# script to check that a biogeme estimation datafile has no error # Authors: Ricardo Hurtubia and Michel Bierlaire # Wed Mar 14 08:30:18 2012 import builtins #from string import * import string import sys import random import time import re import functools import os import math def mean(x): sum = 0.0 for i in x: sum += i return sum/len(x) def variance(x,mean): sum = 0.0 for i in x: sum += (i-mean)*(i-mean) return sum / len(x) def isAscii(s): try: s.encode('ascii') except UnicodeEncodeError: return False else: return True #from math import * print(sys.path) def round_to_n(x, n): if not x: return 0 power = -int(math.floor(math.log10(abs(x)))) + (n - 1) factor = (10 ** power) return round(x * factor) / factor filename = os.path.basename(str(sys.argv[1])) filewithpath = str(sys.argv[1]) argc = len(sys.argv) if (argc == 1): print("Syntax: ",sys.argv[0]," mydata.dat [rowsMerged=1]") elif (argc == 2): mergedRows = 1 else: mergedRows = int(sys.argv[2]) if (mergedRows < 1): print("The number of merged rows is incorrect: ",mergedRows) sys.exit() else: print("Each ",mergedRows, "consecutive rows are merged") ; Input1 = open(filewithpath, 'r') htmlfile = filename.replace('.','_')+".html" print("Check if the file ",filewithpath," is complying with biogeme's requirements.") print("Reading data") data_1 = {} tab_data = [] headers = [] len_data=0 for line in Input1: dataLine = str.rstrip(line) data_1[line] = re.compile('\s').split(dataLine) row = data_1[line] # print(row) tab_data.append(row) if (len_data == 0): print(len(row), "headers: ",row) headers = row len_data=len_data+1 if len_data == 500000: print("500000 lines read") if len_data == 1000000: print("1000000 lines read") if len_data == 1500000: print("1500000 lines read") if len_data == 2000000: print("more than 2000000 lines read") print(" ") nRows = len_data nColumns = len(tab_data[0]) print(nRows, "lines") print(nColumns, "columns") err=0 for i in range (len(tab_data)): # for i in range (3): if i>0: if (mergedRows * len(tab_data[i])) > nColumns: print("Length [",i,"]: ", len(tab_data[i])) print("Length [0]: ", nColumns) print("error in line", i+1," (more columns (",mergedRows * len(tab_data[i]),") than headers (",nColumns,"))") err = 1 if (mergedRows * len(tab_data[i])) < nColumns: print("error in line", i+1," (less columns (",mergedRows * len(tab_data[i]),") than headers (",nColumns,"))") err = 1 for j in range(len(tab_data[i])): x=tab_data[i][j] try: y=float(x) tab_data[i][j] = y except: print("error in line", i+1, " (column ",j+1," contains text: ",x,")") err=1 if err==0: print("data check finalized, no errors.") else: print("The file does not comply with biogeme's requirements") sys.exit() # Calculate statistics h = open(htmlfile,'w') currentTime = time.strftime("%c") print("",file=h) print("",file=h) print("",file=h) print("",file=h) print("",file=h) print("",file=h) print(""+htmlfile+" Statistics "+currentTime+"",file=h) print("",file=h) print("",file=h) print("",file=h) print("",file=h) print("",file=h) print("",file=h) print("",file=h) print("

Biogeme home page: http://biogeme.epfl.ch

",file=h) print("

Michel Bierlaire, Transport and Mobility Laboratory, Ecole Polytechnique Fédérale de Lausanne (EPFL)

",file=h) print("

This file has automatically been generated on ",file=h) print(currentTime+"

",file=h) print("

Statistics on {}

".format(filewithpath),file=h) print("

Total number of data: {}

".format(nRows-1),file=h) print("

",file=h) print("",file=h) print("",file=h) print("", file=h) print("", file=h) print("", file=h) print("", file=h) print("", file=h) print("", file=h) print("", file=h) print("",file=h) for c in range(nColumns): print("",file=h) print("",file=h) theCol = [row[c] for row in tab_data[1:]] m = builtins.min(theCol) print("",file=h) theMean = mean(theCol) m = theMean print("",file=h) theVariance = variance(theCol,theMean) m = theVariance print("",file=h) m = math.sqrt(theVariance) print("",file=h) if m == 0: print("Variable "+headers[c]+" does not vary in the sample") m = max(theCol) print("",file=h) mm = theCol.count(0.0) print("".format(mm),file=h) m = 100.0 *float(mm) / float(nRows-1) print("",file=h) print("",file=h) print("
VariableMinimumMeanVarianceStd dev.MaximumNbr of zerosPercentage of zeros
"+headers[c]+""+format(round_to_n(m,3))+""+format(round_to_n(m,3))+""+format(round_to_n(m,3))+""+format(round_to_n(m,3))+""+format(round_to_n(m,3))+"{}"+format(round_to_n(m,3))+"%

",file=h) print("",file=h) print("",file=h) h.close() ; print("Statistics are available in "+htmlfile)