"""Audit the dataframe"""fromtypingimportNamedTupleimportnumpyasnpimportpandasaspdfrombiogeme.audit_tupleimportAuditTuplefrombiogeme.exceptionsimportBiogemeErrorfrombiogeme.expressionsimportExpressionfrombiogeme.toolsimportcount_number_of_groupsfrom.containerimportDatabase
[docs]defcheck_availability_of_chosen_alt(database:Database,avail:dict[int:Expression],choice:Expression)->pd.Series:"""Check if the chosen alternative is available for each entry in the database. :param database: object containing the data :param avail: list of expressions to evaluate the availability conditions for each alternative. :param choice: expression for the chosen alternative. :return: numpy series of bool, long as the number of entries in the database, containing True is the chosen alternative is available, False otherwise. :raise BiogemeError: if the chosen alternative does not appear in the availability dict """frombiogeme.calculatorimportevaluate_expressionchoice_array=evaluate_expression(expression=choice,numerically_safe=False,database=database,use_jit=True)calculated_avail={}forkey,expressioninavail.items():calculated_avail[key]=evaluate_expression(expression=expression,numerically_safe=False,database=database,use_jit=True,)try:avail_chosen=np.array([calculated_avail[c][i]fori,cinenumerate(choice_array)])returnavail_chosen!=0exceptKeyErrorasexc:forcinchoice_array:ifcnotincalculated_avail:err_msg=(f'Chosen alternative {c} does not appear in 'f'availability dict: {calculated_avail.keys()}')raiseBiogemeError(err_msg)fromexc
[docs]defchoice_availability_statistics(database:Database,avail:dict[int:Expression],choice:Expression)->dict[int,ChosenAvailable]:"""Calculates the number of times an alternative is chosen and available :param database: object containing the data :param avail: list of expressions to evaluate the availability conditions for each alternative. :param choice: expression for the chosen alternative. :return: for each alternative, a tuple containing the number of time it is chosen, and the number of time it is available. :raise BiogemeError: if the database is empty. """frombiogeme.calculatorimportevaluate_expressionchoice_array=evaluate_expression(expression=choice,numerically_safe=False,database=database,use_jit=True)calculated_avail={}forkey,expressioninavail.items():calculated_avail[key]=evaluate_expression(expression=expression,numerically_safe=False,database=database,use_jit=True,)unique=np.unique(choice_array,return_counts=True)choice_stat={alt:int(unique[1][i])fori,altinenumerate(list(unique[0]))}avail_stat={k:sum(a)fork,aincalculated_avail.items()}the_results={alt:ChosenAvailable(chosen=c,available=avail_stat[alt])foralt,cinchoice_stat.items()}returnthe_results
[docs]defaudit_dataframe(data:pd.DataFrame)->AuditTuple:""" Performs a series of checks and reports warnings and errors for a pandas DataFrame. :param data: The DataFrame to audit. :return: the list of errors. """list_of_warnings=[]list_of_errors=[]forcol,dtypeindata.dtypes.items():ifnotnp.issubdtype(dtype,np.number):list_of_errors.append(f'Column {col} in the database contains non-numeric type: {dtype}')ifdata.isnull().values.any():nan_locations=data.isnull()rows_with_nan=data.index[nan_locations.any(axis=1)].tolist()cols_with_nan=data.columns[nan_locations.any(axis=0)].tolist()list_of_errors.append(f"The database contains NaN value(s).\n"f"Columns with NaN: {cols_with_nan}\n"f"Rows with NaN: {rows_with_nan}\n"f"Use database.dataframe.isna() to inspect further.")returnAuditTuple(errors=list_of_errors,warnings=list_of_warnings)
[docs]defaudit_panel_dataframe(data:pd.DataFrame,id_column:str)->tuple[list[str],list[str]]:""" Performs panel-specific checks on a pandas DataFrame, ensuring entries for the same individual are contiguous. :param data: The DataFrame to audit. :param id_column: The name of the column identifying individuals. :return: A tuple (list_of_errors, list_of_warnings). """list_of_errors=[]list_of_warnings=[]ifid_columnnotindata.columns:list_of_errors.append(f"The column '{id_column:d}' is missing from the dataset.")returnlist_of_errors,list_of_warningsoriginal_groups=count_number_of_groups(data,id_column)sorted_data=data.sort_values(by=id_column).reset_index(drop=True)sorted_groups=count_number_of_groups(sorted_data,id_column)iforiginal_groups!=sorted_groups:list_of_errors.append(f"The data must be sorted so that entries for the same individual "f"are contiguous. Found {original_groups} original groups, "f"but {sorted_groups} after sorting.")returnlist_of_errors,list_of_warnings