importloggingimportpandasaspdfrombiogeme.expressionsimportVariablefrom.segmentationimportDiscreteSegmentationTuplefrom..exceptionsimportBiogemeErrorlogger=logging.getLogger(__name__)"""Logger that controls the output of messages to the screen and log file. """
[docs]defgenerate_segmentation(dataframe:pd.DataFrame,variable:Variable|str,mapping:dict[int,str]|None=None,reference:str|None=None,)->DiscreteSegmentationTuple:"""Generate a segmentation tuple for a variable. :param dataframe: data frame. :param variable: Variable object or name of the variable :param mapping: mapping associating values of the variable to names. If incomplete, default names are provided. :param reference: name of the reference category. If None, an arbitrary category is selected as reference. """the_variable=variableifisinstance(variable,Variable)elseVariable(variable)# Check if the variable is in the database.ifthe_variable.namenotindataframe.columns:error_msg=f'Unknown the_variable {the_variable.name}'raiseBiogemeError(error_msg)# Extract all unique values from the data base.unique_values=set(dataframe[the_variable.name].unique())iflen(unique_values)>=10:warning_msg=(f'Variable {the_variable.name} takes a total of 'f'{len(unique_values)} different values in the database. It is 'f'likely to be too large for a discrete segmentation.')logger.warning(warning_msg)# Check that the provided mapping is consistent with the datavalues_not_in_data=[valueforvalueinmapping.keys()ifvaluenotinunique_values]ifvalues_not_in_data:error_msg=(f'The following values in the mapping do not exist in the data for 'f'variable {the_variable.name}: {values_not_in_data}')raiseBiogemeError(error_msg)the_mapping={int(value):f'{the_variable.name}_{int(value)}'forvalueinunique_values}ifmappingisnotNone:the_mapping.update(mapping)ifreferenceisnotNoneandreferencenotinmapping.values():error_msg=(f'Level {reference} of variable {the_variable.name} does not ''appear in the mapping: {mapping.values()}')raiseBiogemeError(error_msg)returnDiscreteSegmentationTuple(variable=the_variable,mapping=the_mapping,reference=reference,)
[docs]defverify_segmentation(dataframe:pd.DataFrame,segmentation:DiscreteSegmentationTuple)->None:"""Verifies if the definition of the segmentation is consistent with the data :param dataframe: dataframe to check. :param segmentation: definition of the segmentation :raise BiogemeError: if the segmentation is not consistent with the data. """variable=segmentation.variable# Check if the variable is in the database.ifvariable.namenotindataframe.columns:error_msg=f'Unknown variable {variable.name}'raiseBiogemeError(error_msg)# Extract all unique values from the data base.unique_values=set(dataframe[variable.name].unique())segmentation_values=set(segmentation.mapping.keys())in_data_not_in_segmentation=unique_values-segmentation_valuesin_segmentation_not_in_data=segmentation_values-unique_valueserror_msg_1=((f'The following entries are missing in the segmentation: 'f'{in_data_not_in_segmentation}.')ifin_data_not_in_segmentationelse'')error_msg_2=((f'Segmentation entries do not exist in the data: 'f'{in_segmentation_not_in_data}.')ifin_segmentation_not_in_dataelse'')iferror_msg_1orerror_msg_2:raiseBiogemeError(f'{error_msg_1}{error_msg_2}')