Source code for biogeme.sampling_of_alternatives.sampling_context
"""Defines a class that characterized the context to apply sampling of alternatives:author: Michel Bierlaire:date: Wed Sep 6 14:38:31 2023"""importloggingfromcollections.abcimportIterablefromdataclassesimportdataclassfromtypingimportNamedTupleimportpandasaspdfrombiogeme.exceptionsimportBiogemeErrorfrombiogeme.expressionsimport(Expression,list_of_variables_in_expression,)frombiogeme.nestsimportNestsForCrossNestedLogitfrombiogeme.partitionimportPartition,Segmentlogger=logging.getLogger(__name__)MEV_PREFIX='_MEV_'LOG_PROBA_COL='_log_proba'MEV_WEIGHT='_mev_weight'CNL_PREFIX='_CNL_'
[docs]classStratumTuple(NamedTuple):"""A stratum is an element of a partition of the full choice set, combined with the number of alternatives that must be sampled. """subset:Segmentsample_size:int
[docs]classCrossVariableTuple(NamedTuple):"""A cross variable is a variable that involves socio-economic attributes of the individuals, and attributes of the alternatives. It can only be calculated after the sampling has been made. """name:strformula:Expression
[docs]@dataclassclassSamplingContext:"""Class gathering the data needed to perform an estimation with samples of alternatives :param the_partition: Partition used for the sampling. :param sample_sizes: number of alternative to draw from each segment. :param individuals: Pandas data frame containing all the individuals as rows. One column must contain the choice of each individual. :param choice_column: name of the column containing the choice of each individual. :param alternatives: Pandas data frame containing all the alternatives as rows. One column must contain a unique ID identifying the alternatives. The other columns contain variables to include in the data file. :param id_column: name of the column containing the Ids of the alternatives. :param utility_function: definition of the generic utility function :param combined_variables: definition of interaction variables :param mev_partition: If a second choice set need to be sampled for the MEV terms, the corresponding partition is provided here. """the_partition:Partitionsample_sizes:Iterable[int]individuals:pd.DataFramechoice_column:stralternatives:pd.DataFrameid_column:strbiogeme_file_name:strutility_function:Expressioncombined_variables:list[CrossVariableTuple]mev_partition:Partition|None=Nonemev_sample_sizes:Iterable[int]|None=Nonecnl_nests:NestsForCrossNestedLogit|None=None
[docs]defcheck_expression(self,expression:Expression)->None:"""Verifies if the variables contained in the expression can be found in the databases"""variables={var.nameforvarinlist_of_variables_in_expression(the_expression=expression)}forvariableinvariables:if(variablenotinself.individuals.columnsandvariablenotinself.alternatives.columnsandall(variable!=t.namefortinself.combined_variables)):error_msg=(f'Invalid expression. Variable "{variable}" has not been found in 'f'the provided database')raiseBiogemeError(error_msg)
[docs]defcheck_partition(self)->None:"""Check if the partition is truly a partition. If not, an exception is raised :raise BiogemeError: if some elements are present in more than one subset. :raise BiogemeError: if the size of the union of the subsets does not match the expected total size :raise BiogemeError: if an alternative in the partition does not appear in the database of alternatives :raise BiogemeError: if a segment is empty :raise BiogemeError: if the number of sampled alternatives in a stratum is incorrect , that is zero, or larger than the stratum size.. """# Verify that all requested alternatives appear in the database of alternativesforstratuminself.sampling_protocol:n=len(stratum.subset)ifn==0:error_msg='A stratum is empty'raiseBiogemeError(error_msg)k=stratum.sample_sizeifk>n:error_msg=f'Cannot draw {k} elements in a stratum of size {n}'raiseBiogemeError(error_msg)ifk==0:error_msg='At least one alternative must be selected in each segment'raiseBiogemeError(error_msg)foraltinstratum.subset:ifaltnotinself.alternatives[self.id_column].values:error_msg=(f'Alternative {alt} does not appear in the database of 'f'alternatives')raiseBiogemeError(error_msg)
[docs]defcheck_mev_partition(self)->None:"""Check if the partition is a partition of the MEV alternatives. It does not need to cover the full choice set"""ifself.mev_partition:ifself.mev_sample_sizesisNone:error_msg=('If mev_partition is defined, mev_sample_size must also be defined')raiseBiogemeError(error_msg)ifself.mev_sample_sizes:ifself.mev_partitionisNone:error_msg=('If mev_sample_sizes is defined, mev_partition must also be defined')raiseBiogemeError(error_msg)ifself.cnl_nestsandself.mev_partition:ifself.cnl_nests.mev_alternatives!=self.mev_partition.full_set:in_nest_not_in_partition=(self.cnl_nests.mev_alternatives-self.mev_partition.full_set)in_partition_not_in_nest=(self.mev_partition.full_set-self.cnl_nests.mev_alternatives)error_msg=''ifin_nest_not_in_partition:error_msg+=(f'The following alternative(s) belong to a nest but not to the'f' partition for the sample: {in_nest_not_in_partition}. ')ifin_partition_not_in_nest:error_msg+=(f'The following alternative(s) belong to the partition for 'f'the MEV sample, but not to any nest: {in_partition_not_in_nest}')
[docs]defcheck_valid_alternatives(self,set_of_ids:set[int])->None:"""Check if the IDs in set are indeed valid alternatives. Typically used to check if a nest is well defined :param set_of_ids: set of identifiers to check :raise BiogemeError: if at least one id is invalid. """if(notpd.Series(list(set_of_ids)).isin(self.alternatives[self.id_column]).all()):missing_values=set_of_ids-set(self.alternatives[self.id_column])raiseBiogemeError(f'The following IDs are not valid alternative IDs: {missing_values}')
def__post_init__(self)->None:# Check for empty utility functionifself.utility_functionisNone:raiseBiogemeError('No utility function has been provided')# Check for empty stringsifnotself.choice_column:raiseBiogemeError('choice_column should not be an empty string.')ifnotself.id_column:raiseBiogemeError('id_column should not be an empty string.')# Validate that the DataFrames are not emptyifself.individuals.emptyorself.alternatives.empty:raiseBiogemeError('DataFrames individuals or alternatives should not be empty.')self.check_partition()logger.debug('Check if there is a MEV partition')ifself.mev_partitionorself.mev_sample_sizes:logger.debug('Yes, there is a MEV partition')self.check_mev_partition()else:logger.debug('No, there is no MEV partition')# If CNL nests are defined, check that the alphas are all# fixed and that the nests have a name.ifself.cnl_nests:ifnotself.cnl_nests.all_alphas_fixed():error_msg='For the CNL model, all alpha parameters must be fixed.'raiseBiogemeError(error_msg)ifnotself.cnl_nests.check_names():error_msg='For the CNL model, all nests must have a name.'raiseBiogemeError(error_msg)self.number_of_alternatives=self.alternatives.shape[0]self.number_of_individuals=self.individuals.shape[0]# Validate that choice_column is in the individuals DataFrameifself.choice_columnnotinself.individuals.columns:raiseBiogemeError(f'{self.choice_column} is not a column in the individuals DataFrame.')# Validate that id_column is in the alternatives DataFrameifself.id_columnnotinself.alternatives.columns:raiseBiogemeError(f'{self.id_column} is not a column in the alternatives DataFrame.')# Check for data typesifnotself.individuals[self.choice_column].dtypein[int,float]:raiseBiogemeError(f'Column {self.choice_column} in data frame "individuals" should 'f'be of type int or float.')ifnotself.alternatives[self.id_column].dtypein[int,float]:raiseBiogemeError(f'Column {self.id_column} in alternatives should be of type int or float.')self.check_expression(self.utility_function)forcross_variableinself.combined_variables:self.check_expression(cross_variable.formula)self.include_cnl_alphas()@propertydefattributes(self)->set[str]:"""List of attributes for the choice model"""returnset(self.alternatives.columns)|{combined_variable.nameforcombined_variableinself.combined_variables}@propertydefmev_prefix(self)->str:"""Build the prefix for the MEV columns"""return''ifself.mev_partitionisNoneelseMEV_PREFIX@propertydefsampling_protocol(self)->list[StratumTuple]:"""Provides a list of strata characterizing the sampling"""return[StratumTuple(subset=segment,sample_size=size)forsegment,sizeinzip(self.the_partition,self.sample_sizes)]@propertydefmev_sampling_protocol(self)->list[StratumTuple]|None:"""Provides a list of strata characterizing the MEV sampling"""ifself.mev_partitionisNone:returnNonereturn[StratumTuple(subset=segment,sample_size=size)forsegment,sizeinzip(self.mev_partition,self.mev_sample_sizes)]@propertydeftotal_sample_size(self)->int:"""Sample size"""returnsum(stratum.sample_sizeforstratuminself.sampling_protocol)@propertydeftotal_mev_sample_size(self)->int:"""Sample size"""ifself.mev_partitionisNone:return0returnsum(stratum.sample_sizeforstratuminself.mev_sampling_protocol)
[docs]defreporting(self)->str:"""Summarizes the configuration specified by the context object."""result={'Size of the choice set':self.alternatives.shape[0],'Main partition':(f'{self.the_partition.number_of_segments()} segment(s) of size 'f'{", ".join([str(len(segment))forsegmentinself.the_partition])}'),'Main sample':f'{self.total_sample_size}: ',}result['Main sample']+=', '.join([f'{stratum.sample_size}/{len(stratum.subset)}'forstratuminself.sampling_protocol])ifself.mev_partition:result['Nbr of MEV alternatives']=len(self.mev_partition.full_set)result['MEV partition']=(f'{self.mev_partition.number_of_segments()} segment(s) of size 'f'{", ".join([str(len(segment))forsegmentinself.mev_partition])}')result['MEV sample']=f'{self.total_mev_sample_size}: 'result['MEV sample']+=', '.join([f'{stratum.sample_size}/{len(stratum.subset)}'forstratuminself.mev_sampling_protocol])output=''forsection,descriptioninresult.items():output+=f'{section}: {description}\n'returnoutput