Source code for biogeme.sampling_of_alternatives.sampling_of_alternatives
""" Module in charge of functionalities related to the sampling of alternatives:author: Michel Bierlaire:date: Thu Sep 7 10:14:54 2023"""importcopyimportloggingimportnumpyasnpimportpandasaspdfrombiogeme.exceptionsimportBiogemeErrorfrom.sampling_contextimportSamplingContext,LOG_PROBA_COL,MEV_WEIGHT,CNL_PREFIXlogger=logging.getLogger(__name__)
[docs]defgenerate_segment_size(sample_size:int,number_of_segments:int)->list[int]:"""This function calculates the size of each segment, so that they are as close to each other as possible, and cover the full sample :param sample_size: total size of the sample :type sample_size: int :param number_of_segments: number of segments :type number_of_segments: int :return: list of length number_of_segments, containing the segment sizes :rtype: list[int] """ifsample_size<0:raiseValueError("Sample size cannot be negative.")ifnumber_of_segments<=0:raiseValueError("Number of segments must be positive.")# Calculate the base value and the remainderbase_value=sample_size//number_of_segmentsremainder=sample_size%number_of_segments# Distribute the base value across the listsegment_sizes=[base_value]*number_of_segments# Distribute the remainder across the first few elementsforiinrange(remainder):segment_sizes[i]+=1returnsegment_sizes
[docs]classSamplingOfAlternatives:"""Class dealing with the various methods needed to estimate models with samples of alternatives """def__init__(self,context:SamplingContext):"""Constructor :param context: contains all the information that is needed to perform the sampling of alternatives. """self.alternatives=context.alternativesself.id_column=context.id_columnself.partition=context.sampling_protocolself.second_partition=context.mev_sampling_protocolself.cnl_nests=context.cnl_nests
[docs]defadd_alphas(self,the_sample:pd.DataFrame)->pd.DataFrame:"""Add the alpha parameters in the sampled database"""ifself.cnl_nestsisNone:raiseBiogemeError(f'No nests have been defined for the cross-nested logit model')# We add the alpha parameters in the sampledefget_alphas(alternative_id:int)->pd.Series:"""Prepare the alphas for insertion in the data frame"""assertself.cnl_nestsisnotNonethe_dict=self.cnl_nests.get_alpha_values(alternative_id)returnpd.Series(the_dict)new_columns=the_sample[self.id_column].apply(get_alphas)new_columns=new_columns.rename(columns=lambdax:CNL_PREFIX+x)the_sample=pd.concat([the_sample,new_columns],axis="columns")returnthe_sample
[docs]defsample_alternatives(self,chosen:int)->pd.DataFrame:"""Performing the sampling of alternatives :param chosen: ID of the chosen alternative, that must be included in the choice set. :return: data frame containing a sample of alternatives. The first one is the chosen alternative :raise BiogemeError: if the chosen alternative is unknown. """chosen_alternative=self.alternatives[self.alternatives[self.id_column]==chosen].copy()iflen(chosen_alternative)<1:error_msg=f"Unknown alternative: {chosen}"raiseBiogemeError(error_msg)iflen(chosen_alternative)>1:error_msg=f"Duplicate alternative: {chosen}"raiseBiogemeError(error_msg)results=[]forstratuminself.partition:# stratum.subset is a set of int# We create a copy because we'll have to drop the chosen alternativethe_subset_of_alternatives=copy.deepcopy(stratum.subset)stratum_size=len(stratum.subset)sample_size=stratum.sample_sizelogproba=np.log(sample_size)-np.log(stratum_size)ifchoseninstratum.subset:# Discard the chosen alternativethe_subset_of_alternatives.discard(chosen)# And we sample one alternative lesssample_size-=1# Include the correction termschosen_alternative[LOG_PROBA_COL]=logproba# subset is a pandas data frame containing the description# of all alternatives in the subsetsubset=self.alternatives[self.alternatives[self.id_column].isin(the_subset_of_alternatives)]# Perform the samplingsample=subset.sample(n=sample_size,replace=False,axis="index",ignore_index=True)sample[LOG_PROBA_COL]=logprobaresults.append(sample)the_sample=pd.concat(results,ignore_index=True)# Add the chosen alternative. By construction, it is not in the sample.the_sample=pd.concat([chosen_alternative,the_sample],ignore_index=True)ifself.cnl_nests:self.add_alphas(the_sample)returnthe_sample
[docs]defsample_mev_alternatives(self)->pd.DataFrame:"""Performing the sampling of alternatives for the MEV terms. Here, the chosen alternative is ignored. :return: data frame containing a sample of alternatives """results=[]forstratuminself.second_partition:stratum_size=len(stratum.subset)sample_size=stratum.sample_sizemev_weight=stratum_size/sample_sizesubset=self.alternatives[self.alternatives[self.id_column].isin(stratum.subset)]sample=subset.sample(n=sample_size,replace=False,axis="index",ignore_index=True)sample[MEV_WEIGHT]=mev_weightresults.append(sample)the_sample=pd.concat(results,ignore_index=True)ifself.cnl_nests:self.add_alphas(the_sample)returnthe_sample