"""PanelStructure: Handles organization and indexing of panel data,where observations are grouped by individuals.Michel BierlaireWed Mar 26 19:33:13 2025"""importloggingimportpandasaspdfrom.containerimportDatabaseRELEVANT_PREFIX='relevant_'logger=logging.getLogger(__name__)
[docs]defobservation_suffix(index:int)->str:"""Return a zero-padded suffix for observation index (1-based)."""returnf"__panel__{index+1:02d}"
[docs]classPanelDatabase:"""Encapsulates a pandas DataFrame for Biogeme, in panel format,"""def__init__(self,database:Database,panel_column:str):""" Constructor :param database: original database :param panel_column: name of the column with the identifier of the individuals. :raises BiogemeError: if the dataframe is empty """self.original_database=database# Entries for panel dataself.panel_column:str=panel_columndef__str__(self)->str:returnf'biogeme panel database {self.database.name}'
[docs]defflatten_database(self,missing_data:float)->tuple[pd.DataFrame,int]:""" Flatten a long-format dataframe into a wide-format dataframe where each row represents one individual and columns represent multiple observations. :param missing_data: value to use if data is missing. :return: a tuple containing two things: - A wide-format DataFrame where each row corresponds to one individual. For each variable column in the original DataFrame (excluding `grouping_column`), the output contains multiple columns named `columnname_XX`, where `XX` is the zero-padded observation index (starting at 01). Additionally, for each observation index, a `relevant_XX` column indicates whether the observation is relevant (1) or padded with a missing value (0). - The size of the largest group. """dataframe=self.original_database.dataframelogger.info(f'Flattening database [{dataframe.shape}].')grouping_column=self.panel_columnifdataframe.empty:returndataframe.copy(),0# Find non-ID columnsvalue_columns=[colforcolindataframe.columnsifcol!=grouping_column]# Identify columns with constant values within each groupconstant_columns=[]forcolinvalue_columns:ifdataframe.groupby(grouping_column)[col].nunique().max()==1:constant_columns.append(col)# Determine max number of observations per individuallargest_group=dataframe.groupby(grouping_column).size().max()# Prepare list to collect rowsflattened_rows=[]forindividual_id,groupindataframe.groupby(grouping_column):group=group.reset_index(drop=True)obs_count=len(group)row={grouping_column:individual_id}forcolinconstant_columns:row[col]=group.at[0,col]forobs_indexinrange(largest_group):suffix=observation_suffix(obs_index)row[f"{RELEVANT_PREFIX}{suffix}"]=1ifobs_index<obs_countelse0is_valid=obs_index<obs_countforcolinvalue_columns:value=group.at[obs_index,col]ifis_validelsemissing_datarow[f"{col}{suffix}"]=valueflattened_rows.append(row)flat_dataframe=pd.DataFrame(flattened_rows)logger.info(f'Database flattened [{flat_dataframe.shape}]')returnflat_dataframe,largest_group