Source code for biogeme.database.sampling
"""
This module provides utility functions for performing sampling operations
on pandas DataFrames, including standard bootstrapping and panel-based sampling.
Michel Bierlaire
Wed Mar 26 19:39:21 2025
"""
import pandas as pd
import numpy as np
from biogeme.exceptions import BiogemeError
[docs]
def sample_with_replacement(df: pd.DataFrame, size: int | None = None) -> pd.DataFrame:
if size is None:
size = len(df)
indices = np.random.randint(0, len(df), size=size)
return df.iloc[indices].reset_index(drop=True)
[docs]
def sample_panel_with_replacement(
df: pd.DataFrame, individual_map: pd.DataFrame, size: int | None = None
) -> pd.DataFrame:
"""
Draws a sample of individuals with replacement from a panel dataset.
:param df: The input DataFrame representing the full dataset.
:param individual_map: A DataFrame mapping each individual ID to (start, end) row indices.
:param size: The number of individuals to sample. Defaults to the number of individuals in the map.
:return: A new DataFrame with the sampled individuals' rows, with reset index.
:raises BiogemeError: if the individual_map is missing or empty.
"""
if individual_map is None or individual_map.empty:
raise BiogemeError("Panel individual map is missing or empty.")
if size is None:
size = len(individual_map)
sampled_rows = []
sampled_ids = np.random.choice(individual_map.index, size=size, replace=True)
for individual_id in sampled_ids:
start_idx, end_idx = individual_map.loc[individual_id]
rows = df.loc[start_idx:end_idx]
sampled_rows.append(rows)
return pd.concat(sampled_rows, ignore_index=True)
[docs]
def split_validation_sets(
df: pd.DataFrame, slices: int, group_column: str | None = None
) -> list[tuple[pd.DataFrame, pd.DataFrame]]:
"""
Splits a DataFrame into multiple (estimation, validation) pairs for cross-validation.
:param df: The input DataFrame to split.
:param slices: The number of folds (must be >= 2).
:param group_column: Optional column name used to group rows (e.g., individual ID).
If provided, groups are kept together in folds.
:return: A list of (estimation, validation) DataFrame tuples.
:raises BiogemeError: if the number of slices is less than 2 or group column is not found.
"""
if slices < 2:
raise BiogemeError("Validation requires at least 2 slices.")
if group_column is None:
shuffled = df.sample(frac=1)
folds = np.array_split(shuffled, slices)
else:
if group_column not in df.columns:
raise BiogemeError(f"Grouping column '{group_column}' not found.")
ids = df[group_column].unique()
np.random.shuffle(ids)
folds = [
df[df[group_column].isin(group.tolist())]
for group in np.array_split(ids, slices)
]
estimation_sets = []
validation_sets = []
for i, validation in enumerate(folds):
estimation = pd.concat(folds[:i] + folds[i + 1 :])
estimation_sets.append(estimation.reset_index(drop=True))
validation_sets.append(validation.reset_index(drop=True))
return list(zip(estimation_sets, validation_sets))