Note
Go to the end to download the full example code.
Logit¶
Estimation of a logit model using sampling of alternatives.
Michel Bierlaire Fri Jul 25 2025, 17:36:23
import pandas as pd
from alternatives import ID_COLUMN, alternatives, partitions
from compare import compare
from IPython.core.display_functions import display
from specification_sampling import V, combined_variables
import biogeme.biogeme_logging as blog
from biogeme.biogeme import BIOGEME
from biogeme.results_processing import (
EstimationResults,
get_pandas_estimated_parameters,
)
from biogeme.sampling_of_alternatives import (
ChoiceSetsGeneration,
GenerateModel,
SamplingContext,
generate_segment_size,
)
from biogeme.tools import timeit
ID rating price ... rest_lon distance downtown
0 0 1 4 ... 42.220972 71.735518 1.0
1 1 2 2 ... 50.549434 106.267205 0.0
2 2 3 3 ... 97.830520 136.298409 0.0
3 3 4 1 ... 69.152206 85.941147 0.0
4 4 4 3 ... 89.145620 96.773021 0.0
.. .. ... ... ... ... ... ...
95 95 4 3 ... 9.511387 84.166441 0.0
96 96 1 1 ... 92.144641 95.601366 0.0
97 97 4 2 ... 27.657518 30.440555 1.0
98 98 4 4 ... 32.303213 45.027143 1.0
99 99 4 1 ... 13.672495 25.703295 1.0
[100 rows x 16 columns]
Number of asian restaurants: 33
logger = blog.get_screen_logger(level=blog.INFO)
The data file contains several columns associated with synthetic choices. Here we arbitrarily select logit_4.
CHOICE_COLUMN = 'logit_4'
SAMPLE_SIZE = 10
PARTITION = 'asian'
MODEL_NAME = f'logit_{PARTITION}_{SAMPLE_SIZE}_alt'
FILE_NAME = f'{MODEL_NAME}.dat'
OBS_FILE = 'obs_choice.dat'
the_partition = partitions.get(PARTITION)
if the_partition is None:
raise ValueError(f'Unknown partition: {PARTITION}')
segment_sizes = generate_segment_size(SAMPLE_SIZE, the_partition.number_of_segments())
observations = pd.read_csv(OBS_FILE)
context = SamplingContext(
the_partition=the_partition,
sample_sizes=segment_sizes,
individuals=observations,
choice_column=CHOICE_COLUMN,
alternatives=alternatives,
id_column=ID_COLUMN,
biogeme_file_name=FILE_NAME,
utility_function=V,
combined_variables=combined_variables,
)
logger.info(context.reporting())
Size of the choice set: 100
Main partition: 2 segment(s) of size 33, 67
Main sample: 10: 5/33, 5/67
the_data_generation = ChoiceSetsGeneration(context=context)
the_model_generation = GenerateModel(context=context)
biogeme_database = the_data_generation.sample_and_merge(recycle=False)
Generating 10 + 0 alternatives for 10000 observations
0%| | 0/10000 [00:00<?, ?it/s]
1%|▏ | 138/10000 [00:00<00:07, 1379.05it/s]
3%|▎ | 281/10000 [00:00<00:06, 1404.12it/s]
4%|▍ | 427/10000 [00:00<00:06, 1426.23it/s]
6%|▌ | 576/10000 [00:00<00:06, 1450.76it/s]
7%|▋ | 724/10000 [00:00<00:06, 1458.96it/s]
9%|▊ | 871/10000 [00:00<00:06, 1460.08it/s]
10%|█ | 1018/10000 [00:00<00:06, 1450.46it/s]
12%|█▏ | 1167/10000 [00:00<00:06, 1461.08it/s]
13%|█▎ | 1314/10000 [00:00<00:05, 1453.78it/s]
15%|█▍ | 1460/10000 [00:01<00:05, 1452.25it/s]
16%|█▌ | 1608/10000 [00:01<00:05, 1460.61it/s]
18%|█▊ | 1755/10000 [00:01<00:05, 1430.75it/s]
19%|█▉ | 1899/10000 [00:01<00:05, 1425.83it/s]
20%|██ | 2043/10000 [00:01<00:05, 1428.34it/s]
22%|██▏ | 2188/10000 [00:01<00:05, 1432.57it/s]
23%|██▎ | 2332/10000 [00:01<00:05, 1428.69it/s]
25%|██▍ | 2475/10000 [00:01<00:05, 1428.85it/s]
26%|██▌ | 2618/10000 [00:01<00:05, 1424.55it/s]
28%|██▊ | 2761/10000 [00:01<00:05, 1412.73it/s]
29%|██▉ | 2908/10000 [00:02<00:04, 1429.50it/s]
31%|███ | 3051/10000 [00:02<00:04, 1405.55it/s]
32%|███▏ | 3192/10000 [00:02<00:04, 1395.03it/s]
33%|███▎ | 3332/10000 [00:02<00:04, 1365.40it/s]
35%|███▍ | 3469/10000 [00:02<00:05, 1275.66it/s]
36%|███▌ | 3598/10000 [00:02<00:05, 1274.47it/s]
37%|███▋ | 3738/10000 [00:02<00:04, 1307.83it/s]
39%|███▉ | 3880/10000 [00:02<00:04, 1338.45it/s]
40%|████ | 4015/10000 [00:02<00:04, 1273.92it/s]
41%|████▏ | 4144/10000 [00:02<00:04, 1244.04it/s]
43%|████▎ | 4273/10000 [00:03<00:04, 1256.82it/s]
44%|████▍ | 4400/10000 [00:03<00:04, 1170.57it/s]
45%|████▌ | 4530/10000 [00:03<00:04, 1205.79it/s]
47%|████▋ | 4654/10000 [00:03<00:04, 1214.93it/s]
48%|████▊ | 4777/10000 [00:03<00:04, 1211.25it/s]
49%|████▉ | 4912/10000 [00:03<00:04, 1249.01it/s]
51%|█████ | 5051/10000 [00:03<00:03, 1287.75it/s]
52%|█████▏ | 5189/10000 [00:03<00:03, 1313.32it/s]
53%|█████▎ | 5331/10000 [00:03<00:03, 1342.16it/s]
55%|█████▍ | 5471/10000 [00:04<00:03, 1358.06it/s]
56%|█████▌ | 5609/10000 [00:04<00:03, 1364.30it/s]
58%|█████▊ | 5752/10000 [00:04<00:03, 1381.57it/s]
59%|█████▉ | 5892/10000 [00:04<00:02, 1386.92it/s]
60%|██████ | 6033/10000 [00:04<00:02, 1392.62it/s]
62%|██████▏ | 6175/10000 [00:04<00:02, 1398.73it/s]
63%|██████▎ | 6318/10000 [00:04<00:02, 1408.02it/s]
65%|██████▍ | 6461/10000 [00:04<00:02, 1413.79it/s]
66%|██████▌ | 6607/10000 [00:04<00:02, 1426.06it/s]
68%|██████▊ | 6752/10000 [00:04<00:02, 1432.16it/s]
69%|██████▉ | 6896/10000 [00:05<00:02, 1430.00it/s]
70%|███████ | 7040/10000 [00:05<00:02, 1415.40it/s]
72%|███████▏ | 7182/10000 [00:05<00:02, 1394.33it/s]
73%|███████▎ | 7323/10000 [00:05<00:01, 1398.44it/s]
75%|███████▍ | 7463/10000 [00:05<00:01, 1395.54it/s]
76%|███████▌ | 7607/10000 [00:05<00:01, 1407.98it/s]
78%|███████▊ | 7752/10000 [00:05<00:01, 1419.86it/s]
79%|███████▉ | 7897/10000 [00:05<00:01, 1426.22it/s]
80%|████████ | 8042/10000 [00:05<00:01, 1430.24it/s]
82%|████████▏ | 8186/10000 [00:05<00:01, 1417.39it/s]
83%|████████▎ | 8328/10000 [00:06<00:01, 1416.63it/s]
85%|████████▍ | 8472/10000 [00:06<00:01, 1421.38it/s]
86%|████████▌ | 8615/10000 [00:06<00:00, 1420.26it/s]
88%|████████▊ | 8760/10000 [00:06<00:00, 1428.70it/s]
89%|████████▉ | 8903/10000 [00:06<00:00, 1402.36it/s]
90%|█████████ | 9044/10000 [00:06<00:00, 1388.51it/s]
92%|█████████▏| 9183/10000 [00:06<00:00, 1382.44it/s]
93%|█████████▎| 9322/10000 [00:06<00:00, 1380.04it/s]
95%|█████████▍| 9466/10000 [00:06<00:00, 1397.10it/s]
96%|█████████▌| 9611/10000 [00:06<00:00, 1411.66it/s]
98%|█████████▊| 9753/10000 [00:07<00:00, 1408.98it/s]
99%|█████████▉| 9897/10000 [00:07<00:00, 1416.27it/s]
100%|██████████| 10000/10000 [00:07<00:00, 1345.03it/s]
Define new variables
Defining new variables...: 0%| | 0/10 [00:00<?, ?it/s]
Defining new variables...: 10%|█ | 1/10 [00:00<00:01, 8.58it/s]
Defining new variables...: 30%|███ | 3/10 [00:00<00:00, 14.43it/s]
Defining new variables...: 50%|█████ | 5/10 [00:00<00:00, 16.23it/s]
Defining new variables...: 70%|███████ | 7/10 [00:00<00:00, 17.07it/s]
Defining new variables...: 90%|█████████ | 9/10 [00:00<00:00, 17.61it/s]
Defining new variables...: 100%|██████████| 10/10 [00:00<00:00, 16.65it/s]
File logit_asian_10_alt.dat has been created.
logprob = the_model_generation.get_logit()
the_biogeme = BIOGEME(biogeme_database, logprob)
the_biogeme.modelName = MODEL_NAME
Biogeme parameters read from biogeme.toml.
/Users/bierlair/MyFiles/github/biogeme/docs/source/examples/sampling/plot_b01logit.py:88: DeprecationWarning: 'modelName' is deprecated. Please use 'model_name' instead.
the_biogeme.modelName = MODEL_NAME
Calculate the null log likelihood for reporting.
the_biogeme.calculate_null_loglikelihood({i: 1 for i in range(SAMPLE_SIZE)})
-23025.850929940458
Estimate the parameters.
try:
results = EstimationResults.from_yaml_file(
filename=f'saved_results/{the_biogeme.model_name}.yaml'
)
except FileNotFoundError:
with timeit(f'Estimate of model {the_biogeme.model_name}'):
results = the_biogeme.estimate()
print(results.short_summary())
Results for model logit_asian_10_alt
Nbr of parameters: 11
Sample size: 10000
Excluded data: 0
Null log likelihood: -23025.85
Final log likelihood: -18387.49
Likelihood ratio test (null): 9276.731
Rho square (null): 0.201
Rho bar square (null): 0.201
Akaike Information Criterion: 36796.97
Bayesian Information Criterion: 36876.28
parameters_tables = get_pandas_estimated_parameters(estimation_results=results)
estimated_parameters = parameters_tables['Estimated parameters']
display(estimated_parameters)
Name Value Robust std err. Robust t-stat. Robust p-value
0 beta_rating 0.760645 0.015553 48.907369 0.0
1 beta_price -0.404156 0.012784 -31.613034 0.0
2 beta_chinese 0.610563 0.050194 12.164170 0.0
3 beta_japanese 1.163799 0.046703 24.918974 0.0
4 beta_korean 0.711613 0.042585 16.710548 0.0
5 beta_indian 0.916606 0.043000 21.316651 0.0
6 beta_french 0.676296 0.062329 10.850397 0.0
7 beta_mexican 1.206568 0.036418 33.130919 0.0
8 beta_lebanese 0.707055 0.063077 11.209424 0.0
9 beta_ethiopian 0.433145 0.050572 8.564941 0.0
10 beta_log_dist -0.604461 0.015182 -39.813848 0.0
df, msg = compare(estimated_parameters)
print(df)
Name True Value Estimated Value T-Test
0 beta_rating 0.75 0.760645 -0.684464
1 beta_price -0.40 -0.404156 0.325073
2 beta_chinese 0.75 0.610563 2.777997
3 beta_japanese 1.25 1.163799 1.845721
4 beta_korean 0.75 0.711613 0.901433
5 beta_indian 1.00 0.916606 1.939414
6 beta_french 0.75 0.676296 1.182499
7 beta_mexican 1.25 1.206568 1.192585
8 beta_lebanese 0.75 0.707055 0.680830
9 beta_ethiopian 0.50 0.433145 1.321982
10 beta_log_dist -0.60 -0.604461 0.293811
print(msg)
Parameters not estimated: ['mu_asian', 'mu_downtown']
Total running time of the script: (0 minutes 8.879 seconds)