Logit

Estimation of a logit model using sampling of alternatives.

Michel Bierlaire Fri Jul 25 2025, 17:36:23

import pandas as pd
from alternatives import ID_COLUMN, alternatives, partitions
from compare import compare
from IPython.core.display_functions import display
from specification_sampling import V, combined_variables

import biogeme.biogeme_logging as blog
from biogeme.biogeme import BIOGEME
from biogeme.results_processing import (
    EstimationResults,
    get_pandas_estimated_parameters,
)
from biogeme.sampling_of_alternatives import (
    ChoiceSetsGeneration,
    GenerateModel,
    SamplingContext,
    generate_segment_size,
)
from biogeme.tools import timeit
    ID  rating  price  ...   rest_lon    distance  downtown
0    0       1      4  ...  42.220972   71.735518       1.0
1    1       2      2  ...  50.549434  106.267205       0.0
2    2       3      3  ...  97.830520  136.298409       0.0
3    3       4      1  ...  69.152206   85.941147       0.0
4    4       4      3  ...  89.145620   96.773021       0.0
..  ..     ...    ...  ...        ...         ...       ...
95  95       4      3  ...   9.511387   84.166441       0.0
96  96       1      1  ...  92.144641   95.601366       0.0
97  97       4      2  ...  27.657518   30.440555       1.0
98  98       4      4  ...  32.303213   45.027143       1.0
99  99       4      1  ...  13.672495   25.703295       1.0

[100 rows x 16 columns]
Number of asian restaurants: 33
logger = blog.get_screen_logger(level=blog.INFO)

The data file contains several columns associated with synthetic choices. Here we arbitrarily select logit_4.

CHOICE_COLUMN = 'logit_4'
SAMPLE_SIZE = 10
PARTITION = 'asian'
MODEL_NAME = f'logit_{PARTITION}_{SAMPLE_SIZE}_alt'
FILE_NAME = f'{MODEL_NAME}.dat'
OBS_FILE = 'obs_choice.dat'
the_partition = partitions.get(PARTITION)
if the_partition is None:
    raise ValueError(f'Unknown partition: {PARTITION}')
segment_sizes = generate_segment_size(SAMPLE_SIZE, the_partition.number_of_segments())
observations = pd.read_csv(OBS_FILE)
context = SamplingContext(
    the_partition=the_partition,
    sample_sizes=segment_sizes,
    individuals=observations,
    choice_column=CHOICE_COLUMN,
    alternatives=alternatives,
    id_column=ID_COLUMN,
    biogeme_file_name=FILE_NAME,
    utility_function=V,
    combined_variables=combined_variables,
)
logger.info(context.reporting())
Size of the choice set: 100
Main partition: 2 segment(s) of size 33, 67
Main sample: 10: 5/33, 5/67
the_data_generation = ChoiceSetsGeneration(context=context)
the_model_generation = GenerateModel(context=context)
biogeme_database = the_data_generation.sample_and_merge(recycle=False)
Generating 10 + 0 alternatives for 10000 observations

  0%|          | 0/10000 [00:00<?, ?it/s]
  1%|▏         | 138/10000 [00:00<00:07, 1379.05it/s]
  3%|▎         | 281/10000 [00:00<00:06, 1404.12it/s]
  4%|▍         | 427/10000 [00:00<00:06, 1426.23it/s]
  6%|▌         | 576/10000 [00:00<00:06, 1450.76it/s]
  7%|▋         | 724/10000 [00:00<00:06, 1458.96it/s]
  9%|▊         | 871/10000 [00:00<00:06, 1460.08it/s]
 10%|█         | 1018/10000 [00:00<00:06, 1450.46it/s]
 12%|█▏        | 1167/10000 [00:00<00:06, 1461.08it/s]
 13%|█▎        | 1314/10000 [00:00<00:05, 1453.78it/s]
 15%|█▍        | 1460/10000 [00:01<00:05, 1452.25it/s]
 16%|█▌        | 1608/10000 [00:01<00:05, 1460.61it/s]
 18%|█▊        | 1755/10000 [00:01<00:05, 1430.75it/s]
 19%|█▉        | 1899/10000 [00:01<00:05, 1425.83it/s]
 20%|██        | 2043/10000 [00:01<00:05, 1428.34it/s]
 22%|██▏       | 2188/10000 [00:01<00:05, 1432.57it/s]
 23%|██▎       | 2332/10000 [00:01<00:05, 1428.69it/s]
 25%|██▍       | 2475/10000 [00:01<00:05, 1428.85it/s]
 26%|██▌       | 2618/10000 [00:01<00:05, 1424.55it/s]
 28%|██▊       | 2761/10000 [00:01<00:05, 1412.73it/s]
 29%|██▉       | 2908/10000 [00:02<00:04, 1429.50it/s]
 31%|███       | 3051/10000 [00:02<00:04, 1405.55it/s]
 32%|███▏      | 3192/10000 [00:02<00:04, 1395.03it/s]
 33%|███▎      | 3332/10000 [00:02<00:04, 1365.40it/s]
 35%|███▍      | 3469/10000 [00:02<00:05, 1275.66it/s]
 36%|███▌      | 3598/10000 [00:02<00:05, 1274.47it/s]
 37%|███▋      | 3738/10000 [00:02<00:04, 1307.83it/s]
 39%|███▉      | 3880/10000 [00:02<00:04, 1338.45it/s]
 40%|████      | 4015/10000 [00:02<00:04, 1273.92it/s]
 41%|████▏     | 4144/10000 [00:02<00:04, 1244.04it/s]
 43%|████▎     | 4273/10000 [00:03<00:04, 1256.82it/s]
 44%|████▍     | 4400/10000 [00:03<00:04, 1170.57it/s]
 45%|████▌     | 4530/10000 [00:03<00:04, 1205.79it/s]
 47%|████▋     | 4654/10000 [00:03<00:04, 1214.93it/s]
 48%|████▊     | 4777/10000 [00:03<00:04, 1211.25it/s]
 49%|████▉     | 4912/10000 [00:03<00:04, 1249.01it/s]
 51%|█████     | 5051/10000 [00:03<00:03, 1287.75it/s]
 52%|█████▏    | 5189/10000 [00:03<00:03, 1313.32it/s]
 53%|█████▎    | 5331/10000 [00:03<00:03, 1342.16it/s]
 55%|█████▍    | 5471/10000 [00:04<00:03, 1358.06it/s]
 56%|█████▌    | 5609/10000 [00:04<00:03, 1364.30it/s]
 58%|█████▊    | 5752/10000 [00:04<00:03, 1381.57it/s]
 59%|█████▉    | 5892/10000 [00:04<00:02, 1386.92it/s]
 60%|██████    | 6033/10000 [00:04<00:02, 1392.62it/s]
 62%|██████▏   | 6175/10000 [00:04<00:02, 1398.73it/s]
 63%|██████▎   | 6318/10000 [00:04<00:02, 1408.02it/s]
 65%|██████▍   | 6461/10000 [00:04<00:02, 1413.79it/s]
 66%|██████▌   | 6607/10000 [00:04<00:02, 1426.06it/s]
 68%|██████▊   | 6752/10000 [00:04<00:02, 1432.16it/s]
 69%|██████▉   | 6896/10000 [00:05<00:02, 1430.00it/s]
 70%|███████   | 7040/10000 [00:05<00:02, 1415.40it/s]
 72%|███████▏  | 7182/10000 [00:05<00:02, 1394.33it/s]
 73%|███████▎  | 7323/10000 [00:05<00:01, 1398.44it/s]
 75%|███████▍  | 7463/10000 [00:05<00:01, 1395.54it/s]
 76%|███████▌  | 7607/10000 [00:05<00:01, 1407.98it/s]
 78%|███████▊  | 7752/10000 [00:05<00:01, 1419.86it/s]
 79%|███████▉  | 7897/10000 [00:05<00:01, 1426.22it/s]
 80%|████████  | 8042/10000 [00:05<00:01, 1430.24it/s]
 82%|████████▏ | 8186/10000 [00:05<00:01, 1417.39it/s]
 83%|████████▎ | 8328/10000 [00:06<00:01, 1416.63it/s]
 85%|████████▍ | 8472/10000 [00:06<00:01, 1421.38it/s]
 86%|████████▌ | 8615/10000 [00:06<00:00, 1420.26it/s]
 88%|████████▊ | 8760/10000 [00:06<00:00, 1428.70it/s]
 89%|████████▉ | 8903/10000 [00:06<00:00, 1402.36it/s]
 90%|█████████ | 9044/10000 [00:06<00:00, 1388.51it/s]
 92%|█████████▏| 9183/10000 [00:06<00:00, 1382.44it/s]
 93%|█████████▎| 9322/10000 [00:06<00:00, 1380.04it/s]
 95%|█████████▍| 9466/10000 [00:06<00:00, 1397.10it/s]
 96%|█████████▌| 9611/10000 [00:06<00:00, 1411.66it/s]
 98%|█████████▊| 9753/10000 [00:07<00:00, 1408.98it/s]
 99%|█████████▉| 9897/10000 [00:07<00:00, 1416.27it/s]
100%|██████████| 10000/10000 [00:07<00:00, 1345.03it/s]
Define new variables

Defining new variables...:   0%|          | 0/10 [00:00<?, ?it/s]
Defining new variables...:  10%|█         | 1/10 [00:00<00:01,  8.58it/s]
Defining new variables...:  30%|███       | 3/10 [00:00<00:00, 14.43it/s]
Defining new variables...:  50%|█████     | 5/10 [00:00<00:00, 16.23it/s]
Defining new variables...:  70%|███████   | 7/10 [00:00<00:00, 17.07it/s]
Defining new variables...:  90%|█████████ | 9/10 [00:00<00:00, 17.61it/s]
Defining new variables...: 100%|██████████| 10/10 [00:00<00:00, 16.65it/s]
File logit_asian_10_alt.dat has been created.
logprob = the_model_generation.get_logit()
the_biogeme = BIOGEME(biogeme_database, logprob)
the_biogeme.modelName = MODEL_NAME
Biogeme parameters read from biogeme.toml.
/Users/bierlair/MyFiles/github/biogeme/docs/source/examples/sampling/plot_b01logit.py:88: DeprecationWarning: 'modelName' is deprecated. Please use 'model_name' instead.
  the_biogeme.modelName = MODEL_NAME

Calculate the null log likelihood for reporting.

the_biogeme.calculate_null_loglikelihood({i: 1 for i in range(SAMPLE_SIZE)})
-23025.850929940458

Estimate the parameters.

try:
    results = EstimationResults.from_yaml_file(
        filename=f'saved_results/{the_biogeme.model_name}.yaml'
    )
except FileNotFoundError:
    with timeit(f'Estimate of model {the_biogeme.model_name}'):
        results = the_biogeme.estimate()
print(results.short_summary())
Results for model logit_asian_10_alt
Nbr of parameters:              11
Sample size:                    10000
Excluded data:                  0
Null log likelihood:            -23025.85
Final log likelihood:           -18387.49
Likelihood ratio test (null):           9276.731
Rho square (null):                      0.201
Rho bar square (null):                  0.201
Akaike Information Criterion:   36796.97
Bayesian Information Criterion: 36876.28
parameters_tables = get_pandas_estimated_parameters(estimation_results=results)
estimated_parameters = parameters_tables['Estimated parameters']
display(estimated_parameters)
              Name     Value  Robust std err.  Robust t-stat.  Robust p-value
0      beta_rating  0.760645         0.015553       48.907369             0.0
1       beta_price -0.404156         0.012784      -31.613034             0.0
2     beta_chinese  0.610563         0.050194       12.164170             0.0
3    beta_japanese  1.163799         0.046703       24.918974             0.0
4      beta_korean  0.711613         0.042585       16.710548             0.0
5      beta_indian  0.916606         0.043000       21.316651             0.0
6      beta_french  0.676296         0.062329       10.850397             0.0
7     beta_mexican  1.206568         0.036418       33.130919             0.0
8    beta_lebanese  0.707055         0.063077       11.209424             0.0
9   beta_ethiopian  0.433145         0.050572        8.564941             0.0
10   beta_log_dist -0.604461         0.015182      -39.813848             0.0
df, msg = compare(estimated_parameters)
print(df)
              Name  True Value  Estimated Value    T-Test
0      beta_rating        0.75         0.760645 -0.684464
1       beta_price       -0.40        -0.404156  0.325073
2     beta_chinese        0.75         0.610563  2.777997
3    beta_japanese        1.25         1.163799  1.845721
4      beta_korean        0.75         0.711613  0.901433
5      beta_indian        1.00         0.916606  1.939414
6      beta_french        0.75         0.676296  1.182499
7     beta_mexican        1.25         1.206568  1.192585
8    beta_lebanese        0.75         0.707055  0.680830
9   beta_ethiopian        0.50         0.433145  1.321982
10   beta_log_dist       -0.60        -0.604461  0.293811
print(msg)
Parameters not estimated: ['mu_asian', 'mu_downtown']

Total running time of the script: (0 minutes 8.879 seconds)

Gallery generated by Sphinx-Gallery