Source code for biogeme.results_processing.compilation

"""
Compilation of estimation results

Michel Bierlaire
Thu Oct 3 18:54:13 2024
"""

import glob
import logging
import os

import pandas as pd

from biogeme.results_processing.estimation_results import (
    EstimationResults,
    EstimateVarianceCovariance,
)
from biogeme.tools import ModelNames

logger = logging.getLogger(__name__)



[docs]
def compile_estimation_results(
    dict_of_results: dict[str, EstimationResults | str],
    variance_covariance_type: EstimateVarianceCovariance = EstimateVarianceCovariance.ROBUST,
    statistics: tuple[str, ...] = (
        'Number of estimated parameters',
        'Sample size',
        'Final log likelihood',
        'Akaike Information Criterion',
        'Bayesian Information Criterion',
    ),
    include_parameter_estimates: bool = True,
    include_stderr: bool = False,
    include_t_test: bool = True,
    formatted: bool = True,
    use_short_names: bool = False,
) -> tuple[pd.DataFrame, dict[str, str]]:
    """Compile estimation results into a common table

    :param dict_of_results: dict of results, containing
        for each model the name, the ID and the results, or the name
        of the pickle file containing them.
    :param variance_covariance_type: type of variance-covariance estimate to be used.
    :param statistics: list of statistics to include in the summary
        table
    :param include_parameter_estimates: if True, the parameter
        estimates are included.
    :param include_stderr: if True, the robust standard errors
         of the parameters are included.
    :param include_t_test: if True, the t-test
         of the parameters are included.
    :param formatted: if True, a formatted string in included in the
         table results. If False, the numerical values are stored. Use
         "True" if you need to print the results. Use "False" if you
         need to use them for further calculation.
    :param use_short_names: if True, short names, such as Model_1,
        Model_2, are used to identify the model. It is nicer on for the
        reporting.
    :return: pandas dataframe with the requested results, and a dictionary reporting the
        specification of each model

    """
    model_names = ModelNames()

    def the_name(col: str) -> str:
        """Replace the name of a model by a shorter version for reporting

        :param col: name of the column, that is, name of the model.
        :return: name to be used in the reporting.
        """
        if use_short_names:
            return model_names(col)
        return col

    columns = [the_name(k) for k in dict_of_results.keys()]
    df = pd.DataFrame(columns=columns)

    configurations = {the_name(col): col for col in dict_of_results.keys()}

    for model, estimation_results in dict_of_results.items():
        if use_short_names:
            col = model_names(model)
        else:
            col = model
        if not isinstance(estimation_results, EstimationResults):
            try:
                estimation_results = EstimationResults.from_yaml_file(
                    filename=estimation_results
                )
            except FileNotFoundError:
                warning = f'Impossible to access result file {estimation_results}'
                logger.warning(warning)
                estimation_results = None

        if estimation_results is not None:
            stats_results = estimation_results.get_general_statistics()
            for s in statistics:
                df.loc[s, col] = stats_results[s]
            if include_parameter_estimates:
                for (
                    parameter_index,
                    parameter_name,
                ) in enumerate(estimation_results.beta_names):
                    parameter_value = estimation_results.get_parameter_value_from_index(
                        parameter_index=parameter_index
                    )
                    std_err_value = estimation_results.get_parameter_std_err_from_index(
                        parameter_index=parameter_index,
                        estimate_var_covar=variance_covariance_type,
                    )
                    t_test_value = estimation_results.get_parameter_t_test_from_index(
                        parameter_index=parameter_index,
                        estimate_var_covar=variance_covariance_type,
                    )
                    if formatted:

                        std_err_report = (
                            f'({std_err_value:.3g})' if include_stderr else ''
                        )
                        t_test_report = (
                            f'({t_test_value:.3g})' if include_t_test else ''
                        )
                        the_value = (
                            f'{parameter_value:.3g} {std_err_report} {t_test_report}'
                        )
                        row_std = ' (std)' if include_stderr else ''
                        row_t_test = ' (t-test)' if include_t_test else ''
                        row_title = f'{parameter_name}{row_std}{row_t_test}'
                        df.loc[row_title, col] = the_value
                    else:
                        df.loc[parameter_name, col] = parameter_value
                        if include_stderr:
                            df.loc[f'{parameter_name} (std)', col] = std_err_value
                        if include_t_test:
                            df.loc[f'{parameter_name} (t-test)', col] = t_test_value

    return df.fillna(''), configurations




[docs]
def compile_results_in_directory(
    statistics: tuple[str, ...] = (
        'Number of estimated parameters',
        'Sample size',
        'Final log likelihood',
        'Akaike Information Criterion',
        'Bayesian Information Criterion',
    ),
    file_extension: str = 'yaml',
    variance_covariance_type: EstimateVarianceCovariance = EstimateVarianceCovariance.ROBUST,
    include_parameter_estimates: bool = True,
    include_stderr: bool = False,
    include_t_test: bool = True,
    formatted: bool = True,
    use_short_names: bool = False,
) -> tuple[pd.DataFrame, dict[str, str]] | None:
    """Compile estimation results found in the local directory into a
        common table. The results are supposed to be in a file with
        pickle extension.

    :param statistics: list of statistics to include in the summary
        table
    :param file_extension: extension of the files containing the estimation results.
    :param variance_covariance_type: type of variance-covariance estimate to be used.
    :param include_parameter_estimates: if True, the parameter
        estimates are included.
    :param include_stderr: if True, the robust standard errors
         of the parameters are included.
    :param include_t_test: if True, the t-test
         of the parameters are included.
    :param formatted: if True, a formatted string in included in the
         table results. If False, the numerical values are stored. Use
         "True" if you need to print the results. Use "False" if you
         need to use them for further calculation.
    :param use_short_names: if True, short names, such as Model_1,
        Model_2, are used to identify the model. It is nicer on for the
        reporting.
    :return: pandas dataframe with the requested results, and a dictionary reporting the
        specification of each model

    """
    files = glob.glob(f'*.{file_extension}')
    if not files:
        logger.warning(f'No .{file_extension} file found in {os.getcwd()}')
        return None

    the_dict = {k: k for k in files}
    return compile_estimation_results(
        dict_of_results=the_dict,
        variance_covariance_type=variance_covariance_type,
        statistics=statistics,
        include_parameter_estimates=include_parameter_estimates,
        include_stderr=include_stderr,
        include_t_test=include_t_test,
        formatted=formatted,
        use_short_names=use_short_names,
    )