Source code for biogeme.biogeme

"""
Implementation of the main Biogeme class

:author: Michel Bierlaire
:date: Tue Mar 26 16:45:15 2019

It combines the database and the model specification.
"""

from __future__ import annotations

import difflib
import logging
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
from biogeme.biogeme_logging import suppress_logs
from biogeme.calculator import (
    CallableExpression,
    CompiledFormulaEvaluator,
    MultiRowEvaluator,
    function_from_compiled_formula,
)
from biogeme.calculator.simple_formula import evaluate_simple_expression
from biogeme.catalog import (
    CentralController,
    Configuration,
    SelectedConfigurationsIterator,
)
from biogeme.constants import LOG_LIKE, WEIGHT
from biogeme.database import Database
from biogeme.default_parameters import ParameterValue
from biogeme.deprecated import deprecated_parameters
from biogeme.dict_of_formulas import (
    check_validity,
    get_expression,
    insert_valid_keyword,
)
from biogeme.draws import RandomNumberGeneratorTuple
from biogeme.exceptions import BiogemeError, ValueOutOfRange
from biogeme.expressions import (
    Expression,
    ExpressionOrNumeric,
    MultipleSum,
    log,
)
from biogeme.filenames import get_new_file_name
from biogeme.function_output import (
    FunctionOutput,
)
from biogeme.likelihood import AlgorithmResults, model_estimation
from biogeme.likelihood.bootstrap import bootstrap
from biogeme.model_elements import ModelElements
from biogeme.optimization import OptimizationAlgorithm, algorithms
from biogeme.parameters import (
    DEFAULT_FILE_NAME as DEFAULT_PARAMETER_FILE_NAME,
    Parameters,
)
from biogeme.results_processing import (
    RawEstimationResults,
    generate_html_file,
)
from biogeme.results_processing.estimation_results import EstimationResults
from biogeme.second_derivatives import SecondDerivativesMode
from biogeme.tools import (
    CheckDerivativesResults,
    ModelNames,
    check_derivatives,
    safe_serialize_array,
)
from biogeme.tools.files import files_of_type
from biogeme.validation import ValidationResult, cross_validate_model
from tqdm import tqdm

DEFAULT_MODEL_NAME = 'biogeme_model_default_name'
logger = logging.getLogger(__name__)



[docs]
class BIOGEME:
    """Main class that combines the database and the model
        specification.

    It works in two modes: estimation and simulation.

    The following attributes are imported from the parameter file.


    """

    # Type hints for dynamically injected attributes
    __annotations__ = {
        'seed': int,
        'tolerance': float,
        'steptol': float,
        'max_iterations': int,
        'number_of_threads': int,
        'enlarging_factor': float,
        'initial_radius': float,
        'infeasible_cg': bool,
        'second_derivatives': int,
        'dogleg': bool,
        'bootstrap_samples': int,
        'save_iterations': bool,
        'generate_html': bool,
        'generate_yaml': bool,
        'optimization_algorithm': str,
        'maximum_number_catalog_expressions': int,
        'max_number_parameters_to_report': int,
    }

    def __init__(
        self,
        database: Database,
        formulas: Expression | dict[str, Expression],
        random_number_generators: dict[str:RandomNumberGeneratorTuple] | None = None,
        user_notes: str | None = None,
        parameters: str | Parameters | None = None,
        **kwargs,
    ):
        """Constructor

        :param database: choice data.
        :param formulas: expression or dictionary of expressions that
             define the model specification.  The concept is that each
             expression is applied to each entry of the database. The
             keys of the dictionary allow to provide a name to each
             formula.  In the estimation mode, two formulas are
             needed, with the keys 'loglike' and 'weight'. If only one
             formula is provided, it is associated with the label
             'loglike'. If no formula is labeled 'weight', the weight
             of each piece of data is supposed to be 1.0. In the
             simulation mode, the labels of each formula are used as
             labels of the resulting database.

        :param random_number_generators: user defined random number generators.
        :param user_notes: these notes will be included in the report file.
        :param parameters: name of the .toml file where the parameters are read, or Parameters object containing
        :raise BiogemeError: an audit of the formulas is performed.
           If a formula has issues, an error is detected and an
           exception is raised.

        """
        if isinstance(parameters, Parameters):
            logger.info('Biogeme parameters provided by the user.')
            self.biogeme_parameters: Parameters = parameters
        else:
            self.biogeme_parameters: Parameters = Parameters()
            if isinstance(parameters, str):
                self.biogeme_parameters.read_file(parameters)
            else:
                if parameters is None:
                    self.biogeme_parameters.read_file(DEFAULT_PARAMETER_FILE_NAME)
                else:
                    error_msg = (
                        f'Argument "parameters" is of wrong type: {type(parameters)}'
                    )
                    raise AttributeError(error_msg)

        database.reset_indices()
        self.parameter_file: str = self.biogeme_parameters.file_name
        self.html_filename: str | None = None
        self.yaml_filename: str | None = None

        # We allow the values of the parameters to be set with arguments
        self.biogeme_parameters.set_several_parameters(dict_of_parameters=kwargs)

        for name in self.biogeme_parameters.parameter_names:
            self._define_property(name)

        self.maximum_number_of_observations_per_individual: int | None = None
        self.database = database

        self.log_like_name: str = LOG_LIKE
        """ Keywords used for the name of the loglikelihood formula.
        Default: 'log_like'"""

        self.log_like_valid_names: list[str] = [LOG_LIKE, 'loglike']

        self.weight_name = WEIGHT
        """Keyword used for the name of the weight formula. Default: 'weight'
        """

        self.weight_valid_names = [WEIGHT, 'weights']

        self.model_name = DEFAULT_MODEL_NAME
        """Name of the model. Default: 'biogemeModelDefaultName'
        """

        self.second_derivatives_mode = SecondDerivativesMode(
            self.calculating_second_derivatives
        )

        if self.seed != 0:
            np.random.seed(self.seed)

        self.short_names: ModelNames | None = None

        self.formulas = self._normalize_formulas(formulas)

        self.user_notes = user_notes  #: User notes

        self.init_loglikelihood = None  #: Init value of the likelihood function

        self.null_loglikelihood = None  #: Log likelihood of the null model

        self.best_iteration = None  #: Store the best iteration found so far.

        self._model_elements = ModelElements(
            expressions=self.formulas,
            database=self.database,
            number_of_draws=self.biogeme_parameters.get_value(name='number_of_draws'),
            user_defined_draws=random_number_generators,
            use_jit=self.biogeme_parameters.get_value(name='use_jit'),
        )
        self._function_evaluator = (
            CompiledFormulaEvaluator(
                model_elements=self._model_elements,
                second_derivatives_mode=self.second_derivatives_mode,
                numerically_safe=self.numerically_safe,
            )
            if self.contains_log_likelihood()
            else None
        )


[docs]
    @classmethod
    def from_configuration_and_controller(
        cls,
        config_id: str,
        central_controller: CentralController,
        database: Database,
        user_notes: str | None = None,
        parameters: str | Parameters | None = None,
        **kwargs,
    ) -> BIOGEME:
        """Obtain the Biogeme object corresponding to the
        configuration of a multiple expression

        :param config_id: identifier of the configuration

        :param central_controller: central controller for the multiple expression containing all the catalogs.

        :param database: database to be passed to the Biogeme object

        :param user_notes: these notes will be included in the report file.

        :param parameters: object with the parameters

        """
        if central_controller.all_configurations:
            # We verify that the configuration is valid
            the_set = central_controller.all_configurations_ids
            if not the_set:
                error_msg = "No configuration found in the expression"
                raise BiogemeError(error_msg)
            if config_id not in the_set:
                close_matches = difflib.get_close_matches(config_id, the_set)
                if close_matches:
                    error_msg = (
                        f"Unknown configuration: [{config_id}]. "
                        f"Did you mean [{close_matches[0]}]?"
                    )
                else:
                    error_msg = f"Unknown configuration: {config_id}."
                raise BiogemeError(error_msg)
        the_configuration = Configuration.from_string(config_id)
        central_controller.set_configuration(the_configuration)
        if user_notes is None:
            user_notes = the_configuration.get_html()
        flat_expression = central_controller.expression.deep_flat_copy()
        return cls(
            database=database,
            formulas=flat_expression,
            user_notes=user_notes,
            parameters=parameters,
            **kwargs,
        )



[docs]
    @classmethod
    def from_configuration(
        cls,
        config_id: str,
        multiple_expression: Expression,
        database: Database,
        user_notes: str | None = None,
        parameters: str | Parameters | None = None,
        **kwargs,
    ) -> BIOGEME:
        """Obtain the Biogeme object corresponding to the
        configuration of a multiple expression
        :param config_id: identifier of the configuration

        :param multiple_expression: multiple expression containing the catalog.

        :param database: database to be passed to the Biogeme object

        :param user_notes: these notes will be included in the report file.

        :param parameters: object with the parameters

        """
        central_controller = CentralController(expression=multiple_expression)
        return cls.from_configuration_and_controller(
            config_id=config_id,
            central_controller=central_controller,
            database=database,
            user_notes=user_notes,
            parameters=parameters,
            **kwargs,
        )


    def _define_property(self, name: str):
        def getter(obj):
            return obj.biogeme_parameters.get_value(name=name)

        def setter(obj, value):
            error_msg = (
                f"Direct assignment to '{name}' is not allowed. Please set the value in the "
                f"constructor or in the .toml file."
            )
            raise BiogemeError(error_msg)

        prop = property(getter, setter)
        setattr(self.__class__, name, prop)

    @property
    def sample_size(self):
        return self.model_elements.sample_size

    @property
    def number_of_observations(self):
        return self.model_elements.number_of_observations

    @property
    def generate_pickle(self) -> bool:
        warnings.warn(
            "'generate_pickle' is deprecated. Use 'generate_yaml' instead.",
            DeprecationWarning,
            stacklevel=2,
        )
        return False

    @generate_pickle.setter
    def generate_pickle(self, value: bool) -> None:
        warnings.warn(
            "'generate_pickle' is deprecated. Use 'generate_yaml' instead. This statement is ignored",
            DeprecationWarning,
            stacklevel=2,
        )

    @property
    def modelName(self) -> str:
        return self.model_name

    @modelName.setter
    def modelName(self, value: str) -> None:
        warnings.warn(
            "'modelName' is deprecated. Please use 'model_name' instead.",
            DeprecationWarning,
            stacklevel=2,
        )
        self.model_name = value

    @property
    def expressions_registry(self):
        return self._model_elements.expressions_registry

    @property
    def log_like(self) -> Expression | None:
        return get_expression(
            dict_of_formulas=self.formulas, valid_keywords=self.log_like_valid_names
        )


[docs]
    def contains_log_likelihood(self) -> bool:
        return self.log_like is not None


    @property
    def weight(self) -> Expression | None:
        return get_expression(
            dict_of_formulas=self.formulas, valid_keywords=self.weight_valid_names
        )

    @property
    def model_elements(self) -> ModelElements | None:
        return self._model_elements

    @property
    def function_evaluator(self) -> CompiledFormulaEvaluator:
        return self._function_evaluator

    def _normalize_formulas(
        self, formulas: Expression | dict[str, Expression]
    ) -> dict[str, Expression]:
        """
        Normalize user input to a dictionary of expressions.
        Raises BiogemeError if input is invalid.

        :param formulas: a single expression or a dictionary of named expressions.
        :return: a dictionary mapping names to expressions.
        """
        if isinstance(formulas, Expression):
            return {self.log_like_name: formulas}
        if isinstance(formulas, dict):
            check_validity(formulas)
            return insert_valid_keyword(
                dict_of_formulas=formulas,
                reference_keyword=self.log_like_name,
                valid_keywords=self.log_like_valid_names,
            )
        raise BiogemeError(
            f'Invalid type for formulas: {type(formulas)}. Expected Expression or dict.'
        )


[docs]
    def is_model_complex(self) -> bool:
        """Check if the model is potentially complex to estimate"""
        return self.log_like.is_complex()


    @property
    def loglike(self) -> Expression:
        """For backward compatibility"""
        return self.log_like

    @property
    def function_parameters(self) -> dict[str, ParameterValue]:
        """Prepare the parameters for the function"""
        return {
            "tolerance": self.biogeme_parameters.get_value('tolerance'),
            "steptol": self.biogeme_parameters.get_value('steptol'),
        }

    @property
    def algo_parameters(self) -> dict[str, ParameterValue]:
        """Prepare the parameters for the optimization algorithm."""
        common_bounds_params = {
            'infeasibleConjugateGradient': self.biogeme_parameters.get_value(
                'infeasible_cg'
            ),
            'radius': self.biogeme_parameters.get_value('initial_radius'),
            'enlargingFactor': self.biogeme_parameters.get_value('enlarging_factor'),
            'maxiter': self.biogeme_parameters.get_value('max_iterations'),
            'max_number_parameters_to_report': self.biogeme_parameters.get_value(
                'max_number_parameters_to_report'
            ),
        }

        if self.optimization_algorithm == 'automatic':
            if (
                self.is_model_complex()
                or self.second_derivatives_mode == SecondDerivativesMode.NEVER
            ):
                logger.info(
                    'As the model is rather complex, we cancel the calculation of second derivatives. '
                    'If you want to control the parameters, change the algorithm from "automatic" '
                    'to "simple_bounds" in the TOML file.'
                )
                algo_parameters = common_bounds_params | {
                    "proportionAnalyticalHessian": 0,
                }
            else:
                logger.info(
                    'As the model is not too complex, we activate the calculation of second derivatives. '
                    'To change this behavior, modify the algorithm to "simple_bounds" in the TOML file.'
                )
                algo_parameters = common_bounds_params | {
                    "proportionAnalyticalHessian": 1,
                }
            return algo_parameters

        if self.optimization_algorithm == "simple_bounds":
            algo_parameters = common_bounds_params | {
                "proportionAnalyticalHessian": self.biogeme_parameters.get_value(
                    'second_derivatives'
                ),
            }
            if (
                self.second_derivatives_mode == SecondDerivativesMode.FINITE_DIFFERENCES
                and self.biogeme_parameters.get_value('second_derivatives') > 0
            ):
                warning = (
                    f'The proportion of the analytical hessian is not zero, and the second derivatives are approximated by '
                    f'finite difference. It may not be the desired configuration.'
                )
                logger.warning(warning)
            if (
                self.second_derivatives_mode == SecondDerivativesMode.NEVER
                and self.biogeme_parameters.get_value('second_derivatives') > 0
            ):
                error_msg = (
                    f'The proportion of the analytical hessian is not zero, and the second derivatives cannot be '
                    f'evaluated. The parameters "calculating_second_derivatives" and "second_derivatives" are inconsistent.'
                )
            return algo_parameters

        if self.optimization_algorithm in {
            "simple_bounds_newton",
            "simple_bounds_BFGS",
        }:
            return common_bounds_params

        if self.optimization_algorithm in {"TR-newton", "TR-BFGS"}:
            algo_parameters = common_bounds_params | {
                "dogleg": self.biogeme_parameters.get_value('dogleg'),
                "radius": self.biogeme_parameters.get_value('initial_radius'),
                "maxiter": self.biogeme_parameters.get_value('max_iterations'),
            }
            return algo_parameters

        if self.optimization_algorithm in {"LS-newton", "LS-BFGS"}:
            return common_bounds_params

        return common_bounds_params

    @property
    def optimization_parameters(self) -> dict[str, ParameterValue]:
        return self.function_parameters | self.algo_parameters

    def _save_iterations_file_name(self) -> str:
        """
        :return: The name of the file where the iterations are saved.
        :rtype: str
        """
        return f"__{self.model_name}.iter"

    @property
    def free_betas_names(self) -> list[str]:
        """Returns the names of the parameters that must be estimated

        :return: list of names of the parameters
        :rtype: list(str)
        """
        return self.expressions_registry.free_betas_names


[docs]
    def number_unknown_parameters(self) -> int:
        """Returns the number of parameters that must be estimated

        :return: number of parameters
        :rtype: int
        """
        return self.expressions_registry.number_of_free_betas



[docs]
    def calculate_null_loglikelihood(self, avail: dict[int, ExpressionOrNumeric]):
        """Calculate the log likelihood of the null model that predicts equal
        probability for each alternative

        :param avail: list of expressions to evaluate the availability
                      conditions for each alternative. If 1 is provided, it is always available.
        :type avail: list of :class:`biogeme.expressions.Expression`

        :return: value of the log likelihood
        :rtype: float

        """
        expression = -log(MultipleSum(avail))
        result = evaluate_simple_expression(
            expression=expression,
            database=self.database,
            numerically_safe=False,
            use_jit=self.use_jit,
        )
        self.null_loglikelihood = result
        return self.null_loglikelihood



[docs]
    def calculate_init_likelihood(self) -> float:
        """Calculate the value of the log likelihood function

        The default values of the parameters are used.

        :return: value of the log likelihood.
        :rtype: float.
        """
        # Value of the loglikelihood for the default values of the parameters.

        result = self.function_evaluator.evaluate(
            the_betas=self.expressions_registry.free_betas_init_values,
            gradient=False,
            hessian=False,
            bhhh=False,
        )
        return result.function


    def _get_likelihood_function(
        self,
    ) -> CallableExpression:
        return function_from_compiled_formula(
            the_compiled_function=self.function_evaluator,
            the_betas=self.expressions_registry.free_betas_init_values,
        )


[docs]
    def report_array(self, array: np.ndarray, with_names: bool = True) -> str:
        """Reports the entries of the array up to the maximum number

        :param array: array to report
        :type array: numpy.array

        :param with_names: if True, the names of the parameters are included
        :type with_names: bool

        :return: string reporting the values
        :rtype: str
        """
        length = min(
            array.size,
            self.biogeme_parameters.get_value('max_number_parameters_to_report'),
        )
        if with_names:
            names = self.expressions_registry.free_betas_names
            report = ", ".join(
                [
                    f"{name}={value:.2g}"
                    for name, value in zip(names[:length], array[:length])
                ]
            )
            return report
        report = ", ".join([f"{value:.2g}" for value in array[:length]])
        return report


    def _load_saved_iteration(self) -> dict[str, float]:
        """Reads the values of the parameters from a text file where each line
        has the form name_of_beta = value_of_beta, and use these values in all
        formulas.

        """
        filename = self._save_iterations_file_name()
        betas = {}
        try:
            with open(filename, encoding="utf-8") as fp:
                for line in fp:
                    ell = line.split("=")
                    betas[ell[0].strip()] = float(ell[1])
            self.change_init_values(betas)
            logger.info(f"Parameter values restored from {filename}")
            return betas
        except OSError:
            logger.info(f"Cannot read file {filename}. Statement is ignored.")
            return {}


[docs]
    def set_random_init_values(self, default_bound: float = 100.0) -> None:
        """Modifies the initial values of the parameters in all formulas,
        using randomly generated values. The value is drawn from a
        uniform distribution on the interval defined by the
        bounds.

        :param default_bound: If the upper bound is missing, it is
            replaced by this value. If the lower bound is missing, it is
            replaced by the opposite of this value. Default: 100.
        :type default_bound: float
        """
        random_betas = {
            beta.name: np.random.uniform(
                low=-default_bound if beta.lower_bound is None else beta.lower_bound,
                high=default_bound if beta.upper_bound is None else beta.upper_bound,
            )
            for beta in self.expressions_registry.free_betas
        }
        self.change_init_values(random_betas)



[docs]
    def change_init_values(self, betas: dict[str, float]) -> None:
        """Modifies the initial values of the parameters in all formula

        :param betas: dictionary where the keys are the names of the
                      parameters, and the values are the new value for
                      the parameters.
        :type betas: dict(string:float)
        """
        for expression in self.formulas.values():
            expression.change_init_values(betas)
        for beta in self.expressions_registry.free_betas:
            value = betas.get(beta.name)
            if value is not None:
                beta.init_value = value


    def _load_saved_estimates(self) -> EstimationResults:
        yaml_files = files_of_type('yaml', name=self.model_name)
        yaml_files.sort()
        if yaml_files:
            yaml_to_read = yaml_files[-1]
            if len(yaml_files) > 1:
                warning_msg = (
                    f"Several files .yaml are available for "
                    f"this model: {yaml_files}. "
                    f"The file {yaml_to_read} "
                    f"is used to load the results."
                )
                logger.warning(warning_msg)
            results = EstimationResults.from_yaml_file(filename=yaml_to_read)
            logger.warning(
                f'Estimation results read from {yaml_to_read}. '
                f'There is no guarantee that they correspond '
                f'to the specified model.'
            )
            return results
        raise BiogemeError(f'No yaml file has been found for model {self.model_name}')

    def _bootstrap(self, estimated_parameters: dict[str, float]) -> list[np.ndarray]:
        number_of_bootstrap_samples = self.biogeme_parameters.get_value(
            'bootstrap_samples'
        )
        if number_of_bootstrap_samples == 0:
            return []

        logger.info(
            f"Re-estimate the model {self.biogeme_parameters.get_value('bootstrap_samples')} "
            f"times for bootstrapping"
        )
        with suppress_logs(level=logging.WARNING):
            # For some reason, the self.optimization_parameters cannot be used as such in the function call below.
            # I have no clue why.
            opt_parameters = self.optimization_parameters
            bootstrap_results: list[AlgorithmResults] = bootstrap(
                number_of_bootstrap_samples=number_of_bootstrap_samples,
                the_algorithm=self._algorithm_configuration(),
                modeling_elements=self.model_elements,
                parameters=opt_parameters,
                starting_values=estimated_parameters,
                second_derivatives_mode=self.biogeme_parameters.get_value(
                    name='calculating_second_derivatives'
                ),
                numerically_safe=self.numerically_safe,
                number_of_jobs=self.biogeme_parameters.get_value(name='number_of_jobs'),
                use_jit=self.use_jit,
            )

        return [result.solution for result in bootstrap_results]


[docs]
    def retrieve_saved_estimates(self) -> EstimationResults | None:
        """
        Attempt to retrieve previously saved estimation results from a YAML file.

        :return:
            An EstimationResults object if a saved result is found. If no file is found or loading fails,
            None is returned and a warning is logged.

        :raises BiogemeError:
            Raised internally by _load_saved_estimates if loading fails, and is caught to log a warning instead.
        """
        try:
            return self._load_saved_estimates()
        except BiogemeError as e:
            logger.warning(e)
            return None



[docs]
    def estimate(
        self,
        starting_values: dict[str, float] | None = None,
        recycle: bool = False,
        run_bootstrap: bool = False,
        **kwargs,
    ) -> EstimationResults:
        """Estimate the parameters of the model(s).

        :return: object containing the estimation results.
        :rtype: biogeme.bioResults

        Example::

            # Create an instance of biogeme
            biogeme  = bio.BIOGEME(database, logprob)

            # Gives a name to the model
            biogeme.modelName = 'mymodel'

            # Estimate the parameters
            results = biogeme.estimate()

        :raises BiogemeError: if no expression has been provided for the
            likelihood

        """

        if kwargs.get("bootstrap") is not None:
            error_msg = (
                'Parameter "bootstrap" is deprecated. In order to perform '
                "bootstrapping, bootstrap_samples=100 to a positive number in the biogeme.toml file ["
                "e.g. bootstrap_samples=100]."
            )
            raise BiogemeError(error_msg)
        if kwargs.get("algorithm") is not None:
            error_msg = (
                'The parameter "algorithm" is deprecated. Instead, define the '
                'parameter "optimization_algorithm" in section "[Estimation]" '
                "of the TOML parameter file"
            )
            raise BiogemeError(error_msg)

        if kwargs.get("algo_parameters") is not None:
            error_msg = (
                'The parameter "algo_parameters" is deprecated. Instead, define the '
                'parameters "max_iterations" and "tolerance" in section '
                '"[SimpleBounds]" '
                "of the TOML parameter file"
            )
            raise BiogemeError(error_msg)

        if kwargs.get("algoParameters") is not None:
            error_msg = (
                'The parameter "algoParameters" is deprecated. Instead, define the '
                'parameters "max_iterations" and "tolerance" in section '
                '"[SimpleBounds]" '
                "of the TOML parameter file"
            )
            raise BiogemeError(error_msg)

        if kwargs:
            unexpected = ', '.join(kwargs.keys())
            raise BiogemeError(
                f'Ignoring unexpected arguments passed to estimate(): {unexpected}. '
                f'This method does not accept any parameters. Use biogeme.toml or the BIOGEME object to set parameters.'
            )

        if self.log_like is None:
            raise BiogemeError("No log likelihood function has been specified")

        if recycle:
            try:
                return self._load_saved_estimates()
            except BiogemeError as e:
                logger.warning(e)

        if self.model_name == DEFAULT_MODEL_NAME:
            logger.warning(
                f'You have not defined a name for the model. '
                f'The output files are named from the model name. '
                f'The default is [{DEFAULT_MODEL_NAME}]'
            )

        if self.expressions_registry.number_of_free_betas == 0:
            raise BiogemeError(
                f"There is no parameter to estimate"
                f" in the formula: {self.log_like}."
            )

        save_iteration_file_name = None
        saved_starting_values = {}
        if self.biogeme_parameters.get_value('save_iterations'):
            logger.info(
                f"*** Initial values of the parameters are "
                f"obtained from the file {self._save_iterations_file_name()}"
            )
            save_iteration_file_name = self._save_iterations_file_name()
            saved_starting_values = self._load_saved_iteration()

        if starting_values is None:
            starting_values = saved_starting_values.copy()
        else:
            for key, value in saved_starting_values.items():
                starting_values.setdefault(key, value)
        logger.info(f'Starting values for the algorithm: {starting_values}')
        init_log_likelihood = self.calculate_init_likelihood()
        logger.debug(f'Init log likelihood: {init_log_likelihood}')

        # For some reason, the self.optimization_parameters cannot be used as such in the function call below.
        # I have no clue why.
        opt_parameters = self.optimization_parameters

        algorithm_results: AlgorithmResults = model_estimation(
            the_algorithm=self._algorithm_configuration(),
            function_evaluator=self.function_evaluator,
            parameters=opt_parameters,
            some_starting_values=starting_values,
            save_iterations_filename=save_iteration_file_name,
        )
        if algorithm_results.convergence:
            logger.info('Optimization algorithm has converged.')
        else:
            logger.info('Optimization algorithm has *not* converged.')
        for key, msg in algorithm_results.optimization_messages.items():
            logger.info(f'{key}: {msg}')

        optimal_betas = self.expressions_registry.get_named_betas_values(
            algorithm_results.solution
        )

        calculate_hessian = self.second_derivatives_mode != SecondDerivativesMode.NEVER
        if calculate_hessian:
            logger.info('Calculate second derivatives and BHHH')
        else:
            logger.info('Calculate BHHH')
        f_g_h_b: FunctionOutput = self.function_evaluator.evaluate(
            the_betas=optimal_betas,
            gradient=True,
            hessian=calculate_hessian,
            bhhh=True,
        )

        if run_bootstrap:
            start_time = datetime.now()
            bootstrap_results = self._bootstrap(estimated_parameters=optimal_betas)
            # Time needed to generate the bootstrap results
            bootstrap_time = datetime.now() - start_time
        else:
            bootstrap_results = []
            bootstrap_time = None

        clean_optimization_messages = {}
        for key, value in algorithm_results.optimization_messages.items():
            if isinstance(value, np.floating):
                clean_optimization_messages[key] = float(value)
            elif isinstance(value, np.ndarray):
                clean_optimization_messages[key] = value.tolist()
            else:
                clean_optimization_messages[key] = value

        null_log_likelihood = (
            float(self.null_loglikelihood)
            if self.null_loglikelihood is not None
            else None
        )
        the_hessian = (
            None if f_g_h_b.hessian is None else safe_serialize_array(f_g_h_b.hessian)
        )
        raw_estimation_results = RawEstimationResults(
            model_name=self.model_name,
            user_notes=self.user_notes,
            beta_names=self.expressions_registry.free_betas_names,
            beta_values=algorithm_results.solution.tolist(),
            lower_bounds=[bound[0] for bound in self.expressions_registry.bounds],
            upper_bounds=[bound[1] for bound in self.expressions_registry.bounds],
            gradient=safe_serialize_array(f_g_h_b.gradient),
            hessian=the_hessian,
            bhhh=safe_serialize_array(f_g_h_b.bhhh),
            null_log_likelihood=null_log_likelihood,
            initial_log_likelihood=init_log_likelihood,
            final_log_likelihood=f_g_h_b.function,
            data_name=self.model_elements.database.name,
            sample_size=self.model_elements.sample_size,
            number_of_observations=self.model_elements.number_of_observations,
            monte_carlo=self.expressions_registry.requires_draws,
            number_of_draws=int(self.model_elements.draws_management.number_of_draws),
            types_of_draws=self.model_elements.draws_management.draw_types,
            number_of_excluded_data=self.model_elements.database.number_of_excluded_data,
            draws_processing_time=self.model_elements.draws_management.processing_time,
            optimization_messages=clean_optimization_messages,
            convergence=algorithm_results.convergence,
            bootstrap=[one_result.tolist() for one_result in bootstrap_results],
            bootstrap_time=bootstrap_time,
        )
        estimation_results = EstimationResults(
            raw_estimation_results=raw_estimation_results
        )
        estimated_betas = estimation_results.get_beta_values()
        for f in self.formulas.values():
            f.change_init_values(estimated_betas)

        if not estimation_results.algorithm_has_converged:
            logger.warning(
                'It seems that the optimization algorithm did not converge. '
                'Therefore, the results may not correspond to the maximum '
                'likelihood estimator. Check the specification of the model, '
                'or the criteria for convergence of the algorithm.'
            )
        if self.biogeme_parameters.get_value('generate_html'):
            self.html_filename = get_new_file_name(self.model_name, 'html')
            generate_html_file(
                filename=self.html_filename, estimation_results=estimation_results
            )
        if self.biogeme_parameters.get_value('generate_yaml'):
            self.yaml_filename = get_new_file_name(self.model_name, 'yaml')
            estimation_results.dump_yaml_file(filename=self.yaml_filename)
        return estimation_results



[docs]
    def quick_estimate(self) -> EstimationResults:
        """| Estimate the parameters of the model. Same as estimate, where any
             extra calculation is skipped (init loglikelihood,
             t-statistics, etc.)

        :return: object containing the estimation results.
        :rtype: biogeme.results.bioResults

        Example::

            # Create an instance of biogeme
            biogeme  = bio.BIOGEME(database, logprob)

            # Gives a name to the model
            biogeme.modelName = 'mymodel'

            # Estimate the parameters
            results = biogeme.quickEstimate()

        :raises BiogemeError: if no expression has been provided for the
            likelihood

        """
        # For some reason, the self.optimization_parameters cannot be used as such in the function call below.
        # I have no clue why.
        save_iteration_file_name = None
        starting_values = {}
        if self.biogeme_parameters.get_value('save_iterations'):
            logger.info(
                f"*** Initial values of the parameters are "
                f"obtained from the file {self._save_iterations_file_name()}"
            )
            save_iteration_file_name = self._save_iterations_file_name()
            starting_values = self._load_saved_iteration()

        # For some reason, the self.optimization_parameters cannot be used as such in the function call below.
        # I have no clue why.
        opt_parameters = self.optimization_parameters

        algorithm_results: AlgorithmResults = model_estimation(
            the_algorithm=self._algorithm_configuration(),
            function_evaluator=self.function_evaluator,
            parameters=opt_parameters,
            some_starting_values=starting_values,
            save_iterations_filename=save_iteration_file_name,
        )

        optimal_betas = self.expressions_registry.get_named_betas_values(
            algorithm_results.solution
        )
        f_g_h_b: FunctionOutput = self.function_evaluator.evaluate(
            the_betas=optimal_betas,
            gradient=False,
            hessian=False,
            bhhh=False,
        )

        clean_optimization_messages = {}
        for key, value in algorithm_results.optimization_messages.items():
            if isinstance(value, np.floating):
                clean_optimization_messages[key] = float(value)
            elif isinstance(value, np.ndarray):
                clean_optimization_messages[key] = value.tolist()
            else:
                clean_optimization_messages[key] = value

        null_log_likelihood = (
            float(self.null_loglikelihood)
            if self.null_loglikelihood is not None
            else None
        )
        raw_estimation_results = RawEstimationResults(
            model_name=self.model_name,
            user_notes=self.user_notes,
            beta_names=self.expressions_registry.free_betas_names,
            beta_values=algorithm_results.solution.tolist(),
            lower_bounds=[bound[0] for bound in self.expressions_registry.bounds],
            upper_bounds=[bound[1] for bound in self.expressions_registry.bounds],
            gradient=[],
            hessian=[[]],
            bhhh=[[]],
            null_log_likelihood=null_log_likelihood,
            initial_log_likelihood=None,
            final_log_likelihood=f_g_h_b.function,
            data_name=self.model_elements.database.name,
            sample_size=self.model_elements.sample_size,
            number_of_observations=self.model_elements.number_of_observations,
            monte_carlo=self.expressions_registry.requires_draws,
            number_of_draws=int(self.model_elements.draws_management.number_of_draws),
            types_of_draws=self.model_elements.draws_management.draw_types,
            number_of_excluded_data=self.model_elements.database.number_of_excluded_data,
            draws_processing_time=self.model_elements.draws_management.processing_time,
            optimization_messages=clean_optimization_messages,
            convergence=algorithm_results.convergence,
            bootstrap=[],
            bootstrap_time=None,
        )
        return EstimationResults(raw_estimation_results=raw_estimation_results)



[docs]
    def estimate_catalog(
        self,
        selected_configurations: set[Configuration] = None,
        quick_estimate: bool = False,
        recycle: bool = False,
        run_bootstrap: bool = False,
    ) -> dict[str, EstimationResults]:
        """Estimate all or selected versions of a model with Catalog's,
        corresponding to multiple specifications.

        :param selected_configurations: set of configurations. If
            None, all configurations are considered.
        :param quick_estimate: if True, the final statistics are not calculated.
        :param recycle: if True, the results are read from the pickle
            file, if it exists. If False, the estimation is performed.
        :param run_bootstrap: if True, bootstrapping is applied.
        :return: object containing the estimation results associated
            with the name of each specification, as well as a
            description of each configuration

        """
        if self.short_names is None:
            self.short_names = ModelNames(prefix=self.modelName)

        if self.log_like is None:
            raise BiogemeError("No log likelihood function has been specified")

        central_controller = CentralController(
            expression=self.log_like,
            maximum_number_of_configurations=self.biogeme_parameters.get_value(
                'maximum_number_catalog_expressions'
            ),
        )
        if selected_configurations is None:
            number_of_specifications = central_controller.number_of_configurations()
            logger.info(f'Estimating {number_of_specifications} models.')

            if (
                number_of_specifications is None
                or number_of_specifications > self.maximum_number_catalog_expressions
            ):
                error_msg = (
                    f"There are too many [{number_of_specifications}] different "
                    f"specifications for the log likelihood function. This is "
                    f"above the maximum number: "
                    f"{self.maximum_number_catalog_expressions}. Simplify "
                    f"the specification, change the value of the parameter "
                    f"maximum_number_catalog_expressions, or consider using "
                    f'the AssistedSpecification object in the "biogeme.assisted" '
                    f"module."
                )
                raise ValueOutOfRange(error_msg)

            the_iterator = SelectedConfigurationsIterator(
                the_central_controller=central_controller
            )
        else:
            the_iterator = SelectedConfigurationsIterator(
                the_central_controller=central_controller,
                selected_configurations=selected_configurations,
            )
        configurations = {}
        for config in the_iterator:
            config_id = config.get_string_id()
            b = self.__class__.from_configuration(
                config_id=config_id,
                multiple_expression=self.log_like,
                database=self.database,
                user_notes=self.user_notes,
                parameters=self.biogeme_parameters,
            )
            b.model_name = self.short_names(config_id)
            results = b.retrieve_saved_estimates() if recycle else None
            if results is None:
                results = (
                    b.quick_estimate()
                    if quick_estimate
                    else b.estimate(recycle=recycle, run_bootstrap=run_bootstrap)
                )

            configurations[config_id] = results

        return configurations



[docs]
    def validate(
        self,
        estimation_results: EstimationResults,
        slices: int,
        groups: str | None = None,
    ) -> list[ValidationResult]:
        """
        Perform out-of-sample validation of the model.

        The validation procedure operates by dividing the dataset into a number of slices.
        For each slice:
          - The slice is used as the validation set.
          - The remaining data forms the estimation set.
          - The model is re-estimated on the estimation set.
          - The model is applied to the validation set to compute the log likelihood.

        :param estimation_results: Estimation results obtained from the full dataset.
        :param slices: Number of data splits to create for cross-validation.
        :param groups: Optional column name used to group data entries (e.g., panel data). If provided, splitting preserves groups.

        :return: List of validation results, one for each data slice.
        :raises BiogemeError: If the dataset is structured as panel data and incompatible with validation.
        """
        # For some reason, the self.optimization_parameters cannot be used as such in the function call below.
        # I have no clue why.
        parameters = self.optimization_parameters
        parameters['calculating_second_derivatives'] = (
            self.biogeme_parameters.get_value(name='calculating_second_derivatives')
        )
        return cross_validate_model(
            the_algorithm=self._algorithm_configuration(),
            modeling_elements=self.model_elements,
            parameters=parameters,
            starting_values=estimation_results.get_beta_values(),
            slices=slices,
            numerically_safe=self.numerically_safe,
            groups=groups,
        )


    def _algorithm_configuration(self) -> OptimizationAlgorithm:
        if self.biogeme_parameters.get_value('optimization_algorithm') == "automatic":
            self.biogeme_parameters.set_value('optimization_algorithm', 'simple_bounds')
        return algorithms.get(self.optimization_algorithm)


[docs]
    def simulate(self, the_beta_values: dict[str, float] | None) -> pd.DataFrame:
        """Evaluate all simulation formulas on each row of the database using the specified parameter values.

        :param the_beta_values: Dictionary mapping parameter names to values.
                                If None, an exception is raised. Use results.get_beta_values()
                                after estimation or provide explicit values.

        :return: A pandas DataFrame where each row corresponds to an observation in the database,
                 and each column corresponds to a simulation formula.

        :raises BiogemeError: If the_beta_values is None or if the number of parameters is incorrect.
        """
        if the_beta_values is None:
            current_beta_values = self.expressions_registry.free_betas
            err = (
                f'Contrarily to previous versions of Biogeme, '
                f'the values of Beta must '
                f'now be explicitly mentioned. If they have been estimated, they can be obtained from '
                f'results.get_beta_values(). If not, use the default values: {current_beta_values}'
            )
            raise BiogemeError(err)

        if not isinstance(the_beta_values, dict):
            error_msg = f'the_beta_values must be a dict, and not an object of type {type(the_beta_values)}'
            raise BiogemeError(error_msg)

        the_evaluator: MultiRowEvaluator = MultiRowEvaluator(
            model_elements=self.model_elements,
            numerically_safe=self.numerically_safe,
            use_jit=self.use_jit,
        )

        results = the_evaluator.evaluate(the_beta_values)
        return results



[docs]
    def confidence_intervals(
        self, beta_values: list[dict[str, float]], interval_size: float = 0.9
    ) -> tuple[pd.DataFrame, pd.DataFrame]:
        """Calculate confidence intervals on the simulated quantities


        :param beta_values: array of parameters values to be used in
               the calculations. Typically, it is a sample drawn from
               a distribution.
        :type beta_values: list(dict(str: float))

        :param interval_size: size of the reported confidence interval,
                    in percentage. If it is denoted by s, the interval
                    is calculated for the quantiles (1-s)/2 and
                    (1+s)/2. The default (0.9) corresponds to
                    quantiles for the confidence interval [0.05, 0.95].
        :type interval_size: float

        :return: two pandas data frames 'left' and 'right' with the
            same dimensions. Each row corresponds to a row in the
            database, and each column to a formula. 'left' contains the
            left value of the confidence interval, and 'right' the right
            value

            Example::

                # Read the estimation results from a file
                results = EstimationEResults.from_yaml_file(filename = 'my_model.yaml')
                # Retrieve the names of the betas parameters that have been
                # estimated
                betas = biogeme.freeBetaNames

                # Draw 100 realization of the distribution of the estimators
                b = results.getBetasForSensitivityAnalysis(betas, size = 100)

                # Simulate the formulas using the nominal values
                simulatedValues = biogeme.simulate(beta_values)

                # Calculate the confidence intervals for each formula
                left, right = biogeme.confidenceIntervals(b, 0.9)

        :rtype: tuple of two Pandas dataframes.

        """
        list_of_results = []
        for b in tqdm(beta_values):
            r = self.simulate(b)
            list_of_results += [r]
        all_results = pd.concat(list_of_results)
        r = (1.0 - interval_size) / 2.0
        left = all_results.groupby(level=0).quantile(r)
        right = all_results.groupby(level=0).quantile(1.0 - r)
        return left, right


    def __str__(self) -> str:
        r = f"{self.model_name}: database [{self.model_elements.database.name}]"
        r += str(self.formulas)
        return r


[docs]
    @deprecated_parameters({'beta': None})
    def check_derivatives(self, verbose: bool = False) -> CheckDerivativesResults:
        """Verifies the implementation of the derivatives.

        It compares the analytical version with the finite differences
        approximation.

        :param verbose: if True, the comparisons are reported. Default: False.

        :return: f, g, h, gdiff, hdiff where

            - f is the value of the function,
            - g is the analytical gradient,
            - h is the analytical hessian,
            - gdiff is the difference between the analytical and the
              finite differences gradient,
            - hdiff is the difference between the analytical and the
              finite differences hessian,

        """

        starting_values = (
            self.model_elements.expressions_registry.complete_dict_of_free_beta_values(
                the_betas={}
            )
        )

        the_log_likelihood: CallableExpression = function_from_compiled_formula(
            the_compiled_function=self.function_evaluator,
            the_betas=starting_values,
        )

        return check_derivatives(
            the_log_likelihood,
            np.asarray(
                self.model_elements.expressions_registry.list_of_free_betas_init_values
            ),
            self.model_elements.expressions_registry.free_betas_names,
            verbose,
        )