Source code for biogeme.biogeme

"""
Implementation of the main Biogeme class

:author: Michel Bierlaire
:date: Tue Mar 26 16:45:15 2019

It combines the database and the model specification.
"""

from __future__ import annotations

import difflib
import logging
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
from biogeme.biogeme_logging import suppress_logs
from biogeme.calculator import (
    CallableExpression,
    CompiledFormulaEvaluator,
    MultiRowEvaluator,
    function_from_compiled_formula,
)
from biogeme.calculator.simple_formula import evaluate_simple_expression
from biogeme.catalog import (
    CentralController,
    Configuration,
    SelectedConfigurationsIterator,
)
from biogeme.constants import LOG_LIKE, WEIGHT
from biogeme.database import Database
from biogeme.default_parameters import ParameterValue
from biogeme.deprecated import deprecated_parameters
from biogeme.dict_of_formulas import (
    check_validity,
    get_expression,
    insert_valid_keyword,
)
from biogeme.draws import RandomNumberGeneratorTuple
from biogeme.exceptions import BiogemeError, ValueOutOfRange
from biogeme.expressions import (
    Expression,
    ExpressionOrNumeric,
    MultipleSum,
    log,
)
from biogeme.filenames import get_new_file_name
from biogeme.function_output import (
    FunctionOutput,
)
from biogeme.likelihood import AlgorithmResults, model_estimation
from biogeme.likelihood.bootstrap import bootstrap
from biogeme.model_elements import ModelElements
from biogeme.optimization import OptimizationAlgorithm, algorithms
from biogeme.parameters import (
    DEFAULT_FILE_NAME as DEFAULT_PARAMETER_FILE_NAME,
    Parameters,
)
from biogeme.results_processing import (
    RawEstimationResults,
    generate_html_file,
)
from biogeme.results_processing.estimation_results import EstimationResults
from biogeme.second_derivatives import SecondDerivativesMode
from biogeme.tools import (
    CheckDerivativesResults,
    ModelNames,
    check_derivatives,
    safe_serialize_array,
)
from biogeme.tools.files import files_of_type
from biogeme.validation import ValidationResult, cross_validate_model
from tqdm import tqdm

DEFAULT_MODEL_NAME = 'biogeme_model_default_name'
logger = logging.getLogger(__name__)


[docs] class BIOGEME: """Main class that combines the database and the model specification. It works in two modes: estimation and simulation. The following attributes are imported from the parameter file. """ # Type hints for dynamically injected attributes __annotations__ = { 'seed': int, 'tolerance': float, 'steptol': float, 'max_iterations': int, 'number_of_threads': int, 'enlarging_factor': float, 'initial_radius': float, 'infeasible_cg': bool, 'second_derivatives': int, 'dogleg': bool, 'bootstrap_samples': int, 'save_iterations': bool, 'generate_html': bool, 'generate_yaml': bool, 'optimization_algorithm': str, 'maximum_number_catalog_expressions': int, 'max_number_parameters_to_report': int, } def __init__( self, database: Database, formulas: Expression | dict[str, Expression], random_number_generators: dict[str:RandomNumberGeneratorTuple] | None = None, user_notes: str | None = None, parameters: str | Parameters | None = None, **kwargs, ): """Constructor :param database: choice data. :param formulas: expression or dictionary of expressions that define the model specification. The concept is that each expression is applied to each entry of the database. The keys of the dictionary allow to provide a name to each formula. In the estimation mode, two formulas are needed, with the keys 'loglike' and 'weight'. If only one formula is provided, it is associated with the label 'loglike'. If no formula is labeled 'weight', the weight of each piece of data is supposed to be 1.0. In the simulation mode, the labels of each formula are used as labels of the resulting database. :param random_number_generators: user defined random number generators. :param user_notes: these notes will be included in the report file. :param parameters: name of the .toml file where the parameters are read, or Parameters object containing :raise BiogemeError: an audit of the formulas is performed. If a formula has issues, an error is detected and an exception is raised. """ if isinstance(parameters, Parameters): logger.info('Biogeme parameters provided by the user.') self.biogeme_parameters: Parameters = parameters else: self.biogeme_parameters: Parameters = Parameters() if isinstance(parameters, str): self.biogeme_parameters.read_file(parameters) else: if parameters is None: self.biogeme_parameters.read_file(DEFAULT_PARAMETER_FILE_NAME) else: error_msg = ( f'Argument "parameters" is of wrong type: {type(parameters)}' ) raise AttributeError(error_msg) database.reset_indices() self.parameter_file: str = self.biogeme_parameters.file_name self.html_filename: str | None = None self.yaml_filename: str | None = None # We allow the values of the parameters to be set with arguments self.biogeme_parameters.set_several_parameters(dict_of_parameters=kwargs) for name in self.biogeme_parameters.parameter_names: self._define_property(name) self.maximum_number_of_observations_per_individual: int | None = None self.database = database self.log_like_name: str = LOG_LIKE """ Keywords used for the name of the loglikelihood formula. Default: 'log_like'""" self.log_like_valid_names: list[str] = [LOG_LIKE, 'loglike'] self.weight_name = WEIGHT """Keyword used for the name of the weight formula. Default: 'weight' """ self.weight_valid_names = [WEIGHT, 'weights'] self.model_name = DEFAULT_MODEL_NAME """Name of the model. Default: 'biogemeModelDefaultName' """ self.second_derivatives_mode = SecondDerivativesMode( self.calculating_second_derivatives ) if self.seed != 0: np.random.seed(self.seed) self.short_names: ModelNames | None = None self.formulas = self._normalize_formulas(formulas) self.user_notes = user_notes #: User notes self.init_loglikelihood = None #: Init value of the likelihood function self.null_loglikelihood = None #: Log likelihood of the null model self.best_iteration = None #: Store the best iteration found so far. self._model_elements = ModelElements( expressions=self.formulas, database=self.database, number_of_draws=self.biogeme_parameters.get_value(name='number_of_draws'), user_defined_draws=random_number_generators, use_jit=self.biogeme_parameters.get_value(name='use_jit'), ) self._function_evaluator = ( CompiledFormulaEvaluator( model_elements=self._model_elements, second_derivatives_mode=self.second_derivatives_mode, numerically_safe=self.numerically_safe, ) if self.contains_log_likelihood() else None )
[docs] @classmethod def from_configuration_and_controller( cls, config_id: str, central_controller: CentralController, database: Database, user_notes: str | None = None, parameters: str | Parameters | None = None, **kwargs, ) -> BIOGEME: """Obtain the Biogeme object corresponding to the configuration of a multiple expression :param config_id: identifier of the configuration :param central_controller: central controller for the multiple expression containing all the catalogs. :param database: database to be passed to the Biogeme object :param user_notes: these notes will be included in the report file. :param parameters: object with the parameters """ if central_controller.all_configurations: # We verify that the configuration is valid the_set = central_controller.all_configurations_ids if not the_set: error_msg = "No configuration found in the expression" raise BiogemeError(error_msg) if config_id not in the_set: close_matches = difflib.get_close_matches(config_id, the_set) if close_matches: error_msg = ( f"Unknown configuration: [{config_id}]. " f"Did you mean [{close_matches[0]}]?" ) else: error_msg = f"Unknown configuration: {config_id}." raise BiogemeError(error_msg) the_configuration = Configuration.from_string(config_id) central_controller.set_configuration(the_configuration) if user_notes is None: user_notes = the_configuration.get_html() flat_expression = central_controller.expression.deep_flat_copy() return cls( database=database, formulas=flat_expression, user_notes=user_notes, parameters=parameters, **kwargs, )
[docs] @classmethod def from_configuration( cls, config_id: str, multiple_expression: Expression, database: Database, user_notes: str | None = None, parameters: str | Parameters | None = None, **kwargs, ) -> BIOGEME: """Obtain the Biogeme object corresponding to the configuration of a multiple expression :param config_id: identifier of the configuration :param multiple_expression: multiple expression containing the catalog. :param database: database to be passed to the Biogeme object :param user_notes: these notes will be included in the report file. :param parameters: object with the parameters """ central_controller = CentralController(expression=multiple_expression) return cls.from_configuration_and_controller( config_id=config_id, central_controller=central_controller, database=database, user_notes=user_notes, parameters=parameters, **kwargs, )
def _define_property(self, name: str): def getter(obj): return obj.biogeme_parameters.get_value(name=name) def setter(obj, value): error_msg = ( f"Direct assignment to '{name}' is not allowed. Please set the value in the " f"constructor or in the .toml file." ) raise BiogemeError(error_msg) prop = property(getter, setter) setattr(self.__class__, name, prop) @property def sample_size(self): return self.model_elements.sample_size @property def number_of_observations(self): return self.model_elements.number_of_observations @property def generate_pickle(self) -> bool: warnings.warn( "'generate_pickle' is deprecated. Use 'generate_yaml' instead.", DeprecationWarning, stacklevel=2, ) return False @generate_pickle.setter def generate_pickle(self, value: bool) -> None: warnings.warn( "'generate_pickle' is deprecated. Use 'generate_yaml' instead. This statement is ignored", DeprecationWarning, stacklevel=2, ) @property def modelName(self) -> str: return self.model_name @modelName.setter def modelName(self, value: str) -> None: warnings.warn( "'modelName' is deprecated. Please use 'model_name' instead.", DeprecationWarning, stacklevel=2, ) self.model_name = value @property def expressions_registry(self): return self._model_elements.expressions_registry @property def log_like(self) -> Expression | None: return get_expression( dict_of_formulas=self.formulas, valid_keywords=self.log_like_valid_names )
[docs] def contains_log_likelihood(self) -> bool: return self.log_like is not None
@property def weight(self) -> Expression | None: return get_expression( dict_of_formulas=self.formulas, valid_keywords=self.weight_valid_names ) @property def model_elements(self) -> ModelElements | None: return self._model_elements @property def function_evaluator(self) -> CompiledFormulaEvaluator: return self._function_evaluator def _normalize_formulas( self, formulas: Expression | dict[str, Expression] ) -> dict[str, Expression]: """ Normalize user input to a dictionary of expressions. Raises BiogemeError if input is invalid. :param formulas: a single expression or a dictionary of named expressions. :return: a dictionary mapping names to expressions. """ if isinstance(formulas, Expression): return {self.log_like_name: formulas} if isinstance(formulas, dict): check_validity(formulas) return insert_valid_keyword( dict_of_formulas=formulas, reference_keyword=self.log_like_name, valid_keywords=self.log_like_valid_names, ) raise BiogemeError( f'Invalid type for formulas: {type(formulas)}. Expected Expression or dict.' )
[docs] def is_model_complex(self) -> bool: """Check if the model is potentially complex to estimate""" return self.log_like.is_complex()
@property def loglike(self) -> Expression: """For backward compatibility""" return self.log_like @property def function_parameters(self) -> dict[str, ParameterValue]: """Prepare the parameters for the function""" return { "tolerance": self.biogeme_parameters.get_value('tolerance'), "steptol": self.biogeme_parameters.get_value('steptol'), } @property def algo_parameters(self) -> dict[str, ParameterValue]: """Prepare the parameters for the optimization algorithm.""" common_bounds_params = { 'infeasibleConjugateGradient': self.biogeme_parameters.get_value( 'infeasible_cg' ), 'radius': self.biogeme_parameters.get_value('initial_radius'), 'enlargingFactor': self.biogeme_parameters.get_value('enlarging_factor'), 'maxiter': self.biogeme_parameters.get_value('max_iterations'), 'max_number_parameters_to_report': self.biogeme_parameters.get_value( 'max_number_parameters_to_report' ), } if self.optimization_algorithm == 'automatic': if ( self.is_model_complex() or self.second_derivatives_mode == SecondDerivativesMode.NEVER ): logger.info( 'As the model is rather complex, we cancel the calculation of second derivatives. ' 'If you want to control the parameters, change the algorithm from "automatic" ' 'to "simple_bounds" in the TOML file.' ) algo_parameters = common_bounds_params | { "proportionAnalyticalHessian": 0, } else: logger.info( 'As the model is not too complex, we activate the calculation of second derivatives. ' 'To change this behavior, modify the algorithm to "simple_bounds" in the TOML file.' ) algo_parameters = common_bounds_params | { "proportionAnalyticalHessian": 1, } return algo_parameters if self.optimization_algorithm == "simple_bounds": algo_parameters = common_bounds_params | { "proportionAnalyticalHessian": self.biogeme_parameters.get_value( 'second_derivatives' ), } if ( self.second_derivatives_mode == SecondDerivativesMode.FINITE_DIFFERENCES and self.biogeme_parameters.get_value('second_derivatives') > 0 ): warning = ( f'The proportion of the analytical hessian is not zero, and the second derivatives are approximated by ' f'finite difference. It may not be the desired configuration.' ) logger.warning(warning) if ( self.second_derivatives_mode == SecondDerivativesMode.NEVER and self.biogeme_parameters.get_value('second_derivatives') > 0 ): error_msg = ( f'The proportion of the analytical hessian is not zero, and the second derivatives cannot be ' f'evaluated. The parameters "calculating_second_derivatives" and "second_derivatives" are inconsistent.' ) return algo_parameters if self.optimization_algorithm in { "simple_bounds_newton", "simple_bounds_BFGS", }: return common_bounds_params if self.optimization_algorithm in {"TR-newton", "TR-BFGS"}: algo_parameters = common_bounds_params | { "dogleg": self.biogeme_parameters.get_value('dogleg'), "radius": self.biogeme_parameters.get_value('initial_radius'), "maxiter": self.biogeme_parameters.get_value('max_iterations'), } return algo_parameters if self.optimization_algorithm in {"LS-newton", "LS-BFGS"}: return common_bounds_params return common_bounds_params @property def optimization_parameters(self) -> dict[str, ParameterValue]: return self.function_parameters | self.algo_parameters def _save_iterations_file_name(self) -> str: """ :return: The name of the file where the iterations are saved. :rtype: str """ return f"__{self.model_name}.iter" @property def free_betas_names(self) -> list[str]: """Returns the names of the parameters that must be estimated :return: list of names of the parameters :rtype: list(str) """ return self.expressions_registry.free_betas_names
[docs] def number_unknown_parameters(self) -> int: """Returns the number of parameters that must be estimated :return: number of parameters :rtype: int """ return self.expressions_registry.number_of_free_betas
[docs] def calculate_null_loglikelihood(self, avail: dict[int, ExpressionOrNumeric]): """Calculate the log likelihood of the null model that predicts equal probability for each alternative :param avail: list of expressions to evaluate the availability conditions for each alternative. If 1 is provided, it is always available. :type avail: list of :class:`biogeme.expressions.Expression` :return: value of the log likelihood :rtype: float """ expression = -log(MultipleSum(avail)) result = evaluate_simple_expression( expression=expression, database=self.database, numerically_safe=False, use_jit=self.use_jit, ) self.null_loglikelihood = result return self.null_loglikelihood
[docs] def calculate_init_likelihood(self) -> float: """Calculate the value of the log likelihood function The default values of the parameters are used. :return: value of the log likelihood. :rtype: float. """ # Value of the loglikelihood for the default values of the parameters. result = self.function_evaluator.evaluate( the_betas=self.expressions_registry.free_betas_init_values, gradient=False, hessian=False, bhhh=False, ) return result.function
def _get_likelihood_function( self, ) -> CallableExpression: return function_from_compiled_formula( the_compiled_function=self.function_evaluator, the_betas=self.expressions_registry.free_betas_init_values, )
[docs] def report_array(self, array: np.ndarray, with_names: bool = True) -> str: """Reports the entries of the array up to the maximum number :param array: array to report :type array: numpy.array :param with_names: if True, the names of the parameters are included :type with_names: bool :return: string reporting the values :rtype: str """ length = min( array.size, self.biogeme_parameters.get_value('max_number_parameters_to_report'), ) if with_names: names = self.expressions_registry.free_betas_names report = ", ".join( [ f"{name}={value:.2g}" for name, value in zip(names[:length], array[:length]) ] ) return report report = ", ".join([f"{value:.2g}" for value in array[:length]]) return report
def _load_saved_iteration(self) -> dict[str, float]: """Reads the values of the parameters from a text file where each line has the form name_of_beta = value_of_beta, and use these values in all formulas. """ filename = self._save_iterations_file_name() betas = {} try: with open(filename, encoding="utf-8") as fp: for line in fp: ell = line.split("=") betas[ell[0].strip()] = float(ell[1]) self.change_init_values(betas) logger.info(f"Parameter values restored from {filename}") return betas except OSError: logger.info(f"Cannot read file {filename}. Statement is ignored.") return {}
[docs] def set_random_init_values(self, default_bound: float = 100.0) -> None: """Modifies the initial values of the parameters in all formulas, using randomly generated values. The value is drawn from a uniform distribution on the interval defined by the bounds. :param default_bound: If the upper bound is missing, it is replaced by this value. If the lower bound is missing, it is replaced by the opposite of this value. Default: 100. :type default_bound: float """ random_betas = { beta.name: np.random.uniform( low=-default_bound if beta.lower_bound is None else beta.lower_bound, high=default_bound if beta.upper_bound is None else beta.upper_bound, ) for beta in self.expressions_registry.free_betas } self.change_init_values(random_betas)
[docs] def change_init_values(self, betas: dict[str, float]) -> None: """Modifies the initial values of the parameters in all formula :param betas: dictionary where the keys are the names of the parameters, and the values are the new value for the parameters. :type betas: dict(string:float) """ for expression in self.formulas.values(): expression.change_init_values(betas) for beta in self.expressions_registry.free_betas: value = betas.get(beta.name) if value is not None: beta.init_value = value
def _load_saved_estimates(self) -> EstimationResults: yaml_files = files_of_type('yaml', name=self.model_name) yaml_files.sort() if yaml_files: yaml_to_read = yaml_files[-1] if len(yaml_files) > 1: warning_msg = ( f"Several files .yaml are available for " f"this model: {yaml_files}. " f"The file {yaml_to_read} " f"is used to load the results." ) logger.warning(warning_msg) results = EstimationResults.from_yaml_file(filename=yaml_to_read) logger.warning( f'Estimation results read from {yaml_to_read}. ' f'There is no guarantee that they correspond ' f'to the specified model.' ) return results raise BiogemeError(f'No yaml file has been found for model {self.model_name}') def _bootstrap(self, estimated_parameters: dict[str, float]) -> list[np.ndarray]: number_of_bootstrap_samples = self.biogeme_parameters.get_value( 'bootstrap_samples' ) if number_of_bootstrap_samples == 0: return [] logger.info( f"Re-estimate the model {self.biogeme_parameters.get_value('bootstrap_samples')} " f"times for bootstrapping" ) with suppress_logs(level=logging.WARNING): # For some reason, the self.optimization_parameters cannot be used as such in the function call below. # I have no clue why. opt_parameters = self.optimization_parameters bootstrap_results: list[AlgorithmResults] = bootstrap( number_of_bootstrap_samples=number_of_bootstrap_samples, the_algorithm=self._algorithm_configuration(), modeling_elements=self.model_elements, parameters=opt_parameters, starting_values=estimated_parameters, second_derivatives_mode=self.biogeme_parameters.get_value( name='calculating_second_derivatives' ), numerically_safe=self.numerically_safe, number_of_jobs=self.biogeme_parameters.get_value(name='number_of_jobs'), use_jit=self.use_jit, ) return [result.solution for result in bootstrap_results]
[docs] def retrieve_saved_estimates(self) -> EstimationResults | None: """ Attempt to retrieve previously saved estimation results from a YAML file. :return: An EstimationResults object if a saved result is found. If no file is found or loading fails, None is returned and a warning is logged. :raises BiogemeError: Raised internally by _load_saved_estimates if loading fails, and is caught to log a warning instead. """ try: return self._load_saved_estimates() except BiogemeError as e: logger.warning(e) return None
[docs] def estimate( self, starting_values: dict[str, float] | None = None, recycle: bool = False, run_bootstrap: bool = False, **kwargs, ) -> EstimationResults: """Estimate the parameters of the model(s). :return: object containing the estimation results. :rtype: biogeme.bioResults Example:: # Create an instance of biogeme biogeme = bio.BIOGEME(database, logprob) # Gives a name to the model biogeme.modelName = 'mymodel' # Estimate the parameters results = biogeme.estimate() :raises BiogemeError: if no expression has been provided for the likelihood """ if kwargs.get("bootstrap") is not None: error_msg = ( 'Parameter "bootstrap" is deprecated. In order to perform ' "bootstrapping, bootstrap_samples=100 to a positive number in the biogeme.toml file [" "e.g. bootstrap_samples=100]." ) raise BiogemeError(error_msg) if kwargs.get("algorithm") is not None: error_msg = ( 'The parameter "algorithm" is deprecated. Instead, define the ' 'parameter "optimization_algorithm" in section "[Estimation]" ' "of the TOML parameter file" ) raise BiogemeError(error_msg) if kwargs.get("algo_parameters") is not None: error_msg = ( 'The parameter "algo_parameters" is deprecated. Instead, define the ' 'parameters "max_iterations" and "tolerance" in section ' '"[SimpleBounds]" ' "of the TOML parameter file" ) raise BiogemeError(error_msg) if kwargs.get("algoParameters") is not None: error_msg = ( 'The parameter "algoParameters" is deprecated. Instead, define the ' 'parameters "max_iterations" and "tolerance" in section ' '"[SimpleBounds]" ' "of the TOML parameter file" ) raise BiogemeError(error_msg) if kwargs: unexpected = ', '.join(kwargs.keys()) raise BiogemeError( f'Ignoring unexpected arguments passed to estimate(): {unexpected}. ' f'This method does not accept any parameters. Use biogeme.toml or the BIOGEME object to set parameters.' ) if self.log_like is None: raise BiogemeError("No log likelihood function has been specified") if recycle: try: return self._load_saved_estimates() except BiogemeError as e: logger.warning(e) if self.model_name == DEFAULT_MODEL_NAME: logger.warning( f'You have not defined a name for the model. ' f'The output files are named from the model name. ' f'The default is [{DEFAULT_MODEL_NAME}]' ) if self.expressions_registry.number_of_free_betas == 0: raise BiogemeError( f"There is no parameter to estimate" f" in the formula: {self.log_like}." ) save_iteration_file_name = None saved_starting_values = {} if self.biogeme_parameters.get_value('save_iterations'): logger.info( f"*** Initial values of the parameters are " f"obtained from the file {self._save_iterations_file_name()}" ) save_iteration_file_name = self._save_iterations_file_name() saved_starting_values = self._load_saved_iteration() if starting_values is None: starting_values = saved_starting_values.copy() else: for key, value in saved_starting_values.items(): starting_values.setdefault(key, value) logger.info(f'Starting values for the algorithm: {starting_values}') init_log_likelihood = self.calculate_init_likelihood() logger.debug(f'Init log likelihood: {init_log_likelihood}') # For some reason, the self.optimization_parameters cannot be used as such in the function call below. # I have no clue why. opt_parameters = self.optimization_parameters algorithm_results: AlgorithmResults = model_estimation( the_algorithm=self._algorithm_configuration(), function_evaluator=self.function_evaluator, parameters=opt_parameters, some_starting_values=starting_values, save_iterations_filename=save_iteration_file_name, ) if algorithm_results.convergence: logger.info('Optimization algorithm has converged.') else: logger.info('Optimization algorithm has *not* converged.') for key, msg in algorithm_results.optimization_messages.items(): logger.info(f'{key}: {msg}') optimal_betas = self.expressions_registry.get_named_betas_values( algorithm_results.solution ) calculate_hessian = self.second_derivatives_mode != SecondDerivativesMode.NEVER if calculate_hessian: logger.info('Calculate second derivatives and BHHH') else: logger.info('Calculate BHHH') f_g_h_b: FunctionOutput = self.function_evaluator.evaluate( the_betas=optimal_betas, gradient=True, hessian=calculate_hessian, bhhh=True, ) if run_bootstrap: start_time = datetime.now() bootstrap_results = self._bootstrap(estimated_parameters=optimal_betas) # Time needed to generate the bootstrap results bootstrap_time = datetime.now() - start_time else: bootstrap_results = [] bootstrap_time = None clean_optimization_messages = {} for key, value in algorithm_results.optimization_messages.items(): if isinstance(value, np.floating): clean_optimization_messages[key] = float(value) elif isinstance(value, np.ndarray): clean_optimization_messages[key] = value.tolist() else: clean_optimization_messages[key] = value null_log_likelihood = ( float(self.null_loglikelihood) if self.null_loglikelihood is not None else None ) the_hessian = ( None if f_g_h_b.hessian is None else safe_serialize_array(f_g_h_b.hessian) ) raw_estimation_results = RawEstimationResults( model_name=self.model_name, user_notes=self.user_notes, beta_names=self.expressions_registry.free_betas_names, beta_values=algorithm_results.solution.tolist(), lower_bounds=[bound[0] for bound in self.expressions_registry.bounds], upper_bounds=[bound[1] for bound in self.expressions_registry.bounds], gradient=safe_serialize_array(f_g_h_b.gradient), hessian=the_hessian, bhhh=safe_serialize_array(f_g_h_b.bhhh), null_log_likelihood=null_log_likelihood, initial_log_likelihood=init_log_likelihood, final_log_likelihood=f_g_h_b.function, data_name=self.model_elements.database.name, sample_size=self.model_elements.sample_size, number_of_observations=self.model_elements.number_of_observations, monte_carlo=self.expressions_registry.requires_draws, number_of_draws=int(self.model_elements.draws_management.number_of_draws), types_of_draws=self.model_elements.draws_management.draw_types, number_of_excluded_data=self.model_elements.database.number_of_excluded_data, draws_processing_time=self.model_elements.draws_management.processing_time, optimization_messages=clean_optimization_messages, convergence=algorithm_results.convergence, bootstrap=[one_result.tolist() for one_result in bootstrap_results], bootstrap_time=bootstrap_time, ) estimation_results = EstimationResults( raw_estimation_results=raw_estimation_results ) estimated_betas = estimation_results.get_beta_values() for f in self.formulas.values(): f.change_init_values(estimated_betas) if not estimation_results.algorithm_has_converged: logger.warning( 'It seems that the optimization algorithm did not converge. ' 'Therefore, the results may not correspond to the maximum ' 'likelihood estimator. Check the specification of the model, ' 'or the criteria for convergence of the algorithm.' ) if self.biogeme_parameters.get_value('generate_html'): self.html_filename = get_new_file_name(self.model_name, 'html') generate_html_file( filename=self.html_filename, estimation_results=estimation_results ) if self.biogeme_parameters.get_value('generate_yaml'): self.yaml_filename = get_new_file_name(self.model_name, 'yaml') estimation_results.dump_yaml_file(filename=self.yaml_filename) return estimation_results
[docs] def quick_estimate(self) -> EstimationResults: """| Estimate the parameters of the model. Same as estimate, where any extra calculation is skipped (init loglikelihood, t-statistics, etc.) :return: object containing the estimation results. :rtype: biogeme.results.bioResults Example:: # Create an instance of biogeme biogeme = bio.BIOGEME(database, logprob) # Gives a name to the model biogeme.modelName = 'mymodel' # Estimate the parameters results = biogeme.quickEstimate() :raises BiogemeError: if no expression has been provided for the likelihood """ # For some reason, the self.optimization_parameters cannot be used as such in the function call below. # I have no clue why. save_iteration_file_name = None starting_values = {} if self.biogeme_parameters.get_value('save_iterations'): logger.info( f"*** Initial values of the parameters are " f"obtained from the file {self._save_iterations_file_name()}" ) save_iteration_file_name = self._save_iterations_file_name() starting_values = self._load_saved_iteration() # For some reason, the self.optimization_parameters cannot be used as such in the function call below. # I have no clue why. opt_parameters = self.optimization_parameters algorithm_results: AlgorithmResults = model_estimation( the_algorithm=self._algorithm_configuration(), function_evaluator=self.function_evaluator, parameters=opt_parameters, some_starting_values=starting_values, save_iterations_filename=save_iteration_file_name, ) optimal_betas = self.expressions_registry.get_named_betas_values( algorithm_results.solution ) f_g_h_b: FunctionOutput = self.function_evaluator.evaluate( the_betas=optimal_betas, gradient=False, hessian=False, bhhh=False, ) clean_optimization_messages = {} for key, value in algorithm_results.optimization_messages.items(): if isinstance(value, np.floating): clean_optimization_messages[key] = float(value) elif isinstance(value, np.ndarray): clean_optimization_messages[key] = value.tolist() else: clean_optimization_messages[key] = value null_log_likelihood = ( float(self.null_loglikelihood) if self.null_loglikelihood is not None else None ) raw_estimation_results = RawEstimationResults( model_name=self.model_name, user_notes=self.user_notes, beta_names=self.expressions_registry.free_betas_names, beta_values=algorithm_results.solution.tolist(), lower_bounds=[bound[0] for bound in self.expressions_registry.bounds], upper_bounds=[bound[1] for bound in self.expressions_registry.bounds], gradient=[], hessian=[[]], bhhh=[[]], null_log_likelihood=null_log_likelihood, initial_log_likelihood=None, final_log_likelihood=f_g_h_b.function, data_name=self.model_elements.database.name, sample_size=self.model_elements.sample_size, number_of_observations=self.model_elements.number_of_observations, monte_carlo=self.expressions_registry.requires_draws, number_of_draws=int(self.model_elements.draws_management.number_of_draws), types_of_draws=self.model_elements.draws_management.draw_types, number_of_excluded_data=self.model_elements.database.number_of_excluded_data, draws_processing_time=self.model_elements.draws_management.processing_time, optimization_messages=clean_optimization_messages, convergence=algorithm_results.convergence, bootstrap=[], bootstrap_time=None, ) return EstimationResults(raw_estimation_results=raw_estimation_results)
[docs] def estimate_catalog( self, selected_configurations: set[Configuration] = None, quick_estimate: bool = False, recycle: bool = False, run_bootstrap: bool = False, ) -> dict[str, EstimationResults]: """Estimate all or selected versions of a model with Catalog's, corresponding to multiple specifications. :param selected_configurations: set of configurations. If None, all configurations are considered. :param quick_estimate: if True, the final statistics are not calculated. :param recycle: if True, the results are read from the pickle file, if it exists. If False, the estimation is performed. :param run_bootstrap: if True, bootstrapping is applied. :return: object containing the estimation results associated with the name of each specification, as well as a description of each configuration """ if self.short_names is None: self.short_names = ModelNames(prefix=self.modelName) if self.log_like is None: raise BiogemeError("No log likelihood function has been specified") central_controller = CentralController( expression=self.log_like, maximum_number_of_configurations=self.biogeme_parameters.get_value( 'maximum_number_catalog_expressions' ), ) if selected_configurations is None: number_of_specifications = central_controller.number_of_configurations() logger.info(f'Estimating {number_of_specifications} models.') if ( number_of_specifications is None or number_of_specifications > self.maximum_number_catalog_expressions ): error_msg = ( f"There are too many [{number_of_specifications}] different " f"specifications for the log likelihood function. This is " f"above the maximum number: " f"{self.maximum_number_catalog_expressions}. Simplify " f"the specification, change the value of the parameter " f"maximum_number_catalog_expressions, or consider using " f'the AssistedSpecification object in the "biogeme.assisted" ' f"module." ) raise ValueOutOfRange(error_msg) the_iterator = SelectedConfigurationsIterator( the_central_controller=central_controller ) else: the_iterator = SelectedConfigurationsIterator( the_central_controller=central_controller, selected_configurations=selected_configurations, ) configurations = {} for config in the_iterator: config_id = config.get_string_id() b = self.__class__.from_configuration( config_id=config_id, multiple_expression=self.log_like, database=self.database, user_notes=self.user_notes, parameters=self.biogeme_parameters, ) b.model_name = self.short_names(config_id) results = b.retrieve_saved_estimates() if recycle else None if results is None: results = ( b.quick_estimate() if quick_estimate else b.estimate(recycle=recycle, run_bootstrap=run_bootstrap) ) configurations[config_id] = results return configurations
[docs] def validate( self, estimation_results: EstimationResults, slices: int, groups: str | None = None, ) -> list[ValidationResult]: """ Perform out-of-sample validation of the model. The validation procedure operates by dividing the dataset into a number of slices. For each slice: - The slice is used as the validation set. - The remaining data forms the estimation set. - The model is re-estimated on the estimation set. - The model is applied to the validation set to compute the log likelihood. :param estimation_results: Estimation results obtained from the full dataset. :param slices: Number of data splits to create for cross-validation. :param groups: Optional column name used to group data entries (e.g., panel data). If provided, splitting preserves groups. :return: List of validation results, one for each data slice. :raises BiogemeError: If the dataset is structured as panel data and incompatible with validation. """ # For some reason, the self.optimization_parameters cannot be used as such in the function call below. # I have no clue why. parameters = self.optimization_parameters parameters['calculating_second_derivatives'] = ( self.biogeme_parameters.get_value(name='calculating_second_derivatives') ) return cross_validate_model( the_algorithm=self._algorithm_configuration(), modeling_elements=self.model_elements, parameters=parameters, starting_values=estimation_results.get_beta_values(), slices=slices, numerically_safe=self.numerically_safe, groups=groups, )
def _algorithm_configuration(self) -> OptimizationAlgorithm: if self.biogeme_parameters.get_value('optimization_algorithm') == "automatic": self.biogeme_parameters.set_value('optimization_algorithm', 'simple_bounds') return algorithms.get(self.optimization_algorithm)
[docs] def simulate(self, the_beta_values: dict[str, float] | None) -> pd.DataFrame: """Evaluate all simulation formulas on each row of the database using the specified parameter values. :param the_beta_values: Dictionary mapping parameter names to values. If None, an exception is raised. Use results.get_beta_values() after estimation or provide explicit values. :return: A pandas DataFrame where each row corresponds to an observation in the database, and each column corresponds to a simulation formula. :raises BiogemeError: If the_beta_values is None or if the number of parameters is incorrect. """ if the_beta_values is None: current_beta_values = self.expressions_registry.free_betas err = ( f'Contrarily to previous versions of Biogeme, ' f'the values of Beta must ' f'now be explicitly mentioned. If they have been estimated, they can be obtained from ' f'results.get_beta_values(). If not, use the default values: {current_beta_values}' ) raise BiogemeError(err) if not isinstance(the_beta_values, dict): error_msg = f'the_beta_values must be a dict, and not an object of type {type(the_beta_values)}' raise BiogemeError(error_msg) the_evaluator: MultiRowEvaluator = MultiRowEvaluator( model_elements=self.model_elements, numerically_safe=self.numerically_safe, use_jit=self.use_jit, ) results = the_evaluator.evaluate(the_beta_values) return results
[docs] def confidence_intervals( self, beta_values: list[dict[str, float]], interval_size: float = 0.9 ) -> tuple[pd.DataFrame, pd.DataFrame]: """Calculate confidence intervals on the simulated quantities :param beta_values: array of parameters values to be used in the calculations. Typically, it is a sample drawn from a distribution. :type beta_values: list(dict(str: float)) :param interval_size: size of the reported confidence interval, in percentage. If it is denoted by s, the interval is calculated for the quantiles (1-s)/2 and (1+s)/2. The default (0.9) corresponds to quantiles for the confidence interval [0.05, 0.95]. :type interval_size: float :return: two pandas data frames 'left' and 'right' with the same dimensions. Each row corresponds to a row in the database, and each column to a formula. 'left' contains the left value of the confidence interval, and 'right' the right value Example:: # Read the estimation results from a file results = EstimationEResults.from_yaml_file(filename = 'my_model.yaml') # Retrieve the names of the betas parameters that have been # estimated betas = biogeme.freeBetaNames # Draw 100 realization of the distribution of the estimators b = results.getBetasForSensitivityAnalysis(betas, size = 100) # Simulate the formulas using the nominal values simulatedValues = biogeme.simulate(beta_values) # Calculate the confidence intervals for each formula left, right = biogeme.confidenceIntervals(b, 0.9) :rtype: tuple of two Pandas dataframes. """ list_of_results = [] for b in tqdm(beta_values): r = self.simulate(b) list_of_results += [r] all_results = pd.concat(list_of_results) r = (1.0 - interval_size) / 2.0 left = all_results.groupby(level=0).quantile(r) right = all_results.groupby(level=0).quantile(1.0 - r) return left, right
def __str__(self) -> str: r = f"{self.model_name}: database [{self.model_elements.database.name}]" r += str(self.formulas) return r
[docs] @deprecated_parameters({'beta': None}) def check_derivatives(self, verbose: bool = False) -> CheckDerivativesResults: """Verifies the implementation of the derivatives. It compares the analytical version with the finite differences approximation. :param verbose: if True, the comparisons are reported. Default: False. :return: f, g, h, gdiff, hdiff where - f is the value of the function, - g is the analytical gradient, - h is the analytical hessian, - gdiff is the difference between the analytical and the finite differences gradient, - hdiff is the difference between the analytical and the finite differences hessian, """ starting_values = ( self.model_elements.expressions_registry.complete_dict_of_free_beta_values( the_betas={} ) ) the_log_likelihood: CallableExpression = function_from_compiled_formula( the_compiled_function=self.function_evaluator, the_betas=starting_values, ) return check_derivatives( the_log_likelihood, np.asarray( self.model_elements.expressions_registry.list_of_free_betas_init_values ), self.model_elements.expressions_registry.free_betas_names, verbose, )