# Source code for biogeme.tools

```"""Implements some useful functions

:author: Michel Bierlaire

:date: Sun Apr 14 10:46:10 2019

"""

# Too constraining
# pylint: disable=invalid-name, too-many-locals

import numpy as np
from scipy.stats import chi2
import biogeme.messaging as msg
import biogeme.exceptions as excep

logger = msg.bioMessage()

[docs]def findiff_g(theFunction, x):
"""Calculates the gradient of a function :math`f` using finite differences

:param theFunction: A function object that takes a vector as an
argument, and returns a tuple. The first
element of the tuple is the value of the
function :math:`f`. The other elements are not
used.
:type theFunction: function

:param x: argument of the function
:type x: numpy.array

:return: numpy vector, same dimension as x, containing the gradient
calculated by finite differences.
:rtype: numpy.array

"""
tau = 0.0000001
n = len(x)
g = np.zeros(n)
f = theFunction(x)
for i in range(n):
xi = x.item(i)
xp = x.copy()
if abs(xi) >= 1:
s = tau * xi
elif xi >= 0:
s = tau
else:
s = -tau
xp[i] = xi + s
fp = theFunction(xp)
g[i] = (fp - f) / s
return g

[docs]def findiff_H(theFunction, x):
"""Calculates the hessian of a function :math:`f` using finite differences

:param theFunction: A function object that takes a vector as an
argument, and returns a tuple. The first
element of the tuple is the value of the
function :math:`f`, and the second is the
gradient of the function.  The other elements
are not used.

:type theFunction: function

:param x: argument of the function
:type x: numpy.array

:return: numpy matrix containing the hessian calculated by
finite differences.
:rtype: numpy.array

"""
tau = 1.0e-7
n = len(x)
H = np.zeros((n, n))
g = theFunction(x)
eye = np.eye(n, n)
for i in range(n):
xi = x.item(i)
if abs(xi) >= 1:
s = tau * xi
elif xi >= 0:
s = tau
else:
s = -tau
ei = eye[i]
gp = theFunction(x + s * ei)
H[:, i] = (gp - g).flatten() / s
return H

[docs]def checkDerivatives(theFunction, x, names=None, logg=False):
"""Verifies the analytical derivatives of a function by comparing
them with finite difference approximations.

:param theFunction: A function object that takes a vector as an argument,
and returns a tuple:

- The first element of the tuple is the value of the
function :math:`f`,
- the second is the gradient of the function,
- the third is the hessian.

:type theFunction: function

:param x: arguments of the function
:type x: numpy.array

:param names: the names of the entries of x (for reporting).
:type names: list(string)
:param logg: if True, messages will be displayed.
:type logg: bool

:return: tuple f, g, h, gdiff, hdiff where

- f is the value of the function at x,
- g is the analytical gradient,
- h is the analytical hessian,
- gdiff is the difference between the analytical gradient
and the finite difference approximation
- hdiff is the difference between the analytical hessian
and the finite difference approximation

:rtype: float, numpy.array,numpy.array,  numpy.array,numpy.array

"""
f, g, h = theFunction(x)
g_num = findiff_g(theFunction, x)
gdiff = g - g_num
if logg:
if names is None:
names = [f'x[{i}]' for i in range(len(x))]
for k, v in enumerate(gdiff):
logger.detailed(f'{names[k]:15}\t{g[k]:+E}\t{g_num[k]:+E}\t{v:+E}')

h_num = findiff_H(theFunction, x)
hdiff = h - h_num
if logg:
logger.detailed('Row\t\tCol\t\tHessian\tFinDiff\t\tDifference')
for row in range(len(hdiff)):
for col in range(len(hdiff)):
logger.detailed(
f'{names[row]:15}\t{names[col]:15}\t{h[row,col]:+E}\t'
f'{h_num[row,col]:+E}\t{hdiff[row,col]:+E}'
)
return f, g, h, gdiff, hdiff

"""Get a given number of prime numbers

:param n: number of primes that are requested
:type n: int

:return: array with prime numbers
:rtype: list(int)
"""
total = 0
upperBound = 100
while total < n:
upperBound *= 10
total = len(primes)
return primes[0:n]

"""Calculate prime numbers

:param upperBound: prime numbers up to this value will be computed
:type upperBound: int

:return: array with prime numbers
:rtype: list(int)

[2, 3, 5, 7]

"""
mywork = list(range(0, upperBound + 1))
largest = int(np.ceil(np.sqrt(float(upperBound))))
# Remove all multiples
for i in range(2, largest + 1):
if mywork[i] != 0:
for k in range(2 * i, upperBound + 1, i):
mywork[k] = 0
# Gather non zero entries, which are the prime numbers
myprimes = []
for i in range(1, upperBound + 1):
if mywork[i] != 0 and mywork[i] != 1:
myprimes += [mywork[i]]

return myprimes

[docs]def countNumberOfGroups(df, column):
"""
This function counts the number of groups of same value in a column.
For instance: 1,2,2,3,3,3,4,1,1  would give 5.

Example::

>>>df = pd.DataFrame({'ID': [1, 1, 2, 3, 3, 1, 2, 3],
'value':[1000,
2000,
3000,
4000,
5000,
5000,
10000,
20000]})
>>>tools.countNumberOfGroups(df,'ID')
6

>>>tools.countNumberOfGroups(df,'value')
7

"""
df['_biogroups'] = (df[column] != df[column].shift(1)).cumsum()
return len(df['_biogroups'].unique())

[docs]def likelihood_ratio_test(model1, model2, significance_level=0.95):
"""This function performs a likelihood ratio test between a
restricted and an unrestricted model.

:param model1: the final loglikelood of one model, and the number of
estimated parameters.
:type model1: tuple(float, int)

:param model2: the final loglikelood of the other model, and
the number of estimated parameters.
:type model2: tuple(float, int)

:param significance_level: level of significance of the test. Default: 0.95
:type significance_level: float

:return: a tuple containing:

- a message with the outcome of the test
- the statistic, that is minus two times the difference
between the loglikelihood  of the two models
- the threshold of the chi square distribution.

:rtype: (str, float, float)

:raise biogemeError: if the unrestricted model has a lower log
likelihood than the restricted model.

"""

loglike_m1, df_m1 = model1
loglike_m2, df_m2 = model2
if loglike_m1 > loglike_m2:
if df_m1 < df_m2:
raise excep.biogemeError(
f'The unrestricted model {model2} has a '
f'lower log likelihood than the restricted one {model1}'
)
loglike_ur = loglike_m1
loglike_r = loglike_m2
df_ur = df_m1
df_r = df_m2
else:
if df_m1 >= df_m2:
raise excep.biogemeError(
f'The unrestricted model {model1} has a '
f'lower log likelihood than the restricted one {model2}'
)
loglike_ur = loglike_m2
loglike_r = loglike_m1
df_ur = df_m2
df_r = df_m1

stat = -2 * (loglike_r - loglike_ur)
chi_df = df_ur - df_r
threshold = chi2.ppf(significance_level, chi_df)
if stat <= threshold:
final_msg = f'H0 cannot be rejected at level {significance_level}'
else:
final_msg = f'H0 can be rejected at level {significance_level}'
return final_msg, stat, threshold
```