Source code for biogeme.lsh

"""
Obtain sampling weights using local-sensitivity hashing

:author: Nicola Ortelli
:date: Fri Aug 11 18:25:39 2023

"""

import numpy as np
import pandas as pd



[docs]
def get_lsh_weights(
    df: pd.DataFrame, w: float, a: np.ndarray, max_weight: int | None
) -> np.ndarray:
    """Compute weights using Locality-Sensitive Hashing (LSH) on input data.

    This function applies LSH to the input data frame, generating weights
    based on bucketing of the data. It also provides an option to limit
    the maximum weight assigned to a group of data points.

    :param df: The input data frame containing the data to compute weights for.
        The DataFrame should have at least one target column and one
        weight column.
    :type df: pandas DataFrame

    :param w: The width of the LSH buckets.
    :type w: float

    :param a: The LSH hash functions as a 2D array. Each row of this array
        represents an LSH hash function.
    :type a: numpy.ndarray

    :param max_weight: The maximum weight allowed for a group of data points. If not
        provided, no maximum weight constraint is applied.
    :type max_weight: int, optional

    :return: An array of weights corresponding to the input data frame.
    :rtype: numpy.ndarray
    """

    # Normalize the explanatory variables
    df_expla = df.drop(
        columns=['target', 'weight']
    )  # need to drop the intercept column...
    df_expla_norm = (df_expla - df_expla.min()) / (df_expla.max() - df_expla.min())

    # hashing into buckets according to LSH
    b = np.random.rand(a.shape[1]) * w
    buckets = np.floor((df_expla_norm.dot(a) + b) / w).astype(int)

    # if buckets depend on the target, uncomment this line...
    # buckets['target'] = df['target']

    # saving names of colummns storing buckets
    groupby_cols = list(buckets.columns)

    # Randomize the order of buckets to avoid bias
    buckets = buckets.sample(frac=1).reset_index()

    # adding a column that guarantees max_weight is never exceeded
    if max_weight:
        group_counts = buckets.groupby(groupby_cols, sort=False)[0].transform(
            'cumcount'
        )

        buckets['sieve'] = (group_counts / max_weight).astype(int)

        groupby_cols.append('sieve')

    # adding a column to store weights
    buckets['weight'] = 1

    # preparing aggregation dictionary for final grouping
    agg_dict = {'index': 'first', 'weight': 'sum'}

    # final grouping
    selected = buckets.groupby(groupby_cols, sort=False).agg(agg_dict)

    # building vector of weights
    weights = np.zeros(len(df))
    np.put(weights, selected['index'], selected['weight'])

    return weights