Source code for interpreTS.core.features.feature_histogram_dominant

import pandas as pd
import numpy as np


[docs]
def calculate_dominant(data, bins=10, return_bin_center=False):
    """
    Calculate the dominant value (mode) of a time series histogram.

    Parameters
    ----------
    data : pd.Series or np.ndarray
        The time series data for which the dominant value is to be calculated.
    bins : int, optional
        The number of bins to use for creating the histogram, by default 10.
    return_bin_center : bool, optional
        If True, return the center of the bin with the maximum frequency. 
        Otherwise, return the lower bound of the bin (default is False).

    Returns
    -------
    float
        The dominant value of the histogram (either the center or the lower bound of the bin).

    Raises
    ------
    TypeError
        If the data is not a valid time series type.
    ValueError
        If the data contains NaN values.

    Examples
    --------
    >>> import numpy as np
    >>> data = np.array([1, 2, 2, 3, 3, 3, 4, 4, 5])
    >>> calculate_dominant(data, bins=5)
    2.6

    >>> data = np.array([10, 20, 20, 30, 30, 30, 40, 40, 50])
    >>> calculate_dominant(data, bins=5, return_bin_center=True)
    30.0

    >>> data = np.array([1, 1, 1, 1, 1])
    >>> calculate_dominant(data, bins=3)
    0.8333333333333333
    """
    # Handle empty data early
    if isinstance(data, (pd.Series, pd.DataFrame)) and data.empty:
        return np.nan
    if isinstance(data, np.ndarray) and data.size == 0:
        return np.nan

    # Calculate histogram
    counts, bin_edges = np.histogram(data, bins=bins)
    max_bin_index = np.argmax(counts)

    # Return the lower bound or center of the dominant bin
    if return_bin_center:
        dominant_value = (bin_edges[max_bin_index] + bin_edges[max_bin_index + 1]) / 2
    else:
        dominant_value = bin_edges[max_bin_index]
    
    return dominant_value