Source code for interpreTS.core.features.feature_outliers_std
import numpy as np
import pandas as pd
[docs]
def calculate_outliers_std(data, training_data):
"""
Calculates the percentage of observations in a window that are above or below
3 standard deviations from the mean, based on the training dataset.
Parameters
----------
data : np.ndarray or pd.Series
Window data to analyze.
training_data : np.ndarray or pd.Series
Training data used to calculate the mean and standard deviation.
Returns
-------
float
Percentage of observations in the window that deviate by more than 3 standard deviations.
Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> training_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9])
>>> data = pd.Series([0, 10, 2, 3, 15])
>>> calculate_outliers_std(data, training_data)
0.2
"""
# Convert to numpy arrays for consistency
if isinstance(data, pd.Series):
data = data.values
if isinstance(training_data, pd.Series):
training_data = training_data.values
# Calculate mean and standard deviation from training data
mean_value = np.mean(training_data)
std_dev = np.std(training_data)
# Handle case where std_dev is 0
if std_dev == 0:
outliers = np.sum(data != mean_value) # Count values not equal to the mean
return outliers / len(data)
# Define bounds for outliers (3 standard deviations from the mean)
lower_bound = mean_value - 3 * std_dev
upper_bound = mean_value + 3 * std_dev
# Count observations outside the bounds
outliers = np.sum((data < lower_bound) | (data > upper_bound))
return outliers / len(data)