Source code for interpreTS.utils.data_validation
import pandas as pd
import numpy as np
[docs]
def validate_time_series_data(
data,
feature_name=None,
validation_requirements=None,
**kwargs
):
"""
Validate the input time series data against dynamically provided requirements.
Parameters
----------
data : pd.Series, pd.DataFrame, or np.ndarray
The time series data to be validated.
feature_name : str, optional
The name of the feature to validate.
validation_requirements : dict, optional
A dictionary specifying the validation requirements for each feature.
**kwargs : dict
Additional validation parameters (overrides validation_requirements).
Returns
-------
bool
True if the data is valid; raises an error otherwise.
Raises
------
TypeError
If data is not a pd.Series, pd.DataFrame, or np.ndarray.
ValueError
If any validation requirement is not met.
"""
# Determine feature-specific requirements
if feature_name and validation_requirements:
requirements = validation_requirements.get(feature_name, {})
else:
requirements = kwargs
# Check data type
if not isinstance(data, (pd.Series, pd.DataFrame, np.ndarray)):
raise TypeError("Data must be a pandas Series, DataFrame, or numpy array.")
# Handle empty data
if isinstance(data, (pd.Series, pd.DataFrame)) and data.empty:
raise ValueError("Input data is empty.")
elif isinstance(data, np.ndarray) and data.size == 0:
raise ValueError("Input data is empty.")
# Validate pandas Series or DataFrame
if isinstance(data, (pd.Series, pd.DataFrame)):
# Check for NaN values
if not requirements.get('allow_nan', True) and data.isnull().any().any():
raise ValueError("Data contains NaN values.")
# Check index type if datetime index is required
if requirements.get('require_datetime_index', False) and not isinstance(data.index, pd.DatetimeIndex):
raise ValueError("Data must have a DateTime index for time-based operations.")
# Check for numeric values
if not np.issubdtype(data.to_numpy().dtype, np.number):
raise TypeError("Data must contain only numeric values.")
# Validate numpy array
elif isinstance(data, np.ndarray):
# Check if numeric
if not np.issubdtype(data.dtype, np.number):
raise TypeError("Data must contain only numeric values.")
if data.ndim > 1 and not requirements.get('allow_nan', True):
if np.isnan(data).any():
raise ValueError("Data contains NaN values.")
# Check for minimum length
min_length = requirements.get('min_length', None)
if min_length:
if isinstance(min_length, str): # Evaluate dynamic length expressions
min_length = eval(min_length)
if len(data) < min_length:
raise ValueError(f"Data must have at least {min_length} points.")
# Check for one-dimensional data
if requirements.get('check_one_dimensional', False):
if isinstance(data, np.ndarray) and data.ndim != 1:
raise ValueError("Data must be one-dimensional.")
if isinstance(data, pd.DataFrame) and data.shape[1] != 1:
raise ValueError("Data must be one-dimensional.")
# Additional validations
if 'check_nonzero_mean' in requirements and requirements['check_nonzero_mean']:
if np.isclose(data.mean(), 0):
raise ValueError("Data mean must not be zero.")
if 'validate_positive_parameters' in requirements:
for param, error_message in requirements['validate_positive_parameters'].items():
if param in kwargs and kwargs[param] <= 0:
raise ValueError(error_message)
if 'positive_integer_params' in requirements:
for param in requirements['positive_integer_params']:
if param in kwargs and (not isinstance(kwargs[param], int) or kwargs[param] <= 0):
raise ValueError(f"{param} must be a positive integer.")
return True