Regression with interpreTS#

In this tutorial, we show how you can use interpreTS for regression.

[1]:
import urllib.request as urllib2
from io import BytesIO
from zipfile import ZipFile

import numpy as np
import pandas as pd
import interpreTS as it
WARNING:interpreTS:scikit-learn is not installed. Please install it to use interpreTS.

Loading in the data

[2]:
zip_url: str = "https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip"
zipped_file_name: str = "household_power_consumption.txt"


df_power_consumption: pd.DataFrame = pd.read_csv(
    ZipFile(BytesIO(urllib2.urlopen(zip_url).read())).open(zipped_file_name),
    sep=";",
    parse_dates={"timestamp": ["Date", "Time"]},
    infer_datetime_format=True,
    low_memory=False,
    na_values=["nan", "?"],
    index_col="timestamp",
    dtype="float32",
)

df_power_consumption = df_power_consumption.dropna()
df_power_consumption.index.to_series().diff().value_counts().sample(3)
C:\Users\slawek\AppData\Local\Temp\ipykernel_130648\302813920.py:5: FutureWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.
  df_power_consumption: pd.DataFrame = pd.read_csv(
C:\Users\slawek\AppData\Local\Temp\ipykernel_130648\302813920.py:5: UserWarning: Parsing dates in %d/%m/%Y %H:%M:%S format when dayfirst=False (the default) was specified. Pass `dayfirst=True` or specify a format to silence this warning.
  df_power_consumption: pd.DataFrame = pd.read_csv(
[2]:
timestamp
0 days 01:11:00    1
0 days 01:24:00    1
5 days 00:27:00    1
Name: count, dtype: int64
[3]:
df_power_consumption["avg_15min_GAP"] = df_power_consumption.rolling("15min")[
    "Global_active_power"
].aggregate(np.nanmean)
C:\Users\slawek\AppData\Local\Temp\ipykernel_130648\192953477.py:1: FutureWarning: The provided callable <function nanmean at 0x0000028DE8235B80> is currently using Rolling.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  df_power_consumption["avg_15min_GAP"] = df_power_consumption.rolling("15min")[
[4]:
train_columns = [f"Sub_metering_{i}" for i in range(1, 4)] + ["timestamp"]
target_col = "avg_15min_GAP"

# The percentage of data used for testing
test_pct = 0.2
day_margin = 3

# add the timestamp col
df_power_consumption["timestamp"] = df_power_consumption.index

# Ensure timestamp is in datetime format
df_power_consumption['timestamp'] = pd.to_datetime(df_power_consumption['timestamp'])

# Add 'year' and 'month' columns
df_power_consumption['year'] = df_power_consumption['timestamp'].dt.year
df_power_consumption['month'] = df_power_consumption['timestamp'].dt.month

# Add 'adjusted_month' column
df_power_consumption['adjusted_month'] = (df_power_consumption['year'] - 2007) * 12 + df_power_consumption['month']

# Temporal split logic
df_train = df_power_consumption[: -int(len(df_power_consumption) * test_pct)].copy()
df_test = df_power_consumption[df_train.index[-1] + pd.Timedelta(days=day_margin):].copy()

# Add MultiIndex for training data (adjusted_month, timestamp)
df_train = df_train.set_index(['adjusted_month', 'timestamp'])
df_train.sort_index(inplace=True)

# Add MultiIndex for testing data (adjusted_month, timestamp)
df_test = df_test.set_index(['adjusted_month', 'timestamp'])
df_test.sort_index(inplace=True)

# Output the training data head
df_train
[4]:
Global_active_power Global_reactive_power Voltage Global_intensity Sub_metering_1 Sub_metering_2 Sub_metering_3 avg_15min_GAP year month
adjusted_month timestamp
0 2006-12-16 17:24:00 4.216 0.418 234.839996 18.4 0.0 1.0 17.0 4.216000 2006 12
2006-12-16 17:25:00 5.360 0.436 233.630005 23.0 0.0 1.0 16.0 4.788000 2006 12
2006-12-16 17:26:00 5.374 0.498 233.289993 23.0 0.0 2.0 17.0 4.983333 2006 12
2006-12-16 17:27:00 5.388 0.502 233.740005 23.0 0.0 1.0 17.0 5.084500 2006 12
2006-12-16 17:28:00 3.666 0.528 235.679993 15.8 0.0 1.0 17.0 4.800800 2006 12
... ... ... ... ... ... ... ... ... ... ... ...
38 2010-02-05 04:24:00 0.340 0.076 245.979996 1.4 0.0 1.0 0.0 0.344000 2010 2
2010-02-05 04:25:00 0.344 0.076 245.889999 1.6 0.0 1.0 1.0 0.344267 2010 2
2010-02-05 04:26:00 0.344 0.074 245.660004 1.6 0.0 1.0 1.0 0.343733 2010 2
2010-02-05 04:27:00 0.344 0.076 246.190002 1.6 0.0 1.0 0.0 0.344000 2010 2
2010-02-05 04:28:00 0.420 0.162 246.740005 2.0 0.0 1.0 1.0 0.349333 2010 2

1639424 rows × 10 columns

[5]:
# Reset the index to remove the MultiIndex
df_test_reshaped = df_test.reset_index()
df_train_reshaped = df_train.reset_index()
train_y = df_train_reshaped['avg_15min_GAP']
train_y_monthly = df_train.groupby(level='adjusted_month').mean()
test_y = df_test_reshaped['avg_15min_GAP']
test_y_monthly = df_test.groupby(level='adjusted_month').mean()
df_test_reshaped.drop(columns=['timestamp', 'month', 'avg_15min_GAP'], inplace=True)
df_train_reshaped.drop(columns=['timestamp', 'month', 'avg_15min_GAP'], inplace=True)
train_y.shape, df_train_reshaped.shape, test_y.shape, df_test_reshaped.shape
[5]:
((1639424,), (1639424, 9), (405537,), (405537, 9))

Feature extraction with interpreTS

[6]:
extractor = it.FeatureExtractor(id_column="adjusted_month")
features_train = extractor.extract_features(df_train_reshaped)
features_test = extractor.extract_features(df_test_reshaped)
features_train.head()
[6]:
length_Global_active_power length_Global_reactive_power length_Voltage length_Global_intensity length_Sub_metering_1 length_Sub_metering_2 length_Sub_metering_3 length_year mean_Global_active_power mean_Global_reactive_power ... spikeness_Sub_metering_3 spikeness_year seasonality_strength_Global_active_power seasonality_strength_Global_reactive_power seasonality_strength_Voltage seasonality_strength_Global_intensity seasonality_strength_Sub_metering_1 seasonality_strength_Sub_metering_2 seasonality_strength_Sub_metering_3 seasonality_strength_year
0 21992 21992 21992 21992 21992 21992 21992 21992 1.901295 0.131386 ... 0.321571 0.0 0.883480 0.852107 0.958830 0.891263 0.820514 0.930466 0.980202 0.0
1 44638 44638 44638 44638 44638 44638 44638 44638 1.546034 0.132676 ... 0.325868 0.0 0.901837 0.861384 0.954732 0.902753 0.823161 0.931576 0.980758 0.0
2 40318 40318 40318 40318 40318 40318 40318 40318 1.401084 0.113637 ... 0.488657 0.0 0.940750 0.866933 0.940974 0.941904 0.786104 0.926132 0.982762 0.0
3 44639 44639 44639 44639 44639 44639 44639 44639 1.318627 0.114747 ... 0.536607 0.0 0.945153 0.871813 0.943282 0.944359 0.801939 0.930233 0.979583 0.0
4 39477 39477 39477 39477 39477 39477 39477 39477 0.891189 0.118778 ... 1.007143 0.0 0.929497 0.874644 0.963638 0.927853 0.802403 0.896615 0.978974 0.0

5 rows × 56 columns

Using interpreTS for regression

[85]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
[87]:
gb_regressor = xgb.XGBRegressor(random_state=42)

gb_regressor.fit(features_train, train_y_monthly)

y_pred = gb_regressor.predict(features_test)

rmse = np.sqrt(mean_squared_error(test_y_monthly, y_pred))
print(f"RMSE: {rmse:.4f}")

RMSE: 1.8527