Regression with interpreTS#

In this tutorial, we show how you can use interpreTS for regression.

[1]:

import urllib.request as urllib2
from io import BytesIO
from zipfile import ZipFile

import numpy as np
import pandas as pd
import interpreTS as it

WARNING:interpreTS:scikit-learn is not installed. Please install it to use interpreTS.

Loading in the data

[2]:

zip_url: str = "https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip"
zipped_file_name: str = "household_power_consumption.txt"


df_power_consumption: pd.DataFrame = pd.read_csv(
    ZipFile(BytesIO(urllib2.urlopen(zip_url).read())).open(zipped_file_name),
    sep=";",
    parse_dates={"timestamp": ["Date", "Time"]},
    infer_datetime_format=True,
    low_memory=False,
    na_values=["nan", "?"],
    index_col="timestamp",
    dtype="float32",
)

df_power_consumption = df_power_consumption.dropna()
df_power_consumption.index.to_series().diff().value_counts().sample(3)

C:\Users\slawek\AppData\Local\Temp\ipykernel_130648\302813920.py:5: FutureWarning: The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.
  df_power_consumption: pd.DataFrame = pd.read_csv(
C:\Users\slawek\AppData\Local\Temp\ipykernel_130648\302813920.py:5: UserWarning: Parsing dates in %d/%m/%Y %H:%M:%S format when dayfirst=False (the default) was specified. Pass `dayfirst=True` or specify a format to silence this warning.
  df_power_consumption: pd.DataFrame = pd.read_csv(

[2]:

timestamp
0 days 01:11:00    1
0 days 01:24:00    1
5 days 00:27:00    1
Name: count, dtype: int64

[3]:

df_power_consumption["avg_15min_GAP"] = df_power_consumption.rolling("15min")[
    "Global_active_power"
].aggregate(np.nanmean)

C:\Users\slawek\AppData\Local\Temp\ipykernel_130648\192953477.py:1: FutureWarning: The provided callable <function nanmean at 0x0000028DE8235B80> is currently using Rolling.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  df_power_consumption["avg_15min_GAP"] = df_power_consumption.rolling("15min")[

[4]:

train_columns = [f"Sub_metering_{i}" for i in range(1, 4)] + ["timestamp"]
target_col = "avg_15min_GAP"

# The percentage of data used for testing
test_pct = 0.2
day_margin = 3

# add the timestamp col
df_power_consumption["timestamp"] = df_power_consumption.index

# Ensure timestamp is in datetime format
df_power_consumption['timestamp'] = pd.to_datetime(df_power_consumption['timestamp'])

# Add 'year' and 'month' columns
df_power_consumption['year'] = df_power_consumption['timestamp'].dt.year
df_power_consumption['month'] = df_power_consumption['timestamp'].dt.month

# Add 'adjusted_month' column
df_power_consumption['adjusted_month'] = (df_power_consumption['year'] - 2007) * 12 + df_power_consumption['month']

# Temporal split logic
df_train = df_power_consumption[: -int(len(df_power_consumption) * test_pct)].copy()
df_test = df_power_consumption[df_train.index[-1] + pd.Timedelta(days=day_margin):].copy()

# Add MultiIndex for training data (adjusted_month, timestamp)
df_train = df_train.set_index(['adjusted_month', 'timestamp'])
df_train.sort_index(inplace=True)

# Add MultiIndex for testing data (adjusted_month, timestamp)
df_test = df_test.set_index(['adjusted_month', 'timestamp'])
df_test.sort_index(inplace=True)

# Output the training data head
df_train

[4]:

		Global_active_power	Global_reactive_power	Voltage	Global_intensity	Sub_metering_1	Sub_metering_2	Sub_metering_3	avg_15min_GAP	year	month
adjusted_month	timestamp
0	2006-12-16 17:24:00	4.216	0.418	234.839996	18.4	0.0	1.0	17.0	4.216000	2006	12
	2006-12-16 17:25:00	5.360	0.436	233.630005	23.0	0.0	1.0	16.0	4.788000	2006	12
	2006-12-16 17:26:00	5.374	0.498	233.289993	23.0	0.0	2.0	17.0	4.983333	2006	12
	2006-12-16 17:27:00	5.388	0.502	233.740005	23.0	0.0	1.0	17.0	5.084500	2006	12
	2006-12-16 17:28:00	3.666	0.528	235.679993	15.8	0.0	1.0	17.0	4.800800	2006	12
...	...	...	...	...	...	...	...	...	...	...	...
38	2010-02-05 04:24:00	0.340	0.076	245.979996	1.4	0.0	1.0	0.0	0.344000	2010	2
	2010-02-05 04:25:00	0.344	0.076	245.889999	1.6	0.0	1.0	1.0	0.344267	2010	2
	2010-02-05 04:26:00	0.344	0.074	245.660004	1.6	0.0	1.0	1.0	0.343733	2010	2
	2010-02-05 04:27:00	0.344	0.076	246.190002	1.6	0.0	1.0	0.0	0.344000	2010	2
	2010-02-05 04:28:00	0.420	0.162	246.740005	2.0	0.0	1.0	1.0	0.349333	2010	2

1639424 rows × 10 columns

[5]:

# Reset the index to remove the MultiIndex
df_test_reshaped = df_test.reset_index()
df_train_reshaped = df_train.reset_index()
train_y = df_train_reshaped['avg_15min_GAP']
train_y_monthly = df_train.groupby(level='adjusted_month').mean()
test_y = df_test_reshaped['avg_15min_GAP']
test_y_monthly = df_test.groupby(level='adjusted_month').mean()
df_test_reshaped.drop(columns=['timestamp', 'month', 'avg_15min_GAP'], inplace=True)
df_train_reshaped.drop(columns=['timestamp', 'month', 'avg_15min_GAP'], inplace=True)
train_y.shape, df_train_reshaped.shape, test_y.shape, df_test_reshaped.shape

[5]:

((1639424,), (1639424, 9), (405537,), (405537, 9))

Feature extraction with interpreTS

[6]:

extractor = it.FeatureExtractor(id_column="adjusted_month")
features_train = extractor.extract_features(df_train_reshaped)
features_test = extractor.extract_features(df_test_reshaped)
features_train.head()

[6]:

	length_Global_active_power	length_Global_reactive_power	length_Voltage	length_Global_intensity	length_Sub_metering_1	length_Sub_metering_2	length_Sub_metering_3	length_year	mean_Global_active_power	mean_Global_reactive_power	...	spikeness_Sub_metering_3	seasonality_strength_Global_active_power	seasonality_strength_Global_reactive_power	seasonality_strength_Voltage	seasonality_strength_Global_intensity	seasonality_strength_Sub_metering_1	seasonality_strength_Sub_metering_2	seasonality_strength_Sub_metering_3
0	21992	21992	21992	21992	21992	21992	21992	21992	1.901295	0.131386	...	0.321571	0.883480	0.852107	0.958830	0.891263	0.820514	0.930466	0.980202
1	44638	44638	44638	44638	44638	44638	44638	44638	1.546034	0.132676	...	0.325868	0.901837	0.861384	0.954732	0.902753	0.823161	0.931576	0.980758
2	40318	40318	40318	40318	40318	40318	40318	40318	1.401084	0.113637	...	0.488657	0.940750	0.866933	0.940974	0.941904	0.786104	0.926132	0.982762
3	44639	44639	44639	44639	44639	44639	44639	44639	1.318627	0.114747	...	0.536607	0.945153	0.871813	0.943282	0.944359	0.801939	0.930233	0.979583
4	39477	39477	39477	39477	39477	39477	39477	39477	0.891189	0.118778	...	1.007143	0.929497	0.874644	0.963638	0.927853	0.802403	0.896615	0.978974

5 rows × 56 columns

Using interpreTS for regression

[85]:

import xgboost as xgb
from sklearn.metrics import mean_squared_error

[87]:

gb_regressor = xgb.XGBRegressor(random_state=42)

gb_regressor.fit(features_train, train_y_monthly)

y_pred = gb_regressor.predict(features_test)

rmse = np.sqrt(mean_squared_error(test_y_monthly, y_pred))
print(f"RMSE: {rmse:.4f}")

RMSE: 1.8527

Regression with interpreTS#

This Page