import numpy as np
import polars as pl
import matplotlib.pyplot as plt
from sklearn import linear_model, model_selection
from statsmodels.tsa import deterministic
%matplotlib inline
This is a demonstration of carrying out a part of the Hybrid Models tutorial of the Time Series course on Kaggle using polars instead of pandas.
Download the example time series data:
!mkdir -p _local
!kaggle datasets download jazidesigns/us-retail-sales --unzip -p _local
Downloading us-retail-sales.zip to _local 0%| | 0.00/6.91k [00:00<?, ?B/s] 100%|██████████████████████████████████████| 6.91k/6.91k [00:00<00:00, 5.98MB/s]
industries = ['BuildingMaterials', 'FoodAndBeverage']
df = pl.read_csv('_local/us-retail-sales.csv', columns=['Month'] + industries)
df.head()
Month | FoodAndBeverage | BuildingMaterials |
---|---|---|
str | i64 | i64 |
"1992-01" | 29589 | 8964 |
"1992-02" | 28570 | 9023 |
"1992-03" | 29682 | 10608 |
"1992-04" | 30228 | 11630 |
"1992-05" | 31677 | 12327 |
Add 'date' column.
Manipulating the 'Month' column before strptime-ing because strptime-ing with '%Y-%m' does not work for reason dunno.
df = df.with_column(
pl.col('Month').apply(lambda x: x + '-01').str.strptime(pl.Date, '%Y-%m-%d').alias('date'))
df.head()
Month | FoodAndBeverage | BuildingMaterials | date |
---|---|---|---|
str | i64 | i64 | date |
"1992-01" | 29589 | 8964 | 1992-01-01 |
"1992-02" | 28570 | 9023 | 1992-02-01 |
"1992-03" | 29682 | 10608 | 1992-03-01 |
"1992-04" | 30228 | 11630 | 1992-04-01 |
"1992-05" | 31677 | 12327 | 1992-05-01 |
Split the time series data into train part and test part:
idx_train, idx_test = model_selection.train_test_split(range(len(df)), test_size=12*4, shuffle=False)
df_train, df_test = df[idx_train], df[idx_test]
Generate quadratic trend dataframe in order to fit the linear regression model:
def get_trend_dataset(df_train, df_test):
dp = deterministic.DeterministicProcess(
index=range(len(df_train) + len(df_test)),
constant=True,
order=2,
drop=True
)
X = pl.DataFrame(dp.in_sample())
X_train = X[:len(df_train)]
y_train = df_train
X_test = X[len(df_train):]
y_test = df_test
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = get_trend_dataset(df_train, df_test)
X_train.head()
const | trend | trend_squared |
---|---|---|
f64 | f64 | f64 |
1 | 1 | 1 |
1 | 2 | 4 |
1 | 3 | 9 |
1 | 4 | 16 |
1 | 5 | 25 |
y_train.head()
Month | FoodAndBeverage | BuildingMaterials | date |
---|---|---|---|
str | i64 | i64 | date |
"1992-01" | 29589 | 8964 | 1992-01-01 |
"1992-02" | 28570 | 9023 | 1992-02-01 |
"1992-03" | 29682 | 10608 | 1992-03-01 |
"1992-04" | 30228 | 11630 | 1992-04-01 |
"1992-05" | 31677 | 12327 | 1992-05-01 |
Fit the linear regression model and plot the fitted and predicted trends:
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(X_train.to_numpy(), y_train[industries].to_numpy())
y_fit = pl.DataFrame(model.predict(X_train.to_numpy()), columns=industries)
y_pred = pl.DataFrame(model.predict(X_test.to_numpy()), columns=industries)
for industry in industries:
fig, ax = plt.subplots()
ax.set_title(industry)
ax.plot(y_train['date'].to_numpy(), y_train[industry].to_numpy())
ax.plot(y_test['date'].to_numpy(), y_test[industry].to_numpy())
ax.plot(y_train['date'].to_numpy(), y_fit[industry].to_numpy())
ax.plot(y_test['date'].to_numpy(), y_pred[industry].to_numpy())
plt.show()
TODO: preidct annual seasonality and cyclical features from detrended residuals.