import numpy as np
import polars as pl
import matplotlib.pyplot as plt
from sklearn import linear_model, model_selection
from statsmodels.tsa import deterministic

%matplotlib inline


!mkdir -p _local
!kaggle datasets download jazidesigns/us-retail-sales --unzip -p _local

Downloading us-retail-sales.zip to _local
  0%|                                               | 0.00/6.91k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 6.91k/6.91k [00:00<00:00, 5.98MB/s]


industries = ['BuildingMaterials', 'FoodAndBeverage']


df = pl.read_csv('_local/us-retail-sales.csv', columns=['Month'] + industries)


df.head()


df = df.with_column(
    pl.col('Month').apply(lambda x: x + '-01').str.strptime(pl.Date, '%Y-%m-%d').alias('date'))


df.head()


idx_train, idx_test = model_selection.train_test_split(range(len(df)), test_size=12*4, shuffle=False)
df_train, df_test = df[idx_train], df[idx_test]


def get_trend_dataset(df_train, df_test):
    dp = deterministic.DeterministicProcess(
        index=range(len(df_train) + len(df_test)),
        constant=True,
        order=2,
        drop=True
    )
    X = pl.DataFrame(dp.in_sample())
    X_train = X[:len(df_train)]
    y_train = df_train
    X_test = X[len(df_train):]
    y_test = df_test

    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = get_trend_dataset(df_train, df_test)


X_train.head()


y_train.head()


model = linear_model.LinearRegression(fit_intercept=False)
model.fit(X_train.to_numpy(), y_train[industries].to_numpy())

y_fit = pl.DataFrame(model.predict(X_train.to_numpy()), columns=industries)
y_pred = pl.DataFrame(model.predict(X_test.to_numpy()), columns=industries)

for industry in industries:
    fig, ax = plt.subplots()
    ax.set_title(industry)
    ax.plot(y_train['date'].to_numpy(), y_train[industry].to_numpy())
    ax.plot(y_test['date'].to_numpy(), y_test[industry].to_numpy())
    ax.plot(y_train['date'].to_numpy(), y_fit[industry].to_numpy())
    ax.plot(y_test['date'].to_numpy(), y_pred[industry].to_numpy())
plt.show()

Month	FoodAndBeverage	BuildingMaterials
str	i64	i64
"1992-01"	29589	8964
"1992-02"	28570	9023
"1992-03"	29682	10608
"1992-04"	30228	11630
"1992-05"	31677	12327

Month	FoodAndBeverage	BuildingMaterials	date
str	i64	i64	date
"1992-01"	29589	8964	1992-01-01
"1992-02"	28570	9023	1992-02-01
"1992-03"	29682	10608	1992-03-01
"1992-04"	30228	11630	1992-04-01
"1992-05"	31677	12327	1992-05-01

const	trend	trend_squared
f64	f64	f64
1	1	1
1	2	4
1	3	9
1	4	16
1	5	25

Month	FoodAndBeverage	BuildingMaterials	date
str	i64	i64	date
"1992-01"	29589	8964	1992-01-01
"1992-02"	28570	9023	1992-02-01
"1992-03"	29682	10608	1992-03-01
"1992-04"	30228	11630	1992-04-01
"1992-05"	31677	12327	1992-05-01

Predict trend from time series data¶