pq / v2 /test_predictor_statsforecast.py
Daniel Varga
docs
1fee7d7
raw
history blame contribute delete
No virus
5.99 kB
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsforecast import StatsForecast
from statsforecast.models import (
ARIMA,
AutoARIMA,
AutoETS,
AutoCES,
DynamicOptimizedTheta,
MSTL,
SeasonalNaive,
)
from datasetsforecast.losses import rmse, mae
os.environ["NIXTLA_NUMBA_RELEASE_GIL"] = "1"
os.environ["NIXTLA_NUMBA_CACHE"] = "1"
data = pd.read_csv("terheles_fixed.tsv", sep="\t")
data['ds'] = pd.to_datetime(data['Korrigált időpont'])
data['y'] = data['Hatásos teljesítmény']
data = data[['ds', 'y']]
data['unique_id'] = 1
data = data[data['ds'] < '2019-03-01']
Y_df = data
train_df = Y_df[Y_df['ds'] < '2019-02-01']
horizon = 4 * 24 * 7 # 7 days
def ensemble_forecasts(
fcsts_df,
quantiles,
name_models,
model_name,
) -> pd.DataFrame:
fcsts_df[model_name] = fcsts_df[name_models].mean(axis=1).values # type: ignore
# compute quantiles based on the mean of the forecasts
sigma_models = []
for model in name_models:
fcsts_df[f"sigma_{model}"] = fcsts_df[f"{model}-hi-68.27"] - fcsts_df[model]
sigma_models.append(f"sigma_{model}")
fcsts_df[f"std_{model_name}"] = (
fcsts_df[sigma_models].pow(2).sum(axis=1).div(len(sigma_models) ** 2).pow(0.5)
)
z = norm.ppf(quantiles)
q_cols = []
for q, zq in zip(quantiles, z):
q_col = f"{model_name}-q-{q}"
fcsts_df[q_col] = fcsts_df[model_name] + zq * fcsts_df[f"std_{model_name}"]
q_cols.append(q_col)
fcsts_df = fcsts_df[["unique_id", "ds"] + [model_name] + q_cols]
return fcsts_df
def run_statistical_ensemble(
train_df: pd.DataFrame,
horizon: int,
freq: str,
seasonality: int,
quantiles,
):
os.environ["NIXTLA_ID_AS_COL"] = "true"
models = [
AutoARIMA(season_length=seasonality),
AutoETS(season_length=seasonality),
AutoCES(season_length=seasonality),
DynamicOptimizedTheta(season_length=seasonality),
]
init_time = time()
series_per_core = 15
n_series = train_df["unique_id"].nunique()
n_jobs = min(n_series // series_per_core, os.cpu_count())
sf = StatsForecast(
models=models,
freq=freq,
n_jobs=n_jobs,
)
fcsts_df = sf.forecast(df=train_df, h=horizon, level=[68.27])
name_models = [repr(model) for model in models]
model_name = "StatisticalEnsemble"
fcsts_df = ensemble_forecasts(
fcsts_df,
quantiles,
name_models,
model_name,
)
total_time = time() - init_time
return fcsts_df, total_time, model_name
# unlike MSTL, the others only allow a single season_length:
seasonality = 4 * 24 * 1 # 1 day
models = [
MSTL(
season_length=[4 * 24, 4 * 24 * 7], # seasonalities of the time series
trend_forecaster=AutoARIMA() # model used to forecast trend
),
SeasonalNaive(season_length=seasonality)
]
EXTENDED_TEST = True
if EXTENDED_TEST:
models += [
# AutoARIMA(season_length=4 * 24) is just too slow, never even finishes,
# spends all its time in scipy bfgs.
# which is weird, because it's works okay as trend-detector of MSTL.
AutoARIMA(),
AutoETS(season_length=seasonality),
AutoCES(season_length=seasonality),
DynamicOptimizedTheta(season_length=seasonality),
]
freq = '15min'
sf = StatsForecast(
models=models,
freq=freq,
n_jobs=-1,
)
model_names = [repr(model) for model in models]
n_windows = len(train_df) // horizon - 1
print("crossvalidation with", n_windows, "windows")
print("models:", ", ".join(model_names))
crossvalidation_df = sf.cross_validation(df=train_df, h=horizon, step_size=horizon, n_windows=n_windows)
for model_name in model_names:
rmse_crossval = rmse(crossvalidation_df['y'], crossvalidation_df[model_name])
mae_crossval = mae(crossvalidation_df['y'], crossvalidation_df[model_name])
print(model_name, "RMSE", rmse_crossval, "MAE", mae_crossval)
exit()
sf.fit(train_df)
sf.fitted_[0, 0].model_.tail(4 * 24 * 7 * 2).plot(subplots=True, grid=True)
plt.tight_layout()
plt.show()
print("starting forecast, dataset size", len(train_df))
# Y_hat_df = sf.forecast(df=train_df, h=horizon, level=[68.27])
Y_hat_df = sf.predict(h=horizon, level=[68.27])
print(Y_hat_df)
Y_hat_df = Y_hat_df.reset_index()
fig, ax = plt.subplots(1, 1, figsize = (20, 7))
# plot_df = pd.concat([Y_df, Y_hat_df]).set_index('ds') # Concatenate the train and forecast dataframes
# plot_df[['y', 'LSTM', 'NHITS']].plot(ax=ax, linewidth=2)
plot_Y_df = Y_df # [Y_df['ds'] > '2019-07-01']
plot_Y_df = plot_Y_df.set_index('ds')[['y']]
plot_Y_df.plot(ax=ax, linewidth=1)
Y_hat_df.set_index('ds').plot(ax=ax, linewidth=1)
ax.set_title('AirPassengers Forecast', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()
plt.savefig("neuralforecast.pdf")
exit()
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
fcst_df, total_time, model_name = run_statistical_ensemble(
train_df,
horizon=horizon,
freq='15m',
seasonality=4 * 24 * 7,
quantiles=quantiles
)
nf.fit(df=train_df)
Y_hat_df = nf.predict()
print(Y_df)
Y_hat_df = Y_hat_df.reset_index()
fig, ax = plt.subplots(1, 1, figsize = (20, 7))
# plot_df = pd.concat([Y_df, Y_hat_df]).set_index('ds') # Concatenate the train and forecast dataframes
# plot_df[['y', 'LSTM', 'NHITS']].plot(ax=ax, linewidth=2)
plot_Y_df = Y_df[Y_df['ds'] > '2019-07-01']
plot_Y_df = plot_Y_df.set_index('ds')[['y']]
plot_Y_df.plot(ax=ax, linewidth=1)
Y_hat_df.set_index('ds')[['PatchTST', 'NHITS']].plot(ax=ax, linewidth=1)
ax.set_title('AirPassengers Forecast', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()
plt.savefig("neuralforecast.pdf")