Spaces:

danielvarga
/

pq

Sleeping

App Files Files Community

pq / v2 /test_predictor_statsforecast.py

Daniel Varga

docs

1fee7d7 5 months ago

raw

history blame

No virus

5.99 kB

	import os

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt

	from statsforecast import StatsForecast
	from statsforecast.models import (
	ARIMA,
	AutoARIMA,
	AutoETS,
	AutoCES,
	DynamicOptimizedTheta,
	MSTL,
	SeasonalNaive,
	)
	from datasetsforecast.losses import rmse, mae


	os.environ["NIXTLA_NUMBA_RELEASE_GIL"] = "1"
	os.environ["NIXTLA_NUMBA_CACHE"] = "1"


	data = pd.read_csv("terheles_fixed.tsv", sep="\t")
	data['ds'] = pd.to_datetime(data['Korrigált időpont'])
	data['y'] = data['Hatásos teljesítmény']

	data = data[['ds', 'y']]
	data['unique_id'] = 1

	data = data[data['ds'] < '2019-03-01']
	Y_df = data

	train_df = Y_df[Y_df['ds'] < '2019-02-01']


	horizon = 4 * 24 * 7 # 7 days



	def ensemble_forecasts(
	fcsts_df,
	quantiles,
	name_models,
	model_name,
	) -> pd.DataFrame:
	fcsts_df[model_name] = fcsts_df[name_models].mean(axis=1).values # type: ignore
	# compute quantiles based on the mean of the forecasts
	sigma_models = []
	for model in name_models:
	fcsts_df[f"sigma_{model}"] = fcsts_df[f"{model}-hi-68.27"] - fcsts_df[model]
	sigma_models.append(f"sigma_{model}")
	fcsts_df[f"std_{model_name}"] = (
	fcsts_df[sigma_models].pow(2).sum(axis=1).div(len(sigma_models) ** 2).pow(0.5)
	)
	z = norm.ppf(quantiles)
	q_cols = []
	for q, zq in zip(quantiles, z):
	q_col = f"{model_name}-q-{q}"
	fcsts_df[q_col] = fcsts_df[model_name] + zq * fcsts_df[f"std_{model_name}"]
	q_cols.append(q_col)
	fcsts_df = fcsts_df[["unique_id", "ds"] + [model_name] + q_cols]
	return fcsts_df


	def run_statistical_ensemble(
	train_df: pd.DataFrame,
	horizon: int,
	freq: str,
	seasonality: int,
	quantiles,
	):
	os.environ["NIXTLA_ID_AS_COL"] = "true"
	models = [
	AutoARIMA(season_length=seasonality),
	AutoETS(season_length=seasonality),
	AutoCES(season_length=seasonality),
	DynamicOptimizedTheta(season_length=seasonality),
	]
	init_time = time()
	series_per_core = 15
	n_series = train_df["unique_id"].nunique()
	n_jobs = min(n_series // series_per_core, os.cpu_count())
	sf = StatsForecast(
	models=models,
	freq=freq,
	n_jobs=n_jobs,
	)
	fcsts_df = sf.forecast(df=train_df, h=horizon, level=[68.27])
	name_models = [repr(model) for model in models]
	model_name = "StatisticalEnsemble"
	fcsts_df = ensemble_forecasts(
	fcsts_df,
	quantiles,
	name_models,
	model_name,
	)
	total_time = time() - init_time
	return fcsts_df, total_time, model_name


	# unlike MSTL, the others only allow a single season_length:
	seasonality = 4 * 24 * 1 # 1 day

	models = [
	MSTL(
	season_length=[4 * 24, 4 * 24 * 7], # seasonalities of the time series
	trend_forecaster=AutoARIMA() # model used to forecast trend
	),
	SeasonalNaive(season_length=seasonality)
	]

	EXTENDED_TEST = True
	if EXTENDED_TEST:
	models += [
	# AutoARIMA(season_length=4 * 24) is just too slow, never even finishes,
	# spends all its time in scipy bfgs.
	# which is weird, because it's works okay as trend-detector of MSTL.
	AutoARIMA(),
	AutoETS(season_length=seasonality),
	AutoCES(season_length=seasonality),
	DynamicOptimizedTheta(season_length=seasonality),
	]


	freq = '15min'
	sf = StatsForecast(
	models=models,
	freq=freq,
	n_jobs=-1,
	)


	model_names = [repr(model) for model in models]


	n_windows = len(train_df) // horizon - 1
	print("crossvalidation with", n_windows, "windows")
	print("models:", ", ".join(model_names))
	crossvalidation_df = sf.cross_validation(df=train_df, h=horizon, step_size=horizon, n_windows=n_windows)

	for model_name in model_names:
	rmse_crossval = rmse(crossvalidation_df['y'], crossvalidation_df[model_name])
	mae_crossval = mae(crossvalidation_df['y'], crossvalidation_df[model_name])
	print(model_name, "RMSE", rmse_crossval, "MAE", mae_crossval)


	exit()

	sf.fit(train_df)

	sf.fitted_[0, 0].model_.tail(4 * 24 * 7 * 2).plot(subplots=True, grid=True)
	plt.tight_layout()
	plt.show()


	print("starting forecast, dataset size", len(train_df))
	# Y_hat_df = sf.forecast(df=train_df, h=horizon, level=[68.27])
	Y_hat_df = sf.predict(h=horizon, level=[68.27])


	print(Y_hat_df)

	Y_hat_df = Y_hat_df.reset_index()

	fig, ax = plt.subplots(1, 1, figsize = (20, 7))

	# plot_df = pd.concat([Y_df, Y_hat_df]).set_index('ds') # Concatenate the train and forecast dataframes
	# plot_df[['y', 'LSTM', 'NHITS']].plot(ax=ax, linewidth=2)

	plot_Y_df = Y_df # [Y_df['ds'] > '2019-07-01']
	plot_Y_df = plot_Y_df.set_index('ds')[['y']]
	plot_Y_df.plot(ax=ax, linewidth=1)
	Y_hat_df.set_index('ds').plot(ax=ax, linewidth=1)


	ax.set_title('AirPassengers Forecast', fontsize=22)
	ax.set_ylabel('Monthly Passengers', fontsize=20)
	ax.set_xlabel('Timestamp [t]', fontsize=20)
	ax.legend(prop={'size': 15})
	ax.grid()

	plt.savefig("neuralforecast.pdf")



	exit()






















	quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
	fcst_df, total_time, model_name = run_statistical_ensemble(
	train_df,
	horizon=horizon,
	freq='15m',
	seasonality=4 * 24 * 7,
	quantiles=quantiles
	)


	nf.fit(df=train_df)

	Y_hat_df = nf.predict()

	print(Y_df)

	Y_hat_df = Y_hat_df.reset_index()

	fig, ax = plt.subplots(1, 1, figsize = (20, 7))

	# plot_df = pd.concat([Y_df, Y_hat_df]).set_index('ds') # Concatenate the train and forecast dataframes
	# plot_df[['y', 'LSTM', 'NHITS']].plot(ax=ax, linewidth=2)

	plot_Y_df = Y_df[Y_df['ds'] > '2019-07-01']
	plot_Y_df = plot_Y_df.set_index('ds')[['y']]
	plot_Y_df.plot(ax=ax, linewidth=1)
	Y_hat_df.set_index('ds')[['PatchTST', 'NHITS']].plot(ax=ax, linewidth=1)


	ax.set_title('AirPassengers Forecast', fontsize=22)
	ax.set_ylabel('Monthly Passengers', fontsize=20)
	ax.set_xlabel('Timestamp [t]', fontsize=20)
	ax.legend(prop={'size': 15})
	ax.grid()

	plt.savefig("neuralforecast.pdf")