Skip to content

Commit 5abe8d4

Browse files
Create Advanced Time series method
1 parent 4e259a0 commit 5abe8d4

File tree

1 file changed

+331
-0
lines changed

1 file changed

+331
-0
lines changed

Advanced Time series method

Lines changed: 331 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,331 @@
1+
import os
2+
import math
3+
import numpy as np
4+
import pandas as pd
5+
import matplotlib.pyplot as plt
6+
from sklearn.preprocessing import MinMaxScaler, StandardScaler
7+
from sklearn.metrics import mean_squared_error, mean_absolute_error
8+
import tensorflow as tf
9+
from tensorflow import keras
10+
from tensorflow.keras import layers
11+
import yfinance as yf
12+
import pmdarima as pm
13+
from prophet import Prophet
14+
from tqdm.auto import tqdm
15+
16+
# Set random seeds for reproducibility
17+
SEED = 42
18+
np.random.seed(SEED)
19+
tf.random.set_seed(SEED)
20+
21+
# %% [markdown]
22+
# 1) Download dataset (example: AAPL stock prices, using Open, High, Low, Close, Volume)
23+
# You can change the ticker and period as needed.
24+
25+
# %%
26+
TICKER = 'AAPL'
27+
START = '2015-01-01'
28+
END = '2024-12-31'
29+
30+
df = yf.download(TICKER, start=START, end=END, progress=False)
31+
print(f"Downloaded {len(df)} rows for {TICKER}")
32+
33+
# Basic inspection
34+
print(df.head())
35+
36+
# %% [markdown]
37+
# 2) Preprocessing and feature engineering
38+
39+
# %%
40+
# Use Close as target and include Volume + returns as covariates
41+
data = df[['Open','High','Low','Close','Volume']].copy()
42+
# Create log returns
43+
data['ret_1'] = data['Close'].pct_change().fillna(0)
44+
# Rolling features
45+
data['rmean_7'] = data['Close'].rolling(7).mean().fillna(method='bfill')
46+
data['rstd_7'] = data['Close'].rolling(7).std().fillna(method='bfill')
47+
# Time features
48+
data['dayofweek'] = data.index.dayofweek
49+
data['month'] = data.index.month
50+
51+
# Forward fill any remaining NaNs
52+
data = data.fillna(method='ffill').dropna()
53+
54+
TARGET = 'Close'
55+
FEATURES = [c for c in data.columns if c != TARGET]
56+
print('Features:', FEATURES)
57+
58+
# %% [markdown]
59+
# 3) Train/Validation/Test split (time-based)
60+
61+
# %%
62+
n = len(data)
63+
train_end = int(n * 0.7)
64+
val_end = int(n * 0.85)
65+
66+
train_df = data.iloc[:train_end]
67+
val_df = data.iloc[train_end:val_end]
68+
test_df = data.iloc[val_end:]
69+
70+
print(len(train_df), len(val_df), len(test_df))
71+
72+
# %% [markdown]
73+
# 4) Scaling
74+
75+
# %%
76+
scaler_X = StandardScaler()
77+
scaler_y = StandardScaler()
78+
79+
scaler_X.fit(train_df[FEATURES])
80+
scaler_y.fit(train_df[[TARGET]])
81+
82+
X_train = scaler_X.transform(train_df[FEATURES])
83+
X_val = scaler_X.transform(val_df[FEATURES])
84+
X_test = scaler_X.transform(test_df[FEATURES])
85+
86+
y_train = scaler_y.transform(train_df[[TARGET]]).flatten()
87+
y_val = scaler_y.transform(val_df[[TARGET]]).flatten()
88+
y_test = scaler_y.transform(test_df[[TARGET]]).flatten()
89+
90+
# %% [markdown]
91+
# 5) Create sequences for supervised learning
92+
93+
# %%
94+
LOOKBACK = 60 # how many past timesteps to use
95+
HORIZON = 1 # forecast horizon (1-step ahead); can expand to multi-step
96+
97+
def create_sequences(X, y, lookback=LOOKBACK, horizon=HORIZON):
98+
Xs, ys = [], []
99+
for i in range(lookback, len(X)-horizon+1):
100+
Xs.append(X[i-lookback:i])
101+
ys.append(y[i + horizon - 1])
102+
return np.array(Xs), np.array(ys)
103+
104+
Xtr, ytr = create_sequences(X_train, y_train)
105+
Xv, yv = create_sequences(X_val, y_val)
106+
Xt, yt = create_sequences(X_test, y_test)
107+
108+
print('Shapes:', Xtr.shape, ytr.shape, Xv.shape, yv.shape, Xt.shape, yt.shape)
109+
110+
# %% [markdown]
111+
# 6) Build Quantile Regression LSTM
112+
113+
# Quantile loss
114+
import tensorflow.keras.backend as K
115+
116+
def quantile_loss(q, y, f):
117+
e = y - f
118+
return K.mean(K.maximum(q * e, (q - 1) * e), axis=-1)
119+
120+
# Build model factory
121+
122+
def build_lstm_quantile(input_shape, quantiles=[0.1,0.5,0.9], units=64, dropout_rate=0.2):
123+
inp = keras.Input(shape=input_shape)
124+
x = layers.LSTM(units, return_sequences=True)(inp)
125+
x = layers.Dropout(dropout_rate)(x)
126+
x = layers.LSTM(units//2)(x)
127+
x = layers.Dropout(dropout_rate)(x)
128+
outputs = []
129+
for q in quantiles:
130+
out = layers.Dense(1, name=f'q_{int(q*100)}')(x)
131+
outputs.append(out)
132+
model = keras.Model(inp, outputs)
133+
return model
134+
135+
quantiles = [0.1, 0.5, 0.9]
136+
model_q = build_lstm_quantile(Xtr.shape[1:], quantiles=quantiles, units=64, dropout_rate=0.2)
137+
138+
# Compile with custom loss dictionary
139+
losses = {f'q_{int(q*100)}': (lambda q: (lambda y,f: quantile_loss(q,y,f)))(q) for q in quantiles}
140+
model_q.compile(optimizer='adam', loss=losses)
141+
model_q.summary()
142+
143+
# %% [markdown]
144+
# 7) Train Quantile model
145+
146+
# Prepare targets as dict
147+
train_targets = {f'q_{int(q*100)}': ytr for q in quantiles}
148+
val_targets = {f'q_{int(q*100)}': yv for q in quantiles}
149+
150+
EPOCHS = 30
151+
BATCH = 32
152+
153+
history = model_q.fit(Xtr, train_targets, validation_data=(Xv, val_targets), epochs=EPOCHS, batch_size=BATCH)
154+
155+
# %% [markdown]
156+
# 8) Predict and compute intervals
157+
158+
# Predict (returns list of arrays in order of outputs)
159+
preds = model_q.predict(Xt)
160+
# preds is list len=3 each shape (n_samples,1)
161+
q_preds = np.hstack(preds) # shape (n_samples, 3)
162+
163+
# Inverse transform the scaled predictions and targets
164+
q_preds_inv = scaler_y.inverse_transform(q_preds)
165+
yt_inv = scaler_y.inverse_transform(yt.reshape(-1,1)).flatten()
166+
167+
# Build dataframe for evaluation
168+
pred_df = pd.DataFrame({
169+
'y_true': yt_inv,
170+
'q_10': q_preds_inv[:,0],
171+
'q_50': q_preds_inv[:,1],
172+
'q_90': q_preds_inv[:,2],
173+
})
174+
175+
# Metrics for point forecast (median)
176+
rmse_q = math.sqrt(mean_squared_error(pred_df['y_true'], pred_df['q_50']))
177+
mae_q = mean_absolute_error(pred_df['y_true'], pred_df['q_50'])
178+
print('Quantile-LSTM (median) RMSE:', rmse_q, 'MAE:', mae_q)
179+
180+
# Coverage: fraction of true values inside [q10, q90]
181+
coverage = ((pred_df['y_true'] >= pred_df['q_10']) & (pred_df['y_true'] <= pred_df['q_90'])).mean()
182+
print('Empirical coverage (10-90):', coverage)
183+
184+
# %% [markdown]
185+
# 9) Plot results
186+
187+
# %%
188+
plt.figure(figsize=(12,5))
189+
plt.plot(pred_df['y_true'][-200:], label='True')
190+
plt.plot(pred_df['q_50'][-200:], label='Median Forecast')
191+
plt.fill_between(pred_df.index[-200:], pred_df['q_10'][-200:], pred_df['q_90'][-200:], alpha=0.3, label='10-90 PI')
192+
plt.legend()
193+
plt.title('Quantile-LSTM Forecast (test subset)')
194+
plt.show()
195+
196+
# %% [markdown]
197+
# 10) LSTM with Monte Carlo Dropout
198+
# We'll define a model with Dropout and keep it active at inference time by using learning_phase or Dropout layers with `training=True` during predict.
199+
200+
# %%
201+
from tensorflow.keras.layers import Dropout
202+
203+
def build_lstm_mc(input_shape, units=64, dropout_rate=0.2):
204+
inp = keras.Input(shape=input_shape)
205+
x = layers.LSTM(units, return_sequences=True)(inp)
206+
x = layers.Dropout(dropout_rate)(x)
207+
x = layers.LSTM(units//2)(x)
208+
x = layers.Dropout(dropout_rate)(x)
209+
out = layers.Dense(1)(x)
210+
model = keras.Model(inp, out)
211+
return model
212+
213+
model_mc = build_lstm_mc(Xtr.shape[1:], units=64, dropout_rate=0.2)
214+
model_mc.compile(optimizer='adam', loss='mse')
215+
model_mc.summary()
216+
217+
# Train
218+
history_mc = model_mc.fit(Xtr, ytr, validation_data=(Xv, yv), epochs=EPOCHS, batch_size=BATCH)
219+
220+
# Monte Carlo sampling function
221+
import numpy as np
222+
223+
def mc_dropout_predict(model, X, n_samples=100):
224+
# Run predictions with dropout active
225+
f_preds = []
226+
for i in range(n_samples):
227+
preds_i = model(X, training=True).numpy().flatten()
228+
f_preds.append(preds_i)
229+
f_preds = np.array(f_preds) # shape (n_samples, n_points)
230+
mean_pred = f_preds.mean(axis=0)
231+
std_pred = f_preds.std(axis=0)
232+
return mean_pred, std_pred, f_preds
233+
234+
mc_mean_scaled, mc_std_scaled, mc_all = mc_dropout_predict(model_mc, Xt, n_samples=100)
235+
mc_mean = scaler_y.inverse_transform(mc_mean_scaled.reshape(-1,1)).flatten()
236+
mc_std = mc_std_scaled * scaler_y.scale_[0] # scale std by scaler
237+
238+
# 95% interval
239+
mc_lower = mc_mean - 1.96 * mc_std
240+
mc_upper = mc_mean + 1.96 * mc_std
241+
242+
# Evaluate
243+
rmse_mc = math.sqrt(mean_squared_error(yt_inv, mc_mean))
244+
mae_mc = mean_absolute_error(yt_inv, mc_mean)
245+
coverage_mc = ((yt_inv >= mc_lower) & (yt_inv <= mc_upper)).mean()
246+
print('MC Dropout RMSE:', rmse_mc, 'MAE:', mae_mc, 'Coverage (95%):', coverage_mc)
247+
248+
# Plot subset
249+
plt.figure(figsize=(12,5))
250+
plt.plot(yt_inv[-200:], label='True')
251+
plt.plot(mc_mean[-200:], label='MC Mean')
252+
plt.fill_between(range(len(mc_mean[-200:])), mc_lower[-200:], mc_upper[-200:], alpha=0.2, label='MC 95% PI')
253+
plt.legend()
254+
plt.title('LSTM + MC Dropout Forecast (test subset)')
255+
plt.show()
256+
257+
# %% [markdown]
258+
# 11) Baseline: ARIMA (pmdarima auto_arima) on the Close series (univariate)
259+
260+
# %%
261+
close_series = data[TARGET]
262+
train_close = close_series.iloc[:train_end]
263+
val_close = close_series.iloc[train_end:val_end]
264+
test_close = close_series.iloc[val_end:]
265+
266+
# Fit auto_arima on train
267+
arima = pm.auto_arima(train_close, seasonal=False, stepwise=True, suppress_warnings=True, error_action='ignore')
268+
print('ARIMA order:', arima.order)
269+
270+
# Forecast on test horizon (one-step rolling forecast for fairness)
271+
history_arima = list(train_close)
272+
preds_arima = []
273+
for t in range(len(test_close)):
274+
model = pm.ARIMA(order=arima.order).fit(np.array(history_arima))
275+
yhat = model.predict(n_periods=1)[0]
276+
preds_arima.append(yhat)
277+
history_arima.append(test_close.iloc[t])
278+
279+
rmse_arima = math.sqrt(mean_squared_error(test_close.values, preds_arima))
280+
mae_arima = mean_absolute_error(test_close.values, preds_arima)
281+
print('ARIMA RMSE:', rmse_arima, 'MAE:', mae_arima)
282+
283+
# %% [markdown]
284+
# 12) Baseline: Prophet (optional) — requires 'ds' and 'y' columns
285+
286+
# %%
287+
prophet_df = pd.DataFrame({'ds': close_series.index, 'y': close_series.values})
288+
train_prophet = prophet_df.iloc[:train_end]
289+
model_prophet = Prophet()
290+
model_prophet.fit(train_prophet)
291+
292+
future = model_prophet.make_future_dataframe(periods=len(test_close), freq='D')
293+
forecast_prophet = model_prophet.predict(future)
294+
# extract test-period forecasts
295+
fc_prophet = forecast_prophet[['ds','yhat']].set_index('ds').loc[test_close.index]['yhat'].values
296+
297+
rmse_prophet = math.sqrt(mean_squared_error(test_close.values, fc_prophet))
298+
mae_prophet = mean_absolute_error(test_close.values, fc_prophet)
299+
print('Prophet RMSE:', rmse_prophet, 'MAE:', mae_prophet)
300+
301+
# %% [markdown]
302+
# 13) Collect results and compare
303+
304+
# %%
305+
results = pd.DataFrame({
306+
'model': ['Quantile-LSTM (median)', 'LSTM MC Dropout (mean)', 'ARIMA', 'Prophet'],
307+
'RMSE': [rmse_q, rmse_mc, rmse_arima, rmse_prophet],
308+
'MAE': [mae_q, mae_mc, mae_arima, mae_prophet]
309+
})
310+
print(results)
311+
312+
# %% [markdown]
313+
# 14) Save models and artifacts
314+
315+
# %%
316+
os.makedirs('models', exist_ok=True)
317+
model_q.save('models/quantile_lstm')
318+
model_mc.save('models/lstm_mc')
319+
320+
# Save scalers
321+
import joblib
322+
joblib.dump(scaler_X, 'models/scaler_X.pkl')
323+
joblib.dump(scaler_y, 'models/scaler_y.pkl')
324+
325+
# %% [markdown]
326+
# 15) Notes & next steps
327+
# - For probabilistic scoring you can compute CRPS (requires properscoring package).
328+
# - For multistep forecasts extend horizon and adapt sequence creation accordingly.
329+
# - To speed up training use GPU and reduce EPOCHS for quick debugging.
330+
331+
print('Script finished. Review plots and results.')

0 commit comments

Comments
 (0)