I'm developing a code that chooses the best model out of 4 options for doing forecasts of commercial products after performing a preliminary test section to understand the behaviour of the data and classify each product in its best forecasting model. Nonetheless, I'm having trouble to store or generate the forecasts and RMSE for further analysis.
CODE:
CODE:
################################################################################# import pandas as pd import numpy as np from statsmodels.tsa.stattools import adfuller from statsmodels.tsa.statespace.sarima import SARIMA from statsmodels.tsa.api import VAR #from croston import croston #import xlsxwriter from sklearn.metrics import mean_absolute_error, mean_squared_error from sklearn.model_selection import train_test_split from xgboost import XGBRegressor from statsmodels.tsa.holtwinters import ExponentialSmoothing from prophet import Prophet from pmdarima import auto_arima file_path = "/Users/beto/Desktop/df_usd.xlsx" df = pd.read_excel(file_path) df.head() df.info() df = df.set_index("FECHA") missing_values = df.isnull().sum().sum() # Summary statistics for the distribution of the data #summary_stats = df.describe() #missing_values, summary_stats # output_file = "/Users/beto/Desktop/summary_statistics.xlsx" # with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer: # summary_stats.to_excel(writer, sheet_name="Summary Statistics") # pd.DataFrame({"Missing Values": [missing_values]}).to_excel(writer, sheet_name="Missing Values") #output_file ############################################################################## ################################ Preliminary Tests ################################### ############################################################################## # Identify variable categories for model selection # 1.Intermittent Demand - Croston's zero_counts = (df.iloc[:, 1:] == 0).sum() # Count of zero values per column intermittent_threshold = 0.5 # More than 50% zeros is considered intermittent intermittent_vars = zero_counts[zero_counts / len(df) > intermittent_threshold].index.tolist() # 2. ARIMA / ETS candidates continuous_vars = zero_counts[zero_counts / len(df) <= intermittent_threshold].index.tolist() # 3. Checking stationarity for continuous variables) stationary_vars = [] non_stationary_vars = [] for var in continuous_vars: result = adfuller(df[var].dropna()) if result[1] <= 0.05: # If p-value <= 0.05, it's stationary stationary_vars.append(var) else: non_stationary_vars.append(var) # 4. Final model recommendations model_recommendations = { # "Intermittent (Croston's)": intermittent_vars, "ARIMA/ETS (Continuous & Non-Stationary)": non_stationary_vars, "VAR/BVAR (Stationary Variables)": stationary_vars, } #print(model_recommendations) model_recommendations = pd.DataFrame.from_dict(model_recommendations, orient="index") model_recommendations = model_recommendations.fillna(0) model_recommendations = model_recommendations.T # output_model_file = "/Users/beto/Desktop/model_recommendations.xlsx" # with pd.ExcelWriter(output_model_file, engine="xlsxwriter") as writer: # model_recommendations.to_excel(writer, sheet_name="Model classification") # output_model_file ############################################################################## ################################ Modeling Set up ################################### ############################################################################## def create_lagged_features(series, lags=12): df = pd.DataFrame({'y': series}) for lag in range(1, lags + 1): df[f'lag_{lag}'] = df['y'].shift(lag) df.dropna(inplace=True) return df # Load model recommendations croston_vars = intermittent_vars arima_vars = non_stationary_vars var_vars = stationary_vars arima_errors = {} var_errors = {} forecast_results = {} rmse_results = {} forecasts = {} rmse_scores = {} test_size = 12 lags = 12 forecast_horizon=12 # Generate forecast dates forecast_dates = pd.date_range( start=df.index[-1] + pd.DateOffset(months=1), periods=forecast_horizon, freq='ME' ) ############################################################################## ################################ Implementation ################################### ############################################################################## # --- 1. Modeling set-up --- # Iterate over each product in the dataset for product in df.columns: print(f"\n🔹 Processing: {product}") product_data = df[product].dropna() train = product_data.iloc[:-12] test = product_data.iloc[-12:] # ✅ SARIMA if product in arima_vars: try: sarima_model = auto_arima( train, seasonal=True, m=12, stepwise=True, suppress_warnings=True) sarima_forecast = sarima_model.predict(n_periods=12) rmse_sarima = np.sqrt(mean_squared_error(test, sarima_forecast)) rmse_scores["SARIMA"] = rmse_sarima forecasts["SARIMA"] = sarima_forecast except Exception as e: print(f"⚠️ SARIMA failed for {product}: {e}") # ✅ Holt-Winters (for highly seasonal products) try: hw_model = ExponentialSmoothing(train, seasonal='add', seasonal_periods=12).fit() hw_forecast = hw_model.forecast(12) rmse_scores["Holt-Winters"] = np.sqrt(mean_squared_error(test, hw_forecast)) forecasts["Holt-Winters"] = hw_forecast except Exception as e: print(f"⚠️ Holt-Winters failed for {product}: {e}") # ✅ XGBoost (for volatile, non-seasonal products) try: product_series = df[product] df_lagged = create_lagged_features(product_series, lags=12) # Split into features (X) and target (y) X = df_lagged.drop(columns='y').values # Features: lag_1, lag_2, ..., lag_12 y = df_lagged['y'].values # Target: current value # Train/test split (last 12 months for testing) X_train, X_test = X[:-forecast_horizon], X[-forecast_horizon:] y_train, y_test = y[:-forecast_horizon], y[-forecast_horizon:] # Train XGBoost model = XGBRegressor( objective='reg:squarederror', n_estimators=200, learning_rate=0.05, random_state=42 ) model.fit(X_train, y_train) # --- Recursive Forecasting --- # Initialize with the last observed window of data last_window = df_lagged.drop(columns='y').iloc[-1].values forecast = [] for _ in range(forecast_horizon): # Predict next step next_pred = model.predict(last_window.reshape(1, -1))[0] forecast.append(next_pred) last_window = np.roll(last_window, -1) # Shift lags left last_window[-1] = next_pred # Add new prediction as latest lag y_pred_test = model.predict(X_test) rmse_xgboost = np.sqrt(mean_squared_error(y_test, y_pred_test)) forecasts["XGBoost"] = y_pred_test rmse_scores["XGBoost"] = rmse_xgboost except Exception as e: print(f"⚠️ Error: {str(e)}") # ✅ Prophet (for trend-driven products) try: df_prophet = train.reset_index().rename(columns={'FECHA': 'ds', product: 'y'}) prophet_model = Prophet() prophet_model.fit(df_prophet) future = prophet_model.make_future_dataframe(periods=12, freq='ME') prophet_forecast = prophet_model.predict(future)['yhat'][-12:].values rmse_scores["Prophet"] = np.sqrt(mean_squared_error(test, prophet_forecast)) forecasts["Prophet"] = prophet_forecast except Exception as e: print(f"⚠️ Prophet failed for {product}: {e}") forecast_results[product] = forecasts rmse_results[product] = rmse_scores # RMSE & Forecasts to DataFrame rmse_df = pd.DataFrame(rmse_results).T # Transpose for readability forecast_df = pd.DataFrame().T forecast_index = pd.date_range(start=df.index[-1] + pd.DateOffset(months=1), periods=12, freq='ME') for product, model_forecasts in forecast_results.items(): for model_name, forecast_values in model_forecasts.items(): temp_df = pd.DataFrame({ (product, model_name): forecast_values }, index=forecast_index) forecast_df = pd.concat([forecast_df, temp_df], axis=1) forecast_df = forecast_df.sort_index(axis=1) rmse_df = rmse_df[rmse_df.lt(10).any(axis=1)] holt_winters_rmse_df = pd.DataFrame() xgboost_rmse_df = pd.DataFrame() prophet_rmse_df = pd.DataFrame() sarima_rmse_df = pd.DataFrame() min_methods = rmse_df[['Holt-Winters', 'XGBoost', 'Prophet', 'SARIMA']].idxmin(axis=1) holt_winters_rmse_df = rmse_df[min_methods == 'Holt-Winters'] xgboost_rmse_df = rmse_df[min_methods == 'XGBoost'] prophet_rmse_df = rmse_df[min_methods == 'Prophet'] sarima_rmse_df = rmse_df[min_methods == 'SARIMA'] output_path = "/Users/beto/Desktop/rmse_models_comparison.xlsx" with pd.ExcelWriter(output_path) as writer: holt_winters_rmse_df.to_excel(writer, sheet_name='Holt-Winters') xgboost_rmse_df.to_excel(writer, sheet_name='XGBoost') prophet_rmse_df.to_excel(writer, sheet_name='Prophet') sarima_rmse_df.to_excel(writer, sheet_name='SARIMA') output_path
buran write Mar-24-2025, 07:26 PM:
Please, use proper tags when post code, traceback, output, etc. This time I have added tags for you.
See BBcode help for more info.
Please, use proper tags when post code, traceback, output, etc. This time I have added tags for you.
See BBcode help for more info.