(Semi)automatic model selection

betop316 · (This post was last modified: Mar-24-2025, 07:26 PM by buran.)

I'm developing a code that chooses the best model out of 4 options for doing forecasts of commercial products after performing a preliminary test section to understand the behaviour of the data and classify each product in its best forecasting model. Nonetheless, I'm having trouble to store or generate the forecasts and RMSE for further analysis.

CODE:

#################################################################################
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarima import SARIMA
from statsmodels.tsa.api import VAR
#from croston import croston
#import xlsxwriter
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from prophet import Prophet
from pmdarima import auto_arima




file_path = "/Users/beto/Desktop/df_usd.xlsx"
df = pd.read_excel(file_path)

df.head()

df.info()

df = df.set_index("FECHA")

missing_values = df.isnull().sum().sum()



# Summary statistics for the distribution of the data
#summary_stats = df.describe()

#missing_values, summary_stats


# output_file = "/Users/beto/Desktop/summary_statistics.xlsx"
# with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
#    summary_stats.to_excel(writer, sheet_name="Summary Statistics")
#    pd.DataFrame({"Missing Values": [missing_values]}).to_excel(writer, sheet_name="Missing Values")

#output_file

##############################################################################
################################ Preliminary Tests ################################### 
##############################################################################

# Identify variable categories for model selection

# 1.Intermittent Demand - Croston's 
zero_counts = (df.iloc[:, 1:] == 0).sum()  # Count of zero values per column
intermittent_threshold = 0.5  # More than 50% zeros is considered intermittent
intermittent_vars = zero_counts[zero_counts / len(df) > intermittent_threshold].index.tolist()


# 2. ARIMA / ETS candidates
continuous_vars = zero_counts[zero_counts / len(df) <= intermittent_threshold].index.tolist()


# 3. Checking stationarity for continuous variables)

stationary_vars = []
non_stationary_vars = []

for var in continuous_vars:
    result = adfuller(df[var].dropna())
    if result[1] <= 0.05:  # If p-value <= 0.05, it's stationary
        stationary_vars.append(var)
    else:
        non_stationary_vars.append(var)

# 4. Final model recommendations
model_recommendations = {
#    "Intermittent (Croston's)": intermittent_vars,
    "ARIMA/ETS (Continuous & Non-Stationary)": non_stationary_vars,
    "VAR/BVAR (Stationary Variables)": stationary_vars,
}

#print(model_recommendations)

model_recommendations = pd.DataFrame.from_dict(model_recommendations, orient="index")
model_recommendations = model_recommendations.fillna(0)
model_recommendations = model_recommendations.T

# output_model_file = "/Users/beto/Desktop/model_recommendations.xlsx"
# with pd.ExcelWriter(output_model_file, engine="xlsxwriter") as writer:
#    model_recommendations.to_excel(writer, sheet_name="Model classification")

# output_model_file



##############################################################################
################################ Modeling Set up ################################### 
##############################################################################

def create_lagged_features(series, lags=12):
    df = pd.DataFrame({'y': series})
    for lag in range(1, lags + 1):
        df[f'lag_{lag}'] = df['y'].shift(lag)
    df.dropna(inplace=True)
    return df



# Load model recommendations
croston_vars = intermittent_vars
arima_vars = non_stationary_vars  
var_vars = stationary_vars  


arima_errors = {}
var_errors = {}
forecast_results = {}
rmse_results = {}
forecasts = {}  
rmse_scores = {}  


test_size = 12
lags = 12
forecast_horizon=12


# Generate forecast dates
forecast_dates = pd.date_range(
    start=df.index[-1] + pd.DateOffset(months=1), 
    periods=forecast_horizon, 
    freq='ME'
)

##############################################################################
################################ Implementation ################################### 
##############################################################################

# --- 1. Modeling set-up ---

# Iterate over each product in the dataset
for product in df.columns:
    print(f"\n🔹 Processing: {product}")

    product_data = df[product].dropna()

    train = product_data.iloc[:-12]
    test = product_data.iloc[-12:]

    # ✅ SARIMA
    if product in arima_vars:
        try:
            sarima_model = auto_arima(
                train, 
                seasonal=True, 
                m=12, 
                stepwise=True, 
                suppress_warnings=True)
            sarima_forecast = sarima_model.predict(n_periods=12)
            rmse_sarima = np.sqrt(mean_squared_error(test, sarima_forecast))
            
            rmse_scores["SARIMA"] = rmse_sarima
            forecasts["SARIMA"] = sarima_forecast
            
        except Exception as e:
            print(f"⚠️ SARIMA failed for {product}: {e}")

    # ✅ Holt-Winters (for highly seasonal products)
    try:
        
        hw_model = ExponentialSmoothing(train, seasonal='add', seasonal_periods=12).fit()
        hw_forecast = hw_model.forecast(12)
        
        
        rmse_scores["Holt-Winters"] = np.sqrt(mean_squared_error(test, hw_forecast))
        forecasts["Holt-Winters"] = hw_forecast
    except Exception as e:
        print(f"⚠️ Holt-Winters failed for {product}: {e}")

    # ✅ XGBoost (for volatile, non-seasonal products)
    try:
        
        product_series = df[product]  

        df_lagged = create_lagged_features(product_series, lags=12)
        
        # Split into features (X) and target (y)
        X = df_lagged.drop(columns='y').values  # Features: lag_1, lag_2, ..., lag_12
        y = df_lagged['y'].values               # Target: current value
        
        # Train/test split (last 12 months for testing)
        X_train, X_test = X[:-forecast_horizon], X[-forecast_horizon:]
        y_train, y_test = y[:-forecast_horizon], y[-forecast_horizon:]
        
        # Train XGBoost
        model = XGBRegressor(
            objective='reg:squarederror',
            n_estimators=200,
            learning_rate=0.05,
            random_state=42
        )
        
        model.fit(X_train, y_train)
        
        # --- Recursive Forecasting ---
        # Initialize with the last observed window of data
        last_window = df_lagged.drop(columns='y').iloc[-1].values
        
        forecast = []
        for _ in range(forecast_horizon):
            # Predict next step
            next_pred = model.predict(last_window.reshape(1, -1))[0]
            forecast.append(next_pred)
            
            last_window = np.roll(last_window, -1)          # Shift lags left
            last_window[-1] = next_pred                     # Add new prediction as latest lag
        
        y_pred_test = model.predict(X_test)
        rmse_xgboost = np.sqrt(mean_squared_error(y_test, y_pred_test))
        
        forecasts["XGBoost"] = y_pred_test
        rmse_scores["XGBoost"] = rmse_xgboost
            
    except Exception as e:
        print(f"⚠️ Error: {str(e)}")

    # ✅ Prophet (for trend-driven products)
    try:
        df_prophet = train.reset_index().rename(columns={'FECHA': 'ds', product: 'y'})
        prophet_model = Prophet()
        prophet_model.fit(df_prophet)

        future = prophet_model.make_future_dataframe(periods=12, freq='ME')
        prophet_forecast = prophet_model.predict(future)['yhat'][-12:].values

        rmse_scores["Prophet"] = np.sqrt(mean_squared_error(test, prophet_forecast))
        forecasts["Prophet"] = prophet_forecast
    
    except Exception as e:
        print(f"⚠️ Prophet failed for {product}: {e}")

    
    forecast_results[product] = forecasts
    rmse_results[product] = rmse_scores

# RMSE & Forecasts to DataFrame
rmse_df = pd.DataFrame(rmse_results).T  # Transpose for readability
forecast_df = pd.DataFrame().T

forecast_index = pd.date_range(start=df.index[-1] + pd.DateOffset(months=1), periods=12, freq='ME')

for product, model_forecasts in forecast_results.items():
    for model_name, forecast_values in model_forecasts.items():
        temp_df = pd.DataFrame({
            (product, model_name): forecast_values
        }, index=forecast_index)
        forecast_df = pd.concat([forecast_df, temp_df], axis=1)

forecast_df = forecast_df.sort_index(axis=1)

rmse_df = rmse_df[rmse_df.lt(10).any(axis=1)]

holt_winters_rmse_df = pd.DataFrame()
xgboost_rmse_df = pd.DataFrame()
prophet_rmse_df = pd.DataFrame()
sarima_rmse_df = pd.DataFrame()

min_methods = rmse_df[['Holt-Winters', 'XGBoost', 'Prophet', 'SARIMA']].idxmin(axis=1)

holt_winters_rmse_df = rmse_df[min_methods == 'Holt-Winters']
xgboost_rmse_df = rmse_df[min_methods == 'XGBoost']
prophet_rmse_df = rmse_df[min_methods == 'Prophet']
sarima_rmse_df = rmse_df[min_methods == 'SARIMA']

output_path = "/Users/beto/Desktop/rmse_models_comparison.xlsx"
with pd.ExcelWriter(output_path) as writer:
    holt_winters_rmse_df.to_excel(writer, sheet_name='Holt-Winters')
    xgboost_rmse_df.to_excel(writer, sheet_name='XGBoost')
    prophet_rmse_df.to_excel(writer, sheet_name='Prophet')
    sarima_rmse_df.to_excel(writer, sheet_name='SARIMA')

output_path

buran write Mar-24-2025, 07:26 PM:
Please, use proper tags when post code, traceback, output, etc. This time I have added tags for you.
See BBcode help for more info.

**Larz60+** · Mar-25-2025, 04:57 PM

you can use to_pckle to save a DataFrame, and read_pickle to read from a savef file

# to save model_recomendations
# uncomment output file path
    model_recommendations.to_pickle(output_file)

to load back at a leter time:

    model_recommendations = pd.read_pickel(output_file)

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Structuring a semi structured dataset	klllmmm	1	3,696	Apr-16-2017, 11:53 AM Last Post: zivoni

(Semi)automatic model selection

User Panel Messages

Announcements