In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, RidgeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingRegressor, StackingRegressor, StackingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import make_scorer, f1_score
from sklearn.base import RegressorMixin, ClassifierMixin, BaseEstimator
from sklearn.ensemble import VotingClassifier, VotingRegressor

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor

Data Collection¶

In [4]:
import requests

# URLs of the files
data_train_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/course/module6_course_train.csv'
data_test_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/course/module6_course_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(data_train_url, 'module6_course_train.csv')
download_file(data_train_url, 'module6_course_test.csv')
Downloaded module6_course_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module6/course/module6_course_train.csv
Downloaded module6_course_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module6/course/module6_course_train.csv
In [5]:
data_train = pd.read_csv('module6_course_train.csv')
data_test = pd.read_csv('module6_course_test.csv')

Data Analysis¶

In [6]:
data_train
Out[6]:
Temperature Humidity Humex CO2 Bright weekday_0 weekday_1 weekday_2 weekday_3 weekday_4 weekday_5 weekday_6 hour_sine_wave Score
0 -0.151174 2.695116 1.175429 -0.258951 -0.528247 False False False True False False False -0.129410 3
1 -0.089558 2.573765 1.175429 -0.397135 -0.528247 False False False False True False False 0.000000 3
2 -0.027943 2.573765 1.252984 -0.046741 -0.528247 False False False False True False False 0.129410 3
3 0.033673 2.573765 1.330538 0.214821 -0.528247 False False False False True False False 0.250000 1
4 0.033673 2.452414 1.252984 0.431967 -0.528247 False False False False True False False 0.433013 3
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6995 -1.260252 -1.066767 -1.500198 -0.984416 -0.007760 False False False False False False True -0.433013 1
6996 -1.321868 -1.066767 -1.577752 -1.028832 -0.189931 False False False False False False True -0.433013 0
6997 -1.445098 -0.945416 -1.616530 -0.930129 -0.320052 False False False False False False True -0.482963 0
6998 -1.568329 -0.945416 -1.694084 -1.004156 -0.515235 False False False False False False True -0.500000 1
6999 -1.691560 -0.945416 -1.810416 -0.984416 -0.528247 False False False False False False True -0.482963 1

7000 rows × 14 columns

In [5]:
data_train.isnull().sum()
Out[5]:
Temperature       0
Humidity          0
Humex             0
CO2               0
Bright            0
weekday_0         0
weekday_1         0
weekday_2         0
weekday_3         0
weekday_4         0
weekday_5         0
weekday_6         0
hour_sine_wave    0
Score             0
dtype: int64
In [6]:
data_train['Score'].value_counts(normalize=True)
Out[6]:
Score
2    0.420143
1    0.272429
3    0.232571
0    0.044857
4    0.030000
Name: proportion, dtype: float64

Model Building and Evaluate¶

In [7]:
y = data_train.pop('Score')
X = data_train.copy()
In [8]:
# Function to plot the evaluation results
def plot_results(mse_train, mse_test, f1_train, f1_test):
    plt.figure(figsize=(12, 6))

    # MSE plot
    plt.subplot(1, 2, 1)
    plt.plot(mse_train, label="Train MSE", marker='o')
    plt.plot(mse_test, label="Test MSE", marker='o')
    plt.fill_between(range(len(mse_train)), np.min(mse_train), np.max(mse_train), color='blue', alpha=0.1)
    plt.fill_between(range(len(mse_test)), np.min(mse_test), np.max(mse_test), color='orange', alpha=0.1)
    plt.title("MSE over Folds")
    plt.xlabel("Fold")
    plt.ylabel("MSE")
    plt.legend()
    plt.grid(True)

    # f1_score plot
    plt.subplot(1, 2, 2)
    plt.plot(f1_train, label="Train f1_score", marker='o')
    plt.plot(f1_test, label="Test f1_score", marker='o')
    plt.fill_between(range(len(f1_train)), np.min(f1_train), np.max(f1_train), color='blue', alpha=0.1)
    plt.fill_between(range(len(f1_test)), np.min(f1_test), np.max(f1_test), color='orange', alpha=0.1)
    plt.title("f1_score over Folds")
    plt.xlabel("Fold")
    plt.ylabel("f1_score")
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

def plot_multi_model_results(results):
    # Set up the plot
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 20))
    
    # Colors for train and test
    train_color = 'skyblue'
    test_color = 'lightgreen'
    
    # Plot MSE
    ax1.set_title('Mean Squared Error (MSE) Comparison', fontsize=16)
    ax1.set_ylabel('MSE', fontsize=12)
    ax1.set_xlabel('Models', fontsize=12)
    ax1.grid(True, linestyle='--', alpha=0.7)
    
    # Plot f1_score
    ax2.set_title('f1_score Comparison', fontsize=16)
    ax2.set_ylabel('f1_score', fontsize=12)
    ax2.set_xlabel('Models', fontsize=12)
    ax2.grid(True, linestyle='--', alpha=0.7)
    
    x = np.arange(len(results))
    width = 0.35
    
    for i, (model_name, scores) in enumerate(results.items()):
        # MSE
        mse_train = scores['mse_train']
        mse_test = scores['mse_test']
        
        ax1.bar(x[i] - width/2, np.mean(mse_train), width, label='Train' if i == 0 else "", 
                color=train_color, alpha=0.7)
        ax1.bar(x[i] + width/2, np.mean(mse_test), width, label='Test' if i == 0 else "", 
                color=test_color, alpha=0.7)
        
        ax1.errorbar(x[i] - width/2, np.mean(mse_train), 
                     yerr=[[np.mean(mse_train)-np.min(mse_train)], [np.max(mse_train)-np.mean(mse_train)]], 
                     fmt='none', ecolor='black', capsize=5)
        ax1.errorbar(x[i] + width/2, np.mean(mse_test), 
                     yerr=[[np.mean(mse_test)-np.min(mse_test)], [np.max(mse_test)-np.mean(mse_test)]], 
                     fmt='none', ecolor='black', capsize=5)
        
        # f1_score
        f1_train = scores['f1_train']
        f1_test = scores['f1_test']
        
        ax2.bar(x[i] - width/2, np.mean(f1_train), width, label='Train' if i == 0 else "", 
                color=train_color, alpha=0.7)
        ax2.bar(x[i] + width/2, np.mean(f1_test), width, label='Test' if i == 0 else "", 
                color=test_color, alpha=0.7)
        
        ax2.errorbar(x[i] - width/2, np.mean(f1_train), 
                     yerr=[[np.mean(f1_train)-np.min(f1_train)], [np.max(f1_train)-np.mean(f1_train)]], 
                     fmt='none', ecolor='black', capsize=5)
        ax2.errorbar(x[i] + width/2, np.mean(f1_test), 
                     yerr=[[np.mean(f1_test)-np.min(f1_test)], [np.max(f1_test)-np.mean(f1_test)]], 
                     fmt='none', ecolor='black', capsize=5)
    
    ax1.set_xticks(x)
    ax1.set_xticklabels(results.keys(), rotation=45, ha='right')
    ax2.set_xticks(x)
    ax2.set_xticklabels(results.keys(), rotation=45, ha='right')
    
    ax1.legend(loc='upper left')
    ax2.legend(loc='upper left')
    
    plt.tight_layout()
    plt.show()
In [9]:
def cast_and_clip_predictions(y_pred):
    # Round the predictions to the nearest integer
    y_pred_int = np.round(y_pred).astype(int)
    
    # Clip the predictions to be within the range of y_true
    y_pred_int = np.clip(y_pred_int, np.min(0), np.max(4))
    
    return y_pred_int

# Custom scorer that incorporates casting and clipping
def custom_f1_score(y_true, y_pred):
    y_pred_cast = cast_and_clip_predictions(y_pred)
    return f1_score(y_true, y_pred_cast, average="weighted")
In [10]:
# Function to handle train-test evaluation in a fold
def train_and_evaluate(X_train, X_test, y_train, y_test, model):
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on train set
    y_pred_train = model.predict(X_train)
    # Make predictions on train set
    y_pred_test = model.predict(X_test)
    
    # Compute MSE for train and test
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    
    # Compute f1_score
    
    f1_train = custom_f1_score(y_train, y_pred_train)
    f1_test = custom_f1_score(y_test, y_pred_test)
    
    return mse_train, mse_test, f1_train, f1_test


def run_multi_model_cv(X, y, models, n_splits=5):
    sfold = StratifiedKFold(n_splits=n_splits)
    results = {name: {'mse_train': [], 'mse_test': [], 'f1_train': [], 'f1_test': []} 
               for name in models.keys()}
    
    for train_index, test_index in sfold.split(X, y):
        X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_test = y.iloc[train_index].copy(), y.iloc[test_index].copy()
        
        for name, model in models.items():
            mse_train, mse_test, f1_train, f1_test = train_and_evaluate(
                X_train, X_test, y_train, y_test, model
            )
            results[name]['mse_train'].append(mse_train)
            results[name]['mse_test'].append(mse_test)
            results[name]['f1_train'].append(f1_train)
            results[name]['f1_test'].append(f1_test)
    
    # Find the model with the best mean F1 test score
    best_mean_f1_score = -1
    best_model = None
    best_min_f1 = None
    best_max_f1 = None
    
    for name, result in results.items():
        f1_test_scores = result['f1_test']
        mean_f1_test = sum(f1_test_scores) / len(f1_test_scores)  # Calculate mean F1 score
        min_f1_test = min(f1_test_scores)  # Minimum F1 score
        max_f1_test = max(f1_test_scores)  # Maximum F1 score
        
        if mean_f1_test > best_mean_f1_score:
            best_mean_f1_score = mean_f1_test
            best_min_f1 = min_f1_test
            best_max_f1 = max_f1_test
            best_model = name
    
    # Print the best mean F1 test score, min, max, and the associated model
    print(f"Best mean F1 test score: {best_mean_f1_score:.4f} by model: {best_model}")
    print(f"Min F1 test score: {best_min_f1:.4f}, Max F1 test score: {best_max_f1:.4f}")
    
    return results

Simple Baseline¶

In [11]:
# Step 1: Initialize model

model = RandomForestClassifier()
    
# Step 2: Run cross-validation
results = run_multi_model_cv(X, y, {"RandomForestClassifier": model})
    
# Step 3: Plot the results
plot_results(results["RandomForestClassifier"]["mse_train"],
             results["RandomForestClassifier"]["mse_test"],
             results["RandomForestClassifier"]["f1_train"],
             results["RandomForestClassifier"]["f1_test"])
Best mean F1 test score: 0.3970 by model: RandomForestClassifier
Min F1 test score: 0.3733, Max F1 test score: 0.4264
No description has been provided for this image

Compare different models¶

In [12]:
models = {
    'Ridge': Ridge(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'SVR': SVR(),
    'Lasso': Lasso(max_iter=5000),
    'KNN Regressor': KNeighborsRegressor(),
    'Logistic Regression': LogisticRegression(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'SVC': SVC(),
    'KNN Classifier': KNeighborsClassifier(),
    'XGBRegressor': XGBRegressor(),
    'LGBMRegressor': LGBMRegressor(verbose=-1)
}
In [13]:
# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)
Best mean F1 test score: 0.6188 by model: LGBMRegressor
Min F1 test score: 0.5145, Max F1 test score: 0.7604
In [14]:
# Plot MSE results for regression models
plot_multi_model_results(results)
No description has been provided for this image

Optimize models¶

In [15]:
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from skopt.callbacks import DeltaYStopper

# Define the search spaces for each model
spaces = {
    'RandomForestRegressor': {
        'n_estimators': Integer(10, 500),
        'max_depth': Integer(1, 50),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 20)
    },
    'XGBRegressor': {
        'n_estimators': Integer(10, 200),
        'max_depth': Integer(1, 50),
        'learning_rate': Real(0.01, 2.0, 'log-uniform'),
        'subsample': Real(0.5, 1.0, 'uniform'),
        'colsample_bytree': Real(0.5, 1.0, 'uniform')
    },
    'LGBMRegressor': {
        'n_estimators': Integer(10, 200),
        'max_depth': Integer(1, 50),
        'learning_rate': Real(0.01, 1.0, 'log-uniform'),
        'num_leaves': Integer(20, 300),
        'min_child_samples': Integer(1, 100)
    },
    'SVR': {
        'C': Real(0.1, 10.0, 'log-uniform'),
        'epsilon': Real(0.001, 1.0, 'log-uniform'),
        'kernel': Categorical(['linear', 'rbf', 'poly'])
    },
    'KNNRegressor': {
        'n_neighbors': Integer(1, 50),
        'weights': Categorical(['uniform', 'distance']),
        'p': Integer(1, 2)
    },
    'Lasso': {
        'alpha': Real(0.0001, 10.0, 'log-uniform')
    },
    'Ridge': {
        'alpha': Real(0.01, 10.0, 'log-uniform'),
    },
    'LogisticRegression': {
        'C': Real(0.01, 10.0, 'log-uniform'),
        'solver': Categorical(['lbfgs', 'liblinear'])
    },
    'RandomForestClassifier': {
        'n_estimators': Integer(10, 500),
        'max_depth': Integer(1, 50),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 20)
    },
    'XGBClassifier': {
        'n_estimators': Integer(10, 200),
        'max_depth': Integer(1, 50),
        'learning_rate': Real(0.01, 2.0, 'log-uniform'),
        'subsample': Real(0.5, 1.0, 'uniform'),
        'colsample_bytree': Real(0.5, 1.0, 'uniform')
    }
}

def optimizer_callback(res):
    if len(res.func_vals) % 5 == 0:  # Print every 5 iterations
        print(f"Iteration {len(res.func_vals)}: Best score = {-res.fun:.4f}")

delta_stopper = DeltaYStopper(delta=0.001, n_best=10)

# Function to optimize models
def optimize_model(X, y, model, space, n_iter=200):
    sfold = StratifiedKFold(n_splits=5)
    
    scorer = make_scorer(custom_f1_score)
    
    opt = BayesSearchCV(
        model,
        space,
        n_iter=n_iter,
        n_points=5,
        cv=sfold,
        n_jobs=-1,
        scoring=scorer,
        random_state=42
    )
    
    opt.fit(X, y, callback=[optimizer_callback, delta_stopper])
    
    return opt

# Optimize models
models = {
    'RandomForestRegressor': RandomForestRegressor(),
    'XGBRegressor': XGBRegressor(),
    'LGBMRegressor': LGBMRegressor(verbose=-1),
    'SVR': SVR(),
    'KNNRegressor': KNeighborsRegressor(),
    'Lasso': Lasso(max_iter=5000),
    'Ridge': Ridge(),
    'LogisticRegression': LogisticRegression(max_iter=500),
    'RandomForestClassifier': RandomForestClassifier(),
    'XGBClassifier': XGBClassifier(),
}

# models_opt = {}
# for name, model in models.items():
#     print(f"Optimizing {name}...")
#     opt = optimize_model(X, y, model, spaces[name])
#     models_opt[name] = opt
#     print(f"Best parameters: {opt.best_params_}")
#     print(f"Best score: {opt.best_score_:.4f}")
#     print()
In [16]:
# Optimizing RandomForestRegressor...
# Iteration 5: Best score = 0.6446
# Iteration 10: Best score = 0.6446
# Iteration 15: Best score = 0.6462
# Iteration 20: Best score = 0.6462
# Iteration 25: Best score = 0.6462
# Iteration 30: Best score = 0.6462
# Iteration 35: Best score = 0.6462
# Iteration 40: Best score = 0.6462
# Iteration 45: Best score = 0.6462
# Iteration 50: Best score = 0.6462
# Iteration 55: Best score = 0.6462
# Iteration 60: Best score = 0.6462
# Iteration 65: Best score = 0.6462
# Iteration 70: Best score = 0.6462
# Iteration 75: Best score = 0.6462
# Iteration 80: Best score = 0.6462
# Iteration 85: Best score = 0.6509
# Iteration 90: Best score = 0.6509
# Iteration 95: Best score = 0.6509
# Iteration 100: Best score = 0.6509
# Iteration 105: Best score = 0.6509
# Iteration 110: Best score = 0.6509
# Iteration 115: Best score = 0.6509
# Iteration 120: Best score = 0.6509
# Iteration 125: Best score = 0.6509
# Iteration 130: Best score = 0.6509
# Iteration 135: Best score = 0.6509
# Iteration 140: Best score = 0.6509
# Iteration 145: Best score = 0.6509
# Iteration 150: Best score = 0.6509
# Iteration 155: Best score = 0.6509
# Iteration 160: Best score = 0.6509
# Iteration 165: Best score = 0.6509
# Iteration 170: Best score = 0.6509
# Iteration 175: Best score = 0.6509
# Iteration 180: Best score = 0.6509
# Iteration 185: Best score = 0.6509
# Iteration 190: Best score = 0.6509
# Iteration 195: Best score = 0.6509
# Iteration 200: Best score = 0.6509
# Best parameters: OrderedDict({'max_depth': 25, 'min_samples_leaf': 20, 'min_samples_split': 20, 'n_estimators': 10})
# Best score: 0.6509

# Optimizing XGBRegressor...
# Iteration 5: Best score = 0.5863
# Iteration 10: Best score = 0.5863
# Iteration 15: Best score = 0.5863
# Iteration 20: Best score = 0.6188
# Iteration 25: Best score = 0.6188
# Iteration 30: Best score = 0.6188
# Iteration 35: Best score = 0.6188
# Iteration 40: Best score = 0.6207
# Iteration 45: Best score = 0.6207
# Iteration 50: Best score = 0.6207
# Iteration 55: Best score = 0.6207
# Iteration 60: Best score = 0.6273
# Iteration 65: Best score = 0.6273
# Iteration 70: Best score = 0.6328
# Iteration 75: Best score = 0.6328
# Iteration 80: Best score = 0.6330
# Iteration 85: Best score = 0.6378
# Iteration 90: Best score = 0.6378
# Iteration 95: Best score = 0.6378
# Iteration 100: Best score = 0.6378
# Iteration 105: Best score = 0.6378
# Iteration 110: Best score = 0.6378
# Iteration 115: Best score = 0.6378
# Iteration 120: Best score = 0.6378
# Iteration 125: Best score = 0.6378
# Iteration 130: Best score = 0.6378
# Iteration 135: Best score = 0.6378
# Iteration 140: Best score = 0.6378
# Iteration 145: Best score = 0.6378
# Iteration 150: Best score = 0.6378
# Iteration 155: Best score = 0.6378
# Iteration 160: Best score = 0.6378
# Iteration 165: Best score = 0.6378
# Iteration 170: Best score = 0.6378
# Iteration 175: Best score = 0.6378
# Iteration 180: Best score = 0.6378
# Iteration 185: Best score = 0.6378
# Iteration 190: Best score = 0.6378
# Iteration 195: Best score = 0.6378
# Iteration 200: Best score = 0.6378
# Best parameters: OrderedDict({'colsample_bytree': 0.7461282882550149, 'learning_rate': 0.9936330838999975, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.9395779652231953})
# Best score: 0.6378

# Optimizing LGBMRegressor...
# Iteration 5: Best score = 0.6416
# Iteration 10: Best score = 0.6497
# Iteration 15: Best score = 0.6497
# Iteration 20: Best score = 0.6497
# Iteration 25: Best score = 0.6497
# Iteration 30: Best score = 0.6497
# Iteration 35: Best score = 0.6497
# Iteration 40: Best score = 0.6521
# Iteration 45: Best score = 0.6521
# Iteration 50: Best score = 0.6521
# Iteration 55: Best score = 0.6521
# Iteration 60: Best score = 0.6521
# Iteration 65: Best score = 0.6521
# Iteration 70: Best score = 0.6521
# Iteration 75: Best score = 0.6521
# Iteration 80: Best score = 0.6521
# Iteration 85: Best score = 0.6521
# Iteration 90: Best score = 0.6521
# Iteration 95: Best score = 0.6521
# Iteration 100: Best score = 0.6521
# Iteration 105: Best score = 0.6521
# Iteration 110: Best score = 0.6521
# Iteration 115: Best score = 0.6521
# Iteration 120: Best score = 0.6521
# Iteration 125: Best score = 0.6521
# Iteration 130: Best score = 0.6521
# Iteration 135: Best score = 0.6521
# Iteration 140: Best score = 0.6521
# Iteration 145: Best score = 0.6521
# Iteration 150: Best score = 0.6521
# Iteration 155: Best score = 0.6521
# Iteration 160: Best score = 0.6521
# Iteration 165: Best score = 0.6521
# Iteration 170: Best score = 0.6521
# Iteration 175: Best score = 0.6521
# Iteration 180: Best score = 0.6521
# Iteration 185: Best score = 0.6521
# Iteration 190: Best score = 0.6521
# Iteration 195: Best score = 0.6521
# Iteration 200: Best score = 0.6521
# Best parameters: OrderedDict({'learning_rate': 0.055454445189799526, 'max_depth': 8, 'min_child_samples': 37, 'n_estimators': 159, 'num_leaves': 208})
# Best score: 0.6521

# Optimizing SVR...
# Iteration 5: Best score = 0.5336
# Iteration 10: Best score = 0.5336
# Iteration 15: Best score = 0.5522
# Iteration 20: Best score = 0.5522
# Iteration 25: Best score = 0.5522
# Iteration 30: Best score = 0.5635
# Iteration 35: Best score = 0.5635
# Iteration 40: Best score = 0.5635
# Iteration 45: Best score = 0.5635
# Iteration 50: Best score = 0.5635
# Iteration 55: Best score = 0.5655
# Iteration 60: Best score = 0.5674
# Iteration 65: Best score = 0.5674
# Iteration 70: Best score = 0.5674
# Iteration 75: Best score = 0.5674
# Iteration 80: Best score = 0.5674
# Iteration 85: Best score = 0.5674
# Iteration 90: Best score = 0.5674
# Iteration 95: Best score = 0.5674
# Iteration 100: Best score = 0.5677
# Best parameters: OrderedDict({'C': 0.1, 'epsilon': 0.38002726414738297, 'kernel': 'rbf'})
# Best score: 0.5677

# Optimizing KNNRegressor...
# Iteration 5: Best score = 0.5224
# Iteration 10: Best score = 0.5261
# Iteration 15: Best score = 0.5282
# Iteration 20: Best score = 0.5282
# Iteration 25: Best score = 0.5282
# Iteration 30: Best score = 0.5282
# Iteration 35: Best score = 0.5282
# Iteration 40: Best score = 0.5282
# Iteration 45: Best score = 0.5282
# Iteration 50: Best score = 0.5282
# Iteration 55: Best score = 0.5282
# Iteration 60: Best score = 0.5282
# Iteration 65: Best score = 0.5282
# Iteration 70: Best score = 0.5282
# Iteration 75: Best score = 0.5282
# Iteration 80: Best score = 0.5282
# Iteration 85: Best score = 0.5282
# Iteration 90: Best score = 0.5282
# Iteration 95: Best score = 0.5282
# Iteration 100: Best score = 0.5282
# Iteration 105: Best score = 0.5282
# Iteration 110: Best score = 0.5282
# Iteration 115: Best score = 0.5282
# Iteration 120: Best score = 0.5282
# Iteration 125: Best score = 0.5282
# Iteration 130: Best score = 0.5282
# Iteration 135: Best score = 0.5282
# Iteration 140: Best score = 0.5282
# Iteration 145: Best score = 0.5282
# Iteration 150: Best score = 0.5282
# Iteration 155: Best score = 0.5282
# Iteration 160: Best score = 0.5282
# Iteration 165: Best score = 0.5282
# Iteration 170: Best score = 0.5282
# Iteration 175: Best score = 0.5282
# Iteration 180: Best score = 0.5282
# Iteration 185: Best score = 0.5282
# Iteration 190: Best score = 0.5282
# Iteration 195: Best score = 0.5282
# Iteration 200: Best score = 0.5282
# Best parameters: OrderedDict({'n_neighbors': 50, 'p': 1, 'weights': 'uniform'})
# Best score: 0.5282

# Optimizing Lasso...
# Iteration 5: Best score = 0.4257
# Iteration 10: Best score = 0.4299
# Iteration 15: Best score = 0.4471
# Iteration 20: Best score = 0.4473
# Iteration 25: Best score = 0.4476
# Iteration 30: Best score = 0.4476
# Iteration 35: Best score = 0.4476
# Iteration 40: Best score = 0.4476
# Iteration 45: Best score = 0.4476
# Iteration 50: Best score = 0.4481
# Iteration 55: Best score = 0.4481
# Iteration 60: Best score = 0.4481
# Iteration 65: Best score = 0.4481
# Iteration 70: Best score = 0.4481
# Best parameters: OrderedDict({'alpha': 0.00039834351977457706})
# Best score: 0.4481

# Optimizing Ridge...
# Iteration 5: Best score = 0.4483
# Iteration 10: Best score = 0.4502
# Iteration 15: Best score = 0.4502
# Iteration 20: Best score = 0.4502
# Iteration 25: Best score = 0.4502
# Iteration 30: Best score = 0.4502
# Iteration 35: Best score = 0.4517
# Iteration 40: Best score = 0.4517
# Iteration 45: Best score = 0.4517
# Iteration 50: Best score = 0.4518
# Iteration 55: Best score = 0.4518
# Iteration 60: Best score = 0.4518
# Iteration 65: Best score = 0.4518
# Iteration 70: Best score = 0.4518
# Iteration 75: Best score = 0.4518
# Iteration 80: Best score = 0.4518
# Iteration 85: Best score = 0.4518
# Best parameters: OrderedDict({'alpha': 4.822503882256502})
# Best score: 0.4518

# Optimizing LogisticRegression...
# Iteration 5: Best score = 0.4351
# Iteration 10: Best score = 0.4361
# Iteration 15: Best score = 0.4361
# Iteration 20: Best score = 0.4369
# Iteration 25: Best score = 0.4380
# Iteration 30: Best score = 0.4380
# Iteration 35: Best score = 0.4380
# Iteration 40: Best score = 0.4380
# Iteration 45: Best score = 0.4380
# Iteration 50: Best score = 0.4380
# Iteration 55: Best score = 0.4380
# Iteration 60: Best score = 0.4380
# Iteration 65: Best score = 0.4380
# Iteration 70: Best score = 0.4380
# Iteration 75: Best score = 0.4380
# Iteration 80: Best score = 0.4380
# Iteration 85: Best score = 0.4380
# Iteration 90: Best score = 0.4380
# Iteration 95: Best score = 0.4380
# Iteration 100: Best score = 0.4380
# Iteration 105: Best score = 0.4380
# Iteration 110: Best score = 0.4380
# Iteration 115: Best score = 0.4380
# Iteration 120: Best score = 0.4380
# Iteration 125: Best score = 0.4380
# Iteration 130: Best score = 0.4380
# Iteration 135: Best score = 0.4380
# Iteration 140: Best score = 0.4380
# Iteration 145: Best score = 0.4380
# Iteration 150: Best score = 0.4380
# Iteration 155: Best score = 0.4380
# Iteration 160: Best score = 0.4380
# Iteration 165: Best score = 0.4380
# Iteration 170: Best score = 0.4380
# Iteration 175: Best score = 0.4380
# Best parameters: OrderedDict({'C': 9.96900468467878, 'solver': 'lbfgs'})
# Best score: 0.4380

# Optimizing RandomForestClassifier...
# Iteration 5: Best score = 0.5795
# Iteration 10: Best score = 0.5795
# Iteration 15: Best score = 0.5795
# Iteration 20: Best score = 0.5795
# Iteration 25: Best score = 0.5809
# Iteration 30: Best score = 0.5809
# Iteration 35: Best score = 0.5809
# Iteration 40: Best score = 0.5809
# Iteration 45: Best score = 0.5809
# Iteration 50: Best score = 0.5809
# Iteration 55: Best score = 0.5809
# Iteration 60: Best score = 0.5809
# Iteration 65: Best score = 0.5852
# Iteration 70: Best score = 0.5852
# Iteration 75: Best score = 0.5852
# Iteration 80: Best score = 0.5852
# Iteration 85: Best score = 0.5852
# Iteration 90: Best score = 0.5852
# Iteration 95: Best score = 0.5852
# Iteration 100: Best score = 0.5852
# Iteration 105: Best score = 0.5852
# Iteration 110: Best score = 0.5852
# Iteration 115: Best score = 0.5852
# Iteration 120: Best score = 0.5852
# Iteration 125: Best score = 0.5852
# Iteration 130: Best score = 0.5852
# Iteration 135: Best score = 0.5852
# Iteration 140: Best score = 0.5852
# Iteration 145: Best score = 0.5852
# Iteration 150: Best score = 0.5852
# Iteration 155: Best score = 0.5852
# Iteration 160: Best score = 0.5852
# Iteration 165: Best score = 0.5852
# Iteration 170: Best score = 0.5852
# Iteration 175: Best score = 0.5852
# Iteration 180: Best score = 0.5852
# Iteration 185: Best score = 0.5852
# Iteration 190: Best score = 0.5852
# Iteration 195: Best score = 0.5852
# Iteration 200: Best score = 0.5852
# Best parameters: OrderedDict({'max_depth': 16, 'min_samples_leaf': 20, 'min_samples_split': 5, 'n_estimators': 10})
# Best score: 0.5852

# Optimizing XGBClassifier...
# Iteration 5: Best score = 0.5183
# Iteration 10: Best score = 0.5183
# Iteration 15: Best score = 0.5183
# Iteration 20: Best score = 0.5726
# Iteration 25: Best score = 0.5829
# Iteration 30: Best score = 0.5908
# Iteration 35: Best score = 0.5908
# Iteration 40: Best score = 0.5908
# Iteration 45: Best score = 0.5908
# Iteration 50: Best score = 0.5908
# Iteration 55: Best score = 0.5912
# Iteration 60: Best score = 0.5912
# Iteration 65: Best score = 0.5912
# Iteration 70: Best score = 0.5912
# Iteration 75: Best score = 0.5912
# Iteration 80: Best score = 0.5912
# Iteration 85: Best score = 0.5912
# Iteration 90: Best score = 0.5912
# Iteration 95: Best score = 0.5912
# Iteration 100: Best score = 0.5912
# Iteration 105: Best score = 0.5912
# Iteration 110: Best score = 0.5912
# Iteration 115: Best score = 0.5912
# Iteration 120: Best score = 0.5912
# Iteration 125: Best score = 0.5912
# Iteration 130: Best score = 0.5912
# Iteration 135: Best score = 0.5912
# Iteration 140: Best score = 0.5912
# Iteration 145: Best score = 0.5912
# Iteration 150: Best score = 0.5912
# Iteration 155: Best score = 0.5912
# Iteration 160: Best score = 0.5912
# Iteration 165: Best score = 0.5912
# Iteration 170: Best score = 0.5912
# Iteration 175: Best score = 0.5912
# Iteration 180: Best score = 0.5912
# Iteration 185: Best score = 0.5912
# Iteration 190: Best score = 0.5912
# Iteration 195: Best score = 0.5912
# Iteration 200: Best score = 0.5912
# Best parameters: OrderedDict({'colsample_bytree': 1.0, 'learning_rate': 0.6788945751630601, 'max_depth': 1, 'n_estimators': 81, 'subsample': 1.0})
# Best score: 0.5912
In [109]:
# models_opt = {}
# for name, model in models.items():
#     print(f"Optimizing {name}...")
#     opt = optimize_model(X, y, model, spaces[name])
#     models_opt[name] = opt
#     print(f"Best parameters: {opt.best_params_}")
#     print(f"Best score: {opt.best_score_:.4f}")
#     print()

models_opt = {}

models_opt['RandomForestRegressor'] = {
    'estimator': RandomForestRegressor(**{'max_depth': 25, 'min_samples_leaf': 20, 'min_samples_split': 20, 'n_estimators': 10}),
    'best_params_': {'max_depth': 25, 'min_samples_leaf': 20, 'min_samples_split': 20, 'n_estimators': 10},
    'best_score_': 0.6509
}

models_opt['XGBRegressor'] = {
    'estimator': XGBRegressor(**{'colsample_bytree': 0.7461282882550149, 'learning_rate': 0.9936330838999975, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.9395779652231953}),
    'best_params_': {'colsample_bytree': 0.7461282882550149, 'learning_rate': 0.9936330838999975, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.9395779652231953},
    'best_score_': 0.6378
}

models_opt['LGBMRegressor'] = {
    'estimator': LGBMRegressor(**{'verbose': -1, 'learning_rate': 0.055454445189799526, 'max_depth': 8, 'min_child_samples': 37, 'n_estimators': 159, 'num_leaves': 208}),
    'best_params_': {'verbose': -1, 'learning_rate': 0.055454445189799526, 'max_depth': 8, 'min_child_samples': 37, 'n_estimators': 159, 'num_leaves': 208},
    'best_score_': 0.6521
}

models_opt['SVR'] = {
    'estimator': SVR(**{'C': 0.1, 'epsilon': 0.38002726414738297, 'kernel': 'rbf'}),
    'best_params_': {'C': 0.1, 'epsilon': 0.38002726414738297, 'kernel': 'rbf'},
    'best_score_': 0.5677
}

models_opt['KNNRegressor'] = {
    'estimator': KNeighborsRegressor(**{'n_neighbors': 50, 'p': 1, 'weights': 'uniform'}),
    'best_params_': {'n_neighbors': 50, 'p': 1, 'weights': 'uniform'},
    'best_score_': 0.5282
}

models_opt['Lasso'] = {
    'estimator': Lasso(**{'alpha': 0.00039834351977457706, "max_iter": 5000}),
    'best_params_': {'alpha': 0.00039834351977457706, "max_iter": 5000},
    'best_score_': 0.4481
}

models_opt['Ridge'] = {
    'estimator': Ridge(**{'alpha': 4.822503882256502}),
    'best_params_': {'alpha': 4.822503882256502},
    'best_score_': 0.4518
}

models_opt['LogisticRegression'] = {
    'estimator': LogisticRegression(**{'C': 9.96900468467878, 'solver': 'lbfgs', "max_iter": 500}),
    'best_params_': {'C': 9.96900468467878, 'solver': 'lbfgs', "max_iter": 500},
    'best_score_': 0.4380
}

models_opt['RandomForestClassifier'] = {
    'estimator': RandomForestClassifier(**{'max_depth': 16, 'min_samples_leaf': 20, 'min_samples_split': 5, 'n_estimators': 10}),
    'best_params_': {'max_depth': 16, 'min_samples_leaf': 20, 'min_samples_split': 5, 'n_estimators': 10},
    'best_score_': 0.5852
}

models_opt['XGBClassifier'] = {
    'estimator': XGBClassifier(**{'colsample_bytree': 1.0, 'learning_rate': 0.6788945751630601, 'max_depth': 1, 'n_estimators': 81, 'subsample': 1.0}),
    'best_params_': {'colsample_bytree': 1.0, 'learning_rate': 0.6788945751630601, 'max_depth': 1, 'n_estimators': 81, 'subsample': 1.0},
    'best_score_': 0.5912
}
In [18]:
models = {}
for name, opt in models_opt.items():
    model_class = type(opt["estimator"])  # Get the model class (e.g., RandomForestRegressor)
    best_params = opt["best_params_"]  # Get the best parameters from the optimization

    # Add the model with the best parameters to the new dictionary
    models[f"{name} opt"] = model_class(**best_params)

# Add a baseline model manually (if needed)
models['Random Forest Baseline'] = RandomForestClassifier()
In [19]:
# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)
Best mean F1 test score: 0.6521 by model: LGBMRegressor opt
Min F1 test score: 0.5816, Max F1 test score: 0.7691
In [20]:
# Plot MSE results for regression models
plot_multi_model_results(results)
No description has been provided for this image

Bagging¶

In [110]:
# Sort the models by their best score in descending order
top_3_models = sorted(models_opt.items(), key=lambda x: x[1]["best_score_"], reverse=True)

# Filter only classifiers from the sorted models
top_3_classifiers = [(name, opt["estimator"]) for name, opt in top_3_models if isinstance(opt["estimator"], ClassifierMixin)][:3]

# Print the top 3 classifiers for verification
print("Top 3 classifiers used in VotingClassifier:")
for name, estimator in top_3_classifiers:
    print(f"{name}: {type(estimator).__name__}")

# Define the voting ensemble using the top 3 classifiers
bagging_vote_ensemble = VotingClassifier(estimators=top_3_classifiers)


# Sort the models by their best score in descending order
top_models = sorted(models_opt.items(), key=lambda x: x[1]["best_score_"], reverse=True)

# Filter only regressors from the sorted models
top_2_regressors = [(name, opt["estimator"]) for name, opt in top_models if isinstance(opt["estimator"], RegressorMixin)][:2]

# Print the top 2 regressors for verification
print("Top 2 regressors used in VotingRegressor:")
for name, estimator in top_2_regressors:
    print(f"{name}: {type(estimator).__name__}")

# Define the voting ensemble using the top 2 regressors
voting_regressor_ensemble = VotingRegressor(estimators=top_2_regressors)


bagging_ensemble = BaggingRegressor(
    estimator=RandomForestRegressor(),
    n_estimators=10,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    bootstrap_features=False,
    n_jobs=-1
)


models = {
    'bagging_ensemble': bagging_ensemble,
    'bagging_vote_ensemble': bagging_vote_ensemble,
    'voting_regressor_ensemble': voting_regressor_ensemble,
    'Random Forest Baseline': RandomForestClassifier(),
}
Top 3 classifiers used in VotingClassifier:
XGBClassifier: XGBClassifier
RandomForestClassifier: RandomForestClassifier
LogisticRegression: LogisticRegression
Top 2 regressors used in VotingRegressor:
LGBMRegressor: LGBMRegressor
RandomForestRegressor: RandomForestRegressor
In [111]:
# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)
Best mean F1 test score: 0.6358 by model: voting_regressor_ensemble
Min F1 test score: 0.5475, Max F1 test score: 0.7640
In [112]:
# Plot MSE results for regression models
plot_multi_model_results(results)
No description has been provided for this image

Stacking¶

In [28]:
# Separate regressors and classifiers from models_opt
regressor_estimators = [(name, opt["estimator"]) for name, opt in models_opt.items() if isinstance(opt["estimator"], RegressorMixin)]
classifier_estimators = [(name, opt["estimator"]) for name, opt in models_opt.items() if isinstance(opt["estimator"], ClassifierMixin)]

# Define the meta-models
ridge_regressor = Ridge()
random_forest_regressor = RandomForestRegressor()

ridge_classifier = RidgeClassifier()  # RidgeClassifier for classification tasks
random_forest_classifier = RandomForestClassifier()

# Stacking Regressor with Ridge as meta-model
stacking_regressor_ridge = StackingRegressor(
    estimators=regressor_estimators,
    final_estimator=ridge_regressor
)

# Stacking Regressor with RandomForest as meta-model
stacking_regressor_rf = StackingRegressor(
    estimators=regressor_estimators,
    final_estimator=random_forest_regressor
)

# Stacking Classifier with Ridge as meta-model
stacking_classifier_ridge = StackingClassifier(
    estimators=classifier_estimators,
    final_estimator=ridge_classifier
)

# Stacking Classifier with RandomForest as meta-model
stacking_classifier_rf = StackingClassifier(
    estimators=classifier_estimators,
    final_estimator=random_forest_classifier
)

# Stacking Classifier with Ridge as meta-model
stacking_classifier_ridge_proba = StackingClassifier(
    estimators=classifier_estimators,
    final_estimator=ridge_classifier,
    stack_method='predict_proba' 
)

# Stacking Classifier with RandomForest as meta-model
stacking_classifier_rf_proba = StackingClassifier(
    estimators=classifier_estimators,
    final_estimator=random_forest_classifier,
    stack_method='predict_proba' 
)
In [29]:
models = {
    'Stacking Regressor (Ridge Meta)': stacking_regressor_ridge,
    'Stacking Regressor (RandomForest Meta)': stacking_regressor_rf,
    'Stacking Classifier (Ridge Meta)': stacking_classifier_ridge,
    'Stacking Classifier (RandomForest Meta)': stacking_classifier_rf,
    'Stacking Classifier (Ridge Meta, Proba)': stacking_classifier_ridge_proba,
    'Stacking Classifier (RandomForest Meta, Proba)': stacking_classifier_rf_proba,
    'Random Forest Baseline': RandomForestClassifier(),
}

# Run cross-validation for all models
results = run_multi_model_cv(X, y, models)

# Plot results for all models
plot_multi_model_results(results)
Best mean F1 test score: 0.6504 by model: Stacking Regressor (Ridge Meta)
Min F1 test score: 0.5665, Max F1 test score: 0.7761
No description has been provided for this image

Custom Stacking¶

In [82]:
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import KFold

class CustomStackingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators, final_estimator=Ridge(), cv=20):
        """
        Custom stacking model that can combine both classifiers and regressors as base learners.
        
        Parameters:
        - estimators: list of (name, estimator) tuples.
        - final_estimator: the meta-model to train on the stacked outputs.
        - cv: number of cross-validation folds for generating out-of-fold predictions.
        """
        self.estimators = estimators
        self.final_estimator = final_estimator
        self.cv = cv
        self.fitted_estimators_ = []
        self.n_classes = 5

    def fit(self, X, y):
        self.fitted_estimators_ = []
        # Initialize list to hold stacked features
        stacked_features_list = []

        n_samples = X.shape[0]
    
        # Cross-validation setup
        kf = KFold(n_splits=self.cv)
        
        # For each estimator, generate out-of-fold predictions
        for i, (name, estimator) in enumerate(self.estimators):
            # Determine the shape of oof_predictions based on predict or predict_proba
            if hasattr(estimator, "predict_proba"):
                oof_predictions = np.zeros((n_samples, self.n_classes))
            else:
                oof_predictions = np.zeros((n_samples, 1))

            for train_idx, valid_idx in kf.split(X):
                # Use .iloc to access the rows based on integer-based indexing
                X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
                y_train = y.iloc[train_idx]


                # Fit the estimator on the training fold
                fitted_estimator = estimator.fit(X_train, y_train)
                
                # Predict on the validation fold
                if hasattr(fitted_estimator, "predict_proba"):
                    # Store multi-class probabilities in the oof_predictions matrix
                    predictions = fitted_estimator.predict_proba(X_valid)
                    oof_predictions[valid_idx] = predictions
                else:
                    predictions = fitted_estimator.predict(X_valid).reshape(-1, 1)
                    oof_predictions[valid_idx] = predictions

            # Store the out-of-fold predictions in the list of stacked features
            stacked_features_list.append(oof_predictions)
            
            # Fit the full estimator on the entire dataset (for future predictions)
            self.fitted_estimators_.append(estimator.fit(X, y))
    
        # Concatenate all stacked features (column-wise) after all estimators are fitted
        stacked_features = np.hstack(stacked_features_list)
    
        # Fit the final meta-model on the stacked features
        self.final_estimator.fit(stacked_features, y)
        
        return self

    def _get_stacked_features(self, X):
        # Generate predictions from the base estimators for the new data
        stacked_features_list = []
        for estimator in self.fitted_estimators_:
            if hasattr(estimator, "predict_proba"):
                # Concatenate all class probabilities for multi-class classification
                stacked_features_list.append(estimator.predict_proba(X))
            else:
                stacked_features_list.append(estimator.predict(X).reshape(-1, 1))
        
        # Concatenate stacked features column-wise
        stacked_features = np.hstack(stacked_features_list)
        return stacked_features

    def predict(self, X):
        # Get stacked features for the test set
        stacked_features = self._get_stacked_features(X)
        
        # Predict using the final estimator (meta-model)
        return self.final_estimator.predict(stacked_features)


# Example usage of CustomStackingModel
regressor_estimators = [(name, opt["estimator"]) for name, opt in models_opt.items() if isinstance(opt["estimator"], RegressorMixin)]
classifier_estimators = [(name, opt["estimator"]) for name, opt in models_opt.items() if isinstance(opt["estimator"], ClassifierMixin)]

# Combine both regressors and classifiers as base learners
combined_estimators = regressor_estimators + classifier_estimators

# Define the custom stacking model with both classifiers and regressors, using Ridge as the meta-model
custom_stacking_model_ridge = CustomStackingModel(
    estimators=combined_estimators,
    final_estimator=Ridge({'alpha': 5.787655426374446})
)

custom_stacking_model_rf = CustomStackingModel(
    estimators=combined_estimators,
    final_estimator=RandomForestRegressor()
)

custom_stacking_model_lgbm = CustomStackingModel(
    estimators=combined_estimators,
    final_estimator= LGBMRegressor(verbose=-1)
)

custom_stacking_model_rf_class = CustomStackingModel(
    estimators=combined_estimators,
    final_estimator= RandomForestClassifier()
)
In [44]:
models = {
    'custom_stacking_model_ridge': stacking_regressor_ridge,
    'custom_stacking_model_rf': custom_stacking_model_rf,
    'custom_stacking_model_rf_class': custom_stacking_model_rf_class,
    'custom_stacking_model_lgbm': custom_stacking_model_lgbm,
    'Random Forest Baseline': RandomForestClassifier(),
}

# Run cross-validation for all models
results = run_multi_model_cv(X, y, models)

# Plot results for all models
plot_multi_model_results(results)
Best mean F1 test score: 0.6510 by model: custom_stacking_model_ridge
Min F1 test score: 0.5617, Max F1 test score: 0.7781
No description has been provided for this image

Eval on unseen data¶

In [83]:
data_train = pd.read_csv('module6_course_train.csv')
data_test = pd.read_csv('module6_course_test.csv')
In [84]:
y_train = data_train.pop('Score')
X_train = data_train.copy()
In [85]:
y_test = data_test.pop('Score')
X_test = data_test.copy()
In [86]:
custom_stacking_model_rf.fit(X_train, y_train)
Out[86]:
CustomStackingModel(estimators=[('RandomForestRegressor',
                                 RandomForestRegressor(max_depth=25,
                                                       min_samples_leaf=20,
                                                       min_samples_split=20,
                                                       n_estimators=10)),
                                ('XGBRegressor',
                                 XGBRegressor(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=0.7461282882550149,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable...
                                               interaction_constraints=None,
                                               learning_rate=0.6788945751630601,
                                               max_bin=None,
                                               max_cat_threshold=None,
                                               max_cat_to_onehot=None,
                                               max_delta_step=None, max_depth=1,
                                               max_leaves=None,
                                               min_child_weight=None,
                                               missing=nan,
                                               monotone_constraints=None,
                                               multi_strategy=None,
                                               n_estimators=81, n_jobs=None,
                                               num_parallel_tree=None,
                                               objective='multi:softprob', ...))],
                    final_estimator=RandomForestRegressor())
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CustomStackingModel(estimators=[('RandomForestRegressor',
                                 RandomForestRegressor(max_depth=25,
                                                       min_samples_leaf=20,
                                                       min_samples_split=20,
                                                       n_estimators=10)),
                                ('XGBRegressor',
                                 XGBRegressor(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=0.7461282882550149,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable...
                                               interaction_constraints=None,
                                               learning_rate=0.6788945751630601,
                                               max_bin=None,
                                               max_cat_threshold=None,
                                               max_cat_to_onehot=None,
                                               max_delta_step=None, max_depth=1,
                                               max_leaves=None,
                                               min_child_weight=None,
                                               missing=nan,
                                               monotone_constraints=None,
                                               multi_strategy=None,
                                               n_estimators=81, n_jobs=None,
                                               num_parallel_tree=None,
                                               objective='multi:softprob', ...))],
                    final_estimator=RandomForestRegressor())
RandomForestRegressor()
RandomForestRegressor()
In [87]:
y_pred_test = custom_stacking_model_rf.predict(X_test)
y_pred_train = custom_stacking_model_rf.predict(X_train)
In [88]:
f1_train = custom_f1_score(y_train, y_pred_train)
f1_test = custom_f1_score(y_test, y_pred_test)

print(f"f1_train: {f1_train}")
print(f"f1_test: {f1_test}")
f1_train: 0.7262607298491914
f1_test: 0.7111708369916983
In [105]:
lgmb_best = LGBMRegressor(**{'verbose': -1, 'learning_rate': 0.055454445189799526, 'max_depth': 8, 'min_child_samples': 37, 'n_estimators': 159, 'num_leaves': 208})
In [106]:
lgmb_best.fit(X_train, y_train)
Out[106]:
LGBMRegressor(learning_rate=0.05545444518979953, max_depth=8,
              min_child_samples=37, n_estimators=159, num_leaves=208,
              verbose=-1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMRegressor(learning_rate=0.05545444518979953, max_depth=8,
              min_child_samples=37, n_estimators=159, num_leaves=208,
              verbose=-1)
In [107]:
y_pred_test = lgmb_best.predict(X_test)
y_pred_train = lgmb_best.predict(X_train)
In [108]:
f1_train = custom_f1_score(y_train, y_pred_train)
f1_test = custom_f1_score(y_test, y_pred_test)

print(f"f1_train: {f1_train}")
print(f"f1_test: {f1_test}")
f1_train: 0.7292455204969205
f1_test: 0.7086080988315853