import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, RidgeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingRegressor, StackingRegressor, StackingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import make_scorer, f1_score
from sklearn.base import RegressorMixin, ClassifierMixin, BaseEstimator
from sklearn.ensemble import VotingClassifier, VotingRegressor

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor

import requests

# URLs of the files
data_train_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/course/module6_course_train.csv'
data_test_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/course/module6_course_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(data_train_url, 'module6_course_train.csv')
download_file(data_train_url, 'module6_course_test.csv')

Downloaded module6_course_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module6/course/module6_course_train.csv
Downloaded module6_course_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module6/course/module6_course_train.csv

data_train = pd.read_csv('module6_course_train.csv')
data_test = pd.read_csv('module6_course_test.csv')

data_train

data_train.isnull().sum()

Temperature       0
Humidity          0
Humex             0
CO2               0
Bright            0
weekday_0         0
weekday_1         0
weekday_2         0
weekday_3         0
weekday_4         0
weekday_5         0
weekday_6         0
hour_sine_wave    0
Score             0
dtype: int64

data_train['Score'].value_counts(normalize=True)

Score
2    0.420143
1    0.272429
3    0.232571
0    0.044857
4    0.030000
Name: proportion, dtype: float64

y = data_train.pop('Score')
X = data_train.copy()

# Function to plot the evaluation results
def plot_results(mse_train, mse_test, f1_train, f1_test):
    plt.figure(figsize=(12, 6))

    # MSE plot
    plt.subplot(1, 2, 1)
    plt.plot(mse_train, label="Train MSE", marker='o')
    plt.plot(mse_test, label="Test MSE", marker='o')
    plt.fill_between(range(len(mse_train)), np.min(mse_train), np.max(mse_train), color='blue', alpha=0.1)
    plt.fill_between(range(len(mse_test)), np.min(mse_test), np.max(mse_test), color='orange', alpha=0.1)
    plt.title("MSE over Folds")
    plt.xlabel("Fold")
    plt.ylabel("MSE")
    plt.legend()
    plt.grid(True)

    # f1_score plot
    plt.subplot(1, 2, 2)
    plt.plot(f1_train, label="Train f1_score", marker='o')
    plt.plot(f1_test, label="Test f1_score", marker='o')
    plt.fill_between(range(len(f1_train)), np.min(f1_train), np.max(f1_train), color='blue', alpha=0.1)
    plt.fill_between(range(len(f1_test)), np.min(f1_test), np.max(f1_test), color='orange', alpha=0.1)
    plt.title("f1_score over Folds")
    plt.xlabel("Fold")
    plt.ylabel("f1_score")
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

def plot_multi_model_results(results):
    # Set up the plot
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 20))
    
    # Colors for train and test
    train_color = 'skyblue'
    test_color = 'lightgreen'
    
    # Plot MSE
    ax1.set_title('Mean Squared Error (MSE) Comparison', fontsize=16)
    ax1.set_ylabel('MSE', fontsize=12)
    ax1.set_xlabel('Models', fontsize=12)
    ax1.grid(True, linestyle='--', alpha=0.7)
    
    # Plot f1_score
    ax2.set_title('f1_score Comparison', fontsize=16)
    ax2.set_ylabel('f1_score', fontsize=12)
    ax2.set_xlabel('Models', fontsize=12)
    ax2.grid(True, linestyle='--', alpha=0.7)
    
    x = np.arange(len(results))
    width = 0.35
    
    for i, (model_name, scores) in enumerate(results.items()):
        # MSE
        mse_train = scores['mse_train']
        mse_test = scores['mse_test']
        
        ax1.bar(x[i] - width/2, np.mean(mse_train), width, label='Train' if i == 0 else "", 
                color=train_color, alpha=0.7)
        ax1.bar(x[i] + width/2, np.mean(mse_test), width, label='Test' if i == 0 else "", 
                color=test_color, alpha=0.7)
        
        ax1.errorbar(x[i] - width/2, np.mean(mse_train), 
                     yerr=[[np.mean(mse_train)-np.min(mse_train)], [np.max(mse_train)-np.mean(mse_train)]], 
                     fmt='none', ecolor='black', capsize=5)
        ax1.errorbar(x[i] + width/2, np.mean(mse_test), 
                     yerr=[[np.mean(mse_test)-np.min(mse_test)], [np.max(mse_test)-np.mean(mse_test)]], 
                     fmt='none', ecolor='black', capsize=5)
        
        # f1_score
        f1_train = scores['f1_train']
        f1_test = scores['f1_test']
        
        ax2.bar(x[i] - width/2, np.mean(f1_train), width, label='Train' if i == 0 else "", 
                color=train_color, alpha=0.7)
        ax2.bar(x[i] + width/2, np.mean(f1_test), width, label='Test' if i == 0 else "", 
                color=test_color, alpha=0.7)
        
        ax2.errorbar(x[i] - width/2, np.mean(f1_train), 
                     yerr=[[np.mean(f1_train)-np.min(f1_train)], [np.max(f1_train)-np.mean(f1_train)]], 
                     fmt='none', ecolor='black', capsize=5)
        ax2.errorbar(x[i] + width/2, np.mean(f1_test), 
                     yerr=[[np.mean(f1_test)-np.min(f1_test)], [np.max(f1_test)-np.mean(f1_test)]], 
                     fmt='none', ecolor='black', capsize=5)
    
    ax1.set_xticks(x)
    ax1.set_xticklabels(results.keys(), rotation=45, ha='right')
    ax2.set_xticks(x)
    ax2.set_xticklabels(results.keys(), rotation=45, ha='right')
    
    ax1.legend(loc='upper left')
    ax2.legend(loc='upper left')
    
    plt.tight_layout()
    plt.show()

def cast_and_clip_predictions(y_pred):
    # Round the predictions to the nearest integer
    y_pred_int = np.round(y_pred).astype(int)
    
    # Clip the predictions to be within the range of y_true
    y_pred_int = np.clip(y_pred_int, np.min(0), np.max(4))
    
    return y_pred_int

# Custom scorer that incorporates casting and clipping
def custom_f1_score(y_true, y_pred):
    y_pred_cast = cast_and_clip_predictions(y_pred)
    return f1_score(y_true, y_pred_cast, average="weighted")

# Function to handle train-test evaluation in a fold
def train_and_evaluate(X_train, X_test, y_train, y_test, model):
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on train set
    y_pred_train = model.predict(X_train)
    # Make predictions on train set
    y_pred_test = model.predict(X_test)
    
    # Compute MSE for train and test
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    
    # Compute f1_score
    
    f1_train = custom_f1_score(y_train, y_pred_train)
    f1_test = custom_f1_score(y_test, y_pred_test)
    
    return mse_train, mse_test, f1_train, f1_test


def run_multi_model_cv(X, y, models, n_splits=5):
    sfold = StratifiedKFold(n_splits=n_splits)
    results = {name: {'mse_train': [], 'mse_test': [], 'f1_train': [], 'f1_test': []} 
               for name in models.keys()}
    
    for train_index, test_index in sfold.split(X, y):
        X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_test = y.iloc[train_index].copy(), y.iloc[test_index].copy()
        
        for name, model in models.items():
            mse_train, mse_test, f1_train, f1_test = train_and_evaluate(
                X_train, X_test, y_train, y_test, model
            )
            results[name]['mse_train'].append(mse_train)
            results[name]['mse_test'].append(mse_test)
            results[name]['f1_train'].append(f1_train)
            results[name]['f1_test'].append(f1_test)
    
    # Find the model with the best mean F1 test score
    best_mean_f1_score = -1
    best_model = None
    best_min_f1 = None
    best_max_f1 = None
    
    for name, result in results.items():
        f1_test_scores = result['f1_test']
        mean_f1_test = sum(f1_test_scores) / len(f1_test_scores)  # Calculate mean F1 score
        min_f1_test = min(f1_test_scores)  # Minimum F1 score
        max_f1_test = max(f1_test_scores)  # Maximum F1 score
        
        if mean_f1_test > best_mean_f1_score:
            best_mean_f1_score = mean_f1_test
            best_min_f1 = min_f1_test
            best_max_f1 = max_f1_test
            best_model = name
    
    # Print the best mean F1 test score, min, max, and the associated model
    print(f"Best mean F1 test score: {best_mean_f1_score:.4f} by model: {best_model}")
    print(f"Min F1 test score: {best_min_f1:.4f}, Max F1 test score: {best_max_f1:.4f}")
    
    return results

# Step 1: Initialize model

model = RandomForestClassifier()
    
# Step 2: Run cross-validation
results = run_multi_model_cv(X, y, {"RandomForestClassifier": model})
    
# Step 3: Plot the results
plot_results(results["RandomForestClassifier"]["mse_train"],
             results["RandomForestClassifier"]["mse_test"],
             results["RandomForestClassifier"]["f1_train"],
             results["RandomForestClassifier"]["f1_test"])

Best mean F1 test score: 0.3970 by model: RandomForestClassifier
Min F1 test score: 0.3733, Max F1 test score: 0.4264

models = {
    'Ridge': Ridge(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'SVR': SVR(),
    'Lasso': Lasso(max_iter=5000),
    'KNN Regressor': KNeighborsRegressor(),
    'Logistic Regression': LogisticRegression(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'SVC': SVC(),
    'KNN Classifier': KNeighborsClassifier(),
    'XGBRegressor': XGBRegressor(),
    'LGBMRegressor': LGBMRegressor(verbose=-1)
}

# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)

Best mean F1 test score: 0.6188 by model: LGBMRegressor
Min F1 test score: 0.5145, Max F1 test score: 0.7604

# Plot MSE results for regression models
plot_multi_model_results(results)

from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from skopt.callbacks import DeltaYStopper

# Define the search spaces for each model
spaces = {
    'RandomForestRegressor': {
        'n_estimators': Integer(10, 500),
        'max_depth': Integer(1, 50),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 20)
    },
    'XGBRegressor': {
        'n_estimators': Integer(10, 200),
        'max_depth': Integer(1, 50),
        'learning_rate': Real(0.01, 2.0, 'log-uniform'),
        'subsample': Real(0.5, 1.0, 'uniform'),
        'colsample_bytree': Real(0.5, 1.0, 'uniform')
    },
    'LGBMRegressor': {
        'n_estimators': Integer(10, 200),
        'max_depth': Integer(1, 50),
        'learning_rate': Real(0.01, 1.0, 'log-uniform'),
        'num_leaves': Integer(20, 300),
        'min_child_samples': Integer(1, 100)
    },
    'SVR': {
        'C': Real(0.1, 10.0, 'log-uniform'),
        'epsilon': Real(0.001, 1.0, 'log-uniform'),
        'kernel': Categorical(['linear', 'rbf', 'poly'])
    },
    'KNNRegressor': {
        'n_neighbors': Integer(1, 50),
        'weights': Categorical(['uniform', 'distance']),
        'p': Integer(1, 2)
    },
    'Lasso': {
        'alpha': Real(0.0001, 10.0, 'log-uniform')
    },
    'Ridge': {
        'alpha': Real(0.01, 10.0, 'log-uniform'),
    },
    'LogisticRegression': {
        'C': Real(0.01, 10.0, 'log-uniform'),
        'solver': Categorical(['lbfgs', 'liblinear'])
    },
    'RandomForestClassifier': {
        'n_estimators': Integer(10, 500),
        'max_depth': Integer(1, 50),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 20)
    },
    'XGBClassifier': {
        'n_estimators': Integer(10, 200),
        'max_depth': Integer(1, 50),
        'learning_rate': Real(0.01, 2.0, 'log-uniform'),
        'subsample': Real(0.5, 1.0, 'uniform'),
        'colsample_bytree': Real(0.5, 1.0, 'uniform')
    }
}

def optimizer_callback(res):
    if len(res.func_vals) % 5 == 0:  # Print every 5 iterations
        print(f"Iteration {len(res.func_vals)}: Best score = {-res.fun:.4f}")

delta_stopper = DeltaYStopper(delta=0.001, n_best=10)

# Function to optimize models
def optimize_model(X, y, model, space, n_iter=200):
    sfold = StratifiedKFold(n_splits=5)
    
    scorer = make_scorer(custom_f1_score)
    
    opt = BayesSearchCV(
        model,
        space,
        n_iter=n_iter,
        n_points=5,
        cv=sfold,
        n_jobs=-1,
        scoring=scorer,
        random_state=42
    )
    
    opt.fit(X, y, callback=[optimizer_callback, delta_stopper])
    
    return opt

# Optimize models
models = {
    'RandomForestRegressor': RandomForestRegressor(),
    'XGBRegressor': XGBRegressor(),
    'LGBMRegressor': LGBMRegressor(verbose=-1),
    'SVR': SVR(),
    'KNNRegressor': KNeighborsRegressor(),
    'Lasso': Lasso(max_iter=5000),
    'Ridge': Ridge(),
    'LogisticRegression': LogisticRegression(max_iter=500),
    'RandomForestClassifier': RandomForestClassifier(),
    'XGBClassifier': XGBClassifier(),
}

# models_opt = {}
# for name, model in models.items():
#     print(f"Optimizing {name}...")
#     opt = optimize_model(X, y, model, spaces[name])
#     models_opt[name] = opt
#     print(f"Best parameters: {opt.best_params_}")
#     print(f"Best score: {opt.best_score_:.4f}")
#     print()

# Optimizing RandomForestRegressor...
# Iteration 5: Best score = 0.6446
# Iteration 10: Best score = 0.6446
# Iteration 15: Best score = 0.6462
# Iteration 20: Best score = 0.6462
# Iteration 25: Best score = 0.6462
# Iteration 30: Best score = 0.6462
# Iteration 35: Best score = 0.6462
# Iteration 40: Best score = 0.6462
# Iteration 45: Best score = 0.6462
# Iteration 50: Best score = 0.6462
# Iteration 55: Best score = 0.6462
# Iteration 60: Best score = 0.6462
# Iteration 65: Best score = 0.6462
# Iteration 70: Best score = 0.6462
# Iteration 75: Best score = 0.6462
# Iteration 80: Best score = 0.6462
# Iteration 85: Best score = 0.6509
# Iteration 90: Best score = 0.6509
# Iteration 95: Best score = 0.6509
# Iteration 100: Best score = 0.6509
# Iteration 105: Best score = 0.6509
# Iteration 110: Best score = 0.6509
# Iteration 115: Best score = 0.6509
# Iteration 120: Best score = 0.6509
# Iteration 125: Best score = 0.6509
# Iteration 130: Best score = 0.6509
# Iteration 135: Best score = 0.6509
# Iteration 140: Best score = 0.6509
# Iteration 145: Best score = 0.6509
# Iteration 150: Best score = 0.6509
# Iteration 155: Best score = 0.6509
# Iteration 160: Best score = 0.6509
# Iteration 165: Best score = 0.6509
# Iteration 170: Best score = 0.6509
# Iteration 175: Best score = 0.6509
# Iteration 180: Best score = 0.6509
# Iteration 185: Best score = 0.6509
# Iteration 190: Best score = 0.6509
# Iteration 195: Best score = 0.6509
# Iteration 200: Best score = 0.6509
# Best parameters: OrderedDict({'max_depth': 25, 'min_samples_leaf': 20, 'min_samples_split': 20, 'n_estimators': 10})
# Best score: 0.6509

# Optimizing XGBRegressor...
# Iteration 5: Best score = 0.5863
# Iteration 10: Best score = 0.5863
# Iteration 15: Best score = 0.5863
# Iteration 20: Best score = 0.6188
# Iteration 25: Best score = 0.6188
# Iteration 30: Best score = 0.6188
# Iteration 35: Best score = 0.6188
# Iteration 40: Best score = 0.6207
# Iteration 45: Best score = 0.6207
# Iteration 50: Best score = 0.6207
# Iteration 55: Best score = 0.6207
# Iteration 60: Best score = 0.6273
# Iteration 65: Best score = 0.6273
# Iteration 70: Best score = 0.6328
# Iteration 75: Best score = 0.6328
# Iteration 80: Best score = 0.6330
# Iteration 85: Best score = 0.6378
# Iteration 90: Best score = 0.6378
# Iteration 95: Best score = 0.6378
# Iteration 100: Best score = 0.6378
# Iteration 105: Best score = 0.6378
# Iteration 110: Best score = 0.6378
# Iteration 115: Best score = 0.6378
# Iteration 120: Best score = 0.6378
# Iteration 125: Best score = 0.6378
# Iteration 130: Best score = 0.6378
# Iteration 135: Best score = 0.6378
# Iteration 140: Best score = 0.6378
# Iteration 145: Best score = 0.6378
# Iteration 150: Best score = 0.6378
# Iteration 155: Best score = 0.6378
# Iteration 160: Best score = 0.6378
# Iteration 165: Best score = 0.6378
# Iteration 170: Best score = 0.6378
# Iteration 175: Best score = 0.6378
# Iteration 180: Best score = 0.6378
# Iteration 185: Best score = 0.6378
# Iteration 190: Best score = 0.6378
# Iteration 195: Best score = 0.6378
# Iteration 200: Best score = 0.6378
# Best parameters: OrderedDict({'colsample_bytree': 0.7461282882550149, 'learning_rate': 0.9936330838999975, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.9395779652231953})
# Best score: 0.6378

# Optimizing LGBMRegressor...
# Iteration 5: Best score = 0.6416
# Iteration 10: Best score = 0.6497
# Iteration 15: Best score = 0.6497
# Iteration 20: Best score = 0.6497
# Iteration 25: Best score = 0.6497
# Iteration 30: Best score = 0.6497
# Iteration 35: Best score = 0.6497
# Iteration 40: Best score = 0.6521
# Iteration 45: Best score = 0.6521
# Iteration 50: Best score = 0.6521
# Iteration 55: Best score = 0.6521
# Iteration 60: Best score = 0.6521
# Iteration 65: Best score = 0.6521
# Iteration 70: Best score = 0.6521
# Iteration 75: Best score = 0.6521
# Iteration 80: Best score = 0.6521
# Iteration 85: Best score = 0.6521
# Iteration 90: Best score = 0.6521
# Iteration 95: Best score = 0.6521
# Iteration 100: Best score = 0.6521
# Iteration 105: Best score = 0.6521
# Iteration 110: Best score = 0.6521
# Iteration 115: Best score = 0.6521
# Iteration 120: Best score = 0.6521
# Iteration 125: Best score = 0.6521
# Iteration 130: Best score = 0.6521
# Iteration 135: Best score = 0.6521
# Iteration 140: Best score = 0.6521
# Iteration 145: Best score = 0.6521
# Iteration 150: Best score = 0.6521
# Iteration 155: Best score = 0.6521
# Iteration 160: Best score = 0.6521
# Iteration 165: Best score = 0.6521
# Iteration 170: Best score = 0.6521
# Iteration 175: Best score = 0.6521
# Iteration 180: Best score = 0.6521
# Iteration 185: Best score = 0.6521
# Iteration 190: Best score = 0.6521
# Iteration 195: Best score = 0.6521
# Iteration 200: Best score = 0.6521
# Best parameters: OrderedDict({'learning_rate': 0.055454445189799526, 'max_depth': 8, 'min_child_samples': 37, 'n_estimators': 159, 'num_leaves': 208})
# Best score: 0.6521

# Optimizing SVR...
# Iteration 5: Best score = 0.5336
# Iteration 10: Best score = 0.5336
# Iteration 15: Best score = 0.5522
# Iteration 20: Best score = 0.5522
# Iteration 25: Best score = 0.5522
# Iteration 30: Best score = 0.5635
# Iteration 35: Best score = 0.5635
# Iteration 40: Best score = 0.5635
# Iteration 45: Best score = 0.5635
# Iteration 50: Best score = 0.5635
# Iteration 55: Best score = 0.5655
# Iteration 60: Best score = 0.5674
# Iteration 65: Best score = 0.5674
# Iteration 70: Best score = 0.5674
# Iteration 75: Best score = 0.5674
# Iteration 80: Best score = 0.5674
# Iteration 85: Best score = 0.5674
# Iteration 90: Best score = 0.5674
# Iteration 95: Best score = 0.5674
# Iteration 100: Best score = 0.5677
# Best parameters: OrderedDict({'C': 0.1, 'epsilon': 0.38002726414738297, 'kernel': 'rbf'})
# Best score: 0.5677

# Optimizing KNNRegressor...
# Iteration 5: Best score = 0.5224
# Iteration 10: Best score = 0.5261
# Iteration 15: Best score = 0.5282
# Iteration 20: Best score = 0.5282
# Iteration 25: Best score = 0.5282
# Iteration 30: Best score = 0.5282
# Iteration 35: Best score = 0.5282
# Iteration 40: Best score = 0.5282
# Iteration 45: Best score = 0.5282
# Iteration 50: Best score = 0.5282
# Iteration 55: Best score = 0.5282
# Iteration 60: Best score = 0.5282
# Iteration 65: Best score = 0.5282
# Iteration 70: Best score = 0.5282
# Iteration 75: Best score = 0.5282
# Iteration 80: Best score = 0.5282
# Iteration 85: Best score = 0.5282
# Iteration 90: Best score = 0.5282
# Iteration 95: Best score = 0.5282
# Iteration 100: Best score = 0.5282
# Iteration 105: Best score = 0.5282
# Iteration 110: Best score = 0.5282
# Iteration 115: Best score = 0.5282
# Iteration 120: Best score = 0.5282
# Iteration 125: Best score = 0.5282
# Iteration 130: Best score = 0.5282
# Iteration 135: Best score = 0.5282
# Iteration 140: Best score = 0.5282
# Iteration 145: Best score = 0.5282
# Iteration 150: Best score = 0.5282
# Iteration 155: Best score = 0.5282
# Iteration 160: Best score = 0.5282
# Iteration 165: Best score = 0.5282
# Iteration 170: Best score = 0.5282
# Iteration 175: Best score = 0.5282
# Iteration 180: Best score = 0.5282
# Iteration 185: Best score = 0.5282
# Iteration 190: Best score = 0.5282
# Iteration 195: Best score = 0.5282
# Iteration 200: Best score = 0.5282
# Best parameters: OrderedDict({'n_neighbors': 50, 'p': 1, 'weights': 'uniform'})
# Best score: 0.5282

# Optimizing Lasso...
# Iteration 5: Best score = 0.4257
# Iteration 10: Best score = 0.4299
# Iteration 15: Best score = 0.4471
# Iteration 20: Best score = 0.4473
# Iteration 25: Best score = 0.4476
# Iteration 30: Best score = 0.4476
# Iteration 35: Best score = 0.4476
# Iteration 40: Best score = 0.4476
# Iteration 45: Best score = 0.4476
# Iteration 50: Best score = 0.4481
# Iteration 55: Best score = 0.4481
# Iteration 60: Best score = 0.4481
# Iteration 65: Best score = 0.4481
# Iteration 70: Best score = 0.4481
# Best parameters: OrderedDict({'alpha': 0.00039834351977457706})
# Best score: 0.4481

# Optimizing Ridge...
# Iteration 5: Best score = 0.4483
# Iteration 10: Best score = 0.4502
# Iteration 15: Best score = 0.4502
# Iteration 20: Best score = 0.4502
# Iteration 25: Best score = 0.4502
# Iteration 30: Best score = 0.4502
# Iteration 35: Best score = 0.4517
# Iteration 40: Best score = 0.4517
# Iteration 45: Best score = 0.4517
# Iteration 50: Best score = 0.4518
# Iteration 55: Best score = 0.4518
# Iteration 60: Best score = 0.4518
# Iteration 65: Best score = 0.4518
# Iteration 70: Best score = 0.4518
# Iteration 75: Best score = 0.4518
# Iteration 80: Best score = 0.4518
# Iteration 85: Best score = 0.4518
# Best parameters: OrderedDict({'alpha': 4.822503882256502})
# Best score: 0.4518

# Optimizing LogisticRegression...
# Iteration 5: Best score = 0.4351
# Iteration 10: Best score = 0.4361
# Iteration 15: Best score = 0.4361
# Iteration 20: Best score = 0.4369
# Iteration 25: Best score = 0.4380
# Iteration 30: Best score = 0.4380
# Iteration 35: Best score = 0.4380
# Iteration 40: Best score = 0.4380
# Iteration 45: Best score = 0.4380
# Iteration 50: Best score = 0.4380
# Iteration 55: Best score = 0.4380
# Iteration 60: Best score = 0.4380
# Iteration 65: Best score = 0.4380
# Iteration 70: Best score = 0.4380
# Iteration 75: Best score = 0.4380
# Iteration 80: Best score = 0.4380
# Iteration 85: Best score = 0.4380
# Iteration 90: Best score = 0.4380
# Iteration 95: Best score = 0.4380
# Iteration 100: Best score = 0.4380
# Iteration 105: Best score = 0.4380
# Iteration 110: Best score = 0.4380
# Iteration 115: Best score = 0.4380
# Iteration 120: Best score = 0.4380
# Iteration 125: Best score = 0.4380
# Iteration 130: Best score = 0.4380
# Iteration 135: Best score = 0.4380
# Iteration 140: Best score = 0.4380
# Iteration 145: Best score = 0.4380
# Iteration 150: Best score = 0.4380
# Iteration 155: Best score = 0.4380
# Iteration 160: Best score = 0.4380
# Iteration 165: Best score = 0.4380
# Iteration 170: Best score = 0.4380
# Iteration 175: Best score = 0.4380
# Best parameters: OrderedDict({'C': 9.96900468467878, 'solver': 'lbfgs'})
# Best score: 0.4380

# Optimizing RandomForestClassifier...
# Iteration 5: Best score = 0.5795
# Iteration 10: Best score = 0.5795
# Iteration 15: Best score = 0.5795
# Iteration 20: Best score = 0.5795
# Iteration 25: Best score = 0.5809
# Iteration 30: Best score = 0.5809
# Iteration 35: Best score = 0.5809
# Iteration 40: Best score = 0.5809
# Iteration 45: Best score = 0.5809
# Iteration 50: Best score = 0.5809
# Iteration 55: Best score = 0.5809
# Iteration 60: Best score = 0.5809
# Iteration 65: Best score = 0.5852
# Iteration 70: Best score = 0.5852
# Iteration 75: Best score = 0.5852
# Iteration 80: Best score = 0.5852
# Iteration 85: Best score = 0.5852
# Iteration 90: Best score = 0.5852
# Iteration 95: Best score = 0.5852
# Iteration 100: Best score = 0.5852
# Iteration 105: Best score = 0.5852
# Iteration 110: Best score = 0.5852
# Iteration 115: Best score = 0.5852
# Iteration 120: Best score = 0.5852
# Iteration 125: Best score = 0.5852
# Iteration 130: Best score = 0.5852
# Iteration 135: Best score = 0.5852
# Iteration 140: Best score = 0.5852
# Iteration 145: Best score = 0.5852
# Iteration 150: Best score = 0.5852
# Iteration 155: Best score = 0.5852
# Iteration 160: Best score = 0.5852
# Iteration 165: Best score = 0.5852
# Iteration 170: Best score = 0.5852
# Iteration 175: Best score = 0.5852
# Iteration 180: Best score = 0.5852
# Iteration 185: Best score = 0.5852
# Iteration 190: Best score = 0.5852
# Iteration 195: Best score = 0.5852
# Iteration 200: Best score = 0.5852
# Best parameters: OrderedDict({'max_depth': 16, 'min_samples_leaf': 20, 'min_samples_split': 5, 'n_estimators': 10})
# Best score: 0.5852

# Optimizing XGBClassifier...
# Iteration 5: Best score = 0.5183
# Iteration 10: Best score = 0.5183
# Iteration 15: Best score = 0.5183
# Iteration 20: Best score = 0.5726
# Iteration 25: Best score = 0.5829
# Iteration 30: Best score = 0.5908
# Iteration 35: Best score = 0.5908
# Iteration 40: Best score = 0.5908
# Iteration 45: Best score = 0.5908
# Iteration 50: Best score = 0.5908
# Iteration 55: Best score = 0.5912
# Iteration 60: Best score = 0.5912
# Iteration 65: Best score = 0.5912
# Iteration 70: Best score = 0.5912
# Iteration 75: Best score = 0.5912
# Iteration 80: Best score = 0.5912
# Iteration 85: Best score = 0.5912
# Iteration 90: Best score = 0.5912
# Iteration 95: Best score = 0.5912
# Iteration 100: Best score = 0.5912
# Iteration 105: Best score = 0.5912
# Iteration 110: Best score = 0.5912
# Iteration 115: Best score = 0.5912
# Iteration 120: Best score = 0.5912
# Iteration 125: Best score = 0.5912
# Iteration 130: Best score = 0.5912
# Iteration 135: Best score = 0.5912
# Iteration 140: Best score = 0.5912
# Iteration 145: Best score = 0.5912
# Iteration 150: Best score = 0.5912
# Iteration 155: Best score = 0.5912
# Iteration 160: Best score = 0.5912
# Iteration 165: Best score = 0.5912
# Iteration 170: Best score = 0.5912
# Iteration 175: Best score = 0.5912
# Iteration 180: Best score = 0.5912
# Iteration 185: Best score = 0.5912
# Iteration 190: Best score = 0.5912
# Iteration 195: Best score = 0.5912
# Iteration 200: Best score = 0.5912
# Best parameters: OrderedDict({'colsample_bytree': 1.0, 'learning_rate': 0.6788945751630601, 'max_depth': 1, 'n_estimators': 81, 'subsample': 1.0})
# Best score: 0.5912

# models_opt = {}
# for name, model in models.items():
#     print(f"Optimizing {name}...")
#     opt = optimize_model(X, y, model, spaces[name])
#     models_opt[name] = opt
#     print(f"Best parameters: {opt.best_params_}")
#     print(f"Best score: {opt.best_score_:.4f}")
#     print()

models_opt = {}

models_opt['RandomForestRegressor'] = {
    'estimator': RandomForestRegressor(**{'max_depth': 25, 'min_samples_leaf': 20, 'min_samples_split': 20, 'n_estimators': 10}),
    'best_params_': {'max_depth': 25, 'min_samples_leaf': 20, 'min_samples_split': 20, 'n_estimators': 10},
    'best_score_': 0.6509
}

models_opt['XGBRegressor'] = {
    'estimator': XGBRegressor(**{'colsample_bytree': 0.7461282882550149, 'learning_rate': 0.9936330838999975, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.9395779652231953}),
    'best_params_': {'colsample_bytree': 0.7461282882550149, 'learning_rate': 0.9936330838999975, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.9395779652231953},
    'best_score_': 0.6378
}

models_opt['LGBMRegressor'] = {
    'estimator': LGBMRegressor(**{'verbose': -1, 'learning_rate': 0.055454445189799526, 'max_depth': 8, 'min_child_samples': 37, 'n_estimators': 159, 'num_leaves': 208}),
    'best_params_': {'verbose': -1, 'learning_rate': 0.055454445189799526, 'max_depth': 8, 'min_child_samples': 37, 'n_estimators': 159, 'num_leaves': 208},
    'best_score_': 0.6521
}

models_opt['SVR'] = {
    'estimator': SVR(**{'C': 0.1, 'epsilon': 0.38002726414738297, 'kernel': 'rbf'}),
    'best_params_': {'C': 0.1, 'epsilon': 0.38002726414738297, 'kernel': 'rbf'},
    'best_score_': 0.5677
}

models_opt['KNNRegressor'] = {
    'estimator': KNeighborsRegressor(**{'n_neighbors': 50, 'p': 1, 'weights': 'uniform'}),
    'best_params_': {'n_neighbors': 50, 'p': 1, 'weights': 'uniform'},
    'best_score_': 0.5282
}

models_opt['Lasso'] = {
    'estimator': Lasso(**{'alpha': 0.00039834351977457706, "max_iter": 5000}),
    'best_params_': {'alpha': 0.00039834351977457706, "max_iter": 5000},
    'best_score_': 0.4481
}

models_opt['Ridge'] = {
    'estimator': Ridge(**{'alpha': 4.822503882256502}),
    'best_params_': {'alpha': 4.822503882256502},
    'best_score_': 0.4518
}

models_opt['LogisticRegression'] = {
    'estimator': LogisticRegression(**{'C': 9.96900468467878, 'solver': 'lbfgs', "max_iter": 500}),
    'best_params_': {'C': 9.96900468467878, 'solver': 'lbfgs', "max_iter": 500},
    'best_score_': 0.4380
}

models_opt['RandomForestClassifier'] = {
    'estimator': RandomForestClassifier(**{'max_depth': 16, 'min_samples_leaf': 20, 'min_samples_split': 5, 'n_estimators': 10}),
    'best_params_': {'max_depth': 16, 'min_samples_leaf': 20, 'min_samples_split': 5, 'n_estimators': 10},
    'best_score_': 0.5852
}

models_opt['XGBClassifier'] = {
    'estimator': XGBClassifier(**{'colsample_bytree': 1.0, 'learning_rate': 0.6788945751630601, 'max_depth': 1, 'n_estimators': 81, 'subsample': 1.0}),
    'best_params_': {'colsample_bytree': 1.0, 'learning_rate': 0.6788945751630601, 'max_depth': 1, 'n_estimators': 81, 'subsample': 1.0},
    'best_score_': 0.5912
}

models = {}
for name, opt in models_opt.items():
    model_class = type(opt["estimator"])  # Get the model class (e.g., RandomForestRegressor)
    best_params = opt["best_params_"]  # Get the best parameters from the optimization

    # Add the model with the best parameters to the new dictionary
    models[f"{name} opt"] = model_class(**best_params)

# Add a baseline model manually (if needed)
models['Random Forest Baseline'] = RandomForestClassifier()

# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)

Best mean F1 test score: 0.6521 by model: LGBMRegressor opt
Min F1 test score: 0.5816, Max F1 test score: 0.7691

# Plot MSE results for regression models
plot_multi_model_results(results)

# Sort the models by their best score in descending order
top_3_models = sorted(models_opt.items(), key=lambda x: x[1]["best_score_"], reverse=True)

# Filter only classifiers from the sorted models
top_3_classifiers = [(name, opt["estimator"]) for name, opt in top_3_models if isinstance(opt["estimator"], ClassifierMixin)][:3]

# Print the top 3 classifiers for verification
print("Top 3 classifiers used in VotingClassifier:")
for name, estimator in top_3_classifiers:
    print(f"{name}: {type(estimator).__name__}")

# Define the voting ensemble using the top 3 classifiers
bagging_vote_ensemble = VotingClassifier(estimators=top_3_classifiers)


# Sort the models by their best score in descending order
top_models = sorted(models_opt.items(), key=lambda x: x[1]["best_score_"], reverse=True)

# Filter only regressors from the sorted models
top_2_regressors = [(name, opt["estimator"]) for name, opt in top_models if isinstance(opt["estimator"], RegressorMixin)][:2]

# Print the top 2 regressors for verification
print("Top 2 regressors used in VotingRegressor:")
for name, estimator in top_2_regressors:
    print(f"{name}: {type(estimator).__name__}")

# Define the voting ensemble using the top 2 regressors
voting_regressor_ensemble = VotingRegressor(estimators=top_2_regressors)


bagging_ensemble = BaggingRegressor(
    estimator=RandomForestRegressor(),
    n_estimators=10,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    bootstrap_features=False,
    n_jobs=-1
)


models = {
    'bagging_ensemble': bagging_ensemble,
    'bagging_vote_ensemble': bagging_vote_ensemble,
    'voting_regressor_ensemble': voting_regressor_ensemble,
    'Random Forest Baseline': RandomForestClassifier(),
}

Top 3 classifiers used in VotingClassifier:
XGBClassifier: XGBClassifier
RandomForestClassifier: RandomForestClassifier
LogisticRegression: LogisticRegression
Top 2 regressors used in VotingRegressor:
LGBMRegressor: LGBMRegressor
RandomForestRegressor: RandomForestRegressor

# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)

Best mean F1 test score: 0.6358 by model: voting_regressor_ensemble
Min F1 test score: 0.5475, Max F1 test score: 0.7640

# Plot MSE results for regression models
plot_multi_model_results(results)

# Separate regressors and classifiers from models_opt
regressor_estimators = [(name, opt["estimator"]) for name, opt in models_opt.items() if isinstance(opt["estimator"], RegressorMixin)]
classifier_estimators = [(name, opt["estimator"]) for name, opt in models_opt.items() if isinstance(opt["estimator"], ClassifierMixin)]

# Define the meta-models
ridge_regressor = Ridge()
random_forest_regressor = RandomForestRegressor()

ridge_classifier = RidgeClassifier()  # RidgeClassifier for classification tasks
random_forest_classifier = RandomForestClassifier()

# Stacking Regressor with Ridge as meta-model
stacking_regressor_ridge = StackingRegressor(
    estimators=regressor_estimators,
    final_estimator=ridge_regressor
)

# Stacking Regressor with RandomForest as meta-model
stacking_regressor_rf = StackingRegressor(
    estimators=regressor_estimators,
    final_estimator=random_forest_regressor
)

# Stacking Classifier with Ridge as meta-model
stacking_classifier_ridge = StackingClassifier(
    estimators=classifier_estimators,
    final_estimator=ridge_classifier
)

# Stacking Classifier with RandomForest as meta-model
stacking_classifier_rf = StackingClassifier(
    estimators=classifier_estimators,
    final_estimator=random_forest_classifier
)

# Stacking Classifier with Ridge as meta-model
stacking_classifier_ridge_proba = StackingClassifier(
    estimators=classifier_estimators,
    final_estimator=ridge_classifier,
    stack_method='predict_proba' 
)

# Stacking Classifier with RandomForest as meta-model
stacking_classifier_rf_proba = StackingClassifier(
    estimators=classifier_estimators,
    final_estimator=random_forest_classifier,
    stack_method='predict_proba' 
)

models = {
    'Stacking Regressor (Ridge Meta)': stacking_regressor_ridge,
    'Stacking Regressor (RandomForest Meta)': stacking_regressor_rf,
    'Stacking Classifier (Ridge Meta)': stacking_classifier_ridge,
    'Stacking Classifier (RandomForest Meta)': stacking_classifier_rf,
    'Stacking Classifier (Ridge Meta, Proba)': stacking_classifier_ridge_proba,
    'Stacking Classifier (RandomForest Meta, Proba)': stacking_classifier_rf_proba,
    'Random Forest Baseline': RandomForestClassifier(),
}

# Run cross-validation for all models
results = run_multi_model_cv(X, y, models)

# Plot results for all models
plot_multi_model_results(results)

Best mean F1 test score: 0.6504 by model: Stacking Regressor (Ridge Meta)
Min F1 test score: 0.5665, Max F1 test score: 0.7761

import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import KFold

class CustomStackingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators, final_estimator=Ridge(), cv=20):
        """
        Custom stacking model that can combine both classifiers and regressors as base learners.
        
        Parameters:
        - estimators: list of (name, estimator) tuples.
        - final_estimator: the meta-model to train on the stacked outputs.
        - cv: number of cross-validation folds for generating out-of-fold predictions.
        """
        self.estimators = estimators
        self.final_estimator = final_estimator
        self.cv = cv
        self.fitted_estimators_ = []
        self.n_classes = 5

    def fit(self, X, y):
        self.fitted_estimators_ = []
        # Initialize list to hold stacked features
        stacked_features_list = []

        n_samples = X.shape[0]
    
        # Cross-validation setup
        kf = KFold(n_splits=self.cv)
        
        # For each estimator, generate out-of-fold predictions
        for i, (name, estimator) in enumerate(self.estimators):
            # Determine the shape of oof_predictions based on predict or predict_proba
            if hasattr(estimator, "predict_proba"):
                oof_predictions = np.zeros((n_samples, self.n_classes))
            else:
                oof_predictions = np.zeros((n_samples, 1))

            for train_idx, valid_idx in kf.split(X):
                # Use .iloc to access the rows based on integer-based indexing
                X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
                y_train = y.iloc[train_idx]


                # Fit the estimator on the training fold
                fitted_estimator = estimator.fit(X_train, y_train)
                
                # Predict on the validation fold
                if hasattr(fitted_estimator, "predict_proba"):
                    # Store multi-class probabilities in the oof_predictions matrix
                    predictions = fitted_estimator.predict_proba(X_valid)
                    oof_predictions[valid_idx] = predictions
                else:
                    predictions = fitted_estimator.predict(X_valid).reshape(-1, 1)
                    oof_predictions[valid_idx] = predictions

            # Store the out-of-fold predictions in the list of stacked features
            stacked_features_list.append(oof_predictions)
            
            # Fit the full estimator on the entire dataset (for future predictions)
            self.fitted_estimators_.append(estimator.fit(X, y))
    
        # Concatenate all stacked features (column-wise) after all estimators are fitted
        stacked_features = np.hstack(stacked_features_list)
    
        # Fit the final meta-model on the stacked features
        self.final_estimator.fit(stacked_features, y)
        
        return self

    def _get_stacked_features(self, X):
        # Generate predictions from the base estimators for the new data
        stacked_features_list = []
        for estimator in self.fitted_estimators_:
            if hasattr(estimator, "predict_proba"):
                # Concatenate all class probabilities for multi-class classification
                stacked_features_list.append(estimator.predict_proba(X))
            else:
                stacked_features_list.append(estimator.predict(X).reshape(-1, 1))
        
        # Concatenate stacked features column-wise
        stacked_features = np.hstack(stacked_features_list)
        return stacked_features

    def predict(self, X):
        # Get stacked features for the test set
        stacked_features = self._get_stacked_features(X)
        
        # Predict using the final estimator (meta-model)
        return self.final_estimator.predict(stacked_features)


# Example usage of CustomStackingModel
regressor_estimators = [(name, opt["estimator"]) for name, opt in models_opt.items() if isinstance(opt["estimator"], RegressorMixin)]
classifier_estimators = [(name, opt["estimator"]) for name, opt in models_opt.items() if isinstance(opt["estimator"], ClassifierMixin)]

# Combine both regressors and classifiers as base learners
combined_estimators = regressor_estimators + classifier_estimators

# Define the custom stacking model with both classifiers and regressors, using Ridge as the meta-model
custom_stacking_model_ridge = CustomStackingModel(
    estimators=combined_estimators,
    final_estimator=Ridge({'alpha': 5.787655426374446})
)

custom_stacking_model_rf = CustomStackingModel(
    estimators=combined_estimators,
    final_estimator=RandomForestRegressor()
)

custom_stacking_model_lgbm = CustomStackingModel(
    estimators=combined_estimators,
    final_estimator= LGBMRegressor(verbose=-1)
)

custom_stacking_model_rf_class = CustomStackingModel(
    estimators=combined_estimators,
    final_estimator= RandomForestClassifier()
)

models = {
    'custom_stacking_model_ridge': stacking_regressor_ridge,
    'custom_stacking_model_rf': custom_stacking_model_rf,
    'custom_stacking_model_rf_class': custom_stacking_model_rf_class,
    'custom_stacking_model_lgbm': custom_stacking_model_lgbm,
    'Random Forest Baseline': RandomForestClassifier(),
}

# Run cross-validation for all models
results = run_multi_model_cv(X, y, models)

# Plot results for all models
plot_multi_model_results(results)

Best mean F1 test score: 0.6510 by model: custom_stacking_model_ridge
Min F1 test score: 0.5617, Max F1 test score: 0.7781

data_train = pd.read_csv('module6_course_train.csv')
data_test = pd.read_csv('module6_course_test.csv')

y_train = data_train.pop('Score')
X_train = data_train.copy()

y_test = data_test.pop('Score')
X_test = data_test.copy()

custom_stacking_model_rf.fit(X_train, y_train)

CustomStackingModel(estimators=[('RandomForestRegressor',
                                 RandomForestRegressor(max_depth=25,
                                                       min_samples_leaf=20,
                                                       min_samples_split=20,
                                                       n_estimators=10)),
                                ('XGBRegressor',
                                 XGBRegressor(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=0.7461282882550149,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable...
                                               interaction_constraints=None,
                                               learning_rate=0.6788945751630601,
                                               max_bin=None,
                                               max_cat_threshold=None,
                                               max_cat_to_onehot=None,
                                               max_delta_step=None, max_depth=1,
                                               max_leaves=None,
                                               min_child_weight=None,
                                               missing=nan,
                                               monotone_constraints=None,
                                               multi_strategy=None,
                                               n_estimators=81, n_jobs=None,
                                               num_parallel_tree=None,
                                               objective='multi:softprob', ...))],
                    final_estimator=RandomForestRegressor())

CustomStackingModel(estimators=[('RandomForestRegressor',
                                 RandomForestRegressor(max_depth=25,
                                                       min_samples_leaf=20,
                                                       min_samples_split=20,
                                                       n_estimators=10)),
                                ('XGBRegressor',
                                 XGBRegressor(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=0.7461282882550149,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable...
                                               interaction_constraints=None,
                                               learning_rate=0.6788945751630601,
                                               max_bin=None,
                                               max_cat_threshold=None,
                                               max_cat_to_onehot=None,
                                               max_delta_step=None, max_depth=1,
                                               max_leaves=None,
                                               min_child_weight=None,
                                               missing=nan,
                                               monotone_constraints=None,
                                               multi_strategy=None,
                                               n_estimators=81, n_jobs=None,
                                               num_parallel_tree=None,
                                               objective='multi:softprob', ...))],
                    final_estimator=RandomForestRegressor())

RandomForestRegressor()

RandomForestRegressor()

y_pred_test = custom_stacking_model_rf.predict(X_test)
y_pred_train = custom_stacking_model_rf.predict(X_train)

f1_train = custom_f1_score(y_train, y_pred_train)
f1_test = custom_f1_score(y_test, y_pred_test)

print(f"f1_train: {f1_train}")
print(f"f1_test: {f1_test}")

f1_train: 0.7262607298491914
f1_test: 0.7111708369916983

lgmb_best = LGBMRegressor(**{'verbose': -1, 'learning_rate': 0.055454445189799526, 'max_depth': 8, 'min_child_samples': 37, 'n_estimators': 159, 'num_leaves': 208})

lgmb_best.fit(X_train, y_train)

LGBMRegressor(learning_rate=0.05545444518979953, max_depth=8,
              min_child_samples=37, n_estimators=159, num_leaves=208,
              verbose=-1)

LGBMRegressor(learning_rate=0.05545444518979953, max_depth=8,
              min_child_samples=37, n_estimators=159, num_leaves=208,
              verbose=-1)

y_pred_test = lgmb_best.predict(X_test)
y_pred_train = lgmb_best.predict(X_train)

f1_train = custom_f1_score(y_train, y_pred_train)
f1_test = custom_f1_score(y_test, y_pred_test)

print(f"f1_train: {f1_train}")
print(f"f1_test: {f1_test}")

f1_train: 0.7292455204969205
f1_test: 0.7086080988315853

Data Collection¶

Data Analysis¶

Model Building and Evaluate¶

Simple Baseline¶

Compare different models¶

Optimize models¶

Bagging¶

Stacking¶

Custom Stacking¶

Eval on unseen data¶

	Temperature	Humidity	Humex	CO2	Bright	weekday_0	weekday_1	weekday_2	weekday_3	weekday_4	weekday_5	weekday_6	hour_sine_wave	Score
0	-0.151174	2.695116	1.175429	-0.258951	-0.528247	False	False	False	True	False	False	False	-0.129410	3
1	-0.089558	2.573765	1.175429	-0.397135	-0.528247	False	False	False	False	True	False	False	0.000000	3
2	-0.027943	2.573765	1.252984	-0.046741	-0.528247	False	False	False	False	True	False	False	0.129410	3
3	0.033673	2.573765	1.330538	0.214821	-0.528247	False	False	False	False	True	False	False	0.250000	1
4	0.033673	2.452414	1.252984	0.431967	-0.528247	False	False	False	False	True	False	False	0.433013	3
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
6995	-1.260252	-1.066767	-1.500198	-0.984416	-0.007760	False	False	False	False	False	False	True	-0.433013	1
6996	-1.321868	-1.066767	-1.577752	-1.028832	-0.189931	False	False	False	False	False	False	True	-0.433013	0
6997	-1.445098	-0.945416	-1.616530	-0.930129	-0.320052	False	False	False	False	False	False	True	-0.482963	0
6998	-1.568329	-0.945416	-1.694084	-1.004156	-0.515235	False	False	False	False	False	False	True	-0.500000	1
6999	-1.691560	-0.945416	-1.810416	-0.984416	-0.528247	False	False	False	False	False	False	True	-0.482963	1