In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, RidgeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingRegressor, StackingRegressor, StackingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import make_scorer, f1_score
from sklearn.base import RegressorMixin, ClassifierMixin, BaseEstimator
from sklearn.ensemble import VotingClassifier, VotingRegressor
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
Data Collection¶
In [4]:
import requests
# URLs of the files
data_train_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/course/module6_course_train.csv'
data_test_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/course/module6_course_test.csv'
# Function to download a file
def download_file(url, file_name):
response = requests.get(url)
response.raise_for_status() # Ensure we notice bad responses
with open(file_name, 'wb') as file:
file.write(response.content)
print(f'Downloaded {file_name} from {url}')
# Downloading the files
download_file(data_train_url, 'module6_course_train.csv')
download_file(data_train_url, 'module6_course_test.csv')
Downloaded module6_course_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module6/course/module6_course_train.csv Downloaded module6_course_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module6/course/module6_course_train.csv
In [5]:
data_train = pd.read_csv('module6_course_train.csv')
data_test = pd.read_csv('module6_course_test.csv')
Data Analysis¶
In [6]:
data_train
Out[6]:
| Temperature | Humidity | Humex | CO2 | Bright | weekday_0 | weekday_1 | weekday_2 | weekday_3 | weekday_4 | weekday_5 | weekday_6 | hour_sine_wave | Score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.151174 | 2.695116 | 1.175429 | -0.258951 | -0.528247 | False | False | False | True | False | False | False | -0.129410 | 3 |
| 1 | -0.089558 | 2.573765 | 1.175429 | -0.397135 | -0.528247 | False | False | False | False | True | False | False | 0.000000 | 3 |
| 2 | -0.027943 | 2.573765 | 1.252984 | -0.046741 | -0.528247 | False | False | False | False | True | False | False | 0.129410 | 3 |
| 3 | 0.033673 | 2.573765 | 1.330538 | 0.214821 | -0.528247 | False | False | False | False | True | False | False | 0.250000 | 1 |
| 4 | 0.033673 | 2.452414 | 1.252984 | 0.431967 | -0.528247 | False | False | False | False | True | False | False | 0.433013 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6995 | -1.260252 | -1.066767 | -1.500198 | -0.984416 | -0.007760 | False | False | False | False | False | False | True | -0.433013 | 1 |
| 6996 | -1.321868 | -1.066767 | -1.577752 | -1.028832 | -0.189931 | False | False | False | False | False | False | True | -0.433013 | 0 |
| 6997 | -1.445098 | -0.945416 | -1.616530 | -0.930129 | -0.320052 | False | False | False | False | False | False | True | -0.482963 | 0 |
| 6998 | -1.568329 | -0.945416 | -1.694084 | -1.004156 | -0.515235 | False | False | False | False | False | False | True | -0.500000 | 1 |
| 6999 | -1.691560 | -0.945416 | -1.810416 | -0.984416 | -0.528247 | False | False | False | False | False | False | True | -0.482963 | 1 |
7000 rows × 14 columns
In [5]:
data_train.isnull().sum()
Out[5]:
Temperature 0 Humidity 0 Humex 0 CO2 0 Bright 0 weekday_0 0 weekday_1 0 weekday_2 0 weekday_3 0 weekday_4 0 weekday_5 0 weekday_6 0 hour_sine_wave 0 Score 0 dtype: int64
In [6]:
data_train['Score'].value_counts(normalize=True)
Out[6]:
Score 2 0.420143 1 0.272429 3 0.232571 0 0.044857 4 0.030000 Name: proportion, dtype: float64
Model Building and Evaluate¶
In [7]:
y = data_train.pop('Score')
X = data_train.copy()
In [8]:
# Function to plot the evaluation results
def plot_results(mse_train, mse_test, f1_train, f1_test):
plt.figure(figsize=(12, 6))
# MSE plot
plt.subplot(1, 2, 1)
plt.plot(mse_train, label="Train MSE", marker='o')
plt.plot(mse_test, label="Test MSE", marker='o')
plt.fill_between(range(len(mse_train)), np.min(mse_train), np.max(mse_train), color='blue', alpha=0.1)
plt.fill_between(range(len(mse_test)), np.min(mse_test), np.max(mse_test), color='orange', alpha=0.1)
plt.title("MSE over Folds")
plt.xlabel("Fold")
plt.ylabel("MSE")
plt.legend()
plt.grid(True)
# f1_score plot
plt.subplot(1, 2, 2)
plt.plot(f1_train, label="Train f1_score", marker='o')
plt.plot(f1_test, label="Test f1_score", marker='o')
plt.fill_between(range(len(f1_train)), np.min(f1_train), np.max(f1_train), color='blue', alpha=0.1)
plt.fill_between(range(len(f1_test)), np.min(f1_test), np.max(f1_test), color='orange', alpha=0.1)
plt.title("f1_score over Folds")
plt.xlabel("Fold")
plt.ylabel("f1_score")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
def plot_multi_model_results(results):
# Set up the plot
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 20))
# Colors for train and test
train_color = 'skyblue'
test_color = 'lightgreen'
# Plot MSE
ax1.set_title('Mean Squared Error (MSE) Comparison', fontsize=16)
ax1.set_ylabel('MSE', fontsize=12)
ax1.set_xlabel('Models', fontsize=12)
ax1.grid(True, linestyle='--', alpha=0.7)
# Plot f1_score
ax2.set_title('f1_score Comparison', fontsize=16)
ax2.set_ylabel('f1_score', fontsize=12)
ax2.set_xlabel('Models', fontsize=12)
ax2.grid(True, linestyle='--', alpha=0.7)
x = np.arange(len(results))
width = 0.35
for i, (model_name, scores) in enumerate(results.items()):
# MSE
mse_train = scores['mse_train']
mse_test = scores['mse_test']
ax1.bar(x[i] - width/2, np.mean(mse_train), width, label='Train' if i == 0 else "",
color=train_color, alpha=0.7)
ax1.bar(x[i] + width/2, np.mean(mse_test), width, label='Test' if i == 0 else "",
color=test_color, alpha=0.7)
ax1.errorbar(x[i] - width/2, np.mean(mse_train),
yerr=[[np.mean(mse_train)-np.min(mse_train)], [np.max(mse_train)-np.mean(mse_train)]],
fmt='none', ecolor='black', capsize=5)
ax1.errorbar(x[i] + width/2, np.mean(mse_test),
yerr=[[np.mean(mse_test)-np.min(mse_test)], [np.max(mse_test)-np.mean(mse_test)]],
fmt='none', ecolor='black', capsize=5)
# f1_score
f1_train = scores['f1_train']
f1_test = scores['f1_test']
ax2.bar(x[i] - width/2, np.mean(f1_train), width, label='Train' if i == 0 else "",
color=train_color, alpha=0.7)
ax2.bar(x[i] + width/2, np.mean(f1_test), width, label='Test' if i == 0 else "",
color=test_color, alpha=0.7)
ax2.errorbar(x[i] - width/2, np.mean(f1_train),
yerr=[[np.mean(f1_train)-np.min(f1_train)], [np.max(f1_train)-np.mean(f1_train)]],
fmt='none', ecolor='black', capsize=5)
ax2.errorbar(x[i] + width/2, np.mean(f1_test),
yerr=[[np.mean(f1_test)-np.min(f1_test)], [np.max(f1_test)-np.mean(f1_test)]],
fmt='none', ecolor='black', capsize=5)
ax1.set_xticks(x)
ax1.set_xticklabels(results.keys(), rotation=45, ha='right')
ax2.set_xticks(x)
ax2.set_xticklabels(results.keys(), rotation=45, ha='right')
ax1.legend(loc='upper left')
ax2.legend(loc='upper left')
plt.tight_layout()
plt.show()
In [9]:
def cast_and_clip_predictions(y_pred):
# Round the predictions to the nearest integer
y_pred_int = np.round(y_pred).astype(int)
# Clip the predictions to be within the range of y_true
y_pred_int = np.clip(y_pred_int, np.min(0), np.max(4))
return y_pred_int
# Custom scorer that incorporates casting and clipping
def custom_f1_score(y_true, y_pred):
y_pred_cast = cast_and_clip_predictions(y_pred)
return f1_score(y_true, y_pred_cast, average="weighted")
In [10]:
# Function to handle train-test evaluation in a fold
def train_and_evaluate(X_train, X_test, y_train, y_test, model):
# Train the model
model.fit(X_train, y_train)
# Make predictions on train set
y_pred_train = model.predict(X_train)
# Make predictions on train set
y_pred_test = model.predict(X_test)
# Compute MSE for train and test
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
# Compute f1_score
f1_train = custom_f1_score(y_train, y_pred_train)
f1_test = custom_f1_score(y_test, y_pred_test)
return mse_train, mse_test, f1_train, f1_test
def run_multi_model_cv(X, y, models, n_splits=5):
sfold = StratifiedKFold(n_splits=n_splits)
results = {name: {'mse_train': [], 'mse_test': [], 'f1_train': [], 'f1_test': []}
for name in models.keys()}
for train_index, test_index in sfold.split(X, y):
X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
y_train, y_test = y.iloc[train_index].copy(), y.iloc[test_index].copy()
for name, model in models.items():
mse_train, mse_test, f1_train, f1_test = train_and_evaluate(
X_train, X_test, y_train, y_test, model
)
results[name]['mse_train'].append(mse_train)
results[name]['mse_test'].append(mse_test)
results[name]['f1_train'].append(f1_train)
results[name]['f1_test'].append(f1_test)
# Find the model with the best mean F1 test score
best_mean_f1_score = -1
best_model = None
best_min_f1 = None
best_max_f1 = None
for name, result in results.items():
f1_test_scores = result['f1_test']
mean_f1_test = sum(f1_test_scores) / len(f1_test_scores) # Calculate mean F1 score
min_f1_test = min(f1_test_scores) # Minimum F1 score
max_f1_test = max(f1_test_scores) # Maximum F1 score
if mean_f1_test > best_mean_f1_score:
best_mean_f1_score = mean_f1_test
best_min_f1 = min_f1_test
best_max_f1 = max_f1_test
best_model = name
# Print the best mean F1 test score, min, max, and the associated model
print(f"Best mean F1 test score: {best_mean_f1_score:.4f} by model: {best_model}")
print(f"Min F1 test score: {best_min_f1:.4f}, Max F1 test score: {best_max_f1:.4f}")
return results
Simple Baseline¶
In [11]:
# Step 1: Initialize model
model = RandomForestClassifier()
# Step 2: Run cross-validation
results = run_multi_model_cv(X, y, {"RandomForestClassifier": model})
# Step 3: Plot the results
plot_results(results["RandomForestClassifier"]["mse_train"],
results["RandomForestClassifier"]["mse_test"],
results["RandomForestClassifier"]["f1_train"],
results["RandomForestClassifier"]["f1_test"])
Best mean F1 test score: 0.3970 by model: RandomForestClassifier Min F1 test score: 0.3733, Max F1 test score: 0.4264
Compare different models¶
In [12]:
models = {
'Ridge': Ridge(),
'Decision Tree Regressor': DecisionTreeRegressor(),
'Random Forest Regressor': RandomForestRegressor(),
'SVR': SVR(),
'Lasso': Lasso(max_iter=5000),
'KNN Regressor': KNeighborsRegressor(),
'Logistic Regression': LogisticRegression(),
'Decision Tree Classifier': DecisionTreeClassifier(),
'Random Forest Classifier': RandomForestClassifier(),
'SVC': SVC(),
'KNN Classifier': KNeighborsClassifier(),
'XGBRegressor': XGBRegressor(),
'LGBMRegressor': LGBMRegressor(verbose=-1)
}
In [13]:
# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)
Best mean F1 test score: 0.6188 by model: LGBMRegressor Min F1 test score: 0.5145, Max F1 test score: 0.7604
In [14]:
# Plot MSE results for regression models
plot_multi_model_results(results)
Optimize models¶
In [15]:
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from skopt.callbacks import DeltaYStopper
# Define the search spaces for each model
spaces = {
'RandomForestRegressor': {
'n_estimators': Integer(10, 500),
'max_depth': Integer(1, 50),
'min_samples_split': Integer(2, 20),
'min_samples_leaf': Integer(1, 20)
},
'XGBRegressor': {
'n_estimators': Integer(10, 200),
'max_depth': Integer(1, 50),
'learning_rate': Real(0.01, 2.0, 'log-uniform'),
'subsample': Real(0.5, 1.0, 'uniform'),
'colsample_bytree': Real(0.5, 1.0, 'uniform')
},
'LGBMRegressor': {
'n_estimators': Integer(10, 200),
'max_depth': Integer(1, 50),
'learning_rate': Real(0.01, 1.0, 'log-uniform'),
'num_leaves': Integer(20, 300),
'min_child_samples': Integer(1, 100)
},
'SVR': {
'C': Real(0.1, 10.0, 'log-uniform'),
'epsilon': Real(0.001, 1.0, 'log-uniform'),
'kernel': Categorical(['linear', 'rbf', 'poly'])
},
'KNNRegressor': {
'n_neighbors': Integer(1, 50),
'weights': Categorical(['uniform', 'distance']),
'p': Integer(1, 2)
},
'Lasso': {
'alpha': Real(0.0001, 10.0, 'log-uniform')
},
'Ridge': {
'alpha': Real(0.01, 10.0, 'log-uniform'),
},
'LogisticRegression': {
'C': Real(0.01, 10.0, 'log-uniform'),
'solver': Categorical(['lbfgs', 'liblinear'])
},
'RandomForestClassifier': {
'n_estimators': Integer(10, 500),
'max_depth': Integer(1, 50),
'min_samples_split': Integer(2, 20),
'min_samples_leaf': Integer(1, 20)
},
'XGBClassifier': {
'n_estimators': Integer(10, 200),
'max_depth': Integer(1, 50),
'learning_rate': Real(0.01, 2.0, 'log-uniform'),
'subsample': Real(0.5, 1.0, 'uniform'),
'colsample_bytree': Real(0.5, 1.0, 'uniform')
}
}
def optimizer_callback(res):
if len(res.func_vals) % 5 == 0: # Print every 5 iterations
print(f"Iteration {len(res.func_vals)}: Best score = {-res.fun:.4f}")
delta_stopper = DeltaYStopper(delta=0.001, n_best=10)
# Function to optimize models
def optimize_model(X, y, model, space, n_iter=200):
sfold = StratifiedKFold(n_splits=5)
scorer = make_scorer(custom_f1_score)
opt = BayesSearchCV(
model,
space,
n_iter=n_iter,
n_points=5,
cv=sfold,
n_jobs=-1,
scoring=scorer,
random_state=42
)
opt.fit(X, y, callback=[optimizer_callback, delta_stopper])
return opt
# Optimize models
models = {
'RandomForestRegressor': RandomForestRegressor(),
'XGBRegressor': XGBRegressor(),
'LGBMRegressor': LGBMRegressor(verbose=-1),
'SVR': SVR(),
'KNNRegressor': KNeighborsRegressor(),
'Lasso': Lasso(max_iter=5000),
'Ridge': Ridge(),
'LogisticRegression': LogisticRegression(max_iter=500),
'RandomForestClassifier': RandomForestClassifier(),
'XGBClassifier': XGBClassifier(),
}
# models_opt = {}
# for name, model in models.items():
# print(f"Optimizing {name}...")
# opt = optimize_model(X, y, model, spaces[name])
# models_opt[name] = opt
# print(f"Best parameters: {opt.best_params_}")
# print(f"Best score: {opt.best_score_:.4f}")
# print()
In [16]:
# Optimizing RandomForestRegressor...
# Iteration 5: Best score = 0.6446
# Iteration 10: Best score = 0.6446
# Iteration 15: Best score = 0.6462
# Iteration 20: Best score = 0.6462
# Iteration 25: Best score = 0.6462
# Iteration 30: Best score = 0.6462
# Iteration 35: Best score = 0.6462
# Iteration 40: Best score = 0.6462
# Iteration 45: Best score = 0.6462
# Iteration 50: Best score = 0.6462
# Iteration 55: Best score = 0.6462
# Iteration 60: Best score = 0.6462
# Iteration 65: Best score = 0.6462
# Iteration 70: Best score = 0.6462
# Iteration 75: Best score = 0.6462
# Iteration 80: Best score = 0.6462
# Iteration 85: Best score = 0.6509
# Iteration 90: Best score = 0.6509
# Iteration 95: Best score = 0.6509
# Iteration 100: Best score = 0.6509
# Iteration 105: Best score = 0.6509
# Iteration 110: Best score = 0.6509
# Iteration 115: Best score = 0.6509
# Iteration 120: Best score = 0.6509
# Iteration 125: Best score = 0.6509
# Iteration 130: Best score = 0.6509
# Iteration 135: Best score = 0.6509
# Iteration 140: Best score = 0.6509
# Iteration 145: Best score = 0.6509
# Iteration 150: Best score = 0.6509
# Iteration 155: Best score = 0.6509
# Iteration 160: Best score = 0.6509
# Iteration 165: Best score = 0.6509
# Iteration 170: Best score = 0.6509
# Iteration 175: Best score = 0.6509
# Iteration 180: Best score = 0.6509
# Iteration 185: Best score = 0.6509
# Iteration 190: Best score = 0.6509
# Iteration 195: Best score = 0.6509
# Iteration 200: Best score = 0.6509
# Best parameters: OrderedDict({'max_depth': 25, 'min_samples_leaf': 20, 'min_samples_split': 20, 'n_estimators': 10})
# Best score: 0.6509
# Optimizing XGBRegressor...
# Iteration 5: Best score = 0.5863
# Iteration 10: Best score = 0.5863
# Iteration 15: Best score = 0.5863
# Iteration 20: Best score = 0.6188
# Iteration 25: Best score = 0.6188
# Iteration 30: Best score = 0.6188
# Iteration 35: Best score = 0.6188
# Iteration 40: Best score = 0.6207
# Iteration 45: Best score = 0.6207
# Iteration 50: Best score = 0.6207
# Iteration 55: Best score = 0.6207
# Iteration 60: Best score = 0.6273
# Iteration 65: Best score = 0.6273
# Iteration 70: Best score = 0.6328
# Iteration 75: Best score = 0.6328
# Iteration 80: Best score = 0.6330
# Iteration 85: Best score = 0.6378
# Iteration 90: Best score = 0.6378
# Iteration 95: Best score = 0.6378
# Iteration 100: Best score = 0.6378
# Iteration 105: Best score = 0.6378
# Iteration 110: Best score = 0.6378
# Iteration 115: Best score = 0.6378
# Iteration 120: Best score = 0.6378
# Iteration 125: Best score = 0.6378
# Iteration 130: Best score = 0.6378
# Iteration 135: Best score = 0.6378
# Iteration 140: Best score = 0.6378
# Iteration 145: Best score = 0.6378
# Iteration 150: Best score = 0.6378
# Iteration 155: Best score = 0.6378
# Iteration 160: Best score = 0.6378
# Iteration 165: Best score = 0.6378
# Iteration 170: Best score = 0.6378
# Iteration 175: Best score = 0.6378
# Iteration 180: Best score = 0.6378
# Iteration 185: Best score = 0.6378
# Iteration 190: Best score = 0.6378
# Iteration 195: Best score = 0.6378
# Iteration 200: Best score = 0.6378
# Best parameters: OrderedDict({'colsample_bytree': 0.7461282882550149, 'learning_rate': 0.9936330838999975, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.9395779652231953})
# Best score: 0.6378
# Optimizing LGBMRegressor...
# Iteration 5: Best score = 0.6416
# Iteration 10: Best score = 0.6497
# Iteration 15: Best score = 0.6497
# Iteration 20: Best score = 0.6497
# Iteration 25: Best score = 0.6497
# Iteration 30: Best score = 0.6497
# Iteration 35: Best score = 0.6497
# Iteration 40: Best score = 0.6521
# Iteration 45: Best score = 0.6521
# Iteration 50: Best score = 0.6521
# Iteration 55: Best score = 0.6521
# Iteration 60: Best score = 0.6521
# Iteration 65: Best score = 0.6521
# Iteration 70: Best score = 0.6521
# Iteration 75: Best score = 0.6521
# Iteration 80: Best score = 0.6521
# Iteration 85: Best score = 0.6521
# Iteration 90: Best score = 0.6521
# Iteration 95: Best score = 0.6521
# Iteration 100: Best score = 0.6521
# Iteration 105: Best score = 0.6521
# Iteration 110: Best score = 0.6521
# Iteration 115: Best score = 0.6521
# Iteration 120: Best score = 0.6521
# Iteration 125: Best score = 0.6521
# Iteration 130: Best score = 0.6521
# Iteration 135: Best score = 0.6521
# Iteration 140: Best score = 0.6521
# Iteration 145: Best score = 0.6521
# Iteration 150: Best score = 0.6521
# Iteration 155: Best score = 0.6521
# Iteration 160: Best score = 0.6521
# Iteration 165: Best score = 0.6521
# Iteration 170: Best score = 0.6521
# Iteration 175: Best score = 0.6521
# Iteration 180: Best score = 0.6521
# Iteration 185: Best score = 0.6521
# Iteration 190: Best score = 0.6521
# Iteration 195: Best score = 0.6521
# Iteration 200: Best score = 0.6521
# Best parameters: OrderedDict({'learning_rate': 0.055454445189799526, 'max_depth': 8, 'min_child_samples': 37, 'n_estimators': 159, 'num_leaves': 208})
# Best score: 0.6521
# Optimizing SVR...
# Iteration 5: Best score = 0.5336
# Iteration 10: Best score = 0.5336
# Iteration 15: Best score = 0.5522
# Iteration 20: Best score = 0.5522
# Iteration 25: Best score = 0.5522
# Iteration 30: Best score = 0.5635
# Iteration 35: Best score = 0.5635
# Iteration 40: Best score = 0.5635
# Iteration 45: Best score = 0.5635
# Iteration 50: Best score = 0.5635
# Iteration 55: Best score = 0.5655
# Iteration 60: Best score = 0.5674
# Iteration 65: Best score = 0.5674
# Iteration 70: Best score = 0.5674
# Iteration 75: Best score = 0.5674
# Iteration 80: Best score = 0.5674
# Iteration 85: Best score = 0.5674
# Iteration 90: Best score = 0.5674
# Iteration 95: Best score = 0.5674
# Iteration 100: Best score = 0.5677
# Best parameters: OrderedDict({'C': 0.1, 'epsilon': 0.38002726414738297, 'kernel': 'rbf'})
# Best score: 0.5677
# Optimizing KNNRegressor...
# Iteration 5: Best score = 0.5224
# Iteration 10: Best score = 0.5261
# Iteration 15: Best score = 0.5282
# Iteration 20: Best score = 0.5282
# Iteration 25: Best score = 0.5282
# Iteration 30: Best score = 0.5282
# Iteration 35: Best score = 0.5282
# Iteration 40: Best score = 0.5282
# Iteration 45: Best score = 0.5282
# Iteration 50: Best score = 0.5282
# Iteration 55: Best score = 0.5282
# Iteration 60: Best score = 0.5282
# Iteration 65: Best score = 0.5282
# Iteration 70: Best score = 0.5282
# Iteration 75: Best score = 0.5282
# Iteration 80: Best score = 0.5282
# Iteration 85: Best score = 0.5282
# Iteration 90: Best score = 0.5282
# Iteration 95: Best score = 0.5282
# Iteration 100: Best score = 0.5282
# Iteration 105: Best score = 0.5282
# Iteration 110: Best score = 0.5282
# Iteration 115: Best score = 0.5282
# Iteration 120: Best score = 0.5282
# Iteration 125: Best score = 0.5282
# Iteration 130: Best score = 0.5282
# Iteration 135: Best score = 0.5282
# Iteration 140: Best score = 0.5282
# Iteration 145: Best score = 0.5282
# Iteration 150: Best score = 0.5282
# Iteration 155: Best score = 0.5282
# Iteration 160: Best score = 0.5282
# Iteration 165: Best score = 0.5282
# Iteration 170: Best score = 0.5282
# Iteration 175: Best score = 0.5282
# Iteration 180: Best score = 0.5282
# Iteration 185: Best score = 0.5282
# Iteration 190: Best score = 0.5282
# Iteration 195: Best score = 0.5282
# Iteration 200: Best score = 0.5282
# Best parameters: OrderedDict({'n_neighbors': 50, 'p': 1, 'weights': 'uniform'})
# Best score: 0.5282
# Optimizing Lasso...
# Iteration 5: Best score = 0.4257
# Iteration 10: Best score = 0.4299
# Iteration 15: Best score = 0.4471
# Iteration 20: Best score = 0.4473
# Iteration 25: Best score = 0.4476
# Iteration 30: Best score = 0.4476
# Iteration 35: Best score = 0.4476
# Iteration 40: Best score = 0.4476
# Iteration 45: Best score = 0.4476
# Iteration 50: Best score = 0.4481
# Iteration 55: Best score = 0.4481
# Iteration 60: Best score = 0.4481
# Iteration 65: Best score = 0.4481
# Iteration 70: Best score = 0.4481
# Best parameters: OrderedDict({'alpha': 0.00039834351977457706})
# Best score: 0.4481
# Optimizing Ridge...
# Iteration 5: Best score = 0.4483
# Iteration 10: Best score = 0.4502
# Iteration 15: Best score = 0.4502
# Iteration 20: Best score = 0.4502
# Iteration 25: Best score = 0.4502
# Iteration 30: Best score = 0.4502
# Iteration 35: Best score = 0.4517
# Iteration 40: Best score = 0.4517
# Iteration 45: Best score = 0.4517
# Iteration 50: Best score = 0.4518
# Iteration 55: Best score = 0.4518
# Iteration 60: Best score = 0.4518
# Iteration 65: Best score = 0.4518
# Iteration 70: Best score = 0.4518
# Iteration 75: Best score = 0.4518
# Iteration 80: Best score = 0.4518
# Iteration 85: Best score = 0.4518
# Best parameters: OrderedDict({'alpha': 4.822503882256502})
# Best score: 0.4518
# Optimizing LogisticRegression...
# Iteration 5: Best score = 0.4351
# Iteration 10: Best score = 0.4361
# Iteration 15: Best score = 0.4361
# Iteration 20: Best score = 0.4369
# Iteration 25: Best score = 0.4380
# Iteration 30: Best score = 0.4380
# Iteration 35: Best score = 0.4380
# Iteration 40: Best score = 0.4380
# Iteration 45: Best score = 0.4380
# Iteration 50: Best score = 0.4380
# Iteration 55: Best score = 0.4380
# Iteration 60: Best score = 0.4380
# Iteration 65: Best score = 0.4380
# Iteration 70: Best score = 0.4380
# Iteration 75: Best score = 0.4380
# Iteration 80: Best score = 0.4380
# Iteration 85: Best score = 0.4380
# Iteration 90: Best score = 0.4380
# Iteration 95: Best score = 0.4380
# Iteration 100: Best score = 0.4380
# Iteration 105: Best score = 0.4380
# Iteration 110: Best score = 0.4380
# Iteration 115: Best score = 0.4380
# Iteration 120: Best score = 0.4380
# Iteration 125: Best score = 0.4380
# Iteration 130: Best score = 0.4380
# Iteration 135: Best score = 0.4380
# Iteration 140: Best score = 0.4380
# Iteration 145: Best score = 0.4380
# Iteration 150: Best score = 0.4380
# Iteration 155: Best score = 0.4380
# Iteration 160: Best score = 0.4380
# Iteration 165: Best score = 0.4380
# Iteration 170: Best score = 0.4380
# Iteration 175: Best score = 0.4380
# Best parameters: OrderedDict({'C': 9.96900468467878, 'solver': 'lbfgs'})
# Best score: 0.4380
# Optimizing RandomForestClassifier...
# Iteration 5: Best score = 0.5795
# Iteration 10: Best score = 0.5795
# Iteration 15: Best score = 0.5795
# Iteration 20: Best score = 0.5795
# Iteration 25: Best score = 0.5809
# Iteration 30: Best score = 0.5809
# Iteration 35: Best score = 0.5809
# Iteration 40: Best score = 0.5809
# Iteration 45: Best score = 0.5809
# Iteration 50: Best score = 0.5809
# Iteration 55: Best score = 0.5809
# Iteration 60: Best score = 0.5809
# Iteration 65: Best score = 0.5852
# Iteration 70: Best score = 0.5852
# Iteration 75: Best score = 0.5852
# Iteration 80: Best score = 0.5852
# Iteration 85: Best score = 0.5852
# Iteration 90: Best score = 0.5852
# Iteration 95: Best score = 0.5852
# Iteration 100: Best score = 0.5852
# Iteration 105: Best score = 0.5852
# Iteration 110: Best score = 0.5852
# Iteration 115: Best score = 0.5852
# Iteration 120: Best score = 0.5852
# Iteration 125: Best score = 0.5852
# Iteration 130: Best score = 0.5852
# Iteration 135: Best score = 0.5852
# Iteration 140: Best score = 0.5852
# Iteration 145: Best score = 0.5852
# Iteration 150: Best score = 0.5852
# Iteration 155: Best score = 0.5852
# Iteration 160: Best score = 0.5852
# Iteration 165: Best score = 0.5852
# Iteration 170: Best score = 0.5852
# Iteration 175: Best score = 0.5852
# Iteration 180: Best score = 0.5852
# Iteration 185: Best score = 0.5852
# Iteration 190: Best score = 0.5852
# Iteration 195: Best score = 0.5852
# Iteration 200: Best score = 0.5852
# Best parameters: OrderedDict({'max_depth': 16, 'min_samples_leaf': 20, 'min_samples_split': 5, 'n_estimators': 10})
# Best score: 0.5852
# Optimizing XGBClassifier...
# Iteration 5: Best score = 0.5183
# Iteration 10: Best score = 0.5183
# Iteration 15: Best score = 0.5183
# Iteration 20: Best score = 0.5726
# Iteration 25: Best score = 0.5829
# Iteration 30: Best score = 0.5908
# Iteration 35: Best score = 0.5908
# Iteration 40: Best score = 0.5908
# Iteration 45: Best score = 0.5908
# Iteration 50: Best score = 0.5908
# Iteration 55: Best score = 0.5912
# Iteration 60: Best score = 0.5912
# Iteration 65: Best score = 0.5912
# Iteration 70: Best score = 0.5912
# Iteration 75: Best score = 0.5912
# Iteration 80: Best score = 0.5912
# Iteration 85: Best score = 0.5912
# Iteration 90: Best score = 0.5912
# Iteration 95: Best score = 0.5912
# Iteration 100: Best score = 0.5912
# Iteration 105: Best score = 0.5912
# Iteration 110: Best score = 0.5912
# Iteration 115: Best score = 0.5912
# Iteration 120: Best score = 0.5912
# Iteration 125: Best score = 0.5912
# Iteration 130: Best score = 0.5912
# Iteration 135: Best score = 0.5912
# Iteration 140: Best score = 0.5912
# Iteration 145: Best score = 0.5912
# Iteration 150: Best score = 0.5912
# Iteration 155: Best score = 0.5912
# Iteration 160: Best score = 0.5912
# Iteration 165: Best score = 0.5912
# Iteration 170: Best score = 0.5912
# Iteration 175: Best score = 0.5912
# Iteration 180: Best score = 0.5912
# Iteration 185: Best score = 0.5912
# Iteration 190: Best score = 0.5912
# Iteration 195: Best score = 0.5912
# Iteration 200: Best score = 0.5912
# Best parameters: OrderedDict({'colsample_bytree': 1.0, 'learning_rate': 0.6788945751630601, 'max_depth': 1, 'n_estimators': 81, 'subsample': 1.0})
# Best score: 0.5912
In [109]:
# models_opt = {}
# for name, model in models.items():
# print(f"Optimizing {name}...")
# opt = optimize_model(X, y, model, spaces[name])
# models_opt[name] = opt
# print(f"Best parameters: {opt.best_params_}")
# print(f"Best score: {opt.best_score_:.4f}")
# print()
models_opt = {}
models_opt['RandomForestRegressor'] = {
'estimator': RandomForestRegressor(**{'max_depth': 25, 'min_samples_leaf': 20, 'min_samples_split': 20, 'n_estimators': 10}),
'best_params_': {'max_depth': 25, 'min_samples_leaf': 20, 'min_samples_split': 20, 'n_estimators': 10},
'best_score_': 0.6509
}
models_opt['XGBRegressor'] = {
'estimator': XGBRegressor(**{'colsample_bytree': 0.7461282882550149, 'learning_rate': 0.9936330838999975, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.9395779652231953}),
'best_params_': {'colsample_bytree': 0.7461282882550149, 'learning_rate': 0.9936330838999975, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.9395779652231953},
'best_score_': 0.6378
}
models_opt['LGBMRegressor'] = {
'estimator': LGBMRegressor(**{'verbose': -1, 'learning_rate': 0.055454445189799526, 'max_depth': 8, 'min_child_samples': 37, 'n_estimators': 159, 'num_leaves': 208}),
'best_params_': {'verbose': -1, 'learning_rate': 0.055454445189799526, 'max_depth': 8, 'min_child_samples': 37, 'n_estimators': 159, 'num_leaves': 208},
'best_score_': 0.6521
}
models_opt['SVR'] = {
'estimator': SVR(**{'C': 0.1, 'epsilon': 0.38002726414738297, 'kernel': 'rbf'}),
'best_params_': {'C': 0.1, 'epsilon': 0.38002726414738297, 'kernel': 'rbf'},
'best_score_': 0.5677
}
models_opt['KNNRegressor'] = {
'estimator': KNeighborsRegressor(**{'n_neighbors': 50, 'p': 1, 'weights': 'uniform'}),
'best_params_': {'n_neighbors': 50, 'p': 1, 'weights': 'uniform'},
'best_score_': 0.5282
}
models_opt['Lasso'] = {
'estimator': Lasso(**{'alpha': 0.00039834351977457706, "max_iter": 5000}),
'best_params_': {'alpha': 0.00039834351977457706, "max_iter": 5000},
'best_score_': 0.4481
}
models_opt['Ridge'] = {
'estimator': Ridge(**{'alpha': 4.822503882256502}),
'best_params_': {'alpha': 4.822503882256502},
'best_score_': 0.4518
}
models_opt['LogisticRegression'] = {
'estimator': LogisticRegression(**{'C': 9.96900468467878, 'solver': 'lbfgs', "max_iter": 500}),
'best_params_': {'C': 9.96900468467878, 'solver': 'lbfgs', "max_iter": 500},
'best_score_': 0.4380
}
models_opt['RandomForestClassifier'] = {
'estimator': RandomForestClassifier(**{'max_depth': 16, 'min_samples_leaf': 20, 'min_samples_split': 5, 'n_estimators': 10}),
'best_params_': {'max_depth': 16, 'min_samples_leaf': 20, 'min_samples_split': 5, 'n_estimators': 10},
'best_score_': 0.5852
}
models_opt['XGBClassifier'] = {
'estimator': XGBClassifier(**{'colsample_bytree': 1.0, 'learning_rate': 0.6788945751630601, 'max_depth': 1, 'n_estimators': 81, 'subsample': 1.0}),
'best_params_': {'colsample_bytree': 1.0, 'learning_rate': 0.6788945751630601, 'max_depth': 1, 'n_estimators': 81, 'subsample': 1.0},
'best_score_': 0.5912
}
In [18]:
models = {}
for name, opt in models_opt.items():
model_class = type(opt["estimator"]) # Get the model class (e.g., RandomForestRegressor)
best_params = opt["best_params_"] # Get the best parameters from the optimization
# Add the model with the best parameters to the new dictionary
models[f"{name} opt"] = model_class(**best_params)
# Add a baseline model manually (if needed)
models['Random Forest Baseline'] = RandomForestClassifier()
In [19]:
# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)
Best mean F1 test score: 0.6521 by model: LGBMRegressor opt Min F1 test score: 0.5816, Max F1 test score: 0.7691
In [20]:
# Plot MSE results for regression models
plot_multi_model_results(results)
Bagging¶
In [110]:
# Sort the models by their best score in descending order
top_3_models = sorted(models_opt.items(), key=lambda x: x[1]["best_score_"], reverse=True)
# Filter only classifiers from the sorted models
top_3_classifiers = [(name, opt["estimator"]) for name, opt in top_3_models if isinstance(opt["estimator"], ClassifierMixin)][:3]
# Print the top 3 classifiers for verification
print("Top 3 classifiers used in VotingClassifier:")
for name, estimator in top_3_classifiers:
print(f"{name}: {type(estimator).__name__}")
# Define the voting ensemble using the top 3 classifiers
bagging_vote_ensemble = VotingClassifier(estimators=top_3_classifiers)
# Sort the models by their best score in descending order
top_models = sorted(models_opt.items(), key=lambda x: x[1]["best_score_"], reverse=True)
# Filter only regressors from the sorted models
top_2_regressors = [(name, opt["estimator"]) for name, opt in top_models if isinstance(opt["estimator"], RegressorMixin)][:2]
# Print the top 2 regressors for verification
print("Top 2 regressors used in VotingRegressor:")
for name, estimator in top_2_regressors:
print(f"{name}: {type(estimator).__name__}")
# Define the voting ensemble using the top 2 regressors
voting_regressor_ensemble = VotingRegressor(estimators=top_2_regressors)
bagging_ensemble = BaggingRegressor(
estimator=RandomForestRegressor(),
n_estimators=10,
max_samples=0.8,
max_features=0.8,
bootstrap=True,
bootstrap_features=False,
n_jobs=-1
)
models = {
'bagging_ensemble': bagging_ensemble,
'bagging_vote_ensemble': bagging_vote_ensemble,
'voting_regressor_ensemble': voting_regressor_ensemble,
'Random Forest Baseline': RandomForestClassifier(),
}
Top 3 classifiers used in VotingClassifier: XGBClassifier: XGBClassifier RandomForestClassifier: RandomForestClassifier LogisticRegression: LogisticRegression Top 2 regressors used in VotingRegressor: LGBMRegressor: LGBMRegressor RandomForestRegressor: RandomForestRegressor
In [111]:
# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)
Best mean F1 test score: 0.6358 by model: voting_regressor_ensemble Min F1 test score: 0.5475, Max F1 test score: 0.7640
In [112]:
# Plot MSE results for regression models
plot_multi_model_results(results)
Stacking¶
In [28]:
# Separate regressors and classifiers from models_opt
regressor_estimators = [(name, opt["estimator"]) for name, opt in models_opt.items() if isinstance(opt["estimator"], RegressorMixin)]
classifier_estimators = [(name, opt["estimator"]) for name, opt in models_opt.items() if isinstance(opt["estimator"], ClassifierMixin)]
# Define the meta-models
ridge_regressor = Ridge()
random_forest_regressor = RandomForestRegressor()
ridge_classifier = RidgeClassifier() # RidgeClassifier for classification tasks
random_forest_classifier = RandomForestClassifier()
# Stacking Regressor with Ridge as meta-model
stacking_regressor_ridge = StackingRegressor(
estimators=regressor_estimators,
final_estimator=ridge_regressor
)
# Stacking Regressor with RandomForest as meta-model
stacking_regressor_rf = StackingRegressor(
estimators=regressor_estimators,
final_estimator=random_forest_regressor
)
# Stacking Classifier with Ridge as meta-model
stacking_classifier_ridge = StackingClassifier(
estimators=classifier_estimators,
final_estimator=ridge_classifier
)
# Stacking Classifier with RandomForest as meta-model
stacking_classifier_rf = StackingClassifier(
estimators=classifier_estimators,
final_estimator=random_forest_classifier
)
# Stacking Classifier with Ridge as meta-model
stacking_classifier_ridge_proba = StackingClassifier(
estimators=classifier_estimators,
final_estimator=ridge_classifier,
stack_method='predict_proba'
)
# Stacking Classifier with RandomForest as meta-model
stacking_classifier_rf_proba = StackingClassifier(
estimators=classifier_estimators,
final_estimator=random_forest_classifier,
stack_method='predict_proba'
)
In [29]:
models = {
'Stacking Regressor (Ridge Meta)': stacking_regressor_ridge,
'Stacking Regressor (RandomForest Meta)': stacking_regressor_rf,
'Stacking Classifier (Ridge Meta)': stacking_classifier_ridge,
'Stacking Classifier (RandomForest Meta)': stacking_classifier_rf,
'Stacking Classifier (Ridge Meta, Proba)': stacking_classifier_ridge_proba,
'Stacking Classifier (RandomForest Meta, Proba)': stacking_classifier_rf_proba,
'Random Forest Baseline': RandomForestClassifier(),
}
# Run cross-validation for all models
results = run_multi_model_cv(X, y, models)
# Plot results for all models
plot_multi_model_results(results)
Best mean F1 test score: 0.6504 by model: Stacking Regressor (Ridge Meta) Min F1 test score: 0.5665, Max F1 test score: 0.7761
Custom Stacking¶
In [82]:
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import KFold
class CustomStackingModel(BaseEstimator, RegressorMixin):
def __init__(self, estimators, final_estimator=Ridge(), cv=20):
"""
Custom stacking model that can combine both classifiers and regressors as base learners.
Parameters:
- estimators: list of (name, estimator) tuples.
- final_estimator: the meta-model to train on the stacked outputs.
- cv: number of cross-validation folds for generating out-of-fold predictions.
"""
self.estimators = estimators
self.final_estimator = final_estimator
self.cv = cv
self.fitted_estimators_ = []
self.n_classes = 5
def fit(self, X, y):
self.fitted_estimators_ = []
# Initialize list to hold stacked features
stacked_features_list = []
n_samples = X.shape[0]
# Cross-validation setup
kf = KFold(n_splits=self.cv)
# For each estimator, generate out-of-fold predictions
for i, (name, estimator) in enumerate(self.estimators):
# Determine the shape of oof_predictions based on predict or predict_proba
if hasattr(estimator, "predict_proba"):
oof_predictions = np.zeros((n_samples, self.n_classes))
else:
oof_predictions = np.zeros((n_samples, 1))
for train_idx, valid_idx in kf.split(X):
# Use .iloc to access the rows based on integer-based indexing
X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
y_train = y.iloc[train_idx]
# Fit the estimator on the training fold
fitted_estimator = estimator.fit(X_train, y_train)
# Predict on the validation fold
if hasattr(fitted_estimator, "predict_proba"):
# Store multi-class probabilities in the oof_predictions matrix
predictions = fitted_estimator.predict_proba(X_valid)
oof_predictions[valid_idx] = predictions
else:
predictions = fitted_estimator.predict(X_valid).reshape(-1, 1)
oof_predictions[valid_idx] = predictions
# Store the out-of-fold predictions in the list of stacked features
stacked_features_list.append(oof_predictions)
# Fit the full estimator on the entire dataset (for future predictions)
self.fitted_estimators_.append(estimator.fit(X, y))
# Concatenate all stacked features (column-wise) after all estimators are fitted
stacked_features = np.hstack(stacked_features_list)
# Fit the final meta-model on the stacked features
self.final_estimator.fit(stacked_features, y)
return self
def _get_stacked_features(self, X):
# Generate predictions from the base estimators for the new data
stacked_features_list = []
for estimator in self.fitted_estimators_:
if hasattr(estimator, "predict_proba"):
# Concatenate all class probabilities for multi-class classification
stacked_features_list.append(estimator.predict_proba(X))
else:
stacked_features_list.append(estimator.predict(X).reshape(-1, 1))
# Concatenate stacked features column-wise
stacked_features = np.hstack(stacked_features_list)
return stacked_features
def predict(self, X):
# Get stacked features for the test set
stacked_features = self._get_stacked_features(X)
# Predict using the final estimator (meta-model)
return self.final_estimator.predict(stacked_features)
# Example usage of CustomStackingModel
regressor_estimators = [(name, opt["estimator"]) for name, opt in models_opt.items() if isinstance(opt["estimator"], RegressorMixin)]
classifier_estimators = [(name, opt["estimator"]) for name, opt in models_opt.items() if isinstance(opt["estimator"], ClassifierMixin)]
# Combine both regressors and classifiers as base learners
combined_estimators = regressor_estimators + classifier_estimators
# Define the custom stacking model with both classifiers and regressors, using Ridge as the meta-model
custom_stacking_model_ridge = CustomStackingModel(
estimators=combined_estimators,
final_estimator=Ridge({'alpha': 5.787655426374446})
)
custom_stacking_model_rf = CustomStackingModel(
estimators=combined_estimators,
final_estimator=RandomForestRegressor()
)
custom_stacking_model_lgbm = CustomStackingModel(
estimators=combined_estimators,
final_estimator= LGBMRegressor(verbose=-1)
)
custom_stacking_model_rf_class = CustomStackingModel(
estimators=combined_estimators,
final_estimator= RandomForestClassifier()
)
In [44]:
models = {
'custom_stacking_model_ridge': stacking_regressor_ridge,
'custom_stacking_model_rf': custom_stacking_model_rf,
'custom_stacking_model_rf_class': custom_stacking_model_rf_class,
'custom_stacking_model_lgbm': custom_stacking_model_lgbm,
'Random Forest Baseline': RandomForestClassifier(),
}
# Run cross-validation for all models
results = run_multi_model_cv(X, y, models)
# Plot results for all models
plot_multi_model_results(results)
Best mean F1 test score: 0.6510 by model: custom_stacking_model_ridge Min F1 test score: 0.5617, Max F1 test score: 0.7781
Eval on unseen data¶
In [83]:
data_train = pd.read_csv('module6_course_train.csv')
data_test = pd.read_csv('module6_course_test.csv')
In [84]:
y_train = data_train.pop('Score')
X_train = data_train.copy()
In [85]:
y_test = data_test.pop('Score')
X_test = data_test.copy()
In [86]:
custom_stacking_model_rf.fit(X_train, y_train)
Out[86]:
CustomStackingModel(estimators=[('RandomForestRegressor',
RandomForestRegressor(max_depth=25,
min_samples_leaf=20,
min_samples_split=20,
n_estimators=10)),
('XGBRegressor',
XGBRegressor(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.7461282882550149,
device=None,
early_stopping_rounds=None,
enable...
interaction_constraints=None,
learning_rate=0.6788945751630601,
max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=1,
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=81, n_jobs=None,
num_parallel_tree=None,
objective='multi:softprob', ...))],
final_estimator=RandomForestRegressor())In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CustomStackingModel(estimators=[('RandomForestRegressor',
RandomForestRegressor(max_depth=25,
min_samples_leaf=20,
min_samples_split=20,
n_estimators=10)),
('XGBRegressor',
XGBRegressor(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.7461282882550149,
device=None,
early_stopping_rounds=None,
enable...
interaction_constraints=None,
learning_rate=0.6788945751630601,
max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=1,
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=81, n_jobs=None,
num_parallel_tree=None,
objective='multi:softprob', ...))],
final_estimator=RandomForestRegressor())RandomForestRegressor()
RandomForestRegressor()
In [87]:
y_pred_test = custom_stacking_model_rf.predict(X_test)
y_pred_train = custom_stacking_model_rf.predict(X_train)
In [88]:
f1_train = custom_f1_score(y_train, y_pred_train)
f1_test = custom_f1_score(y_test, y_pred_test)
print(f"f1_train: {f1_train}")
print(f"f1_test: {f1_test}")
f1_train: 0.7262607298491914 f1_test: 0.7111708369916983
In [105]:
lgmb_best = LGBMRegressor(**{'verbose': -1, 'learning_rate': 0.055454445189799526, 'max_depth': 8, 'min_child_samples': 37, 'n_estimators': 159, 'num_leaves': 208})
In [106]:
lgmb_best.fit(X_train, y_train)
Out[106]:
LGBMRegressor(learning_rate=0.05545444518979953, max_depth=8,
min_child_samples=37, n_estimators=159, num_leaves=208,
verbose=-1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMRegressor(learning_rate=0.05545444518979953, max_depth=8,
min_child_samples=37, n_estimators=159, num_leaves=208,
verbose=-1)In [107]:
y_pred_test = lgmb_best.predict(X_test)
y_pred_train = lgmb_best.predict(X_train)
In [108]:
f1_train = custom_f1_score(y_train, y_pred_train)
f1_test = custom_f1_score(y_test, y_pred_test)
print(f"f1_train: {f1_train}")
print(f"f1_test: {f1_test}")
f1_train: 0.7292455204969205 f1_test: 0.7086080988315853