Skip to content

Instantly share code, notes, and snippets.

@yvki
Created April 15, 2024 07:53
Show Gist options
  • Save yvki/e02e469c2f139bcd52a078af6207a0ef to your computer and use it in GitHub Desktop.
Save yvki/e02e469c2f139bcd52a078af6207a0ef to your computer and use it in GitHub Desktop.
Predicting Spaceship Titanic πŸš€ survival rates with different GridSearch classifier types βš™οΈ
# 1. Extreme Gradient Boosting (comparable to LightGBM)
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
xgb = XGBClassifier(random_state=22)
param_grid_xgb = {
'n_estimators': [300],
'max_depth': [4],
'learning_rate': [0.1],
'subsample': [1.0],
'colsample_bytree': [0.8]
}
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, scoring='accuracy',n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)
print("Best Parameters (XDG):", grid_search_xgb.best_params_)
accuracy_xgb_train = grid_search_xgb.best_estimator_.score(X_train, y_train)
print("Accuracy on train set (XDG):", accuracy_xgb_train)
accuracy_xgb_test = grid_search_xgb.best_estimator_.score(X_test, y_test)
print("Accuracy on test set (XDG):", accuracy_xgb_test)
# 2. Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=22)
param_grid_rf = {
'n_estimators': [700],
'max_depth': [12],
'min_samples_split': [2],
'min_samples_leaf': [6]
}
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)
print("Best Parameters (RF):", grid_search_rf.best_params_)
accuracy_rf_train = grid_search_rf.best_estimator_.score(X_train, y_train)
print("Accuracy on train set (RF):", accuracy_rf_train)
accuracy_rf_test = grid_search_rf.best_estimator_.score(X_test, y_test)
print("Accuracy on test set (RF):", accuracy_rf_test)
# 3. Categorical Boosting
from catboost import CatBoostClassifier
y_test = y_test.astype(int)
y_train = y_train.astype(int)
cat = CatBoostClassifier(verbose = 0, random_state = 22)
param_grid_cat = {'iterations': [1000],
'learning_rate': [0.05],
'depth': [4]}
grid_search_cat = GridSearchCV(estimator=cat, param_grid=param_grid_cat, cv=5, scoring='accuracy',n_jobs=-1)
grid_search_cat.fit(X_train, y_train)
print("Best Parameters (Cat):", grid_search_cat.best_params_)
accuracy_cat_train = grid_search_cat.best_estimator_.score(X_train, y_train)
print("Accuracy on train set (Cat):", accuracy_cat_train)
accuracy_cat_test = grid_search_cat.best_estimator_.score(X_test, y_test)
print("Accuracy on test set (Cat):", accuracy_cat_test)
# 4. Voting
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy={0: 5750, 1: 5750}, random_state=22)
X_resampled, y_resampled = smote.fit_resample(X, y)
print(X_resampled.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=22)
ensemble_model = VotingClassifier(estimators=[
('xgb', grid_search_xgb.best_estimator_),
('rf', grid_search_rf.best_estimator_),
('lgrg', grid_search_lgrg.best_estimator_),
('cat', grid_search_cat.best_estimator_)
], voting='soft',weights=[4, 2, 2, 4])
ensemble_model.fit(X_train, y_train)
y_pred = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('accuracy_ensemble:', accuracy)
# 5. Self-training
from sklearn.semi_supervised import SelfTrainingClassifier
self_training_clf = SelfTrainingClassifier(ensemble_model, criterion='k_best', k_best=500, max_iter=8)
self_training_clf.fit(X_resampled, y_resampled)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment