Created
April 15, 2024 07:53
-
-
Save yvki/e02e469c2f139bcd52a078af6207a0ef to your computer and use it in GitHub Desktop.
Predicting Spaceship Titanic π survival rates with different GridSearch classifier types βοΈ
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1. Extreme Gradient Boosting (comparable to LightGBM) | |
from xgboost import XGBClassifier | |
from sklearn.model_selection import GridSearchCV | |
xgb = XGBClassifier(random_state=22) | |
param_grid_xgb = { | |
'n_estimators': [300], | |
'max_depth': [4], | |
'learning_rate': [0.1], | |
'subsample': [1.0], | |
'colsample_bytree': [0.8] | |
} | |
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, scoring='accuracy',n_jobs=-1) | |
grid_search_xgb.fit(X_train, y_train) | |
print("Best Parameters (XDG):", grid_search_xgb.best_params_) | |
accuracy_xgb_train = grid_search_xgb.best_estimator_.score(X_train, y_train) | |
print("Accuracy on train set (XDG):", accuracy_xgb_train) | |
accuracy_xgb_test = grid_search_xgb.best_estimator_.score(X_test, y_test) | |
print("Accuracy on test set (XDG):", accuracy_xgb_test) | |
# 2. Random Forest | |
from sklearn.ensemble import RandomForestClassifier | |
rf = RandomForestClassifier(random_state=22) | |
param_grid_rf = { | |
'n_estimators': [700], | |
'max_depth': [12], | |
'min_samples_split': [2], | |
'min_samples_leaf': [6] | |
} | |
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1) | |
grid_search_rf.fit(X_train, y_train) | |
print("Best Parameters (RF):", grid_search_rf.best_params_) | |
accuracy_rf_train = grid_search_rf.best_estimator_.score(X_train, y_train) | |
print("Accuracy on train set (RF):", accuracy_rf_train) | |
accuracy_rf_test = grid_search_rf.best_estimator_.score(X_test, y_test) | |
print("Accuracy on test set (RF):", accuracy_rf_test) | |
# 3. Categorical Boosting | |
from catboost import CatBoostClassifier | |
y_test = y_test.astype(int) | |
y_train = y_train.astype(int) | |
cat = CatBoostClassifier(verbose = 0, random_state = 22) | |
param_grid_cat = {'iterations': [1000], | |
'learning_rate': [0.05], | |
'depth': [4]} | |
grid_search_cat = GridSearchCV(estimator=cat, param_grid=param_grid_cat, cv=5, scoring='accuracy',n_jobs=-1) | |
grid_search_cat.fit(X_train, y_train) | |
print("Best Parameters (Cat):", grid_search_cat.best_params_) | |
accuracy_cat_train = grid_search_cat.best_estimator_.score(X_train, y_train) | |
print("Accuracy on train set (Cat):", accuracy_cat_train) | |
accuracy_cat_test = grid_search_cat.best_estimator_.score(X_test, y_test) | |
print("Accuracy on test set (Cat):", accuracy_cat_test) | |
# 4. Voting | |
from sklearn.ensemble import VotingClassifier | |
from sklearn.metrics import accuracy_score | |
from imblearn.over_sampling import SMOTE | |
smote = SMOTE(sampling_strategy={0: 5750, 1: 5750}, random_state=22) | |
X_resampled, y_resampled = smote.fit_resample(X, y) | |
print(X_resampled.shape) | |
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=22) | |
ensemble_model = VotingClassifier(estimators=[ | |
('xgb', grid_search_xgb.best_estimator_), | |
('rf', grid_search_rf.best_estimator_), | |
('lgrg', grid_search_lgrg.best_estimator_), | |
('cat', grid_search_cat.best_estimator_) | |
], voting='soft',weights=[4, 2, 2, 4]) | |
ensemble_model.fit(X_train, y_train) | |
y_pred = ensemble_model.predict(X_test) | |
accuracy = accuracy_score(y_test, y_pred) | |
print('accuracy_ensemble:', accuracy) | |
# 5. Self-training | |
from sklearn.semi_supervised import SelfTrainingClassifier | |
self_training_clf = SelfTrainingClassifier(ensemble_model, criterion='k_best', k_best=500, max_iter=8) | |
self_training_clf.fit(X_resampled, y_resampled) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment