yvki · April 15, 2024 07:53
diff --git a/predClassifiers.py b/predClassifiers.py
 # 1. Extreme Gradient Boosting (comparable to LightGBM)
 from xgboost import XGBClassifier
 from sklearn.model_selection import GridSearchCV
 xgb = XGBClassifier(random_state=22)
 param_grid_xgb = {
    'n_estimators': [300],
    'max_depth': [4],
    'learning_rate': [0.1],
    'subsample': [1.0],
    'colsample_bytree': [0.8]
 }
 grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, scoring='accuracy',n_jobs=-1)
 grid_search_xgb.fit(X_train, y_train)
 print("Best Parameters (XDG):", grid_search_xgb.best_params_)
 accuracy_xgb_train = grid_search_xgb.best_estimator_.score(X_train, y_train)
 print("Accuracy on train set (XDG):", accuracy_xgb_train)
 accuracy_xgb_test = grid_search_xgb.best_estimator_.score(X_test, y_test)
 print("Accuracy on test set (XDG):", accuracy_xgb_test)

 # 2. Random Forest 
 from sklearn.ensemble import RandomForestClassifier
 rf = RandomForestClassifier(random_state=22)
 param_grid_rf = {
    'n_estimators': [700],
    'max_depth': [12],
    'min_samples_split': [2],
    'min_samples_leaf': [6]
 }
 grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
 grid_search_rf.fit(X_train, y_train)
 print("Best Parameters (RF):", grid_search_rf.best_params_)
 accuracy_rf_train = grid_search_rf.best_estimator_.score(X_train, y_train)
 print("Accuracy on train set (RF):", accuracy_rf_train)
 accuracy_rf_test = grid_search_rf.best_estimator_.score(X_test, y_test)
 print("Accuracy on test set (RF):", accuracy_rf_test)

 # 3. Categorical Boosting
 from catboost import CatBoostClassifier
 y_test = y_test.astype(int)
 y_train = y_train.astype(int)
 cat = CatBoostClassifier(verbose = 0, random_state = 22)
 param_grid_cat = {'iterations': [1000],
              'learning_rate': [0.05],
              'depth': [4]}
 grid_search_cat = GridSearchCV(estimator=cat, param_grid=param_grid_cat, cv=5, scoring='accuracy',n_jobs=-1)
 grid_search_cat.fit(X_train, y_train)
 print("Best Parameters (Cat):", grid_search_cat.best_params_)
 accuracy_cat_train = grid_search_cat.best_estimator_.score(X_train, y_train)
 print("Accuracy on train set (Cat):", accuracy_cat_train)
 accuracy_cat_test = grid_search_cat.best_estimator_.score(X_test, y_test)
 print("Accuracy on test set (Cat):", accuracy_cat_test)

 # 4. Voting 
 from sklearn.ensemble import VotingClassifier
 from sklearn.metrics import accuracy_score
 from imblearn.over_sampling import SMOTE
 smote = SMOTE(sampling_strategy={0: 5750, 1: 5750}, random_state=22)
 X_resampled, y_resampled = smote.fit_resample(X, y)
 print(X_resampled.shape)
 X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=22)
 ensemble_model = VotingClassifier(estimators=[
    ('xgb', grid_search_xgb.best_estimator_),
    ('rf', grid_search_rf.best_estimator_),
    ('lgrg', grid_search_lgrg.best_estimator_),
    ('cat', grid_search_cat.best_estimator_)
 ], voting='soft',weights=[4, 2, 2, 4])
 ensemble_model.fit(X_train, y_train)
 y_pred = ensemble_model.predict(X_test)
 accuracy = accuracy_score(y_test, y_pred)
 print('accuracy_ensemble:', accuracy)

 # 5. Self-training
 from sklearn.semi_supervised import SelfTrainingClassifier
 self_training_clf = SelfTrainingClassifier(ensemble_model, criterion='k_best', k_best=500, max_iter=8)
 self_training_clf.fit(X_resampled, y_resampled)
	# 1. Extreme Gradient Boosting (comparable to LightGBM)
	from xgboost import XGBClassifier
	from sklearn.model_selection import GridSearchCV
	xgb = XGBClassifier(random_state=22)
	param_grid_xgb = {
	'n_estimators': [300],
	'max_depth': [4],
	'learning_rate': [0.1],
	'subsample': [1.0],
	'colsample_bytree': [0.8]
	}
	grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, scoring='accuracy',n_jobs=-1)
	grid_search_xgb.fit(X_train, y_train)
	print("Best Parameters (XDG):", grid_search_xgb.best_params_)
	accuracy_xgb_train = grid_search_xgb.best_estimator_.score(X_train, y_train)
	print("Accuracy on train set (XDG):", accuracy_xgb_train)
	accuracy_xgb_test = grid_search_xgb.best_estimator_.score(X_test, y_test)
	print("Accuracy on test set (XDG):", accuracy_xgb_test)

	# 2. Random Forest
	from sklearn.ensemble import RandomForestClassifier
	rf = RandomForestClassifier(random_state=22)
	param_grid_rf = {
	'n_estimators': [700],
	'max_depth': [12],
	'min_samples_split': [2],
	'min_samples_leaf': [6]
	}
	grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
	grid_search_rf.fit(X_train, y_train)
	print("Best Parameters (RF):", grid_search_rf.best_params_)
	accuracy_rf_train = grid_search_rf.best_estimator_.score(X_train, y_train)
	print("Accuracy on train set (RF):", accuracy_rf_train)
	accuracy_rf_test = grid_search_rf.best_estimator_.score(X_test, y_test)
	print("Accuracy on test set (RF):", accuracy_rf_test)

	# 3. Categorical Boosting
	from catboost import CatBoostClassifier
	y_test = y_test.astype(int)
	y_train = y_train.astype(int)
	cat = CatBoostClassifier(verbose = 0, random_state = 22)
	param_grid_cat = {'iterations': [1000],
	'learning_rate': [0.05],
	'depth': [4]}
	grid_search_cat = GridSearchCV(estimator=cat, param_grid=param_grid_cat, cv=5, scoring='accuracy',n_jobs=-1)
	grid_search_cat.fit(X_train, y_train)
	print("Best Parameters (Cat):", grid_search_cat.best_params_)
	accuracy_cat_train = grid_search_cat.best_estimator_.score(X_train, y_train)
	print("Accuracy on train set (Cat):", accuracy_cat_train)
	accuracy_cat_test = grid_search_cat.best_estimator_.score(X_test, y_test)
	print("Accuracy on test set (Cat):", accuracy_cat_test)

	# 4. Voting
	from sklearn.ensemble import VotingClassifier
	from sklearn.metrics import accuracy_score
	from imblearn.over_sampling import SMOTE
	smote = SMOTE(sampling_strategy={0: 5750, 1: 5750}, random_state=22)
	X_resampled, y_resampled = smote.fit_resample(X, y)
	print(X_resampled.shape)
	X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=22)
	ensemble_model = VotingClassifier(estimators=[
	('xgb', grid_search_xgb.best_estimator_),
	('rf', grid_search_rf.best_estimator_),
	('lgrg', grid_search_lgrg.best_estimator_),
	('cat', grid_search_cat.best_estimator_)
	], voting='soft',weights=[4, 2, 2, 4])
	ensemble_model.fit(X_train, y_train)
	y_pred = ensemble_model.predict(X_test)
	accuracy = accuracy_score(y_test, y_pred)
	print('accuracy_ensemble:', accuracy)

	# 5. Self-training
	from sklearn.semi_supervised import SelfTrainingClassifier
	self_training_clf = SelfTrainingClassifier(ensemble_model, criterion='k_best', k_best=500, max_iter=8)
	self_training_clf.fit(X_resampled, y_resampled)