Last active
July 7, 2017 20:37
-
-
Save rmania/22194f7ea2670180e7dec2d9ff61bc85 to your computer and use it in GitHub Desktop.
some useful ml processing functions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############ ExtraTreesRegressor , TimeSeriesSplit | |
from sklearn.metrics import r2_score | |
from sklearn.metrics import mean_squared_error | |
# Model selection | |
etr = ExtraTreesRegressor(n_estimators = 100, n_jobs=-1) | |
# Predict, TimeSeriesSplit | |
tscv = TimeSeriesSplit(n_splits=5) | |
results = [] | |
for train_index, test_index in tscv.split(X): | |
X_train, y_train = X[train_index], y[train_index] | |
X_test, y_test = X[test_index], y[test_index] | |
etr.fit(X_train, y_train) | |
y_train_pred = etr.predict(X_train) | |
y_test_pred = etr.predict(X_test) | |
print('MSE train: %.3f, test: %.3f' % ( | |
np.sqrt(mean_squared_error(y_train, y_train_pred)), | |
np.sqrt(mean_squared_error(y_test, y_test_pred)))) | |
print('R^2 train: %.3f, test: %.3f' % ( | |
r2_score(y_train, y_train_pred), | |
r2_score(y_test, y_test_pred))) | |
def rank_features(model, features): | |
""" | |
Ranks the importance of features used in ML and bar-graph plots these | |
""" | |
# calcs | |
importances = model.feature_importances_ | |
std = np.std([model.feature_importances_ for model in model.estimators_], axis=0) | |
indices = np.argsort(importances)[::-1] | |
# rank feature importances | |
print (sorted(zip(map(lambda x: "{0:.3f}".format(x), model.feature_importances_), p.feature_cols), | |
reverse=True)) | |
# plot feature importances | |
fig, ax = plt.subplots(figsize= [10,6]) | |
coef = pd.Series(model.feature_importances_, index = p.feature_cols).sort_values(ascending=False) | |
coef.head(X.shape[0]).plot(kind='bar') | |
plt.title('Feature Significance') | |
plt.tight_layout() | |
def measure_model(model): | |
""" | |
calculates the errors of the test, validation and train sets and plots histograms""" | |
er = 100 | |
print("Running on train set") | |
error_train = model.predict(X_train) - y_train | |
rmse_train = np.sqrt(mean_squared_error(y_train, model.predict(X_train))) | |
print("Running on validation set") | |
error_validate = model.predict(X_validate) - y_validate | |
rmse_val = np.sqrt(mean_squared_error(y_validate, model.predict(X_validate))) | |
print("Running on test set") | |
error_test = model.predict(X_test) - y_test | |
rmse_test = np.sqrt(mean_squared_error(y_test, model.predict(X_test))) | |
fig, ax = plt.subplots(1, 3, figsize= [15,5]) | |
p = pd.Series(error_train).hist(bins=31, range=[-er,er], ax=ax[0]) | |
p.set(xlabel = ' Errors on train set', ylabel = 'Frequency', title = 'rmse: {}'.format(rmse_train)) | |
p = pd.Series(error_validate).hist(bins=31, range=[-er,er], ax=ax[1]) | |
p.set(xlabel = ' Errors on validate set', title = 'rmse: {}'.format(rmse_val)) | |
p = pd.Series(error_test).hist(bins=31, range=[-er,er], ax=ax[2]) | |
p.set(xlabel = ' Errors on test set', title = 'rmse: {}'.format(rmse_test)) | |
plt.tight_layout() | |
return p | |
def run_gridsearch(X, y, model, param_grid, cv): | |
"""Run a grid search for best parameters. | |
input | |
---- | |
X_train -- features training set | |
y_train -- targets training set | |
model -- f.i. ExtraTreesRegressor() | |
param_grid -- [dict] parameter settings to test | |
cv -- fold of cross-validation, default 5 | |
Returns | |
------- | |
top_params -- [dict] from report() | |
""" | |
grid_search = GridSearchCV(model, | |
param_grid=param_grid, | |
cv=cv) | |
grid_search.fit(X, y) | |
print('Fitted model with following parameters: \n{}'.format(grid_search.get_params)) | |
print(("\nGridSearchCV " | |
"{:d} candidate " | |
"parameter settings.").format(len(grid_search.cv_results_))) | |
top_params = report(grid_search.cv_results_, 3) | |
return top_params, grid_search | |
# report best scores | |
def grid_search_report(results, n_top=3): | |
"""Report top n_top parameters settings, default n_top=3. | |
input | |
---- | |
grid_scores -- output from GridSearchCV or random_searchCV | |
n_top -- n top models | |
Returns | |
------- | |
top_params -- [dict] top parameters found in search | |
""" | |
for i in range(1, n_top + 1): | |
best_models = np.flatnonzero(results['rank_test_score'] == i) | |
for x in best_models: | |
print("Model rank: {0}".format(i)) | |
print("Mean validation score: {0:.3f} (std: {1:.3f})".format( | |
results['mean_test_score'][x], | |
results['std_test_score'][x])) | |
print("Parameters: {0}".format(results['params'][x])) | |
print ("") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment