Skip to content

Instantly share code, notes, and snippets.

@rmania
Last active July 7, 2017 20:37
Show Gist options
  • Save rmania/22194f7ea2670180e7dec2d9ff61bc85 to your computer and use it in GitHub Desktop.
Save rmania/22194f7ea2670180e7dec2d9ff61bc85 to your computer and use it in GitHub Desktop.
some useful ml processing functions
############ ExtraTreesRegressor , TimeSeriesSplit
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# Model selection
etr = ExtraTreesRegressor(n_estimators = 100, n_jobs=-1)
# Predict, TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
results = []
for train_index, test_index in tscv.split(X):
X_train, y_train = X[train_index], y[train_index]
X_test, y_test = X[test_index], y[test_index]
etr.fit(X_train, y_train)
y_train_pred = etr.predict(X_train)
y_test_pred = etr.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (
np.sqrt(mean_squared_error(y_train, y_train_pred)),
np.sqrt(mean_squared_error(y_test, y_test_pred))))
print('R^2 train: %.3f, test: %.3f' % (
r2_score(y_train, y_train_pred),
r2_score(y_test, y_test_pred)))
def rank_features(model, features):
"""
Ranks the importance of features used in ML and bar-graph plots these
"""
# calcs
importances = model.feature_importances_
std = np.std([model.feature_importances_ for model in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
# rank feature importances
print (sorted(zip(map(lambda x: "{0:.3f}".format(x), model.feature_importances_), p.feature_cols),
reverse=True))
# plot feature importances
fig, ax = plt.subplots(figsize= [10,6])
coef = pd.Series(model.feature_importances_, index = p.feature_cols).sort_values(ascending=False)
coef.head(X.shape[0]).plot(kind='bar')
plt.title('Feature Significance')
plt.tight_layout()
def measure_model(model):
"""
calculates the errors of the test, validation and train sets and plots histograms"""
er = 100
print("Running on train set")
error_train = model.predict(X_train) - y_train
rmse_train = np.sqrt(mean_squared_error(y_train, model.predict(X_train)))
print("Running on validation set")
error_validate = model.predict(X_validate) - y_validate
rmse_val = np.sqrt(mean_squared_error(y_validate, model.predict(X_validate)))
print("Running on test set")
error_test = model.predict(X_test) - y_test
rmse_test = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))
fig, ax = plt.subplots(1, 3, figsize= [15,5])
p = pd.Series(error_train).hist(bins=31, range=[-er,er], ax=ax[0])
p.set(xlabel = ' Errors on train set', ylabel = 'Frequency', title = 'rmse: {}'.format(rmse_train))
p = pd.Series(error_validate).hist(bins=31, range=[-er,er], ax=ax[1])
p.set(xlabel = ' Errors on validate set', title = 'rmse: {}'.format(rmse_val))
p = pd.Series(error_test).hist(bins=31, range=[-er,er], ax=ax[2])
p.set(xlabel = ' Errors on test set', title = 'rmse: {}'.format(rmse_test))
plt.tight_layout()
return p
def run_gridsearch(X, y, model, param_grid, cv):
"""Run a grid search for best parameters.
input
----
X_train -- features training set
y_train -- targets training set
model -- f.i. ExtraTreesRegressor()
param_grid -- [dict] parameter settings to test
cv -- fold of cross-validation, default 5
Returns
-------
top_params -- [dict] from report()
"""
grid_search = GridSearchCV(model,
param_grid=param_grid,
cv=cv)
grid_search.fit(X, y)
print('Fitted model with following parameters: \n{}'.format(grid_search.get_params))
print(("\nGridSearchCV "
"{:d} candidate "
"parameter settings.").format(len(grid_search.cv_results_)))
top_params = report(grid_search.cv_results_, 3)
return top_params, grid_search
# report best scores
def grid_search_report(results, n_top=3):
"""Report top n_top parameters settings, default n_top=3.
input
----
grid_scores -- output from GridSearchCV or random_searchCV
n_top -- n top models
Returns
-------
top_params -- [dict] top parameters found in search
"""
for i in range(1, n_top + 1):
best_models = np.flatnonzero(results['rank_test_score'] == i)
for x in best_models:
print("Model rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][x],
results['std_test_score'][x]))
print("Parameters: {0}".format(results['params'][x]))
print ("")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment