rmania · July 7, 2017 20:37
diff --git a/ml_processing.py b/ml_processing.py
 ############ ExtraTreesRegressor , TimeSeriesSplit
 from sklearn.metrics import r2_score
 from sklearn.metrics import mean_squared_error

 # Model selection
 etr = ExtraTreesRegressor(n_estimators = 100, n_jobs=-1)

 # Predict, TimeSeriesSplit
 tscv = TimeSeriesSplit(n_splits=5)
 results = []

 for train_index, test_index in tscv.split(X):
    
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test   = X[test_index], y[test_index]
    
    etr.fit(X_train, y_train)
    y_train_pred = etr.predict(X_train)
    y_test_pred = etr.predict(X_test)
    
    print('MSE train: %.3f, test: %.3f' % (
            np.sqrt(mean_squared_error(y_train, y_train_pred)),
            np.sqrt(mean_squared_error(y_test, y_test_pred))))
    print('R^2 train: %.3f, test: %.3f' % (
            r2_score(y_train, y_train_pred),
            r2_score(y_test, y_test_pred)))
    
    
 def rank_features(model, features):
    """ 
    Ranks the importance of features used in ML and bar-graph plots these
    """
    # calcs
    importances = model.feature_importances_
    std = np.std([model.feature_importances_ for model in model.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]

    # rank feature importances
    print (sorted(zip(map(lambda x: "{0:.3f}".format(x), model.feature_importances_), p.feature_cols), 
                 reverse=True))

    # plot feature importances
    fig, ax = plt.subplots(figsize= [10,6])
    coef = pd.Series(model.feature_importances_, index = p.feature_cols).sort_values(ascending=False)
    coef.head(X.shape[0]).plot(kind='bar')
    plt.title('Feature Significance')
    plt.tight_layout()
    
    
 def measure_model(model):
  
    """
    calculates the errors of the test, validation and train sets and plots histograms"""
    er = 100
    
    print("Running on train set")
    error_train = model.predict(X_train) - y_train
    rmse_train = np.sqrt(mean_squared_error(y_train, model.predict(X_train)))

    print("Running on validation set")
    error_validate = model.predict(X_validate) - y_validate
    rmse_val = np.sqrt(mean_squared_error(y_validate, model.predict(X_validate)))
    
    print("Running on test set")
    error_test = model.predict(X_test) - y_test
    rmse_test = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))
    
    fig, ax = plt.subplots(1, 3, figsize= [15,5])

    p = pd.Series(error_train).hist(bins=31, range=[-er,er], ax=ax[0])
    p.set(xlabel = ' Errors on train set', ylabel = 'Frequency', title = 'rmse: {}'.format(rmse_train))
    
    p = pd.Series(error_validate).hist(bins=31, range=[-er,er], ax=ax[1])
    p.set(xlabel = ' Errors on validate set', title = 'rmse: {}'.format(rmse_val))
    
    p = pd.Series(error_test).hist(bins=31, range=[-er,er], ax=ax[2])
    p.set(xlabel = ' Errors on test set', title = 'rmse: {}'.format(rmse_test))
    plt.tight_layout()
    
    return p
  
  
 def run_gridsearch(X, y, model, param_grid, cv):
    
    """Run a grid search for best parameters.
    input
    ----
    X_train -- features training set 
    y_train -- targets training set
    model -- f.i. ExtraTreesRegressor()
    param_grid -- [dict] parameter settings to test
    cv -- fold of cross-validation, default 5
    Returns
    -------
    top_params -- [dict] from report()
    """
    grid_search = GridSearchCV(model,
                               param_grid=param_grid,
                               cv=cv)
    grid_search.fit(X, y)
    
    print('Fitted model with following parameters: \n{}'.format(grid_search.get_params))
    print(("\nGridSearchCV "
           "{:d} candidate "
           "parameter settings.").format(len(grid_search.cv_results_))) 

    top_params = report(grid_search.cv_results_, 3)
    
    return top_params, grid_search


 # report best scores
 def grid_search_report(results, n_top=3):
    """Report top n_top parameters settings, default n_top=3.
    input
    ----
    grid_scores -- output from GridSearchCV or random_searchCV
    n_top -- n top models
    Returns
    -------
    top_params -- [dict] top parameters found in search
    """
    for i in range(1, n_top + 1):
        best_models = np.flatnonzero(results['rank_test_score'] == i)
        for x in best_models:
            print("Model rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][x],
                  results['std_test_score'][x]))
            print("Parameters: {0}".format(results['params'][x]))
            print ("")
	############ ExtraTreesRegressor , TimeSeriesSplit
	from sklearn.metrics import r2_score
	from sklearn.metrics import mean_squared_error

	# Model selection
	etr = ExtraTreesRegressor(n_estimators = 100, n_jobs=-1)

	# Predict, TimeSeriesSplit
	tscv = TimeSeriesSplit(n_splits=5)
	results = []

	for train_index, test_index in tscv.split(X):

	X_train, y_train = X[train_index], y[train_index]
	X_test, y_test = X[test_index], y[test_index]

	etr.fit(X_train, y_train)
	y_train_pred = etr.predict(X_train)
	y_test_pred = etr.predict(X_test)

	print('MSE train: %.3f, test: %.3f' % (
	np.sqrt(mean_squared_error(y_train, y_train_pred)),
	np.sqrt(mean_squared_error(y_test, y_test_pred))))
	print('R^2 train: %.3f, test: %.3f' % (
	r2_score(y_train, y_train_pred),
	r2_score(y_test, y_test_pred)))


	def rank_features(model, features):
	"""
	Ranks the importance of features used in ML and bar-graph plots these
	"""
	# calcs
	importances = model.feature_importances_
	std = np.std([model.feature_importances_ for model in model.estimators_], axis=0)
	indices = np.argsort(importances)[::-1]

	# rank feature importances
	print (sorted(zip(map(lambda x: "{0:.3f}".format(x), model.feature_importances_), p.feature_cols),
	reverse=True))

	# plot feature importances
	fig, ax = plt.subplots(figsize= [10,6])
	coef = pd.Series(model.feature_importances_, index = p.feature_cols).sort_values(ascending=False)
	coef.head(X.shape[0]).plot(kind='bar')
	plt.title('Feature Significance')
	plt.tight_layout()


	def measure_model(model):

	"""
	calculates the errors of the test, validation and train sets and plots histograms"""
	er = 100

	print("Running on train set")
	error_train = model.predict(X_train) - y_train
	rmse_train = np.sqrt(mean_squared_error(y_train, model.predict(X_train)))

	print("Running on validation set")
	error_validate = model.predict(X_validate) - y_validate
	rmse_val = np.sqrt(mean_squared_error(y_validate, model.predict(X_validate)))

	print("Running on test set")
	error_test = model.predict(X_test) - y_test
	rmse_test = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))

	fig, ax = plt.subplots(1, 3, figsize= [15,5])

	p = pd.Series(error_train).hist(bins=31, range=[-er,er], ax=ax[0])
	p.set(xlabel = ' Errors on train set', ylabel = 'Frequency', title = 'rmse: {}'.format(rmse_train))

	p = pd.Series(error_validate).hist(bins=31, range=[-er,er], ax=ax[1])
	p.set(xlabel = ' Errors on validate set', title = 'rmse: {}'.format(rmse_val))

	p = pd.Series(error_test).hist(bins=31, range=[-er,er], ax=ax[2])
	p.set(xlabel = ' Errors on test set', title = 'rmse: {}'.format(rmse_test))
	plt.tight_layout()

	return p


	def run_gridsearch(X, y, model, param_grid, cv):

	"""Run a grid search for best parameters.
	input
	----
	X_train -- features training set
	y_train -- targets training set
	model -- f.i. ExtraTreesRegressor()
	param_grid -- [dict] parameter settings to test
	cv -- fold of cross-validation, default 5
	Returns
	-------
	top_params -- [dict] from report()
	"""
	grid_search = GridSearchCV(model,
	param_grid=param_grid,
	cv=cv)
	grid_search.fit(X, y)

	print('Fitted model with following parameters: \n{}'.format(grid_search.get_params))
	print(("\nGridSearchCV "
	"{:d} candidate "
	"parameter settings.").format(len(grid_search.cv_results_)))

	top_params = report(grid_search.cv_results_, 3)

	return top_params, grid_search


	# report best scores
	def grid_search_report(results, n_top=3):
	"""Report top n_top parameters settings, default n_top=3.
	input
	----
	grid_scores -- output from GridSearchCV or random_searchCV
	n_top -- n top models
	Returns
	-------
	top_params -- [dict] top parameters found in search
	"""
	for i in range(1, n_top + 1):
	best_models = np.flatnonzero(results['rank_test_score'] == i)
	for x in best_models:
	print("Model rank: {0}".format(i))
	print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
	results['mean_test_score'][x],
	results['std_test_score'][x]))
	print("Parameters: {0}".format(results['params'][x]))
	print ("")