agramfort · February 8, 2016 15:56
diff --git a/miss_val_bench.py b/miss_val_bench.py
 import numpy as np

 from sklearn.datasets import fetch_covtype
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import StratifiedShuffleSplit
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import Imputer
 from sklearn.model_selection import cross_val_score

 rng = np.random.RandomState(0)

 # dataset = load_digits()
 # dataset = load_iris()
 dataset = fetch_covtype()
 X, y = dataset.data, dataset.target

 # Take only 2 classes
 # mask = y < 3
 mask = (y == 1) | (y == 2)
 X = X[mask]
 y = y[mask]

 X, y = X[::80].copy(), y[::80].copy()

 n_samples, n_features = X.shape

 n_estimators = 100

 rng = np.random.RandomState(42)

 cv = StratifiedShuffleSplit(n_iter=1, test_size=0.3, random_state=rng)

 print "The shape of the dataset is %s" % str(X.shape)
 print "The number of trees for this benchmarking is %s" % n_estimators

 # Estimate the score on the entire dataset, with no missing values
 estimator = RandomForestClassifier(random_state=0, n_estimators=n_estimators,
                                   missing_values=None, n_jobs=4)
 score = cross_val_score(estimator, X, y, cv=cv).mean()
 print "Score with the entire dataset = %.2f" % score

 baseline_score = score

 scores_missing = []
 scores_impute = []

 rf_missing = RandomForestClassifier(random_state=0, n_estimators=n_estimators,
                                    missing_values='NaN', n_jobs=4)
 rf_impute = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="median", axis=0)),
                      ("forest", RandomForestClassifier(
                                         random_state=0,
                                         n_estimators=n_estimators))])

 missing_fraction_range = []
 mask = np.ones(X.shape, dtype=bool)

 for _ in range(7):
    X_missing = X.copy()
    rv = rng.randn(*X.shape)
    thresh = np.sort(rv.ravel())[int(0.1 * n_samples * n_features)]
    mask *= rv > thresh
    mask[y == 1] = True
    missing_fraction = np.mean(~mask)
    missing_fraction_range.append(missing_fraction)
    X_missing[~mask] = np.nan
    train, test = iter(cv.split(X, y)).next()
    score_missing = rf_missing.fit(X_missing[train], y[train]).score(X_missing[test], y[test])
    score_impute = rf_impute.fit(X_missing[train], y[train]).score(X_missing[test], y[test])
    # score_missing = cross_val_score(rf_missing, X_missing, y, cv=cv).mean()
    # score_impute = cross_val_score(rf_impute, X_missing, y, cv=cv).mean()
    scores_missing.append(score_missing)
    scores_impute.append(score_impute)
    print "Score RF with the %s %% missing = %.2f" % (missing_fraction, score_missing)
    print "Score RF+Imp. with the %s %% missing = %.2f" % (missing_fraction, score_impute)

 import matplotlib.pyplot as plt
 plt.close('all')
 plt.plot(missing_fraction_range, scores_missing, 'o-', label='RF mv')
 plt.plot(missing_fraction_range, scores_impute, 'o-', label='RF imp.')
 plt.axhline(baseline_score, label='no missing', color='k')
 plt.xlabel('Missing fraction')
 plt.ylabel('Score')
 plt.legend(loc='lower left')
 plt.show()
	import numpy as np

	from sklearn.datasets import fetch_covtype
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import StratifiedShuffleSplit
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import Imputer
	from sklearn.model_selection import cross_val_score

	rng = np.random.RandomState(0)

	# dataset = load_digits()
	# dataset = load_iris()
	dataset = fetch_covtype()
	X, y = dataset.data, dataset.target

	# Take only 2 classes
	# mask = y < 3
	mask = (y == 1) \| (y == 2)
	X = X[mask]
	y = y[mask]

	X, y = X[::80].copy(), y[::80].copy()

	n_samples, n_features = X.shape

	n_estimators = 100

	rng = np.random.RandomState(42)

	cv = StratifiedShuffleSplit(n_iter=1, test_size=0.3, random_state=rng)

	print "The shape of the dataset is %s" % str(X.shape)
	print "The number of trees for this benchmarking is %s" % n_estimators

	# Estimate the score on the entire dataset, with no missing values
	estimator = RandomForestClassifier(random_state=0, n_estimators=n_estimators,
	missing_values=None, n_jobs=4)
	score = cross_val_score(estimator, X, y, cv=cv).mean()
	print "Score with the entire dataset = %.2f" % score

	baseline_score = score

	scores_missing = []
	scores_impute = []

	rf_missing = RandomForestClassifier(random_state=0, n_estimators=n_estimators,
	missing_values='NaN', n_jobs=4)
	rf_impute = Pipeline([("imputer", Imputer(missing_values='NaN',
	strategy="median", axis=0)),
	("forest", RandomForestClassifier(
	random_state=0,
	n_estimators=n_estimators))])

	missing_fraction_range = []
	mask = np.ones(X.shape, dtype=bool)

	for _ in range(7):
	X_missing = X.copy()
	rv = rng.randn(*X.shape)
	thresh = np.sort(rv.ravel())[int(0.1 * n_samples * n_features)]
	mask *= rv > thresh
	mask[y == 1] = True
	missing_fraction = np.mean(~mask)
	missing_fraction_range.append(missing_fraction)
	X_missing[~mask] = np.nan
	train, test = iter(cv.split(X, y)).next()
	score_missing = rf_missing.fit(X_missing[train], y[train]).score(X_missing[test], y[test])
	score_impute = rf_impute.fit(X_missing[train], y[train]).score(X_missing[test], y[test])
	# score_missing = cross_val_score(rf_missing, X_missing, y, cv=cv).mean()
	# score_impute = cross_val_score(rf_impute, X_missing, y, cv=cv).mean()
	scores_missing.append(score_missing)
	scores_impute.append(score_impute)
	print "Score RF with the %s %% missing = %.2f" % (missing_fraction, score_missing)
	print "Score RF+Imp. with the %s %% missing = %.2f" % (missing_fraction, score_impute)

	import matplotlib.pyplot as plt
	plt.close('all')
	plt.plot(missing_fraction_range, scores_missing, 'o-', label='RF mv')
	plt.plot(missing_fraction_range, scores_impute, 'o-', label='RF imp.')
	plt.axhline(baseline_score, label='no missing', color='k')
	plt.xlabel('Missing fraction')
	plt.ylabel('Score')
	plt.legend(loc='lower left')
	plt.show()