Created
April 23, 2012 12:37
-
-
Save jaquesgrobler/2470665 to your computer and use it in GitHub Desktop.
scale_C compare
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
========================================================= | |
Title | |
========================================================= | |
Description | |
""" | |
print __doc__ | |
# Author: Andreas Mueller <[email protected]> | |
# Jaques Grobler <[email protected]> | |
# License: BSD | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.svm import LinearSVC, SVC | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.cross_validation import ShuffleSplit | |
from sklearn.grid_search import GridSearchCV | |
from sklearn.utils import check_random_state | |
from sklearn import datasets | |
from time import time | |
if __name__ == '__main__': | |
t0 = time() | |
rnd = check_random_state(1) | |
# set up dataset | |
n_samples = 100 | |
n_features = 1000 | |
#L1 data (only 5 informative features) | |
X_1, y_1 = datasets.make_classification(n_samples=n_samples, n_features=n_features, | |
n_informative=5, random_state=1) | |
#L2 data | |
X_2 = 1 + rnd.randn(n_samples, n_features) | |
coef = np.ones(n_features)*10 | |
y_2 = np.dot(X_2, coef) | |
y_2 += .1 * rnd.randn(n_samples) * np.std(y_2) | |
y_2 = np.sign(y_2 - np.mean(y_2)) | |
clf_sets = [(LinearSVC(penalty='L1', loss='L2', dual=False, | |
scale_C=False, tol=1e-7, intercept_scaling=100), | |
np.logspace(-3, -1, 30), X_1, y_1), | |
(LinearSVC(penalty='L2', loss='L1', dual=True, scale_C=False, | |
tol=1e-7, intercept_scaling=100), | |
np.logspace(-4, -2.5, 20), X_2, y_2)] | |
colors = ['b', 'g', 'r', 'c'] | |
for fignum, (clf, cs, X, y) in enumerate(clf_sets): | |
# set up the plot for each regressor | |
plt.figure(fignum, figsize=(9, 10)) | |
plt.clf | |
plt.xlabel('C') | |
plt.ylabel('CV Score') | |
for k, train_fraction in enumerate(np.arange(0.3, 0.6, 0.1)[::-1]): | |
param_grid = dict(C=cs) | |
grid = GridSearchCV(clf, refit=False, param_grid=param_grid, | |
cv=ShuffleSplit(n=n_samples, train_fraction=train_fraction, | |
n_iterations=50, random_state=1)) | |
grid.fit(X, y) | |
scores = [x[1] for x in grid.grid_scores_] | |
scales = [(1, 'No scaling'), | |
(1./(np.sqrt(n_samples * train_fraction)), '1/sqrt(n_samples)'), | |
(1./(n_samples * train_fraction), '1/n_samples'), | |
] | |
for subplotnum, (scaler, name) in enumerate(scales): | |
plt.subplot(3, 1, subplotnum + 1) | |
grid_cs = cs / float(scaler) # scale the C's | |
plt.semilogx(grid_cs, scores, label="fraction %.2f" % | |
train_fraction) | |
plt.title('scaling=%s, penalty=%s, loss=%s' % (name, clf.penalty, clf.loss)) | |
ymin, ymax = plt.ylim() | |
plt.axvline(grid_cs[np.argmax(scores)], 0, 1, | |
color=colors[k]) | |
plt.ylim(ymin=ymin-0.0025) # adjust the y-axis | |
plt.legend(loc="lower left") | |
print 'time taken', time() - t0, ' seconds' | |
plt.show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment