Skip to content

Instantly share code, notes, and snippets.

@jaquesgrobler
Created April 23, 2012 12:37
Show Gist options
  • Save jaquesgrobler/2470665 to your computer and use it in GitHub Desktop.
Save jaquesgrobler/2470665 to your computer and use it in GitHub Desktop.
scale_C compare
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
=========================================================
Title
=========================================================
Description
"""
print __doc__
# Author: Andreas Mueller <[email protected]>
# Jaques Grobler <[email protected]>
# License: BSD
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.utils import check_random_state
from sklearn import datasets
from time import time
if __name__ == '__main__':
t0 = time()
rnd = check_random_state(1)
# set up dataset
n_samples = 100
n_features = 1000
#L1 data (only 5 informative features)
X_1, y_1 = datasets.make_classification(n_samples=n_samples, n_features=n_features,
n_informative=5, random_state=1)
#L2 data
X_2 = 1 + rnd.randn(n_samples, n_features)
coef = np.ones(n_features)*10
y_2 = np.dot(X_2, coef)
y_2 += .1 * rnd.randn(n_samples) * np.std(y_2)
y_2 = np.sign(y_2 - np.mean(y_2))
clf_sets = [(LinearSVC(penalty='L1', loss='L2', dual=False,
scale_C=False, tol=1e-7, intercept_scaling=100),
np.logspace(-3, -1, 30), X_1, y_1),
(LinearSVC(penalty='L2', loss='L1', dual=True, scale_C=False,
tol=1e-7, intercept_scaling=100),
np.logspace(-4, -2.5, 20), X_2, y_2)]
colors = ['b', 'g', 'r', 'c']
for fignum, (clf, cs, X, y) in enumerate(clf_sets):
# set up the plot for each regressor
plt.figure(fignum, figsize=(9, 10))
plt.clf
plt.xlabel('C')
plt.ylabel('CV Score')
for k, train_fraction in enumerate(np.arange(0.3, 0.6, 0.1)[::-1]):
param_grid = dict(C=cs)
grid = GridSearchCV(clf, refit=False, param_grid=param_grid,
cv=ShuffleSplit(n=n_samples, train_fraction=train_fraction,
n_iterations=50, random_state=1))
grid.fit(X, y)
scores = [x[1] for x in grid.grid_scores_]
scales = [(1, 'No scaling'),
(1./(np.sqrt(n_samples * train_fraction)), '1/sqrt(n_samples)'),
(1./(n_samples * train_fraction), '1/n_samples'),
]
for subplotnum, (scaler, name) in enumerate(scales):
plt.subplot(3, 1, subplotnum + 1)
grid_cs = cs / float(scaler) # scale the C's
plt.semilogx(grid_cs, scores, label="fraction %.2f" %
train_fraction)
plt.title('scaling=%s, penalty=%s, loss=%s' % (name, clf.penalty, clf.loss))
ymin, ymax = plt.ylim()
plt.axvline(grid_cs[np.argmax(scores)], 0, 1,
color=colors[k])
plt.ylim(ymin=ymin-0.0025) # adjust the y-axis
plt.legend(loc="lower left")
print 'time taken', time() - t0, ' seconds'
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment