Warning: Declaration of action_plugin_tablewidth::register(&$controller) should be compatible with DokuWiki_Action_Plugin::register(Doku_Event_Handler $controller) in /s/bach/b/class/cs545/public_html/fall15/lib/plugins/tablewidth/action.php on line 93
===== model selection and cross validation in scikit-learn =====
First let's import some modules and read in some data:
In [1]: import numpy as np
In [2]: from sklearn import cross_validation
In [3]: from sklearn import svm
In [4]: from sklearn import metrics
In [5]: data=np.genfromtxt("../data/heart_scale.data", delimiter=",")
In [6]: X=data[:,1:]
In [7]: y=data[:,0]
The simplest form of model evaluation uses a validation/test set:
In [9]: X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
In [10]: classifier = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
In [11]: classifier.score(X_test, y_test)
Out[11]: 0.7592592592592593
Next, let'd perform cross-validation:
In [18]: cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy')
Out[18]: array([ 0.7962963 , 0.83333333, 0.88888889, 0.83333333, 0.83333333])
In [19]:
In [19]: # you can obtain accuracy for other metrics, such as area under the roc curve:
In [20]: cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc')
Out[20]: array([ 0.89166667, 0.89166667, 0.95833333, 0.87638889, 0.91388889])
In [21]:
In [21]: # you can also obtain the predictions by cross-validation and then compute the accuracy:
In [22]: y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5)
In [23]: metrics.accuracy_score(y, y_predict)
Out[23]: 0.83703703703703702
H ere's an alternative way of doing cross-validation.
In [25]: # first divide the data into folds:
In [26]: cv = cross_validation.StratifiedKFold(y, 5)
In [27]: # now use these folds:
In [28]: print cross_validation.cross_val_score(classifier, X, y, cv=cv, scoring='roc_auc')
[ 0.89166667 0.89166667 0.95833333 0.87638889 0.91388889]
In [29]: # you can see how examples were divided into folds by looking at the test_folds attribute:
In [30]: print cv.test_folds
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 3 3 2 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4]
In [31]: # hmm... perhaps we should shuffle things a bit...
In [32]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True)
In [33]: print cv.test_folds
[0 1 1 2 0 1 4 3 4 3 2 0 2 3 2 3 2 0 4 1 1 3 4 1 1 4 1 4 4 2 2 3 0 2 3 1 4
0 3 2 0 2 0 1 3 2 0 0 2 3 0 4 2 0 4 3 4 1 1 0 3 2 4 3 2 3 1 1 1 1 4 3 1 1
4 2 2 3 3 1 4 2 1 0 2 1 0 2 4 1 0 3 2 3 1 2 2 1 1 0 4 1 3 0 1 1 3 3 0 3 3
4 2 0 2 0 2 4 0 1 0 4 4 1 1 0 4 0 1 4 4 3 1 3 3 2 4 3 4 2 4 3 4 1 4 2 0 3
3 3 3 0 0 0 4 3 4 2 3 0 1 1 0 0 4 0 4 1 4 0 0 0 0 3 3 0 4 4 2 0 3 3 0 1 2
2 2 3 2 1 3 4 4 4 1 1 4 2 1 0 3 1 2 0 0 0 0 2 3 4 3 2 0 0 4 1 3 2 2 0 1 2
4 2 4 0 2 1 1 0 4 4 1 4 4 3 4 2 3 3 1 4 2 1 4 1 3 2 1 3 2 1 3 1 3 0 2 2 0
4 4 2 2 4 3 3 0 2 0 2]
In [34]: # if you run division into folds multiple times you will get a different answer:
In [35]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True)
In [36]: print cv.test_folds
[3 0 2 2 0 2 2 4 1 4 0 2 3 4 2 0 4 0 3 3 4 0 2 0 4 4 0 1 4 4 3 4 1 2 3 3 1
2 1 4 4 4 0 0 4 2 0 0 2 0 1 3 1 0 3 4 0 3 0 4 1 1 2 4 2 0 2 3 1 0 3 0 1 2
3 2 4 0 0 0 1 4 3 2 2 4 3 1 3 2 0 2 0 0 3 2 1 2 4 4 0 0 4 2 1 4 3 0 4 3 4
1 4 0 0 4 2 1 4 4 3 4 1 1 3 0 2 2 3 1 2 3 1 0 4 1 4 1 3 1 3 3 4 4 1 0 0 0
0 4 3 1 2 2 3 0 3 2 4 3 2 2 3 0 3 1 0 4 2 3 0 2 4 3 0 4 3 4 3 3 0 3 1 2 2
1 3 4 1 0 4 3 4 0 0 0 3 2 2 1 3 4 4 2 3 4 3 2 1 3 0 4 0 1 3 1 2 2 2 2 0 3
1 1 1 2 0 1 4 1 1 1 2 2 1 2 3 3 1 4 4 3 4 2 0 2 2 1 1 1 2 0 3 0 2 1 1 3 1
3 1 0 1 3 4 4 2 1 1 1]
In [37]: # if you want to consistently get the same division into folds:
In [38]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0)
In [39]: # this sets the seed for the random number generator.
Let's do grid search for the optimal set of parameters:
In [40]: from sklearn.grid_search import GridSearchCV
In [41]: Cs = np.logspace(-2, 3, 6)
In [42]: classifier = GridSearchCV(estimator=svm.LinearSVC(), param_grid=dict(C=Cs) )
In [43]: classifier.fit(X, y)
Out[43]:
GridSearchCV(cv=None, error_score='raise',
estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
intercept_scaling=1, loss='squared_hinge', max_iter=1000,
multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
verbose=0),
fit_params={}, iid=True, loss_func=None, n_jobs=1,
param_grid={'C': array([ 1.00000e-02, 1.00000e-01, 1.00000e+00, 1.00000e+01,
1.00000e+02, 1.00000e+03])},
pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
verbose=0)
In [44]:
In [44]: # print the best accuracy, classifier and parameters:
In [45]: print classifier.best_score_
0.844444444444
In [46]: print classifier.best_estimator_
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
intercept_scaling=1, loss='squared_hinge', max_iter=1000,
multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
verbose=0)
In [47]: print classifier.best_params_
{'C': 1.0}
n [48]: # performing nested cross validation:
In [49]: print cross_validation.cross_val_score(classifier, X, y, cv=5)
[ 0.7962963 0.81481481 0.88888889 0.83333333 0.83333333]
In [50]: # if we want to do grid search over multiple parameters:
In [51]: param_grid = [
....: {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
....: {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
....: ]
In [52]: classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid)
In [53]: print cross_validation.cross_val_score(classifier, X, y, cv=5)
[ 0.7962963 0.83333333 0.88888889 0.7962963 0.87037037]
And to make things easier for you here's the whole thing without the output:
import numpy as np
from sklearn import cross_validation
from sklearn import svm
from sklearn import metrics
data=np.genfromtxt("../data/heart_scale.data", delimiter=",")
X=data[:,1:]
y=data[:,0]
# let's train/test an svm on the heart dataset:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
classifier = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
print classifier.score(X_test, y_test)
# now let's use cross-validation instead:
print cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy')
# you can obtain accuracy for other metrics, such as area under the roc curve:
print cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc')
# you can also obtain the predictions by cross-validation and then compute the accuracy:
y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5)
metrics.accuracy_score(y, y_predict)
# here's an alternative way of doing cross-validation.
# first divide the data into folds:
cv = cross_validation.StratifiedKFold(y, 5)
# now use these folds:
print cross_validation.cross_val_score(classifier, X, y, cv=cv, scoring='roc_auc')
# you can see how examples were divided into folds by looking at the test_folds attribute:
print cv.test_folds
# hmm... perhaps we should shuffle things a bit...
cv = cross_validation.StratifiedKFold(y, 5, shuffle=True)
print cv.test_folds
# if you run division into folds multiple times you will get a different answer:
cv = cross_validation.StratifiedKFold(y, 5, shuffle=True)
print cv.test_folds
# if you want to consistently get the same division into folds:
cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0)
# this sets the seed for the random number generator.
# grid search
# let's perform model selection using grid search
from sklearn.grid_search import GridSearchCV
Cs = np.logspace(-2, 3, 6)
classifier = GridSearchCV(estimator=svm.LinearSVC(), param_grid=dict(C=Cs) )
classifier.fit(X, y)
# print the best accuracy, classifier and parameters:
print classifier.best_score_
print classifier.best_estimator_
print classifier.best_params_
# performing nested cross validation:
print cross_validation.cross_val_score(classifier, X, y, cv=5)
# if we want to do grid search over multiple parameters:
param_grid = [
{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid)
print cross_validation.cross_val_score(classifier, X, y, cv=5)