Warning: Declaration of action_plugin_tablewidth::register(&$controller) should be compatible with DokuWiki_Action_Plugin::register(Doku_Event_Handler $controller) in /s/bach/b/class/cs545/public_html/fall15/lib/plugins/tablewidth/action.php on line 93
===== model selection and cross validation in scikit-learn ===== First let's import some modules and read in some data: In [1]: import numpy as np In [2]: from sklearn import cross_validation In [3]: from sklearn import svm In [4]: from sklearn import metrics In [5]: data=np.genfromtxt("../data/heart_scale.data", delimiter=",") In [6]: X=data[:,1:] In [7]: y=data[:,0] The simplest form of model evaluation uses a validation/test set: In [9]: X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0) In [10]: classifier = svm.SVC(kernel='linear', C=1).fit(X_train, y_train) In [11]: classifier.score(X_test, y_test) Out[11]: 0.7592592592592593 Next, let'd perform cross-validation: In [18]: cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy') Out[18]: array([ 0.7962963 , 0.83333333, 0.88888889, 0.83333333, 0.83333333]) In [19]: In [19]: # you can obtain accuracy for other metrics, such as area under the roc curve: In [20]: cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc') Out[20]: array([ 0.89166667, 0.89166667, 0.95833333, 0.87638889, 0.91388889]) In [21]: In [21]: # you can also obtain the predictions by cross-validation and then compute the accuracy: In [22]: y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5) In [23]: metrics.accuracy_score(y, y_predict) Out[23]: 0.83703703703703702 H ere's an alternative way of doing cross-validation. In [25]: # first divide the data into folds: In [26]: cv = cross_validation.StratifiedKFold(y, 5) In [27]: # now use these folds: In [28]: print cross_validation.cross_val_score(classifier, X, y, cv=cv, scoring='roc_auc') [ 0.89166667 0.89166667 0.95833333 0.87638889 0.91388889] In [29]: # you can see how examples were divided into folds by looking at the test_folds attribute: In [30]: print cv.test_folds [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 2 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4] In [31]: # hmm... perhaps we should shuffle things a bit... In [32]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True) In [33]: print cv.test_folds [0 1 1 2 0 1 4 3 4 3 2 0 2 3 2 3 2 0 4 1 1 3 4 1 1 4 1 4 4 2 2 3 0 2 3 1 4 0 3 2 0 2 0 1 3 2 0 0 2 3 0 4 2 0 4 3 4 1 1 0 3 2 4 3 2 3 1 1 1 1 4 3 1 1 4 2 2 3 3 1 4 2 1 0 2 1 0 2 4 1 0 3 2 3 1 2 2 1 1 0 4 1 3 0 1 1 3 3 0 3 3 4 2 0 2 0 2 4 0 1 0 4 4 1 1 0 4 0 1 4 4 3 1 3 3 2 4 3 4 2 4 3 4 1 4 2 0 3 3 3 3 0 0 0 4 3 4 2 3 0 1 1 0 0 4 0 4 1 4 0 0 0 0 3 3 0 4 4 2 0 3 3 0 1 2 2 2 3 2 1 3 4 4 4 1 1 4 2 1 0 3 1 2 0 0 0 0 2 3 4 3 2 0 0 4 1 3 2 2 0 1 2 4 2 4 0 2 1 1 0 4 4 1 4 4 3 4 2 3 3 1 4 2 1 4 1 3 2 1 3 2 1 3 1 3 0 2 2 0 4 4 2 2 4 3 3 0 2 0 2] In [34]: # if you run division into folds multiple times you will get a different answer: In [35]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True) In [36]: print cv.test_folds [3 0 2 2 0 2 2 4 1 4 0 2 3 4 2 0 4 0 3 3 4 0 2 0 4 4 0 1 4 4 3 4 1 2 3 3 1 2 1 4 4 4 0 0 4 2 0 0 2 0 1 3 1 0 3 4 0 3 0 4 1 1 2 4 2 0 2 3 1 0 3 0 1 2 3 2 4 0 0 0 1 4 3 2 2 4 3 1 3 2 0 2 0 0 3 2 1 2 4 4 0 0 4 2 1 4 3 0 4 3 4 1 4 0 0 4 2 1 4 4 3 4 1 1 3 0 2 2 3 1 2 3 1 0 4 1 4 1 3 1 3 3 4 4 1 0 0 0 0 4 3 1 2 2 3 0 3 2 4 3 2 2 3 0 3 1 0 4 2 3 0 2 4 3 0 4 3 4 3 3 0 3 1 2 2 1 3 4 1 0 4 3 4 0 0 0 3 2 2 1 3 4 4 2 3 4 3 2 1 3 0 4 0 1 3 1 2 2 2 2 0 3 1 1 1 2 0 1 4 1 1 1 2 2 1 2 3 3 1 4 4 3 4 2 0 2 2 1 1 1 2 0 3 0 2 1 1 3 1 3 1 0 1 3 4 4 2 1 1 1] In [37]: # if you want to consistently get the same division into folds: In [38]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0) In [39]: # this sets the seed for the random number generator. Let's do grid search for the optimal set of parameters: In [40]: from sklearn.grid_search import GridSearchCV In [41]: Cs = np.logspace(-2, 3, 6) In [42]: classifier = GridSearchCV(estimator=svm.LinearSVC(), param_grid=dict(C=Cs) ) In [43]: classifier.fit(X, y) Out[43]: GridSearchCV(cv=None, error_score='raise', estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0), fit_params={}, iid=True, loss_func=None, n_jobs=1, param_grid={'C': array([ 1.00000e-02, 1.00000e-01, 1.00000e+00, 1.00000e+01, 1.00000e+02, 1.00000e+03])}, pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None, verbose=0) In [44]: In [44]: # print the best accuracy, classifier and parameters: In [45]: print classifier.best_score_ 0.844444444444 In [46]: print classifier.best_estimator_ LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0) In [47]: print classifier.best_params_ {'C': 1.0} n [48]: # performing nested cross validation: In [49]: print cross_validation.cross_val_score(classifier, X, y, cv=5) [ 0.7962963 0.81481481 0.88888889 0.83333333 0.83333333] In [50]: # if we want to do grid search over multiple parameters: In [51]: param_grid = [ ....: {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, ....: {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}, ....: ] In [52]: classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid) In [53]: print cross_validation.cross_val_score(classifier, X, y, cv=5) [ 0.7962963 0.83333333 0.88888889 0.7962963 0.87037037] And to make things easier for you here's the whole thing without the output: import numpy as np from sklearn import cross_validation from sklearn import svm from sklearn import metrics data=np.genfromtxt("../data/heart_scale.data", delimiter=",") X=data[:,1:] y=data[:,0] # let's train/test an svm on the heart dataset: X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0) classifier = svm.SVC(kernel='linear', C=1).fit(X_train, y_train) print classifier.score(X_test, y_test) # now let's use cross-validation instead: print cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy') # you can obtain accuracy for other metrics, such as area under the roc curve: print cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc') # you can also obtain the predictions by cross-validation and then compute the accuracy: y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5) metrics.accuracy_score(y, y_predict) # here's an alternative way of doing cross-validation. # first divide the data into folds: cv = cross_validation.StratifiedKFold(y, 5) # now use these folds: print cross_validation.cross_val_score(classifier, X, y, cv=cv, scoring='roc_auc') # you can see how examples were divided into folds by looking at the test_folds attribute: print cv.test_folds # hmm... perhaps we should shuffle things a bit... cv = cross_validation.StratifiedKFold(y, 5, shuffle=True) print cv.test_folds # if you run division into folds multiple times you will get a different answer: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True) print cv.test_folds # if you want to consistently get the same division into folds: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0) # this sets the seed for the random number generator. # grid search # let's perform model selection using grid search from sklearn.grid_search import GridSearchCV Cs = np.logspace(-2, 3, 6) classifier = GridSearchCV(estimator=svm.LinearSVC(), param_grid=dict(C=Cs) ) classifier.fit(X, y) # print the best accuracy, classifier and parameters: print classifier.best_score_ print classifier.best_estimator_ print classifier.best_params_ # performing nested cross validation: print cross_validation.cross_val_score(classifier, X, y, cv=5) # if we want to do grid search over multiple parameters: param_grid = [ {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}, ] classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid) print cross_validation.cross_val_score(classifier, X, y, cv=5)