Warning: Declaration of action_plugin_tablewidth::register(&$controller) should be compatible with DokuWiki_Action_Plugin::register(Doku_Event_Handler $controller) in /s/bach/b/class/cs545/public_html/fall16/lib/plugins/tablewidth/action.php on line 93
code:model_selection [CS545 fall 2016]

User Tools

Site Tools


code:model_selection

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Last revision Both sides next revision
code:model_selection [2016/08/09 10:25]
127.0.0.1 external edit
code:model_selection [2016/10/06 14:58]
asa
Line 1: Line 1:
-===== model selection ​and cross validation ​in scikit-learn ===== +===== model selection in scikit-learn =====
- +
-First let's import some modules and read in some data:+
  
 <code python> <code python>
  
-In [1]: import numpy as np 
- 
-In [2]: from sklearn import cross_validation 
- 
-In [3]: from sklearn import svm 
- 
-In [4]: from sklearn import metrics 
- 
-In [5]: data=np.genfromtxt("​../​data/​heart_scale.data",​ delimiter=","​) 
- 
-In [6]: X=data[:,​1:​] 
- 
-In [7]: y=data[:,0] 
- 
-</​code>​ 
- 
-The simplest form of model evaluation uses a validation/​test set: 
- 
-<code python> 
-In [9]: X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,​ y, test_size=0.4,​ random_state=0) 
- 
-In [10]: classifier = svm.SVC(kernel='​linear',​ C=1).fit(X_train,​ y_train) 
- 
-In [11]: classifier.score(X_test,​ y_test) 
-Out[11]: 0.7592592592592593 
- 
- 
-</​code>​ 
- 
-Next, let'd perform cross-validation:​ 
- 
-<code python> 
- 
-In [18]: cross_validation.cross_val_score(classifier,​ X, y, cv=5, scoring='​accuracy'​) 
-Out[18]: array([ 0.7962963 ,  0.83333333, ​ 0.88888889, ​ 0.83333333, ​ 0.83333333]) 
- 
-In [19]:  
- 
-In [19]: # you can obtain accuracy for other metrics, such as area under the roc curve: 
- 
-In [20]: cross_validation.cross_val_score(classifier,​ X, y, cv=5, scoring='​roc_auc'​) 
-Out[20]: array([ 0.89166667, ​ 0.89166667, ​ 0.95833333, ​ 0.87638889, ​ 0.91388889]) 
- 
-In [21]:  
- 
-In [21]: # you can also obtain the predictions by cross-validation and then compute the accuracy: 
- 
-In [22]: y_predict = cross_validation.cross_val_predict(classifier,​ X, y, cv=5) 
- 
-In [23]: metrics.accuracy_score(y,​ y_predict) 
-Out[23]: 0.83703703703703702 
- 
-</​code>​ 
- 
-H ere's an alternative way of doing cross-validation. 
- 
-<code python> 
-In [25]: # first divide the data into folds: 
- 
-In [26]: cv = cross_validation.StratifiedKFold(y,​ 5) 
- 
-In [27]: # now use these folds: 
- 
-In [28]: print cross_validation.cross_val_score(classifier,​ X, y, cv=cv, scoring='​roc_auc'​) 
-[ 0.89166667 ​ 0.89166667 ​ 0.95833333 ​ 0.87638889 ​ 0.91388889] 
- 
-In [29]: # you can see how examples were divided into folds by looking at the test_folds attribute: 
- 
-In [30]: print cv.test_folds 
-[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
- 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 
- 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 
- 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 
- 2 2 2 2 2 2 2 2 2 2 2 2 3 3 2 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 
- 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 
- 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 
- 4 4 4 4 4 4 4 4 4 4 4] 
- 
-In [31]: # hmm... perhaps we should shuffle things a bit... 
- 
-In [32]: cv = cross_validation.StratifiedKFold(y,​ 5, shuffle=True) 
- 
-In [33]: print cv.test_folds 
-[0 1 1 2 0 1 4 3 4 3 2 0 2 3 2 3 2 0 4 1 1 3 4 1 1 4 1 4 4 2 2 3 0 2 3 1 4 
- 0 3 2 0 2 0 1 3 2 0 0 2 3 0 4 2 0 4 3 4 1 1 0 3 2 4 3 2 3 1 1 1 1 4 3 1 1 
- 4 2 2 3 3 1 4 2 1 0 2 1 0 2 4 1 0 3 2 3 1 2 2 1 1 0 4 1 3 0 1 1 3 3 0 3 3 
- 4 2 0 2 0 2 4 0 1 0 4 4 1 1 0 4 0 1 4 4 3 1 3 3 2 4 3 4 2 4 3 4 1 4 2 0 3 
- 3 3 3 0 0 0 4 3 4 2 3 0 1 1 0 0 4 0 4 1 4 0 0 0 0 3 3 0 4 4 2 0 3 3 0 1 2 
- 2 2 3 2 1 3 4 4 4 1 1 4 2 1 0 3 1 2 0 0 0 0 2 3 4 3 2 0 0 4 1 3 2 2 0 1 2 
- 4 2 4 0 2 1 1 0 4 4 1 4 4 3 4 2 3 3 1 4 2 1 4 1 3 2 1 3 2 1 3 1 3 0 2 2 0 
- 4 4 2 2 4 3 3 0 2 0 2] 
- 
-In [34]: # if you run division into folds multiple times you will get a different answer: 
- 
-In [35]: cv = cross_validation.StratifiedKFold(y,​ 5, shuffle=True) 
- 
-In [36]: print cv.test_folds 
-[3 0 2 2 0 2 2 4 1 4 0 2 3 4 2 0 4 0 3 3 4 0 2 0 4 4 0 1 4 4 3 4 1 2 3 3 1 
- 2 1 4 4 4 0 0 4 2 0 0 2 0 1 3 1 0 3 4 0 3 0 4 1 1 2 4 2 0 2 3 1 0 3 0 1 2 
- 3 2 4 0 0 0 1 4 3 2 2 4 3 1 3 2 0 2 0 0 3 2 1 2 4 4 0 0 4 2 1 4 3 0 4 3 4 
- 1 4 0 0 4 2 1 4 4 3 4 1 1 3 0 2 2 3 1 2 3 1 0 4 1 4 1 3 1 3 3 4 4 1 0 0 0 
- 0 4 3 1 2 2 3 0 3 2 4 3 2 2 3 0 3 1 0 4 2 3 0 2 4 3 0 4 3 4 3 3 0 3 1 2 2 
- 1 3 4 1 0 4 3 4 0 0 0 3 2 2 1 3 4 4 2 3 4 3 2 1 3 0 4 0 1 3 1 2 2 2 2 0 3 
- 1 1 1 2 0 1 4 1 1 1 2 2 1 2 3 3 1 4 4 3 4 2 0 2 2 1 1 1 2 0 3 0 2 1 1 3 1 
- 3 1 0 1 3 4 4 2 1 1 1] 
- 
-In [37]: # if you want to consistently get the same division into folds: 
- 
-In [38]: cv = cross_validation.StratifiedKFold(y,​ 5, shuffle=True,​ random_state=0) 
- 
-In [39]: # this sets the seed for the random number generator. 
- 
-</​code>​ 
- 
-Let's do grid search for the optimal set of parameters: 
- 
-<code python> 
-In [40]: from sklearn.grid_search import GridSearchCV 
- 
-In [41]: Cs = np.logspace(-2,​ 3, 6) 
- 
-In [42]: classifier = GridSearchCV(estimator=svm.LinearSVC(),​ param_grid=dict(C=Cs) ) 
- 
-In [43]: classifier.fit(X,​ y) 
-Out[43]: ​ 
-GridSearchCV(cv=None,​ error_score='​raise',​ 
-       ​estimator=LinearSVC(C=1.0,​ class_weight=None,​ dual=True, fit_intercept=True,​ 
-     ​intercept_scaling=1,​ loss='​squared_hinge',​ max_iter=1000,​ 
-     ​multi_class='​ovr',​ penalty='​l2',​ random_state=None,​ tol=0.0001, 
-     ​verbose=0),​ 
-       ​fit_params={},​ iid=True, loss_func=None,​ n_jobs=1, 
-       ​param_grid={'​C':​ array([ ​ 1.00000e-02, ​  ​1.00000e-01, ​  ​1.00000e+00, ​  ​1.00000e+01,​ 
-         ​1.00000e+02, ​  ​1.00000e+03])},​ 
-       ​pre_dispatch='​2*n_jobs',​ refit=True, score_func=None,​ scoring=None,​ 
-       ​verbose=0) 
- 
-In [44]:  
- 
-In [44]: # print the best accuracy, classifier and parameters: 
- 
-In [45]: print classifier.best_score_ 
-0.844444444444 
- 
-In [46]: print classifier.best_estimator_ 
-LinearSVC(C=1.0,​ class_weight=None,​ dual=True, fit_intercept=True,​ 
-     ​intercept_scaling=1,​ loss='​squared_hinge',​ max_iter=1000,​ 
-     ​multi_class='​ovr',​ penalty='​l2',​ random_state=None,​ tol=0.0001, 
-     ​verbose=0) 
- 
-In [47]: print classifier.best_params_ 
-{'​C':​ 1.0} 
- 
-n [48]: # performing nested cross validation: 
- 
-In [49]: print  cross_validation.cross_val_score(classifier,​ X, y, cv=5) 
-[ 0.7962963 ​  ​0.81481481 ​ 0.88888889 ​ 0.83333333 ​ 0.83333333] 
- 
-In [50]: # if we want to do grid search over multiple parameters: 
- 
-In [51]: param_grid = [ 
-   ​....: ​  ​{'​C':​ [1, 10, 100, 1000], '​kernel':​ ['​linear'​]},​ 
-   ​....: ​  ​{'​C':​ [1, 10, 100, 1000], '​gamma':​ [0.001, 0.0001], '​kernel':​ ['​rbf'​]},​ 
-   ​....: ​ ] 
- 
-In [52]: classifier = GridSearchCV(estimator=svm.SVC(),​ param_grid=param_grid) 
- 
-In [53]: print cross_validation.cross_val_score(classifier,​ X, y, cv=5) 
-[ 0.7962963 ​  ​0.83333333 ​ 0.88888889 ​ 0.7962963 ​  ​0.87037037] 
  
-</​code>​+"""​classifier evaluation using scikit-learn
  
-And to make things easier for you here's the whole thing without the output:+more details at: 
 +http://​scikit-learn.org/​stable/​modules/​cross_validation.html 
 +http://​scikit-learn.org/​stable/​tutorial/​statistical_inference/​model_selection.html 
 +"""​
  
-<file python model_selection.py>​ 
 import numpy as np import numpy as np
 from sklearn import cross_validation from sklearn import cross_validation
 from sklearn import svm from sklearn import svm
 from sklearn import metrics from sklearn import metrics
 +
 +# read in the heart dataset
  
 data=np.genfromtxt("​../​data/​heart_scale.data",​ delimiter=","​) data=np.genfromtxt("​../​data/​heart_scale.data",​ delimiter=","​)
Line 188: Line 22:
 y=data[:,0] y=data[:,0]
  
-# let'​s ​train/test an svm on the heart dataset:+first let'​s ​do regular cross-validation:
  
-X_train, X_test, y_train, y_test ​= cross_validation.train_test_split(X, y, test_size=0.4, random_state=0) +cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0) 
-classifier = svm.SVC(kernel='​linear',​ C=1).fit(X_train,​ y_train) +print (cv.test_folds)
-print classifier.score(X_test,​ y_test)+
  
-# now let's use cross-validation instead: +classifier = svm.SVC(kernel='​linear'​C=1)
-print cross_validation.cross_val_score(classifier, X, y, cv=5scoring='​accuracy'​)+
  
-# you can obtain accuracy for other metrics, such as area under the roc curve: +y_predict = cross_validation.cross_val_predict(classifier,​ X, y, cv=cv
-print cross_validation.cross_val_score(classifier,​ X, y, cv=5, scoring='​roc_auc'​) +print(metrics.accuracy_score(y,​ y_predict))
- +
-# you can also obtain the predictions by cross-validation and then compute the accuracy: +
-y_predict = cross_validation.cross_val_predict(classifier,​ X, y, cv=5+
-metrics.accuracy_score(y,​ y_predict) +
- +
-# here's an alternative way of doing cross-validation. +
-# first divide the data into folds: +
-cv = cross_validation.StratifiedKFold(y,​ 5) +
-# now use these folds: +
-print cross_validation.cross_val_score(classifier,​ X, y, cv=cv, scoring='​roc_auc'​) +
- +
-# you can see how examples were divided into folds by looking at the test_folds attribute:​ +
-print cv.test_folds +
- +
-# hmm... perhaps we should shuffle things a bit... +
- +
-cv = cross_validation.StratifiedKFold(y,​ 5, shuffle=True) +
-print cv.test_folds +
- +
-# if you run division into folds multiple times you will get a different answer: +
-cv = cross_validation.StratifiedKFold(y,​ 5, shuffle=True) +
-print cv.test_folds +
- +
-# if you want to consistently get the same division into folds: +
-cv = cross_validation.StratifiedKFold(y,​ 5, shuffle=True,​ random_state=0) +
-# this sets the seed for the random number generator.+
  
  
Line 237: Line 43:
  
 # print the best accuracy, classifier and parameters: # print the best accuracy, classifier and parameters:
-print classifier.best_score_ +print (classifier.best_score_) 
-print classifier.best_estimator_ +print (classifier.best_estimator_) 
-print classifier.best_params_+print (classifier.best_params_)
  
 # performing nested cross validation: # performing nested cross validation:
-print  ​cross_validation.cross_val_score(classifier,​ X, y, cv=5)+ 
 +y_predict = cross_validation.cross_val_predict(classifier,​ X, y, cv=cv) 
 +print(metrics.accuracy_score(y,​ y_predict)) 
  
 # if we want to do grid search over multiple parameters: # if we want to do grid search over multiple parameters:
Line 250: Line 59:
  ]  ]
 classifier = GridSearchCV(estimator=svm.SVC(),​ param_grid=param_grid) classifier = GridSearchCV(estimator=svm.SVC(),​ param_grid=param_grid)
-print cross_validation.cross_val_score(classifier,​ X, y, cv=5)+ 
 +y_predict = cross_validation.cross_val_predict(classifier,​ X, y, cv=cv) 
 +print(metrics.accuracy_score(y,​ y_predict))
  
 </​file>​ </​file>​
  
code/model_selection.txt · Last modified: 2016/10/06 14:58 by asa