This shows you the differences between two versions of the page.
Both sides previous revision Previous revision | Next revision Both sides next revision | ||
code:model_selection [2015/10/05 15:01] asa |
code:model_selection [2015/10/05 15:06] asa |
||
---|---|---|---|
Line 175: | Line 175: | ||
</code> | </code> | ||
+ | |||
+ | And to make things easier for you here's the whole thing without the output: | ||
+ | |||
+ | <file python model_selection.py> | ||
+ | import numpy as np | ||
+ | from sklearn import cross_validation | ||
+ | from sklearn import svm | ||
+ | from sklearn import metrics | ||
+ | |||
+ | data=np.genfromtxt("../data/heart_scale.data", delimiter=",") | ||
+ | X=data[:,1:] | ||
+ | y=data[:,0] | ||
+ | |||
+ | # let's train/test an svm on the heart dataset: | ||
+ | |||
+ | X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0) | ||
+ | classifier = svm.SVC(kernel='linear', C=1).fit(X_train, y_train) | ||
+ | print classifier.score(X_test, y_test) | ||
+ | |||
+ | # now let's use cross-validation instead: | ||
+ | print cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy') | ||
+ | |||
+ | # you can obtain accuracy for other metrics, such as area under the roc curve: | ||
+ | print cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc') | ||
+ | |||
+ | # you can also obtain the predictions by cross-validation and then compute the accuracy: | ||
+ | y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5) | ||
+ | metrics.accuracy_score(y, y_predict) | ||
+ | |||
+ | # here's an alternative way of doing cross-validation. | ||
+ | # first divide the data into folds: | ||
+ | cv = cross_validation.StratifiedKFold(y, 5) | ||
+ | # now use these folds: | ||
+ | print cross_validation.cross_val_score(classifier, X, y, cv=cv, scoring='roc_auc') | ||
+ | |||
+ | # you can see how examples were divided into folds by looking at the test_folds attribute: | ||
+ | print cv.test_folds | ||
+ | |||
+ | # hmm... perhaps we should shuffle things a bit... | ||
+ | |||
+ | cv = cross_validation.StratifiedKFold(y, 5, shuffle=True) | ||
+ | print cv.test_folds | ||
+ | |||
+ | # if you run division into folds multiple times you will get a different answer: | ||
+ | cv = cross_validation.StratifiedKFold(y, 5, shuffle=True) | ||
+ | print cv.test_folds | ||
+ | |||
+ | # if you want to consistently get the same division into folds: | ||
+ | cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0) | ||
+ | # this sets the seed for the random number generator. | ||
+ | |||
+ | |||
+ | # grid search | ||
+ | |||
+ | # let's perform model selection using grid search | ||
+ | |||
+ | from sklearn.grid_search import GridSearchCV | ||
+ | Cs = np.logspace(-2, 3, 6) | ||
+ | classifier = GridSearchCV(estimator=svm.LinearSVC(), param_grid=dict(C=Cs) ) | ||
+ | classifier.fit(X, y) | ||
+ | |||
+ | # print the best accuracy, classifier and parameters: | ||
+ | print classifier.best_score_ | ||
+ | print classifier.best_estimator_ | ||
+ | print classifier.best_params_ | ||
+ | |||
+ | # performing nested cross validation: | ||
+ | print cross_validation.cross_val_score(classifier, X, y, cv=5) | ||
+ | |||
+ | # if we want to do grid search over multiple parameters: | ||
+ | param_grid = [ | ||
+ | {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, | ||
+ | {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}, | ||
+ | ] | ||
+ | classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid) | ||
+ | print cross_validation.cross_val_score(classifier, X, y, cv=5) | ||
+ | |||
+ | </file> | ||