Differences

This shows you the differences between two versions of the page.

--- code:model_selection [2015/10/05 15:01]
asa
+++ code:model_selection [2016/08/09 10:25]
127.0.0.1 external edit
@@ Line 175: / Line 175: @@
 </code>
+And to make things easier for you here's the whole thing without the output:
+<file python model_selection.py>
+import numpy as np
+from sklearn import cross_validation
+from sklearn import svm
+from sklearn import metrics
+data=np.genfromtxt("../data/heart_scale.data", delimiter=",")
+X=data[:,1:]
+y=data[:,0]
+# let's train/test an svm on the heart dataset:
+X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
+classifier = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
+print classifier.score(X_test, y_test)
+# now let's use cross-validation instead:
+print cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy')
+# you can obtain accuracy for other metrics, such as area under the roc curve:
+print cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc')
+# you can also obtain the predictions by cross-validation and then compute the accuracy:
+y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5)
+metrics.accuracy_score(y, y_predict)
+# here's an alternative way of doing cross-validation.
+# first divide the data into folds:
+cv = cross_validation.StratifiedKFold(y, 5)
+# now use these folds:
+print cross_validation.cross_val_score(classifier, X, y, cv=cv, scoring='roc_auc')
+# you can see how examples were divided into folds by looking at the test_folds attribute:
+print cv.test_folds
+# hmm... perhaps we should shuffle things a bit...
+cv = cross_validation.StratifiedKFold(y, 5, shuffle=True)
+print cv.test_folds
+# if you run division into folds multiple times you will get a different answer:
+cv = cross_validation.StratifiedKFold(y, 5, shuffle=True)
+print cv.test_folds
+# if you want to consistently get the same division into folds:
+cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0)
+# this sets the seed for the random number generator.
+# grid search
+# let's perform model selection using grid search
+from sklearn.grid_search import GridSearchCV
+Cs = np.logspace(-2, 3, 6)
+classifier = GridSearchCV(estimator=svm.LinearSVC(), param_grid=dict(C=Cs) )
+classifier.fit(X, y)
+# print the best accuracy, classifier and parameters:
+print classifier.best_score_
+print classifier.best_estimator_
+print classifier.best_params_
+# performing nested cross validation:
+print  cross_validation.cross_val_score(classifier, X, y, cv=5)
+# if we want to do grid search over multiple parameters:
+param_grid = [
+  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
+  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
+ ]
+classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid)
+print cross_validation.cross_val_score(classifier, X, y, cv=5)
+</file>

CS545 fall 2016

User Tools

Site Tools

Differences

Page Tools