This is an old revision of the document!
Before we do cross validation, we will need a modified version of the perceptron class that will work with scikit-learn:
import numpy as np from matplotlib import pyplot as plt from sklearn.base import BaseEstimator class Perceptron (BaseEstimator) : """An implementation of the perceptron algorithm. Note that this implementation does not include a bias term""" def __init__(self, max_iterations=500, learning_rate=0.2) : self.max_iterations = max_iterations self.learning_rate = learning_rate def fit(self, X, y) : """ Train a classifier using the perceptron training algorithm. After training the attribute 'w' will contain the perceptron weight vector. Parameters ---------- X : ndarray, shape (num_examples, n_features) Training data. y : ndarray, shape (n_examples,) Array of labels. """ self.w = np.zeros(len(X)) converged = False iterations = 0 while (not converged and iterations < self.max_iterations) : converged = True for i in range(len(X)) : if y[i] * self.discriminant(X[i]) <= 0 : self.w = self.w + y[i] * self.learning_rate * X[i] converged = False iterations += 1 self.converged = converged if converged : print ('converged in %d iterations ' % iterations) def discriminant(self, x) : return np.inner(self.w, x) def predict(self, X) : """ make predictions using a trained linear classifier Parameters ---------- X : ndarray, shape (num_examples, n_features) Training data. """ scores = np.inner(self.w, X) return np.sign(scores)
"""classifier evaluation using scikit-learn more details at: http://scikit-learn.org/stable/modules/cross_validation.html http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html """ import numpy as np from sklearn import cross_validation from sklearn import metrics import perceptron2 data=np.genfromtxt("../data/heart_scale.data", delimiter=",") X=data[:,1:] y=data[:,0] # let's train/test a perceptron on the heart dataset: X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0) classifier = perceptron2.Perceptron() classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) # let's comput the accuracy of the classifier: print (len(np.where(np.equal(y_pred, y_test)))/len(y_test)) # you can get the same result using scikit-learn: metrics.accuracy_score(y_test, y_pred) # now let's use cross-validation instead: print (cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy')) # you can obtain accuracy for other metrics, such as area under the roc curve: print (cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc')) # you can also obtain the predictions by cross-validation and then compute the accuracy: y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5) print(metrics.accuracy_score(y, y_predict)) # here's an alternative way of doing cross-validation. # first divide the data into folds: cv = cross_validation.StratifiedKFold(y, 5) # now use these folds: print (cross_validation.cross_val_score(classifier, X, y, cv=cv, scoring='roc_auc')) # you can see how examples were divided into folds by looking at the test_folds attribute: print (cv.test_folds) # hmm... perhaps we should shuffle things a bit... cv = cross_validation.StratifiedKFold(y, 5, shuffle=True) print (cv.test_folds) # if you run division into folds multiple times you will get a different answer: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True) print (cv.test_folds) # if you want to consistently get the same division into folds: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0) # this sets the seed for the random number generator.