CS545 fall 2016

Cross validation

Before we do cross validation, we will need a modified version of the perceptron class that will work with scikit-learn:

perceptron2.py

import numpy as np
from matplotlib import pyplot as plt
from sklearn.base import BaseEstimator
 
class Perceptron (BaseEstimator) :
 
    """An implementation of the perceptron algorithm.
    Note that this implementation does not include a bias term"""
 
    def __init__(self, max_iterations=500, learning_rate=0.2) :
        self.max_iterations = max_iterations
        self.learning_rate = learning_rate
 
    def fit(self, X, y) :
        """
        Train a classifier using the perceptron training algorithm.
        After training the attribute 'w' will contain the perceptron weight vector.
 
        Parameters
        ----------
 
        X : ndarray, shape (num_examples, n_features)
        Training data.
 
        y : ndarray, shape (n_examples,)
        Array of labels.
 
        """
        self.w = np.zeros(len(X[0]))
        converged = False
        iterations = 0
        while (not converged and iterations < self.max_iterations) :
            converged = True
            for i in range(len(X)) :
                if y[i] * self.decision_function(X[i]) <= 0 :
                    self.w = self.w + y[i] * self.learning_rate * X[i]
                    converged = False
            iterations += 1
        self.converged = converged
        if converged :
            print ('converged in %d iterations ' % iterations)
 
    def decision_function(self, X) :
        return np.inner(self.w, X)
 
    def predict(self, X) :
        """
        make predictions using a trained linear classifier
 
        Parameters
        ----------
 
        X : ndarray, shape (num_examples, n_features)
        Training data.
        """
        scores = np.inner(self.w, X)
        return np.sign(scores)
 
 
if __name__=='__main__' :
    X,y,w = generate_separable_data(40)
    p = Perceptron()
    p.fit(X,y)

cross-validation.py

 
"""classifier evaluation using scikit-learn
 
more details at:
http://scikit-learn.org/stable/modules/cross_validation.html
http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html
"""
 
import numpy as np
from sklearn import cross_validation
from sklearn import metrics
import perceptron2
 
data=np.genfromtxt("../data/heart_scale.data", delimiter=",")
X=data[:,1:]
y=data[:,0]
 
# let's train/test a perceptron on the heart dataset:
 
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
classifier = perceptron2.Perceptron()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
 
# let's comput the accuracy of the classifier:
print (len(np.where(np.equal(y_pred, y_test))[0])/len(y_test))
 
# you can get the same result using scikit-learn:
metrics.accuracy_score(y_test, y_pred)
 
# now let's use cross-validation instead:
print (cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy'))
 
# you can obtain accuracy for other metrics, such as area under the roc curve:
print (cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc'))
 
# you can also obtain the predictions by cross-validation and then compute the accuracy:
y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5)
print(metrics.accuracy_score(y, y_predict))
 
# here's an alternative way of doing cross-validation.
# first divide the data into folds:
cv = cross_validation.StratifiedKFold(y, 5)
# now use these folds:
print (cross_validation.cross_val_score(classifier, X, y, cv=cv, scoring='roc_auc'))
 
# you can see how examples were divided into folds by looking at the test_folds attribute:
print (cv.test_folds)
 
# hmm... perhaps we should shuffle things a bit...
 
cv = cross_validation.StratifiedKFold(y, 5, shuffle=True)
print (cv.test_folds)
 
# if you run division into folds multiple times you will get a different answer:
cv = cross_validation.StratifiedKFold(y, 5, shuffle=True)
print (cv.test_folds)
 
# if you want to consistently get the same division into folds:
cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0)
# this sets the seed for the random number generator.

CS545 fall 2016

User Tools

Site Tools

Sidebar

Cross validation

Page Tools