code:feature_selection

Feature selection in scikit-learn

This demo uses the a file of yeast gene expression data that is available here.
"""
=================================================
SVM with RFE feature selection
=================================================
 
How to perform feature selection in scikit-learn
"""
 
import numpy as np
from sklearn import svm, feature_selection, cross_validation
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE,RFECV
from sklearn.svm import SVC,LinearSVC
 
# let's read in the yeast data that you used in an earlier assignment:
data = np.genfromtxt("../data/yeast2.csv", delimiter = ",")
X = data[:,1:]
y = data[:,0]
# add some extra features as noise
X = np.hstack((X, np.random.randn(len(y), 250)))
 
# create an instance of RFE that uses an SVM to define weights
# for the features (any linear classifier will work):
classifier = LinearSVC()
selector = RFE(classifier, step=0.1, n_features_to_select=25)
# run feature selection:
selector = selector.fit(X, y)
 
# check which features got chosen:
print selector.support_
print selector.ranking_
 
# to actually perform feature selection:
Xt=selector.fit_transform(X,y)
 
# the wrong way to perform cross-validation:
cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0)
print (np.mean(cross_validation.cross_val_score(classifier, Xt, y, cv=cv)))
 
# now let's perform nested cross-validation:
classifier = LinearSVC()
selector = RFE(classifier, step=0.1,n_features_to_select=25)
rfe_svm = make_pipeline(selector, classifier)
 
print (np.mean(cross_validation.cross_val_score(rfe_svm, X, y, cv=cv)))
 
# feature selection using a univariate filter method:
from sklearn.feature_selection import SelectKBest, f_regression
filter_selector = SelectKBest(f_regression, k=25)
classifier = LinearSVC()
filter_svm = make_pipeline(filter_selector, classifier)
 
print (np.mean(cross_validation.cross_val_score(filter_svm, X, y, cv=cv)))