# CS545 fall 2016

CS545

Instructor
Asa Ben-Hur

code:ensembles

### Ensemble methods in scikit-learn

Let's compare some ensemble methods:

ensembles.py
"""
ensemble methods in scikit-learn
http://scikit-learn.org/stable/modules/ensemble.html

"""

from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn import metrics
from sklearn.datasets import load_digits

X = digits.data
X = X + np.random.binomial(1, 0.5, X.shape) * np.random.uniform(1, 15, X.shape)
y = digits.target
X /= X.max()

cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0)

# let's look at accuracy as a function of the number of trees:

n_estimators = [10, 20, 50, 100, 200, 500]
accuracy = []
for estimators in n_estimators :
print ("num estimators: ", estimators)
classifier = RandomForestClassifier(n_estimators=estimators)
y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv)
accuracy.append(metrics.accuracy_score(y, y_predict))

import matplotlib.pyplot as plt
plt.semilogx(n_estimators, accuracy, 'ob')
plt.title('performance of random forests on the digits data')
plt.xlabel('number of estimators')
plt.ylabel('accuracy')
plt.show()

# let's compare to decision trees:

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(max_depth=None)
y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv)
print(metrics.accuracy_score(y, y_predict))

# bagging

from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=200, random_state=0)
y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv)
print(metrics.accuracy_score(y, y_predict))

from sklearn.ensemble import AdaBoostClassifier
classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=500)
y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv)
print(metrics.accuracy_score(y, y_predict))

# SVM

from sklearn import svm
from sklearn.grid_search import GridSearchCV

param_grid = [
{'C': [1, 10, 100], 'kernel': ['linear']},
{'C': [1, 10, 100], 'gamma': [1, 0.1, 0.01], 'kernel': ['rbf']},
]
classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid)

y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv)
print(metrics.accuracy_score(y, y_predict))
code/ensembles.txt ยท Last modified: 2016/12/01 19:36 by asa