-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathensemble_classifier.py
135 lines (107 loc) · 4.86 KB
/
ensemble_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
import numpy as np
import operator
class EnsembleClassifier(BaseEstimator, ClassifierMixin):
"""
Ensemble classifier for scikit-learn estimators.
Parameters
clf : `iterable`
A list of scikit-learn classifier objects.
weights : `list` (default: `None`)
If `None`, the majority rule voting will be applied to the predicted class labels.
If a list of weights (`float` or `int`) is provided, the averaged raw probabilities (via `predict_proba`)
will be used to determine the most confident class label.
"""
def __init__(self, clfs, weights=None):
self.clfs = clfs
self.weights = weights
def fit(self, X, y):
"""
Fit the scikit-learn estimators.
Parameters
X : numpy array, shape = [n_samples, n_features]
Training data
y : list or numpy array, shape = [n_samples]
Class labels
"""
for clf in self.clfs:
clf.fit(X, y)
def predict(self, X):
"""
Parameters
X : numpy array, shape = [n_samples, n_features]
Returns
maj : list or numpy array, shape = [n_samples]
Predicted class labels by majority rule
"""
self.classes_ = np.asarray([clf.predict(X) for clf in self.clfs])
if self.weights:
avg = self.predict_proba(X)
maj = np.apply_along_axis(lambda x: max(enumerate(x), key=operator.itemgetter(1))[0], axis=1, arr=avg)
else:
maj = np.asarray([np.argmax(np.bincount(self.classes_[:,c])) for c in range(self.classes_.shape[1])])
return maj
def predict_proba(self, X):
"""
Parameters
X : numpy array, shape = [n_samples, n_features]
Returns
avg : list or numpy array, shape = [n_samples, n_probabilities]
Weighted average probability for each class per sample.
"""
self.probas_ = [clf.predict_proba(X) for clf in self.clfs]
avg = np.average(self.probas_, axis=0, weights=self.weights)
return avg
from sklearn.cross_validation import train_test_split
import pandas as pd
import numpy
from scipy import stats
from sklearn.grid_search import RandomizedSearchCV
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
df = pd.DataFrame(columns=('w1', 'w2', 'w3', 'w4' , 'w5' , 'w6' , 'mean', 'std'))
data = numpy.loadtxt("Data/data.csv", delimiter=",")
X = data[:,0:8]
Y = data[:,8]
print X
random_state = numpy.random.RandomState(0)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.25,random_state=42)
n_feat = X_train.shape[1]
n_targets = y_train.max() + 1
bayes = GaussianNB()
reg = LogisticRegression(C=1,max_iter=5000,tol=1e-08,solver='liblinear')
grad_boost = GradientBoostingClassifier(max_features='log2',loss='exponential',learning_rate=0.1,n_estimators=100,max_depth=30)
forest = RandomForestClassifier(max_features='log2',n_estimators=200, criterion='entropy',max_depth=20)
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=200)
svm = SVC(C=1,kernel='linear',gamma=0.1,probability=True)
clf = EnsembleClassifier(clfs=[bayes, reg, grad_boost, forest, ada_boost, svm],weights=[1,2,3,4,4,4])
df = pd.DataFrame(columns=('w1', 'w2', 'w3', 'w4', 'w5', 'w6', 'mean', 'std'))
i = 0
for w1 in range(1,4):
for w2 in range(1,4):
for w3 in range(1,4):
for w4 in range(1,4):
for w5 in range(1,4):
for w6 in range(1,4):
if len(set((w1,w2,w3,w4,w5,w6))) == 1: # skip if all weights are equal
continue
clf = EnsembleClassifier(clfs=[bayes,reg,grad_boost,forest,ada_boost,svm], weights=[w1,w2,w3,w4,w5,w6])
scores = cross_val_score(estimator=clf,
X=X_train,
y=y_train,
cv=3,
scoring='accuracy',
n_jobs=2)
df.loc[i] = [w1, w2, w3, w4, w5, w6, scores.mean(), scores.std()]
i += 1
print i, w1, w2, w3, w4, w5, w6, scores.mean(), scores.std()
df.sort(columns=['mean', 'std'], ascending=False)
print df