-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalgo_chap8_3.py
249 lines (179 loc) · 6 KB
/
algo_chap8_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 30 15:47:57 2020
@author: Administrator
"""
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
#设置出图显示中文
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
class Node(object):
def __init__(self):
self.feature_index = None
self.split_point = None
self.deep = None
self.left_tree = None
self.right_tree = None
self.leaf_class = None
def gini(y, D):
'''
计算样本集y下的加权基尼指数
:param y: 数据样本标签
:param D: 样本权重
:return: 加权后的基尼指数
'''
unique_class = np.unique(y)
total_weight = np.sum(D)
gini = 1
for c in unique_class:
gini -= (np.sum(D[y == c]) / total_weight) ** 2
return gini
def calcMinGiniIndex(a, y, D):
'''
计算特征a下样本集y的的基尼指数
:param a: 单一特征值
:param y: 数据样本标签
:param D: 样本权重
:return:
'''
feature = np.sort(a)
total_weight = np.sum(D)
split_points = [(feature[i] + feature[i + 1]) / 2 for i in range(feature.shape[0] - 1)]
min_gini = float('inf')
min_gini_point = None
for i in split_points:
yv1 = y[a <= i]
yv2 = y[a > i]
Dv1 = D[a <= i]
Dv2 = D[a > i]
gini_tmp = (np.sum(Dv1) * gini(yv1, Dv1) + np.sum(Dv2) * gini(yv2, Dv2)) / total_weight
if gini_tmp < min_gini:
min_gini = gini_tmp
min_gini_point = i
return min_gini, min_gini_point
def chooseFeatureToSplit(X, y, D):
'''
:param X:
:param y:
:param D:
:return: 特征索引, 分割点
'''
gini0, split_point0 = calcMinGiniIndex(X[:, 0], y, D)
gini1, split_point1 = calcMinGiniIndex(X[:, 1], y, D)
if gini0 > gini1:
return 1, split_point1
else:
return 0, split_point0
def createSingleTree(X, y, D, deep=0):
'''
这里以C4.5 作为基学习器,限定深度为2,使用基尼指数作为划分点,基尼指数的计算会基于样本权重,
不确定这样的做法是否正确,但在西瓜书p87, 4.4节中, 处理缺失值时, 其计算信息增益的方式是将样本权重考虑在内的,
这里就参考处理缺失值时的方法。
:param X: 训练集特征
:param y: 训练集标签
:param D: 训练样本权重
:param deep: 树的深度
:return:
'''
node = Node()
node.deep = deep
if (deep == 2) | (X.shape[0] <= 2): # 当前分支下,样本数量小于等于2 或者 深度达到2时,直接设置为也节点
pos_weight = np.sum(D[y == 1])
neg_weight = np.sum(D[y == -1])
if pos_weight > neg_weight:
node.leaf_class = 1
else:
node.leaf_class = -1
return node
feature_index, split_point = chooseFeatureToSplit(X, y, D)
node.feature_index = feature_index
node.split_point = split_point
left = X[:, feature_index] <= split_point
right = X[:, feature_index] > split_point
node.left_tree = createSingleTree(X[left, :], y[left], D[left], deep + 1)
node.right_tree = createSingleTree(X[right, :], y[right], D[right], deep + 1)
return node
def predictSingle(tree, x):
'''
基于基学习器,预测单个样本
:param tree:
:param x:
:return:
'''
if tree.leaf_class is not None:
return tree.leaf_class
if x[tree.feature_index] > tree.split_point:
return predictSingle(tree.right_tree, x)
else:
return predictSingle(tree.left_tree, x)
def predictBase(tree, X):
'''
基于基学习器预测所有样本
:param tree:
:param X:
:return:
'''
result = []
for i in range(X.shape[0]):
result.append(predictSingle(tree, X[i, :]))
return np.array(result)
def adaBoostTrain(X, y, tree_num=20):
'''
以深度为2的决策树作为基学习器,训练adaBoost
:param X:
:param y:
:param tree_num:
:return:
'''
D = np.ones(y.shape) / y.shape # 初始化权重
trees = [] # 所有基学习器
a = [] # 基学习器对应权重
agg_est = np.zeros(y.shape)
for _ in range(tree_num):
tree = createSingleTree(X, y, D)
hx = predictBase(tree, X)
err_rate = np.sum(D[hx != y])
at = np.log((1 - err_rate) / max(err_rate, 1e-16)) / 2
agg_est += at * hx
trees.append(tree)
a.append(at)
if (err_rate > 0.5) | (err_rate == 0): # 错误率大于0.5 或者 错误率为0时,则直接停止
break
# 更新每个样本权重
err_index = np.ones(y.shape)
err_index[hx == y] = -1
D = D * np.exp(err_index * at)
D = D / np.sum(D)
return trees, a, agg_est
def adaBoostPredict(X, trees, a):
agg_est = np.zeros((X.shape[0],))
for tree, am in zip(trees, a):
agg_est += am * predictBase(tree, X)
result = np.ones((X.shape[0],))
result[agg_est < 0] = -1
return result.astype(int)
def pltAdaBoostDecisionBound(X_, y_, trees, a):
pos = y_ == 1
neg = y_ == -1
x_tmp = np.linspace(0, 1, 600)
y_tmp = np.linspace(-0.2, 0.7, 600)
X_tmp, Y_tmp = np.meshgrid(x_tmp, y_tmp)
Z_ = adaBoostPredict(np.c_[X_tmp.ravel(), Y_tmp.ravel()], trees, a).reshape(X_tmp.shape)
plt.contour(X_tmp, Y_tmp, Z_, [0], colors='red', linewidths=1)
plt.scatter(X_[pos, 0], X_[pos, 1], label='好瓜', color='c', marker='+')
plt.scatter(X_[neg, 0], X_[neg, 1], label='坏瓜', color='k', marker='_')
plt.xlabel('密度')
plt.ylabel('含糖率')
plt.legend()
plt.rcParams['figure.dpi'] = 3000 #分辨率
plt.show()
if __name__ == "__main__":
data_path = r'C:\Users\Administrator\Desktop\Data\watermelon3_0a_Ch.txt'
data = pd.read_table(data_path, delimiter=' ')
X = data.iloc[:, :2].values
y = data.iloc[:, 2].values
y[y == 0] = -1
trees, a, agg_est = adaBoostTrain(X, y)
pltAdaBoostDecisionBound(X, y, trees, a)