-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathactivity_classifier_decisionTree.py
126 lines (100 loc) · 4.21 KB
/
activity_classifier_decisionTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
from typing import Union
import typing
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,classification_report
# 导入交叉验证和参数最优化的包
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
import numpy as np
import pandas as pd
def calculate_entropy(y):
"""
计算给定标签的熵
"""
# 找出数组y中的所有唯一值,并将这些唯一值赋值给class_labels
class_labels = np.unique(y)
entropy = 0
for cls in class_labels:
# y[y == cls]是一个布尔索引,它会返回一个新的数组,这个新的数组只包含y中等于cls的元素。
p = len(y[y == cls]) / len(y)
entropy -= p * np.log2(p)
return entropy
def calculate_information_gain(X, y, feature_index):
"""
计算某一个特征的information gain
"""
# 总熵
total_entropy = calculate_entropy(y)
# 计算权重熵
values, counts = np.unique(X[:, feature_index], return_counts=True) # 获取特征feature_index所有唯一值及其出现次数
weighted_entropy = 0
for v, c in zip(values, counts):
sub_y = y[X[:, feature_index] == v] # 获取特征值为v的所有样本的标签, 并组成一个子集
# 计算子集的熵
entropy = calculate_entropy(sub_y)
# 计算这些样本的加权熵,并累加到weighted_entropy上。权重是这些样本的比例,即c / len(X)。
weighted_entropy += (c / len(X)) * entropy
'''
weighted_entropy就是特征feature_index的加权熵。
这个值反映了根据该特征划分数据集后,各子集的熵的加权平均。
在决策树算法中,我们通常会选择加权熵最小的特征作为划分特征,因为这样可以获取最大的信息增益。
'''
# 信息增益
information_gain = total_entropy - weighted_entropy
return information_gain
def best_feature_to_split(X, y):
"""找到最佳分割特征的索引"""
n_features = X.shape[1]
information_gains = [calculate_information_gain(X, y, i) for i in range(n_features)]
return np.argmax(information_gains)
def build_tree(X, y, depth=0, max_depth=10):
"""递归构建决策树"""
if len(np.unique(y)) == 1 or len(y) == 0:
# 如果所有输出相同,或者没有数据,返回结果
return np.unique(y)[0] if len(y) > 0 else None
if depth == max_depth:
# 达到最大深度,返回多数类
values, counts = np.unique(y, return_counts=True)
return values[np.argmax(counts)]
# 选择最佳分割特征
feature_index = best_feature_to_split(X, y)
values = np.unique(X[:, feature_index])
tree = {}
tree[feature_index] = {}
# 为每个特征值构建子树
for value in values:
sub_X = X[X[:, feature_index] == value]
sub_y = y[X[:, feature_index] == value]
tree[feature_index][value] = build_tree(sub_X, sub_y, depth + 1, max_depth)
return tree
# 示例数据加载和模型训练
# 假设 data 是已经加载的 DataFrame,包含 251 个特征和 1 个标签列
data = pd.read_csv("your_dataset.csv")
X = data.iloc[:, :-1].values # 特征
y = data.iloc[:, -1].values # 标签
# 构建决策树
decision_tree = build_tree(X, y)
# 使用决策树进行预测的函数(这里仅提供框架,需要具体实现)
def predict(tree, x):
"""在决策树上预测单个样本"""
for feature_index, branches in tree.items():
value = x[feature_index]
if value in branches:
subtree = branches[value]
if type(subtree) is dict:
return predict(subtree, x)
else:
return subtree
else:
return None # 无法处理的值
if __name__ == '__main__':
# 预测示例
x_new = X[0] # 假设我们使用第一个样本进行预测
print(predict(decision_tree, x_new))