-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTreeBagger.py
104 lines (78 loc) · 2.53 KB
/
TreeBagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import numpy as np
import pandas as pd
from splearn.DecisionTree.DecisionTree import DecisionTree
class TreeBagger:
def __init__(self):
'''
Creates a new TreeBagger that can be trained using an array of
DecisionTrees
'''
self.trees = []
self.features = None
self.target = None
def train(
self,
features: pd.DataFrame,
target: pd.Series,
num_trees: int,
seed: int,
subset_frac: float = 0.32,
gain = "entropy",
):
'''
Trains N decision trees for bagging prediction
Parameters:
* features (DataFrame): Features used for training
* target (Series): Predictions to be trained against
* num_trees (int): Number of trees used in the prediction
* gain: Type of gain used to train trees
* seed (int): Lorem ipsum
'''
# Stage the internal variables for the machine
self.features = features
self.target = target
self.gain = gain
self.seed = seed
self.subset_frac = subset_frac
classes = [0, 1]
uniques = np.unique(target)
self.binary_dict = dict(zip(classes, uniques))
self.class_dict = dict(zip(uniques, classes))
self.iterate(num_trees)
def predict(
self,
features: pd.DataFrame
):
votes = np.zeros(len(features))
for m in self.trees:
preds = m.predict(features)
preds = preds.map(self.class_dict)
preds = (preds - 0.5) * 2
votes += preds
result = pd.Series((votes >= 0).astype(int))
result = result.map(self.binary_dict)
return result
def __len__(self):
return len(self.trees)
def iterate(
self,
iterations
):
for i in range(iterations):
# Create a random subset of 68% of the data
sub_feat = self.features.sample(
frac = self.subset_frac,
random_state = self.seed + len(self),
replace = True
)
sub_idx = sub_feat.index
sub_targ = self.target[sub_idx]
# Predict tree fully based on subset
tree = DecisionTree()
tree.train(
sub_feat,
sub_targ,
gain=self.gain,
)
# Add tree to bagger
self.trees.append(tree)