-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecision_tree_for_regression.py
191 lines (164 loc) · 9.43 KB
/
decision_tree_for_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""Decision tree for Regression is structurally similar to that for classification but differs in how they evaluate splits and make predictions. At each node, the decision tree regressor selects
the feature and split point that minimize the sum of squared residuals between the predicted and actual values of the target variable. For choosing the best feature to spilt, concepts like
Variance reduction or Mean squared error reduction are used for regression. This process continues until a stopping criteria is met, such as reaching a maximum tree depth or no further improvement
in the residual reduction. Each leaf node represents the predicted value for regression tasks. Decision trees for regression are effective for capturing non-linear relationships in data but can
overfit, which can be mitigated using pruning techniques. In this implementation, I'll use Pre-Pruning and to demonstrate all these concepts, I have implemented the algorithm from scratch. I have
used the Red wine quality dataset from Kaggle for this implementation. So, Let's begin !!
NOTE: Most of the functions are similar to the ones in Decison tree classifier. So, here, I have provided description only for the functions that are different from the Decison tree classifier.
NOTE: The sklearn library is used just to prepare the training and testing datasets and to compare our results with the sklearn DecisionTreeRegressor.
"""
# IMPORTING NECESSARY LIBRARIES #
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
# NODE CLASS #
"""The Node class will represent each node in the decision tree. It will store the information about: (1) the 'feature' to split the tree on; (2) the 'threshold' value for splitting;
(3) the 'left' and 'right' child nodes of each node; (4) 'value' which is a predicted target value if the node is a leaf node. The default value for all will be None.
"""
class Node:
def __init__(self, feature=None, threshold=None, left=None, right=None, *,value=None ):
self.feature = feature
self.threshold = threshold
self.left = left
self.right = right
self.value = value
# DECISION TREE REGRESSOR #
"""The Decision_Tree_Regressor class is responsible for doing the following: (1) Handle the maximum depth, minimum samples and max features for splits (Pre-pruning); (2) Compute variance
reduction to evaluate splits; (3) Identify the best feature and threshold for splitting the data (Best Split using Variance reduction); (4) Recursively grow the tree by finding the best
split; (5) Predict values for input data by traversing the tree.
"""
class Decision_Tree_Regressor:
def __init__(self, max_depth=None, min_samples_split=None, max_features=None):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.max_features = max_features
self.root_node = None
def fit(self, x, y):
if not self.max_features:
self.max_features = x.shape[1]
else:
self.max_features = min(x.shape[1], self.max_features)
self.root_node = self.grow_tree(x, y)
def grow_tree(self, x, y, depth=0):
leaf_value = self.calculate_leaf_value(y)
node = Node(value = leaf_value)
# Check the stopping criteria for recursion
if self.max_depth is not None and depth >= self.max_depth or x.shape[0] < self.min_samples_split:
return node
else:
best_feature, best_threshold = self.best_split(x, y)
if best_feature is None or best_threshold is None:
return node
else:
left_child_node = np.where(x[:, best_feature] <= best_threshold)[0]
right_child_node = np.where(x[:, best_feature] > best_threshold)[0]
if len(left_child_node) == 0 or len(right_child_node) == 0:
return node
left = self.grow_tree( x[left_child_node, :], y[left_child_node], depth + 1 )
right = self.grow_tree( x[right_child_node, :], y[right_child_node], depth + 1 )
return Node(best_feature, best_threshold, left, right)
def best_split(self, x, y):
"""Find the best feature and threshold for splitting the data, using variance reduction.
:param x : feature matrix
:param y : target values
:return : best feature and best threshold value
"""
total_actual_features = x.shape[1]
if self.max_features is not None:
features = np.random.choice(total_actual_features, self.max_features, replace=False)
else:
features = range(total_actual_features)
min_variance = -float('inf')
best_feature, best_threshold = None, None
for feature in features:
feature_values = x[:, feature]
thresholds = np.unique(feature_values)
for threshold in thresholds:
left_child_node = y[feature_values <= threshold]
right_child_node = y[feature_values > threshold]
reduced_variance = self.variance_reduction(y, left_child_node, right_child_node)
if reduced_variance > min_variance:
min_variance = reduced_variance
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold
def calculate_leaf_value(self, y):
"""Calculate the predicted value for a leaf node (mean of target values)
:param y : target values of the current node
:return : predicted value for the leaf node
"""
return np.mean(y)
def variance_reduction(self, y, left_child_node, right_child_node):
"""Calculate the reduction in variance by splitting the dataset. Variance reduction evaluates
how much a split of a node decreases the variance of the target variable within the child
nodes created by the split.
Formula: VarRed = variance of parent node - weighted average variance of child nodes
:param y : target values
:param left_child_node : left child node of the parent node
:param right_child_node : right child node of the parent node
:return : reduction in variance
"""
if len(left_child_node) == 0 or len(right_child_node) == 0:
return 0
parent_variance = np.var(y)
left_variance = np.var(left_child_node)
right_variance = np.var(right_child_node)
childern_variance = (left_variance * len(left_child_node) / len(y)) + (right_variance * len(right_child_node) / len(y))
reduced_variance = parent_variance - childern_variance
return reduced_variance
def make_predictions(self, x_test):
"""Predict the target values for the given data using the trained model
:param x_test : test input features
:return : array of predicated target value
"""
return np.array( [self.traverse_tree(i, self.root_node) for i in x_test] )
def traverse_tree(self, x, node):
"""Traverse through the tree to predict the target value for a single input.
:param x : single input data point
:param node : current node in the tree (traversing begins from the root_node node)
:return : predicted target value
"""
if node.value is not None:
return node.value
if x[node.feature] <= node.threshold:
return self.traverse_tree(x, node.left)
else:
return self.traverse_tree(x, node.right)
def mse(self, y_test, y_predicted):
"""Determine the Mean Squared Error (MSE) between the true and predicted values.
:param y_test : true values of the target variable
:param y_predicted : predicted values of target variable
:return : MSE of the model
"""
return np.mean(np.square(y_test - y_predicted))
# EXAMPLE USAGE ON THE RED WINE QUALITY DATA #
if __name__ == "__main__":
data = pd.read_csv("datasets\decision-tree-regression-data.csv")
X = data.iloc[:, :-1].values
y = data['quality'].values
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# Training the decision tree model without using sklearn library
regressor = Decision_Tree_Regressor(max_depth=50, min_samples_split=3, max_features=None)
regressor.fit(x_train, y_train)
# Training the Sklearn decision tree model
sklearn_regressor = DecisionTreeRegressor(max_depth=70, min_samples_split=3, max_features=None)
sklearn_regressor.fit(x_train, y_train)
# Predictions from both the models
our_predictions = regressor.make_predictions(x_test)
print("\nOur Predictions:\n", our_predictions)
sklearn_prediction = sklearn_regressor.predict(x_test)
print("\nSklearn Predictions:\n", sklearn_prediction)
# MSE from both the models
our_mse = regressor.mse(y_test, our_predictions)
print("\nMean Squared Error from Our model:", round(our_mse, 5))
sklearn_mse = mean_squared_error(y_test, sklearn_prediction)
print("Mean Squared Error from Slearn model:", round(sklearn_mse, 5))
#----If the data has categorical features, we need to handle them as well by performing encoding methods before train-test-split----#
"""Identify categorical features
Perform Encoding techniques like :
One-hot encoding (Creates binary vales 0/1)
Label Encoding (assigns a number to each category, but considers ordering)
Ordinal Encoding (Similar to label encoding, but only use if categories have a natural order (like low, medium, high).)
"""