Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP correct CI errors relating to partial_fit #58

Merged
merged 9 commits into from
Aug 28, 2023
2 changes: 1 addition & 1 deletion sklearn/tree/_tree.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ cdef class TreeBuilder:
cdef double min_weight_leaf # Minimum weight in a leaf
cdef SIZE_t max_depth # Maximal tree depth
cdef double min_impurity_decrease # Impurity threshold for early stopping
cdef object initial_roots # Leaf nodes for streaming updates
cdef cnp.ndarray initial_roots # Leaf nodes for streaming updates

cdef unsigned char store_leaf_values # Whether to store leaf values

Expand Down
31 changes: 24 additions & 7 deletions sklearn/tree/_tree.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -220,19 +220,24 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
X_copy = {}
y_copy = {}
for i in range(X.shape[0]):
# collect depths from the node paths
depth_i = paths[i].indices.shape[0] - 1
PARENT = depth_i - 1
CHILD = depth_i

# find leaf node's & their parent node's IDs
if PARENT < 0:
parent_i = 0
else:
parent_i = paths[i].indices[PARENT]
child_i = paths[i].indices[CHILD]
left = 0
if tree.children_left[parent_i] == child_i:
left = 1
left = 1 # leaf node is left child

# organize samples by the leaf they fall into (false root)
# leaf nodes are marked by parent node and
# their relative position (left or right child)
if (parent_i, left) in false_roots:
false_roots[(parent_i, left)][0] += 1
X_copy[(parent_i, left)].append(X[i])
Expand All @@ -244,16 +249,20 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):

X_list = []
y_list = []

# reorder the samples according to parent node IDs
for key, value in reversed(sorted(X_copy.items())):
X_list = X_list + value
y_list = y_list + y_copy[key]
cdef object X_new = np.array(X_list)
cdef cnp.ndarray y_new = np.array(y_list)

# initialize the splitter using sorted samples
cdef Splitter splitter = self.splitter
splitter.init(X_new, y_new, sample_weight, missing_values_in_feature_mask)

self.initial_roots = false_roots
# convert dict to numpy array and store value
self.initial_roots = np.array(list(false_roots.items()))

cpdef build(
self,
Expand All @@ -275,11 +284,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
cdef double min_weight_leaf = self.min_weight_leaf
cdef SIZE_t min_samples_split = self.min_samples_split
cdef double min_impurity_decrease = self.min_impurity_decrease
cdef unsigned char store_leaf_values = self.store_leaf_values
cdef cnp.ndarray initial_roots = self.initial_roots

# Initial capacity
cdef int init_capacity
cdef bint first = 0
if self.initial_roots is None:
if initial_roots is None:
# Recursive partition (without actual recursion)
splitter.init(X, y, sample_weight, missing_values_in_feature_mask)

Expand All @@ -290,6 +301,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):

tree._resize(init_capacity)
first = 1
else:
# convert numpy array back to dict
false_roots = {}
for key_value_pair in initial_roots:
false_roots[tuple(key_value_pair[0])] = key_value_pair[1]

cdef SIZE_t start = 0
cdef SIZE_t end = 0
Expand Down Expand Up @@ -318,7 +334,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):

if not first:
# push reached leaf nodes onto stack
for key, value in reversed(sorted(self.initial_roots.items())):
for key, value in reversed(sorted(false_roots.items())):
end += value[0]
update_stack.push({
"start": start,
Expand Down Expand Up @@ -471,7 +487,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
"lower_bound": left_child_min,
"upper_bound": left_child_max,
})
elif self.store_leaf_values and is_leaf:
elif store_leaf_values and is_leaf:
# copy leaf values to leaf_values array
splitter.node_samples(tree.value_samples[node_id])

Expand Down Expand Up @@ -599,7 +615,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
"lower_bound": left_child_min,
"upper_bound": left_child_max,
})
elif self.store_leaf_values and is_leaf:
elif store_leaf_values and is_leaf:
# copy leaf values to leaf_values array
splitter.node_samples(tree.value_samples[node_id])

Expand Down Expand Up @@ -712,6 +728,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
# Parameters
cdef Splitter splitter = self.splitter
cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes
cdef unsigned char store_leaf_values = self.store_leaf_values

# Recursive partition (without actual recursion)
splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
Expand Down Expand Up @@ -770,7 +787,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
node.feature = _TREE_UNDEFINED
node.threshold = _TREE_UNDEFINED

if self.store_leaf_values:
if store_leaf_values:
# copy leaf values to leaf_values array
splitter.node_samples(tree.value_samples[record.node_id])
else:
Expand Down
Loading