Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP correct CI errors relating to partial_fit #58

Merged
merged 9 commits into from
Aug 28, 2023
1 change: 0 additions & 1 deletion sklearn/ensemble/_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1332,7 +1332,6 @@ def partial_fit(self, X, y, sample_weight=None, classes=None):
Parallel(
n_jobs=self.n_jobs,
verbose=self.verbose,
prefer="threads",
)(
delayed(_parallel_update_trees)(
t,
Expand Down
27 changes: 19 additions & 8 deletions sklearn/tree/_tree.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
SIZE_t max_depth,
double min_impurity_decrease,
unsigned char store_leaf_values=False,
cnp.ndarray initial_roots=None,
object initial_roots=None,
):
self.splitter = splitter
self.min_samples_split = min_samples_split
Expand Down Expand Up @@ -220,19 +220,24 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
X_copy = {}
y_copy = {}
for i in range(X.shape[0]):
# collect depths from the node paths
depth_i = paths[i].indices.shape[0] - 1
PARENT = depth_i - 1
CHILD = depth_i

# find leaf node's & their parent node's IDs
if PARENT < 0:
parent_i = 0
else:
parent_i = paths[i].indices[PARENT]
child_i = paths[i].indices[CHILD]
left = 0
if tree.children_left[parent_i] == child_i:
left = 1
left = 1 # leaf node is left child

# organize samples by the leaf they fall into (false root)
# leaf nodes are marked by parent node and
# their relative position (left or right child)
if (parent_i, left) in false_roots:
false_roots[(parent_i, left)][0] += 1
X_copy[(parent_i, left)].append(X[i])
Expand All @@ -244,12 +249,15 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):

X_list = []
y_list = []

# reorder the samples according to parent node IDs
for key, value in reversed(sorted(X_copy.items())):
X_list = X_list + value
y_list = y_list + y_copy[key]
cdef object X_new = np.array(X_list)
cdef cnp.ndarray y_new = np.array(y_list)

# initialize the splitter using sorted samples
cdef Splitter splitter = self.splitter
splitter.init(X_new, y_new, sample_weight, missing_values_in_feature_mask)

Expand All @@ -275,11 +283,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
cdef double min_weight_leaf = self.min_weight_leaf
cdef SIZE_t min_samples_split = self.min_samples_split
cdef double min_impurity_decrease = self.min_impurity_decrease
cdef unsigned char store_leaf_values = self.store_leaf_values
initial_roots = self.initial_roots

# Initial capacity
cdef int init_capacity
cdef bint first = 0
if self.initial_roots is None:
if initial_roots is None:
# Recursive partition (without actual recursion)
splitter.init(X, y, sample_weight, missing_values_in_feature_mask)

Expand Down Expand Up @@ -318,7 +328,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):

if not first:
# push reached leaf nodes onto stack
for key, value in reversed(sorted(self.initial_roots.items())):
for key, value in reversed(sorted(initial_roots.items())):
end += value[0]
update_stack.push({
"start": start,
Expand Down Expand Up @@ -471,7 +481,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
"lower_bound": left_child_min,
"upper_bound": left_child_max,
})
elif self.store_leaf_values and is_leaf:
elif store_leaf_values and is_leaf:
# copy leaf values to leaf_values array
splitter.node_samples(tree.value_samples[node_id])

Expand Down Expand Up @@ -599,7 +609,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
"lower_bound": left_child_min,
"upper_bound": left_child_max,
})
elif self.store_leaf_values and is_leaf:
elif store_leaf_values and is_leaf:
# copy leaf values to leaf_values array
splitter.node_samples(tree.value_samples[node_id])

Expand Down Expand Up @@ -672,7 +682,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
SIZE_t max_leaf_nodes,
double min_impurity_decrease,
unsigned char store_leaf_values=False,
cnp.ndarray initial_roots=None,
object initial_roots=None,
):
self.splitter = splitter
self.min_samples_split = min_samples_split
Expand Down Expand Up @@ -712,6 +722,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
# Parameters
cdef Splitter splitter = self.splitter
cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes
cdef unsigned char store_leaf_values = self.store_leaf_values

# Recursive partition (without actual recursion)
splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
Expand Down Expand Up @@ -770,7 +781,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
node.feature = _TREE_UNDEFINED
node.threshold = _TREE_UNDEFINED

if self.store_leaf_values:
if store_leaf_values:
# copy leaf values to leaf_values array
splitter.node_samples(tree.value_samples[record.node_id])
else:
Expand Down
Loading