From 8c4a7f627f31a2a635038abd2ad22af671adb559 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 5 Sep 2024 09:40:21 -0400 Subject: [PATCH 01/17] Clean up Cython files Signed-off-by: Adam Li --- treeple/tree/_classes.py | 12 ++--- treeple/tree/_oblique_splitter.pxd | 6 --- treeple/tree/_oblique_splitter.pyx | 42 +++++---------- treeple/tree/_utils.pxd | 38 ++++++++++++-- treeple/tree/_utils.pyx | 65 ++++++++++++++++++------ treeple/tree/manifold/_morf_splitter.pxd | 10 +--- treeple/tree/manifold/_morf_splitter.pyx | 14 ++--- 7 files changed, 109 insertions(+), 78 deletions(-) diff --git a/treeple/tree/_classes.py b/treeple/tree/_classes.py index 16eb6ea52..8a227c859 100644 --- a/treeple/tree/_classes.py +++ b/treeple/tree/_classes.py @@ -820,7 +820,7 @@ class ObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): tree_type = "oblique" - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeClassifier._parameter_constraints, "feature_combinations": [ Interval(Real, 1.0, None, closed="left"), @@ -1283,7 +1283,7 @@ class ObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): tree_type = "oblique" - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeRegressor._parameter_constraints, "feature_combinations": [ Interval(Real, 1.0, None, closed="left"), @@ -1684,7 +1684,7 @@ class PatchObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier) """ tree_type = "oblique" - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeClassifier._parameter_constraints, "min_patch_dims": ["array-like", None], "max_patch_dims": ["array-like", None], @@ -2166,7 +2166,7 @@ class PatchObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): """ tree_type = "oblique" - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeRegressor._parameter_constraints, "min_patch_dims": ["array-like", None], "max_patch_dims": ["array-like", None], @@ -2669,7 +2669,7 @@ class ExtraObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier) tree_type = "oblique" - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeClassifier._parameter_constraints, "feature_combinations": [ Interval(Real, 1.0, None, closed="left"), @@ -3069,7 +3069,7 @@ class ExtraObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): -0.26552594, -0.00642017, -0.07108117, -0.40726765, -0.40315294]) """ - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeRegressor._parameter_constraints, "feature_combinations": [ Interval(Real, 1.0, None, closed="left"), diff --git a/treeple/tree/_oblique_splitter.pxd b/treeple/tree/_oblique_splitter.pxd index 124a66dd6..65ca16e14 100644 --- a/treeple/tree/_oblique_splitter.pxd +++ b/treeple/tree/_oblique_splitter.pxd @@ -83,12 +83,6 @@ cdef class BaseObliqueSplitter(Splitter): SplitRecord* split, ) except -1 nogil - cdef inline void fisher_yates_shuffle_memview( - self, - intp_t[::1] indices_to_sample, - intp_t grid_size, - uint32_t* random_state - ) noexcept nogil cdef class ObliqueSplitter(BaseObliqueSplitter): # The splitter searches in the input space for a linear combination of features and a threshold diff --git a/treeple/tree/_oblique_splitter.pyx b/treeple/tree/_oblique_splitter.pyx index ca77a30ac..0cceac664 100644 --- a/treeple/tree/_oblique_splitter.pyx +++ b/treeple/tree/_oblique_splitter.pyx @@ -11,6 +11,7 @@ from libcpp.vector cimport vector from .._lib.sklearn.tree._criterion cimport Criterion from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform +from ._utils cimport fisher_yates_shuffle cdef float64_t INFINITY = np.inf @@ -46,8 +47,12 @@ cdef class BaseObliqueSplitter(Splitter): def __setstate__(self, d): pass - cdef int node_reset(self, intp_t start, intp_t end, - float64_t* weighted_n_node_samples) except -1 nogil: + cdef int node_reset( + self, + intp_t start, + intp_t end, + float64_t* weighted_n_node_samples + ) except -1 nogil: """Reset splitter on node samples[start:end]. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -62,17 +67,7 @@ cdef class BaseObliqueSplitter(Splitter): weighted_n_node_samples : ndarray, dtype=float64_t pointer The total weight of those samples """ - - self.start = start - self.end = end - - self.criterion.init(self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples) - self.criterion.set_sample_pointers(start, end) - - weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples + Splitter.node_reset(self, start, end, weighted_n_node_samples) # Clear all projection vectors for i in range(self.max_features): @@ -102,8 +97,8 @@ cdef class BaseObliqueSplitter(Splitter): intp_t end, const intp_t[:] samples, float32_t[:] feature_values, - vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]* proj_vec_weights, # weights of the vector (n_non_zeros,) + vector[intp_t]* proj_vec_indices # indices of the features (n_non_zeros,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. @@ -126,19 +121,6 @@ cdef class BaseObliqueSplitter(Splitter): feature_values[idx] = 0.0 feature_values[idx] += self.X[samples[idx], col_idx] * col_weight - cdef inline void fisher_yates_shuffle_memview( - self, - intp_t[::1] indices_to_sample, - intp_t grid_size, - uint32_t* random_state, - ) noexcept nogil: - cdef intp_t i, j - - # XXX: should this be `i` or `i+1`? for valid Fisher-Yates? - for i in range(0, grid_size - 1): - j = rand_int(i, grid_size, random_state) - indices_to_sample[j], indices_to_sample[i] = \ - indices_to_sample[i], indices_to_sample[j] cdef class ObliqueSplitter(BaseObliqueSplitter): def __cinit__( @@ -257,7 +239,7 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): cdef intp_t grid_size = self.max_features * self.n_features # shuffle indices over the 2D grid to sample using Fisher-Yates - self.fisher_yates_shuffle_memview(indices_to_sample, grid_size, random_state) + fisher_yates_shuffle(indices_to_sample, grid_size, random_state) # sample 'n_non_zeros' in a mtry X n_features projection matrix # which consists of +/- 1's chosen at a 1/2s rate @@ -309,7 +291,7 @@ cdef class BestObliqueSplitter(ObliqueSplitter): cdef intp_t end = self.end # pointer array to store feature values to split on - cdef float32_t[::1] feature_values = self.feature_values + cdef float32_t[::1] feature_values = self.feature_values cdef intp_t max_features = self.max_features cdef intp_t min_samples_leaf = self.min_samples_leaf diff --git a/treeple/tree/_utils.pxd b/treeple/tree/_utils.pxd index c814cc166..dc6cb5b71 100644 --- a/treeple/tree/_utils.pxd +++ b/treeple/tree/_utils.pxd @@ -1,3 +1,5 @@ +from libcpp.vector cimport vector + import numpy as np cimport numpy as cnp @@ -7,15 +9,41 @@ cnp.import_array() from .._lib.sklearn.tree._splitter cimport SplitRecord from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t +ctypedef fused vector_or_memview: + vector[intp_t] + intp_t[::1] + intp_t[:] + + +cdef inline void fisher_yates_shuffle( + vector_or_memview indices_to_sample, + intp_t grid_size, + uint32_t* random_state, +) noexcept nogil -cdef int rand_weighted_binary(float64_t p0, uint32_t* random_state) noexcept nogil + +cdef int rand_weighted_binary( + float64_t p0, + uint32_t* random_state +) noexcept nogil cpdef unravel_index( - intp_t index, cnp.ndarray[intp_t, ndim=1] shape + intp_t index, + cnp.ndarray[intp_t, ndim=1] shape ) -cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape) +cpdef ravel_multi_index( + intp_t[:] coords, + const intp_t[:] shape +) -cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil +cdef void unravel_index_cython( + intp_t index, + const intp_t[:] shape, + vector_or_memview coords +) noexcept nogil -cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil +cdef intp_t ravel_multi_index_cython( + vector_or_memview coords, + const intp_t[:] shape +) noexcept nogil diff --git a/treeple/tree/_utils.pyx b/treeple/tree/_utils.pyx index 197b82ecf..7ce48977b 100644 --- a/treeple/tree/_utils.pyx +++ b/treeple/tree/_utils.pyx @@ -11,10 +11,40 @@ cimport numpy as cnp cnp.import_array() -from .._lib.sklearn.tree._utils cimport rand_uniform +from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform -cdef inline int rand_weighted_binary(float64_t p0, uint32_t* random_state) noexcept nogil: +cdef inline void fisher_yates_shuffle( + vector_or_memview indices_to_sample, + intp_t grid_size, + uint32_t* random_state, +) noexcept nogil: + """Shuffle the indices in place using the Fisher-Yates algorithm. + Parameters + ---------- + indices_to_sample : A C++ vector or 1D memoryview + The indices to shuffle. + grid_size : intp_t + The size of the grid to shuffle. This is explicitly passed in + to support the templated `vector_or_memview` type, which allows + for both C++ vectors and Cython memoryviews. Getitng the length + of both types uses different API. + random_state : uint32_t* + The random state. + """ + cdef intp_t i, j + + # XXX: should this be `i` or `i+1`? for valid Fisher-Yates? + for i in range(0, grid_size - 1): + j = rand_int(i, grid_size, random_state) + indices_to_sample[j], indices_to_sample[i] = \ + indices_to_sample[i], indices_to_sample[j] + + +cdef inline int rand_weighted_binary( + float64_t p0, + uint32_t* random_state +) noexcept nogil: """Sample from integers 0 and 1 with different probabilities. Parameters @@ -54,7 +84,9 @@ cpdef unravel_index( index = np.intp(index) shape = np.array(shape) coords = np.empty(shape.shape[0], dtype=np.intp) - unravel_index_cython(index, shape, coords) + cdef const intp_t[:] shape_memview = shape + cdef intp_t[:] coords_memview = coords + unravel_index_cython(index, shape_memview, coords_memview) return coords @@ -83,7 +115,11 @@ cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape): return ravel_multi_index_cython(coords, shape) -cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil: +cdef inline void unravel_index_cython( + intp_t index, + const intp_t[:] shape, + vector_or_memview coords +) noexcept nogil: """Converts a flat index into a tuple of coordinate arrays. Parameters @@ -92,13 +128,9 @@ cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] co The flat index to be converted. shape : numpy.ndarray[intp_t, ndim=1] The shape of the array into which the flat index should be converted. - coords : numpy.ndarray[intp_t, ndim=1] - A preinitialized memoryview array of coordinate arrays to be converted. - - Returns - ------- - numpy.ndarray[intp_t, ndim=1] - An array of coordinate arrays, with each coordinate array having the same shape as the input `shape`. + coords : intp_t[:] or vector[intp_t] + A preinitialized array of coordinates to store the result of the + unraveled `index`. """ cdef intp_t ndim = shape.shape[0] cdef intp_t j, size @@ -109,13 +141,16 @@ cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] co index //= size -cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil: - """Converts a tuple of coordinate arrays into a flat index. +cdef inline intp_t ravel_multi_index_cython( + vector_or_memview coords, + const intp_t[:] shape +) noexcept nogil: + """Converts a tuple of coordinate arrays into a flat index in the vectorized dimension. Parameters ---------- - coords : numpy.ndarray[intp_t, ndim=1] - An array of coordinate arrays to be converted. + coords : intp_t[:] or vector[intp_t] + An array of coordinates to be converted and vectorized into a sinlg shape : numpy.ndarray[intp_t, ndim=1] The shape of the array into which the coordinates should be converted. diff --git a/treeple/tree/manifold/_morf_splitter.pxd b/treeple/tree/manifold/_morf_splitter.pxd index a0a61a4de..2b65fd3ba 100644 --- a/treeple/tree/manifold/_morf_splitter.pxd +++ b/treeple/tree/manifold/_morf_splitter.pxd @@ -32,14 +32,6 @@ cdef class PatchSplitter(BestObliqueSplitter): # an input data vector. The input data is vectorized, so `data_height` and # `data_width` are used to determine the vectorized indices corresponding to # (x,y) coordinates in the original un-vectorized data. - - cdef public intp_t max_patch_height # Maximum height of the patch to sample - cdef public intp_t max_patch_width # Maximum width of the patch to sample - cdef public intp_t min_patch_height # Minimum height of the patch to sample - cdef public intp_t min_patch_width # Minimum width of the patch to sample - cdef public intp_t data_height # Height of the input data - cdef public intp_t data_width # Width of the input data - cdef public intp_t ndim # The number of dimensions of the input data cdef const intp_t[:] data_dims # The dimensions of the input data @@ -56,7 +48,7 @@ cdef class PatchSplitter(BestObliqueSplitter): cdef intp_t[::1] _index_data_buffer cdef intp_t[::1] _index_patch_buffer - cdef intp_t[:] patch_dims_buff # A buffer to store the dimensions of the sampled patch + cdef intp_t[:] patch_sampled_size # A buffer to store the dimensions of the sampled patch cdef intp_t[:] unraveled_patch_point # A buffer to store the unraveled patch point # All oblique splitters (i.e. non-axis aligned splitters) require a diff --git a/treeple/tree/manifold/_morf_splitter.pyx b/treeple/tree/manifold/_morf_splitter.pyx index d6c8d0121..f1eaf2918 100644 --- a/treeple/tree/manifold/_morf_splitter.pyx +++ b/treeple/tree/manifold/_morf_splitter.pyx @@ -151,7 +151,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.data_dims = data_dims # create a buffer for storing the patch dimensions sampled per projection matrix - self.patch_dims_buff = np.zeros(data_dims.shape[0], dtype=np.intp) + self.patch_sampled_size = np.zeros(data_dims.shape[0], dtype=np.intp) self.unraveled_patch_point = np.zeros(data_dims.shape[0], dtype=np.intp) # store the min and max patch dimension constraints @@ -237,7 +237,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): top_left_patch_seed = rand_int(0, delta_patch_dim, random_state) # write to buffer - self.patch_dims_buff[idx] = patch_dim + self.patch_sampled_size[idx] = patch_dim patch_size *= patch_dim elif self.boundary == "wrap": # add circular boundary conditions @@ -251,7 +251,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # resample the patch dimension due to padding patch_dim = min(patch_dim, min(dim+1, self.data_dims[idx] + patch_dim - dim - 1)) - self.patch_dims_buff[idx] = patch_dim + self.patch_sampled_size[idx] = patch_dim patch_size *= patch_dim # TODO: make this work @@ -283,7 +283,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): cdef intp_t top_left_patch_seed # size of the sampled patch, which is just the size of the n-dim patch - # (\prod_i self.patch_dims_buff[i]) + # (\prod_i self.patch_sampled_size[i]) cdef intp_t patch_size for proj_i in range(0, max_features): @@ -299,7 +299,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): proj_i, patch_size, top_left_patch_seed, - self.patch_dims_buff + self.patch_sampled_size ) cdef void sample_proj_vec( @@ -389,7 +389,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): if not self.dim_contiguous[idx]: row_index += ( (self.unraveled_patch_point[idx] // other_dims_offset) % - self.patch_dims_buff[idx] + self.patch_sampled_size[idx] ) * other_dims_offset other_dims_offset //= self.data_dims[idx] @@ -445,7 +445,7 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): """A class to expose a Python interface for testing.""" cpdef sample_top_left_seed_cpdef(self): top_left_patch_seed, patch_size = self.sample_top_left_seed() - patch_dims = np.array(self.patch_dims_buff, dtype=np.intp) + patch_dims = np.array(self.patch_sampled_size, dtype=np.intp) return top_left_patch_seed, patch_size, patch_dims cpdef sample_projection_vector( From 1c46a5b8abe5aa43d996d29d961370a0486e5b26 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 5 Sep 2024 10:14:24 -0400 Subject: [PATCH 02/17] Try again Signed-off-by: Adam Li --- treeple/meson.build | 1 + 1 file changed, 1 insertion(+) diff --git a/treeple/meson.build b/treeple/meson.build index 3d1715dbe..f72b36b51 100644 --- a/treeple/meson.build +++ b/treeple/meson.build @@ -106,6 +106,7 @@ scikit_learn_cython_args = [ # Needed for cython imports across subpackages, e.g. cluster pyx that # cimports metrics pxd '--include-dir', meson.global_build_root(), + '--embedsignature', 'True', ] cython_c_args += scikit_learn_cython_args From cbc79aad1d07cf2dde29851ad224e91731746fce Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 5 Sep 2024 10:22:04 -0400 Subject: [PATCH 03/17] Adding submodule update Signed-off-by: Adam Li --- treeple/_lib/sklearn_fork | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork index ac5cb8abd..600187a53 160000 --- a/treeple/_lib/sklearn_fork +++ b/treeple/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit ac5cb8abd5c9b425c3c02a2be1d91296adf643a3 +Subproject commit 600187a53a8c1bee0b7092d69adda9064e3c0dbc From 84826e28afecc9de80bb7be790d12125797ba1bb Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 5 Sep 2024 10:25:10 -0400 Subject: [PATCH 04/17] Fixed Signed-off-by: Adam Li --- treeple/meson.build | 2 +- treeple/tree/_utils.pxd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/treeple/meson.build b/treeple/meson.build index f72b36b51..4801d0536 100644 --- a/treeple/meson.build +++ b/treeple/meson.build @@ -103,10 +103,10 @@ scikit_learn_cython_args = [ '-X language_level=3', '-X boundscheck=' + boundscheck, '-X wraparound=False', '-X initializedcheck=False', '-X nonecheck=False', '-X cdivision=True', '-X profile=False', + '-X embedsignature=True', # Needed for cython imports across subpackages, e.g. cluster pyx that # cimports metrics pxd '--include-dir', meson.global_build_root(), - '--embedsignature', 'True', ] cython_c_args += scikit_learn_cython_args diff --git a/treeple/tree/_utils.pxd b/treeple/tree/_utils.pxd index dc6cb5b71..ba2707791 100644 --- a/treeple/tree/_utils.pxd +++ b/treeple/tree/_utils.pxd @@ -15,7 +15,7 @@ ctypedef fused vector_or_memview: intp_t[:] -cdef inline void fisher_yates_shuffle( +cdef void fisher_yates_shuffle( vector_or_memview indices_to_sample, intp_t grid_size, uint32_t* random_state, From a3df2d957d179b8874ee524a2f26a170db42952b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 6 Sep 2024 15:54:33 -0400 Subject: [PATCH 05/17] Try with new commit Signed-off-by: Adam Li --- treeple/_lib/meson.build | 63 ++++++++++++++++++++------------------- treeple/_lib/sklearn_fork | 2 +- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/treeple/_lib/meson.build b/treeple/_lib/meson.build index 5dd37c868..47648036c 100644 --- a/treeple/_lib/meson.build +++ b/treeple/_lib/meson.build @@ -60,37 +60,38 @@ endforeach # TODO: Can remove if included in scikit-learn eventually # install tree/ submodule -extensions = [ - '_quad_tree', -] +# extensions = [ +# '_quad_tree', +# ] -foreach ext: extensions - py.extension_module( - ext, - ['./sklearn/neighbors/' + ext + '.pyx'], - c_args: c_args, - dependencies: [np_dep], - cython_args: cython_c_args, - override_options : ['optimization=3', 'cython_language=cpp'], - install: true, - subdir: 'treeple/_lib/sklearn/neighbors/', - ) -endforeach +# foreach ext: extensions +# py.extension_module( +# ext, +# ['./sklearn/neighbors/' + ext + '.pyx'], +# c_args: c_args, +# dependencies: [np_dep], +# cython_args: cython_c_args, +# override_options : ['optimization=3', 'cython_language=cpp'], +# install: true, +# subdir: 'treeple/_lib/sklearn/neighbors/', +# ) +# endforeach -# install tree/ submodule -extensions = [ - '_typedefs', - '_random', -] +# # install tree/ submodule +# extensions = [ +# '_typedefs', +# '_random', +# ] + +# foreach ext: extensions +# py.extension_module(ext, +# ['./sklearn/utils/' + ext + '.pyx'], +# c_args: c_args, +# dependencies: [np_dep], +# cython_args: cython_c_args, +# override_options : ['optimization=3', 'cython_language=cpp'], +# install: true, +# subdir: 'treeple/_lib/sklearn/utils/', +# ) +# endforeach -foreach ext: extensions - py.extension_module(ext, - ['./sklearn/utils/' + ext + '.pyx'], - c_args: c_args, - dependencies: [np_dep], - cython_args: cython_c_args, - override_options : ['optimization=3', 'cython_language=cpp'], - install: true, - subdir: 'treeple/_lib/sklearn/utils/', - ) -endforeach diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork index 600187a53..ee4b9b777 160000 --- a/treeple/_lib/sklearn_fork +++ b/treeple/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 600187a53a8c1bee0b7092d69adda9064e3c0dbc +Subproject commit ee4b9b777600a1c4da322c4f703b665037d97a3c From d9de2aeffeaa41f45ec0d1b2fcf698a1ba0d375c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 6 Sep 2024 16:07:50 -0400 Subject: [PATCH 06/17] Update submodule Signed-off-by: Adam Li --- treeple/__init__.py | 3 +- treeple/_lib/meson.build | 113 ++++++++++++++++++++++---------------- treeple/_lib/sklearn_fork | 2 +- 3 files changed, 68 insertions(+), 50 deletions(-) diff --git a/treeple/__init__.py b/treeple/__init__.py index 2a70afefe..a90b095d4 100644 --- a/treeple/__init__.py +++ b/treeple/__init__.py @@ -64,7 +64,8 @@ msg = """Error importing treeple: you cannot import treeple while being in treeple source directory; please exit the treeple source tree first and relaunch your Python interpreter.""" - raise ImportError(msg) from e + raise Exception(e) + # raise ImportError(msg) from e __all__ = [ "_lib", diff --git a/treeple/_lib/meson.build b/treeple/_lib/meson.build index 47648036c..e8c0fb1fc 100644 --- a/treeple/_lib/meson.build +++ b/treeple/_lib/meson.build @@ -3,60 +3,76 @@ if not fs.exists('sklearn') error('Missing the `sklearn` fork submodule! Run `git submodule update --init` to fix this.') endif +tempita = files('./sklearn/_build_utils/tempita.py') + +# Write file in Meson build dir to be able to figure out from Python code +# whether scikit-learn was built with Meson. Adapted from pandas +# _version_meson.py. +option('enable_custom_target', type: 'boolean', value: false, description: 'Enable custom target') + +custom_target('write_built_with_meson_file', + output: '_built_with_meson.py', + command: [ + py, '-c', 'with open("./treeple/_lib/sklearn/_built_with_meson.py", "w") as f: f.write("")' + ], + install: true, + install_dir: py.get_install_dir() / 'sklearn' +) + # install tree/ submodule -tree_extension_metadata = { - '_tree': - {'sources': ['./sklearn/tree/' + '_tree.pyx'], - 'override_options': ['cython_language=cpp', 'optimization=3']}, - '_partitioner': - {'sources': ['./sklearn/tree/' + '_partitioner.pyx'], - 'override_options': ['cython_language=cpp', 'optimization=3']}, - '_splitter': - {'sources': ['./sklearn/tree/' + '_splitter.pyx'], - 'override_options': ['cython_language=cpp', 'optimization=3']}, - '_criterion': - {'sources': ['./sklearn/tree/' + '_criterion.pyx'], - 'override_options': ['cython_language=cpp', 'optimization=3']}, - '_utils': - {'sources': ['./sklearn/tree/' + '_utils.pyx'], - 'override_options': ['cython_language=cpp', 'optimization=3']}, -} +# tree_extension_metadata = { +# '_tree': +# {'sources': ['./sklearn/tree/' + '_tree.pyx'], +# 'override_options': ['cython_language=cpp', 'optimization=3']}, +# '_partitioner': +# {'sources': ['./sklearn/tree/' + '_partitioner.pyx'], +# 'override_options': ['cython_language=cpp', 'optimization=3']}, +# '_splitter': +# {'sources': ['./sklearn/tree/' + '_splitter.pyx'], +# 'override_options': ['cython_language=cpp', 'optimization=3']}, +# '_criterion': +# {'sources': ['./sklearn/tree/' + '_criterion.pyx'], +# 'override_options': ['cython_language=cpp', 'optimization=3']}, +# '_utils': +# {'sources': ['./sklearn/tree/' + '_utils.pyx'], +# 'override_options': ['cython_language=cpp', 'optimization=3']}, +# } -foreach ext_name, ext_dict : tree_extension_metadata - py.extension_module( - ext_name, - ext_dict.get('sources'), - dependencies: [np_dep], - override_options : ext_dict.get('override_options', []), - cython_args: cython_c_args, - subdir: 'treeple/_lib/sklearn/tree/', - install: true - ) -endforeach +# foreach ext_name, ext_dict : tree_extension_metadata +# py.extension_module( +# ext_name, +# ext_dict.get('sources'), +# dependencies: [np_dep], +# override_options : ext_dict.get('override_options', []), +# cython_args: cython_c_args, +# subdir: 'treeple/_lib/sklearn/tree/', +# install: true +# ) +# endforeach -python_sources = [ - './sklearn/tree/__init__.py', - './sklearn/tree/_classes.py', - './sklearn/tree/_export.py', - './sklearn/tree/_reingold_tilford.py', -] +# python_sources = [ +# './sklearn/tree/__init__.py', +# './sklearn/tree/_classes.py', +# './sklearn/tree/_export.py', +# './sklearn/tree/_reingold_tilford.py', +# ] -py.install_sources( - python_sources, - subdir: 'treeple/_lib/sklearn/tree' # Folder relative to site-packages to install to -) +# py.install_sources( +# python_sources, +# subdir: 'treeple/_lib/sklearn/tree' # Folder relative to site-packages to install to +# ) -# install ensemble/ submodule -python_sources = [ - '_forest.py', -] -foreach py_source: python_sources - py.install_sources( - './sklearn/ensemble/' + py_source, - subdir: 'treeple/_lib/sklearn/ensemble' - ) -endforeach +# # install ensemble/ submodule +# python_sources = [ +# '_forest.py', +# ] +# foreach py_source: python_sources +# py.install_sources( +# './sklearn/ensemble/' + py_source, +# subdir: 'treeple/_lib/sklearn/ensemble' +# ) +# endforeach # TODO: Can remove if included in scikit-learn eventually # install tree/ submodule @@ -95,3 +111,4 @@ endforeach # ) # endforeach +subdir('sklearn') \ No newline at end of file diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork index ee4b9b777..d3788bfa4 160000 --- a/treeple/_lib/sklearn_fork +++ b/treeple/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit ee4b9b777600a1c4da322c4f703b665037d97a3c +Subproject commit d3788bfa41df61ecba8d1281ae175e74f9558dda From 500906b6338e0eaed1de9fb17d30e5dbfac07be1 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 6 Sep 2024 16:10:55 -0400 Subject: [PATCH 07/17] Update submodule Signed-off-by: Adam Li --- treeple/_lib/meson_options.txt | 1 + treeple/_lib/sklearn_fork | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 treeple/_lib/meson_options.txt diff --git a/treeple/_lib/meson_options.txt b/treeple/_lib/meson_options.txt new file mode 100644 index 000000000..7bce7ab3d --- /dev/null +++ b/treeple/_lib/meson_options.txt @@ -0,0 +1 @@ +option('enable_custom_target', type: 'boolean', value: false, description: 'Enable custom target') diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork index d3788bfa4..8f32f299b 160000 --- a/treeple/_lib/sklearn_fork +++ b/treeple/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit d3788bfa41df61ecba8d1281ae175e74f9558dda +Subproject commit 8f32f299ba28d276bc031f6b185006bd0a52a9cd From 315d1c2c1a434e1376c3914b98374563043bfa3b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 6 Sep 2024 16:16:46 -0400 Subject: [PATCH 08/17] Update submodule Signed-off-by: Adam Li --- treeple/_lib/meson.build | 20 ++++++++++---------- treeple/_lib/sklearn_fork | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/treeple/_lib/meson.build b/treeple/_lib/meson.build index e8c0fb1fc..57f325d9a 100644 --- a/treeple/_lib/meson.build +++ b/treeple/_lib/meson.build @@ -8,16 +8,15 @@ tempita = files('./sklearn/_build_utils/tempita.py') # Write file in Meson build dir to be able to figure out from Python code # whether scikit-learn was built with Meson. Adapted from pandas # _version_meson.py. -option('enable_custom_target', type: 'boolean', value: false, description: 'Enable custom target') -custom_target('write_built_with_meson_file', - output: '_built_with_meson.py', - command: [ - py, '-c', 'with open("./treeple/_lib/sklearn/_built_with_meson.py", "w") as f: f.write("")' - ], - install: true, - install_dir: py.get_install_dir() / 'sklearn' -) +# custom_target('write_built_with_meson_file', +# output: '_built_with_meson.py', +# command: [ +# py, '-c', 'with open("./treeple/_lib/sklearn/_built_with_meson.py", "w") as f: f.write("")' +# ], +# install: true, +# install_dir: py.get_install_dir() / 'sklearn' +# ) # install tree/ submodule # tree_extension_metadata = { @@ -111,4 +110,5 @@ custom_target('write_built_with_meson_file', # ) # endforeach -subdir('sklearn') \ No newline at end of file +# subdir('sklearn') +subproject('sklearn', default_options: ['enable_custom_target=false']) diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork index 8f32f299b..8c6be9f30 160000 --- a/treeple/_lib/sklearn_fork +++ b/treeple/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 8f32f299ba28d276bc031f6b185006bd0a52a9cd +Subproject commit 8c6be9f3024f3519cfa2159e2db2c5125c3e9e56 From f69d1a812a8d023f2fc56a726ba169f50168e177 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sun, 8 Sep 2024 21:01:24 -0400 Subject: [PATCH 09/17] Fix submodule Signed-off-by: Adam Li --- meson.build | 2 +- treeple/__init__.py | 1 + treeple/_lib/meson.build | 39 +++++++++++++++++----------------- treeple/_lib/meson_options.txt | 1 - treeple/_lib/sklearn_fork | 2 +- treeple/tree/__init__.py | 12 +++++------ 6 files changed, 29 insertions(+), 28 deletions(-) delete mode 100644 treeple/_lib/meson_options.txt diff --git a/meson.build b/meson.build index 26f909dea..07ec4c9c2 100644 --- a/meson.build +++ b/meson.build @@ -8,7 +8,7 @@ project( license: 'PolyForm Noncommercial 1.0.0', meson_version: '>= 1.1.0', default_options: [ - 'c_std=c99', + 'c_std=c11', 'cpp_std=c++14', ], ) diff --git a/treeple/__init__.py b/treeple/__init__.py index a90b095d4..dafad7deb 100644 --- a/treeple/__init__.py +++ b/treeple/__init__.py @@ -22,6 +22,7 @@ # https://github.com/ContinuumIO/anaconda-issues/issues/11294 os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE") + try: # This variable is injected in the __builtins__ by the build # process. It is used to enable importing subpackages of sklearn when diff --git a/treeple/_lib/meson.build b/treeple/_lib/meson.build index 57f325d9a..7593c2614 100644 --- a/treeple/_lib/meson.build +++ b/treeple/_lib/meson.build @@ -3,21 +3,6 @@ if not fs.exists('sklearn') error('Missing the `sklearn` fork submodule! Run `git submodule update --init` to fix this.') endif -tempita = files('./sklearn/_build_utils/tempita.py') - -# Write file in Meson build dir to be able to figure out from Python code -# whether scikit-learn was built with Meson. Adapted from pandas -# _version_meson.py. - -# custom_target('write_built_with_meson_file', -# output: '_built_with_meson.py', -# command: [ -# py, '-c', 'with open("./treeple/_lib/sklearn/_built_with_meson.py", "w") as f: f.write("")' -# ], -# install: true, -# install_dir: py.get_install_dir() / 'sklearn' -# ) - # install tree/ submodule # tree_extension_metadata = { # '_tree': @@ -73,8 +58,8 @@ tempita = files('./sklearn/_build_utils/tempita.py') # ) # endforeach -# TODO: Can remove if included in scikit-learn eventually -# install tree/ submodule +# # TODO: Can remove if included in scikit-learn eventually +# # install tree/ submodule # extensions = [ # '_quad_tree', # ] @@ -110,5 +95,21 @@ tempita = files('./sklearn/_build_utils/tempita.py') # ) # endforeach -# subdir('sklearn') -subproject('sklearn', default_options: ['enable_custom_target=false']) + +python_sources = [ + '__init__.py', +] + +py.install_sources( + python_sources, + subdir: 'treeple/_lib' # Folder relative to site-packages to install to +) + +tempita = files('./sklearn/_build_utils/tempita.py') + +# Copy all the .py files to the install dir, rather than using +# py.install_sources and needing to list them explicitely one by one +# install_subdir('sklearn', install_dir: py.get_install_dir()) +install_subdir('sklearn', install_dir: join_paths(py.get_install_dir(), 'treeple/_lib')) + +subdir('sklearn') diff --git a/treeple/_lib/meson_options.txt b/treeple/_lib/meson_options.txt deleted file mode 100644 index 7bce7ab3d..000000000 --- a/treeple/_lib/meson_options.txt +++ /dev/null @@ -1 +0,0 @@ -option('enable_custom_target', type: 'boolean', value: false, description: 'Enable custom target') diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork index 8c6be9f30..5b074dd38 160000 --- a/treeple/_lib/sklearn_fork +++ b/treeple/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 8c6be9f3024f3519cfa2159e2db2c5125c3e9e56 +Subproject commit 5b074dd386af2791c57c556c89a65528e62a3c15 diff --git a/treeple/tree/__init__.py b/treeple/tree/__init__.py index 797338ac3..0af9f713f 100644 --- a/treeple/tree/__init__.py +++ b/treeple/tree/__init__.py @@ -1,9 +1,9 @@ -from .._lib.sklearn.tree import ( - DecisionTreeClassifier, - DecisionTreeRegressor, - ExtraTreeClassifier, - ExtraTreeRegressor, -) +# from .._lib.sklearn.tree import ( +# DecisionTreeClassifier, +# DecisionTreeRegressor, +# ExtraTreeClassifier, +# ExtraTreeRegressor, +# ) from ._classes import ( ExtraObliqueDecisionTreeClassifier, ExtraObliqueDecisionTreeRegressor, From be2b655f258edb10c9c3685af7b7233116fd5598 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sun, 8 Sep 2024 21:06:44 -0400 Subject: [PATCH 10/17] New submodule-s --- treeple/_lib/meson.build | 196 +++++++++++++++++++------------------- treeple/_lib/sklearn_fork | 2 +- 2 files changed, 99 insertions(+), 99 deletions(-) diff --git a/treeple/_lib/meson.build b/treeple/_lib/meson.build index 7593c2614..ae83cf4a5 100644 --- a/treeple/_lib/meson.build +++ b/treeple/_lib/meson.build @@ -4,112 +4,112 @@ if not fs.exists('sklearn') endif # install tree/ submodule -# tree_extension_metadata = { -# '_tree': -# {'sources': ['./sklearn/tree/' + '_tree.pyx'], -# 'override_options': ['cython_language=cpp', 'optimization=3']}, -# '_partitioner': -# {'sources': ['./sklearn/tree/' + '_partitioner.pyx'], -# 'override_options': ['cython_language=cpp', 'optimization=3']}, -# '_splitter': -# {'sources': ['./sklearn/tree/' + '_splitter.pyx'], -# 'override_options': ['cython_language=cpp', 'optimization=3']}, -# '_criterion': -# {'sources': ['./sklearn/tree/' + '_criterion.pyx'], -# 'override_options': ['cython_language=cpp', 'optimization=3']}, -# '_utils': -# {'sources': ['./sklearn/tree/' + '_utils.pyx'], -# 'override_options': ['cython_language=cpp', 'optimization=3']}, -# } - - -# foreach ext_name, ext_dict : tree_extension_metadata -# py.extension_module( -# ext_name, -# ext_dict.get('sources'), -# dependencies: [np_dep], -# override_options : ext_dict.get('override_options', []), -# cython_args: cython_c_args, -# subdir: 'treeple/_lib/sklearn/tree/', -# install: true -# ) -# endforeach +tree_extension_metadata = { + '_tree': + {'sources': ['./sklearn/tree/' + '_tree.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_partitioner': + {'sources': ['./sklearn/tree/' + '_partitioner.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_splitter': + {'sources': ['./sklearn/tree/' + '_splitter.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_criterion': + {'sources': ['./sklearn/tree/' + '_criterion.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_utils': + {'sources': ['./sklearn/tree/' + '_utils.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, +} + + +foreach ext_name, ext_dict : tree_extension_metadata + py.extension_module( + ext_name, + ext_dict.get('sources'), + dependencies: [np_dep], + override_options : ext_dict.get('override_options', []), + cython_args: cython_c_args, + subdir: 'treeple/_lib/sklearn/tree/', + install: true + ) +endforeach -# python_sources = [ -# './sklearn/tree/__init__.py', -# './sklearn/tree/_classes.py', -# './sklearn/tree/_export.py', -# './sklearn/tree/_reingold_tilford.py', -# ] +python_sources = [ + './sklearn/tree/__init__.py', + './sklearn/tree/_classes.py', + './sklearn/tree/_export.py', + './sklearn/tree/_reingold_tilford.py', +] -# py.install_sources( -# python_sources, -# subdir: 'treeple/_lib/sklearn/tree' # Folder relative to site-packages to install to -# ) +py.install_sources( + python_sources, + subdir: 'treeple/_lib/sklearn/tree' # Folder relative to site-packages to install to +) -# # install ensemble/ submodule -# python_sources = [ -# '_forest.py', -# ] -# foreach py_source: python_sources -# py.install_sources( -# './sklearn/ensemble/' + py_source, -# subdir: 'treeple/_lib/sklearn/ensemble' -# ) -# endforeach - -# # TODO: Can remove if included in scikit-learn eventually -# # install tree/ submodule -# extensions = [ -# '_quad_tree', -# ] +# install ensemble/ submodule +python_sources = [ + '_forest.py', +] +foreach py_source: python_sources + py.install_sources( + './sklearn/ensemble/' + py_source, + subdir: 'treeple/_lib/sklearn/ensemble' + ) +endforeach + +# TODO: Can remove if included in scikit-learn eventually +# install tree/ submodule +extensions = [ + '_quad_tree', +] -# foreach ext: extensions -# py.extension_module( -# ext, -# ['./sklearn/neighbors/' + ext + '.pyx'], -# c_args: c_args, -# dependencies: [np_dep], -# cython_args: cython_c_args, -# override_options : ['optimization=3', 'cython_language=cpp'], -# install: true, -# subdir: 'treeple/_lib/sklearn/neighbors/', -# ) -# endforeach - -# # install tree/ submodule -# extensions = [ -# '_typedefs', -# '_random', -# ] +foreach ext: extensions + py.extension_module( + ext, + ['./sklearn/neighbors/' + ext + '.pyx'], + c_args: c_args, + dependencies: [np_dep], + cython_args: cython_c_args, + override_options : ['optimization=3', 'cython_language=cpp'], + install: true, + subdir: 'treeple/_lib/sklearn/neighbors/', + ) +endforeach -# foreach ext: extensions -# py.extension_module(ext, -# ['./sklearn/utils/' + ext + '.pyx'], -# c_args: c_args, -# dependencies: [np_dep], -# cython_args: cython_c_args, -# override_options : ['optimization=3', 'cython_language=cpp'], -# install: true, -# subdir: 'treeple/_lib/sklearn/utils/', -# ) -# endforeach +# install tree/ submodule +extensions = [ + '_typedefs', + '_random', +] +foreach ext: extensions + py.extension_module(ext, + ['./sklearn/utils/' + ext + '.pyx'], + c_args: c_args, + dependencies: [np_dep], + cython_args: cython_c_args, + override_options : ['optimization=3', 'cython_language=cpp'], + install: true, + subdir: 'treeple/_lib/sklearn/utils/', + ) +endforeach -python_sources = [ - '__init__.py', -] -py.install_sources( - python_sources, - subdir: 'treeple/_lib' # Folder relative to site-packages to install to -) +# python_sources = [ +# '__init__.py', +# ] + +# py.install_sources( +# python_sources, +# subdir: 'treeple/_lib' # Folder relative to site-packages to install to +# ) -tempita = files('./sklearn/_build_utils/tempita.py') +# tempita = files('./sklearn/_build_utils/tempita.py') -# Copy all the .py files to the install dir, rather than using -# py.install_sources and needing to list them explicitely one by one -# install_subdir('sklearn', install_dir: py.get_install_dir()) -install_subdir('sklearn', install_dir: join_paths(py.get_install_dir(), 'treeple/_lib')) +# # Copy all the .py files to the install dir, rather than using +# # py.install_sources and needing to list them explicitely one by one +# # install_subdir('sklearn', install_dir: py.get_install_dir()) +# install_subdir('sklearn', install_dir: join_paths(py.get_install_dir(), 'treeple/_lib')) -subdir('sklearn') +# subdir('sklearn') diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork index 5b074dd38..80959211c 160000 --- a/treeple/_lib/sklearn_fork +++ b/treeple/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 5b074dd386af2791c57c556c89a65528e62a3c15 +Subproject commit 80959211c228bc50e928ffefe30ff2457d7814e9 From cd793cd7740471ccc64e2dd383670e060e0da58a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sun, 8 Sep 2024 21:28:58 -0400 Subject: [PATCH 11/17] Update submodule Signed-off-by: Adam Li --- treeple/_lib/sklearn_fork | 2 +- treeple/tree/__init__.py | 12 ++++++------ treeple/tree/_classes.py | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork index 80959211c..960b58955 160000 --- a/treeple/_lib/sklearn_fork +++ b/treeple/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 80959211c228bc50e928ffefe30ff2457d7814e9 +Subproject commit 960b589554982b2d08404186bf57a4de83862e80 diff --git a/treeple/tree/__init__.py b/treeple/tree/__init__.py index 0af9f713f..797338ac3 100644 --- a/treeple/tree/__init__.py +++ b/treeple/tree/__init__.py @@ -1,9 +1,9 @@ -# from .._lib.sklearn.tree import ( -# DecisionTreeClassifier, -# DecisionTreeRegressor, -# ExtraTreeClassifier, -# ExtraTreeRegressor, -# ) +from .._lib.sklearn.tree import ( + DecisionTreeClassifier, + DecisionTreeRegressor, + ExtraTreeClassifier, + ExtraTreeRegressor, +) from ._classes import ( ExtraObliqueDecisionTreeClassifier, ExtraObliqueDecisionTreeRegressor, diff --git a/treeple/tree/_classes.py b/treeple/tree/_classes.py index 8a227c859..44cacc88b 100644 --- a/treeple/tree/_classes.py +++ b/treeple/tree/_classes.py @@ -8,7 +8,7 @@ from sklearn.cluster import AgglomerativeClustering from sklearn.utils import check_random_state from sklearn.utils._param_validation import Interval -from sklearn.utils.validation import check_is_fitted +from sklearn.utils.validation import check_is_fitted, validate_data from .._lib.sklearn.tree import ( BaseDecisionTree, @@ -216,7 +216,7 @@ def fit(self, X, y=None, sample_weight=None, check_input=True): if check_input: # TODO: allow X to be sparse check_X_params = dict(dtype=DTYPE) # , accept_sparse="csc" - X = self._validate_data(X, validate_separately=(check_X_params)) + X = validate_data(self, X, validate_separately=(check_X_params)) if issparse(X): X.sort_indices() @@ -1798,8 +1798,8 @@ def _build_tree( self.feature_combinations_ = 1 if self.feature_weight is not None: - self.feature_weight = self._validate_data( - self.feature_weight, ensure_2d=True, dtype=DTYPE + self.feature_weight = validate_data( + self, self.feature_weight, ensure_2d=True, dtype=DTYPE ) if self.feature_weight.shape != X.shape: raise ValueError( @@ -2277,8 +2277,8 @@ def _build_tree( self.feature_combinations_ = 1 if self.feature_weight is not None: - self.feature_weight = self._validate_data( - self.feature_weight, ensure_2d=True, dtype=DTYPE + self.feature_weight = validate_data( + self, self.feature_weight, ensure_2d=True, dtype=DTYPE ) if self.feature_weight.shape != X.shape: raise ValueError( From 406abfdc76197f7667a1313d8ad293a98a792268 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 9 Sep 2024 10:37:54 -0400 Subject: [PATCH 12/17] Update submodule Signed-off-by: Adam Li --- treeple/_lib/sklearn_fork | 2 +- treeple/ensemble/_honest_forest.py | 8 +++- treeple/ensemble/_unsupervised_forest.py | 10 +++- treeple/tree/_classes.py | 58 +++++++++++++++++++++--- treeple/tree/_neighbors.py | 4 -- 5 files changed, 67 insertions(+), 15 deletions(-) diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork index 960b58955..4551602a6 160000 --- a/treeple/_lib/sklearn_fork +++ b/treeple/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 960b589554982b2d08404186bf57a4de83862e80 +Subproject commit 4551602a68b5410dbf67b13f5acbdc64705b0c62 diff --git a/treeple/ensemble/_honest_forest.py b/treeple/ensemble/_honest_forest.py index 96c010625..447371b37 100644 --- a/treeple/ensemble/_honest_forest.py +++ b/treeple/ensemble/_honest_forest.py @@ -720,8 +720,12 @@ def oob_samples_(self): oob_samples.append(_oob_samples) return oob_samples - def _more_tags(self): - return {"multioutput": False} + def __sklearn_tags__(self): + # XXX: nans should be supportable in HRF + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_output = False + tags.input_tags.allow_nan = False + return tags def decision_path(self, X): """ diff --git a/treeple/ensemble/_unsupervised_forest.py b/treeple/ensemble/_unsupervised_forest.py index a66c330af..d97c87510 100644 --- a/treeple/ensemble/_unsupervised_forest.py +++ b/treeple/ensemble/_unsupervised_forest.py @@ -21,7 +21,12 @@ ) from sklearn.metrics import calinski_harabasz_score from sklearn.utils.parallel import Parallel, delayed -from sklearn.utils.validation import _check_sample_weight, check_is_fitted, check_random_state +from sklearn.utils.validation import ( + _check_sample_weight, + check_is_fitted, + check_random_state, + validate_data, +) from .._lib.sklearn.ensemble._forest import BaseForest from .._lib.sklearn.tree._tree import DTYPE @@ -88,7 +93,8 @@ def fit(self, X, y=None, sample_weight=None): self._validate_params() # Validate or convert input data - X = self._validate_data( + X = validate_data( + self, X, dtype=DTYPE, # accept_sparse="csc", ) diff --git a/treeple/tree/_classes.py b/treeple/tree/_classes.py index 44cacc88b..aa93d4c08 100644 --- a/treeple/tree/_classes.py +++ b/treeple/tree/_classes.py @@ -378,6 +378,13 @@ def _assign_labels(self, affinity_matrix): predict_labels = cluster.fit_predict(affinity_matrix) return predict_labels + def __sklearn_tags__(self): + # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values + # However, for MORF it is not supported + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = False + return tags + class UnsupervisedObliqueDecisionTree(UnsupervisedDecisionTree): """Unsupervised oblique decision tree. @@ -577,6 +584,13 @@ def _build_tree( builder.build(self.tree_, X, sample_weight) return self + def __sklearn_tags__(self): + # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values + # However, for MORF it is not supported + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = False + return tags + class ObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): """An oblique decision tree classifier. @@ -1070,6 +1084,13 @@ def _update_tree(self, X, y, sample_weight): self._prune_tree() return self + def __sklearn_tags__(self): + # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values + # However, for MORF it is not supported + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = False + return tags + class ObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): """An oblique decision tree Regressor. @@ -1450,6 +1471,13 @@ def _build_tree( builder.build(self.tree_, X, y, sample_weight, None) return self + def __sklearn_tags__(self): + # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values + # However, for MORF it is not supported + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = False + return tags + class PatchObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): """A oblique decision tree classifier that operates over patches of data. @@ -1927,11 +1955,13 @@ def _build_tree( return self - def _more_tags(self): + def __sklearn_tags__(self): # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values # However, for MORF it is not supported - allow_nan = False - return {"multilabel": True, "allow_nan": allow_nan} + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + tags.input_tags.allow_nan = False + return tags @property def _inheritable_fitted_attribute(self): @@ -2407,11 +2437,13 @@ def _build_tree( return self - def _more_tags(self): + def __sklearn_tags__(self): # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values # However, for MORF it is not supported - allow_nan = False - return {"multilabel": True, "allow_nan": allow_nan} + tags = super().__sklearn_tags__() + tags.regressor_tags.multi_label = True + tags.input_tags.allow_nan = False + return tags class ExtraObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): @@ -2846,6 +2878,13 @@ def _inheritable_fitted_attribute(self): "feature_combinations_", ] + def __sklearn_tags__(self): + # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values + # However, for MORF it is not supported + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = False + return tags + class ExtraObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): """An oblique decision tree Regressor. @@ -3237,3 +3276,10 @@ def _build_tree( builder.build(self.tree_, X, y, sample_weight) return self + + def __sklearn_tags__(self): + # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values + # However, for MORF it is not supported + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = False + return tags diff --git a/treeple/tree/_neighbors.py b/treeple/tree/_neighbors.py index 94f2c8f18..93d8ff1a0 100644 --- a/treeple/tree/_neighbors.py +++ b/treeple/tree/_neighbors.py @@ -64,7 +64,3 @@ def compute_similarity_matrix(self, X): The similarity matrix among the samples. """ return compute_forest_similarity_matrix(self, X) - - def _more_tags(self): - # XXX: no treeple estimators support NaNs as of now - return {"allow_nan": False} From 483b318ef9d42cbc091a56a8425075c0f30aa3c2 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 9 Sep 2024 10:41:13 -0400 Subject: [PATCH 13/17] Update submodule Signed-off-by: Adam Li --- treeple/_lib/sklearn_fork | 2 +- treeple/ensemble/_unsupervised_forest.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork index 4551602a6..dd58597a0 160000 --- a/treeple/_lib/sklearn_fork +++ b/treeple/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 4551602a68b5410dbf67b13f5acbdc64705b0c62 +Subproject commit dd58597a04ed339654b801669f9aa13e87555b18 diff --git a/treeple/ensemble/_unsupervised_forest.py b/treeple/ensemble/_unsupervised_forest.py index d97c87510..980c1ebbd 100644 --- a/treeple/ensemble/_unsupervised_forest.py +++ b/treeple/ensemble/_unsupervised_forest.py @@ -90,8 +90,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() - # Validate or convert input data X = validate_data( self, From 3b0426c684a8d5501849734b658eadf858e611df Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 9 Sep 2024 10:48:14 -0400 Subject: [PATCH 14/17] Update submodule Signed-off-by: Adam Li --- treeple/_lib/sklearn_fork | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork index dd58597a0..e4b9728cb 160000 --- a/treeple/_lib/sklearn_fork +++ b/treeple/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit dd58597a04ed339654b801669f9aa13e87555b18 +Subproject commit e4b9728cb8667d0a40ed0c6c45f0414811f5f1f8 From 04d661bdc2957425a3cb707d1e976315655e21de Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 9 Sep 2024 11:08:04 -0400 Subject: [PATCH 15/17] Fix Signed-off-by: Adam Li --- treeple/neighbors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/treeple/neighbors.py b/treeple/neighbors.py index 473b4363f..c95c2968c 100644 --- a/treeple/neighbors.py +++ b/treeple/neighbors.py @@ -5,7 +5,7 @@ from sklearn.base import BaseEstimator, MetaEstimatorMixin from sklearn.exceptions import NotFittedError from sklearn.neighbors import NearestNeighbors -from sklearn.utils.validation import check_is_fitted +from sklearn.utils.validation import check_is_fitted, validate_data from treeple.tree._neighbors import _compute_distance_matrix, compute_forest_similarity_matrix @@ -56,7 +56,7 @@ def fit(self, X, y=None): self : object Fitted estimator. """ - X, y = self._validate_data(X, y, accept_sparse="csc") + X, y = validate_data(self, X, y, accept_sparse="csc") self.estimator_ = copy(self.estimator) try: From 72217416bd2f3d8ca604fab088045a654067a3d4 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 9 Sep 2024 11:12:22 -0400 Subject: [PATCH 16/17] Update spin to 0.12 Signed-off-by: Adam Li --- .spin/cmds.py | 5 +++++ build_requirements.txt | 2 +- pyproject.toml | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.spin/cmds.py b/.spin/cmds.py index 7a80393d0..b5631b0e6 100644 --- a/.spin/cmds.py +++ b/.spin/cmds.py @@ -5,6 +5,7 @@ import click from spin import util from spin.cmds import meson +from spin.cmds.meson import build_dir_option def get_git_revision_hash(submodule) -> str: @@ -145,14 +146,18 @@ def setup_submodule(forcesubmodule=False): @click.option( "--forcesubmodule", is_flag=True, help="Force submodule pull.", envvar="FORCE_SUBMODULE" ) +@build_dir_option @click.pass_context def build( ctx, + *, meson_args, jobs=None, clean=False, verbose=False, gcov=False, + quiet=False, + build_dir=None, forcesubmodule=False, ): """Build treeple using submodules. diff --git a/build_requirements.txt b/build_requirements.txt index 95bc6c98e..ec63cfb3b 100644 --- a/build_requirements.txt +++ b/build_requirements.txt @@ -8,5 +8,5 @@ click rich-click doit pydevtool -spin +spin>=0.12 build diff --git a/pyproject.toml b/pyproject.toml index 596d2408b..c0a50d95a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,7 +68,7 @@ build = [ 'twine', 'meson', 'meson-python', - 'spin', + 'spin>=0.12', 'doit', 'scikit-learn>=1.5.0', 'Cython>=3.0.10', From e0dc4a51af22ad1eeb03bc4426442fe2278c7d2b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 9 Sep 2024 11:52:26 -0400 Subject: [PATCH 17/17] Fix unit tests Signed-off-by: Adam Li --- treeple/neighbors.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/treeple/neighbors.py b/treeple/neighbors.py index c95c2968c..b16e732f9 100644 --- a/treeple/neighbors.py +++ b/treeple/neighbors.py @@ -7,6 +7,7 @@ from sklearn.neighbors import NearestNeighbors from sklearn.utils.validation import check_is_fitted, validate_data +from treeple.tree import DecisionTreeClassifier from treeple.tree._neighbors import _compute_distance_matrix, compute_forest_similarity_matrix @@ -31,13 +32,19 @@ class NearestNeighborsMetaEstimator(BaseEstimator, MetaEstimatorMixin): The number of parallel jobs to run for neighbors, by default None. """ - def __init__(self, estimator, n_neighbors=5, radius=1.0, algorithm="auto", n_jobs=None): + def __init__(self, estimator=None, n_neighbors=5, radius=1.0, algorithm="auto", n_jobs=None): self.estimator = estimator self.n_neighbors = n_neighbors self.algorithm = algorithm self.radius = radius self.n_jobs = n_jobs + def get_estimator(self): + if self.estimator is not None: + return DecisionTreeClassifier(random_state=0) + else: + return copy(self.estimator) + def fit(self, X, y=None): """Fit the nearest neighbors estimator from the training dataset. @@ -58,7 +65,7 @@ def fit(self, X, y=None): """ X, y = validate_data(self, X, y, accept_sparse="csc") - self.estimator_ = copy(self.estimator) + self.estimator_ = self.get_estimator() try: check_is_fitted(self.estimator_) except NotFittedError: