diff --git a/treeple/tree/_classes.py b/treeple/tree/_classes.py index 16eb6ea5..8a227c85 100644 --- a/treeple/tree/_classes.py +++ b/treeple/tree/_classes.py @@ -820,7 +820,7 @@ class ObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): tree_type = "oblique" - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeClassifier._parameter_constraints, "feature_combinations": [ Interval(Real, 1.0, None, closed="left"), @@ -1283,7 +1283,7 @@ class ObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): tree_type = "oblique" - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeRegressor._parameter_constraints, "feature_combinations": [ Interval(Real, 1.0, None, closed="left"), @@ -1684,7 +1684,7 @@ class PatchObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier) """ tree_type = "oblique" - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeClassifier._parameter_constraints, "min_patch_dims": ["array-like", None], "max_patch_dims": ["array-like", None], @@ -2166,7 +2166,7 @@ class PatchObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): """ tree_type = "oblique" - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeRegressor._parameter_constraints, "min_patch_dims": ["array-like", None], "max_patch_dims": ["array-like", None], @@ -2669,7 +2669,7 @@ class ExtraObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier) tree_type = "oblique" - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeClassifier._parameter_constraints, "feature_combinations": [ Interval(Real, 1.0, None, closed="left"), @@ -3069,7 +3069,7 @@ class ExtraObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): -0.26552594, -0.00642017, -0.07108117, -0.40726765, -0.40315294]) """ - _parameter_constraints = { + _parameter_constraints: dict = { **DecisionTreeRegressor._parameter_constraints, "feature_combinations": [ Interval(Real, 1.0, None, closed="left"), diff --git a/treeple/tree/_oblique_splitter.pxd b/treeple/tree/_oblique_splitter.pxd index 124a66dd..65ca16e1 100644 --- a/treeple/tree/_oblique_splitter.pxd +++ b/treeple/tree/_oblique_splitter.pxd @@ -83,12 +83,6 @@ cdef class BaseObliqueSplitter(Splitter): SplitRecord* split, ) except -1 nogil - cdef inline void fisher_yates_shuffle_memview( - self, - intp_t[::1] indices_to_sample, - intp_t grid_size, - uint32_t* random_state - ) noexcept nogil cdef class ObliqueSplitter(BaseObliqueSplitter): # The splitter searches in the input space for a linear combination of features and a threshold diff --git a/treeple/tree/_oblique_splitter.pyx b/treeple/tree/_oblique_splitter.pyx index ca77a30a..0cceac66 100644 --- a/treeple/tree/_oblique_splitter.pyx +++ b/treeple/tree/_oblique_splitter.pyx @@ -11,6 +11,7 @@ from libcpp.vector cimport vector from .._lib.sklearn.tree._criterion cimport Criterion from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform +from ._utils cimport fisher_yates_shuffle cdef float64_t INFINITY = np.inf @@ -46,8 +47,12 @@ cdef class BaseObliqueSplitter(Splitter): def __setstate__(self, d): pass - cdef int node_reset(self, intp_t start, intp_t end, - float64_t* weighted_n_node_samples) except -1 nogil: + cdef int node_reset( + self, + intp_t start, + intp_t end, + float64_t* weighted_n_node_samples + ) except -1 nogil: """Reset splitter on node samples[start:end]. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -62,17 +67,7 @@ cdef class BaseObliqueSplitter(Splitter): weighted_n_node_samples : ndarray, dtype=float64_t pointer The total weight of those samples """ - - self.start = start - self.end = end - - self.criterion.init(self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples) - self.criterion.set_sample_pointers(start, end) - - weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples + Splitter.node_reset(self, start, end, weighted_n_node_samples) # Clear all projection vectors for i in range(self.max_features): @@ -102,8 +97,8 @@ cdef class BaseObliqueSplitter(Splitter): intp_t end, const intp_t[:] samples, float32_t[:] feature_values, - vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]* proj_vec_weights, # weights of the vector (n_non_zeros,) + vector[intp_t]* proj_vec_indices # indices of the features (n_non_zeros,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. @@ -126,19 +121,6 @@ cdef class BaseObliqueSplitter(Splitter): feature_values[idx] = 0.0 feature_values[idx] += self.X[samples[idx], col_idx] * col_weight - cdef inline void fisher_yates_shuffle_memview( - self, - intp_t[::1] indices_to_sample, - intp_t grid_size, - uint32_t* random_state, - ) noexcept nogil: - cdef intp_t i, j - - # XXX: should this be `i` or `i+1`? for valid Fisher-Yates? - for i in range(0, grid_size - 1): - j = rand_int(i, grid_size, random_state) - indices_to_sample[j], indices_to_sample[i] = \ - indices_to_sample[i], indices_to_sample[j] cdef class ObliqueSplitter(BaseObliqueSplitter): def __cinit__( @@ -257,7 +239,7 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): cdef intp_t grid_size = self.max_features * self.n_features # shuffle indices over the 2D grid to sample using Fisher-Yates - self.fisher_yates_shuffle_memview(indices_to_sample, grid_size, random_state) + fisher_yates_shuffle(indices_to_sample, grid_size, random_state) # sample 'n_non_zeros' in a mtry X n_features projection matrix # which consists of +/- 1's chosen at a 1/2s rate @@ -309,7 +291,7 @@ cdef class BestObliqueSplitter(ObliqueSplitter): cdef intp_t end = self.end # pointer array to store feature values to split on - cdef float32_t[::1] feature_values = self.feature_values + cdef float32_t[::1] feature_values = self.feature_values cdef intp_t max_features = self.max_features cdef intp_t min_samples_leaf = self.min_samples_leaf diff --git a/treeple/tree/_utils.pxd b/treeple/tree/_utils.pxd index c814cc16..dc6cb5b7 100644 --- a/treeple/tree/_utils.pxd +++ b/treeple/tree/_utils.pxd @@ -1,3 +1,5 @@ +from libcpp.vector cimport vector + import numpy as np cimport numpy as cnp @@ -7,15 +9,41 @@ cnp.import_array() from .._lib.sklearn.tree._splitter cimport SplitRecord from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t +ctypedef fused vector_or_memview: + vector[intp_t] + intp_t[::1] + intp_t[:] + + +cdef inline void fisher_yates_shuffle( + vector_or_memview indices_to_sample, + intp_t grid_size, + uint32_t* random_state, +) noexcept nogil -cdef int rand_weighted_binary(float64_t p0, uint32_t* random_state) noexcept nogil + +cdef int rand_weighted_binary( + float64_t p0, + uint32_t* random_state +) noexcept nogil cpdef unravel_index( - intp_t index, cnp.ndarray[intp_t, ndim=1] shape + intp_t index, + cnp.ndarray[intp_t, ndim=1] shape ) -cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape) +cpdef ravel_multi_index( + intp_t[:] coords, + const intp_t[:] shape +) -cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil +cdef void unravel_index_cython( + intp_t index, + const intp_t[:] shape, + vector_or_memview coords +) noexcept nogil -cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil +cdef intp_t ravel_multi_index_cython( + vector_or_memview coords, + const intp_t[:] shape +) noexcept nogil diff --git a/treeple/tree/_utils.pyx b/treeple/tree/_utils.pyx index 197b82ec..7ce48977 100644 --- a/treeple/tree/_utils.pyx +++ b/treeple/tree/_utils.pyx @@ -11,10 +11,40 @@ cimport numpy as cnp cnp.import_array() -from .._lib.sklearn.tree._utils cimport rand_uniform +from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform -cdef inline int rand_weighted_binary(float64_t p0, uint32_t* random_state) noexcept nogil: +cdef inline void fisher_yates_shuffle( + vector_or_memview indices_to_sample, + intp_t grid_size, + uint32_t* random_state, +) noexcept nogil: + """Shuffle the indices in place using the Fisher-Yates algorithm. + Parameters + ---------- + indices_to_sample : A C++ vector or 1D memoryview + The indices to shuffle. + grid_size : intp_t + The size of the grid to shuffle. This is explicitly passed in + to support the templated `vector_or_memview` type, which allows + for both C++ vectors and Cython memoryviews. Getitng the length + of both types uses different API. + random_state : uint32_t* + The random state. + """ + cdef intp_t i, j + + # XXX: should this be `i` or `i+1`? for valid Fisher-Yates? + for i in range(0, grid_size - 1): + j = rand_int(i, grid_size, random_state) + indices_to_sample[j], indices_to_sample[i] = \ + indices_to_sample[i], indices_to_sample[j] + + +cdef inline int rand_weighted_binary( + float64_t p0, + uint32_t* random_state +) noexcept nogil: """Sample from integers 0 and 1 with different probabilities. Parameters @@ -54,7 +84,9 @@ cpdef unravel_index( index = np.intp(index) shape = np.array(shape) coords = np.empty(shape.shape[0], dtype=np.intp) - unravel_index_cython(index, shape, coords) + cdef const intp_t[:] shape_memview = shape + cdef intp_t[:] coords_memview = coords + unravel_index_cython(index, shape_memview, coords_memview) return coords @@ -83,7 +115,11 @@ cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape): return ravel_multi_index_cython(coords, shape) -cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil: +cdef inline void unravel_index_cython( + intp_t index, + const intp_t[:] shape, + vector_or_memview coords +) noexcept nogil: """Converts a flat index into a tuple of coordinate arrays. Parameters @@ -92,13 +128,9 @@ cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] co The flat index to be converted. shape : numpy.ndarray[intp_t, ndim=1] The shape of the array into which the flat index should be converted. - coords : numpy.ndarray[intp_t, ndim=1] - A preinitialized memoryview array of coordinate arrays to be converted. - - Returns - ------- - numpy.ndarray[intp_t, ndim=1] - An array of coordinate arrays, with each coordinate array having the same shape as the input `shape`. + coords : intp_t[:] or vector[intp_t] + A preinitialized array of coordinates to store the result of the + unraveled `index`. """ cdef intp_t ndim = shape.shape[0] cdef intp_t j, size @@ -109,13 +141,16 @@ cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] co index //= size -cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil: - """Converts a tuple of coordinate arrays into a flat index. +cdef inline intp_t ravel_multi_index_cython( + vector_or_memview coords, + const intp_t[:] shape +) noexcept nogil: + """Converts a tuple of coordinate arrays into a flat index in the vectorized dimension. Parameters ---------- - coords : numpy.ndarray[intp_t, ndim=1] - An array of coordinate arrays to be converted. + coords : intp_t[:] or vector[intp_t] + An array of coordinates to be converted and vectorized into a sinlg shape : numpy.ndarray[intp_t, ndim=1] The shape of the array into which the coordinates should be converted. diff --git a/treeple/tree/manifold/_morf_splitter.pxd b/treeple/tree/manifold/_morf_splitter.pxd index a0a61a4d..2b65fd3b 100644 --- a/treeple/tree/manifold/_morf_splitter.pxd +++ b/treeple/tree/manifold/_morf_splitter.pxd @@ -32,14 +32,6 @@ cdef class PatchSplitter(BestObliqueSplitter): # an input data vector. The input data is vectorized, so `data_height` and # `data_width` are used to determine the vectorized indices corresponding to # (x,y) coordinates in the original un-vectorized data. - - cdef public intp_t max_patch_height # Maximum height of the patch to sample - cdef public intp_t max_patch_width # Maximum width of the patch to sample - cdef public intp_t min_patch_height # Minimum height of the patch to sample - cdef public intp_t min_patch_width # Minimum width of the patch to sample - cdef public intp_t data_height # Height of the input data - cdef public intp_t data_width # Width of the input data - cdef public intp_t ndim # The number of dimensions of the input data cdef const intp_t[:] data_dims # The dimensions of the input data @@ -56,7 +48,7 @@ cdef class PatchSplitter(BestObliqueSplitter): cdef intp_t[::1] _index_data_buffer cdef intp_t[::1] _index_patch_buffer - cdef intp_t[:] patch_dims_buff # A buffer to store the dimensions of the sampled patch + cdef intp_t[:] patch_sampled_size # A buffer to store the dimensions of the sampled patch cdef intp_t[:] unraveled_patch_point # A buffer to store the unraveled patch point # All oblique splitters (i.e. non-axis aligned splitters) require a diff --git a/treeple/tree/manifold/_morf_splitter.pyx b/treeple/tree/manifold/_morf_splitter.pyx index d6c8d012..f1eaf291 100644 --- a/treeple/tree/manifold/_morf_splitter.pyx +++ b/treeple/tree/manifold/_morf_splitter.pyx @@ -151,7 +151,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.data_dims = data_dims # create a buffer for storing the patch dimensions sampled per projection matrix - self.patch_dims_buff = np.zeros(data_dims.shape[0], dtype=np.intp) + self.patch_sampled_size = np.zeros(data_dims.shape[0], dtype=np.intp) self.unraveled_patch_point = np.zeros(data_dims.shape[0], dtype=np.intp) # store the min and max patch dimension constraints @@ -237,7 +237,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): top_left_patch_seed = rand_int(0, delta_patch_dim, random_state) # write to buffer - self.patch_dims_buff[idx] = patch_dim + self.patch_sampled_size[idx] = patch_dim patch_size *= patch_dim elif self.boundary == "wrap": # add circular boundary conditions @@ -251,7 +251,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # resample the patch dimension due to padding patch_dim = min(patch_dim, min(dim+1, self.data_dims[idx] + patch_dim - dim - 1)) - self.patch_dims_buff[idx] = patch_dim + self.patch_sampled_size[idx] = patch_dim patch_size *= patch_dim # TODO: make this work @@ -283,7 +283,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): cdef intp_t top_left_patch_seed # size of the sampled patch, which is just the size of the n-dim patch - # (\prod_i self.patch_dims_buff[i]) + # (\prod_i self.patch_sampled_size[i]) cdef intp_t patch_size for proj_i in range(0, max_features): @@ -299,7 +299,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): proj_i, patch_size, top_left_patch_seed, - self.patch_dims_buff + self.patch_sampled_size ) cdef void sample_proj_vec( @@ -389,7 +389,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): if not self.dim_contiguous[idx]: row_index += ( (self.unraveled_patch_point[idx] // other_dims_offset) % - self.patch_dims_buff[idx] + self.patch_sampled_size[idx] ) * other_dims_offset other_dims_offset //= self.data_dims[idx] @@ -445,7 +445,7 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): """A class to expose a Python interface for testing.""" cpdef sample_top_left_seed_cpdef(self): top_left_patch_seed, patch_size = self.sample_top_left_seed() - patch_dims = np.array(self.patch_dims_buff, dtype=np.intp) + patch_dims = np.array(self.patch_sampled_size, dtype=np.intp) return top_left_patch_seed, patch_size, patch_dims cpdef sample_projection_vector(