diff --git a/treeple/tree/_oblique_splitter.pxd b/treeple/tree/_oblique_splitter.pxd index 124a66dd..65ca16e1 100644 --- a/treeple/tree/_oblique_splitter.pxd +++ b/treeple/tree/_oblique_splitter.pxd @@ -83,12 +83,6 @@ cdef class BaseObliqueSplitter(Splitter): SplitRecord* split, ) except -1 nogil - cdef inline void fisher_yates_shuffle_memview( - self, - intp_t[::1] indices_to_sample, - intp_t grid_size, - uint32_t* random_state - ) noexcept nogil cdef class ObliqueSplitter(BaseObliqueSplitter): # The splitter searches in the input space for a linear combination of features and a threshold diff --git a/treeple/tree/_oblique_splitter.pyx b/treeple/tree/_oblique_splitter.pyx index cb221643..78451b74 100644 --- a/treeple/tree/_oblique_splitter.pyx +++ b/treeple/tree/_oblique_splitter.pyx @@ -11,6 +11,7 @@ from libcpp.vector cimport vector from .._lib.sklearn.tree._criterion cimport Criterion from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform +from ._utils cimport fisher_yates_shuffle cdef float64_t INFINITY = np.inf @@ -46,8 +47,12 @@ cdef class BaseObliqueSplitter(Splitter): def __setstate__(self, d): pass - cdef int node_reset(self, intp_t start, intp_t end, - float64_t* weighted_n_node_samples) except -1 nogil: + cdef int node_reset( + self, + intp_t start, + intp_t end, + float64_t* weighted_n_node_samples + ) except -1 nogil: """Reset splitter on node samples[start:end]. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -62,17 +67,7 @@ cdef class BaseObliqueSplitter(Splitter): weighted_n_node_samples : ndarray, dtype=float64_t pointer The total weight of those samples """ - - self.start = start - self.end = end - - self.criterion.init(self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples) - self.criterion.set_sample_pointers(start, end) - - weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples + Splitter.node_reset(self, start, end, weighted_n_node_samples) # Clear all projection vectors for i in range(self.max_features): @@ -102,8 +97,8 @@ cdef class BaseObliqueSplitter(Splitter): intp_t end, const intp_t[:] samples, float32_t[:] feature_values, - vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]* proj_vec_weights, # weights of the vector for this projection (n_non_zeros',) + vector[intp_t]* proj_vec_indices # indices of the features for this projection (n_non_zeros',) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. @@ -126,20 +121,6 @@ cdef class BaseObliqueSplitter(Splitter): feature_values[idx] = 0.0 feature_values[idx] += self.X[samples[idx], col_idx] * col_weight - cdef inline void fisher_yates_shuffle_memview( - self, - intp_t[::1] indices_to_sample, - intp_t grid_size, - uint32_t* random_state, - ) noexcept nogil: - cdef intp_t i, j - - # XXX: should this be `i` or `i+1`? for valid Fisher-Yates? - for i in range(0, grid_size - 1): - j = rand_int(i, grid_size, random_state) - indices_to_sample[j], indices_to_sample[i] = \ - indices_to_sample[i], indices_to_sample[j] - cdef class ObliqueSplitter(BaseObliqueSplitter): def __cinit__( self, @@ -220,6 +201,43 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): # self.feature_weights = np.ones((self.n_features,), dtype=float32_t) / self.n_features return 0 + cdef void sample_proj_vec( + self, + vector[float32_t]& proj_vec_weights, + vector[intp_t]& proj_vec_indices + ) noexcept nogil: + cdef intp_t n_features = self.n_features + cdef intp_t n_non_zeros = self.n_non_zeros + cdef uint32_t* random_state = &self.rand_r_state + + cdef intp_t i, feat_i, proj_i, rand_vec_index + cdef float32_t weight + + # construct an array to sample from mTry x n_features set of indices + cdef intp_t[::1] indices_to_sample = self.indices_to_sample + cdef intp_t grid_size = self.max_features * self.n_features + + # shuffle indices over the 2D grid to sample using Fisher-Yates + fisher_yates_shuffle(indices_to_sample, grid_size, random_state) + + # sample 'n_non_zeros' in a mtry X n_features projection matrix + # which consists of +/- 1's chosen at a 1/2s rate + for i in range(0, n_non_zeros): + # get the next index from the shuffled index array + rand_vec_index = indices_to_sample[i] + + # get the projection index (i.e. row of the projection matrix) and + # feature index (i.e. column of the projection matrix) + proj_i = rand_vec_index // n_features + feat_i = rand_vec_index % n_features + + # sample a random weight + weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 + + proj_vec_indices[proj_i].push_back(feat_i) # Store index of nonzero + proj_vec_weights[proj_i].push_back(weight) # Store weight of nonzero + + cdef void sample_proj_mat( self, vector[vector[float32_t]]& proj_mat_weights, @@ -257,7 +275,7 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): cdef intp_t grid_size = self.max_features * self.n_features # shuffle indices over the 2D grid to sample using Fisher-Yates - self.fisher_yates_shuffle_memview(indices_to_sample, grid_size, random_state) + fisher_yates_shuffle(indices_to_sample, grid_size, random_state) # sample 'n_non_zeros' in a mtry X n_features projection matrix # which consists of +/- 1's chosen at a 1/2s rate @@ -340,6 +358,8 @@ cdef class BestObliqueSplitter(ObliqueSplitter): # XXX: 'feature' is not actually used in oblique split records # Just indicates which split was sampled current_split.feature = feat_i + + # sample the projection vector current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] diff --git a/treeple/tree/_utils.pxd b/treeple/tree/_utils.pxd index a436c059..8a595751 100644 --- a/treeple/tree/_utils.pxd +++ b/treeple/tree/_utils.pxd @@ -4,9 +4,23 @@ cimport numpy as cnp cnp.import_array() +from libcpp.vector cimport vector + from .._lib.sklearn.tree._splitter cimport SplitRecord from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t +ctypedef fused vector_or_memview: + vector[intp_t] + intp_t[::1] + intp_t[:] + + +cdef inline void fisher_yates_shuffle( + vector_or_memview indices_to_sample, + intp_t grid_size, + uint32_t* random_state, +) noexcept nogil + cdef intp_t rand_weighted_binary( float64_t p0, @@ -22,10 +36,22 @@ cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape) cdef void unravel_index_cython( intp_t index, const intp_t[:] shape, - const intp_t[:] coords + vector_or_memview coords ) noexcept nogil cdef intp_t ravel_multi_index_cython( - const intp_t[:] coords, + vector_or_memview coords, const intp_t[:] shape ) noexcept nogil + +cdef void compute_vectorized_indices_from_cartesian( + intp_t base_index, + vector[vector[intp_t]]& index_arrays, + const intp_t[:] data_dims, + vector[intp_t]& result +) noexcept nogil + +cdef memoryview[float32_t, ndim=3] init_2dmemoryview( + cnp.ndarray array, + const intp_t[:] data_dims +) diff --git a/treeple/tree/_utils.pyx b/treeple/tree/_utils.pyx index 28c03867..786ff70a 100644 --- a/treeple/tree/_utils.pyx +++ b/treeple/tree/_utils.pyx @@ -13,7 +13,35 @@ cimport numpy as cnp cnp.import_array() -from .._lib.sklearn.tree._utils cimport rand_uniform +from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform + + +cdef inline void fisher_yates_shuffle( + vector_or_memview indices_to_sample, + intp_t grid_size, + uint32_t* random_state, +) noexcept nogil: + """Shuffle the indices in place using the Fisher-Yates algorithm. + + Parameters + ---------- + indices_to_sample : A C++ vector or 1D memoryview + The indices to shuffle. + grid_size : intp_t + The size of the grid to shuffle. This is explicitly passed in + to support the templated `vector_or_memview` type, which allows + for both C++ vectors and Cython memoryviews. Getitng the length + of both types uses different API. + random_state : uint32_t* + The random state. + """ + cdef intp_t i, j + + # XXX: should this be `i` or `i+1`? for valid Fisher-Yates? + for i in range(0, grid_size - 1): + j = rand_int(i, grid_size, random_state) + indices_to_sample[j], indices_to_sample[i] = \ + indices_to_sample[i], indices_to_sample[j] cdef inline intp_t rand_weighted_binary( @@ -36,62 +64,11 @@ cdef inline intp_t rand_weighted_binary( else: return 1 -cpdef unravel_index( - intp_t index, - cnp.ndarray[intp_t, ndim=1] shape -): - """Converts a flat index or array of flat indices into a tuple of coordinate arrays. - - Purely used for testing purposes. - - Parameters - ---------- - index : intp_t - A flat index. - shape : numpy.ndarray[intp_t, ndim=1] - The shape of the array into which the flat indices should be converted. - - Returns - ------- - numpy.ndarray[intp_t, ndim=1] - A coordinate array having the same shape as the input `shape`. - """ - index = np.intp(index) - shape = np.array(shape) - coords = np.empty(shape.shape[0], dtype=np.intp) - unravel_index_cython(index, shape, coords) - return coords - - -cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape): - """Converts a tuple of coordinate arrays into a flat index. - - Purely used for testing purposes. - - Parameters - ---------- - coords : numpy.ndarray[intp_t, ndim=1] - An array of coordinate arrays to be converted. - shape : numpy.ndarray[intp_t, ndim=1] - The shape of the array into which the coordinates should be converted. - - Returns - ------- - intp_t - The resulting flat index. - - Raises - ------ - ValueError - If the input `coords` have invalid indices. - """ - return ravel_multi_index_cython(coords, shape) - cdef inline void unravel_index_cython( intp_t index, const intp_t[:] shape, - const intp_t[:] coords + vector_or_memview coords ) noexcept nogil: """Converts a flat index into a tuple of coordinate arrays. @@ -101,13 +78,9 @@ cdef inline void unravel_index_cython( The flat index to be converted. shape : numpy.ndarray[intp_t, ndim=1] The shape of the array into which the flat index should be converted. - coords : numpy.ndarray[intp_t, ndim=1] - A preinitialized memoryview array of coordinate arrays to be converted. - - Returns - ------- - numpy.ndarray[intp_t, ndim=1] - An array of coordinate arrays, with each coordinate array having the same shape as the input `shape`. + coords : intp_t[:] or vector[intp_t] + A preinitialized array of coordinates to store the result of the + unraveled `index`. """ cdef intp_t ndim = shape.shape[0] cdef intp_t j, size @@ -119,15 +92,15 @@ cdef inline void unravel_index_cython( cdef inline intp_t ravel_multi_index_cython( - const intp_t[:] coords, + vector_or_memview coords, const intp_t[:] shape ) noexcept nogil: """Converts a tuple of coordinate arrays into a flat index in the vectorized dimension. Parameters ---------- - coords : numpy.ndarray[intp_t, ndim=1] - An array of coordinate arrays to be converted. + coords : intp_t[:] or vector[intp_t] + An array of coordinates to be converted and vectorized into a sinlg shape : numpy.ndarray[intp_t, ndim=1] The shape of the array into which the coordinates should be converted. @@ -159,6 +132,55 @@ cdef inline intp_t ravel_multi_index_cython( return flat_index +cdef inline void compute_vectorized_indices_from_cartesian( + intp_t base_index, + vector[vector[intp_t]]& index_arrays, + const intp_t[:] data_dims, + vector[intp_t]& result +) noexcept nogil: + """ + Compute the vectorized indices for all index sets from the Cartesian product of index arrays. + + :param base_index: The initial vectorized index (top-left point in the patch). + :param index_arrays: A list of lists, where each sublist contains the indices for that dimension. + :param data_dims: The dimensions of the data. + :param result: C++ vector to store the vectorized indices. + """ + cdef int n_dims = len(index_arrays) + cdef vector[intp_t] indices = vector[intp_t](n_dims) + cdef intp_t vectorized_index + cdef intp_t i + cdef intp_t n_products = 1 + + # Calculate the total number of Cartesian products + for i in range(n_dims): + n_products *= len(index_arrays[i]) + + # Initialize the current indices for each dimension + cdef vector[intp_t] current_indices = vector[intp_t](n_dims) + for i in range(n_dims): + current_indices[i] = 0 + + # Iterate over Cartesian products and compute vectorized indices + for _ in range(n_products): + # Compute the vectorized index for the current combination + for i in range(n_dims): + indices[i] = index_arrays[i][current_indices[i]] + vectorized_index = ravel_multi_index_cython( + indices, data_dims + ) + result.push_back(vectorized_index) + + # Move to the next combination + for i in range(n_dims - 1, -1, -1): + current_indices[i] += 1 + if current_indices[i] < len(index_arrays[i]): + break + current_indices[i] = 0 + if i == 0: + return # Finished all combinations + + cdef vector[vector[intp_t]] cartesian_cython( vector[vector[intp_t]] sequences ) noexcept nogil: @@ -176,4 +198,78 @@ cdef vector[vector[intp_t]] cartesian_cython( cpdef cartesian_python(vector[vector[intp_t]]& sequences): - return cartesian_cython(sequences) \ No newline at end of file + return cartesian_cython(sequences) + + +cdef memoryview[float32_t, ndim=3] init_2dmemoryview( + cnp.ndarray array, + const intp_t[:] data_dims +): + cdef intp_t n_samples = array.shape[0] + cdef intp_t ndim = data_dims.shape[0] + assert ndim == 2, f"Currently only 2D images are accepted." + + # Reshape the array into (n_samples, *data_dims) + cdef cnp.ndarray reshaped_array = array.reshape((n_samples, data_dims[0], data_dims[1])) + + # Create a memoryview with the appropriate number of dimensions + cdef memoryview[float32_t, ndim=3] view = reshaped_array + return view + +#################################################################################################### +# CPDEF functions for testing purposes +#################################################################################################### + +cpdef unravel_index( + intp_t index, + cnp.ndarray[intp_t, ndim=1] shape +): + """Converts a flat index or array of flat indices into a tuple of coordinate arrays. + + Purely used for testing purposes. + + Parameters + ---------- + index : intp_t + A flat index. + shape : numpy.ndarray[intp_t, ndim=1] + The shape of the array into which the flat indices should be converted. + + Returns + ------- + numpy.ndarray[intp_t, ndim=1] + A coordinate array having the same shape as the input `shape`. + """ + index = np.intp(index) + shape = np.array(shape, dtype=np.intp) + coords = np.empty(shape.shape[0], dtype=np.intp) + + cdef const intp_t[:] shape_memview = shape + cdef intp_t[:] coords_memview = coords + unravel_index_cython(index, shape_memview, coords_memview) + return coords + + +cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape): + """Converts a tuple of coordinate arrays into a flat index. + + Purely used for testing purposes. + + Parameters + ---------- + coords : numpy.ndarray[intp_t, ndim=1] + An array of coordinate arrays to be converted. + shape : numpy.ndarray[intp_t, ndim=1] + The shape of the array into which the coordinates should be converted. + + Returns + ------- + intp_t + The resulting flat index. + + Raises + ------ + ValueError + If the input `coords` have invalid indices. + """ + return ravel_multi_index_cython(coords, shape) diff --git a/treeple/tree/manifold/_morf_splitter.pxd b/treeple/tree/manifold/_morf_splitter.pxd index a9618dfe..164b9c6f 100644 --- a/treeple/tree/manifold/_morf_splitter.pxd +++ b/treeple/tree/manifold/_morf_splitter.pxd @@ -33,19 +33,15 @@ cdef class PatchSplitter(BestObliqueSplitter): # `data_width` are used to determine the vectorized indices corresponding to # (x,y) coordinates in the original un-vectorized data. - cdef public intp_t max_patch_height # Maximum height of the patch to sample - cdef public intp_t max_patch_width # Maximum width of the patch to sample - cdef public intp_t min_patch_height # Minimum height of the patch to sample - cdef public intp_t min_patch_width # Minimum width of the patch to sample - cdef public intp_t data_height # Height of the input data - cdef public intp_t data_width # Width of the input data - cdef public intp_t ndim # The number of dimensions of the input data + cdef const intp_t[:] data_dims # The dimensions of the input data + cdef const intp_t[:] min_patch_dims # The minimum size of the patch to sample in each dimension + cdef const intp_t[:] max_patch_dims # The maximum size of the patch to sample in each dimension + cdef const uint8_t[:] dim_contiguous # A boolean array indicating whether each dimension is contiguous - cdef const intp_t[:] data_dims # The dimensions of the input data - cdef const intp_t[:] min_patch_dims # The minimum size of the patch to sample in each dimension - cdef const intp_t[:] max_patch_dims # The maximum size of the patch to sample in each dimension - cdef const uint8_t[:] dim_contiguous # A boolean array indicating whether each dimension is contiguous + # TODO: assumes all oblique splitters only work with dense data + cdef public memoryview X_reshaped + cdef memoryview patch_nd_indices # TODO: check if this works and is necessary for discontiguous data # cdef intp_t[:] stride_offsets # The stride offsets for each dimension @@ -56,13 +52,13 @@ cdef class PatchSplitter(BestObliqueSplitter): cdef intp_t[::1] _index_data_buffer cdef intp_t[::1] _index_patch_buffer - cdef intp_t[:] patch_dims_buff # A buffer to store the dimensions of the sampled patch + cdef intp_t[:] patch_sampled_size # A buffer to store the dimensions of the sampled patch cdef intp_t[:] unraveled_patch_point # A buffer to store the unraveled patch point # All oblique splitters (i.e. non-axis aligned splitters) require a # function to sample a projection matrix that is applied to the feature matrix # to quickly obtain the sampled projections for candidate splits. - cdef (intp_t, intp_t) sample_top_left_seed( + cdef intp_t sample_top_left_seed( self ) noexcept nogil diff --git a/treeple/tree/manifold/_morf_splitter.pyx b/treeple/tree/manifold/_morf_splitter.pyx index 1e91ad8f..f335999c 100644 --- a/treeple/tree/manifold/_morf_splitter.pyx +++ b/treeple/tree/manifold/_morf_splitter.pyx @@ -5,6 +5,8 @@ # cython: wraparound=False # cython: initializedcheck=False +cimport numpy as cnp + import numpy as np from cython.operator cimport dereference as deref @@ -12,7 +14,24 @@ from libcpp.vector cimport vector from ..._lib.sklearn.tree._criterion cimport Criterion from ..._lib.sklearn.tree._utils cimport rand_int -from .._utils cimport ravel_multi_index_cython, unravel_index_cython +from ..._lib.sklearn.tree._tree cimport ParentInfo +from .._utils cimport init_2dmemoryview, ravel_multi_index_cython, unravel_index_cython, fisher_yates_shuffle, compute_vectorized_indices_from_cartesian +from .._sklearn_splitter cimport sort + +cdef float64_t INFINITY = np.inf + +# Mitigate precision differences between 32 bit and 64 bit +cdef float32_t FEATURE_THRESHOLD = 1e-7 + +cdef inline void _init_split(ObliqueSplitRecord* self, intp_t start_pos) noexcept nogil: + self.impurity_left = INFINITY + self.impurity_right = INFINITY + self.pos = start_pos + self.feature = 0 + self.threshold = 0. + self.improvement = -INFINITY + self.missing_go_to_left = False + self.n_missing = 0 cdef class PatchSplitter(BestObliqueSplitter): @@ -33,46 +52,11 @@ cdef class PatchSplitter(BestObliqueSplitter): const float64_t[:] sample_weight, const uint8_t[::1] missing_values_in_feature_mask, ) except -1: + # store a view to the (n_samples, n_features) dataset BestObliqueSplitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) - return 0 - - cdef int node_reset( - self, - intp_t start, - intp_t end, - float64_t* weighted_n_node_samples - ) except -1 nogil: - """Reset splitter on node samples[start:end]. - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - - Parameters - ---------- - start : intp_t - The index of the first sample to consider - end : intp_t - The index of the last sample to consider - weighted_n_node_samples : ndarray, dtype=float64_t pointer - The total weight of those samples - """ - - self.start = start - self.end = end - - self.criterion.init(self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples) - self.criterion.set_sample_pointers(start, end) - - weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples - - # Clear all projection vectors - for i in range(self.max_features): - self.proj_mat_weights[i].clear() - self.proj_mat_indices[i].clear() + # store a reshaped view of (n_samples, height, width) dataset + self.X_reshaped = init_2dmemoryview(X, self.data_dims) return 0 cdef void sample_proj_mat( @@ -87,32 +71,11 @@ cdef class PatchSplitter(BestObliqueSplitter): """ pass - cdef (intp_t, intp_t) sample_top_left_seed(self) noexcept nogil: + cdef intp_t sample_top_left_seed(self) noexcept nogil: pass -cdef class BaseDensePatchSplitter(PatchSplitter): - cdef int init( - self, - object X, - const float64_t[:, ::1] y, - const float64_t[:] sample_weight, - const uint8_t[::1] missing_values_in_feature_mask, - # const int32_t[:] n_categories - ) except -1: - """Initialize the splitter - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - """ - # Call parent init - PatchSplitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) - - self.X = X - return 0 - - -cdef class BestPatchSplitter(BaseDensePatchSplitter): +cdef class BestPatchSplitter(PatchSplitter): def __cinit__( self, Criterion criterion, @@ -151,7 +114,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.data_dims = data_dims # create a buffer for storing the patch dimensions sampled per projection matrix - self.patch_dims_buff = np.zeros(data_dims.shape[0], dtype=np.intp) + self.patch_sampled_size = np.zeros(data_dims.shape[0], dtype=np.intp) self.unraveled_patch_point = np.zeros(data_dims.shape[0], dtype=np.intp) # store the min and max patch dimension constraints @@ -159,6 +122,19 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.max_patch_dims = max_patch_dims self.dim_contiguous = dim_contiguous + # create a memoryview to store the n-dimensional indices + self.patch_nd_indices = np.zeros(self.max_patch_dims[:], dtype=np.intp) + + # store random indices for discontiguous sampling if necessary + # these are initialized here to allow fisher-yates shuffling without having + # to re-allocate memory + self.random_indices = vector[vector[intp_t]](self.ndim) + for idx in range(self.ndim): + if not self.dim_contiguous[idx]: + self.random_indices[idx].reserve(self.max_patch_dims[idx]) + for jdx in range(self.max_patch_dims[idx]): + self.random_indices[idx].push_back(jdx) + # initialize a buffer to allow for Fisher-Yates self._index_patch_buffer = np.zeros(np.max(self.max_patch_dims), dtype=np.intp) self._index_data_buffer = np.zeros(np.max(self.data_dims), dtype=np.intp) @@ -192,23 +168,167 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.feature_weight.base if self.feature_weight is not None else None, ), self.__getstate__()) - cdef inline (intp_t, intp_t) sample_top_left_seed( + cdef int node_split( + self, + ParentInfo* parent_record, + SplitRecord* split, + ) except -1 nogil: + """Find the best_split split on node samples[start:end] + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + # typecast the pointer to an ObliqueSplitRecord + cdef ObliqueSplitRecord* oblique_split = (split) + + # Draw random splits and pick the best_split + cdef intp_t[::1] samples = self.samples + cdef intp_t start = self.start + cdef intp_t end = self.end + + # pointer array to store feature values to split on + cdef float32_t[::1] feature_values = self.feature_values + cdef intp_t max_features = self.max_features + cdef intp_t min_samples_leaf = self.min_samples_leaf + + # keep track of split record for current_split node and the best_split split + # found among the sampled projection vectors + cdef ObliqueSplitRecord best_split, current_split + cdef float64_t current_proxy_improvement = -INFINITY + cdef float64_t best_proxy_improvement = -INFINITY + + cdef float64_t impurity = parent_record.impurity + + cdef intp_t feat_i, p # index over computed features and start/end + cdef intp_t partition_end + cdef float32_t temp_d # to compute a projection feature value + + # instantiate the split records + _init_split(&best_split, end) + + # For every vector in the projection matrix + for feat_i in range(max_features): + # Projection vector has no nonzeros + if self.proj_mat_weights[feat_i].empty(): + continue + + # XXX: 'feature' is not actually used in oblique split records + # Just indicates which split was sampled + current_split.feature = feat_i + + # sample the projection vector + self.sample_proj_vec(self.proj_mat_weights[feat_i], self.proj_mat_indices[feat_i]) + current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] + current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] + + # Compute linear combination of features and then + # sort samples according to the feature values. + self.compute_features_over_samples( + start, + end, + samples, + feature_values, + &self.proj_mat_weights[feat_i], + &self.proj_mat_indices[feat_i] + ) + + # Sort the samples + sort(&feature_values[start], &samples[start], end - start) + + # Evaluate all splits + self.criterion.reset() + p = start + while p < end: + while (p + 1 < end and feature_values[p + 1] <= feature_values[p] + FEATURE_THRESHOLD): + p += 1 + + p += 1 + + if p < end: + current_split.pos = p + + # Reject if min_samples_leaf is not guaranteed + if (((current_split.pos - start) < min_samples_leaf) or + ((end - current_split.pos) < min_samples_leaf)): + continue + + self.criterion.update(current_split.pos) + # Reject if min_weight_leaf is not satisfied + if self.check_postsplit_conditions() == 1: + continue + + current_proxy_improvement = \ + self.criterion.proxy_impurity_improvement() + + if current_proxy_improvement > best_proxy_improvement: + best_proxy_improvement = current_proxy_improvement + # sum of halves is used to avoid infinite value + current_split.threshold = feature_values[p - 1] / 2.0 + feature_values[p] / 2.0 + + if ( + (current_split.threshold == feature_values[p]) or + (current_split.threshold == INFINITY) or + (current_split.threshold == -INFINITY) + ): + current_split.threshold = feature_values[p - 1] + + best_split = current_split # copy + + # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end] + if best_split.pos < end: + partition_end = end + p = start + + while p < partition_end: + # Account for projection vector + temp_d = 0.0 + for j in range(best_split.proj_vec_indices.size()): + temp_d += self.X[samples[p], deref(best_split.proj_vec_indices)[j]] *\ + deref(best_split.proj_vec_weights)[j] + + if temp_d <= best_split.threshold: + p += 1 + + else: + partition_end -= 1 + samples[p], samples[partition_end] = \ + samples[partition_end], samples[p] + + self.criterion.reset() + self.criterion.update(best_split.pos) + self.criterion.children_impurity(&best_split.impurity_left, + &best_split.impurity_right) + best_split.improvement = self.criterion.impurity_improvement( + impurity, best_split.impurity_left, best_split.impurity_right) + + # Return values + deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices + deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights + deref(oblique_split).feature = best_split.feature + deref(oblique_split).pos = best_split.pos + deref(oblique_split).threshold = best_split.threshold + deref(oblique_split).improvement = best_split.improvement + deref(oblique_split).impurity_left = best_split.impurity_left + deref(oblique_split).impurity_right = best_split.impurity_right + + # XXX: Fix when we can track constants + parent_record.n_constant_features = 0 + return 0 + + cdef inline intp_t sample_top_left_seed( self ) noexcept nogil: - """Sample the top-left seed for the n-dim patch. + """Sample the top-left seed, and patch size for the n-dim patch. Returns ------- top_left_seed : intp_t The top-left seed vectorized (i.e. raveled) for the n-dim patch. - patch_size : intp_t - The total size of the n-dim patch (i.e. the volume). """ # now get the top-left seed that is used to then determine the top-left # position in patch # compute top-left seed for the multi-dimensional patch cdef intp_t top_left_patch_seed - cdef intp_t patch_size = 1 cdef uint32_t* random_state = &self.rand_r_state @@ -240,8 +360,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): top_left_patch_seed = rand_int(0, delta_patch_dim, random_state) # write to buffer - self.patch_dims_buff[idx] = patch_dim - patch_size *= patch_dim + self.patch_sampled_size[idx] = patch_dim elif self.boundary == "wrap": # add circular boundary conditions delta_patch_dim = self.data_dims[idx] + 2 * (patch_dim - 1) @@ -254,20 +373,94 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # resample the patch dimension due to padding patch_dim = min(patch_dim, min(dim+1, self.data_dims[idx] + patch_dim - dim - 1)) - self.patch_dims_buff[idx] = patch_dim - patch_size *= patch_dim + self.patch_sampled_size[idx] = patch_dim # TODO: make this work # Convert the top-left-seed value to it's appropriate index in the full image. top_left_patch_seed = max(0, dim - patch_dim + 1) - unraveled_patch_point[idx] = top_left_patch_seed + # unraveled_patch_point[idx] = top_left_patch_seed + self.unraveled_patch_point[idx] = top_left_patch_seed + + # top_left_patch_seed = ravel_multi_index_cython( + # unraveled_patch_point, + # self.data_dims + # ) + top_left_patch_seed = ravel_multi_index_cython( + self.unraveled_patch_point, + self.data_dims + ) + return top_left_patch_seed + + + cdef inline intp_t sample_top_left_seed_v2( + self + ) noexcept nogil: + """Sample the top-left seed, and patch size for the n-dim patch. + + Returns + ------- + top_left_seed : intp_t + The top-left seed vectorized (i.e. raveled) for the n-dim patch. + """ + # the top-left position in the patch for each dimension + cdef intp_t top_left_patch_seed + + cdef uint32_t* random_state = &self.rand_r_state + + # define parameters for the random patch + cdef intp_t patch_dim + cdef intp_t delta_patch_dim + cdef intp_t dim + + # pre-allocated buffer to store the unraveled patch top left seed point + cdef intp_t[:] unraveled_patch_point = self.unraveled_patch_point + + for dim in range(self.ndim): + # compute random patch width and height + # Note: By constraining max patch height/width to be at least the min + # patch height/width this ensures that the minimum value of + # patch_height and patch_width is 1 + patch_dim = rand_int( + self.min_patch_dims[dim], + self.max_patch_dims[dim] + 1, + random_state + ) + + if not self.dim_contiguous[dim]: + # fisher-yates shuffle the discontiguous dimension, so we can randomly sample + # without replacement, the indices of the patch in that dimension + fisher_yates_shuffle(self.random_indices[dim], self.random_indices[dim].size(), random_state) + + # sample the top-left index and patch size for this dimension based on boundary effects + if self.boundary is None: + # compute the difference between the image dimensions and the current + # random patch dimensions for sampling + delta_patch_dim = (self.data_dims[dim] - patch_dim) + 1 + top_left_patch_seed = rand_int(0, delta_patch_dim, random_state) + + # write to buffer + self.patch_sampled_size[dim] = patch_dim + elif self.boundary == "wrap": + pass + + # now add the relevant indices for each element of the patch + for jdx in range(patch_dim): + if self.dim_contiguous[dim]: + self.patch_nd_indices[dim][jdx] = top_left_patch_seed + jdx + else: + # if discontiguous, we will perform random sampling + self.patch_nd_indices[dim][jdx] = self.random_indices[dim][jdx] + + unraveled_patch_point[dim] = top_left_patch_seed + + # get the vectorized index of the top-left seed top_left_patch_seed = ravel_multi_index_cython( unraveled_patch_point, self.data_dims ) - return top_left_patch_seed, patch_size + return top_left_patch_seed cdef void sample_proj_mat( self, @@ -285,24 +478,19 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # and top-left seed cdef intp_t top_left_patch_seed - # size of the sampled patch, which is just the size of the n-dim patch - # (\prod_i self.patch_dims_buff[i]) - cdef intp_t patch_size - for proj_i in range(0, max_features): # now get the top-left seed that is used to then determine the top-left # position in patch # compute top-left seed for the multi-dimensional patch - top_left_patch_seed, patch_size = self.sample_top_left_seed() + top_left_patch_seed = self.sample_top_left_seed() # sample a projection vector given the top-left seed point in n-dimensional space self.sample_proj_vec( proj_mat_weights, proj_mat_indices, proj_i, - patch_size, top_left_patch_seed, - self.patch_dims_buff + self.patch_sampled_size ) cdef void sample_proj_vec( @@ -310,7 +498,6 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): vector[vector[float32_t]]& proj_mat_weights, vector[vector[intp_t]]& proj_mat_indices, intp_t proj_i, - intp_t patch_size, intp_t top_left_patch_seed, const intp_t[:] patch_dims, ) noexcept nogil: @@ -318,6 +505,14 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # iterates over the size of the patch cdef intp_t patch_idx + cdef intp_t i + + # size of the sampled patch, which is just the size of the n-dim patch + # (\prod_i self.patch_sampled_size[i]) + cdef intp_t patch_size = 1 + for i in range(0, self.ndim): + patch_size *= patch_dims[i] + # stores how many patches we have iterated so far cdef intp_t vectorized_patch_offset cdef intp_t vectorized_point_offset @@ -333,7 +528,6 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): cdef intp_t other_dims_offset cdef intp_t row_index - cdef intp_t i cdef intp_t num_rows = self.data_dims[0] if self._discontiguous: # fill with values 0, 1, ..., dimension - 1 @@ -392,7 +586,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): if not self.dim_contiguous[idx]: row_index += ( (self.unraveled_patch_point[idx] // other_dims_offset) % - self.patch_dims_buff[idx] + self.patch_sampled_size[idx] ) * other_dims_offset other_dims_offset //= self.data_dims[idx] @@ -404,6 +598,166 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): proj_mat_indices[proj_i].push_back(vectorized_point) proj_mat_weights[proj_i].push_back(weight) + cdef void sample_proj_vec_v2( + self, + vector[vector[float32_t]]& proj_mat_weights, + vector[vector[intp_t]]& proj_mat_indices, + vector[intp_t]& top_left_seed, + intp_t proj_i, + const intp_t[:] patch_dims, + ) noexcept nogil: + cdef uint32_t* random_state = &self.rand_r_state + + # iterates over the size of the patch + cdef intp_t patch_idx + + # stores how many patches we have iterated so far + cdef intp_t vectorized_patch_offset + cdef intp_t vectorized_point_offset + cdef intp_t vectorized_point + + cdef intp_t dim_idx + + # weights are default to 1 + cdef float32_t weight = 1. + + # XXX: still unsure if it works yet + # XXX: THIS ONLY WORKS FOR THE FIRST DIMENSION THAT IS DISCONTIGUOUS. + cdef intp_t other_dims_offset + cdef intp_t row_index + + cdef intp_t i + cdef intp_t num_rows = self.data_dims[0] + + # iterate over the patch and store the vectorized index + cdef vector[intp_t] index_arr = vector[intp_t](self.ndim) + for dim in range(top_left_seed.size()): + index_arr[dim] = top_left_seed[dim] + + # fisher_yates_shuffle(dim_index_arr, patch_dims[dim], random_state) + + # iterate over the indices of the patch, and get the resulting + # raveled index at each patch-point, resulting in now a 1D vector + # of indices representing each patch-point. + while True: + # Note: we iterate over the last axis first assuming that data + # is stored in a C-contiguous array. + for dim in range(self.ndim - 1, -1, -1): + if self.dim_contiguous[dim]: + index_arr[dim] += 1 + + # check we have not reached the boundaries of the patch. If we have not, + # we can break out of the loop and continue iterating in this dimension + if index_arr[dim] < top_left_seed[dim] + patch_dims[dim]: + break + + # if we have reached the boundary, we reset the index and continue + index_arr[dim] = top_left_seed[dim] + else: + random_index = random_indices[dim][r_idx] + # dimension is assumed discontiguous, and thus is randomly chosen + index_arr[dim] = rand_int(0, self.data_dims[dim], random_state) + + discontiguous_index += 1 + + if r_idx < patch_dims[dim]: + idx[dim] = random_indices[dim][r_idx] + r_idx += 1 + break + + r_idx = 0 + # if we have reached the boundary, we reset the index and continue + index_arr[dim] = top_left_seed[dim] + + # if self.dim_contiguous[dim] and + # break + # elif not self.dim_contiguous[dim] and index > patch_dims[dim]: + # break + else: + break + + # # get the vectorized version of this index + # # dim_idx = ravel_multi_index_cython( + # # index_arr, + # # self.data_dims + # # ) + + # proj_mat_indices[proj_i].push_back(vectorized_point) + # proj_mat_weights[proj_i].push_back(weight) + + # idx[dim] = start[dim] + # TODO: iterate correctly over multi-dim array + # for idx in range(len(self.data_dims[dim])): + # unvectorized_idx[dim] = top_left_seed[dim] + idx + # else: + # unvectorized_idx[dim] = top_left_seed[dim] + rand_int(0, patch_dims[dim], random_state) + + # # get the vectorized version of this index + # dim_idx = ravel_multi_index_cython( + # unvectorized_idx, + # self.data_dims + # ) + + # proj_mat_indices[proj_i].push_back(vectorized_point) + # proj_mat_weights[proj_i].push_back(weight) + + + # for patch_idx in range(patch_size): + # # keep track of which dimensions of the patch we have iterated over + # vectorized_patch_offset = 1 + + # # Once the vectorized top-left-seed is unraveled, you can add the patch + # # points in the array structure and compute their vectorized (unraveled) + # # points, which are added to the projection vector + # unravel_index_cython(top_left_patch_seed, self.data_dims, self.unraveled_patch_point) + + # for dim_idx in range(self.ndim): + # # compute the offset from the top-left patch seed based on: + # # 1. the current patch index + # # 2. the patch dimension indexed by `dim_idx` + # # 3. and the vectorized patch dimensions that we have seen so far + # # the `vectorized_point_offset` is the offset from the top-left vectorized seed for this dimension + # vectorized_point_offset = (patch_idx // (vectorized_patch_offset)) % patch_dims[dim_idx] + + # # then we compute the actual point in the original data shape + # self.unraveled_patch_point[dim_idx] = self.unraveled_patch_point[dim_idx] + vectorized_point_offset + # vectorized_patch_offset *= patch_dims[dim_idx] + + # # if any dimensions are discontiguous, we want to migrate the entire axis a fixed amount + # # based on the shuffling + # if self._discontiguous is True: + # for dim_idx in range(self.ndim): + # if self.dim_contiguous[dim_idx] is True: + # continue + + # # determine the "row" we are currently on + # # other_dims_offset = 1 + # # for idx in range(dim_idx + 1, self.ndim): + # # other_dims_offset *= self.data_dims[idx] + # # row_index = self.unraveled_patch_point[dim_idx] % other_dims_offset + # # determine the "row" we are currently on + # other_dims_offset = 1 + # for idx in range(dim_idx + 1, self.ndim): + # if not self.dim_contiguous[idx]: + # other_dims_offset *= self.data_dims[idx] + + # row_index = 0 + # for idx in range(dim_idx + 1, self.ndim): + # if not self.dim_contiguous[idx]: + # row_index += ( + # (self.unraveled_patch_point[idx] // other_dims_offset) % + # self.patch_sampled_size[idx] + # ) * other_dims_offset + # other_dims_offset //= self.data_dims[idx] + + # # assign random row index now + # self.unraveled_patch_point[dim_idx] = self._index_patch_buffer[row_index] + + # # ravel the patch point into the original data dimensions + # vectorized_point = ravel_multi_index_cython(self.unraveled_patch_point, self.data_dims) + # proj_mat_indices[proj_i].push_back(vectorized_point) + # proj_mat_weights[proj_i].push_back(weight) + cdef void compute_features_over_samples( self, intp_t start, @@ -411,18 +765,62 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): const intp_t[:] samples, float32_t[:] feature_values, vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[intp_t]* proj_vec_indices # indices of the features (max_features,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. + + Parameters + ---------- + start : int + The start index of the samples. + end : int + The end index of the samples. + samples : array-like, shape (n_samples,) + The indices of the samples. + feature_values : array-like, shape (n_samples,) + The pre-allocated buffer to store the feature values. + proj_vec_weights : array-like, shape (max_features,) + The weights of the projection vector. + proj_vec_indices : array-like, shape (max_features,) + The indices of the projection vector. This will only store the vectorized + index of the top-left-seed. """ cdef intp_t idx, jdx # initialize feature weight to normalize across patch cdef float32_t patch_weight + if proj_vec_indices.size() != 1: + with gil: + raise ValueError("proj_vec_indices should only have one element corresponding to the top left seed.") + + # patch dimensions + cdef intp_t[:] patch_dims = self.patch_sampled_size + cdef vector[intp_t] top_left_seed = vector[intp_t](self.ndim) + + cdef intp_t volume_of_patch = 1 + # Calculate the total number of Cartesian products + for i in range(self.ndim): + volume_of_patch *= len(self.patch_nd_indices[i]) + + # create a buffer to store the raveled indices of the patch, which will be used + # to compute the relevant feature values + cdef vector[intp_t] raveled_patch_indices = vector[intp_t](volume_of_patch) + for jdx in range(0, proj_vec_indices.size()): + # get the index of the top-left patch + top_left_patch_index = deref(proj_vec_indices)[jdx] + + # compute the raveled index of all the points in the patch + compute_vectorized_indices_from_cartesian( + top_left_patch_index, + self.patch_nd_indices, + self.data_dims, + raveled_patch_indices + ) + # Compute linear combination of features and then # sort samples according to the feature values. for idx in range(start, end): @@ -430,15 +828,17 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # initialize the feature value to 0 feature_values[idx] = 0 - for jdx in range(0, proj_vec_indices.size()): + for kdx in range(raveled_patch_indices.size()): + feature_index = raveled_patch_indices[kdx] + feature_values[idx] += self.X[ - samples[idx], deref(proj_vec_indices)[jdx] + samples[idx], feature_index ] * deref(proj_vec_weights)[jdx] - if self.feature_weight is not None: - # gets the feature weight for this specific column from X - # the default of feature_weights[i] is (1/n_features) for all i - patch_weight += self.feature_weight[samples[idx], deref(proj_vec_indices)[jdx]] + if self.feature_weight is not None: + # gets the feature weight for this specific column from X + # the default of feature_weights[i] is (1/n_features) for all i + patch_weight += self.feature_weight[samples[idx], deref(proj_vec_indices)[jdx]] if self.feature_weight is not None: feature_values[idx] /= patch_weight @@ -448,7 +848,7 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): """A class to expose a Python interface for testing.""" cpdef sample_top_left_seed_cpdef(self): top_left_patch_seed, patch_size = self.sample_top_left_seed() - patch_dims = np.array(self.patch_dims_buff, dtype=np.intp) + patch_dims = np.array(self.patch_sampled_size, dtype=np.intp) return top_left_patch_seed, patch_size, patch_dims cpdef sample_projection_vector( @@ -467,7 +867,6 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): proj_mat_weights, proj_mat_indices, proj_i, - patch_size, top_left_patch_seed, patch_dims ) diff --git a/treeple/tree/tests/meson.build b/treeple/tree/tests/meson.build index b0ccb9b5..cbdd38f7 100644 --- a/treeple/tree/tests/meson.build +++ b/treeple/tree/tests/meson.build @@ -8,6 +8,7 @@ python_sources = [ 'test_unsupervised_tree.py', 'test_multiview.py', 'test_kernel.py', + 'test_manifold.py', ] py.install_sources( diff --git a/treeple/tree/tests/test_manifold.py b/treeple/tree/tests/test_manifold.py new file mode 100644 index 00000000..f352cc1e --- /dev/null +++ b/treeple/tree/tests/test_manifold.py @@ -0,0 +1,89 @@ +import joblib +import numpy as np +import pytest +from sklearn import datasets +from numpy.testing import assert_almost_equal, assert_array_equal +from sklearn.base import is_classifier +from sklearn.datasets import load_iris, make_blobs +from sklearn.tree._tree import TREE_LEAF + +from treeple.tree import ( + ExtraObliqueDecisionTreeClassifier, + ExtraObliqueDecisionTreeRegressor, + ObliqueDecisionTreeClassifier, + ObliqueDecisionTreeRegressor, + PatchObliqueDecisionTreeClassifier, + PatchObliqueDecisionTreeRegressor, + UnsupervisedDecisionTree, + UnsupervisedObliqueDecisionTree, +) + +ALL_TREES = [ + ExtraObliqueDecisionTreeRegressor, + ObliqueDecisionTreeRegressor, + PatchObliqueDecisionTreeRegressor, + ExtraObliqueDecisionTreeClassifier, + ObliqueDecisionTreeClassifier, + PatchObliqueDecisionTreeClassifier, + UnsupervisedDecisionTree, + UnsupervisedObliqueDecisionTree, +] + +rng = np.random.RandomState(1) + +# load digits dataset and randomly permute it +digits = datasets.load_digits() +perm = rng.permutation(digits.target.size) +digits.data = digits.data[perm] +digits.target = digits.target[perm] + + + +def test_splitters(): + """Test that splitters are picklable.""" + X, y = digits.data, digits.target + X = X.astype(np.float32) + sample_weight = np.ones(len(y), dtype=np.float64).squeeze() + y = y.reshape(-1, 1).astype(np.float64) + missing_values_in_feature_mask = None + + print(X.shape, y.shape) + from treeple._lib.sklearn.tree._criterion import Gini + from treeple.tree.manifold._morf_splitter import BestPatchSplitterTester + + criterion = Gini(1, np.array((0, 1), dtype=np.intp)) + max_features = 6 + min_samples_leaf = 1 + min_weight_leaf = 0.0 + monotonic_cst = np.array([1, 1, 1, 1, 1, 1], dtype=np.int8) + random_state = np.random.RandomState(100) + boundary = None + feature_weight = None + min_patch_dims = np.array((1, 1), dtype=np.intp) + max_patch_dims = np.array((3, 1), dtype=np.intp) + dim_contiguous = np.array((True, True)) + data_dims = np.array((8, 8), dtype=np.intp) + + feature_combinations = 1.5 + + splitter = BestPatchSplitterTester( + criterion, + max_features, + min_samples_leaf, + min_weight_leaf, + random_state, + monotonic_cst, + feature_combinations, + min_patch_dims, + max_patch_dims, + dim_contiguous, + data_dims, + boundary, + feature_weight, + ) + splitter.init_test( + X, + y, + sample_weight, + ) + assert_array_equal(splitter.X_reshaped.reshape(-1, 64), X)