Clean up Cython files

Signed-off-by: Adam Li <adam2392@gmail.com>
neurodata · Sep 5, 2024 · 8c4a7f6 · 8c4a7f6
1 parent ea67d06
commit 8c4a7f6
Show file tree

Hide file tree

Showing 7 changed files with 109 additions and 78 deletions.
diff --git a/treeple/tree/_classes.py b/treeple/tree/_classes.py
@@ -820,7 +820,7 @@ class ObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier):
 
     tree_type = "oblique"
 
-    _parameter_constraints = {
+    _parameter_constraints: dict = {
         **DecisionTreeClassifier._parameter_constraints,
         "feature_combinations": [
             Interval(Real, 1.0, None, closed="left"),
@@ -1283,7 +1283,7 @@ class ObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):
 
     tree_type = "oblique"
 
-    _parameter_constraints = {
+    _parameter_constraints: dict = {
         **DecisionTreeRegressor._parameter_constraints,
         "feature_combinations": [
             Interval(Real, 1.0, None, closed="left"),
@@ -1684,7 +1684,7 @@ class PatchObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier)
     """
 
     tree_type = "oblique"
-    _parameter_constraints = {
+    _parameter_constraints: dict = {
         **DecisionTreeClassifier._parameter_constraints,
         "min_patch_dims": ["array-like", None],
         "max_patch_dims": ["array-like", None],
@@ -2166,7 +2166,7 @@ class PatchObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):
     """
 
     tree_type = "oblique"
-    _parameter_constraints = {
+    _parameter_constraints: dict = {
         **DecisionTreeRegressor._parameter_constraints,
         "min_patch_dims": ["array-like", None],
         "max_patch_dims": ["array-like", None],
@@ -2669,7 +2669,7 @@ class ExtraObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier)
 
     tree_type = "oblique"
 
-    _parameter_constraints = {
+    _parameter_constraints: dict = {
         **DecisionTreeClassifier._parameter_constraints,
         "feature_combinations": [
             Interval(Real, 1.0, None, closed="left"),
@@ -3069,7 +3069,7 @@ class ExtraObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):
         -0.26552594, -0.00642017, -0.07108117, -0.40726765, -0.40315294])
     """
 
-    _parameter_constraints = {
+    _parameter_constraints: dict = {
         **DecisionTreeRegressor._parameter_constraints,
         "feature_combinations": [
             Interval(Real, 1.0, None, closed="left"),

diff --git a/treeple/tree/_oblique_splitter.pxd b/treeple/tree/_oblique_splitter.pxd
@@ -83,12 +83,6 @@ cdef class BaseObliqueSplitter(Splitter):
         SplitRecord* split,
     ) except -1 nogil
 
-    cdef inline void fisher_yates_shuffle_memview(
-        self,
-        intp_t[::1] indices_to_sample,
-        intp_t grid_size,
-        uint32_t* random_state
-    ) noexcept nogil
 
 cdef class ObliqueSplitter(BaseObliqueSplitter):
     # The splitter searches in the input space for a linear combination of features and a threshold

diff --git a/treeple/tree/_oblique_splitter.pyx b/treeple/tree/_oblique_splitter.pyx
@@ -11,6 +11,7 @@ from libcpp.vector cimport vector
 
 from .._lib.sklearn.tree._criterion cimport Criterion
 from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform
+from ._utils cimport fisher_yates_shuffle
 
 
 cdef float64_t INFINITY = np.inf
@@ -46,8 +47,12 @@ cdef class BaseObliqueSplitter(Splitter):
     def __setstate__(self, d):
         pass
 
-    cdef int node_reset(self, intp_t start, intp_t end,
-                        float64_t* weighted_n_node_samples) except -1 nogil:
+    cdef int node_reset(
+        self,
+        intp_t start,
+        intp_t end,
+        float64_t* weighted_n_node_samples
+    ) except -1 nogil:
         """Reset splitter on node samples[start:end].
 
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
@@ -62,17 +67,7 @@ cdef class BaseObliqueSplitter(Splitter):
         weighted_n_node_samples : ndarray, dtype=float64_t pointer
             The total weight of those samples
         """
-
-        self.start = start
-        self.end = end
-
-        self.criterion.init(self.y,
-                            self.sample_weight,
-                            self.weighted_n_samples,
-                            self.samples)
-        self.criterion.set_sample_pointers(start, end)
-
-        weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
+        Splitter.node_reset(self, start, end, weighted_n_node_samples)
 
         # Clear all projection vectors
         for i in range(self.max_features):
@@ -102,8 +97,8 @@ cdef class BaseObliqueSplitter(Splitter):
         intp_t end,
         const intp_t[:] samples,
         float32_t[:] feature_values,
-        vector[float32_t]* proj_vec_weights,  # weights of the vector (max_features,)
-        vector[intp_t]* proj_vec_indices    # indices of the features (max_features,)
+        vector[float32_t]* proj_vec_weights,  # weights of the vector (n_non_zeros,)
+        vector[intp_t]* proj_vec_indices    # indices of the features (n_non_zeros,)
     ) noexcept nogil:
         """Compute the feature values for the samples[start:end] range.
 
@@ -126,19 +121,6 @@ cdef class BaseObliqueSplitter(Splitter):
                     feature_values[idx] = 0.0
                 feature_values[idx] += self.X[samples[idx], col_idx] * col_weight
 
-    cdef inline void fisher_yates_shuffle_memview(
-        self,
-        intp_t[::1] indices_to_sample,
-        intp_t grid_size,
-        uint32_t* random_state,
-    ) noexcept nogil:
-        cdef intp_t i, j
-
-        # XXX: should this be `i` or `i+1`? for valid Fisher-Yates?
-        for i in range(0, grid_size - 1):
-            j = rand_int(i, grid_size, random_state)
-            indices_to_sample[j], indices_to_sample[i] = \
-                indices_to_sample[i], indices_to_sample[j]
 
 cdef class ObliqueSplitter(BaseObliqueSplitter):
     def __cinit__(
@@ -257,7 +239,7 @@ cdef class ObliqueSplitter(BaseObliqueSplitter):
         cdef intp_t grid_size = self.max_features * self.n_features
 
         # shuffle indices over the 2D grid to sample using Fisher-Yates
-        self.fisher_yates_shuffle_memview(indices_to_sample, grid_size, random_state)
+        fisher_yates_shuffle(indices_to_sample, grid_size, random_state)
 
         # sample 'n_non_zeros' in a mtry X n_features projection matrix
         # which consists of +/- 1's chosen at a 1/2s rate
@@ -309,7 +291,7 @@ cdef class BestObliqueSplitter(ObliqueSplitter):
         cdef intp_t end = self.end
 
         # pointer array to store feature values to split on
-        cdef float32_t[::1]  feature_values = self.feature_values
+        cdef float32_t[::1] feature_values = self.feature_values
         cdef intp_t max_features = self.max_features
         cdef intp_t min_samples_leaf = self.min_samples_leaf
 

diff --git a/treeple/tree/_utils.pxd b/treeple/tree/_utils.pxd
@@ -1,3 +1,5 @@
+from libcpp.vector cimport vector
+
 import numpy as np
 
 cimport numpy as cnp
@@ -7,15 +9,41 @@ cnp.import_array()
 from .._lib.sklearn.tree._splitter cimport SplitRecord
 from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t
 
+ctypedef fused vector_or_memview:
+    vector[intp_t]
+    intp_t[::1]
+    intp_t[:]
+
+
+cdef inline void fisher_yates_shuffle(
+    vector_or_memview indices_to_sample,
+    intp_t grid_size,
+    uint32_t* random_state,
+) noexcept nogil
 
-cdef int rand_weighted_binary(float64_t p0, uint32_t* random_state) noexcept nogil
+
+cdef int rand_weighted_binary(
+    float64_t p0,
+    uint32_t* random_state
+) noexcept nogil
 
 cpdef unravel_index(
-    intp_t index, cnp.ndarray[intp_t, ndim=1] shape
+    intp_t index,
+    cnp.ndarray[intp_t, ndim=1] shape
 )
 
-cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape)
+cpdef ravel_multi_index(
+    intp_t[:] coords,
+    const intp_t[:] shape
+)
 
-cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil
+cdef void unravel_index_cython(
+    intp_t index,
+    const intp_t[:] shape,
+    vector_or_memview coords
+) noexcept nogil
 
-cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil
+cdef intp_t ravel_multi_index_cython(
+    vector_or_memview coords,
+    const intp_t[:] shape
+) noexcept nogil
diff --git a/treeple/tree/_utils.pyx b/treeple/tree/_utils.pyx
@@ -11,10 +11,40 @@ cimport numpy as cnp
 
 cnp.import_array()
 
-from .._lib.sklearn.tree._utils cimport rand_uniform
+from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform
 
 
-cdef inline int rand_weighted_binary(float64_t p0, uint32_t* random_state) noexcept nogil:
+cdef inline void fisher_yates_shuffle(
+    vector_or_memview indices_to_sample,
+    intp_t grid_size,
+    uint32_t* random_state,
+) noexcept nogil:
+    """Shuffle the indices in place using the Fisher-Yates algorithm.
+    Parameters
+    ----------
+    indices_to_sample : A C++ vector or 1D memoryview
+        The indices to shuffle.
+    grid_size : intp_t
+        The size of the grid to shuffle. This is explicitly passed in
+        to support the templated `vector_or_memview` type, which allows
+        for both C++ vectors and Cython memoryviews. Getitng the length
+        of both types uses different API.
+    random_state : uint32_t*
+        The random state.
+    """
+    cdef intp_t i, j
+
+    # XXX: should this be `i` or `i+1`? for valid Fisher-Yates?
+    for i in range(0, grid_size - 1):
+        j = rand_int(i, grid_size, random_state)
+        indices_to_sample[j], indices_to_sample[i] = \
+            indices_to_sample[i], indices_to_sample[j]
+
+
+cdef inline int rand_weighted_binary(
+    float64_t p0,
+    uint32_t* random_state
+) noexcept nogil:
     """Sample from integers 0 and 1 with different probabilities.
 
     Parameters
@@ -54,7 +84,9 @@ cpdef unravel_index(
     index = np.intp(index)
     shape = np.array(shape)
     coords = np.empty(shape.shape[0], dtype=np.intp)
-    unravel_index_cython(index, shape, coords)
+    cdef const intp_t[:] shape_memview = shape
+    cdef intp_t[:] coords_memview = coords
+    unravel_index_cython(index, shape_memview, coords_memview)
     return coords
 
 
@@ -83,7 +115,11 @@ cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape):
     return ravel_multi_index_cython(coords, shape)
 
 
-cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil:
+cdef inline void unravel_index_cython(
+    intp_t index,
+    const intp_t[:] shape,
+    vector_or_memview coords
+) noexcept nogil:
     """Converts a flat index into a tuple of coordinate arrays.
 
     Parameters
@@ -92,13 +128,9 @@ cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] co
         The flat index to be converted.
     shape : numpy.ndarray[intp_t, ndim=1]
         The shape of the array into which the flat index should be converted.
-    coords : numpy.ndarray[intp_t, ndim=1]
-        A preinitialized memoryview array of coordinate arrays to be converted.
-
-    Returns
-    -------
-    numpy.ndarray[intp_t, ndim=1]
-        An array of coordinate arrays, with each coordinate array having the same shape as the input `shape`.
+    coords : intp_t[:] or vector[intp_t]
+        A preinitialized array of coordinates to store the result of the
+        unraveled `index`.
     """
     cdef intp_t ndim = shape.shape[0]
     cdef intp_t j, size
@@ -109,13 +141,16 @@ cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] co
         index //= size
 
 
-cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil:
-    """Converts a tuple of coordinate arrays into a flat index.
+cdef inline intp_t ravel_multi_index_cython(
+    vector_or_memview coords,
+    const intp_t[:] shape
+) noexcept nogil:
+    """Converts a tuple of coordinate arrays into a flat index in the vectorized dimension.
 
     Parameters
     ----------
-    coords : numpy.ndarray[intp_t, ndim=1]
-        An array of coordinate arrays to be converted.
+    coords : intp_t[:] or vector[intp_t]
+         An array of coordinates to be converted and vectorized into a sinlg
     shape : numpy.ndarray[intp_t, ndim=1]
         The shape of the array into which the coordinates should be converted.
 

diff --git a/treeple/tree/manifold/_morf_splitter.pxd b/treeple/tree/manifold/_morf_splitter.pxd
@@ -32,14 +32,6 @@ cdef class PatchSplitter(BestObliqueSplitter):
     # an input data vector. The input data is vectorized, so `data_height` and
     # `data_width` are used to determine the vectorized indices corresponding to
     # (x,y) coordinates in the original un-vectorized data.
-
-    cdef public intp_t max_patch_height                 # Maximum height of the patch to sample
-    cdef public intp_t max_patch_width                  # Maximum width of the patch to sample
-    cdef public intp_t min_patch_height                 # Minimum height of the patch to sample
-    cdef public intp_t min_patch_width                  # Minimum width of the patch to sample
-    cdef public intp_t data_height                      # Height of the input data
-    cdef public intp_t data_width                       # Width of the input data
-
     cdef public intp_t ndim                       # The number of dimensions of the input data
 
     cdef const intp_t[:] data_dims                      # The dimensions of the input data
@@ -56,7 +48,7 @@ cdef class PatchSplitter(BestObliqueSplitter):
 
     cdef intp_t[::1] _index_data_buffer
     cdef intp_t[::1] _index_patch_buffer
-    cdef intp_t[:] patch_dims_buff                # A buffer to store the dimensions of the sampled patch
+    cdef intp_t[:] patch_sampled_size                # A buffer to store the dimensions of the sampled patch
     cdef intp_t[:] unraveled_patch_point          # A buffer to store the unraveled patch point
 
     # All oblique splitters (i.e. non-axis aligned splitters) require a