Skip to content

Commit

Permalink
Clean up Cython files
Browse files Browse the repository at this point in the history
Signed-off-by: Adam Li <adam2392@gmail.com>
  • Loading branch information
adam2392 committed Sep 5, 2024
1 parent ea67d06 commit 8c4a7f6
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 78 deletions.
12 changes: 6 additions & 6 deletions treeple/tree/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,7 @@ class ObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier):

tree_type = "oblique"

_parameter_constraints = {
_parameter_constraints: dict = {
**DecisionTreeClassifier._parameter_constraints,
"feature_combinations": [
Interval(Real, 1.0, None, closed="left"),
Expand Down Expand Up @@ -1283,7 +1283,7 @@ class ObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):

tree_type = "oblique"

_parameter_constraints = {
_parameter_constraints: dict = {
**DecisionTreeRegressor._parameter_constraints,
"feature_combinations": [
Interval(Real, 1.0, None, closed="left"),
Expand Down Expand Up @@ -1684,7 +1684,7 @@ class PatchObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier)
"""

tree_type = "oblique"
_parameter_constraints = {
_parameter_constraints: dict = {
**DecisionTreeClassifier._parameter_constraints,
"min_patch_dims": ["array-like", None],
"max_patch_dims": ["array-like", None],
Expand Down Expand Up @@ -2166,7 +2166,7 @@ class PatchObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):
"""

tree_type = "oblique"
_parameter_constraints = {
_parameter_constraints: dict = {
**DecisionTreeRegressor._parameter_constraints,
"min_patch_dims": ["array-like", None],
"max_patch_dims": ["array-like", None],
Expand Down Expand Up @@ -2669,7 +2669,7 @@ class ExtraObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier)

tree_type = "oblique"

_parameter_constraints = {
_parameter_constraints: dict = {
**DecisionTreeClassifier._parameter_constraints,
"feature_combinations": [
Interval(Real, 1.0, None, closed="left"),
Expand Down Expand Up @@ -3069,7 +3069,7 @@ class ExtraObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):
-0.26552594, -0.00642017, -0.07108117, -0.40726765, -0.40315294])
"""

_parameter_constraints = {
_parameter_constraints: dict = {
**DecisionTreeRegressor._parameter_constraints,
"feature_combinations": [
Interval(Real, 1.0, None, closed="left"),
Expand Down
6 changes: 0 additions & 6 deletions treeple/tree/_oblique_splitter.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,6 @@ cdef class BaseObliqueSplitter(Splitter):
SplitRecord* split,
) except -1 nogil

cdef inline void fisher_yates_shuffle_memview(
self,
intp_t[::1] indices_to_sample,
intp_t grid_size,
uint32_t* random_state
) noexcept nogil

cdef class ObliqueSplitter(BaseObliqueSplitter):
# The splitter searches in the input space for a linear combination of features and a threshold
Expand Down
42 changes: 12 additions & 30 deletions treeple/tree/_oblique_splitter.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ from libcpp.vector cimport vector

from .._lib.sklearn.tree._criterion cimport Criterion
from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform
from ._utils cimport fisher_yates_shuffle


cdef float64_t INFINITY = np.inf
Expand Down Expand Up @@ -46,8 +47,12 @@ cdef class BaseObliqueSplitter(Splitter):
def __setstate__(self, d):
pass

cdef int node_reset(self, intp_t start, intp_t end,
float64_t* weighted_n_node_samples) except -1 nogil:
cdef int node_reset(
self,
intp_t start,
intp_t end,
float64_t* weighted_n_node_samples
) except -1 nogil:
"""Reset splitter on node samples[start:end].
Returns -1 in case of failure to allocate memory (and raise MemoryError)
Expand All @@ -62,17 +67,7 @@ cdef class BaseObliqueSplitter(Splitter):
weighted_n_node_samples : ndarray, dtype=float64_t pointer
The total weight of those samples
"""

self.start = start
self.end = end

self.criterion.init(self.y,
self.sample_weight,
self.weighted_n_samples,
self.samples)
self.criterion.set_sample_pointers(start, end)

weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
Splitter.node_reset(self, start, end, weighted_n_node_samples)

# Clear all projection vectors
for i in range(self.max_features):
Expand Down Expand Up @@ -102,8 +97,8 @@ cdef class BaseObliqueSplitter(Splitter):
intp_t end,
const intp_t[:] samples,
float32_t[:] feature_values,
vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,)
vector[intp_t]* proj_vec_indices # indices of the features (max_features,)
vector[float32_t]* proj_vec_weights, # weights of the vector (n_non_zeros,)
vector[intp_t]* proj_vec_indices # indices of the features (n_non_zeros,)
) noexcept nogil:
"""Compute the feature values for the samples[start:end] range.
Expand All @@ -126,19 +121,6 @@ cdef class BaseObliqueSplitter(Splitter):
feature_values[idx] = 0.0
feature_values[idx] += self.X[samples[idx], col_idx] * col_weight

cdef inline void fisher_yates_shuffle_memview(
self,
intp_t[::1] indices_to_sample,
intp_t grid_size,
uint32_t* random_state,
) noexcept nogil:
cdef intp_t i, j

# XXX: should this be `i` or `i+1`? for valid Fisher-Yates?
for i in range(0, grid_size - 1):
j = rand_int(i, grid_size, random_state)
indices_to_sample[j], indices_to_sample[i] = \
indices_to_sample[i], indices_to_sample[j]

cdef class ObliqueSplitter(BaseObliqueSplitter):
def __cinit__(
Expand Down Expand Up @@ -257,7 +239,7 @@ cdef class ObliqueSplitter(BaseObliqueSplitter):
cdef intp_t grid_size = self.max_features * self.n_features

# shuffle indices over the 2D grid to sample using Fisher-Yates
self.fisher_yates_shuffle_memview(indices_to_sample, grid_size, random_state)
fisher_yates_shuffle(indices_to_sample, grid_size, random_state)

# sample 'n_non_zeros' in a mtry X n_features projection matrix
# which consists of +/- 1's chosen at a 1/2s rate
Expand Down Expand Up @@ -309,7 +291,7 @@ cdef class BestObliqueSplitter(ObliqueSplitter):
cdef intp_t end = self.end

# pointer array to store feature values to split on
cdef float32_t[::1] feature_values = self.feature_values
cdef float32_t[::1] feature_values = self.feature_values
cdef intp_t max_features = self.max_features
cdef intp_t min_samples_leaf = self.min_samples_leaf

Expand Down
38 changes: 33 additions & 5 deletions treeple/tree/_utils.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from libcpp.vector cimport vector

import numpy as np

cimport numpy as cnp
Expand All @@ -7,15 +9,41 @@ cnp.import_array()
from .._lib.sklearn.tree._splitter cimport SplitRecord
from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t

ctypedef fused vector_or_memview:
vector[intp_t]
intp_t[::1]
intp_t[:]


cdef inline void fisher_yates_shuffle(
vector_or_memview indices_to_sample,
intp_t grid_size,
uint32_t* random_state,
) noexcept nogil

cdef int rand_weighted_binary(float64_t p0, uint32_t* random_state) noexcept nogil

cdef int rand_weighted_binary(
float64_t p0,
uint32_t* random_state
) noexcept nogil

cpdef unravel_index(
intp_t index, cnp.ndarray[intp_t, ndim=1] shape
intp_t index,
cnp.ndarray[intp_t, ndim=1] shape
)

cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape)
cpdef ravel_multi_index(
intp_t[:] coords,
const intp_t[:] shape
)

cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil
cdef void unravel_index_cython(
intp_t index,
const intp_t[:] shape,
vector_or_memview coords
) noexcept nogil

cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil
cdef intp_t ravel_multi_index_cython(
vector_or_memview coords,
const intp_t[:] shape
) noexcept nogil
65 changes: 50 additions & 15 deletions treeple/tree/_utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,40 @@ cimport numpy as cnp

cnp.import_array()

from .._lib.sklearn.tree._utils cimport rand_uniform
from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform


cdef inline int rand_weighted_binary(float64_t p0, uint32_t* random_state) noexcept nogil:
cdef inline void fisher_yates_shuffle(
vector_or_memview indices_to_sample,
intp_t grid_size,
uint32_t* random_state,
) noexcept nogil:
"""Shuffle the indices in place using the Fisher-Yates algorithm.
Parameters
----------
indices_to_sample : A C++ vector or 1D memoryview
The indices to shuffle.
grid_size : intp_t
The size of the grid to shuffle. This is explicitly passed in
to support the templated `vector_or_memview` type, which allows
for both C++ vectors and Cython memoryviews. Getitng the length
of both types uses different API.
random_state : uint32_t*
The random state.
"""
cdef intp_t i, j

# XXX: should this be `i` or `i+1`? for valid Fisher-Yates?
for i in range(0, grid_size - 1):
j = rand_int(i, grid_size, random_state)
indices_to_sample[j], indices_to_sample[i] = \
indices_to_sample[i], indices_to_sample[j]


cdef inline int rand_weighted_binary(
float64_t p0,
uint32_t* random_state
) noexcept nogil:
"""Sample from integers 0 and 1 with different probabilities.
Parameters
Expand Down Expand Up @@ -54,7 +84,9 @@ cpdef unravel_index(
index = np.intp(index)
shape = np.array(shape)
coords = np.empty(shape.shape[0], dtype=np.intp)
unravel_index_cython(index, shape, coords)
cdef const intp_t[:] shape_memview = shape
cdef intp_t[:] coords_memview = coords
unravel_index_cython(index, shape_memview, coords_memview)
return coords


Expand Down Expand Up @@ -83,7 +115,11 @@ cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape):
return ravel_multi_index_cython(coords, shape)


cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil:
cdef inline void unravel_index_cython(
intp_t index,
const intp_t[:] shape,
vector_or_memview coords
) noexcept nogil:
"""Converts a flat index into a tuple of coordinate arrays.
Parameters
Expand All @@ -92,13 +128,9 @@ cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] co
The flat index to be converted.
shape : numpy.ndarray[intp_t, ndim=1]
The shape of the array into which the flat index should be converted.
coords : numpy.ndarray[intp_t, ndim=1]
A preinitialized memoryview array of coordinate arrays to be converted.
Returns
-------
numpy.ndarray[intp_t, ndim=1]
An array of coordinate arrays, with each coordinate array having the same shape as the input `shape`.
coords : intp_t[:] or vector[intp_t]
A preinitialized array of coordinates to store the result of the
unraveled `index`.
"""
cdef intp_t ndim = shape.shape[0]
cdef intp_t j, size
Expand All @@ -109,13 +141,16 @@ cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] co
index //= size


cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil:
"""Converts a tuple of coordinate arrays into a flat index.
cdef inline intp_t ravel_multi_index_cython(
vector_or_memview coords,
const intp_t[:] shape
) noexcept nogil:
"""Converts a tuple of coordinate arrays into a flat index in the vectorized dimension.
Parameters
----------
coords : numpy.ndarray[intp_t, ndim=1]
An array of coordinate arrays to be converted.
coords : intp_t[:] or vector[intp_t]
An array of coordinates to be converted and vectorized into a sinlg
shape : numpy.ndarray[intp_t, ndim=1]
The shape of the array into which the coordinates should be converted.
Expand Down
10 changes: 1 addition & 9 deletions treeple/tree/manifold/_morf_splitter.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,6 @@ cdef class PatchSplitter(BestObliqueSplitter):
# an input data vector. The input data is vectorized, so `data_height` and
# `data_width` are used to determine the vectorized indices corresponding to
# (x,y) coordinates in the original un-vectorized data.

cdef public intp_t max_patch_height # Maximum height of the patch to sample
cdef public intp_t max_patch_width # Maximum width of the patch to sample
cdef public intp_t min_patch_height # Minimum height of the patch to sample
cdef public intp_t min_patch_width # Minimum width of the patch to sample
cdef public intp_t data_height # Height of the input data
cdef public intp_t data_width # Width of the input data

cdef public intp_t ndim # The number of dimensions of the input data

cdef const intp_t[:] data_dims # The dimensions of the input data
Expand All @@ -56,7 +48,7 @@ cdef class PatchSplitter(BestObliqueSplitter):

cdef intp_t[::1] _index_data_buffer
cdef intp_t[::1] _index_patch_buffer
cdef intp_t[:] patch_dims_buff # A buffer to store the dimensions of the sampled patch
cdef intp_t[:] patch_sampled_size # A buffer to store the dimensions of the sampled patch
cdef intp_t[:] unraveled_patch_point # A buffer to store the unraveled patch point

# All oblique splitters (i.e. non-axis aligned splitters) require a
Expand Down
Loading

0 comments on commit 8c4a7f6

Please sign in to comment.