Skip to content

Commit

Permalink
MAINT Clean up Cython files (#321)
Browse files Browse the repository at this point in the history
* Clean up Cython files in oblique and morf splitter
* Migrate `self._validate_data` to `validate_data` in scikit-learn developer API
* Update spin to v0.12+
* Update c++ to c++11 standard

---------

Signed-off-by: Adam Li <adam2392@gmail.com>
  • Loading branch information
adam2392 authored Sep 9, 2024
1 parent ea67d06 commit 7e9dc22
Show file tree
Hide file tree
Showing 19 changed files with 224 additions and 109 deletions.
5 changes: 5 additions & 0 deletions .spin/cmds.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import click
from spin import util
from spin.cmds import meson
from spin.cmds.meson import build_dir_option


def get_git_revision_hash(submodule) -> str:
Expand Down Expand Up @@ -145,14 +146,18 @@ def setup_submodule(forcesubmodule=False):
@click.option(
"--forcesubmodule", is_flag=True, help="Force submodule pull.", envvar="FORCE_SUBMODULE"
)
@build_dir_option
@click.pass_context
def build(
ctx,
*,
meson_args,
jobs=None,
clean=False,
verbose=False,
gcov=False,
quiet=False,
build_dir=None,
forcesubmodule=False,
):
"""Build treeple using submodules.
Expand Down
2 changes: 1 addition & 1 deletion build_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ click
rich-click
doit
pydevtool
spin
spin>=0.12
build
2 changes: 1 addition & 1 deletion meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ project(
license: 'PolyForm Noncommercial 1.0.0',
meson_version: '>= 1.1.0',
default_options: [
'c_std=c99',
'c_std=c11',
'cpp_std=c++14',
],
)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ build = [
'twine',
'meson',
'meson-python',
'spin',
'spin>=0.12',
'doit',
'scikit-learn>=1.5.0',
'Cython>=3.0.10',
Expand Down
4 changes: 3 additions & 1 deletion treeple/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
# https://github.com/ContinuumIO/anaconda-issues/issues/11294
os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE")


try:
# This variable is injected in the __builtins__ by the build
# process. It is used to enable importing subpackages of sklearn when
Expand Down Expand Up @@ -64,7 +65,8 @@
msg = """Error importing treeple: you cannot import treeple while
being in treeple source directory; please exit the treeple source
tree first and relaunch your Python interpreter."""
raise ImportError(msg) from e
raise Exception(e)
# raise ImportError(msg) from e

__all__ = [
"_lib",
Expand Down
19 changes: 19 additions & 0 deletions treeple/_lib/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,22 @@ foreach ext: extensions
subdir: 'treeple/_lib/sklearn/utils/',
)
endforeach


# python_sources = [
# '__init__.py',
# ]

# py.install_sources(
# python_sources,
# subdir: 'treeple/_lib' # Folder relative to site-packages to install to
# )

# tempita = files('./sklearn/_build_utils/tempita.py')

# # Copy all the .py files to the install dir, rather than using
# # py.install_sources and needing to list them explicitely one by one
# # install_subdir('sklearn', install_dir: py.get_install_dir())
# install_subdir('sklearn', install_dir: join_paths(py.get_install_dir(), 'treeple/_lib'))

# subdir('sklearn')
2 changes: 1 addition & 1 deletion treeple/_lib/sklearn_fork
Submodule sklearn_fork updated 216 files
8 changes: 6 additions & 2 deletions treeple/ensemble/_honest_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,8 +720,12 @@ def oob_samples_(self):
oob_samples.append(_oob_samples)
return oob_samples

def _more_tags(self):
return {"multioutput": False}
def __sklearn_tags__(self):
# XXX: nans should be supportable in HRF
tags = super().__sklearn_tags__()
tags.classifier_tags.multi_output = False
tags.input_tags.allow_nan = False
return tags

def decision_path(self, X):
"""
Expand Down
12 changes: 8 additions & 4 deletions treeple/ensemble/_unsupervised_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,12 @@
)
from sklearn.metrics import calinski_harabasz_score
from sklearn.utils.parallel import Parallel, delayed
from sklearn.utils.validation import _check_sample_weight, check_is_fitted, check_random_state
from sklearn.utils.validation import (
_check_sample_weight,
check_is_fitted,
check_random_state,
validate_data,
)

from .._lib.sklearn.ensemble._forest import BaseForest
from .._lib.sklearn.tree._tree import DTYPE
Expand Down Expand Up @@ -85,10 +90,9 @@ def fit(self, X, y=None, sample_weight=None):
self : object
Returns the instance itself.
"""
self._validate_params()

# Validate or convert input data
X = self._validate_data(
X = validate_data(
self,
X,
dtype=DTYPE, # accept_sparse="csc",
)
Expand Down
1 change: 1 addition & 0 deletions treeple/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ scikit_learn_cython_args = [
'-X language_level=3', '-X boundscheck=' + boundscheck, '-X wraparound=False',
'-X initializedcheck=False', '-X nonecheck=False', '-X cdivision=True',
'-X profile=False',
'-X embedsignature=True',
# Needed for cython imports across subpackages, e.g. cluster pyx that
# cimports metrics pxd
'--include-dir', meson.global_build_root(),
Expand Down
15 changes: 11 additions & 4 deletions treeple/neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
from sklearn.base import BaseEstimator, MetaEstimatorMixin
from sklearn.exceptions import NotFittedError
from sklearn.neighbors import NearestNeighbors
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import check_is_fitted, validate_data

from treeple.tree import DecisionTreeClassifier
from treeple.tree._neighbors import _compute_distance_matrix, compute_forest_similarity_matrix


Expand All @@ -31,13 +32,19 @@ class NearestNeighborsMetaEstimator(BaseEstimator, MetaEstimatorMixin):
The number of parallel jobs to run for neighbors, by default None.
"""

def __init__(self, estimator, n_neighbors=5, radius=1.0, algorithm="auto", n_jobs=None):
def __init__(self, estimator=None, n_neighbors=5, radius=1.0, algorithm="auto", n_jobs=None):
self.estimator = estimator
self.n_neighbors = n_neighbors
self.algorithm = algorithm
self.radius = radius
self.n_jobs = n_jobs

def get_estimator(self):
if self.estimator is not None:
return DecisionTreeClassifier(random_state=0)
else:
return copy(self.estimator)

def fit(self, X, y=None):
"""Fit the nearest neighbors estimator from the training dataset.
Expand All @@ -56,9 +63,9 @@ def fit(self, X, y=None):
self : object
Fitted estimator.
"""
X, y = self._validate_data(X, y, accept_sparse="csc")
X, y = validate_data(self, X, y, accept_sparse="csc")

self.estimator_ = copy(self.estimator)
self.estimator_ = self.get_estimator()
try:
check_is_fitted(self.estimator_)
except NotFittedError:
Expand Down
82 changes: 64 additions & 18 deletions treeple/tree/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sklearn.cluster import AgglomerativeClustering
from sklearn.utils import check_random_state
from sklearn.utils._param_validation import Interval
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import check_is_fitted, validate_data

from .._lib.sklearn.tree import (
BaseDecisionTree,
Expand Down Expand Up @@ -216,7 +216,7 @@ def fit(self, X, y=None, sample_weight=None, check_input=True):
if check_input:
# TODO: allow X to be sparse
check_X_params = dict(dtype=DTYPE) # , accept_sparse="csc"
X = self._validate_data(X, validate_separately=(check_X_params))
X = validate_data(self, X, validate_separately=(check_X_params))
if issparse(X):
X.sort_indices()

Expand Down Expand Up @@ -378,6 +378,13 @@ def _assign_labels(self, affinity_matrix):
predict_labels = cluster.fit_predict(affinity_matrix)
return predict_labels

def __sklearn_tags__(self):
# XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
# However, for MORF it is not supported
tags = super().__sklearn_tags__()
tags.input_tags.allow_nan = False
return tags


class UnsupervisedObliqueDecisionTree(UnsupervisedDecisionTree):
"""Unsupervised oblique decision tree.
Expand Down Expand Up @@ -577,6 +584,13 @@ def _build_tree(
builder.build(self.tree_, X, sample_weight)
return self

def __sklearn_tags__(self):
# XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
# However, for MORF it is not supported
tags = super().__sklearn_tags__()
tags.input_tags.allow_nan = False
return tags


class ObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier):
"""An oblique decision tree classifier.
Expand Down Expand Up @@ -820,7 +834,7 @@ class ObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier):

tree_type = "oblique"

_parameter_constraints = {
_parameter_constraints: dict = {
**DecisionTreeClassifier._parameter_constraints,
"feature_combinations": [
Interval(Real, 1.0, None, closed="left"),
Expand Down Expand Up @@ -1070,6 +1084,13 @@ def _update_tree(self, X, y, sample_weight):
self._prune_tree()
return self

def __sklearn_tags__(self):
# XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
# However, for MORF it is not supported
tags = super().__sklearn_tags__()
tags.input_tags.allow_nan = False
return tags


class ObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):
"""An oblique decision tree Regressor.
Expand Down Expand Up @@ -1283,7 +1304,7 @@ class ObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):

tree_type = "oblique"

_parameter_constraints = {
_parameter_constraints: dict = {
**DecisionTreeRegressor._parameter_constraints,
"feature_combinations": [
Interval(Real, 1.0, None, closed="left"),
Expand Down Expand Up @@ -1450,6 +1471,13 @@ def _build_tree(
builder.build(self.tree_, X, y, sample_weight, None)
return self

def __sklearn_tags__(self):
# XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
# However, for MORF it is not supported
tags = super().__sklearn_tags__()
tags.input_tags.allow_nan = False
return tags


class PatchObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier):
"""A oblique decision tree classifier that operates over patches of data.
Expand Down Expand Up @@ -1684,7 +1712,7 @@ class PatchObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier)
"""

tree_type = "oblique"
_parameter_constraints = {
_parameter_constraints: dict = {
**DecisionTreeClassifier._parameter_constraints,
"min_patch_dims": ["array-like", None],
"max_patch_dims": ["array-like", None],
Expand Down Expand Up @@ -1798,8 +1826,8 @@ def _build_tree(
self.feature_combinations_ = 1

if self.feature_weight is not None:
self.feature_weight = self._validate_data(
self.feature_weight, ensure_2d=True, dtype=DTYPE
self.feature_weight = validate_data(
self, self.feature_weight, ensure_2d=True, dtype=DTYPE
)
if self.feature_weight.shape != X.shape:
raise ValueError(
Expand Down Expand Up @@ -1927,11 +1955,13 @@ def _build_tree(

return self

def _more_tags(self):
def __sklearn_tags__(self):
# XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
# However, for MORF it is not supported
allow_nan = False
return {"multilabel": True, "allow_nan": allow_nan}
tags = super().__sklearn_tags__()
tags.classifier_tags.multi_label = True
tags.input_tags.allow_nan = False
return tags

@property
def _inheritable_fitted_attribute(self):
Expand Down Expand Up @@ -2166,7 +2196,7 @@ class PatchObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):
"""

tree_type = "oblique"
_parameter_constraints = {
_parameter_constraints: dict = {
**DecisionTreeRegressor._parameter_constraints,
"min_patch_dims": ["array-like", None],
"max_patch_dims": ["array-like", None],
Expand Down Expand Up @@ -2277,8 +2307,8 @@ def _build_tree(
self.feature_combinations_ = 1

if self.feature_weight is not None:
self.feature_weight = self._validate_data(
self.feature_weight, ensure_2d=True, dtype=DTYPE
self.feature_weight = validate_data(
self, self.feature_weight, ensure_2d=True, dtype=DTYPE
)
if self.feature_weight.shape != X.shape:
raise ValueError(
Expand Down Expand Up @@ -2407,11 +2437,13 @@ def _build_tree(

return self

def _more_tags(self):
def __sklearn_tags__(self):
# XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
# However, for MORF it is not supported
allow_nan = False
return {"multilabel": True, "allow_nan": allow_nan}
tags = super().__sklearn_tags__()
tags.regressor_tags.multi_label = True
tags.input_tags.allow_nan = False
return tags


class ExtraObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier):
Expand Down Expand Up @@ -2669,7 +2701,7 @@ class ExtraObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier)

tree_type = "oblique"

_parameter_constraints = {
_parameter_constraints: dict = {
**DecisionTreeClassifier._parameter_constraints,
"feature_combinations": [
Interval(Real, 1.0, None, closed="left"),
Expand Down Expand Up @@ -2846,6 +2878,13 @@ def _inheritable_fitted_attribute(self):
"feature_combinations_",
]

def __sklearn_tags__(self):
# XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
# However, for MORF it is not supported
tags = super().__sklearn_tags__()
tags.input_tags.allow_nan = False
return tags


class ExtraObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):
"""An oblique decision tree Regressor.
Expand Down Expand Up @@ -3069,7 +3108,7 @@ class ExtraObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):
-0.26552594, -0.00642017, -0.07108117, -0.40726765, -0.40315294])
"""

_parameter_constraints = {
_parameter_constraints: dict = {
**DecisionTreeRegressor._parameter_constraints,
"feature_combinations": [
Interval(Real, 1.0, None, closed="left"),
Expand Down Expand Up @@ -3237,3 +3276,10 @@ def _build_tree(
builder.build(self.tree_, X, y, sample_weight)

return self

def __sklearn_tags__(self):
# XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
# However, for MORF it is not supported
tags = super().__sklearn_tags__()
tags.input_tags.allow_nan = False
return tags
Loading

0 comments on commit 7e9dc22

Please sign in to comment.