From 7a062332a22ac95e25907ce3b612e040c01c2cc6 Mon Sep 17 00:00:00 2001 From: j-ittner Date: Mon, 15 May 2023 20:39:12 +0200 Subject: [PATCH 01/22] BUILD: change flake8 repo to GitHub --- .pre-commit-config.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7b19cd7f1..dc0cd9cb5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,14 +10,16 @@ repos: - id: black language_version: python3 - - repo: https://gitlab.com/pycqa/flake8 - rev: 3.9.0 + - repo: https://github.com/pycqa/flake8 + rev: 5.0.4 hooks: - id: flake8 name: flake8 entry: flake8 --config tox.ini - language: python_venv - additional_dependencies: [ flake8-comprehensions, flake8-import-order ] + language: python + language_version: python39 + additional_dependencies: + - flake8-comprehensions ~= 3.10 types: [ python ] - repo: https://github.com/pre-commit/pre-commit-hooks From 011d472eb781c8749e28589cca23779e1b65eb0c Mon Sep 17 00:00:00 2001 From: j-ittner Date: Mon, 15 May 2023 21:28:51 +0200 Subject: [PATCH 02/22] BUILD: bump flake8 to ~=5.0 --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9b858e013..2f52a485a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -76,7 +76,7 @@ stages: versionSpec: '3.7.*' displayName: 'use Python 3.7' - script: | - python -m pip install flake8==3.9.0 flake8-comprehensions flake8-import-order + python -m pip install flake8~=5.0 flake8-comprehensions~=3.10 python -m flake8 --config tox.ini -v . displayName: 'Run flake8' From b481df057226ecbc45f108e93863c6ccb4ff3d68 Mon Sep 17 00:00:00 2001 From: j-ittner Date: Mon, 15 May 2023 21:40:29 +0200 Subject: [PATCH 03/22] BUILD: disallow numpy >=1.24, which is incompatible with shap <=0.40 --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ab020ed43..03469d806 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ requires = [ # direct requirements of gamma-facet "gamma-pytools ~=1.2,>=1.2.1", "matplotlib ~=3.0,<3.6a", - "numpy >=1.17,<2a", + "numpy >=1.17,<1.24a", "packaging >=20", "pandas >=0.24,<2a", "scipy ~=1.2,<1.9a", @@ -94,7 +94,7 @@ typing-extensions = "<4.2" # direct requirements of gamma-facet gamma-pytools = "~=1.2,>=1.2.1" matplotlib = "~=3.5.2" -numpy = ">=1.22,<2a" +numpy = ">=1.22,<1.24a" packaging = ">=20.9" pandas = "~=1.4" python = "~=3.8" From 1f0881ea096d546df093f927b19fb2ad5b1e0935 Mon Sep 17 00:00:00 2001 From: j-ittner Date: Wed, 17 May 2023 10:49:40 +0200 Subject: [PATCH 04/22] BUILD: update version to 2.0.1 --- src/facet/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/facet/__init__.py b/src/facet/__init__.py index e59d00761..1843b247b 100644 --- a/src/facet/__init__.py +++ b/src/facet/__init__.py @@ -6,7 +6,7 @@ """ -__version__ = "2.0.0" +__version__ = "2.0.1" __logo__ = ( r""" From 1bd64edaffab95f200d813335114177e21aba9b8 Mon Sep 17 00:00:00 2001 From: Jan Ittner Date: Wed, 24 May 2023 07:17:05 +0200 Subject: [PATCH 05/22] BUILD: use mamba for faster conda builds (#365) --- azure-pipelines.yml | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4fe6aab88..4ee65fe20 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -243,7 +243,15 @@ stages: - script: dir $(Build.SourcesDirectory) - script: | - conda install -y -c anaconda conda-build~=3.21 conda-verify~=3.4 toml~=0.10 flit~=3.6 packaging~=20.9 + # install micromamba + curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba + export MAMBA_ROOT_PREFIX=~/micromamba + eval "$(./bin/micromamba shell hook -s posix)" + + # create and activate a build environment, then install the tools we need + micromamba create -n build + micromamba activate build + micromamba install -y -c conda-forge boa~=0.14 toml~=0.10 flit~=3.6 packaging~=20.9 displayName: 'Install conda-build, flit, toml' condition: eq(variables['BUILD_SYSTEM'], 'conda') @@ -261,7 +269,11 @@ stages: targetType: 'inline' script: | set -eux - if [ "$BUILD_SYSTEM" = "conda" ] ; then eval "$(conda shell.bash hook)" ; fi + if [ "$BUILD_SYSTEM" = "conda" ] ; then + export MAMBA_ROOT_PREFIX=~/micromamba + eval "$(./bin/micromamba shell hook -s posix)" + micromamba activate build + fi export RUN_PACKAGE_VERSION_TEST=$(project_name) cd $(Build.SourcesDirectory)/$(project_root) @@ -348,7 +360,11 @@ stages: targetType: 'inline' script: | set -eux - if [ "$BUILD_SYSTEM" = "conda" ] ; then eval "$(conda shell.bash hook)" ; fi + if [ "$BUILD_SYSTEM" = "conda" ] ; then + export MAMBA_ROOT_PREFIX=~/micromamba + eval "$(./bin/micromamba shell hook -s posix)" + micromamba activate build + fi export RUN_PACKAGE_VERSION_TEST=$(project_name) cd $(Build.SourcesDirectory)/$(project_root) From ea41a47c326ecdcaf591f4a51f3dc83f98f7cbec Mon Sep 17 00:00:00 2001 From: j-ittner Date: Wed, 24 May 2023 18:26:27 +0200 Subject: [PATCH 06/22] BUILD: simplify development environment --- environment.yml | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/environment.yml b/environment.yml index 2ffe73f95..501f6843a 100644 --- a/environment.yml +++ b/environment.yml @@ -4,12 +4,11 @@ channels: - bcg_gamma dependencies: # run - - boruta_py ~= 0.3 - gamma-pytools ~= 2.1 - joblib ~= 1.2 - lightgbm ~= 3.3 - matplotlib ~= 3.7 - - numpy ~= 1.23 + - numpy ~= 1.24 - pandas ~= 2.0 - python ~= 3.9 - scikit-learn ~= 1.2.0 @@ -17,22 +16,13 @@ dependencies: - shap ~= 0.41 - sklearndf ~= 2.2 - typing_extensions ~= 4.3 - # build/test - - conda-build ~= 3.23.3 - - conda-verify ~= 3.1.1 - - docutils ~= 0.17.1 - - flit ~= 3.8.0 - - jinja2 ~= 2.11.3 - - markupsafe ~= 2.0.1 # markupsafe 2.1 breaks support for jinja2 - - m2r ~= 0.3.1 - - pluggy ~= 0.13.1 - - pre-commit ~= 2.21.0 - - pytest ~= 7.2.1 - - pytest-cov ~= 2.12.1 - - pyyaml ~= 5.4.1 - - toml ~= 0.10.2 - - tox ~= 3.27.1 - - yaml ~= 0.2.5 + # additional packages for notebooks etc. + - pip ~= 23.0 + - pip: + - arfs ~= 1.1 + # test + - pytest ~= 7.2.1 + - pytest-cov ~= 2.12.1 # sphinx - nbsphinx ~= 0.8.9 - sphinx ~= 4.5.0 From b3893932d595c4915cf87a8f30c6d7b0fdb856db Mon Sep 17 00:00:00 2001 From: j-ittner Date: Thu, 25 May 2023 10:34:49 +0200 Subject: [PATCH 07/22] FIX: install micromamba for nightly builds --- azure-pipelines.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4ee65fe20..a0a123338 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -342,7 +342,15 @@ stages: - script: dir $(Build.SourcesDirectory) - script: | - conda install -y -c anaconda conda-build~=3.21 conda-verify~=3.4 toml~=0.10 flit~=3.6 packaging~=20.9 + # install micromamba + curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba + export MAMBA_ROOT_PREFIX=~/micromamba + eval "$(./bin/micromamba shell hook -s posix)" + + # create and activate a build environment, then install the tools we need + micromamba create -n build + micromamba activate build + micromamba install -y -c conda-forge boa~=0.14 toml~=0.10 flit~=3.6 packaging~=20.9 displayName: 'Install conda-build, flit, toml' condition: eq(variables['BUILD_SYSTEM'], 'conda') From 6749064167fd8b3b8fbfa115fe41a0c8d5732e7f Mon Sep 17 00:00:00 2001 From: Jan Ittner Date: Wed, 5 Jul 2023 06:28:00 +0200 Subject: [PATCH 08/22] FIX: refer to arg model (not pipeline) in LearnerInspector exceptions (#367) --- src/facet/inspection/_learner_inspector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/facet/inspection/_learner_inspector.py b/src/facet/inspection/_learner_inspector.py index 28412f231..500c9a345 100644 --- a/src/facet/inspection/_learner_inspector.py +++ b/src/facet/inspection/_learner_inspector.py @@ -102,7 +102,7 @@ def __init__( """ if not model.is_fitted: - raise ValueError("arg pipeline must be fitted") + raise ValueError("arg model must be fitted") final_estimator: T_SupervisedLearnerDF = model.final_estimator if is_classifier(final_estimator): @@ -119,7 +119,7 @@ def __init__( ) elif not is_regressor(final_estimator): raise TypeError( - "learner in arg pipeline must be a classifier or a regressor," + "learner in arg model must be a classifier or a regressor," f"but is a {type(final_estimator).__name__}" ) From b9f6b73e7eca0d568e8ceb7c376f851cae1dc073 Mon Sep 17 00:00:00 2001 From: Jan Ittner Date: Wed, 5 Jul 2023 13:37:13 +0200 Subject: [PATCH 09/22] API: support simple (non-pipeline) learners in LearnerInspector (#368) * API: support simple (non-pipeline) learners in LearnerInspector * API: raise a TypeError if arg model is an unexpected type * TEST: test LearnerInspector with a simple classifier * FIX: correctly handle simple learners throughout LearnerInspector * DOC: improve parameter documentation of LearnerInspector --- RELEASE_NOTES.rst | 8 ++++ src/facet/inspection/_learner_inspector.py | 51 +++++++++++++++++----- test/test/facet/test_inspection.py | 9 ++-- 3 files changed, 51 insertions(+), 17 deletions(-) diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index 4944dccb8..4798cd01f 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -18,6 +18,14 @@ visualizations. FACET 2.0 requires :mod:`pytools` |nbsp| 2.0 and :mod:`sklearndf` |nbsp| 2.2, and is now fully type-checked by |mypy|. +2.0.1 +~~~~~ + +- API: class :class:`.LearnerInspector` now supports inspecting individual regressors + and classifiers; it is no longer necessary to wrap them into a + :class:`.RegressorPipelineDF` or :class:`.ClassifierPipelineDF` instance with empty + preprocessing + 2.0.0 ~~~~~ diff --git a/src/facet/inspection/_learner_inspector.py b/src/facet/inspection/_learner_inspector.py index 500c9a345..7fb496fad 100644 --- a/src/facet/inspection/_learner_inspector.py +++ b/src/facet/inspection/_learner_inspector.py @@ -76,6 +76,12 @@ class LearnerInspector( #: The factory instance used to create the explainer for the learner. explainer_factory: ExplainerFactory[NativeSupervisedLearner] + #: The learner being inspected. + #: + #: If the model is a pipeline, this is the final estimator in the pipeline; + #: otherwise, it is the model itself. + learner: SupervisedLearnerDF + # defined in superclass, repeated here for Sphinx: model: T_SupervisedLearnerDF shap_interaction: bool @@ -86,7 +92,7 @@ class LearnerInspector( def __init__( self, - model: SupervisedLearnerPipelineDF[T_SupervisedLearnerDF], + model: T_SupervisedLearnerDF, *, explainer_factory: Optional[ExplainerFactory[NativeSupervisedLearner]] = None, shap_interaction: bool = True, @@ -96,7 +102,11 @@ def __init__( verbose: Optional[int] = None, ) -> None: """ - :param model: the learner pipeline to inspect + :param model: the learner or learner pipeline to inspect (typically, one of + a :class:`~sklearndf.pipeline.ClassifierPipelineDF`, + :class:`~sklearndf.pipeline.RegressorPipelineDF`, + :class:`~sklearndf.classification.ClassifierDF`, or + :class:`~sklearndf.regression.RegressorDF`) :param explainer_factory: optional function that creates a shap Explainer (default: ``TreeExplainerFactory``) """ @@ -104,10 +114,22 @@ def __init__( if not model.is_fitted: raise ValueError("arg model must be fitted") - final_estimator: T_SupervisedLearnerDF = model.final_estimator - if is_classifier(final_estimator): + learner: SupervisedLearnerDF + + if isinstance(model, SupervisedLearnerPipelineDF): + learner = model.final_estimator + elif isinstance(model, SupervisedLearnerDF): + learner = model + else: + raise TypeError( + "arg model must be a SupervisedLearnerPipelineDF or a " + f"SupervisedLearnerDF, but is a {type(model).__name__}" + ) + self.learner = learner + + if is_classifier(learner): try: - n_outputs = final_estimator.n_outputs_ + n_outputs = learner.n_outputs_ except AttributeError: pass else: @@ -115,12 +137,12 @@ def __init__( raise ValueError( "only single-target classifiers (binary or multi-class) are " "supported, but the given classifier has been fitted on " - f"multiple targets: {', '.join(final_estimator.output_names_)}" + f"multiple targets: {', '.join(learner.output_names_)}" ) - elif not is_regressor(final_estimator): + elif not is_regressor(learner): raise TypeError( "learner in arg model must be a classifier or a regressor," - f"but is a {type(final_estimator).__name__}" + f"but is a {type(learner).__name__}" ) if explainer_factory: @@ -162,14 +184,19 @@ def feature_names(self) -> List[str]: """[see superclass]""" return cast( List[str], - self.model.final_estimator.feature_names_in_.to_list(), + self.learner.feature_names_in_.to_list(), ) def preprocess_features( self, features: Union[pd.DataFrame, pd.Series] ) -> pd.DataFrame: """[see superclass]""" - return self.model.preprocess(features) + if self.model is self.learner: + # we have a simple learner: no preprocessing needed + return features + else: + # we have a pipeline: preprocess features + return self.model.preprocess(features) @property def shap_calculator(self) -> LearnerShapCalculator[Any]: @@ -178,10 +205,10 @@ def shap_calculator(self) -> LearnerShapCalculator[Any]: if self._shap_calculator is not None: return self._shap_calculator - learner: SupervisedLearnerDF = self.model.final_estimator + learner: SupervisedLearnerDF = self.learner shap_calculator_params: Dict[str, Any] = dict( - model=self.model.final_estimator.native_estimator, + model=self.learner.native_estimator, interaction_values=self.shap_interaction, explainer_factory=self.explainer_factory, n_jobs=self.n_jobs, diff --git a/test/test/facet/test_inspection.py b/test/test/facet/test_inspection.py index 23fab0e45..04c0c6ec6 100644 --- a/test/test/facet/test_inspection.py +++ b/test/test/facet/test_inspection.py @@ -161,7 +161,6 @@ def test_binary_classifier_ranking( ClassifierPipelineDF[RandomForestClassifierDF], GridSearchCV ] ) -> None: - expected_learner_scores = [0.938, 0.936, 0.936, 0.929] ranking = iris_classifier_selector_binary.summary_report() @@ -185,7 +184,6 @@ def test_model_inspection_classifier_binary( iris_sample_binary: Sample, n_jobs: int, ) -> None: - model_inspector = LearnerInspector( model=iris_classifier_binary, shap_interaction=False, @@ -261,7 +259,9 @@ def test_model_inspection_classifier_binary_single_shap_output(n_jobs: int) -> N # noinspection DuplicatedCode def test_model_inspection_classifier_multi_class( - iris_inspector_multi_class: LearnerInspector[RandomForestClassifierDF], + iris_inspector_multi_class: LearnerInspector[ + ClassifierPipelineDF[RandomForestClassifierDF] + ], ) -> None: iris_classifier = iris_inspector_multi_class.model iris_sample = iris_inspector_multi_class.sample_ @@ -382,7 +382,6 @@ def test_model_inspection_classifier_multi_class( def _validate_shap_values_against_predictions( shap_values: pd.DataFrame, model: ClassifierDF, sample: Sample ) -> None: - # calculate the matching predictions, so we can check if the SHAP values add up # correctly predicted_probabilities: pd.DataFrame = model.predict_proba(sample.features) @@ -447,7 +446,7 @@ def test_model_inspection_classifier_interaction( warnings.filterwarnings("ignore", message="You are accessing a training score") model_inspector = LearnerInspector( - model=iris_classifier_binary, + model=iris_classifier_binary.final_estimator, explainer_factory=TreeExplainerFactory( feature_perturbation="tree_path_dependent", uses_background_dataset=True ), From 229d32390a7b9a9fc220ad5ddcdf2897ab973154 Mon Sep 17 00:00:00 2001 From: j-ittner Date: Wed, 5 Jul 2023 15:36:09 +0200 Subject: [PATCH 10/22] BUILD: change version to 2.1rc0 --- src/facet/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/facet/__init__.py b/src/facet/__init__.py index 1843b247b..6fa5211f5 100644 --- a/src/facet/__init__.py +++ b/src/facet/__init__.py @@ -6,7 +6,7 @@ """ -__version__ = "2.0.1" +__version__ = "2.1rc0" __logo__ = ( r""" From b9489d826b47053cfd1ec22ae6d89266707175dd Mon Sep 17 00:00:00 2001 From: j-ittner Date: Wed, 5 Jul 2023 15:43:51 +0200 Subject: [PATCH 11/22] DOC: update release notes --- RELEASE_NOTES.rst | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index 4798cd01f..477230c01 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -1,4 +1,4 @@ -Release Notes +\Release Notes ============= .. |mypy| replace:: :external+mypy:doc:`mypy ` @@ -6,6 +6,25 @@ Release Notes .. |nbsp| unicode:: 0xA0 :trim: +FACET 2.1 +--------- + +FACET 2.1 introduces the :class:`.NativeLearnerInspector` for inspecting native +*scikit-learn* models and pipelines. + +We still recommend using *sklearndf* models and learner pipelines and FACET's +:class:`.LearnerSelector` for hyperparameter tuning; however the new +:class:`.NativeLearnerInspector` can be useful for inspecting models that have been +trained using *scikit-learn* directly. + +2.1.0 +~~~~~ + +- API: new :class:`.NativeLearnerInspector` class for inspecting native *scikit-learn* + regressors, classifiers, and pipelines with a regressor or classifier as the final + estimator + + FACET 2.0 --------- From 5fe3110b010fff98c8944e317325553227ed853c Mon Sep 17 00:00:00 2001 From: j-ittner Date: Wed, 5 Jul 2023 15:48:27 +0200 Subject: [PATCH 12/22] REFACTOR: reformat code with newest `black` --- src/facet/explanation/_explanation.py | 1 + src/facet/inspection/shap/_shap.py | 1 - src/facet/inspection/shap/sklearn/_sklearn.py | 2 -- src/facet/selection/_parameters.py | 3 --- src/facet/validation/_validation.py | 2 -- 5 files changed, 1 insertion(+), 8 deletions(-) diff --git a/src/facet/explanation/_explanation.py b/src/facet/explanation/_explanation.py index 9da485a88..25116f6cb 100644 --- a/src/facet/explanation/_explanation.py +++ b/src/facet/explanation/_explanation.py @@ -344,6 +344,7 @@ def to_expression(self) -> Expression: # Exact explainer factory # + # noinspection PyPep8Naming class _ExactExplainer( shap.explainers.Exact, # type: ignore diff --git a/src/facet/inspection/shap/_shap.py b/src/facet/inspection/shap/_shap.py index dfd3a4ac5..671821a85 100644 --- a/src/facet/inspection/shap/_shap.py +++ b/src/facet/inspection/shap/_shap.py @@ -290,7 +290,6 @@ def _reset_fit(self) -> None: self.output_names_ = None def _make_explainer(self, features: pd.DataFrame) -> BaseExplainer: - # prepare the background dataset background_dataset: Optional[pd.DataFrame] diff --git a/src/facet/inspection/shap/sklearn/_sklearn.py b/src/facet/inspection/shap/sklearn/_sklearn.py index 80b5aaa02..ea6f1b2fa 100644 --- a/src/facet/inspection/shap/sklearn/_sklearn.py +++ b/src/facet/inspection/shap/sklearn/_sklearn.py @@ -241,7 +241,6 @@ def _convert_shap_tensors_to_list( shap_tensors: Union[npt.NDArray[np.float_], List[npt.NDArray[np.float_]]], n_outputs: int, ) -> List[npt.NDArray[np.float_]]: - if n_outputs == 1 and isinstance(shap_tensors, list) and len(shap_tensors) == 2: # in the binary classification case, we will proceed with SHAP values # for class 0 only, since values for class 1 will just be the same @@ -273,7 +272,6 @@ def _convert_shap_to_df( observation_idx: pd.Index, feature_idx: pd.Index, ) -> List[pd.DataFrame]: - if self.interaction_values: # return a list of data frame [(obs x features) x features], # one for each of the outputs diff --git a/src/facet/selection/_parameters.py b/src/facet/selection/_parameters.py index 778d0e02d..a872aefce 100644 --- a/src/facet/selection/_parameters.py +++ b/src/facet/selection/_parameters.py @@ -185,7 +185,6 @@ def get_parameters(self, prefix: Optional[str] = None) -> ParameterDict: } def _validate_parameter(self, name: str, value: ParameterSet) -> None: - if name not in self._params: raise AttributeError( f"unknown parameter name for " @@ -222,7 +221,6 @@ def __dir__(self) -> Iterable[str]: def __getattr__(self, key: str) -> Any: if not key.startswith("_"): - result: Union[ParameterSpace[Any], ParameterSet, None] result = self._children.get(key, None) @@ -241,7 +239,6 @@ def __iter__(self) -> Iterator[Tuple[List[str], ParameterSet]]: def _iter_parameters( self, path_prefix: List[str] ) -> Iterator[Tuple[List[str], ParameterSet]]: - yield from ( ([*path_prefix, name], value) for name, value in self._values.items() ) diff --git a/src/facet/validation/_validation.py b/src/facet/validation/_validation.py index ffe927e60..4c45fef83 100644 --- a/src/facet/validation/_validation.py +++ b/src/facet/validation/_validation.py @@ -115,7 +115,6 @@ def split( test: npt.NDArray[np.int_] = indices[test_mask] # make sure test is not empty, else sample another train set if len(test) > 0: - yield train, test break @@ -254,7 +253,6 @@ def _select_train_indices( random_state: np.random.RandomState, y: Union[npt.NDArray[Any], pd.Series, pd.DataFrame, None], ) -> npt.NDArray[np.int_]: - mean_block_size = self.mean_block_size if mean_block_size < 1: # if mean block size was set as a percentage, calculate the actual mean From bfa5da709b3977a715520a7db41a6de6134935c0 Mon Sep 17 00:00:00 2001 From: Jan Ittner Date: Mon, 10 Jul 2023 08:49:29 +0200 Subject: [PATCH 13/22] API: add support for inspecting native scikit-learn learners and learner pipelines (#369) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * API: preserve row order in ShapCalculator output * TEST: suppress numba debug messages * API: add class NativeLearnerInspector for native scikit-learn learners * REFACTOR: pull learner inspector initializer up to base class * REFACTOR: remove obsolete LearnerInspector.shap_calculator() * REFACTOR: remove duplicated shap_calculator method (#370) --------- Co-authored-by: Mateusz Sokół <8431159+mtsokol@users.noreply.github.com> --- src/facet/inspection/_learner_inspector.py | 346 +++++++++++++++++---- src/facet/inspection/shap/_shap.py | 2 +- test/test/conftest.py | 3 +- test/test/facet/test_inspection.py | 101 +++++- 4 files changed, 384 insertions(+), 68 deletions(-) diff --git a/src/facet/inspection/_learner_inspector.py b/src/facet/inspection/_learner_inspector.py index 7fb496fad..a3f0e6818 100644 --- a/src/facet/inspection/_learner_inspector.py +++ b/src/facet/inspection/_learner_inspector.py @@ -3,10 +3,12 @@ """ import logging import re +from abc import ABCMeta, abstractmethod from typing import Any, Dict, Generic, List, Optional, TypeVar, Union, cast import pandas as pd -from sklearn.base import is_classifier, is_regressor +from sklearn.base import BaseEstimator, is_classifier, is_regressor +from sklearn.pipeline import Pipeline from pytools.api import AllTracker, inheritdoc, subsdoc from sklearndf import SupervisedLearnerDF @@ -26,6 +28,7 @@ __all__ = [ "LearnerInspector", + "NativeLearnerInspector", ] @@ -34,7 +37,9 @@ # T_SupervisedLearnerDF = TypeVar("T_SupervisedLearnerDF", bound=SupervisedLearnerDF) - +T_SupervisedLearner = TypeVar( + "T_SupervisedLearner", bound=Union[NativeSupervisedLearner, Pipeline] +) # # Ensure all symbols introduced below are included in __all__ @@ -61,8 +66,8 @@ replacement="Explain a regressor or classifier based on SHAP", ) @inheritdoc(match="""[see superclass]""") -class LearnerInspector( - ModelInspector[T_SupervisedLearnerDF], Generic[T_SupervisedLearnerDF] +class _BaseLearnerInspector( + ModelInspector[T_SupervisedLearner], Generic[T_SupervisedLearner], metaclass=ABCMeta ): """[see superclass]""" @@ -76,23 +81,17 @@ class LearnerInspector( #: The factory instance used to create the explainer for the learner. explainer_factory: ExplainerFactory[NativeSupervisedLearner] - #: The learner being inspected. - #: - #: If the model is a pipeline, this is the final estimator in the pipeline; - #: otherwise, it is the model itself. - learner: SupervisedLearnerDF + #: the supervised learner to inspect; this is either identical with + #: :attr:`model`, or the final estimator of :attr:`model` if :attr:`model` + #: is a pipeline + learner: NativeSupervisedLearner - # defined in superclass, repeated here for Sphinx: - model: T_SupervisedLearnerDF - shap_interaction: bool - n_jobs: Optional[int] - shared_memory: Optional[bool] - pre_dispatch: Optional[Union[str, int]] - verbose: Optional[int] + # the SHAP calculator used by this inspector + _shap_calculator: Optional[LearnerShapCalculator[Any]] def __init__( self, - model: T_SupervisedLearnerDF, + model: T_SupervisedLearner, *, explainer_factory: Optional[ExplainerFactory[NativeSupervisedLearner]] = None, shap_interaction: bool = True, @@ -102,33 +101,20 @@ def __init__( verbose: Optional[int] = None, ) -> None: """ - :param model: the learner or learner pipeline to inspect (typically, one of - a :class:`~sklearndf.pipeline.ClassifierPipelineDF`, - :class:`~sklearndf.pipeline.RegressorPipelineDF`, - :class:`~sklearndf.classification.ClassifierDF`, or - :class:`~sklearndf.regression.RegressorDF`) + :param model: the learner or learner pipeline to inspect :param explainer_factory: optional function that creates a shap Explainer (default: ``TreeExplainerFactory``) """ - if not model.is_fitted: + fitted = self._is_model_fitted(model) + if not fitted: raise ValueError("arg model must be fitted") - learner: SupervisedLearnerDF - - if isinstance(model, SupervisedLearnerPipelineDF): - learner = model.final_estimator - elif isinstance(model, SupervisedLearnerDF): - learner = model - else: - raise TypeError( - "arg model must be a SupervisedLearnerPipelineDF or a " - f"SupervisedLearnerDF, but is a {type(model).__name__}" - ) - self.learner = learner + learner = self._get_learner(model) if is_classifier(learner): try: + # noinspection PyUnresolvedReferences n_outputs = learner.n_outputs_ except AttributeError: pass @@ -137,7 +123,7 @@ def __init__( raise ValueError( "only single-target classifiers (binary or multi-class) are " "supported, but the given classifier has been fitted on " - f"multiple targets: {', '.join(learner.output_names_)}" + f"multiple targets: {', '.join(model.output_names_)}" ) elif not is_regressor(learner): raise TypeError( @@ -173,31 +159,31 @@ def __init__( ) self.explainer_factory = explainer_factory + self.learner = learner self._shap_calculator: Optional[LearnerShapCalculator[Any]] = None __init__.__doc__ = str(__init__.__doc__) + re.sub( r"(?m)^\s*:param model:\s+.*$", "", str(ModelInspector.__init__.__doc__) ) + @property + @abstractmethod + def native_learner(self) -> NativeSupervisedLearner: + """ + The native learner to inspect. + """ + @property def feature_names(self) -> List[str]: """[see superclass]""" + # noinspection PyUnresolvedReferences return cast( List[str], - self.learner.feature_names_in_.to_list(), + # feature_names_in_ is a pandas index (sklearndf) or an ndarray (sklearn); + # we convert it to a list + self.learner.feature_names_in_.tolist(), ) - def preprocess_features( - self, features: Union[pd.DataFrame, pd.Series] - ) -> pd.DataFrame: - """[see superclass]""" - if self.model is self.learner: - # we have a simple learner: no preprocessing needed - return features - else: - # we have a pipeline: preprocess features - return self.model.preprocess(features) - @property def shap_calculator(self) -> LearnerShapCalculator[Any]: """[see superclass]""" @@ -205,10 +191,10 @@ def shap_calculator(self) -> LearnerShapCalculator[Any]: if self._shap_calculator is not None: return self._shap_calculator - learner: SupervisedLearnerDF = self.learner + native_learner = self.native_learner shap_calculator_params: Dict[str, Any] = dict( - model=self.learner.native_estimator, + model=native_learner, interaction_values=self.shap_interaction, explainer_factory=self.explainer_factory, n_jobs=self.n_jobs, @@ -218,15 +204,271 @@ def shap_calculator(self) -> LearnerShapCalculator[Any]: ) shap_calculator: LearnerShapCalculator[Any] - if is_classifier(learner): + if is_classifier(native_learner): shap_calculator = ClassifierShapCalculator(**shap_calculator_params) else: shap_calculator = RegressorShapCalculator( - **shap_calculator_params, output_names=learner.output_names_ + **shap_calculator_params, output_names=self._learner_output_names ) self._shap_calculator = shap_calculator return shap_calculator + @property + @abstractmethod + def _learner_output_names(self) -> List[str]: + """ + The names of the outputs of the learner. + """ + pass + + @staticmethod + @abstractmethod + def _is_model_fitted(model: T_SupervisedLearner) -> bool: + # return True if the model is fitted, False otherwise + pass + + @staticmethod + @abstractmethod + def _get_learner(model: T_SupervisedLearner) -> NativeSupervisedLearner: + # get the learner class from the model, which may be a pipeline + # that includes additional preprocessing steps + pass + + +@inheritdoc(match="""[see superclass]""") +class LearnerInspector( + _BaseLearnerInspector[T_SupervisedLearnerDF], Generic[T_SupervisedLearnerDF] +): + """[see superclass]""" + + # defined in superclass, repeated here for Sphinx: + model: T_SupervisedLearnerDF + shap_interaction: bool + n_jobs: Optional[int] + shared_memory: Optional[bool] + pre_dispatch: Optional[Union[str, int]] + verbose: Optional[int] + explainer_factory: ExplainerFactory[NativeSupervisedLearner] + learner: SupervisedLearnerDF + + @subsdoc( + pattern=r"(?m)^(\s*:param model:\s+.*)$", + replacement=r"""\1 (typically, one of + a :class:`~sklearndf.pipeline.ClassifierPipelineDF`, + :class:`~sklearndf.pipeline.RegressorPipelineDF`, + :class:`~sklearndf.classification.ClassifierDF`, or + :class:`~sklearndf.regression.RegressorDF`)""", + using=_BaseLearnerInspector.__init__, + ) + def __init__( + self, + model: T_SupervisedLearnerDF, + *, + explainer_factory: Optional[ExplainerFactory[NativeSupervisedLearner]] = None, + shap_interaction: bool = True, + n_jobs: Optional[int] = None, + shared_memory: Optional[bool] = None, + pre_dispatch: Optional[Union[str, int]] = None, + verbose: Optional[int] = None, + ) -> None: + super().__init__( + model=model, + explainer_factory=explainer_factory, + shap_interaction=shap_interaction, + n_jobs=n_jobs, + shared_memory=shared_memory, + pre_dispatch=pre_dispatch, + verbose=verbose, + ) + + @property + def native_learner(self) -> NativeSupervisedLearner: + """[see superclass]""" + return cast(NativeSupervisedLearner, self.learner.native_estimator) + + @property + def _learner_output_names(self) -> List[str]: + """[see superclass]""" + return self.learner.output_names_ + + def preprocess_features( + self, features: Union[pd.DataFrame, pd.Series] + ) -> pd.DataFrame: + """[see superclass]""" + if self.model is self.learner: + # we have a simple learner: no preprocessing needed + return features + else: + # we have a pipeline: preprocess features + return self.model.preprocess(features) + + @staticmethod + def _is_model_fitted(model: T_SupervisedLearnerDF) -> bool: + return model.is_fitted + + @staticmethod + def _get_learner(model: T_SupervisedLearnerDF) -> SupervisedLearnerDF: + if isinstance(model, SupervisedLearnerPipelineDF): + return cast(SupervisedLearnerDF, model.final_estimator) + elif isinstance(model, SupervisedLearnerDF): + return model + else: + raise TypeError( + "arg model must be a SupervisedLearnerPipelineDF or a " + f"SupervisedLearnerDF, but is a {type(model).__name__}" + ) + + +@inheritdoc(match="""[see superclass]""") +class NativeLearnerInspector( + _BaseLearnerInspector[T_SupervisedLearner], Generic[T_SupervisedLearner] +): + """[see superclass]""" + + #: The default explainer factory used by this inspector. + #: This is a tree explainer using the tree_path_dependent method for + #: feature perturbation, so we can calculate SHAP interaction values. + DEFAULT_EXPLAINER_FACTORY = TreeExplainerFactory( + feature_perturbation="tree_path_dependent", uses_background_dataset=False + ) + + # defined in superclass, repeated here for Sphinx: + model: T_SupervisedLearner + shap_interaction: bool + n_jobs: Optional[int] + shared_memory: Optional[bool] + pre_dispatch: Optional[Union[str, int]] + verbose: Optional[int] + explainer_factory: ExplainerFactory[NativeSupervisedLearner] + learner: NativeSupervisedLearner + + @subsdoc( + pattern=r"(?m)^(\s*:param model:\s+.*)$", + replacement=r"""\1 (either a scikit-learn :class:`~sklearn.pipeline.Pipeline`, + or a regressor or classifier that implements the scikit-learn API)""", + using=_BaseLearnerInspector.__init__, + ) + def __init__( + self, + model: T_SupervisedLearner, + *, + explainer_factory: Optional[ExplainerFactory[NativeSupervisedLearner]] = None, + shap_interaction: bool = True, + n_jobs: Optional[int] = None, + shared_memory: Optional[bool] = None, + pre_dispatch: Optional[Union[str, int]] = None, + verbose: Optional[int] = None, + ) -> None: + super().__init__( + model=model, + explainer_factory=explainer_factory, + shap_interaction=shap_interaction, + n_jobs=n_jobs, + shared_memory=shared_memory, + pre_dispatch=pre_dispatch, + verbose=verbose, + ) + + @property + def native_learner(self) -> NativeSupervisedLearner: + return self.learner + + @property + def _learner_output_names(self) -> List[str]: + # we try to get the number of outputs from the learner; if that fails, + # we assume that the learner was fitted on a single target + n_outputs = getattr(self.learner, "n_outputs_", 1) + if n_outputs == 1: + return ["y"] + else: + return [f"y_{i}" for i in range(n_outputs)] + + def preprocess_features( + self, features: Union[pd.DataFrame, pd.Series] + ) -> pd.DataFrame: + """[see superclass]""" + if self.learner is self.model: + # we have a single learner: do not preprocess + return features + else: + # we have a pipeline: preprocessing is the first part of the pipeline + preprocessing = self.model[:-1] + return pd.DataFrame( + preprocessing.transform(features), + index=features.index, + columns=preprocessing.get_feature_names_out(), + ) + + @staticmethod + def _is_model_fitted(model: T_SupervisedLearner) -> bool: + return is_fitted(model) + + @staticmethod + def _get_learner(model: T_SupervisedLearner) -> NativeSupervisedLearner: + if isinstance(model, Pipeline): + try: + return model[-1] + except IndexError: + raise ValueError("arg model is an empty pipeline") + else: + return model + __tracker.validate() + + +# +# Private auxiliary methods +# + + +def is_fitted(estimator: BaseEstimator) -> bool: + """ + Check if the estimator is fitted. + + :param estimator: a scikit-learn estimator instance + :return: ``True`` if the estimator is fitted; ``False`` otherwise + """ + + if not isinstance(estimator, BaseEstimator): + raise TypeError( + "arg estimator must be a scikit-learn estimator, but is a " + f"{type(estimator).__name__}" + ) + + # get all properties of the estimator (instances of class ``property``) + fitted_properties = { + name + for name, value in vars(type(estimator)).items() + if ( + # we're only interested in properties that scikit-learn + # sets when fitting a learner + name.endswith("_") + and not name.startswith("_") + and isinstance(value, property) + ) + } + + # get all attributes ending with an underscore - these are only set as an estimator + # is fitted + fitted_attributes = [ + name + for name in vars(estimator) + if name not in fitted_properties + and name.endswith("_") + and not name.startswith("_") + ] + + if fitted_attributes: + # we have at least one fitted attribute: the estimator is fitted + return True + + # ensure that at least one of the fitted properties is defined + for p in fitted_properties: + if hasattr(estimator, p): + return True + + # the estimator has no fitted attributes and no fitted properties: + # it is not fitted + return False diff --git a/src/facet/inspection/shap/_shap.py b/src/facet/inspection/shap/_shap.py index 671821a85..4a91b6763 100644 --- a/src/facet/inspection/shap/_shap.py +++ b/src/facet/inspection/shap/_shap.py @@ -205,7 +205,7 @@ def shap_values(self) -> pd.DataFrame: assert self.shap_ is not None, ASSERTION__CALCULATOR_IS_FITTED if self.interaction_values: - return self.shap_.groupby(level=0).sum() + return self.shap_.groupby(level=0, sort=False).sum() else: return self.shap_ diff --git a/test/test/conftest.py b/test/test/conftest.py index bbfcb15b8..cbeb1826c 100644 --- a/test/test/conftest.py +++ b/test/test/conftest.py @@ -41,8 +41,9 @@ # print the FACET logo print(facet.__logo__) -# disable SHAP debugging messages +# disable 3rd party debugging messages logging.getLogger("shap").setLevel(logging.WARNING) +logging.getLogger("numba").setLevel(logging.WARNING) # configure pandas text output diff --git a/test/test/facet/test_inspection.py b/test/test/facet/test_inspection.py index 04c0c6ec6..c2af4c603 100644 --- a/test/test/facet/test_inspection.py +++ b/test/test/facet/test_inspection.py @@ -4,15 +4,19 @@ import logging import platform import warnings -from typing import Any, Dict, List, Optional, Type, TypeVar, cast +from typing import Any, Dict, List, Optional, Set, Type, TypeVar, Union, cast import numpy as np import pandas as pd import pytest from numpy.testing import assert_allclose +from pandas._testing import assert_index_equal from pandas.testing import assert_frame_equal, assert_series_equal +from sklearn.base import BaseEstimator from sklearn.datasets import make_classification +from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV +from sklearn.pipeline import Pipeline from pytools.data import LinkageTree, Matrix from pytools.viz.dendrogram import DendrogramDrawer, DendrogramReportStyle @@ -33,7 +37,7 @@ TreeExplainerFactory, ) from facet.explanation.base import ExplainerFactory -from facet.inspection import FunctionInspector, LearnerInspector +from facet.inspection import FunctionInspector, LearnerInspector, NativeLearnerInspector from facet.selection import LearnerSelector # noinspection PyMissingOrEmptyDocstring @@ -80,20 +84,56 @@ def test_regressor_selector( (PermutationExplainerFactory, {}), ], ) +@pytest.mark.parametrize( # type: ignore + argnames="native", + argvalues=(False, True), +) def test_model_inspection( explainer_factory_cls: Type[ExplainerFactory[LGBMRegressorDF]], explainer_factory_args: Dict[str, Any], best_lgbm_model: RegressorPipelineDF[LGBMRegressorDF], sample: Sample, n_jobs: int, + native: bool, ) -> None: # test the ModelInspector with the given explainer factory: - inspector = LearnerInspector( - model=best_lgbm_model, - explainer_factory=explainer_factory_cls(**explainer_factory_args), - n_jobs=n_jobs, - ).fit(sample) + explainer_factory: ExplainerFactory[LGBMRegressorDF] = explainer_factory_cls( + **explainer_factory_args + ) + + inspector: Union[ + LearnerInspector[RegressorPipelineDF[LGBMRegressorDF]], + NativeLearnerInspector[Pipeline], + ] + + if native: + assert ( + best_lgbm_model.preprocessing is not None + ), "preprocessing step must be defined" + # noinspection PyTypeChecker + inspector = NativeLearnerInspector( + model=( + # create and fit a native pipeline from the regressor pipeline + Pipeline( + steps=[ + ( + "preprocessing", + best_lgbm_model.preprocessing.native_estimator, + ), + ("regressor", best_lgbm_model.regressor.native_estimator), + ] + ).fit(X=sample.features, y=sample.target) + ), + explainer_factory=explainer_factory, + n_jobs=n_jobs, + ).fit(sample) + else: + inspector = LearnerInspector( + model=best_lgbm_model, + explainer_factory=explainer_factory, + n_jobs=n_jobs, + ).fit(sample) shap_values: pd.DataFrame = inspector.shap_values() @@ -106,9 +146,22 @@ def test_model_inspection( assert shap_values.columns.names == [Sample.IDX_FEATURE] # column index - assert set(shap_values.columns) == set( - inspector.model.final_estimator.feature_names_in_ - ) + regressor: BaseEstimator + if native: + regressor = inspector.model[-1] + else: + regressor = inspector.model.regressor + + regressor_feature_names: Set[str] + if native: + regressor_feature_names = set(inspector.model[:-1].get_feature_names_out()) + else: + regressor_feature_names = set(regressor.feature_names_in_) + + assert set(shap_values.columns) == set(regressor_feature_names) + + # check that the row order has been preserved + assert_index_equal(shap_values.index, sample.index) # check that the SHAP values add up to the predictions shap_totals = shap_values.sum(axis=1) @@ -121,6 +174,7 @@ def test_model_inspection( assert ( round((shap_minus_pred - shap_minus_pred.mean()).abs().mean(), 12) == 0.0 ), "predictions matching total SHAP" + # validate the linkage tree of the resulting inspector # if the inspector supports interaction values, test the redundancy linkage @@ -438,23 +492,42 @@ def _check_probabilities( # noinspection DuplicatedCode +@pytest.mark.parametrize( # type: ignore + argnames="native", + argvalues=(False, True), +) def test_model_inspection_classifier_interaction( iris_classifier_binary: ClassifierPipelineDF[RandomForestClassifierDF], iris_sample_binary: Sample, n_jobs: int, + native: bool, ) -> None: warnings.filterwarnings("ignore", message="You are accessing a training score") - model_inspector = LearnerInspector( - model=iris_classifier_binary.final_estimator, + cls_inspector: Type[ + Union[ + LearnerInspector[RandomForestClassifierDF], + NativeLearnerInspector[RandomForestClassifier], + ] + ] + learner: Union[RandomForestClassifierDF, RandomForestClassifier] + if native: + cls_inspector = NativeLearnerInspector[RandomForestClassifier] + learner = iris_classifier_binary.final_estimator.native_estimator + else: + cls_inspector = LearnerInspector[RandomForestClassifierDF] + learner = iris_classifier_binary.final_estimator + + model_inspector = cls_inspector( + model=learner, explainer_factory=TreeExplainerFactory( feature_perturbation="tree_path_dependent", uses_background_dataset=True ), n_jobs=n_jobs, ).fit(iris_sample_binary) - model_inspector_no_interaction = LearnerInspector( - model=iris_classifier_binary, + model_inspector_no_interaction = cls_inspector( + model=learner, shap_interaction=False, explainer_factory=TreeExplainerFactory( feature_perturbation="tree_path_dependent", uses_background_dataset=True From b67f376fe536c9993cff14fa887e44cdab9691e7 Mon Sep 17 00:00:00 2001 From: j-ittner Date: Mon, 10 Jul 2023 08:56:05 +0200 Subject: [PATCH 14/22] REFACTOR: reformat code with newest `black` --- src/facet/explanation/_explanation.py | 1 + src/facet/inspection/shap/_shap.py | 1 - src/facet/inspection/shap/sklearn/_sklearn.py | 2 -- src/facet/selection/_parameters.py | 3 --- src/facet/validation/_validation.py | 2 -- test/test/facet/test_partition.py | 3 --- test/test/facet/test_selection.py | 1 - test/test/facet/test_simulation.py | 5 ----- test/test/facet/test_validation.py | 1 - 9 files changed, 1 insertion(+), 18 deletions(-) diff --git a/src/facet/explanation/_explanation.py b/src/facet/explanation/_explanation.py index 9da485a88..25116f6cb 100644 --- a/src/facet/explanation/_explanation.py +++ b/src/facet/explanation/_explanation.py @@ -344,6 +344,7 @@ def to_expression(self) -> Expression: # Exact explainer factory # + # noinspection PyPep8Naming class _ExactExplainer( shap.explainers.Exact, # type: ignore diff --git a/src/facet/inspection/shap/_shap.py b/src/facet/inspection/shap/_shap.py index dfd3a4ac5..671821a85 100644 --- a/src/facet/inspection/shap/_shap.py +++ b/src/facet/inspection/shap/_shap.py @@ -290,7 +290,6 @@ def _reset_fit(self) -> None: self.output_names_ = None def _make_explainer(self, features: pd.DataFrame) -> BaseExplainer: - # prepare the background dataset background_dataset: Optional[pd.DataFrame] diff --git a/src/facet/inspection/shap/sklearn/_sklearn.py b/src/facet/inspection/shap/sklearn/_sklearn.py index 80b5aaa02..ea6f1b2fa 100644 --- a/src/facet/inspection/shap/sklearn/_sklearn.py +++ b/src/facet/inspection/shap/sklearn/_sklearn.py @@ -241,7 +241,6 @@ def _convert_shap_tensors_to_list( shap_tensors: Union[npt.NDArray[np.float_], List[npt.NDArray[np.float_]]], n_outputs: int, ) -> List[npt.NDArray[np.float_]]: - if n_outputs == 1 and isinstance(shap_tensors, list) and len(shap_tensors) == 2: # in the binary classification case, we will proceed with SHAP values # for class 0 only, since values for class 1 will just be the same @@ -273,7 +272,6 @@ def _convert_shap_to_df( observation_idx: pd.Index, feature_idx: pd.Index, ) -> List[pd.DataFrame]: - if self.interaction_values: # return a list of data frame [(obs x features) x features], # one for each of the outputs diff --git a/src/facet/selection/_parameters.py b/src/facet/selection/_parameters.py index 778d0e02d..a872aefce 100644 --- a/src/facet/selection/_parameters.py +++ b/src/facet/selection/_parameters.py @@ -185,7 +185,6 @@ def get_parameters(self, prefix: Optional[str] = None) -> ParameterDict: } def _validate_parameter(self, name: str, value: ParameterSet) -> None: - if name not in self._params: raise AttributeError( f"unknown parameter name for " @@ -222,7 +221,6 @@ def __dir__(self) -> Iterable[str]: def __getattr__(self, key: str) -> Any: if not key.startswith("_"): - result: Union[ParameterSpace[Any], ParameterSet, None] result = self._children.get(key, None) @@ -241,7 +239,6 @@ def __iter__(self) -> Iterator[Tuple[List[str], ParameterSet]]: def _iter_parameters( self, path_prefix: List[str] ) -> Iterator[Tuple[List[str], ParameterSet]]: - yield from ( ([*path_prefix, name], value) for name, value in self._values.items() ) diff --git a/src/facet/validation/_validation.py b/src/facet/validation/_validation.py index ffe927e60..4c45fef83 100644 --- a/src/facet/validation/_validation.py +++ b/src/facet/validation/_validation.py @@ -115,7 +115,6 @@ def split( test: npt.NDArray[np.int_] = indices[test_mask] # make sure test is not empty, else sample another train set if len(test) > 0: - yield train, test break @@ -254,7 +253,6 @@ def _select_train_indices( random_state: np.random.RandomState, y: Union[npt.NDArray[Any], pd.Series, pd.DataFrame, None], ) -> npt.NDArray[np.int_]: - mean_block_size = self.mean_block_size if mean_block_size < 1: # if mean block size was set as a percentage, calculate the actual mean diff --git a/test/test/facet/test_partition.py b/test/test/facet/test_partition.py index cd2cf0113..2f47345c6 100644 --- a/test/test/facet/test_partition.py +++ b/test/test/facet/test_partition.py @@ -18,7 +18,6 @@ def test_discrete_partitioning() -> None: np.random.seed(42) for _ in range(10): - values = np.random.randint( low=0, high=10000, size=np.random.randint(low=100, high=200) ) @@ -51,7 +50,6 @@ def test_continuous_partitioning() -> None: np.random.seed(42) for _ in range(10): - values = np.random.normal( loc=3.0, scale=8.0, size=np.random.randint(low=2000, high=4000) ) @@ -100,7 +98,6 @@ def test_category_partitioning() -> None: def test_partition_with_invalid_values() -> None: - arr_empty = np.array([]) arr_single = np.array([1]) arr_multi = np.array([1, 1, 1, 10, 1]) diff --git a/test/test/facet/test_selection.py b/test/test/facet/test_selection.py index e28143fe3..e54b8c930 100644 --- a/test/test/facet/test_selection.py +++ b/test/test/facet/test_selection.py @@ -40,7 +40,6 @@ def test_learner_selector( sample: Sample, n_jobs: int, ) -> None: - expected_scores = [ 0.669, 0.649, diff --git a/test/test/facet/test_simulation.py b/test/test/facet/test_simulation.py index a68b3d2a4..701969cf9 100644 --- a/test/test/facet/test_simulation.py +++ b/test/test/facet/test_simulation.py @@ -72,7 +72,6 @@ def uplift_simulator( def test_univariate_target_simulation( target_simulator: UnivariateTargetSimulator, ) -> None: - parameterized_feature = "HouseAge" partitioner = ContinuousRangePartitioner(max_partitions=10) @@ -135,7 +134,6 @@ def test_univariate_target_simulation( def test_univariate_target_subsample_simulation_80( model: RegressorPipelineDF[LGBMRegressorDF], subsample: Sample, n_jobs: int ) -> None: - parameterized_feature = "HouseAge" partitioner = ContinuousRangePartitioner(max_partitions=10) @@ -201,7 +199,6 @@ def test_univariate_target_subsample_simulation_80( def test_univariate_uplift_subsample_simulation_95( model: RegressorPipelineDF[LGBMRegressorDF], subsample: Sample, n_jobs: int ) -> None: - parameterized_feature = "HouseAge" partitioner = ContinuousRangePartitioner(max_partitions=10) @@ -266,7 +263,6 @@ def test_univariate_uplift_subsample_simulation_95( def test_univariate_uplift_simulation( uplift_simulator: UnivariateUpliftSimulator, ) -> None: - parameterized_feature = "HouseAge" partitioner = ContinuousRangePartitioner(max_partitions=10) @@ -329,7 +325,6 @@ def test_univariate_uplift_simulation( def test_univariate_uplift_subsample_simulation( model: RegressorPipelineDF[LGBMRegressorDF], subsample: Sample, n_jobs: int ) -> None: - parameterized_feature = "HouseAge" partitioner = ContinuousRangePartitioner(max_partitions=10) diff --git a/test/test/facet/test_validation.py b/test/test/facet/test_validation.py index a183eae00..51bde3d7b 100644 --- a/test/test/facet/test_validation.py +++ b/test/test/facet/test_validation.py @@ -30,7 +30,6 @@ def test_bootstrap_cv_init() -> None: def test_get_train_test_splits_as_indices() -> None: - n_test_splits = 200 test_x = np.arange(0, 1000, 1) From 1833b3ceb35e697cdc78111e7aeec648f07893c9 Mon Sep 17 00:00:00 2001 From: Jan Ittner Date: Mon, 10 Jul 2023 17:39:08 +0200 Subject: [PATCH 15/22] FIX: test native learner inspectors w/o preprocessing for sklearn 1.0.x (#372) * REFACTOR: reformat code with newest `black` * BUILD: drop support for scikit-learn 0.x * FIX: suppress numpy warnings when patching to support legacy types * FIX: test native learner inspectors w/o preprocessing for sklearn 1.0.x --- pyproject.toml | 5 ++- src/facet/explanation/base/_base.py | 12 +++--- test/test/facet/test_inspection.py | 62 ++++++++++++++++++----------- test/test/facet/test_partition.py | 3 -- test/test/facet/test_selection.py | 1 - test/test/facet/test_simulation.py | 5 --- test/test/facet/test_validation.py | 1 - 7 files changed, 47 insertions(+), 42 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f410a9c60..cf3c4b122 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ requires = [ "pandas >=1.0", "scipy ~=1.2", "shap >=0.39", + "scikit-learn ~=1.0", "sklearndf ~=2.2", "typing_extensions ~=4.0", ] @@ -85,7 +86,7 @@ typing_extensions = "~=4.0.0" # additional minimum requirements of sklearndf boruta = "~=0.3.0" lightgbm = "~=3.0.0" -scikit-learn = "~=0.24.2" +scikit-learn = "~=1.0.2" xgboost = "~=1.5" # additional minimum requirements of gamma-pytools joblib = "~=0.14.1" @@ -109,7 +110,7 @@ typing_extensions = "~=4.3" # additional maximum requirements of sklearndf boruta = "~=0.3" lightgbm = "~=3.3" -scikit-learn = "~=1.1" +scikit-learn = "~=1.2" xgboost = "~=1.5" # additional maximum requirements of gamma-pytools joblib = "~=1.1" diff --git a/src/facet/explanation/base/_base.py b/src/facet/explanation/base/_base.py index 2b8a0017d..f5a0ca180 100644 --- a/src/facet/explanation/base/_base.py +++ b/src/facet/explanation/base/_base.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +from packaging.version import Version from shap import Explainer, Explanation from pytools.api import AllTracker @@ -26,14 +27,13 @@ # shap relies on the np.bool, np.int, and np.float types, which were deprecated in # numpy 1.20 and removed in numpy 1.24. # -# We check if the types are defined and, if not, define them as an alias -# for the corresponding type with a trailing underscore. +# We define these types as an alias for the corresponding type with a trailing +# underscore. - -for __attr in ("bool", "int", "float"): - if not hasattr(np, __attr): +if Version(np.__version__) >= Version("1.20"): + for __attr in ("bool", "int", "float"): setattr(np, __attr, getattr(np, f"{__attr}_")) -del __attr + del __attr # diff --git a/test/test/facet/test_inspection.py b/test/test/facet/test_inspection.py index c2af4c603..5c2985f46 100644 --- a/test/test/facet/test_inspection.py +++ b/test/test/facet/test_inspection.py @@ -12,7 +12,6 @@ from numpy.testing import assert_allclose from pandas._testing import assert_index_equal from pandas.testing import assert_frame_equal, assert_series_equal -from sklearn.base import BaseEstimator from sklearn.datasets import make_classification from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV @@ -107,23 +106,49 @@ def test_model_inspection( NativeLearnerInspector[Pipeline], ] + regressor_feature_names: Set[str] # column index names + if native: assert ( best_lgbm_model.preprocessing is not None ), "preprocessing step must be defined" + + regressor = best_lgbm_model.regressor.native_estimator + + if __sklearn_version__ < __sklearn_1_1__: + # scikit-learn 1.0.x does not support output feature names in simple + # imputers, so we cannot use this for preprocessing + log.warning( + f"scikit-learn {__sklearn_version__} does not support output " + "feature names in simple imputers, so we will test the native learner " + "inspector without preprocessing" + ) + assert ( + sample.features.notna().all().all() + ), "observations must not contain missing values" + model = regressor.fit(X=sample.features, y=sample.target) + regressor_feature_names = set(sample.feature_names) + + else: + # scikit-learn 1.1.x supports output feature names in simple imputers, + # so we can use this for preprocessing + model = Pipeline( + # create a native pipeline from the regressor pipeline + steps=[ + ( + "preprocessing", + best_lgbm_model.preprocessing.native_estimator, + ), + ("regressor", regressor), + ] + ).fit(X=sample.features, y=sample.target) + regressor_feature_names = set(model[:-1].get_feature_names_out()) + # noinspection PyTypeChecker inspector = NativeLearnerInspector( model=( - # create and fit a native pipeline from the regressor pipeline - Pipeline( - steps=[ - ( - "preprocessing", - best_lgbm_model.preprocessing.native_estimator, - ), - ("regressor", best_lgbm_model.regressor.native_estimator), - ] - ).fit(X=sample.features, y=sample.target) + # fit the model on the sample + model.fit(X=sample.features, y=sample.target) ), explainer_factory=explainer_factory, n_jobs=n_jobs, @@ -134,6 +159,7 @@ def test_model_inspection( explainer_factory=explainer_factory, n_jobs=n_jobs, ).fit(sample) + regressor_feature_names = set(best_lgbm_model.regressor.feature_names_in_) shap_values: pd.DataFrame = inspector.shap_values() @@ -145,19 +171,7 @@ def test_model_inspection( assert shap_values.index.names == [Sample.IDX_OBSERVATION] assert shap_values.columns.names == [Sample.IDX_FEATURE] - # column index - regressor: BaseEstimator - if native: - regressor = inspector.model[-1] - else: - regressor = inspector.model.regressor - - regressor_feature_names: Set[str] - if native: - regressor_feature_names = set(inspector.model[:-1].get_feature_names_out()) - else: - regressor_feature_names = set(regressor.feature_names_in_) - + # check that the column names are the same as the feature names assert set(shap_values.columns) == set(regressor_feature_names) # check that the row order has been preserved diff --git a/test/test/facet/test_partition.py b/test/test/facet/test_partition.py index cd2cf0113..2f47345c6 100644 --- a/test/test/facet/test_partition.py +++ b/test/test/facet/test_partition.py @@ -18,7 +18,6 @@ def test_discrete_partitioning() -> None: np.random.seed(42) for _ in range(10): - values = np.random.randint( low=0, high=10000, size=np.random.randint(low=100, high=200) ) @@ -51,7 +50,6 @@ def test_continuous_partitioning() -> None: np.random.seed(42) for _ in range(10): - values = np.random.normal( loc=3.0, scale=8.0, size=np.random.randint(low=2000, high=4000) ) @@ -100,7 +98,6 @@ def test_category_partitioning() -> None: def test_partition_with_invalid_values() -> None: - arr_empty = np.array([]) arr_single = np.array([1]) arr_multi = np.array([1, 1, 1, 10, 1]) diff --git a/test/test/facet/test_selection.py b/test/test/facet/test_selection.py index e28143fe3..e54b8c930 100644 --- a/test/test/facet/test_selection.py +++ b/test/test/facet/test_selection.py @@ -40,7 +40,6 @@ def test_learner_selector( sample: Sample, n_jobs: int, ) -> None: - expected_scores = [ 0.669, 0.649, diff --git a/test/test/facet/test_simulation.py b/test/test/facet/test_simulation.py index a68b3d2a4..701969cf9 100644 --- a/test/test/facet/test_simulation.py +++ b/test/test/facet/test_simulation.py @@ -72,7 +72,6 @@ def uplift_simulator( def test_univariate_target_simulation( target_simulator: UnivariateTargetSimulator, ) -> None: - parameterized_feature = "HouseAge" partitioner = ContinuousRangePartitioner(max_partitions=10) @@ -135,7 +134,6 @@ def test_univariate_target_simulation( def test_univariate_target_subsample_simulation_80( model: RegressorPipelineDF[LGBMRegressorDF], subsample: Sample, n_jobs: int ) -> None: - parameterized_feature = "HouseAge" partitioner = ContinuousRangePartitioner(max_partitions=10) @@ -201,7 +199,6 @@ def test_univariate_target_subsample_simulation_80( def test_univariate_uplift_subsample_simulation_95( model: RegressorPipelineDF[LGBMRegressorDF], subsample: Sample, n_jobs: int ) -> None: - parameterized_feature = "HouseAge" partitioner = ContinuousRangePartitioner(max_partitions=10) @@ -266,7 +263,6 @@ def test_univariate_uplift_subsample_simulation_95( def test_univariate_uplift_simulation( uplift_simulator: UnivariateUpliftSimulator, ) -> None: - parameterized_feature = "HouseAge" partitioner = ContinuousRangePartitioner(max_partitions=10) @@ -329,7 +325,6 @@ def test_univariate_uplift_simulation( def test_univariate_uplift_subsample_simulation( model: RegressorPipelineDF[LGBMRegressorDF], subsample: Sample, n_jobs: int ) -> None: - parameterized_feature = "HouseAge" partitioner = ContinuousRangePartitioner(max_partitions=10) diff --git a/test/test/facet/test_validation.py b/test/test/facet/test_validation.py index a183eae00..51bde3d7b 100644 --- a/test/test/facet/test_validation.py +++ b/test/test/facet/test_validation.py @@ -30,7 +30,6 @@ def test_bootstrap_cv_init() -> None: def test_get_train_test_splits_as_indices() -> None: - n_test_splits = 200 test_x = np.arange(0, 1000, 1) From c075e10a218f68e7055c8d2f6e927b93032cdcff Mon Sep 17 00:00:00 2001 From: j-ittner Date: Mon, 10 Jul 2023 23:31:59 +0200 Subject: [PATCH 16/22] BUILD: update min and max matrix test package dependencies --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cf3c4b122..31cca6c31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,13 +87,13 @@ typing_extensions = "~=4.0.0" boruta = "~=0.3.0" lightgbm = "~=3.0.0" scikit-learn = "~=1.0.2" -xgboost = "~=1.5" +xgboost = "~=1.5.0" # additional minimum requirements of gamma-pytools joblib = "~=0.14.1" typing_inspect = "~=0.4.0" # additional minimum requirements of shap ipython = "==7.0" -numba = "~=0.55" # required to support numpy 1.21 +numba = "~=0.55.2" # required to support numpy 1.21 [build.matrix.max] # direct requirements of gamma-facet @@ -117,7 +117,7 @@ joblib = "~=1.1" typing_inspect = "~=0.7" # additional maximum requirements of shap ipython = ">=7" -numba = ">=0.55.2" # required to support numpy 1.22 +numba = "~=0.56" [tool.black] # quiet = "True" From 921b385066e7b659d102168e6c9282e5a1e489d1 Mon Sep 17 00:00:00 2001 From: j-ittner Date: Mon, 10 Jul 2023 23:59:12 +0200 Subject: [PATCH 17/22] BUILD: require zipp<3.16 for min test with Python 3.7 --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 31cca6c31..90b9a64fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,6 +94,8 @@ typing_inspect = "~=0.4.0" # additional minimum requirements of shap ipython = "==7.0" numba = "~=0.55.2" # required to support numpy 1.21 +# additional minimum requirements +zipp = "<3.16" # required to support python 3.7 [build.matrix.max] # direct requirements of gamma-facet From e880e6fbdfbf5078dc96092a24f59bb34801a1b6 Mon Sep 17 00:00:00 2001 From: j-ittner Date: Tue, 11 Jul 2023 00:08:59 +0200 Subject: [PATCH 18/22] TEST: only re-fit the model where needed --- test/test/facet/test_inspection.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/test/test/facet/test_inspection.py b/test/test/facet/test_inspection.py index 5c2985f46..f59a8eb6e 100644 --- a/test/test/facet/test_inspection.py +++ b/test/test/facet/test_inspection.py @@ -126,7 +126,7 @@ def test_model_inspection( assert ( sample.features.notna().all().all() ), "observations must not contain missing values" - model = regressor.fit(X=sample.features, y=sample.target) + model = regressor regressor_feature_names = set(sample.feature_names) else: @@ -146,10 +146,7 @@ def test_model_inspection( # noinspection PyTypeChecker inspector = NativeLearnerInspector( - model=( - # fit the model on the sample - model.fit(X=sample.features, y=sample.target) - ), + model=model, explainer_factory=explainer_factory, n_jobs=n_jobs, ).fit(sample) From 03fc727913252f52f4554876bcf43999c81a52a3 Mon Sep 17 00:00:00 2001 From: j-ittner Date: Tue, 11 Jul 2023 10:04:27 +0200 Subject: [PATCH 19/22] BUILD: require zipp<3.16 for min conda test with Python 3.7 --- condabuild/meta.yaml | 2 ++ pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/condabuild/meta.yaml b/condabuild/meta.yaml index 89b991af3..eb09a179e 100644 --- a/condabuild/meta.yaml +++ b/condabuild/meta.yaml @@ -47,6 +47,8 @@ test: # additional requirements of shap - ipython {{ environ.get('FACET_V_IPYTHON', '[False]') }} - numba {{ environ.get('FACET_V_NUMBA', '[False]') }} + # additional requirements for testing + - zipp {{ environ.get('FACET_V_ZIPP', '[False]') }} commands: - conda list - python -c 'import facet; diff --git a/pyproject.toml b/pyproject.toml index 90b9a64fb..2837b66a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,7 +94,7 @@ typing_inspect = "~=0.4.0" # additional minimum requirements of shap ipython = "==7.0" numba = "~=0.55.2" # required to support numpy 1.21 -# additional minimum requirements +# additional requirements for testing zipp = "<3.16" # required to support python 3.7 [build.matrix.max] From 4ee81f600ae19bc5cddc308a1f5093bcd85b8027 Mon Sep 17 00:00:00 2001 From: j-ittner Date: Tue, 11 Jul 2023 14:57:25 +0200 Subject: [PATCH 20/22] FIX: check properties of superclasses in is_fitted() --- src/facet/inspection/_learner_inspector.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/facet/inspection/_learner_inspector.py b/src/facet/inspection/_learner_inspector.py index a3f0e6818..65bf9b4aa 100644 --- a/src/facet/inspection/_learner_inspector.py +++ b/src/facet/inspection/_learner_inspector.py @@ -440,7 +440,10 @@ def is_fitted(estimator: BaseEstimator) -> bool: # get all properties of the estimator (instances of class ``property``) fitted_properties = { name - for name, value in vars(type(estimator)).items() + for cls in reversed(type(estimator).mro()) + # traverse the class hierarchy in reverse order, so that we add the + # properties of the most specific class last + for name, value in vars(cls).items() if ( # we're only interested in properties that scikit-learn # sets when fitting a learner From 118a8e301580b457297ebfdc07e394e75341364c Mon Sep 17 00:00:00 2001 From: j-ittner Date: Tue, 11 Jul 2023 15:42:36 +0200 Subject: [PATCH 21/22] DOC: tweak docstrings for learner inspectors --- src/facet/inspection/_learner_inspector.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/facet/inspection/_learner_inspector.py b/src/facet/inspection/_learner_inspector.py index 65bf9b4aa..24adf589a 100644 --- a/src/facet/inspection/_learner_inspector.py +++ b/src/facet/inspection/_learner_inspector.py @@ -61,10 +61,6 @@ ), replacement="\n\n", ) -@subsdoc( - pattern="Explain a model based on SHAP", - replacement="Explain a regressor or classifier based on SHAP", -) @inheritdoc(match="""[see superclass]""") class _BaseLearnerInspector( ModelInspector[T_SupervisedLearner], Generic[T_SupervisedLearner], metaclass=ABCMeta @@ -236,6 +232,10 @@ def _get_learner(model: T_SupervisedLearner) -> NativeSupervisedLearner: pass +@subsdoc( + pattern=r"Explain a model", + replacement=r"Explain an :mod:`sklearndf` regressor or classifier", +) @inheritdoc(match="""[see superclass]""") class LearnerInspector( _BaseLearnerInspector[T_SupervisedLearnerDF], Generic[T_SupervisedLearnerDF] @@ -320,6 +320,10 @@ def _get_learner(model: T_SupervisedLearnerDF) -> SupervisedLearnerDF: ) +@subsdoc( + pattern=r"Explain a model", + replacement=r"Explain a native scikit-learn regressor or classifier", +) @inheritdoc(match="""[see superclass]""") class NativeLearnerInspector( _BaseLearnerInspector[T_SupervisedLearner], Generic[T_SupervisedLearner] From e9efa29b2661e57d8468882980dddd333df317b3 Mon Sep 17 00:00:00 2001 From: j-ittner Date: Tue, 11 Jul 2023 15:56:46 +0200 Subject: [PATCH 22/22] DOC: tweak release notes --- RELEASE_NOTES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index 477230c01..8bdd58081 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -12,7 +12,7 @@ FACET 2.1 FACET 2.1 introduces the :class:`.NativeLearnerInspector` for inspecting native *scikit-learn* models and pipelines. -We still recommend using *sklearndf* models and learner pipelines and FACET's +We still recommend using :mod:`sklearndf` models and learner pipelines along with FACET's :class:`.LearnerSelector` for hyperparameter tuning; however the new :class:`.NativeLearnerInspector` can be useful for inspecting models that have been trained using *scikit-learn* directly.