From 66f2716160de065065fec2948ba7bcb89d1baf6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Kope=C4=87?= <3338226+mkopec87@users.noreply.github.com> Date: Fri, 6 Dec 2024 16:26:23 +0100 Subject: [PATCH] - align requirements.txt with pyproject.toml - remove calls to np.string_ not existing in numpy >= 2.0.0 - remove calls to pd._testing.makeMixedDataFrame not existing in new pandas versions - fix install and test commands in documentation for developers - replace np.mean with column-wise version - drop pandas dependency constraint <2 --- docs/source/developing.rst | 4 ++-- popmon/analysis/profiling/profiles.py | 4 +--- popmon/analysis/profiling/pull_calculator.py | 7 ++++++- pyproject.toml | 2 +- requirements.txt | 5 +++-- .../popmon/analysis/profiling/test_apply_func.py | 16 +++++++++++----- tests/popmon/analysis/test_hist_numpy.py | 10 +++++----- tests/popmon/conftest.py | 12 ++++++++++++ tests/popmon/hist/test_histogram.py | 3 ++- 9 files changed, 43 insertions(+), 20 deletions(-) diff --git a/docs/source/developing.rst b/docs/source/developing.rst index 15dc7fc2..99d8eb2e 100644 --- a/docs/source/developing.rst +++ b/docs/source/developing.rst @@ -27,8 +27,8 @@ For this you'll need to install our test requirements: .. code-block:: bash cd popmon/ - pip install -r requirements-test.txt - python setup.py test + pip install -r .[test] + pytest That's it! diff --git a/popmon/analysis/profiling/profiles.py b/popmon/analysis/profiling/profiles.py index 4dbe2aad..747abd3c 100644 --- a/popmon/analysis/profiling/profiles.py +++ b/popmon/analysis/profiling/profiles.py @@ -186,9 +186,7 @@ def replace(bl): if len(bin_labels) == 0 or len(bin_labels) > 4 or np.sum(bin_entries) == 0: return np.nan if not np.all([isinstance(bl, (bool, np.bool_)) for bl in bin_labels]): - if not np.all( - [isinstance(bl, (str, np.str_, np.string_)) for bl in bin_labels] - ): + if not np.all([isinstance(bl, (str, np.str_, np.bytes_)) for bl in bin_labels]): return np.nan # all strings from hereon n_true = (bin_labels == "True").sum() + (bin_labels == "true").sum() diff --git a/popmon/analysis/profiling/pull_calculator.py b/popmon/analysis/profiling/pull_calculator.py index b1d0e0a6..f6606162 100644 --- a/popmon/analysis/profiling/pull_calculator.py +++ b/popmon/analysis/profiling/pull_calculator.py @@ -208,6 +208,11 @@ def transform(self, datastore): class ReferencePullCalculator(PullCalculator): """Pull calculation based on reference mean and standard deviations""" + @staticmethod + def mean(x): + """ "Column-wise mean version.""" + return np.mean(x, axis=0) + def __init__( self, reference_key, @@ -233,7 +238,7 @@ def __init__( :param kwargs: (dict, optional): residual kwargs passed on to mean and std functions """ super().__init__( - np.mean, + ReferencePullCalculator.mean, np.std, reference_key, assign_to_key, diff --git a/pyproject.toml b/pyproject.toml index 70bf35d2..4730b9cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ authors = [{name = "ING Analytics Wholesale Banking", email = "wbaa@ing.com"}] license = {type = "MIT", file = "LICENSE"} dependencies = [ "numpy>=1.18.0", - "pandas>=0.25.1,<2", + "pandas>=0.25.1", "scipy>=1.5.2", "histogrammar>=1.0.32", "phik", diff --git a/requirements.txt b/requirements.txt index 08a9d220..a5d5b128 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,5 +8,6 @@ tqdm plotly>=5.8.0 joblib>=0.14.0 htmlmin -pydantic -typing_extensions +pydantic>=2 +pydantic-settings +typing_extensions \ No newline at end of file diff --git a/tests/popmon/analysis/profiling/test_apply_func.py b/tests/popmon/analysis/profiling/test_apply_func.py index 556a0e7d..d4108aa5 100644 --- a/tests/popmon/analysis/profiling/test_apply_func.py +++ b/tests/popmon/analysis/profiling/test_apply_func.py @@ -1,3 +1,4 @@ + import numpy as np import pandas as pd import pytest @@ -13,6 +14,11 @@ from popmon.base import Pipeline +def mean(x): + """ "Column-wise mean version,""" + return np.mean(x, axis=0) + + def get_test_data(): df = pd.DataFrame() df["a"] = np.arange(100) @@ -25,7 +31,7 @@ def test_pull(): module1 = ApplyFunc(apply_to_key="to_profile") module1.add_apply_func(np.std, suffix="_std", entire=True) - module1.add_apply_func(np.mean, suffix="_mean", entire=True) + module1.add_apply_func(mean, suffix="_mean", entire=True) module2 = ApplyFunc(apply_to_key="to_profile", features=["asc_numbers"]) module2.add_apply_func( @@ -57,7 +63,7 @@ def func(x): ) module.add_apply_func(np.std, entire=True) - module.add_apply_func(np.mean, entire=True) + module.add_apply_func(mean, entire=True) module.add_apply_func(func) datastore = module.transform(datastore) @@ -77,7 +83,7 @@ def test_variance_comparer(): apply_to_key="to_profile", features=["the_feature", "dummy_feature"] ) module1.add_apply_func(np.std, suffix="_std", entire=True) - module1.add_apply_func(np.mean, suffix="_mean", entire=True) + module1.add_apply_func(mean, suffix="_mean", entire=True) module2 = ApplyFunc( apply_to_key="to_profile", features=["the_feature", "dummy_feature"] @@ -171,7 +177,7 @@ def test_apply_func(): apply_funcs = [ {"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True}, - {"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True}, + {"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True}, ] d = apply_func( @@ -195,7 +201,7 @@ def test_apply_func_array(): apply_funcs = [ {"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True}, - {"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True}, + {"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True}, ] f, p = apply_func_array( diff --git a/tests/popmon/analysis/test_hist_numpy.py b/tests/popmon/analysis/test_hist_numpy.py index d33da9de..477b27c1 100644 --- a/tests/popmon/analysis/test_hist_numpy.py +++ b/tests/popmon/analysis/test_hist_numpy.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd import pytest +from conftest import make_mixed_dataframe from popmon.analysis.hist_numpy import ( assert_similar_hists, @@ -30,7 +31,7 @@ def get_test_histograms1(): """Get set 1 of test histograms""" # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() df["date"] = df["D"].apply(to_ns) df["boolT"] = True df["boolF"] = False @@ -55,8 +56,7 @@ def get_test_histograms1(): def get_test_histograms2(): """Get set 2 of test histograms""" # dummy dataset with mixed types - # convert timestamp (col D) to nanosec since 1970-1-1 - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() # building 1d-, 2d-histogram (iteratively) hist1 = hg.Categorize(unit("C")) @@ -351,7 +351,7 @@ def test_check_similar_hists(): """ # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() df["date"] = df["D"].apply(to_ns) # building 1d-, 2d-, and 3d-histogram (iteratively) @@ -391,7 +391,7 @@ def test_assert_similar_hists(): """ # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() df["date"] = df["D"].apply(to_ns) # building 1d-, 2d-, and 3d-histogram (iteratively) diff --git a/tests/popmon/conftest.py b/tests/popmon/conftest.py index b6b50b8b..dba9cd90 100644 --- a/tests/popmon/conftest.py +++ b/tests/popmon/conftest.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd import pytest +from pandas.core.indexes.datetimes import bdate_range from popmon import resources @@ -88,3 +89,14 @@ def pytest_configure(): df = pd.read_csv(resources.data(CSV_FILE)) df["date"] = pd.to_datetime(df["date"]) pytest.test_df = df + + +def make_mixed_dataframe() -> pd.DataFrame: + return pd.DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + } + ) diff --git a/tests/popmon/hist/test_histogram.py b/tests/popmon/hist/test_histogram.py index 18d83e17..721bff07 100644 --- a/tests/popmon/hist/test_histogram.py +++ b/tests/popmon/hist/test_histogram.py @@ -1,6 +1,7 @@ import histogrammar as hg import numpy as np import pandas as pd +from conftest import make_mixed_dataframe from popmon.hist.hist_utils import ( is_numeric, @@ -15,7 +16,7 @@ def get_test_data(): - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() df["date"] = df["D"].apply(lambda x: pd.to_datetime(x).value) return df