From 66f2716160de065065fec2948ba7bcb89d1baf6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20Kope=C4=87?=
 <3338226+mkopec87@users.noreply.github.com>
Date: Fri, 6 Dec 2024 16:26:23 +0100
Subject: [PATCH] - align requirements.txt with pyproject.toml - remove calls
 to np.string_ not existing in numpy >= 2.0.0 - remove calls to
 pd._testing.makeMixedDataFrame not existing in new pandas versions - fix
 install and test commands in documentation for developers - replace np.mean
 with column-wise version - drop pandas dependency constraint <2

---
 docs/source/developing.rst                       |  4 ++--
 popmon/analysis/profiling/profiles.py            |  4 +---
 popmon/analysis/profiling/pull_calculator.py     |  7 ++++++-
 pyproject.toml                                   |  2 +-
 requirements.txt                                 |  5 +++--
 .../popmon/analysis/profiling/test_apply_func.py | 16 +++++++++++-----
 tests/popmon/analysis/test_hist_numpy.py         | 10 +++++-----
 tests/popmon/conftest.py                         | 12 ++++++++++++
 tests/popmon/hist/test_histogram.py              |  3 ++-
 9 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/docs/source/developing.rst b/docs/source/developing.rst
index 15dc7fc2..99d8eb2e 100644
--- a/docs/source/developing.rst
+++ b/docs/source/developing.rst
@@ -27,8 +27,8 @@ For this you'll need to install our test requirements:
 .. code-block:: bash
 
   cd popmon/
-  pip install -r requirements-test.txt
-  python setup.py test
+  pip install -r .[test]
+  pytest
 
 That's it!
 
diff --git a/popmon/analysis/profiling/profiles.py b/popmon/analysis/profiling/profiles.py
index 4dbe2aad..747abd3c 100644
--- a/popmon/analysis/profiling/profiles.py
+++ b/popmon/analysis/profiling/profiles.py
@@ -186,9 +186,7 @@ def replace(bl):
     if len(bin_labels) == 0 or len(bin_labels) > 4 or np.sum(bin_entries) == 0:
         return np.nan
     if not np.all([isinstance(bl, (bool, np.bool_)) for bl in bin_labels]):
-        if not np.all(
-            [isinstance(bl, (str, np.str_, np.string_)) for bl in bin_labels]
-        ):
+        if not np.all([isinstance(bl, (str, np.str_, np.bytes_)) for bl in bin_labels]):
             return np.nan
         # all strings from hereon
         n_true = (bin_labels == "True").sum() + (bin_labels == "true").sum()
diff --git a/popmon/analysis/profiling/pull_calculator.py b/popmon/analysis/profiling/pull_calculator.py
index b1d0e0a6..f6606162 100644
--- a/popmon/analysis/profiling/pull_calculator.py
+++ b/popmon/analysis/profiling/pull_calculator.py
@@ -208,6 +208,11 @@ def transform(self, datastore):
 class ReferencePullCalculator(PullCalculator):
     """Pull calculation based on reference mean and standard deviations"""
 
+    @staticmethod
+    def mean(x):
+        """ "Column-wise mean version."""
+        return np.mean(x, axis=0)
+
     def __init__(
         self,
         reference_key,
@@ -233,7 +238,7 @@ def __init__(
         :param kwargs: (dict, optional): residual kwargs passed on to mean and std functions
         """
         super().__init__(
-            np.mean,
+            ReferencePullCalculator.mean,
             np.std,
             reference_key,
             assign_to_key,
diff --git a/pyproject.toml b/pyproject.toml
index 70bf35d2..4730b9cb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ authors = [{name = "ING Analytics Wholesale Banking", email = "wbaa@ing.com"}]
 license = {type = "MIT", file = "LICENSE"}
 dependencies = [
     "numpy>=1.18.0",
-    "pandas>=0.25.1,<2",
+    "pandas>=0.25.1",
     "scipy>=1.5.2",
     "histogrammar>=1.0.32",
     "phik",
diff --git a/requirements.txt b/requirements.txt
index 08a9d220..a5d5b128 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,5 +8,6 @@ tqdm
 plotly>=5.8.0
 joblib>=0.14.0
 htmlmin
-pydantic
-typing_extensions
+pydantic>=2
+pydantic-settings
+typing_extensions
\ No newline at end of file
diff --git a/tests/popmon/analysis/profiling/test_apply_func.py b/tests/popmon/analysis/profiling/test_apply_func.py
index 556a0e7d..d4108aa5 100644
--- a/tests/popmon/analysis/profiling/test_apply_func.py
+++ b/tests/popmon/analysis/profiling/test_apply_func.py
@@ -1,3 +1,4 @@
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -13,6 +14,11 @@
 from popmon.base import Pipeline
 
 
+def mean(x):
+    """ "Column-wise mean version,"""
+    return np.mean(x, axis=0)
+
+
 def get_test_data():
     df = pd.DataFrame()
     df["a"] = np.arange(100)
@@ -25,7 +31,7 @@ def test_pull():
 
     module1 = ApplyFunc(apply_to_key="to_profile")
     module1.add_apply_func(np.std, suffix="_std", entire=True)
-    module1.add_apply_func(np.mean, suffix="_mean", entire=True)
+    module1.add_apply_func(mean, suffix="_mean", entire=True)
 
     module2 = ApplyFunc(apply_to_key="to_profile", features=["asc_numbers"])
     module2.add_apply_func(
@@ -57,7 +63,7 @@ def func(x):
     )
 
     module.add_apply_func(np.std, entire=True)
-    module.add_apply_func(np.mean, entire=True)
+    module.add_apply_func(mean, entire=True)
     module.add_apply_func(func)
 
     datastore = module.transform(datastore)
@@ -77,7 +83,7 @@ def test_variance_comparer():
         apply_to_key="to_profile", features=["the_feature", "dummy_feature"]
     )
     module1.add_apply_func(np.std, suffix="_std", entire=True)
-    module1.add_apply_func(np.mean, suffix="_mean", entire=True)
+    module1.add_apply_func(mean, suffix="_mean", entire=True)
 
     module2 = ApplyFunc(
         apply_to_key="to_profile", features=["the_feature", "dummy_feature"]
@@ -171,7 +177,7 @@ def test_apply_func():
 
     apply_funcs = [
         {"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True},
-        {"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
+        {"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
     ]
 
     d = apply_func(
@@ -195,7 +201,7 @@ def test_apply_func_array():
 
     apply_funcs = [
         {"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True},
-        {"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
+        {"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
     ]
 
     f, p = apply_func_array(
diff --git a/tests/popmon/analysis/test_hist_numpy.py b/tests/popmon/analysis/test_hist_numpy.py
index d33da9de..477b27c1 100644
--- a/tests/popmon/analysis/test_hist_numpy.py
+++ b/tests/popmon/analysis/test_hist_numpy.py
@@ -2,6 +2,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from conftest import make_mixed_dataframe
 
 from popmon.analysis.hist_numpy import (
     assert_similar_hists,
@@ -30,7 +31,7 @@ def get_test_histograms1():
     """Get set 1 of test histograms"""
     # dummy dataset with mixed types
     # convert timestamp (col D) to nanosec since 1970-1-1
-    df = pd._testing.makeMixedDataFrame()
+    df = make_mixed_dataframe()
     df["date"] = df["D"].apply(to_ns)
     df["boolT"] = True
     df["boolF"] = False
@@ -55,8 +56,7 @@ def get_test_histograms1():
 def get_test_histograms2():
     """Get set 2 of test histograms"""
     # dummy dataset with mixed types
-    # convert timestamp (col D) to nanosec since 1970-1-1
-    df = pd._testing.makeMixedDataFrame()
+    df = make_mixed_dataframe()
 
     # building 1d-, 2d-histogram (iteratively)
     hist1 = hg.Categorize(unit("C"))
@@ -351,7 +351,7 @@ def test_check_similar_hists():
     """
     # dummy dataset with mixed types
     # convert timestamp (col D) to nanosec since 1970-1-1
-    df = pd._testing.makeMixedDataFrame()
+    df = make_mixed_dataframe()
     df["date"] = df["D"].apply(to_ns)
 
     # building 1d-, 2d-, and 3d-histogram (iteratively)
@@ -391,7 +391,7 @@ def test_assert_similar_hists():
     """
     # dummy dataset with mixed types
     # convert timestamp (col D) to nanosec since 1970-1-1
-    df = pd._testing.makeMixedDataFrame()
+    df = make_mixed_dataframe()
     df["date"] = df["D"].apply(to_ns)
 
     # building 1d-, 2d-, and 3d-histogram (iteratively)
diff --git a/tests/popmon/conftest.py b/tests/popmon/conftest.py
index b6b50b8b..dba9cd90 100644
--- a/tests/popmon/conftest.py
+++ b/tests/popmon/conftest.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from pandas.core.indexes.datetimes import bdate_range
 
 from popmon import resources
 
@@ -88,3 +89,14 @@ def pytest_configure():
     df = pd.read_csv(resources.data(CSV_FILE))
     df["date"] = pd.to_datetime(df["date"])
     pytest.test_df = df
+
+
+def make_mixed_dataframe() -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            "A": [0.0, 1.0, 2.0, 3.0, 4.0],
+            "B": [0.0, 1.0, 0.0, 1.0, 0.0],
+            "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
+            "D": bdate_range("1/1/2009", periods=5),
+        }
+    )
diff --git a/tests/popmon/hist/test_histogram.py b/tests/popmon/hist/test_histogram.py
index 18d83e17..721bff07 100644
--- a/tests/popmon/hist/test_histogram.py
+++ b/tests/popmon/hist/test_histogram.py
@@ -1,6 +1,7 @@
 import histogrammar as hg
 import numpy as np
 import pandas as pd
+from conftest import make_mixed_dataframe
 
 from popmon.hist.hist_utils import (
     is_numeric,
@@ -15,7 +16,7 @@
 
 
 def get_test_data():
-    df = pd._testing.makeMixedDataFrame()
+    df = make_mixed_dataframe()
     df["date"] = df["D"].apply(lambda x: pd.to_datetime(x).value)
     return df