Skip to content

Commit

Permalink
chore: add compatibility to pandas>=2,numpy>=2
Browse files Browse the repository at this point in the history
- align requirements.txt with pyproject.toml
- remove calls to np.string_ not existing in numpy >= 2.0.0
- remove calls to pd._testing.makeMixedDataFrame not existing in new pandas versions
- fix install and test commands in documentation for developers
- replace np.mean with column-wise version
- drop pandas dependency constraint <2
- require Python 3.9 in pyproject.toml
- add PySpark 3.5.3 to test pipeline matrix
- update test pipeline matrix: exclude Python 3.8, include Python 3.12
- add test notebook output to .gitignore
- switch to importlib from pkg_resources
- install project dependencies after pyspark in spark build tests
  • Loading branch information
mkopec87 committed Dec 16, 2024
1 parent ac79d21 commit cfc85d1
Show file tree
Hide file tree
Showing 14 changed files with 102 additions and 47 deletions.
20 changes: 11 additions & 9 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest]
python: ['3.8', '3.9', '3.10', '3.11']
python: ['3.9', '3.10', '3.11', '3.12']
runs-on: ${{ matrix.os }}

steps:
Expand Down Expand Up @@ -40,15 +40,15 @@ jobs:
strategy:
matrix:
include:
# - SPARK_VERSION: "2.4.8"
# HADOOP_VERSION: "2.7"
# JAVA_VERSION: "8"
# python: "3.7"
# os: ubuntu-latest
- SPARK_VERSION: "3.3.2"
HADOOP_VERSION: "3"
JAVA_VERSION: "11"
python: "3.8"
python: "3.9"
os: ubuntu-latest
- SPARK_VERSION: "3.5.3"
HADOOP_VERSION: "3"
JAVA_VERSION: "11"
python: "3.9"
os: ubuntu-latest
runs-on: ${{ matrix.os }}
name: ${{ matrix.os }}, Spark ${{ matrix.SPARK_VERSION}}, Python ${{ matrix.python }}
Expand All @@ -67,10 +67,9 @@ jobs:
/home/runner/work/spark.tgz
~/.cache/pip
key: ${{ runner.os }}-spark-${{ matrix.SPARK_VERSION }}-hadoop${{ matrix.HADOOP_VERSION }}-java${{ matrix.JAVA_VERSION }}-${{ hashFiles('**/pyproject.toml') }}
- name: Install dependencies
- name: Install pip and setuptools
run: |
python -m pip install --upgrade pip setuptools
pip install -e .[test]
- name: Download spark
if: steps.cache-spark.outputs.cache-hit != 'true'
env:
Expand All @@ -93,6 +92,9 @@ jobs:
# https://github.com/python-poetry/poetry/issues/6792
pip3 install "pypandoc<1.8"
pip install "pyspark==${SPARK_VERSION}"
- name: Install project dependencies
run: |
pip install -e .[test]
- name: Test with pytest (spark-specific)
env:
BUILD_DIR: "/home/runner/work/" #${{ github.workspace }}
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,5 @@ docs/build
# Developer's playground
/playground/
.ruff_cache/

notebooks/report.html
4 changes: 2 additions & 2 deletions docs/source/developing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ For this you'll need to install our test requirements:
.. code-block:: bash
cd popmon/
pip install -r requirements-test.txt
python setup.py test
pip install -r .[test]
pytest
That's it!

Expand Down
4 changes: 1 addition & 3 deletions popmon/analysis/profiling/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,7 @@ def replace(bl):
if len(bin_labels) == 0 or len(bin_labels) > 4 or np.sum(bin_entries) == 0:
return np.nan
if not np.all([isinstance(bl, (bool, np.bool_)) for bl in bin_labels]):
if not np.all(
[isinstance(bl, (str, np.str_, np.string_)) for bl in bin_labels]
):
if not np.all([isinstance(bl, (str, np.str_, np.bytes_)) for bl in bin_labels]):
return np.nan
# all strings from hereon
n_true = (bin_labels == "True").sum() + (bin_labels == "true").sum()
Expand Down
7 changes: 6 additions & 1 deletion popmon/analysis/profiling/pull_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,11 @@ def transform(self, datastore):
class ReferencePullCalculator(PullCalculator):
"""Pull calculation based on reference mean and standard deviations"""

@staticmethod
def mean(x):
""" "Column-wise mean version."""
return np.mean(x, axis=0)

def __init__(
self,
reference_key,
Expand All @@ -233,7 +238,7 @@ def __init__(
:param kwargs: (dict, optional): residual kwargs passed on to mean and std functions
"""
super().__init__(
np.mean,
ReferencePullCalculator.mean,
np.std,
reference_key,
assign_to_key,
Expand Down
18 changes: 18 additions & 0 deletions popmon/notebooks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright (c) 2023 ING Analytics Wholesale Banking
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 changes: 10 additions & 17 deletions popmon/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,37 +20,30 @@

# Resources lookup file for popmon
import json
import pathlib
from importlib import resources

from jinja2 import Environment, FileSystemLoader
from pkg_resources import resource_filename

import popmon
from popmon import notebooks, test_data, visualization

# data files that are shipped with popmon.
_DATA = {
_.name: _
for _ in pathlib.Path(resource_filename(popmon.__name__, "test_data")).glob("*")
}
_DATA = {_.name: _ for _ in resources.files(test_data).iterdir()}

# Tutorial notebooks
_NOTEBOOK = {
_.name: _
for _ in pathlib.Path(resource_filename(popmon.__name__, "notebooks")).glob(
"*.ipynb"
)
p.name: p for p in resources.files(notebooks).iterdir() if p.suffix == ".ipynb"
}

# Resource types
_RESOURCES = {"data": _DATA, "notebook": _NOTEBOOK}

# Environment for visualization templates' directory
_TEMPLATES_ENV = Environment(
loader=FileSystemLoader(
resource_filename(popmon.__name__, "visualization/templates")
),
autoescape=True,
)
ref = resources.files(visualization) / "templates"
with resources.as_file(ref) as templates_dir_path:
_TEMPLATES_ENV = Environment(
loader=FileSystemLoader(templates_dir_path),
autoescape=True,
)
_TEMPLATES_ENV.filters["fmt_metric"] = lambda x: x.replace("_", " ")


Expand Down
18 changes: 18 additions & 0 deletions popmon/test_data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright (c) 2023 ING Analytics Wholesale Banking
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ keywords = [
"ipython"
]
readme = "README.rst"
requires-python = ">=3.7"
requires-python = ">=3.9"
authors = [{name = "ING Analytics Wholesale Banking", email = "wbaa@ing.com"}]
license = {type = "MIT", file = "LICENSE"}
dependencies = [
"numpy>=1.18.0",
"pandas>=0.25.1,<2",
"pandas>=0.25.1",
"scipy>=1.5.2",
"histogrammar>=1.0.32",
"phik",
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ tqdm
plotly>=5.8.0
joblib>=0.14.0
htmlmin
pydantic
typing_extensions
pydantic>=2
pydantic-settings
typing_extensions
15 changes: 10 additions & 5 deletions tests/popmon/analysis/profiling/test_apply_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
from popmon.base import Pipeline


def mean(x):
""" "Column-wise mean version,"""
return np.mean(x, axis=0)


def get_test_data():
df = pd.DataFrame()
df["a"] = np.arange(100)
Expand All @@ -25,7 +30,7 @@ def test_pull():

module1 = ApplyFunc(apply_to_key="to_profile")
module1.add_apply_func(np.std, suffix="_std", entire=True)
module1.add_apply_func(np.mean, suffix="_mean", entire=True)
module1.add_apply_func(mean, suffix="_mean", entire=True)

module2 = ApplyFunc(apply_to_key="to_profile", features=["asc_numbers"])
module2.add_apply_func(
Expand Down Expand Up @@ -57,7 +62,7 @@ def func(x):
)

module.add_apply_func(np.std, entire=True)
module.add_apply_func(np.mean, entire=True)
module.add_apply_func(mean, entire=True)
module.add_apply_func(func)

datastore = module.transform(datastore)
Expand All @@ -77,7 +82,7 @@ def test_variance_comparer():
apply_to_key="to_profile", features=["the_feature", "dummy_feature"]
)
module1.add_apply_func(np.std, suffix="_std", entire=True)
module1.add_apply_func(np.mean, suffix="_mean", entire=True)
module1.add_apply_func(mean, suffix="_mean", entire=True)

module2 = ApplyFunc(
apply_to_key="to_profile", features=["the_feature", "dummy_feature"]
Expand Down Expand Up @@ -171,7 +176,7 @@ def test_apply_func():

apply_funcs = [
{"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True},
{"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
{"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
]

d = apply_func(
Expand All @@ -195,7 +200,7 @@ def test_apply_func_array():

apply_funcs = [
{"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True},
{"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
{"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
]

f, p = apply_func_array(
Expand Down
10 changes: 5 additions & 5 deletions tests/popmon/analysis/test_hist_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
import pandas as pd
import pytest
from conftest import make_mixed_dataframe

from popmon.analysis.hist_numpy import (
assert_similar_hists,
Expand Down Expand Up @@ -30,7 +31,7 @@ def get_test_histograms1():
"""Get set 1 of test histograms"""
# dummy dataset with mixed types
# convert timestamp (col D) to nanosec since 1970-1-1
df = pd._testing.makeMixedDataFrame()
df = make_mixed_dataframe()
df["date"] = df["D"].apply(to_ns)
df["boolT"] = True
df["boolF"] = False
Expand All @@ -55,8 +56,7 @@ def get_test_histograms1():
def get_test_histograms2():
"""Get set 2 of test histograms"""
# dummy dataset with mixed types
# convert timestamp (col D) to nanosec since 1970-1-1
df = pd._testing.makeMixedDataFrame()
df = make_mixed_dataframe()

# building 1d-, 2d-histogram (iteratively)
hist1 = hg.Categorize(unit("C"))
Expand Down Expand Up @@ -351,7 +351,7 @@ def test_check_similar_hists():
"""
# dummy dataset with mixed types
# convert timestamp (col D) to nanosec since 1970-1-1
df = pd._testing.makeMixedDataFrame()
df = make_mixed_dataframe()
df["date"] = df["D"].apply(to_ns)

# building 1d-, 2d-, and 3d-histogram (iteratively)
Expand Down Expand Up @@ -391,7 +391,7 @@ def test_assert_similar_hists():
"""
# dummy dataset with mixed types
# convert timestamp (col D) to nanosec since 1970-1-1
df = pd._testing.makeMixedDataFrame()
df = make_mixed_dataframe()
df["date"] = df["D"].apply(to_ns)

# building 1d-, 2d-, and 3d-histogram (iteratively)
Expand Down
12 changes: 12 additions & 0 deletions tests/popmon/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import pandas as pd
import pytest
from pandas.core.indexes.datetimes import bdate_range

from popmon import resources

Expand Down Expand Up @@ -88,3 +89,14 @@ def pytest_configure():
df = pd.read_csv(resources.data(CSV_FILE))
df["date"] = pd.to_datetime(df["date"])
pytest.test_df = df


def make_mixed_dataframe() -> pd.DataFrame:
return pd.DataFrame(
{
"A": [0.0, 1.0, 2.0, 3.0, 4.0],
"B": [0.0, 1.0, 0.0, 1.0, 0.0],
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
"D": bdate_range("1/1/2009", periods=5),
}
)
3 changes: 2 additions & 1 deletion tests/popmon/hist/test_histogram.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import histogrammar as hg
import numpy as np
import pandas as pd
from conftest import make_mixed_dataframe

from popmon.hist.hist_utils import (
is_numeric,
Expand All @@ -15,7 +16,7 @@


def get_test_data():
df = pd._testing.makeMixedDataFrame()
df = make_mixed_dataframe()
df["date"] = df["D"].apply(lambda x: pd.to_datetime(x).value)
return df

Expand Down

0 comments on commit cfc85d1

Please sign in to comment.