From 008e010b17a13b7b0578761c4a24272e26075108 Mon Sep 17 00:00:00 2001 From: Michael Van de Steene Date: Thu, 22 Feb 2024 13:06:04 +0100 Subject: [PATCH 1/2] Change default for incomplete chunks to `keep` --- nannyml/chunk.py | 8 ++++---- nannyml/config.py | 6 ------ 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/nannyml/chunk.py b/nannyml/chunk.py index f986448fb..4fe6ced07 100644 --- a/nannyml/chunk.py +++ b/nannyml/chunk.py @@ -331,14 +331,14 @@ class SizeBasedChunker(Chunker): """ - def __init__(self, chunk_size: int, incomplete: str = 'append', timestamp_column_name: Optional[str] = None): + def __init__(self, chunk_size: int, incomplete: str = 'keep', timestamp_column_name: Optional[str] = None): """Create a new SizeBasedChunker. Parameters ---------- chunk_size: int The preferred size of the resulting Chunks, i.e. the number of observations in each Chunk. - incomplete: str, default='append' + incomplete: str, default='keep' Choose how to handle any leftover observations that don't make up a full Chunk. The following options are available: @@ -429,7 +429,7 @@ class CountBasedChunker(Chunker): """ - def __init__(self, chunk_number: int, incomplete: str = 'append', timestamp_column_name: Optional[str] = None): + def __init__(self, chunk_number: int, incomplete: str = 'keep', timestamp_column_name: Optional[str] = None): """Creates a new CountBasedChunker. It will calculate the amount of observations per chunk based on the given chunk count. @@ -450,7 +450,7 @@ def __init__(self, chunk_number: int, incomplete: str = 'append', timestamp_colu - ``'append'``: append leftover observations to the last complete Chunk (overfilling it) - Defaults to ``'append'``. + Defaults to ``'keep'``. Returns ------- diff --git a/nannyml/config.py b/nannyml/config.py index adc43b456..2e8e4743c 100644 --- a/nannyml/config.py +++ b/nannyml/config.py @@ -40,12 +40,6 @@ class WriterConfig(BaseModel): write_args: Optional[Dict[str, Any]] -class ChunkerConfig(BaseModel): - chunk_size: Optional[int] - chunk_period: Optional[str] - chunk_count: Optional[int] - - class IntervalSchedulingConfig(BaseModel): weeks: Optional[int] days: Optional[int] From c21824c9852f7513d3464c24476a4ab5444fbde5 Mon Sep 17 00:00:00 2001 From: Niels Nuyttens Date: Sun, 25 Feb 2024 01:44:52 +0100 Subject: [PATCH 2/2] Fix failing tests due to 'keep' default for Size based chunker --- nannyml/sampling_error/summary_stats.py | 14 +++---- tests/drift/test_drift.py | 4 +- tests/drift/test_multiv_pca.py | 4 +- .../CBPE/test_cbpe_metrics.py | 34 +++++++++++------ tests/stats/test_std.py | 37 ++++++++----------- tests/test_chunk.py | 6 +-- 6 files changed, 52 insertions(+), 47 deletions(-) diff --git a/nannyml/sampling_error/summary_stats.py b/nannyml/sampling_error/summary_stats.py index 19558fca3..e8e71e74e 100644 --- a/nannyml/sampling_error/summary_stats.py +++ b/nannyml/sampling_error/summary_stats.py @@ -2,15 +2,16 @@ # # License: Apache Software License 2.0 +from logging import getLogger from typing import Tuple import numpy as np import pandas as pd from scipy.stats import gaussian_kde, moment -from logging import getLogger logger = getLogger(__name__) + def summary_stats_std_sampling_error_components(col: pd.Series) -> Tuple: """ Calculate sampling error components for Summary Stats Standard Deviation @@ -54,12 +55,11 @@ def summary_stats_std_sampling_error(sampling_error_components, col) -> float: _mu4 = sampling_error_components[1] _size = col.shape[0] - err_var_parenthesis_part = (_mu4 - ((_size - 3) * (_std**4) / (_size - 1))) - if not ( - np.isfinite(err_var_parenthesis_part) and - err_var_parenthesis_part >= 0 - ): - logger.debug("Summary Stats sampling error calculation imputed to nan because of non finite positive parenthesis factor.") + err_var_parenthesis_part = _mu4 - ((_size - 3) * (_std**4) / (_size - 1)) + if not (np.isfinite(err_var_parenthesis_part) and err_var_parenthesis_part >= 0): + logger.debug( + "Summary Stats sampling error calculation imputed to nan because of non finite positive parenthesis factor." + ) return np.nan err_var = np.sqrt((1 / _size) * err_var_parenthesis_part) return (1 / (2 * _std)) * err_var diff --git a/tests/drift/test_drift.py b/tests/drift/test_drift.py index 845f08fa4..64bca5ad6 100644 --- a/tests/drift/test_drift.py +++ b/tests/drift/test_drift.py @@ -453,11 +453,11 @@ def test_statistical_drift_calculator_deals_with_missing_class_labels(sample_dri [ ( {'chunk_size': 5000}, - [0.004968, 0.004833, 0.01186, 0.242068], + [0.004968, 0.004833, 0.01186, 0.243595, 0.210516], ), ( {'chunk_size': 5000, 'timestamp_column_name': 'timestamp'}, - [0.004968, 0.004833, 0.01186, 0.242068], + [0.004968, 0.004833, 0.01186, 0.243595, 0.210516], ), ( {'chunk_number': 5}, diff --git a/tests/drift/test_multiv_pca.py b/tests/drift/test_multiv_pca.py index f2ccfe07e..63e0cf35a 100644 --- a/tests/drift/test_multiv_pca.py +++ b/tests/drift/test_multiv_pca.py @@ -292,11 +292,11 @@ def test_data_reconstruction_drift_calculator_numeric_results(sample_drift_data) [ ( {'chunk_size': 5000}, - [0.7998744001719177, 0.8020996183121666, 0.8043000024523013, 0.73631], + [0.79987, 0.80210, 0.80430, 0.73552, 0.76087], ), ( {'chunk_size': 5000, 'timestamp_column_name': 'timestamp'}, - [0.7998744001719177, 0.8020996183121666, 0.8043000024523013, 0.73631], + [0.79987, 0.80210, 0.80430, 0.73552, 0.76087], ), ( {'chunk_number': 5}, diff --git a/tests/performance_estimation/CBPE/test_cbpe_metrics.py b/tests/performance_estimation/CBPE/test_cbpe_metrics.py index d2ea3d82a..d0456e27e 100644 --- a/tests/performance_estimation/CBPE/test_cbpe_metrics.py +++ b/tests/performance_estimation/CBPE/test_cbpe_metrics.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -from nannyml.chunk import DefaultChunker +from nannyml.chunk import DefaultChunker, SizeBasedChunker from nannyml.datasets import ( load_synthetic_binary_classification_dataset, load_synthetic_multiclass_classification_dataset, @@ -24,7 +24,7 @@ [ ( { - 'chunk_size': 20000, + 'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'), 'normalize_confusion_matrix': None, 'business_value_matrix': [[2, -5], [-10, 10]], 'normalize_business_value': None, @@ -48,7 +48,7 @@ ), ( { - 'chunk_size': 20000, + 'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'), 'normalize_confusion_matrix': None, 'business_value_matrix': [[2, -5], [-10, 10]], 'normalize_business_value': 'per_prediction', @@ -71,7 +71,11 @@ ), ), ( - {'chunk_size': 20000, 'normalize_confusion_matrix': 'all', 'business_value_matrix': [[-1, 4], [8, -8]]}, + { + 'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'), + 'normalize_confusion_matrix': 'all', + 'business_value_matrix': [[-1, 4], [8, -8]], + }, pd.DataFrame( { 'key': ['[0:19999]', '[20000:49999]'], @@ -90,7 +94,11 @@ ), ), ( - {'chunk_size': 20000, 'normalize_confusion_matrix': 'true', 'business_value_matrix': [[-1, 4], [8, -8]]}, + { + 'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'), + 'normalize_confusion_matrix': 'true', + 'business_value_matrix': [[-1, 4], [8, -8]], + }, pd.DataFrame( { 'key': ['[0:19999]', '[20000:49999]'], @@ -109,7 +117,11 @@ ), ), ( - {'chunk_size': 20000, 'normalize_confusion_matrix': 'pred', 'business_value_matrix': [[-1, 4], [8, -8]]}, + { + 'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'), + 'normalize_confusion_matrix': 'pred', + 'business_value_matrix': [[-1, 4], [8, -8]], + }, pd.DataFrame( { 'key': ['[0:19999]', '[20000:49999]'], @@ -129,7 +141,7 @@ ), ( { - 'chunk_size': 20000, + 'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'), 'normalize_confusion_matrix': None, 'timestamp_column_name': 'timestamp', 'business_value_matrix': [[-1, 4], [8, -8]], @@ -153,7 +165,7 @@ ), ( { - 'chunk_size': 20000, + 'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'), 'normalize_confusion_matrix': 'all', 'timestamp_column_name': 'timestamp', 'business_value_matrix': [[-1, 4], [8, -8]], @@ -177,7 +189,7 @@ ), ( { - 'chunk_size': 20000, + 'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'), 'normalize_confusion_matrix': 'all', 'timestamp_column_name': 'timestamp', 'business_value_matrix': [[2, -5], [-10, 10]], @@ -202,7 +214,7 @@ ), ( { - 'chunk_size': 20000, + 'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'), 'normalize_confusion_matrix': 'true', 'timestamp_column_name': 'timestamp', 'business_value_matrix': [[-1, 4], [8, -8]], @@ -226,7 +238,7 @@ ), ( { - 'chunk_size': 20000, + 'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'), 'normalize_confusion_matrix': 'pred', 'timestamp_column_name': 'timestamp', 'business_value_matrix': [[-1, 4], [8, -8]], diff --git a/tests/stats/test_std.py b/tests/stats/test_std.py index d6481b32d..525868b00 100644 --- a/tests/stats/test_std.py +++ b/tests/stats/test_std.py @@ -5,14 +5,13 @@ """Tests for Drift package.""" -import pytest -import pandas as pd import numpy as np +import pandas as pd +import pytest - +from nannyml.chunk import SizeBasedChunker from nannyml.datasets import load_synthetic_car_loan_dataset from nannyml.stats import SummaryStatsStdCalculator -from nannyml.chunk import SizeBasedChunker # @pytest.fixture(scope="module") # def status_sum_result() -> Result: @@ -43,30 +42,24 @@ def test_stats_std_calculator_with_default_params_chunk_size_one(): # noqa: D10 reference, analysis, _ = load_synthetic_car_loan_dataset() chunker = SizeBasedChunker(chunk_size=5_000, incomplete='keep') - calc = SummaryStatsStdCalculator( - column_names=['car_value'], - chunker=chunker - ).fit(reference) + calc = SummaryStatsStdCalculator(column_names=['car_value'], chunker=chunker).fit(reference) result = calc.calculate(data=analysis.head(5_001)) expected = pd.DataFrame( { ('chunk', 'key'): ['[0:4999]', '[5000:5000]'], - ('chunk', 'chunk_index'): [0,1], - ('chunk', 'start_index'): [0,5000], - ('chunk', 'end_index'): [4999,5000], - ('chunk', 'start_date'): [None,None], - ('chunk', 'end_date'): [None,None], - ('chunk', 'period'): ['analysis','analysis'], - ('car_value', 'value'): [20614.8926,np.nan], - ('car_value', 'sampling_error'): [271.9917,np.nan], - ('car_value', 'upper_confidence_boundary'): [21430.8679,np.nan], - ('car_value', 'lower_confidence_boundary'): [19798.9174,np.nan], + ('chunk', 'chunk_index'): [0, 1], + ('chunk', 'start_index'): [0, 5000], + ('chunk', 'end_index'): [4999, 5000], + ('chunk', 'start_date'): [None, None], + ('chunk', 'end_date'): [None, None], + ('chunk', 'period'): ['analysis', 'analysis'], + ('car_value', 'value'): [20614.8926, np.nan], + ('car_value', 'sampling_error'): [271.9917, np.nan], + ('car_value', 'upper_confidence_boundary'): [21430.8679, np.nan], + ('car_value', 'lower_confidence_boundary'): [19798.9174, np.nan], ('car_value', 'upper_threshold'): [20978.5658, 20978.5658], ('car_value', 'lower_threshold'): [19816.9091, 19816.9091], ('car_value', 'alert'): [False, True], } ) - pd.testing.assert_frame_equal( - expected, - result.filter(period='analysis').to_df().round(4) - ) + pd.testing.assert_frame_equal(expected, result.filter(period='analysis').to_df().round(4)) diff --git a/tests/test_chunk.py b/tests/test_chunk.py index 54bf8714b..0c741372b 100644 --- a/tests/test_chunk.py +++ b/tests/test_chunk.py @@ -241,12 +241,12 @@ def test_size_based_chunker_returns_chunks_of_required_size(sample_chunk_data): chunker = SizeBasedChunker(chunk_size=chunk_size) sut = chunker.split(sample_chunk_data) assert len(sut[0]) == chunk_size - assert len(sut) == math.ceil(sample_chunk_data.shape[0] / chunk_size) - 1 + assert len(sut) == math.ceil(sample_chunk_data.shape[0] / chunk_size) def test_size_based_chunker_returns_last_chunk_that_is_partially_filled(sample_chunk_data): # noqa: D103 chunk_size = 3333 - expected_last_chunk_size = chunk_size + sample_chunk_data.shape[0] % chunk_size + expected_last_chunk_size = sample_chunk_data.shape[0] % chunk_size chunker = SizeBasedChunker(chunk_size) sut = chunker.split(sample_chunk_data) assert len(sut[-1]) == expected_last_chunk_size @@ -304,7 +304,7 @@ def test_size_based_chunker_uses_observations_to_set_chunk_date_boundaries(sampl def test_size_based_chunker_assigns_observation_range_to_chunk_keys(sample_chunk_data): # noqa: D103 chunk_size = 1500 - last_chunk_start = ((sample_chunk_data.shape[0] // chunk_size) - 1) * chunk_size + last_chunk_start = (sample_chunk_data.shape[0] // chunk_size) * chunk_size last_chunk_end = sample_chunk_data.shape[0] - 1 chunker = SizeBasedChunker(chunk_size=chunk_size)