Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change default for incomplete chunks to keep #367

Merged
merged 2 commits into from
Feb 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions nannyml/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,14 +331,14 @@ class SizeBasedChunker(Chunker):

"""

def __init__(self, chunk_size: int, incomplete: str = 'append', timestamp_column_name: Optional[str] = None):
def __init__(self, chunk_size: int, incomplete: str = 'keep', timestamp_column_name: Optional[str] = None):
"""Create a new SizeBasedChunker.

Parameters
----------
chunk_size: int
The preferred size of the resulting Chunks, i.e. the number of observations in each Chunk.
incomplete: str, default='append'
incomplete: str, default='keep'
Choose how to handle any leftover observations that don't make up a full Chunk.
The following options are available:

Expand Down Expand Up @@ -429,7 +429,7 @@ class CountBasedChunker(Chunker):

"""

def __init__(self, chunk_number: int, incomplete: str = 'append', timestamp_column_name: Optional[str] = None):
def __init__(self, chunk_number: int, incomplete: str = 'keep', timestamp_column_name: Optional[str] = None):
"""Creates a new CountBasedChunker.

It will calculate the amount of observations per chunk based on the given chunk count.
Expand All @@ -450,7 +450,7 @@ def __init__(self, chunk_number: int, incomplete: str = 'append', timestamp_colu

- ``'append'``: append leftover observations to the last complete Chunk (overfilling it)

Defaults to ``'append'``.
Defaults to ``'keep'``.

Returns
-------
Expand Down
6 changes: 0 additions & 6 deletions nannyml/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,6 @@ class WriterConfig(BaseModel):
write_args: Optional[Dict[str, Any]]


class ChunkerConfig(BaseModel):
chunk_size: Optional[int]
chunk_period: Optional[str]
chunk_count: Optional[int]


class IntervalSchedulingConfig(BaseModel):
weeks: Optional[int]
days: Optional[int]
Expand Down
14 changes: 7 additions & 7 deletions nannyml/sampling_error/summary_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
#
# License: Apache Software License 2.0

from logging import getLogger
from typing import Tuple

import numpy as np
import pandas as pd
from scipy.stats import gaussian_kde, moment
from logging import getLogger

logger = getLogger(__name__)


def summary_stats_std_sampling_error_components(col: pd.Series) -> Tuple:
"""
Calculate sampling error components for Summary Stats Standard Deviation
Expand Down Expand Up @@ -54,12 +55,11 @@ def summary_stats_std_sampling_error(sampling_error_components, col) -> float:
_mu4 = sampling_error_components[1]
_size = col.shape[0]

err_var_parenthesis_part = (_mu4 - ((_size - 3) * (_std**4) / (_size - 1)))
if not (
np.isfinite(err_var_parenthesis_part) and
err_var_parenthesis_part >= 0
):
logger.debug("Summary Stats sampling error calculation imputed to nan because of non finite positive parenthesis factor.")
err_var_parenthesis_part = _mu4 - ((_size - 3) * (_std**4) / (_size - 1))
if not (np.isfinite(err_var_parenthesis_part) and err_var_parenthesis_part >= 0):
logger.debug(
"Summary Stats sampling error calculation imputed to nan because of non finite positive parenthesis factor."
)
return np.nan
err_var = np.sqrt((1 / _size) * err_var_parenthesis_part)
return (1 / (2 * _std)) * err_var
Expand Down
4 changes: 2 additions & 2 deletions tests/drift/test_drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,11 +453,11 @@ def test_statistical_drift_calculator_deals_with_missing_class_labels(sample_dri
[
(
{'chunk_size': 5000},
[0.004968, 0.004833, 0.01186, 0.242068],
[0.004968, 0.004833, 0.01186, 0.243595, 0.210516],
),
(
{'chunk_size': 5000, 'timestamp_column_name': 'timestamp'},
[0.004968, 0.004833, 0.01186, 0.242068],
[0.004968, 0.004833, 0.01186, 0.243595, 0.210516],
),
(
{'chunk_number': 5},
Expand Down
4 changes: 2 additions & 2 deletions tests/drift/test_multiv_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,11 +292,11 @@ def test_data_reconstruction_drift_calculator_numeric_results(sample_drift_data)
[
(
{'chunk_size': 5000},
[0.7998744001719177, 0.8020996183121666, 0.8043000024523013, 0.73631],
[0.79987, 0.80210, 0.80430, 0.73552, 0.76087],
),
(
{'chunk_size': 5000, 'timestamp_column_name': 'timestamp'},
[0.7998744001719177, 0.8020996183121666, 0.8043000024523013, 0.73631],
[0.79987, 0.80210, 0.80430, 0.73552, 0.76087],
),
(
{'chunk_number': 5},
Expand Down
34 changes: 23 additions & 11 deletions tests/performance_estimation/CBPE/test_cbpe_metrics.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import pytest

from nannyml.chunk import DefaultChunker
from nannyml.chunk import DefaultChunker, SizeBasedChunker
from nannyml.datasets import (
load_synthetic_binary_classification_dataset,
load_synthetic_multiclass_classification_dataset,
Expand All @@ -24,7 +24,7 @@
[
(
{
'chunk_size': 20000,
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': None,
'business_value_matrix': [[2, -5], [-10, 10]],
'normalize_business_value': None,
Expand All @@ -48,7 +48,7 @@
),
(
{
'chunk_size': 20000,
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': None,
'business_value_matrix': [[2, -5], [-10, 10]],
'normalize_business_value': 'per_prediction',
Expand All @@ -71,7 +71,11 @@
),
),
(
{'chunk_size': 20000, 'normalize_confusion_matrix': 'all', 'business_value_matrix': [[-1, 4], [8, -8]]},
{
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': 'all',
'business_value_matrix': [[-1, 4], [8, -8]],
},
pd.DataFrame(
{
'key': ['[0:19999]', '[20000:49999]'],
Expand All @@ -90,7 +94,11 @@
),
),
(
{'chunk_size': 20000, 'normalize_confusion_matrix': 'true', 'business_value_matrix': [[-1, 4], [8, -8]]},
{
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': 'true',
'business_value_matrix': [[-1, 4], [8, -8]],
},
pd.DataFrame(
{
'key': ['[0:19999]', '[20000:49999]'],
Expand All @@ -109,7 +117,11 @@
),
),
(
{'chunk_size': 20000, 'normalize_confusion_matrix': 'pred', 'business_value_matrix': [[-1, 4], [8, -8]]},
{
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': 'pred',
'business_value_matrix': [[-1, 4], [8, -8]],
},
pd.DataFrame(
{
'key': ['[0:19999]', '[20000:49999]'],
Expand All @@ -129,7 +141,7 @@
),
(
{
'chunk_size': 20000,
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': None,
'timestamp_column_name': 'timestamp',
'business_value_matrix': [[-1, 4], [8, -8]],
Expand All @@ -153,7 +165,7 @@
),
(
{
'chunk_size': 20000,
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': 'all',
'timestamp_column_name': 'timestamp',
'business_value_matrix': [[-1, 4], [8, -8]],
Expand All @@ -177,7 +189,7 @@
),
(
{
'chunk_size': 20000,
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': 'all',
'timestamp_column_name': 'timestamp',
'business_value_matrix': [[2, -5], [-10, 10]],
Expand All @@ -202,7 +214,7 @@
),
(
{
'chunk_size': 20000,
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': 'true',
'timestamp_column_name': 'timestamp',
'business_value_matrix': [[-1, 4], [8, -8]],
Expand All @@ -226,7 +238,7 @@
),
(
{
'chunk_size': 20000,
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': 'pred',
'timestamp_column_name': 'timestamp',
'business_value_matrix': [[-1, 4], [8, -8]],
Expand Down
37 changes: 15 additions & 22 deletions tests/stats/test_std.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@

"""Tests for Drift package."""

import pytest
import pandas as pd
import numpy as np
import pandas as pd
import pytest


from nannyml.chunk import SizeBasedChunker
from nannyml.datasets import load_synthetic_car_loan_dataset
from nannyml.stats import SummaryStatsStdCalculator
from nannyml.chunk import SizeBasedChunker

# @pytest.fixture(scope="module")
# def status_sum_result() -> Result:
Expand Down Expand Up @@ -43,30 +42,24 @@ def test_stats_std_calculator_with_default_params_chunk_size_one(): # noqa: D10
reference, analysis, _ = load_synthetic_car_loan_dataset()

chunker = SizeBasedChunker(chunk_size=5_000, incomplete='keep')
calc = SummaryStatsStdCalculator(
column_names=['car_value'],
chunker=chunker
).fit(reference)
calc = SummaryStatsStdCalculator(column_names=['car_value'], chunker=chunker).fit(reference)
result = calc.calculate(data=analysis.head(5_001))
expected = pd.DataFrame(
{
('chunk', 'key'): ['[0:4999]', '[5000:5000]'],
('chunk', 'chunk_index'): [0,1],
('chunk', 'start_index'): [0,5000],
('chunk', 'end_index'): [4999,5000],
('chunk', 'start_date'): [None,None],
('chunk', 'end_date'): [None,None],
('chunk', 'period'): ['analysis','analysis'],
('car_value', 'value'): [20614.8926,np.nan],
('car_value', 'sampling_error'): [271.9917,np.nan],
('car_value', 'upper_confidence_boundary'): [21430.8679,np.nan],
('car_value', 'lower_confidence_boundary'): [19798.9174,np.nan],
('chunk', 'chunk_index'): [0, 1],
('chunk', 'start_index'): [0, 5000],
('chunk', 'end_index'): [4999, 5000],
('chunk', 'start_date'): [None, None],
('chunk', 'end_date'): [None, None],
('chunk', 'period'): ['analysis', 'analysis'],
('car_value', 'value'): [20614.8926, np.nan],
('car_value', 'sampling_error'): [271.9917, np.nan],
('car_value', 'upper_confidence_boundary'): [21430.8679, np.nan],
('car_value', 'lower_confidence_boundary'): [19798.9174, np.nan],
('car_value', 'upper_threshold'): [20978.5658, 20978.5658],
('car_value', 'lower_threshold'): [19816.9091, 19816.9091],
('car_value', 'alert'): [False, True],
}
)
pd.testing.assert_frame_equal(
expected,
result.filter(period='analysis').to_df().round(4)
)
pd.testing.assert_frame_equal(expected, result.filter(period='analysis').to_df().round(4))
6 changes: 3 additions & 3 deletions tests/test_chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,12 +241,12 @@ def test_size_based_chunker_returns_chunks_of_required_size(sample_chunk_data):
chunker = SizeBasedChunker(chunk_size=chunk_size)
sut = chunker.split(sample_chunk_data)
assert len(sut[0]) == chunk_size
assert len(sut) == math.ceil(sample_chunk_data.shape[0] / chunk_size) - 1
assert len(sut) == math.ceil(sample_chunk_data.shape[0] / chunk_size)


def test_size_based_chunker_returns_last_chunk_that_is_partially_filled(sample_chunk_data): # noqa: D103
chunk_size = 3333
expected_last_chunk_size = chunk_size + sample_chunk_data.shape[0] % chunk_size
expected_last_chunk_size = sample_chunk_data.shape[0] % chunk_size
chunker = SizeBasedChunker(chunk_size)
sut = chunker.split(sample_chunk_data)
assert len(sut[-1]) == expected_last_chunk_size
Expand Down Expand Up @@ -304,7 +304,7 @@ def test_size_based_chunker_uses_observations_to_set_chunk_date_boundaries(sampl

def test_size_based_chunker_assigns_observation_range_to_chunk_keys(sample_chunk_data): # noqa: D103
chunk_size = 1500
last_chunk_start = ((sample_chunk_data.shape[0] // chunk_size) - 1) * chunk_size
last_chunk_start = (sample_chunk_data.shape[0] // chunk_size) * chunk_size
last_chunk_end = sample_chunk_data.shape[0] - 1

chunker = SizeBasedChunker(chunk_size=chunk_size)
Expand Down
Loading