From ea58aed729de2799c13ab59dc8ae1a05da42842d Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Mon, 5 Feb 2024 15:59:00 -0600 Subject: [PATCH 01/11] update profiler utils --- dataprofiler/profilers/profiler_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py index a81dca7a..319c42b7 100644 --- a/dataprofiler/profilers/profiler_utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -333,6 +333,7 @@ def biased_skew(df_series: Series | np.ndarray) -> np.float64: :return: biased skewness :rtype: np.float64 """ + df_series = pl.from_pandas(df_series, nan_to_null=False) n = len(df_series) if n < 1: return np.float64(np.nan) @@ -371,6 +372,7 @@ def biased_kurt(df_series: Series | np.ndarray) -> np.float64: :return: biased kurtosis :rtype: np.float64 """ + df_series = pl.from_pandas(df_series, nan_to_null=False) n = len(df_series) if n < 1: return np.float64(np.nan) From f23f70111287df2f00e749e124ccbb4bfb611ed3 Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Mon, 5 Feb 2024 16:26:12 -0600 Subject: [PATCH 02/11] finish updates --- dataprofiler/profilers/profiler_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py index 319c42b7..a81dca7a 100644 --- a/dataprofiler/profilers/profiler_utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -333,7 +333,6 @@ def biased_skew(df_series: Series | np.ndarray) -> np.float64: :return: biased skewness :rtype: np.float64 """ - df_series = pl.from_pandas(df_series, nan_to_null=False) n = len(df_series) if n < 1: return np.float64(np.nan) @@ -372,7 +371,6 @@ def biased_kurt(df_series: Series | np.ndarray) -> np.float64: :return: biased kurtosis :rtype: np.float64 """ - df_series = pl.from_pandas(df_series, nan_to_null=False) n = len(df_series) if n < 1: return np.float64(np.nan) From ac888ff6fa4fd54394d0df828d613550932ac2fd Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Fri, 16 Feb 2024 13:40:07 -0600 Subject: [PATCH 03/11] finish int updates --- dataprofiler/profilers/int_column_profile.py | 23 +++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py index 014465c7..5e1ad6ee 100644 --- a/dataprofiler/profilers/int_column_profile.py +++ b/dataprofiler/profilers/int_column_profile.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +import polars as pl from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler from .numerical_column_stats import NumericStatsMixin @@ -113,7 +114,7 @@ def data_type_ratio(self) -> float | None: return None @classmethod - def _is_each_row_int(cls, df_series: pd.Series) -> list[bool]: + def _is_each_row_int(cls, df_series: pl.Series) -> list[bool]: """ Return true if given is numerical and int values. @@ -134,7 +135,7 @@ def _is_each_row_int(cls, df_series: pd.Series) -> list[bool]: return [NumericStatsMixin.is_int(x) for x in df_series] - def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: + def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: """ Update col profile properties with clean dataset and its known null params. @@ -144,6 +145,7 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: :type profile: dict :return: None """ + df_series_clean = pd.Series(df_series_clean.to_numpy()) if self._NumericStatsMixin__calculations: NumericStatsMixin._update_helper(self, df_series_clean, profile) self._update_column_base_properties(profile) @@ -157,23 +159,32 @@ def update(self, df_series: pd.Series) -> IntColumn: :return: updated IntColumn :rtype: IntColumn """ + self._greater_than_64_bit = ( + not df_series.empty + and df_series.apply(pd.to_numeric, errors="coerce").dtype == "O" + ) + if self._greater_than_64_bit: + df_series = pl.Series(df_series.to_list(), dtype=pl.Object) + else: + df_series = pl.from_pandas(df_series) if len(df_series) == 0: return self - df_series = df_series.reset_index(drop=True) is_each_row_int = self._is_each_row_int(df_series) sample_size = len(is_each_row_int) - match_int_count = np.sum(is_each_row_int) + match_int_count = np.sum([is_each_row_int]) profile = dict(match_count=match_int_count, sample_size=sample_size) BaseColumnProfiler._perform_property_calcs( self, self.__calculations, - df_series=df_series[is_each_row_int], + df_series=df_series.filter(is_each_row_int), prev_dependent_properties={}, subset_properties=profile, ) - self._update_helper(df_series_clean=df_series[is_each_row_int], profile=profile) + self._update_helper( + df_series_clean=df_series.filter(is_each_row_int), profile=profile + ) return self From 02aadef27a88571de41cce041911c1c5aef5d842 Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Fri, 9 Feb 2024 14:35:34 -0600 Subject: [PATCH 04/11] update float precision --- dataprofiler/profilers/float_column_profile.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py index bc426a44..3347be11 100644 --- a/dataprofiler/profilers/float_column_profile.py +++ b/dataprofiler/profilers/float_column_profile.py @@ -2,10 +2,10 @@ from __future__ import annotations import copy -import re import numpy as np import pandas as pd +import polars as pl from . import profiler_utils from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler @@ -288,13 +288,14 @@ def _get_float_precision( :return: string representing its precision print format :rtype: int """ + df_series_clean = pl.from_pandas(df_series_clean) len_df = len(df_series_clean) if not len_df: return None # Lead zeros: ^[+-.0\s]+ End zeros: \.?0+(\s|$) # Scientific Notation: (?<=[e])(.*) Any non-digits: \D - r = re.compile(r"^[+-.0\s]+|\.?0+(\s|$)|(?<=[e])(.*)|\D") + r = r"^[+-.0\s]+|\.?0+(\s|$)|([e].*)|\D" # DEFAULT: Sample the dataset. If small use full dataset, # OR 20k samples or 5% of the dataset which ever is larger. @@ -305,15 +306,17 @@ def _get_float_precision( # length of sampled cells after all punctuation removed len_per_float = ( - df_series_clean.sample(sample_size).replace(to_replace=r, value="").map(len) - ).astype(float) + df_series_clean.sample(sample_size) + .str.replace_all(pattern=r, value="") + .map_elements(len) + ).cast(float) # Determine statistics precision precision_sum = len_per_float.sum() subset_precision = { "min": np.float64(len_per_float.min()), "max": np.float64(len_per_float.max()), - "biased_var": np.var(len_per_float), + "biased_var": np.var([len_per_float]), "sum": np.float64(precision_sum), "mean": np.float64(precision_sum / sample_size), "sample_size": sample_size, From 8dc51068a7070b3f5722b09a8df33445c1267488 Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Mon, 12 Feb 2024 15:31:35 -0600 Subject: [PATCH 05/11] finish float col profile updates --- .../profilers/float_column_profile.py | 30 ++++++++++--------- .../profilers/test_float_column_profile.py | 3 +- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py index 3347be11..b5de726f 100644 --- a/dataprofiler/profilers/float_column_profile.py +++ b/dataprofiler/profilers/float_column_profile.py @@ -275,7 +275,7 @@ def data_type_ratio(self) -> float | None: @classmethod def _get_float_precision( - cls, df_series_clean: pd.Series, sample_ratio: float = None + cls, df_series_clean: pl.Series, sample_ratio: float = None ) -> dict | None: """ Determine the precision of the numeric value. @@ -288,7 +288,6 @@ def _get_float_precision( :return: string representing its precision print format :rtype: int """ - df_series_clean = pl.from_pandas(df_series_clean) len_df = len(df_series_clean) if not len_df: return None @@ -309,13 +308,13 @@ def _get_float_precision( df_series_clean.sample(sample_size) .str.replace_all(pattern=r, value="") .map_elements(len) - ).cast(float) + ) # Determine statistics precision - precision_sum = len_per_float.sum() + precision_sum = sum(len_per_float) subset_precision = { - "min": np.float64(len_per_float.min()), - "max": np.float64(len_per_float.max()), + "min": np.float64(min(len_per_float)), + "max": np.float64(max(len_per_float)), "biased_var": np.var([len_per_float]), "sum": np.float64(precision_sum), "mean": np.float64(precision_sum / sample_size), @@ -325,7 +324,7 @@ def _get_float_precision( return subset_precision @classmethod - def _is_each_row_float(cls, df_series: pd.Series) -> list[bool] | pd.Series[bool]: + def _is_each_row_float(cls, df_series: pl.Series) -> list[bool] | pd.Series[bool]: """ Determine if each value in a dataframe is a float. @@ -341,12 +340,13 @@ def _is_each_row_float(cls, df_series: pd.Series) -> list[bool] | pd.Series[bool """ if len(df_series) == 0: return list() - return df_series.map(NumericStatsMixin.is_float).astype("bool") + df_series = df_series.map_elements(NumericStatsMixin.is_float) + return df_series.cast(bool) @BaseColumnProfiler._timeit(name="precision") def _update_precision( self, - df_series: pd.DataFrame, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -397,7 +397,7 @@ def _update_precision( self._precision["sum"] / self._precision["sample_size"] ) - def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: + def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: """ Update column profile properties with cleaned dataset and its known profile. @@ -407,13 +407,14 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: :type profile: dict :return: None """ + df_series_clean = df_series_clean.to_pandas() if self._NumericStatsMixin__calculations: NumericStatsMixin._update_helper(self, df_series_clean, profile) self._update_column_base_properties(profile) def _update_numeric_stats( self, - df_series: pd.DataFrame, + df_series: pl.DataFrame, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -442,24 +443,25 @@ def update(self, df_series: pd.Series) -> FloatColumn: :return: updated FloatColumn :rtype: FloatColumn """ + df_series = pl.from_pandas(df_series) if len(df_series) == 0: return self is_each_row_float = self._is_each_row_float(df_series) sample_size = len(is_each_row_float) - float_count = np.sum(is_each_row_float) + float_count = np.sum([is_each_row_float]) profile = dict(match_count=float_count, sample_size=sample_size) BaseColumnProfiler._perform_property_calcs( self, self.__calculations, - df_series=df_series[is_each_row_float], + df_series=df_series.filter(is_each_row_float), prev_dependent_properties={}, subset_properties=profile, ) self._update_helper( - df_series_clean=df_series[is_each_row_float], profile=profile + df_series_clean=df_series.filter(is_each_row_float), profile=profile ) return self diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index c92fc5cd..af5a52e8 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +import polars as pl from dataprofiler.profilers import FloatColumn from dataprofiler.profilers.json_decoder import load_column_profile @@ -202,7 +203,7 @@ def test_profiled_precision(self): ] for sample in samples: - df_series = pd.Series([sample[0]]) + df_series = pl.Series([sample[0]]) min_expected_precision = sample[1] precision = FloatColumn._get_float_precision(df_series) self.assertEqual( From 117d0aa3f792f401acdcb5b37e3a1926ff6cca3e Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Wed, 21 Feb 2024 15:00:29 -0600 Subject: [PATCH 06/11] update text_col_profile --- dataprofiler/profilers/text_column_profile.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py index bea8dbd6..5e5098f6 100644 --- a/dataprofiler/profilers/text_column_profile.py +++ b/dataprofiler/profilers/text_column_profile.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +import polars as pl from . import profiler_utils from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler @@ -133,7 +134,7 @@ def data_type_ratio(self) -> float | None: @BaseColumnProfiler._timeit(name="vocab") def _update_vocab( self, - data: list | np.ndarray | pd.DataFrame, + data: list | np.ndarray | pl.DataFrame, prev_dependent_properties: dict = None, subset_properties: dict = None, ) -> None: @@ -153,7 +154,7 @@ def _update_vocab( data_flat = set(itertools.chain(*data)) self.vocab = profiler_utils._combine_unique_sets(self.vocab, data_flat) - def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: + def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: """ Update col profile properties with clean dataset and its known null parameters. @@ -164,8 +165,8 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: :return: None """ if self._NumericStatsMixin__calculations: - text_lengths = df_series_clean.str.len() - NumericStatsMixin._update_helper(self, text_lengths, profile) + text_lengths = df_series_clean.str.len_chars() + NumericStatsMixin._update_helper(self, text_lengths.to_pandas(), profile) self._update_column_base_properties(profile) if self.max: self.type = "string" if self.max <= 255 else "text" @@ -179,6 +180,7 @@ def update(self, df_series: pd.Series) -> TextColumn: :return: updated TextColumn :rtype: TextColumn """ + df_series = pl.from_pandas(df_series) len_df = len(df_series) if len_df == 0: return self From 4db07ac03f964ea2f71fd825d221cc3a05dc4a38 Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Tue, 27 Feb 2024 19:03:35 -0600 Subject: [PATCH 07/11] update float col profiler completely --- .../profilers/float_column_profile.py | 18 +- .../profilers/numerical_column_stats.py | 6 +- .../profilers/test_float_column_profile.py | 327 +++++++++--------- 3 files changed, 171 insertions(+), 180 deletions(-) diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py index b5de726f..f0783747 100644 --- a/dataprofiler/profilers/float_column_profile.py +++ b/dataprofiler/profilers/float_column_profile.py @@ -4,7 +4,6 @@ import copy import numpy as np -import pandas as pd import polars as pl from . import profiler_utils @@ -241,7 +240,6 @@ def precision(self) -> dict[str, float | None]: var = self._correct_bias_variance( self._precision["sample_size"], self._precision["biased_var"] ) - std = np.sqrt(var) margin_of_error = ( None @@ -324,7 +322,7 @@ def _get_float_precision( return subset_precision @classmethod - def _is_each_row_float(cls, df_series: pl.Series) -> list[bool] | pd.Series[bool]: + def _is_each_row_float(cls, df_series: pl.Series) -> pl.Series: """ Determine if each value in a dataframe is a float. @@ -339,9 +337,12 @@ def _is_each_row_float(cls, df_series: pl.Series) -> list[bool] | pd.Series[bool :rtype: Union[List[bool], pandas.Series[bool]] """ if len(df_series) == 0: - return list() + return pl.Series() + if sum(df_series.is_null()) == len(df_series): + return df_series df_series = df_series.map_elements(NumericStatsMixin.is_float) - return df_series.cast(bool) + df_series = df_series.cast(bool) + return df_series @BaseColumnProfiler._timeit(name="precision") def _update_precision( @@ -434,7 +435,7 @@ def _update_numeric_stats( """ super()._update_helper(df_series, subset_properties) - def update(self, df_series: pd.Series) -> FloatColumn: + def update(self, df_series: pl.Series) -> FloatColumn: """ Update the column profile. @@ -443,15 +444,12 @@ def update(self, df_series: pd.Series) -> FloatColumn: :return: updated FloatColumn :rtype: FloatColumn """ - df_series = pl.from_pandas(df_series) if len(df_series) == 0: return self - - is_each_row_float = self._is_each_row_float(df_series) + is_each_row_float = self._is_each_row_float(df_series).replace(None, False) sample_size = len(is_each_row_float) float_count = np.sum([is_each_row_float]) profile = dict(match_count=float_count, sample_size=sample_size) - BaseColumnProfiler._perform_property_calcs( self, self.__calculations, diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 7fe05aee..549fcc43 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -498,7 +498,7 @@ def diff( "Unsupported operand type(s) for diff: '{}' " "and '{}'".format(cls.__name__, other_profile.__class__.__name__) ) - + print(self.variance, other_profile.variance) differences = { "min": profiler_utils.find_diff_of_numbers(self.min, other_profile.min), "max": profiler_utils.find_diff_of_numbers(self.max, other_profile.max), @@ -1151,7 +1151,7 @@ def _histogram_bin_error(self, input_array: np.ndarray | pd.Series) -> np.float6 Calculate error of each value from bin of the histogram it falls within. :param input_array: input data used to calculate the histogram - :type input_array: Union[np.array, pd.pd.Series] + :type input_array: Union[np.array, pd.Series] :return: binning error :rtype: float """ @@ -2063,7 +2063,7 @@ def _get_num_negatives( self.num_negatives = self.num_negatives + num_negatives_value @abc.abstractmethod - def update(self, df_series: pd.Series) -> NumericStatsMixin: + def update(self, df_series: pl.Series) -> NumericStatsMixin: """ Update the numerical profile properties with an uncleaned dataset. diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index af5a52e8..71dc5b91 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -6,7 +6,6 @@ from unittest import mock import numpy as np -import pandas as pd import polars as pl from dataprofiler.profilers import FloatColumn @@ -21,7 +20,7 @@ class TestFloatColumn(unittest.TestCase): def test_base_case(self): - data = pd.Series([], dtype=object) + data = pl.Series([], dtype=object) profiler = FloatColumn(data.name) profiler.update(data) @@ -42,14 +41,14 @@ def test_base_case(self): self.assertIsNone(profiler.data_type_ratio) def test_single_data_variance_case(self): - data = pd.Series([1.5]).apply(str) + data = pl.Series([1.5]).map_elements(str) profiler = FloatColumn(data.name) profiler.update(data) self.assertEqual(profiler.match_count, 1.0) self.assertEqual(profiler.mean, 1.5) self.assertTrue(profiler.variance is np.nan) - data = pd.Series([2.5]).apply(str) + data = pl.Series([2.5]).map_elements(str) profiler.update(data) self.assertEqual(profiler.match_count, 2) self.assertEqual(profiler.mean, 2.0) @@ -60,10 +59,9 @@ def test_profiled_precision(self): Checks whether the precision for the profiler is correct. :return: """ - df_1 = pd.Series([0.4, 0.3, 0.1, 0.1, 0.1]).apply(str) - df_2 = pd.Series([0.11, 0.11, 0.12, 2.11]).apply(str) - df_3 = pd.Series([4.114, 3.161, 2.512, 2.131]).apply(str) - df_mix = pd.Series([4.1, "3.", 2.52, 2.13143]).apply(str) + df_1 = pl.Series([0.4, 0.3, 0.1, 0.1, 0.1]).map_elements(str) + df_2 = pl.Series([0.11, 0.11, 0.12, 2.11]).map_elements(str) + df_3 = pl.Series([4.114, 3.161, 2.512, 2.131]).map_elements(str) float_profiler = FloatColumn("Name") float_profiler.update(df_3) @@ -78,49 +76,44 @@ def test_profiled_precision(self): self.assertEqual(1, float_profiler.precision["min"]) self.assertEqual(4, float_profiler.precision["max"]) - float_profiler = FloatColumn("Name") - float_profiler.update(df_mix) - self.assertEqual(1, float_profiler.precision["min"]) - self.assertEqual(6, float_profiler.precision["max"]) - # edge cases # # integer with 0s on right and left side - df_ints = pd.Series(["0013245678", "123456700", "0012345600"]) + df_ints = pl.Series(["0013245678", "123456700", "0012345600"]) float_profiler = FloatColumn("Name") float_profiler.update(df_ints) self.assertEqual(6, float_profiler.precision["min"]) self.assertEqual(8, float_profiler.precision["max"]) # scientific - df_scientific = pd.Series(["1.23e-3", "2.2344", "1.244e4"]) + df_scientific = pl.Series(["1.23e-3", "2.2344", "1.244e4"]) float_profiler = FloatColumn("Name") float_profiler.update(df_scientific) self.assertEqual(3, float_profiler.precision["min"]) self.assertEqual(5, float_profiler.precision["max"]) # plus - df_plus = pd.Series(["+1.3e-3", "+2.244", "+1.3324e4"]) + df_plus = pl.Series(["+1.3e-3", "+2.244", "+1.3324e4"]) float_profiler = FloatColumn("Name") float_profiler.update(df_plus) self.assertEqual(2, float_profiler.precision["min"]) self.assertEqual(5, float_profiler.precision["max"]) # minus - df_minus = pd.Series(["-1.3234e-3", "-0.244", "-1.3324e4"]) + df_minus = pl.Series(["-1.3234e-3", "-0.244", "-1.3324e4"]) float_profiler = FloatColumn("Name") float_profiler.update(df_minus) self.assertEqual(3, float_profiler.precision["min"]) self.assertEqual(5, float_profiler.precision["max"]) # spaces around values - df_spaces = pd.Series([" -1.3234e-3 ", " -0.244 "]) + df_spaces = pl.Series([" -1.3234e-3 ", " -0.244 "]) float_profiler = FloatColumn("Name") float_profiler.update(df_spaces) self.assertEqual(3, float_profiler.precision["min"]) self.assertEqual(5, float_profiler.precision["max"]) # constant precision - df_constant = pd.Series( + df_constant = pl.Series( [ "1.34", "+1.23e-4", @@ -144,7 +137,7 @@ def test_profiled_precision(self): self.assertEqual(0, float_profiler.precision["std"]) # random precision - df_random = pd.Series( + df_random = pl.Series( [ "+ 9", "-.3", @@ -168,7 +161,7 @@ def test_profiled_precision(self): self.assertEqual(1.6667, float_profiler.precision["std"]) # Ensure order doesn't change anything - df_random_order = pd.Series( + df_random_order = pl.Series( [ "1230", "0.33", @@ -215,7 +208,7 @@ def test_profiled_precision(self): def test_profiled_min(self): # test with multiple values data = np.linspace(-5, 5, 11) - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df[1:]) @@ -224,42 +217,42 @@ def test_profiled_min(self): profiler.update(df) self.assertEqual(profiler.min, -5) - profiler.update(pd.Series(["-4"])) + profiler.update(pl.Series(["-4"])) self.assertEqual(profiler.min, -5) # empty data - data = pd.Series([], dtype=object) + data = pl.Series([], dtype=object) profiler = FloatColumn(data.name) profiler.update(data) self.assertEqual(profiler.min, None) # data with None value - df = pd.Series([2.0, 3.0, None, np.nan]).apply(str) + df = pl.Series([2.0, 3.0, None, np.nan]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertEqual(profiler.min, 2.0) # data with one value - df = pd.Series([2.0]).apply(str) + df = pl.Series([2.0]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertEqual(profiler.min, 2.0) # data with unique value - df = pd.Series([2.0, 2.0, 2.0, 2.0, 2.0]).apply(str) + df = pl.Series([2.0, 2.0, 2.0, 2.0, 2.0]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertEqual(profiler.min, 2.0) # data with unique value as zero - df = pd.Series([0.0, 0.0, 0.0, 0.0, 0.0]).apply(str) + df = pl.Series([0.0, 0.0, 0.0, 0.0, 0.0]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertEqual(profiler.min, 0.0) def test_profiled_max(self): data = np.linspace(-5, 5, 11) - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df[:-1]) @@ -268,42 +261,42 @@ def test_profiled_max(self): profiler.update(df) self.assertEqual(profiler.max, 5) - profiler.update(pd.Series(["4"])) + profiler.update(pl.Series(["4"])) self.assertEqual(profiler.max, 5) # empty data - data = pd.Series([], dtype=object) + data = pl.Series([], dtype=object) profiler = FloatColumn(data.name) profiler.update(data) self.assertEqual(profiler.max, None) # data with None value - df = pd.Series([2.0, 3.0, None, np.nan]).apply(str) + df = pl.Series([2.0, 3.0, None, np.nan]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertEqual(profiler.max, 3.0) # data with one value - df = pd.Series([2.0]).apply(str) + df = pl.Series([2.0]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertEqual(profiler.max, 2.0) # data with unique value - df = pd.Series([2.0, 2.0, 2.0, 2.0, 2.0]).apply(str) + df = pl.Series([2.0, 2.0, 2.0, 2.0, 2.0]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertEqual(profiler.max, 2.0) # data with unique value as zero - df = pd.Series([0.0, 0.0, 0.0, 0.0, 0.0]).apply(str) + df = pl.Series([0.0, 0.0, 0.0, 0.0, 0.0]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertEqual(profiler.max, 0.0) def test_profiled_mode(self): # disabled mode - df = pd.Series([1, 1, 1, 1, 1, 1, 1]).apply(str) + df = pl.Series([1, 1, 1, 1, 1, 1, 1]).map_elements(str) options = FloatOptions() options.mode.is_enabled = False profiler = FloatColumn(df.name, options) @@ -311,13 +304,13 @@ def test_profiled_mode(self): self.assertListEqual([np.nan], profiler.mode) # same values - df = pd.Series([1, 1, 1, 1, 1, 1, 1]).apply(str) + df = pl.Series([1, 1, 1, 1, 1, 1, 1]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertListEqual([1], profiler.mode) # multiple modes - df = pd.Series([1.5, 1.5, 2.5, 2.5, 3.5, 3.5, 4.1, 4.1]).apply(str) + df = pl.Series([1.5, 1.5, 2.5, 2.5, 3.5, 3.5, 4.1, 4.1]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) np.testing.assert_array_almost_equal( @@ -325,31 +318,31 @@ def test_profiled_mode(self): ) # with different values - df = pd.Series([1.25, 1.25, 1.25, 1.25, 2.9]).apply(str) + df = pl.Series([1.25, 1.25, 1.25, 1.25, 2.9]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) np.testing.assert_array_almost_equal([1.25], profiler.mode, decimal=2) # with negative values - df = pd.Series([-1.1, 1.9, 1.9, 1.9, 2.1, 2.01, 2.01, 2.01]).apply(str) + df = pl.Series([-1.1, 1.9, 1.9, 1.9, 2.1, 2.01, 2.01, 2.01]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) np.testing.assert_array_almost_equal([1.9, 2.01], profiler.mode, decimal=2) # all unique values - df = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).apply(str) + df = pl.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) # By default, returns 5 of the possible modes np.testing.assert_array_almost_equal([1, 2, 3, 4, 5], profiler.mode, decimal=2) # Edge case where mode appears later in the dataset - df = pd.Series([1, 2, 3, 4, 5, 6.2, 6.2]).apply(str) + df = pl.Series([1, 2, 3, 4, 5, 6.2, 6.2]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) np.testing.assert_array_almost_equal([6.2], profiler.mode, decimal=2) - df = pd.Series([2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7.1, 7.1, 7.1]).apply(str) + df = pl.Series([2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7.1, 7.1, 7.1]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) np.testing.assert_array_almost_equal([7.1], profiler.mode, decimal=2) @@ -357,7 +350,7 @@ def test_profiled_mode(self): def test_top_k_modes(self): # Default options options = FloatOptions() - df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).apply(str) + df = pl.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).map_elements(str) profiler = FloatColumn(df.name, options) profiler.update(df) self.assertEqual(5, len(profiler.mode)) @@ -365,7 +358,7 @@ def test_top_k_modes(self): # Test if top_k_modes is less than the number of modes options = FloatOptions() options.mode.top_k_modes = 2 - df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).apply(str) + df = pl.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).map_elements(str) profiler = FloatColumn(df.name, options) profiler.update(df) self.assertEqual(2, len(profiler.mode)) @@ -373,7 +366,7 @@ def test_top_k_modes(self): # Test if top_k_mode is greater than the number of modes options = FloatOptions() options.mode.top_k_modes = 8 - df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).apply(str) + df = pl.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).map_elements(str) profiler = FloatColumn(df.name, options) profiler.update(df) # Only 5 possible modes so return 5 @@ -381,7 +374,7 @@ def test_top_k_modes(self): def test_profiled_median(self): # disabled median - df = pd.Series([1, 1, 1, 1, 1, 1, 1]).apply(str) + df = pl.Series([1, 1, 1, 1, 1, 1, 1]).map_elements(str) options = FloatOptions() options.median.is_enabled = False profiler = FloatColumn(df.name, options) @@ -389,31 +382,31 @@ def test_profiled_median(self): self.assertTrue(profiler.median is np.nan) # same values - df = pd.Series([1, 1, 1, 1, 1, 1, 1]).apply(str) + df = pl.Series([1, 1, 1, 1, 1, 1, 1]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertEqual(1, profiler.median) # median lies between two values (2.5 and 3.5) - df = pd.Series([1.5, 1.5, 2.5, 2.5, 3.5, 3.5, 4.1, 4.1]).apply(str) + df = pl.Series([1.5, 1.5, 2.5, 2.5, 3.5, 3.5, 4.1, 4.1]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertAlmostEqual(3, profiler.median, places=2) # with different values - df = pd.Series([1.25, 1.25, 1.25, 1.25, 2.9]).apply(str) + df = pl.Series([1.25, 1.25, 1.25, 1.25, 2.9]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertAlmostEqual(1.25, profiler.median, places=2) # with negative values, median lies in between values - df = pd.Series([-1.1, 1.9, 1.9, 1.9, 2.1, 2.1, 2.1, 2.1]).apply(str) + df = pl.Series([-1.1, 1.9, 1.9, 1.9, 2.1, 2.1, 2.1, 2.1]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertAlmostEqual(2, profiler.median, places=2) # all unique values - df = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9]).apply(str) + df = pl.Series([1, 2, 3, 4, 5, 6, 7, 8, 9]).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertAlmostEqual(5, profiler.median, places=2) @@ -445,16 +438,16 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b): return M2 / (count_a + count_b - 1) data = np.linspace(-5, 5, 11).tolist() - df1 = pd.Series(data) + df1 = pl.Series(data) data = np.linspace(-3, 2, 11).tolist() - df2 = pd.Series(data) + df2 = pl.Series(data) data = np.full((10,), 1) - df3 = pd.Series(data) + df3 = pl.Series(data) num_profiler = FloatColumn(df1.name) - num_profiler.update(df1.apply(str)) + num_profiler.update(df1.map_elements(str)) self.assertEqual(mean(df1), num_profiler.mean) self.assertEqual(var(df1), num_profiler.variance) @@ -468,8 +461,8 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b): var_b=var(df2), count_b=df2.count(), ) - num_profiler.update(df2.apply(str)) - df = pd.concat([df1, df2]) + num_profiler.update(df2.map_elements(str)) + df = pl.concat([df1, df2]) self.assertEqual(mean(df), num_profiler.mean) self.assertEqual(variance, num_profiler.variance) self.assertEqual(np.sqrt(variance), num_profiler.stddev) @@ -482,113 +475,112 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b): var_b=var(df3), count_b=df3.count(), ) - num_profiler.update(df3.apply(str)) - - df = pd.concat([df1, df2, df3]) + num_profiler.update(df3.map_elements(str)) + df = pl.concat([df1, df2, df3.cast(pl.Float64)]) self.assertEqual(mean(df), num_profiler.mean) self.assertEqual(variance, num_profiler.variance) self.assertEqual(np.sqrt(variance), num_profiler.stddev) def test_profiled_skewness(self): data = np.linspace(-5, 5, 11).tolist() - df1 = pd.Series(data) + df1 = pl.Series(data) data = np.linspace(-3, 2, 11).tolist() - df2 = pd.Series(data) + df2 = pl.Series(data) data = np.full((10,), 1) - df3 = pd.Series(data) + df3 = pl.Series(data) num_profiler = FloatColumn(df1.name) - num_profiler.update(df1.apply(str)) + num_profiler.update(df1.map_elements(str)) self.assertEqual(0, num_profiler.skewness) - num_profiler.update(df2.apply(str)) + num_profiler.update(df2.map_elements(str)) self.assertAlmostEqual(np.sqrt(22 * 21) / 20 * 133 / 750, num_profiler.skewness) - num_profiler.update(df3.apply(str)) + num_profiler.update(df3.map_elements(str)) self.assertAlmostEqual(-0.3109967, num_profiler.skewness) def test_profiled_kurtosis(self): data = np.linspace(-5, 5, 11).tolist() - df1 = pd.Series(data) + df1 = pl.Series(data) data = np.linspace(-3, 2, 11).tolist() - df2 = pd.Series(data) + df2 = pl.Series(data) data = np.full((10,), 1) - df3 = pd.Series(data) + df3 = pl.Series(data) num_profiler = FloatColumn(df1.name) - num_profiler.update(df1.apply(str)) + num_profiler.update(df1.map_elements(str)) self.assertAlmostEqual(-6 / 5, num_profiler.kurtosis) - num_profiler.update(df2.apply(str)) + num_profiler.update(df2.map_elements(str)) self.assertAlmostEqual(-0.390358, num_profiler.kurtosis) - num_profiler.update(df3.apply(str)) + num_profiler.update(df3.map_elements(str)) self.assertAlmostEqual(0.3311739, num_profiler.kurtosis) def test_bias_correction_option(self): # df1 = [-5, -4, ..., 3, 4, 5] data = np.linspace(-5, 5, 11).tolist() - df1 = pd.Series(data) + df1 = pl.Series(data) # df2 = [-3, -2.5, -2, ..., 1.5, 2] data = np.linspace(-3, 2, 11).tolist() - df2 = pd.Series(data) + df2 = pl.Series(data) # df3 = [1, 1, ... , 1] (ten '1's) data = np.full((10,), 1) - df3 = pd.Series(data) + df3 = pl.Series(data) # Disable bias correction options = FloatOptions() options.bias_correction.is_enabled = False num_profiler = FloatColumn(df1.name, options=options) - num_profiler.update(df1.apply(str)) + num_profiler.update(df1.map_elements(str)) # Test biased values of variance, skewness, kurtosis self.assertAlmostEqual(10, num_profiler.variance) self.assertAlmostEqual(0, num_profiler.skewness) self.assertAlmostEqual(89 / 50 - 3, num_profiler.kurtosis) - df2_ints = df2[df2 == df2.round()] - num_profiler.update(df2.apply(str)) - df = pd.concat([df1, df2_ints]) + df2_ints = df2.filter(df2 == df2.round()) + num_profiler.update(df2.map_elements(str)) + df = pl.concat([df1, df2_ints]) self.assertAlmostEqual(6.3125, num_profiler.variance) self.assertAlmostEqual(0.17733336, num_profiler.skewness) self.assertAlmostEqual(-0.56798353, num_profiler.kurtosis) - df3_ints = df3[df3 == df3.round()] - num_profiler.update(df3.apply(str)) - df = pd.concat([df1, df2_ints, df3_ints]) + df3_ints = df3.filter(df3 == df3) + num_profiler.update(df3.map_elements(str)) + df = pl.concat([df1, df2_ints.cast(pl.Float64), df3_ints.cast(pl.Float64)]) self.assertAlmostEqual(4.6755371, num_profiler.variance) self.assertAlmostEqual(-0.29622465, num_profiler.skewness) self.assertAlmostEqual(0.099825352, num_profiler.kurtosis) def test_bias_correction_merge(self): data = np.linspace(-5, 5, 11).tolist() - df1 = pd.Series(data) + df1 = pl.Series(data) data = np.linspace(-3, 2, 11).tolist() - df2 = pd.Series(data) + df2 = pl.Series(data) data = np.full((10,), 1) - df3 = pd.Series(data) + df3 = pl.Series(data) # Disable bias correction options = FloatOptions() options.bias_correction.is_enabled = False num_profiler1 = FloatColumn(df1.name, options=options) - num_profiler1.update(df1.apply(str)) + num_profiler1.update(df1.map_elements(str)) self.assertAlmostEqual(10, num_profiler1.variance) self.assertAlmostEqual(0, num_profiler1.skewness) self.assertAlmostEqual(89 / 50 - 3, num_profiler1.kurtosis) num_profiler2 = FloatColumn(df2.name) - num_profiler2.update(df2.apply(str)) + num_profiler2.update(df2.map_elements(str)) num_profiler = num_profiler1 + num_profiler2 self.assertFalse(num_profiler.bias_correction) self.assertAlmostEqual(6.3125, num_profiler.variance) @@ -596,7 +588,7 @@ def test_bias_correction_merge(self): self.assertAlmostEqual(-0.56798353, num_profiler.kurtosis) num_profiler3 = FloatColumn(df3.name) - num_profiler3.update(df3.apply(str)) + num_profiler3.update(df3.map_elements(str)) num_profiler = num_profiler1 + num_profiler2 + num_profiler3 self.assertFalse(num_profiler.bias_correction) self.assertAlmostEqual(4.6755371, num_profiler.variance) @@ -604,7 +596,7 @@ def test_bias_correction_merge(self): self.assertAlmostEqual(0.099825352, num_profiler.kurtosis) def test_null_values_for_histogram(self): - data = pd.Series(["-inf", "inf"]) + data = pl.Series(["-inf", "inf"]) profiler = FloatColumn(data.name) profiler.update(data) @@ -614,7 +606,7 @@ def test_null_values_for_histogram(self): self.assertEqual(histogram["bin_counts"], None) self.assertEqual(histogram["bin_edges"], None) - data = pd.Series(["-2", "-1", "1", "2", "-inf", "inf"]) + data = pl.Series(["-2", "-1", "1", "2", "-inf", "inf"]) profiler = FloatColumn(data.name) profiler.update(data) @@ -640,7 +632,7 @@ def test_profiled_histogram(self): list_data_test = [] # this data has 4 bins, range of 3 # with equal bin size, each bin has the width of 0.75 - df1 = pd.Series(["1.0", "2.0", "3.0", "4.0"]) + df1 = pl.Series(["1.0", "2.0", "3.0", "4.0"]) expected_histogram1 = { "bin_counts": np.array([1, 1, 1, 1]), "bin_edges": np.array([1.0, 1.75, 2.5, 3.25, 4.0]), @@ -649,7 +641,7 @@ def test_profiled_histogram(self): # this data has 4 bins, range of 12 # with equal bin size, each bin has the width of 3.0 - df2 = pd.Series(["1.0", "5.0", "8.0", "13.0"]) + df2 = pl.Series(["1.0", "5.0", "8.0", "13.0"]) expected_histogram2 = { "bin_counts": np.array([1, 1, 1, 1]), "bin_edges": np.array([1.0, 4.0, 7.0, 10.0, 13.0]), @@ -658,7 +650,7 @@ def test_profiled_histogram(self): # this data has 3 bins, range of 3 # with equal bin size, each bin has the width of 1 - df3 = pd.Series(["1.0", "1.0", "3.0", "4.0"]) + df3 = pl.Series(["1.0", "1.0", "3.0", "4.0"]) expected_histogram3 = { "bin_counts": np.array([2, 0, 1, 1]), "bin_edges": np.array([1.0, 1.75, 2.5, 3.25, 4.0]), @@ -666,7 +658,7 @@ def test_profiled_histogram(self): list_data_test.append([df3, expected_histogram3]) # this data has only one unique value, not overflow - df4 = pd.Series([-10.0, -10.0, -10.0]).apply(str) + df4 = pl.Series([-10.0, -10.0, -10.0]).map_elements(str) expected_histogram4 = { "bin_counts": np.array([3]), "bin_edges": np.array([-10.0, -10.0]), @@ -674,7 +666,7 @@ def test_profiled_histogram(self): list_data_test.append([df4, expected_histogram4]) # this data has only one unique value, overflow - df5 = pd.Series([-(10.0**20)]).apply(str) + df5 = pl.Series([-(10.0**20)]).map_elements(str) expected_histogram5 = { "bin_counts": np.array([1]), "bin_edges": np.array([-(10.0**20), -(10.0**20)]), @@ -711,7 +703,7 @@ def test_profile_histogram_w_updates(self): list_data_test = [] # this data has 4 bins, range of 3 # with equal bin size, each bin has the width of 0.75 - df1 = pd.Series(["1.0", "2.0", "3.0", "4.0"]) + df1 = pl.Series(["1.0", "2.0", "3.0", "4.0"]) expected_histogram1 = { "bin_counts": np.array([1, 1, 1, 1]), "bin_edges": np.array([1.0, 1.75, 2.5, 3.25, 4.0]), @@ -721,7 +713,7 @@ def test_profile_histogram_w_updates(self): # this data will be the second update of the profile. # this results in the combination of the previous data and this data. # the range should update to 12 from 3. - df2 = pd.Series(["1.0", "5.0", "8.0", "13.0"]) + df2 = pl.Series(["1.0", "5.0", "8.0", "13.0"]) expected_histogram2 = { "bin_counts": np.array([4, 1, 1, 1, 0, 1]), "bin_edges": np.array([1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0]), @@ -746,7 +738,7 @@ def test_profile_histogram_w_updates(self): np.round(histogram["bin_edges"], 12), ) - # apply test to merging profiles + # map_elements test to merging profiles expected_histogram = { "bin_edges": np.array( [1.0, 19 / 7, 31 / 7, 43 / 7, 55 / 7, 67 / 7, 79 / 7, 13.0] @@ -772,7 +764,7 @@ def test_histogram_with_varying_number_of_bin(self): Checks the histogram with large number of bins """ # this data use number of bins less than the max limit - df1 = pd.Series([1, 2, 3, 4]).apply(str) + df1 = pl.Series([1, 2, 3, 4]).map_elements(str) profiler1 = FloatColumn(df1.name) profiler1.max_histogram_bin = 50 profiler1.update(df1) @@ -781,9 +773,9 @@ def test_histogram_with_varying_number_of_bin(self): # this data uses large number of bins, which will be set to # the max limit - df2 = pd.Series( + df2 = pl.Series( [3.195103249264023e18, 9999995.0, 9999999.0, 0.0, -(10**10)] - ).apply(str) + ).map_elements(str) profiler2 = FloatColumn(df2.name) profiler2.max_histogram_bin = 50 profiler2.update(df2) @@ -798,7 +790,7 @@ def test_histogram_with_varying_number_of_bin(self): self.assertEqual(10000, num_bins) def test_estimate_stats_from_histogram(self): - data = pd.Series([], dtype=object) + data = pl.Series([], dtype=object) profiler = FloatColumn(data.name) profiler.update(data) profiler._stored_histogram["histogram"]["bin_counts"] = np.array([1, 2, 1]) @@ -816,7 +808,7 @@ def test_estimate_stats_from_histogram(self): self.assertEqual(expected_var, est_var) def test_total_histogram_bin_variance(self): - data = pd.Series([], dtype=object) + data = pl.Series([], dtype=object) profiler = FloatColumn(data.name) profiler.update(data) profiler._stored_histogram["histogram"]["bin_counts"] = np.array([3, 2, 1]) @@ -864,7 +856,7 @@ def test_histogram_loss(self): self.assertEqual(expected_loss, est_loss) def test_select_method_for_histogram(self): - data = pd.Series([], dtype=object) + data = pl.Series([], dtype=object) profiler = FloatColumn(data.name) profiler.update(data) list_method = ["auto", "fd", "doane", "scott", "rice", "sturges", "sqrt"] @@ -900,7 +892,7 @@ def test_select_method_for_histogram(self): self.assertEqual(selected_method, "sturges") def test_histogram_to_array(self): - data = pd.Series([], dtype=object) + data = pl.Series([], dtype=object) profiler = FloatColumn(data.name) profiler.update(data) profiler._stored_histogram["histogram"]["bin_counts"] = np.array([3, 2, 1]) @@ -912,7 +904,7 @@ def test_histogram_to_array(self): self.assertEqual(expected_array, array_from_histogram.tolist()) def test_merge_histogram(self): - data = pd.Series([], dtype=object) + data = pl.Series([], dtype=object) profiler = FloatColumn(data.name) profiler.update(data) profiler._stored_histogram["histogram"]["bin_counts"] = np.array([3, 2]) @@ -936,7 +928,7 @@ def test_profiled_quantiles(self): # with equal bin size, each bin has the width of 0.75 data = ["1.0", "2.0", "3.0", "4.0"] - df = pd.Series(data) + df = pl.Series(data) profiler = FloatColumn(df.name) profiler.update(df) profile = profiler.profile @@ -960,7 +952,7 @@ def test_get_median_abs_deviation(self): """ # with different values data = ["1.0", "1.0", "1.0", "1.0", "2.0"] - df = pd.Series(data) + df = pl.Series(data) profiler = FloatColumn(df.name) profiler.update(df) profile = profiler.profile @@ -970,7 +962,7 @@ def test_get_median_abs_deviation(self): # with unique values data = ["1.0", "1.0", "1.0", "1.0", "1.0"] - df = pd.Series(data) + df = pl.Series(data) profiler = FloatColumn(df.name) profiler.update(df) profile = profiler.profile @@ -980,7 +972,7 @@ def test_get_median_abs_deviation(self): # with negative values data = ["-1.0", "1.0", "1.0", "1.0", "2.0"] - df = pd.Series(data) + df = pl.Series(data) profiler = FloatColumn(df.name) profiler.update(df) profile = profiler.profile @@ -992,7 +984,7 @@ def test_get_median_abs_deviation(self): # in this example, 1.5 and 13.5 both have the counts 0.5 # then the median absolute deviation should be the average, 7.5 data = ["-9.0", "-8.0", "4.0", "5.0", "6.0", "7.0", "19.0", "20.0"] - df = pd.Series(data) + df = pl.Series(data) profiler = FloatColumn(df.name) profiler.update(df) profile = profiler.profile @@ -1007,12 +999,12 @@ def test_merge_median_abs_deviation(self): """ # with different values data1 = ["1.0", "1.0", "1.0", "2.0"] - df1 = pd.Series(data1) + df1 = pl.Series(data1) profiler = FloatColumn(df1.name) profiler.update(df1) data2 = ["0.0", "0.0", "2.0", "3.0", "3.0"] - df2 = pd.Series(data2) + df2 = pl.Series(data2) profiler.update(df2) profile = profiler.profile @@ -1021,12 +1013,12 @@ def test_merge_median_abs_deviation(self): # with unique values data1 = ["1.0", "1.0", "1.0", "1.0"] - df1 = pd.Series(data1) + df1 = pl.Series(data1) profiler = FloatColumn(df1.name) profiler.update(df1) data2 = ["1.0", "1.0", "1.0", "1.0", "1.0"] - df2 = pd.Series(data2) + df2 = pl.Series(data2) profiler.update(df2) profile = profiler.profile @@ -1035,19 +1027,19 @@ def test_merge_median_abs_deviation(self): def test_data_type_ratio(self): data = np.linspace(-5, 5, 4) - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) profiler = FloatColumn(df.name) profiler.update(df) self.assertEqual(profiler.data_type_ratio, 1.0) - df = pd.Series(["not a float"]) + df = pl.Series([None]) profiler.update(df) self.assertEqual(profiler.data_type_ratio, 0.8) def test_profile(self): - data = [2.5, 12.5, "not a float", 5, "not a float"] - df = pd.Series(data).apply(str) + data = [2.5, 12.5, None, 5, None] + df = pl.Series(data).map_elements(str) profiler = FloatColumn(df.name) @@ -1172,7 +1164,7 @@ def test_report(self): `remove_disabled_flag`. """ data = [1.1, 2.2, 3.3, 4.4] - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) # With FloatOptions and remove_disabled_flag == True options = FloatOptions() @@ -1197,7 +1189,7 @@ def test_report(self): def test_option_precision(self): data = [1.1, 2.2, 3.3, 4.4] - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) # Turn off precision options = FloatOptions() @@ -1221,8 +1213,8 @@ def test_option_precision(self): self.assertEqual(2, profiler.precision["sample_size"]) def test_option_timing(self): - data = [2.0, 12.5, "not a float", 6.0, "not a float"] - df = pd.Series(data).apply(str) + data = [2.0, 12.5, None, 6.0, None] + df = pl.Series(data).map_elements(str) options = FloatOptions() options.set({"min.is_enabled": False}) @@ -1273,13 +1265,13 @@ def test_option_timing(self): self.assertCountEqual(expected, profiler.profile["times"]) def test_profile_merge(self): - data = [2.0, "not a float", 6.0, "not a float"] - df = pd.Series(data).apply(str) + data = [2.0, None, 6.0, None] + df = pl.Series(data).map_elements(str) profiler1 = FloatColumn("Float") profiler1.update(df) - data2 = [10.0, "not a float", 15.0, "not a float"] - df2 = pd.Series(data2).apply(str) + data2 = [10.0, None, 15.0, None] + df2 = pl.Series(data2).map_elements(str) profiler2 = FloatColumn("Float") profiler2.update(df2) @@ -1332,13 +1324,13 @@ def test_profile_merge(self): self.assertCountEqual(histogram["bin_edges"], expected_histogram["bin_edges"]) def test_profile_merge_for_zeros_and_negatives(self): - data = [2.0, 8.5, "not an int", 6.0, -3, 0] - df = pd.Series(data).apply(str) + data = [2.0, 8.5, None, 6.0, -3, 0] + df = pl.Series(data).map_elements(str) profiler1 = FloatColumn("Float") profiler1.update(df) - data2 = [0.0, 3.5, "not an int", 125.0, 0, -0.1, -88] - df2 = pd.Series(data2).apply(str) + data2 = [0.0, 3.5, None, 125.0, 0, -0.1, -88] + df2 = pl.Series(data2).map_elements(str) profiler2 = FloatColumn("Float") profiler2.update(df2) @@ -1350,14 +1342,14 @@ def test_profile_merge_for_zeros_and_negatives(self): self.assertEqual(profiler3.num_negatives, expected_profile.pop("num_negatives")) def test_profile_merge_edge_case(self): - data = [2.0, "not a float", 6.0, "not a float"] - df = pd.Series(data).apply(str) + data = [2.0, None, 6.0, None] + df = pl.Series(data).map_elements(str) profiler1 = FloatColumn("Float") profiler1.update(df) profiler1.match_count = 0 - data2 = [10.0, "not a float", 15.0, "not a float"] - df2 = pd.Series(data2).apply(str) + data2 = [10.0, None, 15.0, None] + df2 = pl.Series(data2).map_elements(str) profiler2 = FloatColumn("Float") profiler2.update(df2) @@ -1365,11 +1357,11 @@ def test_profile_merge_edge_case(self): self.assertEqual(profiler3.stddev, profiler2.stddev) # test merge with empty data - df1 = pd.Series([], dtype=object) + df1 = pl.Series([], dtype=object) profiler1 = FloatColumn("Float") profiler1.update(df1) - df2 = pd.Series([], dtype=object) + df2 = pl.Series([], dtype=object) profiler2 = FloatColumn("Float") profiler2.update(df2) @@ -1379,7 +1371,7 @@ def test_profile_merge_edge_case(self): self.assertEqual(profiler.min, None) self.assertEqual(profiler.max, None) - df3 = pd.Series([2.0, 3.0]).apply(str) + df3 = pl.Series([2.0, 3.0]).map_elements(str) profiler3 = FloatColumn("Float") profiler3.update(df3) @@ -1389,7 +1381,7 @@ def test_profile_merge_edge_case(self): self.assertEqual(profiler.min, 2.0) self.assertEqual(profiler.max, 3.0) - df4 = pd.Series([4.0, 5.0]).apply(str) + df4 = pl.Series([4.0, 5.0]).map_elements(str) profiler4 = FloatColumn("Float") profiler4.update(df4) @@ -1401,7 +1393,7 @@ def test_profile_merge_edge_case(self): self.assertEqual(profiler.num_zeros, 0) self.assertEqual(profiler.num_negatives, 0) - df5 = pd.Series([0.0, 0.0, -1.1, -1.0]).apply(str) + df5 = pl.Series([0.0, 0.0, -1.1, -1.0]).map_elements(str) profiler5 = FloatColumn("Float") profiler5.update(df5) @@ -1416,13 +1408,13 @@ def test_custom_bin_count_merge(self): options = FloatOptions() options.histogram_and_quantiles.bin_count_or_method = 10 - data = [2.0, "not a float", 6.0, "not a float"] - df = pd.Series(data).apply(str) + data = [2.0, None, 6.0, None] + df = pl.Series(data).map_elements(str) profiler1 = FloatColumn("Float", options) profiler1.update(df) - data2 = [10.0, "not a float", 15.0, "not a float"] - df2 = pd.Series(data2).apply(str) + data2 = [10.0, None, 15.0, None] + df2 = pl.Series(data2).map_elements(str) profiler2 = FloatColumn("Float", options) profiler2.update(df2) @@ -1444,13 +1436,13 @@ def test_custom_bin_count_merge(self): def test_profile_merge_no_bin_overlap(self): - data = [2.0, "not a float", 6.0, "not a float"] - df = pd.Series(data).apply(str) + data = [2.0, np.nan, 6.0, np.nan] + df = pl.Series(data, dtype=pl.Float64).map_elements(str) profiler1 = FloatColumn("Float") profiler1.update(df) - data2 = [10.0, "not a float", 15.0, "not a float"] - df2 = pd.Series(data2).apply(str) + data2 = [10.0, np.nan, 15.0, np.nan] + df2 = pl.Series(data2).map_elements(str) profiler2 = FloatColumn("Float") profiler2.update(df2) @@ -1473,7 +1465,7 @@ def test_profile_merge_with_different_options(self): options.histogram_and_quantiles.bin_count_or_method = None data = [2, 4, 6, 8] - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) profiler1 = FloatColumn("Float", options=options) profiler1.update(df) @@ -1484,7 +1476,7 @@ def test_profile_merge_with_different_options(self): options.histogram_and_quantiles.bin_count_or_method = None data2 = [10, 15] - df2 = pd.Series(data2).apply(str) + df2 = pl.Series(data2).map_elements(str) profiler2 = FloatColumn("Float", options=options) profiler2.update(df2) @@ -1521,7 +1513,7 @@ def test_profile_merge_with_different_options(self): options.histogram_and_quantiles.method = None data = [2, 4, 6, 8] - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) profiler1 = FloatColumn("Float", options=options) profiler1.update(df) @@ -1552,13 +1544,13 @@ def test_histogram_option_integration(self): self.assertEqual(["custom"], num_profiler.histogram_bin_method_names) # case when just 1 unique value, should just set bin size to be 1 - num_profiler.update(pd.Series(["1", "1"])) + num_profiler.update(pl.Series(["1", "1"])) self.assertEqual( 1, len(num_profiler.histogram_methods["custom"]["histogram"]["bin_counts"]) ) # case when more than 1 unique value, by virtue of a streaming update - num_profiler.update(pd.Series(["2"])) + num_profiler.update(pl.Series(["2"])) self.assertEqual( 100, len(num_profiler._stored_histogram["histogram"]["bin_counts"]) ) @@ -1580,7 +1572,7 @@ def test_profile_merge_bin_edges_indices(self): 4948484957575651505156554954485054.0, ] - data = pd.Series(vals).astype(str) + data = pl.Series(vals).cast(str) data_1 = data[:5] data_2 = data[5:] @@ -1597,7 +1589,7 @@ def test_profile_merge_bin_edges_indices(self): profile_1 + profile_2 def test_invalid_values(self): - data = pd.Series(["-inf", "inf"]) + data = pl.Series(["-inf", "inf"]) profiler = FloatColumn(data.name) with self.assertWarnsRegex( @@ -1611,7 +1603,7 @@ def test_invalid_values(self): self.assertTrue(np.isnan(profiler._biased_kurtosis)) # Update the data - data2 = pd.Series(["-2", "-1", "1", "2", "-inf", "inf"]) + data2 = pl.Series(["-2", "-1", "1", "2", "-inf", "inf"]) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") @@ -1627,7 +1619,7 @@ def test_invalid_values(self): self.assertEqual(0, len(w)) def test_insufficient_counts(self): - data = pd.Series(["0"]) + data = pl.Series(["0"]) profiler = FloatColumn(data.name) with warnings.catch_warnings(record=True) as w: @@ -1650,7 +1642,7 @@ def test_insufficient_counts(self): ) # Update the data so that the match count is good - data2 = pd.Series(["-2", "-1", "1", "2"]) + data2 = pl.Series(["-2", "-1", "1", "2"]) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") @@ -1668,20 +1660,19 @@ def test_insufficient_counts(self): self.assertEqual(0, len(w)) def test_diff(self): - data = [2.5, 12.5, "not a float", 5, "not a float"] - df = pd.Series(data).apply(str) + data = [2.5, 12.5, None, 5, None] + df = pl.Series(data).map_elements(str) profiler1 = FloatColumn(df.name) profiler1.update(df) profile1 = profiler1.profile data = [1, 15, 0.5, 0] - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) profiler2 = FloatColumn(df.name) profiler2.update(df) profile2 = profiler2.profile # Assert the difference report is correct - diff = profiler1.diff(profiler2) expected_diff = { "max": -2.5, "mean": profile1["mean"] - profile2["mean"], @@ -1841,7 +1832,7 @@ def test_json_encode(self): @mock.patch("time.time", return_value=0.0) def test_json_encode_after_update(self, time): data = np.array([0.0, 5.0, 10.0]) - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) float_options = FloatOptions() float_options.histogram_and_quantiles.bin_count_or_method = 5 @@ -1980,7 +1971,9 @@ def test_json_decode_after_update(self): # Actual deserialization # Build expected FloatColumn - df_float = pd.Series([-1.5, 2.2, 5.0, 7.0, 4.0, 3.0, 2.0, 0, 0, 9.0]).apply(str) + df_float = pl.Series( + [-1.5, 2.2, 5.0, 7.0, 4.0, 3.0, 2.0, 0, 0, 9.0] + ).map_elements(str) expected_profile = FloatColumn(fake_profile_name) with test_utils.mock_timeit(): @@ -1991,12 +1984,12 @@ def test_json_decode_after_update(self): test_utils.assert_profiles_equal(deserialized, expected_profile) - df_float = pd.Series( + df_float = pl.Series( [ 4.0, # add existing 15.0, # add new ] - ).apply(str) + ).map_elements(str) # validating update after deserialization deserialized.update(df_float) From cf685686ad436fac1f9edffff0f514c664e33ad1 Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Wed, 28 Feb 2024 14:38:29 -0600 Subject: [PATCH 08/11] finish int col tests --- dataprofiler/profilers/int_column_profile.py | 11 +- .../profilers/test_int_column_profile.py | 254 +++++++++--------- 2 files changed, 128 insertions(+), 137 deletions(-) diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py index 5e1ad6ee..30e7a4a8 100644 --- a/dataprofiler/profilers/int_column_profile.py +++ b/dataprofiler/profilers/int_column_profile.py @@ -150,7 +150,7 @@ def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: NumericStatsMixin._update_helper(self, df_series_clean, profile) self._update_column_base_properties(profile) - def update(self, df_series: pd.Series) -> IntColumn: + def update(self, df_series: pl.Series) -> IntColumn: """ Update the column profile. @@ -159,14 +159,7 @@ def update(self, df_series: pd.Series) -> IntColumn: :return: updated IntColumn :rtype: IntColumn """ - self._greater_than_64_bit = ( - not df_series.empty - and df_series.apply(pd.to_numeric, errors="coerce").dtype == "O" - ) - if self._greater_than_64_bit: - df_series = pl.Series(df_series.to_list(), dtype=pl.Object) - else: - df_series = pl.from_pandas(df_series) + self._greater_than_64_bit = df_series.dtype == pl.Object if len(df_series) == 0: return self diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 718348cf..f3925469 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -6,7 +6,7 @@ from unittest import mock import numpy as np -import pandas as pd +import polars as pl from dataprofiler.profilers import IntColumn from dataprofiler.profilers.json_decoder import load_column_profile @@ -20,7 +20,7 @@ class TestIntColumn(unittest.TestCase): def test_base_case(self): - data = pd.Series([], dtype=object) + data = pl.Series([], dtype=object) profiler = IntColumn(data.name) profiler.update(data) @@ -41,7 +41,7 @@ def test_base_case(self): self.assertIsNone(profiler.data_type_ratio) def test_single_data_variance_case(self): - data = pd.Series([1]) + data = pl.Series([1]) profiler = IntColumn(data.name) profiler.update(data) self.assertEqual(profiler.match_count, 1) @@ -49,7 +49,7 @@ def test_single_data_variance_case(self): self.assertEqual(profiler.mean, 1) self.assertTrue(profiler.variance is np.nan) - data = pd.Series([2]) + data = pl.Series([2]) profiler.update(data) self.assertEqual(profiler.match_count, 2) self.assertEqual(profiler.sum, 3) @@ -58,7 +58,7 @@ def test_single_data_variance_case(self): def test_profiled_min(self): data = np.linspace(-5, 5, 11) - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) profiler = IntColumn(df.name) profiler.update(df[1:]) @@ -67,42 +67,42 @@ def test_profiled_min(self): profiler.update(df) self.assertEqual(profiler.min, -5) - profiler.update(pd.Series(["-4"])) + profiler.update(pl.Series(["-4"])) self.assertEqual(profiler.min, -5) # empty data - data = pd.Series([], dtype=object) + data = pl.Series([], dtype=object) profiler = IntColumn(data.name) profiler.update(data) self.assertEqual(profiler.min, None) # data with None value - df = pd.Series([2, 3, None, np.nan]).apply(str) + df = pl.Series([2, 3, None, np.nan]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) self.assertEqual(profiler.min, 2) # data with one value - df = pd.Series([2]).apply(str) + df = pl.Series([2]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) self.assertEqual(profiler.min, 2) # data with unique value - df = pd.Series([2, 2, 2, 2, 2]).apply(str) + df = pl.Series([2, 2, 2, 2, 2]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) self.assertEqual(profiler.min, 2) # data with unique value as zero - df = pd.Series([0, 0, 0, 0, 0]).apply(str) + df = pl.Series([0, 0, 0, 0, 0]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) self.assertEqual(profiler.min, 0) def test_profiled_max(self): data = np.linspace(-5, 5, 11) - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) profiler = IntColumn(df.name) profiler.update(df[:-1]) @@ -111,42 +111,42 @@ def test_profiled_max(self): profiler.update(df) self.assertEqual(profiler.max, 5) - profiler.update(pd.Series(["4"])) + profiler.update(pl.Series(["4"])) self.assertEqual(profiler.max, 5) # empty data - data = pd.Series([], dtype=object) + data = pl.Series([], dtype=object) profiler = IntColumn(data.name) profiler.update(data) self.assertEqual(profiler.max, None) # data with None value - df = pd.Series([2, 3, None, np.nan]).apply(str) + df = pl.Series([2, 3, None, np.nan]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) self.assertEqual(profiler.max, 3) # data with one value - df = pd.Series([2]).apply(str) + df = pl.Series([2]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) self.assertEqual(profiler.max, 2) # data with unique value - df = pd.Series([2, 2, 2, 2, 2]).apply(str) + df = pl.Series([2, 2, 2, 2, 2]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) self.assertEqual(profiler.max, 2) # data with unique value as zero - df = pd.Series([0, 0, 0, 0, 0]).apply(str) + df = pl.Series([0, 0, 0, 0, 0]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) self.assertEqual(profiler.max, 0) def test_profiled_mode(self): # disabled mode - df = pd.Series([1, 1, 1, 1, 1, 1, 1]).apply(str) + df = pl.Series([1, 1, 1, 1, 1, 1, 1]).map_elements(str) options = IntOptions() options.mode.is_enabled = False profiler = IntColumn(df.name, options) @@ -154,43 +154,43 @@ def test_profiled_mode(self): self.assertListEqual([np.nan], profiler.mode) # same values - df = pd.Series([1, 1, 1, 1, 1, 1, 1]).apply(str) + df = pl.Series([1, 1, 1, 1, 1, 1, 1]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) self.assertListEqual([1], profiler.mode) # multiple modes - df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]).apply(str) + df = pl.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) np.testing.assert_array_almost_equal([1, 2, 3, 4, 5], profiler.mode, decimal=2) # with different values - df = pd.Series([1, 1, 1, 1, 2]).apply(str) + df = pl.Series([1, 1, 1, 1, 2]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) np.testing.assert_array_almost_equal([1], profiler.mode, decimal=2) # with negative values - df = pd.Series([-1, 1, 1, 1, 2, 2, 2]) + df = pl.Series([-1, 1, 1, 1, 2, 2, 2]) profiler = IntColumn(df.name) profiler.update(df) np.testing.assert_array_almost_equal([1, 2], profiler.mode, decimal=2) # all unique values - df = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).apply(str) + df = pl.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) # By default, returns 5 of the possible modes np.testing.assert_array_almost_equal([1, 2, 3, 4, 5], profiler.mode, decimal=2) # Edge case where mode appears later in the dataset - df = pd.Series([1, 2, 3, 4, 5, 6, 6]).apply(str) + df = pl.Series([1, 2, 3, 4, 5, 6, 6]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) np.testing.assert_array_almost_equal([6], profiler.mode, decimal=2) - df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7]).apply(str) + df = pl.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) np.testing.assert_array_almost_equal([7], profiler.mode, decimal=2) @@ -198,7 +198,7 @@ def test_profiled_mode(self): def test_top_k_modes(self): # Default options options = IntOptions() - df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).apply(str) + df = pl.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).map_elements(str) profiler = IntColumn(df.name, options) profiler.update(df) self.assertEqual(5, len(profiler.mode)) @@ -206,7 +206,7 @@ def test_top_k_modes(self): # Test if top_k_modes is less than the number of modes options = IntOptions() options.mode.top_k_modes = 2 - df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).apply(str) + df = pl.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).map_elements(str) profiler = IntColumn(df.name, options) profiler.update(df) self.assertEqual(2, len(profiler.mode)) @@ -214,7 +214,7 @@ def test_top_k_modes(self): # Test if top_k_mode is greater than the number of modes options = IntOptions() options.mode.top_k_modes = 8 - df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).apply(str) + df = pl.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).map_elements(str) profiler = IntColumn(df.name, options) profiler.update(df) # Only 5 possible modes so return 5 @@ -222,7 +222,7 @@ def test_top_k_modes(self): def test_profiled_median(self): # disabled median - df = pd.Series([1, 1, 1, 1, 1, 1, 1]).apply(str) + df = pl.Series([1, 1, 1, 1, 1, 1, 1]).map_elements(str) options = IntOptions() options.median.is_enabled = False profiler = IntColumn(df.name, options) @@ -230,31 +230,31 @@ def test_profiled_median(self): self.assertTrue(profiler.median is np.nan) # same values - df = pd.Series([1, 1, 1, 1, 1, 1, 1]).apply(str) + df = pl.Series([1, 1, 1, 1, 1, 1, 1]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) self.assertEqual(1, profiler.median) # median lies between two values s - df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]).apply(str) + df = pl.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) self.assertAlmostEqual(3.5, profiler.median, places=2) # with different values - df = pd.Series([1, 1, 1, 1, 2]).apply(str) + df = pl.Series([1, 1, 1, 1, 2]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) self.assertAlmostEqual(1, profiler.median, places=2) # with negative values - df = pd.Series([-1, 1, 1, 1, 2, 2, 2]) + df = pl.Series([-1, 1, 1, 1, 2, 2, 2]) profiler = IntColumn(df.name) profiler.update(df) self.assertAlmostEqual(1, profiler.median, places=2) # all unique values - df = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).apply(str) + df = pl.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) self.assertAlmostEqual(5.5, profiler.median, places=2) @@ -286,22 +286,22 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b): return M2 / (count_a + count_b - 1) data = np.linspace(-5, 5, 11).tolist() - df1 = pd.Series(data) + df1 = pl.Series(data) data = np.linspace(-3, 2, 11).tolist() - df2 = pd.Series(data) + df2 = pl.Series(data) data = np.full((10,), 1) - df3 = pd.Series(data) + df3 = pl.Series(data) num_profiler = IntColumn(df1.name) - num_profiler.update(df1.apply(str)) + num_profiler.update(df1.map_elements(str)) self.assertEqual(mean(df1), num_profiler.mean) self.assertEqual(var(df1), num_profiler.variance) self.assertEqual(np.sqrt(var(df1)), num_profiler.stddev) - df2_ints = df2[df2 == df2.round()] + df2_ints = df2.filter(df2 == df2.round()) variance = batch_variance( mean_a=num_profiler.mean, var_a=num_profiler.variance, @@ -310,13 +310,13 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b): var_b=var(df2_ints), count_b=df2_ints.count(), ) - num_profiler.update(df2.apply(str)) - df = pd.concat([df1, df2_ints]) + num_profiler.update(df2.map_elements(str)) + df = pl.concat([df1, df2_ints]) self.assertEqual(mean(df), num_profiler.mean) self.assertEqual(variance, num_profiler.variance) self.assertEqual(np.sqrt(variance), num_profiler.stddev) - df3_ints = df3[df3 == df3.round()] + df3_ints = df3.filter(df3 == df3) variance = batch_variance( mean_a=num_profiler.mean, var_a=num_profiler.variance, @@ -325,118 +325,117 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b): var_b=var(df3_ints), count_b=df3_ints.count(), ) - num_profiler.update(df3.apply(str)) + num_profiler.update(df3.map_elements(str)) - df = pd.concat([df1, df2_ints, df3_ints]) + df = pl.concat([df1, df2_ints.cast(pl.Float64), df3_ints.cast(pl.Float64)]) self.assertEqual(mean(df), num_profiler.mean) self.assertAlmostEqual(variance, num_profiler.variance) self.assertAlmostEqual(np.sqrt(variance), num_profiler.stddev) def test_profiled_skewness(self): data = np.linspace(-5, 5, 11).tolist() - df1 = pd.Series(data) + df1 = pl.Series(data) data = np.linspace(-3, 2, 11).tolist() - df2 = pd.Series(data) + df2 = pl.Series(data) data = np.full((10,), 1) - df3 = pd.Series(data) + df3 = pl.Series(data) num_profiler = IntColumn(df1.name) - num_profiler.update(df1.apply(str)) + num_profiler.update(df1.map_elements(str)) self.assertEqual(0, num_profiler.skewness) - df2_ints = df2[df2 == df2.round()] - num_profiler.update(df2.apply(str)) - df = pd.concat([df1, df2_ints]) + df2_ints = df2.filter(df2 == df2.round()) + num_profiler.update(df2.map_elements(str)) + df = pl.concat([df1, df2_ints]) self.assertAlmostEqual(11 * np.sqrt(102 / 91) / 91, num_profiler.skewness) - df3_ints = df3[df3 == df3.round()] - num_profiler.update(df3.apply(str)) - df = pd.concat([df1, df2_ints, df3_ints]) + df3_ints = df3.filter(df3 == df3) + num_profiler.update(df3.map_elements(str)) + df = pl.concat([df1, df2_ints.cast(pl.Float64), df3_ints.cast(pl.Float64)]) self.assertAlmostEqual(-6789 * np.sqrt(39 / 463) / 4630, num_profiler.skewness) def test_profiled_kurtosis(self): data = np.linspace(-5, 5, 11).tolist() - df1 = pd.Series(data) + df1 = pl.Series(data) data = np.linspace(-3, 2, 11).tolist() - df2 = pd.Series(data) + df2 = pl.Series(data) data = np.full((10,), 1) - df3 = pd.Series(data) + df3 = pl.Series(data) num_profiler = IntColumn(df1.name) - num_profiler.update(df1.apply(str)) + num_profiler.update(df1.map_elements(str)) self.assertAlmostEqual(-6 / 5, num_profiler.kurtosis) - df2_ints = df2[df2 == df2.round()] - num_profiler.update(df2.apply(str)) - df = pd.concat([df1, df2_ints]) + df2_ints = df2.filter(df2 == df2.round()) + num_profiler.update(df2.map_elements(str)) + df = pl.concat([df1, df2_ints]) self.assertAlmostEqual(-29886 / 41405, num_profiler.kurtosis) - df3_ints = df3[df3 == df3.round()] - num_profiler.update(df3.apply(str)) - df = pd.concat([df1, df2_ints, df3_ints]) + df3_ints = df3.filter(df3 == df3) + num_profiler.update(df3.map_elements(str)) + df = pl.concat([df1, df2_ints.cast(pl.Float64), df3_ints.cast(pl.Float64)]) self.assertAlmostEqual(16015779 / 42873800, num_profiler.kurtosis) def test_bias_correction_option(self): data = np.linspace(-5, 5, 11).tolist() - df1 = pd.Series(data) + df1 = pl.Series(data) data = np.linspace(-3, 2, 11).tolist() - df2 = pd.Series(data) + df2 = pl.Series(data) data = np.full((10,), 1) - df3 = pd.Series(data) + df3 = pl.Series(data) # Disable bias correction options = IntOptions() options.bias_correction.is_enabled = False num_profiler = IntColumn(df1.name, options=options) - num_profiler.update(df1.apply(str)) + num_profiler.update(df1.map_elements(str)) self.assertAlmostEqual(10, num_profiler.variance) self.assertAlmostEqual(0, num_profiler.skewness) self.assertAlmostEqual(89 / 50 - 3, num_profiler.kurtosis) - df2_ints = df2[df2 == df2.round()] - num_profiler.update(df2.apply(str)) - df = pd.concat([df1, df2_ints]) + df2_ints = df2.filter(df2 == df2.round()) + num_profiler.update(df2.map_elements(str)) + df = pl.concat([df1, df2_ints]) self.assertAlmostEqual(2184 / 289, num_profiler.variance) self.assertAlmostEqual(165 * np.sqrt(3 / 182) / 182, num_profiler.skewness) self.assertAlmostEqual(60769 / 28392 - 3, num_profiler.kurtosis) - df3_ints = df3[df3 == df3.round()] - num_profiler.update(df3.apply(str)) - df = pd.concat([df1, df2_ints, df3_ints]) + df3_ints = df3.filter(df3 == df3) + num_profiler.update(df3.map_elements(str)) + df = pl.concat([df1, df2_ints.cast(pl.Float64), df3_ints.cast(pl.Float64)]) self.assertAlmostEqual(3704 / 729, num_profiler.variance) self.assertAlmostEqual(-11315 / (926 * np.sqrt(926)), num_profiler.skewness) self.assertAlmostEqual(5305359 / 1714952 - 3, num_profiler.kurtosis) def test_bias_correction_merge(self): data = np.linspace(-5, 5, 11).tolist() - df1 = pd.Series(data) + df1 = pl.Series(data) data = np.linspace(-3, 2, 11).tolist() - df2 = pd.Series(data) + df2 = pl.Series(data) data = np.full((10,), 1) - df3 = pd.Series(data) + df3 = pl.Series(data) # Disable bias correction options = IntOptions() options.bias_correction.is_enabled = False num_profiler1 = IntColumn(df1.name, options=options) - num_profiler1.update(df1.apply(str)) + num_profiler1.update(df1.map_elements(str)) self.assertAlmostEqual(10, num_profiler1.variance) self.assertAlmostEqual(0, num_profiler1.skewness) self.assertAlmostEqual(89 / 50 - 3, num_profiler1.kurtosis) - df2_ints = df2[df2 == df2.round()] num_profiler2 = IntColumn(df2.name) - num_profiler2.update(df2.apply(str)) + num_profiler2.update(df2.map_elements(str)) num_profiler_merged = num_profiler1 + num_profiler2 # Values should stay biased values self.assertFalse(num_profiler_merged.bias_correction) @@ -446,9 +445,8 @@ def test_bias_correction_merge(self): ) self.assertAlmostEqual(60769 / 28392 - 3, num_profiler_merged.kurtosis) - df3_ints = df3[df3 == df3.round()] num_profiler3 = IntColumn(df3.name) - num_profiler3.update(df3.apply(str)) + num_profiler3.update(df3.map_elements(str)) num_profiler_merged = num_profiler1 + num_profiler2 + num_profiler3 self.assertFalse(num_profiler_merged.bias_correction) self.assertAlmostEqual(3704 / 729, num_profiler_merged.variance) @@ -492,7 +490,7 @@ def test_profiled_histogram(self): list_data_test.append([data3, expected_histogram3]) for data, expected_histogram in list_data_test: - df = pd.Series(data) + df = pl.Series(data) profiler = IntColumn(df.name) profiler.update(df) @@ -510,19 +508,19 @@ def test_profiled_histogram(self): def test_data_type_ratio(self): data = np.linspace(-5, 5, 11) - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) profiler = IntColumn(df.name) profiler.update(df) self.assertEqual(profiler.data_type_ratio, 1.0) - df = pd.Series(["not a float", "0.1"]) + df = pl.Series([None, "0.1"]) profiler.update(df) self.assertEqual(profiler.data_type_ratio, 11 / 13.0) def test_profile(self): - data = [2.0, 12.5, "not a float", 6.0, "not a float"] - df = pd.Series(data).apply(str) + data = [2.0, 12.5, None, 6.0, None] + df = pl.Series(data).map_elements(str) profiler = IntColumn(df.name) @@ -642,8 +640,8 @@ def test_profile(self): self.assertEqual(expected, profiler.profile["times"]) def test_option_timing(self): - data = [2.0, 12.5, "not a float", 6.0, "not a float"] - df = pd.Series(data).apply(str) + data = [2.0, 12.5, None, 6.0, None] + df = pl.Series(data).map_elements(str) options = IntOptions() options.set({"min.is_enabled": False}) @@ -694,13 +692,13 @@ def test_option_timing(self): def test_profile_merge(self): # Floats are not included intentionally for the test # below as this is an int column - data = [2.0, 12.5, "not an int", 6.0, "not an int"] - df = pd.Series(data).apply(str) + data = [2.0, 12.5, None, 6.0, None] + df = pl.Series(data).map_elements(str) profiler1 = IntColumn("Int") profiler1.update(df) - data2 = [10.0, 3.5, "not an int", 15.0, "not an int"] - df2 = pd.Series(data2).apply(str) + data2 = [10.0, 3.5, None, 15.0, None] + df2 = pl.Series(data2).map_elements(str) profiler2 = IntColumn("Int") profiler2.update(df2) @@ -749,13 +747,13 @@ def test_profile_merge(self): self.assertCountEqual(histogram["bin_edges"], expected_histogram["bin_edges"]) def test_profile_merge_for_zeros_and_negatives(self): - data = [2.0, 8.5, "not an int", 6.0, -3, 0] - df = pd.Series(data).apply(str) + data = [2.0, 8.5, None, 6.0, -3, 0] + df = pl.Series(data).map_elements(str) profiler1 = IntColumn("Int") profiler1.update(df) - data2 = [0.0, 3.5, "not an int", 125.0, 0, -0.1, -88] - df2 = pd.Series(data2).apply(str) + data2 = [0.0, 3.5, None, 125.0, 0, -0.1, -88] + df2 = pl.Series(data2).map_elements(str) profiler2 = IntColumn("Int") profiler2.update(df2) @@ -767,14 +765,14 @@ def test_profile_merge_for_zeros_and_negatives(self): self.assertEqual(profiler3.num_negatives, expected_profile.pop("num_negatives")) def test_profile_merge_edge_case(self): - data = [2.0, 12.5, "not a float", 6.0, "not a float"] - df = pd.Series(data).apply(str) + data = [2.0, 12.5, None, 6.0, None] + df = pl.Series(data).map_elements(str) profiler1 = IntColumn(name="Int") profiler1.update(df) profiler1.match_count = 0 - data2 = [10.0, 3.5, "not a float", 15.0, "not a float"] - df2 = pd.Series(data2).apply(str) + data2 = [10.0, 3.5, None, 15.0, None] + df2 = pl.Series(data2).map_elements(str) profiler2 = IntColumn(name="Int") profiler2.update(df2) @@ -782,11 +780,11 @@ def test_profile_merge_edge_case(self): self.assertEqual(profiler3.stddev, profiler2.stddev) # test merge with empty data - df1 = pd.Series([], dtype=object) + df1 = pl.Series([], dtype=object) profiler1 = IntColumn("Int") profiler1.update(df1) - df2 = pd.Series([], dtype=object) + df2 = pl.Series([], dtype=object) profiler2 = IntColumn("Int") profiler2.update(df2) @@ -797,7 +795,7 @@ def test_profile_merge_edge_case(self): self.assertTrue(np.isnan(profiler.kurtosis)) self.assertIsNone(profiler.histogram_selection) - df3 = pd.Series([2, 3]).apply(str) + df3 = pl.Series([2, 3]).map_elements(str) profiler3 = IntColumn("Int") profiler3.update(df3) @@ -809,7 +807,7 @@ def test_profile_merge_edge_case(self): self.assertEqual(profiler.num_zeros, 0) self.assertEqual(profiler.num_negatives, 0) - df4 = pd.Series([4, 5]).apply(str) + df4 = pl.Series([4, 5]).map_elements(str) profiler4 = IntColumn("Int") profiler4.update(df4) @@ -821,7 +819,7 @@ def test_profile_merge_edge_case(self): self.assertEqual(profiler.num_zeros, 0) self.assertEqual(profiler.num_negatives, 0) - df5 = pd.Series([0, 0, -1]).apply(str) + df5 = pl.Series([0, 0, -1]).map_elements(str) profiler5 = IntColumn("Int") profiler5.update(df5) @@ -836,13 +834,13 @@ def test_custom_bin_count_merge(self): options = IntOptions() options.histogram_and_quantiles.bin_count_or_method = 10 - data = [2, "not an int", 6, "not an int"] - df = pd.Series(data).apply(str) + data = [2, None, 6, None] + df = pl.Series(data).map_elements(str) profiler1 = IntColumn("Int", options) profiler1.update(df) - data2 = [10, "not an int", 15, "not an int"] - df2 = pd.Series(data2).apply(str) + data2 = [10, None, 15, None] + df2 = pl.Series(data2).map_elements(str) profiler2 = IntColumn("Int", options) profiler2.update(df2) @@ -866,14 +864,14 @@ def test_custom_bin_count_merge(self): def test_profile_merge_no_bin_overlap(self): - data = [2, "not an int", 6, "not an int"] - df = pd.Series(data).apply(str) + data = [2, None, 6, None] + df = pl.Series(data).map_elements(str) profiler1 = IntColumn("Int") profiler1.update(df) profiler1.match_count = 0 - data2 = [10, "not an int", 15, "not an int"] - df2 = pd.Series(data2).apply(str) + data2 = [10, None, 15, None] + df2 = pl.Series(data2).map_elements(str) profiler2 = IntColumn("Int") profiler2.update(df2) @@ -895,7 +893,7 @@ def test_profile_merge_with_different_options(self): options.min.is_enabled = False data = [2, 4, 6, 8] - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) profiler1 = IntColumn("Int", options=options) profiler1.update(df) profiler1.match_count = 0 @@ -904,7 +902,7 @@ def test_profile_merge_with_different_options(self): options = IntOptions() options.min.is_enabled = False data2 = [10, 15] - df2 = pd.Series(data2).apply(str) + df2 = pl.Series(data2).map_elements(str) profiler2 = IntColumn("Int", options=options) profiler2.update(df2) @@ -950,13 +948,13 @@ def test_histogram_option_integration(self): self.assertEqual(["custom"], num_profiler.histogram_bin_method_names) # case when just 1 unique value, should just set bin size to be 1 - num_profiler.update(pd.Series(["1", "1"])) + num_profiler.update(pl.Series(["1", "1"])) self.assertEqual( 1, len(num_profiler.histogram_methods["custom"]["histogram"]["bin_counts"]) ) # case when more than 1 unique value, by virtue of a streaming update - num_profiler.update(pd.Series(["2"])) + num_profiler.update(pl.Series(["2"])) self.assertEqual( 100, len(num_profiler._stored_histogram["histogram"]["bin_counts"]) ) @@ -978,7 +976,7 @@ def test_profile_merge_bin_edges_indices(self): 4948484957575651505156554954485054, ] - data = pd.Series(vals) + data = pl.Series(vals, dtype=pl.Object) data_1 = data[:5] data_2 = data[5:] @@ -995,7 +993,7 @@ def test_profile_merge_bin_edges_indices(self): profile_1 + profile_2 def test_insufficient_counts(self): - data = pd.Series(["1"]) + data = pl.Series(["1"]) profiler = IntColumn(data.name) with warnings.catch_warnings(record=True) as w: @@ -1018,7 +1016,7 @@ def test_insufficient_counts(self): ) # Update the data so that the match count is good - data2 = pd.Series(["-2", "-1", "1", "2"]) + data2 = pl.Series(["-2", "-1", "1", "2"]) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") @@ -1039,13 +1037,13 @@ def test_diff(self): """ Makes sure the IntColumn Diff() works appropriately. """ - data = [2, "not an int", 6, 4] - df = pd.Series(data).apply(str) + data = [2, None, 6, 4] + df = pl.Series(data).map_elements(str) profiler1 = IntColumn("Int") profiler1.update(df) data = [1, 15] - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) profiler2 = IntColumn("Int") profiler2.update(df) @@ -1189,7 +1187,7 @@ def test_json_encode(self): @mock.patch("time.time", return_value=0.0) def test_json_encode_after_update(self, time): data = np.array([0, 5, 10]) - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) int_options = IntOptions() int_options.histogram_and_quantiles.bin_count_or_method = 5 @@ -1317,7 +1315,7 @@ def test_json_decode_after_update(self): # Actual deserialization # Build expected IntColumn - df_int = pd.Series([-1, 2, 5, 7, 4, 3, 2, 0, 0, 9]) + df_int = pl.Series([-1, 2, 5, 7, 4, 3, 2, 0, 0, 9]) expected_profile = IntColumn(fake_profile_name) with test_utils.mock_timeit(): @@ -1333,7 +1331,7 @@ def test_json_decode_after_update(self): deserialized.report() test_utils.assert_profiles_equal(deserialized, expected_profile) - df_int = pd.Series( + df_int = pl.Series( [ 4, # add existing 15, # add new From c0d90a2dce5e2328bb028fb6e25d283a370ceaa5 Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Wed, 28 Feb 2024 16:39:24 -0600 Subject: [PATCH 09/11] update text profiler tests --- dataprofiler/profilers/text_column_profile.py | 13 +- .../profilers/test_text_column_profile.py | 126 +++++++++--------- 2 files changed, 71 insertions(+), 68 deletions(-) diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py index 5e5098f6..f2ea321e 100644 --- a/dataprofiler/profilers/text_column_profile.py +++ b/dataprofiler/profilers/text_column_profile.py @@ -4,7 +4,6 @@ import itertools import numpy as np -import pandas as pd import polars as pl from . import profiler_utils @@ -166,12 +165,14 @@ def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: """ if self._NumericStatsMixin__calculations: text_lengths = df_series_clean.str.len_chars() - NumericStatsMixin._update_helper(self, text_lengths.to_pandas(), profile) + NumericStatsMixin._update_helper( + self, text_lengths.drop_nulls().to_pandas(), profile + ) self._update_column_base_properties(profile) if self.max: self.type = "string" if self.max <= 255 else "text" - def update(self, df_series: pd.Series) -> TextColumn: + def update(self, df_series: pl.Series) -> TextColumn: """ Update the column profile. @@ -180,17 +181,17 @@ def update(self, df_series: pd.Series) -> TextColumn: :return: updated TextColumn :rtype: TextColumn """ - df_series = pl.from_pandas(df_series) len_df = len(df_series) if len_df == 0: return self - profile = dict(match_count=len_df, sample_size=len_df) + no_nulls_length = len(df_series.drop_nulls()) + profile = dict(match_count=no_nulls_length, sample_size=no_nulls_length) BaseColumnProfiler._perform_property_calcs( self, self.__calculations, - df_series=df_series, + df_series=df_series.drop_nulls(), prev_dependent_properties={}, subset_properties=profile, ) diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py index 61a54afe..c9e37ab4 100644 --- a/dataprofiler/tests/profilers/test_text_column_profile.py +++ b/dataprofiler/tests/profilers/test_text_column_profile.py @@ -6,7 +6,7 @@ from unittest import mock import numpy as np -import pandas as pd +import polars as pl from dataprofiler.profilers import TextColumn, profiler_utils from dataprofiler.profilers.json_decoder import load_column_profile @@ -26,7 +26,7 @@ def test_profiled_vocab(self): Checks whether the vocab list for the profiler is correct. :return: """ - df1 = pd.Series( + df1 = pl.Series( [ "abcd", "aa", @@ -39,35 +39,35 @@ def test_profiled_vocab(self): "dfd", "2", ] - ).apply(str) - df2 = pd.Series( + ).map_elements(str) + df2 = pl.Series( ["1", "1", "ee", "ff", "ff", "gg", "gg", "abcd", "aa", "b", "ee", "b"] - ).apply(str) - df3 = pd.Series( + ).map_elements(str) + df3 = pl.Series( [ "NaN", "b", "nan", "c", ] - ).apply(str) + ).map_elements(str) text_profiler = TextColumn(df1.name) text_profiler.update(df1) - unique_vocab = dict.fromkeys("".join(df1.tolist())).keys() + unique_vocab = dict.fromkeys("".join(df1.to_list())).keys() self.assertCountEqual(unique_vocab, text_profiler.vocab) self.assertCountEqual(set(text_profiler.vocab), text_profiler.vocab) text_profiler.update(df2) - df = pd.concat([df1, df2]) - unique_vocab = dict.fromkeys("".join(df.tolist())).keys() + df = pl.concat([df1, df2]) + unique_vocab = dict.fromkeys("".join(df.to_list())).keys() self.assertCountEqual(unique_vocab, text_profiler.vocab) self.assertCountEqual(set(text_profiler.vocab), text_profiler.vocab) text_profiler.update(df3) - df = pd.concat([df1, df2, df3]) - unique_vocab = dict.fromkeys("".join(df.tolist())).keys() + df = pl.concat([df1, df2, df3]) + unique_vocab = dict.fromkeys("".join(df.to_list())).keys() self.assertCountEqual(unique_vocab, text_profiler.vocab) def test_profiled_str_numerics(self): @@ -96,7 +96,7 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b): M2 = m_a + m_b + delta**2 * count_a * count_b / (count_a + count_b) return M2 / (count_a + count_b - 1) - df1 = pd.Series( + df1 = pl.Series( [ "abcd", "aa", @@ -110,11 +110,11 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b): "2", np.nan, ] - ).apply(str) - df2 = pd.Series( + ).map_elements(str) + df2 = pl.Series( ["1", "1", "ee", "ff", "ff", "gg", "gg", "abcd", "aa", "b", "ee", "b"] - ).apply(str) - df3 = pd.Series( + ).map_elements(str) + df3 = pl.Series( [ "NaN", "b", @@ -122,26 +122,28 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b): "c", None, ] - ).apply(str) + ).map_elements(str) text_profiler = TextColumn(df1.name) text_profiler.update(df1) - - self.assertEqual(mean(df1.str.len()), text_profiler.mean) - self.assertAlmostEqual(var(df1.str.len()), text_profiler.variance) - self.assertAlmostEqual(np.sqrt(var(df1.str.len())), text_profiler.stddev) - + self.assertEqual(mean(df1.str.len_chars().drop_nulls()), text_profiler.mean) + self.assertAlmostEqual( + var(df1.str.len_chars().drop_nulls()), text_profiler.variance + ) + self.assertAlmostEqual( + np.sqrt(var(df1.str.len_chars().drop_nulls())), text_profiler.stddev + ) variance = batch_variance( mean_a=text_profiler.mean, var_a=text_profiler.variance, count_a=text_profiler.sample_size, - mean_b=mean(df2.str.len()), - var_b=var(df2.str.len()), + mean_b=mean(df2.str.len_chars()), + var_b=var(df2.str.len_chars()), count_b=df2.count(), ) text_profiler.update(df2) - df = pd.concat([df1, df2]) - self.assertEqual(df.str.len().mean(), text_profiler.mean) + df = pl.concat([df1, df2]) + self.assertEqual(df.str.len_chars().drop_nulls().mean(), text_profiler.mean) self.assertAlmostEqual(variance, text_profiler.variance) self.assertAlmostEqual(np.sqrt(variance), text_profiler.stddev) @@ -149,19 +151,19 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b): mean_a=text_profiler.mean, var_a=text_profiler.variance, count_a=text_profiler.match_count, - mean_b=mean(df3.str.len()), - var_b=var(df3.str.len()), + mean_b=mean(df3.str.len_chars().drop_nulls()), + var_b=var(df3.str.len_chars().drop_nulls()), count_b=df3.count(), ) text_profiler.update(df3) - df = pd.concat([df1, df2, df3]) - self.assertEqual(df.str.len().mean(), text_profiler.mean) + df = pl.concat([df1, df2, df3]) + self.assertEqual(df.str.len_chars().drop_nulls().mean(), text_profiler.mean) self.assertAlmostEqual(variance, text_profiler.variance) self.assertAlmostEqual(np.sqrt(variance), text_profiler.stddev) def test_base_case(self): - data = pd.Series([], dtype=object) + data = pl.Series([], dtype=object) profiler = TextColumn(data.name) profiler.update(data) profiler.update(data) # intentional to validate no changes if empty @@ -174,7 +176,7 @@ def test_base_case(self): def test_data_ratio(self): # should always be 1.0 unless empty - df1 = pd.Series( + df1 = pl.Series( [ "abcd", "aa", @@ -187,7 +189,7 @@ def test_data_ratio(self): "dfd", "2", ] - ).apply(str) + ).map_elements(str) profiler = TextColumn(df1.name) profiler.update(df1) @@ -198,31 +200,31 @@ def test_data_ratio(self): self.assertEqual(profiler.data_type_ratio, 1.0) def test_profiled_min(self): - df = pd.Series(["aaa", "aa", "aaaa", "aaa"]).apply(str) + df = pl.Series(["aaa", "aa", "aaaa", "aaa"]).map_elements(str) profiler = TextColumn(df.name) profiler.update(df) self.assertEqual(profiler.min, 2) - df = pd.Series(["aa", "a"]).apply(str) + df = pl.Series(["aa", "a"]).map_elements(str) profiler.update(df) self.assertEqual(profiler.min, 1) def test_profiled_max(self): - df = pd.Series(["a", "aa", "a", "a"]).apply(str) + df = pl.Series(["a", "aa", "a", "a"]).map_elements(str) profiler = TextColumn(df.name) profiler.update(df) self.assertEqual(profiler.max, 2) - df = pd.Series(["aa", "aaa", "a"]).apply(str) + df = pl.Series(["aa", "aaa", "a"]).map_elements(str) profiler.update(df) self.assertEqual(profiler.max, 3) def test_profile(self): - df = pd.Series( + df = pl.Series( ["abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2"] - ).apply(str) + ).map_elements(str) profiler = TextColumn(df.name) expected_profile = dict( min=1.0, @@ -302,7 +304,7 @@ def test_report(self): `remove_disabled_flag`. """ data = [2.0, 12.5, "not a float", 6.0, "not a float"] - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) options = TextOptions() # With TextOptions and remove_disabled_flag == True options.vocab.is_enabled = False @@ -327,8 +329,8 @@ def test_report(self): self.assertIn("vocab", report_keys) def test_option_timing(self): - data = [2.0, 12.5, "not a float", 6.0, "not a float"] - df = pd.Series(data).apply(str) + data = ["2.0", "12.5", "not a float", "6.0", "not a float"] + df = pl.Series(data).map_elements(str) options = TextOptions() options.set({"min.is_enabled": False}) @@ -376,13 +378,13 @@ def test_option_timing(self): self.assertCountEqual(expected, profiler.profile["times"]) def test_merge_profile(self): - df = pd.Series( + df = pl.Series( ["abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2"] - ).apply(str) + ).map_elements(str) - df2 = pd.Series( + df2 = pl.Series( ["hello", "my", "name", "is", "Grant", "I", "have", "67", "dogs"] - ).apply(str) + ).map_elements(str) expected_vocab = [ "a", @@ -450,7 +452,7 @@ def test_profile_merge_with_different_options(self): options.min.is_enabled = False options.histogram_and_quantiles.bin_count_or_method = None - df = pd.Series( + df = pl.Series( ["pancake", "banana", "lighthouse", "aa", "b", "4", "3", "2", "dfd", "2"] ) @@ -463,7 +465,7 @@ def test_profile_merge_with_different_options(self): options.max.is_enabled = False options.vocab.is_enabled = False options.histogram_and_quantiles.bin_count_or_method = None - df2 = pd.Series( + df2 = pl.Series( ["hello", "my", "name", "is", "Grant", "I", "have", "67", "dogs"] ) profiler2 = TextColumn("Text", options=options) @@ -499,12 +501,12 @@ def test_custom_bin_count_merge(self): options.histogram_and_quantiles.bin_count_or_method = 10 data = ["this", "is", "a", "test"] - df = pd.Series(data).apply(str) + df = pl.Series(data).map_elements(str) profiler1 = TextColumn("Float", options) profiler1.update(df) data2 = ["this", "is", "another", "test"] - df2 = pd.Series(data2).apply(str) + df2 = pl.Series(data2).map_elements(str) profiler2 = TextColumn("Float", options) profiler2.update(df2) @@ -543,13 +545,13 @@ def test_histogram_option_integration(self): self.assertEqual(["custom"], num_profiler.histogram_bin_method_names) # case when just 1 unique value, should just set bin size to be 1 - num_profiler.update(pd.Series(["1", "1"])) + num_profiler.update(pl.Series(["1", "1"])) self.assertEqual( 1, len(num_profiler.histogram_methods["custom"]["histogram"]["bin_counts"]) ) # case when more than 1 unique value, by virtue of a streaming update - num_profiler.update(pd.Series(["22"])) + num_profiler.update(pl.Series(["22"])) self.assertEqual( 100, len(num_profiler._stored_histogram["histogram"]["bin_counts"]) ) @@ -558,13 +560,13 @@ def test_histogram_option_integration(self): self.assertEqual(100, len(histogram["bin_counts"])) def test_diff(self): - df = pd.Series( + df = pl.Series( ["abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2"] - ).apply(str) + ).map_elements(str) - df2 = pd.Series( + df2 = pl.Series( ["hello", "my", "name", "is", "Grant", "I", "have", "67", "dogs"] - ).apply(str) + ).map_elements(str) profiler1 = TextColumn(df.name) profiler1.update(df) @@ -616,7 +618,7 @@ def test_diff(self): @mock.patch("time.time", return_value=0.0) def test_json_encode_after_update(self, time): - df = pd.Series( + df = pl.Series( [ "abcd", "aa", @@ -630,7 +632,7 @@ def test_json_encode_after_update(self, time): "2", "12.32", ] - ).apply(str) + ).map_elements(str) text_options = TextOptions() text_options.histogram_and_quantiles.bin_count_or_method = 5 @@ -711,7 +713,7 @@ def test_json_encode_after_update(self, time): "kurtosis": "_get_kurtosis", "histogram_and_quantiles": "_get_histogram_and_quantiles", }, - "name": None, + "name": "", "col_index": np.nan, "sample_size": 11, "metadata": {}, @@ -754,7 +756,7 @@ def test_json_decode_after_update(self): # Actual deserialization # Build expected IntColumn - df_int = pd.Series( + df_int = pl.Series( [ "abcd", "aa", @@ -784,7 +786,7 @@ def test_json_decode_after_update(self): deserialized.report() test_utils.assert_profiles_equal(deserialized, expected_profile) - df_str = pd.Series( + df_str = pl.Series( [ "aa", # add existing "awsome", # add new From a9da02eed0348e94254389b197a28f830dfb0849 Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Sun, 3 Mar 2024 17:01:25 -0600 Subject: [PATCH 10/11] fully finished --- .../profilers/float_column_profile.py | 17 +- dataprofiler/profilers/int_column_profile.py | 8 +- .../profilers/numerical_column_stats.py | 153 ++++++++---------- dataprofiler/profilers/text_column_profile.py | 10 +- .../test_numeric_stats_mixin_profile.py | 18 +-- 5 files changed, 95 insertions(+), 111 deletions(-) diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py index f0783747..19bb19c6 100644 --- a/dataprofiler/profilers/float_column_profile.py +++ b/dataprofiler/profilers/float_column_profile.py @@ -280,7 +280,7 @@ def _get_float_precision( :param df_series_clean: df series with nulls removed, assumes all values are floats as well - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.series.series.Series :param sample_ratio: Ratio of samples used for float precision :type sample_ratio: float (between 0 and 1) :return: string representing its precision print format @@ -332,9 +332,9 @@ def _is_each_row_float(cls, df_series: pl.Series) -> pl.Series: For column [1.0, np.NaN, 1.0] returns [True, True, True] For column [1.0, "a", "b"] returns [True, False, False] :param df_series: series of values to evaluate - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :return: is_float_col - :rtype: Union[List[bool], pandas.Series[bool]] + :rtype: pl.Series """ if len(df_series) == 0: return pl.Series() @@ -361,7 +361,7 @@ def _update_precision( subset before they are merged into the main data profile. :type subset_properties: dict :param df_series: Data to be profiled - :type df_series: pandas.DataFrame + :type df_series: polars.DataFrame :return: None """ sample_ratio = None @@ -403,19 +403,18 @@ def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: Update column profile properties with cleaned dataset and its known profile. :param df_series_clean: df series with nulls removed - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.series.series.Series :param profile: float profile dictionary :type profile: dict :return: None """ - df_series_clean = df_series_clean.to_pandas() if self._NumericStatsMixin__calculations: NumericStatsMixin._update_helper(self, df_series_clean, profile) self._update_column_base_properties(profile) def _update_numeric_stats( self, - df_series: pl.DataFrame, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -430,7 +429,7 @@ def _update_numeric_stats( subset before they are merged into the main data profile. :type subset_properties: Dict :param df_series: Data to be profiled - :type df_series: Pandas Dataframe + :type df_series: Polars Dataframe :return: None """ super()._update_helper(df_series, subset_properties) @@ -440,7 +439,7 @@ def update(self, df_series: pl.Series) -> FloatColumn: Update the column profile. :param df_series: df series - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :return: updated FloatColumn :rtype: FloatColumn """ diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py index 30e7a4a8..15394ce8 100644 --- a/dataprofiler/profilers/int_column_profile.py +++ b/dataprofiler/profilers/int_column_profile.py @@ -2,7 +2,6 @@ from __future__ import annotations import numpy as np -import pandas as pd import polars as pl from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler @@ -125,7 +124,7 @@ def _is_each_row_int(cls, df_series: pl.Series) -> list[bool]: For column [1.1 1.1 1.1] returns False :param df_series: series of values to evaluate - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :return: is_int_col :rtype: list """ @@ -140,12 +139,11 @@ def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: Update col profile properties with clean dataset and its known null params. :param df_series_clean: df series with nulls removed - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.series.series.Series :param profile: int profile dictionary :type profile: dict :return: None """ - df_series_clean = pd.Series(df_series_clean.to_numpy()) if self._NumericStatsMixin__calculations: NumericStatsMixin._update_helper(self, df_series_clean, profile) self._update_column_base_properties(profile) @@ -155,7 +153,7 @@ def update(self, df_series: pl.Series) -> IntColumn: Update the column profile. :param df_series: df series - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :return: updated IntColumn :rtype: IntColumn """ diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 549fcc43..0e8677d2 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -10,7 +10,6 @@ import numpy as np import numpy.typing as npt -import pandas as pd import polars as pl import scipy.stats @@ -498,7 +497,6 @@ def diff( "Unsupported operand type(s) for diff: '{}' " "and '{}'".format(cls.__name__, other_profile.__class__.__name__) ) - print(self.variance, other_profile.variance) differences = { "min": profiler_utils.find_diff_of_numbers(self.min, other_profile.min), "max": profiler_utils.find_diff_of_numbers(self.max, other_profile.max), @@ -1125,10 +1123,9 @@ def _estimate_stats_from_histogram(self) -> np.float64: return var def _total_histogram_bin_variance( - self, input_array: np.ndarray | pd.Series + self, input_array: np.ndarray | pl.Series ) -> float: - if type(input_array) is pd.Series: - input_array = pl.from_pandas(input_array) + if type(input_array) is pl.Series: input_array = input_array.to_numpy() # calculate total variance over all bins of a histogram bin_counts = self._stored_histogram["histogram"]["bin_counts"] @@ -1146,20 +1143,18 @@ def _total_histogram_bin_variance( sum_var += bin_var return sum_var - def _histogram_bin_error(self, input_array: np.ndarray | pd.Series) -> np.float64: + def _histogram_bin_error(self, input_array: np.ndarray | pl.Series) -> np.float64: """ Calculate error of each value from bin of the histogram it falls within. :param input_array: input data used to calculate the histogram - :type input_array: Union[np.array, pd.Series] + :type input_array: Union[np.array, pl.Series] :return: binning error :rtype: float """ - if type(input_array) is pd.Series: - input_array = pl.from_pandas(input_array) + if type(input_array) == pl.Series: input_array = input_array.to_numpy() - bin_edges = self._stored_histogram["histogram"]["bin_edges"] - + bin_edges = self._stored_histogram["histogram"]["bin_edges"].astype(float) # account ofr digitize which is exclusive bin_edges = bin_edges.copy() @@ -1280,7 +1275,7 @@ def _get_histogram( Uses np.histogram. :param values: input data values - :type values: Union[np.array, pd.Series] + :type values: Union[np.array, pl.Series] :return: bin edges and bin counts """ if len(np.unique(values)) == 1: @@ -1323,18 +1318,17 @@ def _get_histogram( bin_counts, bin_edges = np.histogram(values, bins=n_equal_bins) return bin_counts, bin_edges - def _merge_histogram(self, values: np.ndarray | pd.Series) -> None: + def _merge_histogram(self, values: np.ndarray | pl.Series) -> None: # values is the current array of values, # that needs to be updated to the accumulated histogram - if type(values) is pd.Series: - values = pl.from_pandas(values) + if type(values) == pl.Series: values = values.to_numpy() combined_values = np.concatenate([values, self._histogram_to_array()]) bin_counts, bin_edges = self._get_histogram(combined_values) self._stored_histogram["histogram"]["bin_counts"] = bin_counts self._stored_histogram["histogram"]["bin_edges"] = bin_edges - def _update_histogram(self, df_series: pd.Series | np.ndarray) -> None: + def _update_histogram(self, df_series: pl.Series) -> None: """ Update histogram for each method and the combined method. @@ -1352,30 +1346,31 @@ def _update_histogram(self, df_series: pd.Series | np.ndarray) -> None: accumulated losses, and the best method with minimal loss is picked :param df_series: a given column - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :return: """ - if self._greater_than_64_bit and type(df_series) is pd.Series: - df_series = df_series.to_numpy(dtype=float) - df_series = df_series[np.isfinite(df_series)] - if df_series.size == 0: + if self._greater_than_64_bit: + df_np_series = df_series.to_numpy() + df_np_series = df_np_series[np.isfinite(df_np_series)] + if df_np_series.size == 0: return + if self._has_histogram: + self._merge_histogram(df_np_series) + else: + bin_counts, bin_edges = self._get_histogram(df_np_series) + self._stored_histogram["histogram"]["bin_counts"] = bin_counts + self._stored_histogram["histogram"]["bin_edges"] = bin_edges else: - df_series = pl.from_pandas(df_series, nan_to_null=True).cast(pl.Float64) - df_series = df_series.replace([np.inf, -np.inf], [None]) # type: ignore - df_series = df_series.drop_nulls() + df_series = df_series.filter(~df_series.is_infinite()) + df_series = df_series.drop_nans() if df_series.is_empty(): return - - if self._has_histogram: - if self._greater_than_64_bit: - self._merge_histogram(df_series.tolist()) + if self._has_histogram: + self._merge_histogram(df_series) else: - self._merge_histogram(df_series.to_list()) - else: - bin_counts, bin_edges = self._get_histogram(df_series) - self._stored_histogram["histogram"]["bin_counts"] = bin_counts - self._stored_histogram["histogram"]["bin_edges"] = bin_edges + bin_counts, bin_edges = self._get_histogram(df_series) + self._stored_histogram["histogram"]["bin_counts"] = bin_counts + self._stored_histogram["histogram"]["bin_edges"] = bin_edges # update loss for the stored bins histogram_loss = self._histogram_bin_error(df_series) @@ -1749,36 +1744,30 @@ def _get_quantiles(self) -> None: ] self.quantiles = self._get_percentile(percentiles=percentiles) - def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: + def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: """ Update base numerical profile properties w/ clean dataset and known null params. :param df_series_clean: df series with nulls removed - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.series.series.Series :param profile: numerical profile dictionary :type profile: dict :return: None """ - self._greater_than_64_bit = ( - not df_series_clean.empty - and df_series_clean.apply(pd.to_numeric, errors="coerce").dtype == "O" - ) + self._greater_than_64_bit = df_series_clean.dtype == pl.Object if self._greater_than_64_bit: - df_series_clean = df_series_clean.to_numpy() - df_series_clean = df_series_clean[df_series_clean != np.nan] - if df_series_clean.size == 0: + df_np_series_clean = df_series_clean.to_numpy() + df_np_series_clean = df_np_series_clean[df_np_series_clean != np.nan] + if df_np_series_clean.size == 0: return - df_series_clean = pd.Series(df_series_clean) + df_series_clean = pl.Series(df_np_series_clean) else: - df_series_clean = pl.from_pandas(df_series_clean) if df_series_clean.dtype == pl.String: df_series_clean = df_series_clean.str.strip_chars().cast(pl.Float64) else: df_series_clean = df_series_clean.cast(pl.Float64) if df_series_clean.is_empty(): return - df_series_clean = df_series_clean.to_pandas() - df_series_clean = df_series_clean.astype(float) prev_dependent_properties = { "mean": self.mean, @@ -1800,15 +1789,14 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: @BaseColumnProfiler._timeit(name="min") def _get_min( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: if self._greater_than_64_bit: - min_value = np.min(df_series) + min_value = min(df_series) self.min = min_value if not self.min else min(self.min, min_value) else: - df_series = pl.from_pandas(df_series) min_value = df_series.min() self.min = np.float64( min_value if not self.min else min(self.min, min_value) @@ -1818,15 +1806,14 @@ def _get_min( @BaseColumnProfiler._timeit(name="max") def _get_max( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: if self._greater_than_64_bit: - max_value = np.max(df_series) + max_value = max(df_series) self.max = max_value if not self.max else max(self.max, max_value) else: - df_series = pl.from_pandas(df_series) max_value = df_series.max() if self.max is not None: max_value = type(self.max)(max_value) @@ -1838,14 +1825,14 @@ def _get_max( @BaseColumnProfiler._timeit(name="sum") def _get_sum( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: if np.isinf(self.sum) or (np.isnan(self.sum) and self.match_count > 0): return if self._greater_than_64_bit: - sum_value = np.sum(df_series) + sum_value = float(sum(df_series)) if len(df_series) > 0 and sum_value == np.nan: warnings.warn( "Infinite or invalid values found in data. " @@ -1854,7 +1841,6 @@ def _get_sum( RuntimeWarning, ) else: - df_series = pl.from_pandas(df_series) sum_value = df_series.sum() if np.isinf(sum_value) or (len(df_series) > 0 and np.isnan(sum_value)): warnings.warn( @@ -1870,7 +1856,7 @@ def _get_sum( @BaseColumnProfiler._timeit(name="variance") def _get_variance( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1879,9 +1865,8 @@ def _get_variance( ): return if self._greater_than_64_bit: - batch_biased_variance = np.var(df_series) + batch_biased_variance = np.var(df_series.to_numpy()) else: - df_series = pl.from_pandas(df_series) batch_biased_variance = np.var([df_series]) subset_properties["biased_variance"] = batch_biased_variance sum_value = subset_properties["sum"] @@ -1900,7 +1885,7 @@ def _get_variance( @BaseColumnProfiler._timeit(name="skewness") def _get_skewness( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1908,7 +1893,7 @@ def _get_skewness( Compute and update skewness of current dataset given new chunk. :param df_series: incoming data - :type df_series: pandas series + :type df_series: polars series :param prev_dependent_properties: pre-update values needed for computation :type prev_dependent_properties: dict @@ -1924,11 +1909,10 @@ def _get_skewness( ): return - if self._greater_than_64_bit and type(df_series) is pd.Series: - df_series = df_series.to_numpy(dtype=float) + if self._greater_than_64_bit and type(df_series) is pl.Series: + batch_biased_skewness = profiler_utils.biased_skew(df_series.to_numpy()) else: - df_series = pl.from_pandas(df_series, nan_to_null=False) - batch_biased_skewness = profiler_utils.biased_skew(df_series) + batch_biased_skewness = profiler_utils.biased_skew(df_series) subset_properties["biased_skewness"] = batch_biased_skewness batch_count = subset_properties["match_count"] batch_biased_var = subset_properties["biased_variance"] @@ -1948,7 +1932,7 @@ def _get_skewness( @BaseColumnProfiler._timeit(name="kurtosis") def _get_kurtosis( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1956,7 +1940,7 @@ def _get_kurtosis( Compute and update kurtosis of current dataset given new chunk. :param df_series: incoming data - :type df_series: pandas series + :type df_series: polars series :param prev_dependent_properties: pre-update values needed for computation :type prev_dependent_properties: dict @@ -1972,11 +1956,10 @@ def _get_kurtosis( ): return - if self._greater_than_64_bit and type(df_series) is pd.Series: - df_series = df_series.to_numpy(dtype=float) + if self._greater_than_64_bit: + batch_biased_kurtosis = profiler_utils.biased_kurt(df_series.to_numpy()) else: - df_series = pl.from_pandas(df_series, nan_to_null=False) - batch_biased_kurtosis = profiler_utils.biased_kurt(df_series) + batch_biased_kurtosis = profiler_utils.biased_kurt(df_series) subset_properties["biased_kurtosis"] = batch_biased_kurtosis batch_count = subset_properties["match_count"] batch_biased_var = subset_properties["biased_variance"] @@ -1999,7 +1982,7 @@ def _get_kurtosis( @BaseColumnProfiler._timeit(name="histogram_and_quantiles") def _get_histogram_and_quantiles( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -2017,7 +2000,7 @@ def _get_histogram_and_quantiles( @BaseColumnProfiler._timeit(name="num_zeros") def _get_num_zeros( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -2025,23 +2008,26 @@ def _get_num_zeros( Get the count of zeros in the numerical column. :param df_series: df series - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :param prev_dependent_properties: previous dependent properties :type prev_dependent_properties: dict :param subset_properties: subset of properties :type subset_properties: dict :return: None """ - if not self._greater_than_64_bit: - df_series = pl.from_pandas(df_series) - num_zeros_value = (df_series == 0).sum() + if df_series.is_empty(): + num_zeros_value = 0 + elif self._greater_than_64_bit: + num_zeros_value = int((df_series.to_numpy() == 0).sum()) + else: + num_zeros_value = int((df_series == 0).sum()) subset_properties["num_zeros"] = num_zeros_value self.num_zeros = self.num_zeros + num_zeros_value @BaseColumnProfiler._timeit(name="num_negatives") def _get_num_negatives( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -2049,16 +2035,19 @@ def _get_num_negatives( Get the count of negative numbers in the numerical column. :param df_series: df series - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :param prev_dependent_properties: previous dependent properties :type prev_dependent_properties: dict :param subset_properties: subset of properties :type subset_properties: dict :return: None """ - if not self._greater_than_64_bit: - df_series = pl.from_pandas(df_series) - num_negatives_value = (df_series < 0).sum() + if df_series.is_empty(): + num_negatives_value = 0 + elif self._greater_than_64_bit: + num_negatives_value = int((df_series.to_numpy() < 0).sum()) + else: + num_negatives_value = int((df_series < 0).sum()) subset_properties["num_negatives"] = num_negatives_value self.num_negatives = self.num_negatives + num_negatives_value @@ -2068,7 +2057,7 @@ def update(self, df_series: pl.Series) -> NumericStatsMixin: Update the numerical profile properties with an uncleaned dataset. :param df_series: df series with nulls removed - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :return: None """ raise NotImplementedError() diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py index f2ea321e..200bd5d3 100644 --- a/dataprofiler/profilers/text_column_profile.py +++ b/dataprofiler/profilers/text_column_profile.py @@ -141,7 +141,7 @@ def _update_vocab( Find the unique vocabulary used in the text column. :param data: list or array of data from which to extract vocab - :type data: Union[list, numpy.array, pandas.DataFrame] + :type data: Union[list, numpy.array, polars.DataFrame] :param prev_dependent_properties: Contains all the previous properties that the calculations depend on. :type prev_dependent_properties: dict @@ -158,16 +158,14 @@ def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: Update col profile properties with clean dataset and its known null parameters. :param df_series_clean: df series with nulls removed - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.series.series.Series :param profile: text profile dictionary :type profile: dict :return: None """ if self._NumericStatsMixin__calculations: text_lengths = df_series_clean.str.len_chars() - NumericStatsMixin._update_helper( - self, text_lengths.drop_nulls().to_pandas(), profile - ) + NumericStatsMixin._update_helper(self, text_lengths.drop_nulls(), profile) self._update_column_base_properties(profile) if self.max: self.type = "string" if self.max <= 255 else "text" @@ -177,7 +175,7 @@ def update(self, df_series: pl.Series) -> TextColumn: Update the column profile. :param df_series: df series - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :return: updated TextColumn :rtype: TextColumn """ diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py index 7b4d2ccc..a1291276 100644 --- a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py +++ b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py @@ -6,7 +6,7 @@ from unittest import mock import numpy as np -import pandas as pd +import polars as pl from dataprofiler.profilers import NumericStatsMixin from dataprofiler.profilers.base_column_profilers import BaseColumnProfiler @@ -340,7 +340,7 @@ def test_timeit(self): "biased_skewness": 0, } data = np.array([0, 0, 0, 0, 0]) - df_series = pd.Series(data) + df_series = pl.Series(data) subset_properties = {"min": 0, "match_count": 0} time_array = [float(i) for i in range(24, 0, -1)] @@ -547,21 +547,21 @@ def test_num_zeros(self): prev_dependent_properties = {"mean": 0} subset_properties = {"num_zeros": 0} - df_series = pd.Series([]) + df_series = pl.Series([]) num_profiler._get_num_zeros( df_series, prev_dependent_properties, subset_properties ) self.assertEqual(subset_properties["num_zeros"], 0) data = np.array([0, 0, 0, 0, 0]) - df_series = pd.Series(data) + df_series = pl.Series(data) num_profiler._get_num_zeros( df_series, prev_dependent_properties, subset_properties ) self.assertEqual(subset_properties["num_zeros"], 5) data = np.array([000.0, 0.00, 0.000, 1.11234, 0, -1]) - df_series = pd.Series(data) + df_series = pl.Series(data) num_profiler._get_num_zeros( df_series, prev_dependent_properties, subset_properties ) @@ -574,21 +574,21 @@ def test_num_negatives(self): prev_dependent_properties = {"mean": 0} subset_properties = {"num_negatives": 0} - df_series = pd.Series([]) + df_series = pl.Series([]) num_profiler._get_num_negatives( df_series, prev_dependent_properties, subset_properties ) self.assertEqual(subset_properties["num_negatives"], 0) data = np.array([0, 0, 0, 0, 0]) - df_series = pd.Series(data) + df_series = pl.Series(data) num_profiler._get_num_negatives( df_series, prev_dependent_properties, subset_properties ) self.assertEqual(subset_properties["num_negatives"], 0) data = np.array([1, 0, -0.003, -16, -1.0, -24.45]) - df_series = pd.Series(data) + df_series = pl.Series(data) num_profiler._get_num_negatives( df_series, prev_dependent_properties, subset_properties ) @@ -675,7 +675,7 @@ def test_timeit_num_zeros_and_negatives(self): # Dummy data to make min call prev_dependent_properties = {"mean": 0} data = np.array([0, 0, 0, 0, 0]) - df_series = pd.Series(data) + df_series = pl.Series(data) subset_properties = {"num_zeros": 0, "num_negatives": 0} time_array = [float(i) for i in range(4, 0, -1)] From 30a4c24bcb336999dc861cdb5d9e76c4e764c241 Mon Sep 17 00:00:00 2001 From: Andrew Li Date: Sun, 3 Mar 2024 20:00:52 -0600 Subject: [PATCH 11/11] fix pandas df in update --- dataprofiler/profilers/float_column_profile.py | 4 ++++ dataprofiler/profilers/int_column_profile.py | 4 ++++ dataprofiler/profilers/text_column_profile.py | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py index 19bb19c6..61c222a1 100644 --- a/dataprofiler/profilers/float_column_profile.py +++ b/dataprofiler/profilers/float_column_profile.py @@ -4,6 +4,7 @@ import copy import numpy as np +import pandas as pd import polars as pl from . import profiler_utils @@ -443,6 +444,9 @@ def update(self, df_series: pl.Series) -> FloatColumn: :return: updated FloatColumn :rtype: FloatColumn """ + # TODO remove onces profiler builder is updated + if type(df_series) == pd.Series: + df_series = pl.from_pandas(df_series) # type: ignore if len(df_series) == 0: return self is_each_row_float = self._is_each_row_float(df_series).replace(None, False) diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py index 15394ce8..15fb3fbd 100644 --- a/dataprofiler/profilers/int_column_profile.py +++ b/dataprofiler/profilers/int_column_profile.py @@ -2,6 +2,7 @@ from __future__ import annotations import numpy as np +import pandas as pd import polars as pl from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler @@ -157,6 +158,9 @@ def update(self, df_series: pl.Series) -> IntColumn: :return: updated IntColumn :rtype: IntColumn """ + # TODO remove onces profiler builder is updated + if type(df_series) == pd.Series: + df_series = pl.from_pandas(df_series) # type: ignore self._greater_than_64_bit = df_series.dtype == pl.Object if len(df_series) == 0: return self diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py index 200bd5d3..a44bc1b7 100644 --- a/dataprofiler/profilers/text_column_profile.py +++ b/dataprofiler/profilers/text_column_profile.py @@ -4,6 +4,7 @@ import itertools import numpy as np +import pandas as pd import polars as pl from . import profiler_utils @@ -179,6 +180,9 @@ def update(self, df_series: pl.Series) -> TextColumn: :return: updated TextColumn :rtype: TextColumn """ + # TODO remove onces profiler builder is updated + if type(df_series) == pd.Series: + df_series = pl.from_pandas(df_series) # type: ignore len_df = len(df_series) if len_df == 0: return self