From 1287027773b0210f91d47fd6d33696dd4e13fad2 Mon Sep 17 00:00:00 2001 From: Taylor Turner Date: Tue, 5 Mar 2024 16:03:46 -0500 Subject: [PATCH 1/6] Staging into `main` from `dev` (#1106) * add downloads tile (#1085) * Hot fix json bug (#1105) * update * update --- README.md | 1 + dataprofiler/profilers/json_encoder.py | 3 +- .../tests/labelers/test_labeler_utils.py | 3 +- .../profilers/test_datetime_column_profile.py | 41 +++++++++++++++++++ 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3ba4ee51b..1df9a2ea3 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/DataProfiler) ![GitHub](https://img.shields.io/github/license/CapitalOne/DataProfiler) ![GitHub last commit](https://img.shields.io/github/last-commit/CapitalOne/DataProfiler) +[![Downloads](https://static.pepy.tech/badge/dataprofiler)](https://pepy.tech/project/dataprofiler)

diff --git a/dataprofiler/profilers/json_encoder.py b/dataprofiler/profilers/json_encoder.py index 4e12eb649..cf0227f64 100644 --- a/dataprofiler/profilers/json_encoder.py +++ b/dataprofiler/profilers/json_encoder.py @@ -1,6 +1,7 @@ """Contains ProfilerEncoder class.""" import json +from datetime import datetime import numpy as np import pandas as pd @@ -52,7 +53,7 @@ def default(self, to_serialize): return int(to_serialize) elif isinstance(to_serialize, np.ndarray): return to_serialize.tolist() - elif isinstance(to_serialize, pd.Timestamp): + elif isinstance(to_serialize, (pd.Timestamp, datetime)): return to_serialize.isoformat() elif isinstance(to_serialize, BaseDataLabeler): # TODO: This does not allow the user to serialize a model if it is loaded diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py index dcfb75020..f59a43e3f 100644 --- a/dataprofiler/tests/labelers/test_labeler_utils.py +++ b/dataprofiler/tests/labelers/test_labeler_utils.py @@ -235,8 +235,9 @@ def test_verbose(self): self.assertIn("f1-score ", log_output) self.assertIn("F1 Score: ", log_output) + @mock.patch("dataprofiler.labelers.labeler_utils.classification_report") @mock.patch("pandas.DataFrame") - def test_save_conf_mat(self, mock_dataframe): + def test_save_conf_mat(self, mock_dataframe, mock_report): # ideally mock out the actual contents written to file, but # would be difficult to get this completely worked out. diff --git a/dataprofiler/tests/profilers/test_datetime_column_profile.py b/dataprofiler/tests/profilers/test_datetime_column_profile.py index c00ac8e0d..dca3a8773 100644 --- a/dataprofiler/tests/profilers/test_datetime_column_profile.py +++ b/dataprofiler/tests/profilers/test_datetime_column_profile.py @@ -501,6 +501,47 @@ def test_json_encode_after_update(self): self.assertEqual(serialized, expected) + def test_json_encode_datetime(self): + data = ["1209214"] + df = pd.Series(data) + profiler = DateTimeColumn("0") + + expected_date_formats = [ + "%Y-%m-%d %H:%M:%S", + "%b %d, %Y", + "%m/%d/%y %H:%M", + ] + with patch.object( + profiler, "_combine_unique_sets", return_value=expected_date_formats + ): + with patch("time.time", return_value=0.0): + profiler.update(df) + + serialized = json.dumps(profiler, cls=ProfileEncoder) + + expected = json.dumps( + { + "class": "DateTimeColumn", + "data": { + "name": "0", + "col_index": np.nan, + "sample_size": 1, + "metadata": {}, + "times": defaultdict(float, {"datetime": 0.0}), + "thread_safe": True, + "match_count": 1, + "date_formats": expected_date_formats, + "min": "1209214", + "max": "1209214", + "_dt_obj_min": "9214-01-20T00:00:00", + "_dt_obj_max": "9214-01-20T00:00:00", + "_DateTimeColumn__calculations": dict(), + }, + } + ) + + self.assertEqual(serialized, expected) + def test_json_decode(self): fake_profile_name = None expected_profile = DateTimeColumn(fake_profile_name) From f8b3e5dbd4b76f0ecc291911ace9e8e21cf1ecb1 Mon Sep 17 00:00:00 2001 From: Taylor Turner Date: Wed, 6 Mar 2024 08:02:24 -0500 Subject: [PATCH 2/6] update version (#1107) --- dataprofiler/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/version.py b/dataprofiler/version.py index 39f0a3c95..86e35b807 100644 --- a/dataprofiler/version.py +++ b/dataprofiler/version.py @@ -2,7 +2,7 @@ MAJOR = 0 MINOR = 10 -MICRO = 8 +MICRO = 9 POST = None # otherwise None VERSION = "%d.%d.%d" % (MAJOR, MINOR, MICRO) From 8269a1f06cb03cd20538adae82666f2ded71a4ae Mon Sep 17 00:00:00 2001 From: Andrew <64439232+atl1502@users.noreply.github.com> Date: Wed, 24 Jan 2024 14:20:31 -0600 Subject: [PATCH 3/6] add polars to requirements (#1087) * add polars to requirements * Update requirements.txt Co-authored-by: Taylor Turner --------- Co-authored-by: Taylor Turner --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index a45dc34ae..cc77e04c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ HLL>=2.0.3 datasketches>=4.1.0 packaging>=23.0 boto3>=1.28.61 +polars>=0.20.5 From 144145297c41bacd8c6f09b53f6d0c8f0ec363c2 Mon Sep 17 00:00:00 2001 From: Andrew <64439232+atl1502@users.noreply.github.com> Date: Tue, 30 Jan 2024 09:16:59 -0600 Subject: [PATCH 4/6] update precommit env (#1088) --- .pre-commit-config.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 203e62b1f..ad7163e8a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -65,6 +65,7 @@ repos: HLL>=2.0.3, datasketches>=4.1.0, boto3>=1.28.61, + polars>=0.20.5, # requirements-dev.txt check-manifest>=0.48, @@ -111,7 +112,7 @@ repos: additional_dependencies: ['h5py', 'wheel', 'future', 'numpy', 'pandas', 'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro', 'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests', - 'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3'] + 'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3', 'polars'] # Pyupgrade - standardize and modernize Python syntax for newer versions of the language - repo: https://github.com/asottile/pyupgrade rev: v3.3.0 From 3a8e931eb6f108a77f7d228e78050ed4feb26a25 Mon Sep 17 00:00:00 2001 From: Andrew <64439232+atl1502@users.noreply.github.com> Date: Mon, 12 Feb 2024 08:27:35 -0600 Subject: [PATCH 5/6] Numerical column stats update (#1089) * partial update to numerical_column_stats * update with full polars replacement * reduce redundant if statement * fix histogram warning * remove unneeded casting --- .../profilers/numerical_column_stats.py | 147 +++++++++++++----- .../profilers/test_float_column_profile.py | 2 + .../profilers/test_int_column_profile.py | 2 + .../test_numeric_stats_mixin_profile.py | 1 + .../profilers/test_text_column_profile.py | 1 + 5 files changed, 112 insertions(+), 41 deletions(-) diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index fa0666a66..74c24e213 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -11,6 +11,7 @@ import numpy as np import numpy.typing as npt import pandas as pd +import polars as pl import scipy.stats from . import float_column_profile, histogram_utils, profiler_utils @@ -83,6 +84,7 @@ def __init__(self, options: NumericalOptions = None) -> None: self.num_zeros: int | np.int64 = np.int64(0) self.num_negatives: int | np.int64 = np.int64(0) self._num_quantiles: int = 1000 # By default, we use 1000 quantiles + self._greater_than_64_bit: bool = False if options: self.bias_correction = options.bias_correction.is_enabled @@ -1125,10 +1127,12 @@ def _estimate_stats_from_histogram(self) -> np.float64: def _total_histogram_bin_variance( self, input_array: np.ndarray | pd.Series ) -> float: + if type(input_array) is pd.Series: + input_array = pl.from_pandas(input_array) + input_array = input_array.to_numpy() # calculate total variance over all bins of a histogram bin_counts = self._stored_histogram["histogram"]["bin_counts"] bin_edges = self._stored_histogram["histogram"]["bin_edges"] - # account ofr digitize which is exclusive bin_edges = bin_edges.copy() bin_edges[-1] += 1e-3 @@ -1151,6 +1155,9 @@ def _histogram_bin_error(self, input_array: np.ndarray | pd.Series) -> np.float6 :return: binning error :rtype: float """ + if type(input_array) is pd.Series: + input_array = pl.from_pandas(input_array) + input_array = input_array.to_numpy() bin_edges = self._stored_histogram["histogram"]["bin_edges"] # account ofr digitize which is exclusive @@ -1265,7 +1272,7 @@ def _histogram_to_array(self) -> np.ndarray: return array_flatten def _get_histogram( - self, values: np.ndarray | pd.Series + self, values: np.ndarray | pl.Series ) -> tuple[np.ndarray, np.ndarray]: """ Calculate stored histogram the suggested bin counts for each histogram method. @@ -1278,10 +1285,7 @@ def _get_histogram( """ if len(np.unique(values)) == 1: bin_counts = np.array([len(values)]) - if isinstance(values, (np.ndarray, list)): - unique_value = values[0] - else: - unique_value = values.iloc[0] + unique_value = values[0] bin_edges = np.array([unique_value, unique_value]) for bin_method in self.histogram_bin_method_names: self.histogram_methods[bin_method]["histogram"][ @@ -1322,12 +1326,15 @@ def _get_histogram( def _merge_histogram(self, values: np.ndarray | pd.Series) -> None: # values is the current array of values, # that needs to be updated to the accumulated histogram + if type(values) is pd.Series: + values = pl.from_pandas(values) + values = values.to_numpy() combined_values = np.concatenate([values, self._histogram_to_array()]) bin_counts, bin_edges = self._get_histogram(combined_values) self._stored_histogram["histogram"]["bin_counts"] = bin_counts self._stored_histogram["histogram"]["bin_edges"] = bin_edges - def _update_histogram(self, df_series: pd.Series) -> None: + def _update_histogram(self, df_series: pd.Series | np.ndarray) -> None: """ Update histogram for each method and the combined method. @@ -1348,12 +1355,23 @@ def _update_histogram(self, df_series: pd.Series) -> None: :type df_series: pandas.core.series.Series :return: """ - df_series = df_series.replace([np.inf, -np.inf], np.nan).dropna() - if df_series.empty: - return + if self._greater_than_64_bit and type(df_series) is pd.Series: + df_series = df_series.to_numpy(dtype=float) + df_series = df_series[np.isfinite(df_series)] + if df_series.size == 0: + return + else: + df_series = pl.from_pandas(df_series, nan_to_null=True).cast(pl.Float64) + df_series = df_series.replace([np.inf, -np.inf], [None]) # type: ignore + df_series = df_series.drop_nulls() + if df_series.is_empty(): + return if self._has_histogram: - self._merge_histogram(df_series.tolist()) + if self._greater_than_64_bit: + self._merge_histogram(df_series.tolist()) + else: + self._merge_histogram(df_series.to_list()) else: bin_counts, bin_edges = self._get_histogram(df_series) self._stored_histogram["histogram"]["bin_counts"] = bin_counts @@ -1741,8 +1759,26 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: :type profile: dict :return: None """ - if df_series_clean.empty: - return + self._greater_than_64_bit = ( + not df_series_clean.empty + and df_series_clean.apply(pd.to_numeric, errors="coerce").dtype == "O" + ) + if self._greater_than_64_bit: + df_series_clean = df_series_clean.to_numpy() + df_series_clean = df_series_clean[df_series_clean != np.nan] + if df_series_clean.size == 0: + return + df_series_clean = pd.Series(df_series_clean) + else: + df_series_clean = pl.from_pandas(df_series_clean) + if df_series_clean.dtype == pl.String: + df_series_clean = df_series_clean.str.strip_chars().cast(pl.Float64) + else: + df_series_clean = df_series_clean.cast(pl.Float64) + if df_series_clean.is_empty(): + return + df_series_clean = df_series_clean.to_pandas() + df_series_clean = df_series_clean.astype(float) prev_dependent_properties = { "mean": self.mean, @@ -1751,7 +1787,6 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: "biased_kurtosis": self._biased_kurtosis, } subset_properties = copy.deepcopy(profile) - df_series_clean = df_series_clean.astype(float) super()._perform_property_calcs( # type: ignore self.__calculations, df_series=df_series_clean, @@ -1765,43 +1800,69 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: @BaseColumnProfiler._timeit(name="min") def _get_min( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: - min_value = df_series.min() - self.min = min_value if not self.min else min(self.min, min_value) + if self._greater_than_64_bit: + min_value = np.min(df_series) + self.min = min_value if not self.min else min(self.min, min_value) + else: + df_series = pl.from_pandas(df_series) + min_value = df_series.min() + self.min = np.float64( + min_value if not self.min else min(self.min, min_value) + ) subset_properties["min"] = min_value @BaseColumnProfiler._timeit(name="max") def _get_max( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: - max_value = df_series.max() - self.max = max_value if not self.max else max(self.max, max_value) + if self._greater_than_64_bit: + max_value = np.max(df_series) + self.max = max_value if not self.max else max(self.max, max_value) + else: + df_series = pl.from_pandas(df_series) + max_value = df_series.max() + if self.max is not None: + max_value = type(self.max)(max_value) + self.max = np.float64( + max_value if not self.max else max(self.max, max_value) + ) subset_properties["max"] = max_value @BaseColumnProfiler._timeit(name="sum") def _get_sum( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: if np.isinf(self.sum) or (np.isnan(self.sum) and self.match_count > 0): return - - sum_value = df_series.sum() - if np.isinf(sum_value) or (len(df_series) > 0 and np.isnan(sum_value)): - warnings.warn( - "Infinite or invalid values found in data. " - "Future statistics (mean, variance, skewness, kurtosis) " - "will not be computed.", - RuntimeWarning, - ) + if self._greater_than_64_bit: + sum_value = np.sum(df_series) + if len(df_series) > 0 and sum_value == np.nan: + warnings.warn( + "Infinite or invalid values found in data. " + "Future statistics (mean, variance, skewness, kurtosis) " + "will not be computed.", + RuntimeWarning, + ) + else: + df_series = pl.from_pandas(df_series) + sum_value = df_series.sum() + if np.isinf(sum_value) or (len(df_series) > 0 and np.isnan(sum_value)): + warnings.warn( + "Infinite or invalid values found in data. " + "Future statistics (mean, variance, skewness, kurtosis) " + "will not be computed.", + RuntimeWarning, + ) subset_properties["sum"] = sum_value self.sum = self.sum + sum_value @@ -1809,7 +1870,7 @@ def _get_sum( @BaseColumnProfiler._timeit(name="variance") def _get_variance( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1817,11 +1878,11 @@ def _get_variance( np.isnan(self._biased_variance) and self.match_count > 0 ): return - - # Suppress any numpy warnings as we have a custom warning for invalid - # or infinite data already - with np.errstate(all="ignore"): - batch_biased_variance = np.var(df_series) # Obtains biased variance + if self._greater_than_64_bit: + batch_biased_variance = np.var(df_series) + else: + df_series = pl.from_pandas(df_series) + batch_biased_variance = np.var([df_series]) subset_properties["biased_variance"] = batch_biased_variance sum_value = subset_properties["sum"] batch_count = subset_properties["match_count"] @@ -1839,7 +1900,7 @@ def _get_variance( @BaseColumnProfiler._timeit(name="skewness") def _get_skewness( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1883,7 +1944,7 @@ def _get_skewness( @BaseColumnProfiler._timeit(name="kurtosis") def _get_kurtosis( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1930,7 +1991,7 @@ def _get_kurtosis( @BaseColumnProfiler._timeit(name="histogram_and_quantiles") def _get_histogram_and_quantiles( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1948,7 +2009,7 @@ def _get_histogram_and_quantiles( @BaseColumnProfiler._timeit(name="num_zeros") def _get_num_zeros( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1963,6 +2024,8 @@ def _get_num_zeros( :type subset_properties: dict :return: None """ + if not self._greater_than_64_bit: + df_series = pl.from_pandas(df_series) num_zeros_value = (df_series == 0).sum() subset_properties["num_zeros"] = num_zeros_value self.num_zeros = self.num_zeros + num_zeros_value @@ -1970,7 +2033,7 @@ def _get_num_zeros( @BaseColumnProfiler._timeit(name="num_negatives") def _get_num_negatives( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1985,6 +2048,8 @@ def _get_num_negatives( :type subset_properties: dict :return: None """ + if not self._greater_than_64_bit: + df_series = pl.from_pandas(df_series) num_negatives_value = (df_series < 0).sum() subset_properties["num_negatives"] = num_negatives_value self.num_negatives = self.num_negatives + num_negatives_value diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index d79fdd641..c92fc5cd8 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -1792,6 +1792,7 @@ def test_json_encode(self): "num_zeros": 0, "num_negatives": 0, "_num_quantiles": 1000, + "_greater_than_64_bit": False, "histogram_methods": expected_historam_methods, "_stored_histogram": { "total_loss": 0.0, @@ -1890,6 +1891,7 @@ def test_json_encode_after_update(self, time): "num_zeros": 1, "num_negatives": 0, "_num_quantiles": 4, + "_greater_than_64_bit": False, "histogram_methods": expected_historam_methods, "_stored_histogram": { "total_loss": 2.0, diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 961b33c8c..718348cf8 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -1151,6 +1151,7 @@ def test_json_encode(self): "num_zeros": 0, "num_negatives": 0, "_num_quantiles": 1000, + "_greater_than_64_bit": False, "histogram_methods": expected_historam_methods, "_stored_histogram": { "total_loss": 0.0, @@ -1233,6 +1234,7 @@ def test_json_encode_after_update(self, time): "num_zeros": 1, "num_negatives": 0, "_num_quantiles": 1000, + "_greater_than_64_bit": False, "histogram_methods": { "custom": { "total_loss": 0.0, diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py index e112781ab..7b4d2ccc3 100644 --- a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py +++ b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py @@ -1234,6 +1234,7 @@ def test_json_encode(self): "num_zeros": 0, "num_negatives": 0, "_num_quantiles": 1000, + "_greater_than_64_bit": False, "histogram_methods": expected_historam_methods, "_stored_histogram": { "total_loss": 0.0, diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py index 12fb1d27b..61a54afe9 100644 --- a/dataprofiler/tests/profilers/test_text_column_profile.py +++ b/dataprofiler/tests/profilers/test_text_column_profile.py @@ -671,6 +671,7 @@ def test_json_encode_after_update(self, time): "num_zeros": 0, "num_negatives": 0, "_num_quantiles": 1000, + "_greater_than_64_bit": False, "histogram_methods": { "custom": { "total_loss": 0.0, From 1766ea6b2928835c8c7f94871434cf73e4ef13d2 Mon Sep 17 00:00:00 2001 From: Andrew <64439232+atl1502@users.noreply.github.com> Date: Wed, 21 Feb 2024 14:50:37 -0600 Subject: [PATCH 6/6] Profiler utils update (#1092) * update profiler utils * finish updates --- dataprofiler/profilers/numerical_column_stats.py | 8 ++++++++ dataprofiler/profilers/profiler_utils.py | 10 +++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 74c24e213..7fe05aeea 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -1924,6 +1924,10 @@ def _get_skewness( ): return + if self._greater_than_64_bit and type(df_series) is pd.Series: + df_series = df_series.to_numpy(dtype=float) + else: + df_series = pl.from_pandas(df_series, nan_to_null=False) batch_biased_skewness = profiler_utils.biased_skew(df_series) subset_properties["biased_skewness"] = batch_biased_skewness batch_count = subset_properties["match_count"] @@ -1968,6 +1972,10 @@ def _get_kurtosis( ): return + if self._greater_than_64_bit and type(df_series) is pd.Series: + df_series = df_series.to_numpy(dtype=float) + else: + df_series = pl.from_pandas(df_series, nan_to_null=False) batch_biased_kurtosis = profiler_utils.biased_kurt(df_series) subset_properties["biased_kurtosis"] = batch_biased_kurtosis batch_count = subset_properties["match_count"] diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py index e38e1b041..a81dca7a5 100644 --- a/dataprofiler/profilers/profiler_utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -26,9 +26,11 @@ ) import numpy as np +import polars as pl import psutil import scipy -from pandas import DataFrame, Series +from pandas import DataFrame +from polars import Series from ..labelers.data_labelers import DataLabeler @@ -320,7 +322,7 @@ def add_nested_dictionaries(first_dict: dict, second_dict: dict) -> dict: return merged_dict -def biased_skew(df_series: Series) -> np.float64: +def biased_skew(df_series: Series | np.ndarray) -> np.float64: """ Calculate the biased estimator for skewness of the given data. @@ -358,7 +360,7 @@ def biased_skew(df_series: Series) -> np.float64: return skew -def biased_kurt(df_series: Series) -> np.float64: +def biased_kurt(df_series: Series | np.ndarray) -> np.float64: """ Calculate the biased estimator for kurtosis of the given data. @@ -675,6 +677,8 @@ def get_memory_size(data: list | np.ndarray | DataFrame, unit: str = "M") -> flo :type unit: string :return: memory size of the input data """ + if type(data) is DataFrame: + data = pl.from_pandas(data) unit_map: dict = collections.defaultdict(B=0, K=1, M=2, G=3) if unit not in unit_map: raise ValueError(