diff --git a/dataprofiler/profilers/order_column_profile.py b/dataprofiler/profilers/order_column_profile.py index 30826232..c01ee124 100644 --- a/dataprofiler/profilers/order_column_profile.py +++ b/dataprofiler/profilers/order_column_profile.py @@ -4,8 +4,9 @@ from abc import abstractmethod from typing import Protocol, Type, TypeVar, cast -import numpy as np -from pandas import DataFrame, Series +import pandas as pd +import polars as pl +from polars import Series from . import profiler_utils from .base_column_profilers import BaseColumnProfiler @@ -24,7 +25,7 @@ def __lt__(self: CT, other: CT) -> bool: CT = TypeVar("CT", bound=Comparable) # bc type in class attr causing issues, need to alias -AliasFloatType = Type[np.float64] +AliasFloatType = Type[pl.Float64] AliasStrType = Type[str] @@ -51,9 +52,9 @@ def __init__(self, name: str | None, options: OrderOptions = None) -> None: "OrderColumn parameter 'options' must be of type" " OrderOptions." ) self.order: str | None = None - self._last_value: np.float64 | float | str | None = None - self._first_value: np.float64 | float | str | None = None - self._data_store_type: AliasStrType | AliasFloatType = np.float64 + self._last_value: pl.Float64 | float | str | None = None + self._first_value: pl.Float64 | float | str | None = None + self._data_store_type: AliasStrType | AliasFloatType = pl.Float64 self._piecewise: bool | None = False self.__calculations: dict = {} self._filter_properties_w_options(self.__calculations, options) @@ -161,15 +162,15 @@ def _merge_order( :type first_value1: Float | String :type last_value1: Float | String :type piecewise1: Boolean - :type data_store_type1: Type[str] | Type[np.float64] + :type data_store_type1: Type[str] | Type[pl.Float64] :type order2: String :type first_value2: Float | String :type last_value2: Float | String - :type data_store_type2: Type[str] | Type[np.float64] + :type data_store_type2: Type[str] | Type[pl.Float64] :type piecewise2: Boolean :return: order, first_value, last_value, piecewise, merged_data_store_type :rtype: String, Float | String, Float | String, Boolean, Type[str] - | Type[np.float64] + | Type[pl.Float64] """ # Return either order if one is None if not order1: @@ -177,7 +178,7 @@ def _merge_order( elif not order2: return order1, first_value1, last_value1, piecewise1, data_store_type1 - merged_data_store_type: AliasStrType | AliasFloatType = np.float64 + merged_data_store_type: AliasStrType | AliasFloatType = pl.Float64 if data_store_type1 is str or data_store_type2 is str: first_value1 = cast(CT, str(first_value1)) last_value1 = cast(CT, str(last_value1)) @@ -329,13 +330,13 @@ def load_from_dict(cls, data, config: dict | None = None): """ # This is an ambiguous call to super classes. data["_data_store_type"] = ( - str if data["_data_store_type"] == "str" else np.float64 + str if data["_data_store_type"] == "str" else pl.Float64 ) profile = super().load_from_dict(data) try: - if profile.sample_size and profile._data_store_type is np.float64: - profile._first_value = np.float64(profile._first_value) - profile._last_value = np.float64(profile._last_value) + if profile.sample_size and profile._data_store_type is pl.Float64: + profile._first_value = profile._first_value + profile._last_value = profile._last_value except ValueError: profile._first_value = data["_first_value"] profile._last_value = data["_last_value"] @@ -379,23 +380,23 @@ def _get_data_order( Additionally, return the first and last value of the series. :param df_series: a given column - :type df_series: pandas.core.series.Series + :type df_series: polars.Series :param data_store_type: type of value for first_value and last_value - :type data_store_type: Type[str] | Type[np.float64] + :type data_store_type: Type[str] | Type[pl.Float64] :return: order, first_value, last_value, data_store_type - :rtype: String, Float, Float, type, Type[str] | Type[np.float64] + :rtype: String, Float, Float, type, Type[str] | Type[pl.Float64] """ try: if data_store_type is not str: - df_series = df_series.astype(float) - except ValueError: + df_series = df_series.cast(pl.Float64) + except pl.exceptions.ComputeError: data_store_type = str order = None - last_value = df_series.iloc[0] - first_value = df_series.iloc[0] + last_value = df_series[0] + first_value = df_series[0] - for value in df_series.values: + for value in df_series: if value < last_value and order == "ascending": order = "random" break @@ -414,7 +415,7 @@ def _get_data_order( def _update_order( self, - df_series: DataFrame, + df_series: Series, prev_dependent_properties: dict = None, subset_properties: dict = None, ) -> None: @@ -427,7 +428,7 @@ def _update_order( order information. :param df_series: Data to be profiled - :type df_series: pandas.DataFrame + :type df_series: polars.DataFrame :param prev_dependent_properties: Contains all the previous properties that the calculations depend on. :type prev_dependent_properties: dict @@ -466,7 +467,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None: Update col profile properties with clean dataset and its known null parameters. :param df_series_clean: df series with nulls removed - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.Series :param profile: ordered profile :type profile: dict :return: None @@ -478,10 +479,13 @@ def update(self, df_series: Series) -> OrderColumn: Update the column profile. :param df_series: df series - :type df_series: pandas.core.series.Series + :type df_series: polars.Series :return: updated OrderColumn :rtype: OrderColumn """ + # TODO remove onces profiler builder is updated + if type(df_series) == pd.Series: + df_series = pl.from_pandas(df_series) # type: ignore if len(df_series) == 0: return self diff --git a/dataprofiler/tests/profilers/test_order_column_profile.py b/dataprofiler/tests/profilers/test_order_column_profile.py index aefb0288..7bd55b2a 100644 --- a/dataprofiler/tests/profilers/test_order_column_profile.py +++ b/dataprofiler/tests/profilers/test_order_column_profile.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +import polars as pl from dataprofiler.profilers import OrderColumn from dataprofiler.profilers.json_decoder import load_column_profile @@ -22,7 +23,7 @@ class TestOrderColumn(unittest.TestCase): @staticmethod def _update_order(data): - df = pd.Series(data).apply(str) + df = pl.Series(data).cast(str) profiler = OrderColumn(df.name) profiler.update(df) @@ -30,7 +31,7 @@ def _update_order(data): return profiler.order def test_base_case(self): - data = pd.Series([], dtype=object) + data = pl.Series([], dtype=object) profiler = OrderColumn(data.name) profiler.update(data) @@ -75,31 +76,31 @@ def test_random(self): def test_batch_updates(self): data = ["a", "a", "a"] - df = pd.Series(data) + df = pl.Series(data) profiler = OrderColumn(df.name) profiler.update(df) self.assertEqual(profiler.order, "constant value") data = ["a", "b", "c"] - df = pd.Series(data) + df = pl.Series(data) profiler.update(df) self.assertEqual(profiler.order, "ascending") # previous was ascending, should stay ascending bc now receiving const data = ["c", "c", "c"] - df = pd.Series(data) + df = pl.Series(data) profiler.update(df) self.assertEqual(profiler.order, "ascending") # previous was ascending, should be random now receiving descending data = ["c", "b", "a"] - df = pd.Series(data) + df = pl.Series(data) profiler.update(df) self.assertEqual(profiler.order, "random") def test_profile(self): data = [1] - df = pd.Series(data).apply(str) + df = pl.Series(data).cast(str) profiler = OrderColumn(df.name) @@ -114,7 +115,7 @@ def test_profile(self): def test_report(self): data = [1] - df = pd.Series(data).apply(str) + df = pl.Series(data).cast(str) profile = OrderColumn(df.name) @@ -126,58 +127,58 @@ def test_report(self): def test_profile_merge(self): data = [1, 2, 3, 4, 5, 6] - df = pd.Series(data).apply(str) + df = pl.Series(data).cast(str) profiler = OrderColumn("placeholder_name") profiler.update(df) data2 = [7, 8, 9, 10] - df2 = pd.Series(data2).apply(str) + df2 = pl.Series(data2).cast(str) profiler2 = OrderColumn("placeholder_name") profiler2.update(df2) data3 = [2, 3, 4] - df3 = pd.Series(data3).apply(str) + df3 = pl.Series(data3).cast(str) profiler3 = OrderColumn("placeholder_name") profiler3.update(df3) data4 = [3, 3, 3, 3] - df4 = pd.Series(data4).apply(str) + df4 = pl.Series(data4).cast(str) profiler4 = OrderColumn("placeholder_name") profiler4.update(df4) data5 = [4, 2, 3, 1, 5] - df5 = pd.Series(data5).apply(str) + df5 = pl.Series(data5).cast(str) profiler5 = OrderColumn("placeholder_name") profiler5.update(df5) data6 = [10, 9, 8, 7] - df6 = pd.Series(data6).apply(str) + df6 = pl.Series(data6).cast(str) profiler6 = OrderColumn("placeholder_name") profiler6.update(df6) data7 = [3, 3, 3] - df7 = pd.Series(data7).apply(str) + df7 = pl.Series(data7).cast(str) profiler7 = OrderColumn("placeholder_name") profiler7.update(df7) data8 = [7, 7, 7, 7, 7, 7, 7] - df8 = pd.Series(data8).apply(str) + df8 = pl.Series(data8).cast(str) profiler8 = OrderColumn("placeholder_name") profiler8.update(df8) data9 = [7, 6, 5, 4, 3] - df9 = pd.Series(data9).apply(str) + df9 = pl.Series(data9).cast(str) profiler9 = OrderColumn("placeholder_name") profiler9.update(df9) data10 = [1, 5, 6] - df10 = pd.Series(data10).apply(str) + df10 = pl.Series(data10).cast(str) profiler10 = OrderColumn("placeholder_name") profiler10.update(df10) profiler10._piecewise = True - data11 = pd.Series([], dtype=object) - df11 = pd.Series(data11).apply(str) + data11 = pl.Series([], dtype=object) + df11 = pl.Series(data11).cast(str) profiler11 = OrderColumn("placeholder_name") profiler11.update(df11) @@ -322,7 +323,7 @@ def test_merge_timing(self): def test_random_order_prevents_update_from_occuring(self, mock_get_data_order): mock_get_data_order.return_value = ["random", 1, 2, str] data = ["a", "b", "ab"] - df = pd.Series(data).apply(str) + df = pl.Series(data).cast(str) # Assert the order is random profiler = OrderColumn(df.name) @@ -342,12 +343,12 @@ def test_order_column_with_wrong_options(self): def test_diff(self): data = [1, 2, 3, 4, 5, 6] - df = pd.Series(data).apply(str) + df = pl.Series(data).cast(str) profiler = OrderColumn("placeholder_name") profiler.update(df) data2 = [7, 8, 9, 10] - df2 = pd.Series(data2).apply(str) + df2 = pl.Series(data2).cast(str) profiler2 = OrderColumn("placeholder_name") profiler2.update(df2) @@ -355,7 +356,7 @@ def test_diff(self): self.assertEqual("unchanged", diff["order"]) data3 = [4, 2, 3, 1, 5] - df3 = pd.Series(data3).apply(str) + df3 = pl.Series(data3).cast(str) profiler3 = OrderColumn("placeholder_name") profiler3.update(df3) @@ -373,7 +374,7 @@ def test_json_encode(self): "order": None, "_last_value": None, "_first_value": None, - "_data_store_type": "float64", + "_data_store_type": "Float64", "_piecewise": False, "_OrderColumn__calculations": dict(), "name": "0", @@ -391,7 +392,7 @@ def test_json_encode(self): def test_json_encode_after_update(self): profile = OrderColumn("0") - df_order = pd.Series(["za", "z", "c", "a"]) + df_order = pl.Series(["za", "z", "c", "a"]) with patch("time.time", side_effect=lambda: 0.0): profile.update(df_order) @@ -431,7 +432,7 @@ def test_json_decode_after_update_str(self): fake_profile_name = "Fake profile name" # Build expected orderColumn - df_order = pd.Series(["za", "z", "c", "c"]) + df_order = pl.Series(["za", "z", "c", "c"]) expected_profile = OrderColumn(fake_profile_name) with utils.mock_timeit(): @@ -444,7 +445,7 @@ def test_json_decode_after_update_str(self): # Adding data to update that is in descending order # (consistent with previous data) - df_order = pd.Series( + df_order = pl.Series( [ "c", # add existing "a", # add new @@ -460,7 +461,7 @@ def test_json_decode_after_update_str(self): # Adding data to update that is in random order # (not consistent with previous data) - df_order = pd.Series( + df_order = pl.Series( [ "c", # add existing "zza", # add new @@ -476,7 +477,7 @@ def test_json_decode_after_update_num(self): fake_profile_name = "Fake profile name" # Build expected orderColumn - df_order = pd.Series(["1", "4", "6"]) + df_order = pl.Series(["1", "4", "6"]) expected_profile = OrderColumn(fake_profile_name) with utils.mock_timeit(): @@ -488,7 +489,7 @@ def test_json_decode_after_update_num(self): # Adding data to update that is in descending order # (consistent with previous data) - df_order = pd.Series( + df_order = pl.Series( [ "6", # add existing "9", # add new @@ -503,7 +504,7 @@ def test_json_decode_after_update_num(self): # Adding data to update that is in random order # (not consistent with previous data) - df_order = pd.Series( + df_order = pl.Series( [ "3", # add existing "1", # add new