Skip to content

Commit

Permalink
update order col
Browse files Browse the repository at this point in the history
  • Loading branch information
atl1502 committed Mar 24, 2024
1 parent c176d53 commit 16575a2
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 58 deletions.
52 changes: 26 additions & 26 deletions dataprofiler/profilers/order_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from abc import abstractmethod
from typing import Protocol, Type, TypeVar, cast

import numpy as np
from pandas import DataFrame, Series
import polars as pl
from polars import Series

from . import profiler_utils
from .base_column_profilers import BaseColumnProfiler
Expand All @@ -24,7 +24,7 @@ def __lt__(self: CT, other: CT) -> bool:
CT = TypeVar("CT", bound=Comparable)

# bc type in class attr causing issues, need to alias
AliasFloatType = Type[np.float64]
AliasFloatType = Type[pl.Float64]
AliasStrType = Type[str]


Expand All @@ -51,9 +51,9 @@ def __init__(self, name: str | None, options: OrderOptions = None) -> None:
"OrderColumn parameter 'options' must be of type" " OrderOptions."
)
self.order: str | None = None
self._last_value: np.float64 | float | str | None = None
self._first_value: np.float64 | float | str | None = None
self._data_store_type: AliasStrType | AliasFloatType = np.float64
self._last_value: pl.Float64 | float | str | None = None
self._first_value: pl.Float64 | float | str | None = None
self._data_store_type: AliasStrType | AliasFloatType = pl.Float64
self._piecewise: bool | None = False
self.__calculations: dict = {}
self._filter_properties_w_options(self.__calculations, options)
Expand Down Expand Up @@ -161,23 +161,23 @@ def _merge_order(
:type first_value1: Float | String
:type last_value1: Float | String
:type piecewise1: Boolean
:type data_store_type1: Type[str] | Type[np.float64]
:type data_store_type1: Type[str] | Type[pl.Float64]
:type order2: String
:type first_value2: Float | String
:type last_value2: Float | String
:type data_store_type2: Type[str] | Type[np.float64]
:type data_store_type2: Type[str] | Type[pl.Float64]
:type piecewise2: Boolean
:return: order, first_value, last_value, piecewise, merged_data_store_type
:rtype: String, Float | String, Float | String, Boolean, Type[str]
| Type[np.float64]
| Type[pl.Float64]
"""
# Return either order if one is None
if not order1:
return order2, first_value2, last_value2, piecewise2, data_store_type2
elif not order2:
return order1, first_value1, last_value1, piecewise1, data_store_type1

merged_data_store_type: AliasStrType | AliasFloatType = np.float64
merged_data_store_type: AliasStrType | AliasFloatType = pl.Float64
if data_store_type1 is str or data_store_type2 is str:
first_value1 = cast(CT, str(first_value1))
last_value1 = cast(CT, str(last_value1))
Expand Down Expand Up @@ -329,13 +329,13 @@ def load_from_dict(cls, data, config: dict | None = None):
"""
# This is an ambiguous call to super classes.
data["_data_store_type"] = (
str if data["_data_store_type"] == "str" else np.float64
str if data["_data_store_type"] == "str" else pl.Float64
)
profile = super().load_from_dict(data)
try:
if profile.sample_size and profile._data_store_type is np.float64:
profile._first_value = np.float64(profile._first_value)
profile._last_value = np.float64(profile._last_value)
if profile.sample_size and profile._data_store_type is pl.Float64:
profile._first_value = profile._first_value
profile._last_value = profile._last_value
except ValueError:
profile._first_value = data["_first_value"]
profile._last_value = data["_last_value"]
Expand Down Expand Up @@ -379,23 +379,23 @@ def _get_data_order(
Additionally, return the first and last value of the series.
:param df_series: a given column
:type df_series: pandas.core.series.Series
:type df_series: polars.series.series.Series
:param data_store_type: type of value for first_value and last_value
:type data_store_type: Type[str] | Type[np.float64]
:type data_store_type: Type[str] | Type[pl.Float64]
:return: order, first_value, last_value, data_store_type
:rtype: String, Float, Float, type, Type[str] | Type[np.float64]
:rtype: String, Float, Float, type, Type[str] | Type[pl.Float64]
"""
try:
if data_store_type is not str:
df_series = df_series.astype(float)
except ValueError:
df_series = df_series.cast(pl.Float64)
except pl.exceptions.ComputeError:
data_store_type = str

order = None
last_value = df_series.iloc[0]
first_value = df_series.iloc[0]
last_value = df_series[0]
first_value = df_series[0]

for value in df_series.values:
for value in df_series:
if value < last_value and order == "ascending":
order = "random"
break
Expand All @@ -414,7 +414,7 @@ def _get_data_order(

def _update_order(
self,
df_series: DataFrame,
df_series: Series,
prev_dependent_properties: dict = None,
subset_properties: dict = None,
) -> None:
Expand All @@ -427,7 +427,7 @@ def _update_order(
order information.
:param df_series: Data to be profiled
:type df_series: pandas.DataFrame
:type df_series: polars.DataFrame
:param prev_dependent_properties: Contains all the previous properties
that the calculations depend on.
:type prev_dependent_properties: dict
Expand Down Expand Up @@ -466,7 +466,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
Update col profile properties with clean dataset and its known null parameters.
:param df_series_clean: df series with nulls removed
:type df_series_clean: pandas.core.series.Series
:type df_series_clean: polars.series.series.Series
:param profile: ordered profile
:type profile: dict
:return: None
Expand All @@ -478,7 +478,7 @@ def update(self, df_series: Series) -> OrderColumn:
Update the column profile.
:param df_series: df series
:type df_series: pandas.core.series.Series
:type df_series: polars.series.series.Series
:return: updated OrderColumn
:rtype: OrderColumn
"""
Expand Down
65 changes: 33 additions & 32 deletions dataprofiler/tests/profilers/test_order_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import numpy as np
import pandas as pd
import polars as pl

from dataprofiler.profilers import OrderColumn
from dataprofiler.profilers.json_decoder import load_column_profile
Expand All @@ -22,15 +23,15 @@
class TestOrderColumn(unittest.TestCase):
@staticmethod
def _update_order(data):
df = pd.Series(data).apply(str)
df = pl.Series(data).cast(str)

profiler = OrderColumn(df.name)
profiler.update(df)

return profiler.order

def test_base_case(self):
data = pd.Series([], dtype=object)
data = pl.Series([], dtype=object)
profiler = OrderColumn(data.name)
profiler.update(data)

Expand Down Expand Up @@ -75,31 +76,31 @@ def test_random(self):

def test_batch_updates(self):
data = ["a", "a", "a"]
df = pd.Series(data)
df = pl.Series(data)
profiler = OrderColumn(df.name)
profiler.update(df)
self.assertEqual(profiler.order, "constant value")

data = ["a", "b", "c"]
df = pd.Series(data)
df = pl.Series(data)
profiler.update(df)
self.assertEqual(profiler.order, "ascending")

# previous was ascending, should stay ascending bc now receiving const
data = ["c", "c", "c"]
df = pd.Series(data)
df = pl.Series(data)
profiler.update(df)
self.assertEqual(profiler.order, "ascending")

# previous was ascending, should be random now receiving descending
data = ["c", "b", "a"]
df = pd.Series(data)
df = pl.Series(data)
profiler.update(df)
self.assertEqual(profiler.order, "random")

def test_profile(self):
data = [1]
df = pd.Series(data).apply(str)
df = pl.Series(data).cast(str)

profiler = OrderColumn(df.name)

Expand All @@ -114,7 +115,7 @@ def test_profile(self):

def test_report(self):
data = [1]
df = pd.Series(data).apply(str)
df = pl.Series(data).cast(str)

profile = OrderColumn(df.name)

Expand All @@ -126,58 +127,58 @@ def test_report(self):

def test_profile_merge(self):
data = [1, 2, 3, 4, 5, 6]
df = pd.Series(data).apply(str)
df = pl.Series(data).cast(str)
profiler = OrderColumn("placeholder_name")
profiler.update(df)

data2 = [7, 8, 9, 10]
df2 = pd.Series(data2).apply(str)
df2 = pl.Series(data2).cast(str)
profiler2 = OrderColumn("placeholder_name")
profiler2.update(df2)

data3 = [2, 3, 4]
df3 = pd.Series(data3).apply(str)
df3 = pl.Series(data3).cast(str)
profiler3 = OrderColumn("placeholder_name")
profiler3.update(df3)

data4 = [3, 3, 3, 3]
df4 = pd.Series(data4).apply(str)
df4 = pl.Series(data4).cast(str)
profiler4 = OrderColumn("placeholder_name")
profiler4.update(df4)

data5 = [4, 2, 3, 1, 5]
df5 = pd.Series(data5).apply(str)
df5 = pl.Series(data5).cast(str)
profiler5 = OrderColumn("placeholder_name")
profiler5.update(df5)

data6 = [10, 9, 8, 7]
df6 = pd.Series(data6).apply(str)
df6 = pl.Series(data6).cast(str)
profiler6 = OrderColumn("placeholder_name")
profiler6.update(df6)

data7 = [3, 3, 3]
df7 = pd.Series(data7).apply(str)
df7 = pl.Series(data7).cast(str)
profiler7 = OrderColumn("placeholder_name")
profiler7.update(df7)

data8 = [7, 7, 7, 7, 7, 7, 7]
df8 = pd.Series(data8).apply(str)
df8 = pl.Series(data8).cast(str)
profiler8 = OrderColumn("placeholder_name")
profiler8.update(df8)

data9 = [7, 6, 5, 4, 3]
df9 = pd.Series(data9).apply(str)
df9 = pl.Series(data9).cast(str)
profiler9 = OrderColumn("placeholder_name")
profiler9.update(df9)

data10 = [1, 5, 6]
df10 = pd.Series(data10).apply(str)
df10 = pl.Series(data10).cast(str)
profiler10 = OrderColumn("placeholder_name")
profiler10.update(df10)
profiler10._piecewise = True

data11 = pd.Series([], dtype=object)
df11 = pd.Series(data11).apply(str)
data11 = pl.Series([], dtype=object)
df11 = pl.Series(data11).cast(str)
profiler11 = OrderColumn("placeholder_name")
profiler11.update(df11)

Expand Down Expand Up @@ -322,7 +323,7 @@ def test_merge_timing(self):
def test_random_order_prevents_update_from_occuring(self, mock_get_data_order):
mock_get_data_order.return_value = ["random", 1, 2, str]
data = ["a", "b", "ab"]
df = pd.Series(data).apply(str)
df = pl.Series(data).cast(str)

# Assert the order is random
profiler = OrderColumn(df.name)
Expand All @@ -342,20 +343,20 @@ def test_order_column_with_wrong_options(self):

def test_diff(self):
data = [1, 2, 3, 4, 5, 6]
df = pd.Series(data).apply(str)
df = pl.Series(data).cast(str)
profiler = OrderColumn("placeholder_name")
profiler.update(df)

data2 = [7, 8, 9, 10]
df2 = pd.Series(data2).apply(str)
df2 = pl.Series(data2).cast(str)
profiler2 = OrderColumn("placeholder_name")
profiler2.update(df2)

diff = profiler.diff(profiler2)
self.assertEqual("unchanged", diff["order"])

data3 = [4, 2, 3, 1, 5]
df3 = pd.Series(data3).apply(str)
df3 = pl.Series(data3).cast(str)
profiler3 = OrderColumn("placeholder_name")
profiler3.update(df3)

Expand All @@ -373,7 +374,7 @@ def test_json_encode(self):
"order": None,
"_last_value": None,
"_first_value": None,
"_data_store_type": "float64",
"_data_store_type": "Float64",
"_piecewise": False,
"_OrderColumn__calculations": dict(),
"name": "0",
Expand All @@ -391,7 +392,7 @@ def test_json_encode(self):
def test_json_encode_after_update(self):
profile = OrderColumn("0")

df_order = pd.Series(["za", "z", "c", "a"])
df_order = pl.Series(["za", "z", "c", "a"])
with patch("time.time", side_effect=lambda: 0.0):
profile.update(df_order)

Expand Down Expand Up @@ -431,7 +432,7 @@ def test_json_decode_after_update_str(self):
fake_profile_name = "Fake profile name"

# Build expected orderColumn
df_order = pd.Series(["za", "z", "c", "c"])
df_order = pl.Series(["za", "z", "c", "c"])
expected_profile = OrderColumn(fake_profile_name)

with utils.mock_timeit():
Expand All @@ -444,7 +445,7 @@ def test_json_decode_after_update_str(self):

# Adding data to update that is in descending order
# (consistent with previous data)
df_order = pd.Series(
df_order = pl.Series(
[
"c", # add existing
"a", # add new
Expand All @@ -460,7 +461,7 @@ def test_json_decode_after_update_str(self):

# Adding data to update that is in random order
# (not consistent with previous data)
df_order = pd.Series(
df_order = pl.Series(
[
"c", # add existing
"zza", # add new
Expand All @@ -476,7 +477,7 @@ def test_json_decode_after_update_num(self):
fake_profile_name = "Fake profile name"

# Build expected orderColumn
df_order = pd.Series(["1", "4", "6"])
df_order = pl.Series(["1", "4", "6"])
expected_profile = OrderColumn(fake_profile_name)

with utils.mock_timeit():
Expand All @@ -488,7 +489,7 @@ def test_json_decode_after_update_num(self):

# Adding data to update that is in descending order
# (consistent with previous data)
df_order = pd.Series(
df_order = pl.Series(
[
"6", # add existing
"9", # add new
Expand All @@ -503,7 +504,7 @@ def test_json_decode_after_update_num(self):

# Adding data to update that is in random order
# (not consistent with previous data)
df_order = pd.Series(
df_order = pl.Series(
[
"3", # add existing
"1", # add new
Expand Down

0 comments on commit 16575a2

Please sign in to comment.