Skip to content

Commit

Permalink
update order col (capitalone#1123)
Browse files Browse the repository at this point in the history
* update order col

* fix pandas df in update

* update types in comments
  • Loading branch information
atl1502 authored and abajpai15 committed Apr 15, 2024
1 parent def1285 commit 5110544
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 58 deletions.
56 changes: 30 additions & 26 deletions dataprofiler/profilers/order_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
from abc import abstractmethod
from typing import Protocol, Type, TypeVar, cast

import numpy as np
from pandas import DataFrame, Series
import pandas as pd
import polars as pl
from polars import Series

from . import profiler_utils
from .base_column_profilers import BaseColumnProfiler
Expand All @@ -24,7 +25,7 @@ def __lt__(self: CT, other: CT) -> bool:
CT = TypeVar("CT", bound=Comparable)

# bc type in class attr causing issues, need to alias
AliasFloatType = Type[np.float64]
AliasFloatType = Type[pl.Float64]
AliasStrType = Type[str]


Expand All @@ -51,9 +52,9 @@ def __init__(self, name: str | None, options: OrderOptions = None) -> None:
"OrderColumn parameter 'options' must be of type" " OrderOptions."
)
self.order: str | None = None
self._last_value: np.float64 | float | str | None = None
self._first_value: np.float64 | float | str | None = None
self._data_store_type: AliasStrType | AliasFloatType = np.float64
self._last_value: pl.Float64 | float | str | None = None
self._first_value: pl.Float64 | float | str | None = None
self._data_store_type: AliasStrType | AliasFloatType = pl.Float64
self._piecewise: bool | None = False
self.__calculations: dict = {}
self._filter_properties_w_options(self.__calculations, options)
Expand Down Expand Up @@ -161,23 +162,23 @@ def _merge_order(
:type first_value1: Float | String
:type last_value1: Float | String
:type piecewise1: Boolean
:type data_store_type1: Type[str] | Type[np.float64]
:type data_store_type1: Type[str] | Type[pl.Float64]
:type order2: String
:type first_value2: Float | String
:type last_value2: Float | String
:type data_store_type2: Type[str] | Type[np.float64]
:type data_store_type2: Type[str] | Type[pl.Float64]
:type piecewise2: Boolean
:return: order, first_value, last_value, piecewise, merged_data_store_type
:rtype: String, Float | String, Float | String, Boolean, Type[str]
| Type[np.float64]
| Type[pl.Float64]
"""
# Return either order if one is None
if not order1:
return order2, first_value2, last_value2, piecewise2, data_store_type2
elif not order2:
return order1, first_value1, last_value1, piecewise1, data_store_type1

merged_data_store_type: AliasStrType | AliasFloatType = np.float64
merged_data_store_type: AliasStrType | AliasFloatType = pl.Float64
if data_store_type1 is str or data_store_type2 is str:
first_value1 = cast(CT, str(first_value1))
last_value1 = cast(CT, str(last_value1))
Expand Down Expand Up @@ -329,13 +330,13 @@ def load_from_dict(cls, data, config: dict | None = None):
"""
# This is an ambiguous call to super classes.
data["_data_store_type"] = (
str if data["_data_store_type"] == "str" else np.float64
str if data["_data_store_type"] == "str" else pl.Float64
)
profile = super().load_from_dict(data)
try:
if profile.sample_size and profile._data_store_type is np.float64:
profile._first_value = np.float64(profile._first_value)
profile._last_value = np.float64(profile._last_value)
if profile.sample_size and profile._data_store_type is pl.Float64:
profile._first_value = profile._first_value
profile._last_value = profile._last_value
except ValueError:
profile._first_value = data["_first_value"]
profile._last_value = data["_last_value"]
Expand Down Expand Up @@ -379,23 +380,23 @@ def _get_data_order(
Additionally, return the first and last value of the series.
:param df_series: a given column
:type df_series: pandas.core.series.Series
:type df_series: polars.Series
:param data_store_type: type of value for first_value and last_value
:type data_store_type: Type[str] | Type[np.float64]
:type data_store_type: Type[str] | Type[pl.Float64]
:return: order, first_value, last_value, data_store_type
:rtype: String, Float, Float, type, Type[str] | Type[np.float64]
:rtype: String, Float, Float, type, Type[str] | Type[pl.Float64]
"""
try:
if data_store_type is not str:
df_series = df_series.astype(float)
except ValueError:
df_series = df_series.cast(pl.Float64)
except pl.exceptions.ComputeError:
data_store_type = str

order = None
last_value = df_series.iloc[0]
first_value = df_series.iloc[0]
last_value = df_series[0]
first_value = df_series[0]

for value in df_series.values:
for value in df_series:
if value < last_value and order == "ascending":
order = "random"
break
Expand All @@ -414,7 +415,7 @@ def _get_data_order(

def _update_order(
self,
df_series: DataFrame,
df_series: Series,
prev_dependent_properties: dict = None,
subset_properties: dict = None,
) -> None:
Expand All @@ -427,7 +428,7 @@ def _update_order(
order information.
:param df_series: Data to be profiled
:type df_series: pandas.DataFrame
:type df_series: polars.DataFrame
:param prev_dependent_properties: Contains all the previous properties
that the calculations depend on.
:type prev_dependent_properties: dict
Expand Down Expand Up @@ -466,7 +467,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
Update col profile properties with clean dataset and its known null parameters.
:param df_series_clean: df series with nulls removed
:type df_series_clean: pandas.core.series.Series
:type df_series_clean: polars.Series
:param profile: ordered profile
:type profile: dict
:return: None
Expand All @@ -478,10 +479,13 @@ def update(self, df_series: Series) -> OrderColumn:
Update the column profile.
:param df_series: df series
:type df_series: pandas.core.series.Series
:type df_series: polars.Series
:return: updated OrderColumn
:rtype: OrderColumn
"""
# TODO remove onces profiler builder is updated
if type(df_series) == pd.Series:
df_series = pl.from_pandas(df_series) # type: ignore
if len(df_series) == 0:
return self

Expand Down
Loading

0 comments on commit 5110544

Please sign in to comment.