Skip to content

Commit

Permalink
Profiler utils update (#1092)
Browse files Browse the repository at this point in the history
* update profiler utils

* finish updates
  • Loading branch information
atl1502 committed Apr 15, 2024
1 parent 8e294f7 commit 61b8737
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 3 deletions.
8 changes: 8 additions & 0 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -1924,6 +1924,10 @@ def _get_skewness(
):
return

if self._greater_than_64_bit and type(df_series) is pd.Series:
df_series = df_series.to_numpy(dtype=float)
else:
df_series = pl.from_pandas(df_series, nan_to_null=False)
batch_biased_skewness = profiler_utils.biased_skew(df_series)
subset_properties["biased_skewness"] = batch_biased_skewness
batch_count = subset_properties["match_count"]
Expand Down Expand Up @@ -1968,6 +1972,10 @@ def _get_kurtosis(
):
return

if self._greater_than_64_bit and type(df_series) is pd.Series:
df_series = df_series.to_numpy(dtype=float)
else:
df_series = pl.from_pandas(df_series, nan_to_null=False)
batch_biased_kurtosis = profiler_utils.biased_kurt(df_series)
subset_properties["biased_kurtosis"] = batch_biased_kurtosis
batch_count = subset_properties["match_count"]
Expand Down
10 changes: 7 additions & 3 deletions dataprofiler/profilers/profiler_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@
)

import numpy as np
import polars as pl
import psutil
import scipy
from pandas import DataFrame, Series
from pandas import DataFrame
from polars import Series

from ..labelers.data_labelers import DataLabeler

Expand Down Expand Up @@ -320,7 +322,7 @@ def add_nested_dictionaries(first_dict: dict, second_dict: dict) -> dict:
return merged_dict


def biased_skew(df_series: Series) -> np.float64:
def biased_skew(df_series: Series | np.ndarray) -> np.float64:
"""
Calculate the biased estimator for skewness of the given data.
Expand Down Expand Up @@ -358,7 +360,7 @@ def biased_skew(df_series: Series) -> np.float64:
return skew


def biased_kurt(df_series: Series) -> np.float64:
def biased_kurt(df_series: Series | np.ndarray) -> np.float64:
"""
Calculate the biased estimator for kurtosis of the given data.
Expand Down Expand Up @@ -675,6 +677,8 @@ def get_memory_size(data: list | np.ndarray | DataFrame, unit: str = "M") -> flo
:type unit: string
:return: memory size of the input data
"""
if type(data) is DataFrame:
data = pl.from_pandas(data)
unit_map: dict = collections.defaultdict(B=0, K=1, M=2, G=3)
if unit not in unit_map:
raise ValueError(
Expand Down

0 comments on commit 61b8737

Please sign in to comment.