Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Numeric updates #1103

Merged
merged 11 commits into from
Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 36 additions & 30 deletions dataprofiler/profilers/float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
from __future__ import annotations

import copy
import re

import numpy as np
import pandas as pd
import polars as pl

from . import profiler_utils
from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler
Expand Down Expand Up @@ -241,7 +241,6 @@ def precision(self) -> dict[str, float | None]:
var = self._correct_bias_variance(
self._precision["sample_size"], self._precision["biased_var"]
)

std = np.sqrt(var)
margin_of_error = (
None
Expand Down Expand Up @@ -275,14 +274,14 @@ def data_type_ratio(self) -> float | None:

@classmethod
def _get_float_precision(
cls, df_series_clean: pd.Series, sample_ratio: float = None
cls, df_series_clean: pl.Series, sample_ratio: float = None
) -> dict | None:
"""
Determine the precision of the numeric value.

:param df_series_clean: df series with nulls removed, assumes all values
are floats as well
:type df_series_clean: pandas.core.series.Series
:type df_series_clean: polars.series.series.Series
:param sample_ratio: Ratio of samples used for float precision
:type sample_ratio: float (between 0 and 1)
:return: string representing its precision print format
Expand All @@ -294,7 +293,7 @@ def _get_float_precision(

# Lead zeros: ^[+-.0\s]+ End zeros: \.?0+(\s|$)
# Scientific Notation: (?<=[e])(.*) Any non-digits: \D
r = re.compile(r"^[+-.0\s]+|\.?0+(\s|$)|(?<=[e])(.*)|\D")
r = r"^[+-.0\s]+|\.?0+(\s|$)|([e].*)|\D"
taylorfturner marked this conversation as resolved.
Show resolved Hide resolved

# DEFAULT: Sample the dataset. If small use full dataset,
# OR 20k samples or 5% of the dataset which ever is larger.
Expand All @@ -305,15 +304,17 @@ def _get_float_precision(

# length of sampled cells after all punctuation removed
len_per_float = (
df_series_clean.sample(sample_size).replace(to_replace=r, value="").map(len)
).astype(float)
df_series_clean.sample(sample_size)
.str.replace_all(pattern=r, value="")
.map_elements(len)
)

# Determine statistics precision
precision_sum = len_per_float.sum()
precision_sum = sum(len_per_float)
subset_precision = {
"min": np.float64(len_per_float.min()),
"max": np.float64(len_per_float.max()),
"biased_var": np.var(len_per_float),
"min": np.float64(min(len_per_float)),
"max": np.float64(max(len_per_float)),
"biased_var": np.var([len_per_float]),
"sum": np.float64(precision_sum),
"mean": np.float64(precision_sum / sample_size),
"sample_size": sample_size,
Expand All @@ -322,7 +323,7 @@ def _get_float_precision(
return subset_precision

@classmethod
def _is_each_row_float(cls, df_series: pd.Series) -> list[bool] | pd.Series[bool]:
def _is_each_row_float(cls, df_series: pl.Series) -> pl.Series:
"""
Determine if each value in a dataframe is a float.

Expand All @@ -332,18 +333,22 @@ def _is_each_row_float(cls, df_series: pd.Series) -> list[bool] | pd.Series[bool
For column [1.0, np.NaN, 1.0] returns [True, True, True]
For column [1.0, "a", "b"] returns [True, False, False]
:param df_series: series of values to evaluate
:type df_series: pandas.core.series.Series
:type df_series: polars.series.series.Series
:return: is_float_col
:rtype: Union[List[bool], pandas.Series[bool]]
:rtype: pl.Series
"""
if len(df_series) == 0:
return list()
return df_series.map(NumericStatsMixin.is_float).astype("bool")
return pl.Series()
if sum(df_series.is_null()) == len(df_series):
return df_series
df_series = df_series.map_elements(NumericStatsMixin.is_float)
df_series = df_series.cast(bool)
return df_series
taylorfturner marked this conversation as resolved.
Show resolved Hide resolved

@BaseColumnProfiler._timeit(name="precision")
def _update_precision(
self,
df_series: pd.DataFrame,
df_series: pl.Series,
prev_dependent_properties: dict,
subset_properties: dict,
) -> None:
Expand All @@ -357,7 +362,7 @@ def _update_precision(
subset before they are merged into the main data profile.
:type subset_properties: dict
:param df_series: Data to be profiled
:type df_series: pandas.DataFrame
:type df_series: polars.DataFrame
:return: None
"""
sample_ratio = None
Expand Down Expand Up @@ -394,12 +399,12 @@ def _update_precision(
self._precision["sum"] / self._precision["sample_size"]
)

def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None:
def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None:
"""
Update column profile properties with cleaned dataset and its known profile.

:param df_series_clean: df series with nulls removed
:type df_series_clean: pandas.core.series.Series
:type df_series_clean: polars.series.series.Series
:param profile: float profile dictionary
:type profile: dict
:return: None
Expand All @@ -410,7 +415,7 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None:

def _update_numeric_stats(
self,
df_series: pd.DataFrame,
df_series: pl.Series,
prev_dependent_properties: dict,
subset_properties: dict,
) -> None:
Expand All @@ -425,38 +430,39 @@ def _update_numeric_stats(
subset before they are merged into the main data profile.
:type subset_properties: Dict
:param df_series: Data to be profiled
:type df_series: Pandas Dataframe
:type df_series: Polars Dataframe
:return: None
"""
super()._update_helper(df_series, subset_properties)

def update(self, df_series: pd.Series) -> FloatColumn:
def update(self, df_series: pl.Series) -> FloatColumn:
"""
Update the column profile.

:param df_series: df series
:type df_series: pandas.core.series.Series
:type df_series: polars.series.series.Series
:return: updated FloatColumn
:rtype: FloatColumn
"""
# TODO remove onces profiler builder is updated
if type(df_series) == pd.Series:
df_series = pl.from_pandas(df_series) # type: ignore
if len(df_series) == 0:
return self

is_each_row_float = self._is_each_row_float(df_series)
is_each_row_float = self._is_each_row_float(df_series).replace(None, False)
sample_size = len(is_each_row_float)
float_count = np.sum(is_each_row_float)
float_count = np.sum([is_each_row_float])
profile = dict(match_count=float_count, sample_size=sample_size)

BaseColumnProfiler._perform_property_calcs(
self,
self.__calculations,
df_series=df_series[is_each_row_float],
df_series=df_series.filter(is_each_row_float),
prev_dependent_properties={},
subset_properties=profile,
)

self._update_helper(
df_series_clean=df_series[is_each_row_float], profile=profile
df_series_clean=df_series.filter(is_each_row_float), profile=profile
)

return self
26 changes: 16 additions & 10 deletions dataprofiler/profilers/int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
import pandas as pd
import polars as pl

from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler
from .numerical_column_stats import NumericStatsMixin
Expand Down Expand Up @@ -113,7 +114,7 @@ def data_type_ratio(self) -> float | None:
return None

@classmethod
def _is_each_row_int(cls, df_series: pd.Series) -> list[bool]:
def _is_each_row_int(cls, df_series: pl.Series) -> list[bool]:
"""
Return true if given is numerical and int values.

Expand All @@ -124,7 +125,7 @@ def _is_each_row_int(cls, df_series: pd.Series) -> list[bool]:
For column [1.1 1.1 1.1] returns False

:param df_series: series of values to evaluate
:type df_series: pandas.core.series.Series
:type df_series: polars.series.series.Series
:return: is_int_col
:rtype: list
"""
Expand All @@ -134,12 +135,12 @@ def _is_each_row_int(cls, df_series: pd.Series) -> list[bool]:

return [NumericStatsMixin.is_int(x) for x in df_series]

def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None:
def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None:
"""
Update col profile properties with clean dataset and its known null params.

:param df_series_clean: df series with nulls removed
:type df_series_clean: pandas.core.series.Series
:type df_series_clean: polars.series.series.Series
:param profile: int profile dictionary
:type profile: dict
:return: None
Expand All @@ -148,32 +149,37 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None:
NumericStatsMixin._update_helper(self, df_series_clean, profile)
self._update_column_base_properties(profile)

def update(self, df_series: pd.Series) -> IntColumn:
def update(self, df_series: pl.Series) -> IntColumn:
"""
Update the column profile.

:param df_series: df series
:type df_series: pandas.core.series.Series
:type df_series: polars.series.series.Series
:return: updated IntColumn
:rtype: IntColumn
"""
# TODO remove onces profiler builder is updated
if type(df_series) == pd.Series:
df_series = pl.from_pandas(df_series) # type: ignore
self._greater_than_64_bit = df_series.dtype == pl.Object
if len(df_series) == 0:
return self

df_series = df_series.reset_index(drop=True)
is_each_row_int = self._is_each_row_int(df_series)
sample_size = len(is_each_row_int)
match_int_count = np.sum(is_each_row_int)
match_int_count = np.sum([is_each_row_int])
profile = dict(match_count=match_int_count, sample_size=sample_size)

BaseColumnProfiler._perform_property_calcs(
self,
self.__calculations,
df_series=df_series[is_each_row_int],
df_series=df_series.filter(is_each_row_int),
prev_dependent_properties={},
subset_properties=profile,
)

self._update_helper(df_series_clean=df_series[is_each_row_int], profile=profile)
self._update_helper(
df_series_clean=df_series.filter(is_each_row_int), profile=profile
)

return self
Loading