From 78d458f2bb1ea315b62cc743a1ac0c2cbf54118d Mon Sep 17 00:00:00 2001 From: Philipp Wundrack Date: Mon, 14 Oct 2024 12:25:06 +0200 Subject: [PATCH] replace threshold with "how" option to make NA filtering more intuitive --- .../backend/pandas_preprocessing.py | 14 +++++++++++--- .../pandas_preprocessing/routes.py | 3 ++- .../pandas_preprocessing/schemas.py | 13 +++++++------ .../templates/pd_preprocessing_step_2.html | 6 +++--- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/backend/pandas_preprocessing.py b/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/backend/pandas_preprocessing.py index 3ed7cc411..603c66328 100644 --- a/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/backend/pandas_preprocessing.py +++ b/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/backend/pandas_preprocessing.py @@ -63,6 +63,14 @@ def get(self): return self.value +class HowEnum(Enum): + any = "any" + all = "all" + + def get(self): + return self.value + + class PositionEnum(Enum): front = "front" end = "end" @@ -105,7 +113,7 @@ def process_subset(subset: Optional[str], is_int: bool = False) -> Optional[List def drop_missing_value( df: DataFrame, axis: int = 0, - threshold: int = no_default, + how: str = "any", subset: str = None, **kwargs, ) -> DataFrame: @@ -115,13 +123,13 @@ def drop_missing_value( returns the new dataframe. :param df: DataFrame :param axis: int - :param threshold: int + :param how: HowEnum :param subset: str containing the columns or rows separated by commas :return: DataFrame """ subset = process_subset(subset, is_int=(axis == 1)) - df.dropna(axis=axis, thresh=threshold, subset=subset, inplace=True) + df.dropna(axis=axis, how=how, subset=subset, inplace=True) return df diff --git a/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/routes.py b/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/routes.py index 33b943c3b..5b84ddeaf 100644 --- a/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/routes.py +++ b/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/routes.py @@ -41,6 +41,7 @@ ) from . import PDPreprocessing, PDPreprocessing_BLP +from .backend.pandas_preprocessing import HowEnum from .schemas import ( FirstInputParametersSchema, SecondInputParametersSchema, @@ -244,7 +245,7 @@ def render(self, data: Mapping, db_id: int, step_id: int, errors: dict): fields = schema.fields # define default values default_values = { - fields["threshold"].data_key: 0, + fields["how"].data_key: HowEnum.any, } # overwrite default values with other values if possible diff --git a/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/schemas.py b/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/schemas.py index f9cb8cb5f..e30f44a87 100644 --- a/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/schemas.py +++ b/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/schemas.py @@ -28,6 +28,7 @@ KeepEnum, PositionEnum, CaseEnum, + HowEnum, ) from celery.utils.log import get_task_logger @@ -69,7 +70,7 @@ def make_input_params(self, data, **kwargs) -> FirstInputParameters: class SecondInputParameters: preprocessing_enum: PreprocessingEnum axis: AxisEnum - threshold: int + how: HowEnum subset: str fill_value: str keep: KeepEnum @@ -119,15 +120,15 @@ class SecondInputParametersSchema(FrontendFormBaseSchema): "input_type": "select", }, ) - threshold = ma.fields.Integer( + how = EnumField( + HowEnum, required=True, allow_none=False, metadata={ - "label": "Threshold", - "description": "Requires that many non-NA values. Cannot be combined with how. If left empty, then all values may not be NA.", - "input_type": "number", + "label": "How", + "description": "Select when a row / column is dropped. Any: if any NA is present, it will be dropped. All: if all values are NA, it will be dropped.", + "input_type": "select", }, - validate=validate.Range(min=0, min_inclusive=True), ) subset = ma.fields.String( required=False, diff --git a/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/templates/pd_preprocessing_step_2.html b/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/templates/pd_preprocessing_step_2.html index 1aa871ce3..17a9b7a6c 100644 --- a/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/templates/pd_preprocessing_step_2.html +++ b/stable_plugins/classical_ml/data_preparation/pandas_preprocessing/templates/pd_preprocessing_step_2.html @@ -87,7 +87,7 @@ const column_val = document.getElementById("column"); const axis_dis = axis_val.parentNode.parentNode; - const threshold_dis = document.getElementById("threshold").parentNode.parentNode; + const how_dis = document.getElementById("how").parentNode.parentNode; const subset_dis = subset_val.parentNode.parentNode; const fill_value_dis = document.getElementById("fill_value").parentNode.parentNode; const keep_dis = document.getElementById("keep").parentNode.parentNode; @@ -207,7 +207,7 @@ function preprocessing_enum_change() { axis_dis.style.display = "none"; - threshold_dis.style.display = "none"; + how_dis.style.display = "none"; subset_dis.style.display = "none"; fill_value_dis.style.display = "none"; keep_dis.style.display = "none"; @@ -225,7 +225,7 @@ if (preprocessing_enum_value.value === "drop_na") { axis_dis.style.display = "block"; - threshold_dis.style.display = "block"; + how_dis.style.display = "block"; subset_dis.style.display = "block"; change_parameter(subset_dis, null, "Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include."); axis_change();