Clean up internal column logic in _run_classifier_helper function (NVIDIA#457)

sarahyurick · web-flow · commit 694970a31adf · 2025-01-03T14:58:52.000-08:00
* first commit

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* working code and pytest

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

---------

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;
diff --git a/nemo_curator/classifiers/base.py b/nemo_curator/classifiers/base.py
@@ -121,44 +121,29 @@ def _run_classifier_helper(
     prob_col: str = None,
 ) -> "dask_cudf.DataFrame":
 
-    keep_prob = prob_col is not None
-    prob_internal_col = "_prob"
-    # TODO: Make crossfit handle this cleanly
-    pred_internal_col = "labels"
-    df["sliced_text"] = df[text_field].str.slice(0, max_chars)
+    if prob_col:
+        df[prob_col] = 0
+    else:
+        prob_col = "_prob"
+
     columns_to_keep_list = df.columns.to_list()
-    columns_to_keep_list.remove("sliced_text")
 
     classifier_pipe = op.Sequential(
-        op.Tokenizer(model, cols=["sliced_text"], tokenizer_type="default"),
+        op.Tokenizer(
+            model, cols=[text_field], tokenizer_type="default", max_chars=max_chars
+        ),
         op.Predictor(
             model,
             sorted_data_loader=True,
             batch_size=batch_size,
-            pred_output_col=prob_internal_col,
+            pred_output_col=prob_col,
         ),
+        op.Labeler(labels, cols=[prob_col], suffix=label_col),
         repartition=df.npartitions,
         keep_cols=columns_to_keep_list,
     )
     df = classifier_pipe(df)
 
-    # TODO: Make crossfit handle this cleanly
-    # to prevent the labeler from dropping the prob_internal_col
-    # and combine it into a single step
-    labeling_pipe = op.Sequential(
-        op.Labeler(labels, cols=[prob_internal_col]),
-        keep_cols=columns_to_keep_list + [prob_internal_col],
-    )
-    df = labeling_pipe(df)
-
-    if keep_prob:
-        df = df.rename(
-            columns={prob_internal_col: prob_col, pred_internal_col: label_col},
-        )
-    else:
-        df = df.rename(columns={pred_internal_col: label_col})
-        df = df.drop(columns=[prob_internal_col])
-
     return df
 
 
diff --git a/tests/test_classifiers.py b/tests/test_classifiers.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-
 import pytest
 from distributed import Client
 
@@ -48,24 +46,35 @@ def domain_dataset():
 
 
 @pytest.mark.gpu
-def test_domain_classifier(gpu_client, domain_dataset):
+@pytest.mark.parametrize("keep_prob", [True, False])
+def test_domain_classifier(gpu_client, domain_dataset, keep_prob):
     from nemo_curator.classifiers import DomainClassifier
 
-    classifier = DomainClassifier()
-    result_dataset = classifier(dataset=domain_dataset)
-    result_pred = result_dataset.df.compute()["domain_pred"]
+    if keep_prob:
+        prob_column = "domain_prob"
+    else:
+        prob_column = None
 
-    expected_pred = cudf.Series(
-        [
-            "Computers_and_Electronics",
-            "Finance",
-            "Health",
-            "Jobs_and_Education",
-            "Travel_and_Transportation",
-        ]
-    )
+    classifier = DomainClassifier(prob_column=prob_column)
+    result_dataset = classifier(dataset=domain_dataset)
 
-    assert result_pred.equals(expected_pred)
+    if keep_prob:
+        result_df = result_dataset.df.compute()
+        assert "domain_prob" in result_df.columns
+    else:
+        result_pred = result_dataset.df.compute()["domain_pred"]
+
+        expected_pred = cudf.Series(
+            [
+                "Computers_and_Electronics",
+                "Finance",
+                "Health",
+                "Jobs_and_Education",
+                "Travel_and_Transportation",
+            ]
+        )
+
+        assert result_pred.equals(expected_pred)
 
 
 @pytest.mark.gpu