NerDL Optimizations python side

DevinTDHa · DevinTDHa · commit 631b35013cc0 · 2025-11-24T12:47:16.000+01:00
diff --git a/python/sparknlp/annotator/ner/ner_dl.py b/python/sparknlp/annotator/ner/ner_dl.py
@@ -238,6 +238,14 @@ class NerDLApproach(AnnotatorApproach, NerApproach, EvaluationDLParams):
                             "Whether to check F1 Micro-average or F1 Macro-average as a final metric for the best model.",
                             TypeConverters.toString)
 
+    prefetchBatches = Param(Params._dummy(), "prefetchBatches",
+                            "Number of batches to prefetch while training using memory optimizer. Has no effect if memory optimizer is disabled.",
+                            TypeConverters.toInt)
+
+    optimizePartitioning = Param(Params._dummy(), "optimizePartitioning",
+                                 "Whether to repartition the dataset before training for optimal performance. Has no effect if memory optimizer is disabled.",
+                                 TypeConverters.toBoolean)
+
     def setConfigProtoBytes(self, b):
         """Sets configProto from tensorflow, serialized into byte array.
 
@@ -377,6 +385,28 @@ def setBestModelMetric(self, value):
         """
         return self._set(bestModelMetric=value)
 
+    def setPrefetchBatches(self, value):
+        """Sets number of batches to prefetch while training using memory optimizer.
+        Has no effect if memory optimizer is disabled.
+
+        Parameters
+        ----------
+        value : int
+            Number of batches to prefetch
+        """
+        return self._set(prefetchBatches=value)
+
+    def setOptimizePartitioning(self, value):
+        """Sets whether to repartition the dataset before training for optimal performance.
+        Has no effect if memory optimizer is disabled.
+
+        Parameters
+        ----------
+        value: bool
+            Whether to optimize partitioning
+        """
+        return self._set(optimizePartitioning=value)
+
     def _create_model(self, java_model):
         return NerDLModel(java_model=java_model)
 
@@ -400,7 +430,9 @@ def __init__(self):
             enableOutputLogs=False,
             enableMemoryOptimizer=False,
             useBestModel=False,
-            bestModelMetric="f1_micro"
+            bestModelMetric="f1_micro",
+            prefetchBatches=0,
+            optimizePartitioning=True
         )
 
 
diff --git a/python/test/annotator/ner/ner_dl_approach_test.py b/python/test/annotator/ner/ner_dl_approach_test.py
@@ -0,0 +1,59 @@
+#  Copyright 2017-2025 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import unittest
+
+import pytest
+
+from sparknlp.annotator import *
+from test.util import SparkSessionForTest
+
+
+@pytest.mark.fast
+class NerDLApproachTestSpec(unittest.TestCase):
+    def setUp(self):
+        self.spark = SparkSessionForTest.spark
+
+    def test_setters(self):
+        ner_approach = (
+            NerDLApproach()
+            .setLr(0.01)
+            .setPo(0.005)
+            .setBatchSize(16)
+            .setDropout(0.01)
+            .setGraphFolder("graph_folder")
+            .setConfigProtoBytes([])
+            .setUseContrib(False)
+            .setEnableMemoryOptimizer(True)
+            .setIncludeConfidence(True)
+            .setIncludeAllConfidenceScores(True)
+            .setUseBestModel(True)
+            .setPrefetchBatches(20)
+            .setOptimizePartitioning(True)
+        )
+
+        # Check param map
+        param_map = ner_approach.extractParamMap()
+        self.assertEqual(param_map[ner_approach.lr], 0.01)
+        self.assertEqual(param_map[ner_approach.po], 0.005)
+        self.assertEqual(param_map[ner_approach.batchSize], 16)
+        self.assertEqual(param_map[ner_approach.dropout], 0.01)
+        self.assertEqual(param_map[ner_approach.graphFolder], "graph_folder")
+        self.assertEqual(param_map[ner_approach.configProtoBytes], [])
+        self.assertEqual(param_map[ner_approach.useContrib], False)
+        self.assertEqual(param_map[ner_approach.enableMemoryOptimizer], True)
+        self.assertEqual(param_map[ner_approach.includeConfidence], True)
+        self.assertEqual(param_map[ner_approach.includeAllConfidenceScores], True)
+        self.assertEqual(param_map[ner_approach.useBestModel], True)
+        self.assertEqual(param_map[ner_approach.prefetchBatches], 20)
+        self.assertEqual(param_map[ner_approach.optimizePartitioning], True)