Fix creating charlist for tokenizer

rvankoert · Sep 1, 2024 · e5d59e6 · e5d59e6
1 parent 8b6f3f4
commit e5d59e6
Showing 1 changed file with 1 addition and 6 deletions.
diff --git a/src/data/manager.py b/src/data/manager.py
@@ -220,16 +220,14 @@ def _create_data(self,
         # Log the faulty lines and flaw counts
         if faulty_lines:
             logging.warning("Faulty lines for %s:", partition_name)
+            # Sort the faulty lines by flaw
             for line, flaw in faulty_lines.items():
                 logging.warning("%s: %s", flaw, line.strip())
 
             logging.warning("Flaw counts for %s:", partition_name)
             for flaw, count in flaw_counts.items():
                 logging.warning("%s: %d", flaw, count)
 
-        # Update the character set with any new characters found
-        characters.update(set("".join(labels)))
-
         logging.info("Created data for %s with %d samples",
                      partition_name, len(partitions))
 
@@ -291,9 +289,6 @@ def _process_line(self,
             ground_truth = normalize_text(ground_truth,
                                           self.config['normalization_file'])
 
-        # Add characters to the set for tokenizer
-        characters.update(ground_truth)
-
         # Check for unsupported characters in the ground truth
         if not self._is_valid_ground_truth(ground_truth, partition_name,
                                            characters):