Skip to content

Commit

Permalink
Fix creating charlist for tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
TimKoornstra committed Sep 1, 2024
1 parent 8b6f3f4 commit e5d59e6
Showing 1 changed file with 1 addition and 6 deletions.
7 changes: 1 addition & 6 deletions src/data/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,16 +220,14 @@ def _create_data(self,
# Log the faulty lines and flaw counts
if faulty_lines:
logging.warning("Faulty lines for %s:", partition_name)
# Sort the faulty lines by flaw
for line, flaw in faulty_lines.items():
logging.warning("%s: %s", flaw, line.strip())

logging.warning("Flaw counts for %s:", partition_name)
for flaw, count in flaw_counts.items():
logging.warning("%s: %d", flaw, count)

# Update the character set with any new characters found
characters.update(set("".join(labels)))

logging.info("Created data for %s with %d samples",
partition_name, len(partitions))

Expand Down Expand Up @@ -291,9 +289,6 @@ def _process_line(self,
ground_truth = normalize_text(ground_truth,
self.config['normalization_file'])

# Add characters to the set for tokenizer
characters.update(ground_truth)

# Check for unsupported characters in the ground truth
if not self._is_valid_ground_truth(ground_truth, partition_name,
characters):
Expand Down

0 comments on commit e5d59e6

Please sign in to comment.