From 3aeb2945a134ba3943bec34ca52fb855a3ccb477 Mon Sep 17 00:00:00 2001 From: Rohan M Date: Sat, 5 Oct 2024 20:59:56 +1000 Subject: [PATCH 1/3] Issue 494: Implement --filter parameter which takes a glob --- silnlp/common/normalize_extracts.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/silnlp/common/normalize_extracts.py b/silnlp/common/normalize_extracts.py index 41f9cac4..d5f9b5cf 100644 --- a/silnlp/common/normalize_extracts.py +++ b/silnlp/common/normalize_extracts.py @@ -32,15 +32,17 @@ The optional `--overwrite` flag will bypass this. By default the script uses the logging configuration inherited from the parent packages (which should log at INFO level). -You can change the logging level with the optional `--log_level LOG_LEVEL` which accepts values like: +You can change the logging level with the optional `--log-level LOG_LEVEL` which accepts values like: "DEBUG", "INFO", "WARNING/WARN", "ERROR" and "CRITICAL". """ import argparse import logging +import os from dataclasses import dataclass +from glob import glob from pathlib import Path from typing import List, Optional @@ -63,8 +65,15 @@ def get_files_to_normalize(input_dir: Path, filter: Optional[str]) -> List[Path] that aren't normalized. If the filter is defined, then further filtering of those candidates is performed. """ - # TODO - return [] + if filter is None: + logger.debug(f"Searching files in input dir: '{input_dir}'") + matching_filenames = [os.path.join(input_dir, f) for f in os.listdir(input_dir)] + else: + logger.debug(f"Searching files in input dir: '{input_dir}' that satisfy glob '{filter}'") + matching_filenames = glob(os.path.join(input_dir, filter), recursive=False) + + matching_paths: List[Path] = [Path(f) for f in matching_filenames] + return [path for path in matching_paths if path.is_file() and path.suffix == ".txt" and not str(path).endswith("norm.txt")] def normalized_path(output_dir: Path, input_path: Path) -> Path: From 82ed36fe874b9dbe061d85a67d91dcd80787a2d7 Mon Sep 17 00:00:00 2001 From: Rohan M Date: Sat, 5 Oct 2024 22:03:07 +1000 Subject: [PATCH 2/3] Issue 494: Implement loading and saving of files --- silnlp/common/normalize_extracts.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/silnlp/common/normalize_extracts.py b/silnlp/common/normalize_extracts.py index d5f9b5cf..43abd420 100644 --- a/silnlp/common/normalize_extracts.py +++ b/silnlp/common/normalize_extracts.py @@ -81,8 +81,12 @@ def normalized_path(output_dir: Path, input_path: Path) -> Path: Uses the input path to generate corresponding output path with "norm" in the name. e.g. extract.all.txt -> extract.all.norm.txt """ - # TODO - return Path("TODO") + input_filename = input_path.parts[-1] + output_filename_parts = input_filename.split(".")[0:-1] + output_filename_parts.append("norm") + output_filename_parts.append("txt") + output_filename = ".".join(output_filename_parts) + return output_dir / output_filename def normalize(extract_sentence: str) -> str: @@ -94,13 +98,15 @@ def normalize(extract_sentence: str) -> str: def load_extract_file(path: Path) -> List[str]: - # TODO - return [] + with open(path, "r", encoding="UTF-8") as file: + return [line.rstrip() for line in file] -def write_extract_file(path: Path, lines: List[str]) -> None: - # TODO - return +def write_extract_file(path: Path, sentences: List[str]) -> None: + logger.debug(f"Writing {len(sentences)} sentences to file: {path}") + with open(path, "w", encoding="utf-8") as f: + for sentence in sentences: + f.write(f"{sentence}\n") def run(cli_input: CliInput) -> None: @@ -134,6 +140,7 @@ def run(cli_input: CliInput) -> None: ) input_lines: List[str] = load_extract_file(input_path) + logger.debug(f"Found {len(input_lines)} lines in file") normalized_lines: List[str] = [normalize(extract_sentence) for extract_sentence in input_lines] write_extract_file(output_path, normalized_lines) logger.debug(f"Finished processing {input_path}") From ee852d81349c842a5dce5e2ae153f8b3d884ee67 Mon Sep 17 00:00:00 2001 From: Rohan M Date: Sat, 5 Oct 2024 22:04:07 +1000 Subject: [PATCH 3/3] Issue 494: Fix bug where existing files were written over --- silnlp/common/normalize_extracts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/silnlp/common/normalize_extracts.py b/silnlp/common/normalize_extracts.py index 43abd420..1b41c921 100644 --- a/silnlp/common/normalize_extracts.py +++ b/silnlp/common/normalize_extracts.py @@ -138,6 +138,7 @@ def run(cli_input: CliInput) -> None: f"Outpath '{output_path}' already exists. Skipping input {input_path}. " + "You can use the --overwrite flag to write over existing files." ) + continue input_lines: List[str] = load_extract_file(input_path) logger.debug(f"Found {len(input_lines)} lines in file")