sillsdev · rminsil · Oct 10, 2024 · Oct 5, 2024 · Oct 5, 2024 · Oct 5, 2024
diff --git a/silnlp/common/normalize_extracts.py b/silnlp/common/normalize_extracts.py
@@ -32,15 +32,17 @@
 The optional `--overwrite` flag will bypass this.
 
 By default the script uses the logging configuration inherited from the parent packages (which should log at INFO level).
-You can change the logging level with the optional `--log_level LOG_LEVEL` which accepts values like:
+You can change the logging level with the optional `--log-level LOG_LEVEL` which accepts values like:
 "DEBUG", "INFO", "WARNING/WARN", "ERROR" and "CRITICAL".
 """
 
 import argparse
 import logging
+import os
 
 from dataclasses import dataclass
 
+from glob import glob
 from pathlib import Path
 from typing import List, Optional
 
@@ -63,17 +65,28 @@ def get_files_to_normalize(input_dir: Path, filter: Optional[str]) -> List[Path]
     that aren't normalized.
     If the filter is defined, then further filtering of those candidates is performed.
     """
-    # TODO
-    return []
+    if filter is None:
+        logger.debug(f"Searching files in input dir: '{input_dir}'")
+        matching_filenames = [os.path.join(input_dir, f) for f in os.listdir(input_dir)]
+    else:
+        logger.debug(f"Searching files in input dir: '{input_dir}' that satisfy glob '{filter}'")
+        matching_filenames = glob(os.path.join(input_dir, filter), recursive=False)
+
+    matching_paths: List[Path] = [Path(f) for f in matching_filenames]
+    return [path for path in matching_paths if path.is_file() and path.suffix == ".txt" and not str(path).endswith("norm.txt")]
 
 
 def normalized_path(output_dir: Path, input_path: Path) -> Path:
     """
     Uses the input path to generate corresponding output path with "norm" in the name.
     e.g. extract.all.txt -> extract.all.norm.txt
     """
-    # TODO
-    return Path("TODO")
+    input_filename = input_path.parts[-1]
+    output_filename_parts = input_filename.split(".")[0:-1]
+    output_filename_parts.append("norm")
+    output_filename_parts.append("txt")
+    output_filename = ".".join(output_filename_parts)
+    return output_dir / output_filename
 
 
 def normalize(extract_sentence: str) -> str:
@@ -85,13 +98,15 @@ def normalize(extract_sentence: str) -> str:
 
 
 def load_extract_file(path: Path) -> List[str]:
-    # TODO
-    return []
+    with open(path, "r", encoding="UTF-8") as file:
+        return [line.rstrip() for line in file]
 
 
-def write_extract_file(path: Path, lines: List[str]) -> None:
-    # TODO
-    return
+def write_extract_file(path: Path, sentences: List[str]) -> None:
+    logger.debug(f"Writing {len(sentences)} sentences to file: {path}")
+    with open(path, "w", encoding="utf-8") as f:
+        for sentence in sentences:
+            f.write(f"{sentence}\n")
 
 
 def run(cli_input: CliInput) -> None:
@@ -123,8 +138,10 @@ def run(cli_input: CliInput) -> None:
                 f"Outpath '{output_path}' already exists. Skipping input {input_path}. "
                 + "You can use the --overwrite flag to write over existing files."
             )
+            continue
 
         input_lines: List[str] = load_extract_file(input_path)
+        logger.debug(f"Found {len(input_lines)} lines in file")
         normalized_lines: List[str] = [normalize(extract_sentence) for extract_sentence in input_lines]
         write_extract_file(output_path, normalized_lines)
         logger.debug(f"Finished processing {input_path}")