sillsdev · rminsil · Sep 19, 2024 · Sep 5, 2024 · Sep 12, 2024 · Sep 12, 2024
diff --git a/silnlp/common/extract_xri.py b/silnlp/common/extract_xri.py
@@ -8,19 +8,27 @@
                                              input   source    target     dataset
                                              file    iso code  iso code   descriptor
 
-Each execution puts the extract files into a unique directory in `out` with name generated from script inputs and the current time:
+The output directory can be explicitly set using the `-output` arg, e.g.
+
+    $ python -m silnlp.common.extract_xri data.tsv swa ngq XRI-2024-08-14 -output /tmp/test
+
+Note that the script will write over any existing extract files in that directory.
+
+If the output directory isn't set, a unique directory is created within the SIL_NLP_ENV.mt_corpora_dir (see common/environment.py).
+The unique directory is constructed using the input arguments and current date time,
+e.g. `swa-ngq-XRI-2024-08-14-20240822-180428` in the example above.
+
+Files of the form `*.all.txt` and `*.(train/dev/test).txt` are created in the output directory for source and target languages.
+The prefix of the filename is built from the cli inputs. For the example above these files would be created:
 
-    out
-    └── swa-ngq-XRI-2024-08-14-20240822-180428
         ├── swa-XRI-2024-08-14.all.txt     ┓ complete extract files (no filtering)
         ├── ngq-XRI-2024-08-14.all.txt     ┛
         ├── swa-XRI-2024-08-14.train.txt   ┓ extract files filtered to training data
         ├── ngq-XRI-2024-08-14.train.txt   ┛
-        ├── swa-XRI-2024-08-14.dev.txt     ┓ extract files filtered to dev/validation data
-        ├── ngq-XRI-2024-08-14.dev.txt     ┛
+        ├── swa-XRI-2024-08-14.val.txt     ┓ extract files filtered to dev/validation data
+        ├── ngq-XRI-2024-08-14.val.txt     ┛
         ├── swa-XRI-2024-08-14.test.txt    ┓ extract files filtered to test data
         └── ngq-XRI-2024-08-14.test.txt    ┛
-    (Note that a subsequent PR is going to rework the output location to follow SIL team conventions)
 
 Run with --help for more details.
 
@@ -35,7 +43,7 @@
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
-from typing import List
+from typing import List, Optional
 
 
 class Split(Enum):
@@ -58,6 +66,7 @@ class CliInput:
     source_iso: str
     target_iso: str
     dataset_descriptor: str
+    output: Optional[str]
 
 
 def load_sentence_pairs(input_file_path: str) -> List[SentencePair]:
@@ -94,21 +103,26 @@ def get_column_index(column_name: str) -> int:
     ]
 
 
-def write_output_file(filename: str, sentences: List[str]) -> None:
-    with open(filename, "w", encoding="utf-8") as f:
+def write_output_file(filepath: Path, sentences: List[str]) -> None:
+    with open(filepath, "w", encoding="utf-8") as f:
         for sentence in sentences:
             f.write(f"{sentence}{os.linesep}")
 
 
 def create_extract_files(cli_input: CliInput, sentence_pairs: List[SentencePair]) -> None:
-    unique_output_dir = (
-        f"{cli_input.source_iso}-{cli_input.target_iso}-{cli_input.dataset_descriptor}-{time.strftime('%Y%m%d-%H%M%S')}"
-    )
-    Path(os.path.join("out", unique_output_dir)).mkdir(parents=True, exist_ok=True)
+    if cli_input.output is None:
+        unique_dir = f"{cli_input.source_iso}-{cli_input.target_iso}-{cli_input.dataset_descriptor}-{time.strftime('%Y%m%d-%H%M%S')}"
+        from ..common.environment import SIL_NLP_ENV
+        output_dir = SIL_NLP_ENV.mt_corpora_dir / unique_dir
+    else:
+        output_dir = Path(cli_input.output)
+
+    print(f"Outputting to directory: {output_dir}")
+    output_dir.mkdir(parents=True, exist_ok=True)
 
     def create_source_target_files(sub_sentence_pairs: List[SentencePair], suffix: str) -> None:
-        def build_output_path(iso: str) -> str:
-            return os.path.join("out", unique_output_dir, f"{iso}-{cli_input.dataset_descriptor}.{suffix}.txt")
+        def build_output_path(iso: str) -> Path:
+            return output_dir / f"{iso}-{cli_input.dataset_descriptor}.{suffix}.txt"
 
         source_filename = build_output_path(iso=cli_input.source_iso)
         source_sentences = [sentence.source for sentence in sub_sentence_pairs]
@@ -147,13 +161,17 @@ def main() -> None:
     parser.add_argument("source_iso", help="The ISO 693-3 code for the source/LWC language", type=str)
     parser.add_argument("target_iso", help="The ISO 693-3 code for the target/vernacular language", type=str)
     parser.add_argument("dataset", help="A descriptor of the dataset to be used in the output filename", type=str)
+    parser.add_argument(
+        "-output", help="Optional path to the output directory where extract files are generated", type=str
+    )
     args = parser.parse_args()
 
     cli_input = CliInput(
         input_file_path=args.input_file,
         source_iso=args.source_iso,
         target_iso=args.target_iso,
         dataset_descriptor=args.dataset,
+        output=args.output,
     )
     run(cli_input)