Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 473: change how output directory is generated for extract_xri script #507

Merged
merged 4 commits into from
Sep 19, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 33 additions & 15 deletions silnlp/common/extract_xri.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,27 @@
input source target dataset
file iso code iso code descriptor

Each execution puts the extract files into a unique directory in `out` with name generated from script inputs and the current time:
The output directory can be explicitly set using the `-output` arg, e.g.

$ python -m silnlp.common.extract_xri data.tsv swa ngq XRI-2024-08-14 -output /tmp/test

Note that the script will write over any existing extract files in that directory.

If the output directory isn't set, a unique directory is created within the SIL_NLP_ENV.mt_corpora_dir (see common/environment.py).
The unique directory is constructed using the input arguments and current date time,
e.g. `swa-ngq-XRI-2024-08-14-20240822-180428` in the example above.

Files of the form `*.all.txt` and `*.(train/dev/test).txt` are created in the output directory for source and target languages.
The prefix of the filename is built from the cli inputs. For the example above these files would be created:

out
└── swa-ngq-XRI-2024-08-14-20240822-180428
├── swa-XRI-2024-08-14.all.txt ┓ complete extract files (no filtering)
├── ngq-XRI-2024-08-14.all.txt ┛
├── swa-XRI-2024-08-14.train.txt ┓ extract files filtered to training data
├── ngq-XRI-2024-08-14.train.txt ┛
├── swa-XRI-2024-08-14.dev.txt ┓ extract files filtered to dev/validation data
├── ngq-XRI-2024-08-14.dev.txt ┛
├── swa-XRI-2024-08-14.val.txt ┓ extract files filtered to dev/validation data
├── ngq-XRI-2024-08-14.val.txt ┛
├── swa-XRI-2024-08-14.test.txt ┓ extract files filtered to test data
└── ngq-XRI-2024-08-14.test.txt ┛
(Note that a subsequent PR is going to rework the output location to follow SIL team conventions)

Run with --help for more details.

Expand All @@ -35,7 +43,7 @@
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import List
from typing import List, Optional


class Split(Enum):
Expand All @@ -58,6 +66,7 @@ class CliInput:
source_iso: str
target_iso: str
dataset_descriptor: str
output: Optional[str]


def load_sentence_pairs(input_file_path: str) -> List[SentencePair]:
Expand Down Expand Up @@ -94,21 +103,26 @@ def get_column_index(column_name: str) -> int:
]


def write_output_file(filename: str, sentences: List[str]) -> None:
with open(filename, "w", encoding="utf-8") as f:
def write_output_file(filepath: Path, sentences: List[str]) -> None:
with open(filepath, "w", encoding="utf-8") as f:
for sentence in sentences:
f.write(f"{sentence}{os.linesep}")


def create_extract_files(cli_input: CliInput, sentence_pairs: List[SentencePair]) -> None:
unique_output_dir = (
f"{cli_input.source_iso}-{cli_input.target_iso}-{cli_input.dataset_descriptor}-{time.strftime('%Y%m%d-%H%M%S')}"
)
Path(os.path.join("out", unique_output_dir)).mkdir(parents=True, exist_ok=True)
if cli_input.output is None:
unique_dir = f"{cli_input.source_iso}-{cli_input.target_iso}-{cli_input.dataset_descriptor}-{time.strftime('%Y%m%d-%H%M%S')}"
from ..common.environment import SIL_NLP_ENV
output_dir = SIL_NLP_ENV.mt_corpora_dir / unique_dir
else:
output_dir = Path(cli_input.output)

print(f"Outputting to directory: {output_dir}")
output_dir.mkdir(parents=True, exist_ok=True)

def create_source_target_files(sub_sentence_pairs: List[SentencePair], suffix: str) -> None:
def build_output_path(iso: str) -> str:
return os.path.join("out", unique_output_dir, f"{iso}-{cli_input.dataset_descriptor}.{suffix}.txt")
def build_output_path(iso: str) -> Path:
return output_dir / f"{iso}-{cli_input.dataset_descriptor}.{suffix}.txt"

source_filename = build_output_path(iso=cli_input.source_iso)
source_sentences = [sentence.source for sentence in sub_sentence_pairs]
Expand Down Expand Up @@ -147,13 +161,17 @@ def main() -> None:
parser.add_argument("source_iso", help="The ISO 693-3 code for the source/LWC language", type=str)
parser.add_argument("target_iso", help="The ISO 693-3 code for the target/vernacular language", type=str)
parser.add_argument("dataset", help="A descriptor of the dataset to be used in the output filename", type=str)
parser.add_argument(
"-output", help="Optional path to the output directory where extract files are generated", type=str
)
args = parser.parse_args()

cli_input = CliInput(
input_file_path=args.input_file,
source_iso=args.source_iso,
target_iso=args.target_iso,
dataset_descriptor=args.dataset,
output=args.output,
)
run(cli_input)

Expand Down