Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 494: add file io for normalize script #549

Merged
merged 3 commits into from
Oct 10, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 27 additions & 10 deletions silnlp/common/normalize_extracts.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,17 @@
The optional `--overwrite` flag will bypass this.

By default the script uses the logging configuration inherited from the parent packages (which should log at INFO level).
You can change the logging level with the optional `--log_level LOG_LEVEL` which accepts values like:
You can change the logging level with the optional `--log-level LOG_LEVEL` which accepts values like:
"DEBUG", "INFO", "WARNING/WARN", "ERROR" and "CRITICAL".
"""

import argparse
import logging
import os

from dataclasses import dataclass

from glob import glob
from pathlib import Path
from typing import List, Optional

Expand All @@ -63,17 +65,28 @@ def get_files_to_normalize(input_dir: Path, filter: Optional[str]) -> List[Path]
that aren't normalized.
If the filter is defined, then further filtering of those candidates is performed.
"""
# TODO
return []
if filter is None:
logger.debug(f"Searching files in input dir: '{input_dir}'")
matching_filenames = [os.path.join(input_dir, f) for f in os.listdir(input_dir)]
else:
logger.debug(f"Searching files in input dir: '{input_dir}' that satisfy glob '{filter}'")
matching_filenames = glob(os.path.join(input_dir, filter), recursive=False)

matching_paths: List[Path] = [Path(f) for f in matching_filenames]
return [path for path in matching_paths if path.is_file() and path.suffix == ".txt" and not str(path).endswith("norm.txt")]


def normalized_path(output_dir: Path, input_path: Path) -> Path:
"""
Uses the input path to generate corresponding output path with "norm" in the name.
e.g. extract.all.txt -> extract.all.norm.txt
"""
# TODO
return Path("TODO")
input_filename = input_path.parts[-1]
output_filename_parts = input_filename.split(".")[0:-1]
output_filename_parts.append("norm")
output_filename_parts.append("txt")
output_filename = ".".join(output_filename_parts)
return output_dir / output_filename


def normalize(extract_sentence: str) -> str:
Expand All @@ -85,13 +98,15 @@ def normalize(extract_sentence: str) -> str:


def load_extract_file(path: Path) -> List[str]:
# TODO
return []
with open(path, "r", encoding="UTF-8") as file:
return [line.rstrip() for line in file]


def write_extract_file(path: Path, lines: List[str]) -> None:
# TODO
return
def write_extract_file(path: Path, sentences: List[str]) -> None:
logger.debug(f"Writing {len(sentences)} sentences to file: {path}")
with open(path, "w", encoding="utf-8") as f:
for sentence in sentences:
f.write(f"{sentence}\n")


def run(cli_input: CliInput) -> None:
Expand Down Expand Up @@ -123,8 +138,10 @@ def run(cli_input: CliInput) -> None:
f"Outpath '{output_path}' already exists. Skipping input {input_path}. "
+ "You can use the --overwrite flag to write over existing files."
)
continue

input_lines: List[str] = load_extract_file(input_path)
logger.debug(f"Found {len(input_lines)} lines in file")
normalized_lines: List[str] = [normalize(extract_sentence) for extract_sentence in input_lines]
write_extract_file(output_path, normalized_lines)
logger.debug(f"Finished processing {input_path}")
Expand Down