From e0de7620070da5d4ae108172ac33b5030fca2855 Mon Sep 17 00:00:00 2001 From: David Baines Date: Wed, 18 Jun 2025 10:07:51 +0100 Subject: [PATCH 1/4] WIP: Stashing changes before find_by_iso branching --- debug.log | 2 + silnlp/common/bulk_extract_local.py | 143 ++++++++++++++++ silnlp/common/check_books.py | 3 +- silnlp/common/combine_scores_save.py | 116 +++++++++++++ silnlp/common/find_by_iso2.py | 244 +++++++++++++++++++++++++++ silnlp/common/usfm_utils.py | 12 +- 6 files changed, 513 insertions(+), 7 deletions(-) create mode 100644 debug.log create mode 100644 silnlp/common/bulk_extract_local.py create mode 100644 silnlp/common/combine_scores_save.py create mode 100644 silnlp/common/find_by_iso2.py diff --git a/debug.log b/debug.log new file mode 100644 index 00000000..93e4b7ba --- /dev/null +++ b/debug.log @@ -0,0 +1,2 @@ +[0529/225757.623:ERROR:registration_protocol_win.cc(108)] CreateFile: The system cannot find the file specified. (0x2) +[0529/225806.839:ERROR:registration_protocol_win.cc(108)] CreateFile: The system cannot find the file specified. (0x2) diff --git a/silnlp/common/bulk_extract_local.py b/silnlp/common/bulk_extract_local.py new file mode 100644 index 00000000..34270a28 --- /dev/null +++ b/silnlp/common/bulk_extract_local.py @@ -0,0 +1,143 @@ +import argparse +import logging +from pathlib import Path +from typing import List +import sys + +from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef, get_books + +from .corpus import count_lines +from .paratext import check_versification, extract_project, extract_term_renderings +from machine.corpora import FileParatextProjectSettingsParser +from ..common.environment import SIL_NLP_ENV + +LOGGER = logging.getLogger(__package__ + ".bulk_extract_local") +SETTINGS_FILENAME = "Settings.xml" + + +def parse_settings(project): + settings_file_path = project / SETTINGS_FILENAME + if not settings_file_path.is_file(): + LOGGER.warning(f"Warning: {SETTINGS_FILENAME} not found.") + return + + try: + parser = FileParatextProjectSettingsParser(str(project)) + project_settings = parser.parse() + + # project_settings.name + # project_settings.full_name + # if project_settings.encoding: + # self.setting_encoding = getattr(project_settings.encoding, 'name', str(project_settings.encoding)) + + # if project_settings.versification: + # setting_versification = getattr(project_settings.versification, 'name', str(project_settings.versification)) + + # project_settings.file_name_prefix + # project_settings.file_name_form + # project_settings.file_name_suffix + # project_settings.biblical_terms_list_type + # project_settings.biblical_terms_project_name + # project_settings.biblical_terms_file_name + # project_settings.language_code + + except Exception as e: + print(f"Error parsing {SETTINGS_FILENAME}: {e}") + return None + + return project_settings + +def get_expected_verse_count(project: Path, include: List[str], exclude: List[str]) -> int: + include_books_set = get_books(include) if len(include) > 0 else None + exclude_books_set = get_books(exclude) if len(exclude) > 0 else None + project_settings = parse_settings(project) + + if project_settings.versification: + setting_versification = getattr(project_settings.versification, 'name', str(project_settings.versification)) + print(f"Found versification {setting_versification} in {SETTINGS_FILENAME} for {project}") + + def filter_lines(verse_ref_str: str) -> bool: + if include_books_set is None and exclude_books_set is None: + return True + + vref = VerseRef.from_string(verse_ref_str.strip(), setting_versification) + if exclude_books_set is not None and vref.book_num in exclude_books_set: + return False + + if include_books_set is not None and vref.book_num in include_books_set: + return True + + return include_books_set is None + + return count_lines(SIL_NLP_ENV.assets_dir / "vref.txt", filter_lines) + + +def has_settings_file(project_folder: Path) -> bool: + return (project_folder / SETTINGS_FILENAME).is_file() or (project_folder / SETTINGS_FILENAME.lower()).is_file() + + +def main() -> None: + parser = argparse.ArgumentParser(description="Extracts text corpora from Paratext projects") + parser.add_argument("input", type=str, help="The input folder.") + parser.add_argument("output", type=str, help="The output corpus folder.") + parser.add_argument("--terms", type=str, required=True, help="The output terms folder.") + parser.add_argument( + "--include", metavar="books", nargs="+", default=[], help="The books to include; e.g., 'NT', 'OT', 'GEN'" + ) + parser.add_argument( + "--exclude", metavar="books", nargs="+", default=[], help="The books to exclude; e.g., 'NT', 'OT', 'GEN'" + ) + parser.add_argument("--markers", default=False, action="store_true", help="Include USFM markers") + parser.add_argument("--lemmas", default=False, action="store_true", help="Extract lemmas if available") + parser.add_argument("--project-vrefs", default=False, action="store_true", help="Extract project verse refs") + + args = parser.parse_args() + + input_path = Path(args.input) + output_path = Path(args.output) + terms_path = Path(args.terms) + + if not input_path.is_dir(): + print(f"Error: Projects folder not found: {args.input}") + sys.exit(1) + + if not output_path.is_dir(): + print(f"Error: Output folder not found: {args.output}") + sys.exit(1) + + if not terms_path.is_dir(): + print(f"Error: Output terms folder not found: {args.terms}") + sys.exit(1) + + # Which folders have a Settings.xml file we can find? + projects = [folder for folder in input_path.glob("*") if folder.is_dir() and has_settings_file(folder)] + + # Process the projects that have data and tell the user. + if len(projects) > 0: + for project in projects: + LOGGER.info(f"Extracting {project} to {output_path}") + expected_verse_count = get_expected_verse_count(project, args.include, args.exclude) + + check_versification(project) + corpus_filename, verse_count = extract_project( + project, + output_path, + args.include, + args.exclude, + args.markers, + args.lemmas, + args.project_vrefs, + ) + + # check if the number of lines in the file is correct (the same as vref.txt) + LOGGER.info(f"# of Verses: {verse_count}") + if verse_count != expected_verse_count: + LOGGER.error(f"The number of verses is {verse_count}, but should be {expected_verse_count}.") + terms_count = extract_term_renderings(project, corpus_filename, terms_path) + LOGGER.info(f"# of Terms: {terms_count}") + LOGGER.info("Done.") + else: + LOGGER.warning(f"Couldn't find any data to process for any project in {input_path}.") + +if __name__ == "__main__": + main() diff --git a/silnlp/common/check_books.py b/silnlp/common/check_books.py index b069c5bf..485bdb01 100644 --- a/silnlp/common/check_books.py +++ b/silnlp/common/check_books.py @@ -50,7 +50,8 @@ def parse_book(project_dir: str, book: str): settings = FileParatextProjectSettingsParser(project_dir).parse() book_path = Path(project_dir) / settings.get_book_file_name(book) - + LOGGER.info(f"Attempting to parse {book} from {book_path}.") + if not book_path.is_file(): raise RuntimeError(f"Can't find file {book_path} for book {book}") diff --git a/silnlp/common/combine_scores_save.py b/silnlp/common/combine_scores_save.py new file mode 100644 index 00000000..098846a5 --- /dev/null +++ b/silnlp/common/combine_scores_save.py @@ -0,0 +1,116 @@ +import argparse +import csv +import sys +from collections import defaultdict +from pathlib import Path + +import pandas as pd + +from ..common.environment import SIL_NLP_ENV + + +def check_for_lock_file(folder: Path, filename: str, file_type: str): + """Check for lock files and ask the user to close them then exit.""" + + if file_type[0] == ".": + file_type = file_type[1:] + + if file_type.lower() == "csv": + lockfile = folder / f".~lock.{filename}.{file_type}#" + elif file_type.lower() == "xlsx": + lockfile = folder / f"~${filename}.{file_type}" + + if lockfile.is_file(): + print(f"Found lock file: {lockfile}") + print(f"Please close {filename}.{file_type} in folder {folder} OR delete the lock file and try again.") + sys.exit() + + +def aggregate_csv(folder_path): + # Dictionary to store rows by header type + data_by_header = defaultdict(list) + + # Iterate over all CSV files in the folder and its subfolders + for csv_file in folder_path.rglob("*/scores-*.csv"): + series = csv_file.parts[-3] # Extract series folder name + experiment = csv_file.parts[-2] # Extract experiment folder name + steps = csv_file.stem.split("-")[-1] # Extract steps from file name + + # Read the CSV file and add new columns + with open(csv_file, "r") as f: + reader = csv.reader(f) + rows = list(reader) + header = tuple(rows[0]) # Use tuple to make it hashable + + # Add columns to the beginning of each row + if header not in data_by_header: + data_by_header[header].append(["Series", "Experiment", "Steps"] + list(header)) + for row in rows[1:]: + data_by_header[header].append([series, experiment, steps] + row) + + return data_by_header + + +def write_to_csv(data_by_header, folder, output_filename): + + output_file = folder / f"{output_filename}.csv" + with open(output_file, "w", newline="") as f: + writer = csv.writer(f) + for header, rows in data_by_header.items(): + writer.writerows(rows) + writer.writerow([]) # Add a blank row to separate different types + # Write the folder path to the last line of the CSV file + writer.writerow([folder]) + print(f"Wrote scores to {output_file}") + + +def write_to_excel(data_by_header, folder, output_filename): + output_file = folder / f"{output_filename}.xlsx" + with pd.ExcelWriter(output_file) as writer: + for i, (header, rows) in enumerate(data_by_header.items()): + # Create a DataFrame for the current header + df = pd.DataFrame(rows[1:], columns=rows[0]) + # Convert columns to appropriate data types + df = df.apply(pd.to_numeric, errors="ignore") + # Generate a unique sheet name + sheet_name = f"Table_{i + 1}" + # Write the DataFrame to the Excel file + df.to_excel(writer, sheet_name=sheet_name, index=False) + print(f"Wrote scores to {output_file}") + + +def main(): + parser = argparse.ArgumentParser(description="Aggregate CSV files in a folder.") + parser.add_argument("folder", type=Path, help="Path to the folder containing CSV files.") + parser.add_argument( + "--output_filename", + type=str, + default="scores", + help="Filename suffix without the '.csv' or '.xlsx'. \ + The folder name is added as a prefix to make it easier to distinguish scores files in search results.", + ) + args = parser.parse_args() + + folder = Path(args.folder) + + csv_filename = f"{folder}_{args.output_filename}" + excel_filename = f"{folder}_{args.output_filename}" + + if not folder.is_dir(): + folder = Path(SIL_NLP_ENV.mt_experiments_dir) / args.folder + + # Check for lock files and ask the user to close them. + check_for_lock_file(folder, csv_filename, "csv") + check_for_lock_file(folder, excel_filename, "xlsx") + + data = aggregate_csv(folder) + + # Write the aggregated data to a new CSV file + write_to_csv(data, folder, csv_filename) + + # Write the aggregated data to an Excel file + write_to_excel(data, folder, excel_filename) + + +if __name__ == "__main__": + main() diff --git a/silnlp/common/find_by_iso2.py b/silnlp/common/find_by_iso2.py new file mode 100644 index 00000000..7e4c3c2a --- /dev/null +++ b/silnlp/common/find_by_iso2.py @@ -0,0 +1,244 @@ +import argparse +import json +import logging +from pathlib import Path +from typing import Dict, List, Set, Tuple, Union +import sys +import yaml + +from .environment import SIL_NLP_ENV +from .iso_info import NLLB_ISO_SET, ALT_ISO + +IsoCode = str +IsoCodeList = List[IsoCode] +IsoCodeSet = Set[IsoCode] + +LANGUAGE_FAMILY_FILE = SIL_NLP_ENV.assets_dir / "languageFamilies.json" + +def is_file_pattern(input_str: str) -> bool: + """Check if the input string contains a hyphen, indicating it's a filename pattern.""" + return '-' in input_str + +def split_input_list(input_list: List[str]) -> Tuple[List[str], List[str]]: + """Split input list into ISO codes and file patterns.""" + iso_codes = [] + files = [] + for item in input_list: + if is_file_pattern(item): + files.append(item) + else: + iso_codes.append(item) + return iso_codes, files + +def get_stem_name(file_path: Path) -> str: + """Get the stem name without path or extension.""" + return file_path.stem + + +def load_language_data(file_path: Path) -> Tuple[Dict, Dict, Dict]: + try: + with open(file_path, "r", encoding="utf-8") as file: + raw_data = json.load(file) + except FileNotFoundError: + logging.error(f"File not found: {file_path}") + return {}, {}, {} + except json.JSONDecodeError: + logging.error(f"Error decoding JSON from file: {file_path}") + return {}, {}, {} + + language_data = {} + country_data = {} + family_data = {} + + for lang in raw_data: + iso = lang["isoCode"] + country = lang["langCountry"] + family = lang["languageFamily"] + + language_data[iso] = { + "Name": lang["language"], + "Country": country, + "Family": family, + } + + country_data.setdefault(country, []).append(iso) + family_data.setdefault(family, []).append(iso) + + return language_data, country_data, family_data + + +def find_related_isocodes( + iso_codes: IsoCodeList, language_data: Dict, country_data: Dict, family_data: Dict +) -> IsoCodeList: + iso_set = set(iso_codes) + + for iso_code in iso_codes: + if iso_code in language_data: + lang_info = language_data[iso_code] +# logger.info(f"{iso_code}: {lang_info['Name']}, {lang_info['Country']}, {lang_info['Family']}") + + iso_set.update(country_data.get(lang_info["Country"], [])) + iso_set.update(family_data.get(lang_info["Family"], [])) + + return sorted(iso_set) + + +def get_files_by_iso(isocodes: IsoCodeList, scripture_dir: Path) -> List[Path]: + return [ + file for file in scripture_dir.glob('*.txt') + if any(file.stem.startswith(isocode + '-') for isocode in isocodes) + ] + +def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict[Path, Path], Dict[Path, Path]]: + existing_projects = {} + missing_projects = {} + + for file in files: + project = projects_dir / file.stem.split("-")[1] + if project.is_dir(): + existing_projects[file] = project + else: + missing_projects[file] = project + + return existing_projects, missing_projects + + +def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]: + return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code} + +def resolve_config_path(config_folder: Path) -> Path: + """Resolve config folder path relative to experiments directory if not absolute.""" + if not config_folder.is_absolute(): + return SIL_NLP_ENV.mt_experiments_dir / config_folder + return config_folder + +def create_alignment_config(source_files: List[Path], target_files: List[str]) -> dict: + """Create the alignment configuration dictionary.""" + config = { + 'data': { + 'aligner': 'fast_align', + 'corpus_pairs': [{ + 'type': 'train', + 'src': [get_stem_name(f) for f in source_files], + 'trg': target_files, + 'mapping': 'many_to_many', + 'test_size': 0, + 'val_size': 0 + }], + 'tokenize': False + } + } + return config + +def write_or_print_config(config: dict, config_folder: Path = None): + """Write config to file or print to terminal.""" + if config_folder: + config_folder = Path(config_folder) + if not config_folder.is_absolute(): + config_folder = SIL_NLP_ENV.mt_experiments_dir / config_folder + config_folder.mkdir(parents=True, exist_ok=True) + config_path = config_folder / 'config.yml' + with open(config_path, 'w') as f: + yaml.dump(config, f, default_flow_style=False, sort_keys=False) + return str(config_path) + else: + return yaml.dump(config, default_flow_style=False, sort_keys=False) + +def main(): + parser = argparse.ArgumentParser(description="Find related ISO language codes and create alignment config.") + parser.add_argument("inputs", nargs="+", + help="ISO codes or file patterns (e.g., 'fra' or 'en-NIV')") + parser.add_argument("--scripture-dir", type=Path, + default=Path(SIL_NLP_ENV.mt_scripture_dir), + help="Directory containing scripture files") + parser.add_argument("--all-related", action='store_true', + help="List all related scriptures without filtering to those that are part of NLLB") + parser.add_argument("--no-related", action='store_true', + help="Only list scriptures in the specified languages and not in related languages") + parser.add_argument("--output", type=Path, help="Output to the specified file.") + parser.add_argument("--target-files", nargs="+", + help="List of target files in format -") + parser.add_argument("--config-folder", type=Path, + help="Folder to write the config.yml file (absolute or relative to mt_experiments_dir)") + + args = parser.parse_args() + + # Setup logging + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + formatter = logging.Formatter('%(message)s') + + if args.output: + file_handler = logging.FileHandler(args.output) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + else: + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + + # Split inputs into ISO codes and file patterns + iso_codes, file_patterns = split_input_list(args.inputs) + + source_files = [] + if iso_codes: + # Load language data and process ISO codes + language_data, country_data, family_data = load_language_data(LANGUAGE_FAMILY_FILE) + if not language_data: + logging.error("Failed to load language data.") + return + + iso_codes = get_equivalent_isocodes(iso_codes) + + if args.no_related: + codes_to_find = list(iso_codes) + logger.info(f"\nConsidering only the specified iso codes and their equivalents: {codes_to_find}") + else: + codes_to_find = find_related_isocodes(list(iso_codes), language_data, country_data, family_data) + logger.info(f"\nFound {len(codes_to_find)} related languages:\n{codes_to_find}.") + + if not args.all_related: + codes_to_find = [iso for iso in codes_to_find if iso in NLLB_ISO_SET] + logger.info(f"\nFound {len(codes_to_find)} specified or related languages in NLLB:\n{codes_to_find}") + else: + logger.info(f"\nFound {len(codes_to_find)} specified or related languages:\n{codes_to_find}") + + # Get all possible codes and find matching files + all_possible_codes = get_equivalent_isocodes(codes_to_find) + source_files.extend(get_files_by_iso(all_possible_codes, args.scripture_dir)) + + # Add files from file patterns + if file_patterns: + pattern_files = [args.scripture_dir / f"{pattern}.txt" for pattern in file_patterns] + existing_files = [f for f in pattern_files if f.exists()] + source_files.extend(existing_files) + if len(existing_files) < len(pattern_files): + missing = set(file_patterns) - set(get_stem_name(f) for f in existing_files) + logger.warning(f"Could not find these files: {missing}") + + if not source_files: + logger.error("\nCouldn't find any Scripture files.") + return + + # Use target files from command line or file patterns from inputs + target_files = args.target_files if args.target_files else file_patterns + + # Create and output configuration + config = create_alignment_config(source_files, target_files) + result = write_or_print_config(config, args.config_folder) + + if args.config_folder: + logger.info(f"\nCreated alignment configuration in: {result}") + else: + logger.info("\nAlignment configuration:") + logger.info(result) + + logger.info(f"\nSource files found: {len(source_files)}") + for file in source_files: + logger.info(f" - {get_stem_name(file)}") + logger.info(f"\nTarget files: {len(target_files)}") + for file in target_files: + logger.info(f" - {file}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/silnlp/common/usfm_utils.py b/silnlp/common/usfm_utils.py index b715b6ec..849547da 100644 --- a/silnlp/common/usfm_utils.py +++ b/silnlp/common/usfm_utils.py @@ -6,13 +6,13 @@ def main() -> None: """ Print out all paragraph and character markers for a book - To use set book, fpath, and out_path. fpath should be a path to a book in a Paratext project + To use set book, fpath, and marker_file. fpath should be a path to a book in a Paratext project """ - book = "MAT" - fpath = Path("") - out_path = Path("") - sentences_file = Path("") + book = "PRO" + fpath = Path(r"M:/Paratext/projects/NIV11/20PRONIV11.SFM") + marker_file = Path(r"E:/Work/Draft Quality Improvements/NIV_PRO_paragraphs.txt") + sentences_file = Path(r"E:/Work/Draft Quality Improvements/NIV_PRO_sentences.txt") settings = FileParatextProjectSettingsParser(fpath.parent).parse() file_text = UsfmFileText( @@ -52,7 +52,7 @@ def main() -> None: elif tok.type in [UsfmTokenType.PARAGRAPH, UsfmTokenType.CHARACTER, UsfmTokenType.END]: usfm_markers[-1].append(tok.marker) - with out_path.open("w", encoding=settings.encoding) as f: + with marker_file.open("w", encoding=settings.encoding) as f: for ref, markers in zip(vrefs, usfm_markers): f.write(f"{ref} {markers}\n") From f9886f6feced7d70bfe988247c231fbb7c3cbdec Mon Sep 17 00:00:00 2001 From: David Baines Date: Thu, 13 Nov 2025 14:12:29 +0000 Subject: [PATCH 2/4] Add filters to find_by_iso --- silnlp/common/find_by_iso.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py index a47effa8..7287f120 100644 --- a/silnlp/common/find_by_iso.py +++ b/silnlp/common/find_by_iso.py @@ -3,8 +3,10 @@ import logging from pathlib import Path from typing import Dict, List, Set, Tuple, Union +import regex as re import sys + from .environment import SIL_NLP_ENV from .iso_info import NLLB_ISO_SET, ALT_ISO @@ -85,6 +87,21 @@ def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]: return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code} +def filter_files(files: List[Path], excluded_patterns:List[str]) -> List[Path]: + filtered = [] + date_pattern = re.compile(r'_\d{4}_\d{1,2}_\d{1,2}|_\d{1,2}_\d{1,2}_\d{4}') + + for file in files: + parts = file.stem.split('-', 1) + if len(parts) != 2: continue + iso, name = parts + if date_pattern.search(name): continue + if len(iso) not in (2, 3): continue + if any(pattern.lower() in name.lower() for pattern in excluded_patterns): continue + if file.is_file() and file.stat().st_size < 100_000: continue + filtered.append(file) + return filtered + def main(): parser = argparse.ArgumentParser(description="Find related ISO language codes.") parser.add_argument("iso_codes", nargs="+", help="ISO codes to find related languages for") @@ -151,7 +168,13 @@ def main(): # Find files matching the codes files = get_files_by_iso(all_possible_codes, scripture_dir) - existing_projects, missing_projects = split_files_by_projects(files, projects_dir) + + # Filter out AI and XRI files, and others. + excluded_patterns = ['XRI', '600M', '3.3B', '1.3B', 'words', 'name', 'clean', 'transcription','matthew', 'mark', 'mrk','luk'] + filtered_files = filter_files(files, excluded_patterns) + print(f"There are {len(files)} files and {len(files)-len(filtered_files)} were filtered out.") + + existing_projects, missing_projects = split_files_by_projects(filtered_files, projects_dir) # Display results if existing_projects: @@ -163,8 +186,8 @@ def main(): logger.info(f"\nThese {len(missing_projects)} files don't have a corresponding project folder:") for file, _ in missing_projects.items(): logger.info(f"{file.stem}") - logger.info(f"\nAll the files:") - for file in files: + logger.info(f"\nFiltered files:") + for file in filtered_files: logger.info(f" - {file.stem}") if not files: From 8c6c07ec34ef050d8d2e1aa4351148b59afa46a7 Mon Sep 17 00:00:00 2001 From: David Baines Date: Tue, 18 Nov 2025 14:17:51 +0000 Subject: [PATCH 3/4] Remove spurious files from commit --- debug.log | 2 - silnlp/common/bulk_extract_local.py | 143 ---------------- silnlp/common/find_by_iso.py | 5 +- silnlp/common/find_by_iso2.py | 244 ---------------------------- silnlp/common/usfm_utils.py | 66 -------- 5 files changed, 1 insertion(+), 459 deletions(-) delete mode 100644 debug.log delete mode 100644 silnlp/common/bulk_extract_local.py delete mode 100644 silnlp/common/find_by_iso2.py delete mode 100644 silnlp/common/usfm_utils.py diff --git a/debug.log b/debug.log deleted file mode 100644 index 93e4b7ba..00000000 --- a/debug.log +++ /dev/null @@ -1,2 +0,0 @@ -[0529/225757.623:ERROR:registration_protocol_win.cc(108)] CreateFile: The system cannot find the file specified. (0x2) -[0529/225806.839:ERROR:registration_protocol_win.cc(108)] CreateFile: The system cannot find the file specified. (0x2) diff --git a/silnlp/common/bulk_extract_local.py b/silnlp/common/bulk_extract_local.py deleted file mode 100644 index 34270a28..00000000 --- a/silnlp/common/bulk_extract_local.py +++ /dev/null @@ -1,143 +0,0 @@ -import argparse -import logging -from pathlib import Path -from typing import List -import sys - -from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef, get_books - -from .corpus import count_lines -from .paratext import check_versification, extract_project, extract_term_renderings -from machine.corpora import FileParatextProjectSettingsParser -from ..common.environment import SIL_NLP_ENV - -LOGGER = logging.getLogger(__package__ + ".bulk_extract_local") -SETTINGS_FILENAME = "Settings.xml" - - -def parse_settings(project): - settings_file_path = project / SETTINGS_FILENAME - if not settings_file_path.is_file(): - LOGGER.warning(f"Warning: {SETTINGS_FILENAME} not found.") - return - - try: - parser = FileParatextProjectSettingsParser(str(project)) - project_settings = parser.parse() - - # project_settings.name - # project_settings.full_name - # if project_settings.encoding: - # self.setting_encoding = getattr(project_settings.encoding, 'name', str(project_settings.encoding)) - - # if project_settings.versification: - # setting_versification = getattr(project_settings.versification, 'name', str(project_settings.versification)) - - # project_settings.file_name_prefix - # project_settings.file_name_form - # project_settings.file_name_suffix - # project_settings.biblical_terms_list_type - # project_settings.biblical_terms_project_name - # project_settings.biblical_terms_file_name - # project_settings.language_code - - except Exception as e: - print(f"Error parsing {SETTINGS_FILENAME}: {e}") - return None - - return project_settings - -def get_expected_verse_count(project: Path, include: List[str], exclude: List[str]) -> int: - include_books_set = get_books(include) if len(include) > 0 else None - exclude_books_set = get_books(exclude) if len(exclude) > 0 else None - project_settings = parse_settings(project) - - if project_settings.versification: - setting_versification = getattr(project_settings.versification, 'name', str(project_settings.versification)) - print(f"Found versification {setting_versification} in {SETTINGS_FILENAME} for {project}") - - def filter_lines(verse_ref_str: str) -> bool: - if include_books_set is None and exclude_books_set is None: - return True - - vref = VerseRef.from_string(verse_ref_str.strip(), setting_versification) - if exclude_books_set is not None and vref.book_num in exclude_books_set: - return False - - if include_books_set is not None and vref.book_num in include_books_set: - return True - - return include_books_set is None - - return count_lines(SIL_NLP_ENV.assets_dir / "vref.txt", filter_lines) - - -def has_settings_file(project_folder: Path) -> bool: - return (project_folder / SETTINGS_FILENAME).is_file() or (project_folder / SETTINGS_FILENAME.lower()).is_file() - - -def main() -> None: - parser = argparse.ArgumentParser(description="Extracts text corpora from Paratext projects") - parser.add_argument("input", type=str, help="The input folder.") - parser.add_argument("output", type=str, help="The output corpus folder.") - parser.add_argument("--terms", type=str, required=True, help="The output terms folder.") - parser.add_argument( - "--include", metavar="books", nargs="+", default=[], help="The books to include; e.g., 'NT', 'OT', 'GEN'" - ) - parser.add_argument( - "--exclude", metavar="books", nargs="+", default=[], help="The books to exclude; e.g., 'NT', 'OT', 'GEN'" - ) - parser.add_argument("--markers", default=False, action="store_true", help="Include USFM markers") - parser.add_argument("--lemmas", default=False, action="store_true", help="Extract lemmas if available") - parser.add_argument("--project-vrefs", default=False, action="store_true", help="Extract project verse refs") - - args = parser.parse_args() - - input_path = Path(args.input) - output_path = Path(args.output) - terms_path = Path(args.terms) - - if not input_path.is_dir(): - print(f"Error: Projects folder not found: {args.input}") - sys.exit(1) - - if not output_path.is_dir(): - print(f"Error: Output folder not found: {args.output}") - sys.exit(1) - - if not terms_path.is_dir(): - print(f"Error: Output terms folder not found: {args.terms}") - sys.exit(1) - - # Which folders have a Settings.xml file we can find? - projects = [folder for folder in input_path.glob("*") if folder.is_dir() and has_settings_file(folder)] - - # Process the projects that have data and tell the user. - if len(projects) > 0: - for project in projects: - LOGGER.info(f"Extracting {project} to {output_path}") - expected_verse_count = get_expected_verse_count(project, args.include, args.exclude) - - check_versification(project) - corpus_filename, verse_count = extract_project( - project, - output_path, - args.include, - args.exclude, - args.markers, - args.lemmas, - args.project_vrefs, - ) - - # check if the number of lines in the file is correct (the same as vref.txt) - LOGGER.info(f"# of Verses: {verse_count}") - if verse_count != expected_verse_count: - LOGGER.error(f"The number of verses is {verse_count}, but should be {expected_verse_count}.") - terms_count = extract_term_renderings(project, corpus_filename, terms_path) - LOGGER.info(f"# of Terms: {terms_count}") - LOGGER.info("Done.") - else: - LOGGER.warning(f"Couldn't find any data to process for any project in {input_path}.") - -if __name__ == "__main__": - main() diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py index 7287f120..f2363755 100644 --- a/silnlp/common/find_by_iso.py +++ b/silnlp/common/find_by_iso.py @@ -2,7 +2,7 @@ import json import logging from pathlib import Path -from typing import Dict, List, Set, Tuple, Union +from typing import Dict, List, Set, Tuple import regex as re import sys @@ -56,8 +56,6 @@ def find_related_isocodes( for iso_code in iso_codes: if iso_code in language_data: lang_info = language_data[iso_code] -# logger.info(f"{iso_code}: {lang_info['Name']}, {lang_info['Country']}, {lang_info['Family']}") - iso_set.update(country_data.get(lang_info["Country"], [])) iso_set.update(family_data.get(lang_info["Family"], [])) @@ -114,7 +112,6 @@ def main(): # Create a custom logger logger = logging.getLogger(__name__) - #logger.basicConfig() # Set the global logging level logger.setLevel(logging.INFO) diff --git a/silnlp/common/find_by_iso2.py b/silnlp/common/find_by_iso2.py deleted file mode 100644 index 7e4c3c2a..00000000 --- a/silnlp/common/find_by_iso2.py +++ /dev/null @@ -1,244 +0,0 @@ -import argparse -import json -import logging -from pathlib import Path -from typing import Dict, List, Set, Tuple, Union -import sys -import yaml - -from .environment import SIL_NLP_ENV -from .iso_info import NLLB_ISO_SET, ALT_ISO - -IsoCode = str -IsoCodeList = List[IsoCode] -IsoCodeSet = Set[IsoCode] - -LANGUAGE_FAMILY_FILE = SIL_NLP_ENV.assets_dir / "languageFamilies.json" - -def is_file_pattern(input_str: str) -> bool: - """Check if the input string contains a hyphen, indicating it's a filename pattern.""" - return '-' in input_str - -def split_input_list(input_list: List[str]) -> Tuple[List[str], List[str]]: - """Split input list into ISO codes and file patterns.""" - iso_codes = [] - files = [] - for item in input_list: - if is_file_pattern(item): - files.append(item) - else: - iso_codes.append(item) - return iso_codes, files - -def get_stem_name(file_path: Path) -> str: - """Get the stem name without path or extension.""" - return file_path.stem - - -def load_language_data(file_path: Path) -> Tuple[Dict, Dict, Dict]: - try: - with open(file_path, "r", encoding="utf-8") as file: - raw_data = json.load(file) - except FileNotFoundError: - logging.error(f"File not found: {file_path}") - return {}, {}, {} - except json.JSONDecodeError: - logging.error(f"Error decoding JSON from file: {file_path}") - return {}, {}, {} - - language_data = {} - country_data = {} - family_data = {} - - for lang in raw_data: - iso = lang["isoCode"] - country = lang["langCountry"] - family = lang["languageFamily"] - - language_data[iso] = { - "Name": lang["language"], - "Country": country, - "Family": family, - } - - country_data.setdefault(country, []).append(iso) - family_data.setdefault(family, []).append(iso) - - return language_data, country_data, family_data - - -def find_related_isocodes( - iso_codes: IsoCodeList, language_data: Dict, country_data: Dict, family_data: Dict -) -> IsoCodeList: - iso_set = set(iso_codes) - - for iso_code in iso_codes: - if iso_code in language_data: - lang_info = language_data[iso_code] -# logger.info(f"{iso_code}: {lang_info['Name']}, {lang_info['Country']}, {lang_info['Family']}") - - iso_set.update(country_data.get(lang_info["Country"], [])) - iso_set.update(family_data.get(lang_info["Family"], [])) - - return sorted(iso_set) - - -def get_files_by_iso(isocodes: IsoCodeList, scripture_dir: Path) -> List[Path]: - return [ - file for file in scripture_dir.glob('*.txt') - if any(file.stem.startswith(isocode + '-') for isocode in isocodes) - ] - -def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict[Path, Path], Dict[Path, Path]]: - existing_projects = {} - missing_projects = {} - - for file in files: - project = projects_dir / file.stem.split("-")[1] - if project.is_dir(): - existing_projects[file] = project - else: - missing_projects[file] = project - - return existing_projects, missing_projects - - -def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]: - return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code} - -def resolve_config_path(config_folder: Path) -> Path: - """Resolve config folder path relative to experiments directory if not absolute.""" - if not config_folder.is_absolute(): - return SIL_NLP_ENV.mt_experiments_dir / config_folder - return config_folder - -def create_alignment_config(source_files: List[Path], target_files: List[str]) -> dict: - """Create the alignment configuration dictionary.""" - config = { - 'data': { - 'aligner': 'fast_align', - 'corpus_pairs': [{ - 'type': 'train', - 'src': [get_stem_name(f) for f in source_files], - 'trg': target_files, - 'mapping': 'many_to_many', - 'test_size': 0, - 'val_size': 0 - }], - 'tokenize': False - } - } - return config - -def write_or_print_config(config: dict, config_folder: Path = None): - """Write config to file or print to terminal.""" - if config_folder: - config_folder = Path(config_folder) - if not config_folder.is_absolute(): - config_folder = SIL_NLP_ENV.mt_experiments_dir / config_folder - config_folder.mkdir(parents=True, exist_ok=True) - config_path = config_folder / 'config.yml' - with open(config_path, 'w') as f: - yaml.dump(config, f, default_flow_style=False, sort_keys=False) - return str(config_path) - else: - return yaml.dump(config, default_flow_style=False, sort_keys=False) - -def main(): - parser = argparse.ArgumentParser(description="Find related ISO language codes and create alignment config.") - parser.add_argument("inputs", nargs="+", - help="ISO codes or file patterns (e.g., 'fra' or 'en-NIV')") - parser.add_argument("--scripture-dir", type=Path, - default=Path(SIL_NLP_ENV.mt_scripture_dir), - help="Directory containing scripture files") - parser.add_argument("--all-related", action='store_true', - help="List all related scriptures without filtering to those that are part of NLLB") - parser.add_argument("--no-related", action='store_true', - help="Only list scriptures in the specified languages and not in related languages") - parser.add_argument("--output", type=Path, help="Output to the specified file.") - parser.add_argument("--target-files", nargs="+", - help="List of target files in format -") - parser.add_argument("--config-folder", type=Path, - help="Folder to write the config.yml file (absolute or relative to mt_experiments_dir)") - - args = parser.parse_args() - - # Setup logging - logger = logging.getLogger(__name__) - logger.setLevel(logging.INFO) - formatter = logging.Formatter('%(message)s') - - if args.output: - file_handler = logging.FileHandler(args.output) - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) - else: - console_handler = logging.StreamHandler(sys.stdout) - console_handler.setFormatter(formatter) - logger.addHandler(console_handler) - - # Split inputs into ISO codes and file patterns - iso_codes, file_patterns = split_input_list(args.inputs) - - source_files = [] - if iso_codes: - # Load language data and process ISO codes - language_data, country_data, family_data = load_language_data(LANGUAGE_FAMILY_FILE) - if not language_data: - logging.error("Failed to load language data.") - return - - iso_codes = get_equivalent_isocodes(iso_codes) - - if args.no_related: - codes_to_find = list(iso_codes) - logger.info(f"\nConsidering only the specified iso codes and their equivalents: {codes_to_find}") - else: - codes_to_find = find_related_isocodes(list(iso_codes), language_data, country_data, family_data) - logger.info(f"\nFound {len(codes_to_find)} related languages:\n{codes_to_find}.") - - if not args.all_related: - codes_to_find = [iso for iso in codes_to_find if iso in NLLB_ISO_SET] - logger.info(f"\nFound {len(codes_to_find)} specified or related languages in NLLB:\n{codes_to_find}") - else: - logger.info(f"\nFound {len(codes_to_find)} specified or related languages:\n{codes_to_find}") - - # Get all possible codes and find matching files - all_possible_codes = get_equivalent_isocodes(codes_to_find) - source_files.extend(get_files_by_iso(all_possible_codes, args.scripture_dir)) - - # Add files from file patterns - if file_patterns: - pattern_files = [args.scripture_dir / f"{pattern}.txt" for pattern in file_patterns] - existing_files = [f for f in pattern_files if f.exists()] - source_files.extend(existing_files) - if len(existing_files) < len(pattern_files): - missing = set(file_patterns) - set(get_stem_name(f) for f in existing_files) - logger.warning(f"Could not find these files: {missing}") - - if not source_files: - logger.error("\nCouldn't find any Scripture files.") - return - - # Use target files from command line or file patterns from inputs - target_files = args.target_files if args.target_files else file_patterns - - # Create and output configuration - config = create_alignment_config(source_files, target_files) - result = write_or_print_config(config, args.config_folder) - - if args.config_folder: - logger.info(f"\nCreated alignment configuration in: {result}") - else: - logger.info("\nAlignment configuration:") - logger.info(result) - - logger.info(f"\nSource files found: {len(source_files)}") - for file in source_files: - logger.info(f" - {get_stem_name(file)}") - logger.info(f"\nTarget files: {len(target_files)}") - for file in target_files: - logger.info(f" - {file}") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/silnlp/common/usfm_utils.py b/silnlp/common/usfm_utils.py deleted file mode 100644 index a29d58aa..00000000 --- a/silnlp/common/usfm_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -from pathlib import Path - -from machine.corpora import FileParatextProjectSettingsParser, UsfmFileText, UsfmTokenizer, UsfmTokenType - -# Marker "type" is as defined by the UsfmTokenType given to tokens by the UsfmTokenizer, -# which mostly aligns with a marker's StyleType in the USFM stylesheet -CHARACTER_TYPE_EMBEDS = ["fig", "fm", "jmp", "rq", "va", "vp", "xt", "xtSee", "xtSeeAlso"] -PARAGRAPH_TYPE_EMBEDS = ["lit", "r", "rem"] -NON_NOTE_TYPE_EMBEDS = CHARACTER_TYPE_EMBEDS + PARAGRAPH_TYPE_EMBEDS - - -def main() -> None: - """ - Print out all paragraph and character markers for a book - To use set book, fpath, and marker_file. fpath should be a path to a book in a Paratext project - """ - - book = "PRO" - fpath = Path(r"M:/Paratext/projects/NIV11/20PRONIV11.SFM") - marker_file = Path(r"E:/Work/Draft Quality Improvements/NIV_PRO_paragraphs.txt") - sentences_file = Path(r"E:/Work/Draft Quality Improvements/NIV_PRO_sentences.txt") - - settings = FileParatextProjectSettingsParser(fpath.parent).parse() - file_text = UsfmFileText( - settings.stylesheet, - settings.encoding, - book, - fpath, - settings.versification, - include_markers=True, - include_all_text=True, - project=settings.name, - ) - - vrefs = [] - usfm_markers = [] - usfm_tokenizer = UsfmTokenizer(settings.stylesheet) - with sentences_file.open("w", encoding=settings.encoding) as f: - for sent in file_text: - f.write(f"{sent}\n") - if len(sent.ref.path) > 0 and sent.ref.path[-1].name in PARAGRAPH_TYPE_EMBEDS: - continue - - vrefs.append(sent.ref) - usfm_markers.append([]) - usfm_toks = usfm_tokenizer.tokenize(sent.text.strip()) - - ignore_scope = None - for tok in usfm_toks: - if ignore_scope is not None: - if tok.type == UsfmTokenType.END and tok.marker[:-1] == ignore_scope.marker: - ignore_scope = None - elif tok.type == UsfmTokenType.NOTE or ( - tok.type == UsfmTokenType.CHARACTER and tok.marker in CHARACTER_TYPE_EMBEDS - ): - ignore_scope = tok - elif tok.type in [UsfmTokenType.PARAGRAPH, UsfmTokenType.CHARACTER, UsfmTokenType.END]: - usfm_markers[-1].append(tok.marker) - - with marker_file.open("w", encoding=settings.encoding) as f: - for ref, markers in zip(vrefs, usfm_markers): - f.write(f"{ref} {markers}\n") - - -if __name__ == "__main__": - main() From d32e059ca83f647780e9cb0170f0adfea7d3fd8f Mon Sep 17 00:00:00 2001 From: David Baines Date: Wed, 19 Nov 2025 12:06:41 +0000 Subject: [PATCH 4/4] Keep files created today --- silnlp/common/find_by_iso.py | 103 ++++++++++++++++++++++++----------- 1 file changed, 72 insertions(+), 31 deletions(-) diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py index f2363755..a91177ef 100644 --- a/silnlp/common/find_by_iso.py +++ b/silnlp/common/find_by_iso.py @@ -1,14 +1,15 @@ import argparse import json import logging +import sys +from datetime import datetime from pathlib import Path from typing import Dict, List, Set, Tuple -import regex as re -import sys +import regex as re from .environment import SIL_NLP_ENV -from .iso_info import NLLB_ISO_SET, ALT_ISO +from .iso_info import ALT_ISO, NLLB_ISO_SET IsoCode = str IsoCodeList = List[IsoCode] @@ -16,6 +17,7 @@ LANGUAGE_FAMILY_FILE = SIL_NLP_ENV.assets_dir / "languageFamilies.json" + def load_language_data(file_path: Path) -> Tuple[Dict, Dict, Dict]: try: with open(file_path, "r", encoding="utf-8") as file: @@ -64,10 +66,10 @@ def find_related_isocodes( def get_files_by_iso(isocodes: IsoCodeList, scripture_dir: Path) -> List[Path]: return [ - file for file in scripture_dir.glob('*.txt') - if any(file.stem.startswith(isocode + '-') for isocode in isocodes) + file for file in scripture_dir.glob("*.txt") if any(file.stem.startswith(isocode + "-") for isocode in isocodes) ] + def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict[Path, Path], Dict[Path, Path]]: existing_projects = {} missing_projects = {} @@ -85,27 +87,53 @@ def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]: return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code} -def filter_files(files: List[Path], excluded_patterns:List[str]) -> List[Path]: + +def filter_files(files: List[Path], excluded_patterns: List[str]) -> List[Path]: filtered = [] - date_pattern = re.compile(r'_\d{4}_\d{1,2}_\d{1,2}|_\d{1,2}_\d{1,2}_\d{4}') + + today = datetime.now() + today_pattern = re.compile(f"{today.strftime('_%Y_%m_%d')}|{today.strftime('_%d_%m_%Y')}") + date_pattern = re.compile(r"_\d{4}_\d{1,2}_\d{1,2}|_\d{1,2}_\d{1,2}_\d{4}") for file in files: - parts = file.stem.split('-', 1) - if len(parts) != 2: continue + parts = file.stem.split("-", 1) + if len(parts) != 2: + continue iso, name = parts - if date_pattern.search(name): continue - if len(iso) not in (2, 3): continue - if any(pattern.lower() in name.lower() for pattern in excluded_patterns): continue - if file.is_file() and file.stat().st_size < 100_000: continue + if today_pattern.search(name): + filtered.append(file) + continue + if date_pattern.search(name): + continue + if len(iso) not in (2, 3): + continue + if any(pattern.lower() in name.lower() for pattern in excluded_patterns): + continue + if file.is_file() and file.stat().st_size < 100_000: + continue filtered.append(file) return filtered + def main(): parser = argparse.ArgumentParser(description="Find related ISO language codes.") parser.add_argument("iso_codes", nargs="+", help="ISO codes to find related languages for") - parser.add_argument("--scripture-dir", type=Path, default=Path(SIL_NLP_ENV.mt_scripture_dir), help="Directory containing scripture files") - parser.add_argument("--all-related", action='store_true', help="List all related scriptures without filtering to those that are part of NLLB") - parser.add_argument("--no-related", action='store_true', help="Only list scriptures in the specified languages and not in related languages") + parser.add_argument( + "--scripture-dir", + type=Path, + default=Path(SIL_NLP_ENV.mt_scripture_dir), + help="Directory containing scripture files", + ) + parser.add_argument( + "--all-related", + action="store_true", + help="List all related scriptures without filtering to those that are part of NLLB", + ) + parser.add_argument( + "--no-related", + action="store_true", + help="Only list scriptures in the specified languages and not in related languages", + ) parser.add_argument("--output", type=Path, help="Output to the specified file.") args = parser.parse_args() @@ -114,9 +142,9 @@ def main(): logger = logging.getLogger(__name__) # Set the global logging level - logger.setLevel(logging.INFO) - - formatter = logging.Formatter('%(message)s') + logger.setLevel(logging.INFO) + + formatter = logging.Formatter("%(message)s") if args.output: # Create handler for the file output. @@ -129,7 +157,6 @@ def main(): console_handler.setFormatter(formatter) logger.addHandler(console_handler) - language_data, country_data, family_data = load_language_data(LANGUAGE_FAMILY_FILE) projects_dir = SIL_NLP_ENV.pt_projects_dir scripture_dir = Path(args.scripture_dir) @@ -137,22 +164,22 @@ def main(): if not language_data: logging.error("Failed to load language data.") return - + # Get equivalent ISO codes for input iso_codes = get_equivalent_isocodes(args.iso_codes) - + if args.no_related: - + # Option 2: No files in related languages, only equivalent ISO codes codes_to_find = list(iso_codes) logger.info(f"\nConsidering only the specified iso codes and their equivalents. {codes_to_find}") - + else: # Find related ISO codes codes_to_find = find_related_isocodes(list(iso_codes), language_data, country_data, family_data) logger.info(f"\nFound {len(codes_to_find)} related languages:\n{codes_to_find}.") - if not args.all_related: + if not args.all_related: # Option 3 (default): Filter to NLLB languages codes_to_find = [iso for iso in codes_to_find if iso in NLLB_ISO_SET] logger.info(f"\nFound {len(codes_to_find)} specified or related languages in NLLB:\n{codes_to_find}") @@ -162,15 +189,28 @@ def main(): # Get all possible 2 and 3 letter codes for the related languages all_possible_codes = get_equivalent_isocodes(codes_to_find) - + # Find files matching the codes files = get_files_by_iso(all_possible_codes, scripture_dir) - + # Filter out AI and XRI files, and others. - excluded_patterns = ['XRI', '600M', '3.3B', '1.3B', 'words', 'name', 'clean', 'transcription','matthew', 'mark', 'mrk','luk'] + excluded_patterns = [ + "XRI", + "600M", + "3.3B", + "1.3B", + "words", + "name", + "clean", + "transcription", + "matthew", + "mark", + "mrk", + "luk", + ] filtered_files = filter_files(files, excluded_patterns) print(f"There are {len(files)} files and {len(files)-len(filtered_files)} were filtered out.") - + existing_projects, missing_projects = split_files_by_projects(filtered_files, projects_dir) # Display results @@ -183,12 +223,13 @@ def main(): logger.info(f"\nThese {len(missing_projects)} files don't have a corresponding project folder:") for file, _ in missing_projects.items(): logger.info(f"{file.stem}") - logger.info(f"\nFiltered files:") + logger.info("\nFiltered files:") for file in filtered_files: logger.info(f" - {file.stem}") if not files: logger.info("\nCouldn't find any Scripture files in these languages.") + if __name__ == "__main__": - main() \ No newline at end of file + main()