From e0de7620070da5d4ae108172ac33b5030fca2855 Mon Sep 17 00:00:00 2001
From: David Baines <david_baines@sil.org>
Date: Wed, 18 Jun 2025 10:07:51 +0100
Subject: [PATCH 1/4] WIP: Stashing changes before find_by_iso branching

---
 debug.log                            |   2 +
 silnlp/common/bulk_extract_local.py  | 143 ++++++++++++++++
 silnlp/common/check_books.py         |   3 +-
 silnlp/common/combine_scores_save.py | 116 +++++++++++++
 silnlp/common/find_by_iso2.py        | 244 +++++++++++++++++++++++++++
 silnlp/common/usfm_utils.py          |  12 +-
 6 files changed, 513 insertions(+), 7 deletions(-)
 create mode 100644 debug.log
 create mode 100644 silnlp/common/bulk_extract_local.py
 create mode 100644 silnlp/common/combine_scores_save.py
 create mode 100644 silnlp/common/find_by_iso2.py

diff --git a/debug.log b/debug.log
new file mode 100644
index 00000000..93e4b7ba
--- /dev/null
+++ b/debug.log
@@ -0,0 +1,2 @@
+[0529/225757.623:ERROR:registration_protocol_win.cc(108)] CreateFile: The system cannot find the file specified. (0x2)
+[0529/225806.839:ERROR:registration_protocol_win.cc(108)] CreateFile: The system cannot find the file specified. (0x2)
diff --git a/silnlp/common/bulk_extract_local.py b/silnlp/common/bulk_extract_local.py
new file mode 100644
index 00000000..34270a28
--- /dev/null
+++ b/silnlp/common/bulk_extract_local.py
@@ -0,0 +1,143 @@
+import argparse
+import logging
+from pathlib import Path
+from typing import List
+import sys
+
+from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef, get_books
+
+from .corpus import count_lines
+from .paratext import check_versification, extract_project, extract_term_renderings
+from machine.corpora import FileParatextProjectSettingsParser
+from ..common.environment import SIL_NLP_ENV
+
+LOGGER = logging.getLogger(__package__ + ".bulk_extract_local")
+SETTINGS_FILENAME = "Settings.xml"
+
+
+def parse_settings(project):
+    settings_file_path = project / SETTINGS_FILENAME
+    if not settings_file_path.is_file():
+        LOGGER.warning(f"Warning: {SETTINGS_FILENAME} not found.")
+        return
+
+    try:
+        parser = FileParatextProjectSettingsParser(str(project))
+        project_settings = parser.parse()
+
+        # project_settings.name
+        # project_settings.full_name
+        # if project_settings.encoding:
+        #     self.setting_encoding = getattr(project_settings.encoding, 'name', str(project_settings.encoding))
+
+        # if project_settings.versification:
+        #     setting_versification = getattr(project_settings.versification, 'name', str(project_settings.versification))
+       
+            # project_settings.file_name_prefix
+            # project_settings.file_name_form
+            # project_settings.file_name_suffix
+            # project_settings.biblical_terms_list_type
+            # project_settings.biblical_terms_project_name
+            # project_settings.biblical_terms_file_name
+            # project_settings.language_code
+
+    except Exception as e:
+        print(f"Error parsing {SETTINGS_FILENAME}: {e}")
+        return None
+
+    return project_settings
+
+def get_expected_verse_count(project: Path, include: List[str], exclude: List[str]) -> int:
+    include_books_set = get_books(include) if len(include) > 0 else None
+    exclude_books_set = get_books(exclude) if len(exclude) > 0 else None
+    project_settings = parse_settings(project)
+
+    if project_settings.versification:
+        setting_versification = getattr(project_settings.versification, 'name', str(project_settings.versification))
+    print(f"Found versification {setting_versification} in {SETTINGS_FILENAME} for {project}")
+
+    def filter_lines(verse_ref_str: str) -> bool:
+        if include_books_set is None and exclude_books_set is None:
+            return True
+
+        vref = VerseRef.from_string(verse_ref_str.strip(), setting_versification)
+        if exclude_books_set is not None and vref.book_num in exclude_books_set:
+            return False
+
+        if include_books_set is not None and vref.book_num in include_books_set:
+            return True
+
+        return include_books_set is None
+
+    return count_lines(SIL_NLP_ENV.assets_dir / "vref.txt", filter_lines)
+
+
+def has_settings_file(project_folder: Path) -> bool:
+    return (project_folder / SETTINGS_FILENAME).is_file() or (project_folder / SETTINGS_FILENAME.lower()).is_file()
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Extracts text corpora from Paratext projects")
+    parser.add_argument("input", type=str, help="The input folder.")
+    parser.add_argument("output", type=str, help="The output corpus folder.")
+    parser.add_argument("--terms", type=str, required=True, help="The output terms folder.")
+    parser.add_argument(
+        "--include", metavar="books", nargs="+", default=[], help="The books to include; e.g., 'NT', 'OT', 'GEN'"
+    )
+    parser.add_argument(
+        "--exclude", metavar="books", nargs="+", default=[], help="The books to exclude; e.g., 'NT', 'OT', 'GEN'"
+    )
+    parser.add_argument("--markers", default=False, action="store_true", help="Include USFM markers")
+    parser.add_argument("--lemmas", default=False, action="store_true", help="Extract lemmas if available")
+    parser.add_argument("--project-vrefs", default=False, action="store_true", help="Extract project verse refs")
+
+    args = parser.parse_args()
+    
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+    terms_path = Path(args.terms)
+
+    if not input_path.is_dir():
+        print(f"Error: Projects folder not found: {args.input}")
+        sys.exit(1)
+
+    if not output_path.is_dir():
+        print(f"Error: Output folder not found: {args.output}")
+        sys.exit(1)
+
+    if not terms_path.is_dir():
+        print(f"Error: Output terms folder not found: {args.terms}")
+        sys.exit(1)
+
+    # Which folders have a Settings.xml file we can find?
+    projects = [folder for folder in input_path.glob("*") if folder.is_dir() and has_settings_file(folder)]
+
+    # Process the projects that have data and tell the user.
+    if len(projects) > 0:
+        for project in projects:
+            LOGGER.info(f"Extracting {project} to {output_path}")
+            expected_verse_count = get_expected_verse_count(project, args.include, args.exclude)
+
+            check_versification(project)
+            corpus_filename, verse_count = extract_project(
+                project,
+                output_path,
+                args.include,
+                args.exclude,
+                args.markers,
+                args.lemmas,
+                args.project_vrefs,
+            )
+            
+            # check if the number of lines in the file is correct (the same as vref.txt)
+            LOGGER.info(f"# of Verses: {verse_count}")
+            if verse_count != expected_verse_count:
+                LOGGER.error(f"The number of verses is {verse_count}, but should be {expected_verse_count}.")
+            terms_count = extract_term_renderings(project, corpus_filename, terms_path)
+            LOGGER.info(f"# of Terms: {terms_count}")
+            LOGGER.info("Done.")
+    else:
+        LOGGER.warning(f"Couldn't find any data to process for any project in {input_path}.")
+
+if __name__ == "__main__":
+    main()
diff --git a/silnlp/common/check_books.py b/silnlp/common/check_books.py
index b069c5bf..485bdb01 100644
--- a/silnlp/common/check_books.py
+++ b/silnlp/common/check_books.py
@@ -50,7 +50,8 @@ def parse_book(project_dir: str, book: str):
 
     settings = FileParatextProjectSettingsParser(project_dir).parse()
     book_path = Path(project_dir) / settings.get_book_file_name(book)
-
+    LOGGER.info(f"Attempting to parse {book} from {book_path}.")
+    
     if not book_path.is_file():
         raise RuntimeError(f"Can't find file {book_path} for book {book}")
 
diff --git a/silnlp/common/combine_scores_save.py b/silnlp/common/combine_scores_save.py
new file mode 100644
index 00000000..098846a5
--- /dev/null
+++ b/silnlp/common/combine_scores_save.py
@@ -0,0 +1,116 @@
+import argparse
+import csv
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+import pandas as pd
+
+from ..common.environment import SIL_NLP_ENV
+
+
+def check_for_lock_file(folder: Path, filename: str, file_type: str):
+    """Check for lock files and ask the user to close them then exit."""
+
+    if file_type[0] == ".":
+        file_type = file_type[1:]
+
+    if file_type.lower() == "csv":
+        lockfile = folder / f".~lock.{filename}.{file_type}#"
+    elif file_type.lower() == "xlsx":
+        lockfile = folder / f"~${filename}.{file_type}"
+
+    if lockfile.is_file():
+        print(f"Found lock file: {lockfile}")
+        print(f"Please close {filename}.{file_type} in folder {folder} OR delete the lock file and try again.")
+        sys.exit()
+
+
+def aggregate_csv(folder_path):
+    # Dictionary to store rows by header type
+    data_by_header = defaultdict(list)
+
+    # Iterate over all CSV files in the folder and its subfolders
+    for csv_file in folder_path.rglob("*/scores-*.csv"):
+        series = csv_file.parts[-3]  # Extract series folder name
+        experiment = csv_file.parts[-2]  # Extract experiment folder name
+        steps = csv_file.stem.split("-")[-1]  # Extract steps from file name
+
+        # Read the CSV file and add new columns
+        with open(csv_file, "r") as f:
+            reader = csv.reader(f)
+            rows = list(reader)
+            header = tuple(rows[0])  # Use tuple to make it hashable
+
+            # Add columns to the beginning of each row
+            if header not in data_by_header:
+                data_by_header[header].append(["Series", "Experiment", "Steps"] + list(header))
+            for row in rows[1:]:
+                data_by_header[header].append([series, experiment, steps] + row)
+
+    return data_by_header
+
+
+def write_to_csv(data_by_header, folder, output_filename):
+
+    output_file = folder / f"{output_filename}.csv"
+    with open(output_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        for header, rows in data_by_header.items():
+            writer.writerows(rows)
+            writer.writerow([])  # Add a blank row to separate different types
+        # Write the folder path to the last line of the CSV file
+        writer.writerow([folder])
+    print(f"Wrote scores to {output_file}")
+
+
+def write_to_excel(data_by_header, folder, output_filename):
+    output_file = folder / f"{output_filename}.xlsx"
+    with pd.ExcelWriter(output_file) as writer:
+        for i, (header, rows) in enumerate(data_by_header.items()):
+            # Create a DataFrame for the current header
+            df = pd.DataFrame(rows[1:], columns=rows[0])
+            # Convert columns to appropriate data types
+            df = df.apply(pd.to_numeric, errors="ignore")
+            # Generate a unique sheet name
+            sheet_name = f"Table_{i + 1}"
+            # Write the DataFrame to the Excel file
+            df.to_excel(writer, sheet_name=sheet_name, index=False)
+    print(f"Wrote scores to {output_file}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Aggregate CSV files in a folder.")
+    parser.add_argument("folder", type=Path, help="Path to the folder containing CSV files.")
+    parser.add_argument(
+        "--output_filename",
+        type=str,
+        default="scores",
+        help="Filename suffix without the '.csv' or '.xlsx'. \
+            The folder name is added as a prefix to make it easier to distinguish scores files in search results.",
+    )
+    args = parser.parse_args()
+
+    folder = Path(args.folder)
+
+    csv_filename = f"{folder}_{args.output_filename}"
+    excel_filename = f"{folder}_{args.output_filename}"
+
+    if not folder.is_dir():
+        folder = Path(SIL_NLP_ENV.mt_experiments_dir) / args.folder
+
+    # Check for lock files and ask the user to close them.
+    check_for_lock_file(folder, csv_filename, "csv")
+    check_for_lock_file(folder, excel_filename, "xlsx")
+
+    data = aggregate_csv(folder)
+
+    # Write the aggregated data to a new CSV file
+    write_to_csv(data, folder, csv_filename)
+
+    # Write the aggregated data to an Excel file
+    write_to_excel(data, folder, excel_filename)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/silnlp/common/find_by_iso2.py b/silnlp/common/find_by_iso2.py
new file mode 100644
index 00000000..7e4c3c2a
--- /dev/null
+++ b/silnlp/common/find_by_iso2.py
@@ -0,0 +1,244 @@
+import argparse
+import json
+import logging
+from pathlib import Path
+from typing import Dict, List, Set, Tuple, Union
+import sys
+import yaml
+
+from .environment import SIL_NLP_ENV
+from .iso_info import NLLB_ISO_SET, ALT_ISO
+
+IsoCode = str
+IsoCodeList = List[IsoCode]
+IsoCodeSet = Set[IsoCode]
+
+LANGUAGE_FAMILY_FILE = SIL_NLP_ENV.assets_dir / "languageFamilies.json"
+
+def is_file_pattern(input_str: str) -> bool:
+    """Check if the input string contains a hyphen, indicating it's a filename pattern."""
+    return '-' in input_str
+
+def split_input_list(input_list: List[str]) -> Tuple[List[str], List[str]]:
+    """Split input list into ISO codes and file patterns."""
+    iso_codes = []
+    files = []
+    for item in input_list:
+        if is_file_pattern(item):
+            files.append(item)
+        else:
+            iso_codes.append(item)
+    return iso_codes, files
+
+def get_stem_name(file_path: Path) -> str:
+    """Get the stem name without path or extension."""
+    return file_path.stem
+
+
+def load_language_data(file_path: Path) -> Tuple[Dict, Dict, Dict]:
+    try:
+        with open(file_path, "r", encoding="utf-8") as file:
+            raw_data = json.load(file)
+    except FileNotFoundError:
+        logging.error(f"File not found: {file_path}")
+        return {}, {}, {}
+    except json.JSONDecodeError:
+        logging.error(f"Error decoding JSON from file: {file_path}")
+        return {}, {}, {}
+
+    language_data = {}
+    country_data = {}
+    family_data = {}
+
+    for lang in raw_data:
+        iso = lang["isoCode"]
+        country = lang["langCountry"]
+        family = lang["languageFamily"]
+
+        language_data[iso] = {
+            "Name": lang["language"],
+            "Country": country,
+            "Family": family,
+        }
+
+        country_data.setdefault(country, []).append(iso)
+        family_data.setdefault(family, []).append(iso)
+
+    return language_data, country_data, family_data
+
+
+def find_related_isocodes(
+    iso_codes: IsoCodeList, language_data: Dict, country_data: Dict, family_data: Dict
+) -> IsoCodeList:
+    iso_set = set(iso_codes)
+
+    for iso_code in iso_codes:
+        if iso_code in language_data:
+            lang_info = language_data[iso_code]
+#            logger.info(f"{iso_code}: {lang_info['Name']}, {lang_info['Country']}, {lang_info['Family']}")
+
+            iso_set.update(country_data.get(lang_info["Country"], []))
+            iso_set.update(family_data.get(lang_info["Family"], []))
+
+    return sorted(iso_set)
+
+
+def get_files_by_iso(isocodes: IsoCodeList, scripture_dir: Path) -> List[Path]:
+    return [
+        file for file in scripture_dir.glob('*.txt')
+        if any(file.stem.startswith(isocode + '-') for isocode in isocodes)
+    ]
+
+def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict[Path, Path], Dict[Path, Path]]:
+    existing_projects = {}
+    missing_projects = {}
+
+    for file in files:
+        project = projects_dir / file.stem.split("-")[1]
+        if project.is_dir():
+            existing_projects[file] = project
+        else:
+            missing_projects[file] = project
+
+    return existing_projects, missing_projects
+
+
+def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]:
+    return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code}
+
+def resolve_config_path(config_folder: Path) -> Path:
+    """Resolve config folder path relative to experiments directory if not absolute."""
+    if not config_folder.is_absolute():
+        return SIL_NLP_ENV.mt_experiments_dir / config_folder
+    return config_folder
+
+def create_alignment_config(source_files: List[Path], target_files: List[str]) -> dict:
+    """Create the alignment configuration dictionary."""
+    config = {
+        'data': {
+            'aligner': 'fast_align',
+            'corpus_pairs': [{
+                'type': 'train',
+                'src': [get_stem_name(f) for f in source_files],
+                'trg': target_files,
+                'mapping': 'many_to_many',
+                'test_size': 0,
+                'val_size': 0
+            }],
+            'tokenize': False
+        }
+    }
+    return config
+
+def write_or_print_config(config: dict, config_folder: Path = None):
+    """Write config to file or print to terminal."""
+    if config_folder:
+        config_folder = Path(config_folder)
+        if not config_folder.is_absolute():
+            config_folder = SIL_NLP_ENV.mt_experiments_dir / config_folder
+        config_folder.mkdir(parents=True, exist_ok=True)
+        config_path = config_folder / 'config.yml'
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f, default_flow_style=False, sort_keys=False)
+        return str(config_path)
+    else:
+        return yaml.dump(config, default_flow_style=False, sort_keys=False)
+
+def main():
+    parser = argparse.ArgumentParser(description="Find related ISO language codes and create alignment config.")
+    parser.add_argument("inputs", nargs="+", 
+                       help="ISO codes or file patterns (e.g., 'fra' or 'en-NIV')")
+    parser.add_argument("--scripture-dir", type=Path, 
+                       default=Path(SIL_NLP_ENV.mt_scripture_dir), 
+                       help="Directory containing scripture files")
+    parser.add_argument("--all-related", action='store_true', 
+                       help="List all related scriptures without filtering to those that are part of NLLB")
+    parser.add_argument("--no-related", action='store_true', 
+                       help="Only list scriptures in the specified languages and not in related languages")
+    parser.add_argument("--output", type=Path, help="Output to the specified file.")
+    parser.add_argument("--target-files", nargs="+",
+                       help="List of target files in format <iso_code>-<project_name>")
+    parser.add_argument("--config-folder", type=Path,
+                       help="Folder to write the config.yml file (absolute or relative to mt_experiments_dir)")
+
+    args = parser.parse_args()
+
+    # Setup logging
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    formatter = logging.Formatter('%(message)s')
+
+    if args.output:
+        file_handler = logging.FileHandler(args.output)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+    else:
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setFormatter(formatter)
+        logger.addHandler(console_handler)
+
+    # Split inputs into ISO codes and file patterns
+    iso_codes, file_patterns = split_input_list(args.inputs)
+    
+    source_files = []
+    if iso_codes:
+        # Load language data and process ISO codes
+        language_data, country_data, family_data = load_language_data(LANGUAGE_FAMILY_FILE)
+        if not language_data:
+            logging.error("Failed to load language data.")
+            return
+
+        iso_codes = get_equivalent_isocodes(iso_codes)
+        
+        if args.no_related:
+            codes_to_find = list(iso_codes)
+            logger.info(f"\nConsidering only the specified iso codes and their equivalents: {codes_to_find}")
+        else:
+            codes_to_find = find_related_isocodes(list(iso_codes), language_data, country_data, family_data)
+            logger.info(f"\nFound {len(codes_to_find)} related languages:\n{codes_to_find}.")
+
+            if not args.all_related:
+                codes_to_find = [iso for iso in codes_to_find if iso in NLLB_ISO_SET]
+                logger.info(f"\nFound {len(codes_to_find)} specified or related languages in NLLB:\n{codes_to_find}")
+            else:
+                logger.info(f"\nFound {len(codes_to_find)} specified or related languages:\n{codes_to_find}")
+
+        # Get all possible codes and find matching files
+        all_possible_codes = get_equivalent_isocodes(codes_to_find)
+        source_files.extend(get_files_by_iso(all_possible_codes, args.scripture_dir))
+
+    # Add files from file patterns
+    if file_patterns:
+        pattern_files = [args.scripture_dir / f"{pattern}.txt" for pattern in file_patterns]
+        existing_files = [f for f in pattern_files if f.exists()]
+        source_files.extend(existing_files)
+        if len(existing_files) < len(pattern_files):
+            missing = set(file_patterns) - set(get_stem_name(f) for f in existing_files)
+            logger.warning(f"Could not find these files: {missing}")
+
+    if not source_files:
+        logger.error("\nCouldn't find any Scripture files.")
+        return
+
+    # Use target files from command line or file patterns from inputs
+    target_files = args.target_files if args.target_files else file_patterns
+
+    # Create and output configuration
+    config = create_alignment_config(source_files, target_files)
+    result = write_or_print_config(config, args.config_folder)
+    
+    if args.config_folder:
+        logger.info(f"\nCreated alignment configuration in: {result}")
+    else:
+        logger.info("\nAlignment configuration:")
+        logger.info(result)
+
+    logger.info(f"\nSource files found: {len(source_files)}")
+    for file in source_files:
+        logger.info(f"    - {get_stem_name(file)}")
+    logger.info(f"\nTarget files: {len(target_files)}")
+    for file in target_files:
+        logger.info(f"    - {file}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/silnlp/common/usfm_utils.py b/silnlp/common/usfm_utils.py
index b715b6ec..849547da 100644
--- a/silnlp/common/usfm_utils.py
+++ b/silnlp/common/usfm_utils.py
@@ -6,13 +6,13 @@
 def main() -> None:
     """
     Print out all paragraph and character markers for a book
-    To use set book, fpath, and out_path. fpath should be a path to a book in a Paratext project
+    To use set book, fpath, and marker_file. fpath should be a path to a book in a Paratext project
     """
 
-    book = "MAT"
-    fpath = Path("")
-    out_path = Path("")
-    sentences_file = Path("")
+    book = "PRO"
+    fpath = Path(r"M:/Paratext/projects/NIV11/20PRONIV11.SFM")
+    marker_file = Path(r"E:/Work/Draft Quality Improvements/NIV_PRO_paragraphs.txt")
+    sentences_file = Path(r"E:/Work/Draft Quality Improvements/NIV_PRO_sentences.txt")
 
     settings = FileParatextProjectSettingsParser(fpath.parent).parse()
     file_text = UsfmFileText(
@@ -52,7 +52,7 @@ def main() -> None:
                 elif tok.type in [UsfmTokenType.PARAGRAPH, UsfmTokenType.CHARACTER, UsfmTokenType.END]:
                     usfm_markers[-1].append(tok.marker)
 
-    with out_path.open("w", encoding=settings.encoding) as f:
+    with marker_file.open("w", encoding=settings.encoding) as f:
         for ref, markers in zip(vrefs, usfm_markers):
             f.write(f"{ref} {markers}\n")
 

From f9886f6feced7d70bfe988247c231fbb7c3cbdec Mon Sep 17 00:00:00 2001
From: David Baines <david_baines@sil.org>
Date: Thu, 13 Nov 2025 14:12:29 +0000
Subject: [PATCH 2/4] Add filters to find_by_iso

---
 silnlp/common/find_by_iso.py | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py
index a47effa8..7287f120 100644
--- a/silnlp/common/find_by_iso.py
+++ b/silnlp/common/find_by_iso.py
@@ -3,8 +3,10 @@
 import logging
 from pathlib import Path
 from typing import Dict, List, Set, Tuple, Union
+import regex as re
 import sys
 
+
 from .environment import SIL_NLP_ENV
 from .iso_info import NLLB_ISO_SET, ALT_ISO
 
@@ -85,6 +87,21 @@ def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict
 def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]:
     return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code}
 
+def filter_files(files: List[Path], excluded_patterns:List[str]) -> List[Path]:
+    filtered = []
+    date_pattern = re.compile(r'_\d{4}_\d{1,2}_\d{1,2}|_\d{1,2}_\d{1,2}_\d{4}')
+
+    for file in files:
+        parts = file.stem.split('-', 1)
+        if len(parts) != 2: continue
+        iso, name = parts
+        if date_pattern.search(name): continue
+        if len(iso) not in (2, 3): continue
+        if any(pattern.lower() in name.lower() for pattern in excluded_patterns): continue
+        if file.is_file() and file.stat().st_size < 100_000: continue
+        filtered.append(file)
+    return filtered
+
 def main():
     parser = argparse.ArgumentParser(description="Find related ISO language codes.")
     parser.add_argument("iso_codes", nargs="+", help="ISO codes to find related languages for")
@@ -151,7 +168,13 @@ def main():
     
     # Find files matching the codes
     files = get_files_by_iso(all_possible_codes, scripture_dir)
-    existing_projects, missing_projects = split_files_by_projects(files, projects_dir)
+    
+    # Filter out AI and XRI files, and others.
+    excluded_patterns = ['XRI', '600M', '3.3B', '1.3B', 'words', 'name', 'clean', 'transcription','matthew', 'mark', 'mrk','luk']
+    filtered_files = filter_files(files, excluded_patterns)
+    print(f"There are {len(files)} files and {len(files)-len(filtered_files)} were filtered out.")
+    
+    existing_projects, missing_projects = split_files_by_projects(filtered_files, projects_dir)
 
     # Display results
     if existing_projects:
@@ -163,8 +186,8 @@ def main():
         logger.info(f"\nThese {len(missing_projects)} files don't have a corresponding project folder:")
         for file, _ in missing_projects.items():
             logger.info(f"{file.stem}")
-    logger.info(f"\nAll the files:")
-    for file in files:
+    logger.info(f"\nFiltered files:")
+    for file in filtered_files:
         logger.info(f"    - {file.stem}")
 
     if not files:

From 8c6c07ec34ef050d8d2e1aa4351148b59afa46a7 Mon Sep 17 00:00:00 2001
From: David Baines <david_baines@sil.org>
Date: Tue, 18 Nov 2025 14:17:51 +0000
Subject: [PATCH 3/4] Remove spurious files from commit

---
 debug.log                           |   2 -
 silnlp/common/bulk_extract_local.py | 143 ----------------
 silnlp/common/find_by_iso.py        |   5 +-
 silnlp/common/find_by_iso2.py       | 244 ----------------------------
 silnlp/common/usfm_utils.py         |  66 --------
 5 files changed, 1 insertion(+), 459 deletions(-)
 delete mode 100644 debug.log
 delete mode 100644 silnlp/common/bulk_extract_local.py
 delete mode 100644 silnlp/common/find_by_iso2.py
 delete mode 100644 silnlp/common/usfm_utils.py

diff --git a/debug.log b/debug.log
deleted file mode 100644
index 93e4b7ba..00000000
--- a/debug.log
+++ /dev/null
@@ -1,2 +0,0 @@
-[0529/225757.623:ERROR:registration_protocol_win.cc(108)] CreateFile: The system cannot find the file specified. (0x2)
-[0529/225806.839:ERROR:registration_protocol_win.cc(108)] CreateFile: The system cannot find the file specified. (0x2)
diff --git a/silnlp/common/bulk_extract_local.py b/silnlp/common/bulk_extract_local.py
deleted file mode 100644
index 34270a28..00000000
--- a/silnlp/common/bulk_extract_local.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import argparse
-import logging
-from pathlib import Path
-from typing import List
-import sys
-
-from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef, get_books
-
-from .corpus import count_lines
-from .paratext import check_versification, extract_project, extract_term_renderings
-from machine.corpora import FileParatextProjectSettingsParser
-from ..common.environment import SIL_NLP_ENV
-
-LOGGER = logging.getLogger(__package__ + ".bulk_extract_local")
-SETTINGS_FILENAME = "Settings.xml"
-
-
-def parse_settings(project):
-    settings_file_path = project / SETTINGS_FILENAME
-    if not settings_file_path.is_file():
-        LOGGER.warning(f"Warning: {SETTINGS_FILENAME} not found.")
-        return
-
-    try:
-        parser = FileParatextProjectSettingsParser(str(project))
-        project_settings = parser.parse()
-
-        # project_settings.name
-        # project_settings.full_name
-        # if project_settings.encoding:
-        #     self.setting_encoding = getattr(project_settings.encoding, 'name', str(project_settings.encoding))
-
-        # if project_settings.versification:
-        #     setting_versification = getattr(project_settings.versification, 'name', str(project_settings.versification))
-       
-            # project_settings.file_name_prefix
-            # project_settings.file_name_form
-            # project_settings.file_name_suffix
-            # project_settings.biblical_terms_list_type
-            # project_settings.biblical_terms_project_name
-            # project_settings.biblical_terms_file_name
-            # project_settings.language_code
-
-    except Exception as e:
-        print(f"Error parsing {SETTINGS_FILENAME}: {e}")
-        return None
-
-    return project_settings
-
-def get_expected_verse_count(project: Path, include: List[str], exclude: List[str]) -> int:
-    include_books_set = get_books(include) if len(include) > 0 else None
-    exclude_books_set = get_books(exclude) if len(exclude) > 0 else None
-    project_settings = parse_settings(project)
-
-    if project_settings.versification:
-        setting_versification = getattr(project_settings.versification, 'name', str(project_settings.versification))
-    print(f"Found versification {setting_versification} in {SETTINGS_FILENAME} for {project}")
-
-    def filter_lines(verse_ref_str: str) -> bool:
-        if include_books_set is None and exclude_books_set is None:
-            return True
-
-        vref = VerseRef.from_string(verse_ref_str.strip(), setting_versification)
-        if exclude_books_set is not None and vref.book_num in exclude_books_set:
-            return False
-
-        if include_books_set is not None and vref.book_num in include_books_set:
-            return True
-
-        return include_books_set is None
-
-    return count_lines(SIL_NLP_ENV.assets_dir / "vref.txt", filter_lines)
-
-
-def has_settings_file(project_folder: Path) -> bool:
-    return (project_folder / SETTINGS_FILENAME).is_file() or (project_folder / SETTINGS_FILENAME.lower()).is_file()
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Extracts text corpora from Paratext projects")
-    parser.add_argument("input", type=str, help="The input folder.")
-    parser.add_argument("output", type=str, help="The output corpus folder.")
-    parser.add_argument("--terms", type=str, required=True, help="The output terms folder.")
-    parser.add_argument(
-        "--include", metavar="books", nargs="+", default=[], help="The books to include; e.g., 'NT', 'OT', 'GEN'"
-    )
-    parser.add_argument(
-        "--exclude", metavar="books", nargs="+", default=[], help="The books to exclude; e.g., 'NT', 'OT', 'GEN'"
-    )
-    parser.add_argument("--markers", default=False, action="store_true", help="Include USFM markers")
-    parser.add_argument("--lemmas", default=False, action="store_true", help="Extract lemmas if available")
-    parser.add_argument("--project-vrefs", default=False, action="store_true", help="Extract project verse refs")
-
-    args = parser.parse_args()
-    
-    input_path = Path(args.input)
-    output_path = Path(args.output)
-    terms_path = Path(args.terms)
-
-    if not input_path.is_dir():
-        print(f"Error: Projects folder not found: {args.input}")
-        sys.exit(1)
-
-    if not output_path.is_dir():
-        print(f"Error: Output folder not found: {args.output}")
-        sys.exit(1)
-
-    if not terms_path.is_dir():
-        print(f"Error: Output terms folder not found: {args.terms}")
-        sys.exit(1)
-
-    # Which folders have a Settings.xml file we can find?
-    projects = [folder for folder in input_path.glob("*") if folder.is_dir() and has_settings_file(folder)]
-
-    # Process the projects that have data and tell the user.
-    if len(projects) > 0:
-        for project in projects:
-            LOGGER.info(f"Extracting {project} to {output_path}")
-            expected_verse_count = get_expected_verse_count(project, args.include, args.exclude)
-
-            check_versification(project)
-            corpus_filename, verse_count = extract_project(
-                project,
-                output_path,
-                args.include,
-                args.exclude,
-                args.markers,
-                args.lemmas,
-                args.project_vrefs,
-            )
-            
-            # check if the number of lines in the file is correct (the same as vref.txt)
-            LOGGER.info(f"# of Verses: {verse_count}")
-            if verse_count != expected_verse_count:
-                LOGGER.error(f"The number of verses is {verse_count}, but should be {expected_verse_count}.")
-            terms_count = extract_term_renderings(project, corpus_filename, terms_path)
-            LOGGER.info(f"# of Terms: {terms_count}")
-            LOGGER.info("Done.")
-    else:
-        LOGGER.warning(f"Couldn't find any data to process for any project in {input_path}.")
-
-if __name__ == "__main__":
-    main()
diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py
index 7287f120..f2363755 100644
--- a/silnlp/common/find_by_iso.py
+++ b/silnlp/common/find_by_iso.py
@@ -2,7 +2,7 @@
 import json
 import logging
 from pathlib import Path
-from typing import Dict, List, Set, Tuple, Union
+from typing import Dict, List, Set, Tuple
 import regex as re
 import sys
 
@@ -56,8 +56,6 @@ def find_related_isocodes(
     for iso_code in iso_codes:
         if iso_code in language_data:
             lang_info = language_data[iso_code]
-#            logger.info(f"{iso_code}: {lang_info['Name']}, {lang_info['Country']}, {lang_info['Family']}")
-
             iso_set.update(country_data.get(lang_info["Country"], []))
             iso_set.update(family_data.get(lang_info["Family"], []))
 
@@ -114,7 +112,6 @@ def main():
 
     # Create a custom logger
     logger = logging.getLogger(__name__)
-    #logger.basicConfig()
 
     # Set the global logging level
     logger.setLevel(logging.INFO) 
diff --git a/silnlp/common/find_by_iso2.py b/silnlp/common/find_by_iso2.py
deleted file mode 100644
index 7e4c3c2a..00000000
--- a/silnlp/common/find_by_iso2.py
+++ /dev/null
@@ -1,244 +0,0 @@
-import argparse
-import json
-import logging
-from pathlib import Path
-from typing import Dict, List, Set, Tuple, Union
-import sys
-import yaml
-
-from .environment import SIL_NLP_ENV
-from .iso_info import NLLB_ISO_SET, ALT_ISO
-
-IsoCode = str
-IsoCodeList = List[IsoCode]
-IsoCodeSet = Set[IsoCode]
-
-LANGUAGE_FAMILY_FILE = SIL_NLP_ENV.assets_dir / "languageFamilies.json"
-
-def is_file_pattern(input_str: str) -> bool:
-    """Check if the input string contains a hyphen, indicating it's a filename pattern."""
-    return '-' in input_str
-
-def split_input_list(input_list: List[str]) -> Tuple[List[str], List[str]]:
-    """Split input list into ISO codes and file patterns."""
-    iso_codes = []
-    files = []
-    for item in input_list:
-        if is_file_pattern(item):
-            files.append(item)
-        else:
-            iso_codes.append(item)
-    return iso_codes, files
-
-def get_stem_name(file_path: Path) -> str:
-    """Get the stem name without path or extension."""
-    return file_path.stem
-
-
-def load_language_data(file_path: Path) -> Tuple[Dict, Dict, Dict]:
-    try:
-        with open(file_path, "r", encoding="utf-8") as file:
-            raw_data = json.load(file)
-    except FileNotFoundError:
-        logging.error(f"File not found: {file_path}")
-        return {}, {}, {}
-    except json.JSONDecodeError:
-        logging.error(f"Error decoding JSON from file: {file_path}")
-        return {}, {}, {}
-
-    language_data = {}
-    country_data = {}
-    family_data = {}
-
-    for lang in raw_data:
-        iso = lang["isoCode"]
-        country = lang["langCountry"]
-        family = lang["languageFamily"]
-
-        language_data[iso] = {
-            "Name": lang["language"],
-            "Country": country,
-            "Family": family,
-        }
-
-        country_data.setdefault(country, []).append(iso)
-        family_data.setdefault(family, []).append(iso)
-
-    return language_data, country_data, family_data
-
-
-def find_related_isocodes(
-    iso_codes: IsoCodeList, language_data: Dict, country_data: Dict, family_data: Dict
-) -> IsoCodeList:
-    iso_set = set(iso_codes)
-
-    for iso_code in iso_codes:
-        if iso_code in language_data:
-            lang_info = language_data[iso_code]
-#            logger.info(f"{iso_code}: {lang_info['Name']}, {lang_info['Country']}, {lang_info['Family']}")
-
-            iso_set.update(country_data.get(lang_info["Country"], []))
-            iso_set.update(family_data.get(lang_info["Family"], []))
-
-    return sorted(iso_set)
-
-
-def get_files_by_iso(isocodes: IsoCodeList, scripture_dir: Path) -> List[Path]:
-    return [
-        file for file in scripture_dir.glob('*.txt')
-        if any(file.stem.startswith(isocode + '-') for isocode in isocodes)
-    ]
-
-def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict[Path, Path], Dict[Path, Path]]:
-    existing_projects = {}
-    missing_projects = {}
-
-    for file in files:
-        project = projects_dir / file.stem.split("-")[1]
-        if project.is_dir():
-            existing_projects[file] = project
-        else:
-            missing_projects[file] = project
-
-    return existing_projects, missing_projects
-
-
-def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]:
-    return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code}
-
-def resolve_config_path(config_folder: Path) -> Path:
-    """Resolve config folder path relative to experiments directory if not absolute."""
-    if not config_folder.is_absolute():
-        return SIL_NLP_ENV.mt_experiments_dir / config_folder
-    return config_folder
-
-def create_alignment_config(source_files: List[Path], target_files: List[str]) -> dict:
-    """Create the alignment configuration dictionary."""
-    config = {
-        'data': {
-            'aligner': 'fast_align',
-            'corpus_pairs': [{
-                'type': 'train',
-                'src': [get_stem_name(f) for f in source_files],
-                'trg': target_files,
-                'mapping': 'many_to_many',
-                'test_size': 0,
-                'val_size': 0
-            }],
-            'tokenize': False
-        }
-    }
-    return config
-
-def write_or_print_config(config: dict, config_folder: Path = None):
-    """Write config to file or print to terminal."""
-    if config_folder:
-        config_folder = Path(config_folder)
-        if not config_folder.is_absolute():
-            config_folder = SIL_NLP_ENV.mt_experiments_dir / config_folder
-        config_folder.mkdir(parents=True, exist_ok=True)
-        config_path = config_folder / 'config.yml'
-        with open(config_path, 'w') as f:
-            yaml.dump(config, f, default_flow_style=False, sort_keys=False)
-        return str(config_path)
-    else:
-        return yaml.dump(config, default_flow_style=False, sort_keys=False)
-
-def main():
-    parser = argparse.ArgumentParser(description="Find related ISO language codes and create alignment config.")
-    parser.add_argument("inputs", nargs="+", 
-                       help="ISO codes or file patterns (e.g., 'fra' or 'en-NIV')")
-    parser.add_argument("--scripture-dir", type=Path, 
-                       default=Path(SIL_NLP_ENV.mt_scripture_dir), 
-                       help="Directory containing scripture files")
-    parser.add_argument("--all-related", action='store_true', 
-                       help="List all related scriptures without filtering to those that are part of NLLB")
-    parser.add_argument("--no-related", action='store_true', 
-                       help="Only list scriptures in the specified languages and not in related languages")
-    parser.add_argument("--output", type=Path, help="Output to the specified file.")
-    parser.add_argument("--target-files", nargs="+",
-                       help="List of target files in format <iso_code>-<project_name>")
-    parser.add_argument("--config-folder", type=Path,
-                       help="Folder to write the config.yml file (absolute or relative to mt_experiments_dir)")
-
-    args = parser.parse_args()
-
-    # Setup logging
-    logger = logging.getLogger(__name__)
-    logger.setLevel(logging.INFO)
-    formatter = logging.Formatter('%(message)s')
-
-    if args.output:
-        file_handler = logging.FileHandler(args.output)
-        file_handler.setFormatter(formatter)
-        logger.addHandler(file_handler)
-    else:
-        console_handler = logging.StreamHandler(sys.stdout)
-        console_handler.setFormatter(formatter)
-        logger.addHandler(console_handler)
-
-    # Split inputs into ISO codes and file patterns
-    iso_codes, file_patterns = split_input_list(args.inputs)
-    
-    source_files = []
-    if iso_codes:
-        # Load language data and process ISO codes
-        language_data, country_data, family_data = load_language_data(LANGUAGE_FAMILY_FILE)
-        if not language_data:
-            logging.error("Failed to load language data.")
-            return
-
-        iso_codes = get_equivalent_isocodes(iso_codes)
-        
-        if args.no_related:
-            codes_to_find = list(iso_codes)
-            logger.info(f"\nConsidering only the specified iso codes and their equivalents: {codes_to_find}")
-        else:
-            codes_to_find = find_related_isocodes(list(iso_codes), language_data, country_data, family_data)
-            logger.info(f"\nFound {len(codes_to_find)} related languages:\n{codes_to_find}.")
-
-            if not args.all_related:
-                codes_to_find = [iso for iso in codes_to_find if iso in NLLB_ISO_SET]
-                logger.info(f"\nFound {len(codes_to_find)} specified or related languages in NLLB:\n{codes_to_find}")
-            else:
-                logger.info(f"\nFound {len(codes_to_find)} specified or related languages:\n{codes_to_find}")
-
-        # Get all possible codes and find matching files
-        all_possible_codes = get_equivalent_isocodes(codes_to_find)
-        source_files.extend(get_files_by_iso(all_possible_codes, args.scripture_dir))
-
-    # Add files from file patterns
-    if file_patterns:
-        pattern_files = [args.scripture_dir / f"{pattern}.txt" for pattern in file_patterns]
-        existing_files = [f for f in pattern_files if f.exists()]
-        source_files.extend(existing_files)
-        if len(existing_files) < len(pattern_files):
-            missing = set(file_patterns) - set(get_stem_name(f) for f in existing_files)
-            logger.warning(f"Could not find these files: {missing}")
-
-    if not source_files:
-        logger.error("\nCouldn't find any Scripture files.")
-        return
-
-    # Use target files from command line or file patterns from inputs
-    target_files = args.target_files if args.target_files else file_patterns
-
-    # Create and output configuration
-    config = create_alignment_config(source_files, target_files)
-    result = write_or_print_config(config, args.config_folder)
-    
-    if args.config_folder:
-        logger.info(f"\nCreated alignment configuration in: {result}")
-    else:
-        logger.info("\nAlignment configuration:")
-        logger.info(result)
-
-    logger.info(f"\nSource files found: {len(source_files)}")
-    for file in source_files:
-        logger.info(f"    - {get_stem_name(file)}")
-    logger.info(f"\nTarget files: {len(target_files)}")
-    for file in target_files:
-        logger.info(f"    - {file}")
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/silnlp/common/usfm_utils.py b/silnlp/common/usfm_utils.py
deleted file mode 100644
index a29d58aa..00000000
--- a/silnlp/common/usfm_utils.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from pathlib import Path
-
-from machine.corpora import FileParatextProjectSettingsParser, UsfmFileText, UsfmTokenizer, UsfmTokenType
-
-# Marker "type" is as defined by the UsfmTokenType given to tokens by the UsfmTokenizer,
-# which mostly aligns with a marker's StyleType in the USFM stylesheet
-CHARACTER_TYPE_EMBEDS = ["fig", "fm", "jmp", "rq", "va", "vp", "xt", "xtSee", "xtSeeAlso"]
-PARAGRAPH_TYPE_EMBEDS = ["lit", "r", "rem"]
-NON_NOTE_TYPE_EMBEDS = CHARACTER_TYPE_EMBEDS + PARAGRAPH_TYPE_EMBEDS
-
-
-def main() -> None:
-    """
-    Print out all paragraph and character markers for a book
-    To use set book, fpath, and marker_file. fpath should be a path to a book in a Paratext project
-    """
-
-    book = "PRO"
-    fpath = Path(r"M:/Paratext/projects/NIV11/20PRONIV11.SFM")
-    marker_file = Path(r"E:/Work/Draft Quality Improvements/NIV_PRO_paragraphs.txt")
-    sentences_file = Path(r"E:/Work/Draft Quality Improvements/NIV_PRO_sentences.txt")
-
-    settings = FileParatextProjectSettingsParser(fpath.parent).parse()
-    file_text = UsfmFileText(
-        settings.stylesheet,
-        settings.encoding,
-        book,
-        fpath,
-        settings.versification,
-        include_markers=True,
-        include_all_text=True,
-        project=settings.name,
-    )
-
-    vrefs = []
-    usfm_markers = []
-    usfm_tokenizer = UsfmTokenizer(settings.stylesheet)
-    with sentences_file.open("w", encoding=settings.encoding) as f:
-        for sent in file_text:
-            f.write(f"{sent}\n")
-            if len(sent.ref.path) > 0 and sent.ref.path[-1].name in PARAGRAPH_TYPE_EMBEDS:
-                continue
-
-            vrefs.append(sent.ref)
-            usfm_markers.append([])
-            usfm_toks = usfm_tokenizer.tokenize(sent.text.strip())
-
-            ignore_scope = None
-            for tok in usfm_toks:
-                if ignore_scope is not None:
-                    if tok.type == UsfmTokenType.END and tok.marker[:-1] == ignore_scope.marker:
-                        ignore_scope = None
-                elif tok.type == UsfmTokenType.NOTE or (
-                    tok.type == UsfmTokenType.CHARACTER and tok.marker in CHARACTER_TYPE_EMBEDS
-                ):
-                    ignore_scope = tok
-                elif tok.type in [UsfmTokenType.PARAGRAPH, UsfmTokenType.CHARACTER, UsfmTokenType.END]:
-                    usfm_markers[-1].append(tok.marker)
-
-    with marker_file.open("w", encoding=settings.encoding) as f:
-        for ref, markers in zip(vrefs, usfm_markers):
-            f.write(f"{ref} {markers}\n")
-
-
-if __name__ == "__main__":
-    main()

From d32e059ca83f647780e9cb0170f0adfea7d3fd8f Mon Sep 17 00:00:00 2001
From: David Baines <david_baines@sil.org>
Date: Wed, 19 Nov 2025 12:06:41 +0000
Subject: [PATCH 4/4] Keep files created today

---
 silnlp/common/find_by_iso.py | 103 ++++++++++++++++++++++++-----------
 1 file changed, 72 insertions(+), 31 deletions(-)

diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py
index f2363755..a91177ef 100644
--- a/silnlp/common/find_by_iso.py
+++ b/silnlp/common/find_by_iso.py
@@ -1,14 +1,15 @@
 import argparse
 import json
 import logging
+import sys
+from datetime import datetime
 from pathlib import Path
 from typing import Dict, List, Set, Tuple
-import regex as re
-import sys
 
+import regex as re
 
 from .environment import SIL_NLP_ENV
-from .iso_info import NLLB_ISO_SET, ALT_ISO
+from .iso_info import ALT_ISO, NLLB_ISO_SET
 
 IsoCode = str
 IsoCodeList = List[IsoCode]
@@ -16,6 +17,7 @@
 
 LANGUAGE_FAMILY_FILE = SIL_NLP_ENV.assets_dir / "languageFamilies.json"
 
+
 def load_language_data(file_path: Path) -> Tuple[Dict, Dict, Dict]:
     try:
         with open(file_path, "r", encoding="utf-8") as file:
@@ -64,10 +66,10 @@ def find_related_isocodes(
 
 def get_files_by_iso(isocodes: IsoCodeList, scripture_dir: Path) -> List[Path]:
     return [
-        file for file in scripture_dir.glob('*.txt')
-        if any(file.stem.startswith(isocode + '-') for isocode in isocodes)
+        file for file in scripture_dir.glob("*.txt") if any(file.stem.startswith(isocode + "-") for isocode in isocodes)
     ]
 
+
 def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict[Path, Path], Dict[Path, Path]]:
     existing_projects = {}
     missing_projects = {}
@@ -85,27 +87,53 @@ def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict
 def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]:
     return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code}
 
-def filter_files(files: List[Path], excluded_patterns:List[str]) -> List[Path]:
+
+def filter_files(files: List[Path], excluded_patterns: List[str]) -> List[Path]:
     filtered = []
-    date_pattern = re.compile(r'_\d{4}_\d{1,2}_\d{1,2}|_\d{1,2}_\d{1,2}_\d{4}')
+
+    today = datetime.now()
+    today_pattern = re.compile(f"{today.strftime('_%Y_%m_%d')}|{today.strftime('_%d_%m_%Y')}")
+    date_pattern = re.compile(r"_\d{4}_\d{1,2}_\d{1,2}|_\d{1,2}_\d{1,2}_\d{4}")
 
     for file in files:
-        parts = file.stem.split('-', 1)
-        if len(parts) != 2: continue
+        parts = file.stem.split("-", 1)
+        if len(parts) != 2:
+            continue
         iso, name = parts
-        if date_pattern.search(name): continue
-        if len(iso) not in (2, 3): continue
-        if any(pattern.lower() in name.lower() for pattern in excluded_patterns): continue
-        if file.is_file() and file.stat().st_size < 100_000: continue
+        if today_pattern.search(name):
+            filtered.append(file)
+            continue
+        if date_pattern.search(name):
+            continue
+        if len(iso) not in (2, 3):
+            continue
+        if any(pattern.lower() in name.lower() for pattern in excluded_patterns):
+            continue
+        if file.is_file() and file.stat().st_size < 100_000:
+            continue
         filtered.append(file)
     return filtered
 
+
 def main():
     parser = argparse.ArgumentParser(description="Find related ISO language codes.")
     parser.add_argument("iso_codes", nargs="+", help="ISO codes to find related languages for")
-    parser.add_argument("--scripture-dir", type=Path, default=Path(SIL_NLP_ENV.mt_scripture_dir), help="Directory containing scripture files")
-    parser.add_argument("--all-related", action='store_true', help="List all related scriptures without filtering to those that are part of NLLB")
-    parser.add_argument("--no-related", action='store_true', help="Only list scriptures in the specified languages and not in related languages")
+    parser.add_argument(
+        "--scripture-dir",
+        type=Path,
+        default=Path(SIL_NLP_ENV.mt_scripture_dir),
+        help="Directory containing scripture files",
+    )
+    parser.add_argument(
+        "--all-related",
+        action="store_true",
+        help="List all related scriptures without filtering to those that are part of NLLB",
+    )
+    parser.add_argument(
+        "--no-related",
+        action="store_true",
+        help="Only list scriptures in the specified languages and not in related languages",
+    )
     parser.add_argument("--output", type=Path, help="Output to the specified file.")
 
     args = parser.parse_args()
@@ -114,9 +142,9 @@ def main():
     logger = logging.getLogger(__name__)
 
     # Set the global logging level
-    logger.setLevel(logging.INFO) 
-    
-    formatter = logging.Formatter('%(message)s')
+    logger.setLevel(logging.INFO)
+
+    formatter = logging.Formatter("%(message)s")
 
     if args.output:
         # Create handler for the file output.
@@ -129,7 +157,6 @@ def main():
         console_handler.setFormatter(formatter)
         logger.addHandler(console_handler)
 
-  
     language_data, country_data, family_data = load_language_data(LANGUAGE_FAMILY_FILE)
     projects_dir = SIL_NLP_ENV.pt_projects_dir
     scripture_dir = Path(args.scripture_dir)
@@ -137,22 +164,22 @@ def main():
     if not language_data:
         logging.error("Failed to load language data.")
         return
-    
+
     # Get equivalent ISO codes for input
     iso_codes = get_equivalent_isocodes(args.iso_codes)
-    
+
     if args.no_related:
-        
+
         # Option 2: No files in related languages, only equivalent ISO codes
         codes_to_find = list(iso_codes)
         logger.info(f"\nConsidering only the specified iso codes and their equivalents. {codes_to_find}")
-        
+
     else:
         # Find related ISO codes
         codes_to_find = find_related_isocodes(list(iso_codes), language_data, country_data, family_data)
         logger.info(f"\nFound {len(codes_to_find)} related languages:\n{codes_to_find}.")
 
-        if not args.all_related:            
+        if not args.all_related:
             # Option 3 (default): Filter to NLLB languages
             codes_to_find = [iso for iso in codes_to_find if iso in NLLB_ISO_SET]
             logger.info(f"\nFound {len(codes_to_find)} specified or related languages in NLLB:\n{codes_to_find}")
@@ -162,15 +189,28 @@ def main():
 
     # Get all possible 2 and 3 letter codes for the related languages
     all_possible_codes = get_equivalent_isocodes(codes_to_find)
-    
+
     # Find files matching the codes
     files = get_files_by_iso(all_possible_codes, scripture_dir)
-    
+
     # Filter out AI and XRI files, and others.
-    excluded_patterns = ['XRI', '600M', '3.3B', '1.3B', 'words', 'name', 'clean', 'transcription','matthew', 'mark', 'mrk','luk']
+    excluded_patterns = [
+        "XRI",
+        "600M",
+        "3.3B",
+        "1.3B",
+        "words",
+        "name",
+        "clean",
+        "transcription",
+        "matthew",
+        "mark",
+        "mrk",
+        "luk",
+    ]
     filtered_files = filter_files(files, excluded_patterns)
     print(f"There are {len(files)} files and {len(files)-len(filtered_files)} were filtered out.")
-    
+
     existing_projects, missing_projects = split_files_by_projects(filtered_files, projects_dir)
 
     # Display results
@@ -183,12 +223,13 @@ def main():
         logger.info(f"\nThese {len(missing_projects)} files don't have a corresponding project folder:")
         for file, _ in missing_projects.items():
             logger.info(f"{file.stem}")
-    logger.info(f"\nFiltered files:")
+    logger.info("\nFiltered files:")
     for file in filtered_files:
         logger.info(f"    - {file.stem}")
 
     if not files:
         logger.info("\nCouldn't find any Scripture files in these languages.")
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()