From be5f4f396f09d7fcddc9eac7c92d79e839a23d06 Mon Sep 17 00:00:00 2001 From: Laura Burdick Date: Tue, 2 Dec 2025 20:16:44 +0000 Subject: [PATCH 1/4] Fixing the case where there's a passage that's a single verse (previously, it crashed in this case) --- silnlp/alignment/segment_verses.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/silnlp/alignment/segment_verses.py b/silnlp/alignment/segment_verses.py index 53271ae9..b8f6f4bd 100644 --- a/silnlp/alignment/segment_verses.py +++ b/silnlp/alignment/segment_verses.py @@ -181,6 +181,12 @@ def _create_target_verses_from_offsets( ) -> SegmentedPassage: target_verses = [] + # Special case where passage is a single verse + if len(target_verse_offsets) == 0: + verse_ref = references[0] + target_verses.append(Verse(verse_ref, target_text)) + return self._adjust_verse_boundaries(target_verses) + current_verse_starting_char_index = 0 current_verse_ending_char_index = 0 current_verse_offset_index = 0 From e95592bf0f9c4f8d84a8251def519656ef65d26a Mon Sep 17 00:00:00 2001 From: Laura Burdick Date: Sat, 6 Dec 2025 17:41:32 +0000 Subject: [PATCH 2/4] Includes an option to output the verses in vref format (so that they can be fed directly into NLLB) --- silnlp/alignment/segment_verses.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/silnlp/alignment/segment_verses.py b/silnlp/alignment/segment_verses.py index b8f6f4bd..33eeb472 100644 --- a/silnlp/alignment/segment_verses.py +++ b/silnlp/alignment/segment_verses.py @@ -290,7 +290,7 @@ def predict_target_verse_token_offsets( if len(best_split_indices) > 1: best_split_index = best_split_indices[0] for split_index in best_split_indices: - if not contains_letter(target_tokens[split_index-1]): + if not contains_letter(target_tokens[split_index - 1]): best_split_index = split_index break target_verse_offsets.append(best_split_index) @@ -702,6 +702,7 @@ def main() -> None: parser.add_argument( "--use-saved-alignments", help="Use pre-computed alignments from a previous run", default=None, action="store_true" ) + parser.add_argument("--vref", help="Output vref file for target verses", default=None, action="store_true") args = parser.parse_args() parallel_passages = ParallelPassageCollectionFactory(args.save_alignments, args.use_saved_alignments).create( @@ -719,6 +720,22 @@ def main() -> None: src_passage.write_to_file(src_output) trg_passage.write_to_file(trg_output) + if args.vref is not None: + vref_path = Path(args.target_passages).with_suffix(".vref.txt") + template_vref_path = SIL_NLP_ENV.assets_dir / "vref.txt" + + verse_map: Dict[str, str] = { + str(verse.reference): verse.text for trg_passage in trg_segmented_passages for verse in trg_passage.verses + } + + with ( + open(template_vref_path, "r", encoding="utf-8") as template_file, + open(vref_path, "w", encoding="utf-8") as vref_output, + ): + for line in template_file: + template_ref = line.rstrip("\n") + vref_output.write(f"{verse_map.get(template_ref, '')}\n") + if args.compare_against is not None: reference_segmentations = ReferenceVerseSegmentationReader().read_passages( args.compare_against, Path(args.target_passages) From 43af2c7bf1c0cbf805496239ced5b04d8cfb2117 Mon Sep 17 00:00:00 2001 From: Laura Burdick Date: Mon, 8 Dec 2025 19:11:39 +0000 Subject: [PATCH 3/4] Changing to original versification before mapping verses to vref format --- silnlp/alignment/segment_verses.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/silnlp/alignment/segment_verses.py b/silnlp/alignment/segment_verses.py index 33eeb472..4155134d 100644 --- a/silnlp/alignment/segment_verses.py +++ b/silnlp/alignment/segment_verses.py @@ -10,7 +10,7 @@ import regex from machine.corpora import AlignedWordPair, FileParatextProjectSettingsParser, ScriptureRef, UsfmFileText -from machine.scripture import VerseRef +from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef from machine.tokenization import LatinWordTokenizer from pyparsing import Iterable @@ -724,8 +724,14 @@ def main() -> None: vref_path = Path(args.target_passages).with_suffix(".vref.txt") template_vref_path = SIL_NLP_ENV.assets_dir / "vref.txt" + for trg_passage in trg_segmented_passages: + for verse in trg_passage.verses: + verse.reference.change_versification(ORIGINAL_VERSIFICATION) + verse_map: Dict[str, str] = { - str(verse.reference): verse.text for trg_passage in trg_segmented_passages for verse in trg_passage.verses + str(verse.reference): verse.text + for trg_passage in trg_segmented_passages + for verse in trg_passage.verses } with ( From 7e64b096dacf6c3b02bfce52fe19335bb3b5c073 Mon Sep 17 00:00:00 2001 From: Laura Burdick Date: Tue, 9 Dec 2025 00:44:48 +0000 Subject: [PATCH 4/4] Using to_versification rather than change_versification --- silnlp/alignment/segment_verses.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/silnlp/alignment/segment_verses.py b/silnlp/alignment/segment_verses.py index 4155134d..291875c6 100644 --- a/silnlp/alignment/segment_verses.py +++ b/silnlp/alignment/segment_verses.py @@ -724,12 +724,8 @@ def main() -> None: vref_path = Path(args.target_passages).with_suffix(".vref.txt") template_vref_path = SIL_NLP_ENV.assets_dir / "vref.txt" - for trg_passage in trg_segmented_passages: - for verse in trg_passage.verses: - verse.reference.change_versification(ORIGINAL_VERSIFICATION) - verse_map: Dict[str, str] = { - str(verse.reference): verse.text + str(verse.reference.to_versification(ORIGINAL_VERSIFICATION)): verse.text for trg_passage in trg_segmented_passages for verse in trg_passage.verses }