diff --git a/silnlp/alignment/segment_verses.py b/silnlp/alignment/segment_verses.py index 53271ae9..291875c6 100644 --- a/silnlp/alignment/segment_verses.py +++ b/silnlp/alignment/segment_verses.py @@ -10,7 +10,7 @@ import regex from machine.corpora import AlignedWordPair, FileParatextProjectSettingsParser, ScriptureRef, UsfmFileText -from machine.scripture import VerseRef +from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef from machine.tokenization import LatinWordTokenizer from pyparsing import Iterable @@ -181,6 +181,12 @@ def _create_target_verses_from_offsets( ) -> SegmentedPassage: target_verses = [] + # Special case where passage is a single verse + if len(target_verse_offsets) == 0: + verse_ref = references[0] + target_verses.append(Verse(verse_ref, target_text)) + return self._adjust_verse_boundaries(target_verses) + current_verse_starting_char_index = 0 current_verse_ending_char_index = 0 current_verse_offset_index = 0 @@ -284,7 +290,7 @@ def predict_target_verse_token_offsets( if len(best_split_indices) > 1: best_split_index = best_split_indices[0] for split_index in best_split_indices: - if not contains_letter(target_tokens[split_index-1]): + if not contains_letter(target_tokens[split_index - 1]): best_split_index = split_index break target_verse_offsets.append(best_split_index) @@ -696,6 +702,7 @@ def main() -> None: parser.add_argument( "--use-saved-alignments", help="Use pre-computed alignments from a previous run", default=None, action="store_true" ) + parser.add_argument("--vref", help="Output vref file for target verses", default=None, action="store_true") args = parser.parse_args() parallel_passages = ParallelPassageCollectionFactory(args.save_alignments, args.use_saved_alignments).create( @@ -713,6 +720,24 @@ def main() -> None: src_passage.write_to_file(src_output) trg_passage.write_to_file(trg_output) + if args.vref is not None: + vref_path = Path(args.target_passages).with_suffix(".vref.txt") + template_vref_path = SIL_NLP_ENV.assets_dir / "vref.txt" + + verse_map: Dict[str, str] = { + str(verse.reference.to_versification(ORIGINAL_VERSIFICATION)): verse.text + for trg_passage in trg_segmented_passages + for verse in trg_passage.verses + } + + with ( + open(template_vref_path, "r", encoding="utf-8") as template_file, + open(vref_path, "w", encoding="utf-8") as vref_output, + ): + for line in template_file: + template_ref = line.rstrip("\n") + vref_output.write(f"{verse_map.get(template_ref, '')}\n") + if args.compare_against is not None: reference_segmentations = ReferenceVerseSegmentationReader().read_passages( args.compare_against, Path(args.target_passages)