Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions silnlp/alignment/segment_verses.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import regex
from machine.corpora import AlignedWordPair, FileParatextProjectSettingsParser, ScriptureRef, UsfmFileText
from machine.scripture import VerseRef
from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef
from machine.tokenization import LatinWordTokenizer
from pyparsing import Iterable

Expand Down Expand Up @@ -181,6 +181,12 @@ def _create_target_verses_from_offsets(
) -> SegmentedPassage:
target_verses = []

# Special case where passage is a single verse
if len(target_verse_offsets) == 0:
verse_ref = references[0]
target_verses.append(Verse(verse_ref, target_text))
return self._adjust_verse_boundaries(target_verses)

current_verse_starting_char_index = 0
current_verse_ending_char_index = 0
current_verse_offset_index = 0
Expand Down Expand Up @@ -284,7 +290,7 @@ def predict_target_verse_token_offsets(
if len(best_split_indices) > 1:
best_split_index = best_split_indices[0]
for split_index in best_split_indices:
if not contains_letter(target_tokens[split_index-1]):
if not contains_letter(target_tokens[split_index - 1]):
best_split_index = split_index
break
target_verse_offsets.append(best_split_index)
Expand Down Expand Up @@ -696,6 +702,7 @@ def main() -> None:
parser.add_argument(
"--use-saved-alignments", help="Use pre-computed alignments from a previous run", default=None, action="store_true"
)
parser.add_argument("--vref", help="Output vref file for target verses", default=None, action="store_true")
args = parser.parse_args()

parallel_passages = ParallelPassageCollectionFactory(args.save_alignments, args.use_saved_alignments).create(
Expand All @@ -713,6 +720,24 @@ def main() -> None:
src_passage.write_to_file(src_output)
trg_passage.write_to_file(trg_output)

if args.vref is not None:
vref_path = Path(args.target_passages).with_suffix(".vref.txt")
template_vref_path = SIL_NLP_ENV.assets_dir / "vref.txt"

verse_map: Dict[str, str] = {
str(verse.reference.to_versification(ORIGINAL_VERSIFICATION)): verse.text
for trg_passage in trg_segmented_passages
for verse in trg_passage.verses
}

with (
open(template_vref_path, "r", encoding="utf-8") as template_file,
open(vref_path, "w", encoding="utf-8") as vref_output,
):
for line in template_file:
template_ref = line.rstrip("\n")
vref_output.write(f"{verse_map.get(template_ref, '')}\n")

if args.compare_against is not None:
reference_segmentations = ReferenceVerseSegmentationReader().read_passages(
args.compare_against, Path(args.target_passages)
Expand Down