From 2659e19dce24554c142336752f60a87520b42748 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Fri, 12 Jul 2024 14:57:04 -0400 Subject: [PATCH] Usfm nonverse multichapter (#224) * Added test case to fail * Don't preserve _curTextType across verses. * Properly close non-verse segment at the beginning of a verse para - Add extra info when exception occurs while tokenizing in UsfmTextBase --------- Co-authored-by: Damien Daspit --- .../ScriptureRefUsfmParserHandlerBase.cs | 5 ++++- src/SIL.Machine/Corpora/UsfmTextBase.cs | 19 ++++++++++++++++- .../Corpora/UsfmMemoryTextTests.cs | 21 +++++++++++++++++++ 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs index da66b34f..7d9e3391 100644 --- a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs +++ b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs @@ -68,7 +68,10 @@ string pubNumber } else { - EndVerseText(state); + if (CurrentTextType == ScriptureTextType.NonVerse) + EndNonVerseText(state); + else + EndVerseText(state); UpdateVerseRef(state.VerseRef, marker); StartVerseText(state); } diff --git a/src/SIL.Machine/Corpora/UsfmTextBase.cs b/src/SIL.Machine/Corpora/UsfmTextBase.cs index 030580b7..b148f391 100644 --- a/src/SIL.Machine/Corpora/UsfmTextBase.cs +++ b/src/SIL.Machine/Corpora/UsfmTextBase.cs @@ -36,8 +36,25 @@ protected override IEnumerable GetVersesInDocOrder() { string usfm = ReadUsfm(); var rowCollector = new TextRowCollector(this); + + var tokenizer = new UsfmTokenizer(_stylesheet); + IReadOnlyList tokens; + try + { + tokens = tokenizer.Tokenize(usfm, _includeMarkers); + } + catch (Exception ex) + { + var sb = new StringBuilder(); + sb.Append($"An error occurred while tokenizing the text '{Id}`"); + if (!string.IsNullOrEmpty(Project)) + sb.Append($" in project '{Project}'"); + sb.Append($". Error: '{ex.Message}'"); + throw new InvalidOperationException(sb.ToString(), ex); + } + var parser = new UsfmParser( - usfm, + tokens, rowCollector, _stylesheet, Versification, diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index e1b25c76..b046be22 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -70,6 +70,27 @@ public void GetRows_DuplicateVerseWithTable() Assert.That(rows, Has.Length.EqualTo(5)); } + [Test] + public void GetRows_VersePara_BeginningNonVerseSegment() + { + // a verse paragraph that begins with a non-verse segment followed by a verse segment + TextRow[] rows = GetRows( + @"\id MAT - Test +\c 1 +\q1 +\f \fr 119 \ft World \f* +\v 1 First verse in line!?! +\c 2 +\d +description +\b +", + includeAllText: true + ); + + Assert.That(rows, Has.Length.EqualTo(4)); + } + private static TextRow[] GetRows(string usfm, bool includeMarkers = false, bool includeAllText = false) { UsfmMemoryText text =