From a598575bbe9e438593384ae02040e63beb4a6b84 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 6 Nov 2024 14:34:00 +0900 Subject: [PATCH] calculate offset based on each morpheme's one --- .../lucene/sudachi/ja/SudachiSplitFilter.java | 37 +++++++++++-------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java index d871cdb..8f58004 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java @@ -130,13 +130,11 @@ private void setAUnitAttribute() { posIncAtt.setPositionIncrement(1); } - int startOffset = subunits.offset(); - Morpheme morpheme = subunits.next(); - int endOffset = subunits.offset(); - termAtt.setEmpty().append(morpheme.surface()); - morphemeAtt.setMorpheme(morpheme); - morphemeAtt.setOffsets(offsetMap.subList(startOffset, endOffset + 1)); - offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset)); + MorphemeSubunits.Subunit su = subunits.next(); + termAtt.setEmpty().append(su.morpheme.surface()); + morphemeAtt.setMorpheme(su.morpheme); + morphemeAtt.setOffsets(offsetMap.subList(su.begin, su.end + 1)); + offsetAtt.setOffset(correctOffset(su.begin), correctOffset(su.end)); } private void setOOVAttribute() { @@ -194,29 +192,40 @@ public int offset() { } static class MorphemeSubunits { + static class Subunit { + final Morpheme morpheme; + final int begin; + final int end; + + public Subunit(Morpheme morpheme, int begin, int end) { + this.morpheme = morpheme; + this.begin = begin; + this.end = end; + } + } + private List morphemes; private int size; private int index; - private int offset; + private int baseOffset; public void setUnits(List morphemes) { this.morphemes = morphemes; size = morphemes.size(); index = 0; - offset = 0; + baseOffset = morphemes.get(0).begin(); } public boolean hasNext() { return index < size; } - public Morpheme next() { + public Subunit next() { if (!hasNext()) { throw new IllegalStateException(); } Morpheme m = morphemes.get(index++); - offset += m.end() - m.begin(); - return m; + return new Subunit(m, m.begin() - baseOffset, m.end() - baseOffset); } public int size() { @@ -226,9 +235,5 @@ public int size() { public int index() { return index; } - - public int offset() { - return offset; - } } } \ No newline at end of file