From d71aacd512c66e80a1067af50fbc43df378df470 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 30 Oct 2024 17:31:35 +0900 Subject: [PATCH 01/13] flatten if-block --- .../lucene/sudachi/ja/SudachiSplitFilter.java | 53 +++++++++++-------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java index 361fd66..6e52ebc 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java @@ -105,6 +105,7 @@ public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode spli @Override public final boolean incrementToken() throws IOException { + // continue to write current split if (oovChars.hasNext()) { clearAttributes(); setOOVAttribute(); @@ -116,30 +117,40 @@ public final boolean incrementToken() throws IOException { return true; } - if (input.incrementToken()) { - int length = 0; - Morpheme m = morphemeAtt.getMorpheme(); - if (m == null) { - return true; - } - termAtt.setEmpty().append(m.surface()); - if (mode == Mode.EXTENDED && m.isOOV() && (length = Strings.codepointCount(termAtt)) > 1) { - oovChars.setOov(offsetAtt.startOffset(), termAtt.buffer(), termAtt.length()); - posLengthAtt.setPositionLength(length); - } else if (splitMode != Tokenizer.SplitMode.C) { - List subUnits = m.split(splitMode); - if (subUnits.size() > 1) { - aUnitIterator = subUnits.listIterator(); - aUnitOffset = offsetAtt.startOffset(); - posLengthAtt.setPositionLength(subUnits.size()); - } else { - posLengthAtt.setPositionLength(1); - } - } + // move to next morpheme + if (!input.incrementToken()) { + return false; + } + + Morpheme m = morphemeAtt.getMorpheme(); + if (m == null) { + return true; + } + + // split oov into charactor + int length = 0; + termAtt.setEmpty().append(m.surface()); + if (mode == Mode.EXTENDED && m.isOOV() && (length = Strings.codepointCount(termAtt)) > 1) { + oovChars.setOov(offsetAtt.startOffset(), termAtt.buffer(), termAtt.length()); + posLengthAtt.setPositionLength(length); return true; + } + + if (splitMode == Tokenizer.SplitMode.C) { + return true; + } + + // split into A/B unit + List subUnits = m.split(splitMode); + if (subUnits.size() > 1) { + aUnitIterator = subUnits.listIterator(); + aUnitOffset = offsetAtt.startOffset(); + posLengthAtt.setPositionLength(subUnits.size()); } else { - return false; + posLengthAtt.setPositionLength(1); } + + return true; } private void setAUnitAttribute(Morpheme morpheme) { From eafe3f25b8dca7a6509989ba4209a8bf9c97b06e Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 30 Oct 2024 18:06:25 +0900 Subject: [PATCH 02/13] refactor oov branch and rm redundant att setting --- .../lucene/sudachi/ja/SudachiSplitFilter.java | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java index 6e52ebc..d093113 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java @@ -127,12 +127,14 @@ public final boolean incrementToken() throws IOException { return true; } - // split oov into charactor - int length = 0; - termAtt.setEmpty().append(m.surface()); - if (mode == Mode.EXTENDED && m.isOOV() && (length = Strings.codepointCount(termAtt)) > 1) { - oovChars.setOov(offsetAtt.startOffset(), termAtt.buffer(), termAtt.length()); - posLengthAtt.setPositionLength(length); + // oov does not have splits + // split into characters in extended mode + if (m.isOOV()) { + int length = 0; + if (mode == Mode.EXTENDED && (length = Strings.codepointCount(termAtt)) > 1) { + oovChars.setOov(offsetAtt.startOffset(), termAtt.buffer(), termAtt.length()); + posLengthAtt.setPositionLength(length); + } return true; } @@ -140,14 +142,12 @@ public final boolean incrementToken() throws IOException { return true; } - // split into A/B unit + // split into A/B units List subUnits = m.split(splitMode); if (subUnits.size() > 1) { aUnitIterator = subUnits.listIterator(); aUnitOffset = offsetAtt.startOffset(); posLengthAtt.setPositionLength(subUnits.size()); - } else { - posLengthAtt.setPositionLength(1); } return true; From 5b20471169c296a15e763f6c1c00226321191244 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 31 Oct 2024 11:34:05 +0900 Subject: [PATCH 03/13] add MorphemeSubunits class to handle a/b splits --- .../lucene/sudachi/ja/SudachiSplitFilter.java | 163 +++++++++++------- 1 file changed, 103 insertions(+), 60 deletions(-) diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java index d093113..e6fe25d 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java @@ -23,7 +23,6 @@ import com.worksap.nlp.lucene.sudachi.ja.attributes.*; import com.worksap.nlp.lucene.sudachi.ja.util.Strings; import com.worksap.nlp.sudachi.Morpheme; - import com.worksap.nlp.sudachi.Tokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -40,56 +39,17 @@ public enum Mode { public static final Mode DEFAULT_MODE = Mode.SEARCH; - static class OovChars { - private int length; - private char[] buffer = new char[0]; - private int reserved; - private int index; - private int baseOffset; - - public void setOov(int offset, char[] src, int length) { - baseOffset = offset; - this.length = length; - if (reserved < length) { - buffer = new char[length]; - reserved = length; - } - System.arraycopy(src, 0, buffer, 0, length); - index = 0; - } - - public boolean hasNext() { - return index < length; - } - - public char next() { - if (index < length) { - return buffer[index++]; - } else { - throw new IllegalStateException(); - } - } - - public int index() { - return index; - } - - public int offset() { - return baseOffset + index; - } - } - private final Mode mode; private final Tokenizer.SplitMode splitMode; + private final CharTermAttribute termAtt; private final OffsetAttribute offsetAtt; private final PositionIncrementAttribute posIncAtt; private final PositionLengthAttribute posLengthAtt; private final MorphemeAttribute morphemeAtt; - private ListIterator aUnitIterator; - private final OovChars oovChars = new OovChars(); - private int aUnitOffset = 0; + private final MorphemeSubunits subunits = new MorphemeSubunits(); + private final OovChars oovChars = new OovChars(); public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode splitMode) { super(input); @@ -111,9 +71,9 @@ public final boolean incrementToken() throws IOException { setOOVAttribute(); return true; } - if (aUnitIterator != null && aUnitIterator.hasNext()) { + if (subunits.hasNext()) { clearAttributes(); - setAUnitAttribute(aUnitIterator.next()); + setAUnitAttribute(); return true; } @@ -132,56 +92,139 @@ public final boolean incrementToken() throws IOException { if (m.isOOV()) { int length = 0; if (mode == Mode.EXTENDED && (length = Strings.codepointCount(termAtt)) > 1) { + // OovChars requires character length oovChars.setOov(offsetAtt.startOffset(), termAtt.buffer(), termAtt.length()); + // Position length should be codepoint length posLengthAtt.setPositionLength(length); } return true; } + // C split is the longest split if (splitMode == Tokenizer.SplitMode.C) { return true; } // split into A/B units - List subUnits = m.split(splitMode); - if (subUnits.size() > 1) { - aUnitIterator = subUnits.listIterator(); - aUnitOffset = offsetAtt.startOffset(); - posLengthAtt.setPositionLength(subUnits.size()); + List subsplits = m.split(splitMode); + if (subsplits.size() > 1) { + subunits.setUnits(offsetAtt.startOffset(), subsplits); + posLengthAtt.setPositionLength(subunits.size()); } return true; } - private void setAUnitAttribute(Morpheme morpheme) { + private void setAUnitAttribute() { posLengthAtt.setPositionLength(1); - if (aUnitIterator.previousIndex() == 0) { + if (subunits.index() == 0) { posIncAtt.setPositionIncrement(0); } else { posIncAtt.setPositionIncrement(1); } - int length = morpheme.end() - morpheme.begin(); - offsetAtt.setOffset(aUnitOffset, aUnitOffset + length); - aUnitOffset += length; - morphemeAtt.setMorpheme(morpheme); + + int startOffset = subunits.offset(); + Morpheme morpheme = subunits.next(); + int endOffset = subunits.offset(); termAtt.setEmpty().append(morpheme.surface()); + morphemeAtt.setMorpheme(morpheme); + offsetAtt.setOffset(startOffset, endOffset); } private void setOOVAttribute() { - int offset = oovChars.offset(); posLengthAtt.setPositionLength(1); if (oovChars.index() == 0) { posIncAtt.setPositionIncrement(0); } else { posIncAtt.setPositionIncrement(1); } + + int startOffset = oovChars.offset(); char c = oovChars.next(); termAtt.setEmpty().append(c); if (Character.isSurrogate(c) && oovChars.hasNext()) { termAtt.append(oovChars.next()); - offsetAtt.setOffset(offset, offset + 2); - } else { - offsetAtt.setOffset(offset, offset + 1); + } + int endOffset = oovChars.offset(); + offsetAtt.setOffset(startOffset, endOffset); + } + + static class OovChars { + private int baseOffset; + private int reserved; + private char[] buffer = new char[0]; + private int length; + private int index; + + public void setOov(int offset, char[] src, int length) { + baseOffset = offset; + this.length = length; + if (reserved < length) { + buffer = new char[length]; + reserved = length; + } + System.arraycopy(src, 0, buffer, 0, length); + index = 0; + } + + public boolean hasNext() { + return index < length; + } + + public char next() { + if (index < length) { + return buffer[index++]; + } + throw new IllegalStateException(); + } + + public int index() { + return index; + } + + public int offset() { + return baseOffset + index; + } + } + + static class MorphemeSubunits { + private int baseOffset; + private List morphemes; + private int size; + private int index; + private int offset; + + public void setUnits(int baseOffset, List morphemes) { + this.baseOffset = baseOffset; + this.morphemes = morphemes; + size = morphemes.size(); + index = 0; + offset = 0; + } + + public boolean hasNext() { + return index < size; + } + + public Morpheme next() { + if (!hasNext()) { + throw new IllegalStateException(); + } + Morpheme m = morphemes.get(index++); + offset += m.end() - m.begin(); + return m; + } + + public int size() { + return size; + } + + public int index() { + return index; + } + + public int offset() { + return baseOffset + offset; } } } \ No newline at end of file From b3c8c6472f114b86aa7e1362c80ce254aab53651 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 31 Oct 2024 17:37:57 +0900 Subject: [PATCH 04/13] add offset mapping to the morpheme attribute --- .../ja/attributes/MorphemeAttribute.java | 17 ++++++++++- .../lucene/sudachi/ja/SudachiSplitFilter.java | 28 +++++++++++-------- .../nlp/lucene/sudachi/ja/SudachiTokenizer.kt | 7 +++-- .../ja/attributes/MorphemeAttributeImpl.kt | 16 ++++++++++- 4 files changed, 51 insertions(+), 17 deletions(-) diff --git a/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttribute.java b/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttribute.java index a008130..7389b1d 100644 --- a/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttribute.java +++ b/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttribute.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Works Applications Co., Ltd. + * Copyright (c) 2023-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,8 @@ package com.worksap.nlp.lucene.sudachi.ja.attributes; +import java.util.List; + import com.worksap.nlp.sudachi.Morpheme; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.Attribute; @@ -36,4 +38,17 @@ public interface MorphemeAttribute extends Attribute { * new object */ void setMorpheme(Morpheme morpheme); + + /** + * @return The offset mapping for the current morpheme + */ + List getOffsets(); + + /** + * Set the offset mapping for the morpheme + * + * @param offsets + * actual offset for each offset in the morpheme + */ + void setOffsets(List offsets); } diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java index e6fe25d..dad24ed 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java @@ -50,6 +50,7 @@ public enum Mode { private final MorphemeSubunits subunits = new MorphemeSubunits(); private final OovChars oovChars = new OovChars(); + private List offsetMap; public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode splitMode) { super(input); @@ -83,6 +84,7 @@ public final boolean incrementToken() throws IOException { } Morpheme m = morphemeAtt.getMorpheme(); + this.offsetMap = morphemeAtt.getOffsets(); if (m == null) { return true; } @@ -93,7 +95,7 @@ public final boolean incrementToken() throws IOException { int length = 0; if (mode == Mode.EXTENDED && (length = Strings.codepointCount(termAtt)) > 1) { // OovChars requires character length - oovChars.setOov(offsetAtt.startOffset(), termAtt.buffer(), termAtt.length()); + oovChars.setOov(termAtt.buffer(), termAtt.length()); // Position length should be codepoint length posLengthAtt.setPositionLength(length); } @@ -108,13 +110,18 @@ public final boolean incrementToken() throws IOException { // split into A/B units List subsplits = m.split(splitMode); if (subsplits.size() > 1) { - subunits.setUnits(offsetAtt.startOffset(), subsplits); + subunits.setUnits(subsplits); posLengthAtt.setPositionLength(subunits.size()); } return true; } + private int correctOffset(int currectOff) { + assert (0 <= currectOff && currectOff <= this.offsetMap.size()); + return this.offsetMap.get(currectOff); + } + private void setAUnitAttribute() { posLengthAtt.setPositionLength(1); if (subunits.index() == 0) { @@ -128,7 +135,8 @@ private void setAUnitAttribute() { int endOffset = subunits.offset(); termAtt.setEmpty().append(morpheme.surface()); morphemeAtt.setMorpheme(morpheme); - offsetAtt.setOffset(startOffset, endOffset); + morphemeAtt.setOffsets(offsetMap.subList(startOffset, endOffset + 1)); + offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset)); } private void setOOVAttribute() { @@ -146,18 +154,16 @@ private void setOOVAttribute() { termAtt.append(oovChars.next()); } int endOffset = oovChars.offset(); - offsetAtt.setOffset(startOffset, endOffset); + offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset)); } static class OovChars { - private int baseOffset; private int reserved; private char[] buffer = new char[0]; private int length; private int index; - public void setOov(int offset, char[] src, int length) { - baseOffset = offset; + public void setOov(char[] src, int length) { this.length = length; if (reserved < length) { buffer = new char[length]; @@ -183,19 +189,17 @@ public int index() { } public int offset() { - return baseOffset + index; + return index; } } static class MorphemeSubunits { - private int baseOffset; private List morphemes; private int size; private int index; private int offset; - public void setUnits(int baseOffset, List morphemes) { - this.baseOffset = baseOffset; + public void setUnits(List morphemes) { this.morphemes = morphemes; size = morphemes.size(); index = 0; @@ -224,7 +228,7 @@ public int index() { } public int offset() { - return baseOffset + offset; + return offset; } } } \ No newline at end of file diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt index 6bfbb0e..ac1b374 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt @@ -55,11 +55,12 @@ class SudachiTokenizer( override fun incrementToken(): Boolean { clearAttributes() var m = iterator.next() ?: return false + val baseOffset = iterator.baseOffset morphemeAtt.setMorpheme(m) - posLenAtt.positionLength = 1 - posIncAtt.positionIncrement = 1 - val baseOffset = iterator.baseOffset + morphemeAtt.setOffsets((m.begin()..m.end()).map { i -> correctOffset(baseOffset + i) }) + posLenAtt.setPositionLength(1) + posIncAtt.setPositionIncrement(1) offsetAtt.setOffset(correctOffset(baseOffset + m.begin()), correctOffset(baseOffset + m.end())) termAtt.setEmpty().append(m.surface()) return true diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt index 9b494c1..01d0e99 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt @@ -26,6 +26,8 @@ import org.apache.lucene.util.AttributeReflector class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute { private var morpheme: MorphemeWrapper? = null + // mapping from the character to the original reader + private var offsetMap: List = listOf() private class MorphemeWrapper(morpheme: Morpheme) : ToXContent { private val morpheme = morpheme @@ -53,10 +55,14 @@ class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute { override fun reflectWith(reflector: AttributeReflector) { reflector.reflect("morpheme", morpheme) + reflector.reflect("offsetMap", offsetMap) } override fun copyTo(target: AttributeImpl?) { - (target as? MorphemeAttributeImpl)?.let { it.setMorpheme(getMorpheme()) } + (target as? MorphemeAttributeImpl)?.let { + it.setMorpheme(getMorpheme()) + it.setOffsets(getOffsets()) + } } override fun getMorpheme(): Morpheme? { @@ -66,4 +72,12 @@ class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute { override fun setMorpheme(morpheme: Morpheme?) { this.morpheme = morpheme?.let { m -> MorphemeWrapper(m) } } + + override fun getOffsets(): List { + return offsetMap + } + + override fun setOffsets(offsets: List) { + this.offsetMap = offsets + } } From 4dae125da91394716f1e35b64a74bf149a55c986 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 1 Nov 2024 12:57:15 +0900 Subject: [PATCH 05/13] add split filter test with input normalization --- .../sudachi/ja/TestSudachiSplitFilter.kt | 65 ++++++++++++++++++- .../attributes/MorphemeAttributeImplTest.kt | 27 ++++++-- .../nlp/lucene/sudachi/ja/sudachi.json | 3 + src/test/resources/dict/lex.csv | 8 ++- 4 files changed, 96 insertions(+), 7 deletions(-) diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiSplitFilter.kt b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiSplitFilter.kt index eafa8a3..493ca44 100644 --- a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiSplitFilter.kt +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiSplitFilter.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023 Works Applications Co., Ltd. + * Copyright (c) 2022-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -196,6 +196,69 @@ class TestSudachiSplitFilter : BaseTokenStreamTestCase() { ) } + @Test + fun testWithCharNormalizationBySearchMode() { + val tokenStream = setUpTokenStream("search", "六三四㍿に行くカ゛カ゛カ゛") + assertTokenStreamContents( + tokenStream, + arrayOf("六三四", "㍿", "㍿", "", "に", "行く", "カ゛カ゛カ゛", "カ゛カ゛", "カ゛"), + intArrayOf(0, 3, 3, 4, 4, 5, 7, 7, 11), + intArrayOf(3, 4, 4, 4, 5, 7, 13, 11, 13), + intArrayOf(1, 1, 0, 1, 1, 1, 1, 0, 1), + intArrayOf(1, 2, 1, 1, 1, 1, 2, 1, 1), + 13, + ) + } + + @Test + fun testWithCharNormalizationInNormalizedFormBySearchMode() { + var tokenStream = setUpTokenStream("search", "六三四㍿に行くカ゛カ゛カ゛") + val normFactory = SudachiNormalizedFormFilterFactory(mutableMapOf()) + tokenStream = normFactory.create(tokenStream) + + assertTokenStreamContents( + tokenStream, + arrayOf("六三四", "株式会社", "株式", "会社", "に", "行く", "ガガガ", "ガガ", "ガ"), + intArrayOf(0, 3, 3, 4, 4, 5, 7, 7, 11), + intArrayOf(3, 4, 4, 4, 5, 7, 13, 11, 13), + intArrayOf(1, 1, 0, 1, 1, 1, 1, 0, 1), + intArrayOf(1, 2, 1, 1, 1, 1, 2, 1, 1), + 13, + ) + } + + @Test + fun testWithCharNormalizationByExtendedMode() { + // extending normalized form seems more natural, but we cannot calculate their offsets. + val tokenStream = setUpTokenStream("extended", "10㌢㍍いったソ゛") + assertTokenStreamContents( + tokenStream, + arrayOf("1", "0", "㌢㍍", "㌢", "㍍", "いっ", "た", "ソ゛", "ソ", "゛"), + intArrayOf(0, 1, 2, 2, 3, 4, 6, 7, 7, 8), + intArrayOf(1, 2, 4, 3, 4, 6, 7, 9, 8, 9), + intArrayOf(1, 1, 1, 0, 1, 1, 1, 1, 0, 1), + intArrayOf(1, 1, 2, 1, 1, 1, 1, 2, 1, 1), + 9, + ) + } + + @Test + fun testWithCharNormalizationInNormalizedFormByExtendedMode() { + // extending normalized form seems more natural, but we cannot calculate their offsets. + var tokenStream = setUpTokenStream("extended", "10㌢㍍いったソ゛") + val normFactory = SudachiNormalizedFormFilterFactory(mutableMapOf()) + tokenStream = normFactory.create(tokenStream) + + assertTokenStreamContents( + tokenStream, + arrayOf("1", "0", "センチメートル", "㌢", "㍍", "行く", "た", "ゾ", "ソ", "゛"), + intArrayOf(0, 1, 2, 2, 3, 4, 6, 7, 7, 8), + intArrayOf(1, 2, 4, 3, 4, 6, 7, 9, 8, 9), + intArrayOf(1, 1, 1, 0, 1, 1, 1, 1, 0, 1), + intArrayOf(1, 1, 2, 1, 1, 1, 1, 2, 1, 1), + 9) + } + fun setUpTokenStream(mode: String, input: String): TokenStream { val factory = SudachiSplitFilterFactory( diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt index a630daf..18a0513 100644 --- a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt @@ -16,7 +16,6 @@ package com.worksap.nlp.lucene.sudachi.ja.attributes -import com.worksap.nlp.lucene.aliases.ToXContent import com.worksap.nlp.lucene.aliases.XContentBuilder import com.worksap.nlp.search.aliases.XContentType import com.worksap.nlp.sudachi.Config @@ -65,6 +64,19 @@ class MorphemeAttributeImplTest { assertNull(morphemeAtt.getMorpheme()) } + @Test + fun setOffsets() { + var morphemeAtt = MorphemeAttributeImpl() + assertTrue(morphemeAtt.getOffsets().isEmpty()) + + val intlist = listOf(1, 2, 3) + morphemeAtt.setOffsets(intlist) + assertEquals(intlist, morphemeAtt.getOffsets()) + + morphemeAtt.setOffsets(listOf()) + assertTrue(morphemeAtt.getOffsets().isEmpty()) + } + @Test fun copyTo() { var morphemeAtt1 = MorphemeAttributeImpl() @@ -85,15 +97,14 @@ class MorphemeAttributeImplTest { var morphemeAtt = MorphemeAttributeImpl() val morpheme = getFirstMorpheme("東京都")!! morphemeAtt.setMorpheme(morpheme) + val offsets = listOf(0, 3) + morphemeAtt.setOffsets(offsets) val builder = XContentBuilder.builder(XContentType.JSON.xContent()) builder.startObject() morphemeAtt.reflectWith( fun(attClass, key, value) { assertEquals(MorphemeAttribute::class.java, attClass) - assertEquals("morpheme", key) - assertTrue(value is ToXContent) - builder.field(key, value) }) builder.endObject() @@ -103,15 +114,21 @@ class MorphemeAttributeImplTest { val deserialized = Json.decodeFromString(serialized) assertNotNull(deserialized.morpheme) + assertNotNull(deserialized.offsetMap) assertEquals(morpheme.surface(), deserialized.morpheme.surface) assertEquals(morpheme.dictionaryForm(), deserialized.morpheme.dictionaryForm) assertEquals(morpheme.normalizedForm(), deserialized.morpheme.normalizedForm) assertEquals(morpheme.readingForm(), deserialized.morpheme.readingForm) assertEquals(morpheme.partOfSpeech(), deserialized.morpheme.partOfSpeech) + assertEquals(offsets, deserialized.offsetMap) } } -@Serializable data class MorphemeHolder(val morpheme: MorphemeAttributeHolder) +@Serializable +data class MorphemeHolder( + val morpheme: MorphemeAttributeHolder, + val offsetMap: List, +) @Serializable data class MorphemeAttributeHolder( diff --git a/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi.json b/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi.json index 4f76af7..3042b87 100644 --- a/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi.json +++ b/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi.json @@ -1,5 +1,8 @@ { "systemDict" : "system_core.dic", + "inputTextPlugin" : [ + { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" } + ], "oovProviderPlugin" : [ { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], diff --git a/src/test/resources/dict/lex.csv b/src/test/resources/dict/lex.csv index a814dc3..1898e45 100644 --- a/src/test/resources/dict/lex.csv +++ b/src/test/resources/dict/lex.csv @@ -37,4 +37,10 @@ 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-30000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,* 特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,*,A,*,*,*,* な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,A,*,*,*,* -ふく,4,4,5105,ふく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,フク,吹く,*,A,*,*,*,* \ No newline at end of file +ふく,4,4,5105,ふく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,フク,吹く,*,A,*,*,*,* +株式,8,8,5611,株式,名詞,普通名詞,一般,*,*,*,カブシキ,株式,*,A,*,*,*,* +会社,8,8,2914,会社,名詞,普通名詞,一般,*,*,*,カイシャ,会社,*,A,*,*,*,* +株式会社,8,8,6000,株式会社,名詞,普通名詞,一般,*,*,*,カブシキガイシャ,株式会社,*,C,40/41,40/41,40/41,* +ガ,5,5,3500,ガ,副詞,*,*,*,*,*,ガ,ガ,*,A,*,*,*,* +ガガ,5,5,5500,ガガ,副詞,*,*,*,*,*,ガガ,ガガ,*,A,*,*,*,* +ガガガ,5,5,8494,ガガガ,副詞,*,*,*,*,*,ガガガ,ガガガ,*,B,44/43,*,44/43,* \ No newline at end of file From c5483c3cbf3975b7d86291c76868d4d06552c675 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 1 Nov 2024 13:17:07 +0900 Subject: [PATCH 06/13] fix toxcontent of morpheme attribute --- .../ja/attributes/MorphemeAttributeImpl.kt | 50 ++++++++++++------- .../attributes/MorphemeAttributeImplTest.kt | 26 ++++++++-- 2 files changed, 54 insertions(+), 22 deletions(-) diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt index 01d0e99..ef712bb 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt @@ -25,37 +25,51 @@ import org.apache.lucene.util.AttributeImpl import org.apache.lucene.util.AttributeReflector class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute { - private var morpheme: MorphemeWrapper? = null - // mapping from the character to the original reader - private var offsetMap: List = listOf() + private val inner: ToXContentWrapper = ToXContentWrapper(null, listOf()) - private class MorphemeWrapper(morpheme: Morpheme) : ToXContent { - private val morpheme = morpheme + private class ToXContentWrapper(morpheme: Morpheme?, offsetMap: List) : ToXContent { + private var morpheme = morpheme + // mapping from the character to the original reader + private var offsetMap = offsetMap override fun toXContent(builder: XContentBuilder, params: ToXContentParams): XContentBuilder { builder.value( mapOf( - "surface" to morpheme.surface(), - "dictionaryForm" to morpheme.dictionaryForm(), - "normalizedForm" to morpheme.normalizedForm(), - "readingForm" to morpheme.readingForm(), - "partOfSpeech" to morpheme.partOfSpeech(), + "surface" to morpheme?.surface(), + "dictionaryForm" to morpheme?.dictionaryForm(), + "normalizedForm" to morpheme?.normalizedForm(), + "readingForm" to morpheme?.readingForm(), + "partOfSpeech" to morpheme?.partOfSpeech(), + "offsetMap" to offsetMap, )) return builder } - fun unwrap(): Morpheme { + fun getMorpheme(): Morpheme? { return morpheme } + + fun setMorpheme(morpheme: Morpheme?) { + this.morpheme = morpheme + } + + fun getOffsets(): List { + return offsetMap + } + + fun setOffsets(offsets: List) { + this.offsetMap = offsets + } } override fun clear() { - morpheme = null + inner.setMorpheme(null) + inner.setOffsets(listOf()) } override fun reflectWith(reflector: AttributeReflector) { - reflector.reflect("morpheme", morpheme) - reflector.reflect("offsetMap", offsetMap) + reflector.reflect( + "morpheme", if (inner.getMorpheme() != null) inner else null) } override fun copyTo(target: AttributeImpl?) { @@ -66,18 +80,18 @@ class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute { } override fun getMorpheme(): Morpheme? { - return morpheme?.let { m -> m.unwrap() } + return inner.getMorpheme() } override fun setMorpheme(morpheme: Morpheme?) { - this.morpheme = morpheme?.let { m -> MorphemeWrapper(m) } + inner.setMorpheme(morpheme) } override fun getOffsets(): List { - return offsetMap + return inner.getOffsets() } override fun setOffsets(offsets: List) { - this.offsetMap = offsets + inner.setOffsets(offsets) } } diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt index 18a0513..5d29e96 100644 --- a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt @@ -114,20 +114,37 @@ class MorphemeAttributeImplTest { val deserialized = Json.decodeFromString(serialized) assertNotNull(deserialized.morpheme) - assertNotNull(deserialized.offsetMap) assertEquals(morpheme.surface(), deserialized.morpheme.surface) assertEquals(morpheme.dictionaryForm(), deserialized.morpheme.dictionaryForm) assertEquals(morpheme.normalizedForm(), deserialized.morpheme.normalizedForm) assertEquals(morpheme.readingForm(), deserialized.morpheme.readingForm) assertEquals(morpheme.partOfSpeech(), deserialized.morpheme.partOfSpeech) - assertEquals(offsets, deserialized.offsetMap) + assertEquals(offsets, deserialized.morpheme.offsetMap) + } + + @Test + fun toXContentNullMorpheme() { + var morphemeAtt = MorphemeAttributeImpl() + + val builder = XContentBuilder.builder(XContentType.JSON.xContent()) + builder.startObject() + morphemeAtt.reflectWith( + fun(attClass, key, value) { + assertEquals(MorphemeAttribute::class.java, attClass) + builder.field(key, value) + }) + builder.endObject() + builder.flush() + + val serialized = builder.getOutputStream().toString() + val deserialized = Json.decodeFromString(serialized) + assertNull(deserialized.morpheme) } } @Serializable data class MorphemeHolder( - val morpheme: MorphemeAttributeHolder, - val offsetMap: List, + val morpheme: MorphemeAttributeHolder?, ) @Serializable @@ -137,4 +154,5 @@ data class MorphemeAttributeHolder( val normalizedForm: String, val readingForm: String, val partOfSpeech: List, + val offsetMap: List, ) From 0a0950a9787f178991b78079c411d88fa9d132af Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 1 Nov 2024 14:42:07 +0900 Subject: [PATCH 07/13] add integration test with icu normalizer --- test-scripts/01-integration-test.py | 116 ++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/test-scripts/01-integration-test.py b/test-scripts/01-integration-test.py index ef51df1..3e2468f 100644 --- a/test-scripts/01-integration-test.py +++ b/test-scripts/01-integration-test.py @@ -142,6 +142,122 @@ def test_icu_filtered_stuff_is_not_trimmed(self): self.assertEqual(5, tokens[0]["end_offset"]) return + def test_correct_split_offset_with_icu_filter(self): + body = { + "tokenizer": "sudachi_tokenizer", + "char_filter": { + "type": "icu_normalizer", + "name": "nfkc_cf", + "mode": "compose" + }, + "filter": { + "type": "sudachi_split", + "mode": "search" + }, + "text": "六三四㍿のアッフ\u309Aルハ\u309Aイ", + } + resp = es_instance.analyze(body) + self.assertEqual(200, resp.status, f"data: {resp.data}") + + tokens = json.loads(resp.data.decode())["tokens"] + self.assertEqual(8, len(tokens)) + self.assertEqual("株式会社", tokens[1]["token"]) + self.assertEqual(1, tokens[1]["position"]) + self.assertEqual(2, tokens[1]["positionLength"]) + self.assertEqual(3, tokens[1]["start_offset"]) + self.assertEqual(4, tokens[1]["end_offset"]) + + self.assertEqual("株式", tokens[2]["token"]) + self.assertEqual(1, tokens[2]["position"]) + self.assertEqual(3, tokens[2]["start_offset"]) + self.assertEqual(3, tokens[2]["end_offset"]) + self.assertEqual("会社", tokens[3]["token"]) + self.assertEqual(2, tokens[3]["position"]) + self.assertEqual(3, tokens[3]["start_offset"]) + self.assertEqual(4, tokens[3]["end_offset"]) + + self.assertEqual("アップルパイ", tokens[5]["token"]) + self.assertEqual(4, tokens[5]["position"]) + self.assertEqual(2, tokens[1]["positionLength"]) + self.assertEqual(5, tokens[5]["start_offset"]) + self.assertEqual(13, tokens[5]["end_offset"]) + + self.assertEqual("アップル", tokens[6]["token"]) + self.assertEqual(4, tokens[6]["position"]) + self.assertEqual(5, tokens[6]["start_offset"]) + self.assertEqual(10, tokens[6]["end_offset"]) + self.assertEqual("パイ", tokens[7]["token"]) + self.assertEqual(5, tokens[7]["position"]) + self.assertEqual(10, tokens[7]["start_offset"]) + self.assertEqual(13, tokens[7]["end_offset"]) + return + + def test_correct_OOV_offset_with_icu_filter(self): + body = { + "tokenizer": "sudachi_tokenizer", + "char_filter": { + "type": "icu_normalizer", + "name": "nfkc_cf", + "mode": "compose" + }, + "filter": { + "type": "sudachi_split", + "mode": "extended" + }, + "text": "10㍉㌢進んでホ\u3099ホ\u3099ホ\u3099", + } + resp = es_instance.analyze(body) + self.assertEqual(200, resp.status, f"data: {resp.data}") + + tokens = json.loads(resp.data.decode())["tokens"] + self.assertEqual(13, len(tokens)) + self.assertEqual("ミリセンチ", tokens[1]["token"]) + self.assertEqual(1, tokens[1]["position"]) + self.assertEqual(5, tokens[1]["positionLength"]) + self.assertEqual(2, tokens[1]["start_offset"]) + self.assertEqual(4, tokens[1]["end_offset"]) + + self.assertEqual("ミ", tokens[2]["token"]) + self.assertEqual(1, tokens[2]["position"]) + self.assertEqual(2, tokens[2]["start_offset"]) + self.assertEqual(2, tokens[2]["end_offset"]) + self.assertEqual("リ", tokens[3]["token"]) + self.assertEqual(2, tokens[3]["position"]) + self.assertEqual(2, tokens[3]["start_offset"]) + self.assertEqual(3, tokens[3]["end_offset"]) + self.assertEqual("セ", tokens[4]["token"]) + self.assertEqual(3, tokens[4]["position"]) + self.assertEqual(3, tokens[4]["start_offset"]) + self.assertEqual(3, tokens[4]["end_offset"]) + self.assertEqual("ン", tokens[5]["token"]) + self.assertEqual(4, tokens[5]["position"]) + self.assertEqual(3, tokens[5]["start_offset"]) + self.assertEqual(3, tokens[5]["end_offset"]) + self.assertEqual("チ", tokens[6]["token"]) + self.assertEqual(5, tokens[6]["position"]) + self.assertEqual(3, tokens[6]["start_offset"]) + self.assertEqual(4, tokens[6]["end_offset"]) + + self.assertEqual("ボボボ", tokens[9]["token"]) + self.assertEqual(8, tokens[9]["position"]) + self.assertEqual(3, tokens[9]["positionLength"]) + self.assertEqual(7, tokens[9]["start_offset"]) + self.assertEqual(13, tokens[9]["end_offset"]) + + self.assertEqual("ボ", tokens[10]["token"]) + self.assertEqual(8, tokens[10]["position"]) + self.assertEqual(7, tokens[10]["start_offset"]) + self.assertEqual(9, tokens[10]["end_offset"]) + self.assertEqual("ボ", tokens[11]["token"]) + self.assertEqual(9, tokens[11]["position"]) + self.assertEqual(9, tokens[11]["start_offset"]) + self.assertEqual(11, tokens[11]["end_offset"]) + self.assertEqual("ボ", tokens[12]["token"]) + self.assertEqual(10, tokens[12]["position"]) + self.assertEqual(11, tokens[12]["start_offset"]) + self.assertEqual(13, tokens[12]["end_offset"]) + return + class TestSubplugin(unittest.TestCase): # requires :subplugin is installed with :testlib From 5aeb607943155349a9a69d84f15f58336c59a394 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 1 Nov 2024 14:42:18 +0900 Subject: [PATCH 08/13] comment out assertion --- .../com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java index dad24ed..d871cdb 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java @@ -118,7 +118,7 @@ public final boolean incrementToken() throws IOException { } private int correctOffset(int currectOff) { - assert (0 <= currectOff && currectOff <= this.offsetMap.size()); + // assert (0 <= currectOff && currectOff <= this.offsetMap.size()); return this.offsetMap.get(currectOff); } From a54675ec541c11be7170c78b35d7fd7c360e609e Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 1 Nov 2024 18:11:54 +0900 Subject: [PATCH 09/13] use wrapper only for reflectWith --- .../ja/attributes/MorphemeAttributeImpl.kt | 53 +++++++------------ 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt index ef712bb..472a771 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt @@ -25,51 +25,38 @@ import org.apache.lucene.util.AttributeImpl import org.apache.lucene.util.AttributeReflector class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute { - private val inner: ToXContentWrapper = ToXContentWrapper(null, listOf()) + private var morpheme: Morpheme? = null + // mapping from the character offset to the original reader offset + private var offsetMap: List = listOf() - private class ToXContentWrapper(morpheme: Morpheme?, offsetMap: List) : ToXContent { - private var morpheme = morpheme - // mapping from the character to the original reader - private var offsetMap = offsetMap + // wrapper class to convert data ToXContent-able + private class ToXContentWrapper(morpheme: Morpheme, offsetMap: List) : ToXContent { + private val morpheme = morpheme + private val offsetMap = offsetMap override fun toXContent(builder: XContentBuilder, params: ToXContentParams): XContentBuilder { builder.value( mapOf( - "surface" to morpheme?.surface(), - "dictionaryForm" to morpheme?.dictionaryForm(), - "normalizedForm" to morpheme?.normalizedForm(), - "readingForm" to morpheme?.readingForm(), - "partOfSpeech" to morpheme?.partOfSpeech(), + "surface" to morpheme.surface(), + "dictionaryForm" to morpheme.dictionaryForm(), + "normalizedForm" to morpheme.normalizedForm(), + "readingForm" to morpheme.readingForm(), + "partOfSpeech" to morpheme.partOfSpeech(), "offsetMap" to offsetMap, )) return builder } - - fun getMorpheme(): Morpheme? { - return morpheme - } - - fun setMorpheme(morpheme: Morpheme?) { - this.morpheme = morpheme - } - - fun getOffsets(): List { - return offsetMap - } - - fun setOffsets(offsets: List) { - this.offsetMap = offsets - } } override fun clear() { - inner.setMorpheme(null) - inner.setOffsets(listOf()) + morpheme = null + offsetMap = listOf() } override fun reflectWith(reflector: AttributeReflector) { + // show only when a morpheme is set reflector.reflect( - "morpheme", if (inner.getMorpheme() != null) inner else null) + "morpheme", morpheme?.let { m -> ToXContentWrapper(m, offsetMap) }) } override fun copyTo(target: AttributeImpl?) { @@ -80,18 +67,18 @@ class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute { } override fun getMorpheme(): Morpheme? { - return inner.getMorpheme() + return morpheme } override fun setMorpheme(morpheme: Morpheme?) { - inner.setMorpheme(morpheme) + this.morpheme = morpheme } override fun getOffsets(): List { - return inner.getOffsets() + return offsetMap } override fun setOffsets(offsets: List) { - inner.setOffsets(offsets) + this.offsetMap = offsets } } From 38bad32b4da3d3da55106075b8fb479f93ecddc3 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 5 Nov 2024 11:14:53 +0900 Subject: [PATCH 10/13] update dict version for workflow --- .github/workflows/build.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 17c6598..d66a4bb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -46,6 +46,8 @@ jobs: - 'os:2.6.0' env: mainJob: ${{ matrix.es-version == 'es:8.15.2' }} + sudachiVersion: 20241021 + sudachiKind: core continue-on-error: true steps: @@ -93,15 +95,16 @@ jobs: - name: Cache dictionary download uses: actions/cache@v4 with: - path: build/integration/sudachi-dictionary-20230110-small.zip - key: sudachi-dictionary-20230110 + path: build/integration/sudachi-dictionary-${{ env.sudachiVersion }}-${{ env.sudachiKind }}.zip + key: sudachi-dictionary-${{ env.sudachiVersion }}-${{ env.sudachiKind }} - name: Integration test env: ES_KIND: ${{ env.ENGINE_KIND }} ES_VERSION: ${{ env.ENGINE_VERSION }} PLUGIN_VERSION: ${{ env.PROJ_VERSION }} RUN_ES_DAEMON: 1 - DIC_VERSION: 20230110 + DIC_VERSION: ${{ env.sudachiVersion }} + DIC_KIND: ${{ env.sudachiKind }} run: | bash test-scripts/00-install-elasticsearch.sh sleep 30 From 24b6b88cc021179c758dcc6098e76a0180217b21 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 5 Nov 2024 11:53:29 +0900 Subject: [PATCH 11/13] adjust test script with updated dict --- test-scripts/20-put-docs.py | 2 +- test-scripts/30-test-docs.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test-scripts/20-put-docs.py b/test-scripts/20-put-docs.py index 725ba56..4c121f4 100644 --- a/test-scripts/20-put-docs.py +++ b/test-scripts/20-put-docs.py @@ -1,6 +1,6 @@ import argparse from multiprocessing import Pool -import urllib3.request +import urllib3 import json from pathlib import Path diff --git a/test-scripts/30-test-docs.py b/test-scripts/30-test-docs.py index 9dd538b..e10d02b 100644 --- a/test-scripts/30-test-docs.py +++ b/test-scripts/30-test-docs.py @@ -1,6 +1,6 @@ import argparse from multiprocessing import Pool -import urllib3.request +import urllib3 import json from pathlib import Path @@ -31,7 +31,7 @@ def run(self): def test57Games(self): games = self.es.find("ゲーム") - self.assertEq(57, games["hits"]["total"]["value"]) + self.assertEq(59, games["hits"]["total"]["value"]) def test107Daigaku(self): docs = self.es.find("大学") From 85a66af6afeb1aa9bea57655831244e003601e5f Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 5 Nov 2024 13:20:57 +0900 Subject: [PATCH 12/13] fix test --- .../lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt index 5d29e96..373302d 100644 --- a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt @@ -97,7 +97,7 @@ class MorphemeAttributeImplTest { var morphemeAtt = MorphemeAttributeImpl() val morpheme = getFirstMorpheme("東京都")!! morphemeAtt.setMorpheme(morpheme) - val offsets = listOf(0, 3) + val offsets = listOf(0, 1, 2, 3) morphemeAtt.setOffsets(offsets) val builder = XContentBuilder.builder(XContentType.JSON.xContent()) From a598575bbe9e438593384ae02040e63beb4a6b84 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 6 Nov 2024 14:34:00 +0900 Subject: [PATCH 13/13] calculate offset based on each morpheme's one --- .../lucene/sudachi/ja/SudachiSplitFilter.java | 37 +++++++++++-------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java index d871cdb..8f58004 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java @@ -130,13 +130,11 @@ private void setAUnitAttribute() { posIncAtt.setPositionIncrement(1); } - int startOffset = subunits.offset(); - Morpheme morpheme = subunits.next(); - int endOffset = subunits.offset(); - termAtt.setEmpty().append(morpheme.surface()); - morphemeAtt.setMorpheme(morpheme); - morphemeAtt.setOffsets(offsetMap.subList(startOffset, endOffset + 1)); - offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset)); + MorphemeSubunits.Subunit su = subunits.next(); + termAtt.setEmpty().append(su.morpheme.surface()); + morphemeAtt.setMorpheme(su.morpheme); + morphemeAtt.setOffsets(offsetMap.subList(su.begin, su.end + 1)); + offsetAtt.setOffset(correctOffset(su.begin), correctOffset(su.end)); } private void setOOVAttribute() { @@ -194,29 +192,40 @@ public int offset() { } static class MorphemeSubunits { + static class Subunit { + final Morpheme morpheme; + final int begin; + final int end; + + public Subunit(Morpheme morpheme, int begin, int end) { + this.morpheme = morpheme; + this.begin = begin; + this.end = end; + } + } + private List morphemes; private int size; private int index; - private int offset; + private int baseOffset; public void setUnits(List morphemes) { this.morphemes = morphemes; size = morphemes.size(); index = 0; - offset = 0; + baseOffset = morphemes.get(0).begin(); } public boolean hasNext() { return index < size; } - public Morpheme next() { + public Subunit next() { if (!hasNext()) { throw new IllegalStateException(); } Morpheme m = morphemes.get(index++); - offset += m.end() - m.begin(); - return m; + return new Subunit(m, m.begin() - baseOffset, m.end() - baseOffset); } public int size() { @@ -226,9 +235,5 @@ public int size() { public int index() { return index; } - - public int offset() { - return offset; - } } } \ No newline at end of file