diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 17c65981..d66a4bb6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -46,6 +46,8 @@ jobs: - 'os:2.6.0' env: mainJob: ${{ matrix.es-version == 'es:8.15.2' }} + sudachiVersion: 20241021 + sudachiKind: core continue-on-error: true steps: @@ -93,15 +95,16 @@ jobs: - name: Cache dictionary download uses: actions/cache@v4 with: - path: build/integration/sudachi-dictionary-20230110-small.zip - key: sudachi-dictionary-20230110 + path: build/integration/sudachi-dictionary-${{ env.sudachiVersion }}-${{ env.sudachiKind }}.zip + key: sudachi-dictionary-${{ env.sudachiVersion }}-${{ env.sudachiKind }} - name: Integration test env: ES_KIND: ${{ env.ENGINE_KIND }} ES_VERSION: ${{ env.ENGINE_VERSION }} PLUGIN_VERSION: ${{ env.PROJ_VERSION }} RUN_ES_DAEMON: 1 - DIC_VERSION: 20230110 + DIC_VERSION: ${{ env.sudachiVersion }} + DIC_KIND: ${{ env.sudachiKind }} run: | bash test-scripts/00-install-elasticsearch.sh sleep 30 diff --git a/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttribute.java b/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttribute.java index a008130e..7389b1d0 100644 --- a/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttribute.java +++ b/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttribute.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Works Applications Co., Ltd. + * Copyright (c) 2023-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,8 @@ package com.worksap.nlp.lucene.sudachi.ja.attributes; +import java.util.List; + import com.worksap.nlp.sudachi.Morpheme; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.Attribute; @@ -36,4 +38,17 @@ public interface MorphemeAttribute extends Attribute { * new object */ void setMorpheme(Morpheme morpheme); + + /** + * @return The offset mapping for the current morpheme + */ + List getOffsets(); + + /** + * Set the offset mapping for the morpheme + * + * @param offsets + * actual offset for each offset in the morpheme + */ + void setOffsets(List offsets); } diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java index 361fd66e..8f580043 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java @@ -23,7 +23,6 @@ import com.worksap.nlp.lucene.sudachi.ja.attributes.*; import com.worksap.nlp.lucene.sudachi.ja.util.Strings; import com.worksap.nlp.sudachi.Morpheme; - import com.worksap.nlp.sudachi.Tokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -40,56 +39,18 @@ public enum Mode { public static final Mode DEFAULT_MODE = Mode.SEARCH; - static class OovChars { - private int length; - private char[] buffer = new char[0]; - private int reserved; - private int index; - private int baseOffset; - - public void setOov(int offset, char[] src, int length) { - baseOffset = offset; - this.length = length; - if (reserved < length) { - buffer = new char[length]; - reserved = length; - } - System.arraycopy(src, 0, buffer, 0, length); - index = 0; - } - - public boolean hasNext() { - return index < length; - } - - public char next() { - if (index < length) { - return buffer[index++]; - } else { - throw new IllegalStateException(); - } - } - - public int index() { - return index; - } - - public int offset() { - return baseOffset + index; - } - } - private final Mode mode; private final Tokenizer.SplitMode splitMode; + private final CharTermAttribute termAtt; private final OffsetAttribute offsetAtt; private final PositionIncrementAttribute posIncAtt; private final PositionLengthAttribute posLengthAtt; private final MorphemeAttribute morphemeAtt; - private ListIterator aUnitIterator; - private final OovChars oovChars = new OovChars(); - private int aUnitOffset = 0; + private final MorphemeSubunits subunits = new MorphemeSubunits(); + private final OovChars oovChars = new OovChars(); + private List offsetMap; public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode splitMode) { super(input); @@ -105,72 +66,174 @@ public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode spli @Override public final boolean incrementToken() throws IOException { + // continue to write current split if (oovChars.hasNext()) { clearAttributes(); setOOVAttribute(); return true; } - if (aUnitIterator != null && aUnitIterator.hasNext()) { + if (subunits.hasNext()) { clearAttributes(); - setAUnitAttribute(aUnitIterator.next()); + setAUnitAttribute(); + return true; + } + + // move to next morpheme + if (!input.incrementToken()) { + return false; + } + + Morpheme m = morphemeAtt.getMorpheme(); + this.offsetMap = morphemeAtt.getOffsets(); + if (m == null) { return true; } - if (input.incrementToken()) { + // oov does not have splits + // split into characters in extended mode + if (m.isOOV()) { int length = 0; - Morpheme m = morphemeAtt.getMorpheme(); - if (m == null) { - return true; - } - termAtt.setEmpty().append(m.surface()); - if (mode == Mode.EXTENDED && m.isOOV() && (length = Strings.codepointCount(termAtt)) > 1) { - oovChars.setOov(offsetAtt.startOffset(), termAtt.buffer(), termAtt.length()); + if (mode == Mode.EXTENDED && (length = Strings.codepointCount(termAtt)) > 1) { + // OovChars requires character length + oovChars.setOov(termAtt.buffer(), termAtt.length()); + // Position length should be codepoint length posLengthAtt.setPositionLength(length); - } else if (splitMode != Tokenizer.SplitMode.C) { - List subUnits = m.split(splitMode); - if (subUnits.size() > 1) { - aUnitIterator = subUnits.listIterator(); - aUnitOffset = offsetAtt.startOffset(); - posLengthAtt.setPositionLength(subUnits.size()); - } else { - posLengthAtt.setPositionLength(1); - } } return true; - } else { - return false; } + + // C split is the longest split + if (splitMode == Tokenizer.SplitMode.C) { + return true; + } + + // split into A/B units + List subsplits = m.split(splitMode); + if (subsplits.size() > 1) { + subunits.setUnits(subsplits); + posLengthAtt.setPositionLength(subunits.size()); + } + + return true; + } + + private int correctOffset(int currectOff) { + // assert (0 <= currectOff && currectOff <= this.offsetMap.size()); + return this.offsetMap.get(currectOff); } - private void setAUnitAttribute(Morpheme morpheme) { + private void setAUnitAttribute() { posLengthAtt.setPositionLength(1); - if (aUnitIterator.previousIndex() == 0) { + if (subunits.index() == 0) { posIncAtt.setPositionIncrement(0); } else { posIncAtt.setPositionIncrement(1); } - int length = morpheme.end() - morpheme.begin(); - offsetAtt.setOffset(aUnitOffset, aUnitOffset + length); - aUnitOffset += length; - morphemeAtt.setMorpheme(morpheme); - termAtt.setEmpty().append(morpheme.surface()); + + MorphemeSubunits.Subunit su = subunits.next(); + termAtt.setEmpty().append(su.morpheme.surface()); + morphemeAtt.setMorpheme(su.morpheme); + morphemeAtt.setOffsets(offsetMap.subList(su.begin, su.end + 1)); + offsetAtt.setOffset(correctOffset(su.begin), correctOffset(su.end)); } private void setOOVAttribute() { - int offset = oovChars.offset(); posLengthAtt.setPositionLength(1); if (oovChars.index() == 0) { posIncAtt.setPositionIncrement(0); } else { posIncAtt.setPositionIncrement(1); } + + int startOffset = oovChars.offset(); char c = oovChars.next(); termAtt.setEmpty().append(c); if (Character.isSurrogate(c) && oovChars.hasNext()) { termAtt.append(oovChars.next()); - offsetAtt.setOffset(offset, offset + 2); - } else { - offsetAtt.setOffset(offset, offset + 1); + } + int endOffset = oovChars.offset(); + offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset)); + } + + static class OovChars { + private int reserved; + private char[] buffer = new char[0]; + private int length; + private int index; + + public void setOov(char[] src, int length) { + this.length = length; + if (reserved < length) { + buffer = new char[length]; + reserved = length; + } + System.arraycopy(src, 0, buffer, 0, length); + index = 0; + } + + public boolean hasNext() { + return index < length; + } + + public char next() { + if (index < length) { + return buffer[index++]; + } + throw new IllegalStateException(); + } + + public int index() { + return index; + } + + public int offset() { + return index; + } + } + + static class MorphemeSubunits { + static class Subunit { + final Morpheme morpheme; + final int begin; + final int end; + + public Subunit(Morpheme morpheme, int begin, int end) { + this.morpheme = morpheme; + this.begin = begin; + this.end = end; + } + } + + private List morphemes; + private int size; + private int index; + private int baseOffset; + + public void setUnits(List morphemes) { + this.morphemes = morphemes; + size = morphemes.size(); + index = 0; + baseOffset = morphemes.get(0).begin(); + } + + public boolean hasNext() { + return index < size; + } + + public Subunit next() { + if (!hasNext()) { + throw new IllegalStateException(); + } + Morpheme m = morphemes.get(index++); + return new Subunit(m, m.begin() - baseOffset, m.end() - baseOffset); + } + + public int size() { + return size; + } + + public int index() { + return index; } } } \ No newline at end of file diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt index 6bfbb0eb..ac1b3746 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt @@ -55,11 +55,12 @@ class SudachiTokenizer( override fun incrementToken(): Boolean { clearAttributes() var m = iterator.next() ?: return false + val baseOffset = iterator.baseOffset morphemeAtt.setMorpheme(m) - posLenAtt.positionLength = 1 - posIncAtt.positionIncrement = 1 - val baseOffset = iterator.baseOffset + morphemeAtt.setOffsets((m.begin()..m.end()).map { i -> correctOffset(baseOffset + i) }) + posLenAtt.setPositionLength(1) + posIncAtt.setPositionIncrement(1) offsetAtt.setOffset(correctOffset(baseOffset + m.begin()), correctOffset(baseOffset + m.end())) termAtt.setEmpty().append(m.surface()) return true diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt index 9b494c19..472a7710 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt @@ -25,10 +25,14 @@ import org.apache.lucene.util.AttributeImpl import org.apache.lucene.util.AttributeReflector class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute { - private var morpheme: MorphemeWrapper? = null + private var morpheme: Morpheme? = null + // mapping from the character offset to the original reader offset + private var offsetMap: List = listOf() - private class MorphemeWrapper(morpheme: Morpheme) : ToXContent { + // wrapper class to convert data ToXContent-able + private class ToXContentWrapper(morpheme: Morpheme, offsetMap: List) : ToXContent { private val morpheme = morpheme + private val offsetMap = offsetMap override fun toXContent(builder: XContentBuilder, params: ToXContentParams): XContentBuilder { builder.value( @@ -38,32 +42,43 @@ class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute { "normalizedForm" to morpheme.normalizedForm(), "readingForm" to morpheme.readingForm(), "partOfSpeech" to morpheme.partOfSpeech(), + "offsetMap" to offsetMap, )) return builder } - - fun unwrap(): Morpheme { - return morpheme - } } override fun clear() { morpheme = null + offsetMap = listOf() } override fun reflectWith(reflector: AttributeReflector) { - reflector.reflect("morpheme", morpheme) + // show only when a morpheme is set + reflector.reflect( + "morpheme", morpheme?.let { m -> ToXContentWrapper(m, offsetMap) }) } override fun copyTo(target: AttributeImpl?) { - (target as? MorphemeAttributeImpl)?.let { it.setMorpheme(getMorpheme()) } + (target as? MorphemeAttributeImpl)?.let { + it.setMorpheme(getMorpheme()) + it.setOffsets(getOffsets()) + } } override fun getMorpheme(): Morpheme? { - return morpheme?.let { m -> m.unwrap() } + return morpheme } override fun setMorpheme(morpheme: Morpheme?) { - this.morpheme = morpheme?.let { m -> MorphemeWrapper(m) } + this.morpheme = morpheme + } + + override fun getOffsets(): List { + return offsetMap + } + + override fun setOffsets(offsets: List) { + this.offsetMap = offsets } } diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiSplitFilter.kt b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiSplitFilter.kt index eafa8a3c..493ca443 100644 --- a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiSplitFilter.kt +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiSplitFilter.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023 Works Applications Co., Ltd. + * Copyright (c) 2022-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -196,6 +196,69 @@ class TestSudachiSplitFilter : BaseTokenStreamTestCase() { ) } + @Test + fun testWithCharNormalizationBySearchMode() { + val tokenStream = setUpTokenStream("search", "六三四㍿に行くカ゛カ゛カ゛") + assertTokenStreamContents( + tokenStream, + arrayOf("六三四", "㍿", "㍿", "", "に", "行く", "カ゛カ゛カ゛", "カ゛カ゛", "カ゛"), + intArrayOf(0, 3, 3, 4, 4, 5, 7, 7, 11), + intArrayOf(3, 4, 4, 4, 5, 7, 13, 11, 13), + intArrayOf(1, 1, 0, 1, 1, 1, 1, 0, 1), + intArrayOf(1, 2, 1, 1, 1, 1, 2, 1, 1), + 13, + ) + } + + @Test + fun testWithCharNormalizationInNormalizedFormBySearchMode() { + var tokenStream = setUpTokenStream("search", "六三四㍿に行くカ゛カ゛カ゛") + val normFactory = SudachiNormalizedFormFilterFactory(mutableMapOf()) + tokenStream = normFactory.create(tokenStream) + + assertTokenStreamContents( + tokenStream, + arrayOf("六三四", "株式会社", "株式", "会社", "に", "行く", "ガガガ", "ガガ", "ガ"), + intArrayOf(0, 3, 3, 4, 4, 5, 7, 7, 11), + intArrayOf(3, 4, 4, 4, 5, 7, 13, 11, 13), + intArrayOf(1, 1, 0, 1, 1, 1, 1, 0, 1), + intArrayOf(1, 2, 1, 1, 1, 1, 2, 1, 1), + 13, + ) + } + + @Test + fun testWithCharNormalizationByExtendedMode() { + // extending normalized form seems more natural, but we cannot calculate their offsets. + val tokenStream = setUpTokenStream("extended", "10㌢㍍いったソ゛") + assertTokenStreamContents( + tokenStream, + arrayOf("1", "0", "㌢㍍", "㌢", "㍍", "いっ", "た", "ソ゛", "ソ", "゛"), + intArrayOf(0, 1, 2, 2, 3, 4, 6, 7, 7, 8), + intArrayOf(1, 2, 4, 3, 4, 6, 7, 9, 8, 9), + intArrayOf(1, 1, 1, 0, 1, 1, 1, 1, 0, 1), + intArrayOf(1, 1, 2, 1, 1, 1, 1, 2, 1, 1), + 9, + ) + } + + @Test + fun testWithCharNormalizationInNormalizedFormByExtendedMode() { + // extending normalized form seems more natural, but we cannot calculate their offsets. + var tokenStream = setUpTokenStream("extended", "10㌢㍍いったソ゛") + val normFactory = SudachiNormalizedFormFilterFactory(mutableMapOf()) + tokenStream = normFactory.create(tokenStream) + + assertTokenStreamContents( + tokenStream, + arrayOf("1", "0", "センチメートル", "㌢", "㍍", "行く", "た", "ゾ", "ソ", "゛"), + intArrayOf(0, 1, 2, 2, 3, 4, 6, 7, 7, 8), + intArrayOf(1, 2, 4, 3, 4, 6, 7, 9, 8, 9), + intArrayOf(1, 1, 1, 0, 1, 1, 1, 1, 0, 1), + intArrayOf(1, 1, 2, 1, 1, 1, 1, 2, 1, 1), + 9) + } + fun setUpTokenStream(mode: String, input: String): TokenStream { val factory = SudachiSplitFilterFactory( diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt index a630dafa..373302dd 100644 --- a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt @@ -16,7 +16,6 @@ package com.worksap.nlp.lucene.sudachi.ja.attributes -import com.worksap.nlp.lucene.aliases.ToXContent import com.worksap.nlp.lucene.aliases.XContentBuilder import com.worksap.nlp.search.aliases.XContentType import com.worksap.nlp.sudachi.Config @@ -65,6 +64,19 @@ class MorphemeAttributeImplTest { assertNull(morphemeAtt.getMorpheme()) } + @Test + fun setOffsets() { + var morphemeAtt = MorphemeAttributeImpl() + assertTrue(morphemeAtt.getOffsets().isEmpty()) + + val intlist = listOf(1, 2, 3) + morphemeAtt.setOffsets(intlist) + assertEquals(intlist, morphemeAtt.getOffsets()) + + morphemeAtt.setOffsets(listOf()) + assertTrue(morphemeAtt.getOffsets().isEmpty()) + } + @Test fun copyTo() { var morphemeAtt1 = MorphemeAttributeImpl() @@ -85,15 +97,14 @@ class MorphemeAttributeImplTest { var morphemeAtt = MorphemeAttributeImpl() val morpheme = getFirstMorpheme("東京都")!! morphemeAtt.setMorpheme(morpheme) + val offsets = listOf(0, 1, 2, 3) + morphemeAtt.setOffsets(offsets) val builder = XContentBuilder.builder(XContentType.JSON.xContent()) builder.startObject() morphemeAtt.reflectWith( fun(attClass, key, value) { assertEquals(MorphemeAttribute::class.java, attClass) - assertEquals("morpheme", key) - assertTrue(value is ToXContent) - builder.field(key, value) }) builder.endObject() @@ -108,10 +119,33 @@ class MorphemeAttributeImplTest { assertEquals(morpheme.normalizedForm(), deserialized.morpheme.normalizedForm) assertEquals(morpheme.readingForm(), deserialized.morpheme.readingForm) assertEquals(morpheme.partOfSpeech(), deserialized.morpheme.partOfSpeech) + assertEquals(offsets, deserialized.morpheme.offsetMap) + } + + @Test + fun toXContentNullMorpheme() { + var morphemeAtt = MorphemeAttributeImpl() + + val builder = XContentBuilder.builder(XContentType.JSON.xContent()) + builder.startObject() + morphemeAtt.reflectWith( + fun(attClass, key, value) { + assertEquals(MorphemeAttribute::class.java, attClass) + builder.field(key, value) + }) + builder.endObject() + builder.flush() + + val serialized = builder.getOutputStream().toString() + val deserialized = Json.decodeFromString(serialized) + assertNull(deserialized.morpheme) } } -@Serializable data class MorphemeHolder(val morpheme: MorphemeAttributeHolder) +@Serializable +data class MorphemeHolder( + val morpheme: MorphemeAttributeHolder?, +) @Serializable data class MorphemeAttributeHolder( @@ -120,4 +154,5 @@ data class MorphemeAttributeHolder( val normalizedForm: String, val readingForm: String, val partOfSpeech: List, + val offsetMap: List, ) diff --git a/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi.json b/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi.json index 4f76af79..3042b87a 100644 --- a/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi.json +++ b/src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi.json @@ -1,5 +1,8 @@ { "systemDict" : "system_core.dic", + "inputTextPlugin" : [ + { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" } + ], "oovProviderPlugin" : [ { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], diff --git a/src/test/resources/dict/lex.csv b/src/test/resources/dict/lex.csv index a814dc30..1898e45c 100644 --- a/src/test/resources/dict/lex.csv +++ b/src/test/resources/dict/lex.csv @@ -37,4 +37,10 @@ 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-30000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,* 特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,*,A,*,*,*,* な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,A,*,*,*,* -ふく,4,4,5105,ふく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,フク,吹く,*,A,*,*,*,* \ No newline at end of file +ふく,4,4,5105,ふく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,フク,吹く,*,A,*,*,*,* +株式,8,8,5611,株式,名詞,普通名詞,一般,*,*,*,カブシキ,株式,*,A,*,*,*,* +会社,8,8,2914,会社,名詞,普通名詞,一般,*,*,*,カイシャ,会社,*,A,*,*,*,* +株式会社,8,8,6000,株式会社,名詞,普通名詞,一般,*,*,*,カブシキガイシャ,株式会社,*,C,40/41,40/41,40/41,* +ガ,5,5,3500,ガ,副詞,*,*,*,*,*,ガ,ガ,*,A,*,*,*,* +ガガ,5,5,5500,ガガ,副詞,*,*,*,*,*,ガガ,ガガ,*,A,*,*,*,* +ガガガ,5,5,8494,ガガガ,副詞,*,*,*,*,*,ガガガ,ガガガ,*,B,44/43,*,44/43,* \ No newline at end of file diff --git a/test-scripts/01-integration-test.py b/test-scripts/01-integration-test.py index ef51df14..3e2468f9 100644 --- a/test-scripts/01-integration-test.py +++ b/test-scripts/01-integration-test.py @@ -142,6 +142,122 @@ def test_icu_filtered_stuff_is_not_trimmed(self): self.assertEqual(5, tokens[0]["end_offset"]) return + def test_correct_split_offset_with_icu_filter(self): + body = { + "tokenizer": "sudachi_tokenizer", + "char_filter": { + "type": "icu_normalizer", + "name": "nfkc_cf", + "mode": "compose" + }, + "filter": { + "type": "sudachi_split", + "mode": "search" + }, + "text": "六三四㍿のアッフ\u309Aルハ\u309Aイ", + } + resp = es_instance.analyze(body) + self.assertEqual(200, resp.status, f"data: {resp.data}") + + tokens = json.loads(resp.data.decode())["tokens"] + self.assertEqual(8, len(tokens)) + self.assertEqual("株式会社", tokens[1]["token"]) + self.assertEqual(1, tokens[1]["position"]) + self.assertEqual(2, tokens[1]["positionLength"]) + self.assertEqual(3, tokens[1]["start_offset"]) + self.assertEqual(4, tokens[1]["end_offset"]) + + self.assertEqual("株式", tokens[2]["token"]) + self.assertEqual(1, tokens[2]["position"]) + self.assertEqual(3, tokens[2]["start_offset"]) + self.assertEqual(3, tokens[2]["end_offset"]) + self.assertEqual("会社", tokens[3]["token"]) + self.assertEqual(2, tokens[3]["position"]) + self.assertEqual(3, tokens[3]["start_offset"]) + self.assertEqual(4, tokens[3]["end_offset"]) + + self.assertEqual("アップルパイ", tokens[5]["token"]) + self.assertEqual(4, tokens[5]["position"]) + self.assertEqual(2, tokens[1]["positionLength"]) + self.assertEqual(5, tokens[5]["start_offset"]) + self.assertEqual(13, tokens[5]["end_offset"]) + + self.assertEqual("アップル", tokens[6]["token"]) + self.assertEqual(4, tokens[6]["position"]) + self.assertEqual(5, tokens[6]["start_offset"]) + self.assertEqual(10, tokens[6]["end_offset"]) + self.assertEqual("パイ", tokens[7]["token"]) + self.assertEqual(5, tokens[7]["position"]) + self.assertEqual(10, tokens[7]["start_offset"]) + self.assertEqual(13, tokens[7]["end_offset"]) + return + + def test_correct_OOV_offset_with_icu_filter(self): + body = { + "tokenizer": "sudachi_tokenizer", + "char_filter": { + "type": "icu_normalizer", + "name": "nfkc_cf", + "mode": "compose" + }, + "filter": { + "type": "sudachi_split", + "mode": "extended" + }, + "text": "10㍉㌢進んでホ\u3099ホ\u3099ホ\u3099", + } + resp = es_instance.analyze(body) + self.assertEqual(200, resp.status, f"data: {resp.data}") + + tokens = json.loads(resp.data.decode())["tokens"] + self.assertEqual(13, len(tokens)) + self.assertEqual("ミリセンチ", tokens[1]["token"]) + self.assertEqual(1, tokens[1]["position"]) + self.assertEqual(5, tokens[1]["positionLength"]) + self.assertEqual(2, tokens[1]["start_offset"]) + self.assertEqual(4, tokens[1]["end_offset"]) + + self.assertEqual("ミ", tokens[2]["token"]) + self.assertEqual(1, tokens[2]["position"]) + self.assertEqual(2, tokens[2]["start_offset"]) + self.assertEqual(2, tokens[2]["end_offset"]) + self.assertEqual("リ", tokens[3]["token"]) + self.assertEqual(2, tokens[3]["position"]) + self.assertEqual(2, tokens[3]["start_offset"]) + self.assertEqual(3, tokens[3]["end_offset"]) + self.assertEqual("セ", tokens[4]["token"]) + self.assertEqual(3, tokens[4]["position"]) + self.assertEqual(3, tokens[4]["start_offset"]) + self.assertEqual(3, tokens[4]["end_offset"]) + self.assertEqual("ン", tokens[5]["token"]) + self.assertEqual(4, tokens[5]["position"]) + self.assertEqual(3, tokens[5]["start_offset"]) + self.assertEqual(3, tokens[5]["end_offset"]) + self.assertEqual("チ", tokens[6]["token"]) + self.assertEqual(5, tokens[6]["position"]) + self.assertEqual(3, tokens[6]["start_offset"]) + self.assertEqual(4, tokens[6]["end_offset"]) + + self.assertEqual("ボボボ", tokens[9]["token"]) + self.assertEqual(8, tokens[9]["position"]) + self.assertEqual(3, tokens[9]["positionLength"]) + self.assertEqual(7, tokens[9]["start_offset"]) + self.assertEqual(13, tokens[9]["end_offset"]) + + self.assertEqual("ボ", tokens[10]["token"]) + self.assertEqual(8, tokens[10]["position"]) + self.assertEqual(7, tokens[10]["start_offset"]) + self.assertEqual(9, tokens[10]["end_offset"]) + self.assertEqual("ボ", tokens[11]["token"]) + self.assertEqual(9, tokens[11]["position"]) + self.assertEqual(9, tokens[11]["start_offset"]) + self.assertEqual(11, tokens[11]["end_offset"]) + self.assertEqual("ボ", tokens[12]["token"]) + self.assertEqual(10, tokens[12]["position"]) + self.assertEqual(11, tokens[12]["start_offset"]) + self.assertEqual(13, tokens[12]["end_offset"]) + return + class TestSubplugin(unittest.TestCase): # requires :subplugin is installed with :testlib diff --git a/test-scripts/20-put-docs.py b/test-scripts/20-put-docs.py index 725ba562..4c121f4a 100644 --- a/test-scripts/20-put-docs.py +++ b/test-scripts/20-put-docs.py @@ -1,6 +1,6 @@ import argparse from multiprocessing import Pool -import urllib3.request +import urllib3 import json from pathlib import Path diff --git a/test-scripts/30-test-docs.py b/test-scripts/30-test-docs.py index 9dd538b1..e10d02b2 100644 --- a/test-scripts/30-test-docs.py +++ b/test-scripts/30-test-docs.py @@ -1,6 +1,6 @@ import argparse from multiprocessing import Pool -import urllib3.request +import urllib3 import json from pathlib import Path @@ -31,7 +31,7 @@ def run(self): def test57Games(self): games = self.es.find("ゲーム") - self.assertEq(57, games["hits"]["total"]["value"]) + self.assertEq(59, games["hits"]["total"]["value"]) def test107Daigaku(self): docs = self.es.find("大学")