Merge pull request #149 from WorksApplications/fix/148-correct-offset

Add offset correction for split filter
WorksApplications · Nov 11, 2024 · ee664ba · ee664ba
2 parents 59f9b99 + a598575
commit ee664ba
Show file tree

Hide file tree

Showing 12 changed files with 422 additions and 102 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -46,6 +46,8 @@ jobs:
           - 'os:2.6.0'
     env:
       mainJob: ${{ matrix.es-version == 'es:8.15.2' }}
+      sudachiVersion: 20241021
+      sudachiKind: core
     continue-on-error: true
 
     steps:
@@ -93,15 +95,16 @@ jobs:
     - name: Cache dictionary download
       uses: actions/cache@v4
       with:
-        path: build/integration/sudachi-dictionary-20230110-small.zip
-        key: sudachi-dictionary-20230110
+        path: build/integration/sudachi-dictionary-${{ env.sudachiVersion }}-${{ env.sudachiKind }}.zip
+        key: sudachi-dictionary-${{ env.sudachiVersion }}-${{ env.sudachiKind }}
     - name: Integration test
       env:
         ES_KIND: ${{ env.ENGINE_KIND }}
         ES_VERSION: ${{ env.ENGINE_VERSION }}
         PLUGIN_VERSION: ${{ env.PROJ_VERSION }}
         RUN_ES_DAEMON: 1
-        DIC_VERSION: 20230110
+        DIC_VERSION: ${{ env.sudachiVersion }}
+        DIC_KIND: ${{ env.sudachiKind }}
       run: |
         bash test-scripts/00-install-elasticsearch.sh
         sleep 30

diff --git a/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttribute.java b/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttribute.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Works Applications Co., Ltd.
+ * Copyright (c) 2023-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 package com.worksap.nlp.lucene.sudachi.ja.attributes;
 
+import java.util.List;
+
 import com.worksap.nlp.sudachi.Morpheme;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.util.Attribute;
@@ -36,4 +38,17 @@ public interface MorphemeAttribute extends Attribute {
      *            new object
      */
     void setMorpheme(Morpheme morpheme);
+
+    /**
+     * @return The offset mapping for the current morpheme
+     */
+    List<Integer> getOffsets();
+
+    /**
+     * Set the offset mapping for the morpheme
+     * 
+     * @param offsets
+     *            actual offset for each offset in the morpheme
+     */
+    void setOffsets(List<Integer> offsets);
 }
diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java
@@ -23,7 +23,6 @@
 import com.worksap.nlp.lucene.sudachi.ja.attributes.*;
 import com.worksap.nlp.lucene.sudachi.ja.util.Strings;
 import com.worksap.nlp.sudachi.Morpheme;
-
 import com.worksap.nlp.sudachi.Tokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
@@ -40,56 +39,18 @@ public enum Mode {
 
     public static final Mode DEFAULT_MODE = Mode.SEARCH;
 
-    static class OovChars {
-        private int length;
-        private char[] buffer = new char[0];
-        private int reserved;
-        private int index;
-        private int baseOffset;
-
-        public void setOov(int offset, char[] src, int length) {
-            baseOffset = offset;
-            this.length = length;
-            if (reserved < length) {
-                buffer = new char[length];
-                reserved = length;
-            }
-            System.arraycopy(src, 0, buffer, 0, length);
-            index = 0;
-        }
-
-        public boolean hasNext() {
-            return index < length;
-        }
-
-        public char next() {
-            if (index < length) {
-                return buffer[index++];
-            } else {
-                throw new IllegalStateException();
-            }
-        }
-
-        public int index() {
-            return index;
-        }
-
-        public int offset() {
-            return baseOffset + index;
-        }
-    }
-
     private final Mode mode;
     private final Tokenizer.SplitMode splitMode;
+
     private final CharTermAttribute termAtt;
     private final OffsetAttribute offsetAtt;
     private final PositionIncrementAttribute posIncAtt;
     private final PositionLengthAttribute posLengthAtt;
     private final MorphemeAttribute morphemeAtt;
-    private ListIterator<Morpheme> aUnitIterator;
-    private final OovChars oovChars = new OovChars();
 
-    private int aUnitOffset = 0;
+    private final MorphemeSubunits subunits = new MorphemeSubunits();
+    private final OovChars oovChars = new OovChars();
+    private List<Integer> offsetMap;
 
     public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode splitMode) {
         super(input);
@@ -105,72 +66,174 @@ public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode spli
 
     @Override
     public final boolean incrementToken() throws IOException {
+        // continue to write current split
         if (oovChars.hasNext()) {
             clearAttributes();
             setOOVAttribute();
             return true;
         }
-        if (aUnitIterator != null && aUnitIterator.hasNext()) {
+        if (subunits.hasNext()) {
             clearAttributes();
-            setAUnitAttribute(aUnitIterator.next());
+            setAUnitAttribute();
+            return true;
+        }
+
+        // move to next morpheme
+        if (!input.incrementToken()) {
+            return false;
+        }
+
+        Morpheme m = morphemeAtt.getMorpheme();
+        this.offsetMap = morphemeAtt.getOffsets();
+        if (m == null) {
             return true;
         }
 
-        if (input.incrementToken()) {
+        // oov does not have splits
+        // split into characters in extended mode
+        if (m.isOOV()) {
             int length = 0;
-            Morpheme m = morphemeAtt.getMorpheme();
-            if (m == null) {
-                return true;
-            }
-            termAtt.setEmpty().append(m.surface());
-            if (mode == Mode.EXTENDED && m.isOOV() && (length = Strings.codepointCount(termAtt)) > 1) {
-                oovChars.setOov(offsetAtt.startOffset(), termAtt.buffer(), termAtt.length());
+            if (mode == Mode.EXTENDED && (length = Strings.codepointCount(termAtt)) > 1) {
+                // OovChars requires character length
+                oovChars.setOov(termAtt.buffer(), termAtt.length());
+                // Position length should be codepoint length
                 posLengthAtt.setPositionLength(length);
-            } else if (splitMode != Tokenizer.SplitMode.C) {
-                List<Morpheme> subUnits = m.split(splitMode);
-                if (subUnits.size() > 1) {
-                    aUnitIterator = subUnits.listIterator();
-                    aUnitOffset = offsetAtt.startOffset();
-                    posLengthAtt.setPositionLength(subUnits.size());
-                } else {
-                    posLengthAtt.setPositionLength(1);
-                }
             }
             return true;
-        } else {
-            return false;
         }
+
+        // C split is the longest split
+        if (splitMode == Tokenizer.SplitMode.C) {
+            return true;
+        }
+
+        // split into A/B units
+        List<Morpheme> subsplits = m.split(splitMode);
+        if (subsplits.size() > 1) {
+            subunits.setUnits(subsplits);
+            posLengthAtt.setPositionLength(subunits.size());
+        }
+
+        return true;
+    }
+
+    private int correctOffset(int currectOff) {
+        // assert (0 <= currectOff && currectOff <= this.offsetMap.size());
+        return this.offsetMap.get(currectOff);
     }
 
-    private void setAUnitAttribute(Morpheme morpheme) {
+    private void setAUnitAttribute() {
         posLengthAtt.setPositionLength(1);
-        if (aUnitIterator.previousIndex() == 0) {
+        if (subunits.index() == 0) {
             posIncAtt.setPositionIncrement(0);
         } else {
             posIncAtt.setPositionIncrement(1);
         }
-        int length = morpheme.end() - morpheme.begin();
-        offsetAtt.setOffset(aUnitOffset, aUnitOffset + length);
-        aUnitOffset += length;
-        morphemeAtt.setMorpheme(morpheme);
-        termAtt.setEmpty().append(morpheme.surface());
+
+        MorphemeSubunits.Subunit su = subunits.next();
+        termAtt.setEmpty().append(su.morpheme.surface());
+        morphemeAtt.setMorpheme(su.morpheme);
+        morphemeAtt.setOffsets(offsetMap.subList(su.begin, su.end + 1));
+        offsetAtt.setOffset(correctOffset(su.begin), correctOffset(su.end));
     }
 
     private void setOOVAttribute() {
-        int offset = oovChars.offset();
         posLengthAtt.setPositionLength(1);
         if (oovChars.index() == 0) {
             posIncAtt.setPositionIncrement(0);
         } else {
             posIncAtt.setPositionIncrement(1);
         }
+
+        int startOffset = oovChars.offset();
         char c = oovChars.next();
         termAtt.setEmpty().append(c);
         if (Character.isSurrogate(c) && oovChars.hasNext()) {
             termAtt.append(oovChars.next());
-            offsetAtt.setOffset(offset, offset + 2);
-        } else {
-            offsetAtt.setOffset(offset, offset + 1);
+        }
+        int endOffset = oovChars.offset();
+        offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
+    }
+
+    static class OovChars {
+        private int reserved;
+        private char[] buffer = new char[0];
+        private int length;
+        private int index;
+
+        public void setOov(char[] src, int length) {
+            this.length = length;
+            if (reserved < length) {
+                buffer = new char[length];
+                reserved = length;
+            }
+            System.arraycopy(src, 0, buffer, 0, length);
+            index = 0;
+        }
+
+        public boolean hasNext() {
+            return index < length;
+        }
+
+        public char next() {
+            if (index < length) {
+                return buffer[index++];
+            }
+            throw new IllegalStateException();
+        }
+
+        public int index() {
+            return index;
+        }
+
+        public int offset() {
+            return index;
+        }
+    }
+
+    static class MorphemeSubunits {
+        static class Subunit {
+            final Morpheme morpheme;
+            final int begin;
+            final int end;
+
+            public Subunit(Morpheme morpheme, int begin, int end) {
+                this.morpheme = morpheme;
+                this.begin = begin;
+                this.end = end;
+            }
+        }
+
+        private List<Morpheme> morphemes;
+        private int size;
+        private int index;
+        private int baseOffset;
+
+        public void setUnits(List<Morpheme> morphemes) {
+            this.morphemes = morphemes;
+            size = morphemes.size();
+            index = 0;
+            baseOffset = morphemes.get(0).begin();
+        }
+
+        public boolean hasNext() {
+            return index < size;
+        }
+
+        public Subunit next() {
+            if (!hasNext()) {
+                throw new IllegalStateException();
+            }
+            Morpheme m = morphemes.get(index++);
+            return new Subunit(m, m.begin() - baseOffset, m.end() - baseOffset);
+        }
+
+        public int size() {
+            return size;
+        }
+
+        public int index() {
+            return index;
         }
     }
 }
diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt
@@ -55,11 +55,12 @@ class SudachiTokenizer(
   override fun incrementToken(): Boolean {
     clearAttributes()
     var m = iterator.next() ?: return false
+    val baseOffset = iterator.baseOffset
 
     morphemeAtt.setMorpheme(m)
-    posLenAtt.positionLength = 1
-    posIncAtt.positionIncrement = 1
-    val baseOffset = iterator.baseOffset
+    morphemeAtt.setOffsets((m.begin()..m.end()).map { i -> correctOffset(baseOffset + i) })
+    posLenAtt.setPositionLength(1)
+    posIncAtt.setPositionIncrement(1)
     offsetAtt.setOffset(correctOffset(baseOffset + m.begin()), correctOffset(baseOffset + m.end()))
     termAtt.setEmpty().append(m.surface())
     return true