Skip to content

Commit

Permalink
Merge pull request #149 from WorksApplications/fix/148-correct-offset
Browse files Browse the repository at this point in the history
Add offset correction for split filter
  • Loading branch information
mh-northlander authored Nov 11, 2024
2 parents 59f9b99 + a598575 commit ee664ba
Show file tree
Hide file tree
Showing 12 changed files with 422 additions and 102 deletions.
9 changes: 6 additions & 3 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ jobs:
- 'os:2.6.0'
env:
mainJob: ${{ matrix.es-version == 'es:8.15.2' }}
sudachiVersion: 20241021
sudachiKind: core
continue-on-error: true

steps:
Expand Down Expand Up @@ -93,15 +95,16 @@ jobs:
- name: Cache dictionary download
uses: actions/cache@v4
with:
path: build/integration/sudachi-dictionary-20230110-small.zip
key: sudachi-dictionary-20230110
path: build/integration/sudachi-dictionary-${{ env.sudachiVersion }}-${{ env.sudachiKind }}.zip
key: sudachi-dictionary-${{ env.sudachiVersion }}-${{ env.sudachiKind }}
- name: Integration test
env:
ES_KIND: ${{ env.ENGINE_KIND }}
ES_VERSION: ${{ env.ENGINE_VERSION }}
PLUGIN_VERSION: ${{ env.PROJ_VERSION }}
RUN_ES_DAEMON: 1
DIC_VERSION: 20230110
DIC_VERSION: ${{ env.sudachiVersion }}
DIC_KIND: ${{ env.sudachiKind }}
run: |
bash test-scripts/00-install-elasticsearch.sh
sleep 30
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023 Works Applications Co., Ltd.
* Copyright (c) 2023-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,6 +16,8 @@

package com.worksap.nlp.lucene.sudachi.ja.attributes;

import java.util.List;

import com.worksap.nlp.sudachi.Morpheme;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Attribute;
Expand All @@ -36,4 +38,17 @@ public interface MorphemeAttribute extends Attribute {
* new object
*/
void setMorpheme(Morpheme morpheme);

/**
* @return The offset mapping for the current morpheme
*/
List<Integer> getOffsets();

/**
* Set the offset mapping for the morpheme
*
* @param offsets
* actual offset for each offset in the morpheme
*/
void setOffsets(List<Integer> offsets);
}
213 changes: 138 additions & 75 deletions src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import com.worksap.nlp.lucene.sudachi.ja.attributes.*;
import com.worksap.nlp.lucene.sudachi.ja.util.Strings;
import com.worksap.nlp.sudachi.Morpheme;

import com.worksap.nlp.sudachi.Tokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
Expand All @@ -40,56 +39,18 @@ public enum Mode {

public static final Mode DEFAULT_MODE = Mode.SEARCH;

static class OovChars {
private int length;
private char[] buffer = new char[0];
private int reserved;
private int index;
private int baseOffset;

public void setOov(int offset, char[] src, int length) {
baseOffset = offset;
this.length = length;
if (reserved < length) {
buffer = new char[length];
reserved = length;
}
System.arraycopy(src, 0, buffer, 0, length);
index = 0;
}

public boolean hasNext() {
return index < length;
}

public char next() {
if (index < length) {
return buffer[index++];
} else {
throw new IllegalStateException();
}
}

public int index() {
return index;
}

public int offset() {
return baseOffset + index;
}
}

private final Mode mode;
private final Tokenizer.SplitMode splitMode;

private final CharTermAttribute termAtt;
private final OffsetAttribute offsetAtt;
private final PositionIncrementAttribute posIncAtt;
private final PositionLengthAttribute posLengthAtt;
private final MorphemeAttribute morphemeAtt;
private ListIterator<Morpheme> aUnitIterator;
private final OovChars oovChars = new OovChars();

private int aUnitOffset = 0;
private final MorphemeSubunits subunits = new MorphemeSubunits();
private final OovChars oovChars = new OovChars();
private List<Integer> offsetMap;

public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode splitMode) {
super(input);
Expand All @@ -105,72 +66,174 @@ public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode spli

@Override
public final boolean incrementToken() throws IOException {
// continue to write current split
if (oovChars.hasNext()) {
clearAttributes();
setOOVAttribute();
return true;
}
if (aUnitIterator != null && aUnitIterator.hasNext()) {
if (subunits.hasNext()) {
clearAttributes();
setAUnitAttribute(aUnitIterator.next());
setAUnitAttribute();
return true;
}

// move to next morpheme
if (!input.incrementToken()) {
return false;
}

Morpheme m = morphemeAtt.getMorpheme();
this.offsetMap = morphemeAtt.getOffsets();
if (m == null) {
return true;
}

if (input.incrementToken()) {
// oov does not have splits
// split into characters in extended mode
if (m.isOOV()) {
int length = 0;
Morpheme m = morphemeAtt.getMorpheme();
if (m == null) {
return true;
}
termAtt.setEmpty().append(m.surface());
if (mode == Mode.EXTENDED && m.isOOV() && (length = Strings.codepointCount(termAtt)) > 1) {
oovChars.setOov(offsetAtt.startOffset(), termAtt.buffer(), termAtt.length());
if (mode == Mode.EXTENDED && (length = Strings.codepointCount(termAtt)) > 1) {
// OovChars requires character length
oovChars.setOov(termAtt.buffer(), termAtt.length());
// Position length should be codepoint length
posLengthAtt.setPositionLength(length);
} else if (splitMode != Tokenizer.SplitMode.C) {
List<Morpheme> subUnits = m.split(splitMode);
if (subUnits.size() > 1) {
aUnitIterator = subUnits.listIterator();
aUnitOffset = offsetAtt.startOffset();
posLengthAtt.setPositionLength(subUnits.size());
} else {
posLengthAtt.setPositionLength(1);
}
}
return true;
} else {
return false;
}

// C split is the longest split
if (splitMode == Tokenizer.SplitMode.C) {
return true;
}

// split into A/B units
List<Morpheme> subsplits = m.split(splitMode);
if (subsplits.size() > 1) {
subunits.setUnits(subsplits);
posLengthAtt.setPositionLength(subunits.size());
}

return true;
}

private int correctOffset(int currectOff) {
// assert (0 <= currectOff && currectOff <= this.offsetMap.size());
return this.offsetMap.get(currectOff);
}

private void setAUnitAttribute(Morpheme morpheme) {
private void setAUnitAttribute() {
posLengthAtt.setPositionLength(1);
if (aUnitIterator.previousIndex() == 0) {
if (subunits.index() == 0) {
posIncAtt.setPositionIncrement(0);
} else {
posIncAtt.setPositionIncrement(1);
}
int length = morpheme.end() - morpheme.begin();
offsetAtt.setOffset(aUnitOffset, aUnitOffset + length);
aUnitOffset += length;
morphemeAtt.setMorpheme(morpheme);
termAtt.setEmpty().append(morpheme.surface());

MorphemeSubunits.Subunit su = subunits.next();
termAtt.setEmpty().append(su.morpheme.surface());
morphemeAtt.setMorpheme(su.morpheme);
morphemeAtt.setOffsets(offsetMap.subList(su.begin, su.end + 1));
offsetAtt.setOffset(correctOffset(su.begin), correctOffset(su.end));
}

private void setOOVAttribute() {
int offset = oovChars.offset();
posLengthAtt.setPositionLength(1);
if (oovChars.index() == 0) {
posIncAtt.setPositionIncrement(0);
} else {
posIncAtt.setPositionIncrement(1);
}

int startOffset = oovChars.offset();
char c = oovChars.next();
termAtt.setEmpty().append(c);
if (Character.isSurrogate(c) && oovChars.hasNext()) {
termAtt.append(oovChars.next());
offsetAtt.setOffset(offset, offset + 2);
} else {
offsetAtt.setOffset(offset, offset + 1);
}
int endOffset = oovChars.offset();
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
}

static class OovChars {
private int reserved;
private char[] buffer = new char[0];
private int length;
private int index;

public void setOov(char[] src, int length) {
this.length = length;
if (reserved < length) {
buffer = new char[length];
reserved = length;
}
System.arraycopy(src, 0, buffer, 0, length);
index = 0;
}

public boolean hasNext() {
return index < length;
}

public char next() {
if (index < length) {
return buffer[index++];
}
throw new IllegalStateException();
}

public int index() {
return index;
}

public int offset() {
return index;
}
}

static class MorphemeSubunits {
static class Subunit {
final Morpheme morpheme;
final int begin;
final int end;

public Subunit(Morpheme morpheme, int begin, int end) {
this.morpheme = morpheme;
this.begin = begin;
this.end = end;
}
}

private List<Morpheme> morphemes;
private int size;
private int index;
private int baseOffset;

public void setUnits(List<Morpheme> morphemes) {
this.morphemes = morphemes;
size = morphemes.size();
index = 0;
baseOffset = morphemes.get(0).begin();
}

public boolean hasNext() {
return index < size;
}

public Subunit next() {
if (!hasNext()) {
throw new IllegalStateException();
}
Morpheme m = morphemes.get(index++);
return new Subunit(m, m.begin() - baseOffset, m.end() - baseOffset);
}

public int size() {
return size;
}

public int index() {
return index;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,12 @@ class SudachiTokenizer(
override fun incrementToken(): Boolean {
clearAttributes()
var m = iterator.next() ?: return false
val baseOffset = iterator.baseOffset

morphemeAtt.setMorpheme(m)
posLenAtt.positionLength = 1
posIncAtt.positionIncrement = 1
val baseOffset = iterator.baseOffset
morphemeAtt.setOffsets((m.begin()..m.end()).map { i -> correctOffset(baseOffset + i) })
posLenAtt.setPositionLength(1)
posIncAtt.setPositionIncrement(1)
offsetAtt.setOffset(correctOffset(baseOffset + m.begin()), correctOffset(baseOffset + m.end()))
termAtt.setEmpty().append(m.surface())
return true
Expand Down
Loading

0 comments on commit ee664ba

Please sign in to comment.