From 2c29440ee06da9505bcf6983ec06a5954d5d5dca Mon Sep 17 00:00:00 2001 From: Laura Schanno Date: Wed, 5 Feb 2025 22:38:17 -0500 Subject: [PATCH 1/3] Upgrade org.apache.lucene to 9.12.1 Upgrade the org.apache.lucene dependencies from 7.5.0 to 9.12.1, and make the requisite compatibility changes. Closes #2737 --- pom.xml | 4 +-- warehouse/ingest-core/pom.xml | 2 +- .../data/tokenize/StandardAnalyzer.java | 5 +--- .../data/tokenize/TokenizationHelper.java | 3 +- .../util/NGramTokenizationStrategy.java | 2 +- warehouse/ingest-csv/pom.xml | 2 +- warehouse/ingest-json/pom.xml | 2 +- .../src/main/resources/bin/ingest/findJars.sh | 2 +- warehouse/ingest-wikipedia/pom.xml | 2 +- warehouse/query-core/pom.xml | 2 +- .../DatawaveFieldIndexListIteratorJexl.java | 29 ++++++++----------- .../PushdownLargeFieldedListsVisitor.java | 4 +-- .../parser/lucene/AccumuloSyntaxParser.java | 7 +++-- 13 files changed, 30 insertions(+), 36 deletions(-) diff --git a/pom.xml b/pom.xml index 354a15ae0c4..ac2a0e09df2 100644 --- a/pom.xml +++ b/pom.xml @@ -109,7 +109,7 @@ 2.20 2.20 2.17.2 - 7.5.0 + 9.12.1 2.5.2 1.6.0 1.2 @@ -828,7 +828,7 @@ org.apache.lucene - lucene-analyzers-common + lucene-analysis-common ${version.lucene} diff --git a/warehouse/ingest-core/pom.xml b/warehouse/ingest-core/pom.xml index 714cfa960c8..90a3069dc28 100644 --- a/warehouse/ingest-core/pom.xml +++ b/warehouse/ingest-core/pom.xml @@ -116,7 +116,7 @@ org.apache.lucene - lucene-analyzers-common + lucene-analysis-common org.apache.lucene diff --git a/warehouse/ingest-core/src/main/java/datawave/ingest/data/tokenize/StandardAnalyzer.java b/warehouse/ingest-core/src/main/java/datawave/ingest/data/tokenize/StandardAnalyzer.java index ab847888533..7835eef651e 100644 --- a/warehouse/ingest-core/src/main/java/datawave/ingest/data/tokenize/StandardAnalyzer.java +++ b/warehouse/ingest-core/src/main/java/datawave/ingest/data/tokenize/StandardAnalyzer.java @@ -1,15 +1,12 @@ package datawave.ingest.data.tokenize; -import java.io.Reader; - import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.classic.ClassicFilter; import org.apache.lucene.analysis.core.LowerCaseFilter; -import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.en.EnglishAnalyzer; -import org.apache.lucene.analysis.standard.ClassicFilter; /** * Filters {@link StandardTokenizer} with {@link LowerCaseFilter} {@link ClassicFilter} and {@link StopFilter}, using a list of English stop words (unless diff --git a/warehouse/ingest-core/src/main/java/datawave/ingest/data/tokenize/TokenizationHelper.java b/warehouse/ingest-core/src/main/java/datawave/ingest/data/tokenize/TokenizationHelper.java index 931b2efa40e..dac97f1c4a6 100644 --- a/warehouse/ingest-core/src/main/java/datawave/ingest/data/tokenize/TokenizationHelper.java +++ b/warehouse/ingest-core/src/main/java/datawave/ingest/data/tokenize/TokenizationHelper.java @@ -6,6 +6,7 @@ import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.en.EnglishAnalyzer; import datawave.ingest.data.config.DataTypeHelper; import datawave.util.ObjectFactory; @@ -373,7 +374,7 @@ public static final CharArraySet getStopWords(DataTypeHelper helper, Configurati } } else { log.warn("Utilizing default stopword set. Tokenization and indexing may generate unwanted data"); - stopWords = org.apache.lucene.analysis.core.StopAnalyzer.ENGLISH_STOP_WORDS_SET; + stopWords = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET; } return stopWords; } diff --git a/warehouse/ingest-core/src/main/java/datawave/ingest/util/NGramTokenizationStrategy.java b/warehouse/ingest-core/src/main/java/datawave/ingest/util/NGramTokenizationStrategy.java index ef5c17d01d6..1bdc0daa7f9 100644 --- a/warehouse/ingest-core/src/main/java/datawave/ingest/util/NGramTokenizationStrategy.java +++ b/warehouse/ingest-core/src/main/java/datawave/ingest/util/NGramTokenizationStrategy.java @@ -6,9 +6,9 @@ import org.apache.log4j.Logger; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.classic.ClassicFilter; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.ngram.NGramTokenizer; -import org.apache.lucene.analysis.standard.ClassicFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import com.google.common.hash.BloomFilter; diff --git a/warehouse/ingest-csv/pom.xml b/warehouse/ingest-csv/pom.xml index 845304382c9..91cce5b821c 100644 --- a/warehouse/ingest-csv/pom.xml +++ b/warehouse/ingest-csv/pom.xml @@ -59,7 +59,7 @@ org.apache.lucene - lucene-analyzers-common + lucene-analysis-common org.apache.lucene diff --git a/warehouse/ingest-json/pom.xml b/warehouse/ingest-json/pom.xml index 8b3e5c02ab3..87cda7e2f10 100644 --- a/warehouse/ingest-json/pom.xml +++ b/warehouse/ingest-json/pom.xml @@ -49,7 +49,7 @@ org.apache.lucene - lucene-analyzers-common + lucene-analysis-common org.apache.lucene diff --git a/warehouse/ingest-scripts/src/main/resources/bin/ingest/findJars.sh b/warehouse/ingest-scripts/src/main/resources/bin/ingest/findJars.sh index 4b20424ee05..d071f9c18a1 100644 --- a/warehouse/ingest-scripts/src/main/resources/bin/ingest/findJars.sh +++ b/warehouse/ingest-scripts/src/main/resources/bin/ingest/findJars.sh @@ -79,7 +79,7 @@ LOG4J2_SLF4J_JAR=$(findJar log4j-slf4j-impl) JSON_SIMPLE=$(findJar json-simple) LUCENE_JAR=$(findJar lucene-core) LUCENE_JAR=$LUCENE_JAR:$(findJar lucene-queryparser) -LUCENE_JAR=$LUCENE_JAR:$(findJar lucene-analyzers-common) +LUCENE_JAR=$LUCENE_JAR:$(findJar lucene-analysis-common) THRIFT_JAR=$(findJar libthrift) AC_CORE_JAR=$(findAccumuloJar accumulo-core) AC_SERVER_JAR=$(findAccumuloJar accumulo-server-base) diff --git a/warehouse/ingest-wikipedia/pom.xml b/warehouse/ingest-wikipedia/pom.xml index e72e629c924..da28eea77eb 100644 --- a/warehouse/ingest-wikipedia/pom.xml +++ b/warehouse/ingest-wikipedia/pom.xml @@ -40,7 +40,7 @@ org.apache.lucene - lucene-analyzers-common + lucene-analysis-common org.apache.lucene diff --git a/warehouse/query-core/pom.xml b/warehouse/query-core/pom.xml index 51e0f2d3085..87b42f41e76 100644 --- a/warehouse/query-core/pom.xml +++ b/warehouse/query-core/pom.xml @@ -161,7 +161,7 @@ org.apache.lucene - lucene-analyzers-common + lucene-analysis-common org.apache.lucene diff --git a/warehouse/query-core/src/main/java/datawave/core/iterators/DatawaveFieldIndexListIteratorJexl.java b/warehouse/query-core/src/main/java/datawave/core/iterators/DatawaveFieldIndexListIteratorJexl.java index fa232bb83b4..fac28fe5f6e 100644 --- a/warehouse/query-core/src/main/java/datawave/core/iterators/DatawaveFieldIndexListIteratorJexl.java +++ b/warehouse/query-core/src/main/java/datawave/core/iterators/DatawaveFieldIndexListIteratorJexl.java @@ -10,6 +10,8 @@ import java.util.Map; import java.util.SortedSet; +import javax.management.ObjectName; + import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; @@ -21,11 +23,14 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.NoOutputs; import org.apache.lucene.util.fst.Outputs; +import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; import datawave.core.iterators.filesystem.FileSystemCache; @@ -185,28 +190,16 @@ protected boolean matches(Key k) throws IOException { public static FST getFST(SortedSet values) throws IOException { final IntsRefBuilder irBuilder = new IntsRefBuilder(); - // The builder options with defaults - FST.INPUT_TYPE inputType = FST.INPUT_TYPE.BYTE1; - int minSuffixCount1 = 0; - int minSuffixCount2 = 0; - boolean doShareSuffix = true; - boolean doShareNonSingletonNodes = true; - int shareMaxTailLength = Integer.MAX_VALUE; - - boolean allowArrayArcs = true; - int bytesPageBits = 15; final Outputs outputs = NoOutputs.getSingleton(); - // create the FST from the values - org.apache.lucene.util.fst.Builder fstBuilder = new org.apache.lucene.util.fst.Builder<>(inputType, minSuffixCount1, minSuffixCount2, - doShareSuffix, doShareNonSingletonNodes, shareMaxTailLength, outputs, allowArrayArcs, bytesPageBits); - + // Add the values to the compiler and create the FST. + FSTCompiler fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build(); for (String value : values) { Util.toUTF16(value, irBuilder); final IntsRef scratchInt = irBuilder.get(); - fstBuilder.add(scratchInt, outputs.getNoOutput()); + fstCompiler.add(scratchInt, outputs.getNoOutput()); } - return fstBuilder.finish(); + return FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()); } /** Utility class to load one instance of any FST per classloader */ @@ -260,8 +253,10 @@ public static FST loadFSTFromFile(Path filename, String compressionCodec fis = codec.createInputStream(fis); } NoOutputs outputs = NoOutputs.getSingleton(); + DataInput di = new InputStreamDataInput(fis); - return new FST<>(di, outputs); + FST.FSTMetadata metadata = FST.readMetadata(di, outputs); + return new FST<>(metadata, di); } public static synchronized void clear(String file) { diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/PushdownLargeFieldedListsVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/PushdownLargeFieldedListsVisitor.java index 3e113e0958d..cd2fc837af0 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/PushdownLargeFieldedListsVisitor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/PushdownLargeFieldedListsVisitor.java @@ -334,7 +334,7 @@ protected void assignNodeByField(JexlNode origNode, JexlNode subNode, Multimap values) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException { - FST fst = DatawaveFieldIndexListIteratorJexl.getFST(values); + FST fst = DatawaveFieldIndexListIteratorJexl.getFST(values); // now serialize to our file system CompressionCodec codec = null; @@ -357,7 +357,7 @@ protected URI createFst(SortedSet values) throws IOException, ClassNotFo } OutputStreamDataOutput outStream = new OutputStreamDataOutput(fstFileOut); - fst.save(outStream); + fst.save(outStream, outStream); outStream.close(); return fstFile.toUri(); diff --git a/warehouse/query-core/src/main/java/datawave/query/language/parser/lucene/AccumuloSyntaxParser.java b/warehouse/query-core/src/main/java/datawave/query/language/parser/lucene/AccumuloSyntaxParser.java index b9dd081637c..85b2e5a063c 100644 --- a/warehouse/query-core/src/main/java/datawave/query/language/parser/lucene/AccumuloSyntaxParser.java +++ b/warehouse/query-core/src/main/java/datawave/query/language/parser/lucene/AccumuloSyntaxParser.java @@ -40,6 +40,7 @@ import org.apache.lucene.queryparser.flexible.messages.MessageImpl; import org.apache.lucene.queryparser.flexible.standard.nodes.RegexpQueryNode; import org.apache.lucene.queryparser.flexible.standard.nodes.TermRangeQueryNode; +import org.apache.lucene.search.FuzzyQuery; @SuppressWarnings("all") public class AccumuloSyntaxParser implements SyntaxParser, AcumuloSyntaxParserConstants { @@ -387,7 +388,7 @@ final public QueryNode Term(CharSequence field) throws ParseException { boolean endInc = false; QueryNode q = null; FieldQueryNode qLower, qUpper; - float defaultMinSimilarity = org.apache.lucene.search.FuzzyQuery.defaultMinSimilarity; + int defaultMaxEdits = FuzzyQuery.defaultMaxEdits; switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { case FUNCTION: case TERM: @@ -443,9 +444,9 @@ final public QueryNode Term(CharSequence field) throws ParseException { ; } if (fuzzy) { - float fms = defaultMinSimilarity; + int fms = defaultMaxEdits; try { - fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue(); + fms = Integer.valueOf(fuzzySlop.image.substring(1)).intValue(); } catch (Exception ignored) {} if (fms < 0.0f) { { From 1d22802f97c6018ce6a5d86df2f155333188f021 Mon Sep 17 00:00:00 2001 From: Laura Schanno Date: Thu, 6 Feb 2025 04:13:23 -0500 Subject: [PATCH 2/3] Fix AccumuloSyntaxParser --- .../query/language/parser/lucene/AccumuloSyntaxParser.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/warehouse/query-core/src/main/java/datawave/query/language/parser/lucene/AccumuloSyntaxParser.java b/warehouse/query-core/src/main/java/datawave/query/language/parser/lucene/AccumuloSyntaxParser.java index 85b2e5a063c..67a073b2a2e 100644 --- a/warehouse/query-core/src/main/java/datawave/query/language/parser/lucene/AccumuloSyntaxParser.java +++ b/warehouse/query-core/src/main/java/datawave/query/language/parser/lucene/AccumuloSyntaxParser.java @@ -388,7 +388,7 @@ final public QueryNode Term(CharSequence field) throws ParseException { boolean endInc = false; QueryNode q = null; FieldQueryNode qLower, qUpper; - int defaultMaxEdits = FuzzyQuery.defaultMaxEdits; + float defaultMinSimilarity = FuzzyQuery.defaultMaxEdits; switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) { case FUNCTION: case TERM: @@ -444,9 +444,9 @@ final public QueryNode Term(CharSequence field) throws ParseException { ; } if (fuzzy) { - int fms = defaultMaxEdits; + float fms = defaultMinSimilarity; try { - fms = Integer.valueOf(fuzzySlop.image.substring(1)).intValue(); + fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue(); } catch (Exception ignored) {} if (fms < 0.0f) { { From e09f67d4f882edf79100616cb8057a6b44b9895e Mon Sep 17 00:00:00 2001 From: Laura Schanno Date: Mon, 3 Mar 2025 12:45:49 -0500 Subject: [PATCH 3/3] Remove unused imports --- .../core/iterators/DatawaveFieldIndexListIteratorJexl.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/warehouse/query-core/src/main/java/datawave/core/iterators/DatawaveFieldIndexListIteratorJexl.java b/warehouse/query-core/src/main/java/datawave/core/iterators/DatawaveFieldIndexListIteratorJexl.java index fac28fe5f6e..8358a1e3265 100644 --- a/warehouse/query-core/src/main/java/datawave/core/iterators/DatawaveFieldIndexListIteratorJexl.java +++ b/warehouse/query-core/src/main/java/datawave/core/iterators/DatawaveFieldIndexListIteratorJexl.java @@ -10,8 +10,6 @@ import java.util.Map; import java.util.SortedSet; -import javax.management.ObjectName; - import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; @@ -23,14 +21,12 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.InputStreamDataInput; -import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.NoOutputs; import org.apache.lucene.util.fst.Outputs; -import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; import datawave.core.iterators.filesystem.FileSystemCache;