Skip to content

Commit

Permalink
Remove MinimizationOperations
Browse files Browse the repository at this point in the history
This class was removed from Lucene because it is no longer needed. It
was copied into the OpenSearch code base because we were using it and
we didn't want our code to break. In fact, the correct choice would
have been to remove the references to
`MinimizationOperations.minimize()` and either replace them with
calls to `Operations.determinize()` or simply drop them altogether
because the automaton is deterministic.

Signed-off-by: Michael Froh <froh@amazon.com>
  • Loading branch information
msfroh committed Feb 6, 2025
1 parent b823d1f commit f011da3
Show file tree
Hide file tree
Showing 9 changed files with 21 additions and 411 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
import org.opensearch.common.settings.Settings;
import org.opensearch.core.common.Strings;
import org.opensearch.index.IndexNotFoundException;
import org.opensearch.lucene.util.automaton.MinimizationOperations;
import org.opensearch.search.builder.SearchSourceBuilder;

import java.util.List;
Expand Down Expand Up @@ -114,7 +113,7 @@ static CharacterRunAutomaton buildRemoteAllowlist(List<String> allowlist) {
return new CharacterRunAutomaton(Automata.makeEmpty());
}
Automaton automaton = Regex.simpleMatchToAutomaton(allowlist.toArray(Strings.EMPTY_ARRAY));
automaton = MinimizationOperations.minimize(automaton, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
automaton = Operations.determinize(automaton, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
if (Operations.isTotal(automaton)) {
throw new IllegalArgumentException(
"Refusing to start because allowlist "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.opensearch.lucene.util.automaton.MinimizationOperations;

import java.util.ArrayList;
import java.util.Iterator;
Expand All @@ -57,12 +56,12 @@ public static Automaton caseInsensitivePrefix(String s) {
List<Automaton> list = new ArrayList<>();
Iterator<Integer> iter = s.codePoints().iterator();
while (iter.hasNext()) {
list.add(toCaseInsensitiveChar(iter.next(), Integer.MAX_VALUE));
list.add(toCaseInsensitiveChar(iter.next()));
}
list.add(Automata.makeAnyString());

Automaton a = Operations.concatenate(list);
a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);
assert a.isDeterministic();
return a;
}

Expand All @@ -85,14 +84,14 @@ public static AutomatonQuery caseInsensitivePrefixQuery(Term prefix, MultiTermQu
*/
public static AutomatonQuery caseInsensitiveTermQuery(Term term) {
BytesRef prefix = term.bytes();
return new AutomatonQuery(term, toCaseInsensitiveString(prefix, Integer.MAX_VALUE));
return new AutomatonQuery(term, toCaseInsensitiveString(prefix));
}

/**
* Build an automaton matching a wildcard pattern, ASCII case insensitive, if the method is null, then will use {@link MultiTermQuery#CONSTANT_SCORE_BLENDED_REWRITE}.
*/
public static AutomatonQuery caseInsensitiveWildcardQuery(Term wildcardquery, MultiTermQuery.RewriteMethod method) {
return createAutomatonQuery(wildcardquery, toCaseInsensitiveWildcardAutomaton(wildcardquery, Integer.MAX_VALUE), method);
return createAutomatonQuery(wildcardquery, toCaseInsensitiveWildcardAutomaton(wildcardquery), method);

Check warning on line 94 in server/src/main/java/org/opensearch/common/lucene/search/AutomatonQueries.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/common/lucene/search/AutomatonQueries.java#L94

Added line #L94 was not covered by tests
}

/**
Expand Down Expand Up @@ -124,7 +123,7 @@ public static AutomatonQuery createAutomatonQuery(Term term, Automaton automaton
* Convert Lucene wildcard syntax into an automaton.
*/
@SuppressWarnings("fallthrough")
public static Automaton toCaseInsensitiveWildcardAutomaton(Term wildcardquery, int maxDeterminizedStates) {
public static Automaton toCaseInsensitiveWildcardAutomaton(Term wildcardquery) {
List<Automaton> automata = new ArrayList<>();

String wildcardText = wildcardquery.text();
Expand All @@ -148,32 +147,28 @@ public static Automaton toCaseInsensitiveWildcardAutomaton(Term wildcardquery, i
break;
} // else fallthru, lenient parsing with a trailing \
default:
automata.add(toCaseInsensitiveChar(c, maxDeterminizedStates));
automata.add(toCaseInsensitiveChar(c));

Check warning on line 150 in server/src/main/java/org/opensearch/common/lucene/search/AutomatonQueries.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/common/lucene/search/AutomatonQueries.java#L150

Added line #L150 was not covered by tests
}
i += length;
}

return Operations.concatenate(automata);
}

protected static Automaton toCaseInsensitiveString(BytesRef br, int maxDeterminizedStates) {
return toCaseInsensitiveString(br.utf8ToString(), maxDeterminizedStates);
protected static Automaton toCaseInsensitiveString(BytesRef br) {
return toCaseInsensitiveString(br.utf8ToString());
}

public static Automaton toCaseInsensitiveString(String s, int maxDeterminizedStates) {
public static Automaton toCaseInsensitiveString(String s) {
List<Automaton> list = new ArrayList<>();
Iterator<Integer> iter = s.codePoints().iterator();
while (iter.hasNext()) {
list.add(toCaseInsensitiveChar(iter.next(), maxDeterminizedStates));
list.add(toCaseInsensitiveChar(iter.next()));
}

Automaton a = Operations.concatenate(list);
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
return a;

return Operations.concatenate(list);
}

public static Automaton toCaseInsensitiveChar(int codepoint, int maxDeterminizedStates) {
public static Automaton toCaseInsensitiveChar(int codepoint) {
Automaton case1 = Automata.makeChar(codepoint);
// For now we only work with ASCII characters
if (codepoint > 128) {
Expand All @@ -183,7 +178,6 @@ public static Automaton toCaseInsensitiveChar(int codepoint, int maxDeterminized
Automaton result;
if (altCase != codepoint) {
result = Operations.union(case1, Automata.makeChar(altCase));
result = MinimizationOperations.minimize(result, maxDeterminizedStates);
} else {
result = case1;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ public Query termQueryCaseInsensitive(Object value, QueryShardContext context) {
Term term = new Term(name(), bytesRef);
Query query = AutomatonQueries.createAutomatonQuery(
term,
AutomatonQueries.toCaseInsensitiveString(bytesRef.utf8ToString(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT),
AutomatonQueries.toCaseInsensitiveString(bytesRef.utf8ToString()),
MultiTermQuery.DOC_VALUES_REWRITE
);
if (boost() != 1f) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@ public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, bool
}
List<Automaton> automata = new ArrayList<>();
if (caseInsensitive) {
automata.add(AutomatonQueries.toCaseInsensitiveString(value, Integer.MAX_VALUE));
automata.add(AutomatonQueries.toCaseInsensitiveString(value));

Check warning on line 643 in server/src/main/java/org/opensearch/index/mapper/TextFieldMapper.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/index/mapper/TextFieldMapper.java#L643

Added line #L643 was not covered by tests
} else {
automata.add(Automata.makeString(value));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
import org.opensearch.common.Nullable;
import org.opensearch.common.regex.Regex;
import org.opensearch.core.index.Index;
import org.opensearch.lucene.util.automaton.MinimizationOperations;

import java.util.Collection;
import java.util.List;
Expand Down Expand Up @@ -147,6 +146,8 @@ private static CharacterRunAutomaton buildCharacterRunAutomaton(Collection<Syste
Optional<Automaton> automaton = descriptors.stream()
.map(descriptor -> Regex.simpleMatchToAutomaton(descriptor.getIndexPattern()))
.reduce(Operations::union);
return new CharacterRunAutomaton(MinimizationOperations.minimize(automaton.orElse(Automata.makeEmpty()), Integer.MAX_VALUE));
return new CharacterRunAutomaton(
Operations.determinize(automaton.orElse(Automata.makeEmpty()), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT)
);
}
}
Loading

0 comments on commit f011da3

Please sign in to comment.