Skip to content

Commit

Permalink
Add HashSet based filtering optimization to XContentMapValues (#17160)
Browse files Browse the repository at this point in the history
This optimization enhances document filtering when field names are simple (no dots or wildcards in field names). In such cases, it uses a HashSet-based implementation instead of automaton matching to prevent TooComplexToDeterminizeException when processing documents with numerous long field names.

Changes:
- Add HashSet optimization for simple field names
- Split filter implementation into set-based and automaton-based
- Add helper methods to check field name patterns

---------

Signed-off-by: hye-on <ain0103@naver.com>
Signed-off-by: Michael Froh <froh@amazon.com>
Co-authored-by: Michael Froh <froh@amazon.com>
  • Loading branch information
hye-on and msfroh authored Feb 7, 2025
1 parent 3f793b6 commit c06f53e
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG-3.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Add task completion count in search backpressure stats API ([#10028](https://github.com/opensearch-project/OpenSearch/pull/10028/))
- Deprecate CamelCase `PathHierarchy` tokenizer name in favor to lowercase `path_hierarchy` ([#10894](https://github.com/opensearch-project/OpenSearch/pull/10894))
- Breaking change: Do not request "search_pipelines" metrics by default in NodesInfoRequest ([#12497](https://github.com/opensearch-project/OpenSearch/pull/12497))
- Use simpler matching logic for source fields when explicit field names (no wildcards or dot-paths) are specified ([#17160](https://github.com/opensearch-project/OpenSearch/pull/17160))
- Refactor `:libs` module `bootstrap` package to eliminate top level split packages for JPMS support ([#17117](https://github.com/opensearch-project/OpenSearch/pull/17117))
- Refactor the codebase to eliminate top level split packages for JPMS support ([#17153](https://github.com/opensearch-project/OpenSearch/pull/17153)
- Refactor `:server` module `org.apacge.lucene` package to eliminate top level split packages for JPMS support ([#17241](https://github.com/opensearch-project/OpenSearch/pull/17241))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,12 @@

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;

/**
Expand Down Expand Up @@ -216,6 +219,54 @@ public static Map<String, Object> filter(Map<String, ?> map, String[] includes,
* @see #filter(Map, String[], String[]) for details
*/
public static Function<Map<String, ?>, Map<String, Object>> filter(String[] includes, String[] excludes) {
if (hasNoWildcardsOrDots(includes) && hasNoWildcardsOrDots(excludes)) {
return createSetBasedFilter(includes, excludes);
}
return createAutomatonFilter(includes, excludes);
}

private static boolean hasNoWildcardsOrDots(String[] fields) {
if (fields == null || fields.length == 0) {
return true;
}

for (String field : fields) {
if (field.indexOf('*') != -1 || field.indexOf('.') != -1) {
return false;
}
}
return true;
}

/**
* Creates a simple HashSet-based filter for exact field name matching
*/
private static Function<Map<String, ?>, Map<String, Object>> createSetBasedFilter(String[] includes, String[] excludes) {
Set<String> includeSet = (includes == null || includes.length == 0) ? null : new HashSet<>(Arrays.asList(includes));
Set<String> excludeSet = (excludes == null || excludes.length == 0)
? Collections.emptySet()
: new HashSet<>(Arrays.asList(excludes));

return (map) -> {
Map<String, Object> filtered = new HashMap<>();
for (Map.Entry<String, ?> entry : map.entrySet()) {
String key = entry.getKey();
int dotPos = key.indexOf('.');
if (dotPos > 0) {
key = key.substring(0, dotPos);
}
if ((includeSet == null || includeSet.contains(key)) && !excludeSet.contains(key)) {
filtered.put(entry.getKey(), entry.getValue());
}
}
return filtered;
};
}

/**
* Creates an automaton-based filter for complex pattern matching
*/
public static Function<Map<String, ?>, Map<String, Object>> createAutomatonFilter(String[] includes, String[] excludes) {
CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString());

CharacterRunAutomaton include;
Expand Down

0 comments on commit c06f53e

Please sign in to comment.