From 9c60c7efb72577556bb08522af23324b06874c96 Mon Sep 17 00:00:00 2001 From: hye-on Date: Tue, 28 Jan 2025 22:16:02 +0900 Subject: [PATCH 1/3] Add HashSet based filtering optimization to XContentMapValues This optimization enhances document filtering when field names are simple (no dots or wildcards in field names and no dots in document keys). In such cases, it uses a HashSet-based implementation instead of automaton matching to prevent TooComplexToDeterminizeException when processing documents with numerous long field names. Changes: - Add HashSet optimization for simple field names - Split filter implementation into set-based and automaton-based - Add helper methods to check field name patterns Signed-off-by: hye-on --- .../xcontent/support/XContentMapValues.java | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/server/src/main/java/org/opensearch/common/xcontent/support/XContentMapValues.java b/server/src/main/java/org/opensearch/common/xcontent/support/XContentMapValues.java index d3fa44c5afb66..82909843391e6 100644 --- a/server/src/main/java/org/opensearch/common/xcontent/support/XContentMapValues.java +++ b/server/src/main/java/org/opensearch/common/xcontent/support/XContentMapValues.java @@ -45,9 +45,12 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.function.Function; /** @@ -216,6 +219,63 @@ public static Map filter(Map map, String[] includes, * @see #filter(Map, String[], String[]) for details */ public static Function, Map> filter(String[] includes, String[] excludes) { + + return (map) -> { + if (hasNoDottedKeys(map) && hasNoWildcardsOrDots(includes) && hasNoWildcardsOrDots(excludes)) { + return createSetBasedFilter(includes, excludes).apply(map); + } + return createAutomatonFilter(includes, excludes).apply(map); + }; + } + + private static boolean hasNoDottedKeys(Map map) { + for (String key : map.keySet()) { + if (key.indexOf('.') != -1) { + return false; + } + } + return true; + } + + private static boolean hasNoWildcardsOrDots(String[] fields) { + if (fields == null || fields.length == 0) { + return true; + } + + for (String field : fields) { + if (field.indexOf('*') != -1 || field.indexOf('.') != -1) { + return false; + } + } + return true; + } + + /** + * Creates a simple HashSet-based filter for exact field name matching + */ + private static Function, Map> createSetBasedFilter(String[] includes, String[] excludes) { + + Set includeSet = (includes == null || includes.length == 0) ? null : new HashSet<>(Arrays.asList(includes)); + Set excludeSet = (excludes == null || excludes.length == 0) + ? Collections.emptySet() + : new HashSet<>(Arrays.asList(excludes)); + + return (map) -> { + Map filtered = new HashMap<>(); + for (Map.Entry entry : map.entrySet()) { + String key = entry.getKey(); + if ((includeSet == null || includeSet.contains(key)) && !excludeSet.contains(key)) { + filtered.put(key, entry.getValue()); + } + } + return filtered; + }; + } + + /** + * Creates an automaton-based filter for complex pattern matching + */ + public static Function, Map> createAutomatonFilter(String[] includes, String[] excludes) { CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString()); CharacterRunAutomaton include; From 97145a0b5ab0ad3e240b6d35bfd7dd915f527df5 Mon Sep 17 00:00:00 2001 From: hye-on Date: Wed, 29 Jan 2025 22:44:44 +0900 Subject: [PATCH 2/3] Update filtering to support HashSet-based approach for map keys with dots Signed-off-by: hye-on --- .../xcontent/support/XContentMapValues.java | 25 ++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/server/src/main/java/org/opensearch/common/xcontent/support/XContentMapValues.java b/server/src/main/java/org/opensearch/common/xcontent/support/XContentMapValues.java index 82909843391e6..7240252b51d83 100644 --- a/server/src/main/java/org/opensearch/common/xcontent/support/XContentMapValues.java +++ b/server/src/main/java/org/opensearch/common/xcontent/support/XContentMapValues.java @@ -219,22 +219,10 @@ public static Map filter(Map map, String[] includes, * @see #filter(Map, String[], String[]) for details */ public static Function, Map> filter(String[] includes, String[] excludes) { - - return (map) -> { - if (hasNoDottedKeys(map) && hasNoWildcardsOrDots(includes) && hasNoWildcardsOrDots(excludes)) { - return createSetBasedFilter(includes, excludes).apply(map); - } - return createAutomatonFilter(includes, excludes).apply(map); - }; - } - - private static boolean hasNoDottedKeys(Map map) { - for (String key : map.keySet()) { - if (key.indexOf('.') != -1) { - return false; - } + if (hasNoWildcardsOrDots(includes) && hasNoWildcardsOrDots(excludes)) { + return createSetBasedFilter(includes, excludes); } - return true; + return createAutomatonFilter(includes, excludes); } private static boolean hasNoWildcardsOrDots(String[] fields) { @@ -254,7 +242,6 @@ private static boolean hasNoWildcardsOrDots(String[] fields) { * Creates a simple HashSet-based filter for exact field name matching */ private static Function, Map> createSetBasedFilter(String[] includes, String[] excludes) { - Set includeSet = (includes == null || includes.length == 0) ? null : new HashSet<>(Arrays.asList(includes)); Set excludeSet = (excludes == null || excludes.length == 0) ? Collections.emptySet() @@ -264,8 +251,12 @@ private static boolean hasNoWildcardsOrDots(String[] fields) { Map filtered = new HashMap<>(); for (Map.Entry entry : map.entrySet()) { String key = entry.getKey(); + int dotPos = key.indexOf('.'); + if (dotPos > 0) { + key = key.substring(0, dotPos); + } if ((includeSet == null || includeSet.contains(key)) && !excludeSet.contains(key)) { - filtered.put(key, entry.getValue()); + filtered.put(entry.getKey(), entry.getValue()); } } return filtered; From dd9749ef52241225bbfe6c546c270f54e4c20c42 Mon Sep 17 00:00:00 2001 From: hye-on Date: Sat, 1 Feb 2025 00:23:23 +0900 Subject: [PATCH 3/3] Add changelog entry for improved source field matching logic Signed-off-by: hye-on --- CHANGELOG-3.0.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG-3.0.md b/CHANGELOG-3.0.md index 8d8adfd1e3566..004197863c240 100644 --- a/CHANGELOG-3.0.md +++ b/CHANGELOG-3.0.md @@ -28,7 +28,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Deprecate CamelCase `PathHierarchy` tokenizer name in favor to lowercase `path_hierarchy` ([#10894](https://github.com/opensearch-project/OpenSearch/pull/10894)) - Breaking change: Do not request "search_pipelines" metrics by default in NodesInfoRequest ([#12497](https://github.com/opensearch-project/OpenSearch/pull/12497)) - Refactor `:libs` module `bootstrap` package to eliminate top level split packages [#17117](https://github.com/opensearch-project/OpenSearch/pull/17117)) - +- Use simpler matching logic for source fields when explicit field names (no wildcards or dot-paths) are specified ([#17160](https://github.com/opensearch-project/OpenSearch/pull/17160)) ### Deprecated ### Removed