[ML] Performance improvements for categorization jobs (#89824)

Categorization of strings which break down to a huge number of tokens can cause the C++ backend process to choke - see elastic/ml-cpp#2403. This PR adds a limit filter to the default categorization analyzer which caps the number of tokens passed to the backend at 100. Unfortunately this isn't a complete panacea to all the issues surrounding categorization of many tokened / large messages as verification checks on the frontend can also fail due to calls to the datafeed _preview API returning an excessive amount of data.
elastic · Sep 8, 2022 · fd20027 · fd20027
1 parent f16110d
commit fd20027
Show file tree

Hide file tree

Showing 6 changed files with 165 additions and 2 deletions.
diff --git a/docs/reference/ml/common/apis/get-ml-info.asciidoc b/docs/reference/ml/common/apis/get-ml-info.asciidoc
@@ -97,6 +97,10 @@ This is a possible response:
               "GMT",
               "UTC"
             ]
+          },
+          {
+            "type": "limit",
+            "max_token_count": "100"
           }
         ]
       },

diff --git a/x-pack/plugin/build.gradle b/x-pack/plugin/build.gradle
@@ -105,6 +105,7 @@ tasks.named("yamlRestTestV7CompatTransform").configure { task ->
     "roles/30_prohibited_role_query/Test use prohibited query inside role query",
     "put role request with a term lookup (deprecated) and type. Requires validation in REST layer"
   )
+  task.skipTest("ml/jobs_crud/Test update job", "Behaviour change #89824 - added limit filter to categorization analyzer")
   task.skipTest("ml/jobs_crud/Test create job with delimited format", "removing undocumented functionality")
   task.skipTest("ml/datafeeds_crud/Test update datafeed to point to missing job", "behaviour change #44752 - not allowing to update datafeed job_id")
   task.skipTest(

diff --git a/...rc/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java b/...rc/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java
@@ -61,6 +61,8 @@ public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeab
     public static final ParseField TOKEN_FILTERS = AnalyzeAction.Fields.TOKEN_FILTERS;
     public static final ParseField CHAR_FILTERS = AnalyzeAction.Fields.CHAR_FILTERS;
 
+    public static final int MAX_TOKEN_COUNT = 100;
+
     /**
      * This method is only used in the unit tests - in production code this config is always parsed as a fragment.
      */
@@ -204,6 +206,7 @@ public static CategorizationAnalyzerConfig buildStandardCategorizationAnalyzer(L
             .addCategorizationFilters(categorizationFilters)
             .setTokenizer("ml_standard")
             .addDateWordsTokenFilter()
+            .addLimitFilter()
             .build();
     }
 
@@ -429,6 +432,14 @@ Builder addDateWordsTokenFilter() {
             return this;
         }
 
+        Builder addLimitFilter() {
+            Map<String, Object> limitFilter = new HashMap<>();
+            limitFilter.put("type", "limit");
+            limitFilter.put("max_token_count", MAX_TOKEN_COUNT);
+            addTokenFilter(limitFilter);
+            return this;
+        }
+
         /**
          * Create a config validating only structure, not exact analyzer/tokenizer/filter names
          */

diff --git a/...st/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizeTextAggregatorTests.java b/...st/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizeTextAggregatorTests.java
@@ -12,6 +12,7 @@
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.analysis.common.CommonAnalysisPlugin;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.env.TestEnvironment;
@@ -45,7 +46,7 @@ protected AnalysisModule createAnalysisModule() throws Exception {
             TestEnvironment.newEnvironment(
                 Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build()
             ),
-            List.of(new MachineLearning(Settings.EMPTY))
+            List.of(new MachineLearning(Settings.EMPTY), new CommonAnalysisPlugin())
         );
     }
 

diff --git a/.../test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java b/.../test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java
@@ -509,6 +509,152 @@ public void testMlStandardCategorizationAnalyzer() throws IOException {
                 ),
                 categorizationAnalyzer.tokenizeField("nginx_error", NGINX_ERROR_EXAMPLE)
             );
+
+            // Limited to CategorizationAnalyzerConfig.MAX_TOKEN_COUNT (100) tokens
+            assertEquals(
+                Arrays.asList(
+                    "took",
+                    "errors",
+                    "true",
+                    "items",
+                    "create",
+                    "index",
+                    "internal.alerts-security.alerts-default-000003",
+                    "id",
+                    "status",
+                    "error",
+                    "type",
+                    "mapper_parsing_exception",
+                    "reason",
+                    "failed",
+                    "to",
+                    "parse",
+                    "field",
+                    "process.parent.command_line",
+                    "of",
+                    "type",
+                    "wildcard",
+                    "in",
+                    "document",
+                    "with",
+                    "id",
+                    "Preview",
+                    "of",
+                    "field",
+                    "s",
+                    "value",
+                    "R",
+                    "Q",
+                    "M",
+                    "t",
+                    "k",
+                    "P",
+                    "r",
+                    "o",
+                    "T",
+                    "g",
+                    "I",
+                    "r",
+                    "R",
+                    "N",
+                    "P",
+                    "R",
+                    "I",
+                    "N",
+                    "t",
+                    "n",
+                    "i",
+                    "l",
+                    "n",
+                    "r",
+                    "T",
+                    "R",
+                    "i",
+                    "t",
+                    "w",
+                    "n",
+                    "t",
+                    "s",
+                    "i",
+                    "i",
+                    "l",
+                    "r",
+                    "t",
+                    "s",
+                    "M",
+                    "i",
+                    "Q",
+                    "N",
+                    "L",
+                    "t",
+                    "v",
+                    "r",
+                    "s",
+                    "s",
+                    "w",
+                    "o",
+                    "n",
+                    "i",
+                    "n",
+                    "m",
+                    "i",
+                    "r",
+                    "g",
+                    "n",
+                    "i",
+                    "i",
+                    "l",
+                    "n",
+                    "o",
+                    "n",
+                    "i",
+                    "r",
+                    "t",
+                    "u",
+                    "k",
+                    "l"
+                ),
+                categorizationAnalyzer.tokenizeField(
+                    "multiline",
+                    "{\"took\":529,\"errors\":true,\"items\":[{\"create\":{\"_index\":\".internal.alerts-security.alerts-default-000003\","
+                        + "\"_id\":\"5b87a604eb4b7bf22a74d3bdda7855f64080a3c620d13563ed000f64670964a8\",\"status\":400,\"error\":{\"type\":"
+                        + "\"mapper_parsing_exception\",\"reason\":\"failed to parse field [process.parent.command_line] of type [wildcard]"
+                        + " in document with id '5b87a604eb4b7bf22a74d3bdda7855f64080a3c620d13563ed000f64670964a8'. Preview of field's"
+                        + " value: '{1220=B, 1217=R, 1216=/, 1215=b, 1214=d, 1213=/, 1212=Q, 1211=M, 1210=t, 1219=B, 1218=A, 1231=k,"
+                        + " 1230=-, 0=\\\", 1228= , 1=C, 1227=\\\", 2=:, 1226=\\\\, 3=\\\\, 1225=\\\", 4=P, 1224=1, 5=r, 1223=~, 6=o,"
+                        + " 1222=T, 7=g, 1221=I, 8=r, 800=R, 9=a, 801=A, 802=~, 803=3, 804=/, 805=N, 806=P, 807=R, 1229= , 808=I, 809=N,"
+                        + " 1000=\\\\, 1242=_, 1241=t, 1240=e, 1239=n, 1238=i, 1237= , 1236=l, 1235=e, 1234=n, 1233=r, 810=T, 1232=e,"
+                        + " 811=~, 812=1, 813=/, 814=R, 815=a, 816=b, 817=b, 818=i, 819=t, 1011=w, 1253=n, 1010= , 1252=e, 1251=t, 1250=s,"
+                        + " 1008=i, 1007=b, 1249=i, 1006=b, 1248=l, 1005=a, 1247=_, 1004=r, 1246=t, 1003=-, 1245=s, 820=M, 1002= ,"
+                        + " 1244=i, 821=Q, 1001=\\\", 1243=d, 822=/, 823=E, 824=N, 825=A, 826=B, 827=L, 828=E, 829=~, 1009=t, 1022=v,"
+                        + " 1264= , 1021=r, 1263=2, 1020=e, 1262=7, 1261=6, 1260=5, 1019=s, 1018=_, 1017=s, 1259=2, 1016=w, 1258= ,"
+                        + " 1015=o, 1257=n, 830=1, 1014=d, 1256=i,  831=\\\", 1013=n, 1255=m, 832=\\\\, 1012=i, 1254=_, 833=\\\","
+                        + " 834= , 835=-, 836=r, 837=a, 838=b, 839=b, 1033= , 1275=e, 1032=g, 1274=n, 1031=i, 1273=i, 1030=f,"
+                        + " 1272= , 1271=l, 1270=e, 1029=n, 1028=o, 1027=c, 1269=n, 840=i, 1026=_, 1268=r, 841=t, 1025=e, 1267=e,"
+                        + " 600=u, 842= , 1024=c, 1266=k, 601=l, 843=p, 1023=i, 1265=-, 602=t, 844=l, 603=_, 845=u, 604=f, 846=g, 605=i,"
+                        + " 847=i, 606=l, 848=n, 607=e, 849=s, 608= , 609=\\\\, 1044=R, 1286=t, 1043=G, 1285=s, 1042=O, 1284=i, 1041=R,"
+                        + " 1283=l, 1040=P, 1282=_, 1281=t, 1280=s, 1039=/, 850=_, 1038=:, 851=d, 1037=C, 1279=i, 610=\\\", 852=i,"
+                        + " 1036=\\\", 1278=d, 611=\\\", 853=r, 1035=\\\", 1277=_, 612=C, 854= , 1034=\\\\, 1276=t, 613=:,"
+                        + " 855=\\\\, 614=/, 856=\\\", 615=P, 857=\\\", 616=R, 858=C, 617=O, 859=:, 618=G, 619=R, 1055=~, 1297=7, 1054=T,"
+                        + " 1296=6, 1053=N, 1295=5, 1052=I, 1294=2, 1051=R, 1293= , 1050=P, 1292=x, 1291=a, 1290=m, 860=/, 861=P, 1049=N,"
+                        + " 620=A, 862=R, 1048=/, 621=~, 863=O, 1047=3, 1289=_, 622=3, 864=G, 1046=~, 1288=n, 623=/, 865=R, 1045=A, 1287=e,"
+                        + " 624=N, 866=A, 625=P, 867=~, 626=R, 868=1, 627=I, 869=/, 628=N, 629=T, 1066=/, 1065=Q, 1064=M, 1063=t, 1062=i,"
+                        + " 1061=b, 1060=b, 870=N, 871=P, 630=~, 872=R, 631=1, 873=I, 1059=a, 632=/, 874=N, 1058=R, 633=R, 875=T, 1057=/,"
+                        + " 1299= , 634=a, 876=~, 1056=1, 1298=2, 635=b, 877=1, 636=b, 878=/, 637=i, 879=R, 638=t, 639=M, 1077=o, 1076=c,"
+                        + " 1075=., 1074=q, 1073=m, 1072=t, 1071=i, 1070=b, 880=A, 881=B, 640=Q, 882=B, 641=/, 883=I, 400=1, 642=l, 884=T,"
+                        + " 401=2, 643=o, 885=~, 1069=b, 402=8, 644=g, 886=1, 1068=a, 403=0, 645=/, 887=., 1067=r, 404=0, 646=R, 888=1,"
+                        + " 405=0, 647=A, 889=0, 406= , 648=B, 407= , 649=B, 408=-, 409=k, 1080=i, 1088=s, 1087=o, 1086=-, 1085= ,"
+                        + " 1084=\\\", 1083=\\\\, 1082=\\\", 890=/, 1081=g, 891=p, 650=I, 892=l, 651=T, 893=u, 410=e, 652=~, 894=g,"
+                        + " 411=r, 653=1, 895=i, 412=n, 654=., 896=n, 413=e, 655=L, 897=s, 1079=f, 414=l, 656=O, 898=\\\", 1078=n,"
+                        + " 415= , 657=G, 899=\\\\, 416=i, 658=\\\", 417=n, 659=\\\\, 418=e, 419=t, 1091=o, 1090=m, 1099=_, 1098=t,"
+                        + " 1097=r, 1096=a, 1095=t, 1094=s, 1093= , 1092=n, 660=\\\", 661= , 420=_, 662=-, 421=d, 663=r, 422=e, 664=a,"
+                        + " 423=f, 665=b, 424=a, 666=b, 425=u, 667=i, 1089=_, 426=l, 668=t, 427=t, 669= , 428=_, 429=c, 670=l, 671=a,"
+                        + " 430=o, 672=g, 431=n, 673=e, 432=n, 674=r, 433=e, 675=_, 434=c, 676=u, 435=t, 677=p, 436=_, 678=g, 437=o,"
+                        + " 679=r, 438=p, 439=t, 680=a, 681=d, 440=i, 682=e, 441=o, 683=_, 200= , 442=n, 684=f, 201=s, 443=s, 685=i,"
+                        + " 202=t, 444= , 686=l, 203=a, 445=\\\", 687=e, 204=r, 446=[, 688= , 205=t, 447={, 689=\\\\, 206=_, 448=n,"
+                        + " 207=s, 449=o, 208=a, 209=s, 690=\\\", 691=\\\", 450=d, 692=C, 451=e, 693=:, 210=l,"
+                )
+            );
         }
     }
 

diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_crud.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_crud.yml
@@ -520,7 +520,7 @@
             }
           }
   - match: { job_id: "jobs-crud-update-job" }
-  - length: { analysis_config.categorization_analyzer.filter: 1 }
+  - length: { analysis_config.categorization_analyzer.filter: 2 }
   - match: { analysis_config.categorization_analyzer.tokenizer: "ml_standard" }
   - length: { analysis_config.categorization_analyzer.char_filter: 3 }
   - match: { analysis_config.categorization_analyzer.char_filter.0: "first_line_with_letters" }