diff --git a/docs/reference/ml/common/apis/get-ml-info.asciidoc b/docs/reference/ml/common/apis/get-ml-info.asciidoc index 269ad8f96f069..48f74314560e6 100644 --- a/docs/reference/ml/common/apis/get-ml-info.asciidoc +++ b/docs/reference/ml/common/apis/get-ml-info.asciidoc @@ -97,6 +97,10 @@ This is a possible response: "GMT", "UTC" ] + }, + { + "type": "limit", + "max_token_count": "100" } ] }, diff --git a/x-pack/plugin/build.gradle b/x-pack/plugin/build.gradle index c43c1410688d3..56521189fcfdb 100644 --- a/x-pack/plugin/build.gradle +++ b/x-pack/plugin/build.gradle @@ -105,6 +105,7 @@ tasks.named("yamlRestTestV7CompatTransform").configure { task -> "roles/30_prohibited_role_query/Test use prohibited query inside role query", "put role request with a term lookup (deprecated) and type. Requires validation in REST layer" ) + task.skipTest("ml/jobs_crud/Test update job", "Behaviour change #89824 - added limit filter to categorization analyzer") task.skipTest("ml/jobs_crud/Test create job with delimited format", "removing undocumented functionality") task.skipTest("ml/datafeeds_crud/Test update datafeed to point to missing job", "behaviour change #44752 - not allowing to update datafeed job_id") task.skipTest( diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java index cf2af9ba2775e..fca4979261dd0 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java @@ -61,6 +61,8 @@ public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeab public static final ParseField TOKEN_FILTERS = AnalyzeAction.Fields.TOKEN_FILTERS; public static final ParseField CHAR_FILTERS = AnalyzeAction.Fields.CHAR_FILTERS; + public static final int MAX_TOKEN_COUNT = 100; + /** * This method is only used in the unit tests - in production code this config is always parsed as a fragment. */ @@ -204,6 +206,7 @@ public static CategorizationAnalyzerConfig buildStandardCategorizationAnalyzer(L .addCategorizationFilters(categorizationFilters) .setTokenizer("ml_standard") .addDateWordsTokenFilter() + .addLimitFilter() .build(); } @@ -429,6 +432,14 @@ Builder addDateWordsTokenFilter() { return this; } + Builder addLimitFilter() { + Map limitFilter = new HashMap<>(); + limitFilter.put("type", "limit"); + limitFilter.put("max_token_count", MAX_TOKEN_COUNT); + addTokenFilter(limitFilter); + return this; + } + /** * Create a config validating only structure, not exact analyzer/tokenizer/filter names */ diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizeTextAggregatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizeTextAggregatorTests.java index ebd7b4ce4da61..e01a90d536f11 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizeTextAggregatorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizeTextAggregatorTests.java @@ -12,6 +12,7 @@ import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.analysis.common.CommonAnalysisPlugin; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.env.TestEnvironment; @@ -45,7 +46,7 @@ protected AnalysisModule createAnalysisModule() throws Exception { TestEnvironment.newEnvironment( Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build() ), - List.of(new MachineLearning(Settings.EMPTY)) + List.of(new MachineLearning(Settings.EMPTY), new CommonAnalysisPlugin()) ); } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java index 4b016e433e39f..f4d583e1dfa11 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java @@ -509,6 +509,152 @@ public void testMlStandardCategorizationAnalyzer() throws IOException { ), categorizationAnalyzer.tokenizeField("nginx_error", NGINX_ERROR_EXAMPLE) ); + + // Limited to CategorizationAnalyzerConfig.MAX_TOKEN_COUNT (100) tokens + assertEquals( + Arrays.asList( + "took", + "errors", + "true", + "items", + "create", + "index", + "internal.alerts-security.alerts-default-000003", + "id", + "status", + "error", + "type", + "mapper_parsing_exception", + "reason", + "failed", + "to", + "parse", + "field", + "process.parent.command_line", + "of", + "type", + "wildcard", + "in", + "document", + "with", + "id", + "Preview", + "of", + "field", + "s", + "value", + "R", + "Q", + "M", + "t", + "k", + "P", + "r", + "o", + "T", + "g", + "I", + "r", + "R", + "N", + "P", + "R", + "I", + "N", + "t", + "n", + "i", + "l", + "n", + "r", + "T", + "R", + "i", + "t", + "w", + "n", + "t", + "s", + "i", + "i", + "l", + "r", + "t", + "s", + "M", + "i", + "Q", + "N", + "L", + "t", + "v", + "r", + "s", + "s", + "w", + "o", + "n", + "i", + "n", + "m", + "i", + "r", + "g", + "n", + "i", + "i", + "l", + "n", + "o", + "n", + "i", + "r", + "t", + "u", + "k", + "l" + ), + categorizationAnalyzer.tokenizeField( + "multiline", + "{\"took\":529,\"errors\":true,\"items\":[{\"create\":{\"_index\":\".internal.alerts-security.alerts-default-000003\"," + + "\"_id\":\"5b87a604eb4b7bf22a74d3bdda7855f64080a3c620d13563ed000f64670964a8\",\"status\":400,\"error\":{\"type\":" + + "\"mapper_parsing_exception\",\"reason\":\"failed to parse field [process.parent.command_line] of type [wildcard]" + + " in document with id '5b87a604eb4b7bf22a74d3bdda7855f64080a3c620d13563ed000f64670964a8'. Preview of field's" + + " value: '{1220=B, 1217=R, 1216=/, 1215=b, 1214=d, 1213=/, 1212=Q, 1211=M, 1210=t, 1219=B, 1218=A, 1231=k," + + " 1230=-, 0=\\\", 1228= , 1=C, 1227=\\\", 2=:, 1226=\\\\, 3=\\\\, 1225=\\\", 4=P, 1224=1, 5=r, 1223=~, 6=o," + + " 1222=T, 7=g, 1221=I, 8=r, 800=R, 9=a, 801=A, 802=~, 803=3, 804=/, 805=N, 806=P, 807=R, 1229= , 808=I, 809=N," + + " 1000=\\\\, 1242=_, 1241=t, 1240=e, 1239=n, 1238=i, 1237= , 1236=l, 1235=e, 1234=n, 1233=r, 810=T, 1232=e," + + " 811=~, 812=1, 813=/, 814=R, 815=a, 816=b, 817=b, 818=i, 819=t, 1011=w, 1253=n, 1010= , 1252=e, 1251=t, 1250=s," + + " 1008=i, 1007=b, 1249=i, 1006=b, 1248=l, 1005=a, 1247=_, 1004=r, 1246=t, 1003=-, 1245=s, 820=M, 1002= ," + + " 1244=i, 821=Q, 1001=\\\", 1243=d, 822=/, 823=E, 824=N, 825=A, 826=B, 827=L, 828=E, 829=~, 1009=t, 1022=v," + + " 1264= , 1021=r, 1263=2, 1020=e, 1262=7, 1261=6, 1260=5, 1019=s, 1018=_, 1017=s, 1259=2, 1016=w, 1258= ," + + " 1015=o, 1257=n, 830=1, 1014=d, 1256=i, 831=\\\", 1013=n, 1255=m, 832=\\\\, 1012=i, 1254=_, 833=\\\"," + + " 834= , 835=-, 836=r, 837=a, 838=b, 839=b, 1033= , 1275=e, 1032=g, 1274=n, 1031=i, 1273=i, 1030=f," + + " 1272= , 1271=l, 1270=e, 1029=n, 1028=o, 1027=c, 1269=n, 840=i, 1026=_, 1268=r, 841=t, 1025=e, 1267=e," + + " 600=u, 842= , 1024=c, 1266=k, 601=l, 843=p, 1023=i, 1265=-, 602=t, 844=l, 603=_, 845=u, 604=f, 846=g, 605=i," + + " 847=i, 606=l, 848=n, 607=e, 849=s, 608= , 609=\\\\, 1044=R, 1286=t, 1043=G, 1285=s, 1042=O, 1284=i, 1041=R," + + " 1283=l, 1040=P, 1282=_, 1281=t, 1280=s, 1039=/, 850=_, 1038=:, 851=d, 1037=C, 1279=i, 610=\\\", 852=i," + + " 1036=\\\", 1278=d, 611=\\\", 853=r, 1035=\\\", 1277=_, 612=C, 854= , 1034=\\\\, 1276=t, 613=:," + + " 855=\\\\, 614=/, 856=\\\", 615=P, 857=\\\", 616=R, 858=C, 617=O, 859=:, 618=G, 619=R, 1055=~, 1297=7, 1054=T," + + " 1296=6, 1053=N, 1295=5, 1052=I, 1294=2, 1051=R, 1293= , 1050=P, 1292=x, 1291=a, 1290=m, 860=/, 861=P, 1049=N," + + " 620=A, 862=R, 1048=/, 621=~, 863=O, 1047=3, 1289=_, 622=3, 864=G, 1046=~, 1288=n, 623=/, 865=R, 1045=A, 1287=e," + + " 624=N, 866=A, 625=P, 867=~, 626=R, 868=1, 627=I, 869=/, 628=N, 629=T, 1066=/, 1065=Q, 1064=M, 1063=t, 1062=i," + + " 1061=b, 1060=b, 870=N, 871=P, 630=~, 872=R, 631=1, 873=I, 1059=a, 632=/, 874=N, 1058=R, 633=R, 875=T, 1057=/," + + " 1299= , 634=a, 876=~, 1056=1, 1298=2, 635=b, 877=1, 636=b, 878=/, 637=i, 879=R, 638=t, 639=M, 1077=o, 1076=c," + + " 1075=., 1074=q, 1073=m, 1072=t, 1071=i, 1070=b, 880=A, 881=B, 640=Q, 882=B, 641=/, 883=I, 400=1, 642=l, 884=T," + + " 401=2, 643=o, 885=~, 1069=b, 402=8, 644=g, 886=1, 1068=a, 403=0, 645=/, 887=., 1067=r, 404=0, 646=R, 888=1," + + " 405=0, 647=A, 889=0, 406= , 648=B, 407= , 649=B, 408=-, 409=k, 1080=i, 1088=s, 1087=o, 1086=-, 1085= ," + + " 1084=\\\", 1083=\\\\, 1082=\\\", 890=/, 1081=g, 891=p, 650=I, 892=l, 651=T, 893=u, 410=e, 652=~, 894=g," + + " 411=r, 653=1, 895=i, 412=n, 654=., 896=n, 413=e, 655=L, 897=s, 1079=f, 414=l, 656=O, 898=\\\", 1078=n," + + " 415= , 657=G, 899=\\\\, 416=i, 658=\\\", 417=n, 659=\\\\, 418=e, 419=t, 1091=o, 1090=m, 1099=_, 1098=t," + + " 1097=r, 1096=a, 1095=t, 1094=s, 1093= , 1092=n, 660=\\\", 661= , 420=_, 662=-, 421=d, 663=r, 422=e, 664=a," + + " 423=f, 665=b, 424=a, 666=b, 425=u, 667=i, 1089=_, 426=l, 668=t, 427=t, 669= , 428=_, 429=c, 670=l, 671=a," + + " 430=o, 672=g, 431=n, 673=e, 432=n, 674=r, 433=e, 675=_, 434=c, 676=u, 435=t, 677=p, 436=_, 678=g, 437=o," + + " 679=r, 438=p, 439=t, 680=a, 681=d, 440=i, 682=e, 441=o, 683=_, 200= , 442=n, 684=f, 201=s, 443=s, 685=i," + + " 202=t, 444= , 686=l, 203=a, 445=\\\", 687=e, 204=r, 446=[, 688= , 205=t, 447={, 689=\\\\, 206=_, 448=n," + + " 207=s, 449=o, 208=a, 209=s, 690=\\\", 691=\\\", 450=d, 692=C, 451=e, 693=:, 210=l," + ) + ); } } diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_crud.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_crud.yml index 88682c18b3ad1..6113569f63035 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_crud.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_crud.yml @@ -520,7 +520,7 @@ } } - match: { job_id: "jobs-crud-update-job" } - - length: { analysis_config.categorization_analyzer.filter: 1 } + - length: { analysis_config.categorization_analyzer.filter: 2 } - match: { analysis_config.categorization_analyzer.tokenizer: "ml_standard" } - length: { analysis_config.categorization_analyzer.char_filter: 3 } - match: { analysis_config.categorization_analyzer.char_filter.0: "first_line_with_letters" }