Skip to content

Commit

Permalink
[ML] Performance improvements for categorization jobs (#89824)
Browse files Browse the repository at this point in the history
Categorization of strings which break down to a huge number of tokens can cause the C++ backend process to choke - see elastic/ml-cpp#2403.

This PR adds a limit filter to the default categorization analyzer which caps the number of tokens passed to the backend at 100.

Unfortunately this isn't a complete panacea to all the issues surrounding categorization of many tokened / large messages as verification checks on the frontend can also fail due to calls to the datafeed _preview API returning an excessive amount of data.
  • Loading branch information
edsavage authored Sep 8, 2022
1 parent f16110d commit fd20027
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 2 deletions.
4 changes: 4 additions & 0 deletions docs/reference/ml/common/apis/get-ml-info.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ This is a possible response:
"GMT",
"UTC"
]
},
{
"type": "limit",
"max_token_count": "100"
}
]
},
Expand Down
1 change: 1 addition & 0 deletions x-pack/plugin/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ tasks.named("yamlRestTestV7CompatTransform").configure { task ->
"roles/30_prohibited_role_query/Test use prohibited query inside role query",
"put role request with a term lookup (deprecated) and type. Requires validation in REST layer"
)
task.skipTest("ml/jobs_crud/Test update job", "Behaviour change #89824 - added limit filter to categorization analyzer")
task.skipTest("ml/jobs_crud/Test create job with delimited format", "removing undocumented functionality")
task.skipTest("ml/datafeeds_crud/Test update datafeed to point to missing job", "behaviour change #44752 - not allowing to update datafeed job_id")
task.skipTest(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeab
public static final ParseField TOKEN_FILTERS = AnalyzeAction.Fields.TOKEN_FILTERS;
public static final ParseField CHAR_FILTERS = AnalyzeAction.Fields.CHAR_FILTERS;

public static final int MAX_TOKEN_COUNT = 100;

/**
* This method is only used in the unit tests - in production code this config is always parsed as a fragment.
*/
Expand Down Expand Up @@ -204,6 +206,7 @@ public static CategorizationAnalyzerConfig buildStandardCategorizationAnalyzer(L
.addCategorizationFilters(categorizationFilters)
.setTokenizer("ml_standard")
.addDateWordsTokenFilter()
.addLimitFilter()
.build();
}

Expand Down Expand Up @@ -429,6 +432,14 @@ Builder addDateWordsTokenFilter() {
return this;
}

Builder addLimitFilter() {
Map<String, Object> limitFilter = new HashMap<>();
limitFilter.put("type", "limit");
limitFilter.put("max_token_count", MAX_TOKEN_COUNT);
addTokenFilter(limitFilter);
return this;
}

/**
* Create a config validating only structure, not exact analyzer/tokenizer/filter names
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.analysis.common.CommonAnalysisPlugin;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.TestEnvironment;
Expand Down Expand Up @@ -45,7 +46,7 @@ protected AnalysisModule createAnalysisModule() throws Exception {
TestEnvironment.newEnvironment(
Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build()
),
List.of(new MachineLearning(Settings.EMPTY))
List.of(new MachineLearning(Settings.EMPTY), new CommonAnalysisPlugin())
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,152 @@ public void testMlStandardCategorizationAnalyzer() throws IOException {
),
categorizationAnalyzer.tokenizeField("nginx_error", NGINX_ERROR_EXAMPLE)
);

// Limited to CategorizationAnalyzerConfig.MAX_TOKEN_COUNT (100) tokens
assertEquals(
Arrays.asList(
"took",
"errors",
"true",
"items",
"create",
"index",
"internal.alerts-security.alerts-default-000003",
"id",
"status",
"error",
"type",
"mapper_parsing_exception",
"reason",
"failed",
"to",
"parse",
"field",
"process.parent.command_line",
"of",
"type",
"wildcard",
"in",
"document",
"with",
"id",
"Preview",
"of",
"field",
"s",
"value",
"R",
"Q",
"M",
"t",
"k",
"P",
"r",
"o",
"T",
"g",
"I",
"r",
"R",
"N",
"P",
"R",
"I",
"N",
"t",
"n",
"i",
"l",
"n",
"r",
"T",
"R",
"i",
"t",
"w",
"n",
"t",
"s",
"i",
"i",
"l",
"r",
"t",
"s",
"M",
"i",
"Q",
"N",
"L",
"t",
"v",
"r",
"s",
"s",
"w",
"o",
"n",
"i",
"n",
"m",
"i",
"r",
"g",
"n",
"i",
"i",
"l",
"n",
"o",
"n",
"i",
"r",
"t",
"u",
"k",
"l"
),
categorizationAnalyzer.tokenizeField(
"multiline",
"{\"took\":529,\"errors\":true,\"items\":[{\"create\":{\"_index\":\".internal.alerts-security.alerts-default-000003\","
+ "\"_id\":\"5b87a604eb4b7bf22a74d3bdda7855f64080a3c620d13563ed000f64670964a8\",\"status\":400,\"error\":{\"type\":"
+ "\"mapper_parsing_exception\",\"reason\":\"failed to parse field [process.parent.command_line] of type [wildcard]"
+ " in document with id '5b87a604eb4b7bf22a74d3bdda7855f64080a3c620d13563ed000f64670964a8'. Preview of field's"
+ " value: '{1220=B, 1217=R, 1216=/, 1215=b, 1214=d, 1213=/, 1212=Q, 1211=M, 1210=t, 1219=B, 1218=A, 1231=k,"
+ " 1230=-, 0=\\\", 1228= , 1=C, 1227=\\\", 2=:, 1226=\\\\, 3=\\\\, 1225=\\\", 4=P, 1224=1, 5=r, 1223=~, 6=o,"
+ " 1222=T, 7=g, 1221=I, 8=r, 800=R, 9=a, 801=A, 802=~, 803=3, 804=/, 805=N, 806=P, 807=R, 1229= , 808=I, 809=N,"
+ " 1000=\\\\, 1242=_, 1241=t, 1240=e, 1239=n, 1238=i, 1237= , 1236=l, 1235=e, 1234=n, 1233=r, 810=T, 1232=e,"
+ " 811=~, 812=1, 813=/, 814=R, 815=a, 816=b, 817=b, 818=i, 819=t, 1011=w, 1253=n, 1010= , 1252=e, 1251=t, 1250=s,"
+ " 1008=i, 1007=b, 1249=i, 1006=b, 1248=l, 1005=a, 1247=_, 1004=r, 1246=t, 1003=-, 1245=s, 820=M, 1002= ,"
+ " 1244=i, 821=Q, 1001=\\\", 1243=d, 822=/, 823=E, 824=N, 825=A, 826=B, 827=L, 828=E, 829=~, 1009=t, 1022=v,"
+ " 1264= , 1021=r, 1263=2, 1020=e, 1262=7, 1261=6, 1260=5, 1019=s, 1018=_, 1017=s, 1259=2, 1016=w, 1258= ,"
+ " 1015=o, 1257=n, 830=1, 1014=d, 1256=i, 831=\\\", 1013=n, 1255=m, 832=\\\\, 1012=i, 1254=_, 833=\\\","
+ " 834= , 835=-, 836=r, 837=a, 838=b, 839=b, 1033= , 1275=e, 1032=g, 1274=n, 1031=i, 1273=i, 1030=f,"
+ " 1272= , 1271=l, 1270=e, 1029=n, 1028=o, 1027=c, 1269=n, 840=i, 1026=_, 1268=r, 841=t, 1025=e, 1267=e,"
+ " 600=u, 842= , 1024=c, 1266=k, 601=l, 843=p, 1023=i, 1265=-, 602=t, 844=l, 603=_, 845=u, 604=f, 846=g, 605=i,"
+ " 847=i, 606=l, 848=n, 607=e, 849=s, 608= , 609=\\\\, 1044=R, 1286=t, 1043=G, 1285=s, 1042=O, 1284=i, 1041=R,"
+ " 1283=l, 1040=P, 1282=_, 1281=t, 1280=s, 1039=/, 850=_, 1038=:, 851=d, 1037=C, 1279=i, 610=\\\", 852=i,"
+ " 1036=\\\", 1278=d, 611=\\\", 853=r, 1035=\\\", 1277=_, 612=C, 854= , 1034=\\\\, 1276=t, 613=:,"
+ " 855=\\\\, 614=/, 856=\\\", 615=P, 857=\\\", 616=R, 858=C, 617=O, 859=:, 618=G, 619=R, 1055=~, 1297=7, 1054=T,"
+ " 1296=6, 1053=N, 1295=5, 1052=I, 1294=2, 1051=R, 1293= , 1050=P, 1292=x, 1291=a, 1290=m, 860=/, 861=P, 1049=N,"
+ " 620=A, 862=R, 1048=/, 621=~, 863=O, 1047=3, 1289=_, 622=3, 864=G, 1046=~, 1288=n, 623=/, 865=R, 1045=A, 1287=e,"
+ " 624=N, 866=A, 625=P, 867=~, 626=R, 868=1, 627=I, 869=/, 628=N, 629=T, 1066=/, 1065=Q, 1064=M, 1063=t, 1062=i,"
+ " 1061=b, 1060=b, 870=N, 871=P, 630=~, 872=R, 631=1, 873=I, 1059=a, 632=/, 874=N, 1058=R, 633=R, 875=T, 1057=/,"
+ " 1299= , 634=a, 876=~, 1056=1, 1298=2, 635=b, 877=1, 636=b, 878=/, 637=i, 879=R, 638=t, 639=M, 1077=o, 1076=c,"
+ " 1075=., 1074=q, 1073=m, 1072=t, 1071=i, 1070=b, 880=A, 881=B, 640=Q, 882=B, 641=/, 883=I, 400=1, 642=l, 884=T,"
+ " 401=2, 643=o, 885=~, 1069=b, 402=8, 644=g, 886=1, 1068=a, 403=0, 645=/, 887=., 1067=r, 404=0, 646=R, 888=1,"
+ " 405=0, 647=A, 889=0, 406= , 648=B, 407= , 649=B, 408=-, 409=k, 1080=i, 1088=s, 1087=o, 1086=-, 1085= ,"
+ " 1084=\\\", 1083=\\\\, 1082=\\\", 890=/, 1081=g, 891=p, 650=I, 892=l, 651=T, 893=u, 410=e, 652=~, 894=g,"
+ " 411=r, 653=1, 895=i, 412=n, 654=., 896=n, 413=e, 655=L, 897=s, 1079=f, 414=l, 656=O, 898=\\\", 1078=n,"
+ " 415= , 657=G, 899=\\\\, 416=i, 658=\\\", 417=n, 659=\\\\, 418=e, 419=t, 1091=o, 1090=m, 1099=_, 1098=t,"
+ " 1097=r, 1096=a, 1095=t, 1094=s, 1093= , 1092=n, 660=\\\", 661= , 420=_, 662=-, 421=d, 663=r, 422=e, 664=a,"
+ " 423=f, 665=b, 424=a, 666=b, 425=u, 667=i, 1089=_, 426=l, 668=t, 427=t, 669= , 428=_, 429=c, 670=l, 671=a,"
+ " 430=o, 672=g, 431=n, 673=e, 432=n, 674=r, 433=e, 675=_, 434=c, 676=u, 435=t, 677=p, 436=_, 678=g, 437=o,"
+ " 679=r, 438=p, 439=t, 680=a, 681=d, 440=i, 682=e, 441=o, 683=_, 200= , 442=n, 684=f, 201=s, 443=s, 685=i,"
+ " 202=t, 444= , 686=l, 203=a, 445=\\\", 687=e, 204=r, 446=[, 688= , 205=t, 447={, 689=\\\\, 206=_, 448=n,"
+ " 207=s, 449=o, 208=a, 209=s, 690=\\\", 691=\\\", 450=d, 692=C, 451=e, 693=:, 210=l,"
)
);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,7 @@
}
}
- match: { job_id: "jobs-crud-update-job" }
- length: { analysis_config.categorization_analyzer.filter: 1 }
- length: { analysis_config.categorization_analyzer.filter: 2 }
- match: { analysis_config.categorization_analyzer.tokenizer: "ml_standard" }
- length: { analysis_config.categorization_analyzer.char_filter: 3 }
- match: { analysis_config.categorization_analyzer.char_filter.0: "first_line_with_letters" }
Expand Down

0 comments on commit fd20027

Please sign in to comment.