-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Couch stats resource tracker v2 #5213
base: main
Are you sure you want to change the base?
Changes from 15 commits
1fa358d
e67690a
e10d82e
b3a38c2
a4fc931
d2c7eba
7ad90ad
03fa0cb
bb00b81
edd5d0e
f06d4a4
c02d57b
5864ba3
e1fead4
4535f1a
48db790
4db08c9
1d34432
46fba8e
5ac8a91
ce91a40
de9a430
1a9e4d1
ba1baaa
e213714
8d0e9fd
1702115
4c091bc
ad84bf8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
handle_replicate_req/1, | ||
handle_reload_query_servers_req/1, | ||
handle_task_status_req/1, | ||
handle_resource_status_req/1, | ||
handle_up_req/1, | ||
handle_utils_dir_req/1, | ||
handle_utils_dir_req/2, | ||
|
@@ -224,6 +225,110 @@ handle_task_status_req(#httpd{method = 'GET'} = Req) -> | |
handle_task_status_req(Req) -> | ||
send_method_not_allowed(Req, "GET,HEAD"). | ||
|
||
handle_resource_status_req(#httpd{method = 'POST'} = Req) -> | ||
ok = chttpd:verify_is_server_admin(Req), | ||
chttpd:validate_ctype(Req, "application/json"), | ||
{Props} = chttpd:json_body_obj(Req), | ||
Action = proplists:get_value(<<"action">>, Props), | ||
Key = proplists:get_value(<<"key">>, Props), | ||
Val = proplists:get_value(<<"val">>, Props), | ||
|
||
CountBy = fun couch_stats_resource_tracker:count_by/1, | ||
GroupBy = fun couch_stats_resource_tracker:group_by/2, | ||
SortedBy1 = fun couch_stats_resource_tracker:sorted_by/1, | ||
SortedBy2 = fun couch_stats_resource_tracker:sorted_by/2, | ||
ConvertEle = fun(K) -> list_to_existing_atom(binary_to_list(K)) end, | ||
nickva marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ConvertList = fun(L) -> [ConvertEle(E) || E <- L] end, | ||
ToJson = fun couch_stats_resource_tracker:term_to_flat_json/1, | ||
JsonKeys = fun(PL) -> [[ToJson(K), V] || {K, V} <- PL] end, | ||
|
||
Fun = case {Action, Key, Val} of | ||
{<<"count_by">>, Keys, undefined} when is_list(Keys) -> | ||
Keys1 = [ConvertEle(K) || K <- Keys], | ||
fun() -> CountBy(Keys1) end; | ||
{<<"count_by">>, Key, undefined} -> | ||
Key1 = ConvertEle(Key), | ||
fun() -> CountBy(Key1) end; | ||
{<<"group_by">>, Keys, Vals} when is_list(Keys) andalso is_list(Vals) -> | ||
Keys1 = ConvertList(Keys), | ||
Vals1 = ConvertList(Vals), | ||
fun() -> GroupBy(Keys1, Vals1) end; | ||
{<<"group_by">>, Key, Vals} when is_list(Vals) -> | ||
Key1 = ConvertEle(Key), | ||
Vals1 = ConvertList(Vals), | ||
fun() -> GroupBy(Key1, Vals1) end; | ||
{<<"group_by">>, Keys, Val} when is_list(Keys) -> | ||
Keys1 = ConvertList(Keys), | ||
Val1 = ConvertEle(Val), | ||
fun() -> GroupBy(Keys1, Val1) end; | ||
{<<"group_by">>, Key, Val} -> | ||
Key1 = ConvertEle(Key), | ||
Val1 = ConvertList(Val), | ||
fun() -> GroupBy(Key1, Val1) end; | ||
|
||
{<<"sorted_by">>, Key, undefined} -> | ||
Key1 = ConvertEle(Key), | ||
fun() -> JsonKeys(SortedBy1(Key1)) end; | ||
{<<"sorted_by">>, Keys, undefined} when is_list(Keys) -> | ||
Keys1 = [ConvertEle(K) || K <- Keys], | ||
fun() -> JsonKeys(SortedBy1(Keys1)) end; | ||
{<<"sorted_by">>, Keys, Vals} when is_list(Keys) andalso is_list(Vals) -> | ||
Keys1 = ConvertList(Keys), | ||
Vals1 = ConvertList(Vals), | ||
fun() -> JsonKeys(SortedBy2(Keys1, Vals1)) end; | ||
{<<"sorted_by">>, Key, Vals} when is_list(Vals) -> | ||
Key1 = ConvertEle(Key), | ||
Vals1 = ConvertList(Vals), | ||
fun() -> JsonKeys(SortedBy2(Key1, Vals1)) end; | ||
{<<"sorted_by">>, Keys, Val} when is_list(Keys) -> | ||
Keys1 = ConvertList(Keys), | ||
Val1 = ConvertEle(Val), | ||
fun() -> JsonKeys(SortedBy2(Keys1, Val1)) end; | ||
{<<"sorted_by">>, Key, Val} -> | ||
Key1 = ConvertEle(Key), | ||
Val1 = ConvertList(Val), | ||
fun() -> JsonKeys(SortedBy2(Key1, Val1)) end; | ||
_ -> | ||
throw({badrequest, invalid_resource_request}) | ||
end, | ||
|
||
Fun1 = fun() -> | ||
case Fun() of | ||
Map when is_map(Map) -> | ||
{maps:fold( | ||
fun | ||
(_K,0,A) -> A; %% TODO: Skip 0 value entries? | ||
(K,V,A) -> [{ToJson(K), V} | A] | ||
end, | ||
[], Map)}; | ||
List when is_list(List) -> | ||
List | ||
end | ||
end, | ||
|
||
{Resp, _Bad} = rpc:multicall(erlang, apply, [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. erpc might be a better option https://www.erlang.org/doc/apps/kernel/erpc.html#multicall/3 |
||
fun() -> | ||
{node(), Fun1()} | ||
end, | ||
[] | ||
]), | ||
%%io:format("{CSRT}***** GOT RESP: ~p~n", [Resp]), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Left-over debug comment |
||
send_json(Req, {Resp}); | ||
handle_resource_status_req(#httpd{method = 'GET'} = Req) -> | ||
ok = chttpd:verify_is_server_admin(Req), | ||
{Resp, Bad} = rpc:multicall(erlang, apply, [ | ||
fun() -> | ||
{node(), couch_stats_resource_tracker:active()} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of having to run in a closure just to get node() in the response, consider using an explicit list of nodes |
||
end, | ||
[] | ||
]), | ||
%% TODO: incorporate Bad responses | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. consider using erpc, the handling of bad responses is a bit cleaner there, I think |
||
send_json(Req, {Resp}); | ||
handle_resource_status_req(Req) -> | ||
ok = chttpd:verify_is_server_admin(Req), | ||
send_method_not_allowed(Req, "GET,HEAD,POST"). | ||
|
||
|
||
handle_replicate_req(#httpd{method = 'POST', user_ctx = Ctx, req_body = PostBody} = Req) -> | ||
chttpd:validate_ctype(Req, "application/json"), | ||
%% see HACK in chttpd.erl about replication | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,6 +53,8 @@ | |
-define(INTERACTIVE_EDIT, interactive_edit). | ||
-define(REPLICATED_CHANGES, replicated_changes). | ||
|
||
-define(LOG_UNEXPECTED_MSG(Msg), couch_log:warning("[~p:~p:~p/~p]{~p[~p]} Unexpected message: ~w", [?MODULE, ?LINE, ?FUNCTION_NAME, ?FUNCTION_ARITY, self(), element(2, process_info(self(), message_queue_len)), Msg])). | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am surprised linter doesn't complain about such long line. |
||
|
||
-type branch() :: {Key::term(), Value::term(), Tree::term()}. | ||
-type path() :: {Start::pos_integer(), branch()}. | ||
-type update_type() :: replicated_changes | interactive_edit. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -306,6 +306,10 @@ | |
{type, counter}, | ||
{desc, <<"number of couch_server LRU operations skipped">>} | ||
]}. | ||
{[couchdb, couch_server, open], [ | ||
{type, counter}, | ||
{desc, <<"number of couch_server open operations invoked">>} | ||
]}. | ||
{[couchdb, query_server, vdu_rejects], [ | ||
{type, counter}, | ||
{desc, <<"number of rejections by validate_doc_update function">>} | ||
|
@@ -418,10 +422,47 @@ | |
{type, counter}, | ||
{desc, <<"number of other requests">>} | ||
]}. | ||
{[couchdb, query_server, js_filter], [ | ||
{type, counter}, | ||
{desc, <<"number of JS filter invocations">>} | ||
]}. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is already a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ahhh good catch, I didn't see those were hidden in https://github.com/apache/couchdb/blob/main/src/couch/src/couch_os_process.erl#L257-L259 I've reworked this in |
||
{[couchdb, query_server, js_filtered_docs], [ | ||
{type, counter}, | ||
{desc, <<"number of docs filtered through JS invocations">>} | ||
]}. | ||
{[couchdb, query_server, js_filter_error], [ | ||
{type, counter}, | ||
{desc, <<"number of JS filter invocation errors">>} | ||
]}. | ||
{[couchdb, legacy_checksums], [ | ||
{type, counter}, | ||
{desc, <<"number of legacy checksums found in couch_file instances">>} | ||
]}. | ||
{[couchdb, btree, folds], [ | ||
{type, counter}, | ||
{desc, <<"number of couch btree kv fold callback invocations">>} | ||
]}. | ||
{[couchdb, btree, get_node, kp_node], [ | ||
{type, counter}, | ||
{desc, <<"number of couch btree kp_nodes read">>} | ||
]}. | ||
{[couchdb, btree, get_node, kv_node], [ | ||
{type, counter}, | ||
{desc, <<"number of couch btree kv_nodes read">>} | ||
]}. | ||
Comment on lines
+437
to
+444
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wonder if this is a bit too low level? Or rather, btrees are used for view and db files. It's not clear which btrees the metrics refer to. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
So that's the thing, we don't have current visibility into the volume of The idea here is to start tracking the amount of core btree operations by the workers, allowing us to track it at a process level like we're doing with the rest of the stats in this PR, and then we'll at least be able to see the total volume of couch_btree operations induced, and then use the reports to quantify the requests/dbs/etc that performed those operations. A key aspect of the tracking here is that As for tracking of I thought about trying to funnel down into |
||
{[couchdb, btree, write_node, kp_node], [ | ||
{type, counter}, | ||
{desc, <<"number of couch btree kp_nodes written">>} | ||
]}. | ||
{[couchdb, btree, write_node, kv_node], [ | ||
{type, counter}, | ||
{desc, <<"number of couch btree kv_nodes written">>} | ||
]}. | ||
%% CSRT (couch_stats_resource_tracker) stats | ||
{[couchdb, csrt, delta_missing_t0], [ | ||
{type, counter}, | ||
{desc, <<"number of csrt contexts without a proper startime">>} | ||
]}. | ||
{[pread, exceed_eof], [ | ||
{type, counter}, | ||
{desc, <<"number of the attempts to read beyond end of db file">>} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -542,6 +542,8 @@ filter_docs(Req, Db, DDoc, FName, Docs) -> | |
{ok, filter_docs_int(Db, DDoc, FName, JsonReq, JsonDocs)} | ||
catch | ||
throw:{os_process_error, {exit_status, 1}} -> | ||
%% TODO: wire in csrt tracking | ||
couch_stats:increment_counter([couchdb, query_server, js_filter_error]), | ||
Comment on lines
+545
to
+546
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We already have a general counter for os process errors, normal exits and error exits. Probably don't need an explicit one just for filters? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 4db08c9. |
||
%% batch used too much memory, retry sequentially. | ||
Fun = fun(JsonDoc) -> | ||
filter_docs_int(Db, DDoc, FName, JsonReq, [JsonDoc]) | ||
|
@@ -550,6 +552,12 @@ filter_docs(Req, Db, DDoc, FName, Docs) -> | |
end. | ||
|
||
filter_docs_int(Db, DDoc, FName, JsonReq, JsonDocs) -> | ||
%% Count usage in _int version as this can be repeated for OS error | ||
%% Pros & cons... might not have actually processed `length(JsonDocs)` docs | ||
%% but it certainly undercounts if we count in `filter_docs/5` above | ||
%% TODO: wire in csrt tracking | ||
couch_stats:increment_counter([couchdb, query_server, js_filter]), | ||
couch_stats:increment_counter([couchdb, query_server, js_filtered_docs], length(JsonDocs)), | ||
[true, Passes] = ddoc_prompt( | ||
Db, | ||
DDoc, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,12 @@ | |
update_gauge/2 | ||
]). | ||
|
||
%% couch_stats_resource_tracker API | ||
-export([ | ||
create_context/3, | ||
maybe_track_rexi_init_p/1 | ||
]). | ||
|
||
-type response() :: ok | {error, unknown_metric} | {error, invalid_metric}. | ||
-type stat() :: {any(), [{atom(), any()}]}. | ||
|
||
|
@@ -49,6 +55,11 @@ increment_counter(Name) -> | |
|
||
-spec increment_counter(any(), pos_integer()) -> response(). | ||
increment_counter(Name, Value) -> | ||
%% Should maybe_track_local happen before or after notify? | ||
%% If after, only currently tracked metrics declared in the app's | ||
%% stats_description.cfg will be trackable locally. Pros/cons. | ||
%io:format("NOTIFY_EXISTING_METRIC: ~p || ~p || ~p~n", [Name, Op, Type]), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Left-over debug statement |
||
ok = maybe_track_local_counter(Name, Value), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this is enabled for all requests, a bit worried about the performance here as we add an ets operation over a simple integer counter bump. Was there any noticeable performance impact from it, during perf runs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An essential part of the work done in this PR is to ensure that this is as minimally impactful as possible. We add ets config lookup to check if it's enabled, and then we perform the same type of simple integer counter bump utilizing @nickva you did have me change my hard coded atom function to one utilizing maps (although I did mange to use static maps), so I'll make sure to do some more testing with the final version to compare. I plan on doing some comparisons between the start of this PR and the final version, to make sure there weren't any regressions induced by the changes. |
||
case couch_stats_util:get_counter(Name, stats()) of | ||
{ok, Ctx} -> couch_stats_counter:increment(Ctx, Value); | ||
{error, Error} -> {error, Error} | ||
|
@@ -100,6 +111,25 @@ stats() -> | |
now_sec() -> | ||
erlang:monotonic_time(second). | ||
|
||
%% Only potentially track positive increments to counters | ||
-spec maybe_track_local_counter(any(), any()) -> ok. | ||
maybe_track_local_counter(Name, Val) when is_integer(Val) andalso Val > 0 -> | ||
%%io:format("maybe_track_local[~p]: ~p~n", [Val, Name]), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. debug left-over? |
||
couch_stats_resource_tracker:maybe_inc(Name, Val), | ||
ok; | ||
maybe_track_local_counter(_, _) -> | ||
ok. | ||
|
||
create_context(From, MFA, Nonce) -> | ||
couch_stats_resource_tracker:create_context(From, MFA, Nonce). | ||
|
||
maybe_track_rexi_init_p({M, F, _A}) -> | ||
Metric = [M, F, spawned], | ||
case couch_stats_resource_tracker:should_track(Metric) of | ||
true -> increment_counter(Metric); | ||
false -> ok | ||
end. | ||
|
||
-ifdef(TEST). | ||
|
||
-include_lib("couch/include/couch_eunit.hrl"). | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we're introducing a new API add some docs or some description (examples) how it works, and the intent behind it. What would CouchDB users use it for, how it is different than metrics, and active tasks, etc...