Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PoC Couch Stats Resource Tracker for tracking process local resource usage #4812

Draft
wants to merge 29 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
be84279
PoC couch_stats local resource usage tracking
chewbranca Jul 18, 2022
e5e4820
Instrument more stats and rudimentary delta accumulation
chewbranca Oct 23, 2023
ecda8f3
Add more usage tracking and core functionality
chewbranca Oct 30, 2023
73863a7
Embed worker usage deltas in rexi:ping
chewbranca Oct 31, 2023
12e0267
WIP: core delta aggregation working
chewbranca Nov 9, 2023
84be4af
TMP: rewire rexi to fail on unexpected messages
chewbranca Nov 10, 2023
19224af
Add baseline group by and sort by aggregations
chewbranca Nov 14, 2023
cafa512
Add watchdog scanning for unmonitored processes
chewbranca Nov 14, 2023
996c385
Set dbname context properly
chewbranca Nov 15, 2023
1bbf7a9
Add unsafe_foldl for perf testing
chewbranca Nov 16, 2023
e70ba4d
Rework csrt context setting
chewbranca Nov 18, 2023
715b524
WIP: ugly but working HTTP API around groupings
chewbranca Nov 18, 2023
be8ab04
Cleanup debug info
chewbranca Nov 20, 2023
4c05a0f
Make CSRT toggle-able and rework delta sending
chewbranca Nov 20, 2023
eb20b5d
Remove debug exit clause
chewbranca Nov 20, 2023
f4a712c
Declare missing metrics
chewbranca Nov 21, 2023
c758cb7
WIP: relax record constraints...
chewbranca Nov 21, 2023
4f00910
Use Default in config:get/3 on ets table error
chewbranca Nov 30, 2023
887a7f2
Switch to delta extraction from messages
chewbranca Nov 30, 2023
1a1a584
Revert "Use Default in config:get/3 on ets table error"
chewbranca Nov 30, 2023
db1f874
Flush chttpd_db monitor refs on demonitor
chewbranca Dec 9, 2023
50495bf
TMP Hack around issue #4909
chewbranca Dec 18, 2023
9578773
Avoid mem3_rpc:rexi_call selective receive
chewbranca Dec 18, 2023
f77583a
Pass tests and toggle csrt
chewbranca Dec 19, 2023
ac17510
Handle delta in fabric rpc tests
chewbranca Dec 19, 2023
c500f0f
Conditionally log reports
chewbranca Dec 19, 2023
d7e9bd9
Do not persist doc size test config settings
chewbranca Jan 5, 2024
f68005a
Configurable logging and no more io_lib:format
chewbranca Jan 8, 2024
0371589
Limit resource usage output and fix keys
chewbranca Feb 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion src/chttpd/src/chttpd.erl
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ stop() ->
mochiweb_http:stop(?MODULE).

handle_request(MochiReq0) ->
couch_util:clear_pdict(), %% Make sure we start clean, everytime
erlang:put(?REWRITE_COUNT, 0),
MochiReq = couch_httpd_vhost:dispatch_host(MochiReq0),
handle_request_int(MochiReq).
Expand Down Expand Up @@ -323,6 +324,9 @@ handle_request_int(MochiReq) ->
% Save client socket so that it can be monitored for disconnects
chttpd_util:mochiweb_client_req_set(MochiReq),

%% This is probably better in before_request, but having Path is nice
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is fine IMO

couch_stats_resource_tracker:create_coordinator_context(HttpReq0, Path),

{HttpReq2, Response} =
case before_request(HttpReq0) of
{ok, HttpReq1} ->
Expand All @@ -347,12 +351,14 @@ handle_request_int(MochiReq) ->
#httpd_resp{status = ok, response = Resp} ->
{ok, Resp};
#httpd_resp{status = aborted, reason = Reason} ->
couch_log:error("Response abnormally terminated: ~p", [Reason]),
couch_log:error("Response abnormally terminated: ~w", [Reason]),
exit({shutdown, Reason})
end.

before_request(HttpReq) ->
try
%% TODO: re-enable this here once we have Path
%% couch_stats_resource_tracker:create_coordinator_context(HttpReq),
chttpd_stats:init(),
chttpd_plugin:before_request(HttpReq)
catch
Expand All @@ -372,6 +378,7 @@ after_request(HttpReq, HttpResp0) ->
HttpResp2 = update_stats(HttpReq, HttpResp1),
chttpd_stats:report(HttpReq, HttpResp2),
maybe_log(HttpReq, HttpResp2),
%%couch_stats_resource_tracker:close_context(),
HttpResp2.

process_request(#httpd{mochi_req = MochiReq} = HttpReq) ->
Expand Down Expand Up @@ -409,6 +416,7 @@ handle_req_after_auth(HandlerKey, HttpReq) ->
HandlerKey,
fun chttpd_db:handle_request/1
),
couch_stats_resource_tracker:set_context_handler_fun(HandlerFun),
AuthorizedReq = chttpd_auth:authorize(
possibly_hack(HttpReq),
fun chttpd_auth_request:authorize_request/1
Expand Down
3 changes: 2 additions & 1 deletion src/chttpd/src/chttpd_db.erl
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@

% Database request handlers
handle_request(#httpd{path_parts = [DbName | RestParts], method = Method} = Req) ->
couch_stats_resource_tracker:set_context_dbname(DbName),
case {Method, RestParts} of
{'PUT', []} ->
create_db_req(Req, DbName);
Expand Down Expand Up @@ -2248,7 +2249,7 @@ monitor_attachments(Att) ->
monitor_attachments([Att]).

demonitor_refs(Refs) when is_list(Refs) ->
[demonitor(Ref) || Ref <- Refs].
[demonitor(Ref, [flush]) || Ref <- Refs].

% Return attachments which are not stubs
non_stubbed_attachments(Atts) when is_list(Atts) ->
Expand Down
1 change: 1 addition & 0 deletions src/chttpd/src/chttpd_httpd_handlers.erl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ url_handler(<<"_utils">>) -> fun chttpd_misc:handle_utils_dir_req/1;
url_handler(<<"_all_dbs">>) -> fun chttpd_misc:handle_all_dbs_req/1;
url_handler(<<"_dbs_info">>) -> fun chttpd_misc:handle_dbs_info_req/1;
url_handler(<<"_active_tasks">>) -> fun chttpd_misc:handle_task_status_req/1;
url_handler(<<"_active_resources">>) -> fun chttpd_misc:handle_resource_status_req/1;
url_handler(<<"_scheduler">>) -> fun couch_replicator_httpd:handle_scheduler_req/1;
url_handler(<<"_node">>) -> fun chttpd_node:handle_node_req/1;
url_handler(<<"_reload_query_servers">>) -> fun chttpd_misc:handle_reload_query_servers_req/1;
Expand Down
108 changes: 106 additions & 2 deletions src/chttpd/src/chttpd_misc.erl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
handle_replicate_req/1,
handle_reload_query_servers_req/1,
handle_task_status_req/1,
handle_resource_status_req/1,
handle_up_req/1,
handle_utils_dir_req/1,
handle_utils_dir_req/2,
Expand All @@ -36,8 +37,7 @@
[
send_json/2, send_json/3,
send_method_not_allowed/2,
send_chunk/2,
start_chunked_response/3
send_chunk/2
]
).

Expand Down Expand Up @@ -230,6 +230,110 @@ handle_task_status_req(#httpd{method = 'GET'} = Req) ->
handle_task_status_req(Req) ->
send_method_not_allowed(Req, "GET,HEAD").

handle_resource_status_req(#httpd{method = 'POST'} = Req) ->
ok = chttpd:verify_is_server_admin(Req),
chttpd:validate_ctype(Req, "application/json"),
{Props} = chttpd:json_body_obj(Req),
Action = proplists:get_value(<<"action">>, Props),
Key = proplists:get_value(<<"key">>, Props),
Val = proplists:get_value(<<"val">>, Props),

CountBy = fun couch_stats_resource_tracker:count_by/1,
GroupBy = fun couch_stats_resource_tracker:group_by/2,
SortedBy1 = fun couch_stats_resource_tracker:sorted_by/1,
SortedBy2 = fun couch_stats_resource_tracker:sorted_by/2,
ConvertEle = fun(K) -> list_to_existing_atom(binary_to_list(K)) end,
ConvertList = fun(L) -> [ConvertEle(E) || E <- L] end,
ToJson = fun couch_stats_resource_tracker:term_to_flat_json/1,
JsonKeys = fun(PL) -> [[ToJson(K), V] || {K, V} <- PL] end,

Fun = case {Action, Key, Val} of
{<<"count_by">>, Keys, undefined} when is_list(Keys) ->
Keys1 = [ConvertEle(K) || K <- Keys],
fun() -> CountBy(Keys1) end;
{<<"count_by">>, Key, undefined} ->
Key1 = ConvertEle(Key),
fun() -> CountBy(Key1) end;
{<<"group_by">>, Keys, Vals} when is_list(Keys) andalso is_list(Vals) ->
Keys1 = ConvertList(Keys),
Vals1 = ConvertList(Vals),
fun() -> GroupBy(Keys1, Vals1) end;
{<<"group_by">>, Key, Vals} when is_list(Vals) ->
Key1 = ConvertEle(Key),
Vals1 = ConvertList(Vals),
fun() -> GroupBy(Key1, Vals1) end;
{<<"group_by">>, Keys, Val} when is_list(Keys) ->
Keys1 = ConvertList(Keys),
Val1 = ConvertEle(Val),
fun() -> GroupBy(Keys1, Val1) end;
{<<"group_by">>, Key, Val} ->
Key1 = ConvertEle(Key),
Val1 = ConvertList(Val),
fun() -> GroupBy(Key1, Val1) end;

{<<"sorted_by">>, Key, undefined} ->
Key1 = ConvertEle(Key),
fun() -> JsonKeys(SortedBy1(Key1)) end;
{<<"sorted_by">>, Keys, undefined} when is_list(Keys) ->
Keys1 = [ConvertEle(K) || K <- Keys],
fun() -> JsonKeys(SortedBy1(Keys1)) end;
{<<"sorted_by">>, Keys, Vals} when is_list(Keys) andalso is_list(Vals) ->
Keys1 = ConvertList(Keys),
Vals1 = ConvertList(Vals),
fun() -> JsonKeys(SortedBy2(Keys1, Vals1)) end;
{<<"sorted_by">>, Key, Vals} when is_list(Vals) ->
Key1 = ConvertEle(Key),
Vals1 = ConvertList(Vals),
fun() -> JsonKeys(SortedBy2(Key1, Vals1)) end;
{<<"sorted_by">>, Keys, Val} when is_list(Keys) ->
Keys1 = ConvertList(Keys),
Val1 = ConvertEle(Val),
fun() -> JsonKeys(SortedBy2(Keys1, Val1)) end;
{<<"sorted_by">>, Key, Val} ->
Key1 = ConvertEle(Key),
Val1 = ConvertList(Val),
fun() -> JsonKeys(SortedBy2(Key1, Val1)) end;
_ ->
throw({badrequest, invalid_resource_request})
end,

Fun1 = fun() ->
case Fun() of
Map when is_map(Map) ->
{maps:fold(
fun
(_K,0,A) -> A; %% TODO: Skip 0 value entries?
(K,V,A) -> [{ToJson(K), V} | A]
end,
[], Map)};
List when is_list(List) ->
List
end
end,

{Resp, _Bad} = rpc:multicall(erlang, apply, [
fun() ->
{node(), Fun1()}
end,
[]
]),
%%io:format("{CSRT}***** GOT RESP: ~p~n", [Resp]),
send_json(Req, {Resp});
handle_resource_status_req(#httpd{method = 'GET'} = Req) ->
ok = chttpd:verify_is_server_admin(Req),
{Resp, Bad} = rpc:multicall(erlang, apply, [
fun() ->
{node(), couch_stats_resource_tracker:active()}
end,
[]
]),
%% TODO: incorporate Bad responses
io:format("ACTIVE RESP: ~p~nBAD RESP: ~p~n", [Resp, Bad]),
send_json(Req, {Resp});
handle_resource_status_req(Req) ->
ok = chttpd:verify_is_server_admin(Req),
send_method_not_allowed(Req, "GET,HEAD,POST").

handle_replicate_req(#httpd{method = 'POST', user_ctx = Ctx, req_body = PostBody} = Req) ->
chttpd:validate_ctype(Req, "application/json"),
%% see HACK in chttpd.erl about replication
Expand Down
2 changes: 1 addition & 1 deletion src/chttpd/test/eunit/chttpd_db_doc_size_tests.erl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
setup() ->
Hashed = couch_passwords:hash_admin_password(?PASS),
ok = config:set("admins", ?USER, ?b2l(Hashed), _Persist = false),
ok = config:set("couchdb", "max_document_size", "50"),
ok = config:set("couchdb", "max_document_size", "50", false),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_Persist = false for consistency.

TmpDb = ?tempdb(),
Addr = config:get("chttpd", "bind_address", "127.0.0.1"),
Port = mochiweb_socket_server:get(chttpd, port),
Expand Down
1 change: 1 addition & 0 deletions src/couch/include/couch_db.hrl
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@
meta = []
}).

-define(LOG_UNEXPECTED_MSG(Msg), couch_log:warning("[~p:~p:~p/~p]{~p[~p]} Unexpected message: ~w", [?MODULE, ?LINE, ?FUNCTION_NAME, ?FUNCTION_ARITY, self(), element(2, process_info(self(), message_queue_len)), Msg])).

-record(user_ctx, {
name=null,
Expand Down
33 changes: 33 additions & 0 deletions src/couch/priv/stats_descriptions.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,10 @@
{type, counter},
{desc, <<"number of couch_server LRU operations skipped">>}
]}.
{[couchdb, couch_server, open], [
{type, counter},
{desc, <<"number of couch_server open operations invoked">>}
]}.
{[couchdb, query_server, vdu_rejects], [
{type, counter},
{desc, <<"number of rejections by validate_doc_update function">>}
Expand Down Expand Up @@ -410,10 +414,39 @@
{type, counter},
{desc, <<"number of other requests">>}
]}.
{[couchdb, query_server, js_filter], [
{type, counter},
{desc, <<"number of JS filter invocations">>}
]}.
{[couchdb, query_server, js_filtered_docs], [
{type, counter},
{desc, <<"number of docs filtered through JS invocations">>}
]}.
{[couchdb, query_server, js_filter_error], [
{type, counter},
{desc, <<"number of JS filter invocation errors">>}
]}.
{[couchdb, legacy_checksums], [
{type, counter},
{desc, <<"number of legacy checksums found in couch_file instances">>}
]}.
{[couchdb, btree, folds], [
{type, counter},
{desc, <<"number of couch btree kv fold callback invocations">>}
]}.
{[couchdb, btree, kp_node], [
{type, counter},
{desc, <<"number of couch btree kp_nodes read">>}
]}.
{[couchdb, btree, kv_node], [
{type, counter},
{desc, <<"number of couch btree kv_nodes read">>}
]}.
%% CSRT (couch_stats_resource_tracker) stats
{[couchdb, csrt, delta_missing_t0], [
{type, counter},
{desc, <<"number of csrt contexts without a proper startime">>}
]}.
{[pread, exceed_eof], [
{type, counter},
{desc, <<"number of the attempts to read beyond end of db file">>}
Expand Down
3 changes: 3 additions & 0 deletions src/couch/src/couch_btree.erl
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,8 @@ reduce_tree_size(kp_node, NodeSize, [{_K, {_P, _Red, Sz}} | NodeList]) ->

get_node(#btree{fd = Fd}, NodePos) ->
{ok, {NodeType, NodeList}} = couch_file:pread_term(Fd, NodePos),
%% TODO: wire in csrt tracking
couch_stats:increment_counter([couchdb, btree, NodeType]),
{NodeType, NodeList}.

write_node(#btree{fd = Fd, compression = Comp} = Bt, NodeType, NodeList) ->
Expand Down Expand Up @@ -1163,6 +1165,7 @@ stream_kv_node2(Bt, Reds, PrevKVs, [{K, V} | RestKVs], InRange, Dir, Fun, Acc) -
false ->
{stop, {PrevKVs, Reds}, Acc};
true ->
couch_stats:increment_counter([couchdb, btree, folds]),
AssembledKV = assemble(Bt, K, V),
case Fun(visit, AssembledKV, {PrevKVs, Reds}, Acc) of
{ok, Acc2} ->
Expand Down
2 changes: 2 additions & 0 deletions src/couch/src/couch_db.erl
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ open_doc(Db, IdOrDocInfo) ->
open_doc(Db, IdOrDocInfo, []).

open_doc(Db, Id, Options) ->
%% TODO: wire in csrt tracking
increment_stat(Db, [couchdb, database_reads]),
case open_doc_int(Db, Id, Options) of
{ok, #doc{deleted = true} = Doc} ->
Expand Down Expand Up @@ -1982,6 +1983,7 @@ increment_stat(#db{options = Options}, Stat, Count) when
->
case lists:member(sys_db, Options) of
true ->
%% TODO: we shouldn't leak resource usage just because it's a sys_db
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hrmm. How we can do it? Are you planing to store IS_SYSTEM_DB key on process dictionary or something?

Final solution to this question is out of scope for the first PR. TODO is good enough for first version of the feature.

ok;
false ->
couch_stats:increment_counter(Stat, Count)
Expand Down
8 changes: 8 additions & 0 deletions src/couch/src/couch_query_servers.erl
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,8 @@ filter_docs(Req, Db, DDoc, FName, Docs) ->
{ok, filter_docs_int(Db, DDoc, FName, JsonReq, JsonDocs)}
catch
throw:{os_process_error, {exit_status, 1}} ->
%% TODO: wire in csrt tracking
couch_stats:increment_counter([couchdb, query_server, js_filter_error]),
%% batch used too much memory, retry sequentially.
Fun = fun(JsonDoc) ->
filter_docs_int(Db, DDoc, FName, JsonReq, [JsonDoc])
Expand All @@ -550,6 +552,12 @@ filter_docs(Req, Db, DDoc, FName, Docs) ->
end.

filter_docs_int(Db, DDoc, FName, JsonReq, JsonDocs) ->
%% Count usage in _int version as this can be repeated for OS error
%% Pros & cons... might not have actually processed `length(JsonDocs)` docs
%% but it certainly undercounts if we count in `filter_docs/5` above
%% TODO: wire in csrt tracking
couch_stats:increment_counter([couchdb, query_server, js_filter]),
couch_stats:increment_counter([couchdb, query_server, js_filtered_docs], length(JsonDocs)),
[true, Passes] = ddoc_prompt(
Db,
DDoc,
Expand Down
2 changes: 2 additions & 0 deletions src/couch/src/couch_server.erl
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ sup_start_link(N) ->
gen_server:start_link({local, couch_server(N)}, couch_server, [N], []).

open(DbName, Options) ->
%% TODO: wire in csrt tracking
couch_stats:increment_counter([couchdb, couch_server, open]),
try
validate_open_or_create(DbName, Options),
open_int(DbName, Options)
Expand Down
29 changes: 29 additions & 0 deletions src/couch/src/couch_util.erl
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
-export([verify_hash_names/2]).
-export([get_config_hash_algorithms/0]).
-export([remove_sensitive_data/1]).
-export([clear_pdict/0, clear_pdict/1]).

-include_lib("couch/include/couch_db.hrl").

Expand Down Expand Up @@ -870,3 +871,31 @@ remove_sensitive_data(KVList) ->
KVList1 = lists:keyreplace(<<"password">>, 1, KVList, {<<"password">>, <<"****">>}),
% some KVList entries are atoms, so test fo this too
lists:keyreplace(password, 1, KVList1, {password, <<"****">>}).

-spec clear_pdict() -> ok.
clear_pdict() ->
clear_pdict(erlang:get()).

%% Exclude mochiweb markers, otherwise just use erlang:erase/0
-spec clear_pdict(list()) -> ok.
clear_pdict([]) ->
ok;
clear_pdict([{mochiweb_request_body, _V} | Rest]) ->
clear_pdict(Rest);
clear_pdict([{mochiweb_request_body_length, _V} | Rest]) ->
clear_pdict(Rest);
clear_pdict([{mochiweb_request_cookie, _V} | Rest]) ->
clear_pdict(Rest);
clear_pdict([{mochiweb_request_force_close, _V} | Rest]) ->
clear_pdict(Rest);
clear_pdict([{mochiweb_request_path, _V} | Rest]) ->
clear_pdict(Rest);
clear_pdict([{mochiweb_request_post, _V} | Rest]) ->
clear_pdict(Rest);
clear_pdict([{mochiweb_request_qs, _V} | Rest]) ->
clear_pdict(Rest);
clear_pdict([{mochiweb_request_recv, _V} | Rest]) ->
clear_pdict(Rest);
clear_pdict([{Key, _V} | Rest]) ->
erlang:erase(Key),
clear_pdict(Rest).
8 changes: 6 additions & 2 deletions src/couch_stats/src/couch_stats.app.src
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@
{application, couch_stats, [
{description, "Simple statistics collection"},
{vsn, git},
{registered, [couch_stats_aggregator, couch_stats_process_tracker]},
{applications, [kernel, stdlib]},
{registered, [
couch_stats_aggregator,
couch_stats_process_tracker,
couch_stats_resource_tracker
]},
{applications, [kernel, stdlib, couch_log]},
{mod, {couch_stats_app, []}},
{env, []}
]}.
Loading