From be8427986dc67ec70b5916db186f2fb0dcdcacbf Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 18 Jul 2022 16:26:00 -0700 Subject: [PATCH 01/29] PoC couch_stats local resource usage tracking --- src/chttpd/src/chttpd_httpd_handlers.erl | 1 + src/chttpd/src/chttpd_misc.erl | 15 + src/couch/priv/stats_descriptions.cfg | 5 + src/couch/src/couch_btree.erl | 3 + src/couch/src/couch_db.erl | 2 + src/couch/src/couch_query_servers.erl | 8 + src/couch/src/couch_server.erl | 2 + src/couch_stats/src/couch_stats.erl | 30 + .../src/couch_stats_resource_tracker.erl | 607 ++++++++++++++++++ src/couch_stats/src/couch_stats_sup.erl | 1 + src/fabric/priv/stats_descriptions.cfg | 26 + src/fabric/src/fabric_rpc.erl | 6 + src/fabric/src/fabric_util.erl | 9 +- src/mango/src/mango_cursor_view.erl | 2 + src/mango/src/mango_selector.erl | 1 + src/rexi/src/rexi.erl | 6 +- src/rexi/src/rexi_server.erl | 5 +- src/rexi/src/rexi_utils.erl | 21 + 18 files changed, 745 insertions(+), 5 deletions(-) create mode 100644 src/couch_stats/src/couch_stats_resource_tracker.erl diff --git a/src/chttpd/src/chttpd_httpd_handlers.erl b/src/chttpd/src/chttpd_httpd_handlers.erl index 932b52e5f6e..e1b26022204 100644 --- a/src/chttpd/src/chttpd_httpd_handlers.erl +++ b/src/chttpd/src/chttpd_httpd_handlers.erl @@ -20,6 +20,7 @@ url_handler(<<"_utils">>) -> fun chttpd_misc:handle_utils_dir_req/1; url_handler(<<"_all_dbs">>) -> fun chttpd_misc:handle_all_dbs_req/1; url_handler(<<"_dbs_info">>) -> fun chttpd_misc:handle_dbs_info_req/1; url_handler(<<"_active_tasks">>) -> fun chttpd_misc:handle_task_status_req/1; +url_handler(<<"_active_resources">>) -> fun chttpd_misc:handle_resource_status_req/1; url_handler(<<"_scheduler">>) -> fun couch_replicator_httpd:handle_scheduler_req/1; url_handler(<<"_node">>) -> fun chttpd_node:handle_node_req/1; url_handler(<<"_reload_query_servers">>) -> fun chttpd_misc:handle_reload_query_servers_req/1; diff --git a/src/chttpd/src/chttpd_misc.erl b/src/chttpd/src/chttpd_misc.erl index 9df3a881fae..195c8d8284d 100644 --- a/src/chttpd/src/chttpd_misc.erl +++ b/src/chttpd/src/chttpd_misc.erl @@ -20,6 +20,7 @@ handle_replicate_req/1, handle_reload_query_servers_req/1, handle_task_status_req/1, + handle_resource_status_req/1, handle_up_req/1, handle_utils_dir_req/1, handle_utils_dir_req/2, @@ -230,6 +231,20 @@ handle_task_status_req(#httpd{method = 'GET'} = Req) -> handle_task_status_req(Req) -> send_method_not_allowed(Req, "GET,HEAD"). +handle_resource_status_req(#httpd{method = 'GET'} = Req) -> + ok = chttpd:verify_is_server_admin(Req), + {Resp, Bad} = rpc:multicall(erlang, apply, [ + fun() -> + {node(), couch_stats_resource_tracker:active()} + end, + [] + ]), + %% TODO: incorporate Bad responses + io:format("ACTIVE RESP: ~p~nBAD RESP: ~p~n", [Resp, Bad]), + send_json(Req, {Resp}); +handle_resource_status_req(Req) -> + send_method_not_allowed(Req, "GET,HEAD"). + handle_replicate_req(#httpd{method = 'POST', user_ctx = Ctx, req_body = PostBody} = Req) -> chttpd:validate_ctype(Req, "application/json"), %% see HACK in chttpd.erl about replication diff --git a/src/couch/priv/stats_descriptions.cfg b/src/couch/priv/stats_descriptions.cfg index 2d40518e278..e3f6a585e5c 100644 --- a/src/couch/priv/stats_descriptions.cfg +++ b/src/couch/priv/stats_descriptions.cfg @@ -414,6 +414,11 @@ {type, counter}, {desc, <<"number of legacy checksums found in couch_file instances">>} ]}. +%% CSRT (couch_stats_resource_tracker) stats +{[couchdb, csrt, delta_missing_t0], [ + {type, counter}, + {desc, <<"number of csrt contexts without a proper startime">>} +]}. {[pread, exceed_eof], [ {type, counter}, {desc, <<"number of the attempts to read beyond end of db file">>} diff --git a/src/couch/src/couch_btree.erl b/src/couch/src/couch_btree.erl index b974a22eeca..16817132017 100644 --- a/src/couch/src/couch_btree.erl +++ b/src/couch/src/couch_btree.erl @@ -472,6 +472,8 @@ reduce_tree_size(kp_node, NodeSize, [{_K, {_P, _Red, Sz}} | NodeList]) -> get_node(#btree{fd = Fd}, NodePos) -> {ok, {NodeType, NodeList}} = couch_file:pread_term(Fd, NodePos), + %% TODO: wire in csrt tracking + couch_stats:increment_counter([couchdb, btree, get_node]), {NodeType, NodeList}. write_node(#btree{fd = Fd, compression = Comp} = Bt, NodeType, NodeList) -> @@ -1163,6 +1165,7 @@ stream_kv_node2(Bt, Reds, PrevKVs, [{K, V} | RestKVs], InRange, Dir, Fun, Acc) - false -> {stop, {PrevKVs, Reds}, Acc}; true -> + couch_stats:increment_counter([couchdb, btree, changes_processed]), AssembledKV = assemble(Bt, K, V), case Fun(visit, AssembledKV, {PrevKVs, Reds}, Acc) of {ok, Acc2} -> diff --git a/src/couch/src/couch_db.erl b/src/couch/src/couch_db.erl index 2ef89ced3a6..c7afaa4b39f 100644 --- a/src/couch/src/couch_db.erl +++ b/src/couch/src/couch_db.erl @@ -297,6 +297,7 @@ open_doc(Db, IdOrDocInfo) -> open_doc(Db, IdOrDocInfo, []). open_doc(Db, Id, Options) -> + %% TODO: wire in csrt tracking increment_stat(Db, [couchdb, database_reads]), case open_doc_int(Db, Id, Options) of {ok, #doc{deleted = true} = Doc} -> @@ -1982,6 +1983,7 @@ increment_stat(#db{options = Options}, Stat, Count) when -> case lists:member(sys_db, Options) of true -> + %% TODO: we shouldn't leak resource usage just because it's a sys_db ok; false -> couch_stats:increment_counter(Stat, Count) diff --git a/src/couch/src/couch_query_servers.erl b/src/couch/src/couch_query_servers.erl index 151bdc805ee..3d759594f79 100644 --- a/src/couch/src/couch_query_servers.erl +++ b/src/couch/src/couch_query_servers.erl @@ -542,6 +542,8 @@ filter_docs(Req, Db, DDoc, FName, Docs) -> {ok, filter_docs_int(Db, DDoc, FName, JsonReq, JsonDocs)} catch throw:{os_process_error, {exit_status, 1}} -> + %% TODO: wire in csrt tracking + couch_stats:increment_counter([couchdb, query_server, js_filter_error]), %% batch used too much memory, retry sequentially. Fun = fun(JsonDoc) -> filter_docs_int(Db, DDoc, FName, JsonReq, [JsonDoc]) @@ -550,6 +552,12 @@ filter_docs(Req, Db, DDoc, FName, Docs) -> end. filter_docs_int(Db, DDoc, FName, JsonReq, JsonDocs) -> + %% Count usage in _int version as this can be repeated for OS error + %% Pros & cons... might not have actually processed `length(JsonDocs)` docs + %% but it certainly undercounts if we count in `filter_docs/5` above + %% TODO: wire in csrt tracking + couch_stats:increment_counter([couchdb, query_server, js_filter]), + couch_stats:increment_counter([couchdb, query_server, js_filtered_docs], length(JsonDocs)), [true, Passes] = ddoc_prompt( Db, DDoc, diff --git a/src/couch/src/couch_server.erl b/src/couch/src/couch_server.erl index 7dbbe4af11c..c6c244ad411 100644 --- a/src/couch/src/couch_server.erl +++ b/src/couch/src/couch_server.erl @@ -97,6 +97,8 @@ sup_start_link(N) -> gen_server:start_link({local, couch_server(N)}, couch_server, [N], []). open(DbName, Options) -> + %% TODO: wire in csrt tracking + couch_stats:increment_counter([couchdb, couch_server, open]), try validate_open_or_create(DbName, Options), open_int(DbName, Options) diff --git a/src/couch_stats/src/couch_stats.erl b/src/couch_stats/src/couch_stats.erl index 29a4024491f..29190e6b003 100644 --- a/src/couch_stats/src/couch_stats.erl +++ b/src/couch_stats/src/couch_stats.erl @@ -24,6 +24,12 @@ update_gauge/2 ]). +%% couch_stats_resource_tracker API +-export([ + create_context/3, + maybe_track_rexi_init_p/1 +]). + -type response() :: ok | {error, unknown_metric} | {error, invalid_metric}. -type stat() :: {any(), [{atom(), any()}]}. @@ -49,6 +55,11 @@ increment_counter(Name) -> -spec increment_counter(any(), pos_integer()) -> response(). increment_counter(Name, Value) -> + %% Should maybe_track_local happen before or after notify? + %% If after, only currently tracked metrics declared in the app's + %% stats_description.cfg will be trackable locally. Pros/cons. + %io:format("NOTIFY_EXISTING_METRIC: ~p || ~p || ~p~n", [Name, Op, Type]), + ok = maybe_track_local_counter(Name, Value), case couch_stats_util:get_counter(Name, stats()) of {ok, Ctx} -> couch_stats_counter:increment(Ctx, Value); {error, Error} -> {error, Error} @@ -100,6 +111,25 @@ stats() -> now_sec() -> erlang:monotonic_time(second). +%% Only potentially track positive increments to counters +-spec maybe_track_local_counter(any(), any()) -> ok. +maybe_track_local_counter(Name, Val) when is_integer(Val) andalso Val > 0 -> + %%io:format("maybe_track_local[~p]: ~p~n", [Val, Name]), + couch_stats_resource_tracker:maybe_inc(Name, Val), + ok; +maybe_track_local_counter(_, _) -> + ok. + +create_context(From, MFA, Nonce) -> + couch_stats_resource_tracker:create_context(From, MFA, Nonce). + +maybe_track_rexi_init_p({M, F, _A}) -> + Metric = [M, F, spawned], + case couch_stats_resource_tracker:should_track(Metric) of + true -> increment_counter(Metric); + false -> ok + end. + -ifdef(TEST). -include_lib("couch/include/couch_eunit.hrl"). diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl new file mode 100644 index 00000000000..00e375e8ce9 --- /dev/null +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -0,0 +1,607 @@ +% Licensed under the Apache License, Version 2.0 (the "License"); you may not +% use this file except in compliance with the License. You may obtain a copy of +% the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +% License for the specific language governing permissions and limitations under +% the License. + +-module(couch_stats_resource_tracker). + +-behaviour(gen_server). + +-export([ + start_link/0, + init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + code_change/3, + terminate/2 +]). + +-export([ + inc/1, inc/2, + maybe_inc/2, + get_pid_ref/0, + accumulate_delta/1 +]). + +-export([ + create_context/0, create_context/1, create_context/3, + track/1, + should_track/1 +]). + +-export([ + active/0 +]). + +-export([ + make_delta/0 +]). + +%% Singular increment operations +-export([ + db_opened/0, + doc_read/0, + row_read/0, + change_processed/0, + ioq_called/0, + js_evaled/0, + js_filtered/0, + js_filtered_error/0, + js_filtered_doc/0, + mango_match_evaled/0, + get_kv_node/0, + get_kp_node/0 +]). + +%% Plural increment operations +-export([ + js_filtered_docs/1, + io_bytes_read/1, + io_bytes_written/1 +]). + +-include_lib("couch/include/couch_db.hrl"). + +%% Use these for record upgrades over the wire and in ETS tables +%% TODO: alternatively, just delete these. Currently using a map +%% for shipping deltas over the wire, avoiding much of the +%% problem here. We'll likely still need to handle upgrades to +%% map format over time, so let's decide a course of action here. +-define(RCTX_V1, rctx_v1). +-define(RCTX, ?RCTX_V1). + +-define(MANGO_EVAL_MATCH, mango_eval_match). +-define(DB_OPEN_DOC, docs_read). + +-define(FRPC_CHANGES_ROW, changes_processed). + +%% Module pdict markers +-define(DELTA_TA, csrt_delta_ta). +-define(DELTA_TZ, csrt_delta_tz). %% T Zed instead of T0 +-define(PID_REF, csrt_pid_ref). %% track local ID + + +-record(st, { + eviction_delay = 10000, %% How many ms dead processes are visible + tracking = #{} %% track active processes for eventual eviction +}). + + +%% TODO: switch to: +%% -record(?RCTX, { +-record(rctx, { + updated_at = os:timestamp(), + exited_at, + pid_ref, + mfa, + nonce, + from, + type = unknown, %% unknown/background/system/rpc/coordinator/fabric_rpc/etc_rpc/etc + db_open = 0, + docs_read = 0, + rows_read = 0, + changes_processed = 0, + ioq_calls = 0, + io_bytes_read = 0, + io_bytes_written = 0, + js_evals = 0, + js_filter = 0, + js_filter_error = 0, + js_filtered_docs = 0, + mango_eval_match = 0, + get_kv_node = 0, + get_kp_node = 0, + state = alive +}). + +db_opened() -> inc(db_open). +doc_read() -> inc(docs_read). +row_read() -> inc(rows_read). +change_processed() -> inc(changes_processed). +ioq_called() -> inc(ioq_calls). +js_evaled() -> inc(js_evals). +js_filtered() -> inc(js_filter). +js_filtered_error() -> inc(js_filter_error). +js_filtered_doc() -> inc(js_filtered_docs). +mango_match_evaled() -> inc(mango_eval_match). +get_kv_node() -> inc(get_kv_node). +get_kp_node() -> inc(get_kp_node). + +js_filtered_docs(N) -> inc(js_filtered_docs, N). +io_bytes_read(N) -> inc(io_bytes_read, N). +io_bytes_written(N) -> inc(io_bytes_written, N). + +inc(db_open) -> + inc(db_open, 1); +inc(docs_read) -> + inc(docs_read, 1); +inc(rows_read) -> + inc(rows_read, 1); +inc(changes_processed) -> + inc(changes_processed, 1); +inc(ioq_calls) -> + inc(ioq_calls, 1); +inc(io_bytes_read) -> + inc(io_bytes_read, 1); +inc(io_bytes_written) -> + inc(io_bytes_written, 1); +inc(js_evals) -> + inc(js_evals, 1); +inc(js_filter) -> + inc(js_filter, 1); +inc(js_filter_error) -> + inc(js_filter_error, 1); +inc(js_filtered_docs) -> + inc(js_filtered_docs, 1); +inc(?MANGO_EVAL_MATCH) -> + inc(?MANGO_EVAL_MATCH, 1); +inc(get_kv_node) -> + inc(get_kv_node, 1); +inc(get_kp_node) -> + inc(get_kp_node, 1); +inc(_) -> + 0. + + +inc(db_open, N) -> + update_counter(#rctx.db_open, N); +inc(rows_read, N) -> + update_counter(#rctx.rows_read, N); +inc(ioq_calls, N) -> + update_counter(#rctx.ioq_calls, N); +inc(io_bytes_read, N) -> + update_counter(#rctx.io_bytes_read, N); +inc(io_bytes_written, N) -> + update_counter(#rctx.io_bytes_written, N); +inc(js_evals, N) -> + update_counter(#rctx.js_evals, N); +inc(js_filter, N) -> + update_counter(#rctx.js_filter, N); +inc(js_filter_error, N) -> + update_counter(#rctx.js_filter_error, N); +inc(js_filtered_docs, N) -> + update_counter(#rctx.js_filtered_docs, N); +inc(?MANGO_EVAL_MATCH, N) -> + update_counter(#rctx.?MANGO_EVAL_MATCH, N); +inc(?DB_OPEN_DOC, N) -> + update_counter(#rctx.?DB_OPEN_DOC, N); +inc(?FRPC_CHANGES_ROW, N) -> + update_counter(#rctx.?FRPC_CHANGES_ROW, N); +inc(get_kv_node, N) -> + update_counter(#rctx.get_kv_node, N); +inc(get_kp_node, N) -> + update_counter(#rctx.get_kp_node, N); +inc(_, _) -> + 0. + +maybe_inc([mango, evaluate_selector], Val) -> + inc(?MANGO_EVAL_MATCH, Val); +maybe_inc([couchdb, database_reads], Val) -> + inc(?DB_OPEN_DOC, Val); +maybe_inc([fabric_rpc, changes, rows_read], Val) -> + inc(?FRPC_CHANGES_ROW, Val); +maybe_inc(_, _) -> + 0. + + +%% TODO: update stats_descriptions.cfg for relevant apps +should_track([fabric_rpc, all_docs, spawned]) -> + true; +should_track([fabric_rpc, changes, spawned]) -> + true; +should_track([fabric_rpc, map_view, spawned]) -> + true; +should_track([fabric_rpc, reduce_view, spawned]) -> + true; +should_track([fabric_rpc, get_all_security, spawned]) -> + true; +should_track([fabric_rpc, open_doc, spawned]) -> + true; +should_track([fabric_rpc, update_docs, spawned]) -> + true; +should_track([fabric_rpc, open_shard, spawned]) -> + true; +should_track([mango_cursor, view, all_docs]) -> + true; +should_track([mango_cursor, view, idx]) -> + true; +should_track(_) -> + false. + +%% TODO: update coordinator stats from worker deltas +accumulate_delta(_Delta) -> + ok. + +update_counter(Field, Count) -> + update_counter(get_pid_ref(), Field, Count). + + +update_counter({_Pid,_Ref}=Key, Field, Count) -> + ets:update_counter(?MODULE, Key, {Field, Count}, #rctx{pid_ref=Key}). + + +active() -> + lists:map(fun to_json/1, ets:tab2list(?MODULE)). + + +to_json(#rctx{}=Rctx) -> + #rctx{ + updated_at = TP, + pid_ref = {_Pid, _Ref} = PidRef, + mfa = MFA0, + nonce = Nonce0, + from = From0, + docs_read = DocsRead, + rows_read = RowsRead, + state = State0, + type = Type, + changes_processed = ChangesProcessed + } = Rctx, + %%io:format("TO_JSON_MFA: ~p~n", [MFA0]), + MFA = case MFA0 of + {M, F, A} -> + [M, F, A]; + undefined -> + null; + Other -> + throw({error, {unexpected, Other}}) + end, + From = case From0 of + {Parent, ParentRef} -> + [pid_to_list(Parent), ref_to_list(ParentRef)]; + undefined -> + null + end, + State = case State0 of + alive -> + alive; + {down, Reason} when is_atom(Reason) -> + [down, Reason]; + Unknown -> + [unknown, io_lib:format("~w", [Unknown])] + end, + Nonce = case Nonce0 of + undefined -> + null; + Nonce0 -> + list_to_binary(Nonce0) + end, + #{ + updated_at => term_to_json(TP), + %%pid_ref => [pid_to_list(Pid), ref_to_list(Ref)], + pid_ref => term_to_json(PidRef), + mfa => term_to_json(MFA), + nonce => term_to_json(Nonce), + %%from => From, + from => term_to_json(From), + docs_read => DocsRead, + rows_read => RowsRead, + state => State, + type => term_to_json(Type), + changes_processed => ChangesProcessed + }. + +term_to_json({Pid, Ref}) when is_pid(Pid), is_reference(Ref) -> + [?l2b(pid_to_list(Pid)), ?l2b(ref_to_list(Ref))]; +term_to_json({A, B, C}) -> + [A, B, C]; +term_to_json(undefined) -> + null; +term_to_json(T) -> + T. + +term_to_flat_json(Tuple) when is_tuple(Tuple) -> + ?l2b(io_lib:format("~w", [Tuple])); +term_to_flat_json(undefined) -> + null; +term_to_flat_json(T) -> + T. + +to_flat_json(#rctx{}=Rctx) -> + #rctx{ + updated_at = TP, + pid_ref = {_Pid, _Ref} = PidRef, + mfa = MFA0, + nonce = Nonce0, + from = From0, + docs_read = DocsRead, + rows_read = RowsRead, + state = State0, + type = Type, + changes_processed = ChangesProcessed + } = Rctx, + io:format("TO_JSON_MFA: ~p~n", [MFA0]), + MFA = case MFA0 of + {_M, _F, _A} -> + ?l2b(io_lib:format("~w", [MFA0])); + undefined -> + null; + Other -> + throw({error, {unexpected, Other}}) + end, + From = case From0 of + {_Parent, _ParentRef} -> + ?l2b(io_lib:format("~w", [From0])); + undefined -> + null + end, + State = case State0 of + alive -> + alive; + State0 -> + ?l2b(io_lib:format("~w", [State0])) + end, + Nonce = case Nonce0 of + undefined -> + null; + Nonce0 -> + list_to_binary(Nonce0) + end, + #{ + %%updated_at => ?l2b(io_lib:format("~w", [TP])), + updated_at => term_to_flat_json(TP), + %%pid_ref => [pid_to_list(Pid), ref_to_list(Ref)], + pid_ref => ?l2b(io_lib:format("~w", [PidRef])), + mfa => MFA, + nonce => Nonce, + from => From, + docs_read => DocsRead, + rows_read => RowsRead, + state => State, + type => term_to_flat_json(Type), + changes_processed => ChangesProcessed + }. + +get_pid_ref() -> + case get(?PID_REF) of + undefined -> + Ref = make_ref(), + set_pid_ref({self(), Ref}); + PidRef -> + PidRef + end. + + +create_context() -> + create_context(self()). + + +create_context(Pid) -> + Ref = make_ref(), + Rctx = make_record(Pid, Ref), + track(Rctx), + ets:insert(?MODULE, Rctx), + Rctx. + +%% add type to disnguish coordinator vs rpc_worker +create_context(From, {M,F,_A} = MFA, Nonce) -> + io:format("CREAT_CONTEXT MFA[~p]: {~p}: ~p~n", [From, MFA, Nonce]), + Ref = make_ref(), + %%Rctx = make_record(self(), Ref), + %% TODO: extract user_ctx and db/shard from + Rctx = #rctx{ + pid_ref = {self(), Ref}, + from = From, + mfa = MFA, + type = {worker, M, F}, + nonce = Nonce + }, + track(Rctx), + erlang:put(?DELTA_TZ, Rctx), + ets:insert(?MODULE, Rctx), + Rctx. + +track(#rctx{}=Rctx) -> + %% TODO: should this block or not? If no, what cleans up zombies? + %% gen_server:call(?MODULE, {track, PR}). + gen_server:cast(?MODULE, {track, Rctx}). + + +make_delta() -> + TA = case get(?DELTA_TA) of + undefined -> + %% Need to handle this better, can't just make a new T0 at T' as + %% the timestamps will be identical causing a divide by zero error. + %% + %% Realistically need to ensure that all invocations of database + %% operations sets T0 appropriately. Perhaps it's possible to do + %% this is the couch_db:open chain, and then similarly, in + %% couch_server, and uhhhh... couch_file, and... + %% + %% I think we need some type of approach for establishing a T0 that + %% doesn't result in outrageous deltas. For now zero out the + %% microseconds field, or subtract a second on the off chance that + %% microseconds is zero. I'm not uptodate on the latest Erlang time + %% libraries and don't remember how to easily get an + %% `os:timestamp()` out of now() - 100ms or some such. + %% + %% I think it's unavoidable that we'll have some codepaths that do + %% not properly instantiate the T0 at spawn resulting in needing to + %% do some time of "time warp" or ignoring the timing collection + %% entirely. Perhaps if we hoisted out the stats collection into + %% the primary flow of the database and funnel that through all the + %% function clauses we could then utilize Dialyzer to statically + %% analyze and assert all code paths that invoke database + %% operations have properly instantinated a T0 at the appropriate + %% start time such that we don't have to "fudge" deltas with a + %% missing start point, but we're a long ways from that happening + %% so I feel it necessary to address the NULL start time. + + %% Track how often we fail to initiate T0 correctly + %% Perhaps somewhat naughty we're incrementing stats from within + %% couch_stats itself? Might need to handle this differently + %% TODO: determine appropriate course of action here + couch_stats:increment_counter( + [couchdb, csrt, delta_missing_t0]), + %%[couch_stats_resource_tracker, delta_missing_t0]), + + case erlang:get(?DELTA_TZ) of + undefined -> + TA0 = make_delta_base(), + %% TODO: handline missing deltas, otherwise divide by zero + set_delta_a(TA0), + TA0; + TA0 -> + TA0 + end; + %%?RCTX{} = TA0 -> + #rctx{} = TA0 -> + TA0 + end, + TB = get_resource(), + make_delta(TA, TB). + + +make_delta(#rctx{}=TA, #rctx{}=TB) -> + Delta = #{ + docs_read => TB#rctx.docs_read - TA#rctx.docs_read, + rows_read => TB#rctx.rows_read - TA#rctx.rows_read, + changes_processed => TB#rctx.changes_processed - TA#rctx.changes_processed, + dt => timer:now_diff(TB#rctx.updated_at, TA#rctx.updated_at) + }, + %% TODO: reevaluate this decision + %% Only return non zero (and also positive) delta fields + maps:filter(fun(_K,V) -> V > 0 end, Delta); +make_delta(_, #rctx{}) -> + #{error => missing_beg_rctx}; +make_delta(#rctx{}, _) -> + #{error => missing_fin_rctx}. + +make_delta_base() -> + Ref = make_ref(), + %%Rctx = make_record(self(), Ref), + %% TODO: extract user_ctx and db/shard from request + TA0 = #rctx{ + pid_ref = {self(), Ref} + }, + case TA0#rctx.updated_at of + {Me, S, Mi} when Mi > 0 -> + TA0#rctx{updated_at = {Me, S, 0}}; + {Me, S, Mi} when S > 0 -> + TA0#rctx{updated_at = {Me, S - 1, Mi}} + end. + +set_delta_a(TA) -> + erlang:put(?DELTA_TA, TA). + +set_pid_ref(PidRef) -> + erlang:put(?PID_REF, PidRef), + PidRef. + +get_resource() -> + get_resource(get_pid_ref()). + +get_resource(PidRef) -> + case ets:lookup(?MODULE, PidRef) of + [#rctx{}=TP] -> + TP; + [] -> + undefined + end. + +make_record(Pid, Ref) -> + #rctx{pid_ref = {Pid, Ref}}. + +start_link() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + +init([]) -> + ets:new(?MODULE, [ + named_table, + public, + {decentralized_counters, true}, %% TODO: test impact of this + {write_concurrency, true}, + {read_concurrency, true}, + {keypos, #rctx.pid_ref} + ]), + {ok, #st{}}. + +handle_call(fetch, _from, #st{} = St) -> + {reply, {ok, St}, St}; +handle_call({track, _}, _From, St) -> + {reply, ok, St}; +handle_call(Msg, _From, St) -> + {stop, {unknown_call, Msg}, error, St}. + +handle_cast({track, #rctx{pid_ref={Pid,_}=PidRef}}, #st{tracking=AT0} = St0) -> + St = case maps:is_key(PidRef, AT0) of + true -> %% noop, we're already tracking this PidRef + St0; + false -> %% setup new monitor and double bookkeep refs + Mon = erlang:monitor(process, Pid), + AT = maps:put(Mon, PidRef, maps:put(PidRef, Mon, AT0)), + St0#st{tracking=AT} + end, + {noreply, St}; +handle_cast(Msg, St) -> + {stop, {unknown_cast, Msg}, St}. + +handle_info({'DOWN', MonRef, Type, DPid, Reason}, #st{tracking=AT0} = St0) -> + io:format("CSRT:HI(~p)~n", [{'DOWN', MonRef, Type, DPid, Reason}]), + St = case maps:get(MonRef, AT0, undefined) of + undefined -> + io:format("ERROR: UNEXPECTED MISSING MONITOR IN TRACKING TABLE: {~p, ~p}~n", [MonRef, DPid]), + St0; + {RPid, _Ref} = PidRef -> + if + RPid =:= DPid -> ok; + true -> erlang:halt(io_lib:format("CSRT:HI PID MISMATCH ABORT: ~p =/= ~p~n", [DPid, RPid])) + end, + %% remove double bookkeeping + AT = maps:remove(MonRef, maps:remove(PidRef, AT0)), + %% TODO: Assert Pid matches Object + %% update process state in live table + %% TODO: decide whether we want the true match to crash this process on failure + true = ets:update_element(?MODULE, PidRef, + [{#rctx.state, {down, Reason}}, {#rctx.updated_at, os:timestamp()}]), + log_process_lifetime_report(PidRef), + %% Delay eviction to allow human visibility on short lived pids + erlang:send_after(St0#st.eviction_delay, self(), {evict, PidRef}), + St0#st{tracking=AT} + end, + {noreply, St}; +handle_info({evict, {_Pid, _Ref}=PidRef}, #st{}=St) -> + ets:delete(?MODULE, PidRef), + {noreply, St}; +handle_info(Msg, St) -> + {stop, {unknown_info, Msg}, St}. + +terminate(_Reason, _St) -> + ok. + +code_change(_OldVsn, St, _Extra) -> + {ok, St}. + +log_process_lifetime_report(PidRef) -> + %% More safely assert this can't ever be undefined + #rctx{} = Rctx = get_resource(PidRef), + %% TODO: catch error out of here, report crashes on depth>1 json + couch_log:report("csrt-pid-usage-lifetime", to_flat_json(Rctx)). diff --git a/src/couch_stats/src/couch_stats_sup.erl b/src/couch_stats/src/couch_stats_sup.erl index 325372c3e4b..4b4df17e26a 100644 --- a/src/couch_stats/src/couch_stats_sup.erl +++ b/src/couch_stats/src/couch_stats_sup.erl @@ -29,6 +29,7 @@ init([]) -> { {one_for_one, 5, 10}, [ ?CHILD(couch_stats_server, worker), + ?CHILD(couch_stats_resource_tracker, worker), ?CHILD(couch_stats_process_tracker, worker) ] }}. diff --git a/src/fabric/priv/stats_descriptions.cfg b/src/fabric/priv/stats_descriptions.cfg index d12aa0c8480..57ec0b8d204 100644 --- a/src/fabric/priv/stats_descriptions.cfg +++ b/src/fabric/priv/stats_descriptions.cfg @@ -26,3 +26,29 @@ {type, counter}, {desc, <<"number of write quorum errors">>} ]}. + + +%% fabric_rpc worker stats +%% TODO: decide on which naming scheme: +%% {[fabric_rpc, get_all_security, spawned], [ +%% {[fabric_rpc, spawned, get_all_security], [ +{[fabric_rpc, get_all_security, spawned], [ + {type, counter}, + {desc, <<"number of fabric_rpc worker get_all_security spawns">>} +]}. +{[fabric_rpc, open_doc, spawned], [ + {type, counter}, + {desc, <<"number of fabric_rpc worker open_doc spawns">>} +]}. +{[fabric_rpc, all_docs, spawned], [ + {type, counter}, + {desc, <<"number of fabric_rpc worker all_docs spawns">>} +]}. +{[fabric_rpc, open_shard, spawned], [ + {type, counter}, + {desc, <<"number of fabric_rpc worker open_shard spawns">>} +]}. +{[fabric_rpc, changes, spawned], [ + {type, counter}, + {desc, <<"number of fabric_rpc worker changes spawns">>} +]}. diff --git a/src/fabric/src/fabric_rpc.erl b/src/fabric/src/fabric_rpc.erl index d01f1f5a749..b5064b5af2c 100644 --- a/src/fabric/src/fabric_rpc.erl +++ b/src/fabric/src/fabric_rpc.erl @@ -492,6 +492,11 @@ view_cb({meta, Meta}, Acc) -> ok = rexi:stream2({meta, Meta}), {ok, Acc}; view_cb({row, Row}, Acc) -> + %% TODO: distinguish between rows and docs + %% TODO: wire in csrt tracking + %% TODO: distinguish between all_docs vs view call + couch_stats:increment_counter([fabric_rpc, view, row_processed]), + %%couch_stats_resource_tracker:inc(rows_read), % Adding another row ViewRow = #view_row{ id = couch_util:get_value(id, Row), @@ -535,6 +540,7 @@ changes_enumerator(#full_doc_info{} = FDI, Acc) -> changes_enumerator(#doc_info{id = <<"_local/", _/binary>>, high_seq = Seq}, Acc) -> {ok, Acc#fabric_changes_acc{seq = Seq, pending = Acc#fabric_changes_acc.pending - 1}}; changes_enumerator(DocInfo, Acc) -> + couch_stats:increment_counter([fabric_rpc, changes, rows_read]), #fabric_changes_acc{ db = Db, args = #changes_args{ diff --git a/src/fabric/src/fabric_util.erl b/src/fabric/src/fabric_util.erl index 4acb65c739a..00d6b5e0e1c 100644 --- a/src/fabric/src/fabric_util.erl +++ b/src/fabric/src/fabric_util.erl @@ -137,7 +137,9 @@ get_shard([#shard{node = Node, name = Name} | Rest], Opts, Timeout, Factor) -> Ref = rexi:cast(Node, self(), MFA, [sync]), try receive - {Ref, {ok, Db}} -> + {Ref, {ok, Db}, {delta, Delta}} -> + io:format("[~p]GET SHARD GOT DELTA: ~p~n", [self(), Delta]), + couch_stats_resource_tracker:accumulate_delta(Delta), {ok, Db}; {Ref, {'rexi_EXIT', {{unauthorized, _} = Error, _}}} -> throw(Error); @@ -145,7 +147,10 @@ get_shard([#shard{node = Node, name = Name} | Rest], Opts, Timeout, Factor) -> throw(Error); {Ref, Reason} -> couch_log:debug("Failed to open shard ~p because: ~p", [Name, Reason]), - get_shard(Rest, Opts, Timeout, Factor) + get_shard(Rest, Opts, Timeout, Factor); + Other -> + io:format("GOT UNEXPECTED MESSAGE FORMAT: ~p~n", [Other]), + erlang:error(Other) after Timeout -> couch_log:debug("Failed to open shard ~p after: ~p", [Name, Timeout]), get_shard(Rest, Opts, Factor * Timeout, Factor) diff --git a/src/mango/src/mango_cursor_view.erl b/src/mango/src/mango_cursor_view.erl index 41006ce7794..a19d650b401 100644 --- a/src/mango/src/mango_cursor_view.erl +++ b/src/mango/src/mango_cursor_view.erl @@ -229,9 +229,11 @@ execute(#cursor{db = Db, index = Idx, execution_stats = Stats} = Cursor0, UserFu Result = case mango_idx:def(Idx) of all_docs -> + couch_stats:increment_counter([mango_cursor, view, all_docs]), CB = fun ?MODULE:handle_all_docs_message/2, fabric:all_docs(Db, DbOpts, CB, Cursor, Args); _ -> + couch_stats:increment_counter([mango_cursor, view, idx]), CB = fun ?MODULE:handle_message/2, % Normal view DDoc = ddocid(Idx), diff --git a/src/mango/src/mango_selector.erl b/src/mango/src/mango_selector.erl index 59be7a6ebaf..bde297a15ab 100644 --- a/src/mango/src/mango_selector.erl +++ b/src/mango/src/mango_selector.erl @@ -50,6 +50,7 @@ normalize(Selector) -> % This assumes that the Selector has been normalized. % Returns true or false. match(Selector, D) -> + %% TODO: wire in csrt tracking couch_stats:increment_counter([mango, evaluate_selector]), match_int(Selector, D). diff --git a/src/rexi/src/rexi.erl b/src/rexi/src/rexi.erl index 77830996e4b..67287108975 100644 --- a/src/rexi/src/rexi.erl +++ b/src/rexi/src/rexi.erl @@ -129,7 +129,8 @@ async_server_call(Server, Caller, Request) -> -spec reply(any()) -> any(). reply(Reply) -> {Caller, Ref} = get(rexi_from), - erlang:send(Caller, {Ref, Reply}). + Delta = couch_stats_resource_tracker:make_delta(), + erlang:send(Caller, {Ref, Reply, {delta, Delta}}). %% @equiv sync_reply(Reply, 300000) sync_reply(Reply) -> @@ -214,7 +215,8 @@ stream(Msg, Limit, Timeout) -> {ok, Count} -> put(rexi_unacked, Count + 1), {Caller, Ref} = get(rexi_from), - erlang:send(Caller, {Ref, self(), Msg}), + Delta = couch_stats_resource_tracker:make_delta(), + erlang:send(Caller, {Ref, self(), Msg, {delta, Delta}}), ok catch throw:timeout -> diff --git a/src/rexi/src/rexi_server.erl b/src/rexi/src/rexi_server.erl index 52489a9c5ef..08a9139bd79 100644 --- a/src/rexi/src/rexi_server.erl +++ b/src/rexi/src/rexi_server.erl @@ -136,9 +136,12 @@ init_p(From, MFA) -> string() | undefined ) -> any(). init_p(From, {M, F, A}, Nonce) -> + MFA = {M, F, length(A)}, put(rexi_from, From), - put('$initial_call', {M, F, length(A)}), + put('$initial_call', MFA), put(nonce, Nonce), + couch_stats:create_context(From, MFA, Nonce), + couch_stats:maybe_track_rexi_init_p(MFA), try apply(M, F, A) catch diff --git a/src/rexi/src/rexi_utils.erl b/src/rexi/src/rexi_utils.erl index d59c5ea0f1d..4f4ca5576c3 100644 --- a/src/rexi/src/rexi_utils.erl +++ b/src/rexi/src/rexi_utils.erl @@ -83,7 +83,27 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> end; {rexi, '$rexi_ping'} -> {ok, Acc0}; + {Ref, Msg, {delta, Delta}} -> + io:format("[~p]GOT DELTA: ~p -- ~p~n", [self(), Delta, Msg]), + couch_stats_resource_tracker:accumulate_delta(Delta), + case lists:keyfind(Ref, Keypos, RefList) of + false -> + % this was some non-matching message which we will ignore + {ok, Acc0}; + Worker -> + Fun(Msg, Worker, Acc0) + end; + {Ref, From, Msg, {delta, Delta}} -> + io:format("[~p]GOT DELTA: ~p -- ~p~n", [self(), Delta, Msg]), + couch_stats_resource_tracker:accumulate_delta(Delta), + case lists:keyfind(Ref, Keypos, RefList) of + false -> + {ok, Acc0}; + Worker -> + Fun(Msg, {Worker, From}, Acc0) + end; {Ref, Msg} -> + %%io:format("GOT NON DELTA MSG: ~p~n", [Msg]), case lists:keyfind(Ref, Keypos, RefList) of false -> % this was some non-matching message which we will ignore @@ -92,6 +112,7 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> Fun(Msg, Worker, Acc0) end; {Ref, From, Msg} -> + %%io:format("GOT NON DELTA MSG: ~p~n", [Msg]), case lists:keyfind(Ref, Keypos, RefList) of false -> {ok, Acc0}; From e5e482088b65c82e4f0b898dd7a23d6db18284a8 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 23 Oct 2023 15:32:03 -0700 Subject: [PATCH 02/29] Instrument more stats and rudimentary delta accumulation --- src/couch/priv/stats_descriptions.cfg | 28 ++++ src/couch/src/couch_btree.erl | 4 +- .../src/couch_stats_resource_tracker.erl | 121 ++++++++++++------ src/fabric/priv/stats_descriptions.cfg | 4 + src/fabric/src/fabric_rpc.erl | 4 +- 5 files changed, 115 insertions(+), 46 deletions(-) diff --git a/src/couch/priv/stats_descriptions.cfg b/src/couch/priv/stats_descriptions.cfg index e3f6a585e5c..02740e4d8f8 100644 --- a/src/couch/priv/stats_descriptions.cfg +++ b/src/couch/priv/stats_descriptions.cfg @@ -298,6 +298,10 @@ {type, counter}, {desc, <<"number of couch_server LRU operations skipped">>} ]}. +{[couchdb, couch_server, open], [ + {type, counter}, + {desc, <<"number of couch_server open operations invoked">>} +]}. {[couchdb, query_server, vdu_rejects], [ {type, counter}, {desc, <<"number of rejections by validate_doc_update function">>} @@ -410,10 +414,34 @@ {type, counter}, {desc, <<"number of other requests">>} ]}. +{[couchdb, query_server, js_filter], [ + {type, counter}, + {desc, <<"number of JS filter invocations">>} +]}. +{[couchdb, query_server, js_filterered], [ + {type, counter}, + {desc, <<"number of docs filtered through JS invocations">>} +]}. +{[couchdb, query_server, js_filter_error], [ + {type, counter}, + {desc, <<"number of JS filter invocation errors">>} +]}. {[couchdb, legacy_checksums], [ {type, counter}, {desc, <<"number of legacy checksums found in couch_file instances">>} ]}. +{[couchdb, btree, folds], [ + {type, counter}, + {desc, <<"number of couch btree kv fold callback invocations">>} +]}. +{[couchdb, btree, kp_node], [ + {type, counter}, + {desc, <<"number of couch btree kp_nodes read">>} +]}. +{[couchdb, btree, kv_node], [ + {type, counter}, + {desc, <<"number of couch btree kv_nodes read">>} +]}. %% CSRT (couch_stats_resource_tracker) stats {[couchdb, csrt, delta_missing_t0], [ {type, counter}, diff --git a/src/couch/src/couch_btree.erl b/src/couch/src/couch_btree.erl index 16817132017..27b5bc18b2e 100644 --- a/src/couch/src/couch_btree.erl +++ b/src/couch/src/couch_btree.erl @@ -473,7 +473,7 @@ reduce_tree_size(kp_node, NodeSize, [{_K, {_P, _Red, Sz}} | NodeList]) -> get_node(#btree{fd = Fd}, NodePos) -> {ok, {NodeType, NodeList}} = couch_file:pread_term(Fd, NodePos), %% TODO: wire in csrt tracking - couch_stats:increment_counter([couchdb, btree, get_node]), + couch_stats:increment_counter([couchdb, btree, NodeType]), {NodeType, NodeList}. write_node(#btree{fd = Fd, compression = Comp} = Bt, NodeType, NodeList) -> @@ -1165,7 +1165,7 @@ stream_kv_node2(Bt, Reds, PrevKVs, [{K, V} | RestKVs], InRange, Dir, Fun, Acc) - false -> {stop, {PrevKVs, Reds}, Acc}; true -> - couch_stats:increment_counter([couchdb, btree, changes_processed]), + couch_stats:increment_counter([couchdb, btree, folds]), AssembledKV = assemble(Bt, K, V), case Fun(visit, AssembledKV, {PrevKVs, Reds}, Acc) of {ok, Acc2} -> diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index 00e375e8ce9..8e9feeca1b0 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -50,7 +50,7 @@ db_opened/0, doc_read/0, row_read/0, - change_processed/0, + btree_fold/0, ioq_called/0, js_evaled/0, js_filtered/0, @@ -80,7 +80,18 @@ -define(MANGO_EVAL_MATCH, mango_eval_match). -define(DB_OPEN_DOC, docs_read). - +-define(DB_OPEN, db_open). +-define(COUCH_SERVER_OPEN, db_open). +-define(COUCH_BT_FOLDS, btree_folds). +-define(COUCH_BT_GET_KP_NODE, get_kp_node). +-define(COUCH_BT_GET_KV_NODE, get_kv_node). +-define(COUCH_JS_FILTER, js_filter). +-define(COUCH_JS_FILTER_ERROR, js_filter_error). +-define(COUCH_JS_FILTERED_DOCS, js_filtered_docs). +-define(ROWS_READ, rows_read). + +%% TODO: overlap between this and couch btree fold invocations +%% TODO: need some way to distinguish fols on views vs find vs all_docs -define(FRPC_CHANGES_ROW, changes_processed). %% Module pdict markers @@ -108,6 +119,7 @@ db_open = 0, docs_read = 0, rows_read = 0, + btree_folds = 0, changes_processed = 0, ioq_calls = 0, io_bytes_read = 0, @@ -117,15 +129,17 @@ js_filter_error = 0, js_filtered_docs = 0, mango_eval_match = 0, + %% TODO: switch record definitions to be macro based, eg: + %% ?COUCH_BT_GET_KP_NODE = 0, get_kv_node = 0, get_kp_node = 0, state = alive }). -db_opened() -> inc(db_open). +db_opened() -> inc(db_opened). doc_read() -> inc(docs_read). row_read() -> inc(rows_read). -change_processed() -> inc(changes_processed). +btree_fold() -> inc(?COUCH_BT_FOLDS). ioq_called() -> inc(ioq_calls). js_evaled() -> inc(js_evals). js_filtered() -> inc(js_filter). @@ -139,14 +153,14 @@ js_filtered_docs(N) -> inc(js_filtered_docs, N). io_bytes_read(N) -> inc(io_bytes_read, N). io_bytes_written(N) -> inc(io_bytes_written, N). -inc(db_open) -> - inc(db_open, 1); +inc(?DB_OPEN) -> + inc(?DB_OPEN, 1); inc(docs_read) -> inc(docs_read, 1); -inc(rows_read) -> - inc(rows_read, 1); -inc(changes_processed) -> - inc(changes_processed, 1); +inc(?ROWS_READ) -> + inc(?ROWS_READ, 1); +inc(?COUCH_BT_FOLDS) -> + inc(?COUCH_BT_FOLDS, 1); inc(ioq_calls) -> inc(ioq_calls, 1); inc(io_bytes_read) -> @@ -155,26 +169,26 @@ inc(io_bytes_written) -> inc(io_bytes_written, 1); inc(js_evals) -> inc(js_evals, 1); -inc(js_filter) -> - inc(js_filter, 1); -inc(js_filter_error) -> - inc(js_filter_error, 1); -inc(js_filtered_docs) -> - inc(js_filtered_docs, 1); +inc(?COUCH_JS_FILTER) -> + inc(?COUCH_JS_FILTER, 1); +inc(?COUCH_JS_FILTER_ERROR) -> + inc(?COUCH_JS_FILTER_ERROR, 1); +inc(?COUCH_JS_FILTERED_DOCS) -> + inc(?COUCH_JS_FILTERED_DOCS, 1); inc(?MANGO_EVAL_MATCH) -> inc(?MANGO_EVAL_MATCH, 1); -inc(get_kv_node) -> - inc(get_kv_node, 1); -inc(get_kp_node) -> - inc(get_kp_node, 1); +inc(?COUCH_BT_GET_KP_NODE) -> + inc(?COUCH_BT_GET_KP_NODE, 1); +inc(?COUCH_BT_GET_KV_NODE) -> + inc(?COUCH_BT_GET_KV_NODE, 1); inc(_) -> 0. -inc(db_open, N) -> - update_counter(#rctx.db_open, N); -inc(rows_read, N) -> - update_counter(#rctx.rows_read, N); +inc(?DB_OPEN, N) -> + update_counter(#rctx.?DB_OPEN, N); +inc(?ROWS_READ, N) -> + update_counter(#rctx.?ROWS_READ, N); inc(ioq_calls, N) -> update_counter(#rctx.ioq_calls, N); inc(io_bytes_read, N) -> @@ -183,22 +197,22 @@ inc(io_bytes_written, N) -> update_counter(#rctx.io_bytes_written, N); inc(js_evals, N) -> update_counter(#rctx.js_evals, N); -inc(js_filter, N) -> - update_counter(#rctx.js_filter, N); -inc(js_filter_error, N) -> - update_counter(#rctx.js_filter_error, N); -inc(js_filtered_docs, N) -> - update_counter(#rctx.js_filtered_docs, N); +inc(?COUCH_JS_FILTER, N) -> + update_counter(#rctx.?COUCH_JS_FILTER, N); +inc(?COUCH_JS_FILTER_ERROR, N) -> + update_counter(#rctx.?COUCH_JS_FILTER_ERROR, N); +inc(?COUCH_JS_FILTERED_DOCS, N) -> + update_counter(#rctx.?COUCH_JS_FILTERED_DOCS, N); inc(?MANGO_EVAL_MATCH, N) -> update_counter(#rctx.?MANGO_EVAL_MATCH, N); inc(?DB_OPEN_DOC, N) -> update_counter(#rctx.?DB_OPEN_DOC, N); inc(?FRPC_CHANGES_ROW, N) -> update_counter(#rctx.?FRPC_CHANGES_ROW, N); -inc(get_kv_node, N) -> - update_counter(#rctx.get_kv_node, N); -inc(get_kp_node, N) -> - update_counter(#rctx.get_kp_node, N); +inc(?COUCH_BT_GET_KP_NODE, N) -> + update_counter(#rctx.?COUCH_BT_GET_KP_NODE, N); +inc(?COUCH_BT_GET_KV_NODE, N) -> + update_counter(#rctx.?COUCH_BT_GET_KV_NODE, N); inc(_, _) -> 0. @@ -206,9 +220,26 @@ maybe_inc([mango, evaluate_selector], Val) -> inc(?MANGO_EVAL_MATCH, Val); maybe_inc([couchdb, database_reads], Val) -> inc(?DB_OPEN_DOC, Val); -maybe_inc([fabric_rpc, changes, rows_read], Val) -> +maybe_inc([fabric_rpc, changes, processed], Val) -> inc(?FRPC_CHANGES_ROW, Val); -maybe_inc(_, _) -> +maybe_inc([fabric_rpc, view, rows_read], Val) -> + inc(?ROWS_READ, Val); +maybe_inc([couchdb, couch_server, open], Val) -> + inc(?DB_OPEN, Val); +maybe_inc([couchdb, btree, folds], Val) -> + inc(?COUCH_BT_FOLDS, Val); +maybe_inc([couchdb, btree, kp_node], Val) -> + inc(?COUCH_BT_GET_KP_NODE, Val); +maybe_inc([couchdb, btree, kv_node], Val) -> + inc(?COUCH_BT_GET_KV_NODE, Val); +maybe_inc([couchdb, query_server, js_filter_error], Val) -> + inc(?COUCH_JS_FILTER_ERROR, Val); +maybe_inc([couchdb, query_server, js_filter], Val) -> + inc(?COUCH_JS_FILTER, Val); +maybe_inc([couchdb, query_server, js_filtered_docs], Val) -> + inc(?COUCH_JS_FILTERED_DOCS, Val); +maybe_inc(Metric, Val) -> + io:format("SKIPPING MAYBE_INC METRIC[~p]: ~p~n", [Val, Metric]), 0. @@ -233,12 +264,16 @@ should_track([mango_cursor, view, all_docs]) -> true; should_track([mango_cursor, view, idx]) -> true; -should_track(_) -> +should_track(_Metric) -> + %%io:format("SKIPPING METRIC: ~p~n", [Metric]), false. %% TODO: update coordinator stats from worker deltas -accumulate_delta(_Delta) -> - ok. +accumulate_delta(Delta) -> + io:format("Accumulating delta: ~p~n", [Delta]), + %% TODO: switch to creating a batch of updates to invoke a single + %% update_counter rather than sequentially invoking it for each field + maps:foreach(fun inc/2, Delta). update_counter(Field, Count) -> update_counter(get_pid_ref(), Field, Count). @@ -263,6 +298,7 @@ to_json(#rctx{}=Rctx) -> rows_read = RowsRead, state = State0, type = Type, + btree_folds = BtFolds, changes_processed = ChangesProcessed } = Rctx, %%io:format("TO_JSON_MFA: ~p~n", [MFA0]), @@ -306,6 +342,7 @@ to_json(#rctx{}=Rctx) -> rows_read => RowsRead, state => State, type => term_to_json(Type), + btree_folds => BtFolds, changes_processed => ChangesProcessed }. @@ -336,7 +373,7 @@ to_flat_json(#rctx{}=Rctx) -> rows_read = RowsRead, state = State0, type = Type, - changes_processed = ChangesProcessed + btree_folds = ChangesProcessed } = Rctx, io:format("TO_JSON_MFA: ~p~n", [MFA0]), MFA = case MFA0 of @@ -377,7 +414,7 @@ to_flat_json(#rctx{}=Rctx) -> rows_read => RowsRead, state => State, type => term_to_flat_json(Type), - changes_processed => ChangesProcessed + btree_folds => ChangesProcessed }. get_pid_ref() -> @@ -484,7 +521,7 @@ make_delta(#rctx{}=TA, #rctx{}=TB) -> Delta = #{ docs_read => TB#rctx.docs_read - TA#rctx.docs_read, rows_read => TB#rctx.rows_read - TA#rctx.rows_read, - changes_processed => TB#rctx.changes_processed - TA#rctx.changes_processed, + btree_folds => TB#rctx.btree_folds - TA#rctx.btree_folds, dt => timer:now_diff(TB#rctx.updated_at, TA#rctx.updated_at) }, %% TODO: reevaluate this decision diff --git a/src/fabric/priv/stats_descriptions.cfg b/src/fabric/priv/stats_descriptions.cfg index 57ec0b8d204..7a062cca0ec 100644 --- a/src/fabric/priv/stats_descriptions.cfg +++ b/src/fabric/priv/stats_descriptions.cfg @@ -52,3 +52,7 @@ {type, counter}, {desc, <<"number of fabric_rpc worker changes spawns">>} ]}. +{[fabric_rpc, view, rows_read], [ + {type, counter}, + {desc, <<"number of fabric_rpc view_cb row invocations">>} +]}. diff --git a/src/fabric/src/fabric_rpc.erl b/src/fabric/src/fabric_rpc.erl index b5064b5af2c..49f7d2c4fe5 100644 --- a/src/fabric/src/fabric_rpc.erl +++ b/src/fabric/src/fabric_rpc.erl @@ -495,7 +495,7 @@ view_cb({row, Row}, Acc) -> %% TODO: distinguish between rows and docs %% TODO: wire in csrt tracking %% TODO: distinguish between all_docs vs view call - couch_stats:increment_counter([fabric_rpc, view, row_processed]), + couch_stats:increment_counter([fabric_rpc, view, rows_read]), %%couch_stats_resource_tracker:inc(rows_read), % Adding another row ViewRow = #view_row{ @@ -540,7 +540,7 @@ changes_enumerator(#full_doc_info{} = FDI, Acc) -> changes_enumerator(#doc_info{id = <<"_local/", _/binary>>, high_seq = Seq}, Acc) -> {ok, Acc#fabric_changes_acc{seq = Seq, pending = Acc#fabric_changes_acc.pending - 1}}; changes_enumerator(DocInfo, Acc) -> - couch_stats:increment_counter([fabric_rpc, changes, rows_read]), + couch_stats:increment_counter([fabric_rpc, changes, processed]), #fabric_changes_acc{ db = Db, args = #changes_args{ From ecda8f3270223223fe9ad8d6e80c1e95bad08657 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 30 Oct 2023 15:35:45 -0700 Subject: [PATCH 03/29] Add more usage tracking and core functionality --- src/chttpd/src/chttpd.erl | 4 + src/chttpd/src/chttpd_db.erl | 1 + .../src/couch_stats_resource_tracker.erl | 150 ++++++++++++++++-- src/fabric/src/fabric_rpc.erl | 9 ++ src/rexi/src/rexi.erl | 9 +- src/rexi/src/rexi_utils.erl | 4 + 6 files changed, 158 insertions(+), 19 deletions(-) diff --git a/src/chttpd/src/chttpd.erl b/src/chttpd/src/chttpd.erl index ab8e1e9a379..fdd7855f0e9 100644 --- a/src/chttpd/src/chttpd.erl +++ b/src/chttpd/src/chttpd.erl @@ -323,6 +323,8 @@ handle_request_int(MochiReq) -> % Save client socket so that it can be monitored for disconnects chttpd_util:mochiweb_client_req_set(MochiReq), + %% This is probably better in before_request, but having Path is nice + couch_stats_resource_tracker:create_coordinator_context(HttpReq0, Path), {HttpReq2, Response} = case before_request(HttpReq0) of {ok, HttpReq1} -> @@ -353,6 +355,8 @@ handle_request_int(MochiReq) -> before_request(HttpReq) -> try + %% TODO: re-enable this here once we have Path + %% couch_stats_resource_tracker:create_coordinator_context(HttpReq), chttpd_stats:init(), chttpd_plugin:before_request(HttpReq) catch diff --git a/src/chttpd/src/chttpd_db.erl b/src/chttpd/src/chttpd_db.erl index 9b1aff54f24..7538c452196 100644 --- a/src/chttpd/src/chttpd_db.erl +++ b/src/chttpd/src/chttpd_db.erl @@ -83,6 +83,7 @@ % Database request handlers handle_request(#httpd{path_parts = [DbName | RestParts], method = Method} = Req) -> + couch_stats_resource_tracker:set_context_dbname(DbName), case {Method, RestParts} of {'PUT', []} -> create_db_req(Req, DbName); diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index 8e9feeca1b0..b74eb5a10d2 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -33,12 +33,17 @@ -export([ create_context/0, create_context/1, create_context/3, + create_coordinator_context/1, create_coordinator_context/2, + set_context_dbname/1, + set_context_username/1, track/1, should_track/1 ]). -export([ - active/0 + active/0, + active_coordinators/0, + active_workers/0 ]). -export([ @@ -93,6 +98,7 @@ %% TODO: overlap between this and couch btree fold invocations %% TODO: need some way to distinguish fols on views vs find vs all_docs -define(FRPC_CHANGES_ROW, changes_processed). +%%-define(FRPC_CHANGES_ROW, ?ROWS_READ). %% Module pdict markers -define(DELTA_TA, csrt_delta_ta). @@ -109,6 +115,7 @@ %% TODO: switch to: %% -record(?RCTX, { -record(rctx, { + %% Metadata updated_at = os:timestamp(), exited_at, pid_ref, @@ -116,6 +123,11 @@ nonce, from, type = unknown, %% unknown/background/system/rpc/coordinator/fabric_rpc/etc_rpc/etc + state = alive, + dbname, + username, + + %% Stats counters db_open = 0, docs_read = 0, rows_read = 0, @@ -132,8 +144,7 @@ %% TODO: switch record definitions to be macro based, eg: %% ?COUCH_BT_GET_KP_NODE = 0, get_kv_node = 0, - get_kp_node = 0, - state = alive + get_kp_node = 0 }). db_opened() -> inc(db_opened). @@ -208,7 +219,7 @@ inc(?MANGO_EVAL_MATCH, N) -> inc(?DB_OPEN_DOC, N) -> update_counter(#rctx.?DB_OPEN_DOC, N); inc(?FRPC_CHANGES_ROW, N) -> - update_counter(#rctx.?FRPC_CHANGES_ROW, N); + update_counter(#rctx.?ROWS_READ, N); %% TODO: rework double use of rows_read inc(?COUCH_BT_GET_KP_NODE, N) -> update_counter(#rctx.?COUCH_BT_GET_KP_NODE, N); inc(?COUCH_BT_GET_KV_NODE, N) -> @@ -238,8 +249,8 @@ maybe_inc([couchdb, query_server, js_filter], Val) -> inc(?COUCH_JS_FILTER, Val); maybe_inc([couchdb, query_server, js_filtered_docs], Val) -> inc(?COUCH_JS_FILTERED_DOCS, Val); -maybe_inc(Metric, Val) -> - io:format("SKIPPING MAYBE_INC METRIC[~p]: ~p~n", [Val, Metric]), +maybe_inc(_Metric, _Val) -> + %%io:format("SKIPPING MAYBE_INC METRIC[~p]: ~p~n", [Val, Metric]), 0. @@ -248,6 +259,8 @@ should_track([fabric_rpc, all_docs, spawned]) -> true; should_track([fabric_rpc, changes, spawned]) -> true; +should_track([fabric_rpc, changes, processed]) -> + true; should_track([fabric_rpc, map_view, spawned]) -> true; should_track([fabric_rpc, reduce_view, spawned]) -> @@ -283,7 +296,26 @@ update_counter({_Pid,_Ref}=Key, Field, Count) -> ets:update_counter(?MODULE, Key, {Field, Count}, #rctx{pid_ref=Key}). -active() -> +active() -> active_int(all). +active_coordinators() -> active_int(coordinators). +active_workers() -> active_int(workers). + + +active_int(coordinators) -> + select_by_type(coordinators); +active_int(workers) -> + select_by_type(workers); +active_int(all) -> + lists:map(fun to_json/1, ets:tab2list(?MODULE)). + + +select_by_type(coordinators) -> + ets:select(couch_stats_resource_tracker, + [{#rctx{type = {coordinator,'_','_'}, _ = '_'}, [], ['$_']}]); +select_by_type(workers) -> + ets:select(couch_stats_resource_tracker, + [{#rctx{type = {worker,'_','_'}, _ = '_'}, [], ['$_']}]); +select_by_type(all) -> lists:map(fun to_json/1, ets:tab2list(?MODULE)). @@ -294,11 +326,17 @@ to_json(#rctx{}=Rctx) -> mfa = MFA0, nonce = Nonce0, from = From0, + dbname = DbName, + username = UserName, + db_open = DbOpens, docs_read = DocsRead, rows_read = RowsRead, state = State0, type = Type, btree_folds = BtFolds, + get_kp_node = KpNodes, + get_kv_node = KvNodes, + ioq_calls = IoqCalls, changes_processed = ChangesProcessed } = Rctx, %%io:format("TO_JSON_MFA: ~p~n", [MFA0]), @@ -338,27 +376,43 @@ to_json(#rctx{}=Rctx) -> nonce => term_to_json(Nonce), %%from => From, from => term_to_json(From), + dbname => DbName, + username => UserName, + db_open => DbOpens, docs_read => DocsRead, rows_read => RowsRead, state => State, - type => term_to_json(Type), + type => term_to_json({type, Type}), btree_folds => BtFolds, + kp_nodes => KpNodes, + kv_nodes => KvNodes, + ioq_calls => IoqCalls, changes_processed => ChangesProcessed }. term_to_json({Pid, Ref}) when is_pid(Pid), is_reference(Ref) -> [?l2b(pid_to_list(Pid)), ?l2b(ref_to_list(Ref))]; +term_to_json({type, {coordinator, _, _} = Type}) -> + %%io:format("SETTING JSON TYPE: ~p~n", [Type]), + ?l2b(io_lib:format("~p", [Type])); term_to_json({A, B, C}) -> [A, B, C]; term_to_json(undefined) -> null; +term_to_json(null) -> + null; term_to_json(T) -> T. +term_to_flat_json({type, {coordinator, _, _} = Type}) -> + %%io:format("SETTING FLAT JSON TYPE: ~p~n", [Type]), + ?l2b(io_lib:format("~p", [Type])); term_to_flat_json(Tuple) when is_tuple(Tuple) -> ?l2b(io_lib:format("~w", [Tuple])); term_to_flat_json(undefined) -> null; +term_to_flat_json(null) -> + null; term_to_flat_json(T) -> T. @@ -369,11 +423,17 @@ to_flat_json(#rctx{}=Rctx) -> mfa = MFA0, nonce = Nonce0, from = From0, + dbname = DbName, + username = UserName, + db_open = DbOpens, docs_read = DocsRead, rows_read = RowsRead, state = State0, type = Type, - btree_folds = ChangesProcessed + get_kp_node = KpNodes, + get_kv_node = KvNodes, + btree_folds = ChangesProcessed, + ioq_calls = IoqCalls } = Rctx, io:format("TO_JSON_MFA: ~p~n", [MFA0]), MFA = case MFA0 of @@ -402,6 +462,7 @@ to_flat_json(#rctx{}=Rctx) -> Nonce0 -> list_to_binary(Nonce0) end, + io:format("NONCE IS: ~p||~p~n", [Nonce0, Nonce]), #{ %%updated_at => ?l2b(io_lib:format("~w", [TP])), updated_at => term_to_flat_json(TP), @@ -410,11 +471,17 @@ to_flat_json(#rctx{}=Rctx) -> mfa => MFA, nonce => Nonce, from => From, + dbname => DbName, + username => UserName, + db_open => DbOpens, docs_read => DocsRead, rows_read => RowsRead, state => State, - type => term_to_flat_json(Type), - btree_folds => ChangesProcessed + type => term_to_flat_json({type, Type}), + kp_nodes => KpNodes, + kv_nodes => KvNodes, + btree_folds => ChangesProcessed, + ioq_calls => IoqCalls }. get_pid_ref() -> @@ -440,12 +507,12 @@ create_context(Pid) -> %% add type to disnguish coordinator vs rpc_worker create_context(From, {M,F,_A} = MFA, Nonce) -> - io:format("CREAT_CONTEXT MFA[~p]: {~p}: ~p~n", [From, MFA, Nonce]), - Ref = make_ref(), + io:format("[~p] CREAT_CONTEXT MFA[~p]: {~p}: ~p~n", [self(), From, MFA, Nonce]), + PidRef = get_pid_ref(), %% this will instantiate a new PidRef %%Rctx = make_record(self(), Ref), %% TODO: extract user_ctx and db/shard from Rctx = #rctx{ - pid_ref = {self(), Ref}, + pid_ref = PidRef, from = From, mfa = MFA, type = {worker, M, F}, @@ -453,9 +520,58 @@ create_context(From, {M,F,_A} = MFA, Nonce) -> }, track(Rctx), erlang:put(?DELTA_TZ, Rctx), - ets:insert(?MODULE, Rctx), + true = ets:insert(?MODULE, Rctx), Rctx. +create_coordinator_context(#httpd{path_parts=Parts} = Req) -> + create_coordinator_context(Req, io_lib:format("~p", [Parts])). + +create_coordinator_context(#httpd{} = Req, Path) -> + io:format("CREATING COORDINATOR CONTEXT ON {~p}~n", [Path]), + #httpd{ + method = Verb, + %%path_parts = Parts, + nonce = Nonce + } = Req, + PidRef = get_pid_ref(), %% this will instantiate a new PidRef + %%Rctx = make_record(self(), Ref), + %% TODO: extract user_ctx and db/shard from Req + Rctx = #rctx{ + pid_ref = PidRef, + %%type = {cooridantor, Verb, Parts}, + type = {coordinator, Verb, [$/ | Path]}, + nonce = Nonce + }, + track(Rctx), + erlang:put(?DELTA_TZ, Rctx), + true = ets:insert(?MODULE, Rctx), + Rctx. + +set_context_dbname(DbName) -> + case ets:update_element(?MODULE, get_pid_ref(), [{#rctx.dbname, DbName}]) of + false -> + Stk = try throw(42) catch _:_:Stk0 -> Stk0 end, + io:format("UPDATING DBNAME[~p] FAILURE WITH CONTEXT: ~p AND STACK:~n~pFOO:: ~p~n~n", [DbName, get_resource(), Stk, process_info(self(), current_stacktrace)]), + timer:sleep(1000), + erlang:halt(kaboomz); + true -> + true + end. + +set_context_username(null) -> + ok; +set_context_username(UserName) -> + io:format("CSRT SETTING USERNAME CONTEXT: ~p~n", [UserName]), + case ets:update_element(?MODULE, get_pid_ref(), [{#rctx.username, UserName}]) of + false -> + Stk = try throw(42) catch _:_:Stk0 -> Stk0 end, + io:format("UPDATING DBNAME[~p] FAILURE WITH CONTEXT: ~p AND STACK:~n~pFOO:: ~p~n~n", [UserName, get_resource(), Stk, process_info(self(), current_stacktrace)]), + timer:sleep(1000), + erlang:halt(kaboomz); + true -> + true + end. + track(#rctx{}=Rctx) -> %% TODO: should this block or not? If no, what cleans up zombies? %% gen_server:call(?MODULE, {track, PR}). @@ -522,6 +638,10 @@ make_delta(#rctx{}=TA, #rctx{}=TB) -> docs_read => TB#rctx.docs_read - TA#rctx.docs_read, rows_read => TB#rctx.rows_read - TA#rctx.rows_read, btree_folds => TB#rctx.btree_folds - TA#rctx.btree_folds, + get_kp_node => TB#rctx.get_kp_node - TA#rctx.get_kp_node, + get_kv_node => TB#rctx.get_kv_node - TA#rctx.get_kv_node, + db_open => TB#rctx.db_open - TA#rctx.db_open, + ioq_calls => TB#rctx.ioq_calls - TA#rctx.ioq_calls, dt => timer:now_diff(TB#rctx.updated_at, TA#rctx.updated_at) }, %% TODO: reevaluate this decision diff --git a/src/fabric/src/fabric_rpc.erl b/src/fabric/src/fabric_rpc.erl index 49f7d2c4fe5..da0710c9630 100644 --- a/src/fabric/src/fabric_rpc.erl +++ b/src/fabric/src/fabric_rpc.erl @@ -262,6 +262,7 @@ set_purge_infos_limit(DbName, Limit, Options) -> with_db(DbName, Options, {couch_db, set_purge_infos_limit, [Limit]}). open_doc(DbName, DocId, Options) -> + io:format("frpc:open_doc(~p, ~p, ~p)~n", [DbName, DocId, Options]), with_db(DbName, Options, {couch_db, open_doc, [DocId, Options]}). open_revs(DbName, IdRevsOpts, Options) -> @@ -352,6 +353,14 @@ get_uuid(DbName) -> with_db(DbName, Options, {M, F, A}) -> set_io_priority(DbName, Options), + couch_stats_resource_tracker:set_context_dbname(DbName), + %% TODO: better approach here than using proplists? + case proplists:get_value(user_ctx, Options) of + undefined -> + ok; + #user_ctx{name = UserName} -> + couch_stats_resource_tracker:set_context_username(UserName) + end, case get_or_create_db(DbName, Options) of {ok, Db} -> rexi:reply( diff --git a/src/rexi/src/rexi.erl b/src/rexi/src/rexi.erl index 67287108975..754dee44411 100644 --- a/src/rexi/src/rexi.erl +++ b/src/rexi/src/rexi.erl @@ -129,8 +129,7 @@ async_server_call(Server, Caller, Request) -> -spec reply(any()) -> any(). reply(Reply) -> {Caller, Ref} = get(rexi_from), - Delta = couch_stats_resource_tracker:make_delta(), - erlang:send(Caller, {Ref, Reply, {delta, Delta}}). + erlang:send(Caller, {Ref, Reply, get_delta()}). %% @equiv sync_reply(Reply, 300000) sync_reply(Reply) -> @@ -215,8 +214,7 @@ stream(Msg, Limit, Timeout) -> {ok, Count} -> put(rexi_unacked, Count + 1), {Caller, Ref} = get(rexi_from), - Delta = couch_stats_resource_tracker:make_delta(), - erlang:send(Caller, {Ref, self(), Msg, {delta, Delta}}), + erlang:send(Caller, {Ref, self(), Msg, get_delta()}), ok catch throw:timeout -> @@ -330,3 +328,6 @@ drain_acks(Count) -> after 0 -> {ok, Count} end. + +get_delta() -> + {delta, couch_stats_resource_tracker:make_delta()}. diff --git a/src/rexi/src/rexi_utils.erl b/src/rexi/src/rexi_utils.erl index 4f4ca5576c3..f7127ecd1f9 100644 --- a/src/rexi/src/rexi_utils.erl +++ b/src/rexi/src/rexi_utils.erl @@ -103,6 +103,8 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> Fun(Msg, {Worker, From}, Acc0) end; {Ref, Msg} -> + %% TODO: add stack trace to log entry + couch_log:debug("rexi_utils:process_message no delta: {Ref, Msg} => {~p, ~p}~n", [Ref, Msg]), %%io:format("GOT NON DELTA MSG: ~p~n", [Msg]), case lists:keyfind(Ref, Keypos, RefList) of false -> @@ -113,6 +115,8 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> end; {Ref, From, Msg} -> %%io:format("GOT NON DELTA MSG: ~p~n", [Msg]), + %% TODO: add stack trace to log entry + couch_log:debug("rexi_utils:process_message no delta: {Ref, From, Msg} => {~p, ~p, ~p}~n", [Ref, From, Msg]), case lists:keyfind(Ref, Keypos, RefList) of false -> {ok, Acc0}; From 73863a736d621f84e9b9bb8518f7aca7aedbb1aa Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Tue, 31 Oct 2023 15:13:45 -0700 Subject: [PATCH 04/29] Embed worker usage deltas in rexi:ping --- src/rexi/src/rexi.erl | 6 ++++++ src/rexi/src/rexi_utils.erl | 6 ++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/rexi/src/rexi.erl b/src/rexi/src/rexi.erl index 754dee44411..0db6ac5de79 100644 --- a/src/rexi/src/rexi.erl +++ b/src/rexi/src/rexi.erl @@ -265,16 +265,22 @@ stream_last(Msg, Timeout) -> %% @equiv stream_ack(Client, 1) stream_ack(Client) -> + %% stream_ack is coordinator side only, no need to send worker deltas erlang:send(Client, {rexi_ack, 1}). %% @doc Ack streamed messages stream_ack(Client, N) -> + %% stream_ack is coordinator side only, no need to send worker deltas erlang:send(Client, {rexi_ack, N}). %% Sends a ping message to the coordinator. This is for long running %% operations on a node that could exceed the rexi timeout ping() -> {Caller, _} = get(rexi_from), + %% It is essential ping/0 includes deltas as otherwise long running + %% filtered queries will be silent on usage until they finally return + %% a row or no results. This delay is proportional to the database size, + %% so instead we make sure ping/0 keeps live stats flowing. erlang:send(Caller, {rexi, '$rexi_ping'}). %% internal functions %% diff --git a/src/rexi/src/rexi_utils.erl b/src/rexi/src/rexi_utils.erl index f7127ecd1f9..3125f75db92 100644 --- a/src/rexi/src/rexi_utils.erl +++ b/src/rexi/src/rexi_utils.erl @@ -67,6 +67,10 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> receive {timeout, TimeoutRef} -> {timeout, Acc0}; + {rexi, '$rexi_ping', {delta, Delta}} -> + io:format("[~p]GOT PING DELTA: ~p~n", [self(), Delta]), + couch_stats_resource_tracker:accumulate_delta(Delta), + {ok, Acc0}; {rexi, Ref, Msg} -> case lists:keyfind(Ref, Keypos, RefList) of false -> @@ -81,8 +85,6 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> Worker -> Fun(Msg, {Worker, From}, Acc0) end; - {rexi, '$rexi_ping'} -> - {ok, Acc0}; {Ref, Msg, {delta, Delta}} -> io:format("[~p]GOT DELTA: ~p -- ~p~n", [self(), Delta, Msg]), couch_stats_resource_tracker:accumulate_delta(Delta), From 12e0267f2c6551fedfbf09a97fce31a6495dcfde Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Thu, 9 Nov 2023 11:22:13 -0800 Subject: [PATCH 05/29] WIP: core delta aggregation working --- src/chttpd/src/chttpd.erl | 14 ++- .../src/couch_stats_resource_tracker.erl | 100 ++++++++++++++++-- src/fabric/priv/stats_descriptions.cfg | 8 ++ src/fabric/src/fabric_rpc.erl | 1 + src/rexi/src/rexi.erl | 6 +- src/rexi/src/rexi_utils.erl | 6 ++ 6 files changed, 121 insertions(+), 14 deletions(-) diff --git a/src/chttpd/src/chttpd.erl b/src/chttpd/src/chttpd.erl index fdd7855f0e9..688541ae8bb 100644 --- a/src/chttpd/src/chttpd.erl +++ b/src/chttpd/src/chttpd.erl @@ -313,6 +313,8 @@ handle_request_int(MochiReq) -> ] }, + io:format("CSRTZ PROCESSING: ~w~n", [RequestedPath]), + % put small token on heap to keep requests synced to backend calls erlang:put(nonce, Nonce), @@ -324,14 +326,22 @@ handle_request_int(MochiReq) -> chttpd_util:mochiweb_client_req_set(MochiReq), %% This is probably better in before_request, but having Path is nice + io:format("CSRTZ INITIATING CONTEXT: ~w~n", [Nonce]), couch_stats_resource_tracker:create_coordinator_context(HttpReq0, Path), + io:format("CSRTZ INITIAL CONTEXT: ~w~n", [couch_stats_resource_tracker:get_resource()]), + Coord = self(), + PidRef = couch_stats_resource_tracker:get_pid_ref(), + spawn(fun() -> monitor(process, Coord), receive M -> io:format("CSRTZ PROCESS DOWN[~w]{~w}: ~w~n", [ Coord, M, couch_stats_resource_tracker:get_resource(PidRef)]) end end), {HttpReq2, Response} = case before_request(HttpReq0) of {ok, HttpReq1} -> + io:format("CSRTZ BEFORE_REQUEST OK: ~w~n", [couch_stats_resource_tracker:get_resource()]), process_request(HttpReq1); {error, Response0} -> + io:format("CSRTZ BEFORE_REQUEST ERROR: ~w~n", [couch_stats_resource_tracker:get_resource()]), {HttpReq0, Response0} end, + io:format("CSRTZ AFTER PROCESS_REQUEST: ~w~n", [couch_stats_resource_tracker:get_resource()]), chttpd_util:mochiweb_client_req_clean(), @@ -347,9 +357,11 @@ handle_request_int(MochiReq) -> case after_request(HttpReq2, HttpResp) of #httpd_resp{status = ok, response = Resp} -> + io:format("CSRTZ AFTER REQUEST OK: ~w~n", [couch_stats_resource_tracker:get_resource()]), {ok, Resp}; #httpd_resp{status = aborted, reason = Reason} -> - couch_log:error("Response abnormally terminated: ~p", [Reason]), + io:format("CSRTZ AFTER REQUEST ERROR: ~w~n", [couch_stats_resource_tracker:get_resource()]), + couch_log:error("Response abnormally terminated: ~w", [Reason]), exit({shutdown, Reason}) end. diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index b74eb5a10d2..9f4f9254e19 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -34,6 +34,8 @@ -export([ create_context/0, create_context/1, create_context/3, create_coordinator_context/1, create_coordinator_context/2, + get_resource/0, + get_resource/1, set_context_dbname/1, set_context_username/1, track/1, @@ -73,6 +75,10 @@ io_bytes_written/1 ]). +-export([ + field/2 +]). + -include_lib("couch/include/couch_db.hrl"). %% Use these for record upgrades over the wire and in ETS tables @@ -98,6 +104,7 @@ %% TODO: overlap between this and couch btree fold invocations %% TODO: need some way to distinguish fols on views vs find vs all_docs -define(FRPC_CHANGES_ROW, changes_processed). +-define(FRPC_CHANGES_RETURNED, changes_returned). %%-define(FRPC_CHANGES_ROW, ?ROWS_READ). %% Module pdict markers @@ -133,6 +140,7 @@ rows_read = 0, btree_folds = 0, changes_processed = 0, + changes_returned = 0, ioq_calls = 0, io_bytes_read = 0, io_bytes_written = 0, @@ -170,6 +178,8 @@ inc(docs_read) -> inc(docs_read, 1); inc(?ROWS_READ) -> inc(?ROWS_READ, 1); +inc(?FRPC_CHANGES_RETURNED) -> + inc(?FRPC_CHANGES_RETURNED, 1); inc(?COUCH_BT_FOLDS) -> inc(?COUCH_BT_FOLDS, 1); inc(ioq_calls) -> @@ -200,6 +210,8 @@ inc(?DB_OPEN, N) -> update_counter(#rctx.?DB_OPEN, N); inc(?ROWS_READ, N) -> update_counter(#rctx.?ROWS_READ, N); +inc(?FRPC_CHANGES_RETURNED, N) -> + update_counter(#rctx.?FRPC_CHANGES_RETURNED, N); inc(ioq_calls, N) -> update_counter(#rctx.ioq_calls, N); inc(io_bytes_read, N) -> @@ -233,6 +245,8 @@ maybe_inc([couchdb, database_reads], Val) -> inc(?DB_OPEN_DOC, Val); maybe_inc([fabric_rpc, changes, processed], Val) -> inc(?FRPC_CHANGES_ROW, Val); +maybe_inc([fabric_rpc, changes, returned], Val) -> + inc(?FRPC_CHANGES_RETURNED, Val); maybe_inc([fabric_rpc, view, rows_read], Val) -> inc(?ROWS_READ, Val); maybe_inc([couchdb, couch_server, open], Val) -> @@ -261,6 +275,8 @@ should_track([fabric_rpc, changes, spawned]) -> true; should_track([fabric_rpc, changes, processed]) -> true; +should_track([fabric_rpc, changes, returned]) -> + true; should_track([fabric_rpc, map_view, spawned]) -> true; should_track([fabric_rpc, reduce_view, spawned]) -> @@ -283,7 +299,12 @@ should_track(_Metric) -> %% TODO: update coordinator stats from worker deltas accumulate_delta(Delta) -> - io:format("Accumulating delta: ~p~n", [Delta]), + case get_resource() of + #rctx{type={coordinator,_,_}} = Rctx -> + io:format("Accumulating delta: ~w || ~w~n", [Delta, Rctx]); + _ -> + ok + end, %% TODO: switch to creating a batch of updates to invoke a single %% update_counter rather than sequentially invoking it for each field maps:foreach(fun inc/2, Delta). @@ -292,8 +313,8 @@ update_counter(Field, Count) -> update_counter(get_pid_ref(), Field, Count). -update_counter({_Pid,_Ref}=Key, Field, Count) -> - ets:update_counter(?MODULE, Key, {Field, Count}, #rctx{pid_ref=Key}). +update_counter({_Pid,_Ref}=PidRef, Field, Count) -> + ets:update_counter(?MODULE, PidRef, {Field, Count}, #rctx{pid_ref=PidRef}). active() -> active_int(all). @@ -319,6 +340,39 @@ select_by_type(all) -> lists:map(fun to_json/1, ets:tab2list(?MODULE)). +field(#rctx{pid_ref=Val}, pid_ref) -> Val; +field(#rctx{mfa=Val}, mfa) -> Val; +field(#rctx{nonce=Val}, nonce) -> Val; +field(#rctx{from=Val}, from) -> Val; +field(#rctx{type=Val}, type) -> Val; +field(#rctx{state=Val}, state) -> Val; +field(#rctx{dbname=Val}, dbname) -> Val; +field(#rctx{username=Val}, username) -> Val; +field(#rctx{db_open=Val}, db_open) -> Val; +field(#rctx{docs_read=Val}, docs_read) -> Val; +field(#rctx{rows_read=Val}, rows_read) -> Val; +field(#rctx{btree_folds=Val}, btree_folds) -> Val; +field(#rctx{changes_processed=Val}, changes_processed) -> Val; +field(#rctx{changes_returned=Val}, changes_returned) -> Val; +field(#rctx{ioq_calls=Val}, ioq_calls) -> Val; +field(#rctx{io_bytes_read=Val}, io_bytes_read) -> Val; +field(#rctx{io_bytes_written=Val}, io_bytes_written) -> Val; +field(#rctx{js_evals=Val}, js_evals) -> Val; +field(#rctx{js_filter=Val}, js_filter) -> Val; +field(#rctx{js_filter_error=Val}, js_filter_error) -> Val; +field(#rctx{js_filtered_docs=Val}, js_filtered_docs) -> Val; +field(#rctx{mango_eval_match=Val}, mango_eval_match) -> Val; +field(#rctx{get_kv_node=Val}, get_kv_node) -> Val; +field(#rctx{get_kp_node=Val}, get_kp_node) -> Val. + + +%%group_by(GBFun) when is_fun(GBFun) -> +%% group_by(GBFun, sum). + + +%%group_by(GbFun, Agg) when is_function(GBFun) -> + + to_json(#rctx{}=Rctx) -> #rctx{ updated_at = TP, @@ -331,13 +385,17 @@ to_json(#rctx{}=Rctx) -> db_open = DbOpens, docs_read = DocsRead, rows_read = RowsRead, + js_filter = JSFilters, + js_filter_error = JSFilterErrors, + js_filtered_docs = JSFilteredDocss, state = State0, type = Type, btree_folds = BtFolds, get_kp_node = KpNodes, get_kv_node = KvNodes, ioq_calls = IoqCalls, - changes_processed = ChangesProcessed + changes_processed = ChangesProcessed, + changes_returned = ChangesReturned } = Rctx, %%io:format("TO_JSON_MFA: ~p~n", [MFA0]), MFA = case MFA0 of @@ -380,6 +438,9 @@ to_json(#rctx{}=Rctx) -> username => UserName, db_open => DbOpens, docs_read => DocsRead, + js_filter => JSFilters, + js_filter_error => JSFilterErrors, + js_filtered_docs => JSFilteredDocss, rows_read => RowsRead, state => State, type => term_to_json({type, Type}), @@ -387,7 +448,8 @@ to_json(#rctx{}=Rctx) -> kp_nodes => KpNodes, kv_nodes => KvNodes, ioq_calls => IoqCalls, - changes_processed => ChangesProcessed + changes_processed => ChangesProcessed, + changes_returned => ChangesReturned }. term_to_json({Pid, Ref}) when is_pid(Pid), is_reference(Ref) -> @@ -428,14 +490,18 @@ to_flat_json(#rctx{}=Rctx) -> db_open = DbOpens, docs_read = DocsRead, rows_read = RowsRead, + js_filter = JSFilters, + js_filter_error = JSFilterErrors, + js_filtered_docs = JSFilteredDocss, state = State0, type = Type, get_kp_node = KpNodes, get_kv_node = KvNodes, btree_folds = ChangesProcessed, + changes_returned = ChangesReturned, ioq_calls = IoqCalls } = Rctx, - io:format("TO_JSON_MFA: ~p~n", [MFA0]), + %%io:format("TO_JSON_MFA: ~p~n", [MFA0]), MFA = case MFA0 of {_M, _F, _A} -> ?l2b(io_lib:format("~w", [MFA0])); @@ -475,12 +541,16 @@ to_flat_json(#rctx{}=Rctx) -> username => UserName, db_open => DbOpens, docs_read => DocsRead, + js_filter => JSFilters, + js_filter_error => JSFilterErrors, + js_filtered_docs => JSFilteredDocss, rows_read => RowsRead, state => State, type => term_to_flat_json({type, Type}), kp_nodes => KpNodes, kv_nodes => KvNodes, btree_folds => ChangesProcessed, + changes_returned => ChangesReturned, ioq_calls => IoqCalls }. @@ -507,7 +577,7 @@ create_context(Pid) -> %% add type to disnguish coordinator vs rpc_worker create_context(From, {M,F,_A} = MFA, Nonce) -> - io:format("[~p] CREAT_CONTEXT MFA[~p]: {~p}: ~p~n", [self(), From, MFA, Nonce]), + %%io:format("[~p] CREAT_CONTEXT MFA[~p]: {~p}: ~p~n", [self(), From, MFA, Nonce]), PidRef = get_pid_ref(), %% this will instantiate a new PidRef %%Rctx = make_record(self(), Ref), %% TODO: extract user_ctx and db/shard from @@ -527,7 +597,7 @@ create_coordinator_context(#httpd{path_parts=Parts} = Req) -> create_coordinator_context(Req, io_lib:format("~p", [Parts])). create_coordinator_context(#httpd{} = Req, Path) -> - io:format("CREATING COORDINATOR CONTEXT ON {~p}~n", [Path]), + %%io:format("CREATING COORDINATOR CONTEXT ON {~p}~n", [Path]), #httpd{ method = Verb, %%path_parts = Parts, @@ -561,11 +631,11 @@ set_context_dbname(DbName) -> set_context_username(null) -> ok; set_context_username(UserName) -> - io:format("CSRT SETTING USERNAME CONTEXT: ~p~n", [UserName]), + %%io:format("CSRT SETTING USERNAME CONTEXT: ~p~n", [UserName]), case ets:update_element(?MODULE, get_pid_ref(), [{#rctx.username, UserName}]) of false -> Stk = try throw(42) catch _:_:Stk0 -> Stk0 end, - io:format("UPDATING DBNAME[~p] FAILURE WITH CONTEXT: ~p AND STACK:~n~pFOO:: ~p~n~n", [UserName, get_resource(), Stk, process_info(self(), current_stacktrace)]), + io:format("UPDATING USERNAME[~p] FAILURE WITH CONTEXT: ~p AND STACK:~n~pFOO:: ~p~n~n", [UserName, get_resource(), Stk, process_info(self(), current_stacktrace)]), timer:sleep(1000), erlang:halt(kaboomz); true -> @@ -612,6 +682,7 @@ make_delta() -> %% Perhaps somewhat naughty we're incrementing stats from within %% couch_stats itself? Might need to handle this differently %% TODO: determine appropriate course of action here + io:format("~n**********MISSING STARTING DELTA************~n~n", []), couch_stats:increment_counter( [couchdb, csrt, delta_missing_t0]), %%[couch_stats_resource_tracker, delta_missing_t0]), @@ -630,13 +701,19 @@ make_delta() -> TA0 end, TB = get_resource(), - make_delta(TA, TB). + Delta = make_delta(TA, TB), + set_delta_a(TB), + Delta. make_delta(#rctx{}=TA, #rctx{}=TB) -> Delta = #{ docs_read => TB#rctx.docs_read - TA#rctx.docs_read, + js_filter => TB#rctx.js_filter - TA#rctx.js_filter, + js_filter_error => TB#rctx.js_filter_error - TA#rctx.js_filter_error, + js_filtered_docs => TB#rctx.js_filtered_docs - TA#rctx.js_filtered_docs, rows_read => TB#rctx.rows_read - TA#rctx.rows_read, + changes_returned => TB#rctx.changes_returned - TA#rctx.changes_returned, btree_folds => TB#rctx.btree_folds - TA#rctx.btree_folds, get_kp_node => TB#rctx.get_kp_node - TA#rctx.get_kp_node, get_kv_node => TB#rctx.get_kv_node - TA#rctx.get_kv_node, @@ -761,4 +838,5 @@ log_process_lifetime_report(PidRef) -> %% More safely assert this can't ever be undefined #rctx{} = Rctx = get_resource(PidRef), %% TODO: catch error out of here, report crashes on depth>1 json + io:format("CSRT RCTX: ~p~n", [to_flat_json(Rctx)]), couch_log:report("csrt-pid-usage-lifetime", to_flat_json(Rctx)). diff --git a/src/fabric/priv/stats_descriptions.cfg b/src/fabric/priv/stats_descriptions.cfg index 7a062cca0ec..528942be86e 100644 --- a/src/fabric/priv/stats_descriptions.cfg +++ b/src/fabric/priv/stats_descriptions.cfg @@ -52,6 +52,14 @@ {type, counter}, {desc, <<"number of fabric_rpc worker changes spawns">>} ]}. +{[fabric_rpc, changes, processed], [ + {type, counter}, + {desc, <<"number of fabric_rpc worker changes row invocations">>} +]}. +{[fabric_rpc, changes, returned], [ + {type, counter}, + {desc, <<"number of fabric_rpc worker changes rows returned">>} +]}. {[fabric_rpc, view, rows_read], [ {type, counter}, {desc, <<"number of fabric_rpc view_cb row invocations">>} diff --git a/src/fabric/src/fabric_rpc.erl b/src/fabric/src/fabric_rpc.erl index da0710c9630..b5f41782a12 100644 --- a/src/fabric/src/fabric_rpc.erl +++ b/src/fabric/src/fabric_rpc.erl @@ -570,6 +570,7 @@ changes_enumerator(DocInfo, Acc) -> {seq, {Seq, uuid(Db), couch_db:owner_of(Epochs, Seq)}} ]}; Results -> + couch_stats:increment_counter([fabric_rpc, changes, returned]), Opts = if Conflicts -> [conflicts | DocOptions]; diff --git a/src/rexi/src/rexi.erl b/src/rexi/src/rexi.erl index 0db6ac5de79..35e884a9f15 100644 --- a/src/rexi/src/rexi.erl +++ b/src/rexi/src/rexi.erl @@ -243,7 +243,9 @@ stream2(Msg, Limit, Timeout) -> {ok, Count} -> put(rexi_unacked, Count + 1), {Caller, Ref} = get(rexi_from), - erlang:send(Caller, {Ref, self(), Msg}), + %% TODO: why do the numbers go whacky when we add the delta here + erlang:send(Caller, {Ref, self(), Msg, get_delta()}), + %%erlang:send(Caller, {Ref, self(), Msg}), ok catch throw:timeout -> @@ -281,7 +283,7 @@ ping() -> %% filtered queries will be silent on usage until they finally return %% a row or no results. This delay is proportional to the database size, %% so instead we make sure ping/0 keeps live stats flowing. - erlang:send(Caller, {rexi, '$rexi_ping'}). + erlang:send(Caller, {rexi, '$rexi_ping'}, get_delta()). %% internal functions %% diff --git a/src/rexi/src/rexi_utils.erl b/src/rexi/src/rexi_utils.erl index 3125f75db92..642087fab6d 100644 --- a/src/rexi/src/rexi_utils.erl +++ b/src/rexi/src/rexi_utils.erl @@ -107,6 +107,9 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> {Ref, Msg} -> %% TODO: add stack trace to log entry couch_log:debug("rexi_utils:process_message no delta: {Ref, Msg} => {~p, ~p}~n", [Ref, Msg]), + timer:sleep(100), + %%erlang:halt(enodelta), + erlang:halt(binary_to_list(iolist_to_binary(io_lib:format("{enodelta} rexi_utils:process_message no delta: {Ref, Msg} => {~w, ~w}~n", [Ref, Msg])))), %%io:format("GOT NON DELTA MSG: ~p~n", [Msg]), case lists:keyfind(Ref, Keypos, RefList) of false -> @@ -119,6 +122,9 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> %%io:format("GOT NON DELTA MSG: ~p~n", [Msg]), %% TODO: add stack trace to log entry couch_log:debug("rexi_utils:process_message no delta: {Ref, From, Msg} => {~p, ~p, ~p}~n", [Ref, From, Msg]), + timer:sleep(100), + %%erlang:halt(enodelta), + erlang:halt(binary_to_list(iolist_to_binary(io_lib:format("{enodelta} rexi_utils:process_message no delta: {Ref, From, Msg} => {~w, ~w, ~w}~n", [Ref, From, Msg])))), case lists:keyfind(Ref, Keypos, RefList) of false -> {ok, Acc0}; From 84be4af1ba67087062bbf0f2909480867cd02bf6 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Fri, 10 Nov 2023 13:22:11 -0800 Subject: [PATCH 06/29] TMP: rewire rexi to fail on unexpected messages --- src/rexi/include/rexi.hrl | 1 + src/rexi/src/rexi_server.erl | 10 +++++----- src/rexi/src/rexi_utils.erl | 32 +++++++++++++++++++++----------- 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/src/rexi/include/rexi.hrl b/src/rexi/include/rexi.hrl index a2d86b2ab54..a962f306917 100644 --- a/src/rexi/include/rexi.hrl +++ b/src/rexi/include/rexi.hrl @@ -11,6 +11,7 @@ % the License. -record(error, { + delta, timestamp, reason, mfa, diff --git a/src/rexi/src/rexi_server.erl b/src/rexi/src/rexi_server.erl index 08a9139bd79..7bfe264844e 100644 --- a/src/rexi/src/rexi_server.erl +++ b/src/rexi/src/rexi_server.erl @@ -101,12 +101,12 @@ handle_info({'DOWN', Ref, process, Pid, Error}, #st{workers = Workers} = St) -> case find_worker(Ref, Workers) of #job{worker_pid = Pid, worker = Ref, client_pid = CPid, client = CRef} = Job -> case Error of - #error{reason = {_Class, Reason}, stack = Stack} -> - notify_caller({CPid, CRef}, {Reason, Stack}), + #error{reason = {_Class, Reason}, stack = Stack, delta = Delta} -> + notify_caller({CPid, CRef}, {Reason, Stack}, Delta), St1 = save_error(Error, St), {noreply, remove_job(Job, St1)}; _ -> - notify_caller({CPid, CRef}, Error), + notify_caller({CPid, CRef}, Error, edelta), {noreply, remove_job(Job, St)} end; false -> @@ -205,8 +205,8 @@ find_worker(Ref, Tab) -> [Worker] -> Worker end. -notify_caller({Caller, Ref}, Reason) -> - rexi_utils:send(Caller, {Ref, {rexi_EXIT, Reason}}). +notify_caller({Caller, Ref}, Reason, Delta) -> + rexi_utils:send(Caller, {Ref, {rexi_EXIT, Reason}, {delta, Delta}}). kill_worker(FromRef, #st{clients = Clients} = St) -> case find_worker(FromRef, Clients) of diff --git a/src/rexi/src/rexi_utils.erl b/src/rexi/src/rexi_utils.erl index 642087fab6d..43f3346d945 100644 --- a/src/rexi/src/rexi_utils.erl +++ b/src/rexi/src/rexi_utils.erl @@ -104,13 +104,7 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> Worker -> Fun(Msg, {Worker, From}, Acc0) end; - {Ref, Msg} -> - %% TODO: add stack trace to log entry - couch_log:debug("rexi_utils:process_message no delta: {Ref, Msg} => {~p, ~p}~n", [Ref, Msg]), - timer:sleep(100), - %%erlang:halt(enodelta), - erlang:halt(binary_to_list(iolist_to_binary(io_lib:format("{enodelta} rexi_utils:process_message no delta: {Ref, Msg} => {~w, ~w}~n", [Ref, Msg])))), - %%io:format("GOT NON DELTA MSG: ~p~n", [Msg]), + {Ref, get_state = Msg} -> case lists:keyfind(Ref, Keypos, RefList) of false -> % this was some non-matching message which we will ignore @@ -118,19 +112,35 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> Worker -> Fun(Msg, Worker, Acc0) end; - {Ref, From, Msg} -> - %%io:format("GOT NON DELTA MSG: ~p~n", [Msg]), + {Ref, done = Msg} -> + case lists:keyfind(Ref, Keypos, RefList) of + false -> + % this was some non-matching message which we will ignore + {ok, Acc0}; + Worker -> + Fun(Msg, Worker, Acc0) + end; + %% {#Ref<0.1961702128.3491758082.26965>, {rexi_EXIT,{{query_parse_error,<<78,111,32,114,111,119,115,32,99,97,110,32,109,97 + {Ref, Msg} -> %% TODO: add stack trace to log entry - couch_log:debug("rexi_utils:process_message no delta: {Ref, From, Msg} => {~p, ~p, ~p}~n", [Ref, From, Msg]), + couch_log:debug("rexi_utils:process_message no delta: {Ref, Msg} => {~p, ~p}~n", [Ref, Msg]), timer:sleep(100), %%erlang:halt(enodelta), - erlang:halt(binary_to_list(iolist_to_binary(io_lib:format("{enodelta} rexi_utils:process_message no delta: {Ref, From, Msg} => {~w, ~w, ~w}~n", [Ref, From, Msg])))), + erlang:halt(binary_to_list(iolist_to_binary(io_lib:format("{enodelta} rexi_utils:process_message no delta: {Ref, Msg} => {~w, ~w}~n", [Ref, Msg])))); + {Ref, From, rexi_STREAM_INIT = Msg} -> case lists:keyfind(Ref, Keypos, RefList) of false -> {ok, Acc0}; Worker -> Fun(Msg, {Worker, From}, Acc0) end; + {Ref, From, Msg} -> + %%io:format("GOT NON DELTA MSG: ~p~n", [Msg]), + %% TODO: add stack trace to log entry + couch_log:debug("rexi_utils:process_message no delta: {Ref, From, Msg} => {~p, ~p, ~p}~n", [Ref, From, Msg]), + timer:sleep(100), + %%erlang:halt(enodelta), + erlang:halt(binary_to_list(iolist_to_binary(io_lib:format("{enodelta} rexi_utils:process_message no delta: {Ref, From, Msg} => {~w, ~w, ~w}~n", [Ref, From, Msg])))); {rexi_DOWN, _, _, _} = Msg -> Fun(Msg, nil, Acc0) after PerMsgTO -> From 19224af0ad0441f530b074153a93eb07cd939129 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 13 Nov 2023 16:11:52 -0800 Subject: [PATCH 07/29] Add baseline group by and sort by aggregations --- .../src/couch_stats_resource_tracker.erl | 58 +++++++++++++++++-- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index 9f4f9254e19..da8cd30413f 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -48,6 +48,16 @@ active_workers/0 ]). +-export([ + count_by/1, + group_by/2, + group_by/3, + sorted/1, + sorted_by/1, + sorted_by/2, + sorted_by/3 +]). + -export([ make_delta/0 ]). @@ -76,7 +86,8 @@ ]). -export([ - field/2 + field/2, + curry_field/1 ]). -include_lib("couch/include/couch_db.hrl"). @@ -366,11 +377,50 @@ field(#rctx{get_kv_node=Val}, get_kv_node) -> Val; field(#rctx{get_kp_node=Val}, get_kp_node) -> Val. -%%group_by(GBFun) when is_fun(GBFun) -> -%% group_by(GBFun, sum). +curry_field(Field) -> + fun(Ele) -> field(Ele, Field) end. + + +count_by(KeyFun) -> + group_by(KeyFun, fun(_) -> 1 end). + + +group_by(KeyFun, ValFun) -> + group_by(KeyFun, ValFun, fun erlang:'+'/2). + + +%% eg: group_by(mfa, docs_read). +%% eg: group_by(fun(#rctx{mfa=MFA,docs_read=DR}) -> {MFA, DR} end, ioq_calls). +%% eg: ^^ or: group_by([mfa, docs_read], ioq_calls). +%% eg: group_by([username, dbname, mfa], docs_read). +%% eg: group_by([username, dbname, mfa], ioq_calls). +%% eg: group_by([username, dbname, mfa], js_filters). +group_by(KeyL, ValFun, AggFun) when is_list(KeyL) -> + KeyFun = fun(Ele) -> list_to_tuple([field(Ele, Key) || Key <- KeyL]) end, + group_by(KeyFun, ValFun, AggFun); +group_by(Key, ValFun, AggFun) when is_atom(Key) -> + group_by(curry_field(Key), ValFun, AggFun); +group_by(KeyFun, Val, AggFun) when is_atom(Val) -> + group_by(KeyFun, curry_field(Val), AggFun); +group_by(KeyFun, ValFun, AggFun) -> + FoldFun = fun(Ele, Acc) -> + Key = KeyFun(Ele), + Val = ValFun(Ele), + CurrVal = maps:get(Key, Acc, 0), + NewVal = AggFun(CurrVal, Val), + maps:put(Key, NewVal, Acc) + end, + ets:foldl(FoldFun, #{}, ?MODULE). + +sorted(Map) when is_map(Map) -> + lists:sort(fun({_K1, A}, {_K2, B}) -> B < A end, maps:to_list(Map)). -%%group_by(GbFun, Agg) when is_function(GBFun) -> +%% eg: sorted_by([username, dbname, mfa], ioq_calls) +%% eg: sorted_by([dbname, mfa], doc_reads) +sorted_by(KeyFun) -> sorted(count_by(KeyFun)). +sorted_by(KeyFun, ValFun) -> sorted(group_by(KeyFun, ValFun)). +sorted_by(KeyFun, ValFun, AggFun) -> sorted(group_by(KeyFun, ValFun, AggFun)). to_json(#rctx{}=Rctx) -> From cafa5123b97b45a4e45d7e6def352de7f9c92ce5 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 13 Nov 2023 16:25:23 -0800 Subject: [PATCH 08/29] Add watchdog scanning for unmonitored processes --- .../src/couch_stats_resource_tracker.erl | 64 ++++++++++++++----- src/rexi/src/rexi.erl | 2 - 2 files changed, 49 insertions(+), 17 deletions(-) diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index da8cd30413f..0a5b2e33558 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -45,7 +45,8 @@ -export([ active/0, active_coordinators/0, - active_workers/0 + active_workers/0, + find_unmonitored/0 ]). -export([ @@ -125,7 +126,8 @@ -record(st, { - eviction_delay = 10000, %% How many ms dead processes are visible + eviction_delay = 10 * 1000, %% How many ms dead processes are visible + scan_interval = 2048, %% How regularly to perfom scans tracking = #{} %% track active processes for eventual eviction }). @@ -134,9 +136,11 @@ %% -record(?RCTX, { -record(rctx, { %% Metadata + started_at = os:timestamp(), updated_at = os:timestamp(), exited_at, pid_ref, + mon_ref, mfa, nonce, from, @@ -426,6 +430,7 @@ sorted_by(KeyFun, ValFun, AggFun) -> sorted(group_by(KeyFun, ValFun, AggFun)). to_json(#rctx{}=Rctx) -> #rctx{ updated_at = TP, + started_at = TInit, pid_ref = {_Pid, _Ref} = PidRef, mfa = MFA0, nonce = Nonce0, @@ -478,6 +483,7 @@ to_json(#rctx{}=Rctx) -> end, #{ updated_at => term_to_json(TP), + started_at => term_to_json(TInit), %%pid_ref => [pid_to_list(Pid), ref_to_list(Ref)], pid_ref => term_to_json(PidRef), mfa => term_to_json(MFA), @@ -531,6 +537,7 @@ term_to_flat_json(T) -> to_flat_json(#rctx{}=Rctx) -> #rctx{ updated_at = TP, + started_at = TInit, pid_ref = {_Pid, _Ref} = PidRef, mfa = MFA0, nonce = Nonce0, @@ -582,6 +589,7 @@ to_flat_json(#rctx{}=Rctx) -> #{ %%updated_at => ?l2b(io_lib:format("~w", [TP])), updated_at => term_to_flat_json(TP), + started_at => term_to_flat_json(TInit), %%pid_ref => [pid_to_list(Pid), ref_to_list(Ref)], pid_ref => ?l2b(io_lib:format("~w", [PidRef])), mfa => MFA, @@ -814,6 +822,12 @@ get_resource(PidRef) -> make_record(Pid, Ref) -> #rctx{pid_ref = {Pid, Ref}}. + +find_unmonitored() -> + %% TODO: only need PidRef here, replace with a select that does that... + [PR || #rctx{pid_ref=PR} <- ets:match_object(?MODULE, #rctx{mon_ref=undefined, _ = '_'})]. + + start_link() -> gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). @@ -826,28 +840,31 @@ init([]) -> {read_concurrency, true}, {keypos, #rctx.pid_ref} ]), - {ok, #st{}}. + St = #st{}, + _TimerRef = erlang:send_after(St#st.scan_interval, self(), scan), + {ok, St}. handle_call(fetch, _from, #st{} = St) -> {reply, {ok, St}, St}; -handle_call({track, _}, _From, St) -> - {reply, ok, St}; +handle_call({track, _}=Msg, _From, St) -> + {noreply, St1} = handle_cast(Msg, St), + {reply, ok, St1}; handle_call(Msg, _From, St) -> {stop, {unknown_call, Msg}, error, St}. -handle_cast({track, #rctx{pid_ref={Pid,_}=PidRef}}, #st{tracking=AT0} = St0) -> - St = case maps:is_key(PidRef, AT0) of - true -> %% noop, we're already tracking this PidRef - St0; - false -> %% setup new monitor and double bookkeep refs - Mon = erlang:monitor(process, Pid), - AT = maps:put(Mon, PidRef, maps:put(PidRef, Mon, AT0)), - St0#st{tracking=AT} - end, - {noreply, St}; +handle_cast({track, #rctx{pid_ref=PidRef}}, #st{tracking=AT0} = St0) -> + AT = maybe_track(PidRef, AT0), + {noreply, St0#st{tracking=AT}}; handle_cast(Msg, St) -> {stop, {unknown_cast, Msg}, St}. +handle_info(scan, #st{tracking=AT0} = St0) -> + io:format("{CSRT} TRIGGERING SCAN: ~n", []), + Unmonitored = find_unmonitored(), + io:format("{CSRT} SCAN FOUND ~p UNMONITORED PIDS: ~n", [length(Unmonitored)]), + AT = maybe_track(Unmonitored, AT0), + _TimerRef = erlang:send_after(St0#st.scan_interval, self(), scan), + {noreply, St0#st{tracking=AT}}; handle_info({'DOWN', MonRef, Type, DPid, Reason}, #st{tracking=AT0} = St0) -> io:format("CSRT:HI(~p)~n", [{'DOWN', MonRef, Type, DPid, Reason}]), St = case maps:get(MonRef, AT0, undefined) of @@ -884,6 +901,23 @@ terminate(_Reason, _St) -> code_change(_OldVsn, St, _Extra) -> {ok, St}. + +maybe_track([], AT) -> + AT; +maybe_track(PidRef, AT) when is_tuple(PidRef) -> + maybe_track([PidRef], AT); +maybe_track([{Pid,_Ref} = PidRef | PidRefs], AT) -> + AT1 = case maps:is_key(PidRef, AT) of + true -> %% noop, we're already tracking this PidRef + AT; + false -> %% setup new monitor and double bookkeep refs + Mon = erlang:monitor(process, Pid), + %% TODO: decide whether we want the true match to crash this process on failure + true = ets:update_element(?MODULE, PidRef, [{#rctx.mon_ref, Mon}]), + maps:put(Mon, PidRef, maps:put(PidRef, Mon, AT)) + end, + maybe_track(PidRefs, AT1). + log_process_lifetime_report(PidRef) -> %% More safely assert this can't ever be undefined #rctx{} = Rctx = get_resource(PidRef), diff --git a/src/rexi/src/rexi.erl b/src/rexi/src/rexi.erl index 35e884a9f15..0b005dace8d 100644 --- a/src/rexi/src/rexi.erl +++ b/src/rexi/src/rexi.erl @@ -243,9 +243,7 @@ stream2(Msg, Limit, Timeout) -> {ok, Count} -> put(rexi_unacked, Count + 1), {Caller, Ref} = get(rexi_from), - %% TODO: why do the numbers go whacky when we add the delta here erlang:send(Caller, {Ref, self(), Msg, get_delta()}), - %%erlang:send(Caller, {Ref, self(), Msg}), ok catch throw:timeout -> From 996c38551ae2a10ee16371bab30f5a06128aa102 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Wed, 15 Nov 2023 15:08:35 -0800 Subject: [PATCH 09/29] Set dbname context properly --- src/fabric/src/fabric_rpc.erl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/fabric/src/fabric_rpc.erl b/src/fabric/src/fabric_rpc.erl index b5f41782a12..bfd039aad95 100644 --- a/src/fabric/src/fabric_rpc.erl +++ b/src/fabric/src/fabric_rpc.erl @@ -70,6 +70,7 @@ changes(DbName, #changes_args{} = Args, StartSeq, DbOptions) -> changes(DbName, [Args], StartSeq, DbOptions); changes(DbName, Options, StartVector, DbOptions) -> set_io_priority(DbName, DbOptions), + couch_stats_resource_tracker:set_context_dbname(DbName), Args0 = lists:keyfind(changes_args, 1, Options), #changes_args{dir = Dir, filter_fun = Filter} = Args0, Args = @@ -146,6 +147,7 @@ all_docs(DbName, Options, Args0) -> case fabric_util:upgrade_mrargs(Args0) of #mrargs{keys = undefined} = Args -> set_io_priority(DbName, Options), + couch_stats_resource_tracker:set_context_dbname(DbName), {ok, Db} = get_or_create_db(DbName, Options), CB = get_view_cb(Args), couch_mrview:query_all_docs(Db, Args, CB, Args) @@ -170,6 +172,7 @@ map_view(DbName, {DDocId, Rev}, ViewName, Args0, DbOptions) -> map_view(DbName, DDoc, ViewName, Args0, DbOptions); map_view(DbName, DDoc, ViewName, Args0, DbOptions) -> set_io_priority(DbName, DbOptions), + couch_stats_resource_tracker:set_context_dbname(DbName), Args = fabric_util:upgrade_mrargs(Args0), {ok, Db} = get_or_create_db(DbName, DbOptions), CB = get_view_cb(Args), @@ -184,6 +187,7 @@ reduce_view(DbName, {DDocId, Rev}, ViewName, Args0, DbOptions) -> reduce_view(DbName, DDoc, ViewName, Args0, DbOptions); reduce_view(DbName, DDoc, ViewName, Args0, DbOptions) -> set_io_priority(DbName, DbOptions), + couch_stats_resource_tracker:set_context_dbname(DbName), Args = fabric_util:upgrade_mrargs(Args0), {ok, Db} = get_or_create_db(DbName, DbOptions), VAcc0 = #vacc{db = Db}, @@ -327,6 +331,7 @@ reset_validation_funs(DbName) -> open_shard(Name, Opts) -> set_io_priority(Name, Opts), + couch_stats_resource_tracker:set_context_dbname(DbName), try rexi:reply(mem3_util:get_or_create_db(Name, Opts)) catch @@ -386,6 +391,7 @@ with_db(DbName, Options, {M, F, A}) -> read_repair_filter(DbName, Docs, NodeRevs, Options) -> set_io_priority(DbName, Options), + couch_stats_resource_tracker:set_context_dbname(DbName), case get_or_create_db(DbName, Options) of {ok, Db} -> try From 1bbf7a90e82ccb224c0ebf1558fabbf2ee4bd795 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Wed, 15 Nov 2023 16:03:20 -0800 Subject: [PATCH 10/29] Add unsafe_foldl for perf testing --- .../src/couch_stats_resource_tracker.erl | 43 +++++++++++++++---- src/fabric/src/fabric_rpc.erl | 2 +- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index 0a5b2e33558..d2dd4406ce2 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -53,10 +53,13 @@ count_by/1, group_by/2, group_by/3, + group_by/4, sorted/1, sorted_by/1, sorted_by/2, - sorted_by/3 + sorted_by/3, + + unsafe_foldl/3 ]). -export([ @@ -393,20 +396,24 @@ group_by(KeyFun, ValFun) -> group_by(KeyFun, ValFun, fun erlang:'+'/2). +group_by(KeyFun, ValFun, AggFun) -> + group_by(KeyFun, ValFun, AggFun, fun ets:foldl/3). + + %% eg: group_by(mfa, docs_read). %% eg: group_by(fun(#rctx{mfa=MFA,docs_read=DR}) -> {MFA, DR} end, ioq_calls). %% eg: ^^ or: group_by([mfa, docs_read], ioq_calls). %% eg: group_by([username, dbname, mfa], docs_read). %% eg: group_by([username, dbname, mfa], ioq_calls). %% eg: group_by([username, dbname, mfa], js_filters). -group_by(KeyL, ValFun, AggFun) when is_list(KeyL) -> +group_by(KeyL, ValFun, AggFun, Fold) when is_list(KeyL) -> KeyFun = fun(Ele) -> list_to_tuple([field(Ele, Key) || Key <- KeyL]) end, - group_by(KeyFun, ValFun, AggFun); -group_by(Key, ValFun, AggFun) when is_atom(Key) -> - group_by(curry_field(Key), ValFun, AggFun); -group_by(KeyFun, Val, AggFun) when is_atom(Val) -> - group_by(KeyFun, curry_field(Val), AggFun); -group_by(KeyFun, ValFun, AggFun) -> + group_by(KeyFun, ValFun, AggFun, Fold); +group_by(Key, ValFun, AggFun, Fold) when is_atom(Key) -> + group_by(curry_field(Key), ValFun, AggFun, Fold); +group_by(KeyFun, Val, AggFun, Fold) when is_atom(Val) -> + group_by(KeyFun, curry_field(Val), AggFun, Fold); +group_by(KeyFun, ValFun, AggFun, Fold) -> FoldFun = fun(Ele, Acc) -> Key = KeyFun(Ele), Val = ValFun(Ele), @@ -414,7 +421,8 @@ group_by(KeyFun, ValFun, AggFun) -> NewVal = AggFun(CurrVal, Val), maps:put(Key, NewVal, Acc) end, - ets:foldl(FoldFun, #{}, ?MODULE). + Fold(FoldFun, #{}, ?MODULE). + sorted(Map) when is_map(Map) -> lists:sort(fun({_K1, A}, {_K2, B}) -> B < A end, maps:to_list(Map)). @@ -924,3 +932,20 @@ log_process_lifetime_report(PidRef) -> %% TODO: catch error out of here, report crashes on depth>1 json io:format("CSRT RCTX: ~p~n", [to_flat_json(Rctx)]), couch_log:report("csrt-pid-usage-lifetime", to_flat_json(Rctx)). + + +%% Reimplementation of: https://github.com/erlang/otp/blob/b2ee4fc9a0b81a139dad2033e9b2bfc178146886/lib/stdlib/src/ets.erl#L633-L658 +%% with wrapping of ets:safe_fixtable/2 removed +unsafe_foldl(F, Accu, T) -> + First = ets:first(T), + do_foldl(F, Accu, First, T). + +do_foldl(F, Accu0, Key, T) -> + case Key of + '$end_of_table' -> + Accu0; + _ -> + do_foldl(F, + lists:foldl(F, Accu0, ets:lookup(T, Key)), + ets:next(T, Key), T) + end. diff --git a/src/fabric/src/fabric_rpc.erl b/src/fabric/src/fabric_rpc.erl index bfd039aad95..2c8ba16547f 100644 --- a/src/fabric/src/fabric_rpc.erl +++ b/src/fabric/src/fabric_rpc.erl @@ -331,7 +331,7 @@ reset_validation_funs(DbName) -> open_shard(Name, Opts) -> set_io_priority(Name, Opts), - couch_stats_resource_tracker:set_context_dbname(DbName), + couch_stats_resource_tracker:set_context_dbname(Name), try rexi:reply(mem3_util:get_or_create_db(Name, Opts)) catch From e70ba4d5f261b2ec9855aa430810ee25ee83e03d Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Fri, 17 Nov 2023 16:01:21 -0800 Subject: [PATCH 11/29] Rework csrt context setting --- src/fabric/src/fabric_rpc.erl | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/src/fabric/src/fabric_rpc.erl b/src/fabric/src/fabric_rpc.erl index 2c8ba16547f..6a8b069ea8b 100644 --- a/src/fabric/src/fabric_rpc.erl +++ b/src/fabric/src/fabric_rpc.erl @@ -70,7 +70,6 @@ changes(DbName, #changes_args{} = Args, StartSeq, DbOptions) -> changes(DbName, [Args], StartSeq, DbOptions); changes(DbName, Options, StartVector, DbOptions) -> set_io_priority(DbName, DbOptions), - couch_stats_resource_tracker:set_context_dbname(DbName), Args0 = lists:keyfind(changes_args, 1, Options), #changes_args{dir = Dir, filter_fun = Filter} = Args0, Args = @@ -147,7 +146,6 @@ all_docs(DbName, Options, Args0) -> case fabric_util:upgrade_mrargs(Args0) of #mrargs{keys = undefined} = Args -> set_io_priority(DbName, Options), - couch_stats_resource_tracker:set_context_dbname(DbName), {ok, Db} = get_or_create_db(DbName, Options), CB = get_view_cb(Args), couch_mrview:query_all_docs(Db, Args, CB, Args) @@ -172,7 +170,6 @@ map_view(DbName, {DDocId, Rev}, ViewName, Args0, DbOptions) -> map_view(DbName, DDoc, ViewName, Args0, DbOptions); map_view(DbName, DDoc, ViewName, Args0, DbOptions) -> set_io_priority(DbName, DbOptions), - couch_stats_resource_tracker:set_context_dbname(DbName), Args = fabric_util:upgrade_mrargs(Args0), {ok, Db} = get_or_create_db(DbName, DbOptions), CB = get_view_cb(Args), @@ -187,7 +184,6 @@ reduce_view(DbName, {DDocId, Rev}, ViewName, Args0, DbOptions) -> reduce_view(DbName, DDoc, ViewName, Args0, DbOptions); reduce_view(DbName, DDoc, ViewName, Args0, DbOptions) -> set_io_priority(DbName, DbOptions), - couch_stats_resource_tracker:set_context_dbname(DbName), Args = fabric_util:upgrade_mrargs(Args0), {ok, Db} = get_or_create_db(DbName, DbOptions), VAcc0 = #vacc{db = Db}, @@ -331,7 +327,6 @@ reset_validation_funs(DbName) -> open_shard(Name, Opts) -> set_io_priority(Name, Opts), - couch_stats_resource_tracker:set_context_dbname(Name), try rexi:reply(mem3_util:get_or_create_db(Name, Opts)) catch @@ -358,14 +353,6 @@ get_uuid(DbName) -> with_db(DbName, Options, {M, F, A}) -> set_io_priority(DbName, Options), - couch_stats_resource_tracker:set_context_dbname(DbName), - %% TODO: better approach here than using proplists? - case proplists:get_value(user_ctx, Options) of - undefined -> - ok; - #user_ctx{name = UserName} -> - couch_stats_resource_tracker:set_context_username(UserName) - end, case get_or_create_db(DbName, Options) of {ok, Db} -> rexi:reply( @@ -391,7 +378,6 @@ with_db(DbName, Options, {M, F, A}) -> read_repair_filter(DbName, Docs, NodeRevs, Options) -> set_io_priority(DbName, Options), - couch_stats_resource_tracker:set_context_dbname(DbName), case get_or_create_db(DbName, Options) of {ok, Db} -> try @@ -682,6 +668,14 @@ clean_stack(S) -> ). set_io_priority(DbName, Options) -> + couch_stats_resource_tracker:set_context_dbname(DbName), + %% TODO: better approach here than using proplists? + case proplists:get_value(user_ctx, Options) of + undefined -> + ok; + #user_ctx{name = UserName} -> + couch_stats_resource_tracker:set_context_username(UserName) + end, case lists:keyfind(io_priority, 1, Options) of {io_priority, Pri} -> erlang:put(io_priority, Pri); From 715b524199cd2b1a4c35e26152c8d44d857db793 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Fri, 17 Nov 2023 16:03:12 -0800 Subject: [PATCH 12/29] WIP: ugly but working HTTP API around groupings --- src/chttpd/src/chttpd_misc.erl | 92 ++++++++++++++++++- .../src/couch_stats_resource_tracker.erl | 13 ++- 2 files changed, 99 insertions(+), 6 deletions(-) diff --git a/src/chttpd/src/chttpd_misc.erl b/src/chttpd/src/chttpd_misc.erl index 195c8d8284d..28bbcbee01d 100644 --- a/src/chttpd/src/chttpd_misc.erl +++ b/src/chttpd/src/chttpd_misc.erl @@ -37,8 +37,7 @@ [ send_json/2, send_json/3, send_method_not_allowed/2, - send_chunk/2, - start_chunked_response/3 + send_chunk/2 ] ). @@ -231,6 +230,92 @@ handle_task_status_req(#httpd{method = 'GET'} = Req) -> handle_task_status_req(Req) -> send_method_not_allowed(Req, "GET,HEAD"). +handle_resource_status_req(#httpd{method = 'POST'} = Req) -> + ok = chttpd:verify_is_server_admin(Req), + chttpd:validate_ctype(Req, "application/json"), + {Props} = chttpd:json_body_obj(Req), + Action = proplists:get_value(<<"action">>, Props), + Key = proplists:get_value(<<"key">>, Props), + Val = proplists:get_value(<<"val">>, Props), + + CountBy = fun couch_stats_resource_tracker:count_by/1, + GroupBy = fun couch_stats_resource_tracker:group_by/2, + SortedBy1 = fun couch_stats_resource_tracker:sorted_by/1, + SortedBy2 = fun couch_stats_resource_tracker:sorted_by/2, + ConvertEle = fun(K) -> list_to_existing_atom(binary_to_list(K)) end, + ConvertList = fun(L) -> [ConvertEle(E) || E <- L] end, + ToJson = fun couch_stats_resource_tracker:term_to_flat_json/1, + JsonKeys = fun(PL) -> [[ToJson(K), V] || {K, V} <- PL] end, + + Fun = case {Action, Key, Val} of + {<<"count_by">>, Keys, undefined} when is_list(Keys) -> + Keys1 = [ConvertEle(K) || K <- Keys], + fun() -> CountBy(Keys1) end; + {<<"count_by">>, Key, undefined} -> + Key1 = ConvertEle(Key), + fun() -> CountBy(Key1) end; + {<<"group_by">>, Keys, Vals} when is_list(Keys) andalso is_list(Vals) -> + Keys1 = ConvertList(Keys), + Vals1 = ConvertList(Vals), + fun() -> GroupBy(Keys1, Vals1) end; + {<<"group_by">>, Key, Vals} when is_list(Vals) -> + Key1 = ConvertEle(Key), + Vals1 = ConvertList(Vals), + fun() -> GroupBy(Key1, Vals1) end; + {<<"group_by">>, Keys, Val} when is_list(Keys) -> + Keys1 = ConvertList(Keys), + Val1 = ConvertEle(Val), + fun() -> GroupBy(Keys1, Val1) end; + {<<"group_by">>, Key, Val} -> + Key1 = ConvertEle(Key), + Val1 = ConvertList(Val), + fun() -> GroupBy(Key1, Val1) end; + + {<<"sorted_by">>, Key, undefined} -> + Key1 = ConvertEle(Key), + fun() -> JsonKeys(SortedBy1(Key1)) end; + {<<"sorted_by">>, Keys, undefined} when is_list(Keys) -> + Keys1 = [ConvertEle(K) || K <- Keys], + fun() -> JsonKeys(SortedBy1(Keys1)) end; + {<<"sorted_by">>, Keys, Vals} when is_list(Keys) andalso is_list(Vals) -> + Keys1 = ConvertList(Keys), + Vals1 = ConvertList(Vals), + fun() -> JsonKeys(SortedBy2(Keys1, Vals1)) end; + {<<"sorted_by">>, Key, Vals} when is_list(Vals) -> + Key1 = ConvertEle(Key), + Vals1 = ConvertList(Vals), + fun() -> JsonKeys(SortedBy2(Key1, Vals1)) end; + {<<"sorted_by">>, Keys, Val} when is_list(Keys) -> + Keys1 = ConvertList(Keys), + Val1 = ConvertEle(Val), + fun() -> JsonKeys(SortedBy2(Keys1, Val1)) end; + {<<"sorted_by">>, Key, Val} -> + Key1 = ConvertEle(Key), + Val1 = ConvertList(Val), + fun() -> JsonKeys(SortedBy2(Key1, Val1)) end; + _ -> + throw({badrequest, invalid_resource_request}) + end, + + Fun1 = fun() -> + case Fun() of + Map when is_map(Map) -> + {maps:fold( + fun(K,V,A) -> [{ToJson(K), V} | A] end, + [], Map)}; + List when is_list(List) -> + List + end + end, + + {Resp, _Bad} = rpc:multicall(erlang, apply, [ + fun() -> + {node(), Fun1()} + end, + [] + ]), + io:format("{CSRT}***** GOT RESP: ~p~n", [Resp]), + send_json(Req, {Resp}); handle_resource_status_req(#httpd{method = 'GET'} = Req) -> ok = chttpd:verify_is_server_admin(Req), {Resp, Bad} = rpc:multicall(erlang, apply, [ @@ -243,7 +328,8 @@ handle_resource_status_req(#httpd{method = 'GET'} = Req) -> io:format("ACTIVE RESP: ~p~nBAD RESP: ~p~n", [Resp, Bad]), send_json(Req, {Resp}); handle_resource_status_req(Req) -> - send_method_not_allowed(Req, "GET,HEAD"). + ok = chttpd:verify_is_server_admin(Req), + send_method_not_allowed(Req, "GET,HEAD,POST"). handle_replicate_req(#httpd{method = 'POST', user_ctx = Ctx, req_body = PostBody} = Req) -> chttpd:validate_ctype(Req, "application/json"), diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index d2dd4406ce2..d5fcd92ae25 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -59,7 +59,9 @@ sorted_by/2, sorted_by/3, - unsafe_foldl/3 + unsafe_foldl/3, + + term_to_flat_json/1 ]). -export([ @@ -424,6 +426,7 @@ group_by(KeyFun, ValFun, AggFun, Fold) -> Fold(FoldFun, #{}, ?MODULE). +%% Sorts largest first sorted(Map) when is_map(Map) -> lists:sort(fun({_K1, A}, {_K2, B}) -> B < A end, maps:to_list(Map)). @@ -523,6 +526,8 @@ term_to_json({type, {coordinator, _, _} = Type}) -> ?l2b(io_lib:format("~p", [Type])); term_to_json({A, B, C}) -> [A, B, C]; +term_to_json(Tuple) when is_tuple(Tuple) -> + ?l2b(io_lib:format("~w", [Tuple])); term_to_json(undefined) -> null; term_to_json(null) -> @@ -530,11 +535,13 @@ term_to_json(null) -> term_to_json(T) -> T. -term_to_flat_json({type, {coordinator, _, _} = Type}) -> +term_to_flat_json({type, {coordinator, _, _}}=Type) -> %%io:format("SETTING FLAT JSON TYPE: ~p~n", [Type]), ?l2b(io_lib:format("~p", [Type])); +term_to_flat_json({coordinator, _, _}=Type) -> + ?l2b(io_lib:format("~p", [Type])); term_to_flat_json(Tuple) when is_tuple(Tuple) -> - ?l2b(io_lib:format("~w", [Tuple])); + ?l2b(io_lib:format("~p", [Tuple])); term_to_flat_json(undefined) -> null; term_to_flat_json(null) -> From be8ab045d93b72aa1da4cbcb84d71054cf5d03ee Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 20 Nov 2023 13:45:29 -0800 Subject: [PATCH 13/29] Cleanup debug info --- src/chttpd/src/chttpd.erl | 13 +------ src/chttpd/src/chttpd_misc.erl | 2 +- src/couch/priv/stats_descriptions.cfg | 2 +- .../src/couch_stats_resource_tracker.erl | 36 +++---------------- src/fabric/src/fabric_rpc.erl | 1 - src/fabric/src/fabric_util.erl | 2 -- src/rexi/src/rexi_utils.erl | 6 ---- 7 files changed, 7 insertions(+), 55 deletions(-) diff --git a/src/chttpd/src/chttpd.erl b/src/chttpd/src/chttpd.erl index 688541ae8bb..78b41a7106a 100644 --- a/src/chttpd/src/chttpd.erl +++ b/src/chttpd/src/chttpd.erl @@ -313,8 +313,6 @@ handle_request_int(MochiReq) -> ] }, - io:format("CSRTZ PROCESSING: ~w~n", [RequestedPath]), - % put small token on heap to keep requests synced to backend calls erlang:put(nonce, Nonce), @@ -326,22 +324,15 @@ handle_request_int(MochiReq) -> chttpd_util:mochiweb_client_req_set(MochiReq), %% This is probably better in before_request, but having Path is nice - io:format("CSRTZ INITIATING CONTEXT: ~w~n", [Nonce]), couch_stats_resource_tracker:create_coordinator_context(HttpReq0, Path), - io:format("CSRTZ INITIAL CONTEXT: ~w~n", [couch_stats_resource_tracker:get_resource()]), - Coord = self(), - PidRef = couch_stats_resource_tracker:get_pid_ref(), - spawn(fun() -> monitor(process, Coord), receive M -> io:format("CSRTZ PROCESS DOWN[~w]{~w}: ~w~n", [ Coord, M, couch_stats_resource_tracker:get_resource(PidRef)]) end end), + {HttpReq2, Response} = case before_request(HttpReq0) of {ok, HttpReq1} -> - io:format("CSRTZ BEFORE_REQUEST OK: ~w~n", [couch_stats_resource_tracker:get_resource()]), process_request(HttpReq1); {error, Response0} -> - io:format("CSRTZ BEFORE_REQUEST ERROR: ~w~n", [couch_stats_resource_tracker:get_resource()]), {HttpReq0, Response0} end, - io:format("CSRTZ AFTER PROCESS_REQUEST: ~w~n", [couch_stats_resource_tracker:get_resource()]), chttpd_util:mochiweb_client_req_clean(), @@ -357,10 +348,8 @@ handle_request_int(MochiReq) -> case after_request(HttpReq2, HttpResp) of #httpd_resp{status = ok, response = Resp} -> - io:format("CSRTZ AFTER REQUEST OK: ~w~n", [couch_stats_resource_tracker:get_resource()]), {ok, Resp}; #httpd_resp{status = aborted, reason = Reason} -> - io:format("CSRTZ AFTER REQUEST ERROR: ~w~n", [couch_stats_resource_tracker:get_resource()]), couch_log:error("Response abnormally terminated: ~w", [Reason]), exit({shutdown, Reason}) end. diff --git a/src/chttpd/src/chttpd_misc.erl b/src/chttpd/src/chttpd_misc.erl index 28bbcbee01d..eff4c93c11d 100644 --- a/src/chttpd/src/chttpd_misc.erl +++ b/src/chttpd/src/chttpd_misc.erl @@ -314,7 +314,7 @@ handle_resource_status_req(#httpd{method = 'POST'} = Req) -> end, [] ]), - io:format("{CSRT}***** GOT RESP: ~p~n", [Resp]), + %%io:format("{CSRT}***** GOT RESP: ~p~n", [Resp]), send_json(Req, {Resp}); handle_resource_status_req(#httpd{method = 'GET'} = Req) -> ok = chttpd:verify_is_server_admin(Req), diff --git a/src/couch/priv/stats_descriptions.cfg b/src/couch/priv/stats_descriptions.cfg index 02740e4d8f8..43794bff62b 100644 --- a/src/couch/priv/stats_descriptions.cfg +++ b/src/couch/priv/stats_descriptions.cfg @@ -418,7 +418,7 @@ {type, counter}, {desc, <<"number of JS filter invocations">>} ]}. -{[couchdb, query_server, js_filterered], [ +{[couchdb, query_server, js_filtered_docs], [ {type, counter}, {desc, <<"number of docs filtered through JS invocations">>} ]}. diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index d5fcd92ae25..5097f286617 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -317,14 +317,7 @@ should_track(_Metric) -> %%io:format("SKIPPING METRIC: ~p~n", [Metric]), false. -%% TODO: update coordinator stats from worker deltas accumulate_delta(Delta) -> - case get_resource() of - #rctx{type={coordinator,_,_}} = Rctx -> - io:format("Accumulating delta: ~w || ~w~n", [Delta, Rctx]); - _ -> - ok - end, %% TODO: switch to creating a batch of updates to invoke a single %% update_counter rather than sequentially invoking it for each field maps:foreach(fun inc/2, Delta). @@ -463,7 +456,6 @@ to_json(#rctx{}=Rctx) -> changes_processed = ChangesProcessed, changes_returned = ChangesReturned } = Rctx, - %%io:format("TO_JSON_MFA: ~p~n", [MFA0]), MFA = case MFA0 of {M, F, A} -> [M, F, A]; @@ -495,11 +487,9 @@ to_json(#rctx{}=Rctx) -> #{ updated_at => term_to_json(TP), started_at => term_to_json(TInit), - %%pid_ref => [pid_to_list(Pid), ref_to_list(Ref)], pid_ref => term_to_json(PidRef), mfa => term_to_json(MFA), nonce => term_to_json(Nonce), - %%from => From, from => term_to_json(From), dbname => DbName, username => UserName, @@ -522,7 +512,6 @@ to_json(#rctx{}=Rctx) -> term_to_json({Pid, Ref}) when is_pid(Pid), is_reference(Ref) -> [?l2b(pid_to_list(Pid)), ?l2b(ref_to_list(Ref))]; term_to_json({type, {coordinator, _, _} = Type}) -> - %%io:format("SETTING JSON TYPE: ~p~n", [Type]), ?l2b(io_lib:format("~p", [Type])); term_to_json({A, B, C}) -> [A, B, C]; @@ -536,7 +525,6 @@ term_to_json(T) -> T. term_to_flat_json({type, {coordinator, _, _}}=Type) -> - %%io:format("SETTING FLAT JSON TYPE: ~p~n", [Type]), ?l2b(io_lib:format("~p", [Type])); term_to_flat_json({coordinator, _, _}=Type) -> ?l2b(io_lib:format("~p", [Type])); @@ -573,7 +561,6 @@ to_flat_json(#rctx{}=Rctx) -> changes_returned = ChangesReturned, ioq_calls = IoqCalls } = Rctx, - %%io:format("TO_JSON_MFA: ~p~n", [MFA0]), MFA = case MFA0 of {_M, _F, _A} -> ?l2b(io_lib:format("~w", [MFA0])); @@ -600,12 +587,9 @@ to_flat_json(#rctx{}=Rctx) -> Nonce0 -> list_to_binary(Nonce0) end, - io:format("NONCE IS: ~p||~p~n", [Nonce0, Nonce]), #{ - %%updated_at => ?l2b(io_lib:format("~w", [TP])), updated_at => term_to_flat_json(TP), started_at => term_to_flat_json(TInit), - %%pid_ref => [pid_to_list(Pid), ref_to_list(Ref)], pid_ref => ?l2b(io_lib:format("~w", [PidRef])), mfa => MFA, nonce => Nonce, @@ -650,10 +634,8 @@ create_context(Pid) -> %% add type to disnguish coordinator vs rpc_worker create_context(From, {M,F,_A} = MFA, Nonce) -> - %%io:format("[~p] CREAT_CONTEXT MFA[~p]: {~p}: ~p~n", [self(), From, MFA, Nonce]), PidRef = get_pid_ref(), %% this will instantiate a new PidRef - %%Rctx = make_record(self(), Ref), - %% TODO: extract user_ctx and db/shard from + %% TODO: extract user_ctx and db/shard from Rctx = #rctx{ pid_ref = PidRef, from = From, @@ -670,18 +652,13 @@ create_coordinator_context(#httpd{path_parts=Parts} = Req) -> create_coordinator_context(Req, io_lib:format("~p", [Parts])). create_coordinator_context(#httpd{} = Req, Path) -> - %%io:format("CREATING COORDINATOR CONTEXT ON {~p}~n", [Path]), #httpd{ method = Verb, - %%path_parts = Parts, nonce = Nonce } = Req, PidRef = get_pid_ref(), %% this will instantiate a new PidRef - %%Rctx = make_record(self(), Ref), - %% TODO: extract user_ctx and db/shard from Req Rctx = #rctx{ pid_ref = PidRef, - %%type = {cooridantor, Verb, Parts}, type = {coordinator, Verb, [$/ | Path]}, nonce = Nonce }, @@ -704,7 +681,6 @@ set_context_dbname(DbName) -> set_context_username(null) -> ok; set_context_username(UserName) -> - %%io:format("CSRT SETTING USERNAME CONTEXT: ~p~n", [UserName]), case ets:update_element(?MODULE, get_pid_ref(), [{#rctx.username, UserName}]) of false -> Stk = try throw(42) catch _:_:Stk0 -> Stk0 end, @@ -755,7 +731,7 @@ make_delta() -> %% Perhaps somewhat naughty we're incrementing stats from within %% couch_stats itself? Might need to handle this differently %% TODO: determine appropriate course of action here - io:format("~n**********MISSING STARTING DELTA************~n~n", []), + %% io:format("~n**********MISSING STARTING DELTA************~n~n", []), couch_stats:increment_counter( [couchdb, csrt, delta_missing_t0]), %%[couch_stats_resource_tracker, delta_missing_t0]), @@ -769,7 +745,6 @@ make_delta() -> TA0 -> TA0 end; - %%?RCTX{} = TA0 -> #rctx{} = TA0 -> TA0 end, @@ -804,7 +779,6 @@ make_delta(#rctx{}, _) -> make_delta_base() -> Ref = make_ref(), - %%Rctx = make_record(self(), Ref), %% TODO: extract user_ctx and db/shard from request TA0 = #rctx{ pid_ref = {self(), Ref} @@ -874,14 +848,12 @@ handle_cast(Msg, St) -> {stop, {unknown_cast, Msg}, St}. handle_info(scan, #st{tracking=AT0} = St0) -> - io:format("{CSRT} TRIGGERING SCAN: ~n", []), Unmonitored = find_unmonitored(), - io:format("{CSRT} SCAN FOUND ~p UNMONITORED PIDS: ~n", [length(Unmonitored)]), AT = maybe_track(Unmonitored, AT0), _TimerRef = erlang:send_after(St0#st.scan_interval, self(), scan), {noreply, St0#st{tracking=AT}}; handle_info({'DOWN', MonRef, Type, DPid, Reason}, #st{tracking=AT0} = St0) -> - io:format("CSRT:HI(~p)~n", [{'DOWN', MonRef, Type, DPid, Reason}]), + %% io:format("CSRT:HI(~p)~n", [{'DOWN', MonRef, Type, DPid, Reason}]), St = case maps:get(MonRef, AT0, undefined) of undefined -> io:format("ERROR: UNEXPECTED MISSING MONITOR IN TRACKING TABLE: {~p, ~p}~n", [MonRef, DPid]), @@ -937,7 +909,7 @@ log_process_lifetime_report(PidRef) -> %% More safely assert this can't ever be undefined #rctx{} = Rctx = get_resource(PidRef), %% TODO: catch error out of here, report crashes on depth>1 json - io:format("CSRT RCTX: ~p~n", [to_flat_json(Rctx)]), + %%io:format("CSRT RCTX: ~p~n", [to_flat_json(Rctx)]), couch_log:report("csrt-pid-usage-lifetime", to_flat_json(Rctx)). diff --git a/src/fabric/src/fabric_rpc.erl b/src/fabric/src/fabric_rpc.erl index 6a8b069ea8b..6fa990e711b 100644 --- a/src/fabric/src/fabric_rpc.erl +++ b/src/fabric/src/fabric_rpc.erl @@ -262,7 +262,6 @@ set_purge_infos_limit(DbName, Limit, Options) -> with_db(DbName, Options, {couch_db, set_purge_infos_limit, [Limit]}). open_doc(DbName, DocId, Options) -> - io:format("frpc:open_doc(~p, ~p, ~p)~n", [DbName, DocId, Options]), with_db(DbName, Options, {couch_db, open_doc, [DocId, Options]}). open_revs(DbName, IdRevsOpts, Options) -> diff --git a/src/fabric/src/fabric_util.erl b/src/fabric/src/fabric_util.erl index 00d6b5e0e1c..2bde50a29d2 100644 --- a/src/fabric/src/fabric_util.erl +++ b/src/fabric/src/fabric_util.erl @@ -138,7 +138,6 @@ get_shard([#shard{node = Node, name = Name} | Rest], Opts, Timeout, Factor) -> try receive {Ref, {ok, Db}, {delta, Delta}} -> - io:format("[~p]GET SHARD GOT DELTA: ~p~n", [self(), Delta]), couch_stats_resource_tracker:accumulate_delta(Delta), {ok, Db}; {Ref, {'rexi_EXIT', {{unauthorized, _} = Error, _}}} -> @@ -149,7 +148,6 @@ get_shard([#shard{node = Node, name = Name} | Rest], Opts, Timeout, Factor) -> couch_log:debug("Failed to open shard ~p because: ~p", [Name, Reason]), get_shard(Rest, Opts, Timeout, Factor); Other -> - io:format("GOT UNEXPECTED MESSAGE FORMAT: ~p~n", [Other]), erlang:error(Other) after Timeout -> couch_log:debug("Failed to open shard ~p after: ~p", [Name, Timeout]), diff --git a/src/rexi/src/rexi_utils.erl b/src/rexi/src/rexi_utils.erl index 43f3346d945..f18ca179fdf 100644 --- a/src/rexi/src/rexi_utils.erl +++ b/src/rexi/src/rexi_utils.erl @@ -68,7 +68,6 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> {timeout, TimeoutRef} -> {timeout, Acc0}; {rexi, '$rexi_ping', {delta, Delta}} -> - io:format("[~p]GOT PING DELTA: ~p~n", [self(), Delta]), couch_stats_resource_tracker:accumulate_delta(Delta), {ok, Acc0}; {rexi, Ref, Msg} -> @@ -86,7 +85,6 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> Fun(Msg, {Worker, From}, Acc0) end; {Ref, Msg, {delta, Delta}} -> - io:format("[~p]GOT DELTA: ~p -- ~p~n", [self(), Delta, Msg]), couch_stats_resource_tracker:accumulate_delta(Delta), case lists:keyfind(Ref, Keypos, RefList) of false -> @@ -96,7 +94,6 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> Fun(Msg, Worker, Acc0) end; {Ref, From, Msg, {delta, Delta}} -> - io:format("[~p]GOT DELTA: ~p -- ~p~n", [self(), Delta, Msg]), couch_stats_resource_tracker:accumulate_delta(Delta), case lists:keyfind(Ref, Keypos, RefList) of false -> @@ -125,7 +122,6 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> %% TODO: add stack trace to log entry couch_log:debug("rexi_utils:process_message no delta: {Ref, Msg} => {~p, ~p}~n", [Ref, Msg]), timer:sleep(100), - %%erlang:halt(enodelta), erlang:halt(binary_to_list(iolist_to_binary(io_lib:format("{enodelta} rexi_utils:process_message no delta: {Ref, Msg} => {~w, ~w}~n", [Ref, Msg])))); {Ref, From, rexi_STREAM_INIT = Msg} -> case lists:keyfind(Ref, Keypos, RefList) of @@ -135,11 +131,9 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> Fun(Msg, {Worker, From}, Acc0) end; {Ref, From, Msg} -> - %%io:format("GOT NON DELTA MSG: ~p~n", [Msg]), %% TODO: add stack trace to log entry couch_log:debug("rexi_utils:process_message no delta: {Ref, From, Msg} => {~p, ~p, ~p}~n", [Ref, From, Msg]), timer:sleep(100), - %%erlang:halt(enodelta), erlang:halt(binary_to_list(iolist_to_binary(io_lib:format("{enodelta} rexi_utils:process_message no delta: {Ref, From, Msg} => {~w, ~w, ~w}~n", [Ref, From, Msg])))); {rexi_DOWN, _, _, _} = Msg -> Fun(Msg, nil, Acc0) From 4c05a0f39a2f4d7e372c80f8701678c006b0df3e Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 20 Nov 2023 14:25:32 -0800 Subject: [PATCH 14/29] Make CSRT toggle-able and rework delta sending --- .../src/couch_stats_resource_tracker.erl | 163 +++++++++++------- src/rexi/src/rexi.erl | 27 ++- src/rexi/src/rexi_utils.erl | 27 ++- 3 files changed, 143 insertions(+), 74 deletions(-) diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index 5097f286617..fb0d39cd3ed 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -34,6 +34,7 @@ -export([ create_context/0, create_context/1, create_context/3, create_coordinator_context/1, create_coordinator_context/2, + is_enabled/0, get_resource/0, get_resource/1, set_context_dbname/1, @@ -175,11 +176,15 @@ get_kp_node = 0 }). +is_enabled() -> + config:get_boolean("couch_stats_resource_tracker", "enabled", true). + db_opened() -> inc(db_opened). doc_read() -> inc(docs_read). row_read() -> inc(rows_read). btree_fold() -> inc(?COUCH_BT_FOLDS). -ioq_called() -> inc(ioq_calls). +%% TODO: do we need ioq_called and this access pattern? +ioq_called() -> is_enabled() andalso inc(ioq_calls). js_evaled() -> inc(js_evals). js_filtered() -> inc(js_filter). js_filtered_error() -> inc(js_filter_error). @@ -290,29 +295,29 @@ maybe_inc(_Metric, _Val) -> %% TODO: update stats_descriptions.cfg for relevant apps should_track([fabric_rpc, all_docs, spawned]) -> - true; + is_enabled(); should_track([fabric_rpc, changes, spawned]) -> - true; + is_enabled(); should_track([fabric_rpc, changes, processed]) -> - true; + is_enabled(); should_track([fabric_rpc, changes, returned]) -> - true; + is_enabled(); should_track([fabric_rpc, map_view, spawned]) -> - true; + is_enabled(); should_track([fabric_rpc, reduce_view, spawned]) -> - true; + is_enabled(); should_track([fabric_rpc, get_all_security, spawned]) -> - true; + is_enabled(); should_track([fabric_rpc, open_doc, spawned]) -> - true; + is_enabled(); should_track([fabric_rpc, update_docs, spawned]) -> - true; + is_enabled(); should_track([fabric_rpc, open_shard, spawned]) -> - true; + is_enabled(); should_track([mango_cursor, view, all_docs]) -> - true; + is_enabled(); should_track([mango_cursor, view, idx]) -> - true; + is_enabled(); should_track(_Metric) -> %%io:format("SKIPPING METRIC: ~p~n", [Metric]), false. @@ -320,14 +325,14 @@ should_track(_Metric) -> accumulate_delta(Delta) -> %% TODO: switch to creating a batch of updates to invoke a single %% update_counter rather than sequentially invoking it for each field - maps:foreach(fun inc/2, Delta). + is_enabled() andalso maps:foreach(fun inc/2, Delta). update_counter(Field, Count) -> - update_counter(get_pid_ref(), Field, Count). + is_enabled() andalso update_counter(get_pid_ref(), Field, Count). update_counter({_Pid,_Ref}=PidRef, Field, Count) -> - ets:update_counter(?MODULE, PidRef, {Field, Count}, #rctx{pid_ref=PidRef}). + is_enabled() andalso ets:update_counter(?MODULE, PidRef, {Field, Count}, #rctx{pid_ref=PidRef}). active() -> active_int(all). @@ -622,73 +627,98 @@ get_pid_ref() -> create_context() -> - create_context(self()). + is_enabled() andalso create_context(self()). create_context(Pid) -> - Ref = make_ref(), - Rctx = make_record(Pid, Ref), - track(Rctx), - ets:insert(?MODULE, Rctx), - Rctx. + case is_enabled() of + false -> + ok; + true -> + Ref = make_ref(), + Rctx = make_record(Pid, Ref), + track(Rctx), + ets:insert(?MODULE, Rctx), + Rctx + end. %% add type to disnguish coordinator vs rpc_worker create_context(From, {M,F,_A} = MFA, Nonce) -> - PidRef = get_pid_ref(), %% this will instantiate a new PidRef - %% TODO: extract user_ctx and db/shard from - Rctx = #rctx{ - pid_ref = PidRef, - from = From, - mfa = MFA, - type = {worker, M, F}, - nonce = Nonce - }, - track(Rctx), - erlang:put(?DELTA_TZ, Rctx), - true = ets:insert(?MODULE, Rctx), - Rctx. + case is_enabled() of + false -> + ok; + true -> + PidRef = get_pid_ref(), %% this will instantiate a new PidRef + %% TODO: extract user_ctx and db/shard from + Rctx = #rctx{ + pid_ref = PidRef, + from = From, + mfa = MFA, + type = {worker, M, F}, + nonce = Nonce + }, + track(Rctx), + erlang:put(?DELTA_TZ, Rctx), + true = ets:insert(?MODULE, Rctx), + Rctx + end. create_coordinator_context(#httpd{path_parts=Parts} = Req) -> - create_coordinator_context(Req, io_lib:format("~p", [Parts])). + is_enabled() andalso create_coordinator_context(Req, io_lib:format("~p", [Parts])). create_coordinator_context(#httpd{} = Req, Path) -> - #httpd{ - method = Verb, - nonce = Nonce - } = Req, - PidRef = get_pid_ref(), %% this will instantiate a new PidRef - Rctx = #rctx{ - pid_ref = PidRef, - type = {coordinator, Verb, [$/ | Path]}, - nonce = Nonce - }, - track(Rctx), - erlang:put(?DELTA_TZ, Rctx), - true = ets:insert(?MODULE, Rctx), - Rctx. + case is_enabled() of + false -> + ok; + true -> + #httpd{ + method = Verb, + nonce = Nonce + } = Req, + PidRef = get_pid_ref(), %% this will instantiate a new PidRef + Rctx = #rctx{ + pid_ref = PidRef, + type = {coordinator, Verb, [$/ | Path]}, + nonce = Nonce + }, + track(Rctx), + erlang:put(?DELTA_TZ, Rctx), + true = ets:insert(?MODULE, Rctx), + Rctx + end. set_context_dbname(DbName) -> - case ets:update_element(?MODULE, get_pid_ref(), [{#rctx.dbname, DbName}]) of + case is_enabled() of false -> - Stk = try throw(42) catch _:_:Stk0 -> Stk0 end, - io:format("UPDATING DBNAME[~p] FAILURE WITH CONTEXT: ~p AND STACK:~n~pFOO:: ~p~n~n", [DbName, get_resource(), Stk, process_info(self(), current_stacktrace)]), - timer:sleep(1000), - erlang:halt(kaboomz); + ok; true -> - true + case ets:update_element(?MODULE, get_pid_ref(), [{#rctx.dbname, DbName}]) of + false -> + Stk = try throw(42) catch _:_:Stk0 -> Stk0 end, + io:format("UPDATING DBNAME[~p] FAILURE WITH CONTEXT: ~p AND STACK:~n~pFOO:: ~p~n~n", [DbName, get_resource(), Stk, process_info(self(), current_stacktrace)]), + timer:sleep(1000), + erlang:halt(kaboomz); + true -> + true + end end. set_context_username(null) -> ok; set_context_username(UserName) -> - case ets:update_element(?MODULE, get_pid_ref(), [{#rctx.username, UserName}]) of + case is_enabled() of false -> - Stk = try throw(42) catch _:_:Stk0 -> Stk0 end, - io:format("UPDATING USERNAME[~p] FAILURE WITH CONTEXT: ~p AND STACK:~n~pFOO:: ~p~n~n", [UserName, get_resource(), Stk, process_info(self(), current_stacktrace)]), - timer:sleep(1000), - erlang:halt(kaboomz); + ok; true -> - true + case ets:update_element(?MODULE, get_pid_ref(), [{#rctx.username, UserName}]) of + false -> + Stk = try throw(42) catch _:_:Stk0 -> Stk0 end, + io:format("UPDATING USERNAME[~p] FAILURE WITH CONTEXT: ~p AND STACK:~n~pFOO:: ~p~n~n", [UserName, get_resource(), Stk, process_info(self(), current_stacktrace)]), + timer:sleep(1000), + erlang:halt(kaboomz); + true -> + true + end end. track(#rctx{}=Rctx) -> @@ -830,7 +860,12 @@ init([]) -> {keypos, #rctx.pid_ref} ]), St = #st{}, - _TimerRef = erlang:send_after(St#st.scan_interval, self(), scan), + case is_enabled() of + false -> + ok; + true -> + _TimerRef = erlang:send_after(St#st.scan_interval, self(), scan) + end, {ok, St}. handle_call(fetch, _from, #st{} = St) -> @@ -852,7 +887,7 @@ handle_info(scan, #st{tracking=AT0} = St0) -> AT = maybe_track(Unmonitored, AT0), _TimerRef = erlang:send_after(St0#st.scan_interval, self(), scan), {noreply, St0#st{tracking=AT}}; -handle_info({'DOWN', MonRef, Type, DPid, Reason}, #st{tracking=AT0} = St0) -> +handle_info({'DOWN', MonRef, _Type, DPid, Reason}, #st{tracking=AT0} = St0) -> %% io:format("CSRT:HI(~p)~n", [{'DOWN', MonRef, Type, DPid, Reason}]), St = case maps:get(MonRef, AT0, undefined) of undefined -> diff --git a/src/rexi/src/rexi.erl b/src/rexi/src/rexi.erl index 0b005dace8d..4bb5e9f6913 100644 --- a/src/rexi/src/rexi.erl +++ b/src/rexi/src/rexi.erl @@ -129,7 +129,8 @@ async_server_call(Server, Caller, Request) -> -spec reply(any()) -> any(). reply(Reply) -> {Caller, Ref} = get(rexi_from), - erlang:send(Caller, {Ref, Reply, get_delta()}). + %%erlang:send(Caller, {Ref, Reply, get_delta()}). + erlang:send(Caller, maybe_add_delta({Ref, Reply})). %% @equiv sync_reply(Reply, 300000) sync_reply(Reply) -> @@ -214,7 +215,8 @@ stream(Msg, Limit, Timeout) -> {ok, Count} -> put(rexi_unacked, Count + 1), {Caller, Ref} = get(rexi_from), - erlang:send(Caller, {Ref, self(), Msg, get_delta()}), + %%erlang:send(Caller, {Ref, self(), Msg, get_delta()}), + erlang:send(Caller, maybe_add_delta({Ref, self(), Msg})), ok catch throw:timeout -> @@ -243,7 +245,7 @@ stream2(Msg, Limit, Timeout) -> {ok, Count} -> put(rexi_unacked, Count + 1), {Caller, Ref} = get(rexi_from), - erlang:send(Caller, {Ref, self(), Msg, get_delta()}), + erlang:send(Caller, maybe_add_delta({Ref, self(), Msg})), ok catch throw:timeout -> @@ -281,7 +283,7 @@ ping() -> %% filtered queries will be silent on usage until they finally return %% a row or no results. This delay is proportional to the database size, %% so instead we make sure ping/0 keeps live stats flowing. - erlang:send(Caller, {rexi, '$rexi_ping'}, get_delta()). + erlang:send(Caller, maybe_add_delta({rexi, '$rexi_ping'})). %% internal functions %% @@ -337,3 +339,20 @@ drain_acks(Count) -> get_delta() -> {delta, couch_stats_resource_tracker:make_delta()}. + +maybe_add_delta(T) -> + case couch_stats_resource_tracker:is_enabled() of + false -> + T; + true -> + add_delta(T, get_delta()) + end. + +add_delta({A}, Delta) -> {A, Delta}; +add_delta({A, B}, Delta) -> {A, B, Delta}; +add_delta({A, B, C}, Delta) -> {A, B, C, Delta}; +add_delta({A, B, C, D}, Delta) -> {A, B, C, D, Delta}; +add_delta({A, B, C, D, E}, Delta) -> {A, B, C, D, E, Delta}; +add_delta({A, B, C, D, E, F}, Delta) -> {A, B, C, D, E, F, Delta}; +add_delta({A, B, C, D, E, F, G}, Delta) -> {A, B, C, D, E, F, G, Delta}; +add_delta(T, _Delta) -> T. diff --git a/src/rexi/src/rexi_utils.erl b/src/rexi/src/rexi_utils.erl index f18ca179fdf..4924bd11e87 100644 --- a/src/rexi/src/rexi_utils.erl +++ b/src/rexi/src/rexi_utils.erl @@ -67,6 +67,8 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> receive {timeout, TimeoutRef} -> {timeout, Acc0}; + {rexi, '$rexi_ping'} -> + {ok, Acc0}; {rexi, '$rexi_ping', {delta, Delta}} -> couch_stats_resource_tracker:accumulate_delta(Delta), {ok, Acc0}; @@ -120,9 +122,16 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> %% {#Ref<0.1961702128.3491758082.26965>, {rexi_EXIT,{{query_parse_error,<<78,111,32,114,111,119,115,32,99,97,110,32,109,97 {Ref, Msg} -> %% TODO: add stack trace to log entry - couch_log:debug("rexi_utils:process_message no delta: {Ref, Msg} => {~p, ~p}~n", [Ref, Msg]), - timer:sleep(100), - erlang:halt(binary_to_list(iolist_to_binary(io_lib:format("{enodelta} rexi_utils:process_message no delta: {Ref, Msg} => {~w, ~w}~n", [Ref, Msg])))); + %% couch_log:debug("rexi_utils:process_message no delta: {Ref, Msg} => {~p, ~p}~n", [Ref, Msg]), + %% timer:sleep(100), + %% erlang:halt(binary_to_list(iolist_to_binary(io_lib:format("{enodelta} rexi_utils:process_message no delta: {Ref, Msg} => {~w, ~w}~n", [Ref, Msg])))); + case lists:keyfind(Ref, Keypos, RefList) of + false -> + % this was some non-matching message which we will ignore + {ok, Acc0}; + Worker -> + Fun(Msg, Worker, Acc0) + end; {Ref, From, rexi_STREAM_INIT = Msg} -> case lists:keyfind(Ref, Keypos, RefList) of false -> @@ -132,9 +141,15 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> end; {Ref, From, Msg} -> %% TODO: add stack trace to log entry - couch_log:debug("rexi_utils:process_message no delta: {Ref, From, Msg} => {~p, ~p, ~p}~n", [Ref, From, Msg]), - timer:sleep(100), - erlang:halt(binary_to_list(iolist_to_binary(io_lib:format("{enodelta} rexi_utils:process_message no delta: {Ref, From, Msg} => {~w, ~w, ~w}~n", [Ref, From, Msg])))); + %% couch_log:debug("rexi_utils:process_message no delta: {Ref, From, Msg} => {~p, ~p, ~p}~n", [Ref, From, Msg]), + %% timer:sleep(100), + %% erlang:halt(binary_to_list(iolist_to_binary(io_lib:format("{enodelta} rexi_utils:process_message no delta: {Ref, From, Msg} => {~w, ~w, ~w}~n", [Ref, From, Msg])))); + case lists:keyfind(Ref, Keypos, RefList) of + false -> + {ok, Acc0}; + Worker -> + Fun(Msg, {Worker, From}, Acc0) + end; {rexi_DOWN, _, _, _} = Msg -> Fun(Msg, nil, Acc0) after PerMsgTO -> From eb20b5d83b2c9232add42d60b0368b8556af2d30 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 20 Nov 2023 14:49:48 -0800 Subject: [PATCH 15/29] Remove debug exit clause --- src/couch_stats/src/couch_stats_resource_tracker.erl | 2 +- src/fabric/src/fabric_util.erl | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index fb0d39cd3ed..22610c72254 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -144,7 +144,7 @@ %% Metadata started_at = os:timestamp(), updated_at = os:timestamp(), - exited_at, + exited_at, %% TODO: do we need a final exit time and additional update times afterwards? pid_ref, mon_ref, mfa, diff --git a/src/fabric/src/fabric_util.erl b/src/fabric/src/fabric_util.erl index 2bde50a29d2..e4f5d793f4b 100644 --- a/src/fabric/src/fabric_util.erl +++ b/src/fabric/src/fabric_util.erl @@ -137,6 +137,8 @@ get_shard([#shard{node = Node, name = Name} | Rest], Opts, Timeout, Factor) -> Ref = rexi:cast(Node, self(), MFA, [sync]), try receive + {Ref, {ok, Db}} -> + {ok, Db}; {Ref, {ok, Db}, {delta, Delta}} -> couch_stats_resource_tracker:accumulate_delta(Delta), {ok, Db}; @@ -146,9 +148,7 @@ get_shard([#shard{node = Node, name = Name} | Rest], Opts, Timeout, Factor) -> throw(Error); {Ref, Reason} -> couch_log:debug("Failed to open shard ~p because: ~p", [Name, Reason]), - get_shard(Rest, Opts, Timeout, Factor); - Other -> - erlang:error(Other) + get_shard(Rest, Opts, Timeout, Factor) after Timeout -> couch_log:debug("Failed to open shard ~p after: ~p", [Name, Timeout]), get_shard(Rest, Opts, Factor * Timeout, Factor) From f4a712c1b60608f8621593fd36a746ceba833b08 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 20 Nov 2023 16:33:37 -0800 Subject: [PATCH 16/29] Declare missing metrics --- src/fabric/priv/stats_descriptions.cfg | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/fabric/priv/stats_descriptions.cfg b/src/fabric/priv/stats_descriptions.cfg index 528942be86e..9ab054bf038 100644 --- a/src/fabric/priv/stats_descriptions.cfg +++ b/src/fabric/priv/stats_descriptions.cfg @@ -44,6 +44,18 @@ {type, counter}, {desc, <<"number of fabric_rpc worker all_docs spawns">>} ]}. +{[fabric_rpc, update_docs, spawned], [ + {type, counter}, + {desc, <<"number of fabric_rpc worker update_docs spawns">>} +]}. +{[fabric_rpc, map_view, spawned], [ + {type, counter}, + {desc, <<"number of fabric_rpc worker map_view spawns">>} +]}. +{[fabric_rpc, reduce_view, spawned], [ + {type, counter}, + {desc, <<"number of fabric_rpc worker reduce_view spawns">>} +]}. {[fabric_rpc, open_shard, spawned], [ {type, counter}, {desc, <<"number of fabric_rpc worker open_shard spawns">>} From c758cb777a6b36c8b71f7633f019e85e6577790c Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 20 Nov 2023 16:34:11 -0800 Subject: [PATCH 17/29] WIP: relax record constraints... --- src/couch_stats/src/couch_stats_resource_tracker.erl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index 22610c72254..59d2a9de301 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -903,7 +903,8 @@ handle_info({'DOWN', MonRef, _Type, DPid, Reason}, #st{tracking=AT0} = St0) -> %% TODO: Assert Pid matches Object %% update process state in live table %% TODO: decide whether we want the true match to crash this process on failure - true = ets:update_element(?MODULE, PidRef, + %% true = ets:update_element(?MODULE, PidRef, + ets:update_element(?MODULE, PidRef, [{#rctx.state, {down, Reason}}, {#rctx.updated_at, os:timestamp()}]), log_process_lifetime_report(PidRef), %% Delay eviction to allow human visibility on short lived pids @@ -935,7 +936,8 @@ maybe_track([{Pid,_Ref} = PidRef | PidRefs], AT) -> false -> %% setup new monitor and double bookkeep refs Mon = erlang:monitor(process, Pid), %% TODO: decide whether we want the true match to crash this process on failure - true = ets:update_element(?MODULE, PidRef, [{#rctx.mon_ref, Mon}]), + %%true = ets:update_element(?MODULE, PidRef, [{#rctx.mon_ref, Mon}]), + ets:update_element(?MODULE, PidRef, [{#rctx.mon_ref, Mon}]), maps:put(Mon, PidRef, maps:put(PidRef, Mon, AT)) end, maybe_track(PidRefs, AT1). From 4f00910a88e37e8a2b977814c0ecfe19f7f788af Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Thu, 30 Nov 2023 12:48:08 -0800 Subject: [PATCH 18/29] Use Default in config:get/3 on ets table error --- src/config/src/config.erl | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/config/src/config.erl b/src/config/src/config.erl index cf8c448355f..b8dabb7e264 100644 --- a/src/config/src/config.erl +++ b/src/config/src/config.erl @@ -158,15 +158,20 @@ get(Section, Key) -> get(Section, Key, Default) when is_binary(Section) and is_binary(Key) -> ?MODULE:get(binary_to_list(Section), binary_to_list(Key), Default); get(Section, Key, Default) when is_list(Section), is_list(Key) -> - case ets:lookup(?MODULE, {Section, Key}) of - [] when Default == undefined -> Default; - [] when is_boolean(Default) -> Default; - [] when is_float(Default) -> Default; - [] when is_integer(Default) -> Default; - [] when is_list(Default) -> Default; - [] when is_atom(Default) -> Default; - [] -> error(badarg); - [{_, Match}] -> Match + try + case ets:lookup(?MODULE, {Section, Key}) of + [] when Default == undefined -> Default; + [] when is_boolean(Default) -> Default; + [] when is_float(Default) -> Default; + [] when is_integer(Default) -> Default; + [] when is_list(Default) -> Default; + [] when is_atom(Default) -> Default; + [] -> error(badarg); + [{_, Match}] -> Match + end + catch + error:badarg -> + Default end. set(Section, Key, Value) -> From 887a7f2cea1ecab780895f4420deda08a3b3cf3c Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Thu, 30 Nov 2023 15:03:24 -0800 Subject: [PATCH 19/29] Switch to delta extraction from messages --- src/rexi/src/rexi_utils.erl | 77 ++++++++++--------------------------- 1 file changed, 21 insertions(+), 56 deletions(-) diff --git a/src/rexi/src/rexi_utils.erl b/src/rexi/src/rexi_utils.erl index 4924bd11e87..4d81475dfeb 100644 --- a/src/rexi/src/rexi_utils.erl +++ b/src/rexi/src/rexi_utils.erl @@ -65,13 +65,18 @@ process_mailbox(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> receive + Msg -> + process_raw_message(Msg, RefList, Keypos, Fun, Acc0, TimeoutRef) + after PerMsgTO -> + {timeout, Acc0} + end. + +process_raw_message(Payload0, RefList, Keypos, Fun, Acc0, TimeoutRef) -> + {Payload, Delta} = extract_delta(Payload0), + couch_stats_resource_tracker:accumulate_delta(Delta), + case Payload of {timeout, TimeoutRef} -> {timeout, Acc0}; - {rexi, '$rexi_ping'} -> - {ok, Acc0}; - {rexi, '$rexi_ping', {delta, Delta}} -> - couch_stats_resource_tracker:accumulate_delta(Delta), - {ok, Acc0}; {rexi, Ref, Msg} -> case lists:keyfind(Ref, Keypos, RefList) of false -> @@ -86,45 +91,9 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> Worker -> Fun(Msg, {Worker, From}, Acc0) end; - {Ref, Msg, {delta, Delta}} -> - couch_stats_resource_tracker:accumulate_delta(Delta), - case lists:keyfind(Ref, Keypos, RefList) of - false -> - % this was some non-matching message which we will ignore - {ok, Acc0}; - Worker -> - Fun(Msg, Worker, Acc0) - end; - {Ref, From, Msg, {delta, Delta}} -> - couch_stats_resource_tracker:accumulate_delta(Delta), - case lists:keyfind(Ref, Keypos, RefList) of - false -> - {ok, Acc0}; - Worker -> - Fun(Msg, {Worker, From}, Acc0) - end; - {Ref, get_state = Msg} -> - case lists:keyfind(Ref, Keypos, RefList) of - false -> - % this was some non-matching message which we will ignore - {ok, Acc0}; - Worker -> - Fun(Msg, Worker, Acc0) - end; - {Ref, done = Msg} -> - case lists:keyfind(Ref, Keypos, RefList) of - false -> - % this was some non-matching message which we will ignore - {ok, Acc0}; - Worker -> - Fun(Msg, Worker, Acc0) - end; - %% {#Ref<0.1961702128.3491758082.26965>, {rexi_EXIT,{{query_parse_error,<<78,111,32,114,111,119,115,32,99,97,110,32,109,97 + {rexi, '$rexi_ping'} -> + {ok, Acc0}; {Ref, Msg} -> - %% TODO: add stack trace to log entry - %% couch_log:debug("rexi_utils:process_message no delta: {Ref, Msg} => {~p, ~p}~n", [Ref, Msg]), - %% timer:sleep(100), - %% erlang:halt(binary_to_list(iolist_to_binary(io_lib:format("{enodelta} rexi_utils:process_message no delta: {Ref, Msg} => {~w, ~w}~n", [Ref, Msg])))); case lists:keyfind(Ref, Keypos, RefList) of false -> % this was some non-matching message which we will ignore @@ -132,18 +101,7 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> Worker -> Fun(Msg, Worker, Acc0) end; - {Ref, From, rexi_STREAM_INIT = Msg} -> - case lists:keyfind(Ref, Keypos, RefList) of - false -> - {ok, Acc0}; - Worker -> - Fun(Msg, {Worker, From}, Acc0) - end; {Ref, From, Msg} -> - %% TODO: add stack trace to log entry - %% couch_log:debug("rexi_utils:process_message no delta: {Ref, From, Msg} => {~p, ~p, ~p}~n", [Ref, From, Msg]), - %% timer:sleep(100), - %% erlang:halt(binary_to_list(iolist_to_binary(io_lib:format("{enodelta} rexi_utils:process_message no delta: {Ref, From, Msg} => {~w, ~w, ~w}~n", [Ref, From, Msg])))); case lists:keyfind(Ref, Keypos, RefList) of false -> {ok, Acc0}; @@ -152,6 +110,13 @@ process_message(RefList, Keypos, Fun, Acc0, TimeoutRef, PerMsgTO) -> end; {rexi_DOWN, _, _, _} = Msg -> Fun(Msg, nil, Acc0) - after PerMsgTO -> - {timeout, Acc0} end. + +extract_delta({A, {delta, Delta}}) -> {{A}, Delta}; +extract_delta({A, B, {delta, Delta}}) -> {{A, B}, Delta}; +extract_delta({A, B, C, {delta, Delta}}) -> {{A, B, C}, Delta}; +extract_delta({A, B, C, D, {delta, Delta}}) -> {{A, B, C, D}, Delta}; +extract_delta({A, B, C, D, E, {delta, Delta}}) -> {{A, B, C, D, E}, Delta}; +extract_delta({A, B, C, D, E, F, {delta, Delta}}) -> {{A, B, C, D, E, F}, Delta}; +extract_delta({A, B, C, D, E, F, G, {delta, Delta}}) -> {{A, B, C, D, E, F, G}, Delta}; +extract_delta(T) -> {T, undefined}. From 1a1a5848c2bf566f54316ebe478901af54d427e2 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Thu, 30 Nov 2023 15:08:58 -0800 Subject: [PATCH 20/29] Revert "Use Default in config:get/3 on ets table error" This reverts commit 4f00910a88e37e8a2b977814c0ecfe19f7f788af. --- src/config/src/config.erl | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/src/config/src/config.erl b/src/config/src/config.erl index b8dabb7e264..cf8c448355f 100644 --- a/src/config/src/config.erl +++ b/src/config/src/config.erl @@ -158,20 +158,15 @@ get(Section, Key) -> get(Section, Key, Default) when is_binary(Section) and is_binary(Key) -> ?MODULE:get(binary_to_list(Section), binary_to_list(Key), Default); get(Section, Key, Default) when is_list(Section), is_list(Key) -> - try - case ets:lookup(?MODULE, {Section, Key}) of - [] when Default == undefined -> Default; - [] when is_boolean(Default) -> Default; - [] when is_float(Default) -> Default; - [] when is_integer(Default) -> Default; - [] when is_list(Default) -> Default; - [] when is_atom(Default) -> Default; - [] -> error(badarg); - [{_, Match}] -> Match - end - catch - error:badarg -> - Default + case ets:lookup(?MODULE, {Section, Key}) of + [] when Default == undefined -> Default; + [] when is_boolean(Default) -> Default; + [] when is_float(Default) -> Default; + [] when is_integer(Default) -> Default; + [] when is_list(Default) -> Default; + [] when is_atom(Default) -> Default; + [] -> error(badarg); + [{_, Match}] -> Match end. set(Section, Key, Value) -> From db1f8748918df5367b96d3ba85cd2ccfc51b592f Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Fri, 8 Dec 2023 16:19:57 -0800 Subject: [PATCH 21/29] Flush chttpd_db monitor refs on demonitor This ensures that the coordinator process flushes on demonitoring of the attachment refs in chttpd_db. The problem here is that it's possible to receive a 'DOWN' message for the monitor ref that is not receive'ed, causing it to stick around in the coordinator message queue while the next http request is handled. The pending message will not become apparent until the next fabric call is invoked, as fabric expects to have full access to all messages in the calling process, an expectation which is violated by the pending message and causes a case clause crash in the fabric receive message callbacks. I noticed this during eunit runs with stubbed attachment handles that generate an immediate noproc message on the monitor call. Normal operations should not result in an immediate noproc result on monitoring the attachment process, however, any failure that causes the attachment process to fail between acquisition of the refs and the demonitor calls will induce this bug, causing the next http request handled by the particular chttpd coordinator pool processs to fail on whatever next fabric call is made. --- src/chttpd/src/chttpd_db.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/chttpd/src/chttpd_db.erl b/src/chttpd/src/chttpd_db.erl index 7538c452196..8c35910a686 100644 --- a/src/chttpd/src/chttpd_db.erl +++ b/src/chttpd/src/chttpd_db.erl @@ -2249,7 +2249,7 @@ monitor_attachments(Att) -> monitor_attachments([Att]). demonitor_refs(Refs) when is_list(Refs) -> - [demonitor(Ref) || Ref <- Refs]. + [demonitor(Ref, [flush]) || Ref <- Refs]. % Return attachments which are not stubs non_stubbed_attachments(Atts) when is_list(Atts) -> From 50495bf37551189f43c77e388ad5d3ea5a0d5377 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 18 Dec 2023 14:06:51 -0800 Subject: [PATCH 22/29] TMP Hack around issue #4909 --- src/couch/src/couch_util.erl | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/couch/src/couch_util.erl b/src/couch/src/couch_util.erl index 739df28e59d..0bc9a94bae7 100644 --- a/src/couch/src/couch_util.erl +++ b/src/couch/src/couch_util.erl @@ -46,6 +46,7 @@ -export([verify_hash_names/2]). -export([get_config_hash_algorithms/0]). -export([remove_sensitive_data/1]). +-export([clear_pdict/0, clear_pdict/1]). -include_lib("couch/include/couch_db.hrl"). @@ -870,3 +871,31 @@ remove_sensitive_data(KVList) -> KVList1 = lists:keyreplace(<<"password">>, 1, KVList, {<<"password">>, <<"****">>}), % some KVList entries are atoms, so test fo this too lists:keyreplace(password, 1, KVList1, {password, <<"****">>}). + +-spec clear_pdict() -> ok. +clear_pdict() -> + clear_pdict(erlang:get()). + +%% Exclude mochiweb markers, otherwise just use erlang:erase/0 +-spec clear_pdict(list()) -> ok. +clear_pdict([]) -> + ok; +clear_pdict([{mochiweb_request_body, _V} | Rest]) -> + clear_pdict(Rest); +clear_pdict([{mochiweb_request_body_length, _V} | Rest]) -> + clear_pdict(Rest); +clear_pdict([{mochiweb_request_cookie, _V} | Rest]) -> + clear_pdict(Rest); +clear_pdict([{mochiweb_request_force_close, _V} | Rest]) -> + clear_pdict(Rest); +clear_pdict([{mochiweb_request_path, _V} | Rest]) -> + clear_pdict(Rest); +clear_pdict([{mochiweb_request_post, _V} | Rest]) -> + clear_pdict(Rest); +clear_pdict([{mochiweb_request_qs, _V} | Rest]) -> + clear_pdict(Rest); +clear_pdict([{mochiweb_request_recv, _V} | Rest]) -> + clear_pdict(Rest); +clear_pdict([{Key, _V} | Rest]) -> + erlang:erase(Key), + clear_pdict(Rest). From 957877347995e18175a18b3ed4ca15e193602430 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 18 Dec 2023 15:10:45 -0800 Subject: [PATCH 23/29] Avoid mem3_rpc:rexi_call selective receive We need to potentially extract the usage delta from the incoming RPC message. Rather than pattern match on all possible message formats that could potentially include usage deltas, we instead utilize rexi_util:extract_delta which matches against tuples ending in `{delta, Delta}`, and separates that out from the underlying message. The subtlety here is that receiving the message to extract the delta changes the behavior as this was previously doing a selective receive keyed off of the Ref, and then ignoring any other messages that arrived. I don't know if the selective receive was intended, but I don't think it's appropriate to leave unexpected messages floating around, especially given things like issue #4909. Instead of utilizing a selective receive, this switches to extracting the message and delta like we need to do, and then in the event it finds unexpected messages they're logged and skipped. This selective receive was masking the lack of unlink on the linked rexi_mon pid in fix #4906. I've also noticed some rpc responses arriving late as well, but I haven't tracked that down, so let's log when it does happen. --- src/couch/include/couch_db.hrl | 1 + src/mem3/src/mem3_rpc.erl | 34 ++++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/src/couch/include/couch_db.hrl b/src/couch/include/couch_db.hrl index 9c1df21b690..d8dbf38ba8d 100644 --- a/src/couch/include/couch_db.hrl +++ b/src/couch/include/couch_db.hrl @@ -127,6 +127,7 @@ meta = [] }). +-define(LOG_UNEXPECTED_MSG(Msg), couch_log:warning("[~p:~p:~p/~p]{~p[~p]} Unexpected message: ~w", [?MODULE, ?LINE, ?FUNCTION_NAME, ?FUNCTION_ARITY, self(), element(2, process_info(self(), message_queue_len)), Msg])). -record(user_ctx, { name=null, diff --git a/src/mem3/src/mem3_rpc.erl b/src/mem3/src/mem3_rpc.erl index 60c24e1d40e..e4006c62cbe 100644 --- a/src/mem3/src/mem3_rpc.erl +++ b/src/mem3/src/mem3_rpc.erl @@ -378,20 +378,34 @@ rexi_call(Node, MFA, Timeout) -> Mon = rexi_monitor:start([rexi_utils:server_pid(Node)]), Ref = rexi:cast(Node, self(), MFA, [sync]), try - receive - {Ref, {ok, Reply}} -> - Reply; - {Ref, Error} -> - erlang:error(Error); - {rexi_DOWN, Mon, _, Reason} -> - erlang:error({rexi_DOWN, {Node, Reason}}) - after Timeout -> - erlang:error(timeout) - end + wait_message(Node, Ref, Mon, Timeout) after rexi_monitor:stop(Mon) end. +wait_message(Node, Ref, Mon, Timeout) -> + receive + Msg -> + process_raw_message(Msg, Node, Ref, Mon, Timeout) + after Timeout -> + erlang:error(timeout) + end. + +process_raw_message(Msg0, Node, Ref, Mon, Timeout) -> + {Msg, Delta} = rexi_utils:extract_delta(Msg0), + couch_stats_resource_tracker:accumulate_delta(Delta), + case Msg of + {Ref, {ok, Reply}} -> + Reply; + {Ref, Error} -> + erlang:error(Error); + {rexi_DOWN, Mon, _, Reason} -> + erlang:error({rexi_DOWN, {Node, Reason}}); + Other -> + ?LOG_UNEXPECTED_MSG(Other), + wait_message(Node, Ref, Mon, Timeout) + end. + get_or_create_db(DbName, Options) -> mem3_util:get_or_create_db_int(DbName, Options). From f77583aafd611cabe7eae0c7370140f877fa8c7d Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 18 Dec 2023 16:13:21 -0800 Subject: [PATCH 24/29] Pass tests and toggle csrt --- src/chttpd/src/chttpd.erl | 2 ++ .../src/couch_stats_resource_tracker.erl | 33 ++++++++++++++----- src/rexi/src/rexi_monitor.erl | 1 + src/rexi/src/rexi_server.erl | 8 ++++- src/rexi/src/rexi_utils.erl | 2 ++ 5 files changed, 37 insertions(+), 9 deletions(-) diff --git a/src/chttpd/src/chttpd.erl b/src/chttpd/src/chttpd.erl index 78b41a7106a..16cda1a5827 100644 --- a/src/chttpd/src/chttpd.erl +++ b/src/chttpd/src/chttpd.erl @@ -221,6 +221,7 @@ stop() -> mochiweb_http:stop(?MODULE). handle_request(MochiReq0) -> + couch_util:clear_pdict(), %% Make sure we start clean, everytime erlang:put(?REWRITE_COUNT, 0), MochiReq = couch_httpd_vhost:dispatch_host(MochiReq0), handle_request_int(MochiReq). @@ -377,6 +378,7 @@ after_request(HttpReq, HttpResp0) -> HttpResp2 = update_stats(HttpReq, HttpResp1), chttpd_stats:report(HttpReq, HttpResp2), maybe_log(HttpReq, HttpResp2), + %%couch_stats_resource_tracker:close_context(), HttpResp2. process_request(#httpd{mochi_req = MochiReq} = HttpReq) -> diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index 59d2a9de301..2f3a34f2f33 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -60,6 +60,8 @@ sorted_by/2, sorted_by/3, + find_by_pid/1, + unsafe_foldl/3, term_to_flat_json/1 @@ -322,17 +324,23 @@ should_track(_Metric) -> %%io:format("SKIPPING METRIC: ~p~n", [Metric]), false. -accumulate_delta(Delta) -> +accumulate_delta(Delta) when is_map(Delta) -> %% TODO: switch to creating a batch of updates to invoke a single %% update_counter rather than sequentially invoking it for each field - is_enabled() andalso maps:foreach(fun inc/2, Delta). + is_enabled() andalso maps:foreach(fun inc/2, Delta); +accumulate_delta(undefined) -> + ok; +accumulate_delta(Other) -> + io:format("CSRT:ACC_DELTA UNKNOWN DELTA: ~p~n", [Other]). + update_counter(Field, Count) -> is_enabled() andalso update_counter(get_pid_ref(), Field, Count). update_counter({_Pid,_Ref}=PidRef, Field, Count) -> - is_enabled() andalso ets:update_counter(?MODULE, PidRef, {Field, Count}, #rctx{pid_ref=PidRef}). + %% TODO: mem3 crashes without catch, why do we lose the stats table? + is_enabled() andalso catch ets:update_counter(?MODULE, PidRef, {Field, Count}, #rctx{pid_ref=PidRef}). active() -> active_int(all). @@ -638,10 +646,15 @@ create_context(Pid) -> Ref = make_ref(), Rctx = make_record(Pid, Ref), track(Rctx), - ets:insert(?MODULE, Rctx), + create_resource(Rctx), Rctx end. + +create_resource(#rctx{} = Rctx) -> + %% true = ets:insert(?MODULE, Rctx). + catch ets:insert(?MODULE, Rctx). + %% add type to disnguish coordinator vs rpc_worker create_context(From, {M,F,_A} = MFA, Nonce) -> case is_enabled() of @@ -659,7 +672,7 @@ create_context(From, {M,F,_A} = MFA, Nonce) -> }, track(Rctx), erlang:put(?DELTA_TZ, Rctx), - true = ets:insert(?MODULE, Rctx), + create_resource(Rctx), Rctx end. @@ -683,7 +696,7 @@ create_coordinator_context(#httpd{} = Req, Path) -> }, track(Rctx), erlang:put(?DELTA_TZ, Rctx), - true = ets:insert(?MODULE, Rctx), + create_resource(Rctx), Rctx end. @@ -831,7 +844,7 @@ get_resource() -> get_resource(get_pid_ref()). get_resource(PidRef) -> - case ets:lookup(?MODULE, PidRef) of + catch case ets:lookup(?MODULE, PidRef) of [#rctx{}=TP] -> TP; [] -> @@ -847,6 +860,10 @@ find_unmonitored() -> [PR || #rctx{pid_ref=PR} <- ets:match_object(?MODULE, #rctx{mon_ref=undefined, _ = '_'})]. +find_by_pid(Pid) -> + [R || #rctx{} = R <- ets:match_object(?MODULE, #rctx{pid_ref={Pid, '_'}, _ = '_'})]. + + start_link() -> gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). @@ -947,7 +964,7 @@ log_process_lifetime_report(PidRef) -> #rctx{} = Rctx = get_resource(PidRef), %% TODO: catch error out of here, report crashes on depth>1 json %%io:format("CSRT RCTX: ~p~n", [to_flat_json(Rctx)]), - couch_log:report("csrt-pid-usage-lifetime", to_flat_json(Rctx)). + is_enabled() andalso couch_log:report("csrt-pid-usage-lifetime", to_flat_json(Rctx)). %% Reimplementation of: https://github.com/erlang/otp/blob/b2ee4fc9a0b81a139dad2033e9b2bfc178146886/lib/stdlib/src/ets.erl#L633-L658 diff --git a/src/rexi/src/rexi_monitor.erl b/src/rexi/src/rexi_monitor.erl index 7fe66db71d4..72f0985df80 100644 --- a/src/rexi/src/rexi_monitor.erl +++ b/src/rexi/src/rexi_monitor.erl @@ -35,6 +35,7 @@ start(Procs) -> %% messages from our mailbox. -spec stop(pid()) -> ok. stop(MonitoringPid) -> + unlink(MonitoringPid), MonitoringPid ! {self(), shutdown}, flush_down_messages(). diff --git a/src/rexi/src/rexi_server.erl b/src/rexi/src/rexi_server.erl index 7bfe264844e..b8aa92b5d82 100644 --- a/src/rexi/src/rexi_server.erl +++ b/src/rexi/src/rexi_server.erl @@ -206,7 +206,13 @@ find_worker(Ref, Tab) -> end. notify_caller({Caller, Ref}, Reason, Delta) -> - rexi_utils:send(Caller, {Ref, {rexi_EXIT, Reason}, {delta, Delta}}). + Msg = case couch_stats_resource_tracker:is_enabled() of + true -> + {Ref, {rexi_EXIT, Reason}, {delta, Delta}}; + false -> + {Ref, {rexi_EXIT, Reason}} + end, + rexi_utils:send(Caller, Msg). kill_worker(FromRef, #st{clients = Clients} = St) -> case find_worker(FromRef, Clients) of diff --git a/src/rexi/src/rexi_utils.erl b/src/rexi/src/rexi_utils.erl index 4d81475dfeb..56c5a69b87a 100644 --- a/src/rexi/src/rexi_utils.erl +++ b/src/rexi/src/rexi_utils.erl @@ -14,6 +14,8 @@ -export([server_id/1, server_pid/1, send/2, recv/6]). +-export([extract_delta/1]). + %% @doc Return a rexi_server id for the given node. server_id(Node) -> case config:get_boolean("rexi", "server_per_node", true) of From ac1751026f2837b555a64da52c645965f2ab00d3 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 18 Dec 2023 16:13:44 -0800 Subject: [PATCH 25/29] Handle delta in fabric rpc tests --- src/fabric/test/eunit/fabric_rpc_purge_tests.erl | 2 ++ src/fabric/test/eunit/fabric_rpc_tests.erl | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/fabric/test/eunit/fabric_rpc_purge_tests.erl b/src/fabric/test/eunit/fabric_rpc_purge_tests.erl index 07e6b1d4220..c7a36fbe342 100644 --- a/src/fabric/test/eunit/fabric_rpc_purge_tests.erl +++ b/src/fabric/test/eunit/fabric_rpc_purge_tests.erl @@ -263,6 +263,8 @@ rpc_update_doc(DbName, Doc, Opts) -> Reply = test_util:wait(fun() -> receive {Ref, Reply} -> + Reply; + {Ref, Reply, {delta, _}} -> Reply after 0 -> wait diff --git a/src/fabric/test/eunit/fabric_rpc_tests.erl b/src/fabric/test/eunit/fabric_rpc_tests.erl index 16bb66badac..c402affbab0 100644 --- a/src/fabric/test/eunit/fabric_rpc_tests.erl +++ b/src/fabric/test/eunit/fabric_rpc_tests.erl @@ -101,7 +101,16 @@ t_no_config_db_create_fails_for_shard_rpc(DbName) -> receive Resp0 -> Resp0 end, - ?assertMatch({Ref, {'rexi_EXIT', {{error, missing_target}, _}}}, Resp). + case couch_stats_resource_tracker:is_enabled() of + true -> + ?assertMatch( %% allow for {Ref, {rexi_EXIT, error}, {delta, D}} + {Ref, {'rexi_EXIT', {{error, missing_target}, _}}, _}, + Resp); + false -> + ?assertMatch( + {Ref, {'rexi_EXIT', {{error, missing_target}, _}}}, + Resp) + end. t_db_create_with_config(DbName) -> MDbName = mem3:dbname(DbName), From c500f0f1f5af7b6da598ca186e4bc1f084929e3e Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Tue, 19 Dec 2023 14:54:10 -0800 Subject: [PATCH 26/29] Conditionally log reports --- .../src/couch_stats_resource_tracker.erl | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index 2f3a34f2f33..1cf7b16685b 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -964,7 +964,34 @@ log_process_lifetime_report(PidRef) -> #rctx{} = Rctx = get_resource(PidRef), %% TODO: catch error out of here, report crashes on depth>1 json %%io:format("CSRT RCTX: ~p~n", [to_flat_json(Rctx)]), - is_enabled() andalso couch_log:report("csrt-pid-usage-lifetime", to_flat_json(Rctx)). + case is_enabled() andalso should_log(Rctx) of + true -> + couch_log:report("csrt-pid-usage-lifetime", to_flat_json(Rctx)); + false -> + ok + end. + +should_log(#rctx{type = {coordinator, _, _}}) -> + true; +should_log(#rctx{type = {worker, fabric_rpc, FName}}) -> + case conf_get("log_fabric_rpc") of + "true" -> + true; + undefined -> + false; + Name -> + Name =:= atom_to_list(FName) + end; +should_log(_) -> + false. + + +conf_get(Key) -> + conf_get(Key, undefined). + + +conf_get(Key, Default) -> + config:get("couch_stats_resource_tracker", Key, Default). %% Reimplementation of: https://github.com/erlang/otp/blob/b2ee4fc9a0b81a139dad2033e9b2bfc178146886/lib/stdlib/src/ets.erl#L633-L658 From d7e9bd95710a2eb7828eae3dee6d42f555d4de14 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Fri, 5 Jan 2024 15:17:34 -0800 Subject: [PATCH 27/29] Do not persist doc size test config settings Most tests should not persist config changes. This is no exception. While the test deletes the change afterwards, system halts between setup and teardown result in a persisted max doc setting that causes unexpected failures on the next test suite that writes a document larger than 50 bytes. --- src/chttpd/test/eunit/chttpd_db_doc_size_tests.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/chttpd/test/eunit/chttpd_db_doc_size_tests.erl b/src/chttpd/test/eunit/chttpd_db_doc_size_tests.erl index 01ef16f23e8..91817473d1c 100644 --- a/src/chttpd/test/eunit/chttpd_db_doc_size_tests.erl +++ b/src/chttpd/test/eunit/chttpd_db_doc_size_tests.erl @@ -25,7 +25,7 @@ setup() -> Hashed = couch_passwords:hash_admin_password(?PASS), ok = config:set("admins", ?USER, ?b2l(Hashed), _Persist = false), - ok = config:set("couchdb", "max_document_size", "50"), + ok = config:set("couchdb", "max_document_size", "50", false), TmpDb = ?tempdb(), Addr = config:get("chttpd", "bind_address", "127.0.0.1"), Port = mochiweb_socket_server:get(chttpd, port), From f68005aac6302eefcc90ee90a3fd550b4a148c01 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 8 Jan 2024 13:06:17 -0800 Subject: [PATCH 28/29] Configurable logging and no more io_lib:format --- src/chttpd/src/chttpd.erl | 1 + src/couch_stats/src/couch_stats.app.src | 8 +- .../src/couch_stats_resource_tracker.erl | 273 +++++++++--------- 3 files changed, 145 insertions(+), 137 deletions(-) diff --git a/src/chttpd/src/chttpd.erl b/src/chttpd/src/chttpd.erl index 16cda1a5827..e2bfd5bef54 100644 --- a/src/chttpd/src/chttpd.erl +++ b/src/chttpd/src/chttpd.erl @@ -416,6 +416,7 @@ handle_req_after_auth(HandlerKey, HttpReq) -> HandlerKey, fun chttpd_db:handle_request/1 ), + couch_stats_resource_tracker:set_context_handler_fun(HandlerFun), AuthorizedReq = chttpd_auth:authorize( possibly_hack(HttpReq), fun chttpd_auth_request:authorize_request/1 diff --git a/src/couch_stats/src/couch_stats.app.src b/src/couch_stats/src/couch_stats.app.src index a54fac7349f..de9f00e4e70 100644 --- a/src/couch_stats/src/couch_stats.app.src +++ b/src/couch_stats/src/couch_stats.app.src @@ -13,8 +13,12 @@ {application, couch_stats, [ {description, "Simple statistics collection"}, {vsn, git}, - {registered, [couch_stats_aggregator, couch_stats_process_tracker]}, - {applications, [kernel, stdlib]}, + {registered, [ + couch_stats_aggregator, + couch_stats_process_tracker, + couch_stats_resource_tracker + ]}, + {applications, [kernel, stdlib, couch_log]}, {mod, {couch_stats_app, []}}, {env, []} ]}. diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index 1cf7b16685b..363a663ba83 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -38,6 +38,7 @@ get_resource/0, get_resource/1, set_context_dbname/1, + set_context_handler_fun/1, set_context_username/1, track/1, should_track/1 @@ -144,8 +145,8 @@ %% -record(?RCTX, { -record(rctx, { %% Metadata - started_at = os:timestamp(), - updated_at = os:timestamp(), + started_at = tnow(), + updated_at = tnow(), exited_at, %% TODO: do we need a final exit time and additional update times afterwards? pid_ref, mon_ref, @@ -178,8 +179,12 @@ get_kp_node = 0 }). +%% monotonic time now in millisecionds +tnow() -> + erlang:monotonic_time(millisecond). + is_enabled() -> - config:get_boolean("couch_stats_resource_tracker", "enabled", true). + config:get_boolean(?MODULE_STRING, "enabled", true). db_opened() -> inc(db_opened). doc_read() -> inc(docs_read). @@ -264,6 +269,8 @@ inc(?COUCH_BT_GET_KP_NODE, N) -> inc(?COUCH_BT_GET_KV_NODE, N) -> update_counter(#rctx.?COUCH_BT_GET_KV_NODE, N); inc(_, _) -> + %% inc needs to allow unknown types to pass for accumulate_update to handle + %% updates from nodes with newer data formats 0. maybe_inc([mango, evaluate_selector], Val) -> @@ -353,7 +360,7 @@ active_int(coordinators) -> active_int(workers) -> select_by_type(workers); active_int(all) -> - lists:map(fun to_json/1, ets:tab2list(?MODULE)). + lists:map(fun to_flat_json/1, ets:tab2list(?MODULE)). select_by_type(coordinators) -> @@ -363,14 +370,20 @@ select_by_type(workers) -> ets:select(couch_stats_resource_tracker, [{#rctx{type = {worker,'_','_'}, _ = '_'}, [], ['$_']}]); select_by_type(all) -> - lists:map(fun to_json/1, ets:tab2list(?MODULE)). + lists:map(fun to_flat_json/1, ets:tab2list(?MODULE)). field(#rctx{pid_ref=Val}, pid_ref) -> Val; -field(#rctx{mfa=Val}, mfa) -> Val; +%% NOTE: Pros and cons to doing these convert functions here +%% Ideally, this would be done later so as to prefer the core data structures +%% as long as possible, but we currently need the output of this function to +%% be jiffy:encode'able. The tricky bit is dynamically encoding the group_by +%% structure provided by the caller of *_by aggregator functions below. +%% For now, we just always return jiffy:encode'able data types. +field(#rctx{mfa=Val}, mfa) -> convert_mfa(Val); field(#rctx{nonce=Val}, nonce) -> Val; field(#rctx{from=Val}, from) -> Val; -field(#rctx{type=Val}, type) -> Val; +field(#rctx{type=Val}, type) -> convert_type(Val); field(#rctx{state=Val}, state) -> Val; field(#rctx{dbname=Val}, dbname) -> Val; field(#rctx{username=Val}, username) -> Val; @@ -444,105 +457,27 @@ sorted_by(KeyFun, ValFun) -> sorted(group_by(KeyFun, ValFun)). sorted_by(KeyFun, ValFun, AggFun) -> sorted(group_by(KeyFun, ValFun, AggFun)). -to_json(#rctx{}=Rctx) -> - #rctx{ - updated_at = TP, - started_at = TInit, - pid_ref = {_Pid, _Ref} = PidRef, - mfa = MFA0, - nonce = Nonce0, - from = From0, - dbname = DbName, - username = UserName, - db_open = DbOpens, - docs_read = DocsRead, - rows_read = RowsRead, - js_filter = JSFilters, - js_filter_error = JSFilterErrors, - js_filtered_docs = JSFilteredDocss, - state = State0, - type = Type, - btree_folds = BtFolds, - get_kp_node = KpNodes, - get_kv_node = KvNodes, - ioq_calls = IoqCalls, - changes_processed = ChangesProcessed, - changes_returned = ChangesReturned - } = Rctx, - MFA = case MFA0 of - {M, F, A} -> - [M, F, A]; - undefined -> - null; - Other -> - throw({error, {unexpected, Other}}) - end, - From = case From0 of - {Parent, ParentRef} -> - [pid_to_list(Parent), ref_to_list(ParentRef)]; - undefined -> - null - end, - State = case State0 of - alive -> - alive; - {down, Reason} when is_atom(Reason) -> - [down, Reason]; - Unknown -> - [unknown, io_lib:format("~w", [Unknown])] - end, - Nonce = case Nonce0 of - undefined -> - null; - Nonce0 -> - list_to_binary(Nonce0) - end, - #{ - updated_at => term_to_json(TP), - started_at => term_to_json(TInit), - pid_ref => term_to_json(PidRef), - mfa => term_to_json(MFA), - nonce => term_to_json(Nonce), - from => term_to_json(From), - dbname => DbName, - username => UserName, - db_open => DbOpens, - docs_read => DocsRead, - js_filter => JSFilters, - js_filter_error => JSFilterErrors, - js_filtered_docs => JSFilteredDocss, - rows_read => RowsRead, - state => State, - type => term_to_json({type, Type}), - btree_folds => BtFolds, - kp_nodes => KpNodes, - kv_nodes => KvNodes, - ioq_calls => IoqCalls, - changes_processed => ChangesProcessed, - changes_returned => ChangesReturned - }. - -term_to_json({Pid, Ref}) when is_pid(Pid), is_reference(Ref) -> - [?l2b(pid_to_list(Pid)), ?l2b(ref_to_list(Ref))]; -term_to_json({type, {coordinator, _, _} = Type}) -> - ?l2b(io_lib:format("~p", [Type])); -term_to_json({A, B, C}) -> - [A, B, C]; -term_to_json(Tuple) when is_tuple(Tuple) -> - ?l2b(io_lib:format("~w", [Tuple])); -term_to_json(undefined) -> - null; -term_to_json(null) -> - null; -term_to_json(T) -> - T. - -term_to_flat_json({type, {coordinator, _, _}}=Type) -> - ?l2b(io_lib:format("~p", [Type])); -term_to_flat_json({coordinator, _, _}=Type) -> - ?l2b(io_lib:format("~p", [Type])); +term_to_flat_json({shutdown, Reason0}) when is_atom(Reason0) -> + Reason = atom_to_binary(Reason0), + <<"shutdown: ", Reason/binary>>; +term_to_flat_json({type, Atom}) when is_atom(Atom) -> + atom_to_binary(Atom); +term_to_flat_json({type, {coordinator, Verb0, Path0}}=_Type) -> + Verb = atom_to_binary(Verb0), + Path = list_to_binary(Path0), + <<"coordinator:", Verb/binary, ":", Path/binary>>; +term_to_flat_json({type, {worker, M0, F0}}=_Type) -> + M = atom_to_binary(M0), + F = atom_to_binary(F0), + <<"worker:", M/binary, ":", F/binary>>; term_to_flat_json(Tuple) when is_tuple(Tuple) -> - ?l2b(io_lib:format("~p", [Tuple])); + erlang:tuple_to_list(Tuple); +term_to_flat_json(Pid) when is_pid(Pid) -> + ?l2b(pid_to_list(Pid)); +term_to_flat_json(Ref) when is_reference(Ref) -> + ?l2b(ref_to_list(Ref)); +term_to_flat_json(Atom) when is_atom(Atom) -> + atom_to_binary(Atom); term_to_flat_json(undefined) -> null; term_to_flat_json(null) -> @@ -554,7 +489,7 @@ to_flat_json(#rctx{}=Rctx) -> #rctx{ updated_at = TP, started_at = TInit, - pid_ref = {_Pid, _Ref} = PidRef, + pid_ref = {Pid, Ref}, mfa = MFA0, nonce = Nonce0, from = From0, @@ -574,25 +509,32 @@ to_flat_json(#rctx{}=Rctx) -> changes_returned = ChangesReturned, ioq_calls = IoqCalls } = Rctx, + PidRef = {term_to_flat_json(Pid), term_to_flat_json(Ref)}, MFA = case MFA0 of - {_M, _F, _A} -> - ?l2b(io_lib:format("~w", [MFA0])); + {M0, F0, A0} -> + M = atom_to_binary(M0), + F = atom_to_binary(F0), + A = integer_to_binary(A0), + <>; + MFA0 when is_list(MFA0) -> + MFA0; undefined -> null; - Other -> - throw({error, {unexpected, Other}}) + OtherMFA -> + throw({error, {unexpected, OtherMFA}}) end, From = case From0 of - {_Parent, _ParentRef} -> - ?l2b(io_lib:format("~w", [From0])); + {Parent, ParentRef} -> + {term_to_flat_json(Parent), term_to_flat_json(ParentRef)}; undefined -> null end, State = case State0 of alive -> alive; - State0 -> - ?l2b(io_lib:format("~w", [State0])) + {down, Reason0} -> + Reason = term_to_flat_json(Reason0), + <<"down: ", Reason/binary>> end, Nonce = case Nonce0 of undefined -> @@ -601,9 +543,9 @@ to_flat_json(#rctx{}=Rctx) -> list_to_binary(Nonce0) end, #{ - updated_at => term_to_flat_json(TP), - started_at => term_to_flat_json(TInit), - pid_ref => ?l2b(io_lib:format("~w", [PidRef])), + updated_at => TP, + started_at => TInit, + pid_ref => PidRef, mfa => MFA, nonce => Nonce, from => From, @@ -624,6 +566,26 @@ to_flat_json(#rctx{}=Rctx) -> ioq_calls => IoqCalls }. + +convert_mfa({M0, F0, A0}) -> + M = atom_to_binary(M0), + F = atom_to_binary(F0), + A = integer_to_binary(A0), + <>; +convert_mfa(undefined) -> + null. + +convert_type(Atom) when is_atom(Atom) -> + atom_to_binary(Atom); +convert_type({coordinator, Verb0, Path0}) -> + Verb = atom_to_binary(Verb0), + Path = list_to_binary(Path0), + <<"coordinator:", Verb/binary, ":", Path/binary>>; +convert_type({worker, M0, F0}) -> + M = atom_to_binary(M0), + F = atom_to_binary(F0), + <<"worker:", M/binary, ":", F/binary>>. + get_pid_ref() -> case get(?PID_REF) of undefined -> @@ -705,7 +667,7 @@ set_context_dbname(DbName) -> false -> ok; true -> - case ets:update_element(?MODULE, get_pid_ref(), [{#rctx.dbname, DbName}]) of + catch case ets:update_element(?MODULE, get_pid_ref(), [{#rctx.dbname, DbName}]) of false -> Stk = try throw(42) catch _:_:Stk0 -> Stk0 end, io:format("UPDATING DBNAME[~p] FAILURE WITH CONTEXT: ~p AND STACK:~n~pFOO:: ~p~n~n", [DbName, get_resource(), Stk, process_info(self(), current_stacktrace)]), @@ -716,6 +678,23 @@ set_context_dbname(DbName) -> end end. +set_context_handler_fun(Fun) when is_function(Fun) -> + case is_enabled() of + false -> + ok; + true -> + FunName = erlang:fun_to_list(Fun), + catch case ets:update_element(?MODULE, get_pid_ref(), [{#rctx.mfa, FunName}]) of + false -> + Stk = try throw(42) catch _:_:Stk0 -> Stk0 end, + io:format("UPDATING HANDLER FUN[~p] FAILURE WITH CONTEXT: ~p AND STACK:~n~pFOO:: ~p~n~n", [FunName, get_resource(), Stk, process_info(self(), current_stacktrace)]), + timer:sleep(1000), + erlang:halt(kaboomz); + true -> + true + end + end. + set_context_username(null) -> ok; set_context_username(UserName) -> @@ -723,7 +702,7 @@ set_context_username(UserName) -> false -> ok; true -> - case ets:update_element(?MODULE, get_pid_ref(), [{#rctx.username, UserName}]) of + catch case ets:update_element(?MODULE, get_pid_ref(), [{#rctx.username, UserName}]) of false -> Stk = try throw(42) catch _:_:Stk0 -> Stk0 end, io:format("UPDATING USERNAME[~p] FAILURE WITH CONTEXT: ~p AND STACK:~n~pFOO:: ~p~n~n", [UserName, get_resource(), Stk, process_info(self(), current_stacktrace)]), @@ -810,7 +789,7 @@ make_delta(#rctx{}=TA, #rctx{}=TB) -> get_kv_node => TB#rctx.get_kv_node - TA#rctx.get_kv_node, db_open => TB#rctx.db_open - TA#rctx.db_open, ioq_calls => TB#rctx.ioq_calls - TA#rctx.ioq_calls, - dt => timer:now_diff(TB#rctx.updated_at, TA#rctx.updated_at) + dt => TB#rctx.updated_at - TA#rctx.updated_at }, %% TODO: reevaluate this decision %% Only return non zero (and also positive) delta fields @@ -823,15 +802,12 @@ make_delta(#rctx{}, _) -> make_delta_base() -> Ref = make_ref(), %% TODO: extract user_ctx and db/shard from request - TA0 = #rctx{ - pid_ref = {self(), Ref} - }, - case TA0#rctx.updated_at of - {Me, S, Mi} when Mi > 0 -> - TA0#rctx{updated_at = {Me, S, 0}}; - {Me, S, Mi} when S > 0 -> - TA0#rctx{updated_at = {Me, S - 1, Mi}} - end. + Now = tnow(), + #rctx{ + pid_ref = {self(), Ref}, + started_at = Now - 100, %% give us 100ms rewind time for missing T0 + updated_at = Now + }. set_delta_a(TA) -> erlang:put(?DELTA_TA, TA). @@ -904,7 +880,7 @@ handle_info(scan, #st{tracking=AT0} = St0) -> AT = maybe_track(Unmonitored, AT0), _TimerRef = erlang:send_after(St0#st.scan_interval, self(), scan), {noreply, St0#st{tracking=AT}}; -handle_info({'DOWN', MonRef, _Type, DPid, Reason}, #st{tracking=AT0} = St0) -> +handle_info({'DOWN', MonRef, _Type, DPid, Reason0}, #st{tracking=AT0} = St0) -> %% io:format("CSRT:HI(~p)~n", [{'DOWN', MonRef, Type, DPid, Reason}]), St = case maps:get(MonRef, AT0, undefined) of undefined -> @@ -921,8 +897,15 @@ handle_info({'DOWN', MonRef, _Type, DPid, Reason}, #st{tracking=AT0} = St0) -> %% update process state in live table %% TODO: decide whether we want the true match to crash this process on failure %% true = ets:update_element(?MODULE, PidRef, + Reason = case Reason0 of + {shutdown, Shutdown0} -> + Shutdown = atom_to_binary(Shutdown0), + <<"shutdown: ", Shutdown/binary>>; + Reason0 -> + Reason0 + end, ets:update_element(?MODULE, PidRef, - [{#rctx.state, {down, Reason}}, {#rctx.updated_at, os:timestamp()}]), + [{#rctx.state, {down, Reason}}, {#rctx.updated_at, tnow()}]), log_process_lifetime_report(PidRef), %% Delay eviction to allow human visibility on short lived pids erlang:send_after(St0#st.eviction_delay, self(), {evict, PidRef}), @@ -971,9 +954,29 @@ log_process_lifetime_report(PidRef) -> ok end. -should_log(#rctx{type = {coordinator, _, _}}) -> + +logging_enabled() -> + case conf_get("log_pid_usage_report", "coordinator") of + "coordinator" -> + coordinator; + "true" -> + true; + _ -> + false + end. + + +should_log(#rctx{}=Rctx) -> + should_log(Rctx, logging_enabled()). + + +should_log(#rctx{}, true) -> + true; +should_log(#rctx{}, false) -> + false; +should_log(#rctx{type = {coordinator, _, _}}, coordinator) -> true; -should_log(#rctx{type = {worker, fabric_rpc, FName}}) -> +should_log(#rctx{type = {worker, fabric_rpc, FName}}, _) -> case conf_get("log_fabric_rpc") of "true" -> true; @@ -982,7 +985,7 @@ should_log(#rctx{type = {worker, fabric_rpc, FName}}) -> Name -> Name =:= atom_to_list(FName) end; -should_log(_) -> +should_log(#rctx{}, _) -> false. @@ -991,7 +994,7 @@ conf_get(Key) -> conf_get(Key, Default) -> - config:get("couch_stats_resource_tracker", Key, Default). + config:get(?MODULE_STRING, Key, Default). %% Reimplementation of: https://github.com/erlang/otp/blob/b2ee4fc9a0b81a139dad2033e9b2bfc178146886/lib/stdlib/src/ets.erl#L633-L658 From 03715890285dedbca8c8e4f104a78519af26e603 Mon Sep 17 00:00:00 2001 From: Russell Branca Date: Mon, 12 Feb 2024 14:34:04 -0800 Subject: [PATCH 29/29] Limit resource usage output and fix keys --- src/chttpd/src/chttpd_misc.erl | 5 +++- .../src/couch_stats_resource_tracker.erl | 29 ++++++++++++++----- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/src/chttpd/src/chttpd_misc.erl b/src/chttpd/src/chttpd_misc.erl index eff4c93c11d..08ce9841840 100644 --- a/src/chttpd/src/chttpd_misc.erl +++ b/src/chttpd/src/chttpd_misc.erl @@ -301,7 +301,10 @@ handle_resource_status_req(#httpd{method = 'POST'} = Req) -> case Fun() of Map when is_map(Map) -> {maps:fold( - fun(K,V,A) -> [{ToJson(K), V} | A] end, + fun + (_K,0,A) -> A; %% TODO: Skip 0 value entries? + (K,V,A) -> [{ToJson(K), V} | A] + end, [], Map)}; List when is_list(List) -> List diff --git a/src/couch_stats/src/couch_stats_resource_tracker.erl b/src/couch_stats/src/couch_stats_resource_tracker.erl index 363a663ba83..0d8e4164edb 100644 --- a/src/couch_stats/src/couch_stats_resource_tracker.erl +++ b/src/couch_stats/src/couch_stats_resource_tracker.erl @@ -440,7 +440,13 @@ group_by(KeyFun, ValFun, AggFun, Fold) -> Val = ValFun(Ele), CurrVal = maps:get(Key, Acc, 0), NewVal = AggFun(CurrVal, Val), - maps:put(Key, NewVal, Acc) + %% TODO: should we skip here? how to make this optional? + case NewVal > 0 of + true -> + maps:put(Key, NewVal, Acc); + false -> + Acc + end end, Fold(FoldFun, #{}, ?MODULE). @@ -449,12 +455,15 @@ group_by(KeyFun, ValFun, AggFun, Fold) -> sorted(Map) when is_map(Map) -> lists:sort(fun({_K1, A}, {_K2, B}) -> B < A end, maps:to_list(Map)). +shortened(L) -> + lists:sublist(L, 10). + %% eg: sorted_by([username, dbname, mfa], ioq_calls) %% eg: sorted_by([dbname, mfa], doc_reads) -sorted_by(KeyFun) -> sorted(count_by(KeyFun)). -sorted_by(KeyFun, ValFun) -> sorted(group_by(KeyFun, ValFun)). -sorted_by(KeyFun, ValFun, AggFun) -> sorted(group_by(KeyFun, ValFun, AggFun)). +sorted_by(KeyFun) -> shortened(sorted(count_by(KeyFun))). +sorted_by(KeyFun, ValFun) -> shortened(sorted(group_by(KeyFun, ValFun))). +sorted_by(KeyFun, ValFun, AggFun) -> shortened(sorted(group_by(KeyFun, ValFun, AggFun))). term_to_flat_json({shutdown, Reason0}) when is_atom(Reason0) -> @@ -489,7 +498,7 @@ to_flat_json(#rctx{}=Rctx) -> #rctx{ updated_at = TP, started_at = TInit, - pid_ref = {Pid, Ref}, + pid_ref = {Pid0, Ref0}, mfa = MFA0, nonce = Nonce0, from = From0, @@ -509,7 +518,9 @@ to_flat_json(#rctx{}=Rctx) -> changes_returned = ChangesReturned, ioq_calls = IoqCalls } = Rctx, - PidRef = {term_to_flat_json(Pid), term_to_flat_json(Ref)}, + Pid = term_to_flat_json(Pid0), + Ref = term_to_flat_json(Ref0), + PidRef = <>, MFA = case MFA0 of {M0, F0, A0} -> M = atom_to_binary(M0), @@ -524,8 +535,10 @@ to_flat_json(#rctx{}=Rctx) -> throw({error, {unexpected, OtherMFA}}) end, From = case From0 of - {Parent, ParentRef} -> - {term_to_flat_json(Parent), term_to_flat_json(ParentRef)}; + {Parent0, ParentRef0} -> + Parent = term_to_flat_json(Parent0), + ParentRef = term_to_flat_json(ParentRef0), + <>; undefined -> null end,