Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Couch stats resource tracker v2 #5213

Open
wants to merge 29 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
1fa358d
Port in csrt changes
chewbranca Jun 7, 2024
e67690a
Rework CSRT post experimentation
chewbranca Jul 29, 2024
e10d82e
Remove debug statement
chewbranca Aug 5, 2024
b3a38c2
Inc IOQ calls
chewbranca Aug 5, 2024
a4fc931
Add csrt delta to rexi_tests
chewbranca Aug 5, 2024
d2c7eba
Use extract_delta for delta message handling
chewbranca Aug 20, 2024
7ad90ad
Cleanup maybe_add_delta logic
chewbranca Aug 20, 2024
03fa0cb
Rework btree stats allowing for write_node stats
chewbranca Aug 21, 2024
bb00b81
Handle unexpected get_shard messages
chewbranca Aug 21, 2024
edd5d0e
Restore coordinator logging only default
chewbranca Aug 23, 2024
f06d4a4
Handle old fabric_util:get_shard Ref responses
chewbranca Aug 23, 2024
c02d57b
Cleanup _Persist usage
chewbranca Aug 27, 2024
5864ba3
Rework CSRT to use ets:fun2ms for static queries
chewbranca Aug 27, 2024
e1fead4
Remove debug statements
chewbranca Aug 28, 2024
4535f1a
Add find by nonce and some cleanup
chewbranca Sep 2, 2024
48db790
Use fun binary_to_existing_atom/1
chewbranca Oct 10, 2024
4db08c9
Rework js filter tracking to reuse existing stats
chewbranca Oct 10, 2024
1d34432
Switch to using static maps for key/field/stat translations
chewbranca Oct 11, 2024
46fba8e
Cleanup and sort function exports
chewbranca Oct 11, 2024
5ac8a91
Cleanup oneline functions
chewbranca Oct 11, 2024
ce91a40
Simplify csrt:to_json
chewbranca Oct 11, 2024
de9a430
Drop stub gen_server callbacks
chewbranca Oct 11, 2024
1a9e4d1
Cleanup handle_{call,cast}
chewbranca Oct 11, 2024
ba1baaa
Remove extra csrt destroy_context
chewbranca Oct 14, 2024
e213714
Remove unnecessary demonitor
chewbranca Oct 21, 2024
8d0e9fd
Remove unnecessary destroy_context
chewbranca Oct 21, 2024
1702115
WIP: hack together filtered reporting logic
chewbranca Oct 24, 2024
4c091bc
Rework context types to be records
chewbranca Oct 25, 2024
ad84bf8
WIP: persistent terms matcher experiment
chewbranca Nov 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/chttpd/src/chttpd.erl
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,9 @@ handle_request_int(MochiReq) ->
% Save client socket so that it can be monitored for disconnects
chttpd_util:mochiweb_client_req_set(MochiReq),

%% This is probably better in before_request, but having Path is nice
couch_stats_resource_tracker:create_coordinator_context(HttpReq0, Path),

{HttpReq2, Response} =
case before_request(HttpReq0) of
{ok, HttpReq1} ->
Expand Down Expand Up @@ -372,6 +375,7 @@ after_request(HttpReq, HttpResp0) ->
HttpResp2 = update_stats(HttpReq, HttpResp1),
chttpd_stats:report(HttpReq, HttpResp2),
maybe_log(HttpReq, HttpResp2),
couch_stats_resource_tracker:destroy_context(),
HttpResp2.

process_request(#httpd{mochi_req = MochiReq} = HttpReq) ->
Expand Down Expand Up @@ -409,10 +413,12 @@ handle_req_after_auth(HandlerKey, HttpReq) ->
HandlerKey,
fun chttpd_db:handle_request/1
),
couch_stats_resource_tracker:set_context_handler_fun(HandlerFun),
AuthorizedReq = chttpd_auth:authorize(
possibly_hack(HttpReq),
fun chttpd_auth_request:authorize_request/1
),
couch_stats_resource_tracker:set_context_username(AuthorizedReq),
{AuthorizedReq, HandlerFun(AuthorizedReq)}
catch
ErrorType:Error:Stack ->
Expand Down
2 changes: 2 additions & 0 deletions src/chttpd/src/chttpd_db.erl
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@

% Database request handlers
handle_request(#httpd{path_parts = [DbName | RestParts], method = Method} = Req) ->
couch_stats_resource_tracker:set_context_dbname(DbName),
case {Method, RestParts} of
{'PUT', []} ->
create_db_req(Req, DbName);
Expand All @@ -103,6 +104,7 @@ handle_request(#httpd{path_parts = [DbName | RestParts], method = Method} = Req)
do_db_req(Req, fun db_req/2);
{_, [SecondPart | _]} ->
Handler = chttpd_handlers:db_handler(SecondPart, fun db_req/2),
couch_stats_resource_tracker:set_context_handler_fun(Handler),
do_db_req(Req, Handler)
end.

Expand Down
1 change: 1 addition & 0 deletions src/chttpd/src/chttpd_httpd_handlers.erl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ url_handler(<<"_utils">>) -> fun chttpd_misc:handle_utils_dir_req/1;
url_handler(<<"_all_dbs">>) -> fun chttpd_misc:handle_all_dbs_req/1;
url_handler(<<"_dbs_info">>) -> fun chttpd_misc:handle_dbs_info_req/1;
url_handler(<<"_active_tasks">>) -> fun chttpd_misc:handle_task_status_req/1;
url_handler(<<"_active_resources">>) -> fun chttpd_misc:handle_resource_status_req/1;
url_handler(<<"_scheduler">>) -> fun couch_replicator_httpd:handle_scheduler_req/1;
url_handler(<<"_node">>) -> fun chttpd_node:handle_node_req/1;
url_handler(<<"_reload_query_servers">>) -> fun chttpd_misc:handle_reload_query_servers_req/1;
Expand Down
105 changes: 105 additions & 0 deletions src/chttpd/src/chttpd_misc.erl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
handle_replicate_req/1,
handle_reload_query_servers_req/1,
handle_task_status_req/1,
handle_resource_status_req/1,
handle_up_req/1,
handle_utils_dir_req/1,
handle_utils_dir_req/2,
Expand Down Expand Up @@ -224,6 +225,110 @@ handle_task_status_req(#httpd{method = 'GET'} = Req) ->
handle_task_status_req(Req) ->
send_method_not_allowed(Req, "GET,HEAD").

handle_resource_status_req(#httpd{method = 'POST'} = Req) ->
ok = chttpd:verify_is_server_admin(Req),
chttpd:validate_ctype(Req, "application/json"),
{Props} = chttpd:json_body_obj(Req),
Action = proplists:get_value(<<"action">>, Props),
Key = proplists:get_value(<<"key">>, Props),
Val = proplists:get_value(<<"val">>, Props),
Comment on lines +228 to +234
Copy link
Contributor

@nickva nickva Sep 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're introducing a new API add some docs or some description (examples) how it works, and the intent behind it. What would CouchDB users use it for, how it is different than metrics, and active tasks, etc...


CountBy = fun couch_stats_resource_tracker:count_by/1,
GroupBy = fun couch_stats_resource_tracker:group_by/2,
SortedBy1 = fun couch_stats_resource_tracker:sorted_by/1,
SortedBy2 = fun couch_stats_resource_tracker:sorted_by/2,
ConvertEle = fun(K) -> list_to_existing_atom(binary_to_list(K)) end,
nickva marked this conversation as resolved.
Show resolved Hide resolved
ConvertList = fun(L) -> [ConvertEle(E) || E <- L] end,
ToJson = fun couch_stats_resource_tracker:term_to_flat_json/1,
JsonKeys = fun(PL) -> [[ToJson(K), V] || {K, V} <- PL] end,

Fun = case {Action, Key, Val} of
{<<"count_by">>, Keys, undefined} when is_list(Keys) ->
Keys1 = [ConvertEle(K) || K <- Keys],
fun() -> CountBy(Keys1) end;
{<<"count_by">>, Key, undefined} ->
Key1 = ConvertEle(Key),
fun() -> CountBy(Key1) end;
{<<"group_by">>, Keys, Vals} when is_list(Keys) andalso is_list(Vals) ->
Keys1 = ConvertList(Keys),
Vals1 = ConvertList(Vals),
fun() -> GroupBy(Keys1, Vals1) end;
{<<"group_by">>, Key, Vals} when is_list(Vals) ->
Key1 = ConvertEle(Key),
Vals1 = ConvertList(Vals),
fun() -> GroupBy(Key1, Vals1) end;
{<<"group_by">>, Keys, Val} when is_list(Keys) ->
Keys1 = ConvertList(Keys),
Val1 = ConvertEle(Val),
fun() -> GroupBy(Keys1, Val1) end;
{<<"group_by">>, Key, Val} ->
Key1 = ConvertEle(Key),
Val1 = ConvertList(Val),
fun() -> GroupBy(Key1, Val1) end;

{<<"sorted_by">>, Key, undefined} ->
Key1 = ConvertEle(Key),
fun() -> JsonKeys(SortedBy1(Key1)) end;
{<<"sorted_by">>, Keys, undefined} when is_list(Keys) ->
Keys1 = [ConvertEle(K) || K <- Keys],
fun() -> JsonKeys(SortedBy1(Keys1)) end;
{<<"sorted_by">>, Keys, Vals} when is_list(Keys) andalso is_list(Vals) ->
Keys1 = ConvertList(Keys),
Vals1 = ConvertList(Vals),
fun() -> JsonKeys(SortedBy2(Keys1, Vals1)) end;
{<<"sorted_by">>, Key, Vals} when is_list(Vals) ->
Key1 = ConvertEle(Key),
Vals1 = ConvertList(Vals),
fun() -> JsonKeys(SortedBy2(Key1, Vals1)) end;
{<<"sorted_by">>, Keys, Val} when is_list(Keys) ->
Keys1 = ConvertList(Keys),
Val1 = ConvertEle(Val),
fun() -> JsonKeys(SortedBy2(Keys1, Val1)) end;
{<<"sorted_by">>, Key, Val} ->
Key1 = ConvertEle(Key),
Val1 = ConvertList(Val),
fun() -> JsonKeys(SortedBy2(Key1, Val1)) end;
_ ->
throw({badrequest, invalid_resource_request})
end,

Fun1 = fun() ->
case Fun() of
Map when is_map(Map) ->
{maps:fold(
fun
(_K,0,A) -> A; %% TODO: Skip 0 value entries?
(K,V,A) -> [{ToJson(K), V} | A]
end,
[], Map)};
List when is_list(List) ->
List
end
end,

{Resp, _Bad} = rpc:multicall(erlang, apply, [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fun() ->
{node(), Fun1()}
end,
[]
]),
%%io:format("{CSRT}***** GOT RESP: ~p~n", [Resp]),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Left-over debug comment

send_json(Req, {Resp});
handle_resource_status_req(#httpd{method = 'GET'} = Req) ->
ok = chttpd:verify_is_server_admin(Req),
{Resp, Bad} = rpc:multicall(erlang, apply, [
fun() ->
{node(), couch_stats_resource_tracker:active()}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of having to run in a closure just to get node() in the response, consider using an explicit list of nodes [node() | nodes()] with a plain [M, F, A] and then zipping over the nodes in the response. It's a bit longer but it's less fragile.

end,
[]
]),
%% TODO: incorporate Bad responses
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider using erpc, the handling of bad responses is a bit cleaner there, I think

send_json(Req, {Resp});
handle_resource_status_req(Req) ->
ok = chttpd:verify_is_server_admin(Req),
send_method_not_allowed(Req, "GET,HEAD,POST").


handle_replicate_req(#httpd{method = 'POST', user_ctx = Ctx, req_body = PostBody} = Req) ->
chttpd:validate_ctype(Req, "application/json"),
%% see HACK in chttpd.erl about replication
Expand Down
7 changes: 4 additions & 3 deletions src/chttpd/test/eunit/chttpd_db_doc_size_tests.erl
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@

setup() ->
Hashed = couch_passwords:hash_admin_password(?PASS),
ok = config:set("admins", ?USER, ?b2l(Hashed), _Persist = false),
ok = config:set("couchdb", "max_document_size", "50"),
ok = config:set("admins", ?USER, ?b2l(Hashed), false),
ok = config:set("couchdb", "max_document_size", "50", false),

TmpDb = ?tempdb(),
Addr = config:get("chttpd", "bind_address", "127.0.0.1"),
Port = mochiweb_socket_server:get(chttpd, port),
Expand All @@ -35,7 +36,7 @@ setup() ->

teardown(Url) ->
delete_db(Url),
ok = config:delete("admins", ?USER, _Persist = false),
ok = config:delete("admins", ?USER, false),
ok = config:delete("couchdb", "max_document_size").

create_db(Url) ->
Expand Down
2 changes: 2 additions & 0 deletions src/couch/include/couch_db.hrl
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
-define(INTERACTIVE_EDIT, interactive_edit).
-define(REPLICATED_CHANGES, replicated_changes).

-define(LOG_UNEXPECTED_MSG(Msg), couch_log:warning("[~p:~p:~p/~p]{~p[~p]} Unexpected message: ~w", [?MODULE, ?LINE, ?FUNCTION_NAME, ?FUNCTION_ARITY, self(), element(2, process_info(self(), message_queue_len)), Msg])).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am surprised linter doesn't complain about such long line.


-type branch() :: {Key::term(), Value::term(), Tree::term()}.
-type path() :: {Start::pos_integer(), branch()}.
-type update_type() :: replicated_changes | interactive_edit.
Expand Down
41 changes: 41 additions & 0 deletions src/couch/priv/stats_descriptions.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,10 @@
{type, counter},
{desc, <<"number of couch_server LRU operations skipped">>}
]}.
{[couchdb, couch_server, open], [
{type, counter},
{desc, <<"number of couch_server open operations invoked">>}
]}.
{[couchdb, query_server, vdu_rejects], [
{type, counter},
{desc, <<"number of rejections by validate_doc_update function">>}
Expand Down Expand Up @@ -418,10 +422,47 @@
{type, counter},
{desc, <<"number of other requests">>}
]}.
{[couchdb, query_server, js_filter], [
{type, counter},
{desc, <<"number of JS filter invocations">>}
]}.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is already a .query_server.calls.ddoc_filter counter

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahhh good catch, I didn't see those were hidden in https://github.com/apache/couchdb/blob/main/src/couch/src/couch_os_process.erl#L257-L259

I've reworked this in 4db08c9f2 to use the existing metrics and only add a new metric [couchdb, query_server, volume, ddoc_filter] for counting the number of docs funneled through the filter. I also reworked the logic so that volume metric is captured in the same module and style as the bump_time_stat function above.

{[couchdb, query_server, js_filtered_docs], [
{type, counter},
{desc, <<"number of docs filtered through JS invocations">>}
]}.
{[couchdb, query_server, js_filter_error], [
{type, counter},
{desc, <<"number of JS filter invocation errors">>}
]}.
{[couchdb, legacy_checksums], [
{type, counter},
{desc, <<"number of legacy checksums found in couch_file instances">>}
]}.
{[couchdb, btree, folds], [
{type, counter},
{desc, <<"number of couch btree kv fold callback invocations">>}
]}.
{[couchdb, btree, get_node, kp_node], [
{type, counter},
{desc, <<"number of couch btree kp_nodes read">>}
]}.
{[couchdb, btree, get_node, kv_node], [
{type, counter},
{desc, <<"number of couch btree kv_nodes read">>}
]}.
Comment on lines +437 to +444
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wonder if this is a bit too low level? Or rather, btrees are used for view and db files. It's not clear which btrees the metrics refer to.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not clear which btrees the metrics refer to.

So that's the thing, we don't have current visibility into the volume of couch_btree:get_node calls or what type of operations are inducing them. These could come from normal db doc reads, all_docs folds, compaction folds, index build folds, all these different systems induce heavy couch_btree operations and we don't know where/when/why.

The idea here is to start tracking the amount of core btree operations by the workers, allowing us to track it at a process level like we're doing with the rest of the stats in this PR, and then we'll at least be able to see the total volume of couch_btree operations induced, and then use the reports to quantify the requests/dbs/etc that performed those operations.

A key aspect of the tracking here is that couch_btree:* are still inside the caller processes, eg the workers themselves, so we can do the process local tracking without having to extend beyond process boundaries. This CSRT engine does not yet handle nested work tracking, so for instance we can't funnel back information from the couch_file pid back through IOQ. Tracking the data here is a solid boundary of per request work being induced that we can track, while also providing much needed insight into overall database operations.

As for tracking of kv_node separately from kp_node, I think this is important because we're in a btree and performance is proportional to the log of the size of the btree, and we currently don't have any metrics illuminating the impact of btree size on read throughput. In my experience it's a fairly common experience for users to forget that while logN in a B+tree is really good, it's not a constant value and once you get into databases with billions of documents it takes longer to get through the btree, on every request. My hope here is that by tracking kp_node reads separately from kv_nodes we can more easily understand the workloads induced by larger databases, and more easily quantify that, something that I personally find challenging to do today.

I thought about trying to funnel down into couch_btree an indicator variable of the datatype, so for instance we could track changes btree operations independently of compaction folds, but that'd either require passing through a more informative context into couch_btree:* or some ugly pdict hacks. I think this approach of getting as close to the caller boundary in as we can before we funnel off to another process gives us low level insight into couch_btree operations while also allowing us to track it at a request/worker level so that retroactively we'll be able to go look at the report logs and quantify what requests were responsible for the induced btree operations. It's important to note that this approach will also extend naturally to background worker CSRT tracking, eg we can easily extend compaction, replication, and indexing to create a CSRT context and then we'll be able to quantify the work induced by those background jobs too.

{[couchdb, btree, write_node, kp_node], [
{type, counter},
{desc, <<"number of couch btree kp_nodes written">>}
]}.
{[couchdb, btree, write_node, kv_node], [
{type, counter},
{desc, <<"number of couch btree kv_nodes written">>}
]}.
%% CSRT (couch_stats_resource_tracker) stats
{[couchdb, csrt, delta_missing_t0], [
{type, counter},
{desc, <<"number of csrt contexts without a proper startime">>}
]}.
{[pread, exceed_eof], [
{type, counter},
{desc, <<"number of the attempts to read beyond end of db file">>}
Expand Down
3 changes: 3 additions & 0 deletions src/couch/src/couch_btree.erl
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,8 @@ reduce_tree_size(kp_node, NodeSize, [{_K, {_P, _Red, Sz}} | NodeList]) ->

get_node(#btree{fd = Fd}, NodePos) ->
{ok, {NodeType, NodeList}} = couch_file:pread_term(Fd, NodePos),
%% TODO: wire in csrt tracking
couch_stats:increment_counter([couchdb, btree, get_node, NodeType]),
{NodeType, NodeList}.

write_node(#btree{fd = Fd, compression = Comp} = Bt, NodeType, NodeList) ->
Expand All @@ -480,6 +482,7 @@ write_node(#btree{fd = Fd, compression = Comp} = Bt, NodeType, NodeList) ->
% now write out each chunk and return the KeyPointer pairs for those nodes
ToWrite = [{NodeType, Chunk} || Chunk <- Chunks],
WriteOpts = [{compression, Comp}],
couch_stats:increment_counter([couchdb, btree, write_node, NodeType]),
{ok, PtrSizes} = couch_file:append_terms(Fd, ToWrite, WriteOpts),
{ok, group_kps(Bt, NodeType, Chunks, PtrSizes)}.

Expand Down
2 changes: 2 additions & 0 deletions src/couch/src/couch_db.erl
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ open_doc(Db, IdOrDocInfo) ->
open_doc(Db, IdOrDocInfo, []).

open_doc(Db, Id, Options) ->
%% TODO: wire in csrt tracking
increment_stat(Db, [couchdb, database_reads]),
case open_doc_int(Db, Id, Options) of
{ok, #doc{deleted = true} = Doc} ->
Expand Down Expand Up @@ -1982,6 +1983,7 @@ increment_stat(#db{options = Options}, Stat, Count) when
->
case lists:member(sys_db, Options) of
true ->
%% TODO: we shouldn't leak resource usage just because it's a sys_db
ok;
false ->
couch_stats:increment_counter(Stat, Count)
Expand Down
8 changes: 8 additions & 0 deletions src/couch/src/couch_query_servers.erl
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,8 @@ filter_docs(Req, Db, DDoc, FName, Docs) ->
{ok, filter_docs_int(Db, DDoc, FName, JsonReq, JsonDocs)}
catch
throw:{os_process_error, {exit_status, 1}} ->
%% TODO: wire in csrt tracking
couch_stats:increment_counter([couchdb, query_server, js_filter_error]),
Comment on lines +545 to +546
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have a general counter for os process errors, normal exits and error exits. Probably don't need an explicit one just for filters?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 4db08c9.

%% batch used too much memory, retry sequentially.
Fun = fun(JsonDoc) ->
filter_docs_int(Db, DDoc, FName, JsonReq, [JsonDoc])
Expand All @@ -550,6 +552,12 @@ filter_docs(Req, Db, DDoc, FName, Docs) ->
end.

filter_docs_int(Db, DDoc, FName, JsonReq, JsonDocs) ->
%% Count usage in _int version as this can be repeated for OS error
%% Pros & cons... might not have actually processed `length(JsonDocs)` docs
%% but it certainly undercounts if we count in `filter_docs/5` above
%% TODO: wire in csrt tracking
couch_stats:increment_counter([couchdb, query_server, js_filter]),
couch_stats:increment_counter([couchdb, query_server, js_filtered_docs], length(JsonDocs)),
[true, Passes] = ddoc_prompt(
Db,
DDoc,
Expand Down
2 changes: 2 additions & 0 deletions src/couch/src/couch_server.erl
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ sup_start_link(N) ->
gen_server:start_link({local, couch_server(N)}, couch_server, [N], []).

open(DbName, Options) ->
%% TODO: wire in csrt tracking
couch_stats:increment_counter([couchdb, couch_server, open]),
try
validate_open_or_create(DbName, Options),
open_int(DbName, Options)
Expand Down
8 changes: 6 additions & 2 deletions src/couch_stats/src/couch_stats.app.src
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@
{application, couch_stats, [
{description, "Simple statistics collection"},
{vsn, git},
{registered, [couch_stats_aggregator, couch_stats_process_tracker]},
{applications, [kernel, stdlib]},
{registered, [
couch_stats_aggregator,
couch_stats_process_tracker,
couch_stats_resource_tracker
]},
{applications, [kernel, stdlib, couch_log]},
{mod, {couch_stats_app, []}},
{env, []}
]}.
30 changes: 30 additions & 0 deletions src/couch_stats/src/couch_stats.erl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@
update_gauge/2
]).

%% couch_stats_resource_tracker API
-export([
create_context/3,
maybe_track_rexi_init_p/1
]).

-type response() :: ok | {error, unknown_metric} | {error, invalid_metric}.
-type stat() :: {any(), [{atom(), any()}]}.

Expand All @@ -49,6 +55,11 @@ increment_counter(Name) ->

-spec increment_counter(any(), pos_integer()) -> response().
increment_counter(Name, Value) ->
%% Should maybe_track_local happen before or after notify?
%% If after, only currently tracked metrics declared in the app's
%% stats_description.cfg will be trackable locally. Pros/cons.
%io:format("NOTIFY_EXISTING_METRIC: ~p || ~p || ~p~n", [Name, Op, Type]),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Left-over debug statement

ok = maybe_track_local_counter(Name, Value),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is enabled for all requests, a bit worried about the performance here as we add an ets operation over a simple integer counter bump. Was there any noticeable performance impact from it, during perf runs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An essential part of the work done in this PR is to ensure that this is as minimally impactful as possible. We add ets config lookup to check if it's enabled, and then we perform the same type of simple integer counter bump utilizing ets:update_counter with distributed_counters enabled and no concurrent writes to the same key, all writes to any given key are only written to from the process tracking its own work, and then finally the tracker process will delete the entry from ets when the process dies and closes the context.

@nickva you did have me change my hard coded atom function to one utilizing maps (although I did mange to use static maps), so I'll make sure to do some more testing with the final version to compare. I plan on doing some comparisons between the start of this PR and the final version, to make sure there weren't any regressions induced by the changes.

case couch_stats_util:get_counter(Name, stats()) of
{ok, Ctx} -> couch_stats_counter:increment(Ctx, Value);
{error, Error} -> {error, Error}
Expand Down Expand Up @@ -100,6 +111,25 @@ stats() ->
now_sec() ->
erlang:monotonic_time(second).

%% Only potentially track positive increments to counters
-spec maybe_track_local_counter(any(), any()) -> ok.
maybe_track_local_counter(Name, Val) when is_integer(Val) andalso Val > 0 ->
%%io:format("maybe_track_local[~p]: ~p~n", [Val, Name]),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

debug left-over?

couch_stats_resource_tracker:maybe_inc(Name, Val),
ok;
maybe_track_local_counter(_, _) ->
ok.

create_context(From, MFA, Nonce) ->
couch_stats_resource_tracker:create_context(From, MFA, Nonce).

maybe_track_rexi_init_p({M, F, _A}) ->
Metric = [M, F, spawned],
case couch_stats_resource_tracker:should_track(Metric) of
true -> increment_counter(Metric);
false -> ok
end.

-ifdef(TEST).

-include_lib("couch/include/couch_eunit.hrl").
Expand Down
Loading