From 618f695645636fb6d978afe4b4165bbe9a2d37de Mon Sep 17 00:00:00 2001
From: Michal Kuratczyk <mkuratczyk@vmware.com>
Date: Mon, 22 Jul 2024 18:17:32 +0200
Subject: [PATCH] Move memory breakdown metrics to new endpoint

Collecting them on a large system (tens of thousands of processes
or more) can be time consuming as we iterate over all processes.
By putting them on a separate endpoint, we make that opt-in
---
 ...etheus_rabbitmq_core_metrics_collector.erl | 63 ++++++++++---------
 .../src/rabbit_prometheus_dispatcher.erl      |  3 +
 .../src/rabbit_prometheus_handler.erl         |  1 +
 .../test/rabbit_prometheus_http_SUITE.erl     | 20 ++++--
 4 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_core_metrics_collector.erl b/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_core_metrics_collector.erl
index 3af1df4dfa1a..848e6c764fde 100644
--- a/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_core_metrics_collector.erl
+++ b/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_core_metrics_collector.erl
@@ -86,35 +86,6 @@
         {2, ?MILLISECOND, erlang_uptime_seconds, gauge, "Node uptime", uptime}
     ]},
 
-    {node_memory, [
-        {2, undefined, memory_code_module_bytes, gauge, "Code module memory footprint", code},
-        {2, undefined, memory_client_connection_reader_bytes, gauge, "Client connection reader processes footprint in bytes", connection_readers},
-        {2, undefined, memory_client_connection_writer_bytes, gauge, "Client connection writer processes footprint in bytes", connection_writers},
-        {2, undefined, memory_client_connection_channel_bytes, gauge, "Client connection channel processes footprint in bytes", connection_channels},
-        {2, undefined, memory_client_connection_other_bytes, gauge, "Client connection other processes footprint in bytes", connection_other},
-        {2, undefined, memory_classic_queue_erlang_process_bytes, gauge, "Classic queue processes footprint in bytes", queue_procs},
-        {2, undefined, memory_quorum_queue_erlang_process_bytes, gauge, "Quorum queue processes footprint in bytes", quorum_queue_procs},
-        {2, undefined, memory_quorum_queue_dlx_erlang_process_bytes, gauge, "Quorum queue DLX worker processes footprint in bytes", quorum_queue_dlx_procs},
-        {2, undefined, memory_stream_erlang_process_bytes, gauge, "Stream processes footprint in bytes", stream_queue_procs},
-        {2, undefined, memory_stream_replica_reader_erlang_process_bytes, gauge, "Stream replica reader processes footprint in bytes", stream_queue_replica_reader_procs},
-        {2, undefined, memory_stream_coordinator_erlang_process_bytes, gauge, "Stream coordinator processes footprint in bytes", stream_queue_coordinator_procs},
-        {2, undefined, memory_plugin_bytes, gauge, "Total plugin footprint in bytes", plugins},
-        {2, undefined, memory_modern_metadata_store_bytes, gauge, "Modern metadata store footprint in bytes", metadata_store},
-        {2, undefined, memory_other_erlang_process_bytes, gauge, "Other processes footprint in bytes", other_proc},
-        {2, undefined, memory_metrics_bytes, gauge, "Metric table footprint in bytes", metrics},
-        {2, undefined, memory_management_stats_db_bytes, gauge, "Management stats database footprint in bytes", mgmt_db},
-        {2, undefined, memory_classic_metadata_store_bytes, gauge, "Classic metadata store footprint in bytes", mnesia},
-        {2, undefined, memory_quorum_queue_ets_table_bytes, gauge, "Quorum queue ETS tables footprint in bytes", quorum_ets},
-        {2, undefined, memory_modern_metadata_store_ets_table_bytes, gauge, "Modern metadata store ETS tables footprint in bytes", metadata_store_ets},
-        {2, undefined, memory_other_ets_table_bytes, gauge, "Other ETS tables footprint in bytes", other_ets},
-        {2, undefined, memory_binary_heap_bytes, gauge, "Binary heap size in bytes", binary},
-        {2, undefined, memory_message_index_bytes, gauge, "Message index footprint in bytes", msg_index},
-        {2, undefined, memory_atom_table_bytes, gauge, "Atom table size in bytes", atom},
-        {2, undefined, memory_other_system_bytes, gauge, "Other runtime footprint in bytes", other_system},
-        {2, undefined, memory_runtime_allocated_unused_bytes, gauge, "Runtime allocated but unused blocks size in bytes", allocated_unused},
-        {2, undefined, memory_runtime_reserved_unallocated_bytes, gauge, "Runtime reserved but unallocated blocks size in bytes", reserved_unallocated}
-    ]},
-
     {node_persister_metrics, [
         {2, undefined, io_read_ops_total, counter, "Total number of I/O read operations", io_read_count},
         {2, undefined, io_read_bytes_total, counter, "Total number of I/O bytes read", io_read_bytes},
@@ -277,6 +248,36 @@
     ]}
 ]).
 
+-define(METRICS_MEMORY_BREAKDOWN, [
+    {node_memory, [
+        {2, undefined, memory_code_module_bytes, gauge, "Code module memory footprint", code},
+        {2, undefined, memory_client_connection_reader_bytes, gauge, "Client connection reader processes footprint in bytes", connection_readers},
+        {2, undefined, memory_client_connection_writer_bytes, gauge, "Client connection writer processes footprint in bytes", connection_writers},
+        {2, undefined, memory_client_connection_channel_bytes, gauge, "Client connection channel processes footprint in bytes", connection_channels},
+        {2, undefined, memory_client_connection_other_bytes, gauge, "Client connection other processes footprint in bytes", connection_other},
+        {2, undefined, memory_classic_queue_erlang_process_bytes, gauge, "Classic queue processes footprint in bytes", queue_procs},
+        {2, undefined, memory_quorum_queue_erlang_process_bytes, gauge, "Quorum queue processes footprint in bytes", quorum_queue_procs},
+        {2, undefined, memory_quorum_queue_dlx_erlang_process_bytes, gauge, "Quorum queue DLX worker processes footprint in bytes", quorum_queue_dlx_procs},
+        {2, undefined, memory_stream_erlang_process_bytes, gauge, "Stream processes footprint in bytes", stream_queue_procs},
+        {2, undefined, memory_stream_replica_reader_erlang_process_bytes, gauge, "Stream replica reader processes footprint in bytes", stream_queue_replica_reader_procs},
+        {2, undefined, memory_stream_coordinator_erlang_process_bytes, gauge, "Stream coordinator processes footprint in bytes", stream_queue_coordinator_procs},
+        {2, undefined, memory_plugin_bytes, gauge, "Total plugin footprint in bytes", plugins},
+        {2, undefined, memory_modern_metadata_store_bytes, gauge, "Modern metadata store footprint in bytes", metadata_store},
+        {2, undefined, memory_other_erlang_process_bytes, gauge, "Other processes footprint in bytes", other_proc},
+        {2, undefined, memory_metrics_bytes, gauge, "Metric table footprint in bytes", metrics},
+        {2, undefined, memory_management_stats_db_bytes, gauge, "Management stats database footprint in bytes", mgmt_db},
+        {2, undefined, memory_classic_metadata_store_bytes, gauge, "Classic metadata store footprint in bytes", mnesia},
+        {2, undefined, memory_quorum_queue_ets_table_bytes, gauge, "Quorum queue ETS tables footprint in bytes", quorum_ets},
+        {2, undefined, memory_modern_metadata_store_ets_table_bytes, gauge, "Modern metadata store ETS tables footprint in bytes", metadata_store_ets},
+        {2, undefined, memory_other_ets_table_bytes, gauge, "Other ETS tables footprint in bytes", other_ets},
+        {2, undefined, memory_binary_heap_bytes, gauge, "Binary heap size in bytes", binary},
+        {2, undefined, memory_message_index_bytes, gauge, "Message index footprint in bytes", msg_index},
+        {2, undefined, memory_atom_table_bytes, gauge, "Atom table size in bytes", atom},
+        {2, undefined, memory_other_system_bytes, gauge, "Other runtime footprint in bytes", other_system},
+        {2, undefined, memory_runtime_allocated_unused_bytes, gauge, "Runtime allocated but unused blocks size in bytes", allocated_unused},
+        {2, undefined, memory_runtime_reserved_unallocated_bytes, gauge, "Runtime reserved but unallocated blocks size in bytes", reserved_unallocated}
+    ]}]).
+
 -define(TOTALS, [
     %% ordering differs from metrics above, refer to list comprehension
     {connection_created, connections, gauge, "Connections currently open"},
@@ -305,6 +306,10 @@ collect_mf('per-object', Callback) ->
     totals(Callback),
     emit_identity_info(Callback),
     ok;
+collect_mf('memory-breakdown', Callback) ->
+    collect(false, ?METRIC_NAME_PREFIX, false, ?METRICS_MEMORY_BREAKDOWN, Callback),
+    emit_identity_info(Callback),
+    ok;
 collect_mf(_Registry, Callback) ->
     PerObjectMetrics = application:get_env(rabbitmq_prometheus, return_per_object_metrics, false),
     collect(PerObjectMetrics, ?METRIC_NAME_PREFIX, false, ?METRICS_RAW, Callback),
diff --git a/deps/rabbitmq_prometheus/src/rabbit_prometheus_dispatcher.erl b/deps/rabbitmq_prometheus/src/rabbit_prometheus_dispatcher.erl
index 5ede00f50f7a..e8b5a1d0de3f 100644
--- a/deps/rabbitmq_prometheus/src/rabbit_prometheus_dispatcher.erl
+++ b/deps/rabbitmq_prometheus/src/rabbit_prometheus_dispatcher.erl
@@ -33,6 +33,9 @@ build_dispatcher() ->
     prometheus_registry:register_collectors('detailed', [
         prometheus_rabbitmq_core_metrics_collector
         ]),
+    prometheus_registry:register_collectors('memory-breakdown', [
+        prometheus_rabbitmq_core_metrics_collector
+        ]),
     rabbit_prometheus_handler:setup(),
     cowboy_router:compile([{'_', dispatcher()}]).
 
diff --git a/deps/rabbitmq_prometheus/src/rabbit_prometheus_handler.erl b/deps/rabbitmq_prometheus/src/rabbit_prometheus_handler.erl
index b5f4076ccab7..ff780d273042 100644
--- a/deps/rabbitmq_prometheus/src/rabbit_prometheus_handler.erl
+++ b/deps/rabbitmq_prometheus/src/rabbit_prometheus_handler.erl
@@ -46,6 +46,7 @@ is_authorized(ReqData, Context) ->
 setup() ->
     setup_metrics(telemetry_registry()),
     setup_metrics('per-object'),
+    setup_metrics('memory-breakdown'),
     setup_metrics('detailed').
 
 setup_metrics(Registry) ->
diff --git a/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl b/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl
index 8b41466a04eb..1a9c514391be 100644
--- a/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl
+++ b/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl
@@ -25,7 +25,8 @@ all() ->
         {group, commercial},
         {group, detailed_metrics},
         {group, special_chars},
-        {group, authentication}
+        {group, authentication},
+        {group, memory_breakdown_endpoint_metrics}
     ].
 
 groups() ->
@@ -49,6 +50,9 @@ groups() ->
             endpoint_per_object_metrics,
             specific_erlang_metrics_present_test
         ]},
+        {memory_breakdown_endpoint_metrics, [], [
+            memory_breakdown_metrics_test
+        ]},
         {commercial, [], [
             build_info_product_test
         ]},
@@ -247,7 +251,9 @@ init_per_group(special_chars, Config0) ->
 init_per_group(authentication, Config) ->
     Config1 = rabbit_ct_helpers:merge_app_env(
                 Config, {rabbitmq_prometheus, [{authentication, [{enabled, true}]}]}),
-    init_per_group(authentication, Config1, []).
+    init_per_group(authentication, Config1, []);
+init_per_group(memory_breakdown_endpoint_metrics, Config) ->
+    init_per_group(memory_breakdown_endpoint_metrics, Config, []).
 
 
 
@@ -387,10 +393,6 @@ aggregated_metrics_test(Config) ->
     ?assertEqual(match, re:run(Body, "^rabbitmq_queue_consumers ", [{capture, none}, multiline])),
     ?assertEqual(match, re:run(Body, "TYPE rabbitmq_auth_attempts_total", [{capture, none}, multiline])),
     ?assertEqual(nomatch, re:run(Body, "TYPE rabbitmq_auth_attempts_detailed_total", [{capture, none}, multiline])),
-    %% Memory breakdown
-    ?assertEqual(match, re:run(Body, "^rabbitmq_memory_quorum_queue_erlang_process_bytes ", [{capture, none}, multiline])),
-    ?assertEqual(match, re:run(Body, "^rabbitmq_memory_classic_queue_erlang_process_bytes ", [{capture, none}, multiline])),
-    ?assertEqual(match, re:run(Body, "^rabbitmq_memory_binary_heap_bytes ", [{capture, none}, multiline])),
     %% Check the first metric value in each ETS table that requires converting
     ?assertEqual(match, re:run(Body, "^rabbitmq_erlang_uptime_seconds ", [{capture, none}, multiline])),
     ?assertEqual(match, re:run(Body, "^rabbitmq_io_read_time_seconds_total ", [{capture, none}, multiline])),
@@ -437,6 +439,12 @@ per_object_metrics_test(Config, Path) ->
     %% Check the first TOTALS metric value
     ?assertEqual(match, re:run(Body, "^rabbitmq_connections ", [{capture, none}, multiline])).
 
+memory_breakdown_metrics_test(Config) ->
+    {_Headers, Body} = http_get_with_pal(Config, "/metrics/memory-breakdown", [], 200),
+    ?assertEqual(match, re:run(Body, "^rabbitmq_memory_quorum_queue_erlang_process_bytes ", [{capture, none}, multiline])),
+    ?assertEqual(match, re:run(Body, "^rabbitmq_memory_classic_queue_erlang_process_bytes ", [{capture, none}, multiline])),
+    ?assertEqual(match, re:run(Body, "^rabbitmq_memory_binary_heap_bytes ", [{capture, none}, multiline])).
+
 build_info_test(Config) ->
     {_Headers, Body} = http_get_with_pal(Config, [], 200),
     ?assertEqual(match, re:run(Body, "^rabbitmq_build_info{", [{capture, none}, multiline])),