From 181919610cb94be27e20ccc764659234ec2afc26 Mon Sep 17 00:00:00 2001 From: Robert Johnson Date: Fri, 6 Feb 2026 07:11:10 -0800 Subject: [PATCH 1/3] feat: add Prometheus metrics infrastructure and build_info Add PrometheusMetrics module with WEBrick HTTP server, registry, helper methods (prom_inc/prom_set/prom_observe), and build_info gauge. Wire into Nerve, ServiceWatcher, and Reporter::Base. Configuration via "prometheus" block in nerve config (enabled, port, bind, histogram_buckets_zk, histogram_buckets_main_loop). Server startup deferred until after --check-config early exit. Co-Authored-By: Claude Opus 4.6 --- Gemfile.lock | 7 ++ example/nerve.conf.json | 7 ++ lib/nerve.rb | 13 +++ lib/nerve/prometheus_metrics.rb | 117 ++++++++++++++++++++++ lib/nerve/reporter.rb | 2 + lib/nerve/reporter/base.rb | 1 + lib/nerve/service_watcher.rb | 1 + nerve.gemspec | 2 + spec/lib/nerve/prometheus_metrics_spec.rb | 116 +++++++++++++++++++++ spec/lib/nerve_spec.rb | 102 +++++++++++++++++++ 10 files changed, 368 insertions(+) create mode 100644 lib/nerve/prometheus_metrics.rb create mode 100644 spec/lib/nerve/prometheus_metrics_spec.rb diff --git a/Gemfile.lock b/Gemfile.lock index ac6b0233..52c32ab1 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -6,7 +6,9 @@ PATH dogstatsd-ruby (~> 5.0) etcd (~> 0.3) json + prometheus-client (~> 4.0) redis (~> 5.0) + webrick zk (~> 1.10) GEM @@ -70,6 +72,8 @@ GEM ast (~> 2.4.1) racc prism (1.7.0) + prometheus-client (4.2.5) + base64 pry (0.16.0) coderay (~> 1.1) method_source (~> 1.0) @@ -140,6 +144,7 @@ GEM unicode-emoji (~> 4.1) unicode-emoji (4.2.0) uri (1.1.1) + webrick (1.9.2) zk (1.10.0) zookeeper (~> 1.5.0) zookeeper (1.5.5) @@ -204,6 +209,7 @@ CHECKSUMS parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130 parser (3.3.10.0) sha256=ce3587fa5cc55a88c4ba5b2b37621b3329aadf5728f9eafa36bbd121462aabd6 prism (1.7.0) sha256=10062f734bf7985c8424c44fac382ac04a58124ea3d220ec3ba9fe4f2da65103 + prometheus-client (4.2.5) sha256=807bebc3e92ccd9f4d814d90be15e85338f3708d227badf80924ef3f6e7d5225 pry (0.16.0) sha256=d76c69065698ed1f85e717bd33d7942c38a50868f6b0673c636192b3d1b6054e racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a @@ -232,6 +238,7 @@ CHECKSUMS unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42 unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f uri (1.1.1) sha256=379fa58d27ffb1387eaada68c749d1426738bd0f654d812fcc07e7568f5c57c6 + webrick (1.9.2) sha256=beb4a15fc474defed24a3bda4ffd88a490d517c9e4e6118c3edce59e45864131 zk (1.10.0) sha256=e7151a665d6f5974a8d569d1b7a28c7c0110e4679b8f834fa54429dd4cbb569f zookeeper (1.5.5) sha256=6fcf3b2ac40158968bead2425abf9ca225db673014aba0f4a4a3b057d73d88c1 diff --git a/example/nerve.conf.json b/example/nerve.conf.json index b356d5e9..451fdedf 100644 --- a/example/nerve.conf.json +++ b/example/nerve.conf.json @@ -6,6 +6,13 @@ "host": "localhost", "port": 8125 }, + "prometheus": { + "enabled": true, + "port": 9292, + "bind": "0.0.0.0", + "histogram_buckets_zk": [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0], + "histogram_buckets_main_loop": [0.001, 0.01, 0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0] + }, "services": { "your_http_service": { "host": "1.2.3.4", diff --git a/lib/nerve.rb b/lib/nerve.rb index cb1e5f39..9a602ef8 100644 --- a/lib/nerve.rb +++ b/lib/nerve.rb @@ -7,6 +7,7 @@ require "nerve/utils" require "nerve/log" require "nerve/statsd" +require "nerve/prometheus_metrics" require "nerve/ring_buffer" require "nerve/reporter" require "nerve/service_watcher" @@ -16,6 +17,7 @@ class Nerve include Logging include Utils include StatsD + include PrometheusMetrics MAIN_LOOP_SLEEP_S = 10 LAUNCH_WAIT_FOR_REPORT_S = 30 @@ -161,6 +163,16 @@ def run break end + prometheus_config = @config_manager.config["prometheus"] || {} + if prometheus_config["enabled"] + unless @prometheus_started + PrometheusMetrics.configure(prometheus_config) + end + elsif PrometheusMetrics.enabled? + PrometheusMetrics.disable! + end + @prometheus_started = PrometheusMetrics.enabled? + # Check that watchers are still alive, auto-remediate if they # are not. Sometimes zookeeper flakes out or connections are lost to # remote datacenter zookeeper clusters, failing is not an option @@ -205,6 +217,7 @@ def run statsd.increment("nerve.stop", tags: ["stop_avenue:clean", "stop_location:main_loop"]) ensure $EXIT = true + PrometheusMetrics.stop_server end def heartbeat diff --git a/lib/nerve/prometheus_metrics.rb b/lib/nerve/prometheus_metrics.rb new file mode 100644 index 00000000..9e93ed41 --- /dev/null +++ b/lib/nerve/prometheus_metrics.rb @@ -0,0 +1,117 @@ +require "webrick" +require "prometheus/client" +require "prometheus/client/formats/text" +require "nerve/log" +require "nerve/version" + +module Nerve + module PrometheusMetrics + HISTOGRAM_BUCKETS_ZK = [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0].freeze + HISTOGRAM_BUCKETS_MAIN_LOOP = [0.001, 0.01, 0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0].freeze + + class << self + include Logging + + @@prom_enabled = false + @@prom_registry = nil + @@prom_metrics = {} + @@prom_server = nil + + def enabled? + @@prom_enabled + end + + def registry + @@prom_registry + end + + def metrics + @@prom_metrics + end + + def configure(opts) + return unless opts && opts["enabled"] + + @@prom_enabled = true + @@prom_registry = Prometheus::Client::Registry.new + + zk_buckets = opts["histogram_buckets_zk"] || HISTOGRAM_BUCKETS_ZK + main_loop_buckets = opts["histogram_buckets_main_loop"] || HISTOGRAM_BUCKETS_MAIN_LOOP + register_metrics(zk_buckets: zk_buckets, main_loop_buckets: main_loop_buckets) + + port = opts["port"] || 9292 + bind = opts["bind"] || "0.0.0.0" + start_server(bind, port) + + log.info "nerve: prometheus metrics enabled on #{bind}:#{port}/metrics" + end + + def stop_server + if @@prom_server + log.info "nerve: stopping prometheus metrics server" + @@prom_server.shutdown + @@prom_server = nil + end + end + + def disable! + return unless @@prom_enabled + stop_server + @@prom_enabled = false + @@prom_registry = nil + @@prom_metrics = {} + @@prom_server = nil + end + + private + + def register_metrics(zk_buckets: HISTOGRAM_BUCKETS_ZK, main_loop_buckets: HISTOGRAM_BUCKETS_MAIN_LOOP) + # Info + @@prom_metrics[:build_info] = @@prom_registry.gauge( + :nerve_build_info, + docstring: "Nerve build information", + labels: [:version] + ) + @@prom_metrics[:build_info].set(1, labels: {version: VERSION}) + end + + def start_server(bind, port) + registry = @@prom_registry + @@prom_server = WEBrick::HTTPServer.new( + Port: port, + BindAddress: bind, + Logger: WEBrick::Log.new(File::NULL), + AccessLog: [] + ) + + @@prom_server.mount_proc "/metrics" do |_req, res| + res["Content-Type"] = Prometheus::Client::Formats::Text::CONTENT_TYPE + res.body = Prometheus::Client::Formats::Text.marshal(registry) + end + + Thread.new { @@prom_server.start } + end + end + + def prom_inc(metric_name, labels: {}, by: 1) + return unless PrometheusMetrics.enabled? + metric = PrometheusMetrics.metrics[metric_name] + return unless metric + metric.increment(labels: labels, by: by) + end + + def prom_set(metric_name, value, labels: {}) + return unless PrometheusMetrics.enabled? + metric = PrometheusMetrics.metrics[metric_name] + return unless metric + metric.set(value, labels: labels) + end + + def prom_observe(metric_name, value, labels: {}) + return unless PrometheusMetrics.enabled? + metric = PrometheusMetrics.metrics[metric_name] + return unless metric + metric.observe(value, labels: labels) + end + end +end diff --git a/lib/nerve/reporter.rb b/lib/nerve/reporter.rb index ca3c47b2..beb471fc 100644 --- a/lib/nerve/reporter.rb +++ b/lib/nerve/reporter.rb @@ -1,5 +1,7 @@ require "nerve/utils" require "nerve/log" +require "nerve/statsd" +require "nerve/prometheus_metrics" require "nerve/reporter/base" module Nerve diff --git a/lib/nerve/reporter/base.rb b/lib/nerve/reporter/base.rb index 13591200..f007ef06 100644 --- a/lib/nerve/reporter/base.rb +++ b/lib/nerve/reporter/base.rb @@ -3,6 +3,7 @@ class Base include Nerve::Utils include Nerve::Logging include Nerve::StatsD + include Nerve::PrometheusMetrics def initialize(opts) end diff --git a/lib/nerve/service_watcher.rb b/lib/nerve/service_watcher.rb index 497a6ecd..10671107 100644 --- a/lib/nerve/service_watcher.rb +++ b/lib/nerve/service_watcher.rb @@ -10,6 +10,7 @@ class ServiceWatcher include Utils include Logging include StatsD + include PrometheusMetrics attr_reader :was_up diff --git a/nerve.gemspec b/nerve.gemspec index d319346c..1bb797db 100644 --- a/nerve.gemspec +++ b/nerve.gemspec @@ -25,6 +25,8 @@ Gem::Specification.new do |gem| gem.add_runtime_dependency "redis", "~> 5.0" gem.add_runtime_dependency "etcd", "~> 0.3" gem.add_runtime_dependency "dogstatsd-ruby", "~> 5.0" + gem.add_runtime_dependency "prometheus-client", "~> 4.0" + gem.add_runtime_dependency "webrick" gem.add_development_dependency "rake" gem.add_development_dependency "rspec", "~> 3.13" diff --git a/spec/lib/nerve/prometheus_metrics_spec.rb b/spec/lib/nerve/prometheus_metrics_spec.rb new file mode 100644 index 00000000..f195b0fe --- /dev/null +++ b/spec/lib/nerve/prometheus_metrics_spec.rb @@ -0,0 +1,116 @@ +require "spec_helper" +require "nerve/prometheus_metrics" + +describe Nerve::PrometheusMetrics do + let(:test_class) do + Class.new do + include Nerve::PrometheusMetrics + end + end + let(:instance) { test_class.new } + + after(:each) do + Nerve::PrometheusMetrics.stop_server + Nerve::PrometheusMetrics.class_variable_set(:@@prom_enabled, false) + Nerve::PrometheusMetrics.class_variable_set(:@@prom_registry, nil) + Nerve::PrometheusMetrics.class_variable_set(:@@prom_metrics, {}) + Nerve::PrometheusMetrics.class_variable_set(:@@prom_server, nil) + end + + def configure_without_server(opts = {}) + allow(Nerve::PrometheusMetrics).to receive(:start_server) + Nerve::PrometheusMetrics.configure({"enabled" => true}.merge(opts)) + end + + describe ".configure" do + it "does nothing when opts is nil" do + Nerve::PrometheusMetrics.configure(nil) + expect(Nerve::PrometheusMetrics.enabled?).to be false + end + + it "does nothing when enabled is false" do + Nerve::PrometheusMetrics.configure({"enabled" => false}) + expect(Nerve::PrometheusMetrics.enabled?).to be false + end + + it "does nothing when opts is empty hash" do + Nerve::PrometheusMetrics.configure({}) + expect(Nerve::PrometheusMetrics.enabled?).to be false + end + + it "enables metrics when enabled is true" do + configure_without_server + expect(Nerve::PrometheusMetrics.enabled?).to be true + end + + it "creates a registry" do + configure_without_server + expect(Nerve::PrometheusMetrics.registry).to be_a(Prometheus::Client::Registry) + end + + it "sets build_info with version" do + configure_without_server + metric = Nerve::PrometheusMetrics.metrics[:build_info] + expect(metric.get(labels: {version: Nerve::VERSION})).to eq(1) + end + end + + describe "instance helpers when disabled" do + it "prom_inc is a no-op" do + expect { instance.prom_inc(:config_reloads_total) }.not_to raise_error + end + + it "prom_set is a no-op" do + expect { instance.prom_set(:watchers_desired, 5) }.not_to raise_error + end + + it "prom_observe is a no-op" do + expect { instance.prom_observe(:main_loop_duration_seconds, 1.0) }.not_to raise_error + end + end + + describe "instance helpers when enabled" do + before(:each) do + configure_without_server + end + + it "prom_inc ignores unknown metrics" do + expect { instance.prom_inc(:nonexistent_metric) }.not_to raise_error + end + + it "prom_set ignores unknown metrics" do + expect { instance.prom_set(:nonexistent_metric, 1) }.not_to raise_error + end + + it "prom_observe ignores unknown metrics" do + expect { instance.prom_observe(:nonexistent_metric, 1.0) }.not_to raise_error + end + end + + describe "HTTP server" do + it "serves /metrics endpoint" do + Nerve::PrometheusMetrics.configure({"enabled" => true, "port" => 19297}) + sleep 0.2 + + require "net/http" + response = Net::HTTP.get_response("127.0.0.1", "/metrics", 19297) + expect(response.code).to eq("200") + expect(response["content-type"]).to include("text/plain") + expect(response.body).to include("nerve_build_info") + end + end + + describe ".stop_server" do + it "stops the server cleanly" do + mock_server = double("WEBrick::HTTPServer") + expect(mock_server).to receive(:shutdown) + Nerve::PrometheusMetrics.class_variable_set(:@@prom_server, mock_server) + expect { Nerve::PrometheusMetrics.stop_server }.not_to raise_error + expect(Nerve::PrometheusMetrics.class_variable_get(:@@prom_server)).to be_nil + end + + it "is safe to call when no server is running" do + expect { Nerve::PrometheusMetrics.stop_server }.not_to raise_error + end + end +end diff --git a/spec/lib/nerve_spec.rb b/spec/lib/nerve_spec.rb index 30c917af..9372fc5c 100644 --- a/spec/lib/nerve_spec.rb +++ b/spec/lib/nerve_spec.rb @@ -80,6 +80,14 @@ def make_mock_service_watcher } } + def reset_prometheus_state! + Nerve::PrometheusMetrics.stop_server + Nerve::PrometheusMetrics.class_variable_set(:@@prom_enabled, false) + Nerve::PrometheusMetrics.class_variable_set(:@@prom_registry, nil) + Nerve::PrometheusMetrics.class_variable_set(:@@prom_metrics, {}) + Nerve::PrometheusMetrics.class_variable_set(:@@prom_server, nil) + end + it "does a regular run and finishes" do nerve = Nerve::Nerve.new(mock_config_manager) @@ -90,6 +98,100 @@ def make_mock_service_watcher expect { nerve.run }.not_to raise_error end + it "enables prometheus after reload when initially disabled" do + allow(Nerve::PrometheusMetrics).to receive(:start_server) + reset_prometheus_state! + + services_config = { + "service1" => { + "host" => "localhost", + "port" => 1234 + }, + "service2" => { + "host" => "localhost", + "port" => 1235 + } + } + disabled_config = { + "instance_id" => nerve_instance_id, + "services" => services_config, + "prometheus" => {"enabled" => false} + } + enabled_config = { + "instance_id" => nerve_instance_id, + "services" => services_config, + "prometheus" => {"enabled" => true, "port" => 19297} + } + current_config = disabled_config + allow(mock_config_manager).to receive(:config) { current_config } + + nerve = Nerve::Nerve.new(mock_config_manager) + iterations = 1 + expect(nerve).to receive(:heartbeat).exactly(iterations + 1).times do + if iterations == 1 + expect(Nerve::PrometheusMetrics.enabled?).to be false + current_config = enabled_config + nerve.instance_variable_set(:@config_to_load, true) + else + expect(Nerve::PrometheusMetrics.enabled?).to be true + $EXIT = true + end + iterations -= 1 + end + + expect { nerve.run }.not_to raise_error + expect(Nerve::PrometheusMetrics.enabled?).to be true + ensure + reset_prometheus_state! + end + + it "disables prometheus after reload when config disables" do + allow(Nerve::PrometheusMetrics).to receive(:start_server) + reset_prometheus_state! + + services_config = { + "service1" => { + "host" => "localhost", + "port" => 1234 + }, + "service2" => { + "host" => "localhost", + "port" => 1235 + } + } + enabled_config = { + "instance_id" => nerve_instance_id, + "services" => services_config, + "prometheus" => {"enabled" => true, "port" => 19297} + } + disabled_config = { + "instance_id" => nerve_instance_id, + "services" => services_config, + "prometheus" => {"enabled" => false} + } + current_config = enabled_config + allow(mock_config_manager).to receive(:config) { current_config } + + nerve = Nerve::Nerve.new(mock_config_manager) + iterations = 1 + expect(nerve).to receive(:heartbeat).exactly(iterations + 1).times do + if iterations == 1 + expect(Nerve::PrometheusMetrics.enabled?).to be true + current_config = disabled_config + nerve.instance_variable_set(:@config_to_load, true) + else + expect(Nerve::PrometheusMetrics.enabled?).to be false + $EXIT = true + end + iterations -= 1 + end + + expect { nerve.run }.not_to raise_error + expect(Nerve::PrometheusMetrics.enabled?).to be false + ensure + reset_prometheus_state! + end + it "relaunches dead watchers" do nerve = Nerve::Nerve.new(mock_config_manager) From 44aa22e3004d583951bdf4eac804b638bbc76fcb Mon Sep 17 00:00:00 2001 From: Robert Johnson Date: Fri, 6 Feb 2026 07:15:03 -0800 Subject: [PATCH 2/3] feat: add Prometheus metrics for nerve core Add watcher gauges (desired/running/up/down, repeated_report_failures_max), counters (config_reloads, watcher_launches/stops/throttled, report_results, reporter_ping_results), and main_loop_duration histogram. Expose repeated_report_failures attr_reader on ServiceWatcher for main loop aggregation. update_prom_gauges recomputes aggregate state each iteration. Co-Authored-By: Claude Opus 4.6 --- lib/nerve.rb | 36 ++++++++++++++ lib/nerve/prometheus_metrics.rb | 59 +++++++++++++++++++++++ lib/nerve/service_watcher.rb | 28 +++++++---- spec/lib/nerve/prometheus_metrics_spec.rb | 59 +++++++++++++++++++++++ spec/lib/nerve_spec.rb | 1 + 5 files changed, 174 insertions(+), 9 deletions(-) diff --git a/lib/nerve.rb b/lib/nerve.rb index 9a602ef8..9c974d59 100644 --- a/lib/nerve.rb +++ b/lib/nerve.rb @@ -85,6 +85,7 @@ def load_config! @heartbeat_path = config["heartbeat_path"] StatsD.configure_statsd(config["statsd"] || {}) statsd.increment("nerve.config.update") + prom_inc(:config_reloads_total) end def run @@ -93,6 +94,8 @@ def run statsd.time("nerve.main_loop.elapsed_time") do until $EXIT + main_loop_start = Time.now + # Poll overlay file mtime to detect config changes without SIGHUP current_overlay_mtime = @config_manager.overlay_mtime if current_overlay_mtime != @last_overlay_mtime @@ -128,6 +131,7 @@ def run log.info "nerve: launching new watchers: #{services_to_launch}" services_to_launch.each do |name| statsd.increment("nerve.watcher.launch", tags: ["launch_reason:new", "watcher_name:#{name}"]) + prom_inc(:watcher_launches_total, labels: {reason: "new"}) launch_watcher(name, @watchers_desired[name]) end end @@ -150,6 +154,7 @@ def run @watcher_versions[temp_name] = @watcher_versions.delete(name) log.info "nerve: launching new watcher for #{name}" statsd.increment("nerve.watcher.launch", tags: ["launch_reason:update", "watcher_name:#{name}"]) + prom_inc(:watcher_launches_total, labels: {reason: "update"}) launch_watcher(name, @watchers_desired[name], wait: true) log.info "nerve: reaping old watcher #{temp_name}" statsd.increment("nerve.watcher.reap", tags: ["reap_reason:update", "watcher_name:#{temp_name}"]) @@ -193,12 +198,16 @@ def run statsd.increment("nerve.watcher.reap", tags: ["reap_reason:relaunch", "reap_result:fail", "watcher_name:#{name}", "exception_name:#{e.class.name}", "exception_message:#{e.message}"]) end statsd.increment("nerve.watcher.launch", tags: ["launch_reason:relaunch", "watcher_name:#{name}"]) + prom_inc(:watcher_launches_total, labels: {reason: "relaunch"}) launch_watcher(name, @watchers_desired[name]) end + update_prom_gauges + # Indicate we've made progress heartbeat + prom_observe(:main_loop_duration_seconds, Time.now - main_loop_start) responsive_sleep(MAIN_LOOP_SLEEP_S) { @config_to_load || $EXIT } end rescue => e @@ -269,5 +278,32 @@ def reap_watcher(name) log.info "nerve: stopped #{name}, clean shutdown? #{shutdown_status}" shutdown_status end + + def update_prom_gauges + return unless PrometheusMetrics.enabled? + + prom_set(:watchers_desired, @watchers_desired.size) + prom_set(:watchers_running, @watchers.size) + + up_count = 0 + down_count = 0 + max_failures = 0 + + @watchers.each do |_name, watcher| + case watcher.was_up + when true + up_count += 1 + when false + down_count += 1 + end + + failures = watcher.repeated_report_failures + max_failures = failures if failures > max_failures + end + + prom_set(:watchers_up, up_count) + prom_set(:watchers_down, down_count) + prom_set(:repeated_report_failures_max, max_failures) + end end end diff --git a/lib/nerve/prometheus_metrics.rb b/lib/nerve/prometheus_metrics.rb index 9e93ed41..fd36f9a3 100644 --- a/lib/nerve/prometheus_metrics.rb +++ b/lib/nerve/prometheus_metrics.rb @@ -66,6 +66,65 @@ def disable! private def register_metrics(zk_buckets: HISTOGRAM_BUCKETS_ZK, main_loop_buckets: HISTOGRAM_BUCKETS_MAIN_LOOP) + # Gauges + @@prom_metrics[:watchers_desired] = @@prom_registry.gauge( + :nerve_watchers_desired, + docstring: "Number of service watchers desired from config" + ) + @@prom_metrics[:watchers_running] = @@prom_registry.gauge( + :nerve_watchers_running, + docstring: "Number of service watchers currently running" + ) + @@prom_metrics[:watchers_up] = @@prom_registry.gauge( + :nerve_watchers_up, + docstring: "Number of service watchers currently reporting up" + ) + @@prom_metrics[:watchers_down] = @@prom_registry.gauge( + :nerve_watchers_down, + docstring: "Number of service watchers currently reporting down" + ) + @@prom_metrics[:repeated_report_failures_max] = @@prom_registry.gauge( + :nerve_repeated_report_failures_max, + docstring: "Worst-case repeated report failure count across all watchers" + ) + + # Counters + @@prom_metrics[:report_results_total] = @@prom_registry.counter( + :nerve_report_results_total, + docstring: "Total report up/down attempts and results", + labels: [:action, :result] + ) + @@prom_metrics[:reporter_ping_results_total] = @@prom_registry.counter( + :nerve_reporter_ping_results_total, + docstring: "Total reporter ping results", + labels: [:result] + ) + @@prom_metrics[:watcher_stops_total] = @@prom_registry.counter( + :nerve_watcher_stops_total, + docstring: "Total watcher stop events", + labels: [:reason] + ) + @@prom_metrics[:watcher_launches_total] = @@prom_registry.counter( + :nerve_watcher_launches_total, + docstring: "Total watcher launch events", + labels: [:reason] + ) + @@prom_metrics[:watcher_throttled_total] = @@prom_registry.counter( + :nerve_watcher_throttled_total, + docstring: "Total watcher throttle events" + ) + @@prom_metrics[:config_reloads_total] = @@prom_registry.counter( + :nerve_config_reloads_total, + docstring: "Total configuration reloads" + ) + + # Histograms + @@prom_metrics[:main_loop_duration_seconds] = @@prom_registry.histogram( + :nerve_main_loop_duration_seconds, + docstring: "Duration of main loop iterations in seconds", + buckets: main_loop_buckets + ) + # Info @@prom_metrics[:build_info] = @@prom_registry.gauge( :nerve_build_info, diff --git a/lib/nerve/service_watcher.rb b/lib/nerve/service_watcher.rb index 10671107..7b824348 100644 --- a/lib/nerve/service_watcher.rb +++ b/lib/nerve/service_watcher.rb @@ -12,7 +12,7 @@ class ServiceWatcher include StatsD include PrometheusMetrics - attr_reader :was_up + attr_reader :was_up, :repeated_report_failures def initialize(service = {}) log.debug "nerve: creating service watcher object" @@ -79,6 +79,7 @@ def initialize(service = {}) @should_finish = false @max_repeated_report_failures = service["max_repeated_report_failures"] || 10 + @repeated_report_failures = 0 log.debug "nerve: created service watcher for #{@name} with #{@service_checks.size} checks" end @@ -115,15 +116,15 @@ def run @reporter.start - repeated_report_failures = 0 - until watcher_should_exit? || repeated_report_failures >= @max_repeated_report_failures + @repeated_report_failures = 0 + until watcher_should_exit? || @repeated_report_failures >= @max_repeated_report_failures report_succeeded = check_and_report case report_succeeded when true - repeated_report_failures = 0 + @repeated_report_failures = 0 when false - repeated_report_failures += 1 + @repeated_report_failures += 1 when nil # this case exists for when the request is throttled # do nothing @@ -136,13 +137,16 @@ def run responsive_sleep(@check_interval) { watcher_should_exit? } end - if repeated_report_failures >= @max_repeated_report_failures + if @repeated_report_failures >= @max_repeated_report_failures statsd.increment("nerve.watcher.stop", tags: ["stop_avenue:failure", "stop_location:main_loop", "service_name:#{@name}"]) + prom_inc(:watcher_stops_total, labels: {reason: "failure"}) else statsd.increment("nerve.watcher.stop", tags: ["stop_avenue:clean", "stop_location:main_loop", "service_name:#{@name}"]) + prom_inc(:watcher_stops_total, labels: {reason: "clean"}) end rescue => e statsd.increment("nerve.watcher.stop", tags: ["stop_avenue:abort", "stop_location:main_loop", "service_name:#{@name}", "exception_name:#{e.class.name}", "exception_message:#{e.message}"]) + prom_inc(:watcher_stops_total, labels: {reason: "abort"}) log.error "nerve: error in service watcher #{@name}: #{e.inspect}" raise e ensure @@ -153,6 +157,7 @@ def run def check_and_report if !@reporter.ping? statsd.increment("nerve.watcher.status.ping.count", tags: ["ping_result:fail", "service_name:#{@name}"]) + prom_inc(:reporter_ping_results_total, labels: {result: "fail"}) # If the reporter can't ping, then we do not know the status and must force a new report. # We will also skip checking service status since it couldn't be reported @@ -160,6 +165,7 @@ def check_and_report return false end statsd.increment("nerve.watcher.status.ping.count", tags: ["ping_result:success", "service_name:#{@name}"]) + prom_inc(:reporter_ping_results_total, labels: {result: "success"}) # what is the status of the service? is_up = check? @@ -170,6 +176,7 @@ def check_and_report if !@rate_limiter.consume log.warn "nerve: service #{@name} throttled (shadow mode: #{@rate_limit_shadow_mode})" statsd.increment("nerve.watcher.throttled", tags: ["service_name:#{@name}", "shadow_mode:#{@rate_limit_shadow_mode}"]) + prom_inc(:watcher_throttled_total) unless @rate_limit_shadow_mode # When the request is throttled, ensure that the status is reported @@ -187,15 +194,19 @@ def check_and_report report_succeeded = @reporter.report_up if report_succeeded log.info "nerve: service #{@name} is now up" + prom_inc(:report_results_total, labels: {action: "up", result: "success"}) else log.warn "nerve: service #{@name} failed to report up" + prom_inc(:report_results_total, labels: {action: "up", result: "fail"}) end else report_succeeded = @reporter.report_down if report_succeeded log.warn "nerve: service #{@name} is now down" + prom_inc(:report_results_total, labels: {action: "down", result: "success"}) else log.warn "nerve: service #{@name} failed to report down" + prom_inc(:report_results_total, labels: {action: "down", result: "fail"}) end end @@ -213,9 +224,8 @@ def check_and_report end def check? - if @check_mocked - return true - end + return true if @check_mocked + @service_checks.each do |check| up = check.up? statsd.increment("nerve.watcher.status.service_check", tags: ["check_result:#{up ? "up" : "down"}", "service_name:#{@name}", "check_name:#{check.name}"]) diff --git a/spec/lib/nerve/prometheus_metrics_spec.rb b/spec/lib/nerve/prometheus_metrics_spec.rb index f195b0fe..94fd7b58 100644 --- a/spec/lib/nerve/prometheus_metrics_spec.rb +++ b/spec/lib/nerve/prometheus_metrics_spec.rb @@ -48,6 +48,41 @@ def configure_without_server(opts = {}) expect(Nerve::PrometheusMetrics.registry).to be_a(Prometheus::Client::Registry) end + it "registers all expected metrics" do + configure_without_server + metrics = Nerve::PrometheusMetrics.metrics + + # Gauges + expect(metrics[:watchers_desired]).to be_a(Prometheus::Client::Gauge) + expect(metrics[:watchers_running]).to be_a(Prometheus::Client::Gauge) + expect(metrics[:watchers_up]).to be_a(Prometheus::Client::Gauge) + expect(metrics[:watchers_down]).to be_a(Prometheus::Client::Gauge) + expect(metrics[:repeated_report_failures_max]).to be_a(Prometheus::Client::Gauge) + + # Counters + expect(metrics[:report_results_total]).to be_a(Prometheus::Client::Counter) + expect(metrics[:reporter_ping_results_total]).to be_a(Prometheus::Client::Counter) + expect(metrics[:watcher_stops_total]).to be_a(Prometheus::Client::Counter) + expect(metrics[:watcher_launches_total]).to be_a(Prometheus::Client::Counter) + expect(metrics[:watcher_throttled_total]).to be_a(Prometheus::Client::Counter) + expect(metrics[:config_reloads_total]).to be_a(Prometheus::Client::Counter) + + # Histograms + expect(metrics[:main_loop_duration_seconds]).to be_a(Prometheus::Client::Histogram) + + # Info + expect(metrics[:build_info]).to be_a(Prometheus::Client::Gauge) + end + + it "accepts custom histogram buckets" do + configure_without_server( + "histogram_buckets_zk" => [0.01, 0.1, 1.0], + "histogram_buckets_main_loop" => [0.1, 1.0, 10.0] + ) + metrics = Nerve::PrometheusMetrics.metrics + expect(metrics[:main_loop_duration_seconds]).to be_a(Prometheus::Client::Histogram) + end + it "sets build_info with version" do configure_without_server metric = Nerve::PrometheusMetrics.metrics[:build_info] @@ -74,6 +109,30 @@ def configure_without_server(opts = {}) configure_without_server end + it "prom_inc increments a counter" do + instance.prom_inc(:config_reloads_total) + metric = Nerve::PrometheusMetrics.metrics[:config_reloads_total] + expect(metric.get).to eq(1.0) + + instance.prom_inc(:config_reloads_total) + expect(metric.get).to eq(2.0) + end + + it "prom_set sets a gauge" do + instance.prom_set(:watchers_desired, 5) + metric = Nerve::PrometheusMetrics.metrics[:watchers_desired] + expect(metric.get).to eq(5) + + instance.prom_set(:watchers_desired, 3) + expect(metric.get).to eq(3) + end + + it "prom_observe records a histogram observation" do + instance.prom_observe(:main_loop_duration_seconds, 0.5) + metric = Nerve::PrometheusMetrics.metrics[:main_loop_duration_seconds] + expect(metric.get["sum"]).to eq(0.5) + end + it "prom_inc ignores unknown metrics" do expect { instance.prom_inc(:nonexistent_metric) }.not_to raise_error end diff --git a/spec/lib/nerve_spec.rb b/spec/lib/nerve_spec.rb index 9372fc5c..1e853c58 100644 --- a/spec/lib/nerve_spec.rb +++ b/spec/lib/nerve_spec.rb @@ -11,6 +11,7 @@ def make_mock_service_watcher allow(mock_service_watcher).to receive(:stop) allow(mock_service_watcher).to receive(:alive?).and_return(true) allow(mock_service_watcher).to receive(:was_up).and_return(true) + allow(mock_service_watcher).to receive(:repeated_report_failures).and_return(0) mock_service_watcher end From cbfb774dea23d898263146b92991db08e84cc6d2 Mon Sep 17 00:00:00 2001 From: Robert Johnson Date: Fri, 6 Feb 2026 07:17:59 -0800 Subject: [PATCH 3/3] feat: add Prometheus metrics for ZK reporter Add zk_connected gauge (1/0 per cluster), zk_pool_size gauge, zk_write_failures_total counter (primary alerting metric), and zk_operation_duration_seconds histogram for create/save/delete ops. Co-Authored-By: Claude Opus 4.6 --- lib/nerve/prometheus_metrics.rb | 28 ++++++++++++++++++++ lib/nerve/reporter/zookeeper.rb | 31 +++++++++++++++++------ spec/lib/nerve/prometheus_metrics_spec.rb | 21 +++++++++++++++ 3 files changed, 72 insertions(+), 8 deletions(-) diff --git a/lib/nerve/prometheus_metrics.rb b/lib/nerve/prometheus_metrics.rb index fd36f9a3..6bf2a8a7 100644 --- a/lib/nerve/prometheus_metrics.rb +++ b/lib/nerve/prometheus_metrics.rb @@ -87,6 +87,16 @@ def register_metrics(zk_buckets: HISTOGRAM_BUCKETS_ZK, main_loop_buckets: HISTOG :nerve_repeated_report_failures_max, docstring: "Worst-case repeated report failure count across all watchers" ) + @@prom_metrics[:zk_connected] = @@prom_registry.gauge( + :nerve_zk_connected, + docstring: "Whether ZK connection is alive (1=connected, 0=disconnected)", + labels: [:zk_cluster] + ) + @@prom_metrics[:zk_pool_size] = @@prom_registry.gauge( + :nerve_zk_pool_size, + docstring: "Number of watchers sharing each ZK connection pool", + labels: [:zk_cluster] + ) # Counters @@prom_metrics[:report_results_total] = @@prom_registry.counter( @@ -94,6 +104,11 @@ def register_metrics(zk_buckets: HISTOGRAM_BUCKETS_ZK, main_loop_buckets: HISTOG docstring: "Total report up/down attempts and results", labels: [:action, :result] ) + @@prom_metrics[:zk_write_failures_total] = @@prom_registry.counter( + :nerve_zk_write_failures_total, + docstring: "Total ZK write failures (primary alerting metric)", + labels: [:zk_cluster, :operation] + ) @@prom_metrics[:reporter_ping_results_total] = @@prom_registry.counter( :nerve_reporter_ping_results_total, docstring: "Total reporter ping results", @@ -119,6 +134,12 @@ def register_metrics(zk_buckets: HISTOGRAM_BUCKETS_ZK, main_loop_buckets: HISTOG ) # Histograms + @@prom_metrics[:zk_operation_duration_seconds] = @@prom_registry.histogram( + :nerve_zk_operation_duration_seconds, + docstring: "Duration of ZK operations in seconds", + labels: [:zk_cluster, :operation], + buckets: zk_buckets + ) @@prom_metrics[:main_loop_duration_seconds] = @@prom_registry.histogram( :nerve_main_loop_duration_seconds, docstring: "Duration of main loop iterations in seconds", @@ -172,5 +193,12 @@ def prom_observe(metric_name, value, labels: {}) return unless metric metric.observe(value, labels: labels) end + + def prom_time(metric_name, labels: {}) + start = Process.clock_gettime(Process::CLOCK_MONOTONIC) + result = yield + prom_observe(metric_name, Process.clock_gettime(Process::CLOCK_MONOTONIC) - start, labels: labels) + result + end end end diff --git a/lib/nerve/reporter/zookeeper.rb b/lib/nerve/reporter/zookeeper.rb index 3c8cc3f7..09df7c43 100644 --- a/lib/nerve/reporter/zookeeper.rb +++ b/lib/nerve/reporter/zookeeper.rb @@ -48,6 +48,7 @@ def start statsd.increment("nerve.reporter.zk.client.created", tags: ["zk_cluster:#{@zk_cluster}"]) end @zk = @@zk_pool[@zk_connection_string] + prom_set(:zk_pool_size, @@zk_pool_count[@zk_connection_string], labels: {zk_cluster: @zk_cluster}) log.info "nerve: retrieved zk connection to #{@zk_connection_string}" } end @@ -57,6 +58,7 @@ def stop ensure @@zk_pool_lock.synchronize { @@zk_pool_count[@zk_connection_string] -= 1 + prom_set(:zk_pool_size, @@zk_pool_count[@zk_connection_string], labels: {zk_cluster: @zk_cluster}) # Last thread to use the connection closes it if @@zk_pool_count[@zk_connection_string] == 0 log.info "nerve: closing zk connection to #{@zk_connection_string}" @@ -72,12 +74,14 @@ def stop def report_up if !@zk.connected? log.error "nerve: error in reporting up on zk node #{@full_key}: loss connection" + prom_inc(:zk_write_failures_total, labels: {zk_cluster: @zk_cluster, operation: "save"}) false else begin zk_save rescue *ZK_CONNECTION_ERRORS => e log.error "nerve: error in reporting up on zk node #{@full_key}: #{e.message}" + prom_inc(:zk_write_failures_total, labels: {zk_cluster: @zk_cluster, operation: "save"}) return false end @@ -88,12 +92,14 @@ def report_up def report_down if !@zk.connected? log.error "nerve: error in reporting down on zk node #{@full_key}: loss connection" + prom_inc(:zk_write_failures_total, labels: {zk_cluster: @zk_cluster, operation: "delete"}) false else begin zk_delete rescue *ZK_CONNECTION_ERRORS => e log.error "nerve: error in reporting down on zk node #{@full_key}: #{e.message}" + prom_inc(:zk_write_failures_total, labels: {zk_cluster: @zk_cluster, operation: "delete"}) return false end @@ -104,12 +110,15 @@ def report_down def ping? if !@zk.connected? log.error "nerve: error in ping reporter at zk node #{@full_key}: loss connection" + prom_set(:zk_connected, 0, labels: {zk_cluster: @zk_cluster}) false else + prom_set(:zk_connected, 1, labels: {zk_cluster: @zk_cluster}) begin @zk.exists?(@full_key || "/") rescue *ZK_CONNECTION_ERRORS => e log.error "nerve: error in ping reporter at zk node #{@full_key}: #{e.message}" + prom_set(:zk_connected, 0, labels: {zk_cluster: @zk_cluster}) false end end @@ -142,8 +151,10 @@ def zk_delete if @full_key log.info "nerve: deleting zk node at #{@full_key}" if @full_key - statsd.time("nerve.reporter.zk.delete.elapsed_time", tags: ["zk_cluster:#{@zk_cluster}"]) do - @zk.delete(@full_key, ignore: :no_node) + prom_time(:zk_operation_duration_seconds, labels: {zk_cluster: @zk_cluster, operation: "delete"}) do + statsd.time("nerve.reporter.zk.delete.elapsed_time", tags: ["zk_cluster:#{@zk_cluster}"]) do + @zk.delete(@full_key, ignore: :no_node) + end end @full_key = nil else @@ -155,9 +166,11 @@ def zk_create log.info "nerve: creating zk node at #{@key_prefix}" # only mkdir_p if the path does not exist - statsd.time("nerve.reporter.zk.create.elapsed_time", tags: ["zk_cluster:#{@zk_cluster}", "zk_path:#{@zk_path}"]) do - @zk.mkdir_p(@zk_path) unless @zk.exists?(@zk_path) - @full_key = @zk.create(@key_prefix, data: @data, mode: :ephemeral_sequential) + prom_time(:zk_operation_duration_seconds, labels: {zk_cluster: @zk_cluster, operation: "create"}) do + statsd.time("nerve.reporter.zk.create.elapsed_time", tags: ["zk_cluster:#{@zk_cluster}", "zk_path:#{@zk_path}"]) do + @zk.mkdir_p(@zk_path) unless @zk.exists?(@zk_path) + @full_key = @zk.create(@key_prefix, data: @data, mode: :ephemeral_sequential) + end end end @@ -165,9 +178,11 @@ def zk_save return zk_create unless @full_key begin - statsd.time("nerve.reporter.zk.save.elapsed_time", tags: ["zk_cluster:#{@zk_cluster}"]) do - log.info "nerve: updating zk node at #{@key_prefix}" - @zk.set(@full_key, @data) + prom_time(:zk_operation_duration_seconds, labels: {zk_cluster: @zk_cluster, operation: "save"}) do + statsd.time("nerve.reporter.zk.save.elapsed_time", tags: ["zk_cluster:#{@zk_cluster}"]) do + log.info "nerve: updating zk node at #{@key_prefix}" + @zk.set(@full_key, @data) + end end rescue ZK::Exceptions::NoNode zk_create diff --git a/spec/lib/nerve/prometheus_metrics_spec.rb b/spec/lib/nerve/prometheus_metrics_spec.rb index 94fd7b58..c5c561dd 100644 --- a/spec/lib/nerve/prometheus_metrics_spec.rb +++ b/spec/lib/nerve/prometheus_metrics_spec.rb @@ -58,16 +58,20 @@ def configure_without_server(opts = {}) expect(metrics[:watchers_up]).to be_a(Prometheus::Client::Gauge) expect(metrics[:watchers_down]).to be_a(Prometheus::Client::Gauge) expect(metrics[:repeated_report_failures_max]).to be_a(Prometheus::Client::Gauge) + expect(metrics[:zk_connected]).to be_a(Prometheus::Client::Gauge) + expect(metrics[:zk_pool_size]).to be_a(Prometheus::Client::Gauge) # Counters expect(metrics[:report_results_total]).to be_a(Prometheus::Client::Counter) expect(metrics[:reporter_ping_results_total]).to be_a(Prometheus::Client::Counter) + expect(metrics[:zk_write_failures_total]).to be_a(Prometheus::Client::Counter) expect(metrics[:watcher_stops_total]).to be_a(Prometheus::Client::Counter) expect(metrics[:watcher_launches_total]).to be_a(Prometheus::Client::Counter) expect(metrics[:watcher_throttled_total]).to be_a(Prometheus::Client::Counter) expect(metrics[:config_reloads_total]).to be_a(Prometheus::Client::Counter) # Histograms + expect(metrics[:zk_operation_duration_seconds]).to be_a(Prometheus::Client::Histogram) expect(metrics[:main_loop_duration_seconds]).to be_a(Prometheus::Client::Histogram) # Info @@ -80,6 +84,7 @@ def configure_without_server(opts = {}) "histogram_buckets_main_loop" => [0.1, 1.0, 10.0] ) metrics = Nerve::PrometheusMetrics.metrics + expect(metrics[:zk_operation_duration_seconds]).to be_a(Prometheus::Client::Histogram) expect(metrics[:main_loop_duration_seconds]).to be_a(Prometheus::Client::Histogram) end @@ -133,6 +138,22 @@ def configure_without_server(opts = {}) expect(metric.get["sum"]).to eq(0.5) end + it "prom_time records duration and returns the block result" do + allow(Process).to receive(:clock_gettime) + .with(Process::CLOCK_MONOTONIC) + .and_return(10.0, 12.5) + + result = instance.prom_time( + :zk_operation_duration_seconds, + labels: {zk_cluster: "zk", operation: "save"} + ) { :ok } + + metric = Nerve::PrometheusMetrics.metrics[:zk_operation_duration_seconds] + expect(metric.get(labels: {zk_cluster: "zk", operation: "save"})["sum"]) + .to be_within(0.0001).of(2.5) + expect(result).to eq(:ok) + end + it "prom_inc ignores unknown metrics" do expect { instance.prom_inc(:nonexistent_metric) }.not_to raise_error end