Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ PATH
dogstatsd-ruby (~> 5.0)
etcd (~> 0.3)
json
prometheus-client (~> 4.0)
redis (~> 5.0)
webrick
zk (~> 1.10)

GEM
Expand Down Expand Up @@ -70,6 +72,8 @@ GEM
ast (~> 2.4.1)
racc
prism (1.7.0)
prometheus-client (4.2.5)
base64
pry (0.16.0)
coderay (~> 1.1)
method_source (~> 1.0)
Expand Down Expand Up @@ -140,6 +144,7 @@ GEM
unicode-emoji (~> 4.1)
unicode-emoji (4.2.0)
uri (1.1.1)
webrick (1.9.2)
zk (1.10.0)
zookeeper (~> 1.5.0)
zookeeper (1.5.5)
Expand Down Expand Up @@ -204,6 +209,7 @@ CHECKSUMS
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
parser (3.3.10.0) sha256=ce3587fa5cc55a88c4ba5b2b37621b3329aadf5728f9eafa36bbd121462aabd6
prism (1.7.0) sha256=10062f734bf7985c8424c44fac382ac04a58124ea3d220ec3ba9fe4f2da65103
prometheus-client (4.2.5) sha256=807bebc3e92ccd9f4d814d90be15e85338f3708d227badf80924ef3f6e7d5225
pry (0.16.0) sha256=d76c69065698ed1f85e717bd33d7942c38a50868f6b0673c636192b3d1b6054e
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
Expand Down Expand Up @@ -232,6 +238,7 @@ CHECKSUMS
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
uri (1.1.1) sha256=379fa58d27ffb1387eaada68c749d1426738bd0f654d812fcc07e7568f5c57c6
webrick (1.9.2) sha256=beb4a15fc474defed24a3bda4ffd88a490d517c9e4e6118c3edce59e45864131
zk (1.10.0) sha256=e7151a665d6f5974a8d569d1b7a28c7c0110e4679b8f834fa54429dd4cbb569f
zookeeper (1.5.5) sha256=6fcf3b2ac40158968bead2425abf9ca225db673014aba0f4a4a3b057d73d88c1

Expand Down
7 changes: 7 additions & 0 deletions example/nerve.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@
"host": "localhost",
"port": 8125
},
"prometheus": {
"enabled": true,
"port": 9292,
"bind": "0.0.0.0",
"histogram_buckets_zk": [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0],
"histogram_buckets_main_loop": [0.001, 0.01, 0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0]
},
"services": {
"your_http_service": {
"host": "1.2.3.4",
Expand Down
49 changes: 49 additions & 0 deletions lib/nerve.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
require "nerve/utils"
require "nerve/log"
require "nerve/statsd"
require "nerve/prometheus_metrics"
require "nerve/ring_buffer"
require "nerve/reporter"
require "nerve/service_watcher"
Expand All @@ -16,6 +17,7 @@ class Nerve
include Logging
include Utils
include StatsD
include PrometheusMetrics

MAIN_LOOP_SLEEP_S = 10
LAUNCH_WAIT_FOR_REPORT_S = 30
Expand Down Expand Up @@ -83,6 +85,7 @@ def load_config!
@heartbeat_path = config["heartbeat_path"]
StatsD.configure_statsd(config["statsd"] || {})
statsd.increment("nerve.config.update")
prom_inc(:config_reloads_total)
end

def run
Expand All @@ -91,6 +94,8 @@ def run

statsd.time("nerve.main_loop.elapsed_time") do
until $EXIT
main_loop_start = Time.now

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

an llm pointed out to me

[Low] main_loop_duration_seconds uses Time.now for elapsed time, which is wall‑clock and can jump backward/forward (NTP or clock changes), skewing histogram values. Use
     Process.clock_gettime(Process::CLOCK_MONOTONIC) like prom_time does. lib/nerve.rb:97,210, lib/nerve/prometheus_metrics.rb:197-201.

which sounds reasonable.
I remember always pointing this out when people were using time.now() in python.
Maybe it's not the end of the world as we might see hiccups only during daylight savings but if it's easy to change maybe we should?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah good catch - I'll open a PR


# Poll overlay file mtime to detect config changes without SIGHUP
current_overlay_mtime = @config_manager.overlay_mtime
if current_overlay_mtime != @last_overlay_mtime
Expand Down Expand Up @@ -126,6 +131,7 @@ def run
log.info "nerve: launching new watchers: #{services_to_launch}"
services_to_launch.each do |name|
statsd.increment("nerve.watcher.launch", tags: ["launch_reason:new", "watcher_name:#{name}"])
prom_inc(:watcher_launches_total, labels: {reason: "new"})
launch_watcher(name, @watchers_desired[name])
end
end
Expand All @@ -148,6 +154,7 @@ def run
@watcher_versions[temp_name] = @watcher_versions.delete(name)
log.info "nerve: launching new watcher for #{name}"
statsd.increment("nerve.watcher.launch", tags: ["launch_reason:update", "watcher_name:#{name}"])
prom_inc(:watcher_launches_total, labels: {reason: "update"})
launch_watcher(name, @watchers_desired[name], wait: true)
log.info "nerve: reaping old watcher #{temp_name}"
statsd.increment("nerve.watcher.reap", tags: ["reap_reason:update", "watcher_name:#{temp_name}"])
Expand All @@ -161,6 +168,16 @@ def run
break
end

prometheus_config = @config_manager.config["prometheus"] || {}
if prometheus_config["enabled"]
unless @prometheus_started
PrometheusMetrics.configure(prometheus_config)
end
elsif PrometheusMetrics.enabled?
PrometheusMetrics.disable!
end
@prometheus_started = PrometheusMetrics.enabled?

# Check that watchers are still alive, auto-remediate if they
# are not. Sometimes zookeeper flakes out or connections are lost to
# remote datacenter zookeeper clusters, failing is not an option
Expand All @@ -181,12 +198,16 @@ def run
statsd.increment("nerve.watcher.reap", tags: ["reap_reason:relaunch", "reap_result:fail", "watcher_name:#{name}", "exception_name:#{e.class.name}", "exception_message:#{e.message}"])
end
statsd.increment("nerve.watcher.launch", tags: ["launch_reason:relaunch", "watcher_name:#{name}"])
prom_inc(:watcher_launches_total, labels: {reason: "relaunch"})
launch_watcher(name, @watchers_desired[name])
end

update_prom_gauges

# Indicate we've made progress
heartbeat

prom_observe(:main_loop_duration_seconds, Time.now - main_loop_start)
responsive_sleep(MAIN_LOOP_SLEEP_S) { @config_to_load || $EXIT }
end
rescue => e
Expand All @@ -205,6 +226,7 @@ def run
statsd.increment("nerve.stop", tags: ["stop_avenue:clean", "stop_location:main_loop"])
ensure
$EXIT = true
PrometheusMetrics.stop_server
end

def heartbeat
Expand Down Expand Up @@ -256,5 +278,32 @@ def reap_watcher(name)
log.info "nerve: stopped #{name}, clean shutdown? #{shutdown_status}"
shutdown_status
end

def update_prom_gauges
return unless PrometheusMetrics.enabled?

prom_set(:watchers_desired, @watchers_desired.size)
prom_set(:watchers_running, @watchers.size)

up_count = 0
down_count = 0
max_failures = 0

@watchers.each do |_name, watcher|
case watcher.was_up
when true
up_count += 1
when false
down_count += 1
end

failures = watcher.repeated_report_failures
max_failures = failures if failures > max_failures
end

prom_set(:watchers_up, up_count)
prom_set(:watchers_down, down_count)
prom_set(:repeated_report_failures_max, max_failures)
end
end
end
204 changes: 204 additions & 0 deletions lib/nerve/prometheus_metrics.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
require "webrick"
require "prometheus/client"
require "prometheus/client/formats/text"
require "nerve/log"
require "nerve/version"

module Nerve
module PrometheusMetrics
HISTOGRAM_BUCKETS_ZK = [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0].freeze
HISTOGRAM_BUCKETS_MAIN_LOOP = [0.001, 0.01, 0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0].freeze

class << self
include Logging

@@prom_enabled = false
@@prom_registry = nil
@@prom_metrics = {}
@@prom_server = nil

def enabled?
@@prom_enabled
end

def registry
@@prom_registry
end

def metrics
@@prom_metrics
end

def configure(opts)
return unless opts && opts["enabled"]

@@prom_enabled = true
@@prom_registry = Prometheus::Client::Registry.new

zk_buckets = opts["histogram_buckets_zk"] || HISTOGRAM_BUCKETS_ZK
main_loop_buckets = opts["histogram_buckets_main_loop"] || HISTOGRAM_BUCKETS_MAIN_LOOP
register_metrics(zk_buckets: zk_buckets, main_loop_buckets: main_loop_buckets)

port = opts["port"] || 9292
bind = opts["bind"] || "0.0.0.0"
start_server(bind, port)

log.info "nerve: prometheus metrics enabled on #{bind}:#{port}/metrics"
end

def stop_server
if @@prom_server
log.info "nerve: stopping prometheus metrics server"
@@prom_server.shutdown
@@prom_server = nil
end
end

def disable!
return unless @@prom_enabled
stop_server
@@prom_enabled = false
@@prom_registry = nil
@@prom_metrics = {}
@@prom_server = nil
end

private

def register_metrics(zk_buckets: HISTOGRAM_BUCKETS_ZK, main_loop_buckets: HISTOGRAM_BUCKETS_MAIN_LOOP)
# Gauges
@@prom_metrics[:watchers_desired] = @@prom_registry.gauge(
:nerve_watchers_desired,
docstring: "Number of service watchers desired from config"
)
@@prom_metrics[:watchers_running] = @@prom_registry.gauge(
:nerve_watchers_running,
docstring: "Number of service watchers currently running"
)
@@prom_metrics[:watchers_up] = @@prom_registry.gauge(
:nerve_watchers_up,
docstring: "Number of service watchers currently reporting up"
)
@@prom_metrics[:watchers_down] = @@prom_registry.gauge(
:nerve_watchers_down,
docstring: "Number of service watchers currently reporting down"
)
@@prom_metrics[:repeated_report_failures_max] = @@prom_registry.gauge(
:nerve_repeated_report_failures_max,
docstring: "Worst-case repeated report failure count across all watchers"
)
@@prom_metrics[:zk_connected] = @@prom_registry.gauge(
:nerve_zk_connected,
docstring: "Whether ZK connection is alive (1=connected, 0=disconnected)",
labels: [:zk_cluster]
)
@@prom_metrics[:zk_pool_size] = @@prom_registry.gauge(
:nerve_zk_pool_size,
docstring: "Number of watchers sharing each ZK connection pool",
labels: [:zk_cluster]
)

# Counters
@@prom_metrics[:report_results_total] = @@prom_registry.counter(
:nerve_report_results_total,
docstring: "Total report up/down attempts and results",
labels: [:action, :result]
)
@@prom_metrics[:zk_write_failures_total] = @@prom_registry.counter(
:nerve_zk_write_failures_total,
docstring: "Total ZK write failures (primary alerting metric)",
labels: [:zk_cluster, :operation]
)
@@prom_metrics[:reporter_ping_results_total] = @@prom_registry.counter(
:nerve_reporter_ping_results_total,
docstring: "Total reporter ping results",
labels: [:result]
)
@@prom_metrics[:watcher_stops_total] = @@prom_registry.counter(
:nerve_watcher_stops_total,
docstring: "Total watcher stop events",
labels: [:reason]
)
@@prom_metrics[:watcher_launches_total] = @@prom_registry.counter(
:nerve_watcher_launches_total,
docstring: "Total watcher launch events",
labels: [:reason]
)
@@prom_metrics[:watcher_throttled_total] = @@prom_registry.counter(
:nerve_watcher_throttled_total,
docstring: "Total watcher throttle events"
)
@@prom_metrics[:config_reloads_total] = @@prom_registry.counter(
:nerve_config_reloads_total,
docstring: "Total configuration reloads"
)

# Histograms
@@prom_metrics[:zk_operation_duration_seconds] = @@prom_registry.histogram(
:nerve_zk_operation_duration_seconds,
docstring: "Duration of ZK operations in seconds",
labels: [:zk_cluster, :operation],
buckets: zk_buckets
)
@@prom_metrics[:main_loop_duration_seconds] = @@prom_registry.histogram(
:nerve_main_loop_duration_seconds,
docstring: "Duration of main loop iterations in seconds",
buckets: main_loop_buckets
)

# Info
@@prom_metrics[:build_info] = @@prom_registry.gauge(
:nerve_build_info,
docstring: "Nerve build information",
labels: [:version]
)
@@prom_metrics[:build_info].set(1, labels: {version: VERSION})
end

def start_server(bind, port)
registry = @@prom_registry
@@prom_server = WEBrick::HTTPServer.new(
Port: port,
BindAddress: bind,
Logger: WEBrick::Log.new(File::NULL),
AccessLog: []
)

@@prom_server.mount_proc "/metrics" do |_req, res|
res["Content-Type"] = Prometheus::Client::Formats::Text::CONTENT_TYPE
res.body = Prometheus::Client::Formats::Text.marshal(registry)
end

Thread.new { @@prom_server.start }
end
end

def prom_inc(metric_name, labels: {}, by: 1)
return unless PrometheusMetrics.enabled?
metric = PrometheusMetrics.metrics[metric_name]
return unless metric
metric.increment(labels: labels, by: by)
end

def prom_set(metric_name, value, labels: {})
return unless PrometheusMetrics.enabled?
metric = PrometheusMetrics.metrics[metric_name]
return unless metric
metric.set(value, labels: labels)
end

def prom_observe(metric_name, value, labels: {})
return unless PrometheusMetrics.enabled?
metric = PrometheusMetrics.metrics[metric_name]
return unless metric
metric.observe(value, labels: labels)
end

def prom_time(metric_name, labels: {})
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
result = yield
prom_observe(metric_name, Process.clock_gettime(Process::CLOCK_MONOTONIC) - start, labels: labels)
result
end
end
end
2 changes: 2 additions & 0 deletions lib/nerve/reporter.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
require "nerve/utils"
require "nerve/log"
require "nerve/statsd"
require "nerve/prometheus_metrics"
require "nerve/reporter/base"

module Nerve
Expand Down
Loading