From 3e64f09fbd6408d247bd1106e6aba3bc70d091e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20M=C3=BCller?= Date: Tue, 15 Oct 2024 13:09:22 +0200 Subject: [PATCH 1/5] chore: add omni healthcheck to cli template --- cli/cmd/compose.yml.tpl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cli/cmd/compose.yml.tpl b/cli/cmd/compose.yml.tpl index ea8accc3d..71dc6215d 100644 --- a/cli/cmd/compose.yml.tpl +++ b/cli/cmd/compose.yml.tpl @@ -27,6 +27,10 @@ services: #- --metrics # Enable prometheus metrics #- --pprof # Enable prometheus metrics #- --pprof.addr=0.0.0.0 # Enable prometheus metrics + healthcheck: + test: "nc -z localhost 8545" + interval: 1s + retries: 30 ports: - 8551 # Auth-RPC (used by halo) From 8f6c97980b40a84672a19b64ac281529748c55d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20M=C3=BCller?= Date: Wed, 16 Oct 2024 18:29:32 +0200 Subject: [PATCH 2/5] add: ready command --- halo/cmd/cmd.go | 1 + halo/cmd/ready.go | 61 +++++++++++++++++++ .../cmd/testdata/TestCLIReference_halo.golden | 1 + scripts/halovisor/Dockerfile | 2 + 4 files changed, 65 insertions(+) create mode 100644 halo/cmd/ready.go diff --git a/halo/cmd/cmd.go b/halo/cmd/cmd.go index c6eecd463..f72cfcbc8 100644 --- a/halo/cmd/cmd.go +++ b/halo/cmd/cmd.go @@ -28,6 +28,7 @@ func New() *cobra.Command { buildinfo.NewVersionCmd(), newConsKeyCmd(), newStatusCmd(), + newReadyCmd(), ) } diff --git a/halo/cmd/ready.go b/halo/cmd/ready.go new file mode 100644 index 000000000..2c899491d --- /dev/null +++ b/halo/cmd/ready.go @@ -0,0 +1,61 @@ +package cmd + +import ( + "context" + "fmt" + "net/http" + "time" + + "github.com/omni-network/omni/lib/errors" + "github.com/omni-network/omni/lib/log" + + cmtcfg "github.com/cometbft/cometbft/config" + + "github.com/spf13/cobra" +) + +func newReadyCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "ready", + Short: "Assert the readiness of the halo node", + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, _ []string) error { + err := assertReady(cmd.Context()) + if err != nil { + return errors.Wrap(err, "ready failed") + } + + return nil + }, + } + + return cmd +} + +// assertReady calls halo's /ready endpoint and returns nil if the status is ready +// or an error otherwise. +func assertReady(ctx context.Context) error { + cfg := cmtcfg.DefaultConfig() + url := fmt.Sprintf("http://0.0.0.0%v/ready", cfg.Instrumentation.PrometheusListenAddr) + + ctx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return errors.Wrap(err, "http request creation") + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return errors.Wrap(err, "http request") + } + defer resp.Body.Close() + + if resp.StatusCode < 400 { + log.Info(ctx, "The node is ready") + return nil + } + + return errors.New("the node is not ready yet") +} diff --git a/halo/cmd/testdata/TestCLIReference_halo.golden b/halo/cmd/testdata/TestCLIReference_halo.golden index fe6074db6..84b239fe8 100644 --- a/halo/cmd/testdata/TestCLIReference_halo.golden +++ b/halo/cmd/testdata/TestCLIReference_halo.golden @@ -9,6 +9,7 @@ Available Commands: consensus-pubkey Print the consensus public key help Help about any command init Initializes required halo files and directories + ready Assert the readiness of the halo node rollback Rollback Cosmos SDK and CometBFT state by one height run Runs the halo consensus client status Query remote node for status diff --git a/scripts/halovisor/Dockerfile b/scripts/halovisor/Dockerfile index a62dec39a..ff87ca1a5 100644 --- a/scripts/halovisor/Dockerfile +++ b/scripts/halovisor/Dockerfile @@ -35,6 +35,8 @@ COPY --from=build-cosmovisor /ko-app/cosmovisor /usr/local/bin/cosmovisor COPY --from=build-0-genesis /app /halovisor/genesis/bin/halo COPY --from=build-1-uluwatu /app /halovisor/upgrades/1_uluwatu/bin/halo +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s CMD ["/halovisor/upgrades/1_uluwatu/bin/halo", "ready"] + # Cosmovisor is the entrypoint ENTRYPOINT [ "cosmovisor" ] # First 'run' is cosmovisor command, second 'run' is halo command. From ef8aba0b4e1c30af17417813e287529cc27bacfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20M=C3=BCller?= Date: Thu, 17 Oct 2024 12:13:34 +0200 Subject: [PATCH 3/5] fix: review comments --- halo/Dockerfile | 2 + halo/cmd/ready.go | 41 ++++++++++--------- .../cmd/testdata/TestCLIReference_halo.golden | 2 +- 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/halo/Dockerfile b/halo/Dockerfile index 8bc57138d..545007bf1 100644 --- a/halo/Dockerfile +++ b/halo/Dockerfile @@ -8,6 +8,8 @@ COPY --from=alpine:latest /tmp /tmp COPY halo /app +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s CMD ["/app", "ready"] + # Mount home directory at /halo VOLUME ["/halo"] diff --git a/halo/cmd/ready.go b/halo/cmd/ready.go index 2c899491d..dcd18d6f5 100644 --- a/halo/cmd/ready.go +++ b/halo/cmd/ready.go @@ -2,25 +2,33 @@ package cmd import ( "context" - "fmt" "net/http" - "time" "github.com/omni-network/omni/lib/errors" "github.com/omni-network/omni/lib/log" - cmtcfg "github.com/cometbft/cometbft/config" - "github.com/spf13/cobra" ) +type readyConfig struct { + MonitoringAddr string +} + +func defaultReadyConfig() readyConfig { + return readyConfig{ + MonitoringAddr: "http://localhost:26660", + } +} + func newReadyCmd() *cobra.Command { + cfg := defaultReadyConfig() + cmd := &cobra.Command{ Use: "ready", - Short: "Assert the readiness of the halo node", + Short: "Query node for readiness", Args: cobra.NoArgs, RunE: func(cmd *cobra.Command, _ []string) error { - err := assertReady(cmd.Context()) + err := queryReady(cmd.Context(), cfg) if err != nil { return errors.Wrap(err, "ready failed") } @@ -32,16 +40,10 @@ func newReadyCmd() *cobra.Command { return cmd } -// assertReady calls halo's /ready endpoint and returns nil if the status is ready +// queryReady calls halo's /ready endpoint and returns nil if the status is ready // or an error otherwise. -func assertReady(ctx context.Context) error { - cfg := cmtcfg.DefaultConfig() - url := fmt.Sprintf("http://0.0.0.0%v/ready", cfg.Instrumentation.PrometheusListenAddr) - - ctx, cancel := context.WithTimeout(ctx, 5*time.Second) - defer cancel() - - req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) +func queryReady(ctx context.Context, cfg readyConfig) error { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, cfg.MonitoringAddr, nil) if err != nil { return errors.Wrap(err, "http request creation") } @@ -52,10 +54,11 @@ func assertReady(ctx context.Context) error { } defer resp.Body.Close() - if resp.StatusCode < 400 { - log.Info(ctx, "The node is ready") - return nil + if resp.StatusCode/100 != 2 { + return errors.New("node not ready") } - return errors.New("the node is not ready yet") + log.Info(ctx, "Node ready") + + return nil } diff --git a/halo/cmd/testdata/TestCLIReference_halo.golden b/halo/cmd/testdata/TestCLIReference_halo.golden index 84b239fe8..064e00a63 100644 --- a/halo/cmd/testdata/TestCLIReference_halo.golden +++ b/halo/cmd/testdata/TestCLIReference_halo.golden @@ -9,7 +9,7 @@ Available Commands: consensus-pubkey Print the consensus public key help Help about any command init Initializes required halo files and directories - ready Assert the readiness of the halo node + ready Query node for readiness rollback Rollback Cosmos SDK and CometBFT state by one height run Runs the halo consensus client status Query remote node for status From 3bac68728b45bfec81e60461acd11e90b0063102 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20M=C3=BCller?= Date: Thu, 17 Oct 2024 12:51:20 +0200 Subject: [PATCH 4/5] feat: ready prometheus metric --- halo/app/metrics.go | 7 +++++++ halo/app/monitor.go | 19 +++++++++++++++++++ halo/app/start.go | 1 + 3 files changed, 27 insertions(+) diff --git a/halo/app/metrics.go b/halo/app/metrics.go index 1c5afba86..50256c52b 100644 --- a/halo/app/metrics.go +++ b/halo/app/metrics.go @@ -47,6 +47,13 @@ var ( Name: "size_bytes", Help: "Current size of the database directory in bytes.", }) + + nodeReadiness = promauto.NewGauge(prometheus.GaugeOpts{ + Namespace: "halo", + Subsystem: "health", + Name: "ready", + Help: "Node readiness", + }) ) // setConstantGauge sets the value of a gauge to 1 if b is true, 0 otherwise. diff --git a/halo/app/monitor.go b/halo/app/monitor.go index b3dd6b544..1d6677f65 100644 --- a/halo/app/monitor.go +++ b/halo/app/monitor.go @@ -160,6 +160,25 @@ func monitorEVMOnce(ctx context.Context, ethCl ethclient.Client, status *readine return nil } +// exportReadiness exports the node readiness to prometheus. +func exportReadiness(ctx context.Context, status *readinessStatus) { + ticker := time.NewTicker(time.Second * 10) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + health := 0.0 + if status.ready() { + health = 1.0 + } + nodeReadiness.Set(health) + } + } +} + // dirSize returns the total size of the directory at path. func dirSize(path string) (int64, error) { var size int64 diff --git a/halo/app/start.go b/halo/app/start.go index 48eca8262..4d7f030e1 100644 --- a/halo/app/start.go +++ b/halo/app/start.go @@ -205,6 +205,7 @@ func Start(ctx context.Context, cfg Config) (<-chan error, func(context.Context) } status := new(readinessStatus) + go exportReadiness(ctx, status) stopMonitoringAPI := startMonitoringAPI(&cfg.Comet, asyncAbort, status) From fe28693f925ab27400f4cff6c011722ada3dd6f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20M=C3=BCller?= Date: Thu, 17 Oct 2024 14:07:29 +0200 Subject: [PATCH 5/5] fix: review comments --- halo/Dockerfile | 2 +- halo/app/metrics.go | 2 +- halo/app/monitor.go | 6 +++--- halo/app/start.go | 2 +- halo/cmd/flags.go | 6 ++++++ halo/cmd/ready.go | 12 +++++++----- halo/cmd/testdata/TestCLIReference_halo.golden | 2 +- scripts/halovisor/Dockerfile | 2 +- 8 files changed, 21 insertions(+), 13 deletions(-) diff --git a/halo/Dockerfile b/halo/Dockerfile index 545007bf1..7129903f0 100644 --- a/halo/Dockerfile +++ b/halo/Dockerfile @@ -8,7 +8,7 @@ COPY --from=alpine:latest /tmp /tmp COPY halo /app -HEALTHCHECK --interval=30s --timeout=5s --start-period=10s CMD ["/app", "ready"] +HEALTHCHECK CMD ["/app", "ready"] # Mount home directory at /halo VOLUME ["/halo"] diff --git a/halo/app/metrics.go b/halo/app/metrics.go index 50256c52b..2dc1c8329 100644 --- a/halo/app/metrics.go +++ b/halo/app/metrics.go @@ -52,7 +52,7 @@ var ( Namespace: "halo", Subsystem: "health", Name: "ready", - Help: "Node readiness", + Help: "Constant gauge of 1 if local halo node is ready, 0 if not.", }) ) diff --git a/halo/app/monitor.go b/halo/app/monitor.go index 1d6677f65..5e46e960e 100644 --- a/halo/app/monitor.go +++ b/halo/app/monitor.go @@ -160,8 +160,8 @@ func monitorEVMOnce(ctx context.Context, ethCl ethclient.Client, status *readine return nil } -// exportReadiness exports the node readiness to prometheus. -func exportReadiness(ctx context.Context, status *readinessStatus) { +// instrumentReadiness exports the node readiness to prometheus. +func instrumentReadiness(ctx context.Context, status *readinessStatus) { ticker := time.NewTicker(time.Second * 10) defer ticker.Stop() @@ -170,7 +170,7 @@ func exportReadiness(ctx context.Context, status *readinessStatus) { case <-ctx.Done(): return case <-ticker.C: - health := 0.0 + var health float64 if status.ready() { health = 1.0 } diff --git a/halo/app/start.go b/halo/app/start.go index 4d7f030e1..6e7d8a938 100644 --- a/halo/app/start.go +++ b/halo/app/start.go @@ -205,7 +205,7 @@ func Start(ctx context.Context, cfg Config) (<-chan error, func(context.Context) } status := new(readinessStatus) - go exportReadiness(ctx, status) + go instrumentReadiness(ctx, status) stopMonitoringAPI := startMonitoringAPI(&cfg.Comet, asyncAbort, status) diff --git a/halo/cmd/flags.go b/halo/cmd/flags.go index 5327e979f..eaf1583da 100644 --- a/halo/cmd/flags.go +++ b/halo/cmd/flags.go @@ -60,3 +60,9 @@ func bindStatusFlags(cmd *cobra.Command, cfg *statusConfig) { flags.StringVarP(&cfg.Node, "node", "n", cfg.Node, "Node to connect to") flags.StringVarP(&cfg.Output, "output", "o", cfg.Output, "Output format (text|json)") } + +func bindReadyFlags(cmd *cobra.Command, cfg *readyConfig) { + flags := cmd.Flags() + + flags.StringVarP(&cfg.MonitoringURL, "monitoring-url", "u", cfg.MonitoringURL, "Readiness monitoring url") +} diff --git a/halo/cmd/ready.go b/halo/cmd/ready.go index dcd18d6f5..2657b3bbb 100644 --- a/halo/cmd/ready.go +++ b/halo/cmd/ready.go @@ -11,12 +11,12 @@ import ( ) type readyConfig struct { - MonitoringAddr string + MonitoringURL string } func defaultReadyConfig() readyConfig { return readyConfig{ - MonitoringAddr: "http://localhost:26660", + MonitoringURL: "http://localhost:26660", } } @@ -25,25 +25,27 @@ func newReadyCmd() *cobra.Command { cmd := &cobra.Command{ Use: "ready", - Short: "Query node for readiness", + Short: "Query remote node for readiness", Args: cobra.NoArgs, RunE: func(cmd *cobra.Command, _ []string) error { err := queryReady(cmd.Context(), cfg) if err != nil { - return errors.Wrap(err, "ready failed") + return errors.Wrap(err, "ready") } return nil }, } + bindReadyFlags(cmd, &cfg) + return cmd } // queryReady calls halo's /ready endpoint and returns nil if the status is ready // or an error otherwise. func queryReady(ctx context.Context, cfg readyConfig) error { - req, err := http.NewRequestWithContext(ctx, http.MethodGet, cfg.MonitoringAddr, nil) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, cfg.MonitoringURL, nil) if err != nil { return errors.Wrap(err, "http request creation") } diff --git a/halo/cmd/testdata/TestCLIReference_halo.golden b/halo/cmd/testdata/TestCLIReference_halo.golden index 064e00a63..82ab9d0e3 100644 --- a/halo/cmd/testdata/TestCLIReference_halo.golden +++ b/halo/cmd/testdata/TestCLIReference_halo.golden @@ -9,7 +9,7 @@ Available Commands: consensus-pubkey Print the consensus public key help Help about any command init Initializes required halo files and directories - ready Query node for readiness + ready Query remote node for readiness rollback Rollback Cosmos SDK and CometBFT state by one height run Runs the halo consensus client status Query remote node for status diff --git a/scripts/halovisor/Dockerfile b/scripts/halovisor/Dockerfile index ff87ca1a5..dd56a1bde 100644 --- a/scripts/halovisor/Dockerfile +++ b/scripts/halovisor/Dockerfile @@ -35,7 +35,7 @@ COPY --from=build-cosmovisor /ko-app/cosmovisor /usr/local/bin/cosmovisor COPY --from=build-0-genesis /app /halovisor/genesis/bin/halo COPY --from=build-1-uluwatu /app /halovisor/upgrades/1_uluwatu/bin/halo -HEALTHCHECK --interval=30s --timeout=5s --start-period=10s CMD ["/halovisor/upgrades/1_uluwatu/bin/halo", "ready"] +HEALTHCHECK CMD ["/halovisor/upgrades/1_uluwatu/bin/halo", "ready"] # Cosmovisor is the entrypoint ENTRYPOINT [ "cosmovisor" ]