From 55383697dbc2d460b876ca7d6c0e202641371839 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gianguido=20Sor=C3=A0?= Date: Thu, 11 Jul 2024 17:11:48 +0200 Subject: [PATCH] app/stacksnipe: detect validator services Detect Ethereum validator stack components by reading `/proc` and filtering processes based on an allowlist. To support containerized deployments, add `--proc-directory` flag to `charon run`: if not configured, Charon will log an `INFO` message saying it won't do anything. Support detecting interpreted components like `lodestar`. Expose the resulting data through the `app_validator_stack_params` Prometheus metric. By default, poll the specified directory every 15 seconds. --- app/app.go | 5 + app/lifecycle/order.go | 1 + app/lifecycle/orderstart_string.go | 5 +- app/metrics.go | 7 + app/monitoringapi.go | 10 ++ app/stacksnipe/stacksnipe.go | 227 +++++++++++++++++++++++++++++ app/stacksnipe/stacksnipe_test.go | 118 +++++++++++++++ cmd/run.go | 1 + docs/configuration.md | 1 + docs/metrics.md | 1 + 10 files changed, 374 insertions(+), 2 deletions(-) create mode 100644 app/stacksnipe/stacksnipe.go create mode 100644 app/stacksnipe/stacksnipe_test.go diff --git a/app/app.go b/app/app.go index dfcdf4eac..80f0ca721 100644 --- a/app/app.go +++ b/app/app.go @@ -36,6 +36,7 @@ import ( "github.com/obolnetwork/charon/app/privkeylock" "github.com/obolnetwork/charon/app/promauto" "github.com/obolnetwork/charon/app/retry" + "github.com/obolnetwork/charon/app/stacksnipe" "github.com/obolnetwork/charon/app/tracer" "github.com/obolnetwork/charon/app/version" "github.com/obolnetwork/charon/app/z" @@ -90,6 +91,7 @@ type Config struct { BuilderAPI bool SimnetBMockFuzz bool TestnetConfig eth2util.Network + ProcDirectory string TestConfig TestConfig } @@ -148,6 +150,9 @@ func Run(ctx context.Context, conf Config) (err error) { life.RegisterStop(lifecycle.StopPrivkeyLock, lifecycle.HookFuncMin(lockSvc.Close)) } + stackSniper := stacksnipe.New(conf.ProcDirectory, stackComponents) + life.RegisterStart(lifecycle.AsyncAppCtx, lifecycle.StartStackSnipe, lifecycle.HookFuncCtx(stackSniper.Run)) + if conf.TestnetConfig.IsNonZero() { eth2util.AddTestNetwork(conf.TestnetConfig) } diff --git a/app/lifecycle/order.go b/app/lifecycle/order.go index cb467cda5..ba07474fa 100644 --- a/app/lifecycle/order.go +++ b/app/lifecycle/order.go @@ -29,6 +29,7 @@ const ( StartP2PEventCollector StartPeerInfo StartParSigDB + StartStackSnipe ) // Global ordering of stop hooks; follows dependency tree from root to leaves. diff --git a/app/lifecycle/orderstart_string.go b/app/lifecycle/orderstart_string.go index ba285ccfd..c8f1e0451 100644 --- a/app/lifecycle/orderstart_string.go +++ b/app/lifecycle/orderstart_string.go @@ -26,11 +26,12 @@ func _() { _ = x[StartP2PEventCollector-13] _ = x[StartPeerInfo-14] _ = x[StartParSigDB-15] + _ = x[StartStackSnipe-16] } -const _OrderStart_name = "TrackerPrivkeyLockAggSigDBRelayMonitoringAPIDebugAPIValidatorAPIP2PPingP2PRoutersForceDirectConnsP2PConsensusSimulatorSchedulerP2PEventCollectorPeerInfoParSigDB" +const _OrderStart_name = "TrackerPrivkeyLockAggSigDBRelayMonitoringAPIDebugAPIValidatorAPIP2PPingP2PRoutersForceDirectConnsP2PConsensusSimulatorSchedulerP2PEventCollectorPeerInfoParSigDBStackSnipe" -var _OrderStart_index = [...]uint8{0, 7, 18, 26, 31, 44, 52, 64, 71, 81, 97, 109, 118, 127, 144, 152, 160} +var _OrderStart_index = [...]uint8{0, 7, 18, 26, 31, 44, 52, 64, 71, 81, 97, 109, 118, 127, 144, 152, 160, 170} func (i OrderStart) String() string { if i < 0 || i >= OrderStart(len(_OrderStart_index)-1) { diff --git a/app/metrics.go b/app/metrics.go index 1245f4371..a08438714 100644 --- a/app/metrics.go +++ b/app/metrics.go @@ -105,6 +105,13 @@ var ( Name: "network", Help: "Constant gauge with label set to the current network (chain)", }, []string{"network"}) + + validatorStackParamsGauge = promauto.NewResetGaugeVec(prometheus.GaugeOpts{ + Namespace: "app", + Subsystem: "validator_stack", + Name: "params", + Help: "Parameters for each component of the validator stack in which this Charon instance is deployed into", + }, []string{"component", "cli_parameters"}) ) func initStartupMetrics(peerName string, threshold, numOperators, numValidators int, network string) { diff --git a/app/monitoringapi.go b/app/monitoringapi.go index 84671f5e4..61c0dd088 100644 --- a/app/monitoringapi.go +++ b/app/monitoringapi.go @@ -278,6 +278,16 @@ func quorumPeersConnected(peerIDs []peer.ID, tcpNode host.Host) bool { return count >= cluster.Threshold(len(peerIDs))-1 } +// stackComponents writes Ethereum validator stack components names and CLI params to the defined Prometheus metric. +// It assumes that names and cliParams have the same size. +func stackComponents(names []string, cliParams []string) { + validatorStackParamsGauge.Reset() + + for i := 0; i < len(names); i++ { + validatorStackParamsGauge.WithLabelValues(names[i], cliParams[i]).Set(1) + } +} + func writeResponse(w http.ResponseWriter, status int, msg string) { w.WriteHeader(status) _, _ = w.Write([]byte(msg)) diff --git a/app/stacksnipe/stacksnipe.go b/app/stacksnipe/stacksnipe.go new file mode 100644 index 000000000..8e2ca74a8 --- /dev/null +++ b/app/stacksnipe/stacksnipe.go @@ -0,0 +1,227 @@ +// Copyright © 2022-2024 Obol Labs Inc. Licensed under the terms of a Business Source License 1.1 + +package stacksnipe + +import ( + "bytes" + "context" + "io/fs" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/obolnetwork/charon/app/errors" + "github.com/obolnetwork/charon/app/log" + "github.com/obolnetwork/charon/app/z" +) + +const ( + pollInterval = 15 * time.Second +) + +// supportedVCs represents the process names to identify Ethereum validator stack processes. +var supportedVCs = map[string]struct{}{ + "lighthouse": {}, + "teku": {}, + "nimbus": {}, + "prysm": {}, + "lodestar": {}, +} + +// maybeVCs is the list of process names which might be running as interpreters for components of the +// Ethereum validator stack. +var maybeVCs = map[string]struct{}{ + // lodestar runs under node + "node": {}, +} + +// StackComponent is a named process of the Ethereum validator stack running on the machine, +// whose CLI parameters (also called cmdline) is read from a /proc-like filesystem. +type StackComponent struct { + Name string + CLIParams string +} + +// Instance returns an instance of stacksnipe. +type Instance struct { + procPath string + metricsFunc func([]string, []string) + interval time.Duration +} + +// New returns a new Instance configured with the given /proc path and metrics export function. +func New(procPath string, metricFunc func([]string, []string)) Instance { + return Instance{ + procPath: procPath, + metricsFunc: metricFunc, + interval: pollInterval, + } +} + +// NewWithInterval returns a new Instance configured with the given /proc path, metrics export function and the specified polling interval. +func NewWithInterval(procPath string, metricFunc func([]string, []string), interval time.Duration) Instance { + return Instance{ + procPath: procPath, + metricsFunc: metricFunc, + interval: interval, + } +} + +// Run polls procPath every 15 seconds and exposes the results through the stack Prometheus metric. +func (i *Instance) Run(ctx context.Context) { + ctx = log.WithTopic(ctx, "stacksnipe") + + if i.procPath == "" { + log.Info(ctx, "Stack component sniping disabled") + return + } + + ticker := time.NewTicker(i.interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + comp, err := snipe(ctx, i.procPath) + if err != nil { + log.Warn(ctx, "Failed to snipe stack components", err) + continue + } + + var ( + names []string + cliParams []string + ) + + for _, c := range comp { + names = append(names, c.Name) + cliParams = append(cliParams, c.CLIParams) + } + + i.metricsFunc(names, cliParams) + } + } +} + +// snipe reads /proc entries from procPath, looking for processes that look like Ethereum validator stack components. +func snipe(ctx context.Context, procPath string) ([]StackComponent, error) { + var ( + wb = make(chan StackComponent) + ret []StackComponent + walkErr error + ) + + go func() { + if err := filepath.WalkDir(procPath, walkFunc(ctx, wb)); err != nil { + walkErr = errors.Wrap(err, "cannot walk proc path", z.Str("proc_path", procPath)) + } + + close(wb) + }() + + for c := range wb { + ret = append(ret, c) + } + + if walkErr != nil { + return nil, walkErr + } + + return ret, nil +} + +// walkFunc walks a /proc-like filesystem as invoked by filepath.WalkDir, and sends entries to wb. +func walkFunc(ctx context.Context, wb chan<- StackComponent) fs.WalkDirFunc { + cmdlineDedup := make(map[string]struct{}) + + return func(path string, d fs.DirEntry, err error) error { + // ignore directory access error and don't walk the directory + if err != nil { + return nil //nolint:nilerr // best effort component + } + + // ignore files + if !d.IsDir() { + return nil + } + + // ignore directories which don't look like pids + hostPID, err := strconv.ParseUint(d.Name(), 10, 64) + if err != nil { + return nil //nolint:nilerr // best effort component + } + + // do initial filtering by process' comm + commBytes, err := os.ReadFile(filepath.Join(path, "comm")) + if err != nil { + // ignore error, best effort + return nil //nolint:nilerr // best effort component + } + + comm := strings.TrimSpace(string(commBytes)) + _, vcOk := supportedVCs[comm] + _, maybeVCOk := maybeVCs[comm] + + if !vcOk && !maybeVCOk { + return nil + } + + // grab vc's cmdline + cmdlineBytes, err := os.ReadFile(filepath.Join(path, "cmdline")) + if err != nil { + // ignore error, best effort + return nil //nolint:nilerr // best effort component + } + + cmdlineString := string(cmdlineBytes) + + cmdlineSplit := bytes.Split(cmdlineBytes, []byte{0}) + + var vcName string + for vc := range supportedVCs { + if strings.Contains(cmdlineString, vc) { + vcName = vc + } + } + + if vcName == "" { + return nil + } + + if _, ok := cmdlineDedup[cmdlineString]; ok { + // we already have seen this, probably a background thread + return nil + } + + cmdlineDedup[cmdlineString] = struct{}{} + + var cmdLine []string + for _, cl := range cmdlineSplit { + if len(cl) == 0 { + continue + } + + cmdLine = append(cmdLine, string(cl)) + } + + if len(cmdLine) == 0 { + // no cmdline, ignore + return nil + } + + cmdLineStr := strings.Join(cmdLine, " ") + + log.Debug(ctx, "Detected stack component", z.Str("name", vcName), z.U64("host_pid", hostPID), z.Str("cmdline", cmdLineStr)) + + wb <- StackComponent{ + Name: vcName, + CLIParams: cmdLineStr, + } + + return nil + } +} diff --git a/app/stacksnipe/stacksnipe_test.go b/app/stacksnipe/stacksnipe_test.go new file mode 100644 index 000000000..e45589271 --- /dev/null +++ b/app/stacksnipe/stacksnipe_test.go @@ -0,0 +1,118 @@ +// Copyright © 2022-2024 Obol Labs Inc. Licensed under the terms of a Business Source License 1.1 + +package stacksnipe_test + +import ( + "context" + "os" + "path/filepath" + "strconv" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/obolnetwork/charon/app/stacksnipe" +) + +type procEntry struct { + pid uint64 + procName string + cmdline string +} + +type snipeResult struct { + names []string + cmdlines []string +} + +func Test_StackSnipe(t *testing.T) { + t.Run("happy path", func(t *testing.T) { + baseDir := t.TempDir() + + names := []string{ + "lighthouse", + "nimbus", + "node", + } + + namesExpected := []string{ + "lighthouse", + "nimbus", + "lodestar", + } + + cmdlines := []string{ + "lighthouse_1", + "nimbus_1", + "lodestar vc 1", + } + + extraNames := []string{ + "systemd-resolved", + } + + extraCmdlines := []string{ + "run_1", + } + + for idx := 0; idx < len(names); idx++ { + populateProc(t, baseDir, procEntry{ + pid: uint64(42 + idx), + procName: names[idx], + cmdline: cmdlines[idx], + }) + } + + for idx := 0; idx < len(extraNames); idx++ { + populateProc(t, baseDir, procEntry{ + pid: uint64(52 + idx), + procName: extraNames[idx], + cmdline: extraCmdlines[idx], + }) + } + + var ( + ctx, cancel = context.WithCancel(context.Background()) + resultChan = make(chan snipeResult) + ) + + defer cancel() + + snipe := stacksnipe.NewWithInterval(baseDir, func(names []string, cmdlines []string) { + resultChan <- snipeResult{ + names: names, + cmdlines: cmdlines, + } + + cancel() + }, 50*time.Millisecond) + + go snipe.Run(ctx) + + result := <-resultChan + + require.Len(t, result.names, 3) + require.Len(t, result.cmdlines, 3) + + require.ElementsMatch(t, result.names, namesExpected) + require.ElementsMatch(t, result.cmdlines, cmdlines) + + for idx := 0; idx < len(extraNames); idx++ { + require.NotContains(t, result.names, extraNames[idx]) + require.NotContains(t, result.cmdlines, extraCmdlines[idx]) + } + }) +} + +func populateProc(t *testing.T, base string, entry procEntry) { + t.Helper() + + procDir := filepath.Join(base, strconv.FormatUint(entry.pid, 10)) + commFile := filepath.Join(procDir, "comm") + cmdlineFile := filepath.Join(procDir, "cmdline") + + require.NoError(t, os.Mkdir(procDir, 0o755)) + require.NoError(t, os.WriteFile(commFile, []byte(entry.procName), 0o755)) + require.NoError(t, os.WriteFile(cmdlineFile, []byte(entry.cmdline), 0o755)) +} diff --git a/cmd/run.go b/cmd/run.go index af1faa668..f6f95d5bd 100644 --- a/cmd/run.go +++ b/cmd/run.go @@ -86,6 +86,7 @@ func bindRunFlags(cmd *cobra.Command, config *app.Config) { cmd.Flags().Uint64Var(&config.TestnetConfig.ChainID, "testnet-chain-id", 0, "Chain ID of the custom test network.") cmd.Flags().Int64Var(&config.TestnetConfig.GenesisTimestamp, "testnet-genesis-timestamp", 0, "Genesis timestamp of the custom test network.") cmd.Flags().StringVar(&config.TestnetConfig.CapellaHardFork, "testnet-capella-hard-fork", "", "Capella hard fork version of the custom test network.") + cmd.Flags().StringVar(&config.ProcDirectory, "proc-directory", "", "Directory to look into in order to detect other stack components running on the host.") wrapPreRunE(cmd, func(cmd *cobra.Command, args []string) error { if len(config.BeaconNodeAddrs) == 0 && !config.SimnetBMock { diff --git a/docs/configuration.md b/docs/configuration.md index 4f2bc0b75..99b479ff1 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -175,6 +175,7 @@ Flags: --p2p-tcp-address strings Comma-separated list of listening TCP addresses (ip and port) for libP2P traffic. Empty default doesn't bind to local port therefore only supports outgoing connections. --private-key-file string The path to the charon enr private key file. (default ".charon/charon-enr-private-key") --private-key-file-lock Enables private key locking to prevent multiple instances using the same key. + --proc-directory string Directory to look into in order to detect other stack components running on the host. --simnet-beacon-mock Enables an internal mock beacon node for running a simnet. --simnet-beacon-mock-fuzz Configures simnet beaconmock to return fuzzed responses. --simnet-slot-duration duration Configures slot duration in simnet beacon mock. (default 1s) diff --git a/docs/metrics.md b/docs/metrics.md index ded84b7a0..4bcfcbe96 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -32,6 +32,7 @@ when storing metrics from multiple nodes or clusters in one Prometheus instance. | `app_peerinfo_version` | Gauge | Constant gauge with version label set to peer`s charon version. | `peer, version` | | `app_peerinfo_version_support` | Gauge | Set to 1 if the peer`s version is supported by (compatible with) the current version, else 0 if unsupported. | `peer` | | `app_start_time_secs` | Gauge | Gauge set to the app start time of the binary in unix seconds | | +| `app_validator_stack_params` | Gauge | Parameters for each component of the validator stack in which this Charon instance is deployed into | `component, cli_parameters` | | `app_version` | Gauge | Constant gauge with label set to current app version | `version` | | `cluster_network` | Gauge | Constant gauge with label set to the current network (chain) | `network` | | `cluster_operators` | Gauge | Number of operators in the cluster lock | |