Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

app/stacksnipe: detect validator services #3169

Merged
merged 2 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"github.com/obolnetwork/charon/app/privkeylock"
"github.com/obolnetwork/charon/app/promauto"
"github.com/obolnetwork/charon/app/retry"
"github.com/obolnetwork/charon/app/stacksnipe"
"github.com/obolnetwork/charon/app/tracer"
"github.com/obolnetwork/charon/app/version"
"github.com/obolnetwork/charon/app/z"
Expand Down Expand Up @@ -90,6 +91,7 @@
BuilderAPI bool
SimnetBMockFuzz bool
TestnetConfig eth2util.Network
ProcDirectory string

TestConfig TestConfig
}
Expand Down Expand Up @@ -148,6 +150,9 @@
life.RegisterStop(lifecycle.StopPrivkeyLock, lifecycle.HookFuncMin(lockSvc.Close))
}

stackSniper := stacksnipe.New(conf.ProcDirectory, stackComponents)
life.RegisterStart(lifecycle.AsyncAppCtx, lifecycle.StartStackSnipe, lifecycle.HookFuncCtx(stackSniper.Run))

Check warning on line 155 in app/app.go

View check run for this annotation

Codecov / codecov/patch

app/app.go#L153-L155

Added lines #L153 - L155 were not covered by tests
if conf.TestnetConfig.IsNonZero() {
eth2util.AddTestNetwork(conf.TestnetConfig)
}
Expand Down
1 change: 1 addition & 0 deletions app/lifecycle/order.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ const (
StartP2PEventCollector
StartPeerInfo
StartParSigDB
StartStackSnipe
)

// Global ordering of stop hooks; follows dependency tree from root to leaves.
Expand Down
5 changes: 3 additions & 2 deletions app/lifecycle/orderstart_string.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions app/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,13 @@ var (
Name: "network",
Help: "Constant gauge with label set to the current network (chain)",
}, []string{"network"})

validatorStackParamsGauge = promauto.NewResetGaugeVec(prometheus.GaugeOpts{
Namespace: "app",
Subsystem: "validator_stack",
Name: "params",
Help: "Parameters for each component of the validator stack in which this Charon instance is deployed into",
}, []string{"component", "cli_parameters"})
)

func initStartupMetrics(peerName string, threshold, numOperators, numValidators int, network string) {
Expand Down
10 changes: 10 additions & 0 deletions app/monitoringapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,16 @@
return count >= cluster.Threshold(len(peerIDs))-1
}

// stackComponents writes Ethereum validator stack components names and CLI params to the defined Prometheus metric.
// It assumes that names and cliParams have the same size.
func stackComponents(names []string, cliParams []string) {
validatorStackParamsGauge.Reset()

for i := 0; i < len(names); i++ {
validatorStackParamsGauge.WithLabelValues(names[i], cliParams[i]).Set(1)
}

Check warning on line 288 in app/monitoringapi.go

View check run for this annotation

Codecov / codecov/patch

app/monitoringapi.go#L283-L288

Added lines #L283 - L288 were not covered by tests
}

func writeResponse(w http.ResponseWriter, status int, msg string) {
w.WriteHeader(status)
_, _ = w.Write([]byte(msg))
Expand Down
227 changes: 227 additions & 0 deletions app/stacksnipe/stacksnipe.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
// Copyright © 2022-2024 Obol Labs Inc. Licensed under the terms of a Business Source License 1.1

package stacksnipe

import (
"bytes"
"context"
"io/fs"
"os"
"path/filepath"
"strconv"
"strings"
"time"

"github.com/obolnetwork/charon/app/errors"
"github.com/obolnetwork/charon/app/log"
"github.com/obolnetwork/charon/app/z"
)

const (
pollInterval = 15 * time.Second
pinebit marked this conversation as resolved.
Show resolved Hide resolved
)

// supportedVCs represents the process names to identify Ethereum validator stack processes.
var supportedVCs = map[string]struct{}{
"lighthouse": {},
"teku": {},
"nimbus": {},
"prysm": {},
"lodestar": {},
}

// maybeVCs is the list of process names which might be running as interpreters for components of the
// Ethereum validator stack.
var maybeVCs = map[string]struct{}{
// lodestar runs under node
"node": {},
}

// StackComponent is a named process of the Ethereum validator stack running on the machine,
// whose CLI parameters (also called cmdline) is read from a /proc-like filesystem.
type StackComponent struct {
Name string
CLIParams string
}

// Instance returns an instance of stacksnipe.
type Instance struct {
procPath string
metricsFunc func([]string, []string)
pinebit marked this conversation as resolved.
Show resolved Hide resolved
interval time.Duration
}

// New returns a new Instance configured with the given /proc path and metrics export function.
func New(procPath string, metricFunc func([]string, []string)) Instance {
return Instance{
procPath: procPath,
metricsFunc: metricFunc,
interval: pollInterval,
}

Check warning on line 60 in app/stacksnipe/stacksnipe.go

View check run for this annotation

Codecov / codecov/patch

app/stacksnipe/stacksnipe.go#L55-L60

Added lines #L55 - L60 were not covered by tests
}

// NewWithInterval returns a new Instance configured with the given /proc path, metrics export function and the specified polling interval.
func NewWithInterval(procPath string, metricFunc func([]string, []string), interval time.Duration) Instance {
return Instance{
procPath: procPath,
metricsFunc: metricFunc,
interval: interval,
}
}

// Run polls procPath every 15 seconds and exposes the results through the stack Prometheus metric.
func (i *Instance) Run(ctx context.Context) {
ctx = log.WithTopic(ctx, "stacksnipe")

if i.procPath == "" {
log.Info(ctx, "Stack component sniping disabled")
return
}

Check warning on line 79 in app/stacksnipe/stacksnipe.go

View check run for this annotation

Codecov / codecov/patch

app/stacksnipe/stacksnipe.go#L77-L79

Added lines #L77 - L79 were not covered by tests

ticker := time.NewTicker(i.interval)
defer ticker.Stop()

for {
select {
case <-ctx.Done():
return
case <-ticker.C:
comp, err := snipe(ctx, i.procPath)
if err != nil {
log.Warn(ctx, "Failed to snipe stack components", err)
continue

Check warning on line 92 in app/stacksnipe/stacksnipe.go

View check run for this annotation

Codecov / codecov/patch

app/stacksnipe/stacksnipe.go#L91-L92

Added lines #L91 - L92 were not covered by tests
}

var (
names []string
cliParams []string
)

for _, c := range comp {
names = append(names, c.Name)
cliParams = append(cliParams, c.CLIParams)
}

i.metricsFunc(names, cliParams)
}
}
}

// snipe reads /proc entries from procPath, looking for processes that look like Ethereum validator stack components.
func snipe(ctx context.Context, procPath string) ([]StackComponent, error) {
var (
wb = make(chan StackComponent)
ret []StackComponent
walkErr error
)

go func() {
if err := filepath.WalkDir(procPath, walkFunc(ctx, wb)); err != nil {
walkErr = errors.Wrap(err, "cannot walk proc path", z.Str("proc_path", procPath))
}

Check warning on line 121 in app/stacksnipe/stacksnipe.go

View check run for this annotation

Codecov / codecov/patch

app/stacksnipe/stacksnipe.go#L120-L121

Added lines #L120 - L121 were not covered by tests

close(wb)
}()

for c := range wb {
ret = append(ret, c)
}

if walkErr != nil {
return nil, walkErr
}

Check warning on line 132 in app/stacksnipe/stacksnipe.go

View check run for this annotation

Codecov / codecov/patch

app/stacksnipe/stacksnipe.go#L131-L132

Added lines #L131 - L132 were not covered by tests

return ret, nil
}

// walkFunc walks a /proc-like filesystem as invoked by filepath.WalkDir, and sends entries to wb.
func walkFunc(ctx context.Context, wb chan<- StackComponent) fs.WalkDirFunc {
cmdlineDedup := make(map[string]struct{})

return func(path string, d fs.DirEntry, err error) error {
// ignore directory access error and don't walk the directory
if err != nil {
return nil //nolint:nilerr // best effort component
}

Check warning on line 145 in app/stacksnipe/stacksnipe.go

View check run for this annotation

Codecov / codecov/patch

app/stacksnipe/stacksnipe.go#L144-L145

Added lines #L144 - L145 were not covered by tests

// ignore files
if !d.IsDir() {
return nil
}

// ignore directories which don't look like pids
hostPID, err := strconv.ParseUint(d.Name(), 10, 64)
if err != nil {
return nil //nolint:nilerr // best effort component
}

Check warning on line 156 in app/stacksnipe/stacksnipe.go

View check run for this annotation

Codecov / codecov/patch

app/stacksnipe/stacksnipe.go#L155-L156

Added lines #L155 - L156 were not covered by tests

// do initial filtering by process' comm
commBytes, err := os.ReadFile(filepath.Join(path, "comm"))
if err != nil {
// ignore error, best effort
return nil //nolint:nilerr // best effort component
}

comm := strings.TrimSpace(string(commBytes))
_, vcOk := supportedVCs[comm]
_, maybeVCOk := maybeVCs[comm]

if !vcOk && !maybeVCOk {
return nil
}

// grab vc's cmdline
cmdlineBytes, err := os.ReadFile(filepath.Join(path, "cmdline"))
if err != nil {
// ignore error, best effort
return nil //nolint:nilerr // best effort component
}

Check warning on line 178 in app/stacksnipe/stacksnipe.go

View check run for this annotation

Codecov / codecov/patch

app/stacksnipe/stacksnipe.go#L176-L178

Added lines #L176 - L178 were not covered by tests

cmdlineString := string(cmdlineBytes)

cmdlineSplit := bytes.Split(cmdlineBytes, []byte{0})

var vcName string
for vc := range supportedVCs {
if strings.Contains(cmdlineString, vc) {
vcName = vc
}
}

if vcName == "" {
return nil
}

Check warning on line 193 in app/stacksnipe/stacksnipe.go

View check run for this annotation

Codecov / codecov/patch

app/stacksnipe/stacksnipe.go#L192-L193

Added lines #L192 - L193 were not covered by tests

if _, ok := cmdlineDedup[cmdlineString]; ok {
// we already have seen this, probably a background thread
return nil
}

Check warning on line 198 in app/stacksnipe/stacksnipe.go

View check run for this annotation

Codecov / codecov/patch

app/stacksnipe/stacksnipe.go#L196-L198

Added lines #L196 - L198 were not covered by tests

cmdlineDedup[cmdlineString] = struct{}{}

var cmdLine []string
for _, cl := range cmdlineSplit {
if len(cl) == 0 {
continue

Check warning on line 205 in app/stacksnipe/stacksnipe.go

View check run for this annotation

Codecov / codecov/patch

app/stacksnipe/stacksnipe.go#L205

Added line #L205 was not covered by tests
}

cmdLine = append(cmdLine, string(cl))
}

if len(cmdLine) == 0 {
// no cmdline, ignore
return nil
}

Check warning on line 214 in app/stacksnipe/stacksnipe.go

View check run for this annotation

Codecov / codecov/patch

app/stacksnipe/stacksnipe.go#L212-L214

Added lines #L212 - L214 were not covered by tests

cmdLineStr := strings.Join(cmdLine, " ")

log.Debug(ctx, "Detected stack component", z.Str("name", vcName), z.U64("host_pid", hostPID), z.Str("cmdline", cmdLineStr))

wb <- StackComponent{
Name: vcName,
CLIParams: cmdLineStr,
}

return nil
pinebit marked this conversation as resolved.
Show resolved Hide resolved
}
}
Loading
Loading