From d04e77827c206a7037bd7a4340d3e5094bf88685 Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Wed, 22 Nov 2023 16:15:22 -0800 Subject: [PATCH 01/15] Update liquidations daemon to report success after every paginated gRPC query. Update error for a more specific test. PR comments. Revert liquidations health reporting to occur only at task loop boundary. Checkpoint. checkpoint - implementation, moved methods to app. need to migrate / add testing. Update test app. Checkpoint for tests. Finish test cases. App unit test + cleanup. Cleanup, rename. cleanup. Remove todo for refactoring metrics daemon. Move monitoring call to before client start. Update placement of monitor code for pricefeed client. Remove defunct constants. Revert ServiceName changes. --- protocol/app/app.go | 52 ++- protocol/app/app_test.go | 13 + .../client/client_integration_test.go | 12 +- protocol/daemons/server/bridge.go | 11 - protocol/daemons/server/liquidation.go | 17 - protocol/daemons/server/metrics.go | 15 - protocol/daemons/server/pricefeed.go | 15 - protocol/daemons/server/server.go | 53 --- protocol/daemons/server/server_test.go | 44 --- .../daemons/server/types/health_monitor.go | 277 +++++++++++++++ .../server/types/health_monitor_test.go | 318 ++++++++++++++++++ .../daemons/server/types/update_monitor.go | 169 ---------- .../server/types/update_monitor_test.go | 189 ----------- protocol/daemons/server/types/util.go | 9 - protocol/daemons/server/types/util_test.go | 14 - protocol/daemons/types/health_checkable.go | 11 + protocol/mocks/HealthCheckable.go | 63 ++++ protocol/mocks/Makefile | 1 + protocol/testutil/app/app.go | 2 +- 19 files changed, 736 insertions(+), 549 deletions(-) delete mode 100644 protocol/daemons/server/metrics.go create mode 100644 protocol/daemons/server/types/health_monitor.go create mode 100644 protocol/daemons/server/types/health_monitor_test.go delete mode 100644 protocol/daemons/server/types/update_monitor.go delete mode 100644 protocol/daemons/server/types/update_monitor_test.go delete mode 100644 protocol/daemons/server/types/util.go delete mode 100644 protocol/daemons/server/types/util_test.go create mode 100644 protocol/mocks/HealthCheckable.go diff --git a/protocol/app/app.go b/protocol/app/app.go index f01876607bd..f1e256ed87e 100644 --- a/protocol/app/app.go +++ b/protocol/app/app.go @@ -5,13 +5,13 @@ import ( "encoding/json" "errors" "io" - "math" "math/big" "net/http" "os" "path/filepath" "runtime/debug" "sync" + "time" autocliv1 "cosmossdk.io/api/cosmos/autocli/v1" reflectionv1 "cosmossdk.io/api/cosmos/reflection/v1" @@ -183,6 +183,10 @@ import ( var ( // DefaultNodeHome default home directories for the application daemon DefaultNodeHome string + + // MaximumDaemonUnhealthyDuration is the maximum amount of time that a daemon can be unhealthy before the + // application panics. + MaximumDaemonUnhealthyDuration = 5 * time.Minute ) var ( @@ -290,6 +294,8 @@ type App struct { PriceFeedClient *pricefeedclient.Client LiquidationsClient *liquidationclient.Client BridgeClient *bridgeclient.Client + + HealthMonitor *daemonservertypes.HealthMonitor } // assertAppPreconditions assert invariants required for an application to start. @@ -589,6 +595,11 @@ func New( bridgeEventManager := bridgedaemontypes.NewBridgeEventManager(timeProvider) app.Server.WithBridgeEventManager(bridgeEventManager) + app.HealthMonitor = daemonservertypes.NewHealthMonitor( + daemonservertypes.DaemonStartupGracePeriod, + daemonservertypes.HealthCheckPollFrequency, + app.Logger(), + ) // Create a closure for starting daemons and daemon server. Daemon services are delayed until after the gRPC // service is started because daemons depend on the gRPC service being available. If a node is initialized // with a genesis time in the future, then the gRPC service will not be available until the genesis time, the @@ -600,11 +611,9 @@ func New( // Start liquidations client for sending potentially liquidatable subaccounts to the application. if daemonFlags.Liquidation.Enabled { - app.Server.ExpectLiquidationsDaemon( - daemonservertypes.MaximumAcceptableUpdateDelay(daemonFlags.Liquidation.LoopDelayMs), - ) app.LiquidationsClient = liquidationclient.NewClient(logger) go func() { + app.MonitorDaemon(app.LiquidationsClient, MaximumDaemonUnhealthyDuration) if err := app.LiquidationsClient.Start( // The client will use `context.Background` so that it can have a different context from // the main application. @@ -621,7 +630,6 @@ func New( // Non-validating full-nodes have no need to run the price daemon. if !appFlags.NonValidatingFullNode && daemonFlags.Price.Enabled { exchangeQueryConfig := constants.StaticExchangeQueryConfig - app.Server.ExpectPricefeedDaemon(daemonservertypes.MaximumAcceptableUpdateDelay(daemonFlags.Price.LoopDelayMs)) // Start pricefeed client for sending prices for the pricefeed server to consume. These prices // are retrieved via third-party APIs like Binance and then are encoded in-memory and // periodically sent via gRPC to a shared socket with the server. @@ -637,6 +645,7 @@ func New( constants.StaticExchangeDetails, &pricefeedclient.SubTaskRunnerImpl{}, ) + app.MonitorDaemon(app.PriceFeedClient, MaximumDaemonUnhealthyDuration) } // Start Bridge Daemon. @@ -644,9 +653,9 @@ func New( if !appFlags.NonValidatingFullNode && daemonFlags.Bridge.Enabled { // TODO(CORE-582): Re-enable bridge daemon registration once the bridge daemon is fixed in local / CI // environments. - // app.Server.ExpectBridgeDaemon(daemonservertypes.MaximumAcceptableUpdateDelay(daemonFlags.Bridge.LoopDelayMs)) + app.BridgeClient = bridgeclient.NewClient(logger) go func() { - app.BridgeClient = bridgeclient.NewClient(logger) + app.MonitorDaemon(app.BridgeClient, MaximumDaemonUnhealthyDuration) if err := app.BridgeClient.Start( // The client will use `context.Background` so that it can have a different context from // the main application. @@ -675,10 +684,6 @@ func New( ) } }() - // Don't panic if metrics daemon loops are delayed. Use maximum value. - app.Server.ExpectMetricsDaemon( - daemonservertypes.MaximumAcceptableUpdateDelay(math.MaxUint32), - ) metricsclient.Start( // The client will use `context.Background` so that it can have a different context from // the main application. @@ -1222,6 +1227,31 @@ func New( return app } +// MonitorDaemon registers a daemon service with the update monitor. If the daemon does not register, the method will +// panic. +func (app *App) MonitorDaemon( + healthCheckableDaemon daemontypes.HealthCheckable, + maximumAcceptableUpdateDelay time.Duration, +) { + if err := app.HealthMonitor.RegisterService(healthCheckableDaemon, maximumAcceptableUpdateDelay); err != nil { + app.Logger().Error( + "Failed to register daemon service with update monitor", + "error", + err, + "service", + healthCheckableDaemon.ServiceName(), + "maximumAcceptableUpdateDelay", + maximumAcceptableUpdateDelay, + ) + panic(err) + } +} + +// DisableHealthMonitorForTesting disables the health monitor for testing. +func (app *App) DisableHealthMonitorForTesting() { + app.HealthMonitor.DisableForTesting() +} + // hydrateMemStores hydrates the memStores used for caching state. func (app *App) hydrateMemStores() { // Create an `uncachedCtx` where the underlying MultiStore is the `rootMultiStore`. diff --git a/protocol/app/app_test.go b/protocol/app/app_test.go index 42fc06dd7c1..b845cc71b95 100644 --- a/protocol/app/app_test.go +++ b/protocol/app/app_test.go @@ -1,10 +1,12 @@ package app_test import ( + "github.com/dydxprotocol/v4-chain/protocol/mocks" "gopkg.in/typ.v4/slices" "reflect" "strings" "testing" + "time" delaymsgmodule "github.com/dydxprotocol/v4-chain/protocol/x/delaymsg" @@ -222,3 +224,14 @@ func TestModuleBasics(t *testing.T) { actualFieldTypes := getMapFieldsAndTypes(reflect.ValueOf(basic_manager.ModuleBasics)) require.Equal(t, expectedFieldTypes, actualFieldTypes, "Module basics does not match expected") } + +func TestMonitorDaemon_Panics(t *testing.T) { + app := testapp.DefaultTestApp(nil) + hc := &mocks.HealthCheckable{} + hc.On("ServiceName").Return("test-service") + hc.On("HealthCheck").Return(nil) + + app.MonitorDaemon(hc, 5*time.Minute) + // The second registration should fail, causing a panic. + require.Panics(t, func() { app.MonitorDaemon(hc, 5*time.Minute) }) +} diff --git a/protocol/daemons/pricefeed/client/client_integration_test.go b/protocol/daemons/pricefeed/client/client_integration_test.go index 99f67b37aa4..70e2dfd75ca 100644 --- a/protocol/daemons/pricefeed/client/client_integration_test.go +++ b/protocol/daemons/pricefeed/client/client_integration_test.go @@ -5,6 +5,7 @@ package client_test import ( "fmt" "github.com/cometbft/cometbft/libs/log" + "github.com/dydxprotocol/v4-chain/protocol/app" appflags "github.com/dydxprotocol/v4-chain/protocol/app/flags" "github.com/dydxprotocol/v4-chain/protocol/daemons/flags" "github.com/dydxprotocol/v4-chain/protocol/daemons/pricefeed/client" @@ -219,6 +220,7 @@ type PriceDaemonIntegrationTestSuite struct { exchangeServer *pricefeed.ExchangeServer daemonServer *daemonserver.Server exchangePriceCache *pricefeedserver_types.MarketToExchangePrices + healthMonitor *servertypes.HealthMonitor pricesMockQueryServer *mocks.QueryServer pricesGrpcServer *grpc.Server @@ -278,7 +280,13 @@ func (s *PriceDaemonIntegrationTestSuite) SetupTest() { &daemontypes.FileHandlerImpl{}, s.daemonFlags.Shared.SocketAddress, ) - s.daemonServer.ExpectPricefeedDaemon(servertypes.MaximumAcceptableUpdateDelay(s.daemonFlags.Price.LoopDelayMs)) + + s.healthMonitor = servertypes.NewHealthMonitor( + servertypes.DaemonStartupGracePeriod, + servertypes.HealthCheckPollFrequency, + log.TestingLogger(), + ) + s.exchangePriceCache = pricefeedserver_types.NewMarketToExchangePrices(pricefeed_types.MaxPriceAge) s.daemonServer.WithPriceFeedMarketToExchangePrices(s.exchangePriceCache) @@ -329,6 +337,8 @@ func (s *PriceDaemonIntegrationTestSuite) startClient() { testExchangeToQueryDetails, &client.SubTaskRunnerImpl{}, ) + err := s.healthMonitor.RegisterService(s.pricefeedDaemon, app.MaximumDaemonUnhealthyDuration) + s.Require().NoError(err) } // expectPricesWithTimeout waits for the exchange price cache to contain the expected prices, with a timeout. diff --git a/protocol/daemons/server/bridge.go b/protocol/daemons/server/bridge.go index 20fc68cd1ac..632d72ddb21 100644 --- a/protocol/daemons/server/bridge.go +++ b/protocol/daemons/server/bridge.go @@ -2,9 +2,6 @@ package server import ( "context" - "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types" - "time" - "github.com/dydxprotocol/v4-chain/protocol/daemons/bridge/api" bdtypes "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types/bridge" ) @@ -23,14 +20,6 @@ func (server *Server) WithBridgeEventManager( return server } -// ExpectBridgeDaemon registers the bridge daemon with the server. This is required -// in order to ensure that the daemon service is called at least once during every -// maximumAcceptableUpdateDelay duration. It will cause the protocol to panic if the daemon does not -// respond within maximumAcceptableUpdateDelay duration. -func (server *Server) ExpectBridgeDaemon(maximumAcceptableUpdateDelay time.Duration) { - server.registerDaemon(types.BridgeDaemonServiceName, maximumAcceptableUpdateDelay) -} - // AddBridgeEvents stores any bridge events recognized by the daemon // in a go-routine safe slice. func (s *Server) AddBridgeEvents( diff --git a/protocol/daemons/server/liquidation.go b/protocol/daemons/server/liquidation.go index 9eb10450515..63c8f6b1134 100644 --- a/protocol/daemons/server/liquidation.go +++ b/protocol/daemons/server/liquidation.go @@ -2,9 +2,6 @@ package server import ( "context" - "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types" - "time" - "github.com/cosmos/cosmos-sdk/telemetry" "github.com/dydxprotocol/v4-chain/protocol/daemons/liquidation/api" liquidationtypes "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types/liquidations" @@ -26,14 +23,6 @@ func (server *Server) WithLiquidatableSubaccountIds( return server } -// ExpectLiquidationsDaemon registers the liquidations daemon with the server. This is required -// in order to ensure that the daemon service is called at least once during every -// maximumAcceptableUpdateDelay duration. It will cause the protocol to panic if the daemon does not -// respond within maximumAcceptableUpdateDelay duration. -func (server *Server) ExpectLiquidationsDaemon(maximumAcceptableUpdateDelay time.Duration) { - server.registerDaemon(types.LiquidationsDaemonServiceName, maximumAcceptableUpdateDelay) -} - // LiquidateSubaccounts stores the list of potentially liquidatable subaccount ids // in a go-routine safe slice. func (s *Server) LiquidateSubaccounts( @@ -47,12 +36,6 @@ func (s *Server) LiquidateSubaccounts( metrics.Received, metrics.Count, ) - // If the daemon is unable to report a response, there is either an error in the registration of - // this daemon, or another one. In either case, the protocol should panic. - if err := s.reportResponse(types.LiquidationsDaemonServiceName); err != nil { - s.logger.Error("Failed to report liquidations response to update monitor", "error", err) - } - s.liquidatableSubaccountIds.UpdateSubaccountIds(req.SubaccountIds) return &api.LiquidateSubaccountsResponse{}, nil } diff --git a/protocol/daemons/server/metrics.go b/protocol/daemons/server/metrics.go deleted file mode 100644 index 2ab38263829..00000000000 --- a/protocol/daemons/server/metrics.go +++ /dev/null @@ -1,15 +0,0 @@ -package server - -import ( - "time" - - "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types" -) - -// ExpectMetricsDaemon registers the periodic metrics daemon with the server. This is required -// in order to ensure that the daemon service is called at least once during every -// maximumAcceptableUpdateDelay duration. It will cause the protocol to panic if the daemon does not -// respond within maximumAcceptableUpdateDelay duration. -func (server *Server) ExpectMetricsDaemon(maximumAcceptableUpdateDelay time.Duration) { - server.registerDaemon(types.MetricsDaemonServiceName, maximumAcceptableUpdateDelay) -} diff --git a/protocol/daemons/server/pricefeed.go b/protocol/daemons/server/pricefeed.go index 30f6a2e6146..0337483fc38 100644 --- a/protocol/daemons/server/pricefeed.go +++ b/protocol/daemons/server/pricefeed.go @@ -3,7 +3,6 @@ package server import ( "context" errorsmod "cosmossdk.io/errors" - servertypes "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types" "time" gometrics "github.com/armon/go-metrics" @@ -31,14 +30,6 @@ func (server *Server) WithPriceFeedMarketToExchangePrices( return server } -// ExpectPricefeedDaemon registers the pricefeed daemon with the server. This is required -// in order to ensure that the daemon service is called at least once during every -// maximumAcceptableUpdateDelay duration. It will cause the protocol to panic if the daemon does not -// respond within maximumAcceptableUpdateDelay duration. -func (server *Server) ExpectPricefeedDaemon(maximumAcceptableUpdateDelay time.Duration) { - server.registerDaemon(servertypes.PricefeedDaemonServiceName, maximumAcceptableUpdateDelay) -} - // UpdateMarketPrices updates prices from exchanges for each market provided. func (s *Server) UpdateMarketPrices( ctx context.Context, @@ -52,12 +43,6 @@ func (s *Server) UpdateMarketPrices( metrics.Latency, ) - // If the daemon is unable to report a response, there is either an error in the registration of - // this daemon, or another one. In either case, the protocol should panic. - if err := s.reportResponse(servertypes.PricefeedDaemonServiceName); err != nil { - panic(err) - } - if s.marketToExchange == nil { panic( errorsmod.Wrapf( diff --git a/protocol/daemons/server/server.go b/protocol/daemons/server/server.go index d146587731c..8c3cb42618d 100644 --- a/protocol/daemons/server/server.go +++ b/protocol/daemons/server/server.go @@ -1,19 +1,14 @@ package server import ( - gometrics "github.com/armon/go-metrics" "github.com/cometbft/cometbft/libs/log" - "github.com/cosmos/cosmos-sdk/telemetry" bridgeapi "github.com/dydxprotocol/v4-chain/protocol/daemons/bridge/api" "github.com/dydxprotocol/v4-chain/protocol/daemons/constants" liquidationapi "github.com/dydxprotocol/v4-chain/protocol/daemons/liquidation/api" pricefeedapi "github.com/dydxprotocol/v4-chain/protocol/daemons/pricefeed/api" - "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types" daemontypes "github.com/dydxprotocol/v4-chain/protocol/daemons/types" - "github.com/dydxprotocol/v4-chain/protocol/lib/metrics" "net" "syscall" - "time" ) // Server struct defines the shared gRPC server for all daemons. @@ -26,8 +21,6 @@ type Server struct { fileHandler daemontypes.FileHandler socketAddress string - updateMonitor *types.UpdateMonitor - BridgeServer PriceFeedServer LiquidationServer @@ -47,60 +40,14 @@ func NewServer( gsrv: grpcServer, fileHandler: fileHandler, socketAddress: socketAddress, - updateMonitor: types.NewUpdateFrequencyMonitor(types.DaemonStartupGracePeriod, logger), } } // Stop stops the daemon server's gRPC service. func (server *Server) Stop() { - server.updateMonitor.Stop() server.gsrv.Stop() } -// DisableUpdateMonitoringForTesting disables the update monitor for testing purposes. This is needed in integration -// tests that do not run the full protocol. -func (server *Server) DisableUpdateMonitoringForTesting() { - server.updateMonitor.DisableForTesting() -} - -// registerDaemon registers a daemon service with the update monitor. -func (server *Server) registerDaemon( - daemonKey string, - maximumAcceptableUpdateDelay time.Duration, -) { - if err := server.updateMonitor.RegisterDaemonService(daemonKey, maximumAcceptableUpdateDelay); err != nil { - server.logger.Error( - "Failed to register daemon service with update monitor", - "error", - err, - "service", - daemonKey, - "maximumAcceptableUpdateDelay", - maximumAcceptableUpdateDelay, - ) - panic(err) - } -} - -// reportResponse reports a response from a daemon service with the update monitor. This is used to -// ensure that the daemon continues to operate. If the update monitor does not see a response from a -// registered daemon within the maximumAcceptableUpdateDelay, it will cause the protocol to panic. -func (server *Server) reportResponse( - daemonKey string, -) error { - telemetry.IncrCounterWithLabels( - []string{ - metrics.DaemonServer, - metrics.ValidResponse, - }, - 1, - []gometrics.Label{ - metrics.GetLabelForStringValue(metrics.Daemon, daemonKey), - }, - ) - return server.updateMonitor.RegisterValidResponse(daemonKey) -} - // Start clears the current socket and establishes a new socket connection // on the local filesystem. // See URL for more information: https://eli.thegreenplace.net/2019/unix-domain-sockets-in-go/ diff --git a/protocol/daemons/server/server_test.go b/protocol/daemons/server/server_test.go index 5188409d103..5d09b53d470 100644 --- a/protocol/daemons/server/server_test.go +++ b/protocol/daemons/server/server_test.go @@ -13,7 +13,6 @@ import ( "net" "os" "testing" - "time" ) const ( @@ -159,48 +158,6 @@ func TestStart_MixedInvalid(t *testing.T) { } } -func TestRegisterDaemon_DoesNotPanic(t *testing.T) { - grpcServer := &mocks.GrpcServer{} - grpcServer.On("Stop").Return().Once() - server := server.NewServer( - log.NewNopLogger(), - grpcServer, - &mocks.FileHandler{}, - grpc.SocketPath, - ) - defer server.Stop() - - require.NotPanics(t, func() { - server.ExpectPricefeedDaemon(5 * time.Second) - }) -} - -func TestRegisterDaemon_DoubleRegistrationPanics(t *testing.T) { - grpcServer := &mocks.GrpcServer{} - grpcServer.On("Stop").Return().Once() - server := server.NewServer( - log.NewNopLogger(), - grpcServer, - &mocks.FileHandler{}, - grpc.SocketPath, - ) - defer server.Stop() - - // First registration should not panic. - require.NotPanics(t, func() { - server.ExpectPricefeedDaemon(5 * time.Second) - }) - - // Second registration should panic. - require.PanicsWithError( - t, - "service pricefeed-daemon already registered", - func() { - server.ExpectPricefeedDaemon(5 * time.Second) - }, - ) -} - func createServerWithMocks( t testing.TB, mockGrpcServer *mocks.GrpcServer, @@ -214,7 +171,6 @@ func createServerWithMocks( ) mockGrpcServer.On("Stop").Return().Once() t.Cleanup(server.Stop) - server.DisableUpdateMonitoringForTesting() return server } diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go new file mode 100644 index 00000000000..e3dd3b95e53 --- /dev/null +++ b/protocol/daemons/server/types/health_monitor.go @@ -0,0 +1,277 @@ +package types + +import ( + cosmoslog "cosmossdk.io/log" + "fmt" + "github.com/cometbft/cometbft/libs/log" + "github.com/dydxprotocol/v4-chain/protocol/daemons/types" + libtime "github.com/dydxprotocol/v4-chain/protocol/lib/time" + "sync" + "time" +) + +const ( + HealthCheckPollFrequency = 5 * time.Second +) + +// timestampWithError couples a timestamp and error to make it easier to update them in tandem. +type timestampWithError struct { + timestamp time.Time + err error +} + +func (u *timestampWithError) Update(timestamp time.Time, err error) { + u.timestamp = timestamp + u.err = err +} + +func (u *timestampWithError) Reset() { + u.Update(time.Time{}, nil) +} + +func (u *timestampWithError) IsZero() bool { + return u.timestamp.IsZero() +} + +func (u *timestampWithError) Timestamp() time.Time { + return u.timestamp +} + +func (u *timestampWithError) Error() error { + return u.err +} + +// healthChecker encapsulates the logic for monitoring the health of a health checkable service. +type healthChecker struct { + // healthCheckable is the health checkable service to be monitored. + healthCheckable types.HealthCheckable + + // timer triggers a health check poll for a health checkable service. + timer *time.Timer + + // pollFrequency is the frequency at which the health checkable service is polled. + pollFrequency time.Duration + + // mostRecentSuccess is the timestamp of the most recent successful health check. + mostRecentSuccess time.Time + + // firstFailureInStreak is the timestamp of the first error in the most recent streak of errors. It is set + // whenever the service toggles from healthy to an unhealthy state, and used to determine how long the daemon has + // been unhealthy. If this timestamp is nil, then the error streak ended before it could trigger a callback. + firstFailureInStreak timestampWithError + + // unhealthyCallback is the callback function to be executed if the health checkable service remains + // unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration. + // This callback function is executed with the error that caused the service to become unhealthy. + unhealthyCallback func(error) + + // timeProvider is used to get the current time. It is added as a field so that it can be mocked in tests. + timeProvider libtime.TimeProvider + + // maximumAcceptableUnhealthyDuration is the maximum acceptable duration for a health checkable service to + // remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the + // specified callback function. + maximumAcceptableUnhealthyDuration time.Duration +} + +// Poll executes a health check for the health checkable service. If the service has been unhealthy for longer than the +// maximum acceptable unhealthy duration, the callback function is executed. +// This method is publicly exposed for testing. +func (hc *healthChecker) Poll() { + // Don't return an error if the monitor has been disabled. + err := hc.healthCheckable.HealthCheck() + now := hc.timeProvider.Now() + if err == nil { + hc.mostRecentSuccess = now + // Whenever the service is healthy, reset the first failure in streak timestamp. + hc.firstFailureInStreak.Reset() + } else if hc.firstFailureInStreak.IsZero() { + // Capture the timestamp of the first failure in a new streak. + hc.firstFailureInStreak.Update(now, err) + } + + // If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the + // callback function. + if !hc.firstFailureInStreak.IsZero() && + now.Sub(hc.firstFailureInStreak.Timestamp()) >= hc.maximumAcceptableUnhealthyDuration { + hc.unhealthyCallback(hc.firstFailureInStreak.Error()) + } else { + // If we do not execute the callback, schedule the next poll. + hc.timer.Reset(hc.pollFrequency) + } +} + +func (hc *healthChecker) Stop() { + hc.timer.Stop() +} + +// StartNewHealthChecker creates and starts a new health checker for a health checkable service. +func StartNewHealthChecker( + healthCheckable types.HealthCheckable, + pollFrequency time.Duration, + unhealthyCallback func(error), + timeProvider libtime.TimeProvider, + maximumAcceptableUnhealthyDuration time.Duration, + startupGracePeriod time.Duration, +) *healthChecker { + checker := &healthChecker{ + healthCheckable: healthCheckable, + pollFrequency: pollFrequency, + unhealthyCallback: unhealthyCallback, + timeProvider: timeProvider, + maximumAcceptableUnhealthyDuration: maximumAcceptableUnhealthyDuration, + } + // The first poll is scheduled after the startup grace period to allow the service to initialize. + checker.timer = time.AfterFunc(startupGracePeriod, checker.Poll) + + return checker +} + +// HealthMonitor monitors the health of daemon services, which implement the HealthCheckable interface. If a +// registered health-checkable service sustains an unhealthy state for the maximum acceptable unhealthy duration, +// the monitor will execute a callback function. +type HealthMonitor struct { + // serviceToHealthChecker maps daemon service names to their update metadata. + serviceToHealthChecker map[string]*healthChecker + // stopped indicates whether the monitor has been stopped. Additional daemon services cannot be registered + // after the monitor has been stopped. + stopped bool + // disabled indicates whether the monitor has been disabled. This is used to disable the monitor in testApp + // tests, where app.New is not executed. + disabled bool + // lock is used to synchronize access to the monitor. + lock sync.Mutex + + // These fields are initialized in NewHealthMonitor and are not modified after initialization. + logger log.Logger + startupGracePeriod time.Duration + pollingFrequency time.Duration +} + +// NewHealthMonitor creates a new health monitor. +func NewHealthMonitor( + startupGracePeriod time.Duration, + pollingFrequency time.Duration, + logger log.Logger, +) *HealthMonitor { + return &HealthMonitor{ + serviceToHealthChecker: make(map[string]*healthChecker), + logger: logger.With(cosmoslog.ModuleKey, "health-monitor"), + startupGracePeriod: startupGracePeriod, + pollingFrequency: pollingFrequency, + } +} + +func (hm *HealthMonitor) DisableForTesting() { + hm.lock.Lock() + defer hm.lock.Unlock() + + hm.disabled = true +} + +// RegisterServiceWithCallback registers a HealthCheckable with the health monitor. If the service +// stays unhealthy every time it is polled during the maximum acceptable unhealthy duration, the monitor will +// execute the callback function. +// This method is synchronized. The method returns an error if the service was already registered or the +// monitor has already been stopped. +func (hm *HealthMonitor) RegisterServiceWithCallback( + hc types.HealthCheckable, + maximumAcceptableUnhealthyDuration time.Duration, + callback func(error), +) error { + hm.lock.Lock() + defer hm.lock.Unlock() + + if maximumAcceptableUnhealthyDuration <= 0 { + return fmt.Errorf( + "health check registration failure for service %v: "+ + "maximum acceptable unhealthy duration %v must be positive", + hc.ServiceName(), + maximumAcceptableUnhealthyDuration, + ) + } + + // Don't register daemon services if the monitor has been disabled. + if hm.disabled { + return nil + } + + // Don't register additional daemon services if the monitor has already been stopped. + // This could be a concern for short-running integration test cases, where the network + // stops before all daemon services have been registered. + if hm.stopped { + return fmt.Errorf( + "health check registration failure for service %v: monitor has been stopped", + hc.ServiceName(), + ) + } + + if _, ok := hm.serviceToHealthChecker[hc.ServiceName()]; ok { + return fmt.Errorf("service %v already registered", hc.ServiceName()) + } + + hm.serviceToHealthChecker[hc.ServiceName()] = StartNewHealthChecker( + hc, + hm.pollingFrequency, + callback, + &libtime.TimeProviderImpl{}, + maximumAcceptableUnhealthyDuration, + hm.startupGracePeriod, + ) + return nil +} + +// PanicServiceNotResponding returns a function that panics with a message indicating that the specified daemon +// service is not responding. This is ideal for creating a callback function when registering a daemon service. +func PanicServiceNotResponding(hc types.HealthCheckable) func(error) { + return func(err error) { + panic(fmt.Sprintf("%v unhealthy: %v", hc.ServiceName(), err)) + } +} + +// LogErrorServiceNotResponding returns a function that logs an error indicating that the specified service +// is not responding. This is ideal for creating a callback function when registering a health checkable service. +func LogErrorServiceNotResponding(hc types.HealthCheckable, logger log.Logger) func(error) { + return func(err error) { + logger.Error( + "health-checked service is unhealthy", + "service", + hc.ServiceName(), + "error", + err, + ) + } +} + +// RegisterService registers a new health checkable service with the health check monitor. If the service +// is unhealthy every time it is polled for a duration greater than or equal to the maximum acceptable unhealthy +// duration, the monitor will panic. +// This method is synchronized. It returns an error if the service was already registered or the monitor has +// already been stopped. +func (hm *HealthMonitor) RegisterService( + hc types.HealthCheckable, + maximumAcceptableUnhealthyDuration time.Duration, +) error { + return hm.RegisterServiceWithCallback( + hc, + maximumAcceptableUnhealthyDuration, + PanicServiceNotResponding(hc), + ) +} + +// Stop stops the update frequency monitor. This method is synchronized. +func (hm *HealthMonitor) Stop() { + hm.lock.Lock() + defer hm.lock.Unlock() + + // Don't stop the monitor if it has already been stopped. + if hm.stopped { + return + } + + for _, checker := range hm.serviceToHealthChecker { + checker.Stop() + } + + hm.stopped = true +} diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go new file mode 100644 index 00000000000..8fba05764ae --- /dev/null +++ b/protocol/daemons/server/types/health_monitor_test.go @@ -0,0 +1,318 @@ +package types_test + +import ( + "fmt" + "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types" + "github.com/dydxprotocol/v4-chain/protocol/mocks" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" + "testing" + "time" +) + +var ( + TestError1 = fmt.Errorf("test error 1") + TestError2 = fmt.Errorf("test error 2") + + Time0 = time.Unix(0, 0) + Time1 = Time0.Add(time.Second) + Time2 = Time0.Add(2 * time.Second) + Time3 = Time0.Add(3 * time.Second) + Time4 = Time0.Add(4 * time.Second) + Time5 = Time0.Add(5 * time.Second) + + // Use a maximum unhealthy duration of 5 seconds for testing, simulating a poll frequency of 1s with timestamps. + TestMaximumUnhealthyDuration = 5 * time.Second + + // TestLargeDuration is used to ensure that the health checker does not trigger a callback through the timer. + TestLargeDuration = 5 * time.Minute + + ZeroDuration = 0 * time.Second +) + +// createTestMonitor creates a health monitor with a poll frequency of 10ms and a zero duration grace period. +func createTestMonitor() (*types.HealthMonitor, *mocks.Logger) { + logger := &mocks.Logger{} + logger.On("With", "module", "health-monitor").Return(logger).Once() + return types.NewHealthMonitor( + ZeroDuration, + 10*time.Millisecond, + logger, + ), logger +} + +// mockFailingHealthCheckerWithError creates a mock health checkable service that returns the given error. +func mockFailingHealthCheckerWithError(name string, err error) *mocks.HealthCheckable { + hc := &mocks.HealthCheckable{} + hc.On("ServiceName").Return(name) + hc.On("HealthCheck").Return(err) + return hc +} + +// callbackWithErrorPointer returns a callback function and an error pointer that tracks the error passed to the +// callback. This can be used to validate that a service was considered unhealthy for the maximum allowable duration. +func callbackWithErrorPointer() (func(error), *error) { + var callbackError error + callback := func(err error) { + callbackError = err + } + return callback, &callbackError +} + +func TestHealthChecker(t *testing.T) { + tests := map[string]struct { + healthCheckResponses []error + expectedUnhealthy error + }{ + "initialized: no callback triggered": { + healthCheckResponses: []error{}, + expectedUnhealthy: nil, + }, + "healthy, then unhealthy for < maximum unhealthy duration: no callback triggered": { + healthCheckResponses: []error{ + nil, + TestError1, + }, + expectedUnhealthy: nil, + }, + "unhealthy for < maximum unhealthy duration: no callback triggered": { + healthCheckResponses: []error{ + TestError1, + TestError1, + TestError1, + TestError1, + TestError1, + }, + expectedUnhealthy: nil, + }, + "unhealthy, healthy, unhealthy: no callback triggered": { + healthCheckResponses: []error{ + TestError1, + TestError1, + TestError1, + TestError1, + TestError1, + nil, + TestError1, + TestError1, + TestError1, + TestError1, + TestError1, + }, + expectedUnhealthy: nil, + }, + "unhealthy for maximum unhealthy duration: callback triggered": { + healthCheckResponses: []error{ + TestError1, + TestError1, + TestError1, + TestError1, + TestError1, + TestError1, + }, + expectedUnhealthy: TestError1, + }, + "unhealthy with multiple errors: first error returned": { + healthCheckResponses: []error{ + TestError1, + TestError2, + TestError2, + TestError2, + TestError2, + TestError2, + }, + expectedUnhealthy: TestError1, + }, + } + for name, test := range tests { + t.Run(name, func(t *testing.T) { + // Setup. + // Set up callback to track error passed to it. + callback, callbackError := callbackWithErrorPointer() + + // Set up health checkable service. + checkable := &mocks.HealthCheckable{} + for _, response := range test.healthCheckResponses { + checkable.On("HealthCheck").Return(response).Once() + } + + // Set up time provider to return a sequence of timestamps one second apart starting at Time0. + timeProvider := &mocks.TimeProvider{} + for i := range test.healthCheckResponses { + timeProvider.On("Now").Return(Time0.Add(time.Duration(i) * time.Second)).Once() + } + + healthChecker := types.StartNewHealthChecker( + checkable, + TestLargeDuration, // set to a >> value so that poll is never auto-triggered by the timer + callback, + timeProvider, + TestMaximumUnhealthyDuration, + types.DaemonStartupGracePeriod, + ) + + // Cleanup. + defer func() { + healthChecker.Stop() + }() + + // Act - simulate the health checker polling for updates. + for i := 0; i < len(test.healthCheckResponses); i++ { + healthChecker.Poll() + } + + // Assert. + // Validate the expected polls occurred according to the mocks. + checkable.AssertExpectations(t) + timeProvider.AssertExpectations(t) + + // Validate the callback was called with the expected error. + if test.expectedUnhealthy == nil { + require.NoError(t, *callbackError) + } else { + require.ErrorContains(t, *callbackError, test.expectedUnhealthy.Error()) + } + }) + } +} + +// The following tests may still intermittently fail or report false positives on an overloaded system as they rely +// on callbacks to execute before the termination of the `time.Sleep` call, which is not guaranteed. +func TestRegisterService_Healthy(t *testing.T) { + // Setup. + ufm, logger := createTestMonitor() + hc := mockFailingHealthCheckerWithError("test-service", nil) + + // Act. + err := ufm.RegisterService(hc, 50*time.Millisecond) + require.NoError(t, err) + + // Cleanup. + defer func() { + ufm.Stop() + }() + + // Give the monitor time to poll the health checkable service. Polls occur once every 10ms. + time.Sleep(100 * time.Millisecond) + + // Assert: no calls to the logger were made. + mock.AssertExpectationsForObjects(t, hc, logger) +} + +func TestRegisterServiceWithCallback_Mixed(t *testing.T) { + tests := map[string]struct { + healthCheckResponse error + }{ + "healthy: no callback triggered": { + healthCheckResponse: nil, + }, + "unhealthy: callback triggered": { + healthCheckResponse: TestError1, + }, + } + for name, test := range tests { + t.Run(name, func(t *testing.T) { + // Setup. + ufm, logger := createTestMonitor() + hc := mockFailingHealthCheckerWithError("test-service", test.healthCheckResponse) + callback, callbackError := callbackWithErrorPointer() + + // Act. + err := ufm.RegisterServiceWithCallback( + hc, + 50*time.Millisecond, + callback, + ) + require.NoError(t, err) + + // Cleanup. + defer func() { + ufm.Stop() + }() + + // Give the monitor time to poll the health checkable service. Polls occur once every 10ms. + time.Sleep(100 * time.Millisecond) + + // Assert: no calls to the logger were made. + mock.AssertExpectationsForObjects(t, hc, logger) + + // Assert: the callback was called or not called as expected. + require.Equal(t, test.healthCheckResponse, *callbackError) + }) + } +} + +func TestRegisterServiceWithCallback_DoubleRegistrationFails(t *testing.T) { + // Setup. + ufm, logger := createTestMonitor() + + hc := mockFailingHealthCheckerWithError("test-service", TestError1) + hc2 := mockFailingHealthCheckerWithError("test-service", TestError2) + + callback, callbackError := callbackWithErrorPointer() + + err := ufm.RegisterServiceWithCallback(hc, 50*time.Millisecond, callback) + require.NoError(t, err) + + // Register a service with the same name. This registration should fail. + err = ufm.RegisterServiceWithCallback(hc2, 50*time.Millisecond, callback) + require.ErrorContains(t, err, "service already registered") + + // Expect that the first service is still operating and will produce a callback after a sustained unhealthy period. + time.Sleep(100 * time.Millisecond) + ufm.Stop() + + // Assert no calls to the logger were made. + mock.AssertExpectationsForObjects(t, logger, hc) + hc2.AssertNotCalled(t, "HealthCheck") + + // Assert the callback was called with the expected error. + require.Equal(t, TestError1, *callbackError) +} + +func TestRegisterService_RegistrationFailsAfterStop(t *testing.T) { + ufm, logger := createTestMonitor() + ufm.Stop() + + hc := mockFailingHealthCheckerWithError("test-service", TestError1) + err := ufm.RegisterService(hc, 50*time.Millisecond) + require.ErrorContains(t, err, "monitor has been stopped") + + // Any scheduled functions with error logs that were not cleaned up should trigger before this sleep finishes. + time.Sleep(100 * time.Millisecond) + mock.AssertExpectationsForObjects(t, logger) +} + +func TestRegisterValidResponseWithCallback_NegativeUpdateDuration(t *testing.T) { + ufm, _ := createTestMonitor() + hc := mockFailingHealthCheckerWithError("test-service", TestError1) + err := ufm.RegisterServiceWithCallback(hc, -50*time.Millisecond, func(error) {}) + require.ErrorContains(t, err, "maximum acceptable unhealthy duration -50ms must be positive") +} + +func TestPanicServiceNotResponding(t *testing.T) { + panicFunc := types.PanicServiceNotResponding(&mocks.HealthCheckable{}) + require.Panics(t, func() { + panicFunc(TestError1) + }) +} + +func TestLogErrorServiceNotResponding(t *testing.T) { + logger := &mocks.Logger{} + hc := &mocks.HealthCheckable{} + + hc.On("ServiceName").Return("test-service") + logger.On( + "Error", + "health-checked service is unhealthy", + "service", + "test-service", + "error", + TestError1, + ).Return() + logFunc := types.LogErrorServiceNotResponding(hc, logger) + logFunc(TestError1) + + // Assert: the logger was called with the expected arguments. + mock.AssertExpectationsForObjects(t, logger) +} diff --git a/protocol/daemons/server/types/update_monitor.go b/protocol/daemons/server/types/update_monitor.go deleted file mode 100644 index 7c437ff0d7c..00000000000 --- a/protocol/daemons/server/types/update_monitor.go +++ /dev/null @@ -1,169 +0,0 @@ -package types - -import ( - "fmt" - "github.com/cometbft/cometbft/libs/log" - "sync" - "time" -) - -type updateMetadata struct { - timer *time.Timer - updateFrequency time.Duration -} - -// UpdateMonitor monitors the update frequency of daemon services. If a daemon service does not respond within -// the maximum acceptable update delay set when the daemon is registered, the monitor will log an error and halt the -// protocol. This was judged to be the best solution for network performance because it prevents any validator from -// participating in the network at all if a daemon service is not responding. -type UpdateMonitor struct { - // serviceToUpdateMetadata maps daemon service names to their update metadata. - serviceToUpdateMetadata map[string]updateMetadata - // stopped indicates whether the monitor has been stopped. Additional daemon services cannot be registered - // after the monitor has been stopped. - stopped bool - // disabled indicates whether the monitor has been disabled. This is used to disable the monitor in testApp - // tests, where app.New is not executed. - disabled bool - // lock is used to synchronize access to the monitor. - lock sync.Mutex - - // These fields are initialized in NewUpdateFrequencyMonitor and are not modified after initialization. - logger log.Logger - daemonStartupGracePeriod time.Duration -} - -// NewUpdateFrequencyMonitor creates a new update frequency monitor. -func NewUpdateFrequencyMonitor(daemonStartupGracePeriod time.Duration, logger log.Logger) *UpdateMonitor { - return &UpdateMonitor{ - serviceToUpdateMetadata: make(map[string]updateMetadata), - logger: logger, - daemonStartupGracePeriod: daemonStartupGracePeriod, - } -} - -func (ufm *UpdateMonitor) DisableForTesting() { - ufm.lock.Lock() - defer ufm.lock.Unlock() - - ufm.disabled = true -} - -// RegisterDaemonServiceWithCallback registers a new daemon service with the update frequency monitor. If the daemon -// service fails to respond within the maximum acceptable update delay, the monitor will execute the callback function. -// This method is synchronized. The method returns an error if the daemon service was already registered or the -// monitor has already been stopped. -func (ufm *UpdateMonitor) RegisterDaemonServiceWithCallback( - service string, - maximumAcceptableUpdateDelay time.Duration, - callback func(), -) error { - ufm.lock.Lock() - defer ufm.lock.Unlock() - - if maximumAcceptableUpdateDelay <= 0 { - return fmt.Errorf( - "registration failure for service %v: maximum acceptable update delay %v must be positive", - service, - maximumAcceptableUpdateDelay, - ) - } - - // Don't register daemon services if the monitor has been disabled. - if ufm.disabled { - return nil - } - - // Don't register additional daemon services if the monitor has already been stopped. - // This could be a concern for short-running integration test cases, where the network - // stops before all daemon services have been registered. - if ufm.stopped { - return fmt.Errorf("registration failure for service %v: monitor has been stopped", service) - } - - if _, ok := ufm.serviceToUpdateMetadata[service]; ok { - return fmt.Errorf("service %v already registered", service) - } - - ufm.serviceToUpdateMetadata[service] = updateMetadata{ - timer: time.AfterFunc(ufm.daemonStartupGracePeriod+maximumAcceptableUpdateDelay, callback), - updateFrequency: maximumAcceptableUpdateDelay, - } - return nil -} - -// PanicServiceNotResponding returns a function that panics with a message indicating that the specified daemon -// service is not responding. This is ideal for creating a callback function when registering a daemon service. -func PanicServiceNotResponding(service string) func() { - return func() { - panic(fmt.Sprintf("%v daemon not responding", service)) - } -} - -// LogErrorServiceNotResponding returns a function that logs an error indicating that the specified daemon service -// is not responding. This is ideal for creating a callback function when registering a daemon service. -func LogErrorServiceNotResponding(service string, logger log.Logger) func() { - return func() { - logger.Error( - "daemon not responding", - "service", - service, - ) - } -} - -// RegisterDaemonService registers a new daemon service with the update frequency monitor. If the daemon service -// fails to respond within the maximum acceptable update delay, the monitor will log an error. -// This method is synchronized. The method an error if the daemon service was already registered or the monitor has -// already been stopped. -func (ufm *UpdateMonitor) RegisterDaemonService( - service string, - maximumAcceptableUpdateDelay time.Duration, -) error { - return ufm.RegisterDaemonServiceWithCallback( - service, - maximumAcceptableUpdateDelay, - LogErrorServiceNotResponding(service, ufm.logger), - ) -} - -// Stop stops the update frequency monitor. This method is synchronized. -func (ufm *UpdateMonitor) Stop() { - ufm.lock.Lock() - defer ufm.lock.Unlock() - - // Don't stop the monitor if it has already been stopped. - if ufm.stopped { - return - } - - for _, metadata := range ufm.serviceToUpdateMetadata { - metadata.timer.Stop() - } - ufm.stopped = true -} - -// RegisterValidResponse registers a valid response from the daemon service. This will reset the timer for the -// daemon service. This method is synchronized. -func (ufm *UpdateMonitor) RegisterValidResponse(service string) error { - ufm.lock.Lock() - defer ufm.lock.Unlock() - - // Don't return an error if the monitor has been disabled. - if ufm.disabled { - return nil - } - - // Don't bother to reset the timer if the monitor has already been stopped. - if ufm.stopped { - return nil - } - - metadata, ok := ufm.serviceToUpdateMetadata[service] - if !ok { - return fmt.Errorf("service %v not registered", service) - } - - metadata.timer.Reset(metadata.updateFrequency) - return nil -} diff --git a/protocol/daemons/server/types/update_monitor_test.go b/protocol/daemons/server/types/update_monitor_test.go deleted file mode 100644 index c9abf1094f3..00000000000 --- a/protocol/daemons/server/types/update_monitor_test.go +++ /dev/null @@ -1,189 +0,0 @@ -package types_test - -import ( - "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types" - "github.com/dydxprotocol/v4-chain/protocol/mocks" - "github.com/stretchr/testify/mock" - "github.com/stretchr/testify/require" - "sync/atomic" - "testing" - "time" -) - -var ( - zeroDuration = 0 * time.Second -) - -func createTestMonitor() (*types.UpdateMonitor, *mocks.Logger) { - logger := &mocks.Logger{} - return types.NewUpdateFrequencyMonitor(zeroDuration, logger), logger -} - -// The following tests may still intermittently fail on an overloaded system as they rely -// on `time.Sleep`, which is not guaranteed to wake up after the specified amount of time. -func TestRegisterDaemonService_Success(t *testing.T) { - ufm, logger := createTestMonitor() - err := ufm.RegisterDaemonService("test-service", 200*time.Millisecond) - require.NoError(t, err) - - // As long as responses come in before the 200ms deadline, no errors should be logged. - time.Sleep(80 * time.Millisecond) - require.NoError(t, ufm.RegisterValidResponse("test-service")) - time.Sleep(80 * time.Millisecond) - require.NoError(t, ufm.RegisterValidResponse("test-service")) - time.Sleep(80 * time.Millisecond) - - ufm.Stop() - // Assert: no calls to the logger were made. - mock.AssertExpectationsForObjects(t, logger) -} - -func TestRegisterDaemonService_SuccessfullyLogsError(t *testing.T) { - ufm, logger := createTestMonitor() - logger.On("Error", "daemon not responding", "service", "test-service").Once().Return() - err := ufm.RegisterDaemonService("test-service", 1*time.Millisecond) - require.NoError(t, err) - time.Sleep(2 * time.Millisecond) - ufm.Stop() - - // Assert: the logger was called with the expected arguments. - mock.AssertExpectationsForObjects(t, logger) -} - -func TestRegisterDaemonServiceWithCallback_Success(t *testing.T) { - callbackCalled := atomic.Bool{} - - ufm, _ := createTestMonitor() - err := ufm.RegisterDaemonServiceWithCallback("test-service", 200*time.Millisecond, func() { - callbackCalled.Store(true) - }) - require.NoError(t, err) - - // As long as responses come in before the 200ms deadline, no panic should occur. - time.Sleep(80 * time.Millisecond) - require.NoError(t, ufm.RegisterValidResponse("test-service")) - time.Sleep(80 * time.Millisecond) - require.NoError(t, ufm.RegisterValidResponse("test-service")) - time.Sleep(80 * time.Millisecond) - - require.False(t, callbackCalled.Load()) - - ufm.Stop() -} - -func TestRegisterDaemonService_DoubleRegistrationFails(t *testing.T) { - ufm, logger := createTestMonitor() - - err := ufm.RegisterDaemonService("test-service", 200*time.Millisecond) - require.NoError(t, err) - - // Register the same daemon service again. This should fail, and 50ms update frequency should be ignored. - err = ufm.RegisterDaemonService("test-service", 50*time.Millisecond) - require.ErrorContains(t, err, "service already registered") - - // Confirm that the original 200ms update frequency is still in effect. 50ms would have triggered an error log. - // Note there is a possibility that 200ms will still cause an error log due to the semantics of Sleep, which is - // not guaranteed to sleep for exactly the specified duration. - time.Sleep(80 * time.Millisecond) - require.NoError(t, ufm.RegisterValidResponse("test-service")) - time.Sleep(80 * time.Millisecond) - ufm.Stop() - - // Assert no calls to the logger were made. - mock.AssertExpectationsForObjects(t, logger) -} - -func TestRegisterDaemonServiceWithCallback_DoubleRegistrationFails(t *testing.T) { - // lock synchronizes callback flags and was added to avoid race test failures. - callback1Called := atomic.Bool{} - callback2Called := atomic.Bool{} - - ufm, _ := createTestMonitor() - // First registration should succeed. - err := ufm.RegisterDaemonServiceWithCallback("test-service", 200*time.Millisecond, func() { - callback1Called.Store(true) - }) - require.NoError(t, err) - - // Register the same daemon service again. This should fail, and 50ms update frequency should be ignored. - err = ufm.RegisterDaemonServiceWithCallback("test-service", 50*time.Millisecond, func() { - callback2Called.Store(true) - }) - require.ErrorContains(t, err, "service already registered") - - // Validate that the original callback is still in effect for the original 200ms update frequency. - // The 50ms update frequency should have invoked a callback if it were applied. - time.Sleep(80 * time.Millisecond) - require.False(t, callback1Called.Load()) - require.False(t, callback2Called.Load()) - - // Validate no issues with RegisterValidResponse after a double registration was attempted. - require.NoError(t, ufm.RegisterValidResponse("test-service")) - - // Sleep until the callback should be called. - time.Sleep(250 * time.Millisecond) - require.True(t, callback1Called.Load()) - require.False(t, callback2Called.Load()) - - ufm.Stop() -} - -func TestRegisterDaemonService_RegistrationFailsAfterStop(t *testing.T) { - ufm, logger := createTestMonitor() - ufm.Stop() - err := ufm.RegisterDaemonService("test-service", 50*time.Millisecond) - require.ErrorContains(t, err, "monitor has been stopped") - - // Any scheduled functions with error logs that were not cleaned up should trigger before this sleep finishes. - time.Sleep(100 * time.Millisecond) - mock.AssertExpectationsForObjects(t, logger) -} - -func TestRegisterDaemonServiceWithCallback_RegistrationFailsAfterStop(t *testing.T) { - ufm, _ := createTestMonitor() - ufm.Stop() - - callbackCalled := atomic.Bool{} - - // Registering a daemon service with a callback should fail after the monitor has been stopped. - err := ufm.RegisterDaemonServiceWithCallback("test-service", 50*time.Millisecond, func() { - callbackCalled.Store(true) - }) - require.ErrorContains(t, err, "monitor has been stopped") - - // Wait until after the callback duration has expired. The callback should not be called. - time.Sleep(75 * time.Millisecond) - - // Validate that the callback was not called. - require.False(t, callbackCalled.Load()) -} - -func TestRegisterValidResponse_NegativeUpdateDelay(t *testing.T) { - ufm, logger := createTestMonitor() - err := ufm.RegisterDaemonService("test-service", -50*time.Millisecond) - require.ErrorContains(t, err, "update delay -50ms must be positive") - - // Sanity check: no calls to the logger should have been made. - mock.AssertExpectationsForObjects(t, logger) -} - -func TestRegisterValidResponseWithCallback_NegativeUpdateDelay(t *testing.T) { - ufm, _ := createTestMonitor() - err := ufm.RegisterDaemonServiceWithCallback("test-service", -50*time.Millisecond, func() {}) - require.ErrorContains(t, err, "update delay -50ms must be positive") -} - -func TestPanicServiceNotResponding(t *testing.T) { - panicFunc := types.PanicServiceNotResponding("test-service") - require.Panics(t, panicFunc) -} - -func TestLogErrorServiceNotResponding(t *testing.T) { - logger := &mocks.Logger{} - logger.On("Error", "daemon not responding", "service", "test-service").Return() - logFunc := types.LogErrorServiceNotResponding("test-service", logger) - logFunc() - - // Assert: the logger was called with the expected arguments. - mock.AssertExpectationsForObjects(t, logger) -} diff --git a/protocol/daemons/server/types/util.go b/protocol/daemons/server/types/util.go deleted file mode 100644 index 59c07dcb6fe..00000000000 --- a/protocol/daemons/server/types/util.go +++ /dev/null @@ -1,9 +0,0 @@ -package types - -import "time" - -// MaximumAcceptableUpdateDelay computes the maximum acceptable update delay for a daemon service as a -// multiple of the loop delay. -func MaximumAcceptableUpdateDelay(loopDelayMs uint32) time.Duration { - return MaximumLoopDelayMultiple * time.Duration(loopDelayMs) * time.Millisecond -} diff --git a/protocol/daemons/server/types/util_test.go b/protocol/daemons/server/types/util_test.go deleted file mode 100644 index 497faf19dd9..00000000000 --- a/protocol/daemons/server/types/util_test.go +++ /dev/null @@ -1,14 +0,0 @@ -package types - -import ( - "github.com/stretchr/testify/require" - "testing" - "time" -) - -func TestMaximumAcceptableUpdateDelay(t *testing.T) { - loopDelayMs := uint32(1000) - expected := time.Duration(MaximumLoopDelayMultiple*loopDelayMs) * time.Millisecond - actual := MaximumAcceptableUpdateDelay(loopDelayMs) - require.Equal(t, expected, actual) -} diff --git a/protocol/daemons/types/health_checkable.go b/protocol/daemons/types/health_checkable.go index da79405685e..fd5c2246b08 100644 --- a/protocol/daemons/types/health_checkable.go +++ b/protocol/daemons/types/health_checkable.go @@ -22,6 +22,8 @@ type HealthCheckable interface { ReportFailure(err error) // ReportSuccess records a successful update. ReportSuccess() + // ServiceName returns the name of the service being monitored. This name is expected to be unique. + ServiceName() string } // timestampWithError couples a timestamp and error to make it easier to update them in tandem. @@ -65,6 +67,9 @@ type timeBoundedHealthCheckable struct { // logger is the logger used to log errors. logger log.Logger + + // serviceName is the name of the service being monitored. This field is read-only and not synchronized. + serviceName string } // NewTimeBoundedHealthCheckable creates a new HealthCheckable instance. @@ -76,12 +81,18 @@ func NewTimeBoundedHealthCheckable( hc := &timeBoundedHealthCheckable{ timeProvider: timeProvider, logger: logger, + serviceName: serviceName, } // Initialize the timeBoudnedHealthCheckable to an unhealthy state by reporting an error. hc.ReportFailure(fmt.Errorf("%v is initializing", serviceName)) return hc } +// ServiceName returns the name of the service being monitored. +func (hc *timeBoundedHealthCheckable) ServiceName() string { + return hc.serviceName +} + // ReportSuccess records a successful update. This method is thread-safe. func (h *timeBoundedHealthCheckable) ReportSuccess() { h.Lock() diff --git a/protocol/mocks/HealthCheckable.go b/protocol/mocks/HealthCheckable.go new file mode 100644 index 00000000000..4d69fd74b2b --- /dev/null +++ b/protocol/mocks/HealthCheckable.go @@ -0,0 +1,63 @@ +// Code generated by mockery v2.14.0. DO NOT EDIT. + +package mocks + +import mock "github.com/stretchr/testify/mock" + +// HealthCheckable is an autogenerated mock type for the HealthCheckable type +type HealthCheckable struct { + mock.Mock +} + +// HealthCheck provides a mock function with given fields: +func (_m *HealthCheckable) HealthCheck() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// ReportFailure provides a mock function with given fields: err +func (_m *HealthCheckable) ReportFailure(err error) { + _m.Called(err) +} + +// ReportSuccess provides a mock function with given fields: +func (_m *HealthCheckable) ReportSuccess() { + _m.Called() +} + +// ServiceName provides a mock function with given fields: +func (_m *HealthCheckable) ServiceName() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} + +type mockConstructorTestingTNewHealthCheckable interface { + mock.TestingT + Cleanup(func()) +} + +// NewHealthCheckable creates a new instance of HealthCheckable. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +func NewHealthCheckable(t mockConstructorTestingTNewHealthCheckable) *HealthCheckable { + mock := &HealthCheckable{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} diff --git a/protocol/mocks/Makefile b/protocol/mocks/Makefile index f7c549814c2..b47321d1bf3 100644 --- a/protocol/mocks/Makefile +++ b/protocol/mocks/Makefile @@ -21,6 +21,7 @@ mock-gen: @go run github.com/vektra/mockery/v2 --name=AppOptions --dir=$(GOPATH)/pkg/mod/github.com/dydxprotocol/cosmos-sdk@$(COSMOS_VERSION)/server/types --recursive --output=./mocks @go run github.com/vektra/mockery/v2 --name=Logger --dir=$(GOPATH)/pkg/mod/github.com/dydxprotocol/cometbft@$(COMETBFT_VERSION)/libs/log --filename=logger.go --recursive --output=./mocks @go run github.com/vektra/mockery/v2 --name=MsgRouter --dir=./lib --recursive --output=./mocks + @go run github.com/vektra/mockery/v2 --name=HealthCheckable --dir=./daemons/types --recursive --output=./mocks @go run github.com/vektra/mockery/v2 --name=PrepareBridgeKeeper --dir=./app/prepare --recursive --output=./mocks @go run github.com/vektra/mockery/v2 --name=PrepareClobKeeper --dir=./app/prepare --recursive --output=./mocks @go run github.com/vektra/mockery/v2 --name=PreparePerpetualsKeeper --dir=./app/prepare --recursive --output=./mocks diff --git a/protocol/testutil/app/app.go b/protocol/testutil/app/app.go index c228da42d45..fa902d41bf7 100644 --- a/protocol/testutil/app/app.go +++ b/protocol/testutil/app/app.go @@ -490,7 +490,7 @@ func (tApp *TestApp) initChainIfNeeded() { } if tApp.builder.usesDefaultAppConfig { - tApp.App.Server.DisableUpdateMonitoringForTesting() + tApp.App.DisableHealthMonitorForTesting() } if tApp.builder.enableNonDeterminismChecks { From bfbad7dbf26ad238832bad7db8e9c2c90ea89e1f Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Mon, 27 Nov 2023 16:04:29 -0800 Subject: [PATCH 02/15] PR Comments. --- protocol/app/app.go | 8 ++++---- protocol/daemons/server/bridge.go | 8 ++------ protocol/daemons/server/liquidation.go | 2 ++ protocol/daemons/server/pricefeed.go | 12 +++++++----- protocol/daemons/server/server.go | 17 +++++++++++++++++ protocol/daemons/server/types/health_monitor.go | 5 ++--- .../daemons/server/types/health_monitor_test.go | 2 +- 7 files changed, 35 insertions(+), 19 deletions(-) diff --git a/protocol/app/app.go b/protocol/app/app.go index f1e256ed87e..6a5e53b418d 100644 --- a/protocol/app/app.go +++ b/protocol/app/app.go @@ -295,7 +295,7 @@ type App struct { LiquidationsClient *liquidationclient.Client BridgeClient *bridgeclient.Client - HealthMonitor *daemonservertypes.HealthMonitor + DaemonHealthMonitor *daemonservertypes.HealthMonitor } // assertAppPreconditions assert invariants required for an application to start. @@ -595,7 +595,7 @@ func New( bridgeEventManager := bridgedaemontypes.NewBridgeEventManager(timeProvider) app.Server.WithBridgeEventManager(bridgeEventManager) - app.HealthMonitor = daemonservertypes.NewHealthMonitor( + app.DaemonHealthMonitor = daemonservertypes.NewHealthMonitor( daemonservertypes.DaemonStartupGracePeriod, daemonservertypes.HealthCheckPollFrequency, app.Logger(), @@ -1233,7 +1233,7 @@ func (app *App) MonitorDaemon( healthCheckableDaemon daemontypes.HealthCheckable, maximumAcceptableUpdateDelay time.Duration, ) { - if err := app.HealthMonitor.RegisterService(healthCheckableDaemon, maximumAcceptableUpdateDelay); err != nil { + if err := app.DaemonHealthMonitor.RegisterService(healthCheckableDaemon, maximumAcceptableUpdateDelay); err != nil { app.Logger().Error( "Failed to register daemon service with update monitor", "error", @@ -1249,7 +1249,7 @@ func (app *App) MonitorDaemon( // DisableHealthMonitorForTesting disables the health monitor for testing. func (app *App) DisableHealthMonitorForTesting() { - app.HealthMonitor.DisableForTesting() + app.DaemonHealthMonitor.DisableForTesting() } // hydrateMemStores hydrates the memStores used for caching state. diff --git a/protocol/daemons/server/bridge.go b/protocol/daemons/server/bridge.go index 632d72ddb21..a4d0de5683c 100644 --- a/protocol/daemons/server/bridge.go +++ b/protocol/daemons/server/bridge.go @@ -3,6 +3,7 @@ package server import ( "context" "github.com/dydxprotocol/v4-chain/protocol/daemons/bridge/api" + "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types" bdtypes "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types/bridge" ) @@ -29,12 +30,7 @@ func (s *Server) AddBridgeEvents( *api.AddBridgeEventsResponse, error, ) { - // If the daemon is unable to report a response, there is either an error in the registration of - // this daemon, or another one. In either case, the protocol should panic. - // TODO(CORE-582): Re-enable this check once the bridge daemon is fixed in local / CI environments. - //if err := s.reportResponse(types.BridgeDaemonServiceName); err != nil { - // panic(err) - //} + s.reportResponse(types.BridgeDaemonServiceName) if err := s.bridgeEventManager.AddBridgeEvents(req.BridgeEvents); err != nil { return nil, err } diff --git a/protocol/daemons/server/liquidation.go b/protocol/daemons/server/liquidation.go index 63c8f6b1134..7cf8d351392 100644 --- a/protocol/daemons/server/liquidation.go +++ b/protocol/daemons/server/liquidation.go @@ -4,6 +4,7 @@ import ( "context" "github.com/cosmos/cosmos-sdk/telemetry" "github.com/dydxprotocol/v4-chain/protocol/daemons/liquidation/api" + "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types" liquidationtypes "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types/liquidations" "github.com/dydxprotocol/v4-chain/protocol/lib/metrics" ) @@ -36,6 +37,7 @@ func (s *Server) LiquidateSubaccounts( metrics.Received, metrics.Count, ) + s.reportResponse(types.LiquidationsDaemonServiceName) s.liquidatableSubaccountIds.UpdateSubaccountIds(req.SubaccountIds) return &api.LiquidateSubaccountsResponse{}, nil } diff --git a/protocol/daemons/server/pricefeed.go b/protocol/daemons/server/pricefeed.go index 0337483fc38..1d43f984168 100644 --- a/protocol/daemons/server/pricefeed.go +++ b/protocol/daemons/server/pricefeed.go @@ -3,6 +3,7 @@ package server import ( "context" errorsmod "cosmossdk.io/errors" + "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types" "time" gometrics "github.com/armon/go-metrics" @@ -11,7 +12,7 @@ import ( "github.com/dydxprotocol/v4-chain/protocol/daemons/pricefeed/api" pricefeedmetrics "github.com/dydxprotocol/v4-chain/protocol/daemons/pricefeed/metrics" pricefeedtypes "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types/pricefeed" - "github.com/dydxprotocol/v4-chain/protocol/daemons/types" + daemontypes "github.com/dydxprotocol/v4-chain/protocol/daemons/types" "github.com/dydxprotocol/v4-chain/protocol/lib/metrics" ) @@ -35,6 +36,7 @@ func (s *Server) UpdateMarketPrices( ctx context.Context, req *api.UpdateMarketPricesRequest, ) (*api.UpdateMarketPricesResponse, error) { + s.reportResponse(types.PricefeedDaemonServiceName) // Measure latency in ingesting and handling gRPC price update. defer telemetry.ModuleMeasureSince( metrics.PricefeedServer, @@ -46,7 +48,7 @@ func (s *Server) UpdateMarketPrices( if s.marketToExchange == nil { panic( errorsmod.Wrapf( - types.ErrServerNotInitializedCorrectly, + daemontypes.ErrServerNotInitializedCorrectly, "MarketToExchange not initialized", ), ) @@ -65,7 +67,7 @@ func (s *Server) UpdateMarketPrices( // validateMarketPricesUpdatesMessage validates a `UpdateMarketPricesRequest`. func validateMarketPricesUpdatesMessage(req *api.UpdateMarketPricesRequest) error { if len(req.MarketPriceUpdates) == 0 { - return types.ErrPriceFeedMarketPriceUpdateEmpty + return daemontypes.ErrPriceFeedMarketPriceUpdateEmpty } for _, mpu := range req.MarketPriceUpdates { @@ -95,7 +97,7 @@ func validateMarketPriceUpdate(mpu *api.MarketPriceUpdate) error { for _, ep := range mpu.ExchangePrices { if ep.Price == constants.DefaultPrice { return generateSdkErrorForExchangePrice( - types.ErrPriceFeedInvalidPrice, + daemontypes.ErrPriceFeedInvalidPrice, ep, mpu.MarketId, ) @@ -103,7 +105,7 @@ func validateMarketPriceUpdate(mpu *api.MarketPriceUpdate) error { if ep.LastUpdateTime == nil { return generateSdkErrorForExchangePrice( - types.ErrPriceFeedLastUpdateTimeNotSet, + daemontypes.ErrPriceFeedLastUpdateTimeNotSet, ep, mpu.MarketId, ) diff --git a/protocol/daemons/server/server.go b/protocol/daemons/server/server.go index 8c3cb42618d..28ef9b8fa8b 100644 --- a/protocol/daemons/server/server.go +++ b/protocol/daemons/server/server.go @@ -1,12 +1,15 @@ package server import ( + gometrics "github.com/armon/go-metrics" "github.com/cometbft/cometbft/libs/log" + "github.com/cosmos/cosmos-sdk/telemetry" bridgeapi "github.com/dydxprotocol/v4-chain/protocol/daemons/bridge/api" "github.com/dydxprotocol/v4-chain/protocol/daemons/constants" liquidationapi "github.com/dydxprotocol/v4-chain/protocol/daemons/liquidation/api" pricefeedapi "github.com/dydxprotocol/v4-chain/protocol/daemons/pricefeed/api" daemontypes "github.com/dydxprotocol/v4-chain/protocol/daemons/types" + "github.com/dydxprotocol/v4-chain/protocol/lib/metrics" "net" "syscall" ) @@ -48,6 +51,20 @@ func (server *Server) Stop() { server.gsrv.Stop() } +// reportResponse reports a response from a daemon service for metrics collection purposes. +func (server *Server) reportResponse(daemonKey string) { + telemetry.IncrCounterWithLabels( + []string{ + metrics.DaemonServer, + metrics.ValidResponse, + }, + 1, + []gometrics.Label{ + metrics.GetLabelForStringValue(metrics.Daemon, daemonKey), + }, + ) +} + // Start clears the current socket and establishes a new socket connection // on the local filesystem. // See URL for more information: https://eli.thegreenplace.net/2019/unix-domain-sockets-in-go/ diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go index e3dd3b95e53..771af6f833c 100644 --- a/protocol/daemons/server/types/health_monitor.go +++ b/protocol/daemons/server/types/health_monitor.go @@ -95,10 +95,9 @@ func (hc *healthChecker) Poll() { if !hc.firstFailureInStreak.IsZero() && now.Sub(hc.firstFailureInStreak.Timestamp()) >= hc.maximumAcceptableUnhealthyDuration { hc.unhealthyCallback(hc.firstFailureInStreak.Error()) - } else { - // If we do not execute the callback, schedule the next poll. - hc.timer.Reset(hc.pollFrequency) } + // Schedule the next poll. + hc.timer.Reset(hc.pollFrequency) } func (hc *healthChecker) Stop() { diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go index 8fba05764ae..487cdd3368f 100644 --- a/protocol/daemons/server/types/health_monitor_test.go +++ b/protocol/daemons/server/types/health_monitor_test.go @@ -283,7 +283,7 @@ func TestRegisterService_RegistrationFailsAfterStop(t *testing.T) { mock.AssertExpectationsForObjects(t, logger) } -func TestRegisterValidResponseWithCallback_NegativeUpdateDuration(t *testing.T) { +func TestRegisterValidResponseWithCallback_NegativeUnhealthyDuration(t *testing.T) { ufm, _ := createTestMonitor() hc := mockFailingHealthCheckerWithError("test-service", TestError1) err := ufm.RegisterServiceWithCallback(hc, -50*time.Millisecond, func(error) {}) From d39f6c56c1dacae001331de83d0ba2e6dc6c03ae Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Mon, 27 Nov 2023 16:38:26 -0800 Subject: [PATCH 03/15] Add comment. --- protocol/daemons/server/types/health_monitor.go | 1 + 1 file changed, 1 insertion(+) diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go index 771af6f833c..f412411849f 100644 --- a/protocol/daemons/server/types/health_monitor.go +++ b/protocol/daemons/server/types/health_monitor.go @@ -11,6 +11,7 @@ import ( ) const ( + // HealthCheckPollFrequency is the frequency at which the health checkable service is polled. HealthCheckPollFrequency = 5 * time.Second ) From bc90892f2fdbf187f700b1161dff02dd8d9a7c81 Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Mon, 27 Nov 2023 16:52:23 -0800 Subject: [PATCH 04/15] Add mutex to health checker. --- .../daemons/server/types/health_monitor.go | 44 +++++++++++++------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go index f412411849f..b8ab2bc8813 100644 --- a/protocol/daemons/server/types/health_monitor.go +++ b/protocol/daemons/server/types/health_monitor.go @@ -44,41 +44,53 @@ func (u *timestampWithError) Error() error { // healthChecker encapsulates the logic for monitoring the health of a health checkable service. type healthChecker struct { + // The following fields are initialized in StartNewHealthChecker and are not modified after initialization. // healthCheckable is the health checkable service to be monitored. healthCheckable types.HealthCheckable - // timer triggers a health check poll for a health checkable service. - timer *time.Timer - // pollFrequency is the frequency at which the health checkable service is polled. pollFrequency time.Duration + // maximumAcceptableUnhealthyDuration is the maximum acceptable duration for a health checkable service to + // remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the + // specified callback function. + maximumAcceptableUnhealthyDuration time.Duration + + // unhealthyCallback is the callback function to be executed if the health checkable service remains + // unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration. + // This callback function is executed with the error that caused the service to become unhealthy. + unhealthyCallback func(error) + + // lock is used to synchronize access to the health checker's dynamically updated fields. + lock sync.Mutex + + // The following fields are dynamically updated by the health checker: + // timer triggers a health check poll for a health checkable service. + // Access to the timer is synchronized. + timer *time.Timer + // mostRecentSuccess is the timestamp of the most recent successful health check. + // Access to mostRecentSuccess is synchronized. mostRecentSuccess time.Time // firstFailureInStreak is the timestamp of the first error in the most recent streak of errors. It is set // whenever the service toggles from healthy to an unhealthy state, and used to determine how long the daemon has // been unhealthy. If this timestamp is nil, then the error streak ended before it could trigger a callback. + // Access to firstFailureInStreak is synchronized. firstFailureInStreak timestampWithError - // unhealthyCallback is the callback function to be executed if the health checkable service remains - // unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration. - // This callback function is executed with the error that caused the service to become unhealthy. - unhealthyCallback func(error) - // timeProvider is used to get the current time. It is added as a field so that it can be mocked in tests. + // Access to timeProvider is synchronized. timeProvider libtime.TimeProvider - - // maximumAcceptableUnhealthyDuration is the maximum acceptable duration for a health checkable service to - // remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the - // specified callback function. - maximumAcceptableUnhealthyDuration time.Duration } // Poll executes a health check for the health checkable service. If the service has been unhealthy for longer than the // maximum acceptable unhealthy duration, the callback function is executed. -// This method is publicly exposed for testing. +// This method is publicly exposed for testing. This method is synchronized. func (hc *healthChecker) Poll() { + hc.lock.Lock() + defer hc.lock.Unlock() + // Don't return an error if the monitor has been disabled. err := hc.healthCheckable.HealthCheck() now := hc.timeProvider.Now() @@ -101,7 +113,11 @@ func (hc *healthChecker) Poll() { hc.timer.Reset(hc.pollFrequency) } +// Stop stops the health checker. This method is synchronized. func (hc *healthChecker) Stop() { + hc.lock.Lock() + defer hc.lock.Unlock() + hc.timer.Stop() } From 857d7042de32e53d0cf38bd7325fe2ea8d9892d1 Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Tue, 28 Nov 2023 15:59:53 -0800 Subject: [PATCH 05/15] PR comments. --- protocol/app/app.go | 3 + protocol/app/app_test.go | 6 +- protocol/daemons/metrics/client/client.go | 3 + .../daemons/server/types/health_checker.go | 152 ++++++++++++++++++ .../server/types/health_checker_test.go | 127 +++++++++++++++ .../daemons/server/types/health_monitor.go | 141 ++-------------- .../server/types/health_monitor_test.go | 152 ++++-------------- protocol/daemons/server/types/stoppable.go | 7 + 8 files changed, 341 insertions(+), 250 deletions(-) create mode 100644 protocol/daemons/server/types/health_checker.go create mode 100644 protocol/daemons/server/types/health_checker_test.go create mode 100644 protocol/daemons/server/types/stoppable.go diff --git a/protocol/app/app.go b/protocol/app/app.go index 6a5e53b418d..ee947db019b 100644 --- a/protocol/app/app.go +++ b/protocol/app/app.go @@ -672,6 +672,9 @@ func New( // Start the Metrics Daemon. // The metrics daemon is purely used for observability. It should never bring the app down. // TODO(CLOB-960) Don't start this goroutine if telemetry is disabled + // Note: the metrics daemon is such a simple go-routine that we don't bother implementing a health-check + // for this service. The task loop does not produce any errors because the telemetry calls themselves are + // not error-returning, so in effect this daemon would never become unhealthy. go func() { defer func() { if r := recover(); r != nil { diff --git a/protocol/app/app_test.go b/protocol/app/app_test.go index b845cc71b95..6503d9c4190 100644 --- a/protocol/app/app_test.go +++ b/protocol/app/app_test.go @@ -233,5 +233,9 @@ func TestMonitorDaemon_Panics(t *testing.T) { app.MonitorDaemon(hc, 5*time.Minute) // The second registration should fail, causing a panic. - require.Panics(t, func() { app.MonitorDaemon(hc, 5*time.Minute) }) + require.PanicsWithError( + t, + "service test-service already registered", + func() { app.MonitorDaemon(hc, 5*time.Minute) }, + ) } diff --git a/protocol/daemons/metrics/client/client.go b/protocol/daemons/metrics/client/client.go index a714a637132..5e86debedda 100644 --- a/protocol/daemons/metrics/client/client.go +++ b/protocol/daemons/metrics/client/client.go @@ -19,6 +19,9 @@ var ( // Start begins a job that periodically: // 1) Emits metrics about app version and git commit. +// Note: the metrics daemon is such a simple go-routine that we don't bother implementing a health-check +// for this service. The task loop does not produce any errors because the telemetry calls themselves are +// not error-returning, so in effect this daemon would never become unhealthy. func Start( ctx context.Context, logger log.Logger, diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go new file mode 100644 index 00000000000..ef465566ada --- /dev/null +++ b/protocol/daemons/server/types/health_checker.go @@ -0,0 +1,152 @@ +package types + +import ( + "github.com/cometbft/cometbft/libs/log" + "github.com/dydxprotocol/v4-chain/protocol/daemons/types" + libtime "github.com/dydxprotocol/v4-chain/protocol/lib/time" + "sync" + "time" +) + +// timestampWithError couples a timestamp and error to make it easier to update them in tandem. The +// timestampWithError will track the timestamp of the first error in a streak of errors, but keeps a record of the +// most recent error. This is useful for determining how long a service has been unhealthy, and the current state +// of the service. +type timestampWithError struct { + timestamp time.Time + err error +} + +func (u *timestampWithError) Update(timestamp time.Time, err error) { + // If the timestamp is zero, this is the first update, so set the timestamp. + if u.timestamp.IsZero() { + u.timestamp = timestamp + } + u.err = err +} + +func (u *timestampWithError) Reset() { + u.timestamp = time.Time{} + u.err = nil +} + +func (u *timestampWithError) IsZero() bool { + return u.timestamp.IsZero() && u.err == nil +} + +func (u *timestampWithError) Timestamp() time.Time { + return u.timestamp +} + +func (u *timestampWithError) Error() error { + return u.err +} + +// healthChecker encapsulates the logic for monitoring the health of a health checkable service. +type healthChecker struct { + // The following fields are initialized in StartNewHealthChecker and are not modified after initialization. + // healthCheckable is the health checkable service to be monitored. + healthCheckable types.HealthCheckable + + // pollFrequency is the frequency at which the health checkable service is polled. + pollFrequency time.Duration + + // maxAcceptableUnhealthyDuration is the maximum acceptable duration for a health checkable service to + // remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the + // specified callback function. + maxAcceptableUnhealthyDuration time.Duration + + // unhealthyCallback is the callback function to be executed if the health checkable service remains + // unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration. + // This callback function is executed with the error that caused the service to become unhealthy. + unhealthyCallback func(error) + + logger log.Logger + + // lock is used to synchronize access to the health checker's dynamically updated fields. + lock sync.Mutex + + // The following fields are dynamically updated by the health checker: + // timer triggers a health check poll for a health checkable service. + // Access to the timer is synchronized. + timer *time.Timer + + // mostRecentSuccess is the timestamp of the most recent successful health check. + // Access to mostRecentSuccess is synchronized. + mostRecentSuccess time.Time + + // mostRecentFailureStreakError tracks the timestamp of the first error in the most recent streak of errors, as well + // as the most recent error. It is updated on every error and reset every time the service sees a healthy response. + // This field is used to determine how long the daemon has been unhealthy. If this timestamp is nil, then either + // the service has never been unhealthy, or the most recent error streak ended before it could trigger a callback. + // Access to mostRecentFailureStreakError is synchronized. + mostRecentFailureStreakError timestampWithError + + // timeProvider is used to get the current time. It is added as a field so that it can be mocked in tests. + // Access to timeProvider is synchronized. + timeProvider libtime.TimeProvider +} + +// Poll executes a health check for the health checkable service. If the service has been unhealthy for longer than the +// maximum acceptable unhealthy duration, the callback function is executed. +// This method is publicly exposed for testing. This method is synchronized. +func (hc *healthChecker) Poll() { + hc.lock.Lock() + defer hc.lock.Unlock() + + err := hc.healthCheckable.HealthCheck() + now := hc.timeProvider.Now() + + // Schedule the next poll + defer hc.timer.Reset(hc.pollFrequency) + + // Capture healthy response. + if err == nil { + hc.mostRecentSuccess = now + // Whenever the service is healthy, reset the first failure in streak timestamp. + hc.mostRecentFailureStreakError.Reset() + return + } + + hc.mostRecentFailureStreakError.Update(now, err) + + // If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the + // callback function. + streakDuration := now.Sub(hc.mostRecentFailureStreakError.Timestamp()) + if !hc.mostRecentFailureStreakError.IsZero() && + streakDuration >= hc.maxAcceptableUnhealthyDuration { + hc.unhealthyCallback(hc.mostRecentFailureStreakError.Error()) + } +} + +// Stop stops the health checker. This method is synchronized. +func (hc *healthChecker) Stop() { + hc.lock.Lock() + defer hc.lock.Unlock() + + hc.timer.Stop() +} + +// StartNewHealthChecker creates and starts a new health checker for a health checkable service. +func StartNewHealthChecker( + healthCheckable types.HealthCheckable, + pollFrequency time.Duration, + unhealthyCallback func(error), + timeProvider libtime.TimeProvider, + maximumAcceptableUnhealthyDuration time.Duration, + startupGracePeriod time.Duration, + logger log.Logger, +) *healthChecker { + checker := &healthChecker{ + healthCheckable: healthCheckable, + pollFrequency: pollFrequency, + unhealthyCallback: unhealthyCallback, + timeProvider: timeProvider, + maxAcceptableUnhealthyDuration: maximumAcceptableUnhealthyDuration, + logger: logger, + } + // The first poll is scheduled after the startup grace period to allow the service to initialize. + checker.timer = time.AfterFunc(startupGracePeriod, checker.Poll) + + return checker +} diff --git a/protocol/daemons/server/types/health_checker_test.go b/protocol/daemons/server/types/health_checker_test.go new file mode 100644 index 00000000000..fc37b356bcf --- /dev/null +++ b/protocol/daemons/server/types/health_checker_test.go @@ -0,0 +1,127 @@ +package types_test + +import ( + "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types" + "github.com/dydxprotocol/v4-chain/protocol/mocks" + "github.com/stretchr/testify/require" + "testing" + "time" +) + +func TestHealthChecker(t *testing.T) { + tests := map[string]struct { + healthCheckResponses []error + expectedUnhealthy error + }{ + "initialized: no callback triggered": { + healthCheckResponses: []error{}, + expectedUnhealthy: nil, + }, + "healthy, then unhealthy for < maximum unhealthy duration: no callback triggered": { + healthCheckResponses: []error{ + nil, + TestError1, + }, + expectedUnhealthy: nil, + }, + "unhealthy for < maximum unhealthy duration: no callback triggered": { + healthCheckResponses: []error{ + TestError1, + TestError1, + TestError1, + TestError1, + TestError1, + }, + expectedUnhealthy: nil, + }, + "unhealthy, healthy, unhealthy: no callback triggered": { + healthCheckResponses: []error{ + TestError1, + TestError1, + TestError1, + TestError1, + TestError1, + nil, + TestError1, + TestError1, + TestError1, + TestError1, + TestError1, + }, + expectedUnhealthy: nil, + }, + "unhealthy for maximum unhealthy duration: callback triggered": { + healthCheckResponses: []error{ + TestError1, + TestError1, + TestError1, + TestError1, + TestError1, + TestError1, + }, + expectedUnhealthy: TestError1, + }, + "unhealthy with multiple errors: most recent error returned": { + healthCheckResponses: []error{ + TestError1, + TestError1, + TestError1, + TestError1, + TestError1, + TestError2, + }, + expectedUnhealthy: TestError2, + }, + } + for name, test := range tests { + t.Run(name, func(t *testing.T) { + // Setup. + // Set up callback to track error passed to it. + callback, callbackError := callbackWithErrorPointer() + + // Set up health checkable service. + checkable := &mocks.HealthCheckable{} + for _, response := range test.healthCheckResponses { + checkable.On("HealthCheck").Return(response).Once() + } + + // Set up time provider to return a sequence of timestamps one second apart starting at Time0. + timeProvider := &mocks.TimeProvider{} + for i := range test.healthCheckResponses { + timeProvider.On("Now").Return(Time0.Add(time.Duration(i) * time.Second)).Once() + } + + healthChecker := types.StartNewHealthChecker( + checkable, + TestLargeDuration, // set to a >> value so that poll is never auto-triggered by the timer + callback, + timeProvider, + TestMaximumUnhealthyDuration, + types.DaemonStartupGracePeriod, + &mocks.Logger{}, + ) + + // Cleanup. + defer func() { + healthChecker.Stop() + }() + + // Act - simulate the health checker polling for updates. + for i := 0; i < len(test.healthCheckResponses); i++ { + healthChecker.Poll() + } + + // Assert. + // Validate the expected polls occurred according to the mocks. + checkable.AssertExpectations(t) + timeProvider.AssertExpectations(t) + + // Validate the callback was called with the expected error. + if test.expectedUnhealthy == nil { + require.NoError(t, *callbackError) + } else { + require.ErrorContains(t, *callbackError, test.expectedUnhealthy.Error()) + } + }) + } +} diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go index b8ab2bc8813..28fcd34eccf 100644 --- a/protocol/daemons/server/types/health_monitor.go +++ b/protocol/daemons/server/types/health_monitor.go @@ -15,134 +15,6 @@ const ( HealthCheckPollFrequency = 5 * time.Second ) -// timestampWithError couples a timestamp and error to make it easier to update them in tandem. -type timestampWithError struct { - timestamp time.Time - err error -} - -func (u *timestampWithError) Update(timestamp time.Time, err error) { - u.timestamp = timestamp - u.err = err -} - -func (u *timestampWithError) Reset() { - u.Update(time.Time{}, nil) -} - -func (u *timestampWithError) IsZero() bool { - return u.timestamp.IsZero() -} - -func (u *timestampWithError) Timestamp() time.Time { - return u.timestamp -} - -func (u *timestampWithError) Error() error { - return u.err -} - -// healthChecker encapsulates the logic for monitoring the health of a health checkable service. -type healthChecker struct { - // The following fields are initialized in StartNewHealthChecker and are not modified after initialization. - // healthCheckable is the health checkable service to be monitored. - healthCheckable types.HealthCheckable - - // pollFrequency is the frequency at which the health checkable service is polled. - pollFrequency time.Duration - - // maximumAcceptableUnhealthyDuration is the maximum acceptable duration for a health checkable service to - // remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the - // specified callback function. - maximumAcceptableUnhealthyDuration time.Duration - - // unhealthyCallback is the callback function to be executed if the health checkable service remains - // unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration. - // This callback function is executed with the error that caused the service to become unhealthy. - unhealthyCallback func(error) - - // lock is used to synchronize access to the health checker's dynamically updated fields. - lock sync.Mutex - - // The following fields are dynamically updated by the health checker: - // timer triggers a health check poll for a health checkable service. - // Access to the timer is synchronized. - timer *time.Timer - - // mostRecentSuccess is the timestamp of the most recent successful health check. - // Access to mostRecentSuccess is synchronized. - mostRecentSuccess time.Time - - // firstFailureInStreak is the timestamp of the first error in the most recent streak of errors. It is set - // whenever the service toggles from healthy to an unhealthy state, and used to determine how long the daemon has - // been unhealthy. If this timestamp is nil, then the error streak ended before it could trigger a callback. - // Access to firstFailureInStreak is synchronized. - firstFailureInStreak timestampWithError - - // timeProvider is used to get the current time. It is added as a field so that it can be mocked in tests. - // Access to timeProvider is synchronized. - timeProvider libtime.TimeProvider -} - -// Poll executes a health check for the health checkable service. If the service has been unhealthy for longer than the -// maximum acceptable unhealthy duration, the callback function is executed. -// This method is publicly exposed for testing. This method is synchronized. -func (hc *healthChecker) Poll() { - hc.lock.Lock() - defer hc.lock.Unlock() - - // Don't return an error if the monitor has been disabled. - err := hc.healthCheckable.HealthCheck() - now := hc.timeProvider.Now() - if err == nil { - hc.mostRecentSuccess = now - // Whenever the service is healthy, reset the first failure in streak timestamp. - hc.firstFailureInStreak.Reset() - } else if hc.firstFailureInStreak.IsZero() { - // Capture the timestamp of the first failure in a new streak. - hc.firstFailureInStreak.Update(now, err) - } - - // If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the - // callback function. - if !hc.firstFailureInStreak.IsZero() && - now.Sub(hc.firstFailureInStreak.Timestamp()) >= hc.maximumAcceptableUnhealthyDuration { - hc.unhealthyCallback(hc.firstFailureInStreak.Error()) - } - // Schedule the next poll. - hc.timer.Reset(hc.pollFrequency) -} - -// Stop stops the health checker. This method is synchronized. -func (hc *healthChecker) Stop() { - hc.lock.Lock() - defer hc.lock.Unlock() - - hc.timer.Stop() -} - -// StartNewHealthChecker creates and starts a new health checker for a health checkable service. -func StartNewHealthChecker( - healthCheckable types.HealthCheckable, - pollFrequency time.Duration, - unhealthyCallback func(error), - timeProvider libtime.TimeProvider, - maximumAcceptableUnhealthyDuration time.Duration, - startupGracePeriod time.Duration, -) *healthChecker { - checker := &healthChecker{ - healthCheckable: healthCheckable, - pollFrequency: pollFrequency, - unhealthyCallback: unhealthyCallback, - timeProvider: timeProvider, - maximumAcceptableUnhealthyDuration: maximumAcceptableUnhealthyDuration, - } - // The first poll is scheduled after the startup grace period to allow the service to initialize. - checker.timer = time.AfterFunc(startupGracePeriod, checker.Poll) - - return checker -} - // HealthMonitor monitors the health of daemon services, which implement the HealthCheckable interface. If a // registered health-checkable service sustains an unhealthy state for the maximum acceptable unhealthy duration, // the monitor will execute a callback function. @@ -189,7 +61,8 @@ func (hm *HealthMonitor) DisableForTesting() { // stays unhealthy every time it is polled during the maximum acceptable unhealthy duration, the monitor will // execute the callback function. // This method is synchronized. The method returns an error if the service was already registered or the -// monitor has already been stopped. +// monitor has already been stopped. If the monitor has been stopped, this method will proactively stop the +// health-checkable service before returning. func (hm *HealthMonitor) RegisterServiceWithCallback( hc types.HealthCheckable, maximumAcceptableUnhealthyDuration time.Duration, @@ -216,6 +89,12 @@ func (hm *HealthMonitor) RegisterServiceWithCallback( // This could be a concern for short-running integration test cases, where the network // stops before all daemon services have been registered. if hm.stopped { + // If the service is stoppable, stop it. This helps us to clean up daemon services in test cases + // where the monitor is stopped before all daemon services have been registered. + if stoppable, ok := hc.(Stoppable); ok { + stoppable.Stop() + } + return fmt.Errorf( "health check registration failure for service %v: monitor has been stopped", hc.ServiceName(), @@ -233,6 +112,7 @@ func (hm *HealthMonitor) RegisterServiceWithCallback( &libtime.TimeProviderImpl{}, maximumAcceptableUnhealthyDuration, hm.startupGracePeriod, + hm.logger, ) return nil } @@ -263,7 +143,8 @@ func LogErrorServiceNotResponding(hc types.HealthCheckable, logger log.Logger) f // is unhealthy every time it is polled for a duration greater than or equal to the maximum acceptable unhealthy // duration, the monitor will panic. // This method is synchronized. It returns an error if the service was already registered or the monitor has -// already been stopped. +// already been stopped. If the monitor has been stopped, this method will proactively stop the health-checkable +// service before returning. func (hm *HealthMonitor) RegisterService( hc types.HealthCheckable, maximumAcceptableUnhealthyDuration time.Duration, diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go index 487cdd3368f..228988a201b 100644 --- a/protocol/daemons/server/types/health_monitor_test.go +++ b/protocol/daemons/server/types/health_monitor_test.go @@ -3,6 +3,7 @@ package types_test import ( "fmt" "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types" + daemontypes "github.com/dydxprotocol/v4-chain/protocol/daemons/types" "github.com/dydxprotocol/v4-chain/protocol/mocks" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" @@ -59,123 +60,6 @@ func callbackWithErrorPointer() (func(error), *error) { return callback, &callbackError } -func TestHealthChecker(t *testing.T) { - tests := map[string]struct { - healthCheckResponses []error - expectedUnhealthy error - }{ - "initialized: no callback triggered": { - healthCheckResponses: []error{}, - expectedUnhealthy: nil, - }, - "healthy, then unhealthy for < maximum unhealthy duration: no callback triggered": { - healthCheckResponses: []error{ - nil, - TestError1, - }, - expectedUnhealthy: nil, - }, - "unhealthy for < maximum unhealthy duration: no callback triggered": { - healthCheckResponses: []error{ - TestError1, - TestError1, - TestError1, - TestError1, - TestError1, - }, - expectedUnhealthy: nil, - }, - "unhealthy, healthy, unhealthy: no callback triggered": { - healthCheckResponses: []error{ - TestError1, - TestError1, - TestError1, - TestError1, - TestError1, - nil, - TestError1, - TestError1, - TestError1, - TestError1, - TestError1, - }, - expectedUnhealthy: nil, - }, - "unhealthy for maximum unhealthy duration: callback triggered": { - healthCheckResponses: []error{ - TestError1, - TestError1, - TestError1, - TestError1, - TestError1, - TestError1, - }, - expectedUnhealthy: TestError1, - }, - "unhealthy with multiple errors: first error returned": { - healthCheckResponses: []error{ - TestError1, - TestError2, - TestError2, - TestError2, - TestError2, - TestError2, - }, - expectedUnhealthy: TestError1, - }, - } - for name, test := range tests { - t.Run(name, func(t *testing.T) { - // Setup. - // Set up callback to track error passed to it. - callback, callbackError := callbackWithErrorPointer() - - // Set up health checkable service. - checkable := &mocks.HealthCheckable{} - for _, response := range test.healthCheckResponses { - checkable.On("HealthCheck").Return(response).Once() - } - - // Set up time provider to return a sequence of timestamps one second apart starting at Time0. - timeProvider := &mocks.TimeProvider{} - for i := range test.healthCheckResponses { - timeProvider.On("Now").Return(Time0.Add(time.Duration(i) * time.Second)).Once() - } - - healthChecker := types.StartNewHealthChecker( - checkable, - TestLargeDuration, // set to a >> value so that poll is never auto-triggered by the timer - callback, - timeProvider, - TestMaximumUnhealthyDuration, - types.DaemonStartupGracePeriod, - ) - - // Cleanup. - defer func() { - healthChecker.Stop() - }() - - // Act - simulate the health checker polling for updates. - for i := 0; i < len(test.healthCheckResponses); i++ { - healthChecker.Poll() - } - - // Assert. - // Validate the expected polls occurred according to the mocks. - checkable.AssertExpectations(t) - timeProvider.AssertExpectations(t) - - // Validate the callback was called with the expected error. - if test.expectedUnhealthy == nil { - require.NoError(t, *callbackError) - } else { - require.ErrorContains(t, *callbackError, test.expectedUnhealthy.Error()) - } - }) - } -} - // The following tests may still intermittently fail or report false positives on an overloaded system as they rely // on callbacks to execute before the termination of the `time.Sleep` call, which is not guaranteed. func TestRegisterService_Healthy(t *testing.T) { @@ -270,17 +154,47 @@ func TestRegisterServiceWithCallback_DoubleRegistrationFails(t *testing.T) { require.Equal(t, TestError1, *callbackError) } +// Create a struct that implements HealthCheckable and Stoppable to check that the monitor stops the service. +type stoppableFakeHealthChecker struct { + stopped bool +} + +// Implement stub methods to conform to interfaces. +func (f *stoppableFakeHealthChecker) ServiceName() string { return "test-service" } +func (f *stoppableFakeHealthChecker) HealthCheck() error { return nil } +func (f *stoppableFakeHealthChecker) ReportSuccess() {} +func (f *stoppableFakeHealthChecker) ReportFailure(_ error) {} + +// Stop stub tracks whether the service was stopped. +func (f *stoppableFakeHealthChecker) Stop() { + f.stopped = true +} + +var _ types.Stoppable = (*stoppableFakeHealthChecker)(nil) +var _ daemontypes.HealthCheckable = (*stoppableFakeHealthChecker)(nil) + func TestRegisterService_RegistrationFailsAfterStop(t *testing.T) { ufm, logger := createTestMonitor() ufm.Stop() - hc := mockFailingHealthCheckerWithError("test-service", TestError1) - err := ufm.RegisterService(hc, 50*time.Millisecond) + stoppableHc := &stoppableFakeHealthChecker{} + hc2 := mockFailingHealthCheckerWithError("test-service-2", TestError1) + + // Register a stoppable service. + err := ufm.RegisterService(stoppableHc, 50*time.Millisecond) + require.ErrorContains(t, err, "monitor has been stopped") + + // Register a non-stoppable service. + err = ufm.RegisterService(hc2, 50*time.Millisecond) require.ErrorContains(t, err, "monitor has been stopped") // Any scheduled functions with error logs that were not cleaned up should trigger before this sleep finishes. time.Sleep(100 * time.Millisecond) mock.AssertExpectationsForObjects(t, logger) + + // Assert that the monitor proactively stops any stoppable service that was registered after the monitor was + // stopped. + require.True(t, stoppableHc.stopped) } func TestRegisterValidResponseWithCallback_NegativeUnhealthyDuration(t *testing.T) { diff --git a/protocol/daemons/server/types/stoppable.go b/protocol/daemons/server/types/stoppable.go new file mode 100644 index 00000000000..fda69d34a88 --- /dev/null +++ b/protocol/daemons/server/types/stoppable.go @@ -0,0 +1,7 @@ +package types + +// Stoppable is an interface for a service that can be stopped. +// This is used to stop services registered with the health monitor. +type Stoppable interface { + Stop() +} From 86fa362f68b2fc4eb0a412dea58ad4e12049b2c6 Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Wed, 29 Nov 2023 12:31:58 -0800 Subject: [PATCH 06/15] PR comments. Remove superfluous methods. Comment nit. Method name fix. --- protocol/daemons/server/bridge.go | 15 +- protocol/daemons/server/liquidation.go | 14 +- protocol/daemons/server/pricefeed.go | 16 +- protocol/daemons/server/server.go | 4 +- .../daemons/server/types/health_checker.go | 111 +++++++----- .../daemons/server/types/health_monitor.go | 161 +++++++++++------- .../server/types/health_monitor_test.go | 22 +-- 7 files changed, 211 insertions(+), 132 deletions(-) diff --git a/protocol/daemons/server/bridge.go b/protocol/daemons/server/bridge.go index a4d0de5683c..ad76af6066e 100644 --- a/protocol/daemons/server/bridge.go +++ b/protocol/daemons/server/bridge.go @@ -27,11 +27,18 @@ func (s *Server) AddBridgeEvents( ctx context.Context, req *api.AddBridgeEventsRequest, ) ( - *api.AddBridgeEventsResponse, - error, + response *api.AddBridgeEventsResponse, + err error, ) { - s.reportResponse(types.BridgeDaemonServiceName) - if err := s.bridgeEventManager.AddBridgeEvents(req.BridgeEvents); err != nil { + // Capture valid responses in metrics. + defer func() { + if err == nil { + s.reportValidResponse(types.PricefeedDaemonServiceName) + } + }() + + s.reportValidResponse(types.BridgeDaemonServiceName) + if err = s.bridgeEventManager.AddBridgeEvents(req.BridgeEvents); err != nil { return nil, err } return &api.AddBridgeEventsResponse{}, nil diff --git a/protocol/daemons/server/liquidation.go b/protocol/daemons/server/liquidation.go index 7cf8d351392..f418b6c251f 100644 --- a/protocol/daemons/server/liquidation.go +++ b/protocol/daemons/server/liquidation.go @@ -29,7 +29,17 @@ func (server *Server) WithLiquidatableSubaccountIds( func (s *Server) LiquidateSubaccounts( ctx context.Context, req *api.LiquidateSubaccountsRequest, -) (*api.LiquidateSubaccountsResponse, error) { +) ( + response *api.LiquidateSubaccountsResponse, + err error, +) { + // Capture valid responses in metrics. + defer func() { + if err == nil { + s.reportValidResponse(types.PricefeedDaemonServiceName) + } + }() + telemetry.ModuleSetGauge( metrics.LiquidationDaemon, float32(len(req.SubaccountIds)), @@ -37,7 +47,7 @@ func (s *Server) LiquidateSubaccounts( metrics.Received, metrics.Count, ) - s.reportResponse(types.LiquidationsDaemonServiceName) + s.liquidatableSubaccountIds.UpdateSubaccountIds(req.SubaccountIds) return &api.LiquidateSubaccountsResponse{}, nil } diff --git a/protocol/daemons/server/pricefeed.go b/protocol/daemons/server/pricefeed.go index 1d43f984168..b4ef56df4cf 100644 --- a/protocol/daemons/server/pricefeed.go +++ b/protocol/daemons/server/pricefeed.go @@ -35,8 +35,17 @@ func (server *Server) WithPriceFeedMarketToExchangePrices( func (s *Server) UpdateMarketPrices( ctx context.Context, req *api.UpdateMarketPricesRequest, -) (*api.UpdateMarketPricesResponse, error) { - s.reportResponse(types.PricefeedDaemonServiceName) +) ( + response *api.UpdateMarketPricesResponse, + err error, +) { + // Capture valid responses in metrics. + defer func() { + if err == nil { + s.reportValidResponse(types.PricefeedDaemonServiceName) + } + }() + // Measure latency in ingesting and handling gRPC price update. defer telemetry.ModuleMeasureSince( metrics.PricefeedServer, @@ -54,13 +63,14 @@ func (s *Server) UpdateMarketPrices( ) } - if err := validateMarketPricesUpdatesMessage(req); err != nil { + if err = validateMarketPricesUpdatesMessage(req); err != nil { // Log if failure occurs during an update. s.logger.Error("Failed to validate price update message", "error", err) return nil, err } s.marketToExchange.UpdatePrices(req.MarketPriceUpdates) + return &api.UpdateMarketPricesResponse{}, nil } diff --git a/protocol/daemons/server/server.go b/protocol/daemons/server/server.go index 28ef9b8fa8b..60a34c1552c 100644 --- a/protocol/daemons/server/server.go +++ b/protocol/daemons/server/server.go @@ -51,8 +51,8 @@ func (server *Server) Stop() { server.gsrv.Stop() } -// reportResponse reports a response from a daemon service for metrics collection purposes. -func (server *Server) reportResponse(daemonKey string) { +// reportValidResponse reports a valid request/response from a daemon service for metrics collection purposes. +func (server *Server) reportValidResponse(daemonKey string) { telemetry.IncrCounterWithLabels( []string{ metrics.DaemonServer, diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go index ef465566ada..352b0813e8c 100644 --- a/protocol/daemons/server/types/health_checker.go +++ b/protocol/daemons/server/types/health_checker.go @@ -10,8 +10,8 @@ import ( // timestampWithError couples a timestamp and error to make it easier to update them in tandem. The // timestampWithError will track the timestamp of the first error in a streak of errors, but keeps a record of the -// most recent error. This is useful for determining how long a service has been unhealthy, and the current state -// of the service. +// most recent error. This is useful for determining how long a service has been unhealthy, as well as the current +// state of the service. type timestampWithError struct { timestamp time.Time err error @@ -42,15 +42,61 @@ func (u *timestampWithError) Error() error { return u.err } +// healthCheckerMutableState tracks the current health state of the HealthCheckable, encapsulating all mutable state +// into a single struct for ease of synchronization. +type healthCheckerMutableState struct { + // lock is used to synchronize access to mutable state fields. + lock sync.Mutex + + // mostRecentSuccess is the timestamp of the most recent successful health check. + // Access to mostRecentSuccess is synchronized. + mostRecentSuccess time.Time + + // mostRecentFailureStreakError tracks the timestamp of the first error in the most recent streak of errors, as well + // as the most recent error. It is updated on every error and reset every time the service sees a healthy response. + // This field is used to determine how long the daemon has been unhealthy. If this timestamp is nil, then either + // the service has never been unhealthy, or the most recent error streak ended before it could trigger a callback. + // Access to mostRecentFailureStreakError is synchronized. + mostRecentFailureStreakError timestampWithError +} + +// ReportSuccess updates the health checker's mutable state to reflect a successful health check and schedules the next +// poll as an atomic operation. +func (u *healthCheckerMutableState) ReportSuccess(now time.Time) { + u.lock.Lock() + defer u.lock.Unlock() + + u.mostRecentSuccess = now + + // Whenever the service is healthy, reset the first failure in streak timestamp. + u.mostRecentFailureStreakError.Reset() +} + +// ReportFailure updates the health checker's mutable state to reflect a failed health check and schedules the next +// poll as an atomic operation. The method returns the duration of the current failure streak. +func (u *healthCheckerMutableState) ReportFailure(now time.Time, err error) time.Duration { + u.lock.Lock() + defer u.lock.Unlock() + + u.mostRecentFailureStreakError.Update(now, err) + + return now.Sub(u.mostRecentFailureStreakError.Timestamp()) +} + // healthChecker encapsulates the logic for monitoring the health of a health checkable service. type healthChecker struct { - // The following fields are initialized in StartNewHealthChecker and are not modified after initialization. + // mutableState is the mutable state of the health checker. Access to these fields is synchronized. + mutableState *healthCheckerMutableState + // healthCheckable is the health checkable service to be monitored. healthCheckable types.HealthCheckable // pollFrequency is the frequency at which the health checkable service is polled. pollFrequency time.Duration + // timer triggers a health check poll for a health checkable service. + timer *time.Timer + // maxAcceptableUnhealthyDuration is the maximum acceptable duration for a health checkable service to // remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the // specified callback function. @@ -61,69 +107,38 @@ type healthChecker struct { // This callback function is executed with the error that caused the service to become unhealthy. unhealthyCallback func(error) - logger log.Logger - - // lock is used to synchronize access to the health checker's dynamically updated fields. - lock sync.Mutex - - // The following fields are dynamically updated by the health checker: - // timer triggers a health check poll for a health checkable service. - // Access to the timer is synchronized. - timer *time.Timer - - // mostRecentSuccess is the timestamp of the most recent successful health check. - // Access to mostRecentSuccess is synchronized. - mostRecentSuccess time.Time - - // mostRecentFailureStreakError tracks the timestamp of the first error in the most recent streak of errors, as well - // as the most recent error. It is updated on every error and reset every time the service sees a healthy response. - // This field is used to determine how long the daemon has been unhealthy. If this timestamp is nil, then either - // the service has never been unhealthy, or the most recent error streak ended before it could trigger a callback. - // Access to mostRecentFailureStreakError is synchronized. - mostRecentFailureStreakError timestampWithError - // timeProvider is used to get the current time. It is added as a field so that it can be mocked in tests. - // Access to timeProvider is synchronized. timeProvider libtime.TimeProvider + + logger log.Logger } // Poll executes a health check for the health checkable service. If the service has been unhealthy for longer than the // maximum acceptable unhealthy duration, the callback function is executed. // This method is publicly exposed for testing. This method is synchronized. func (hc *healthChecker) Poll() { - hc.lock.Lock() - defer hc.lock.Unlock() - err := hc.healthCheckable.HealthCheck() now := hc.timeProvider.Now() - // Schedule the next poll - defer hc.timer.Reset(hc.pollFrequency) - // Capture healthy response. + if err == nil { - hc.mostRecentSuccess = now - // Whenever the service is healthy, reset the first failure in streak timestamp. - hc.mostRecentFailureStreakError.Reset() - return + hc.mutableState.ReportSuccess(now) + } else { + streakDuration := hc.mutableState.ReportFailure(now, err) + // If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the + // callback function. + if streakDuration >= hc.maxAcceptableUnhealthyDuration { + hc.unhealthyCallback(err) + } } - hc.mostRecentFailureStreakError.Update(now, err) - - // If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the - // callback function. - streakDuration := now.Sub(hc.mostRecentFailureStreakError.Timestamp()) - if !hc.mostRecentFailureStreakError.IsZero() && - streakDuration >= hc.maxAcceptableUnhealthyDuration { - hc.unhealthyCallback(hc.mostRecentFailureStreakError.Error()) - } + // Schedule next poll. + hc.timer.Reset(hc.pollFrequency) } // Stop stops the health checker. This method is synchronized. func (hc *healthChecker) Stop() { - hc.lock.Lock() - defer hc.lock.Unlock() - hc.timer.Stop() } @@ -144,7 +159,9 @@ func StartNewHealthChecker( timeProvider: timeProvider, maxAcceptableUnhealthyDuration: maximumAcceptableUnhealthyDuration, logger: logger, + mutableState: &healthCheckerMutableState{}, } + // The first poll is scheduled after the startup grace period to allow the service to initialize. checker.timer = time.AfterFunc(startupGracePeriod, checker.Poll) diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go index 28fcd34eccf..f248a306685 100644 --- a/protocol/daemons/server/types/health_monitor.go +++ b/protocol/daemons/server/types/health_monitor.go @@ -15,10 +15,11 @@ const ( HealthCheckPollFrequency = 5 * time.Second ) -// HealthMonitor monitors the health of daemon services, which implement the HealthCheckable interface. If a -// registered health-checkable service sustains an unhealthy state for the maximum acceptable unhealthy duration, -// the monitor will execute a callback function. -type HealthMonitor struct { +// healthMonitorMutableState tracks all mutable state associated with the health monitor. This state is gathered into +// a single struct for ease of synchronization. +type healthMonitorMutableState struct { + sync.Mutex + // serviceToHealthChecker maps daemon service names to their update metadata. serviceToHealthChecker map[string]*healthChecker // stopped indicates whether the monitor has been stopped. Additional daemon services cannot be registered @@ -27,8 +28,82 @@ type HealthMonitor struct { // disabled indicates whether the monitor has been disabled. This is used to disable the monitor in testApp // tests, where app.New is not executed. disabled bool - // lock is used to synchronize access to the monitor. - lock sync.Mutex +} + +// NewHealthMonitorMutableState creates a new health monitor mutable state. +func NewHealthMonitorMutableState() *healthMonitorMutableState { + return &healthMonitorMutableState{ + serviceToHealthChecker: make(map[string]*healthChecker), + } +} + +// DisableForTesting disables the health monitor mutable state from receiving updates. This prevents the monitor +// from registering services when called before app initialization and is used for testing. +func (ms *healthMonitorMutableState) DisableForTesting() { + ms.Lock() + defer ms.Unlock() + + ms.disabled = true +} + +// Stop stops the update frequency monitor. This method is synchronized. +func (ms *healthMonitorMutableState) Stop() { + ms.Lock() + defer ms.Unlock() + + // Don't stop the monitor if it has already been stopped. + if ms.stopped { + return + } + + // Stop all health checkers. + for _, checker := range ms.serviceToHealthChecker { + checker.Stop() + } + + ms.stopped = true +} + +// RegisterHealthChecker registers a new health checker for a health checkable with the health monitor. The health +// checker is lazily created using the provided function if needed. This method is synchronized. It returns an error if +// the service was already registered. +func (ms *healthMonitorMutableState) RegisterHealthChecker( + checkable types.HealthCheckable, + lazyHealthCheckerCreator func() *healthChecker, +) error { + ms.Lock() + defer ms.Unlock() + + // Don't register daemon services if the monitor has been disabled. + if ms.disabled { + return nil + } + + // Don't register additional daemon services if the monitor has already been stopped. + // This could be a concern for short-running integration test cases, where the network + // stops before all daemon services have been registered. + if ms.stopped { + // If the service is stoppable, stop it. This helps us to clean up daemon services in test cases + // where the monitor is stopped before all daemon services have been registered. + if stoppable, ok := checkable.(Stoppable); ok { + stoppable.Stop() + } + return nil + } + + if _, ok := ms.serviceToHealthChecker[checkable.ServiceName()]; ok { + return fmt.Errorf("service %v already registered", checkable.ServiceName()) + } + + ms.serviceToHealthChecker[checkable.ServiceName()] = lazyHealthCheckerCreator() + return nil +} + +// HealthMonitor monitors the health of daemon services, which implement the HealthCheckable interface. If a +// registered health-checkable service sustains an unhealthy state for the maximum acceptable unhealthy duration, +// the monitor will execute a callback function. +type HealthMonitor struct { + mutableState *healthMonitorMutableState // These fields are initialized in NewHealthMonitor and are not modified after initialization. logger log.Logger @@ -43,18 +118,15 @@ func NewHealthMonitor( logger log.Logger, ) *HealthMonitor { return &HealthMonitor{ - serviceToHealthChecker: make(map[string]*healthChecker), - logger: logger.With(cosmoslog.ModuleKey, "health-monitor"), - startupGracePeriod: startupGracePeriod, - pollingFrequency: pollingFrequency, + mutableState: NewHealthMonitorMutableState(), + logger: logger.With(cosmoslog.ModuleKey, "health-monitor"), + startupGracePeriod: startupGracePeriod, + pollingFrequency: pollingFrequency, } } func (hm *HealthMonitor) DisableForTesting() { - hm.lock.Lock() - defer hm.lock.Unlock() - - hm.disabled = true + hm.mutableState.DisableForTesting() } // RegisterServiceWithCallback registers a HealthCheckable with the health monitor. If the service @@ -68,9 +140,6 @@ func (hm *HealthMonitor) RegisterServiceWithCallback( maximumAcceptableUnhealthyDuration time.Duration, callback func(error), ) error { - hm.lock.Lock() - defer hm.lock.Unlock() - if maximumAcceptableUnhealthyDuration <= 0 { return fmt.Errorf( "health check registration failure for service %v: "+ @@ -80,41 +149,17 @@ func (hm *HealthMonitor) RegisterServiceWithCallback( ) } - // Don't register daemon services if the monitor has been disabled. - if hm.disabled { - return nil - } - - // Don't register additional daemon services if the monitor has already been stopped. - // This could be a concern for short-running integration test cases, where the network - // stops before all daemon services have been registered. - if hm.stopped { - // If the service is stoppable, stop it. This helps us to clean up daemon services in test cases - // where the monitor is stopped before all daemon services have been registered. - if stoppable, ok := hc.(Stoppable); ok { - stoppable.Stop() - } - - return fmt.Errorf( - "health check registration failure for service %v: monitor has been stopped", - hc.ServiceName(), + return hm.mutableState.RegisterHealthChecker(hc, func() *healthChecker { + return StartNewHealthChecker( + hc, + hm.pollingFrequency, + callback, + &libtime.TimeProviderImpl{}, + maximumAcceptableUnhealthyDuration, + hm.startupGracePeriod, + hm.logger, ) - } - - if _, ok := hm.serviceToHealthChecker[hc.ServiceName()]; ok { - return fmt.Errorf("service %v already registered", hc.ServiceName()) - } - - hm.serviceToHealthChecker[hc.ServiceName()] = StartNewHealthChecker( - hc, - hm.pollingFrequency, - callback, - &libtime.TimeProviderImpl{}, - maximumAcceptableUnhealthyDuration, - hm.startupGracePeriod, - hm.logger, - ) - return nil + }) } // PanicServiceNotResponding returns a function that panics with a message indicating that the specified daemon @@ -158,17 +203,5 @@ func (hm *HealthMonitor) RegisterService( // Stop stops the update frequency monitor. This method is synchronized. func (hm *HealthMonitor) Stop() { - hm.lock.Lock() - defer hm.lock.Unlock() - - // Don't stop the monitor if it has already been stopped. - if hm.stopped { - return - } - - for _, checker := range hm.serviceToHealthChecker { - checker.Stop() - } - - hm.stopped = true + hm.mutableState.Stop() } diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go index 228988a201b..82a0893e6ef 100644 --- a/protocol/daemons/server/types/health_monitor_test.go +++ b/protocol/daemons/server/types/health_monitor_test.go @@ -161,7 +161,7 @@ type stoppableFakeHealthChecker struct { // Implement stub methods to conform to interfaces. func (f *stoppableFakeHealthChecker) ServiceName() string { return "test-service" } -func (f *stoppableFakeHealthChecker) HealthCheck() error { return nil } +func (f *stoppableFakeHealthChecker) HealthCheck() error { return fmt.Errorf("unhealthy") } func (f *stoppableFakeHealthChecker) ReportSuccess() {} func (f *stoppableFakeHealthChecker) ReportFailure(_ error) {} @@ -174,23 +174,25 @@ var _ types.Stoppable = (*stoppableFakeHealthChecker)(nil) var _ daemontypes.HealthCheckable = (*stoppableFakeHealthChecker)(nil) func TestRegisterService_RegistrationFailsAfterStop(t *testing.T) { - ufm, logger := createTestMonitor() + ufm, _ := createTestMonitor() ufm.Stop() stoppableHc := &stoppableFakeHealthChecker{} hc2 := mockFailingHealthCheckerWithError("test-service-2", TestError1) - // Register a stoppable service. - err := ufm.RegisterService(stoppableHc, 50*time.Millisecond) - require.ErrorContains(t, err, "monitor has been stopped") + // Register unhealthy services. These services are confirmed to trigger a panic if registered when the monitor is + // not stopped. + // Register a stoppable unhealthy service. + err := ufm.RegisterService(stoppableHc, 10*time.Millisecond) + require.Nil(t, err) - // Register a non-stoppable service. - err = ufm.RegisterService(hc2, 50*time.Millisecond) - require.ErrorContains(t, err, "monitor has been stopped") + // Register a non-stoppable unhealthy service. + err = ufm.RegisterService(hc2, 10*time.Millisecond) + require.Nil(t, err) - // Any scheduled functions with error logs that were not cleaned up should trigger before this sleep finishes. + // Since the max allowable unhealthy duration is 10ms, and the polling period is 10ms, 100ms is long enough to wait + // in order to trigger a panic if a service is polled. time.Sleep(100 * time.Millisecond) - mock.AssertExpectationsForObjects(t, logger) // Assert that the monitor proactively stops any stoppable service that was registered after the monitor was // stopped. From 7eb8519fcd9e7d255b28840e4d1399da18013f92 Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Wed, 29 Nov 2023 15:48:03 -0800 Subject: [PATCH 07/15] Comment fixes, renames, reduce critical section for synchronized code. --- .../daemons/server/types/health_checker.go | 18 ++++----- .../daemons/server/types/health_monitor.go | 37 +++++++++++++------ 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go index 352b0813e8c..2f42c95cdda 100644 --- a/protocol/daemons/server/types/health_checker.go +++ b/protocol/daemons/server/types/health_checker.go @@ -83,26 +83,26 @@ func (u *healthCheckerMutableState) ReportFailure(now time.Time, err error) time return now.Sub(u.mostRecentFailureStreakError.Timestamp()) } -// healthChecker encapsulates the logic for monitoring the health of a health checkable service. +// healthChecker encapsulates the logic for monitoring the health of a health-checkable service. type healthChecker struct { // mutableState is the mutable state of the health checker. Access to these fields is synchronized. mutableState *healthCheckerMutableState - // healthCheckable is the health checkable service to be monitored. + // healthCheckable is the health-checkable service to be monitored. healthCheckable types.HealthCheckable - // pollFrequency is the frequency at which the health checkable service is polled. + // pollFrequency is the frequency at which the health-checkable service is polled. pollFrequency time.Duration - // timer triggers a health check poll for a health checkable service. + // timer triggers a health check poll for a health-checkable service. timer *time.Timer - // maxAcceptableUnhealthyDuration is the maximum acceptable duration for a health checkable service to + // maxAcceptableUnhealthyDuration is the maximum acceptable duration for a health-checkable service to // remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the // specified callback function. maxAcceptableUnhealthyDuration time.Duration - // unhealthyCallback is the callback function to be executed if the health checkable service remains + // unhealthyCallback is the callback function to be executed if the health-checkable service remains // unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration. // This callback function is executed with the error that caused the service to become unhealthy. unhealthyCallback func(error) @@ -113,7 +113,7 @@ type healthChecker struct { logger log.Logger } -// Poll executes a health check for the health checkable service. If the service has been unhealthy for longer than the +// Poll executes a health check for the health-checkable service. If the service has been unhealthy for longer than the // maximum acceptable unhealthy duration, the callback function is executed. // This method is publicly exposed for testing. This method is synchronized. func (hc *healthChecker) Poll() { @@ -137,12 +137,12 @@ func (hc *healthChecker) Poll() { hc.timer.Reset(hc.pollFrequency) } -// Stop stops the health checker. This method is synchronized. +// Stop stops the health checker. This method is not synchronized, as the timer does not need synchronization. func (hc *healthChecker) Stop() { hc.timer.Stop() } -// StartNewHealthChecker creates and starts a new health checker for a health checkable service. +// StartNewHealthChecker creates and starts a new health checker for a health-checkable service. func StartNewHealthChecker( healthCheckable types.HealthCheckable, pollFrequency time.Duration, diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go index f248a306685..1cff5f5b0af 100644 --- a/protocol/daemons/server/types/health_monitor.go +++ b/protocol/daemons/server/types/health_monitor.go @@ -11,7 +11,7 @@ import ( ) const ( - // HealthCheckPollFrequency is the frequency at which the health checkable service is polled. + // HealthCheckPollFrequency is the frequency at which the health-checkable service is polled. HealthCheckPollFrequency = 5 * time.Second ) @@ -30,8 +30,8 @@ type healthMonitorMutableState struct { disabled bool } -// NewHealthMonitorMutableState creates a new health monitor mutable state. -func NewHealthMonitorMutableState() *healthMonitorMutableState { +// newHealthMonitorMutableState creates a new health monitor mutable state. +func newHealthMonitorMutableState() *healthMonitorMutableState { return &healthMonitorMutableState{ serviceToHealthChecker: make(map[string]*healthChecker), } @@ -64,13 +64,29 @@ func (ms *healthMonitorMutableState) Stop() { ms.stopped = true } -// RegisterHealthChecker registers a new health checker for a health checkable with the health monitor. The health +// RegisterHealthChecker registers a new health checker for a health-checkable with the health monitor. The health // checker is lazily created using the provided function if needed. This method is synchronized. It returns an error if // the service was already registered. func (ms *healthMonitorMutableState) RegisterHealthChecker( checkable types.HealthCheckable, lazyHealthCheckerCreator func() *healthChecker, ) error { + stopService := false + + // If the monitor has already been stopped, we want to stop the checkable service before returning. + // However, we'd prefer not to stop the service within the critical section in order to prevent deadlocks. + // This defer will be called last, after the lock is released. + defer func() { + if stopService { + // If the service is stoppable, stop it. This helps us to clean up daemon services in test cases + // where the monitor is stopped before all daemon services have been registered. + if stoppable, ok := checkable.(Stoppable); ok { + stoppable.Stop() + } + } + }() + + // Enter into the critical section. ms.Lock() defer ms.Unlock() @@ -83,11 +99,8 @@ func (ms *healthMonitorMutableState) RegisterHealthChecker( // This could be a concern for short-running integration test cases, where the network // stops before all daemon services have been registered. if ms.stopped { - // If the service is stoppable, stop it. This helps us to clean up daemon services in test cases - // where the monitor is stopped before all daemon services have been registered. - if stoppable, ok := checkable.(Stoppable); ok { - stoppable.Stop() - } + // Toggle the stopService flag to true so that the service is stopped after the lock is released. + stopService = true return nil } @@ -118,7 +131,7 @@ func NewHealthMonitor( logger log.Logger, ) *HealthMonitor { return &HealthMonitor{ - mutableState: NewHealthMonitorMutableState(), + mutableState: newHealthMonitorMutableState(), logger: logger.With(cosmoslog.ModuleKey, "health-monitor"), startupGracePeriod: startupGracePeriod, pollingFrequency: pollingFrequency, @@ -171,7 +184,7 @@ func PanicServiceNotResponding(hc types.HealthCheckable) func(error) { } // LogErrorServiceNotResponding returns a function that logs an error indicating that the specified service -// is not responding. This is ideal for creating a callback function when registering a health checkable service. +// is not responding. This is ideal for creating a callback function when registering a health-checkable service. func LogErrorServiceNotResponding(hc types.HealthCheckable, logger log.Logger) func(error) { return func(err error) { logger.Error( @@ -184,7 +197,7 @@ func LogErrorServiceNotResponding(hc types.HealthCheckable, logger log.Logger) f } } -// RegisterService registers a new health checkable service with the health check monitor. If the service +// RegisterService registers a new health-checkable service with the health check monitor. If the service // is unhealthy every time it is polled for a duration greater than or equal to the maximum acceptable unhealthy // duration, the monitor will panic. // This method is synchronized. It returns an error if the service was already registered or the monitor has From f99694783ce7169099075345b7fa35987c2bbd60 Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Fri, 1 Dec 2023 17:51:26 -0800 Subject: [PATCH 08/15] PR comments. --- protocol/app/app.go | 12 ++-- protocol/app/app_test.go | 6 +- protocol/daemons/server/bridge.go | 12 ++-- protocol/daemons/server/liquidation.go | 11 ++- protocol/daemons/server/pricefeed.go | 12 ++-- .../daemons/server/types/health_checker.go | 71 ++++++++++++++++--- .../daemons/server/types/health_monitor.go | 5 +- 7 files changed, 86 insertions(+), 43 deletions(-) diff --git a/protocol/app/app.go b/protocol/app/app.go index ee947db019b..a13d66b9dc5 100644 --- a/protocol/app/app.go +++ b/protocol/app/app.go @@ -613,7 +613,7 @@ func New( if daemonFlags.Liquidation.Enabled { app.LiquidationsClient = liquidationclient.NewClient(logger) go func() { - app.MonitorDaemon(app.LiquidationsClient, MaximumDaemonUnhealthyDuration) + app.RegisterDaemonWithHealthMonitor(app.LiquidationsClient, MaximumDaemonUnhealthyDuration) if err := app.LiquidationsClient.Start( // The client will use `context.Background` so that it can have a different context from // the main application. @@ -645,7 +645,7 @@ func New( constants.StaticExchangeDetails, &pricefeedclient.SubTaskRunnerImpl{}, ) - app.MonitorDaemon(app.PriceFeedClient, MaximumDaemonUnhealthyDuration) + app.RegisterDaemonWithHealthMonitor(app.PriceFeedClient, MaximumDaemonUnhealthyDuration) } // Start Bridge Daemon. @@ -655,7 +655,7 @@ func New( // environments. app.BridgeClient = bridgeclient.NewClient(logger) go func() { - app.MonitorDaemon(app.BridgeClient, MaximumDaemonUnhealthyDuration) + app.RegisterDaemonWithHealthMonitor(app.BridgeClient, MaximumDaemonUnhealthyDuration) if err := app.BridgeClient.Start( // The client will use `context.Background` so that it can have a different context from // the main application. @@ -1230,9 +1230,9 @@ func New( return app } -// MonitorDaemon registers a daemon service with the update monitor. If the daemon does not register, the method will -// panic. -func (app *App) MonitorDaemon( +// RegisterDaemonWithHealthMonitor registers a daemon service with the update monitor, which will commence monitoring +// the health of the daemon. If the daemon does not register, the method will panic. +func (app *App) RegisterDaemonWithHealthMonitor( healthCheckableDaemon daemontypes.HealthCheckable, maximumAcceptableUpdateDelay time.Duration, ) { diff --git a/protocol/app/app_test.go b/protocol/app/app_test.go index 6503d9c4190..ded9ee8b8d8 100644 --- a/protocol/app/app_test.go +++ b/protocol/app/app_test.go @@ -225,17 +225,17 @@ func TestModuleBasics(t *testing.T) { require.Equal(t, expectedFieldTypes, actualFieldTypes, "Module basics does not match expected") } -func TestMonitorDaemon_Panics(t *testing.T) { +func TestRegisterDaemonWithHealthMonitor_Panics(t *testing.T) { app := testapp.DefaultTestApp(nil) hc := &mocks.HealthCheckable{} hc.On("ServiceName").Return("test-service") hc.On("HealthCheck").Return(nil) - app.MonitorDaemon(hc, 5*time.Minute) + app.RegisterDaemonWithHealthMonitor(hc, 5*time.Minute) // The second registration should fail, causing a panic. require.PanicsWithError( t, "service test-service already registered", - func() { app.MonitorDaemon(hc, 5*time.Minute) }, + func() { app.RegisterDaemonWithHealthMonitor(hc, 5*time.Minute) }, ) } diff --git a/protocol/daemons/server/bridge.go b/protocol/daemons/server/bridge.go index ad76af6066e..f794e0bb86a 100644 --- a/protocol/daemons/server/bridge.go +++ b/protocol/daemons/server/bridge.go @@ -30,16 +30,12 @@ func (s *Server) AddBridgeEvents( response *api.AddBridgeEventsResponse, err error, ) { - // Capture valid responses in metrics. - defer func() { - if err == nil { - s.reportValidResponse(types.PricefeedDaemonServiceName) - } - }() - - s.reportValidResponse(types.BridgeDaemonServiceName) if err = s.bridgeEventManager.AddBridgeEvents(req.BridgeEvents); err != nil { return nil, err } + + // Capture valid responses in metrics. + s.reportValidResponse(types.BridgeDaemonServiceName) + return &api.AddBridgeEventsResponse{}, nil } diff --git a/protocol/daemons/server/liquidation.go b/protocol/daemons/server/liquidation.go index f418b6c251f..4cb44acfa1c 100644 --- a/protocol/daemons/server/liquidation.go +++ b/protocol/daemons/server/liquidation.go @@ -33,13 +33,6 @@ func (s *Server) LiquidateSubaccounts( response *api.LiquidateSubaccountsResponse, err error, ) { - // Capture valid responses in metrics. - defer func() { - if err == nil { - s.reportValidResponse(types.PricefeedDaemonServiceName) - } - }() - telemetry.ModuleSetGauge( metrics.LiquidationDaemon, float32(len(req.SubaccountIds)), @@ -49,5 +42,9 @@ func (s *Server) LiquidateSubaccounts( ) s.liquidatableSubaccountIds.UpdateSubaccountIds(req.SubaccountIds) + + // Capture valid responses in metrics. + s.reportValidResponse(types.LiquidationsDaemonServiceName) + return &api.LiquidateSubaccountsResponse{}, nil } diff --git a/protocol/daemons/server/pricefeed.go b/protocol/daemons/server/pricefeed.go index b4ef56df4cf..d95af3dfb1a 100644 --- a/protocol/daemons/server/pricefeed.go +++ b/protocol/daemons/server/pricefeed.go @@ -39,13 +39,6 @@ func (s *Server) UpdateMarketPrices( response *api.UpdateMarketPricesResponse, err error, ) { - // Capture valid responses in metrics. - defer func() { - if err == nil { - s.reportValidResponse(types.PricefeedDaemonServiceName) - } - }() - // Measure latency in ingesting and handling gRPC price update. defer telemetry.ModuleMeasureSince( metrics.PricefeedServer, @@ -54,6 +47,8 @@ func (s *Server) UpdateMarketPrices( metrics.Latency, ) + // This panic is an unexpected condition because we initialize the market price cache in app initialization before + // starting the server or daemons. if s.marketToExchange == nil { panic( errorsmod.Wrapf( @@ -71,6 +66,9 @@ func (s *Server) UpdateMarketPrices( s.marketToExchange.UpdatePrices(req.MarketPriceUpdates) + // Capture valid responses in metrics. + s.reportValidResponse(types.PricefeedDaemonServiceName) + return &api.UpdateMarketPricesResponse{}, nil } diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go index 2f42c95cdda..835b0a5ad05 100644 --- a/protocol/daemons/server/types/health_checker.go +++ b/protocol/daemons/server/types/health_checker.go @@ -17,27 +17,36 @@ type timestampWithError struct { err error } +// Update updates the timeStampWithError to reflect the current error. If the timestamp is zero, this is the first +// update, so set the timestamp. func (u *timestampWithError) Update(timestamp time.Time, err error) { // If the timestamp is zero, this is the first update, so set the timestamp. if u.timestamp.IsZero() { u.timestamp = timestamp } + u.err = err } +// Reset resets the timestampWithError to its zero value, indicating that the service is healthy. func (u *timestampWithError) Reset() { u.timestamp = time.Time{} u.err = nil } +// IsZero returns true if the timestampWithError is zero, indicating that the service is healthy. func (u *timestampWithError) IsZero() bool { return u.timestamp.IsZero() && u.err == nil } +// Timestamp returns the timestamp associated with the timestampWithError, which is the timestamp of the first error +// in the current error streak. func (u *timestampWithError) Timestamp() time.Time { return u.timestamp } +// Error returns the error associated with the timestampWithError, which is the most recent error in the current error +// streak. func (u *timestampWithError) Error() error { return u.err } @@ -48,9 +57,9 @@ type healthCheckerMutableState struct { // lock is used to synchronize access to mutable state fields. lock sync.Mutex - // mostRecentSuccess is the timestamp of the most recent successful health check. - // Access to mostRecentSuccess is synchronized. - mostRecentSuccess time.Time + // lastSuccessTimestamp is the timestamp of the most recent successful health check. + // Access to lastSuccessTimestamp is synchronized. + lastSuccessTimestamp time.Time // mostRecentFailureStreakError tracks the timestamp of the first error in the most recent streak of errors, as well // as the most recent error. It is updated on every error and reset every time the service sees a healthy response. @@ -58,6 +67,21 @@ type healthCheckerMutableState struct { // the service has never been unhealthy, or the most recent error streak ended before it could trigger a callback. // Access to mostRecentFailureStreakError is synchronized. mostRecentFailureStreakError timestampWithError + + // timer triggers a health check poll for a health-checkable service. + timer *time.Timer + + // stopped indicates whether the health checker has been stopped. Additional health checks cannot be scheduled + // after the health checker has been stopped. + stopped bool +} + +// newHealthCheckerMutableState creates a new health checker mutable state scheduled to trigger a poll after the +// initial poll delay. +func newHealthCheckerMutableState(initialPollDelay time.Duration, pollFunc func()) *healthCheckerMutableState { + return &healthCheckerMutableState{ + timer: time.AfterFunc(initialPollDelay, pollFunc), + } } // ReportSuccess updates the health checker's mutable state to reflect a successful health check and schedules the next @@ -66,7 +90,7 @@ func (u *healthCheckerMutableState) ReportSuccess(now time.Time) { u.lock.Lock() defer u.lock.Unlock() - u.mostRecentSuccess = now + u.lastSuccessTimestamp = now // Whenever the service is healthy, reset the first failure in streak timestamp. u.mostRecentFailureStreakError.Reset() @@ -83,6 +107,35 @@ func (u *healthCheckerMutableState) ReportFailure(now time.Time, err error) time return now.Sub(u.mostRecentFailureStreakError.Timestamp()) } +// SchedulePoll schedules the next poll for the health-checkable service. If the service is stopped, the next poll +// will not be scheduled. This method is synchronized. +func (u *healthCheckerMutableState) SchedulePoll(nextPollDelay time.Duration) { + u.lock.Lock() + defer u.lock.Unlock() + + // Don't schedule a poll if the health checker has been stopped. + if u.stopped { + return + } + + // Schedule the next poll. + u.timer.Reset(nextPollDelay) +} + +// Stop stops the health checker. This method is synchronized. +func (u *healthCheckerMutableState) Stop() { + u.lock.Lock() + defer u.lock.Unlock() + + // Don't stop the health checker if it has already been stopped. + if u.stopped { + return + } + + u.timer.Stop() + u.stopped = true +} + // healthChecker encapsulates the logic for monitoring the health of a health-checkable service. type healthChecker struct { // mutableState is the mutable state of the health checker. Access to these fields is synchronized. @@ -94,9 +147,6 @@ type healthChecker struct { // pollFrequency is the frequency at which the health-checkable service is polled. pollFrequency time.Duration - // timer triggers a health check poll for a health-checkable service. - timer *time.Timer - // maxAcceptableUnhealthyDuration is the maximum acceptable duration for a health-checkable service to // remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the // specified callback function. @@ -134,12 +184,12 @@ func (hc *healthChecker) Poll() { } // Schedule next poll. - hc.timer.Reset(hc.pollFrequency) + hc.mutableState.SchedulePoll(hc.pollFrequency) } // Stop stops the health checker. This method is not synchronized, as the timer does not need synchronization. func (hc *healthChecker) Stop() { - hc.timer.Stop() + hc.mutableState.Stop() } // StartNewHealthChecker creates and starts a new health checker for a health-checkable service. @@ -159,11 +209,10 @@ func StartNewHealthChecker( timeProvider: timeProvider, maxAcceptableUnhealthyDuration: maximumAcceptableUnhealthyDuration, logger: logger, - mutableState: &healthCheckerMutableState{}, } // The first poll is scheduled after the startup grace period to allow the service to initialize. - checker.timer = time.AfterFunc(startupGracePeriod, checker.Poll) + checker.mutableState = newHealthCheckerMutableState(startupGracePeriod, checker.Poll) return checker } diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go index 1cff5f5b0af..130398d18d7 100644 --- a/protocol/daemons/server/types/health_monitor.go +++ b/protocol/daemons/server/types/health_monitor.go @@ -13,6 +13,9 @@ import ( const ( // HealthCheckPollFrequency is the frequency at which the health-checkable service is polled. HealthCheckPollFrequency = 5 * time.Second + + // HealthMonitorLogModuleName is the module name used for logging within the health monitor. + HealthMonitorLogModuleName = "daemon-health-monitor" ) // healthMonitorMutableState tracks all mutable state associated with the health monitor. This state is gathered into @@ -132,7 +135,7 @@ func NewHealthMonitor( ) *HealthMonitor { return &HealthMonitor{ mutableState: newHealthMonitorMutableState(), - logger: logger.With(cosmoslog.ModuleKey, "health-monitor"), + logger: logger.With(cosmoslog.ModuleKey, HealthMonitorLogModuleName), startupGracePeriod: startupGracePeriod, pollingFrequency: pollingFrequency, } From b5e62a3d2c10ac4b117a39ceaf8fe0a97e45c7b3 Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Thu, 7 Dec 2023 10:45:16 -0800 Subject: [PATCH 09/15] Fix unit test failure. --- protocol/daemons/server/types/health_monitor_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go index 82a0893e6ef..af991fd21ce 100644 --- a/protocol/daemons/server/types/health_monitor_test.go +++ b/protocol/daemons/server/types/health_monitor_test.go @@ -34,7 +34,7 @@ var ( // createTestMonitor creates a health monitor with a poll frequency of 10ms and a zero duration grace period. func createTestMonitor() (*types.HealthMonitor, *mocks.Logger) { logger := &mocks.Logger{} - logger.On("With", "module", "health-monitor").Return(logger).Once() + logger.On("With", "module", "daemon-health-monitor").Return(logger).Once() return types.NewHealthMonitor( ZeroDuration, 10*time.Millisecond, From 80c481b1c961d16b87f0e591e856a8c3be89de8f Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Thu, 7 Dec 2023 11:57:34 -0800 Subject: [PATCH 10/15] Update protocol/daemons/server/types/health_checker.go Co-authored-by: ttl33 <19664986+ttl33@users.noreply.github.com> --- protocol/daemons/server/types/health_checker.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go index 835b0a5ad05..9f2fb2113f5 100644 --- a/protocol/daemons/server/types/health_checker.go +++ b/protocol/daemons/server/types/health_checker.go @@ -170,11 +170,9 @@ func (hc *healthChecker) Poll() { err := hc.healthCheckable.HealthCheck() now := hc.timeProvider.Now() - // Capture healthy response. - - if err == nil { + if err == nil { // Capture healthy response. hc.mutableState.ReportSuccess(now) - } else { + } else { // Capture unhealthy response. streakDuration := hc.mutableState.ReportFailure(now, err) // If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the // callback function. From 21585a07aa551f2a5ea8cc702a796873fa7889f1 Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Thu, 7 Dec 2023 13:24:03 -0800 Subject: [PATCH 11/15] PR feedback. --- .../daemons/server/types/health_checker.go | 87 ++++++++++--------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go index 9f2fb2113f5..6c4d2727ab4 100644 --- a/protocol/daemons/server/types/health_checker.go +++ b/protocol/daemons/server/types/health_checker.go @@ -8,47 +8,44 @@ import ( "time" ) -// timestampWithError couples a timestamp and error to make it easier to update them in tandem. The -// timestampWithError will track the timestamp of the first error in a streak of errors, but keeps a record of the -// most recent error. This is useful for determining how long a service has been unhealthy, as well as the current -// state of the service. -type timestampWithError struct { - timestamp time.Time - err error -} - -// Update updates the timeStampWithError to reflect the current error. If the timestamp is zero, this is the first -// update, so set the timestamp. -func (u *timestampWithError) Update(timestamp time.Time, err error) { - // If the timestamp is zero, this is the first update, so set the timestamp. - if u.timestamp.IsZero() { - u.timestamp = timestamp +// errorStreak tracks two relevant statistics for an error streak returned by a HealthCheckable - the timestamp of the +// beginning of the error streak, and the most recent error. This is useful for determining how long a service has been +// unhealthy, as well as the current state of the service. +type errorStreak struct { + startOfStreak time.Time + mostRecentError error +} + +// UpdateLastError updates the errorStreak to reflect the current error. If the startOfStreak timestamp is zero, this +// error the first error in a new error streak, so the startOfStreak timestamp is set to the current timestamp. +func (u *errorStreak) UpdateLastError(timestamp time.Time, err error) { + // If the startOfStreak is zero, this is the first update, so set the startOfStreak. + if u.startOfStreak.IsZero() { + u.startOfStreak = timestamp } - u.err = err + u.mostRecentError = err } -// Reset resets the timestampWithError to its zero value, indicating that the service is healthy. -func (u *timestampWithError) Reset() { - u.timestamp = time.Time{} - u.err = nil +// Reset resets the errorStreak to its zero value, indicating that the service has no active error streak. +func (u *errorStreak) Reset() { + u.startOfStreak = time.Time{} + u.mostRecentError = nil } -// IsZero returns true if the timestampWithError is zero, indicating that the service is healthy. -func (u *timestampWithError) IsZero() bool { - return u.timestamp.IsZero() && u.err == nil +// IsUnset returns true if the errorStreak is unset, indicating that the service has no active error streak. +func (u *errorStreak) IsUnset() bool { + return u.startOfStreak.IsZero() && u.mostRecentError == nil } -// Timestamp returns the timestamp associated with the timestampWithError, which is the timestamp of the first error -// in the current error streak. -func (u *timestampWithError) Timestamp() time.Time { - return u.timestamp +// StartOfStreak returns the timestamp of th start of the most recent error streak. +func (u *errorStreak) StartOfStreak() time.Time { + return u.startOfStreak } -// Error returns the error associated with the timestampWithError, which is the most recent error in the current error -// streak. -func (u *timestampWithError) Error() error { - return u.err +// MostRecentError returns the most recent error associated with the current error streak. +func (u *errorStreak) MostRecentError() error { + return u.mostRecentError } // healthCheckerMutableState tracks the current health state of the HealthCheckable, encapsulating all mutable state @@ -57,16 +54,17 @@ type healthCheckerMutableState struct { // lock is used to synchronize access to mutable state fields. lock sync.Mutex - // lastSuccessTimestamp is the timestamp of the most recent successful health check. + // lastSuccessTimestamp is the startOfStreak of the most recent successful health check. // Access to lastSuccessTimestamp is synchronized. lastSuccessTimestamp time.Time - // mostRecentFailureStreakError tracks the timestamp of the first error in the most recent streak of errors, as well - // as the most recent error. It is updated on every error and reset every time the service sees a healthy response. - // This field is used to determine how long the daemon has been unhealthy. If this timestamp is nil, then either - // the service has never been unhealthy, or the most recent error streak ended before it could trigger a callback. - // Access to mostRecentFailureStreakError is synchronized. - mostRecentFailureStreakError timestampWithError + // mostRecentErrorStreak tracks the beginning of the most recent streak, as well as the current error in the streak. + // It is updated on every error and reset every time the service sees a healthy response. + // This field is used to determine how long the daemon has been unhealthy. If the mostRecentErrorStreak is unset, + // then either the service has never been unhealthy, or the most recent error streak ended before it could trigger + // a callback. + // Access to mostRecentErrorStreak is synchronized. + mostRecentErrorStreak errorStreak // timer triggers a health check poll for a health-checkable service. timer *time.Timer @@ -92,8 +90,8 @@ func (u *healthCheckerMutableState) ReportSuccess(now time.Time) { u.lastSuccessTimestamp = now - // Whenever the service is healthy, reset the first failure in streak timestamp. - u.mostRecentFailureStreakError.Reset() + // Whenever the service is healthy, reset the first failure in streak startOfStreak. + u.mostRecentErrorStreak.Reset() } // ReportFailure updates the health checker's mutable state to reflect a failed health check and schedules the next @@ -102,9 +100,9 @@ func (u *healthCheckerMutableState) ReportFailure(now time.Time, err error) time u.lock.Lock() defer u.lock.Unlock() - u.mostRecentFailureStreakError.Update(now, err) + u.mostRecentErrorStreak.UpdateLastError(now, err) - return now.Sub(u.mostRecentFailureStreakError.Timestamp()) + return now.Sub(u.mostRecentErrorStreak.StartOfStreak()) } // SchedulePoll schedules the next poll for the health-checkable service. If the service is stopped, the next poll @@ -181,7 +179,10 @@ func (hc *healthChecker) Poll() { } } - // Schedule next poll. + // Schedule next poll. We schedule another poll whether the callback was invoked or not, as callbacks are not + // guaranteed to panic or otherwise halt the daemon. In such cases, we may end up invoking the callback several + // times once the service exceeds the maximum unhealthy duration. For example, a callback that emits error logs + // will continue to emit error logs every 5s until the service becomes healthy again. hc.mutableState.SchedulePoll(hc.pollFrequency) } From 433638885143639a4cd70e04d2d344fb6a370925 Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Thu, 7 Dec 2023 15:28:26 -0800 Subject: [PATCH 12/15] [CORE-668,CORE-669] - Add health monitor flags (#802) --- protocol/app/app.go | 22 ++++------ protocol/app/flags/flags.go | 1 - protocol/daemons/flags/flags.go | 36 ++++++++++++++-- protocol/daemons/flags/flags_test.go | 12 +++++- .../client/client_integration_test.go | 7 +++- .../daemons/server/types/health_checker.go | 21 +++++----- .../daemons/server/types/health_monitor.go | 37 ++++++++++++----- .../server/types/health_monitor_test.go | 41 ++++++++++++++++++- protocol/docker-compose.yml | 22 ++++++---- .../clob/client/cli/cancel_order_cli_test.go | 8 ++++ .../clob/client/cli/liquidations_cli_test.go | 8 ++++ .../x/clob/client/cli/place_order_cli_test.go | 8 ++++ 12 files changed, 173 insertions(+), 50 deletions(-) diff --git a/protocol/app/app.go b/protocol/app/app.go index a13d66b9dc5..6a6b04fb61b 100644 --- a/protocol/app/app.go +++ b/protocol/app/app.go @@ -183,10 +183,6 @@ import ( var ( // DefaultNodeHome default home directories for the application daemon DefaultNodeHome string - - // MaximumDaemonUnhealthyDuration is the maximum amount of time that a daemon can be unhealthy before the - // application panics. - MaximumDaemonUnhealthyDuration = 5 * time.Minute ) var ( @@ -599,6 +595,7 @@ func New( daemonservertypes.DaemonStartupGracePeriod, daemonservertypes.HealthCheckPollFrequency, app.Logger(), + daemonFlags.Shared.PanicOnDaemonFailureEnabled, ) // Create a closure for starting daemons and daemon server. Daemon services are delayed until after the gRPC // service is started because daemons depend on the gRPC service being available. If a node is initialized @@ -606,6 +603,7 @@ func New( // daemons will not be able to connect to the cosmos gRPC query service and finish initialization, and the daemon // monitoring service will panic. app.startDaemons = func() { + maxDaemonUnhealthyDuration := time.Duration(daemonFlags.Shared.MaxDaemonUnhealthySeconds) * time.Second // Start server for handling gRPC messages from daemons. go app.Server.Start() @@ -613,7 +611,7 @@ func New( if daemonFlags.Liquidation.Enabled { app.LiquidationsClient = liquidationclient.NewClient(logger) go func() { - app.RegisterDaemonWithHealthMonitor(app.LiquidationsClient, MaximumDaemonUnhealthyDuration) + app.RegisterDaemonWithHealthMonitor(app.LiquidationsClient, maxDaemonUnhealthyDuration) if err := app.LiquidationsClient.Start( // The client will use `context.Background` so that it can have a different context from // the main application. @@ -645,17 +643,15 @@ func New( constants.StaticExchangeDetails, &pricefeedclient.SubTaskRunnerImpl{}, ) - app.RegisterDaemonWithHealthMonitor(app.PriceFeedClient, MaximumDaemonUnhealthyDuration) + app.RegisterDaemonWithHealthMonitor(app.PriceFeedClient, maxDaemonUnhealthyDuration) } // Start Bridge Daemon. // Non-validating full-nodes have no need to run the bridge daemon. if !appFlags.NonValidatingFullNode && daemonFlags.Bridge.Enabled { - // TODO(CORE-582): Re-enable bridge daemon registration once the bridge daemon is fixed in local / CI - // environments. app.BridgeClient = bridgeclient.NewClient(logger) go func() { - app.RegisterDaemonWithHealthMonitor(app.BridgeClient, MaximumDaemonUnhealthyDuration) + app.RegisterDaemonWithHealthMonitor(app.BridgeClient, maxDaemonUnhealthyDuration) if err := app.BridgeClient.Start( // The client will use `context.Background` so that it can have a different context from // the main application. @@ -1234,17 +1230,17 @@ func New( // the health of the daemon. If the daemon does not register, the method will panic. func (app *App) RegisterDaemonWithHealthMonitor( healthCheckableDaemon daemontypes.HealthCheckable, - maximumAcceptableUpdateDelay time.Duration, + maxDaemonUnhealthyDuration time.Duration, ) { - if err := app.DaemonHealthMonitor.RegisterService(healthCheckableDaemon, maximumAcceptableUpdateDelay); err != nil { + if err := app.DaemonHealthMonitor.RegisterService(healthCheckableDaemon, maxDaemonUnhealthyDuration); err != nil { app.Logger().Error( "Failed to register daemon service with update monitor", "error", err, "service", healthCheckableDaemon.ServiceName(), - "maximumAcceptableUpdateDelay", - maximumAcceptableUpdateDelay, + "maxDaemonUnhealthyDuration", + maxDaemonUnhealthyDuration, ) panic(err) } diff --git a/protocol/app/flags/flags.go b/protocol/app/flags/flags.go index 21333d95f15..06039978017 100644 --- a/protocol/app/flags/flags.go +++ b/protocol/app/flags/flags.go @@ -2,7 +2,6 @@ package flags import ( "fmt" - "github.com/cosmos/cosmos-sdk/server/config" servertypes "github.com/cosmos/cosmos-sdk/server/types" "github.com/spf13/cast" diff --git a/protocol/daemons/flags/flags.go b/protocol/daemons/flags/flags.go index eebdf7e6319..e386f379aba 100644 --- a/protocol/daemons/flags/flags.go +++ b/protocol/daemons/flags/flags.go @@ -9,7 +9,9 @@ import ( // List of CLI flags for Server and Client. const ( // Flag names - FlagUnixSocketAddress = "unix-socket-address" + FlagUnixSocketAddress = "unix-socket-address" + FlagPanicOnDaemonFailureEnabled = "panic-on-daemon-failure-enabled" + FlagMaxDaemonUnhealthySeconds = "max-daemon-unhealthy-seconds" FlagPriceDaemonEnabled = "price-daemon-enabled" FlagPriceDaemonLoopDelayMs = "price-daemon-loop-delay-ms" @@ -28,6 +30,10 @@ const ( type SharedFlags struct { // SocketAddress is the location of the unix socket to communicate with the daemon gRPC service. SocketAddress string + // PanicOnDaemonFailureEnabled toggles whether the daemon should panic on failure. + PanicOnDaemonFailureEnabled bool + // MaxDaemonUnhealthySeconds is the maximum allowable duration for which a daemon can be unhealthy. + MaxDaemonUnhealthySeconds uint32 } // BridgeFlags contains configuration flags for the Bridge Daemon. @@ -74,7 +80,9 @@ func GetDefaultDaemonFlags() DaemonFlags { if defaultDaemonFlags == nil { defaultDaemonFlags = &DaemonFlags{ Shared: SharedFlags{ - SocketAddress: "/tmp/daemons.sock", + SocketAddress: "/tmp/daemons.sock", + PanicOnDaemonFailureEnabled: true, + MaxDaemonUnhealthySeconds: 5 * 60, // 5 minutes. }, Bridge: BridgeFlags{ Enabled: true, @@ -109,8 +117,18 @@ func AddDaemonFlagsToCmd( cmd.Flags().String( FlagUnixSocketAddress, df.Shared.SocketAddress, - "Socket address for the price daemon to send updates to, if not set "+ - "will establish default location to ingest price updates from", + "Socket address for the daemons to send updates to, if not set "+ + "will establish default location to ingest daemon updates from", + ) + cmd.Flags().Bool( + FlagPanicOnDaemonFailureEnabled, + df.Shared.PanicOnDaemonFailureEnabled, + "Enables panicking when a daemon fails.", + ) + cmd.Flags().Uint32( + FlagMaxDaemonUnhealthySeconds, + df.Shared.MaxDaemonUnhealthySeconds, + "Maximum allowable duration for which a daemon can be unhealthy.", ) // Bridge Daemon. @@ -178,6 +196,16 @@ func GetDaemonFlagValuesFromOptions( result.Shared.SocketAddress = v } } + if option := appOpts.Get(FlagPanicOnDaemonFailureEnabled); option != nil { + if v, err := cast.ToBoolE(option); err == nil { + result.Shared.PanicOnDaemonFailureEnabled = v + } + } + if option := appOpts.Get(FlagMaxDaemonUnhealthySeconds); option != nil { + if v, err := cast.ToUint32E(option); err == nil { + result.Shared.MaxDaemonUnhealthySeconds = v + } + } // Bridge Daemon. if option := appOpts.Get(FlagBridgeDaemonEnabled); option != nil { diff --git a/protocol/daemons/flags/flags_test.go b/protocol/daemons/flags/flags_test.go index 5c79395a39a..04191032f60 100644 --- a/protocol/daemons/flags/flags_test.go +++ b/protocol/daemons/flags/flags_test.go @@ -17,6 +17,8 @@ func TestAddDaemonFlagsToCmd(t *testing.T) { flags.AddDaemonFlagsToCmd(&cmd) tests := []string{ flags.FlagUnixSocketAddress, + flags.FlagPanicOnDaemonFailureEnabled, + flags.FlagMaxDaemonUnhealthySeconds, flags.FlagBridgeDaemonEnabled, flags.FlagBridgeDaemonLoopDelayMs, @@ -41,6 +43,8 @@ func TestGetDaemonFlagValuesFromOptions_Custom(t *testing.T) { optsMap := make(map[string]interface{}) optsMap[flags.FlagUnixSocketAddress] = "test-socket-address" + optsMap[flags.FlagPanicOnDaemonFailureEnabled] = false + optsMap[flags.FlagMaxDaemonUnhealthySeconds] = uint32(1234) optsMap[flags.FlagBridgeDaemonEnabled] = true optsMap[flags.FlagBridgeDaemonLoopDelayMs] = uint32(1111) @@ -64,6 +68,12 @@ func TestGetDaemonFlagValuesFromOptions_Custom(t *testing.T) { // Shared. require.Equal(t, optsMap[flags.FlagUnixSocketAddress], r.Shared.SocketAddress) + require.Equal(t, optsMap[flags.FlagPanicOnDaemonFailureEnabled], r.Shared.PanicOnDaemonFailureEnabled) + require.Equal( + t, + optsMap[flags.FlagMaxDaemonUnhealthySeconds], + r.Shared.MaxDaemonUnhealthySeconds, + ) // Bridge Daemon. require.Equal(t, optsMap[flags.FlagBridgeDaemonEnabled], r.Bridge.Enabled) @@ -81,7 +91,7 @@ func TestGetDaemonFlagValuesFromOptions_Custom(t *testing.T) { require.Equal(t, optsMap[flags.FlagPriceDaemonLoopDelayMs], r.Price.LoopDelayMs) } -func TestGetDaemonFlagValuesFromOptions_Defaul(t *testing.T) { +func TestGetDaemonFlagValuesFromOptions_Default(t *testing.T) { mockOpts := mocks.AppOptions{} mockOpts.On("Get", mock.Anything). Return(func(key string) interface{} { diff --git a/protocol/daemons/pricefeed/client/client_integration_test.go b/protocol/daemons/pricefeed/client/client_integration_test.go index 70e2dfd75ca..45d32a28a45 100644 --- a/protocol/daemons/pricefeed/client/client_integration_test.go +++ b/protocol/daemons/pricefeed/client/client_integration_test.go @@ -5,7 +5,6 @@ package client_test import ( "fmt" "github.com/cometbft/cometbft/libs/log" - "github.com/dydxprotocol/v4-chain/protocol/app" appflags "github.com/dydxprotocol/v4-chain/protocol/app/flags" "github.com/dydxprotocol/v4-chain/protocol/daemons/flags" "github.com/dydxprotocol/v4-chain/protocol/daemons/pricefeed/client" @@ -285,6 +284,7 @@ func (s *PriceDaemonIntegrationTestSuite) SetupTest() { servertypes.DaemonStartupGracePeriod, servertypes.HealthCheckPollFrequency, log.TestingLogger(), + flags.GetDefaultDaemonFlags().Shared.PanicOnDaemonFailureEnabled, // Use default behavior for testing ) s.exchangePriceCache = pricefeedserver_types.NewMarketToExchangePrices(pricefeed_types.MaxPriceAge) @@ -337,7 +337,10 @@ func (s *PriceDaemonIntegrationTestSuite) startClient() { testExchangeToQueryDetails, &client.SubTaskRunnerImpl{}, ) - err := s.healthMonitor.RegisterService(s.pricefeedDaemon, app.MaximumDaemonUnhealthyDuration) + err := s.healthMonitor.RegisterService( + s.pricefeedDaemon, + time.Duration(s.daemonFlags.Shared.MaxDaemonUnhealthySeconds)*time.Second, + ) s.Require().NoError(err) } diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go index 6c4d2727ab4..228a6dfd8d6 100644 --- a/protocol/daemons/server/types/health_checker.go +++ b/protocol/daemons/server/types/health_checker.go @@ -145,10 +145,10 @@ type healthChecker struct { // pollFrequency is the frequency at which the health-checkable service is polled. pollFrequency time.Duration - // maxAcceptableUnhealthyDuration is the maximum acceptable duration for a health-checkable service to + // maxUnhealthyDuration is the maximum acceptable duration for a health-checkable service to // remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the // specified callback function. - maxAcceptableUnhealthyDuration time.Duration + maxUnhealthyDuration time.Duration // unhealthyCallback is the callback function to be executed if the health-checkable service remains // unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration. @@ -174,7 +174,7 @@ func (hc *healthChecker) Poll() { streakDuration := hc.mutableState.ReportFailure(now, err) // If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the // callback function. - if streakDuration >= hc.maxAcceptableUnhealthyDuration { + if streakDuration >= hc.maxUnhealthyDuration { hc.unhealthyCallback(err) } } @@ -197,17 +197,18 @@ func StartNewHealthChecker( pollFrequency time.Duration, unhealthyCallback func(error), timeProvider libtime.TimeProvider, - maximumAcceptableUnhealthyDuration time.Duration, + maxUnhealthyDuration time.Duration, startupGracePeriod time.Duration, logger log.Logger, ) *healthChecker { checker := &healthChecker{ - healthCheckable: healthCheckable, - pollFrequency: pollFrequency, - unhealthyCallback: unhealthyCallback, - timeProvider: timeProvider, - maxAcceptableUnhealthyDuration: maximumAcceptableUnhealthyDuration, - logger: logger, + healthCheckable: healthCheckable, + pollFrequency: pollFrequency, + unhealthyCallback: unhealthyCallback, + timeProvider: timeProvider, + maxUnhealthyDuration: maxUnhealthyDuration, + logger: logger, + mutableState: &healthCheckerMutableState{}, } // The first poll is scheduled after the startup grace period to allow the service to initialize. diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go index 130398d18d7..1ac31095892 100644 --- a/protocol/daemons/server/types/health_monitor.go +++ b/protocol/daemons/server/types/health_monitor.go @@ -122,9 +122,14 @@ type HealthMonitor struct { mutableState *healthMonitorMutableState // These fields are initialized in NewHealthMonitor and are not modified after initialization. - logger log.Logger + logger log.Logger + // startupGracePeriod is the grace period before the monitor starts polling the health-checkable services. startupGracePeriod time.Duration - pollingFrequency time.Duration + // pollingFrequency is the frequency at which the health-checkable services are polled. + pollingFrequency time.Duration + // enablePanics is used to toggle between panics or error logs when a daemon sustains an unhealthy state past the + // maximum allowable duration. + enablePanics bool } // NewHealthMonitor creates a new health monitor. @@ -132,12 +137,14 @@ func NewHealthMonitor( startupGracePeriod time.Duration, pollingFrequency time.Duration, logger log.Logger, + enablePanics bool, ) *HealthMonitor { return &HealthMonitor{ mutableState: newHealthMonitorMutableState(), logger: logger.With(cosmoslog.ModuleKey, HealthMonitorLogModuleName), startupGracePeriod: startupGracePeriod, pollingFrequency: pollingFrequency, + enablePanics: enablePanics, } } @@ -153,15 +160,15 @@ func (hm *HealthMonitor) DisableForTesting() { // health-checkable service before returning. func (hm *HealthMonitor) RegisterServiceWithCallback( hc types.HealthCheckable, - maximumAcceptableUnhealthyDuration time.Duration, + maxUnhealthyDuration time.Duration, callback func(error), ) error { - if maximumAcceptableUnhealthyDuration <= 0 { + if maxUnhealthyDuration <= 0 { return fmt.Errorf( "health check registration failure for service %v: "+ - "maximum acceptable unhealthy duration %v must be positive", + "maximum unhealthy duration %v must be positive", hc.ServiceName(), - maximumAcceptableUnhealthyDuration, + maxUnhealthyDuration, ) } @@ -171,7 +178,7 @@ func (hm *HealthMonitor) RegisterServiceWithCallback( hm.pollingFrequency, callback, &libtime.TimeProviderImpl{}, - maximumAcceptableUnhealthyDuration, + maxUnhealthyDuration, hm.startupGracePeriod, hm.logger, ) @@ -202,18 +209,26 @@ func LogErrorServiceNotResponding(hc types.HealthCheckable, logger log.Logger) f // RegisterService registers a new health-checkable service with the health check monitor. If the service // is unhealthy every time it is polled for a duration greater than or equal to the maximum acceptable unhealthy -// duration, the monitor will panic. +// duration, the monitor will panic or log an error, depending on the app configuration via the +// `panic-on-daemon-failure-enabled` flag. // This method is synchronized. It returns an error if the service was already registered or the monitor has // already been stopped. If the monitor has been stopped, this method will proactively stop the health-checkable // service before returning. func (hm *HealthMonitor) RegisterService( hc types.HealthCheckable, - maximumAcceptableUnhealthyDuration time.Duration, + maxDaemonUnhealthyDuration time.Duration, ) error { + // If the monitor is configured to panic, use the panic callback. Otherwise, use the error log callback. + // This behavior is configured via flag and defaults to panicking on daemon failure. + callback := LogErrorServiceNotResponding(hc, hm.logger) + if hm.enablePanics { + callback = PanicServiceNotResponding(hc) + } + return hm.RegisterServiceWithCallback( hc, - maximumAcceptableUnhealthyDuration, - PanicServiceNotResponding(hc), + maxDaemonUnhealthyDuration, + callback, ) } diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go index af991fd21ce..4a8e701901a 100644 --- a/protocol/daemons/server/types/health_monitor_test.go +++ b/protocol/daemons/server/types/health_monitor_test.go @@ -39,6 +39,7 @@ func createTestMonitor() (*types.HealthMonitor, *mocks.Logger) { ZeroDuration, 10*time.Millisecond, logger, + true, // enable panics here for stricter testing - a panic will definitely cause a test failure. ), logger } @@ -126,6 +127,44 @@ func TestRegisterServiceWithCallback_Mixed(t *testing.T) { } } +func TestHealthMonitor_DisablePanics_DoesNotPanic(t *testing.T) { + logger := &mocks.Logger{} + logger.On("With", "module", "daemon-health-monitor").Return(logger).Once() + logger.On( + "Error", + "health-checked service is unhealthy", + "service", + "test-service", + "error", + mock.Anything, + ).Return() + + hm := types.NewHealthMonitor( + ZeroDuration, + 10*time.Millisecond, + logger, + false, + ) + + hc := mockFailingHealthCheckerWithError("test-service", TestError1) + + err := hm.RegisterService(hc, 10*time.Millisecond) + require.NoError(t, err) + + defer func() { + hm.Stop() + }() + + // A 100ms sleep should be sufficient for the health monitor to detect the unhealthy service and trigger a callback. + time.Sleep(100 * time.Millisecond) + + // Assert. + // This test is confirmed to panic when panics are not disabled - but because the panic occurs in a separate + // go-routine, it cannot be easily captured with an assert. Instead, we do not try to capture the panic, but + // assert that the logger was called with the expected arguments. + mock.AssertExpectationsForObjects(t, logger) +} + func TestRegisterServiceWithCallback_DoubleRegistrationFails(t *testing.T) { // Setup. ufm, logger := createTestMonitor() @@ -203,7 +242,7 @@ func TestRegisterValidResponseWithCallback_NegativeUnhealthyDuration(t *testing. ufm, _ := createTestMonitor() hc := mockFailingHealthCheckerWithError("test-service", TestError1) err := ufm.RegisterServiceWithCallback(hc, -50*time.Millisecond, func(error) {}) - require.ErrorContains(t, err, "maximum acceptable unhealthy duration -50ms must be positive") + require.ErrorContains(t, err, "maximum unhealthy duration -50ms must be positive") } func TestPanicServiceNotResponding(t *testing.T) { diff --git a/protocol/docker-compose.yml b/protocol/docker-compose.yml index 203f802f668..c3038011582 100644 --- a/protocol/docker-compose.yml +++ b/protocol/docker-compose.yml @@ -10,13 +10,15 @@ services: - --log_level # Note that only this validator has a log-level of `info`; other validators use `error` by default. # Change to `debug` for more verbose log-level. - - info + - info - --home - /dydxprotocol/chain/.alice - - --p2p.persistent_peers + - --p2p.persistent_peers - "17e5e45691f0d01449c84fd4ae87279578cdd7ec@dydxprotocold0:26656,b69182310be02559483e42c77b7b104352713166@dydxprotocold1:26656,47539956aaa8e624e0f1d926040e54908ad0eb44@dydxprotocold2:26656,5882428984d83b03d0c907c1f0af343534987052@dydxprotocold3:26656" - --bridge-daemon-eth-rpc-endpoint - "${ETH_RPC_ENDPOINT}" + - --max-daemon-unhealthy-seconds + - "4294967295" # Effectively disable the daemon monitor because bridge daemon is flaky in localnet. environment: # See https://docs.datadoghq.com/profiler/enabling/go/ for DD_ specific environment variables - DD_ENV=localnet_${USER} @@ -28,7 +30,7 @@ services: - "26657:26657" - "9090:9090" - "1317:1317" - + dydxprotocold1: image: local:dydxprotocol entrypoint: @@ -39,10 +41,12 @@ services: - error - --home - /dydxprotocol/chain/.bob - - --p2p.persistent_peers + - --p2p.persistent_peers - "17e5e45691f0d01449c84fd4ae87279578cdd7ec@dydxprotocold0:26656,b69182310be02559483e42c77b7b104352713166@dydxprotocold1:26656,47539956aaa8e624e0f1d926040e54908ad0eb44@dydxprotocold2:26656,5882428984d83b03d0c907c1f0af343534987052@dydxprotocold3:26656" - --bridge-daemon-eth-rpc-endpoint - "${ETH_RPC_ENDPOINT}" + - --max-daemon-unhealthy-seconds + - "4294967295" environment: # See https://docs.datadoghq.com/profiler/enabling/go/ for DD_ specific environment variables - DD_ENV=localnet_${USER} @@ -52,7 +56,7 @@ services: - ./localnet/dydxprotocol1:/dydxprotocol/chain/.bob/data ports: - "26658:26657" - + dydxprotocold2: image: local:dydxprotocol entrypoint: @@ -67,6 +71,8 @@ services: - "17e5e45691f0d01449c84fd4ae87279578cdd7ec@dydxprotocold0:26656,b69182310be02559483e42c77b7b104352713166@dydxprotocold1:26656,47539956aaa8e624e0f1d926040e54908ad0eb44@dydxprotocold2:26656,5882428984d83b03d0c907c1f0af343534987052@dydxprotocold3:26656" - --bridge-daemon-eth-rpc-endpoint - "${ETH_RPC_ENDPOINT}" + - --max-daemon-unhealthy-seconds + - "4294967295" environment: # See https://docs.datadoghq.com/profiler/enabling/go/ for DD_ specific environment variables - DD_ENV=localnet_${USER} @@ -74,7 +80,7 @@ services: - DAEMON_HOME=/dydxprotocol/chain/.carl volumes: - ./localnet/dydxprotocol2:/dydxprotocol/chain/.carl/data - + dydxprotocold3: image: local:dydxprotocol entrypoint: @@ -85,10 +91,12 @@ services: - error - --home - /dydxprotocol/chain/.dave - - --p2p.persistent_peers + - --p2p.persistent_peers - "17e5e45691f0d01449c84fd4ae87279578cdd7ec@dydxprotocold0:26656,b69182310be02559483e42c77b7b104352713166@dydxprotocold1:26656,47539956aaa8e624e0f1d926040e54908ad0eb44@dydxprotocold2:26656,5882428984d83b03d0c907c1f0af343534987052@dydxprotocold3:26656" - --bridge-daemon-eth-rpc-endpoint - "${ETH_RPC_ENDPOINT}" + - --max-daemon-unhealthy-seconds + - "4294967295" environment: # See https://docs.datadoghq.com/profiler/enabling/go/ for DD_ specific environment variables - DD_ENV=localnet_${USER} diff --git a/protocol/x/clob/client/cli/cancel_order_cli_test.go b/protocol/x/clob/client/cli/cancel_order_cli_test.go index 9f148f36ba1..de7afeafe4c 100644 --- a/protocol/x/clob/client/cli/cancel_order_cli_test.go +++ b/protocol/x/clob/client/cli/cancel_order_cli_test.go @@ -8,6 +8,7 @@ import ( appflags "github.com/dydxprotocol/v4-chain/protocol/app/flags" daemonflags "github.com/dydxprotocol/v4-chain/protocol/daemons/flags" "github.com/dydxprotocol/v4-chain/protocol/testutil/appoptions" + "math" "math/big" "testing" @@ -73,6 +74,13 @@ func (s *CancelOrderIntegrationTestSuite) SetupTest() { appOptions.Set(daemonflags.FlagPriceDaemonEnabled, false) appOptions.Set(daemonflags.FlagBridgeDaemonEnabled, false) + // Effectively disable the health monitor panic timeout for these tests. This is necessary + // because all clob cli tests are running in the same process and the total time to run is >> 5 minutes + // on CI, causing the panic to trigger for liquidations daemon go routines that haven't been properly + // cleaned up after a test run. + // TODO(CORE-29): Remove this once the liquidations daemon is refactored to be stoppable. + appOptions.Set(daemonflags.FlagMaxDaemonUnhealthySeconds, math.MaxUint32) + // Make sure the daemon is using the correct GRPC address. appOptions.Set(appflags.GrpcAddress, testval.AppConfig.GRPC.Address) }, diff --git a/protocol/x/clob/client/cli/liquidations_cli_test.go b/protocol/x/clob/client/cli/liquidations_cli_test.go index 01d02bf1a18..9c8223ae956 100644 --- a/protocol/x/clob/client/cli/liquidations_cli_test.go +++ b/protocol/x/clob/client/cli/liquidations_cli_test.go @@ -4,6 +4,7 @@ package cli_test import ( "fmt" + "math" appflags "github.com/dydxprotocol/v4-chain/protocol/app/flags" "math/big" @@ -74,6 +75,13 @@ func TestLiquidationOrderIntegrationTestSuite(t *testing.T) { appOptions.Set(daemonflags.FlagPriceDaemonEnabled, false) appOptions.Set(daemonflags.FlagBridgeDaemonEnabled, false) + // Effectively disable the health monitor panic timeout for these tests. This is necessary + // because all clob cli tests are running in the same process and the total time to run is >> 5 minutes + // on CI, causing the panic to trigger for liquidations daemon go routines that haven't been properly + // cleaned up after a test run. + // TODO(CORE-29): Remove this once the liquidations daemon is refactored to be stoppable. + appOptions.Set(daemonflags.FlagMaxDaemonUnhealthySeconds, math.MaxUint32) + // Make sure the daemon is using the correct GRPC address. appOptions.Set(appflags.GrpcAddress, testval.AppConfig.GRPC.Address) diff --git a/protocol/x/clob/client/cli/place_order_cli_test.go b/protocol/x/clob/client/cli/place_order_cli_test.go index bf6832efe82..5bbd5511fcd 100644 --- a/protocol/x/clob/client/cli/place_order_cli_test.go +++ b/protocol/x/clob/client/cli/place_order_cli_test.go @@ -5,6 +5,7 @@ package cli_test import ( "fmt" appflags "github.com/dydxprotocol/v4-chain/protocol/app/flags" + "math" "math/big" "testing" @@ -68,6 +69,13 @@ func TestPlaceOrderIntegrationTestSuite(t *testing.T) { appOptions.Set(daemonflags.FlagPriceDaemonEnabled, false) appOptions.Set(daemonflags.FlagBridgeDaemonEnabled, false) + // Effectively disable the health monitor panic timeout for these tests. This is necessary + // because all clob cli tests are running in the same process and the total time to run is >> 5 minutes + // on CI, causing the panic to trigger for liquidations daemon go routines that haven't been properly + // cleaned up after a test run. + // TODO(CORE-29): Remove this once the liquidations daemon is refactored to be stoppable. + appOptions.Set(daemonflags.FlagMaxDaemonUnhealthySeconds, math.MaxUint32) + // Make sure the daemon is using the correct GRPC address. appOptions.Set(appflags.GrpcAddress, testval.AppConfig.GRPC.Address) }, From abdeb97c3e8440302a9db49a9fa84426a08a7171 Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Thu, 7 Dec 2023 16:04:01 -0800 Subject: [PATCH 13/15] Fix merge mistake. --- protocol/daemons/server/types/health_checker.go | 1 - 1 file changed, 1 deletion(-) diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go index 228a6dfd8d6..c29bd4384d7 100644 --- a/protocol/daemons/server/types/health_checker.go +++ b/protocol/daemons/server/types/health_checker.go @@ -208,7 +208,6 @@ func StartNewHealthChecker( timeProvider: timeProvider, maxUnhealthyDuration: maxUnhealthyDuration, logger: logger, - mutableState: &healthCheckerMutableState{}, } // The first poll is scheduled after the startup grace period to allow the service to initialize. From 9192baeccea1951cc02b315d1566848fd90bfea0 Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Thu, 7 Dec 2023 17:00:31 -0800 Subject: [PATCH 14/15] Fix nil pointer segfault. --- protocol/daemons/server/types/health_checker.go | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go index c29bd4384d7..16e6f0b900b 100644 --- a/protocol/daemons/server/types/health_checker.go +++ b/protocol/daemons/server/types/health_checker.go @@ -74,14 +74,6 @@ type healthCheckerMutableState struct { stopped bool } -// newHealthCheckerMutableState creates a new health checker mutable state scheduled to trigger a poll after the -// initial poll delay. -func newHealthCheckerMutableState(initialPollDelay time.Duration, pollFunc func()) *healthCheckerMutableState { - return &healthCheckerMutableState{ - timer: time.AfterFunc(initialPollDelay, pollFunc), - } -} - // ReportSuccess updates the health checker's mutable state to reflect a successful health check and schedules the next // poll as an atomic operation. func (u *healthCheckerMutableState) ReportSuccess(now time.Time) { @@ -208,10 +200,11 @@ func StartNewHealthChecker( timeProvider: timeProvider, maxUnhealthyDuration: maxUnhealthyDuration, logger: logger, + mutableState: &healthCheckerMutableState{}, } // The first poll is scheduled after the startup grace period to allow the service to initialize. - checker.mutableState = newHealthCheckerMutableState(startupGracePeriod, checker.Poll) + checker.mutableState.timer = time.AfterFunc(startupGracePeriod, checker.Poll) return checker } From 72d09dccbaf37d8bb20cf1df325751b2604c7341 Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Thu, 7 Dec 2023 17:19:51 -0800 Subject: [PATCH 15/15] Take 2. --- protocol/daemons/server/types/health_monitor_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go index 4a8e701901a..8ba07f748ba 100644 --- a/protocol/daemons/server/types/health_monitor_test.go +++ b/protocol/daemons/server/types/health_monitor_test.go @@ -28,7 +28,7 @@ var ( // TestLargeDuration is used to ensure that the health checker does not trigger a callback through the timer. TestLargeDuration = 5 * time.Minute - ZeroDuration = 0 * time.Second + SmallDuration = 10 * time.Millisecond ) // createTestMonitor creates a health monitor with a poll frequency of 10ms and a zero duration grace period. @@ -36,7 +36,7 @@ func createTestMonitor() (*types.HealthMonitor, *mocks.Logger) { logger := &mocks.Logger{} logger.On("With", "module", "daemon-health-monitor").Return(logger).Once() return types.NewHealthMonitor( - ZeroDuration, + SmallDuration, 10*time.Millisecond, logger, true, // enable panics here for stricter testing - a panic will definitely cause a test failure. @@ -140,7 +140,7 @@ func TestHealthMonitor_DisablePanics_DoesNotPanic(t *testing.T) { ).Return() hm := types.NewHealthMonitor( - ZeroDuration, + SmallDuration, 10*time.Millisecond, logger, false,