From d04e77827c206a7037bd7a4340d3e5094bf88685 Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal@dydx.exchange>
Date: Wed, 22 Nov 2023 16:15:22 -0800
Subject: [PATCH 01/15] Update liquidations daemon to report success after
 every paginated gRPC query.

Update error for a more specific test.

PR comments.

Revert liquidations health reporting to occur only at task loop boundary.

Checkpoint.

checkpoint - implementation, moved methods to app. need to migrate / add testing.

Update test app.

Checkpoint for tests.

Finish test cases.

App unit test + cleanup.

Cleanup, rename.

cleanup.

Remove todo for refactoring metrics daemon.

Move monitoring call to before client start.

Update placement of monitor code for pricefeed client.

Remove defunct constants.

Revert ServiceName changes.
---
 protocol/app/app.go                           |  52 ++-
 protocol/app/app_test.go                      |  13 +
 .../client/client_integration_test.go         |  12 +-
 protocol/daemons/server/bridge.go             |  11 -
 protocol/daemons/server/liquidation.go        |  17 -
 protocol/daemons/server/metrics.go            |  15 -
 protocol/daemons/server/pricefeed.go          |  15 -
 protocol/daemons/server/server.go             |  53 ---
 protocol/daemons/server/server_test.go        |  44 ---
 .../daemons/server/types/health_monitor.go    | 277 +++++++++++++++
 .../server/types/health_monitor_test.go       | 318 ++++++++++++++++++
 .../daemons/server/types/update_monitor.go    | 169 ----------
 .../server/types/update_monitor_test.go       | 189 -----------
 protocol/daemons/server/types/util.go         |   9 -
 protocol/daemons/server/types/util_test.go    |  14 -
 protocol/daemons/types/health_checkable.go    |  11 +
 protocol/mocks/HealthCheckable.go             |  63 ++++
 protocol/mocks/Makefile                       |   1 +
 protocol/testutil/app/app.go                  |   2 +-
 19 files changed, 736 insertions(+), 549 deletions(-)
 delete mode 100644 protocol/daemons/server/metrics.go
 create mode 100644 protocol/daemons/server/types/health_monitor.go
 create mode 100644 protocol/daemons/server/types/health_monitor_test.go
 delete mode 100644 protocol/daemons/server/types/update_monitor.go
 delete mode 100644 protocol/daemons/server/types/update_monitor_test.go
 delete mode 100644 protocol/daemons/server/types/util.go
 delete mode 100644 protocol/daemons/server/types/util_test.go
 create mode 100644 protocol/mocks/HealthCheckable.go

diff --git a/protocol/app/app.go b/protocol/app/app.go
index f01876607bd..f1e256ed87e 100644
--- a/protocol/app/app.go
+++ b/protocol/app/app.go
@@ -5,13 +5,13 @@ import (
 	"encoding/json"
 	"errors"
 	"io"
-	"math"
 	"math/big"
 	"net/http"
 	"os"
 	"path/filepath"
 	"runtime/debug"
 	"sync"
+	"time"
 
 	autocliv1 "cosmossdk.io/api/cosmos/autocli/v1"
 	reflectionv1 "cosmossdk.io/api/cosmos/reflection/v1"
@@ -183,6 +183,10 @@ import (
 var (
 	// DefaultNodeHome default home directories for the application daemon
 	DefaultNodeHome string
+
+	// MaximumDaemonUnhealthyDuration is the maximum amount of time that a daemon can be unhealthy before the
+	// application panics.
+	MaximumDaemonUnhealthyDuration = 5 * time.Minute
 )
 
 var (
@@ -290,6 +294,8 @@ type App struct {
 	PriceFeedClient    *pricefeedclient.Client
 	LiquidationsClient *liquidationclient.Client
 	BridgeClient       *bridgeclient.Client
+
+	HealthMonitor *daemonservertypes.HealthMonitor
 }
 
 // assertAppPreconditions assert invariants required for an application to start.
@@ -589,6 +595,11 @@ func New(
 	bridgeEventManager := bridgedaemontypes.NewBridgeEventManager(timeProvider)
 	app.Server.WithBridgeEventManager(bridgeEventManager)
 
+	app.HealthMonitor = daemonservertypes.NewHealthMonitor(
+		daemonservertypes.DaemonStartupGracePeriod,
+		daemonservertypes.HealthCheckPollFrequency,
+		app.Logger(),
+	)
 	// Create a closure for starting daemons and daemon server. Daemon services are delayed until after the gRPC
 	// service is started because daemons depend on the gRPC service being available. If a node is initialized
 	// with a genesis time in the future, then the gRPC service will not be available until the genesis time, the
@@ -600,11 +611,9 @@ func New(
 
 		// Start liquidations client for sending potentially liquidatable subaccounts to the application.
 		if daemonFlags.Liquidation.Enabled {
-			app.Server.ExpectLiquidationsDaemon(
-				daemonservertypes.MaximumAcceptableUpdateDelay(daemonFlags.Liquidation.LoopDelayMs),
-			)
 			app.LiquidationsClient = liquidationclient.NewClient(logger)
 			go func() {
+				app.MonitorDaemon(app.LiquidationsClient, MaximumDaemonUnhealthyDuration)
 				if err := app.LiquidationsClient.Start(
 					// The client will use `context.Background` so that it can have a different context from
 					// the main application.
@@ -621,7 +630,6 @@ func New(
 		// Non-validating full-nodes have no need to run the price daemon.
 		if !appFlags.NonValidatingFullNode && daemonFlags.Price.Enabled {
 			exchangeQueryConfig := constants.StaticExchangeQueryConfig
-			app.Server.ExpectPricefeedDaemon(daemonservertypes.MaximumAcceptableUpdateDelay(daemonFlags.Price.LoopDelayMs))
 			// Start pricefeed client for sending prices for the pricefeed server to consume. These prices
 			// are retrieved via third-party APIs like Binance and then are encoded in-memory and
 			// periodically sent via gRPC to a shared socket with the server.
@@ -637,6 +645,7 @@ func New(
 				constants.StaticExchangeDetails,
 				&pricefeedclient.SubTaskRunnerImpl{},
 			)
+			app.MonitorDaemon(app.PriceFeedClient, MaximumDaemonUnhealthyDuration)
 		}
 
 		// Start Bridge Daemon.
@@ -644,9 +653,9 @@ func New(
 		if !appFlags.NonValidatingFullNode && daemonFlags.Bridge.Enabled {
 			// TODO(CORE-582): Re-enable bridge daemon registration once the bridge daemon is fixed in local / CI
 			// environments.
-			// app.Server.ExpectBridgeDaemon(daemonservertypes.MaximumAcceptableUpdateDelay(daemonFlags.Bridge.LoopDelayMs))
+			app.BridgeClient = bridgeclient.NewClient(logger)
 			go func() {
-				app.BridgeClient = bridgeclient.NewClient(logger)
+				app.MonitorDaemon(app.BridgeClient, MaximumDaemonUnhealthyDuration)
 				if err := app.BridgeClient.Start(
 					// The client will use `context.Background` so that it can have a different context from
 					// the main application.
@@ -675,10 +684,6 @@ func New(
 					)
 				}
 			}()
-			// Don't panic if metrics daemon loops are delayed. Use maximum value.
-			app.Server.ExpectMetricsDaemon(
-				daemonservertypes.MaximumAcceptableUpdateDelay(math.MaxUint32),
-			)
 			metricsclient.Start(
 				// The client will use `context.Background` so that it can have a different context from
 				// the main application.
@@ -1222,6 +1227,31 @@ func New(
 	return app
 }
 
+// MonitorDaemon registers a daemon service with the update monitor. If the daemon does not register, the method will
+// panic.
+func (app *App) MonitorDaemon(
+	healthCheckableDaemon daemontypes.HealthCheckable,
+	maximumAcceptableUpdateDelay time.Duration,
+) {
+	if err := app.HealthMonitor.RegisterService(healthCheckableDaemon, maximumAcceptableUpdateDelay); err != nil {
+		app.Logger().Error(
+			"Failed to register daemon service with update monitor",
+			"error",
+			err,
+			"service",
+			healthCheckableDaemon.ServiceName(),
+			"maximumAcceptableUpdateDelay",
+			maximumAcceptableUpdateDelay,
+		)
+		panic(err)
+	}
+}
+
+// DisableHealthMonitorForTesting disables the health monitor for testing.
+func (app *App) DisableHealthMonitorForTesting() {
+	app.HealthMonitor.DisableForTesting()
+}
+
 // hydrateMemStores hydrates the memStores used for caching state.
 func (app *App) hydrateMemStores() {
 	// Create an `uncachedCtx` where the underlying MultiStore is the `rootMultiStore`.
diff --git a/protocol/app/app_test.go b/protocol/app/app_test.go
index 42fc06dd7c1..b845cc71b95 100644
--- a/protocol/app/app_test.go
+++ b/protocol/app/app_test.go
@@ -1,10 +1,12 @@
 package app_test
 
 import (
+	"github.com/dydxprotocol/v4-chain/protocol/mocks"
 	"gopkg.in/typ.v4/slices"
 	"reflect"
 	"strings"
 	"testing"
+	"time"
 
 	delaymsgmodule "github.com/dydxprotocol/v4-chain/protocol/x/delaymsg"
 
@@ -222,3 +224,14 @@ func TestModuleBasics(t *testing.T) {
 	actualFieldTypes := getMapFieldsAndTypes(reflect.ValueOf(basic_manager.ModuleBasics))
 	require.Equal(t, expectedFieldTypes, actualFieldTypes, "Module basics does not match expected")
 }
+
+func TestMonitorDaemon_Panics(t *testing.T) {
+	app := testapp.DefaultTestApp(nil)
+	hc := &mocks.HealthCheckable{}
+	hc.On("ServiceName").Return("test-service")
+	hc.On("HealthCheck").Return(nil)
+
+	app.MonitorDaemon(hc, 5*time.Minute)
+	// The second registration should fail, causing a panic.
+	require.Panics(t, func() { app.MonitorDaemon(hc, 5*time.Minute) })
+}
diff --git a/protocol/daemons/pricefeed/client/client_integration_test.go b/protocol/daemons/pricefeed/client/client_integration_test.go
index 99f67b37aa4..70e2dfd75ca 100644
--- a/protocol/daemons/pricefeed/client/client_integration_test.go
+++ b/protocol/daemons/pricefeed/client/client_integration_test.go
@@ -5,6 +5,7 @@ package client_test
 import (
 	"fmt"
 	"github.com/cometbft/cometbft/libs/log"
+	"github.com/dydxprotocol/v4-chain/protocol/app"
 	appflags "github.com/dydxprotocol/v4-chain/protocol/app/flags"
 	"github.com/dydxprotocol/v4-chain/protocol/daemons/flags"
 	"github.com/dydxprotocol/v4-chain/protocol/daemons/pricefeed/client"
@@ -219,6 +220,7 @@ type PriceDaemonIntegrationTestSuite struct {
 	exchangeServer     *pricefeed.ExchangeServer
 	daemonServer       *daemonserver.Server
 	exchangePriceCache *pricefeedserver_types.MarketToExchangePrices
+	healthMonitor      *servertypes.HealthMonitor
 
 	pricesMockQueryServer *mocks.QueryServer
 	pricesGrpcServer      *grpc.Server
@@ -278,7 +280,13 @@ func (s *PriceDaemonIntegrationTestSuite) SetupTest() {
 		&daemontypes.FileHandlerImpl{},
 		s.daemonFlags.Shared.SocketAddress,
 	)
-	s.daemonServer.ExpectPricefeedDaemon(servertypes.MaximumAcceptableUpdateDelay(s.daemonFlags.Price.LoopDelayMs))
+
+	s.healthMonitor = servertypes.NewHealthMonitor(
+		servertypes.DaemonStartupGracePeriod,
+		servertypes.HealthCheckPollFrequency,
+		log.TestingLogger(),
+	)
+
 	s.exchangePriceCache = pricefeedserver_types.NewMarketToExchangePrices(pricefeed_types.MaxPriceAge)
 	s.daemonServer.WithPriceFeedMarketToExchangePrices(s.exchangePriceCache)
 
@@ -329,6 +337,8 @@ func (s *PriceDaemonIntegrationTestSuite) startClient() {
 		testExchangeToQueryDetails,
 		&client.SubTaskRunnerImpl{},
 	)
+	err := s.healthMonitor.RegisterService(s.pricefeedDaemon, app.MaximumDaemonUnhealthyDuration)
+	s.Require().NoError(err)
 }
 
 // expectPricesWithTimeout waits for the exchange price cache to contain the expected prices, with a timeout.
diff --git a/protocol/daemons/server/bridge.go b/protocol/daemons/server/bridge.go
index 20fc68cd1ac..632d72ddb21 100644
--- a/protocol/daemons/server/bridge.go
+++ b/protocol/daemons/server/bridge.go
@@ -2,9 +2,6 @@ package server
 
 import (
 	"context"
-	"github.com/dydxprotocol/v4-chain/protocol/daemons/server/types"
-	"time"
-
 	"github.com/dydxprotocol/v4-chain/protocol/daemons/bridge/api"
 	bdtypes "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types/bridge"
 )
@@ -23,14 +20,6 @@ func (server *Server) WithBridgeEventManager(
 	return server
 }
 
-// ExpectBridgeDaemon registers the bridge daemon with the server. This is required
-// in order to ensure that the daemon service is called at least once during every
-// maximumAcceptableUpdateDelay duration. It will cause the protocol to panic if the daemon does not
-// respond within maximumAcceptableUpdateDelay duration.
-func (server *Server) ExpectBridgeDaemon(maximumAcceptableUpdateDelay time.Duration) {
-	server.registerDaemon(types.BridgeDaemonServiceName, maximumAcceptableUpdateDelay)
-}
-
 // AddBridgeEvents stores any bridge events recognized by the daemon
 // in a go-routine safe slice.
 func (s *Server) AddBridgeEvents(
diff --git a/protocol/daemons/server/liquidation.go b/protocol/daemons/server/liquidation.go
index 9eb10450515..63c8f6b1134 100644
--- a/protocol/daemons/server/liquidation.go
+++ b/protocol/daemons/server/liquidation.go
@@ -2,9 +2,6 @@ package server
 
 import (
 	"context"
-	"github.com/dydxprotocol/v4-chain/protocol/daemons/server/types"
-	"time"
-
 	"github.com/cosmos/cosmos-sdk/telemetry"
 	"github.com/dydxprotocol/v4-chain/protocol/daemons/liquidation/api"
 	liquidationtypes "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types/liquidations"
@@ -26,14 +23,6 @@ func (server *Server) WithLiquidatableSubaccountIds(
 	return server
 }
 
-// ExpectLiquidationsDaemon registers the liquidations daemon with the server. This is required
-// in order to ensure that the daemon service is called at least once during every
-// maximumAcceptableUpdateDelay duration. It will cause the protocol to panic if the daemon does not
-// respond within maximumAcceptableUpdateDelay duration.
-func (server *Server) ExpectLiquidationsDaemon(maximumAcceptableUpdateDelay time.Duration) {
-	server.registerDaemon(types.LiquidationsDaemonServiceName, maximumAcceptableUpdateDelay)
-}
-
 // LiquidateSubaccounts stores the list of potentially liquidatable subaccount ids
 // in a go-routine safe slice.
 func (s *Server) LiquidateSubaccounts(
@@ -47,12 +36,6 @@ func (s *Server) LiquidateSubaccounts(
 		metrics.Received,
 		metrics.Count,
 	)
-	// If the daemon is unable to report a response, there is either an error in the registration of
-	// this daemon, or another one. In either case, the protocol should panic.
-	if err := s.reportResponse(types.LiquidationsDaemonServiceName); err != nil {
-		s.logger.Error("Failed to report liquidations response to update monitor", "error", err)
-	}
-
 	s.liquidatableSubaccountIds.UpdateSubaccountIds(req.SubaccountIds)
 	return &api.LiquidateSubaccountsResponse{}, nil
 }
diff --git a/protocol/daemons/server/metrics.go b/protocol/daemons/server/metrics.go
deleted file mode 100644
index 2ab38263829..00000000000
--- a/protocol/daemons/server/metrics.go
+++ /dev/null
@@ -1,15 +0,0 @@
-package server
-
-import (
-	"time"
-
-	"github.com/dydxprotocol/v4-chain/protocol/daemons/server/types"
-)
-
-// ExpectMetricsDaemon registers the periodic metrics daemon with the server. This is required
-// in order to ensure that the daemon service is called at least once during every
-// maximumAcceptableUpdateDelay duration. It will cause the protocol to panic if the daemon does not
-// respond within maximumAcceptableUpdateDelay duration.
-func (server *Server) ExpectMetricsDaemon(maximumAcceptableUpdateDelay time.Duration) {
-	server.registerDaemon(types.MetricsDaemonServiceName, maximumAcceptableUpdateDelay)
-}
diff --git a/protocol/daemons/server/pricefeed.go b/protocol/daemons/server/pricefeed.go
index 30f6a2e6146..0337483fc38 100644
--- a/protocol/daemons/server/pricefeed.go
+++ b/protocol/daemons/server/pricefeed.go
@@ -3,7 +3,6 @@ package server
 import (
 	"context"
 	errorsmod "cosmossdk.io/errors"
-	servertypes "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types"
 	"time"
 
 	gometrics "github.com/armon/go-metrics"
@@ -31,14 +30,6 @@ func (server *Server) WithPriceFeedMarketToExchangePrices(
 	return server
 }
 
-// ExpectPricefeedDaemon registers the pricefeed daemon with the server. This is required
-// in order to ensure that the daemon service is called at least once during every
-// maximumAcceptableUpdateDelay duration. It will cause the protocol to panic if the daemon does not
-// respond within maximumAcceptableUpdateDelay duration.
-func (server *Server) ExpectPricefeedDaemon(maximumAcceptableUpdateDelay time.Duration) {
-	server.registerDaemon(servertypes.PricefeedDaemonServiceName, maximumAcceptableUpdateDelay)
-}
-
 // UpdateMarketPrices updates prices from exchanges for each market provided.
 func (s *Server) UpdateMarketPrices(
 	ctx context.Context,
@@ -52,12 +43,6 @@ func (s *Server) UpdateMarketPrices(
 		metrics.Latency,
 	)
 
-	// If the daemon is unable to report a response, there is either an error in the registration of
-	// this daemon, or another one. In either case, the protocol should panic.
-	if err := s.reportResponse(servertypes.PricefeedDaemonServiceName); err != nil {
-		panic(err)
-	}
-
 	if s.marketToExchange == nil {
 		panic(
 			errorsmod.Wrapf(
diff --git a/protocol/daemons/server/server.go b/protocol/daemons/server/server.go
index d146587731c..8c3cb42618d 100644
--- a/protocol/daemons/server/server.go
+++ b/protocol/daemons/server/server.go
@@ -1,19 +1,14 @@
 package server
 
 import (
-	gometrics "github.com/armon/go-metrics"
 	"github.com/cometbft/cometbft/libs/log"
-	"github.com/cosmos/cosmos-sdk/telemetry"
 	bridgeapi "github.com/dydxprotocol/v4-chain/protocol/daemons/bridge/api"
 	"github.com/dydxprotocol/v4-chain/protocol/daemons/constants"
 	liquidationapi "github.com/dydxprotocol/v4-chain/protocol/daemons/liquidation/api"
 	pricefeedapi "github.com/dydxprotocol/v4-chain/protocol/daemons/pricefeed/api"
-	"github.com/dydxprotocol/v4-chain/protocol/daemons/server/types"
 	daemontypes "github.com/dydxprotocol/v4-chain/protocol/daemons/types"
-	"github.com/dydxprotocol/v4-chain/protocol/lib/metrics"
 	"net"
 	"syscall"
-	"time"
 )
 
 // Server struct defines the shared gRPC server for all daemons.
@@ -26,8 +21,6 @@ type Server struct {
 	fileHandler   daemontypes.FileHandler
 	socketAddress string
 
-	updateMonitor *types.UpdateMonitor
-
 	BridgeServer
 	PriceFeedServer
 	LiquidationServer
@@ -47,60 +40,14 @@ func NewServer(
 		gsrv:          grpcServer,
 		fileHandler:   fileHandler,
 		socketAddress: socketAddress,
-		updateMonitor: types.NewUpdateFrequencyMonitor(types.DaemonStartupGracePeriod, logger),
 	}
 }
 
 // Stop stops the daemon server's gRPC service.
 func (server *Server) Stop() {
-	server.updateMonitor.Stop()
 	server.gsrv.Stop()
 }
 
-// DisableUpdateMonitoringForTesting disables the update monitor for testing purposes. This is needed in integration
-// tests that do not run the full protocol.
-func (server *Server) DisableUpdateMonitoringForTesting() {
-	server.updateMonitor.DisableForTesting()
-}
-
-// registerDaemon registers a daemon service with the update monitor.
-func (server *Server) registerDaemon(
-	daemonKey string,
-	maximumAcceptableUpdateDelay time.Duration,
-) {
-	if err := server.updateMonitor.RegisterDaemonService(daemonKey, maximumAcceptableUpdateDelay); err != nil {
-		server.logger.Error(
-			"Failed to register daemon service with update monitor",
-			"error",
-			err,
-			"service",
-			daemonKey,
-			"maximumAcceptableUpdateDelay",
-			maximumAcceptableUpdateDelay,
-		)
-		panic(err)
-	}
-}
-
-// reportResponse reports a response from a daemon service with the update monitor. This is used to
-// ensure that the daemon continues to operate. If the update monitor does not see a response from a
-// registered daemon within the maximumAcceptableUpdateDelay, it will cause the protocol to panic.
-func (server *Server) reportResponse(
-	daemonKey string,
-) error {
-	telemetry.IncrCounterWithLabels(
-		[]string{
-			metrics.DaemonServer,
-			metrics.ValidResponse,
-		},
-		1,
-		[]gometrics.Label{
-			metrics.GetLabelForStringValue(metrics.Daemon, daemonKey),
-		},
-	)
-	return server.updateMonitor.RegisterValidResponse(daemonKey)
-}
-
 // Start clears the current socket and establishes a new socket connection
 // on the local filesystem.
 // See URL for more information: https://eli.thegreenplace.net/2019/unix-domain-sockets-in-go/
diff --git a/protocol/daemons/server/server_test.go b/protocol/daemons/server/server_test.go
index 5188409d103..5d09b53d470 100644
--- a/protocol/daemons/server/server_test.go
+++ b/protocol/daemons/server/server_test.go
@@ -13,7 +13,6 @@ import (
 	"net"
 	"os"
 	"testing"
-	"time"
 )
 
 const (
@@ -159,48 +158,6 @@ func TestStart_MixedInvalid(t *testing.T) {
 	}
 }
 
-func TestRegisterDaemon_DoesNotPanic(t *testing.T) {
-	grpcServer := &mocks.GrpcServer{}
-	grpcServer.On("Stop").Return().Once()
-	server := server.NewServer(
-		log.NewNopLogger(),
-		grpcServer,
-		&mocks.FileHandler{},
-		grpc.SocketPath,
-	)
-	defer server.Stop()
-
-	require.NotPanics(t, func() {
-		server.ExpectPricefeedDaemon(5 * time.Second)
-	})
-}
-
-func TestRegisterDaemon_DoubleRegistrationPanics(t *testing.T) {
-	grpcServer := &mocks.GrpcServer{}
-	grpcServer.On("Stop").Return().Once()
-	server := server.NewServer(
-		log.NewNopLogger(),
-		grpcServer,
-		&mocks.FileHandler{},
-		grpc.SocketPath,
-	)
-	defer server.Stop()
-
-	// First registration should not panic.
-	require.NotPanics(t, func() {
-		server.ExpectPricefeedDaemon(5 * time.Second)
-	})
-
-	// Second registration should panic.
-	require.PanicsWithError(
-		t,
-		"service pricefeed-daemon already registered",
-		func() {
-			server.ExpectPricefeedDaemon(5 * time.Second)
-		},
-	)
-}
-
 func createServerWithMocks(
 	t testing.TB,
 	mockGrpcServer *mocks.GrpcServer,
@@ -214,7 +171,6 @@ func createServerWithMocks(
 	)
 	mockGrpcServer.On("Stop").Return().Once()
 	t.Cleanup(server.Stop)
-	server.DisableUpdateMonitoringForTesting()
 	return server
 }
 
diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go
new file mode 100644
index 00000000000..e3dd3b95e53
--- /dev/null
+++ b/protocol/daemons/server/types/health_monitor.go
@@ -0,0 +1,277 @@
+package types
+
+import (
+	cosmoslog "cosmossdk.io/log"
+	"fmt"
+	"github.com/cometbft/cometbft/libs/log"
+	"github.com/dydxprotocol/v4-chain/protocol/daemons/types"
+	libtime "github.com/dydxprotocol/v4-chain/protocol/lib/time"
+	"sync"
+	"time"
+)
+
+const (
+	HealthCheckPollFrequency = 5 * time.Second
+)
+
+// timestampWithError couples a timestamp and error to make it easier to update them in tandem.
+type timestampWithError struct {
+	timestamp time.Time
+	err       error
+}
+
+func (u *timestampWithError) Update(timestamp time.Time, err error) {
+	u.timestamp = timestamp
+	u.err = err
+}
+
+func (u *timestampWithError) Reset() {
+	u.Update(time.Time{}, nil)
+}
+
+func (u *timestampWithError) IsZero() bool {
+	return u.timestamp.IsZero()
+}
+
+func (u *timestampWithError) Timestamp() time.Time {
+	return u.timestamp
+}
+
+func (u *timestampWithError) Error() error {
+	return u.err
+}
+
+// healthChecker encapsulates the logic for monitoring the health of a health checkable service.
+type healthChecker struct {
+	// healthCheckable is the health checkable service to be monitored.
+	healthCheckable types.HealthCheckable
+
+	// timer triggers a health check poll for a health checkable service.
+	timer *time.Timer
+
+	// pollFrequency is the frequency at which the health checkable service is polled.
+	pollFrequency time.Duration
+
+	// mostRecentSuccess is the timestamp of the most recent successful health check.
+	mostRecentSuccess time.Time
+
+	// firstFailureInStreak is the timestamp of the first error in the most recent streak of errors. It is set
+	// whenever the service toggles from healthy to an unhealthy state, and used to determine how long the daemon has
+	// been unhealthy. If this timestamp is nil, then the error streak ended before it could trigger a callback.
+	firstFailureInStreak timestampWithError
+
+	// unhealthyCallback is the callback function to be executed if the health checkable service remains
+	// unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration.
+	// This callback function is executed with the error that caused the service to become unhealthy.
+	unhealthyCallback func(error)
+
+	// timeProvider is used to get the current time. It is added as a field so that it can be mocked in tests.
+	timeProvider libtime.TimeProvider
+
+	// maximumAcceptableUnhealthyDuration is the maximum acceptable duration for a health checkable service to
+	// remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the
+	// specified callback function.
+	maximumAcceptableUnhealthyDuration time.Duration
+}
+
+// Poll executes a health check for the health checkable service. If the service has been unhealthy for longer than the
+// maximum acceptable unhealthy duration, the callback function is executed.
+// This method is publicly exposed for testing.
+func (hc *healthChecker) Poll() {
+	// Don't return an error if the monitor has been disabled.
+	err := hc.healthCheckable.HealthCheck()
+	now := hc.timeProvider.Now()
+	if err == nil {
+		hc.mostRecentSuccess = now
+		// Whenever the service is healthy, reset the first failure in streak timestamp.
+		hc.firstFailureInStreak.Reset()
+	} else if hc.firstFailureInStreak.IsZero() {
+		// Capture the timestamp of the first failure in a new streak.
+		hc.firstFailureInStreak.Update(now, err)
+	}
+
+	// If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the
+	// callback function.
+	if !hc.firstFailureInStreak.IsZero() &&
+		now.Sub(hc.firstFailureInStreak.Timestamp()) >= hc.maximumAcceptableUnhealthyDuration {
+		hc.unhealthyCallback(hc.firstFailureInStreak.Error())
+	} else {
+		// If we do not execute the callback, schedule the next poll.
+		hc.timer.Reset(hc.pollFrequency)
+	}
+}
+
+func (hc *healthChecker) Stop() {
+	hc.timer.Stop()
+}
+
+// StartNewHealthChecker creates and starts a new health checker for a health checkable service.
+func StartNewHealthChecker(
+	healthCheckable types.HealthCheckable,
+	pollFrequency time.Duration,
+	unhealthyCallback func(error),
+	timeProvider libtime.TimeProvider,
+	maximumAcceptableUnhealthyDuration time.Duration,
+	startupGracePeriod time.Duration,
+) *healthChecker {
+	checker := &healthChecker{
+		healthCheckable:                    healthCheckable,
+		pollFrequency:                      pollFrequency,
+		unhealthyCallback:                  unhealthyCallback,
+		timeProvider:                       timeProvider,
+		maximumAcceptableUnhealthyDuration: maximumAcceptableUnhealthyDuration,
+	}
+	// The first poll is scheduled after the startup grace period to allow the service to initialize.
+	checker.timer = time.AfterFunc(startupGracePeriod, checker.Poll)
+
+	return checker
+}
+
+// HealthMonitor monitors the health of daemon services, which implement the HealthCheckable interface. If a
+// registered health-checkable service sustains an unhealthy state for the maximum acceptable unhealthy duration,
+// the monitor will execute a callback function.
+type HealthMonitor struct {
+	// serviceToHealthChecker maps daemon service names to their update metadata.
+	serviceToHealthChecker map[string]*healthChecker
+	// stopped indicates whether the monitor has been stopped. Additional daemon services cannot be registered
+	// after the monitor has been stopped.
+	stopped bool
+	// disabled indicates whether the monitor has been disabled. This is used to disable the monitor in testApp
+	// tests, where app.New is not executed.
+	disabled bool
+	// lock is used to synchronize access to the monitor.
+	lock sync.Mutex
+
+	// These fields are initialized in NewHealthMonitor and are not modified after initialization.
+	logger             log.Logger
+	startupGracePeriod time.Duration
+	pollingFrequency   time.Duration
+}
+
+// NewHealthMonitor creates a new health monitor.
+func NewHealthMonitor(
+	startupGracePeriod time.Duration,
+	pollingFrequency time.Duration,
+	logger log.Logger,
+) *HealthMonitor {
+	return &HealthMonitor{
+		serviceToHealthChecker: make(map[string]*healthChecker),
+		logger:                 logger.With(cosmoslog.ModuleKey, "health-monitor"),
+		startupGracePeriod:     startupGracePeriod,
+		pollingFrequency:       pollingFrequency,
+	}
+}
+
+func (hm *HealthMonitor) DisableForTesting() {
+	hm.lock.Lock()
+	defer hm.lock.Unlock()
+
+	hm.disabled = true
+}
+
+// RegisterServiceWithCallback registers a HealthCheckable with the health monitor. If the service
+// stays unhealthy every time it is polled during the maximum acceptable unhealthy duration, the monitor will
+// execute the callback function.
+// This method is synchronized. The method returns an error if the service was already registered or the
+// monitor has already been stopped.
+func (hm *HealthMonitor) RegisterServiceWithCallback(
+	hc types.HealthCheckable,
+	maximumAcceptableUnhealthyDuration time.Duration,
+	callback func(error),
+) error {
+	hm.lock.Lock()
+	defer hm.lock.Unlock()
+
+	if maximumAcceptableUnhealthyDuration <= 0 {
+		return fmt.Errorf(
+			"health check registration failure for service %v: "+
+				"maximum acceptable unhealthy duration %v must be positive",
+			hc.ServiceName(),
+			maximumAcceptableUnhealthyDuration,
+		)
+	}
+
+	// Don't register daemon services if the monitor has been disabled.
+	if hm.disabled {
+		return nil
+	}
+
+	// Don't register additional daemon services if the monitor has already been stopped.
+	// This could be a concern for short-running integration test cases, where the network
+	// stops before all daemon services have been registered.
+	if hm.stopped {
+		return fmt.Errorf(
+			"health check registration failure for service %v: monitor has been stopped",
+			hc.ServiceName(),
+		)
+	}
+
+	if _, ok := hm.serviceToHealthChecker[hc.ServiceName()]; ok {
+		return fmt.Errorf("service %v already registered", hc.ServiceName())
+	}
+
+	hm.serviceToHealthChecker[hc.ServiceName()] = StartNewHealthChecker(
+		hc,
+		hm.pollingFrequency,
+		callback,
+		&libtime.TimeProviderImpl{},
+		maximumAcceptableUnhealthyDuration,
+		hm.startupGracePeriod,
+	)
+	return nil
+}
+
+// PanicServiceNotResponding returns a function that panics with a message indicating that the specified daemon
+// service is not responding. This is ideal for creating a callback function when registering a daemon service.
+func PanicServiceNotResponding(hc types.HealthCheckable) func(error) {
+	return func(err error) {
+		panic(fmt.Sprintf("%v unhealthy: %v", hc.ServiceName(), err))
+	}
+}
+
+// LogErrorServiceNotResponding returns a function that logs an error indicating that the specified service
+// is not responding. This is ideal for creating a callback function when registering a health checkable service.
+func LogErrorServiceNotResponding(hc types.HealthCheckable, logger log.Logger) func(error) {
+	return func(err error) {
+		logger.Error(
+			"health-checked service is unhealthy",
+			"service",
+			hc.ServiceName(),
+			"error",
+			err,
+		)
+	}
+}
+
+// RegisterService registers a new health checkable service with the health check monitor. If the service
+// is unhealthy every time it is polled for a duration greater than or equal to the maximum acceptable unhealthy
+// duration, the monitor will panic.
+// This method is synchronized. It returns an error if the service was already registered or the monitor has
+// already been stopped.
+func (hm *HealthMonitor) RegisterService(
+	hc types.HealthCheckable,
+	maximumAcceptableUnhealthyDuration time.Duration,
+) error {
+	return hm.RegisterServiceWithCallback(
+		hc,
+		maximumAcceptableUnhealthyDuration,
+		PanicServiceNotResponding(hc),
+	)
+}
+
+// Stop stops the update frequency monitor. This method is synchronized.
+func (hm *HealthMonitor) Stop() {
+	hm.lock.Lock()
+	defer hm.lock.Unlock()
+
+	// Don't stop the monitor if it has already been stopped.
+	if hm.stopped {
+		return
+	}
+
+	for _, checker := range hm.serviceToHealthChecker {
+		checker.Stop()
+	}
+
+	hm.stopped = true
+}
diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go
new file mode 100644
index 00000000000..8fba05764ae
--- /dev/null
+++ b/protocol/daemons/server/types/health_monitor_test.go
@@ -0,0 +1,318 @@
+package types_test
+
+import (
+	"fmt"
+	"github.com/dydxprotocol/v4-chain/protocol/daemons/server/types"
+	"github.com/dydxprotocol/v4-chain/protocol/mocks"
+	"github.com/stretchr/testify/mock"
+	"github.com/stretchr/testify/require"
+	"testing"
+	"time"
+)
+
+var (
+	TestError1 = fmt.Errorf("test error 1")
+	TestError2 = fmt.Errorf("test error 2")
+
+	Time0 = time.Unix(0, 0)
+	Time1 = Time0.Add(time.Second)
+	Time2 = Time0.Add(2 * time.Second)
+	Time3 = Time0.Add(3 * time.Second)
+	Time4 = Time0.Add(4 * time.Second)
+	Time5 = Time0.Add(5 * time.Second)
+
+	// Use a maximum unhealthy duration of 5 seconds for testing, simulating a poll frequency of 1s with timestamps.
+	TestMaximumUnhealthyDuration = 5 * time.Second
+
+	// TestLargeDuration is used to ensure that the health checker does not trigger a callback through the timer.
+	TestLargeDuration = 5 * time.Minute
+
+	ZeroDuration = 0 * time.Second
+)
+
+// createTestMonitor creates a health monitor with a poll frequency of 10ms and a zero duration grace period.
+func createTestMonitor() (*types.HealthMonitor, *mocks.Logger) {
+	logger := &mocks.Logger{}
+	logger.On("With", "module", "health-monitor").Return(logger).Once()
+	return types.NewHealthMonitor(
+		ZeroDuration,
+		10*time.Millisecond,
+		logger,
+	), logger
+}
+
+// mockFailingHealthCheckerWithError creates a mock health checkable service that returns the given error.
+func mockFailingHealthCheckerWithError(name string, err error) *mocks.HealthCheckable {
+	hc := &mocks.HealthCheckable{}
+	hc.On("ServiceName").Return(name)
+	hc.On("HealthCheck").Return(err)
+	return hc
+}
+
+// callbackWithErrorPointer returns a callback function and an error pointer that tracks the error passed to the
+// callback. This can be used to validate that a service was considered unhealthy for the maximum allowable duration.
+func callbackWithErrorPointer() (func(error), *error) {
+	var callbackError error
+	callback := func(err error) {
+		callbackError = err
+	}
+	return callback, &callbackError
+}
+
+func TestHealthChecker(t *testing.T) {
+	tests := map[string]struct {
+		healthCheckResponses []error
+		expectedUnhealthy    error
+	}{
+		"initialized: no callback triggered": {
+			healthCheckResponses: []error{},
+			expectedUnhealthy:    nil,
+		},
+		"healthy, then unhealthy for < maximum unhealthy duration: no callback triggered": {
+			healthCheckResponses: []error{
+				nil,
+				TestError1,
+			},
+			expectedUnhealthy: nil,
+		},
+		"unhealthy for < maximum unhealthy duration: no callback triggered": {
+			healthCheckResponses: []error{
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+			},
+			expectedUnhealthy: nil,
+		},
+		"unhealthy, healthy, unhealthy: no callback triggered": {
+			healthCheckResponses: []error{
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+				nil,
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+			},
+			expectedUnhealthy: nil,
+		},
+		"unhealthy for maximum unhealthy duration: callback triggered": {
+			healthCheckResponses: []error{
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+			},
+			expectedUnhealthy: TestError1,
+		},
+		"unhealthy with multiple errors: first error returned": {
+			healthCheckResponses: []error{
+				TestError1,
+				TestError2,
+				TestError2,
+				TestError2,
+				TestError2,
+				TestError2,
+			},
+			expectedUnhealthy: TestError1,
+		},
+	}
+	for name, test := range tests {
+		t.Run(name, func(t *testing.T) {
+			// Setup.
+			// Set up callback to track error passed to it.
+			callback, callbackError := callbackWithErrorPointer()
+
+			// Set up health checkable service.
+			checkable := &mocks.HealthCheckable{}
+			for _, response := range test.healthCheckResponses {
+				checkable.On("HealthCheck").Return(response).Once()
+			}
+
+			// Set up time provider to return a sequence of timestamps one second apart starting at Time0.
+			timeProvider := &mocks.TimeProvider{}
+			for i := range test.healthCheckResponses {
+				timeProvider.On("Now").Return(Time0.Add(time.Duration(i) * time.Second)).Once()
+			}
+
+			healthChecker := types.StartNewHealthChecker(
+				checkable,
+				TestLargeDuration, // set to a >> value so that poll is never auto-triggered by the timer
+				callback,
+				timeProvider,
+				TestMaximumUnhealthyDuration,
+				types.DaemonStartupGracePeriod,
+			)
+
+			// Cleanup.
+			defer func() {
+				healthChecker.Stop()
+			}()
+
+			// Act - simulate the health checker polling for updates.
+			for i := 0; i < len(test.healthCheckResponses); i++ {
+				healthChecker.Poll()
+			}
+
+			// Assert.
+			// Validate the expected polls occurred according to the mocks.
+			checkable.AssertExpectations(t)
+			timeProvider.AssertExpectations(t)
+
+			// Validate the callback was called with the expected error.
+			if test.expectedUnhealthy == nil {
+				require.NoError(t, *callbackError)
+			} else {
+				require.ErrorContains(t, *callbackError, test.expectedUnhealthy.Error())
+			}
+		})
+	}
+}
+
+// The following tests may still intermittently fail or report false positives on an overloaded system as they rely
+// on callbacks to execute before the termination of the `time.Sleep` call, which is not guaranteed.
+func TestRegisterService_Healthy(t *testing.T) {
+	// Setup.
+	ufm, logger := createTestMonitor()
+	hc := mockFailingHealthCheckerWithError("test-service", nil)
+
+	// Act.
+	err := ufm.RegisterService(hc, 50*time.Millisecond)
+	require.NoError(t, err)
+
+	// Cleanup.
+	defer func() {
+		ufm.Stop()
+	}()
+
+	// Give the monitor time to poll the health checkable service. Polls occur once every 10ms.
+	time.Sleep(100 * time.Millisecond)
+
+	// Assert: no calls to the logger were made.
+	mock.AssertExpectationsForObjects(t, hc, logger)
+}
+
+func TestRegisterServiceWithCallback_Mixed(t *testing.T) {
+	tests := map[string]struct {
+		healthCheckResponse error
+	}{
+		"healthy: no callback triggered": {
+			healthCheckResponse: nil,
+		},
+		"unhealthy: callback triggered": {
+			healthCheckResponse: TestError1,
+		},
+	}
+	for name, test := range tests {
+		t.Run(name, func(t *testing.T) {
+			// Setup.
+			ufm, logger := createTestMonitor()
+			hc := mockFailingHealthCheckerWithError("test-service", test.healthCheckResponse)
+			callback, callbackError := callbackWithErrorPointer()
+
+			// Act.
+			err := ufm.RegisterServiceWithCallback(
+				hc,
+				50*time.Millisecond,
+				callback,
+			)
+			require.NoError(t, err)
+
+			// Cleanup.
+			defer func() {
+				ufm.Stop()
+			}()
+
+			// Give the monitor time to poll the health checkable service. Polls occur once every 10ms.
+			time.Sleep(100 * time.Millisecond)
+
+			// Assert: no calls to the logger were made.
+			mock.AssertExpectationsForObjects(t, hc, logger)
+
+			// Assert: the callback was called or not called as expected.
+			require.Equal(t, test.healthCheckResponse, *callbackError)
+		})
+	}
+}
+
+func TestRegisterServiceWithCallback_DoubleRegistrationFails(t *testing.T) {
+	// Setup.
+	ufm, logger := createTestMonitor()
+
+	hc := mockFailingHealthCheckerWithError("test-service", TestError1)
+	hc2 := mockFailingHealthCheckerWithError("test-service", TestError2)
+
+	callback, callbackError := callbackWithErrorPointer()
+
+	err := ufm.RegisterServiceWithCallback(hc, 50*time.Millisecond, callback)
+	require.NoError(t, err)
+
+	// Register a service with the same name. This registration should fail.
+	err = ufm.RegisterServiceWithCallback(hc2, 50*time.Millisecond, callback)
+	require.ErrorContains(t, err, "service already registered")
+
+	// Expect that the first service is still operating and will produce a callback after a sustained unhealthy period.
+	time.Sleep(100 * time.Millisecond)
+	ufm.Stop()
+
+	// Assert no calls to the logger were made.
+	mock.AssertExpectationsForObjects(t, logger, hc)
+	hc2.AssertNotCalled(t, "HealthCheck")
+
+	// Assert the callback was called with the expected error.
+	require.Equal(t, TestError1, *callbackError)
+}
+
+func TestRegisterService_RegistrationFailsAfterStop(t *testing.T) {
+	ufm, logger := createTestMonitor()
+	ufm.Stop()
+
+	hc := mockFailingHealthCheckerWithError("test-service", TestError1)
+	err := ufm.RegisterService(hc, 50*time.Millisecond)
+	require.ErrorContains(t, err, "monitor has been stopped")
+
+	// Any scheduled functions with error logs that were not cleaned up should trigger before this sleep finishes.
+	time.Sleep(100 * time.Millisecond)
+	mock.AssertExpectationsForObjects(t, logger)
+}
+
+func TestRegisterValidResponseWithCallback_NegativeUpdateDuration(t *testing.T) {
+	ufm, _ := createTestMonitor()
+	hc := mockFailingHealthCheckerWithError("test-service", TestError1)
+	err := ufm.RegisterServiceWithCallback(hc, -50*time.Millisecond, func(error) {})
+	require.ErrorContains(t, err, "maximum acceptable unhealthy duration -50ms must be positive")
+}
+
+func TestPanicServiceNotResponding(t *testing.T) {
+	panicFunc := types.PanicServiceNotResponding(&mocks.HealthCheckable{})
+	require.Panics(t, func() {
+		panicFunc(TestError1)
+	})
+}
+
+func TestLogErrorServiceNotResponding(t *testing.T) {
+	logger := &mocks.Logger{}
+	hc := &mocks.HealthCheckable{}
+
+	hc.On("ServiceName").Return("test-service")
+	logger.On(
+		"Error",
+		"health-checked service is unhealthy",
+		"service",
+		"test-service",
+		"error",
+		TestError1,
+	).Return()
+	logFunc := types.LogErrorServiceNotResponding(hc, logger)
+	logFunc(TestError1)
+
+	// Assert: the logger was called with the expected arguments.
+	mock.AssertExpectationsForObjects(t, logger)
+}
diff --git a/protocol/daemons/server/types/update_monitor.go b/protocol/daemons/server/types/update_monitor.go
deleted file mode 100644
index 7c437ff0d7c..00000000000
--- a/protocol/daemons/server/types/update_monitor.go
+++ /dev/null
@@ -1,169 +0,0 @@
-package types
-
-import (
-	"fmt"
-	"github.com/cometbft/cometbft/libs/log"
-	"sync"
-	"time"
-)
-
-type updateMetadata struct {
-	timer           *time.Timer
-	updateFrequency time.Duration
-}
-
-// UpdateMonitor monitors the update frequency of daemon services. If a daemon service does not respond within
-// the maximum acceptable update delay set when the daemon is registered, the monitor will log an error and halt the
-// protocol. This was judged to be the best solution for network performance because it prevents any validator from
-// participating in the network at all if a daemon service is not responding.
-type UpdateMonitor struct {
-	// serviceToUpdateMetadata maps daemon service names to their update metadata.
-	serviceToUpdateMetadata map[string]updateMetadata
-	// stopped indicates whether the monitor has been stopped. Additional daemon services cannot be registered
-	// after the monitor has been stopped.
-	stopped bool
-	// disabled indicates whether the monitor has been disabled. This is used to disable the monitor in testApp
-	// tests, where app.New is not executed.
-	disabled bool
-	// lock is used to synchronize access to the monitor.
-	lock sync.Mutex
-
-	// These fields are initialized in NewUpdateFrequencyMonitor and are not modified after initialization.
-	logger                   log.Logger
-	daemonStartupGracePeriod time.Duration
-}
-
-// NewUpdateFrequencyMonitor creates a new update frequency monitor.
-func NewUpdateFrequencyMonitor(daemonStartupGracePeriod time.Duration, logger log.Logger) *UpdateMonitor {
-	return &UpdateMonitor{
-		serviceToUpdateMetadata:  make(map[string]updateMetadata),
-		logger:                   logger,
-		daemonStartupGracePeriod: daemonStartupGracePeriod,
-	}
-}
-
-func (ufm *UpdateMonitor) DisableForTesting() {
-	ufm.lock.Lock()
-	defer ufm.lock.Unlock()
-
-	ufm.disabled = true
-}
-
-// RegisterDaemonServiceWithCallback registers a new daemon service with the update frequency monitor. If the daemon
-// service fails to respond within the maximum acceptable update delay, the monitor will execute the callback function.
-// This method is synchronized. The method returns an error if the daemon service was already registered or the
-// monitor has already been stopped.
-func (ufm *UpdateMonitor) RegisterDaemonServiceWithCallback(
-	service string,
-	maximumAcceptableUpdateDelay time.Duration,
-	callback func(),
-) error {
-	ufm.lock.Lock()
-	defer ufm.lock.Unlock()
-
-	if maximumAcceptableUpdateDelay <= 0 {
-		return fmt.Errorf(
-			"registration failure for service %v: maximum acceptable update delay %v must be positive",
-			service,
-			maximumAcceptableUpdateDelay,
-		)
-	}
-
-	// Don't register daemon services if the monitor has been disabled.
-	if ufm.disabled {
-		return nil
-	}
-
-	// Don't register additional daemon services if the monitor has already been stopped.
-	// This could be a concern for short-running integration test cases, where the network
-	// stops before all daemon services have been registered.
-	if ufm.stopped {
-		return fmt.Errorf("registration failure for service %v: monitor has been stopped", service)
-	}
-
-	if _, ok := ufm.serviceToUpdateMetadata[service]; ok {
-		return fmt.Errorf("service %v already registered", service)
-	}
-
-	ufm.serviceToUpdateMetadata[service] = updateMetadata{
-		timer:           time.AfterFunc(ufm.daemonStartupGracePeriod+maximumAcceptableUpdateDelay, callback),
-		updateFrequency: maximumAcceptableUpdateDelay,
-	}
-	return nil
-}
-
-// PanicServiceNotResponding returns a function that panics with a message indicating that the specified daemon
-// service is not responding. This is ideal for creating a callback function when registering a daemon service.
-func PanicServiceNotResponding(service string) func() {
-	return func() {
-		panic(fmt.Sprintf("%v daemon not responding", service))
-	}
-}
-
-// LogErrorServiceNotResponding returns a function that logs an error indicating that the specified daemon service
-// is not responding. This is ideal for creating a callback function when registering a daemon service.
-func LogErrorServiceNotResponding(service string, logger log.Logger) func() {
-	return func() {
-		logger.Error(
-			"daemon not responding",
-			"service",
-			service,
-		)
-	}
-}
-
-// RegisterDaemonService registers a new daemon service with the update frequency monitor. If the daemon service
-// fails to respond within the maximum acceptable update delay, the monitor will log an error.
-// This method is synchronized. The method an error if the daemon service was already registered or the monitor has
-// already been stopped.
-func (ufm *UpdateMonitor) RegisterDaemonService(
-	service string,
-	maximumAcceptableUpdateDelay time.Duration,
-) error {
-	return ufm.RegisterDaemonServiceWithCallback(
-		service,
-		maximumAcceptableUpdateDelay,
-		LogErrorServiceNotResponding(service, ufm.logger),
-	)
-}
-
-// Stop stops the update frequency monitor. This method is synchronized.
-func (ufm *UpdateMonitor) Stop() {
-	ufm.lock.Lock()
-	defer ufm.lock.Unlock()
-
-	// Don't stop the monitor if it has already been stopped.
-	if ufm.stopped {
-		return
-	}
-
-	for _, metadata := range ufm.serviceToUpdateMetadata {
-		metadata.timer.Stop()
-	}
-	ufm.stopped = true
-}
-
-// RegisterValidResponse registers a valid response from the daemon service. This will reset the timer for the
-// daemon service. This method is synchronized.
-func (ufm *UpdateMonitor) RegisterValidResponse(service string) error {
-	ufm.lock.Lock()
-	defer ufm.lock.Unlock()
-
-	// Don't return an error if the monitor has been disabled.
-	if ufm.disabled {
-		return nil
-	}
-
-	// Don't bother to reset the timer if the monitor has already been stopped.
-	if ufm.stopped {
-		return nil
-	}
-
-	metadata, ok := ufm.serviceToUpdateMetadata[service]
-	if !ok {
-		return fmt.Errorf("service %v not registered", service)
-	}
-
-	metadata.timer.Reset(metadata.updateFrequency)
-	return nil
-}
diff --git a/protocol/daemons/server/types/update_monitor_test.go b/protocol/daemons/server/types/update_monitor_test.go
deleted file mode 100644
index c9abf1094f3..00000000000
--- a/protocol/daemons/server/types/update_monitor_test.go
+++ /dev/null
@@ -1,189 +0,0 @@
-package types_test
-
-import (
-	"github.com/dydxprotocol/v4-chain/protocol/daemons/server/types"
-	"github.com/dydxprotocol/v4-chain/protocol/mocks"
-	"github.com/stretchr/testify/mock"
-	"github.com/stretchr/testify/require"
-	"sync/atomic"
-	"testing"
-	"time"
-)
-
-var (
-	zeroDuration = 0 * time.Second
-)
-
-func createTestMonitor() (*types.UpdateMonitor, *mocks.Logger) {
-	logger := &mocks.Logger{}
-	return types.NewUpdateFrequencyMonitor(zeroDuration, logger), logger
-}
-
-// The following tests may still intermittently fail on an overloaded system as they rely
-// on `time.Sleep`, which is not guaranteed to wake up after the specified amount of time.
-func TestRegisterDaemonService_Success(t *testing.T) {
-	ufm, logger := createTestMonitor()
-	err := ufm.RegisterDaemonService("test-service", 200*time.Millisecond)
-	require.NoError(t, err)
-
-	// As long as responses come in before the 200ms deadline, no errors should be logged.
-	time.Sleep(80 * time.Millisecond)
-	require.NoError(t, ufm.RegisterValidResponse("test-service"))
-	time.Sleep(80 * time.Millisecond)
-	require.NoError(t, ufm.RegisterValidResponse("test-service"))
-	time.Sleep(80 * time.Millisecond)
-
-	ufm.Stop()
-	// Assert: no calls to the logger were made.
-	mock.AssertExpectationsForObjects(t, logger)
-}
-
-func TestRegisterDaemonService_SuccessfullyLogsError(t *testing.T) {
-	ufm, logger := createTestMonitor()
-	logger.On("Error", "daemon not responding", "service", "test-service").Once().Return()
-	err := ufm.RegisterDaemonService("test-service", 1*time.Millisecond)
-	require.NoError(t, err)
-	time.Sleep(2 * time.Millisecond)
-	ufm.Stop()
-
-	// Assert: the logger was called with the expected arguments.
-	mock.AssertExpectationsForObjects(t, logger)
-}
-
-func TestRegisterDaemonServiceWithCallback_Success(t *testing.T) {
-	callbackCalled := atomic.Bool{}
-
-	ufm, _ := createTestMonitor()
-	err := ufm.RegisterDaemonServiceWithCallback("test-service", 200*time.Millisecond, func() {
-		callbackCalled.Store(true)
-	})
-	require.NoError(t, err)
-
-	// As long as responses come in before the 200ms deadline, no panic should occur.
-	time.Sleep(80 * time.Millisecond)
-	require.NoError(t, ufm.RegisterValidResponse("test-service"))
-	time.Sleep(80 * time.Millisecond)
-	require.NoError(t, ufm.RegisterValidResponse("test-service"))
-	time.Sleep(80 * time.Millisecond)
-
-	require.False(t, callbackCalled.Load())
-
-	ufm.Stop()
-}
-
-func TestRegisterDaemonService_DoubleRegistrationFails(t *testing.T) {
-	ufm, logger := createTestMonitor()
-
-	err := ufm.RegisterDaemonService("test-service", 200*time.Millisecond)
-	require.NoError(t, err)
-
-	// Register the same daemon service again. This should fail, and 50ms update frequency should be ignored.
-	err = ufm.RegisterDaemonService("test-service", 50*time.Millisecond)
-	require.ErrorContains(t, err, "service already registered")
-
-	// Confirm that the original 200ms update frequency is still in effect. 50ms would have triggered an error log.
-	// Note there is a possibility that 200ms will still cause an error log due to the semantics of Sleep, which is
-	// not guaranteed to sleep for exactly the specified duration.
-	time.Sleep(80 * time.Millisecond)
-	require.NoError(t, ufm.RegisterValidResponse("test-service"))
-	time.Sleep(80 * time.Millisecond)
-	ufm.Stop()
-
-	// Assert no calls to the logger were made.
-	mock.AssertExpectationsForObjects(t, logger)
-}
-
-func TestRegisterDaemonServiceWithCallback_DoubleRegistrationFails(t *testing.T) {
-	// lock synchronizes callback flags and was added to avoid race test failures.
-	callback1Called := atomic.Bool{}
-	callback2Called := atomic.Bool{}
-
-	ufm, _ := createTestMonitor()
-	// First registration should succeed.
-	err := ufm.RegisterDaemonServiceWithCallback("test-service", 200*time.Millisecond, func() {
-		callback1Called.Store(true)
-	})
-	require.NoError(t, err)
-
-	// Register the same daemon service again. This should fail, and 50ms update frequency should be ignored.
-	err = ufm.RegisterDaemonServiceWithCallback("test-service", 50*time.Millisecond, func() {
-		callback2Called.Store(true)
-	})
-	require.ErrorContains(t, err, "service already registered")
-
-	// Validate that the original callback is still in effect for the original 200ms update frequency.
-	// The 50ms update frequency should have invoked a callback if it were applied.
-	time.Sleep(80 * time.Millisecond)
-	require.False(t, callback1Called.Load())
-	require.False(t, callback2Called.Load())
-
-	// Validate no issues with RegisterValidResponse after a double registration was attempted.
-	require.NoError(t, ufm.RegisterValidResponse("test-service"))
-
-	// Sleep until the callback should be called.
-	time.Sleep(250 * time.Millisecond)
-	require.True(t, callback1Called.Load())
-	require.False(t, callback2Called.Load())
-
-	ufm.Stop()
-}
-
-func TestRegisterDaemonService_RegistrationFailsAfterStop(t *testing.T) {
-	ufm, logger := createTestMonitor()
-	ufm.Stop()
-	err := ufm.RegisterDaemonService("test-service", 50*time.Millisecond)
-	require.ErrorContains(t, err, "monitor has been stopped")
-
-	// Any scheduled functions with error logs that were not cleaned up should trigger before this sleep finishes.
-	time.Sleep(100 * time.Millisecond)
-	mock.AssertExpectationsForObjects(t, logger)
-}
-
-func TestRegisterDaemonServiceWithCallback_RegistrationFailsAfterStop(t *testing.T) {
-	ufm, _ := createTestMonitor()
-	ufm.Stop()
-
-	callbackCalled := atomic.Bool{}
-
-	// Registering a daemon service with a callback should fail after the monitor has been stopped.
-	err := ufm.RegisterDaemonServiceWithCallback("test-service", 50*time.Millisecond, func() {
-		callbackCalled.Store(true)
-	})
-	require.ErrorContains(t, err, "monitor has been stopped")
-
-	// Wait until after the callback duration has expired. The callback should not be called.
-	time.Sleep(75 * time.Millisecond)
-
-	// Validate that the callback was not called.
-	require.False(t, callbackCalled.Load())
-}
-
-func TestRegisterValidResponse_NegativeUpdateDelay(t *testing.T) {
-	ufm, logger := createTestMonitor()
-	err := ufm.RegisterDaemonService("test-service", -50*time.Millisecond)
-	require.ErrorContains(t, err, "update delay -50ms must be positive")
-
-	// Sanity check: no calls to the logger should have been made.
-	mock.AssertExpectationsForObjects(t, logger)
-}
-
-func TestRegisterValidResponseWithCallback_NegativeUpdateDelay(t *testing.T) {
-	ufm, _ := createTestMonitor()
-	err := ufm.RegisterDaemonServiceWithCallback("test-service", -50*time.Millisecond, func() {})
-	require.ErrorContains(t, err, "update delay -50ms must be positive")
-}
-
-func TestPanicServiceNotResponding(t *testing.T) {
-	panicFunc := types.PanicServiceNotResponding("test-service")
-	require.Panics(t, panicFunc)
-}
-
-func TestLogErrorServiceNotResponding(t *testing.T) {
-	logger := &mocks.Logger{}
-	logger.On("Error", "daemon not responding", "service", "test-service").Return()
-	logFunc := types.LogErrorServiceNotResponding("test-service", logger)
-	logFunc()
-
-	// Assert: the logger was called with the expected arguments.
-	mock.AssertExpectationsForObjects(t, logger)
-}
diff --git a/protocol/daemons/server/types/util.go b/protocol/daemons/server/types/util.go
deleted file mode 100644
index 59c07dcb6fe..00000000000
--- a/protocol/daemons/server/types/util.go
+++ /dev/null
@@ -1,9 +0,0 @@
-package types
-
-import "time"
-
-// MaximumAcceptableUpdateDelay computes the maximum acceptable update delay for a daemon service as a
-// multiple of the loop delay.
-func MaximumAcceptableUpdateDelay(loopDelayMs uint32) time.Duration {
-	return MaximumLoopDelayMultiple * time.Duration(loopDelayMs) * time.Millisecond
-}
diff --git a/protocol/daemons/server/types/util_test.go b/protocol/daemons/server/types/util_test.go
deleted file mode 100644
index 497faf19dd9..00000000000
--- a/protocol/daemons/server/types/util_test.go
+++ /dev/null
@@ -1,14 +0,0 @@
-package types
-
-import (
-	"github.com/stretchr/testify/require"
-	"testing"
-	"time"
-)
-
-func TestMaximumAcceptableUpdateDelay(t *testing.T) {
-	loopDelayMs := uint32(1000)
-	expected := time.Duration(MaximumLoopDelayMultiple*loopDelayMs) * time.Millisecond
-	actual := MaximumAcceptableUpdateDelay(loopDelayMs)
-	require.Equal(t, expected, actual)
-}
diff --git a/protocol/daemons/types/health_checkable.go b/protocol/daemons/types/health_checkable.go
index da79405685e..fd5c2246b08 100644
--- a/protocol/daemons/types/health_checkable.go
+++ b/protocol/daemons/types/health_checkable.go
@@ -22,6 +22,8 @@ type HealthCheckable interface {
 	ReportFailure(err error)
 	// ReportSuccess records a successful update.
 	ReportSuccess()
+	// ServiceName returns the name of the service being monitored. This name is expected to be unique.
+	ServiceName() string
 }
 
 // timestampWithError couples a timestamp and error to make it easier to update them in tandem.
@@ -65,6 +67,9 @@ type timeBoundedHealthCheckable struct {
 
 	// logger is the logger used to log errors.
 	logger log.Logger
+
+	// serviceName is the name of the service being monitored. This field is read-only and not synchronized.
+	serviceName string
 }
 
 // NewTimeBoundedHealthCheckable creates a new HealthCheckable instance.
@@ -76,12 +81,18 @@ func NewTimeBoundedHealthCheckable(
 	hc := &timeBoundedHealthCheckable{
 		timeProvider: timeProvider,
 		logger:       logger,
+		serviceName:  serviceName,
 	}
 	// Initialize the timeBoudnedHealthCheckable to an unhealthy state by reporting an error.
 	hc.ReportFailure(fmt.Errorf("%v is initializing", serviceName))
 	return hc
 }
 
+// ServiceName returns the name of the service being monitored.
+func (hc *timeBoundedHealthCheckable) ServiceName() string {
+	return hc.serviceName
+}
+
 // ReportSuccess records a successful update. This method is thread-safe.
 func (h *timeBoundedHealthCheckable) ReportSuccess() {
 	h.Lock()
diff --git a/protocol/mocks/HealthCheckable.go b/protocol/mocks/HealthCheckable.go
new file mode 100644
index 00000000000..4d69fd74b2b
--- /dev/null
+++ b/protocol/mocks/HealthCheckable.go
@@ -0,0 +1,63 @@
+// Code generated by mockery v2.14.0. DO NOT EDIT.
+
+package mocks
+
+import mock "github.com/stretchr/testify/mock"
+
+// HealthCheckable is an autogenerated mock type for the HealthCheckable type
+type HealthCheckable struct {
+	mock.Mock
+}
+
+// HealthCheck provides a mock function with given fields:
+func (_m *HealthCheckable) HealthCheck() error {
+	ret := _m.Called()
+
+	var r0 error
+	if rf, ok := ret.Get(0).(func() error); ok {
+		r0 = rf()
+	} else {
+		r0 = ret.Error(0)
+	}
+
+	return r0
+}
+
+// ReportFailure provides a mock function with given fields: err
+func (_m *HealthCheckable) ReportFailure(err error) {
+	_m.Called(err)
+}
+
+// ReportSuccess provides a mock function with given fields:
+func (_m *HealthCheckable) ReportSuccess() {
+	_m.Called()
+}
+
+// ServiceName provides a mock function with given fields:
+func (_m *HealthCheckable) ServiceName() string {
+	ret := _m.Called()
+
+	var r0 string
+	if rf, ok := ret.Get(0).(func() string); ok {
+		r0 = rf()
+	} else {
+		r0 = ret.Get(0).(string)
+	}
+
+	return r0
+}
+
+type mockConstructorTestingTNewHealthCheckable interface {
+	mock.TestingT
+	Cleanup(func())
+}
+
+// NewHealthCheckable creates a new instance of HealthCheckable. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
+func NewHealthCheckable(t mockConstructorTestingTNewHealthCheckable) *HealthCheckable {
+	mock := &HealthCheckable{}
+	mock.Mock.Test(t)
+
+	t.Cleanup(func() { mock.AssertExpectations(t) })
+
+	return mock
+}
diff --git a/protocol/mocks/Makefile b/protocol/mocks/Makefile
index f7c549814c2..b47321d1bf3 100644
--- a/protocol/mocks/Makefile
+++ b/protocol/mocks/Makefile
@@ -21,6 +21,7 @@ mock-gen:
 	@go run github.com/vektra/mockery/v2 --name=AppOptions --dir=$(GOPATH)/pkg/mod/github.com/dydxprotocol/cosmos-sdk@$(COSMOS_VERSION)/server/types --recursive --output=./mocks
 	@go run github.com/vektra/mockery/v2 --name=Logger --dir=$(GOPATH)/pkg/mod/github.com/dydxprotocol/cometbft@$(COMETBFT_VERSION)/libs/log --filename=logger.go --recursive --output=./mocks
 	@go run github.com/vektra/mockery/v2 --name=MsgRouter --dir=./lib --recursive --output=./mocks
+	@go run github.com/vektra/mockery/v2 --name=HealthCheckable --dir=./daemons/types --recursive --output=./mocks
 	@go run github.com/vektra/mockery/v2 --name=PrepareBridgeKeeper --dir=./app/prepare --recursive --output=./mocks
 	@go run github.com/vektra/mockery/v2 --name=PrepareClobKeeper --dir=./app/prepare --recursive --output=./mocks
 	@go run github.com/vektra/mockery/v2 --name=PreparePerpetualsKeeper --dir=./app/prepare --recursive --output=./mocks
diff --git a/protocol/testutil/app/app.go b/protocol/testutil/app/app.go
index c228da42d45..fa902d41bf7 100644
--- a/protocol/testutil/app/app.go
+++ b/protocol/testutil/app/app.go
@@ -490,7 +490,7 @@ func (tApp *TestApp) initChainIfNeeded() {
 	}
 
 	if tApp.builder.usesDefaultAppConfig {
-		tApp.App.Server.DisableUpdateMonitoringForTesting()
+		tApp.App.DisableHealthMonitorForTesting()
 	}
 
 	if tApp.builder.enableNonDeterminismChecks {

From bfbad7dbf26ad238832bad7db8e9c2c90ea89e1f Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal@dydx.exchange>
Date: Mon, 27 Nov 2023 16:04:29 -0800
Subject: [PATCH 02/15] PR Comments.

---
 protocol/app/app.go                             |  8 ++++----
 protocol/daemons/server/bridge.go               |  8 ++------
 protocol/daemons/server/liquidation.go          |  2 ++
 protocol/daemons/server/pricefeed.go            | 12 +++++++-----
 protocol/daemons/server/server.go               | 17 +++++++++++++++++
 protocol/daemons/server/types/health_monitor.go |  5 ++---
 .../daemons/server/types/health_monitor_test.go |  2 +-
 7 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/protocol/app/app.go b/protocol/app/app.go
index f1e256ed87e..6a5e53b418d 100644
--- a/protocol/app/app.go
+++ b/protocol/app/app.go
@@ -295,7 +295,7 @@ type App struct {
 	LiquidationsClient *liquidationclient.Client
 	BridgeClient       *bridgeclient.Client
 
-	HealthMonitor *daemonservertypes.HealthMonitor
+	DaemonHealthMonitor *daemonservertypes.HealthMonitor
 }
 
 // assertAppPreconditions assert invariants required for an application to start.
@@ -595,7 +595,7 @@ func New(
 	bridgeEventManager := bridgedaemontypes.NewBridgeEventManager(timeProvider)
 	app.Server.WithBridgeEventManager(bridgeEventManager)
 
-	app.HealthMonitor = daemonservertypes.NewHealthMonitor(
+	app.DaemonHealthMonitor = daemonservertypes.NewHealthMonitor(
 		daemonservertypes.DaemonStartupGracePeriod,
 		daemonservertypes.HealthCheckPollFrequency,
 		app.Logger(),
@@ -1233,7 +1233,7 @@ func (app *App) MonitorDaemon(
 	healthCheckableDaemon daemontypes.HealthCheckable,
 	maximumAcceptableUpdateDelay time.Duration,
 ) {
-	if err := app.HealthMonitor.RegisterService(healthCheckableDaemon, maximumAcceptableUpdateDelay); err != nil {
+	if err := app.DaemonHealthMonitor.RegisterService(healthCheckableDaemon, maximumAcceptableUpdateDelay); err != nil {
 		app.Logger().Error(
 			"Failed to register daemon service with update monitor",
 			"error",
@@ -1249,7 +1249,7 @@ func (app *App) MonitorDaemon(
 
 // DisableHealthMonitorForTesting disables the health monitor for testing.
 func (app *App) DisableHealthMonitorForTesting() {
-	app.HealthMonitor.DisableForTesting()
+	app.DaemonHealthMonitor.DisableForTesting()
 }
 
 // hydrateMemStores hydrates the memStores used for caching state.
diff --git a/protocol/daemons/server/bridge.go b/protocol/daemons/server/bridge.go
index 632d72ddb21..a4d0de5683c 100644
--- a/protocol/daemons/server/bridge.go
+++ b/protocol/daemons/server/bridge.go
@@ -3,6 +3,7 @@ package server
 import (
 	"context"
 	"github.com/dydxprotocol/v4-chain/protocol/daemons/bridge/api"
+	"github.com/dydxprotocol/v4-chain/protocol/daemons/server/types"
 	bdtypes "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types/bridge"
 )
 
@@ -29,12 +30,7 @@ func (s *Server) AddBridgeEvents(
 	*api.AddBridgeEventsResponse,
 	error,
 ) {
-	// If the daemon is unable to report a response, there is either an error in the registration of
-	// this daemon, or another one. In either case, the protocol should panic.
-	// TODO(CORE-582): Re-enable this check once the bridge daemon is fixed in local / CI environments.
-	//if err := s.reportResponse(types.BridgeDaemonServiceName); err != nil {
-	//	panic(err)
-	//}
+	s.reportResponse(types.BridgeDaemonServiceName)
 	if err := s.bridgeEventManager.AddBridgeEvents(req.BridgeEvents); err != nil {
 		return nil, err
 	}
diff --git a/protocol/daemons/server/liquidation.go b/protocol/daemons/server/liquidation.go
index 63c8f6b1134..7cf8d351392 100644
--- a/protocol/daemons/server/liquidation.go
+++ b/protocol/daemons/server/liquidation.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"github.com/cosmos/cosmos-sdk/telemetry"
 	"github.com/dydxprotocol/v4-chain/protocol/daemons/liquidation/api"
+	"github.com/dydxprotocol/v4-chain/protocol/daemons/server/types"
 	liquidationtypes "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types/liquidations"
 	"github.com/dydxprotocol/v4-chain/protocol/lib/metrics"
 )
@@ -36,6 +37,7 @@ func (s *Server) LiquidateSubaccounts(
 		metrics.Received,
 		metrics.Count,
 	)
+	s.reportResponse(types.LiquidationsDaemonServiceName)
 	s.liquidatableSubaccountIds.UpdateSubaccountIds(req.SubaccountIds)
 	return &api.LiquidateSubaccountsResponse{}, nil
 }
diff --git a/protocol/daemons/server/pricefeed.go b/protocol/daemons/server/pricefeed.go
index 0337483fc38..1d43f984168 100644
--- a/protocol/daemons/server/pricefeed.go
+++ b/protocol/daemons/server/pricefeed.go
@@ -3,6 +3,7 @@ package server
 import (
 	"context"
 	errorsmod "cosmossdk.io/errors"
+	"github.com/dydxprotocol/v4-chain/protocol/daemons/server/types"
 	"time"
 
 	gometrics "github.com/armon/go-metrics"
@@ -11,7 +12,7 @@ import (
 	"github.com/dydxprotocol/v4-chain/protocol/daemons/pricefeed/api"
 	pricefeedmetrics "github.com/dydxprotocol/v4-chain/protocol/daemons/pricefeed/metrics"
 	pricefeedtypes "github.com/dydxprotocol/v4-chain/protocol/daemons/server/types/pricefeed"
-	"github.com/dydxprotocol/v4-chain/protocol/daemons/types"
+	daemontypes "github.com/dydxprotocol/v4-chain/protocol/daemons/types"
 	"github.com/dydxprotocol/v4-chain/protocol/lib/metrics"
 )
 
@@ -35,6 +36,7 @@ func (s *Server) UpdateMarketPrices(
 	ctx context.Context,
 	req *api.UpdateMarketPricesRequest,
 ) (*api.UpdateMarketPricesResponse, error) {
+	s.reportResponse(types.PricefeedDaemonServiceName)
 	// Measure latency in ingesting and handling gRPC price update.
 	defer telemetry.ModuleMeasureSince(
 		metrics.PricefeedServer,
@@ -46,7 +48,7 @@ func (s *Server) UpdateMarketPrices(
 	if s.marketToExchange == nil {
 		panic(
 			errorsmod.Wrapf(
-				types.ErrServerNotInitializedCorrectly,
+				daemontypes.ErrServerNotInitializedCorrectly,
 				"MarketToExchange not initialized",
 			),
 		)
@@ -65,7 +67,7 @@ func (s *Server) UpdateMarketPrices(
 // validateMarketPricesUpdatesMessage validates a `UpdateMarketPricesRequest`.
 func validateMarketPricesUpdatesMessage(req *api.UpdateMarketPricesRequest) error {
 	if len(req.MarketPriceUpdates) == 0 {
-		return types.ErrPriceFeedMarketPriceUpdateEmpty
+		return daemontypes.ErrPriceFeedMarketPriceUpdateEmpty
 	}
 
 	for _, mpu := range req.MarketPriceUpdates {
@@ -95,7 +97,7 @@ func validateMarketPriceUpdate(mpu *api.MarketPriceUpdate) error {
 	for _, ep := range mpu.ExchangePrices {
 		if ep.Price == constants.DefaultPrice {
 			return generateSdkErrorForExchangePrice(
-				types.ErrPriceFeedInvalidPrice,
+				daemontypes.ErrPriceFeedInvalidPrice,
 				ep,
 				mpu.MarketId,
 			)
@@ -103,7 +105,7 @@ func validateMarketPriceUpdate(mpu *api.MarketPriceUpdate) error {
 
 		if ep.LastUpdateTime == nil {
 			return generateSdkErrorForExchangePrice(
-				types.ErrPriceFeedLastUpdateTimeNotSet,
+				daemontypes.ErrPriceFeedLastUpdateTimeNotSet,
 				ep,
 				mpu.MarketId,
 			)
diff --git a/protocol/daemons/server/server.go b/protocol/daemons/server/server.go
index 8c3cb42618d..28ef9b8fa8b 100644
--- a/protocol/daemons/server/server.go
+++ b/protocol/daemons/server/server.go
@@ -1,12 +1,15 @@
 package server
 
 import (
+	gometrics "github.com/armon/go-metrics"
 	"github.com/cometbft/cometbft/libs/log"
+	"github.com/cosmos/cosmos-sdk/telemetry"
 	bridgeapi "github.com/dydxprotocol/v4-chain/protocol/daemons/bridge/api"
 	"github.com/dydxprotocol/v4-chain/protocol/daemons/constants"
 	liquidationapi "github.com/dydxprotocol/v4-chain/protocol/daemons/liquidation/api"
 	pricefeedapi "github.com/dydxprotocol/v4-chain/protocol/daemons/pricefeed/api"
 	daemontypes "github.com/dydxprotocol/v4-chain/protocol/daemons/types"
+	"github.com/dydxprotocol/v4-chain/protocol/lib/metrics"
 	"net"
 	"syscall"
 )
@@ -48,6 +51,20 @@ func (server *Server) Stop() {
 	server.gsrv.Stop()
 }
 
+// reportResponse reports a response from a daemon service for metrics collection purposes.
+func (server *Server) reportResponse(daemonKey string) {
+	telemetry.IncrCounterWithLabels(
+		[]string{
+			metrics.DaemonServer,
+			metrics.ValidResponse,
+		},
+		1,
+		[]gometrics.Label{
+			metrics.GetLabelForStringValue(metrics.Daemon, daemonKey),
+		},
+	)
+}
+
 // Start clears the current socket and establishes a new socket connection
 // on the local filesystem.
 // See URL for more information: https://eli.thegreenplace.net/2019/unix-domain-sockets-in-go/
diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go
index e3dd3b95e53..771af6f833c 100644
--- a/protocol/daemons/server/types/health_monitor.go
+++ b/protocol/daemons/server/types/health_monitor.go
@@ -95,10 +95,9 @@ func (hc *healthChecker) Poll() {
 	if !hc.firstFailureInStreak.IsZero() &&
 		now.Sub(hc.firstFailureInStreak.Timestamp()) >= hc.maximumAcceptableUnhealthyDuration {
 		hc.unhealthyCallback(hc.firstFailureInStreak.Error())
-	} else {
-		// If we do not execute the callback, schedule the next poll.
-		hc.timer.Reset(hc.pollFrequency)
 	}
+	// Schedule the next poll.
+	hc.timer.Reset(hc.pollFrequency)
 }
 
 func (hc *healthChecker) Stop() {
diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go
index 8fba05764ae..487cdd3368f 100644
--- a/protocol/daemons/server/types/health_monitor_test.go
+++ b/protocol/daemons/server/types/health_monitor_test.go
@@ -283,7 +283,7 @@ func TestRegisterService_RegistrationFailsAfterStop(t *testing.T) {
 	mock.AssertExpectationsForObjects(t, logger)
 }
 
-func TestRegisterValidResponseWithCallback_NegativeUpdateDuration(t *testing.T) {
+func TestRegisterValidResponseWithCallback_NegativeUnhealthyDuration(t *testing.T) {
 	ufm, _ := createTestMonitor()
 	hc := mockFailingHealthCheckerWithError("test-service", TestError1)
 	err := ufm.RegisterServiceWithCallback(hc, -50*time.Millisecond, func(error) {})

From d39f6c56c1dacae001331de83d0ba2e6dc6c03ae Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal@dydx.exchange>
Date: Mon, 27 Nov 2023 16:38:26 -0800
Subject: [PATCH 03/15] Add comment.

---
 protocol/daemons/server/types/health_monitor.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go
index 771af6f833c..f412411849f 100644
--- a/protocol/daemons/server/types/health_monitor.go
+++ b/protocol/daemons/server/types/health_monitor.go
@@ -11,6 +11,7 @@ import (
 )
 
 const (
+	// HealthCheckPollFrequency is the frequency at which the health checkable service is polled.
 	HealthCheckPollFrequency = 5 * time.Second
 )
 

From bc90892f2fdbf187f700b1161dff02dd8d9a7c81 Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal@dydx.exchange>
Date: Mon, 27 Nov 2023 16:52:23 -0800
Subject: [PATCH 04/15] Add mutex to health checker.

---
 .../daemons/server/types/health_monitor.go    | 44 +++++++++++++------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go
index f412411849f..b8ab2bc8813 100644
--- a/protocol/daemons/server/types/health_monitor.go
+++ b/protocol/daemons/server/types/health_monitor.go
@@ -44,41 +44,53 @@ func (u *timestampWithError) Error() error {
 
 // healthChecker encapsulates the logic for monitoring the health of a health checkable service.
 type healthChecker struct {
+	// The following fields are initialized in StartNewHealthChecker and are not modified after initialization.
 	// healthCheckable is the health checkable service to be monitored.
 	healthCheckable types.HealthCheckable
 
-	// timer triggers a health check poll for a health checkable service.
-	timer *time.Timer
-
 	// pollFrequency is the frequency at which the health checkable service is polled.
 	pollFrequency time.Duration
 
+	// maximumAcceptableUnhealthyDuration is the maximum acceptable duration for a health checkable service to
+	// remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the
+	// specified callback function.
+	maximumAcceptableUnhealthyDuration time.Duration
+
+	// unhealthyCallback is the callback function to be executed if the health checkable service remains
+	// unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration.
+	// This callback function is executed with the error that caused the service to become unhealthy.
+	unhealthyCallback func(error)
+
+	// lock is used to synchronize access to the health checker's dynamically updated fields.
+	lock sync.Mutex
+
+	// The following fields are dynamically updated by the health checker:
+	// timer triggers a health check poll for a health checkable service.
+	// Access to the timer is synchronized.
+	timer *time.Timer
+
 	// mostRecentSuccess is the timestamp of the most recent successful health check.
+	// Access to mostRecentSuccess is synchronized.
 	mostRecentSuccess time.Time
 
 	// firstFailureInStreak is the timestamp of the first error in the most recent streak of errors. It is set
 	// whenever the service toggles from healthy to an unhealthy state, and used to determine how long the daemon has
 	// been unhealthy. If this timestamp is nil, then the error streak ended before it could trigger a callback.
+	// Access to firstFailureInStreak is synchronized.
 	firstFailureInStreak timestampWithError
 
-	// unhealthyCallback is the callback function to be executed if the health checkable service remains
-	// unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration.
-	// This callback function is executed with the error that caused the service to become unhealthy.
-	unhealthyCallback func(error)
-
 	// timeProvider is used to get the current time. It is added as a field so that it can be mocked in tests.
+	// Access to timeProvider is synchronized.
 	timeProvider libtime.TimeProvider
-
-	// maximumAcceptableUnhealthyDuration is the maximum acceptable duration for a health checkable service to
-	// remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the
-	// specified callback function.
-	maximumAcceptableUnhealthyDuration time.Duration
 }
 
 // Poll executes a health check for the health checkable service. If the service has been unhealthy for longer than the
 // maximum acceptable unhealthy duration, the callback function is executed.
-// This method is publicly exposed for testing.
+// This method is publicly exposed for testing. This method is synchronized.
 func (hc *healthChecker) Poll() {
+	hc.lock.Lock()
+	defer hc.lock.Unlock()
+
 	// Don't return an error if the monitor has been disabled.
 	err := hc.healthCheckable.HealthCheck()
 	now := hc.timeProvider.Now()
@@ -101,7 +113,11 @@ func (hc *healthChecker) Poll() {
 	hc.timer.Reset(hc.pollFrequency)
 }
 
+// Stop stops the health checker. This method is synchronized.
 func (hc *healthChecker) Stop() {
+	hc.lock.Lock()
+	defer hc.lock.Unlock()
+
 	hc.timer.Stop()
 }
 

From 857d7042de32e53d0cf38bd7325fe2ea8d9892d1 Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal@dydx.exchange>
Date: Tue, 28 Nov 2023 15:59:53 -0800
Subject: [PATCH 05/15] PR comments.

---
 protocol/app/app.go                           |   3 +
 protocol/app/app_test.go                      |   6 +-
 protocol/daemons/metrics/client/client.go     |   3 +
 .../daemons/server/types/health_checker.go    | 152 ++++++++++++++++++
 .../server/types/health_checker_test.go       | 127 +++++++++++++++
 .../daemons/server/types/health_monitor.go    | 141 ++--------------
 .../server/types/health_monitor_test.go       | 152 ++++--------------
 protocol/daemons/server/types/stoppable.go    |   7 +
 8 files changed, 341 insertions(+), 250 deletions(-)
 create mode 100644 protocol/daemons/server/types/health_checker.go
 create mode 100644 protocol/daemons/server/types/health_checker_test.go
 create mode 100644 protocol/daemons/server/types/stoppable.go

diff --git a/protocol/app/app.go b/protocol/app/app.go
index 6a5e53b418d..ee947db019b 100644
--- a/protocol/app/app.go
+++ b/protocol/app/app.go
@@ -672,6 +672,9 @@ func New(
 		// Start the Metrics Daemon.
 		// The metrics daemon is purely used for observability. It should never bring the app down.
 		// TODO(CLOB-960) Don't start this goroutine if telemetry is disabled
+		// Note: the metrics daemon is such a simple go-routine that we don't bother implementing a health-check
+		// for this service. The task loop does not produce any errors because the telemetry calls themselves are
+		// not error-returning, so in effect this daemon would never become unhealthy.
 		go func() {
 			defer func() {
 				if r := recover(); r != nil {
diff --git a/protocol/app/app_test.go b/protocol/app/app_test.go
index b845cc71b95..6503d9c4190 100644
--- a/protocol/app/app_test.go
+++ b/protocol/app/app_test.go
@@ -233,5 +233,9 @@ func TestMonitorDaemon_Panics(t *testing.T) {
 
 	app.MonitorDaemon(hc, 5*time.Minute)
 	// The second registration should fail, causing a panic.
-	require.Panics(t, func() { app.MonitorDaemon(hc, 5*time.Minute) })
+	require.PanicsWithError(
+		t,
+		"service test-service already registered",
+		func() { app.MonitorDaemon(hc, 5*time.Minute) },
+	)
 }
diff --git a/protocol/daemons/metrics/client/client.go b/protocol/daemons/metrics/client/client.go
index a714a637132..5e86debedda 100644
--- a/protocol/daemons/metrics/client/client.go
+++ b/protocol/daemons/metrics/client/client.go
@@ -19,6 +19,9 @@ var (
 
 // Start begins a job that periodically:
 // 1) Emits metrics about app version and git commit.
+// Note: the metrics daemon is such a simple go-routine that we don't bother implementing a health-check
+// for this service. The task loop does not produce any errors because the telemetry calls themselves are
+// not error-returning, so in effect this daemon would never become unhealthy.
 func Start(
 	ctx context.Context,
 	logger log.Logger,
diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go
new file mode 100644
index 00000000000..ef465566ada
--- /dev/null
+++ b/protocol/daemons/server/types/health_checker.go
@@ -0,0 +1,152 @@
+package types
+
+import (
+	"github.com/cometbft/cometbft/libs/log"
+	"github.com/dydxprotocol/v4-chain/protocol/daemons/types"
+	libtime "github.com/dydxprotocol/v4-chain/protocol/lib/time"
+	"sync"
+	"time"
+)
+
+// timestampWithError couples a timestamp and error to make it easier to update them in tandem. The
+// timestampWithError will track the timestamp of the first error in a streak of errors, but keeps a record of the
+// most recent error. This is useful for determining how long a service has been unhealthy, and the current state
+// of the service.
+type timestampWithError struct {
+	timestamp time.Time
+	err       error
+}
+
+func (u *timestampWithError) Update(timestamp time.Time, err error) {
+	// If the timestamp is zero, this is the first update, so set the timestamp.
+	if u.timestamp.IsZero() {
+		u.timestamp = timestamp
+	}
+	u.err = err
+}
+
+func (u *timestampWithError) Reset() {
+	u.timestamp = time.Time{}
+	u.err = nil
+}
+
+func (u *timestampWithError) IsZero() bool {
+	return u.timestamp.IsZero() && u.err == nil
+}
+
+func (u *timestampWithError) Timestamp() time.Time {
+	return u.timestamp
+}
+
+func (u *timestampWithError) Error() error {
+	return u.err
+}
+
+// healthChecker encapsulates the logic for monitoring the health of a health checkable service.
+type healthChecker struct {
+	// The following fields are initialized in StartNewHealthChecker and are not modified after initialization.
+	// healthCheckable is the health checkable service to be monitored.
+	healthCheckable types.HealthCheckable
+
+	// pollFrequency is the frequency at which the health checkable service is polled.
+	pollFrequency time.Duration
+
+	// maxAcceptableUnhealthyDuration is the maximum acceptable duration for a health checkable service to
+	// remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the
+	// specified callback function.
+	maxAcceptableUnhealthyDuration time.Duration
+
+	// unhealthyCallback is the callback function to be executed if the health checkable service remains
+	// unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration.
+	// This callback function is executed with the error that caused the service to become unhealthy.
+	unhealthyCallback func(error)
+
+	logger log.Logger
+
+	// lock is used to synchronize access to the health checker's dynamically updated fields.
+	lock sync.Mutex
+
+	// The following fields are dynamically updated by the health checker:
+	// timer triggers a health check poll for a health checkable service.
+	// Access to the timer is synchronized.
+	timer *time.Timer
+
+	// mostRecentSuccess is the timestamp of the most recent successful health check.
+	// Access to mostRecentSuccess is synchronized.
+	mostRecentSuccess time.Time
+
+	// mostRecentFailureStreakError tracks the timestamp of the first error in the most recent streak of errors, as well
+	// as the most recent error. It is updated on every error and reset every time the service sees a healthy response.
+	// This field is used to determine how long the daemon has been unhealthy. If this timestamp is nil, then either
+	// the service has never been unhealthy, or the most recent error streak ended before it could trigger a callback.
+	// Access to mostRecentFailureStreakError is synchronized.
+	mostRecentFailureStreakError timestampWithError
+
+	// timeProvider is used to get the current time. It is added as a field so that it can be mocked in tests.
+	// Access to timeProvider is synchronized.
+	timeProvider libtime.TimeProvider
+}
+
+// Poll executes a health check for the health checkable service. If the service has been unhealthy for longer than the
+// maximum acceptable unhealthy duration, the callback function is executed.
+// This method is publicly exposed for testing. This method is synchronized.
+func (hc *healthChecker) Poll() {
+	hc.lock.Lock()
+	defer hc.lock.Unlock()
+
+	err := hc.healthCheckable.HealthCheck()
+	now := hc.timeProvider.Now()
+
+	// Schedule the next poll
+	defer hc.timer.Reset(hc.pollFrequency)
+
+	// Capture healthy response.
+	if err == nil {
+		hc.mostRecentSuccess = now
+		// Whenever the service is healthy, reset the first failure in streak timestamp.
+		hc.mostRecentFailureStreakError.Reset()
+		return
+	}
+
+	hc.mostRecentFailureStreakError.Update(now, err)
+
+	// If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the
+	// callback function.
+	streakDuration := now.Sub(hc.mostRecentFailureStreakError.Timestamp())
+	if !hc.mostRecentFailureStreakError.IsZero() &&
+		streakDuration >= hc.maxAcceptableUnhealthyDuration {
+		hc.unhealthyCallback(hc.mostRecentFailureStreakError.Error())
+	}
+}
+
+// Stop stops the health checker. This method is synchronized.
+func (hc *healthChecker) Stop() {
+	hc.lock.Lock()
+	defer hc.lock.Unlock()
+
+	hc.timer.Stop()
+}
+
+// StartNewHealthChecker creates and starts a new health checker for a health checkable service.
+func StartNewHealthChecker(
+	healthCheckable types.HealthCheckable,
+	pollFrequency time.Duration,
+	unhealthyCallback func(error),
+	timeProvider libtime.TimeProvider,
+	maximumAcceptableUnhealthyDuration time.Duration,
+	startupGracePeriod time.Duration,
+	logger log.Logger,
+) *healthChecker {
+	checker := &healthChecker{
+		healthCheckable:                healthCheckable,
+		pollFrequency:                  pollFrequency,
+		unhealthyCallback:              unhealthyCallback,
+		timeProvider:                   timeProvider,
+		maxAcceptableUnhealthyDuration: maximumAcceptableUnhealthyDuration,
+		logger:                         logger,
+	}
+	// The first poll is scheduled after the startup grace period to allow the service to initialize.
+	checker.timer = time.AfterFunc(startupGracePeriod, checker.Poll)
+
+	return checker
+}
diff --git a/protocol/daemons/server/types/health_checker_test.go b/protocol/daemons/server/types/health_checker_test.go
new file mode 100644
index 00000000000..fc37b356bcf
--- /dev/null
+++ b/protocol/daemons/server/types/health_checker_test.go
@@ -0,0 +1,127 @@
+package types_test
+
+import (
+	"github.com/dydxprotocol/v4-chain/protocol/daemons/server/types"
+	"github.com/dydxprotocol/v4-chain/protocol/mocks"
+	"github.com/stretchr/testify/require"
+	"testing"
+	"time"
+)
+
+func TestHealthChecker(t *testing.T) {
+	tests := map[string]struct {
+		healthCheckResponses []error
+		expectedUnhealthy    error
+	}{
+		"initialized: no callback triggered": {
+			healthCheckResponses: []error{},
+			expectedUnhealthy:    nil,
+		},
+		"healthy, then unhealthy for < maximum unhealthy duration: no callback triggered": {
+			healthCheckResponses: []error{
+				nil,
+				TestError1,
+			},
+			expectedUnhealthy: nil,
+		},
+		"unhealthy for < maximum unhealthy duration: no callback triggered": {
+			healthCheckResponses: []error{
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+			},
+			expectedUnhealthy: nil,
+		},
+		"unhealthy, healthy, unhealthy: no callback triggered": {
+			healthCheckResponses: []error{
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+				nil,
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+			},
+			expectedUnhealthy: nil,
+		},
+		"unhealthy for maximum unhealthy duration: callback triggered": {
+			healthCheckResponses: []error{
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+			},
+			expectedUnhealthy: TestError1,
+		},
+		"unhealthy with multiple errors: most recent error returned": {
+			healthCheckResponses: []error{
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError1,
+				TestError2,
+			},
+			expectedUnhealthy: TestError2,
+		},
+	}
+	for name, test := range tests {
+		t.Run(name, func(t *testing.T) {
+			// Setup.
+			// Set up callback to track error passed to it.
+			callback, callbackError := callbackWithErrorPointer()
+
+			// Set up health checkable service.
+			checkable := &mocks.HealthCheckable{}
+			for _, response := range test.healthCheckResponses {
+				checkable.On("HealthCheck").Return(response).Once()
+			}
+
+			// Set up time provider to return a sequence of timestamps one second apart starting at Time0.
+			timeProvider := &mocks.TimeProvider{}
+			for i := range test.healthCheckResponses {
+				timeProvider.On("Now").Return(Time0.Add(time.Duration(i) * time.Second)).Once()
+			}
+
+			healthChecker := types.StartNewHealthChecker(
+				checkable,
+				TestLargeDuration, // set to a >> value so that poll is never auto-triggered by the timer
+				callback,
+				timeProvider,
+				TestMaximumUnhealthyDuration,
+				types.DaemonStartupGracePeriod,
+				&mocks.Logger{},
+			)
+
+			// Cleanup.
+			defer func() {
+				healthChecker.Stop()
+			}()
+
+			// Act - simulate the health checker polling for updates.
+			for i := 0; i < len(test.healthCheckResponses); i++ {
+				healthChecker.Poll()
+			}
+
+			// Assert.
+			// Validate the expected polls occurred according to the mocks.
+			checkable.AssertExpectations(t)
+			timeProvider.AssertExpectations(t)
+
+			// Validate the callback was called with the expected error.
+			if test.expectedUnhealthy == nil {
+				require.NoError(t, *callbackError)
+			} else {
+				require.ErrorContains(t, *callbackError, test.expectedUnhealthy.Error())
+			}
+		})
+	}
+}
diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go
index b8ab2bc8813..28fcd34eccf 100644
--- a/protocol/daemons/server/types/health_monitor.go
+++ b/protocol/daemons/server/types/health_monitor.go
@@ -15,134 +15,6 @@ const (
 	HealthCheckPollFrequency = 5 * time.Second
 )
 
-// timestampWithError couples a timestamp and error to make it easier to update them in tandem.
-type timestampWithError struct {
-	timestamp time.Time
-	err       error
-}
-
-func (u *timestampWithError) Update(timestamp time.Time, err error) {
-	u.timestamp = timestamp
-	u.err = err
-}
-
-func (u *timestampWithError) Reset() {
-	u.Update(time.Time{}, nil)
-}
-
-func (u *timestampWithError) IsZero() bool {
-	return u.timestamp.IsZero()
-}
-
-func (u *timestampWithError) Timestamp() time.Time {
-	return u.timestamp
-}
-
-func (u *timestampWithError) Error() error {
-	return u.err
-}
-
-// healthChecker encapsulates the logic for monitoring the health of a health checkable service.
-type healthChecker struct {
-	// The following fields are initialized in StartNewHealthChecker and are not modified after initialization.
-	// healthCheckable is the health checkable service to be monitored.
-	healthCheckable types.HealthCheckable
-
-	// pollFrequency is the frequency at which the health checkable service is polled.
-	pollFrequency time.Duration
-
-	// maximumAcceptableUnhealthyDuration is the maximum acceptable duration for a health checkable service to
-	// remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the
-	// specified callback function.
-	maximumAcceptableUnhealthyDuration time.Duration
-
-	// unhealthyCallback is the callback function to be executed if the health checkable service remains
-	// unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration.
-	// This callback function is executed with the error that caused the service to become unhealthy.
-	unhealthyCallback func(error)
-
-	// lock is used to synchronize access to the health checker's dynamically updated fields.
-	lock sync.Mutex
-
-	// The following fields are dynamically updated by the health checker:
-	// timer triggers a health check poll for a health checkable service.
-	// Access to the timer is synchronized.
-	timer *time.Timer
-
-	// mostRecentSuccess is the timestamp of the most recent successful health check.
-	// Access to mostRecentSuccess is synchronized.
-	mostRecentSuccess time.Time
-
-	// firstFailureInStreak is the timestamp of the first error in the most recent streak of errors. It is set
-	// whenever the service toggles from healthy to an unhealthy state, and used to determine how long the daemon has
-	// been unhealthy. If this timestamp is nil, then the error streak ended before it could trigger a callback.
-	// Access to firstFailureInStreak is synchronized.
-	firstFailureInStreak timestampWithError
-
-	// timeProvider is used to get the current time. It is added as a field so that it can be mocked in tests.
-	// Access to timeProvider is synchronized.
-	timeProvider libtime.TimeProvider
-}
-
-// Poll executes a health check for the health checkable service. If the service has been unhealthy for longer than the
-// maximum acceptable unhealthy duration, the callback function is executed.
-// This method is publicly exposed for testing. This method is synchronized.
-func (hc *healthChecker) Poll() {
-	hc.lock.Lock()
-	defer hc.lock.Unlock()
-
-	// Don't return an error if the monitor has been disabled.
-	err := hc.healthCheckable.HealthCheck()
-	now := hc.timeProvider.Now()
-	if err == nil {
-		hc.mostRecentSuccess = now
-		// Whenever the service is healthy, reset the first failure in streak timestamp.
-		hc.firstFailureInStreak.Reset()
-	} else if hc.firstFailureInStreak.IsZero() {
-		// Capture the timestamp of the first failure in a new streak.
-		hc.firstFailureInStreak.Update(now, err)
-	}
-
-	// If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the
-	// callback function.
-	if !hc.firstFailureInStreak.IsZero() &&
-		now.Sub(hc.firstFailureInStreak.Timestamp()) >= hc.maximumAcceptableUnhealthyDuration {
-		hc.unhealthyCallback(hc.firstFailureInStreak.Error())
-	}
-	// Schedule the next poll.
-	hc.timer.Reset(hc.pollFrequency)
-}
-
-// Stop stops the health checker. This method is synchronized.
-func (hc *healthChecker) Stop() {
-	hc.lock.Lock()
-	defer hc.lock.Unlock()
-
-	hc.timer.Stop()
-}
-
-// StartNewHealthChecker creates and starts a new health checker for a health checkable service.
-func StartNewHealthChecker(
-	healthCheckable types.HealthCheckable,
-	pollFrequency time.Duration,
-	unhealthyCallback func(error),
-	timeProvider libtime.TimeProvider,
-	maximumAcceptableUnhealthyDuration time.Duration,
-	startupGracePeriod time.Duration,
-) *healthChecker {
-	checker := &healthChecker{
-		healthCheckable:                    healthCheckable,
-		pollFrequency:                      pollFrequency,
-		unhealthyCallback:                  unhealthyCallback,
-		timeProvider:                       timeProvider,
-		maximumAcceptableUnhealthyDuration: maximumAcceptableUnhealthyDuration,
-	}
-	// The first poll is scheduled after the startup grace period to allow the service to initialize.
-	checker.timer = time.AfterFunc(startupGracePeriod, checker.Poll)
-
-	return checker
-}
-
 // HealthMonitor monitors the health of daemon services, which implement the HealthCheckable interface. If a
 // registered health-checkable service sustains an unhealthy state for the maximum acceptable unhealthy duration,
 // the monitor will execute a callback function.
@@ -189,7 +61,8 @@ func (hm *HealthMonitor) DisableForTesting() {
 // stays unhealthy every time it is polled during the maximum acceptable unhealthy duration, the monitor will
 // execute the callback function.
 // This method is synchronized. The method returns an error if the service was already registered or the
-// monitor has already been stopped.
+// monitor has already been stopped. If the monitor has been stopped, this method will proactively stop the
+// health-checkable service before returning.
 func (hm *HealthMonitor) RegisterServiceWithCallback(
 	hc types.HealthCheckable,
 	maximumAcceptableUnhealthyDuration time.Duration,
@@ -216,6 +89,12 @@ func (hm *HealthMonitor) RegisterServiceWithCallback(
 	// This could be a concern for short-running integration test cases, where the network
 	// stops before all daemon services have been registered.
 	if hm.stopped {
+		// If the service is stoppable, stop it. This helps us to clean up daemon services in test cases
+		// where the monitor is stopped before all daemon services have been registered.
+		if stoppable, ok := hc.(Stoppable); ok {
+			stoppable.Stop()
+		}
+
 		return fmt.Errorf(
 			"health check registration failure for service %v: monitor has been stopped",
 			hc.ServiceName(),
@@ -233,6 +112,7 @@ func (hm *HealthMonitor) RegisterServiceWithCallback(
 		&libtime.TimeProviderImpl{},
 		maximumAcceptableUnhealthyDuration,
 		hm.startupGracePeriod,
+		hm.logger,
 	)
 	return nil
 }
@@ -263,7 +143,8 @@ func LogErrorServiceNotResponding(hc types.HealthCheckable, logger log.Logger) f
 // is unhealthy every time it is polled for a duration greater than or equal to the maximum acceptable unhealthy
 // duration, the monitor will panic.
 // This method is synchronized. It returns an error if the service was already registered or the monitor has
-// already been stopped.
+// already been stopped. If the monitor has been stopped, this method will proactively stop the health-checkable
+// service before returning.
 func (hm *HealthMonitor) RegisterService(
 	hc types.HealthCheckable,
 	maximumAcceptableUnhealthyDuration time.Duration,
diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go
index 487cdd3368f..228988a201b 100644
--- a/protocol/daemons/server/types/health_monitor_test.go
+++ b/protocol/daemons/server/types/health_monitor_test.go
@@ -3,6 +3,7 @@ package types_test
 import (
 	"fmt"
 	"github.com/dydxprotocol/v4-chain/protocol/daemons/server/types"
+	daemontypes "github.com/dydxprotocol/v4-chain/protocol/daemons/types"
 	"github.com/dydxprotocol/v4-chain/protocol/mocks"
 	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/require"
@@ -59,123 +60,6 @@ func callbackWithErrorPointer() (func(error), *error) {
 	return callback, &callbackError
 }
 
-func TestHealthChecker(t *testing.T) {
-	tests := map[string]struct {
-		healthCheckResponses []error
-		expectedUnhealthy    error
-	}{
-		"initialized: no callback triggered": {
-			healthCheckResponses: []error{},
-			expectedUnhealthy:    nil,
-		},
-		"healthy, then unhealthy for < maximum unhealthy duration: no callback triggered": {
-			healthCheckResponses: []error{
-				nil,
-				TestError1,
-			},
-			expectedUnhealthy: nil,
-		},
-		"unhealthy for < maximum unhealthy duration: no callback triggered": {
-			healthCheckResponses: []error{
-				TestError1,
-				TestError1,
-				TestError1,
-				TestError1,
-				TestError1,
-			},
-			expectedUnhealthy: nil,
-		},
-		"unhealthy, healthy, unhealthy: no callback triggered": {
-			healthCheckResponses: []error{
-				TestError1,
-				TestError1,
-				TestError1,
-				TestError1,
-				TestError1,
-				nil,
-				TestError1,
-				TestError1,
-				TestError1,
-				TestError1,
-				TestError1,
-			},
-			expectedUnhealthy: nil,
-		},
-		"unhealthy for maximum unhealthy duration: callback triggered": {
-			healthCheckResponses: []error{
-				TestError1,
-				TestError1,
-				TestError1,
-				TestError1,
-				TestError1,
-				TestError1,
-			},
-			expectedUnhealthy: TestError1,
-		},
-		"unhealthy with multiple errors: first error returned": {
-			healthCheckResponses: []error{
-				TestError1,
-				TestError2,
-				TestError2,
-				TestError2,
-				TestError2,
-				TestError2,
-			},
-			expectedUnhealthy: TestError1,
-		},
-	}
-	for name, test := range tests {
-		t.Run(name, func(t *testing.T) {
-			// Setup.
-			// Set up callback to track error passed to it.
-			callback, callbackError := callbackWithErrorPointer()
-
-			// Set up health checkable service.
-			checkable := &mocks.HealthCheckable{}
-			for _, response := range test.healthCheckResponses {
-				checkable.On("HealthCheck").Return(response).Once()
-			}
-
-			// Set up time provider to return a sequence of timestamps one second apart starting at Time0.
-			timeProvider := &mocks.TimeProvider{}
-			for i := range test.healthCheckResponses {
-				timeProvider.On("Now").Return(Time0.Add(time.Duration(i) * time.Second)).Once()
-			}
-
-			healthChecker := types.StartNewHealthChecker(
-				checkable,
-				TestLargeDuration, // set to a >> value so that poll is never auto-triggered by the timer
-				callback,
-				timeProvider,
-				TestMaximumUnhealthyDuration,
-				types.DaemonStartupGracePeriod,
-			)
-
-			// Cleanup.
-			defer func() {
-				healthChecker.Stop()
-			}()
-
-			// Act - simulate the health checker polling for updates.
-			for i := 0; i < len(test.healthCheckResponses); i++ {
-				healthChecker.Poll()
-			}
-
-			// Assert.
-			// Validate the expected polls occurred according to the mocks.
-			checkable.AssertExpectations(t)
-			timeProvider.AssertExpectations(t)
-
-			// Validate the callback was called with the expected error.
-			if test.expectedUnhealthy == nil {
-				require.NoError(t, *callbackError)
-			} else {
-				require.ErrorContains(t, *callbackError, test.expectedUnhealthy.Error())
-			}
-		})
-	}
-}
-
 // The following tests may still intermittently fail or report false positives on an overloaded system as they rely
 // on callbacks to execute before the termination of the `time.Sleep` call, which is not guaranteed.
 func TestRegisterService_Healthy(t *testing.T) {
@@ -270,17 +154,47 @@ func TestRegisterServiceWithCallback_DoubleRegistrationFails(t *testing.T) {
 	require.Equal(t, TestError1, *callbackError)
 }
 
+// Create a struct that implements HealthCheckable and Stoppable to check that the monitor stops the service.
+type stoppableFakeHealthChecker struct {
+	stopped bool
+}
+
+// Implement stub methods to conform to interfaces.
+func (f *stoppableFakeHealthChecker) ServiceName() string   { return "test-service" }
+func (f *stoppableFakeHealthChecker) HealthCheck() error    { return nil }
+func (f *stoppableFakeHealthChecker) ReportSuccess()        {}
+func (f *stoppableFakeHealthChecker) ReportFailure(_ error) {}
+
+// Stop stub tracks whether the service was stopped.
+func (f *stoppableFakeHealthChecker) Stop() {
+	f.stopped = true
+}
+
+var _ types.Stoppable = (*stoppableFakeHealthChecker)(nil)
+var _ daemontypes.HealthCheckable = (*stoppableFakeHealthChecker)(nil)
+
 func TestRegisterService_RegistrationFailsAfterStop(t *testing.T) {
 	ufm, logger := createTestMonitor()
 	ufm.Stop()
 
-	hc := mockFailingHealthCheckerWithError("test-service", TestError1)
-	err := ufm.RegisterService(hc, 50*time.Millisecond)
+	stoppableHc := &stoppableFakeHealthChecker{}
+	hc2 := mockFailingHealthCheckerWithError("test-service-2", TestError1)
+
+	// Register a stoppable service.
+	err := ufm.RegisterService(stoppableHc, 50*time.Millisecond)
+	require.ErrorContains(t, err, "monitor has been stopped")
+
+	// Register a non-stoppable service.
+	err = ufm.RegisterService(hc2, 50*time.Millisecond)
 	require.ErrorContains(t, err, "monitor has been stopped")
 
 	// Any scheduled functions with error logs that were not cleaned up should trigger before this sleep finishes.
 	time.Sleep(100 * time.Millisecond)
 	mock.AssertExpectationsForObjects(t, logger)
+
+	// Assert that the monitor proactively stops any stoppable service that was registered after the monitor was
+	// stopped.
+	require.True(t, stoppableHc.stopped)
 }
 
 func TestRegisterValidResponseWithCallback_NegativeUnhealthyDuration(t *testing.T) {
diff --git a/protocol/daemons/server/types/stoppable.go b/protocol/daemons/server/types/stoppable.go
new file mode 100644
index 00000000000..fda69d34a88
--- /dev/null
+++ b/protocol/daemons/server/types/stoppable.go
@@ -0,0 +1,7 @@
+package types
+
+// Stoppable is an interface for a service that can be stopped.
+// This is used to stop services registered with the health monitor.
+type Stoppable interface {
+	Stop()
+}

From 86fa362f68b2fc4eb0a412dea58ad4e12049b2c6 Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal@dydx.exchange>
Date: Wed, 29 Nov 2023 12:31:58 -0800
Subject: [PATCH 06/15] PR comments.

Remove superfluous methods.

Comment nit.

Method name fix.
---
 protocol/daemons/server/bridge.go             |  15 +-
 protocol/daemons/server/liquidation.go        |  14 +-
 protocol/daemons/server/pricefeed.go          |  16 +-
 protocol/daemons/server/server.go             |   4 +-
 .../daemons/server/types/health_checker.go    | 111 +++++++-----
 .../daemons/server/types/health_monitor.go    | 161 +++++++++++-------
 .../server/types/health_monitor_test.go       |  22 +--
 7 files changed, 211 insertions(+), 132 deletions(-)

diff --git a/protocol/daemons/server/bridge.go b/protocol/daemons/server/bridge.go
index a4d0de5683c..ad76af6066e 100644
--- a/protocol/daemons/server/bridge.go
+++ b/protocol/daemons/server/bridge.go
@@ -27,11 +27,18 @@ func (s *Server) AddBridgeEvents(
 	ctx context.Context,
 	req *api.AddBridgeEventsRequest,
 ) (
-	*api.AddBridgeEventsResponse,
-	error,
+	response *api.AddBridgeEventsResponse,
+	err error,
 ) {
-	s.reportResponse(types.BridgeDaemonServiceName)
-	if err := s.bridgeEventManager.AddBridgeEvents(req.BridgeEvents); err != nil {
+	// Capture valid responses in metrics.
+	defer func() {
+		if err == nil {
+			s.reportValidResponse(types.PricefeedDaemonServiceName)
+		}
+	}()
+
+	s.reportValidResponse(types.BridgeDaemonServiceName)
+	if err = s.bridgeEventManager.AddBridgeEvents(req.BridgeEvents); err != nil {
 		return nil, err
 	}
 	return &api.AddBridgeEventsResponse{}, nil
diff --git a/protocol/daemons/server/liquidation.go b/protocol/daemons/server/liquidation.go
index 7cf8d351392..f418b6c251f 100644
--- a/protocol/daemons/server/liquidation.go
+++ b/protocol/daemons/server/liquidation.go
@@ -29,7 +29,17 @@ func (server *Server) WithLiquidatableSubaccountIds(
 func (s *Server) LiquidateSubaccounts(
 	ctx context.Context,
 	req *api.LiquidateSubaccountsRequest,
-) (*api.LiquidateSubaccountsResponse, error) {
+) (
+	response *api.LiquidateSubaccountsResponse,
+	err error,
+) {
+	// Capture valid responses in metrics.
+	defer func() {
+		if err == nil {
+			s.reportValidResponse(types.PricefeedDaemonServiceName)
+		}
+	}()
+
 	telemetry.ModuleSetGauge(
 		metrics.LiquidationDaemon,
 		float32(len(req.SubaccountIds)),
@@ -37,7 +47,7 @@ func (s *Server) LiquidateSubaccounts(
 		metrics.Received,
 		metrics.Count,
 	)
-	s.reportResponse(types.LiquidationsDaemonServiceName)
+
 	s.liquidatableSubaccountIds.UpdateSubaccountIds(req.SubaccountIds)
 	return &api.LiquidateSubaccountsResponse{}, nil
 }
diff --git a/protocol/daemons/server/pricefeed.go b/protocol/daemons/server/pricefeed.go
index 1d43f984168..b4ef56df4cf 100644
--- a/protocol/daemons/server/pricefeed.go
+++ b/protocol/daemons/server/pricefeed.go
@@ -35,8 +35,17 @@ func (server *Server) WithPriceFeedMarketToExchangePrices(
 func (s *Server) UpdateMarketPrices(
 	ctx context.Context,
 	req *api.UpdateMarketPricesRequest,
-) (*api.UpdateMarketPricesResponse, error) {
-	s.reportResponse(types.PricefeedDaemonServiceName)
+) (
+	response *api.UpdateMarketPricesResponse,
+	err error,
+) {
+	// Capture valid responses in metrics.
+	defer func() {
+		if err == nil {
+			s.reportValidResponse(types.PricefeedDaemonServiceName)
+		}
+	}()
+
 	// Measure latency in ingesting and handling gRPC price update.
 	defer telemetry.ModuleMeasureSince(
 		metrics.PricefeedServer,
@@ -54,13 +63,14 @@ func (s *Server) UpdateMarketPrices(
 		)
 	}
 
-	if err := validateMarketPricesUpdatesMessage(req); err != nil {
+	if err = validateMarketPricesUpdatesMessage(req); err != nil {
 		// Log if failure occurs during an update.
 		s.logger.Error("Failed to validate price update message", "error", err)
 		return nil, err
 	}
 
 	s.marketToExchange.UpdatePrices(req.MarketPriceUpdates)
+
 	return &api.UpdateMarketPricesResponse{}, nil
 }
 
diff --git a/protocol/daemons/server/server.go b/protocol/daemons/server/server.go
index 28ef9b8fa8b..60a34c1552c 100644
--- a/protocol/daemons/server/server.go
+++ b/protocol/daemons/server/server.go
@@ -51,8 +51,8 @@ func (server *Server) Stop() {
 	server.gsrv.Stop()
 }
 
-// reportResponse reports a response from a daemon service for metrics collection purposes.
-func (server *Server) reportResponse(daemonKey string) {
+// reportValidResponse reports a valid request/response from a daemon service for metrics collection purposes.
+func (server *Server) reportValidResponse(daemonKey string) {
 	telemetry.IncrCounterWithLabels(
 		[]string{
 			metrics.DaemonServer,
diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go
index ef465566ada..352b0813e8c 100644
--- a/protocol/daemons/server/types/health_checker.go
+++ b/protocol/daemons/server/types/health_checker.go
@@ -10,8 +10,8 @@ import (
 
 // timestampWithError couples a timestamp and error to make it easier to update them in tandem. The
 // timestampWithError will track the timestamp of the first error in a streak of errors, but keeps a record of the
-// most recent error. This is useful for determining how long a service has been unhealthy, and the current state
-// of the service.
+// most recent error. This is useful for determining how long a service has been unhealthy, as well as the current
+// state of the service.
 type timestampWithError struct {
 	timestamp time.Time
 	err       error
@@ -42,15 +42,61 @@ func (u *timestampWithError) Error() error {
 	return u.err
 }
 
+// healthCheckerMutableState tracks the current health state of the HealthCheckable, encapsulating all mutable state
+// into a single struct for ease of synchronization.
+type healthCheckerMutableState struct {
+	// lock is used to synchronize access to mutable state fields.
+	lock sync.Mutex
+
+	// mostRecentSuccess is the timestamp of the most recent successful health check.
+	// Access to mostRecentSuccess is synchronized.
+	mostRecentSuccess time.Time
+
+	// mostRecentFailureStreakError tracks the timestamp of the first error in the most recent streak of errors, as well
+	// as the most recent error. It is updated on every error and reset every time the service sees a healthy response.
+	// This field is used to determine how long the daemon has been unhealthy. If this timestamp is nil, then either
+	// the service has never been unhealthy, or the most recent error streak ended before it could trigger a callback.
+	// Access to mostRecentFailureStreakError is synchronized.
+	mostRecentFailureStreakError timestampWithError
+}
+
+// ReportSuccess updates the health checker's mutable state to reflect a successful health check and schedules the next
+// poll as an atomic operation.
+func (u *healthCheckerMutableState) ReportSuccess(now time.Time) {
+	u.lock.Lock()
+	defer u.lock.Unlock()
+
+	u.mostRecentSuccess = now
+
+	// Whenever the service is healthy, reset the first failure in streak timestamp.
+	u.mostRecentFailureStreakError.Reset()
+}
+
+// ReportFailure updates the health checker's mutable state to reflect a failed health check and schedules the next
+// poll as an atomic operation. The method returns the duration of the current failure streak.
+func (u *healthCheckerMutableState) ReportFailure(now time.Time, err error) time.Duration {
+	u.lock.Lock()
+	defer u.lock.Unlock()
+
+	u.mostRecentFailureStreakError.Update(now, err)
+
+	return now.Sub(u.mostRecentFailureStreakError.Timestamp())
+}
+
 // healthChecker encapsulates the logic for monitoring the health of a health checkable service.
 type healthChecker struct {
-	// The following fields are initialized in StartNewHealthChecker and are not modified after initialization.
+	// mutableState is the mutable state of the health checker. Access to these fields is synchronized.
+	mutableState *healthCheckerMutableState
+
 	// healthCheckable is the health checkable service to be monitored.
 	healthCheckable types.HealthCheckable
 
 	// pollFrequency is the frequency at which the health checkable service is polled.
 	pollFrequency time.Duration
 
+	// timer triggers a health check poll for a health checkable service.
+	timer *time.Timer
+
 	// maxAcceptableUnhealthyDuration is the maximum acceptable duration for a health checkable service to
 	// remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the
 	// specified callback function.
@@ -61,69 +107,38 @@ type healthChecker struct {
 	// This callback function is executed with the error that caused the service to become unhealthy.
 	unhealthyCallback func(error)
 
-	logger log.Logger
-
-	// lock is used to synchronize access to the health checker's dynamically updated fields.
-	lock sync.Mutex
-
-	// The following fields are dynamically updated by the health checker:
-	// timer triggers a health check poll for a health checkable service.
-	// Access to the timer is synchronized.
-	timer *time.Timer
-
-	// mostRecentSuccess is the timestamp of the most recent successful health check.
-	// Access to mostRecentSuccess is synchronized.
-	mostRecentSuccess time.Time
-
-	// mostRecentFailureStreakError tracks the timestamp of the first error in the most recent streak of errors, as well
-	// as the most recent error. It is updated on every error and reset every time the service sees a healthy response.
-	// This field is used to determine how long the daemon has been unhealthy. If this timestamp is nil, then either
-	// the service has never been unhealthy, or the most recent error streak ended before it could trigger a callback.
-	// Access to mostRecentFailureStreakError is synchronized.
-	mostRecentFailureStreakError timestampWithError
-
 	// timeProvider is used to get the current time. It is added as a field so that it can be mocked in tests.
-	// Access to timeProvider is synchronized.
 	timeProvider libtime.TimeProvider
+
+	logger log.Logger
 }
 
 // Poll executes a health check for the health checkable service. If the service has been unhealthy for longer than the
 // maximum acceptable unhealthy duration, the callback function is executed.
 // This method is publicly exposed for testing. This method is synchronized.
 func (hc *healthChecker) Poll() {
-	hc.lock.Lock()
-	defer hc.lock.Unlock()
-
 	err := hc.healthCheckable.HealthCheck()
 	now := hc.timeProvider.Now()
 
-	// Schedule the next poll
-	defer hc.timer.Reset(hc.pollFrequency)
-
 	// Capture healthy response.
+
 	if err == nil {
-		hc.mostRecentSuccess = now
-		// Whenever the service is healthy, reset the first failure in streak timestamp.
-		hc.mostRecentFailureStreakError.Reset()
-		return
+		hc.mutableState.ReportSuccess(now)
+	} else {
+		streakDuration := hc.mutableState.ReportFailure(now, err)
+		// If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the
+		// callback function.
+		if streakDuration >= hc.maxAcceptableUnhealthyDuration {
+			hc.unhealthyCallback(err)
+		}
 	}
 
-	hc.mostRecentFailureStreakError.Update(now, err)
-
-	// If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the
-	// callback function.
-	streakDuration := now.Sub(hc.mostRecentFailureStreakError.Timestamp())
-	if !hc.mostRecentFailureStreakError.IsZero() &&
-		streakDuration >= hc.maxAcceptableUnhealthyDuration {
-		hc.unhealthyCallback(hc.mostRecentFailureStreakError.Error())
-	}
+	// Schedule next poll.
+	hc.timer.Reset(hc.pollFrequency)
 }
 
 // Stop stops the health checker. This method is synchronized.
 func (hc *healthChecker) Stop() {
-	hc.lock.Lock()
-	defer hc.lock.Unlock()
-
 	hc.timer.Stop()
 }
 
@@ -144,7 +159,9 @@ func StartNewHealthChecker(
 		timeProvider:                   timeProvider,
 		maxAcceptableUnhealthyDuration: maximumAcceptableUnhealthyDuration,
 		logger:                         logger,
+		mutableState:                   &healthCheckerMutableState{},
 	}
+
 	// The first poll is scheduled after the startup grace period to allow the service to initialize.
 	checker.timer = time.AfterFunc(startupGracePeriod, checker.Poll)
 
diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go
index 28fcd34eccf..f248a306685 100644
--- a/protocol/daemons/server/types/health_monitor.go
+++ b/protocol/daemons/server/types/health_monitor.go
@@ -15,10 +15,11 @@ const (
 	HealthCheckPollFrequency = 5 * time.Second
 )
 
-// HealthMonitor monitors the health of daemon services, which implement the HealthCheckable interface. If a
-// registered health-checkable service sustains an unhealthy state for the maximum acceptable unhealthy duration,
-// the monitor will execute a callback function.
-type HealthMonitor struct {
+// healthMonitorMutableState tracks all mutable state associated with the health monitor. This state is gathered into
+// a single struct for ease of synchronization.
+type healthMonitorMutableState struct {
+	sync.Mutex
+
 	// serviceToHealthChecker maps daemon service names to their update metadata.
 	serviceToHealthChecker map[string]*healthChecker
 	// stopped indicates whether the monitor has been stopped. Additional daemon services cannot be registered
@@ -27,8 +28,82 @@ type HealthMonitor struct {
 	// disabled indicates whether the monitor has been disabled. This is used to disable the monitor in testApp
 	// tests, where app.New is not executed.
 	disabled bool
-	// lock is used to synchronize access to the monitor.
-	lock sync.Mutex
+}
+
+// NewHealthMonitorMutableState creates a new health monitor mutable state.
+func NewHealthMonitorMutableState() *healthMonitorMutableState {
+	return &healthMonitorMutableState{
+		serviceToHealthChecker: make(map[string]*healthChecker),
+	}
+}
+
+// DisableForTesting disables the health monitor mutable state from receiving updates. This prevents the monitor
+// from registering services when called before app initialization and is used for testing.
+func (ms *healthMonitorMutableState) DisableForTesting() {
+	ms.Lock()
+	defer ms.Unlock()
+
+	ms.disabled = true
+}
+
+// Stop stops the update frequency monitor. This method is synchronized.
+func (ms *healthMonitorMutableState) Stop() {
+	ms.Lock()
+	defer ms.Unlock()
+
+	// Don't stop the monitor if it has already been stopped.
+	if ms.stopped {
+		return
+	}
+
+	// Stop all health checkers.
+	for _, checker := range ms.serviceToHealthChecker {
+		checker.Stop()
+	}
+
+	ms.stopped = true
+}
+
+// RegisterHealthChecker registers a new health checker for a health checkable with the health monitor. The health
+// checker is lazily created using the provided function if needed. This method is synchronized. It returns an error if
+// the service was already registered.
+func (ms *healthMonitorMutableState) RegisterHealthChecker(
+	checkable types.HealthCheckable,
+	lazyHealthCheckerCreator func() *healthChecker,
+) error {
+	ms.Lock()
+	defer ms.Unlock()
+
+	// Don't register daemon services if the monitor has been disabled.
+	if ms.disabled {
+		return nil
+	}
+
+	// Don't register additional daemon services if the monitor has already been stopped.
+	// This could be a concern for short-running integration test cases, where the network
+	// stops before all daemon services have been registered.
+	if ms.stopped {
+		// If the service is stoppable, stop it. This helps us to clean up daemon services in test cases
+		// where the monitor is stopped before all daemon services have been registered.
+		if stoppable, ok := checkable.(Stoppable); ok {
+			stoppable.Stop()
+		}
+		return nil
+	}
+
+	if _, ok := ms.serviceToHealthChecker[checkable.ServiceName()]; ok {
+		return fmt.Errorf("service %v already registered", checkable.ServiceName())
+	}
+
+	ms.serviceToHealthChecker[checkable.ServiceName()] = lazyHealthCheckerCreator()
+	return nil
+}
+
+// HealthMonitor monitors the health of daemon services, which implement the HealthCheckable interface. If a
+// registered health-checkable service sustains an unhealthy state for the maximum acceptable unhealthy duration,
+// the monitor will execute a callback function.
+type HealthMonitor struct {
+	mutableState *healthMonitorMutableState
 
 	// These fields are initialized in NewHealthMonitor and are not modified after initialization.
 	logger             log.Logger
@@ -43,18 +118,15 @@ func NewHealthMonitor(
 	logger log.Logger,
 ) *HealthMonitor {
 	return &HealthMonitor{
-		serviceToHealthChecker: make(map[string]*healthChecker),
-		logger:                 logger.With(cosmoslog.ModuleKey, "health-monitor"),
-		startupGracePeriod:     startupGracePeriod,
-		pollingFrequency:       pollingFrequency,
+		mutableState:       NewHealthMonitorMutableState(),
+		logger:             logger.With(cosmoslog.ModuleKey, "health-monitor"),
+		startupGracePeriod: startupGracePeriod,
+		pollingFrequency:   pollingFrequency,
 	}
 }
 
 func (hm *HealthMonitor) DisableForTesting() {
-	hm.lock.Lock()
-	defer hm.lock.Unlock()
-
-	hm.disabled = true
+	hm.mutableState.DisableForTesting()
 }
 
 // RegisterServiceWithCallback registers a HealthCheckable with the health monitor. If the service
@@ -68,9 +140,6 @@ func (hm *HealthMonitor) RegisterServiceWithCallback(
 	maximumAcceptableUnhealthyDuration time.Duration,
 	callback func(error),
 ) error {
-	hm.lock.Lock()
-	defer hm.lock.Unlock()
-
 	if maximumAcceptableUnhealthyDuration <= 0 {
 		return fmt.Errorf(
 			"health check registration failure for service %v: "+
@@ -80,41 +149,17 @@ func (hm *HealthMonitor) RegisterServiceWithCallback(
 		)
 	}
 
-	// Don't register daemon services if the monitor has been disabled.
-	if hm.disabled {
-		return nil
-	}
-
-	// Don't register additional daemon services if the monitor has already been stopped.
-	// This could be a concern for short-running integration test cases, where the network
-	// stops before all daemon services have been registered.
-	if hm.stopped {
-		// If the service is stoppable, stop it. This helps us to clean up daemon services in test cases
-		// where the monitor is stopped before all daemon services have been registered.
-		if stoppable, ok := hc.(Stoppable); ok {
-			stoppable.Stop()
-		}
-
-		return fmt.Errorf(
-			"health check registration failure for service %v: monitor has been stopped",
-			hc.ServiceName(),
+	return hm.mutableState.RegisterHealthChecker(hc, func() *healthChecker {
+		return StartNewHealthChecker(
+			hc,
+			hm.pollingFrequency,
+			callback,
+			&libtime.TimeProviderImpl{},
+			maximumAcceptableUnhealthyDuration,
+			hm.startupGracePeriod,
+			hm.logger,
 		)
-	}
-
-	if _, ok := hm.serviceToHealthChecker[hc.ServiceName()]; ok {
-		return fmt.Errorf("service %v already registered", hc.ServiceName())
-	}
-
-	hm.serviceToHealthChecker[hc.ServiceName()] = StartNewHealthChecker(
-		hc,
-		hm.pollingFrequency,
-		callback,
-		&libtime.TimeProviderImpl{},
-		maximumAcceptableUnhealthyDuration,
-		hm.startupGracePeriod,
-		hm.logger,
-	)
-	return nil
+	})
 }
 
 // PanicServiceNotResponding returns a function that panics with a message indicating that the specified daemon
@@ -158,17 +203,5 @@ func (hm *HealthMonitor) RegisterService(
 
 // Stop stops the update frequency monitor. This method is synchronized.
 func (hm *HealthMonitor) Stop() {
-	hm.lock.Lock()
-	defer hm.lock.Unlock()
-
-	// Don't stop the monitor if it has already been stopped.
-	if hm.stopped {
-		return
-	}
-
-	for _, checker := range hm.serviceToHealthChecker {
-		checker.Stop()
-	}
-
-	hm.stopped = true
+	hm.mutableState.Stop()
 }
diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go
index 228988a201b..82a0893e6ef 100644
--- a/protocol/daemons/server/types/health_monitor_test.go
+++ b/protocol/daemons/server/types/health_monitor_test.go
@@ -161,7 +161,7 @@ type stoppableFakeHealthChecker struct {
 
 // Implement stub methods to conform to interfaces.
 func (f *stoppableFakeHealthChecker) ServiceName() string   { return "test-service" }
-func (f *stoppableFakeHealthChecker) HealthCheck() error    { return nil }
+func (f *stoppableFakeHealthChecker) HealthCheck() error    { return fmt.Errorf("unhealthy") }
 func (f *stoppableFakeHealthChecker) ReportSuccess()        {}
 func (f *stoppableFakeHealthChecker) ReportFailure(_ error) {}
 
@@ -174,23 +174,25 @@ var _ types.Stoppable = (*stoppableFakeHealthChecker)(nil)
 var _ daemontypes.HealthCheckable = (*stoppableFakeHealthChecker)(nil)
 
 func TestRegisterService_RegistrationFailsAfterStop(t *testing.T) {
-	ufm, logger := createTestMonitor()
+	ufm, _ := createTestMonitor()
 	ufm.Stop()
 
 	stoppableHc := &stoppableFakeHealthChecker{}
 	hc2 := mockFailingHealthCheckerWithError("test-service-2", TestError1)
 
-	// Register a stoppable service.
-	err := ufm.RegisterService(stoppableHc, 50*time.Millisecond)
-	require.ErrorContains(t, err, "monitor has been stopped")
+	// Register unhealthy services. These services are confirmed to trigger a panic if registered when the monitor is
+	// not stopped.
+	// Register a stoppable unhealthy service.
+	err := ufm.RegisterService(stoppableHc, 10*time.Millisecond)
+	require.Nil(t, err)
 
-	// Register a non-stoppable service.
-	err = ufm.RegisterService(hc2, 50*time.Millisecond)
-	require.ErrorContains(t, err, "monitor has been stopped")
+	// Register a non-stoppable unhealthy service.
+	err = ufm.RegisterService(hc2, 10*time.Millisecond)
+	require.Nil(t, err)
 
-	// Any scheduled functions with error logs that were not cleaned up should trigger before this sleep finishes.
+	// Since the max allowable unhealthy duration is 10ms, and the polling period is 10ms, 100ms is long enough to wait
+	// in order to trigger a panic if a service is polled.
 	time.Sleep(100 * time.Millisecond)
-	mock.AssertExpectationsForObjects(t, logger)
 
 	// Assert that the monitor proactively stops any stoppable service that was registered after the monitor was
 	// stopped.

From 7eb8519fcd9e7d255b28840e4d1399da18013f92 Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal@dydx.exchange>
Date: Wed, 29 Nov 2023 15:48:03 -0800
Subject: [PATCH 07/15] Comment fixes, renames, reduce critical section for
 synchronized code.

---
 .../daemons/server/types/health_checker.go    | 18 ++++-----
 .../daemons/server/types/health_monitor.go    | 37 +++++++++++++------
 2 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go
index 352b0813e8c..2f42c95cdda 100644
--- a/protocol/daemons/server/types/health_checker.go
+++ b/protocol/daemons/server/types/health_checker.go
@@ -83,26 +83,26 @@ func (u *healthCheckerMutableState) ReportFailure(now time.Time, err error) time
 	return now.Sub(u.mostRecentFailureStreakError.Timestamp())
 }
 
-// healthChecker encapsulates the logic for monitoring the health of a health checkable service.
+// healthChecker encapsulates the logic for monitoring the health of a health-checkable service.
 type healthChecker struct {
 	// mutableState is the mutable state of the health checker. Access to these fields is synchronized.
 	mutableState *healthCheckerMutableState
 
-	// healthCheckable is the health checkable service to be monitored.
+	// healthCheckable is the health-checkable service to be monitored.
 	healthCheckable types.HealthCheckable
 
-	// pollFrequency is the frequency at which the health checkable service is polled.
+	// pollFrequency is the frequency at which the health-checkable service is polled.
 	pollFrequency time.Duration
 
-	// timer triggers a health check poll for a health checkable service.
+	// timer triggers a health check poll for a health-checkable service.
 	timer *time.Timer
 
-	// maxAcceptableUnhealthyDuration is the maximum acceptable duration for a health checkable service to
+	// maxAcceptableUnhealthyDuration is the maximum acceptable duration for a health-checkable service to
 	// remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the
 	// specified callback function.
 	maxAcceptableUnhealthyDuration time.Duration
 
-	// unhealthyCallback is the callback function to be executed if the health checkable service remains
+	// unhealthyCallback is the callback function to be executed if the health-checkable service remains
 	// unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration.
 	// This callback function is executed with the error that caused the service to become unhealthy.
 	unhealthyCallback func(error)
@@ -113,7 +113,7 @@ type healthChecker struct {
 	logger log.Logger
 }
 
-// Poll executes a health check for the health checkable service. If the service has been unhealthy for longer than the
+// Poll executes a health check for the health-checkable service. If the service has been unhealthy for longer than the
 // maximum acceptable unhealthy duration, the callback function is executed.
 // This method is publicly exposed for testing. This method is synchronized.
 func (hc *healthChecker) Poll() {
@@ -137,12 +137,12 @@ func (hc *healthChecker) Poll() {
 	hc.timer.Reset(hc.pollFrequency)
 }
 
-// Stop stops the health checker. This method is synchronized.
+// Stop stops the health checker. This method is not synchronized, as the timer does not need synchronization.
 func (hc *healthChecker) Stop() {
 	hc.timer.Stop()
 }
 
-// StartNewHealthChecker creates and starts a new health checker for a health checkable service.
+// StartNewHealthChecker creates and starts a new health checker for a health-checkable service.
 func StartNewHealthChecker(
 	healthCheckable types.HealthCheckable,
 	pollFrequency time.Duration,
diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go
index f248a306685..1cff5f5b0af 100644
--- a/protocol/daemons/server/types/health_monitor.go
+++ b/protocol/daemons/server/types/health_monitor.go
@@ -11,7 +11,7 @@ import (
 )
 
 const (
-	// HealthCheckPollFrequency is the frequency at which the health checkable service is polled.
+	// HealthCheckPollFrequency is the frequency at which the health-checkable service is polled.
 	HealthCheckPollFrequency = 5 * time.Second
 )
 
@@ -30,8 +30,8 @@ type healthMonitorMutableState struct {
 	disabled bool
 }
 
-// NewHealthMonitorMutableState creates a new health monitor mutable state.
-func NewHealthMonitorMutableState() *healthMonitorMutableState {
+// newHealthMonitorMutableState creates a new health monitor mutable state.
+func newHealthMonitorMutableState() *healthMonitorMutableState {
 	return &healthMonitorMutableState{
 		serviceToHealthChecker: make(map[string]*healthChecker),
 	}
@@ -64,13 +64,29 @@ func (ms *healthMonitorMutableState) Stop() {
 	ms.stopped = true
 }
 
-// RegisterHealthChecker registers a new health checker for a health checkable with the health monitor. The health
+// RegisterHealthChecker registers a new health checker for a health-checkable with the health monitor. The health
 // checker is lazily created using the provided function if needed. This method is synchronized. It returns an error if
 // the service was already registered.
 func (ms *healthMonitorMutableState) RegisterHealthChecker(
 	checkable types.HealthCheckable,
 	lazyHealthCheckerCreator func() *healthChecker,
 ) error {
+	stopService := false
+
+	// If the monitor has already been stopped, we want to stop the checkable service before returning.
+	// However, we'd prefer not to stop the service within the critical section in order to prevent deadlocks.
+	// This defer will be called last, after the lock is released.
+	defer func() {
+		if stopService {
+			// If the service is stoppable, stop it. This helps us to clean up daemon services in test cases
+			// where the monitor is stopped before all daemon services have been registered.
+			if stoppable, ok := checkable.(Stoppable); ok {
+				stoppable.Stop()
+			}
+		}
+	}()
+
+	// Enter into the critical section.
 	ms.Lock()
 	defer ms.Unlock()
 
@@ -83,11 +99,8 @@ func (ms *healthMonitorMutableState) RegisterHealthChecker(
 	// This could be a concern for short-running integration test cases, where the network
 	// stops before all daemon services have been registered.
 	if ms.stopped {
-		// If the service is stoppable, stop it. This helps us to clean up daemon services in test cases
-		// where the monitor is stopped before all daemon services have been registered.
-		if stoppable, ok := checkable.(Stoppable); ok {
-			stoppable.Stop()
-		}
+		// Toggle the stopService flag to true so that the service is stopped after the lock is released.
+		stopService = true
 		return nil
 	}
 
@@ -118,7 +131,7 @@ func NewHealthMonitor(
 	logger log.Logger,
 ) *HealthMonitor {
 	return &HealthMonitor{
-		mutableState:       NewHealthMonitorMutableState(),
+		mutableState:       newHealthMonitorMutableState(),
 		logger:             logger.With(cosmoslog.ModuleKey, "health-monitor"),
 		startupGracePeriod: startupGracePeriod,
 		pollingFrequency:   pollingFrequency,
@@ -171,7 +184,7 @@ func PanicServiceNotResponding(hc types.HealthCheckable) func(error) {
 }
 
 // LogErrorServiceNotResponding returns a function that logs an error indicating that the specified service
-// is not responding. This is ideal for creating a callback function when registering a health checkable service.
+// is not responding. This is ideal for creating a callback function when registering a health-checkable service.
 func LogErrorServiceNotResponding(hc types.HealthCheckable, logger log.Logger) func(error) {
 	return func(err error) {
 		logger.Error(
@@ -184,7 +197,7 @@ func LogErrorServiceNotResponding(hc types.HealthCheckable, logger log.Logger) f
 	}
 }
 
-// RegisterService registers a new health checkable service with the health check monitor. If the service
+// RegisterService registers a new health-checkable service with the health check monitor. If the service
 // is unhealthy every time it is polled for a duration greater than or equal to the maximum acceptable unhealthy
 // duration, the monitor will panic.
 // This method is synchronized. It returns an error if the service was already registered or the monitor has

From f99694783ce7169099075345b7fa35987c2bbd60 Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal@dydx.exchange>
Date: Fri, 1 Dec 2023 17:51:26 -0800
Subject: [PATCH 08/15] PR comments.

---
 protocol/app/app.go                           | 12 ++--
 protocol/app/app_test.go                      |  6 +-
 protocol/daemons/server/bridge.go             | 12 ++--
 protocol/daemons/server/liquidation.go        | 11 ++-
 protocol/daemons/server/pricefeed.go          | 12 ++--
 .../daemons/server/types/health_checker.go    | 71 ++++++++++++++++---
 .../daemons/server/types/health_monitor.go    |  5 +-
 7 files changed, 86 insertions(+), 43 deletions(-)

diff --git a/protocol/app/app.go b/protocol/app/app.go
index ee947db019b..a13d66b9dc5 100644
--- a/protocol/app/app.go
+++ b/protocol/app/app.go
@@ -613,7 +613,7 @@ func New(
 		if daemonFlags.Liquidation.Enabled {
 			app.LiquidationsClient = liquidationclient.NewClient(logger)
 			go func() {
-				app.MonitorDaemon(app.LiquidationsClient, MaximumDaemonUnhealthyDuration)
+				app.RegisterDaemonWithHealthMonitor(app.LiquidationsClient, MaximumDaemonUnhealthyDuration)
 				if err := app.LiquidationsClient.Start(
 					// The client will use `context.Background` so that it can have a different context from
 					// the main application.
@@ -645,7 +645,7 @@ func New(
 				constants.StaticExchangeDetails,
 				&pricefeedclient.SubTaskRunnerImpl{},
 			)
-			app.MonitorDaemon(app.PriceFeedClient, MaximumDaemonUnhealthyDuration)
+			app.RegisterDaemonWithHealthMonitor(app.PriceFeedClient, MaximumDaemonUnhealthyDuration)
 		}
 
 		// Start Bridge Daemon.
@@ -655,7 +655,7 @@ func New(
 			// environments.
 			app.BridgeClient = bridgeclient.NewClient(logger)
 			go func() {
-				app.MonitorDaemon(app.BridgeClient, MaximumDaemonUnhealthyDuration)
+				app.RegisterDaemonWithHealthMonitor(app.BridgeClient, MaximumDaemonUnhealthyDuration)
 				if err := app.BridgeClient.Start(
 					// The client will use `context.Background` so that it can have a different context from
 					// the main application.
@@ -1230,9 +1230,9 @@ func New(
 	return app
 }
 
-// MonitorDaemon registers a daemon service with the update monitor. If the daemon does not register, the method will
-// panic.
-func (app *App) MonitorDaemon(
+// RegisterDaemonWithHealthMonitor registers a daemon service with the update monitor, which will commence monitoring
+// the health of the daemon. If the daemon does not register, the method will panic.
+func (app *App) RegisterDaemonWithHealthMonitor(
 	healthCheckableDaemon daemontypes.HealthCheckable,
 	maximumAcceptableUpdateDelay time.Duration,
 ) {
diff --git a/protocol/app/app_test.go b/protocol/app/app_test.go
index 6503d9c4190..ded9ee8b8d8 100644
--- a/protocol/app/app_test.go
+++ b/protocol/app/app_test.go
@@ -225,17 +225,17 @@ func TestModuleBasics(t *testing.T) {
 	require.Equal(t, expectedFieldTypes, actualFieldTypes, "Module basics does not match expected")
 }
 
-func TestMonitorDaemon_Panics(t *testing.T) {
+func TestRegisterDaemonWithHealthMonitor_Panics(t *testing.T) {
 	app := testapp.DefaultTestApp(nil)
 	hc := &mocks.HealthCheckable{}
 	hc.On("ServiceName").Return("test-service")
 	hc.On("HealthCheck").Return(nil)
 
-	app.MonitorDaemon(hc, 5*time.Minute)
+	app.RegisterDaemonWithHealthMonitor(hc, 5*time.Minute)
 	// The second registration should fail, causing a panic.
 	require.PanicsWithError(
 		t,
 		"service test-service already registered",
-		func() { app.MonitorDaemon(hc, 5*time.Minute) },
+		func() { app.RegisterDaemonWithHealthMonitor(hc, 5*time.Minute) },
 	)
 }
diff --git a/protocol/daemons/server/bridge.go b/protocol/daemons/server/bridge.go
index ad76af6066e..f794e0bb86a 100644
--- a/protocol/daemons/server/bridge.go
+++ b/protocol/daemons/server/bridge.go
@@ -30,16 +30,12 @@ func (s *Server) AddBridgeEvents(
 	response *api.AddBridgeEventsResponse,
 	err error,
 ) {
-	// Capture valid responses in metrics.
-	defer func() {
-		if err == nil {
-			s.reportValidResponse(types.PricefeedDaemonServiceName)
-		}
-	}()
-
-	s.reportValidResponse(types.BridgeDaemonServiceName)
 	if err = s.bridgeEventManager.AddBridgeEvents(req.BridgeEvents); err != nil {
 		return nil, err
 	}
+
+	// Capture valid responses in metrics.
+	s.reportValidResponse(types.BridgeDaemonServiceName)
+
 	return &api.AddBridgeEventsResponse{}, nil
 }
diff --git a/protocol/daemons/server/liquidation.go b/protocol/daemons/server/liquidation.go
index f418b6c251f..4cb44acfa1c 100644
--- a/protocol/daemons/server/liquidation.go
+++ b/protocol/daemons/server/liquidation.go
@@ -33,13 +33,6 @@ func (s *Server) LiquidateSubaccounts(
 	response *api.LiquidateSubaccountsResponse,
 	err error,
 ) {
-	// Capture valid responses in metrics.
-	defer func() {
-		if err == nil {
-			s.reportValidResponse(types.PricefeedDaemonServiceName)
-		}
-	}()
-
 	telemetry.ModuleSetGauge(
 		metrics.LiquidationDaemon,
 		float32(len(req.SubaccountIds)),
@@ -49,5 +42,9 @@ func (s *Server) LiquidateSubaccounts(
 	)
 
 	s.liquidatableSubaccountIds.UpdateSubaccountIds(req.SubaccountIds)
+
+	// Capture valid responses in metrics.
+	s.reportValidResponse(types.LiquidationsDaemonServiceName)
+
 	return &api.LiquidateSubaccountsResponse{}, nil
 }
diff --git a/protocol/daemons/server/pricefeed.go b/protocol/daemons/server/pricefeed.go
index b4ef56df4cf..d95af3dfb1a 100644
--- a/protocol/daemons/server/pricefeed.go
+++ b/protocol/daemons/server/pricefeed.go
@@ -39,13 +39,6 @@ func (s *Server) UpdateMarketPrices(
 	response *api.UpdateMarketPricesResponse,
 	err error,
 ) {
-	// Capture valid responses in metrics.
-	defer func() {
-		if err == nil {
-			s.reportValidResponse(types.PricefeedDaemonServiceName)
-		}
-	}()
-
 	// Measure latency in ingesting and handling gRPC price update.
 	defer telemetry.ModuleMeasureSince(
 		metrics.PricefeedServer,
@@ -54,6 +47,8 @@ func (s *Server) UpdateMarketPrices(
 		metrics.Latency,
 	)
 
+	// This panic is an unexpected condition because we initialize the market price cache in app initialization before
+	// starting the server or daemons.
 	if s.marketToExchange == nil {
 		panic(
 			errorsmod.Wrapf(
@@ -71,6 +66,9 @@ func (s *Server) UpdateMarketPrices(
 
 	s.marketToExchange.UpdatePrices(req.MarketPriceUpdates)
 
+	// Capture valid responses in metrics.
+	s.reportValidResponse(types.PricefeedDaemonServiceName)
+
 	return &api.UpdateMarketPricesResponse{}, nil
 }
 
diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go
index 2f42c95cdda..835b0a5ad05 100644
--- a/protocol/daemons/server/types/health_checker.go
+++ b/protocol/daemons/server/types/health_checker.go
@@ -17,27 +17,36 @@ type timestampWithError struct {
 	err       error
 }
 
+// Update updates the timeStampWithError to reflect the current error. If the timestamp is zero, this is the first
+// update, so set the timestamp.
 func (u *timestampWithError) Update(timestamp time.Time, err error) {
 	// If the timestamp is zero, this is the first update, so set the timestamp.
 	if u.timestamp.IsZero() {
 		u.timestamp = timestamp
 	}
+
 	u.err = err
 }
 
+// Reset resets the timestampWithError to its zero value, indicating that the service is healthy.
 func (u *timestampWithError) Reset() {
 	u.timestamp = time.Time{}
 	u.err = nil
 }
 
+// IsZero returns true if the timestampWithError is zero, indicating that the service is healthy.
 func (u *timestampWithError) IsZero() bool {
 	return u.timestamp.IsZero() && u.err == nil
 }
 
+// Timestamp returns the timestamp associated with the timestampWithError, which is the timestamp of the first error
+// in the current error streak.
 func (u *timestampWithError) Timestamp() time.Time {
 	return u.timestamp
 }
 
+// Error returns the error associated with the timestampWithError, which is the most recent error in the current error
+// streak.
 func (u *timestampWithError) Error() error {
 	return u.err
 }
@@ -48,9 +57,9 @@ type healthCheckerMutableState struct {
 	// lock is used to synchronize access to mutable state fields.
 	lock sync.Mutex
 
-	// mostRecentSuccess is the timestamp of the most recent successful health check.
-	// Access to mostRecentSuccess is synchronized.
-	mostRecentSuccess time.Time
+	// lastSuccessTimestamp is the timestamp of the most recent successful health check.
+	// Access to lastSuccessTimestamp is synchronized.
+	lastSuccessTimestamp time.Time
 
 	// mostRecentFailureStreakError tracks the timestamp of the first error in the most recent streak of errors, as well
 	// as the most recent error. It is updated on every error and reset every time the service sees a healthy response.
@@ -58,6 +67,21 @@ type healthCheckerMutableState struct {
 	// the service has never been unhealthy, or the most recent error streak ended before it could trigger a callback.
 	// Access to mostRecentFailureStreakError is synchronized.
 	mostRecentFailureStreakError timestampWithError
+
+	// timer triggers a health check poll for a health-checkable service.
+	timer *time.Timer
+
+	// stopped indicates whether the health checker has been stopped. Additional health checks cannot be scheduled
+	// after the health checker has been stopped.
+	stopped bool
+}
+
+// newHealthCheckerMutableState creates a new health checker mutable state scheduled to trigger a poll after the
+// initial poll delay.
+func newHealthCheckerMutableState(initialPollDelay time.Duration, pollFunc func()) *healthCheckerMutableState {
+	return &healthCheckerMutableState{
+		timer: time.AfterFunc(initialPollDelay, pollFunc),
+	}
 }
 
 // ReportSuccess updates the health checker's mutable state to reflect a successful health check and schedules the next
@@ -66,7 +90,7 @@ func (u *healthCheckerMutableState) ReportSuccess(now time.Time) {
 	u.lock.Lock()
 	defer u.lock.Unlock()
 
-	u.mostRecentSuccess = now
+	u.lastSuccessTimestamp = now
 
 	// Whenever the service is healthy, reset the first failure in streak timestamp.
 	u.mostRecentFailureStreakError.Reset()
@@ -83,6 +107,35 @@ func (u *healthCheckerMutableState) ReportFailure(now time.Time, err error) time
 	return now.Sub(u.mostRecentFailureStreakError.Timestamp())
 }
 
+// SchedulePoll schedules the next poll for the health-checkable service. If the service is stopped, the next poll
+// will not be scheduled. This method is synchronized.
+func (u *healthCheckerMutableState) SchedulePoll(nextPollDelay time.Duration) {
+	u.lock.Lock()
+	defer u.lock.Unlock()
+
+	// Don't schedule a poll if the health checker has been stopped.
+	if u.stopped {
+		return
+	}
+
+	// Schedule the next poll.
+	u.timer.Reset(nextPollDelay)
+}
+
+// Stop stops the health checker. This method is synchronized.
+func (u *healthCheckerMutableState) Stop() {
+	u.lock.Lock()
+	defer u.lock.Unlock()
+
+	// Don't stop the health checker if it has already been stopped.
+	if u.stopped {
+		return
+	}
+
+	u.timer.Stop()
+	u.stopped = true
+}
+
 // healthChecker encapsulates the logic for monitoring the health of a health-checkable service.
 type healthChecker struct {
 	// mutableState is the mutable state of the health checker. Access to these fields is synchronized.
@@ -94,9 +147,6 @@ type healthChecker struct {
 	// pollFrequency is the frequency at which the health-checkable service is polled.
 	pollFrequency time.Duration
 
-	// timer triggers a health check poll for a health-checkable service.
-	timer *time.Timer
-
 	// maxAcceptableUnhealthyDuration is the maximum acceptable duration for a health-checkable service to
 	// remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the
 	// specified callback function.
@@ -134,12 +184,12 @@ func (hc *healthChecker) Poll() {
 	}
 
 	// Schedule next poll.
-	hc.timer.Reset(hc.pollFrequency)
+	hc.mutableState.SchedulePoll(hc.pollFrequency)
 }
 
 // Stop stops the health checker. This method is not synchronized, as the timer does not need synchronization.
 func (hc *healthChecker) Stop() {
-	hc.timer.Stop()
+	hc.mutableState.Stop()
 }
 
 // StartNewHealthChecker creates and starts a new health checker for a health-checkable service.
@@ -159,11 +209,10 @@ func StartNewHealthChecker(
 		timeProvider:                   timeProvider,
 		maxAcceptableUnhealthyDuration: maximumAcceptableUnhealthyDuration,
 		logger:                         logger,
-		mutableState:                   &healthCheckerMutableState{},
 	}
 
 	// The first poll is scheduled after the startup grace period to allow the service to initialize.
-	checker.timer = time.AfterFunc(startupGracePeriod, checker.Poll)
+	checker.mutableState = newHealthCheckerMutableState(startupGracePeriod, checker.Poll)
 
 	return checker
 }
diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go
index 1cff5f5b0af..130398d18d7 100644
--- a/protocol/daemons/server/types/health_monitor.go
+++ b/protocol/daemons/server/types/health_monitor.go
@@ -13,6 +13,9 @@ import (
 const (
 	// HealthCheckPollFrequency is the frequency at which the health-checkable service is polled.
 	HealthCheckPollFrequency = 5 * time.Second
+
+	// HealthMonitorLogModuleName is the module name used for logging within the health monitor.
+	HealthMonitorLogModuleName = "daemon-health-monitor"
 )
 
 // healthMonitorMutableState tracks all mutable state associated with the health monitor. This state is gathered into
@@ -132,7 +135,7 @@ func NewHealthMonitor(
 ) *HealthMonitor {
 	return &HealthMonitor{
 		mutableState:       newHealthMonitorMutableState(),
-		logger:             logger.With(cosmoslog.ModuleKey, "health-monitor"),
+		logger:             logger.With(cosmoslog.ModuleKey, HealthMonitorLogModuleName),
 		startupGracePeriod: startupGracePeriod,
 		pollingFrequency:   pollingFrequency,
 	}

From b5e62a3d2c10ac4b117a39ceaf8fe0a97e45c7b3 Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal@dydx.exchange>
Date: Thu, 7 Dec 2023 10:45:16 -0800
Subject: [PATCH 09/15] Fix unit test failure.

---
 protocol/daemons/server/types/health_monitor_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go
index 82a0893e6ef..af991fd21ce 100644
--- a/protocol/daemons/server/types/health_monitor_test.go
+++ b/protocol/daemons/server/types/health_monitor_test.go
@@ -34,7 +34,7 @@ var (
 // createTestMonitor creates a health monitor with a poll frequency of 10ms and a zero duration grace period.
 func createTestMonitor() (*types.HealthMonitor, *mocks.Logger) {
 	logger := &mocks.Logger{}
-	logger.On("With", "module", "health-monitor").Return(logger).Once()
+	logger.On("With", "module", "daemon-health-monitor").Return(logger).Once()
 	return types.NewHealthMonitor(
 		ZeroDuration,
 		10*time.Millisecond,

From 80c481b1c961d16b87f0e591e856a8c3be89de8f Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal.lemire@gmail.com>
Date: Thu, 7 Dec 2023 11:57:34 -0800
Subject: [PATCH 10/15] Update protocol/daemons/server/types/health_checker.go

Co-authored-by: ttl33 <19664986+ttl33@users.noreply.github.com>
---
 protocol/daemons/server/types/health_checker.go | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go
index 835b0a5ad05..9f2fb2113f5 100644
--- a/protocol/daemons/server/types/health_checker.go
+++ b/protocol/daemons/server/types/health_checker.go
@@ -170,11 +170,9 @@ func (hc *healthChecker) Poll() {
 	err := hc.healthCheckable.HealthCheck()
 	now := hc.timeProvider.Now()
 
-	// Capture healthy response.
-
-	if err == nil {
+	if err == nil { // Capture healthy response.
 		hc.mutableState.ReportSuccess(now)
-	} else {
+	} else { // Capture unhealthy response.
 		streakDuration := hc.mutableState.ReportFailure(now, err)
 		// If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the
 		// callback function.

From 21585a07aa551f2a5ea8cc702a796873fa7889f1 Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal@dydx.exchange>
Date: Thu, 7 Dec 2023 13:24:03 -0800
Subject: [PATCH 11/15] PR feedback.

---
 .../daemons/server/types/health_checker.go    | 87 ++++++++++---------
 1 file changed, 44 insertions(+), 43 deletions(-)

diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go
index 9f2fb2113f5..6c4d2727ab4 100644
--- a/protocol/daemons/server/types/health_checker.go
+++ b/protocol/daemons/server/types/health_checker.go
@@ -8,47 +8,44 @@ import (
 	"time"
 )
 
-// timestampWithError couples a timestamp and error to make it easier to update them in tandem. The
-// timestampWithError will track the timestamp of the first error in a streak of errors, but keeps a record of the
-// most recent error. This is useful for determining how long a service has been unhealthy, as well as the current
-// state of the service.
-type timestampWithError struct {
-	timestamp time.Time
-	err       error
-}
-
-// Update updates the timeStampWithError to reflect the current error. If the timestamp is zero, this is the first
-// update, so set the timestamp.
-func (u *timestampWithError) Update(timestamp time.Time, err error) {
-	// If the timestamp is zero, this is the first update, so set the timestamp.
-	if u.timestamp.IsZero() {
-		u.timestamp = timestamp
+// errorStreak tracks two relevant statistics for an error streak returned by a HealthCheckable - the timestamp of the
+// beginning of the error streak, and the most recent error. This is useful for determining how long a service has been
+// unhealthy, as well as the current state of the service.
+type errorStreak struct {
+	startOfStreak   time.Time
+	mostRecentError error
+}
+
+// UpdateLastError updates the errorStreak to reflect the current error. If the startOfStreak timestamp is zero, this
+// error the first error in a new error streak, so the startOfStreak timestamp is set to the current timestamp.
+func (u *errorStreak) UpdateLastError(timestamp time.Time, err error) {
+	// If the startOfStreak is zero, this is the first update, so set the startOfStreak.
+	if u.startOfStreak.IsZero() {
+		u.startOfStreak = timestamp
 	}
 
-	u.err = err
+	u.mostRecentError = err
 }
 
-// Reset resets the timestampWithError to its zero value, indicating that the service is healthy.
-func (u *timestampWithError) Reset() {
-	u.timestamp = time.Time{}
-	u.err = nil
+// Reset resets the errorStreak to its zero value, indicating that the service has no active error streak.
+func (u *errorStreak) Reset() {
+	u.startOfStreak = time.Time{}
+	u.mostRecentError = nil
 }
 
-// IsZero returns true if the timestampWithError is zero, indicating that the service is healthy.
-func (u *timestampWithError) IsZero() bool {
-	return u.timestamp.IsZero() && u.err == nil
+// IsUnset returns true if the errorStreak is unset, indicating that the service has no active error streak.
+func (u *errorStreak) IsUnset() bool {
+	return u.startOfStreak.IsZero() && u.mostRecentError == nil
 }
 
-// Timestamp returns the timestamp associated with the timestampWithError, which is the timestamp of the first error
-// in the current error streak.
-func (u *timestampWithError) Timestamp() time.Time {
-	return u.timestamp
+// StartOfStreak returns the timestamp of th start of the most recent error streak.
+func (u *errorStreak) StartOfStreak() time.Time {
+	return u.startOfStreak
 }
 
-// Error returns the error associated with the timestampWithError, which is the most recent error in the current error
-// streak.
-func (u *timestampWithError) Error() error {
-	return u.err
+// MostRecentError returns the most recent error associated with the current error streak.
+func (u *errorStreak) MostRecentError() error {
+	return u.mostRecentError
 }
 
 // healthCheckerMutableState tracks the current health state of the HealthCheckable, encapsulating all mutable state
@@ -57,16 +54,17 @@ type healthCheckerMutableState struct {
 	// lock is used to synchronize access to mutable state fields.
 	lock sync.Mutex
 
-	// lastSuccessTimestamp is the timestamp of the most recent successful health check.
+	// lastSuccessTimestamp is the startOfStreak of the most recent successful health check.
 	// Access to lastSuccessTimestamp is synchronized.
 	lastSuccessTimestamp time.Time
 
-	// mostRecentFailureStreakError tracks the timestamp of the first error in the most recent streak of errors, as well
-	// as the most recent error. It is updated on every error and reset every time the service sees a healthy response.
-	// This field is used to determine how long the daemon has been unhealthy. If this timestamp is nil, then either
-	// the service has never been unhealthy, or the most recent error streak ended before it could trigger a callback.
-	// Access to mostRecentFailureStreakError is synchronized.
-	mostRecentFailureStreakError timestampWithError
+	// mostRecentErrorStreak tracks the beginning of the most recent streak, as well as the current error in the streak.
+	// It is updated on every error and reset every time the service sees a healthy response.
+	// This field is used to determine how long the daemon has been unhealthy. If the mostRecentErrorStreak is unset,
+	// then either the service has never been unhealthy, or the most recent error streak ended before it could trigger
+	// a callback.
+	// Access to mostRecentErrorStreak is synchronized.
+	mostRecentErrorStreak errorStreak
 
 	// timer triggers a health check poll for a health-checkable service.
 	timer *time.Timer
@@ -92,8 +90,8 @@ func (u *healthCheckerMutableState) ReportSuccess(now time.Time) {
 
 	u.lastSuccessTimestamp = now
 
-	// Whenever the service is healthy, reset the first failure in streak timestamp.
-	u.mostRecentFailureStreakError.Reset()
+	// Whenever the service is healthy, reset the first failure in streak startOfStreak.
+	u.mostRecentErrorStreak.Reset()
 }
 
 // ReportFailure updates the health checker's mutable state to reflect a failed health check and schedules the next
@@ -102,9 +100,9 @@ func (u *healthCheckerMutableState) ReportFailure(now time.Time, err error) time
 	u.lock.Lock()
 	defer u.lock.Unlock()
 
-	u.mostRecentFailureStreakError.Update(now, err)
+	u.mostRecentErrorStreak.UpdateLastError(now, err)
 
-	return now.Sub(u.mostRecentFailureStreakError.Timestamp())
+	return now.Sub(u.mostRecentErrorStreak.StartOfStreak())
 }
 
 // SchedulePoll schedules the next poll for the health-checkable service. If the service is stopped, the next poll
@@ -181,7 +179,10 @@ func (hc *healthChecker) Poll() {
 		}
 	}
 
-	// Schedule next poll.
+	// Schedule next poll. We schedule another poll whether the callback was invoked or not, as callbacks are not
+	// guaranteed to panic or otherwise halt the daemon. In such cases, we may end up invoking the callback several
+	// times once the service exceeds the maximum unhealthy duration. For example, a callback that emits error logs
+	// will continue to emit error logs every 5s until the service becomes healthy again.
 	hc.mutableState.SchedulePoll(hc.pollFrequency)
 }
 

From 433638885143639a4cd70e04d2d344fb6a370925 Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal@dydx.exchange>
Date: Thu, 7 Dec 2023 15:28:26 -0800
Subject: [PATCH 12/15] [CORE-668,CORE-669] - Add health monitor flags (#802)

---
 protocol/app/app.go                           | 22 ++++------
 protocol/app/flags/flags.go                   |  1 -
 protocol/daemons/flags/flags.go               | 36 ++++++++++++++--
 protocol/daemons/flags/flags_test.go          | 12 +++++-
 .../client/client_integration_test.go         |  7 +++-
 .../daemons/server/types/health_checker.go    | 21 +++++-----
 .../daemons/server/types/health_monitor.go    | 37 ++++++++++++-----
 .../server/types/health_monitor_test.go       | 41 ++++++++++++++++++-
 protocol/docker-compose.yml                   | 22 ++++++----
 .../clob/client/cli/cancel_order_cli_test.go  |  8 ++++
 .../clob/client/cli/liquidations_cli_test.go  |  8 ++++
 .../x/clob/client/cli/place_order_cli_test.go |  8 ++++
 12 files changed, 173 insertions(+), 50 deletions(-)

diff --git a/protocol/app/app.go b/protocol/app/app.go
index a13d66b9dc5..6a6b04fb61b 100644
--- a/protocol/app/app.go
+++ b/protocol/app/app.go
@@ -183,10 +183,6 @@ import (
 var (
 	// DefaultNodeHome default home directories for the application daemon
 	DefaultNodeHome string
-
-	// MaximumDaemonUnhealthyDuration is the maximum amount of time that a daemon can be unhealthy before the
-	// application panics.
-	MaximumDaemonUnhealthyDuration = 5 * time.Minute
 )
 
 var (
@@ -599,6 +595,7 @@ func New(
 		daemonservertypes.DaemonStartupGracePeriod,
 		daemonservertypes.HealthCheckPollFrequency,
 		app.Logger(),
+		daemonFlags.Shared.PanicOnDaemonFailureEnabled,
 	)
 	// Create a closure for starting daemons and daemon server. Daemon services are delayed until after the gRPC
 	// service is started because daemons depend on the gRPC service being available. If a node is initialized
@@ -606,6 +603,7 @@ func New(
 	// daemons will not be able to connect to the cosmos gRPC query service and finish initialization, and the daemon
 	// monitoring service will panic.
 	app.startDaemons = func() {
+		maxDaemonUnhealthyDuration := time.Duration(daemonFlags.Shared.MaxDaemonUnhealthySeconds) * time.Second
 		// Start server for handling gRPC messages from daemons.
 		go app.Server.Start()
 
@@ -613,7 +611,7 @@ func New(
 		if daemonFlags.Liquidation.Enabled {
 			app.LiquidationsClient = liquidationclient.NewClient(logger)
 			go func() {
-				app.RegisterDaemonWithHealthMonitor(app.LiquidationsClient, MaximumDaemonUnhealthyDuration)
+				app.RegisterDaemonWithHealthMonitor(app.LiquidationsClient, maxDaemonUnhealthyDuration)
 				if err := app.LiquidationsClient.Start(
 					// The client will use `context.Background` so that it can have a different context from
 					// the main application.
@@ -645,17 +643,15 @@ func New(
 				constants.StaticExchangeDetails,
 				&pricefeedclient.SubTaskRunnerImpl{},
 			)
-			app.RegisterDaemonWithHealthMonitor(app.PriceFeedClient, MaximumDaemonUnhealthyDuration)
+			app.RegisterDaemonWithHealthMonitor(app.PriceFeedClient, maxDaemonUnhealthyDuration)
 		}
 
 		// Start Bridge Daemon.
 		// Non-validating full-nodes have no need to run the bridge daemon.
 		if !appFlags.NonValidatingFullNode && daemonFlags.Bridge.Enabled {
-			// TODO(CORE-582): Re-enable bridge daemon registration once the bridge daemon is fixed in local / CI
-			// environments.
 			app.BridgeClient = bridgeclient.NewClient(logger)
 			go func() {
-				app.RegisterDaemonWithHealthMonitor(app.BridgeClient, MaximumDaemonUnhealthyDuration)
+				app.RegisterDaemonWithHealthMonitor(app.BridgeClient, maxDaemonUnhealthyDuration)
 				if err := app.BridgeClient.Start(
 					// The client will use `context.Background` so that it can have a different context from
 					// the main application.
@@ -1234,17 +1230,17 @@ func New(
 // the health of the daemon. If the daemon does not register, the method will panic.
 func (app *App) RegisterDaemonWithHealthMonitor(
 	healthCheckableDaemon daemontypes.HealthCheckable,
-	maximumAcceptableUpdateDelay time.Duration,
+	maxDaemonUnhealthyDuration time.Duration,
 ) {
-	if err := app.DaemonHealthMonitor.RegisterService(healthCheckableDaemon, maximumAcceptableUpdateDelay); err != nil {
+	if err := app.DaemonHealthMonitor.RegisterService(healthCheckableDaemon, maxDaemonUnhealthyDuration); err != nil {
 		app.Logger().Error(
 			"Failed to register daemon service with update monitor",
 			"error",
 			err,
 			"service",
 			healthCheckableDaemon.ServiceName(),
-			"maximumAcceptableUpdateDelay",
-			maximumAcceptableUpdateDelay,
+			"maxDaemonUnhealthyDuration",
+			maxDaemonUnhealthyDuration,
 		)
 		panic(err)
 	}
diff --git a/protocol/app/flags/flags.go b/protocol/app/flags/flags.go
index 21333d95f15..06039978017 100644
--- a/protocol/app/flags/flags.go
+++ b/protocol/app/flags/flags.go
@@ -2,7 +2,6 @@ package flags
 
 import (
 	"fmt"
-
 	"github.com/cosmos/cosmos-sdk/server/config"
 	servertypes "github.com/cosmos/cosmos-sdk/server/types"
 	"github.com/spf13/cast"
diff --git a/protocol/daemons/flags/flags.go b/protocol/daemons/flags/flags.go
index eebdf7e6319..e386f379aba 100644
--- a/protocol/daemons/flags/flags.go
+++ b/protocol/daemons/flags/flags.go
@@ -9,7 +9,9 @@ import (
 // List of CLI flags for Server and Client.
 const (
 	// Flag names
-	FlagUnixSocketAddress = "unix-socket-address"
+	FlagUnixSocketAddress           = "unix-socket-address"
+	FlagPanicOnDaemonFailureEnabled = "panic-on-daemon-failure-enabled"
+	FlagMaxDaemonUnhealthySeconds   = "max-daemon-unhealthy-seconds"
 
 	FlagPriceDaemonEnabled     = "price-daemon-enabled"
 	FlagPriceDaemonLoopDelayMs = "price-daemon-loop-delay-ms"
@@ -28,6 +30,10 @@ const (
 type SharedFlags struct {
 	// SocketAddress is the location of the unix socket to communicate with the daemon gRPC service.
 	SocketAddress string
+	// PanicOnDaemonFailureEnabled toggles whether the daemon should panic on failure.
+	PanicOnDaemonFailureEnabled bool
+	// MaxDaemonUnhealthySeconds is the maximum allowable duration for which a daemon can be unhealthy.
+	MaxDaemonUnhealthySeconds uint32
 }
 
 // BridgeFlags contains configuration flags for the Bridge Daemon.
@@ -74,7 +80,9 @@ func GetDefaultDaemonFlags() DaemonFlags {
 	if defaultDaemonFlags == nil {
 		defaultDaemonFlags = &DaemonFlags{
 			Shared: SharedFlags{
-				SocketAddress: "/tmp/daemons.sock",
+				SocketAddress:               "/tmp/daemons.sock",
+				PanicOnDaemonFailureEnabled: true,
+				MaxDaemonUnhealthySeconds:   5 * 60, // 5 minutes.
 			},
 			Bridge: BridgeFlags{
 				Enabled:        true,
@@ -109,8 +117,18 @@ func AddDaemonFlagsToCmd(
 	cmd.Flags().String(
 		FlagUnixSocketAddress,
 		df.Shared.SocketAddress,
-		"Socket address for the price daemon to send updates to, if not set "+
-			"will establish default location to ingest price updates from",
+		"Socket address for the daemons to send updates to, if not set "+
+			"will establish default location to ingest daemon updates from",
+	)
+	cmd.Flags().Bool(
+		FlagPanicOnDaemonFailureEnabled,
+		df.Shared.PanicOnDaemonFailureEnabled,
+		"Enables panicking when a daemon fails.",
+	)
+	cmd.Flags().Uint32(
+		FlagMaxDaemonUnhealthySeconds,
+		df.Shared.MaxDaemonUnhealthySeconds,
+		"Maximum allowable duration for which a daemon can be unhealthy.",
 	)
 
 	// Bridge Daemon.
@@ -178,6 +196,16 @@ func GetDaemonFlagValuesFromOptions(
 			result.Shared.SocketAddress = v
 		}
 	}
+	if option := appOpts.Get(FlagPanicOnDaemonFailureEnabled); option != nil {
+		if v, err := cast.ToBoolE(option); err == nil {
+			result.Shared.PanicOnDaemonFailureEnabled = v
+		}
+	}
+	if option := appOpts.Get(FlagMaxDaemonUnhealthySeconds); option != nil {
+		if v, err := cast.ToUint32E(option); err == nil {
+			result.Shared.MaxDaemonUnhealthySeconds = v
+		}
+	}
 
 	// Bridge Daemon.
 	if option := appOpts.Get(FlagBridgeDaemonEnabled); option != nil {
diff --git a/protocol/daemons/flags/flags_test.go b/protocol/daemons/flags/flags_test.go
index 5c79395a39a..04191032f60 100644
--- a/protocol/daemons/flags/flags_test.go
+++ b/protocol/daemons/flags/flags_test.go
@@ -17,6 +17,8 @@ func TestAddDaemonFlagsToCmd(t *testing.T) {
 	flags.AddDaemonFlagsToCmd(&cmd)
 	tests := []string{
 		flags.FlagUnixSocketAddress,
+		flags.FlagPanicOnDaemonFailureEnabled,
+		flags.FlagMaxDaemonUnhealthySeconds,
 
 		flags.FlagBridgeDaemonEnabled,
 		flags.FlagBridgeDaemonLoopDelayMs,
@@ -41,6 +43,8 @@ func TestGetDaemonFlagValuesFromOptions_Custom(t *testing.T) {
 	optsMap := make(map[string]interface{})
 
 	optsMap[flags.FlagUnixSocketAddress] = "test-socket-address"
+	optsMap[flags.FlagPanicOnDaemonFailureEnabled] = false
+	optsMap[flags.FlagMaxDaemonUnhealthySeconds] = uint32(1234)
 
 	optsMap[flags.FlagBridgeDaemonEnabled] = true
 	optsMap[flags.FlagBridgeDaemonLoopDelayMs] = uint32(1111)
@@ -64,6 +68,12 @@ func TestGetDaemonFlagValuesFromOptions_Custom(t *testing.T) {
 
 	// Shared.
 	require.Equal(t, optsMap[flags.FlagUnixSocketAddress], r.Shared.SocketAddress)
+	require.Equal(t, optsMap[flags.FlagPanicOnDaemonFailureEnabled], r.Shared.PanicOnDaemonFailureEnabled)
+	require.Equal(
+		t,
+		optsMap[flags.FlagMaxDaemonUnhealthySeconds],
+		r.Shared.MaxDaemonUnhealthySeconds,
+	)
 
 	// Bridge Daemon.
 	require.Equal(t, optsMap[flags.FlagBridgeDaemonEnabled], r.Bridge.Enabled)
@@ -81,7 +91,7 @@ func TestGetDaemonFlagValuesFromOptions_Custom(t *testing.T) {
 	require.Equal(t, optsMap[flags.FlagPriceDaemonLoopDelayMs], r.Price.LoopDelayMs)
 }
 
-func TestGetDaemonFlagValuesFromOptions_Defaul(t *testing.T) {
+func TestGetDaemonFlagValuesFromOptions_Default(t *testing.T) {
 	mockOpts := mocks.AppOptions{}
 	mockOpts.On("Get", mock.Anything).
 		Return(func(key string) interface{} {
diff --git a/protocol/daemons/pricefeed/client/client_integration_test.go b/protocol/daemons/pricefeed/client/client_integration_test.go
index 70e2dfd75ca..45d32a28a45 100644
--- a/protocol/daemons/pricefeed/client/client_integration_test.go
+++ b/protocol/daemons/pricefeed/client/client_integration_test.go
@@ -5,7 +5,6 @@ package client_test
 import (
 	"fmt"
 	"github.com/cometbft/cometbft/libs/log"
-	"github.com/dydxprotocol/v4-chain/protocol/app"
 	appflags "github.com/dydxprotocol/v4-chain/protocol/app/flags"
 	"github.com/dydxprotocol/v4-chain/protocol/daemons/flags"
 	"github.com/dydxprotocol/v4-chain/protocol/daemons/pricefeed/client"
@@ -285,6 +284,7 @@ func (s *PriceDaemonIntegrationTestSuite) SetupTest() {
 		servertypes.DaemonStartupGracePeriod,
 		servertypes.HealthCheckPollFrequency,
 		log.TestingLogger(),
+		flags.GetDefaultDaemonFlags().Shared.PanicOnDaemonFailureEnabled, // Use default behavior for testing
 	)
 
 	s.exchangePriceCache = pricefeedserver_types.NewMarketToExchangePrices(pricefeed_types.MaxPriceAge)
@@ -337,7 +337,10 @@ func (s *PriceDaemonIntegrationTestSuite) startClient() {
 		testExchangeToQueryDetails,
 		&client.SubTaskRunnerImpl{},
 	)
-	err := s.healthMonitor.RegisterService(s.pricefeedDaemon, app.MaximumDaemonUnhealthyDuration)
+	err := s.healthMonitor.RegisterService(
+		s.pricefeedDaemon,
+		time.Duration(s.daemonFlags.Shared.MaxDaemonUnhealthySeconds)*time.Second,
+	)
 	s.Require().NoError(err)
 }
 
diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go
index 6c4d2727ab4..228a6dfd8d6 100644
--- a/protocol/daemons/server/types/health_checker.go
+++ b/protocol/daemons/server/types/health_checker.go
@@ -145,10 +145,10 @@ type healthChecker struct {
 	// pollFrequency is the frequency at which the health-checkable service is polled.
 	pollFrequency time.Duration
 
-	// maxAcceptableUnhealthyDuration is the maximum acceptable duration for a health-checkable service to
+	// maxUnhealthyDuration is the maximum acceptable duration for a health-checkable service to
 	// remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the
 	// specified callback function.
-	maxAcceptableUnhealthyDuration time.Duration
+	maxUnhealthyDuration time.Duration
 
 	// unhealthyCallback is the callback function to be executed if the health-checkable service remains
 	// unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration.
@@ -174,7 +174,7 @@ func (hc *healthChecker) Poll() {
 		streakDuration := hc.mutableState.ReportFailure(now, err)
 		// If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the
 		// callback function.
-		if streakDuration >= hc.maxAcceptableUnhealthyDuration {
+		if streakDuration >= hc.maxUnhealthyDuration {
 			hc.unhealthyCallback(err)
 		}
 	}
@@ -197,17 +197,18 @@ func StartNewHealthChecker(
 	pollFrequency time.Duration,
 	unhealthyCallback func(error),
 	timeProvider libtime.TimeProvider,
-	maximumAcceptableUnhealthyDuration time.Duration,
+	maxUnhealthyDuration time.Duration,
 	startupGracePeriod time.Duration,
 	logger log.Logger,
 ) *healthChecker {
 	checker := &healthChecker{
-		healthCheckable:                healthCheckable,
-		pollFrequency:                  pollFrequency,
-		unhealthyCallback:              unhealthyCallback,
-		timeProvider:                   timeProvider,
-		maxAcceptableUnhealthyDuration: maximumAcceptableUnhealthyDuration,
-		logger:                         logger,
+		healthCheckable:      healthCheckable,
+		pollFrequency:        pollFrequency,
+		unhealthyCallback:    unhealthyCallback,
+		timeProvider:         timeProvider,
+		maxUnhealthyDuration: maxUnhealthyDuration,
+		logger:               logger,
+		mutableState:         &healthCheckerMutableState{},
 	}
 
 	// The first poll is scheduled after the startup grace period to allow the service to initialize.
diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go
index 130398d18d7..1ac31095892 100644
--- a/protocol/daemons/server/types/health_monitor.go
+++ b/protocol/daemons/server/types/health_monitor.go
@@ -122,9 +122,14 @@ type HealthMonitor struct {
 	mutableState *healthMonitorMutableState
 
 	// These fields are initialized in NewHealthMonitor and are not modified after initialization.
-	logger             log.Logger
+	logger log.Logger
+	// startupGracePeriod is the grace period before the monitor starts polling the health-checkable services.
 	startupGracePeriod time.Duration
-	pollingFrequency   time.Duration
+	// pollingFrequency is the frequency at which the health-checkable services are polled.
+	pollingFrequency time.Duration
+	// enablePanics is used to toggle between panics or error logs when a daemon sustains an unhealthy state past the
+	// maximum allowable duration.
+	enablePanics bool
 }
 
 // NewHealthMonitor creates a new health monitor.
@@ -132,12 +137,14 @@ func NewHealthMonitor(
 	startupGracePeriod time.Duration,
 	pollingFrequency time.Duration,
 	logger log.Logger,
+	enablePanics bool,
 ) *HealthMonitor {
 	return &HealthMonitor{
 		mutableState:       newHealthMonitorMutableState(),
 		logger:             logger.With(cosmoslog.ModuleKey, HealthMonitorLogModuleName),
 		startupGracePeriod: startupGracePeriod,
 		pollingFrequency:   pollingFrequency,
+		enablePanics:       enablePanics,
 	}
 }
 
@@ -153,15 +160,15 @@ func (hm *HealthMonitor) DisableForTesting() {
 // health-checkable service before returning.
 func (hm *HealthMonitor) RegisterServiceWithCallback(
 	hc types.HealthCheckable,
-	maximumAcceptableUnhealthyDuration time.Duration,
+	maxUnhealthyDuration time.Duration,
 	callback func(error),
 ) error {
-	if maximumAcceptableUnhealthyDuration <= 0 {
+	if maxUnhealthyDuration <= 0 {
 		return fmt.Errorf(
 			"health check registration failure for service %v: "+
-				"maximum acceptable unhealthy duration %v must be positive",
+				"maximum unhealthy duration %v must be positive",
 			hc.ServiceName(),
-			maximumAcceptableUnhealthyDuration,
+			maxUnhealthyDuration,
 		)
 	}
 
@@ -171,7 +178,7 @@ func (hm *HealthMonitor) RegisterServiceWithCallback(
 			hm.pollingFrequency,
 			callback,
 			&libtime.TimeProviderImpl{},
-			maximumAcceptableUnhealthyDuration,
+			maxUnhealthyDuration,
 			hm.startupGracePeriod,
 			hm.logger,
 		)
@@ -202,18 +209,26 @@ func LogErrorServiceNotResponding(hc types.HealthCheckable, logger log.Logger) f
 
 // RegisterService registers a new health-checkable service with the health check monitor. If the service
 // is unhealthy every time it is polled for a duration greater than or equal to the maximum acceptable unhealthy
-// duration, the monitor will panic.
+// duration, the monitor will panic or log an error, depending on the app configuration via the
+// `panic-on-daemon-failure-enabled` flag.
 // This method is synchronized. It returns an error if the service was already registered or the monitor has
 // already been stopped. If the monitor has been stopped, this method will proactively stop the health-checkable
 // service before returning.
 func (hm *HealthMonitor) RegisterService(
 	hc types.HealthCheckable,
-	maximumAcceptableUnhealthyDuration time.Duration,
+	maxDaemonUnhealthyDuration time.Duration,
 ) error {
+	// If the monitor is configured to panic, use the panic callback. Otherwise, use the error log callback.
+	// This behavior is configured via flag and defaults to panicking on daemon failure.
+	callback := LogErrorServiceNotResponding(hc, hm.logger)
+	if hm.enablePanics {
+		callback = PanicServiceNotResponding(hc)
+	}
+
 	return hm.RegisterServiceWithCallback(
 		hc,
-		maximumAcceptableUnhealthyDuration,
-		PanicServiceNotResponding(hc),
+		maxDaemonUnhealthyDuration,
+		callback,
 	)
 }
 
diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go
index af991fd21ce..4a8e701901a 100644
--- a/protocol/daemons/server/types/health_monitor_test.go
+++ b/protocol/daemons/server/types/health_monitor_test.go
@@ -39,6 +39,7 @@ func createTestMonitor() (*types.HealthMonitor, *mocks.Logger) {
 		ZeroDuration,
 		10*time.Millisecond,
 		logger,
+		true, // enable panics here for stricter testing - a panic will definitely cause a test failure.
 	), logger
 }
 
@@ -126,6 +127,44 @@ func TestRegisterServiceWithCallback_Mixed(t *testing.T) {
 	}
 }
 
+func TestHealthMonitor_DisablePanics_DoesNotPanic(t *testing.T) {
+	logger := &mocks.Logger{}
+	logger.On("With", "module", "daemon-health-monitor").Return(logger).Once()
+	logger.On(
+		"Error",
+		"health-checked service is unhealthy",
+		"service",
+		"test-service",
+		"error",
+		mock.Anything,
+	).Return()
+
+	hm := types.NewHealthMonitor(
+		ZeroDuration,
+		10*time.Millisecond,
+		logger,
+		false,
+	)
+
+	hc := mockFailingHealthCheckerWithError("test-service", TestError1)
+
+	err := hm.RegisterService(hc, 10*time.Millisecond)
+	require.NoError(t, err)
+
+	defer func() {
+		hm.Stop()
+	}()
+
+	// A 100ms sleep should be sufficient for the health monitor to detect the unhealthy service and trigger a callback.
+	time.Sleep(100 * time.Millisecond)
+
+	// Assert.
+	// This test is confirmed to panic when panics are not disabled - but because the panic occurs in a separate
+	// go-routine, it cannot be easily captured with an assert. Instead, we do not try to capture the panic, but
+	// assert that the logger was called with the expected arguments.
+	mock.AssertExpectationsForObjects(t, logger)
+}
+
 func TestRegisterServiceWithCallback_DoubleRegistrationFails(t *testing.T) {
 	// Setup.
 	ufm, logger := createTestMonitor()
@@ -203,7 +242,7 @@ func TestRegisterValidResponseWithCallback_NegativeUnhealthyDuration(t *testing.
 	ufm, _ := createTestMonitor()
 	hc := mockFailingHealthCheckerWithError("test-service", TestError1)
 	err := ufm.RegisterServiceWithCallback(hc, -50*time.Millisecond, func(error) {})
-	require.ErrorContains(t, err, "maximum acceptable unhealthy duration -50ms must be positive")
+	require.ErrorContains(t, err, "maximum unhealthy duration -50ms must be positive")
 }
 
 func TestPanicServiceNotResponding(t *testing.T) {
diff --git a/protocol/docker-compose.yml b/protocol/docker-compose.yml
index 203f802f668..c3038011582 100644
--- a/protocol/docker-compose.yml
+++ b/protocol/docker-compose.yml
@@ -10,13 +10,15 @@ services:
       - --log_level
       # Note that only this validator has a log-level of `info`; other validators use `error` by default.
       # Change to `debug` for more verbose log-level.
-      - info 
+      - info
       - --home
       - /dydxprotocol/chain/.alice
-      - --p2p.persistent_peers 
+      - --p2p.persistent_peers
       - "17e5e45691f0d01449c84fd4ae87279578cdd7ec@dydxprotocold0:26656,b69182310be02559483e42c77b7b104352713166@dydxprotocold1:26656,47539956aaa8e624e0f1d926040e54908ad0eb44@dydxprotocold2:26656,5882428984d83b03d0c907c1f0af343534987052@dydxprotocold3:26656"
       - --bridge-daemon-eth-rpc-endpoint
       - "${ETH_RPC_ENDPOINT}"
+      - --max-daemon-unhealthy-seconds
+      - "4294967295" # Effectively disable the daemon monitor because bridge daemon is flaky in localnet.
     environment:
       # See https://docs.datadoghq.com/profiler/enabling/go/ for DD_ specific environment variables
       - DD_ENV=localnet_${USER}
@@ -28,7 +30,7 @@ services:
       - "26657:26657"
       - "9090:9090"
       - "1317:1317"
-  
+
   dydxprotocold1:
     image: local:dydxprotocol
     entrypoint:
@@ -39,10 +41,12 @@ services:
        - error
        - --home
        - /dydxprotocol/chain/.bob
-       - --p2p.persistent_peers 
+       - --p2p.persistent_peers
        - "17e5e45691f0d01449c84fd4ae87279578cdd7ec@dydxprotocold0:26656,b69182310be02559483e42c77b7b104352713166@dydxprotocold1:26656,47539956aaa8e624e0f1d926040e54908ad0eb44@dydxprotocold2:26656,5882428984d83b03d0c907c1f0af343534987052@dydxprotocold3:26656"
        - --bridge-daemon-eth-rpc-endpoint
        - "${ETH_RPC_ENDPOINT}"
+       - --max-daemon-unhealthy-seconds
+       - "4294967295"
     environment:
       # See https://docs.datadoghq.com/profiler/enabling/go/ for DD_ specific environment variables
       - DD_ENV=localnet_${USER}
@@ -52,7 +56,7 @@ services:
       - ./localnet/dydxprotocol1:/dydxprotocol/chain/.bob/data
     ports:
       - "26658:26657"
- 
+
   dydxprotocold2:
     image: local:dydxprotocol
     entrypoint:
@@ -67,6 +71,8 @@ services:
        - "17e5e45691f0d01449c84fd4ae87279578cdd7ec@dydxprotocold0:26656,b69182310be02559483e42c77b7b104352713166@dydxprotocold1:26656,47539956aaa8e624e0f1d926040e54908ad0eb44@dydxprotocold2:26656,5882428984d83b03d0c907c1f0af343534987052@dydxprotocold3:26656"
        - --bridge-daemon-eth-rpc-endpoint
        - "${ETH_RPC_ENDPOINT}"
+       - --max-daemon-unhealthy-seconds
+       - "4294967295"
     environment:
       # See https://docs.datadoghq.com/profiler/enabling/go/ for DD_ specific environment variables
       - DD_ENV=localnet_${USER}
@@ -74,7 +80,7 @@ services:
       - DAEMON_HOME=/dydxprotocol/chain/.carl
     volumes:
       - ./localnet/dydxprotocol2:/dydxprotocol/chain/.carl/data
-  
+
   dydxprotocold3:
     image: local:dydxprotocol
     entrypoint:
@@ -85,10 +91,12 @@ services:
        - error
        - --home
        - /dydxprotocol/chain/.dave
-       - --p2p.persistent_peers 
+       - --p2p.persistent_peers
        - "17e5e45691f0d01449c84fd4ae87279578cdd7ec@dydxprotocold0:26656,b69182310be02559483e42c77b7b104352713166@dydxprotocold1:26656,47539956aaa8e624e0f1d926040e54908ad0eb44@dydxprotocold2:26656,5882428984d83b03d0c907c1f0af343534987052@dydxprotocold3:26656"
        - --bridge-daemon-eth-rpc-endpoint
        - "${ETH_RPC_ENDPOINT}"
+       - --max-daemon-unhealthy-seconds
+       - "4294967295"
     environment:
       # See https://docs.datadoghq.com/profiler/enabling/go/ for DD_ specific environment variables
       - DD_ENV=localnet_${USER}
diff --git a/protocol/x/clob/client/cli/cancel_order_cli_test.go b/protocol/x/clob/client/cli/cancel_order_cli_test.go
index 9f148f36ba1..de7afeafe4c 100644
--- a/protocol/x/clob/client/cli/cancel_order_cli_test.go
+++ b/protocol/x/clob/client/cli/cancel_order_cli_test.go
@@ -8,6 +8,7 @@ import (
 	appflags "github.com/dydxprotocol/v4-chain/protocol/app/flags"
 	daemonflags "github.com/dydxprotocol/v4-chain/protocol/daemons/flags"
 	"github.com/dydxprotocol/v4-chain/protocol/testutil/appoptions"
+	"math"
 	"math/big"
 	"testing"
 
@@ -73,6 +74,13 @@ func (s *CancelOrderIntegrationTestSuite) SetupTest() {
 			appOptions.Set(daemonflags.FlagPriceDaemonEnabled, false)
 			appOptions.Set(daemonflags.FlagBridgeDaemonEnabled, false)
 
+			// Effectively disable the health monitor panic timeout for these tests. This is necessary
+			// because all clob cli tests are running in the same process and the total time to run is >> 5 minutes
+			// on CI, causing the panic to trigger for liquidations daemon go routines that haven't been properly
+			// cleaned up after a test run.
+			// TODO(CORE-29): Remove this once the liquidations daemon is refactored to be stoppable.
+			appOptions.Set(daemonflags.FlagMaxDaemonUnhealthySeconds, math.MaxUint32)
+
 			// Make sure the daemon is using the correct GRPC address.
 			appOptions.Set(appflags.GrpcAddress, testval.AppConfig.GRPC.Address)
 		},
diff --git a/protocol/x/clob/client/cli/liquidations_cli_test.go b/protocol/x/clob/client/cli/liquidations_cli_test.go
index 01d02bf1a18..9c8223ae956 100644
--- a/protocol/x/clob/client/cli/liquidations_cli_test.go
+++ b/protocol/x/clob/client/cli/liquidations_cli_test.go
@@ -4,6 +4,7 @@ package cli_test
 
 import (
 	"fmt"
+	"math"
 
 	appflags "github.com/dydxprotocol/v4-chain/protocol/app/flags"
 	"math/big"
@@ -74,6 +75,13 @@ func TestLiquidationOrderIntegrationTestSuite(t *testing.T) {
 			appOptions.Set(daemonflags.FlagPriceDaemonEnabled, false)
 			appOptions.Set(daemonflags.FlagBridgeDaemonEnabled, false)
 
+			// Effectively disable the health monitor panic timeout for these tests. This is necessary
+			// because all clob cli tests are running in the same process and the total time to run is >> 5 minutes
+			// on CI, causing the panic to trigger for liquidations daemon go routines that haven't been properly
+			// cleaned up after a test run.
+			// TODO(CORE-29): Remove this once the liquidations daemon is refactored to be stoppable.
+			appOptions.Set(daemonflags.FlagMaxDaemonUnhealthySeconds, math.MaxUint32)
+
 			// Make sure the daemon is using the correct GRPC address.
 			appOptions.Set(appflags.GrpcAddress, testval.AppConfig.GRPC.Address)
 
diff --git a/protocol/x/clob/client/cli/place_order_cli_test.go b/protocol/x/clob/client/cli/place_order_cli_test.go
index bf6832efe82..5bbd5511fcd 100644
--- a/protocol/x/clob/client/cli/place_order_cli_test.go
+++ b/protocol/x/clob/client/cli/place_order_cli_test.go
@@ -5,6 +5,7 @@ package cli_test
 import (
 	"fmt"
 	appflags "github.com/dydxprotocol/v4-chain/protocol/app/flags"
+	"math"
 	"math/big"
 	"testing"
 
@@ -68,6 +69,13 @@ func TestPlaceOrderIntegrationTestSuite(t *testing.T) {
 			appOptions.Set(daemonflags.FlagPriceDaemonEnabled, false)
 			appOptions.Set(daemonflags.FlagBridgeDaemonEnabled, false)
 
+			// Effectively disable the health monitor panic timeout for these tests. This is necessary
+			// because all clob cli tests are running in the same process and the total time to run is >> 5 minutes
+			// on CI, causing the panic to trigger for liquidations daemon go routines that haven't been properly
+			// cleaned up after a test run.
+			// TODO(CORE-29): Remove this once the liquidations daemon is refactored to be stoppable.
+			appOptions.Set(daemonflags.FlagMaxDaemonUnhealthySeconds, math.MaxUint32)
+
 			// Make sure the daemon is using the correct GRPC address.
 			appOptions.Set(appflags.GrpcAddress, testval.AppConfig.GRPC.Address)
 		},

From abdeb97c3e8440302a9db49a9fa84426a08a7171 Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal@dydx.exchange>
Date: Thu, 7 Dec 2023 16:04:01 -0800
Subject: [PATCH 13/15] Fix merge mistake.

---
 protocol/daemons/server/types/health_checker.go | 1 -
 1 file changed, 1 deletion(-)

diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go
index 228a6dfd8d6..c29bd4384d7 100644
--- a/protocol/daemons/server/types/health_checker.go
+++ b/protocol/daemons/server/types/health_checker.go
@@ -208,7 +208,6 @@ func StartNewHealthChecker(
 		timeProvider:         timeProvider,
 		maxUnhealthyDuration: maxUnhealthyDuration,
 		logger:               logger,
-		mutableState:         &healthCheckerMutableState{},
 	}
 
 	// The first poll is scheduled after the startup grace period to allow the service to initialize.

From 9192baeccea1951cc02b315d1566848fd90bfea0 Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal@dydx.exchange>
Date: Thu, 7 Dec 2023 17:00:31 -0800
Subject: [PATCH 14/15] Fix nil pointer segfault.

---
 protocol/daemons/server/types/health_checker.go | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go
index c29bd4384d7..16e6f0b900b 100644
--- a/protocol/daemons/server/types/health_checker.go
+++ b/protocol/daemons/server/types/health_checker.go
@@ -74,14 +74,6 @@ type healthCheckerMutableState struct {
 	stopped bool
 }
 
-// newHealthCheckerMutableState creates a new health checker mutable state scheduled to trigger a poll after the
-// initial poll delay.
-func newHealthCheckerMutableState(initialPollDelay time.Duration, pollFunc func()) *healthCheckerMutableState {
-	return &healthCheckerMutableState{
-		timer: time.AfterFunc(initialPollDelay, pollFunc),
-	}
-}
-
 // ReportSuccess updates the health checker's mutable state to reflect a successful health check and schedules the next
 // poll as an atomic operation.
 func (u *healthCheckerMutableState) ReportSuccess(now time.Time) {
@@ -208,10 +200,11 @@ func StartNewHealthChecker(
 		timeProvider:         timeProvider,
 		maxUnhealthyDuration: maxUnhealthyDuration,
 		logger:               logger,
+		mutableState:         &healthCheckerMutableState{},
 	}
 
 	// The first poll is scheduled after the startup grace period to allow the service to initialize.
-	checker.mutableState = newHealthCheckerMutableState(startupGracePeriod, checker.Poll)
+	checker.mutableState.timer = time.AfterFunc(startupGracePeriod, checker.Poll)
 
 	return checker
 }

From 72d09dccbaf37d8bb20cf1df325751b2604c7341 Mon Sep 17 00:00:00 2001
From: Crystal Lemire <crystal@dydx.exchange>
Date: Thu, 7 Dec 2023 17:19:51 -0800
Subject: [PATCH 15/15] Take 2.

---
 protocol/daemons/server/types/health_monitor_test.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go
index 4a8e701901a..8ba07f748ba 100644
--- a/protocol/daemons/server/types/health_monitor_test.go
+++ b/protocol/daemons/server/types/health_monitor_test.go
@@ -28,7 +28,7 @@ var (
 	// TestLargeDuration is used to ensure that the health checker does not trigger a callback through the timer.
 	TestLargeDuration = 5 * time.Minute
 
-	ZeroDuration = 0 * time.Second
+	SmallDuration = 10 * time.Millisecond
 )
 
 // createTestMonitor creates a health monitor with a poll frequency of 10ms and a zero duration grace period.
@@ -36,7 +36,7 @@ func createTestMonitor() (*types.HealthMonitor, *mocks.Logger) {
 	logger := &mocks.Logger{}
 	logger.On("With", "module", "daemon-health-monitor").Return(logger).Once()
 	return types.NewHealthMonitor(
-		ZeroDuration,
+		SmallDuration,
 		10*time.Millisecond,
 		logger,
 		true, // enable panics here for stricter testing - a panic will definitely cause a test failure.
@@ -140,7 +140,7 @@ func TestHealthMonitor_DisablePanics_DoesNotPanic(t *testing.T) {
 	).Return()
 
 	hm := types.NewHealthMonitor(
-		ZeroDuration,
+		SmallDuration,
 		10*time.Millisecond,
 		logger,
 		false,