Skip to content

Add health utility #37

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions health/counter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package health

type CounterConfig struct {
Threshold uint64 `default:"60"` // report unhealthy if threshold reached
Remind uint64 `default:"60"` // remind unhealthy if unrecovered for a long time
}

// Counter represents an error tolerant health counter, which allows failures in short time
// and periodically remind unhealthy if unrecovered in time.
type Counter struct {
CounterConfig

failures uint64
}

func NewCounter(config CounterConfig) *Counter {
return &Counter{
CounterConfig: config,
}
}

// IsSuccess indicates whether any failure occurred.
func (counter *Counter) IsSuccess() bool {
return counter.failures == 0
}

// OnSuccess erases failure status and return recover information if any.
//
// `recovered`: indicates if recovered from unhealthy status.
//
// `failures`: indicates the number of failures before success.
func (counter *Counter) OnSuccess() (recovered bool, failures uint64) {
// last time was success status
if counter.failures == 0 {
return
}

// report health now after a long time
if failures = counter.failures; failures > counter.Threshold {
recovered = true
}

// reset
counter.failures = 0

return
}

// OnFailure marks failure status and return unhealthy information.
//
// `unhealthy`: indicates continous failures in a long time.
//
// `unrecovered`: indicates continous failures and unrecovered in a long time.
//
// `failures`: indicates the number of failures so far.
func (counter *Counter) OnFailure() (unhealthy bool, unrecovered bool, failures uint64) {
counter.failures++

// error tolerant in short time
if failures = counter.failures; failures <= counter.Threshold {
return
}

if delta := failures - counter.Threshold - 1; delta == 0 {
unhealthy = true
} else if delta%counter.Remind == 0 {
unrecovered = true
}

return
}
95 changes: 95 additions & 0 deletions health/counter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package health

import (
"testing"

"github.com/stretchr/testify/assert"
)

func newTestCounter() *Counter {
return NewCounter(CounterConfig{
Threshold: 5,
Remind: 10,
})
}

func TestCounterContinousSuccess(t *testing.T) {
counter := newTestCounter()

recovered, failures := counter.OnSuccess()
assert.False(t, recovered)
assert.Equal(t, uint64(0), failures)
}

func TestCounterFailedShortTime(t *testing.T) {
counter := newTestCounter()

// first failure
unhealthy, unrecovered, failures := counter.OnFailure()
assert.False(t, unhealthy)
assert.False(t, unrecovered)
assert.Equal(t, uint64(1), failures)

// continous failure in short time
unhealthy, unrecovered, failures = counter.OnFailure()
assert.False(t, unhealthy)
assert.False(t, unrecovered)
assert.Equal(t, uint64(2), failures)

// recovered
recovered, failures := counter.OnSuccess()
assert.False(t, recovered)
assert.Equal(t, uint64(2), failures)
}

func TestCounterThreshold(t *testing.T) {
counter := newTestCounter()

// continous failure in short time
for i := uint64(1); i <= counter.Threshold; i++ {
unhealthy, unrecovered, failures := counter.OnFailure()
assert.False(t, unhealthy)
assert.False(t, unrecovered)
assert.Equal(t, i, failures)

}

// continous failure in long time
unhealthy, unrecovered, failures := counter.OnFailure()
assert.True(t, unhealthy)
assert.False(t, unrecovered)
assert.Equal(t, counter.Threshold+1, failures)

// continous failure in long time, but not reached to remind counter
unhealthy, unrecovered, failures = counter.OnFailure()
assert.False(t, unhealthy)
assert.False(t, unrecovered)
assert.Equal(t, counter.Threshold+2, failures)

// recovered
recovered, failures := counter.OnSuccess()
assert.True(t, recovered)
assert.Equal(t, counter.Threshold+2, failures)
}

func TestCounterRemind(t *testing.T) {
counter := newTestCounter()

// continous failure in short time
for i := uint64(1); i <= counter.Threshold+counter.Remind; i++ {
_, unrecovered, failures := counter.OnFailure()
assert.False(t, unrecovered)
assert.Equal(t, i, failures)
}

// continous failure and reached remind time
unhealthy, unrecovered, failures := counter.OnFailure()
assert.False(t, unhealthy)
assert.True(t, unrecovered)
assert.Equal(t, counter.Threshold+counter.Remind+1, failures)

// recovered
recovered, failures := counter.OnSuccess()
assert.True(t, recovered)
assert.Equal(t, counter.Threshold+counter.Remind+1, failures)
}
96 changes: 96 additions & 0 deletions health/timed_counter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package health

import "time"

type TimedCounterConfig struct {
Threshold time.Duration `default:"1m"` // report unhealthy if threshold reached
Remind time.Duration `default:"5m"` // remind unhealthy if unrecovered for a long time
}

// TimedCounter represents an error tolerant health counter, which allows failures in short time
// and periodically remind unhealthy if unrecovered in time.
type TimedCounter struct {
TimedCounterConfig

failedAt time.Time // first failure time
reports int // number of times to report unhealthy
}

func NewTimedCounter(config TimedCounterConfig) *TimedCounter {
return &TimedCounter{
TimedCounterConfig: config,
}
}

// IsSuccess indicates whether any failure occurred.
func (counter *TimedCounter) IsSuccess() bool {
return counter.failedAt.IsZero()
}

// OnSuccess erases failure status and return recover information if any.
//
// `recovered`: indicates if recovered from unhealthy status.
//
// `elapsed`: indicates the duration since the first failure time.
func (counter *TimedCounter) OnSuccess() (recovered bool, elapsed time.Duration) {
return counter.onSuccessAt(time.Now())
}

func (counter *TimedCounter) onSuccessAt(now time.Time) (recovered bool, elapsed time.Duration) {
// last time was success status
if counter.failedAt.IsZero() {
return
}

// report health now after a long time
if elapsed = now.Sub(counter.failedAt); elapsed > counter.Threshold {
recovered = true
}

// reset
counter.failedAt = time.Time{}
counter.reports = 0

return
}

// OnFailure marks failure status and return unhealthy information.
//
// `unhealthy`: indicates continous failures in a long time.
//
// `unrecovered`: indicates continous failures and unrecovered in a long time.
//
// `elapsed`: indicates the duration since the first failure time.
func (counter *TimedCounter) OnFailure() (unhealthy bool, unrecovered bool, elapsed time.Duration) {
return counter.onFailureAt(time.Now())
}

func (counter *TimedCounter) onFailureAt(now time.Time) (unhealthy bool, unrecovered bool, elapsed time.Duration) {
// record the first failure time
if counter.failedAt.IsZero() {
counter.failedAt = now
}

// error tolerant in short time
if elapsed = now.Sub(counter.failedAt); elapsed <= counter.Threshold {
return
}

// become unhealthy
if counter.reports == 0 {
unhealthy = true
counter.reports++
return
}

// remind time not reached
if remind := counter.Threshold + counter.Remind*time.Duration(counter.reports); elapsed <= remind {
return
}

// remind unhealthy
unrecovered = true
counter.reports++

return
}
98 changes: 98 additions & 0 deletions health/timed_counter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package health

import (
"testing"
"time"

"github.com/stretchr/testify/assert"
)

func newTestTimedCounter() *TimedCounter {
return NewTimedCounter(TimedCounterConfig{
Threshold: time.Minute,
Remind: 5 * time.Minute,
})
}

func TestTimedCounterContinousSuccess(t *testing.T) {
counter := newTestTimedCounter()

recovered, elapsed := counter.onSuccessAt(time.Now().Add(counter.Threshold + 1))
assert.False(t, recovered)
assert.Equal(t, time.Duration(0), elapsed)
}

func TestTimedCounterFailedShortTime(t *testing.T) {
counter := newTestTimedCounter()
now := time.Now()

// first failure
unhealthy, unrecovered, elapsed := counter.onFailureAt(now)
assert.False(t, unhealthy)
assert.False(t, unrecovered)
assert.Equal(t, time.Duration(0), elapsed)

// continous failure in short time
unhealthy, unrecovered, elapsed = counter.onFailureAt(now.Add(counter.Threshold - 2))
assert.False(t, unhealthy)
assert.False(t, unrecovered)
assert.Equal(t, counter.Threshold-2, elapsed)

// recovered
recovered, elapsed := counter.onSuccessAt(now.Add(counter.Threshold - 1))
assert.False(t, recovered)
assert.Equal(t, counter.Threshold-1, elapsed)
}

func TestTimedCounterThreshold(t *testing.T) {
counter := newTestTimedCounter()
now := time.Now()

// first failure
counter.onFailureAt(now)

// continous failure in short time
counter.onFailureAt(now.Add(counter.Threshold - 1))

// continous failure in long time
unhealthy, unrecovered, elapsed := counter.onFailureAt(now.Add(counter.Threshold + 1))
assert.True(t, unhealthy)
assert.False(t, unrecovered)
assert.Equal(t, counter.Threshold+1, elapsed)

// recovered
recovered, elapsed := counter.onSuccessAt(now.Add(counter.Threshold + 2))
assert.True(t, recovered)
assert.Equal(t, counter.Threshold+2, elapsed)
}

func TestTimedCounterRemind(t *testing.T) {
counter := newTestTimedCounter()
now := time.Now()

// first failure
counter.onFailureAt(now)

// continous failure in short time
counter.onFailureAt(now.Add(counter.Threshold - 1))

// continous failure in long time
counter.onFailureAt(now.Add(counter.Threshold + 1))

// continous failure in long time, but not reached remind time
unhealthy, unrecovered, elapsed := counter.onFailureAt(now.Add(counter.Threshold + 2))
assert.False(t, unhealthy)
assert.False(t, unrecovered)
assert.Equal(t, counter.Threshold+2, elapsed)

// continous failure and reached remind time
unhealthy, unrecovered, elapsed = counter.onFailureAt(now.Add(counter.Threshold + 2 + counter.Remind))
assert.False(t, unhealthy)
assert.True(t, unrecovered)
assert.Equal(t, counter.Threshold+2+counter.Remind, elapsed)

// recovered
recovered, elapsed := counter.onSuccessAt(now.Add(counter.Threshold + 3 + counter.Remind))
assert.True(t, recovered)
assert.Equal(t, counter.Threshold+3+counter.Remind, elapsed)
}
Loading