Skip to content

Commit

Permalink
feat(gctl): #142 add indefinite retry option during metac startup (#143)
Browse files Browse the repository at this point in the history
This commit closes #142

A new boolean flag named 'retry-indefinitely-to-start' is added that
lets metac to retry indefinitely to start all the configured generic
controllers. When this flag is set to true, retries happen even when
one or more configured kubernetes custom resource definitions are not
available at the cluster. Once these custom definitions are available
& discovered by metac, this retry stops completely.

When this flag is not set which is also the default setting, metac
binary panics (i.e. metac container stops running) if any of the
configured controllers make use of custom resources whose definitions
are not yet available in the cluster. The binary panics after exhausting
the configured timeout (which defaults to 30 minutes).

This feature help teams to manage the transient phases during controller
upgrades. One such scenario is when these upgraded controllers make use
of new custom resource definition(s) that are yet to be applied against
the cluster.

Signed-off-by: AmitKumarDas <amit.das@mayadata.io>
  • Loading branch information
Amit Kumar Das authored May 26, 2020
1 parent 55357e9 commit 370ef1a
Show file tree
Hide file tree
Showing 12 changed files with 351 additions and 91 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ integration-dependencies: all
.PHONY: integration-test
integration-test: integration-dependencies
@go test ./test/integration/... \
-v -timeout=10m -args --logtostderr --alsologtostderr -v=1
-v -short -timeout=10m -args --logtostderr --alsologtostderr -v=1

# integration-test-crd-mode runs tests with metac loading
# metacontrollers as kubernetes custom resources
Expand Down
150 changes: 100 additions & 50 deletions controller/generic/metacontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package generic

import (
"fmt"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -82,19 +84,15 @@ type ConfigMetaController struct {
// controllers
Configs []*v1alpha1.GenericController

// Total timeout for any condition to succeed.
//
// NOTE:
// This is currently used to load config that is required
// to run Metac.
WaitTimeoutForCondition time.Duration
// This will allow executing start logic to be retried
// indefinitely till all the watch controllers are started
RetryIndefinitelyUntilSucceed *bool

// Interval between retries for any condition to succeed.
//
// NOTE:
// This is currently used to load config that is required
// to run Metac
WaitIntervalForCondition time.Duration
// Maximum time to wait to start all watch controllers
WaitTimeoutForStartAttempt time.Duration

// Interval between retries to start all watch controllers
WaitIntervalBetweenRestarts time.Duration

opts []ConfigMetaControllerOption
err error
Expand All @@ -106,8 +104,29 @@ type ConfigMetaController struct {
// This follows **functional options** pattern
type ConfigMetaControllerOption func(*ConfigMetaController) error

// SetMetaControllerConfigLoadFn sets the config loader function
func SetMetaControllerConfigLoadFn(
// SetMetacConfigToRetryIndefinitelyForStart will let this
// controller to retry indefinitely till all its watch controllers
// are started
//
// NOTE:
// Indefinite retry is set only when the provided flag is true
func SetMetacConfigToRetryIndefinitelyForStart(enabled *bool) ConfigMetaControllerOption {
return func(c *ConfigMetaController) error {
// indefinite retry is set only if enabled is true
if enabled == nil || !*enabled {
return nil
}
// We are not bothered about performance. We do not
// need to be fast. We can be lazy but not to the
// extent of sleeping for hours.
c.WaitIntervalBetweenRestarts = 1 * time.Minute
c.RetryIndefinitelyUntilSucceed = k8s.BoolPtr(true)
return nil
}
}

// SetMetacConfigLoadFn sets the config loader function
func SetMetacConfigLoadFn(
fn func() ([]*v1alpha1.GenericController, error),
) ConfigMetaControllerOption {
return func(c *ConfigMetaController) error {
Expand All @@ -116,8 +135,8 @@ func SetMetaControllerConfigLoadFn(
}
}

// SetMetaControllerConfigPath sets the config path
func SetMetaControllerConfigPath(path string) ConfigMetaControllerOption {
// SetMetacConfigPath sets the config path
func SetMetacConfigPath(path string) ConfigMetaControllerOption {
return func(c *ConfigMetaController) error {
c.ConfigPath = path
return nil
Expand All @@ -134,9 +153,12 @@ func NewConfigMetaController(
) (*ConfigMetaController, error) {
// initialize with defaults & the provided values
ctl := &ConfigMetaController{
WaitTimeoutForCondition: 30 * time.Minute,
WaitIntervalForCondition: 1 * time.Second,
opts: opts,
// Default setting for retry
// - Retry times out in 30 minutes
WaitTimeoutForStartAttempt: 30 * time.Minute,
// - Interval between retries is 1 second
WaitIntervalBetweenRestarts: 1 * time.Second,
opts: opts,
BaseMetaController: BaseMetaController{
ResourceManager: resourceMgr,
DynClientset: dynClientset,
Expand Down Expand Up @@ -230,24 +252,27 @@ func (mc *ConfigMetaController) Start() {

glog.Infof("Starting %s", mc)

// we run this as a continuous process
// until all the configs are loaded
err := mc.wait(mc.startAllWatchControllers)
// Run this with retries until all the configs are loaded
// and started. In other words, this starts all the
// generic controllers configured in config file eventually.
err := mc.startWithRetries(mc.startAllWatchControllers)
if err != nil {
glog.Fatalf("Failed to start %s: %+v", mc, err)
}
}()
}

// wait polls the condition until it's true, with a configured
// interval and timeout.
// startWithRetries polls the condition until it's true, with
// a configured interval and timeout.
//
// The condition function returns a bool indicating whether it
// is satisfied, as well as an error which should be non-nil if
// and only if the function was unable to determine whether or
// not the condition is satisfied (for example if the check
// involves calling a remote server and the request failed).
func (mc *ConfigMetaController) wait(condition func() (bool, error)) error {
func (mc *ConfigMetaController) startWithRetries(
condition func() (bool, error),
) error {
// mark the start time
start := time.Now()
for {
Expand All @@ -257,31 +282,38 @@ func (mc *ConfigMetaController) wait(condition func() (bool, error)) error {
// returning nil implies the condition has completed
return nil
}
if time.Since(start) > mc.WaitTimeoutForCondition {
return errors.Wrapf(
err,
"Condition timed out after %s: %s",
mc.WaitTimeoutForCondition,
mc,
)
if time.Since(start) > mc.WaitTimeoutForStartAttempt &&
(mc.RetryIndefinitelyUntilSucceed == nil ||
!*mc.RetryIndefinitelyUntilSucceed) {
{
return errors.Errorf(
"Condition timed out after %s: %+v: %s",
mc.WaitTimeoutForStartAttempt,
err,
mc,
)
}
}
if err != nil {
// Log error, but keep trying until timeout.
// condition resulted in error
// keep trying until timeout
glog.V(7).Infof(
"Condition failed: Will retry after %s: %s: %v",
mc.WaitIntervalForCondition,
mc,
"Condition failed: Will retry after %s: %+v: %s",
mc.WaitIntervalBetweenRestarts,
err,
mc,
)
} else {
// condition did not pass
// keep trying until timeout
glog.V(7).Infof(
"Waiting for condition to succeed: Will retry after %s: %s",
mc.WaitIntervalForCondition,
mc.WaitIntervalBetweenRestarts,
mc,
)
}
// wait & then continue retrying
time.Sleep(mc.WaitIntervalForCondition)
time.Sleep(mc.WaitIntervalBetweenRestarts)
}
}

Expand All @@ -290,38 +322,56 @@ func (mc *ConfigMetaController) wait(condition func() (bool, error)) error {
// binary
//
// NOTE:
// This method is used as a condition and hence can be invoked
// more than once.
// This method is used as a condition and is repeatedly executed
// under a loop till this condition is not met.
func (mc *ConfigMetaController) startAllWatchControllers() (bool, error) {
// In this metacontroller, we are only responsible for
// starting/stopping the relevant watch based controllers
var errs []string
// This logic is responsible to start all the generic controllers
// configured in the config file
for _, conf := range mc.Configs {
key := conf.AsNamespaceNameKey()
if _, ok := mc.WatchControllers[key]; ok {
// Already added; perhaps during earlier condition
// checks
continue
}
// watch controller i.e. a controller based on the resource
// specified in the **watch field** of GenericController
// Initialize the GenericController
wc, err := NewWatchController(
mc.ResourceManager,
mc.DynClientset,
mc.DynInformerFactory,
conf,
)
if err != nil {
return false, errors.Wrapf(
err,
"Failed to init %s: %s",
key,
mc,
errs = append(
errs,
fmt.Sprintf(
"Failed to init gctl %s: %s",
key,
err.Error(),
),
)
// continue to initialise & start remaining controllers
//
// NOTE:
// This helps operating controllers independently
// from other controllers that are currently experiencing
// issues. Most of the times these issues are temporary
// in nature.
continue
}
// start this watch controller
// start this controller
wc.Start(mc.WorkerCount)
mc.WatchControllers[key] = wc
}
if len(errs) != 0 {
return false, errors.Errorf(
"Failed to start all gctl controllers: %d errors found: %s: %s",
len(errs),
strings.Join(errs, ": "),
mc,
)
}
return true, nil
}

Expand Down
18 changes: 9 additions & 9 deletions controller/generic/metacontroller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,16 +159,16 @@ func TestConfigMetaControllerIsDuplicateConfig(t *testing.T) {
}
}

func TestConfigMetaControllerWait(t *testing.T) {
func TestConfigMetaControllerStartWithRetry(t *testing.T) {
var tests = map[string]struct {
controller *ConfigMetaController
condition func() (bool, error)
isErr bool
}{
"cond returns error": {
controller: &ConfigMetaController{
WaitIntervalForCondition: 1 * time.Second,
WaitTimeoutForCondition: 5 * time.Second,
WaitIntervalBetweenRestarts: 1 * time.Second,
WaitTimeoutForStartAttempt: 5 * time.Second,
},
condition: func() (bool, error) {
return false, errors.Errorf("Err")
Expand All @@ -177,8 +177,8 @@ func TestConfigMetaControllerWait(t *testing.T) {
},
"cond returns true": {
controller: &ConfigMetaController{
WaitIntervalForCondition: 1 * time.Second,
WaitTimeoutForCondition: 5 * time.Second,
WaitIntervalBetweenRestarts: 1 * time.Second,
WaitTimeoutForStartAttempt: 5 * time.Second,
},
condition: func() (bool, error) {
return true, nil
Expand All @@ -187,21 +187,21 @@ func TestConfigMetaControllerWait(t *testing.T) {
},
"cond returns false": {
controller: &ConfigMetaController{
WaitIntervalForCondition: 1 * time.Second,
WaitTimeoutForCondition: 5 * time.Second,
WaitIntervalBetweenRestarts: 1 * time.Second,
WaitTimeoutForStartAttempt: 5 * time.Second,
},
condition: func() (bool, error) {
return false, nil
},
isErr: false,
isErr: true,
},
}
for name, mock := range tests {
name := name
mock := mock
t.Run(name, func(t *testing.T) {
ctl := mock.controller
err := ctl.wait(mock.condition)
err := ctl.startWithRetries(mock.condition)
if mock.isErr && err == nil {
t.Fatalf("Expected error got none")
}
Expand Down
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ go 1.13

require (
contrib.go.opencensus.io/exporter/prometheus v0.1.0
github.com/coreos/etcd v3.3.15+incompatible // indirect
github.com/evanphx/json-patch v4.5.0+incompatible // indirect
github.com/ghodss/yaml v1.0.0
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b
Expand Down
11 changes: 7 additions & 4 deletions server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,10 @@ type ConfigServer struct {
// ConfigPath has higher priority
GenericControllerConfigLoadFn func() ([]*v1alpha1.GenericController, error)

// This will allow executing start logic to be retried
// indefinitely till all the watch controllers are started
RetryIndefinitelyForStart *bool

// Number of workers per watch controller
workerCount int
}
Expand Down Expand Up @@ -273,10 +277,9 @@ func (s *ConfigServer) Start(workerCount int) (stop func(), err error) {

// various generic meta controller options to setup meta controller
configOpts := []generic.ConfigMetaControllerOption{
generic.SetMetaControllerConfigLoadFn(
s.GenericControllerConfigLoadFn,
),
generic.SetMetaControllerConfigPath(s.ConfigPath),
generic.SetMetacConfigLoadFn(s.GenericControllerConfigLoadFn),
generic.SetMetacConfigPath(s.ConfigPath),
generic.SetMetacConfigToRetryIndefinitelyForStart(s.RetryIndefinitelyForStart),
}

genericMetac, err := generic.NewConfigMetaController(
Expand Down
13 changes: 10 additions & 3 deletions start/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,13 @@ var (
"metac-config-path",
"/etc/config/metac/",
`Path to metac config file to let metac run as a self contained binary.
Needs run-as-local set to true`,
Applicable if run-as-local is set to true`,
)
retryIndefinitelyToStart = flag.Bool(
"retry-indefinitely-to-start",
false,
`When true will let metac to retry continuously till all its controllers are started.
Applicable if run-as-local is set to true`,
)
)

Expand Down Expand Up @@ -145,8 +151,9 @@ func Start() {
// looking up various MetaController resources as
// config files
configServer := &server.ConfigServer{
Server: mserver,
ConfigPath: *metacConfigPath,
Server: mserver,
ConfigPath: *metacConfigPath,
RetryIndefinitelyForStart: retryIndefinitelyToStart,
}
stopServer, err = configServer.Start(*workerCount)
} else {
Expand Down
Loading

0 comments on commit 370ef1a

Please sign in to comment.