Skip to content

Commit

Permalink
fix: allow communication with instance managers in an unknown state
Browse files Browse the repository at this point in the history
Longhorn 6552

Signed-off-by: Eric Weber <eric.weber@suse.com>
(cherry picked from commit 72e2445)
  • Loading branch information
ejweber committed Sep 11, 2024
1 parent d10edae commit bab8861
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 16 deletions.
10 changes: 5 additions & 5 deletions controller/engine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ func (ec *EngineController) CreateInstance(obj interface{}) (*longhorn.InstanceP
return nil, err
}

c, err := engineapi.NewInstanceManagerClient(im)
c, err := engineapi.NewInstanceManagerClient(im, false)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -588,7 +588,7 @@ func (ec *EngineController) DeleteInstance(obj interface{}) (err error) {
return nil
}

c, err := engineapi.NewInstanceManagerClient(im)
c, err := engineapi.NewInstanceManagerClient(im, true)
if err != nil {
return err
}
Expand Down Expand Up @@ -678,7 +678,7 @@ func (ec *EngineController) GetInstance(obj interface{}) (*longhorn.InstanceProc
return nil, err
}
}
c, err := engineapi.NewInstanceManagerClient(im)
c, err := engineapi.NewInstanceManagerClient(im, false)
if err != nil {
return nil, err
}
Expand All @@ -698,7 +698,7 @@ func (ec *EngineController) LogInstance(ctx context.Context, obj interface{}) (*
return nil, nil, err
}

c, err := engineapi.NewInstanceManagerClient(im)
c, err := engineapi.NewInstanceManagerClient(im, false)
if err != nil {
return nil, nil, err
}
Expand Down Expand Up @@ -2061,7 +2061,7 @@ func (ec *EngineController) UpgradeEngineInstance(e *longhorn.Engine, log *logru
return err
}

c, err := engineapi.NewInstanceManagerClient(im)
c, err := engineapi.NewInstanceManagerClient(im, false)
if err != nil {
return err
}
Expand Down
4 changes: 2 additions & 2 deletions controller/instance_manager_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ type InstanceManagerMonitor struct {
}

func updateInstanceManagerVersion(im *longhorn.InstanceManager) error {
cli, err := engineapi.NewInstanceManagerClient(im)
cli, err := engineapi.NewInstanceManagerClient(im, false)
if err != nil {
return err
}
Expand Down Expand Up @@ -1426,7 +1426,7 @@ func (imc *InstanceManagerController) startMonitoring(im *longhorn.InstanceManag
}

// TODO: #2441 refactor this when we do the resource monitoring refactor
client, err := engineapi.NewInstanceManagerClient(im)
client, err := engineapi.NewInstanceManagerClient(im, false)
if err != nil {
log.WithError(err).Errorf("Failed to initialize im client to %v before monitoring", im.Name)
return
Expand Down
8 changes: 4 additions & 4 deletions controller/replica_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ func (rc *ReplicaController) CreateInstance(obj interface{}) (*longhorn.Instance
return nil, err
}

c, err := engineapi.NewInstanceManagerClient(im)
c, err := engineapi.NewInstanceManagerClient(im, false)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -551,7 +551,7 @@ func (rc *ReplicaController) DeleteInstance(obj interface{}) error {
return nil
}

c, err := engineapi.NewInstanceManagerClient(im)
c, err := engineapi.NewInstanceManagerClient(im, false)
if err != nil {
return err
}
Expand Down Expand Up @@ -674,7 +674,7 @@ func (rc *ReplicaController) GetInstance(obj interface{}) (*longhorn.InstancePro
}
}

c, err := engineapi.NewInstanceManagerClient(im)
c, err := engineapi.NewInstanceManagerClient(im, false)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -705,7 +705,7 @@ func (rc *ReplicaController) LogInstance(ctx context.Context, obj interface{}) (
return nil, nil, err
}

c, err := engineapi.NewInstanceManagerClient(im)
c, err := engineapi.NewInstanceManagerClient(im, false)
if err != nil {
return nil, nil, err
}
Expand Down
21 changes: 16 additions & 5 deletions engineapi/instance_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,22 @@ func CheckInstanceManagerProxySupport(im *longhorn.InstanceManager) error {
return nil
}

// NewInstanceManagerClient creates a new instance manager client
func NewInstanceManagerClient(im *longhorn.InstanceManager) (*InstanceManagerClient, error) {
// Do not check the major version here. Since IM cannot get the major version without using this client to call VersionGet().
if im.Status.CurrentState != longhorn.InstanceManagerStateRunning || im.Status.IP == "" {
return nil, fmt.Errorf("invalid Instance Manager %v, state: %v, IP: %v", im.Name, im.Status.CurrentState, im.Status.IP)
// NewInstanceManagerClient creates a new instance manager client. Usually, we only want to attempt to communicate with
// an instance manager in state running. However, sometimes it makes sense to make a best effort attempt to communicate
// with an instance manager in state unknown. It should be safe to do so, since Kubernetes should not reassign the pod's
// IP address until it is at least terminating, which puts the instance manager in state error. However, there is an
// increased chance of failure.
func NewInstanceManagerClient(im *longhorn.InstanceManager, allowUnknown bool) (*InstanceManagerClient, error) {
// Do not check the major version here since IM cannot get the major version without using this client to call
// VersionGet().

if im.Status.IP == "" {
return nil, fmt.Errorf("invalid instance manager %v, state %v, IP %v", im.Name, im.Status.CurrentState, im.Status.IP)
}
if im.Status.CurrentState == longhorn.InstanceManagerStateUnknown && allowUnknown {
logrus.Warnf("Communicating with instance manager %v, state %v, IP %v", im.Name, im.Status.CurrentState, im.Status.IP)
} else if im.Status.CurrentState != longhorn.InstanceManagerStateRunning {
return nil, fmt.Errorf("invalid instance manager %v, state %v, IP %v", im.Name, im.Status.CurrentState, im.Status.IP)
}

// TODO: Initialize the following gRPC clients are similar. This can be simplified via factory method.
Expand Down

0 comments on commit bab8861

Please sign in to comment.