diff --git a/controller/backup_target_controller.go b/controller/backup_target_controller.go index 363e84615e..22972c2303 100644 --- a/controller/backup_target_controller.go +++ b/controller/backup_target_controller.go @@ -237,9 +237,9 @@ func getBackupTarget(controllerID string, backupTarget *longhorn.BackupTarget, d return nil, nil, errors.Wrap(err, "failed to get available data engine for getting backup target") } - instanceManager, err := ds.GetDefaultInstanceManagerByNodeRO(controllerID, dataEngine) + instanceManager, err := ds.GetRunningInstanceManagerByNodeRO(controllerID, dataEngine) if err != nil { - return nil, nil, errors.Wrap(err, "failed to get default engine instance manager for proxy client") + return nil, nil, errors.Wrap(err, "failed to get running instance manager for proxy client") } engineClientProxy, err = engineapi.NewEngineClientProxy(instanceManager, log, proxyConnCounter) @@ -635,10 +635,12 @@ func (btc *BackupTargetController) isResponsibleFor(bt *longhorn.BackupTarget, d return false, err } - if instanceManager, err := btc.ds.GetDefaultInstanceManagerByNodeRO(btc.controllerID, ""); err != nil { + instanceManager, err := btc.ds.GetRunningInstanceManagerByNodeRO(btc.controllerID, "") + if err != nil { return false, err - } else if instanceManager == nil || instanceManager.Status.CurrentState != longhorn.InstanceManagerStateRunning { - return false, errors.New("failed to get default running instance manager") + } + if instanceManager == nil { + return false, errors.New("failed to get running instance manager") } isPreferredOwner := currentNodeEngineAvailable && isResponsible diff --git a/controller/node_controller.go b/controller/node_controller.go index 8c70389c55..62be9e4470 100644 --- a/controller/node_controller.go +++ b/controller/node_controller.go @@ -1500,7 +1500,7 @@ func (nc *NodeController) alignDiskSpecAndStatus(node *longhorn.Node) { if diskInstanceName == "" { diskInstanceName = diskName } - if err := nc.deleteDisk(node, diskStatus.Type, diskInstanceName, diskStatus.DiskUUID, diskStatus.DiskPath, string(diskStatus.DiskDriver)); err != nil { + if err := nc.deleteDisk(diskStatus.Type, diskInstanceName, diskStatus.DiskUUID, diskStatus.DiskPath, string(diskStatus.DiskDriver)); err != nil { nc.logger.WithError(err).Warnf("Failed to delete disk %v", diskInstanceName) } delete(node.Status.DiskStatus, diskName) @@ -1508,7 +1508,7 @@ func (nc *NodeController) alignDiskSpecAndStatus(node *longhorn.Node) { } } -func (nc *NodeController) deleteDisk(node *longhorn.Node, diskType longhorn.DiskType, diskName, diskUUID, diskPath, diskDriver string) error { +func (nc *NodeController) deleteDisk(diskType longhorn.DiskType, diskName, diskUUID, diskPath, diskDriver string) error { if diskUUID == "" { log.Infof("Disk %v has no diskUUID, skip deleting", diskName) return nil @@ -1516,9 +1516,9 @@ func (nc *NodeController) deleteDisk(node *longhorn.Node, diskType longhorn.Disk dataEngine := util.GetDataEngineForDiskType(diskType) - im, err := nc.ds.GetDefaultInstanceManagerByNodeRO(nc.controllerID, dataEngine) + im, err := nc.ds.GetRunningInstanceManagerByNodeRO(nc.controllerID, dataEngine) if err != nil { - return errors.Wrapf(err, "failed to get default instance manager") + return errors.Wrapf(err, "failed to get running instance manager for data engine %v", dataEngine) } diskServiceClient, err := engineapi.NewDiskServiceClient(im, nc.logger) diff --git a/controller/orphan_controller.go b/controller/orphan_controller.go index 04819c8328..d21aa63ddb 100644 --- a/controller/orphan_controller.go +++ b/controller/orphan_controller.go @@ -327,22 +327,22 @@ func (oc *OrphanController) deleteOrphanedReplica(orphan *longhorn.Orphan) error err := lhns.DeletePath(filepath.Join(diskPath, "replicas", replicaDirectoryName)) return errors.Wrapf(err, "failed to delete orphan replica directory %v in disk %v", replicaDirectoryName, diskPath) case longhorn.DiskTypeBlock: - return oc.DeleteSpdkReplicaInstance(orphan.Spec.Parameters[longhorn.OrphanDiskName], orphan.Spec.Parameters[longhorn.OrphanDiskUUID], "", orphan.Spec.Parameters[longhorn.OrphanDataName]) + return oc.DeleteV2ReplicaInstance(orphan.Spec.Parameters[longhorn.OrphanDiskName], orphan.Spec.Parameters[longhorn.OrphanDiskUUID], "", orphan.Spec.Parameters[longhorn.OrphanDataName]) default: return fmt.Errorf("unknown disk type %v for orphan %v", diskType, orphan.Name) } } -func (oc *OrphanController) DeleteSpdkReplicaInstance(diskName, diskUUID, diskDriver, replicaInstanceName string) (err error) { +func (oc *OrphanController) DeleteV2ReplicaInstance(diskName, diskUUID, diskDriver, replicaInstanceName string) (err error) { logrus.Infof("Deleting SPDK replica instance %v on disk %v on node %v", replicaInstanceName, diskUUID, oc.controllerID) defer func() { - err = errors.Wrapf(err, "cannot delete SPDK replica instance %v", replicaInstanceName) + err = errors.Wrapf(err, "cannot delete v2 replica instance %v", replicaInstanceName) }() - im, err := oc.ds.GetDefaultInstanceManagerByNodeRO(oc.controllerID, longhorn.DataEngineTypeV2) + im, err := oc.ds.GetRunningInstanceManagerByNodeRO(oc.controllerID, longhorn.DataEngineTypeV2) if err != nil { - return errors.Wrapf(err, "failed to get instance manager for node %v for deleting SPDK replica instance %v", oc.controllerID, replicaInstanceName) + return errors.Wrapf(err, "failed to get running instance manager for node %v for deleting v2 replica instance %v", oc.controllerID, replicaInstanceName) } c, err := engineapi.NewDiskServiceClient(im, oc.logger) diff --git a/datastore/longhorn.go b/datastore/longhorn.go index fec7944db9..df75590446 100644 --- a/datastore/longhorn.go +++ b/datastore/longhorn.go @@ -5061,3 +5061,30 @@ func (s *DataStore) ListBackupBackingImages() (map[string]*longhorn.BackupBackin func (s *DataStore) ListBackupBackingImagesRO() ([]*longhorn.BackupBackingImage, error) { return s.backupBackingImageLister.BackupBackingImages(s.namespace).List(labels.Everything()) } + +// GetRunningInstanceManagerByNodeRO returns the running instance manager for the given node and data engine +func (s *DataStore) GetRunningInstanceManagerByNodeRO(node string, dataEngine longhorn.DataEngineType) (*longhorn.InstanceManager, error) { + // Trying to get the default instance manager first. + // If the default instance manager is not running, then try to get another running instance manager. + im, err := s.GetDefaultInstanceManagerByNodeRO(node, dataEngine) + if err == nil { + if im.Status.CurrentState == longhorn.InstanceManagerStateRunning { + return im, nil + } + } + + logrus.WithError(err).Warnf("Failed to get the default instance manager for node %v and data engine %v, trying to get another running instance manager", node, dataEngine) + + ims, err := s.ListInstanceManagersByNodeRO(node, longhorn.InstanceManagerTypeAllInOne, dataEngine) + if err != nil { + return nil, errors.Wrapf(err, "failed to list instance managers for node %v", node) + } + + for _, im := range ims { + if im.Status.CurrentState == longhorn.InstanceManagerStateRunning { + return im, nil + } + } + + return nil, fmt.Errorf("failed to find a running instance manager for node %v", node) +}