Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Longhorn Online Rebuilding (backport #2995) #2999

Merged
merged 6 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions controller/engine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1948,6 +1948,12 @@ func (ec *EngineController) startRebuilding(e *longhorn.Engine, replicaName, add
"Start rebuilding replica %v with Address %v for normal engine %v and volume %v", replicaName, addr, e.Name, e.Spec.VolumeName)
err = engineClientProxy.ReplicaAdd(e, replicaName, replicaURL, false, fastReplicaRebuild, localSync, fileSyncHTTPClientTimeout, grpcTimeoutSeconds)
}

// For v2 engine, the rebuilding is an async call. We need to wait for the rebuilding start then complete here
if err == nil && types.IsDataEngineV2(e.Spec.DataEngine) {
err = ec.waitForV2EngineRebuild(e, replicaName, grpcTimeoutSeconds)
}

if err != nil {
replicaRebuildErrMsg := err.Error()

Expand Down Expand Up @@ -2121,6 +2127,64 @@ func getReplicaRebuildFailedReasonFromError(errMsg string) (string, longhorn.Con
}
}

func (ec *EngineController) waitForV2EngineRebuild(e *longhorn.Engine, replicaName string, timeout int64) (err error) {
if !types.IsDataEngineV2(e.Spec.DataEngine) {
return nil
}

ticker := time.NewTicker(EnginePollInterval)
defer ticker.Stop()
timer := time.NewTimer(time.Duration(timeout) * time.Second)
defer timer.Stop()
for {
select {
case <-ticker.C:
e, err = ec.ds.GetEngineRO(e.Name)
if err != nil {
// There is no need to continue if the engine is not found
if apierrors.IsNotFound(err) {
return err
}
// There may be something wrong with the indexer or the API sever, will retry
continue
}
if e.Spec.ReplicaAddressMap[replicaName] == "" {
return fmt.Errorf("unknown replica %v for engine", replicaName)
}
// There is no need to continue when the replica is not found or the replica is not in a valid state for rebuilding
r, err := ec.ds.GetReplicaRO(replicaName)
if err != nil {
return err
}
if r.Status.CurrentState != longhorn.InstanceStateRunning {
return fmt.Errorf("replica %v is state %s, which is invalid for rebuilding", replicaName, r.Status.CurrentState)
}
if e.Status.ReplicaModeMap[replicaName] == longhorn.ReplicaModeRW {
return nil
}
if e.Status.ReplicaModeMap[replicaName] == longhorn.ReplicaModeERR {
return fmt.Errorf("replica %v is in ERR mode, which is invalid for rebuilding", replicaName)
}
if e.Status.ReplicaModeMap[replicaName] == "" {
continue
}
// For a rebuilding replica (with mode WO), there should be a corresponding rebuilding status
rebuildingStatus := e.Status.RebuildStatus[engineapi.GetBackendReplicaURL(e.Status.CurrentReplicaAddressMap[replicaName])]
if rebuildingStatus == nil {
continue
}
if rebuildingStatus.State == engineapi.ProcessStateError || rebuildingStatus.Error != "" {
return fmt.Errorf(rebuildingStatus.Error)
}
if rebuildingStatus.State == engineapi.ProcessStateComplete {
return nil
}
case <-timer.C:
return fmt.Errorf("timeout waiting for replica %v to be rebuilt", replicaName)
}
}
}

func (ec *EngineController) Upgrade(e *longhorn.Engine, log *logrus.Entry) (err error) {
defer func() {
err = errors.Wrapf(err, "failed to live upgrade image for %v", e.Name)
Expand Down
1 change: 0 additions & 1 deletion controller/replica_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,6 @@ func (rc *ReplicaController) CreateInstance(obj interface{}) (*longhorn.Instance
DataPath: dataPath,
BackingImagePath: backingImagePath,
DataLocality: v.Spec.DataLocality,
ExposeRequired: true,
ImIP: im.Status.IP,
EngineCLIAPIVersion: cliAPIVersion,
})
Expand Down
10 changes: 4 additions & 6 deletions controller/volume_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -3640,12 +3640,6 @@ func (c *VolumeController) createReplica(replica *longhorn.Replica, v *longhorn.

if isRebuildingReplica {
// TODO: reuse failed replica for replica rebuilding of SPDK volumes
if types.IsDataEngineV2(v.Spec.DataEngine) {
if !v.Spec.DisableFrontend {
log.Tracef("Online replica rebuilding for replica %v is not supported for SPDK volumes", replica.Name)
return nil
}
}

log.Infof("A new replica %v will be replenished during rebuilding", replica.Name)
// Prevent this new replica from being reused after rebuilding failure.
Expand Down Expand Up @@ -4842,6 +4836,10 @@ func (c *VolumeController) shouldCleanUpFailedReplica(v *longhorn.Volume, r *lon
log.Warnf("Replica %v failed to rebuild too many times", r.Name)
return true
}
// TODO: Remove it once we can reuse failed replicas during v2 rebuilding
if types.IsDataEngineV2(v.Spec.DataEngine) {
return true
}
// Failed too long ago to be useful during a rebuild.
if v.Spec.StaleReplicaTimeout > 0 &&
util.TimestampAfterTimeout(r.Spec.FailedAt, time.Duration(v.Spec.StaleReplicaTimeout)*time.Minute) {
Expand Down
6 changes: 2 additions & 4 deletions engineapi/instance_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,6 @@ type ReplicaInstanceCreateRequest struct {
DataPath string
BackingImagePath string
DataLocality longhorn.DataLocality
ExposeRequired bool
ImIP string
EngineCLIAPIVersion int
}
Expand Down Expand Up @@ -562,9 +561,8 @@ func (c *InstanceManagerClient) ReplicaInstanceCreate(req *ReplicaInstanceCreate
BinaryArgs: args,

Replica: imclient.ReplicaCreateRequest{
DiskName: req.DiskName,
DiskUUID: req.Replica.Spec.DiskID,
ExposeRequired: req.ExposeRequired,
DiskName: req.DiskName,
DiskUUID: req.Replica.Spec.DiskID,
},
})
if err != nil {
Expand Down
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ require (
github.com/longhorn/backupstore v0.0.0-20240720163059-56c90cd23634
github.com/longhorn/go-common-libs v0.0.0-20240720044518-32fc527fe868
github.com/longhorn/go-iscsi-helper v0.0.0-20240720064937-c6ce82d67032
github.com/longhorn/go-spdk-helper v0.0.0-20240720064915-d2ce0846d2a7
github.com/longhorn/go-spdk-helper v0.0.0-20240723045100-85e5737da4b5
github.com/longhorn/longhorn-engine v1.7.0-rc1
github.com/longhorn/longhorn-instance-manager v1.7.0-rc1
github.com/longhorn/longhorn-instance-manager v1.7.0-rc1.0.20240724055354-86e7fa84ef3c
github.com/longhorn/longhorn-share-manager v1.7.0-rc1
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.18.0
Expand Down Expand Up @@ -103,7 +103,7 @@ require (
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/pprof v0.0.0-20231023181126-ff6d637d2a7b // indirect
github.com/jonboulle/clockwork v0.4.0 // indirect
github.com/longhorn/types v0.0.0-20240706151541-33cb010c3544 // indirect
github.com/longhorn/types v0.0.0-20240723142222-56701c990023 // indirect
github.com/mitchellh/go-ps v1.0.0 // indirect
github.com/moby/term v0.0.0-20221205130635-1aeaba878587 // indirect
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
Expand Down
12 changes: 6 additions & 6 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1228,16 +1228,16 @@ github.com/longhorn/go-common-libs v0.0.0-20240720044518-32fc527fe868 h1:B1L6EGH
github.com/longhorn/go-common-libs v0.0.0-20240720044518-32fc527fe868/go.mod h1:o01gaAiKE5NCd8+5i6csJSU4ELlW0Yn8GQ9U7pbXG9w=
github.com/longhorn/go-iscsi-helper v0.0.0-20240720064937-c6ce82d67032 h1:93y5fe2y/TeP58k6VDsVrGHTy+dbaJ28DaNRGhrnYs8=
github.com/longhorn/go-iscsi-helper v0.0.0-20240720064937-c6ce82d67032/go.mod h1:r72HLu6d9YRW5u5ovmw8IkUFQoy9zHObx8yn9DCFp/w=
github.com/longhorn/go-spdk-helper v0.0.0-20240720064915-d2ce0846d2a7 h1:bwieBy2bUVi6sxFi2UbpXt292RykALG31yuChEYo66w=
github.com/longhorn/go-spdk-helper v0.0.0-20240720064915-d2ce0846d2a7/go.mod h1:BxsYHvCYn/1rfohn5021Wt8tFYQb+unq1jItx8+htHY=
github.com/longhorn/go-spdk-helper v0.0.0-20240723045100-85e5737da4b5 h1:zz0gXSR3pjZ+iBPym+rfpsu+tMGd5TfgOsgvue867cc=
github.com/longhorn/go-spdk-helper v0.0.0-20240723045100-85e5737da4b5/go.mod h1:lt/eCeOltUM9QbMxS6qDJRr/+wvavxbLGTzkKrqyaSI=
github.com/longhorn/longhorn-engine v1.7.0-rc1 h1:kQ4BwCvBemWsQvVnLdRsbiCXR3zmQBMA1J9Ezysh2q0=
github.com/longhorn/longhorn-engine v1.7.0-rc1/go.mod h1:u0TZ1221YusDYA+ExdVLjLid1Ps6JuJXgh9185l5D9Y=
github.com/longhorn/longhorn-instance-manager v1.7.0-rc1 h1:9hugpEQEmK6tMa1zY87GX1xrdq1Wb3TDqNl5LQzCWhM=
github.com/longhorn/longhorn-instance-manager v1.7.0-rc1/go.mod h1:HIo3UCiH81EtTPwOujkKZIIQbvmJ1A3OeRHShHlAEV0=
github.com/longhorn/longhorn-instance-manager v1.7.0-rc1.0.20240724055354-86e7fa84ef3c h1:MxpZfeUWHvUfZ3jyI3wqDyoVk46qGdxTP37zhYABKaE=
github.com/longhorn/longhorn-instance-manager v1.7.0-rc1.0.20240724055354-86e7fa84ef3c/go.mod h1:gLVS19NOiRcfxJoGBTVVdY6TntbXi/Ks6nWofoJZQsM=
github.com/longhorn/longhorn-share-manager v1.7.0-rc1 h1:LsSkSajhG8tCfORKKfwK+8XHVrT/8rI9DRWb7fuoVls=
github.com/longhorn/longhorn-share-manager v1.7.0-rc1/go.mod h1:R6+NscPU4lAV5ueO7//lBCAO3en0aDbZi5KkkOSUJvk=
github.com/longhorn/types v0.0.0-20240706151541-33cb010c3544 h1:U08l+0SbxCsododsraBHB5PdXrQme3TEh9iaREhRLQs=
github.com/longhorn/types v0.0.0-20240706151541-33cb010c3544/go.mod h1:KlJuZB8NfHchWshYxYgV9pPIxBKC04Vq05G2TfgMf7w=
github.com/longhorn/types v0.0.0-20240723142222-56701c990023 h1:6BYl+2rZhXjYZiBeLnWZPUgOiEoXGFA9CjcIfETdP/o=
github.com/longhorn/types v0.0.0-20240723142222-56701c990023/go.mod h1:KlJuZB8NfHchWshYxYgV9pPIxBKC04Vq05G2TfgMf7w=
github.com/lyft/protoc-gen-star v0.6.0/go.mod h1:TGAoBVkt8w7MPG72TrKIu85MIdXwDuzJYeZuUPFPNwA=
github.com/lyft/protoc-gen-star v0.6.1/go.mod h1:TGAoBVkt8w7MPG72TrKIu85MIdXwDuzJYeZuUPFPNwA=
github.com/lyft/protoc-gen-star/v2 v2.0.1/go.mod h1:RcCdONR2ScXaYnQC5tUzxzlpA3WVYF7/opLeUgcQs/o=
Expand Down
10 changes: 10 additions & 0 deletions scheduler/replica_scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,11 @@ func filterActiveReplicas(replicas map[string]*longhorn.Replica) map[string]*lon
}

func (rcs *ReplicaScheduler) CheckAndReuseFailedReplica(replicas map[string]*longhorn.Replica, volume *longhorn.Volume, hardNodeAffinity string) (*longhorn.Replica, error) {
// TODO: Remove it once we can reuse failed replicas during v2 rebuilding
if types.IsDataEngineV2(volume.Spec.DataEngine) {
return nil, nil
}

replicas = filterActiveReplicas(replicas)

allNodesInfo, err := rcs.getNodeInfo()
Expand Down Expand Up @@ -654,6 +659,11 @@ func (rcs *ReplicaScheduler) RequireNewReplica(replicas map[string]*longhorn.Rep
return 0
}

// TODO: Remove it once we can reuse failed replicas during v2 rebuilding
if types.IsDataEngineV2(volume.Spec.DataEngine) {
return 0
}

timeUntilNext, timeOfNext, err := rcs.timeToReplacementReplica(volume)
if err != nil {
msg := "Failed to get time until replica replacement, will directly replenish a new replica"
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading