From b4ccf749f4fc2beb0ef9b41977bd6c066fdc4992 Mon Sep 17 00:00:00 2001 From: Martin Necas Date: Wed, 11 Dec 2024 13:31:55 +0100 Subject: [PATCH] Move waiting for tasks to separate phase to unblock process Issue: When the Forklift creates the snapshot we need to wait for the task to finish. Right now we are using task.WaitForResult, this causes the whole process to wait for the snapshot creation and blocks other VM migrations. Same problem we have with snapshot removal, if the ESXi host is busy and we start the snapshot removal the snapshots can take longer than the reconcile cycle (3s) and we can fail due to it. Fix: Instead of using the task.WaitForResult the forklift will start querying for the latest tasks per VM, by default it's 10 tasks. This querying will be done in a separate phase then the creation/deletion. So we will have WaitFor phases for each of the object manipulations. We find the specific task for the creation/deletion and check its status. This has the advantage that we are not only getting the status of the task but in addition also the results of the task, so we can propagate them to the user, in case the creation/deletion fails. Ref: - https://issues.redhat.com/browse/MTV-1753 - https://issues.redhat.com/browse/MTV-1775 Signed-off-by: Martin Necas --- .../forkliftcontroller/defaults/main.yml | 1 - .../controller/deployment-controller.yml.j2 | 4 - pkg/controller/plan/adapter/base/doc.go | 4 +- pkg/controller/plan/adapter/ocp/client.go | 7 +- .../plan/adapter/openstack/client.go | 7 +- pkg/controller/plan/adapter/ova/client.go | 7 +- pkg/controller/plan/adapter/ovirt/client.go | 7 +- .../plan/adapter/vsphere/BUILD.bazel | 1 + pkg/controller/plan/adapter/vsphere/client.go | 87 +++++++++++++++---- pkg/controller/plan/migration.go | 39 ++++----- pkg/settings/migration.go | 44 ++++------ 11 files changed, 135 insertions(+), 73 deletions(-) diff --git a/operator/roles/forkliftcontroller/defaults/main.yml b/operator/roles/forkliftcontroller/defaults/main.yml index 86f7c8b39..92db25c0f 100644 --- a/operator/roles/forkliftcontroller/defaults/main.yml +++ b/operator/roles/forkliftcontroller/defaults/main.yml @@ -35,7 +35,6 @@ controller_snapshot_removal_timeout_minuts: 120 controller_snapshot_status_check_rate_seconds: 10 controller_cleanup_retries: 10 controller_dv_status_check_retries: 10 -controller_snapshot_removal_check_retries: 20 controller_vsphere_incremental_backup: true controller_ovirt_warm_migration: true controller_retain_precopy_importer_pods: false diff --git a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 index 28bfbcc77..84cc6d480 100644 --- a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 +++ b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 @@ -85,10 +85,6 @@ spec: - name: DV_STATUS_CHECK_RETRIES value: "{{ controller_dv_status_check_retries }}" {% endif %} -{% if controller_snapshot_removal_check_retries is number %} - - name: SNAPSHOT_REMOVAL_CHECK_RETRIES - value: "{{ controller_snapshot_removal_check_retries }}" -{% endif %} {% if controller_max_vm_inflight is number %} - name: MAX_VM_INFLIGHT value: "{{ controller_max_vm_inflight }}" diff --git a/pkg/controller/plan/adapter/base/doc.go b/pkg/controller/plan/adapter/base/doc.go index 0900f3397..c08f75464 100644 --- a/pkg/controller/plan/adapter/base/doc.go +++ b/pkg/controller/plan/adapter/base/doc.go @@ -108,7 +108,9 @@ type Client interface { // Remove a snapshot. RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) error // Check if a snapshot is ready to transfer. - CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) + CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) + // Check if a snapshot is removed. + CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (ready bool, err error) // Set DataVolume checkpoints. SetCheckpoints(vmRef ref.Ref, precopies []planapi.Precopy, datavolumes []cdi.DataVolume, final bool, hostsFunc util.HostsFunc) (err error) // Close connections to the provider API. diff --git a/pkg/controller/plan/adapter/ocp/client.go b/pkg/controller/plan/adapter/ocp/client.go index c17cf49a8..951264857 100644 --- a/pkg/controller/plan/adapter/ocp/client.go +++ b/pkg/controller/plan/adapter/ocp/client.go @@ -25,7 +25,12 @@ type Client struct { } // CheckSnapshotReady implements base.Client -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (bool, error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { + return +} + +// CheckSnapshotRemoved implements base.Client +func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (bool, error) { return false, nil } diff --git a/pkg/controller/plan/adapter/openstack/client.go b/pkg/controller/plan/adapter/openstack/client.go index e8e7a89da..6291a095c 100644 --- a/pkg/controller/plan/adapter/openstack/client.go +++ b/pkg/controller/plan/adapter/openstack/client.go @@ -115,10 +115,15 @@ func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (imageI } // Check if a snapshot is ready to transfer. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, imageID string) (ready bool, err error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { return } +// CheckSnapshotRemoved implements base.Client +func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (bool, error) { + return false, nil +} + // Set DataVolume checkpoints. func (r *Client) SetCheckpoints(vmRef ref.Ref, precopies []planapi.Precopy, datavolumes []cdi.DataVolume, final bool, hostsFunc util.HostsFunc) error { return nil diff --git a/pkg/controller/plan/adapter/ova/client.go b/pkg/controller/plan/adapter/ova/client.go index c1ef2838d..c0a676e1c 100644 --- a/pkg/controller/plan/adapter/ova/client.go +++ b/pkg/controller/plan/adapter/ova/client.go @@ -61,10 +61,15 @@ func (r *Client) GetSnapshotDeltas(vmRef ref.Ref, snapshot string, hostsFunc uti } // Check if a snapshot is ready to transfer, to avoid importer restarts. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { return } +// CheckSnapshotRemoved implements base.Client +func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (bool, error) { + return false, nil +} + // Set DataVolume checkpoints. func (r *Client) SetCheckpoints(vmRef ref.Ref, precopies []planapi.Precopy, datavolumes []cdi.DataVolume, final bool, hostsFunc util.HostsFunc) (err error) { return diff --git a/pkg/controller/plan/adapter/ovirt/client.go b/pkg/controller/plan/adapter/ovirt/client.go index 4a7a7fec5..11d404bdc 100644 --- a/pkg/controller/plan/adapter/ovirt/client.go +++ b/pkg/controller/plan/adapter/ovirt/client.go @@ -75,7 +75,7 @@ func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (snapsh } // Check if a snapshot is ready to transfer, to avoid importer restarts. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { correlationID, err := r.getSnapshotCorrelationID(vmRef, &snapshot) if err != nil { err = liberr.Wrap(err) @@ -104,6 +104,11 @@ func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, return } +// CheckSnapshotRemoved implements base.Client +func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (bool, error) { + return false, nil +} + // Remove a VM snapshot. No-op for this provider. func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (err error) { return diff --git a/pkg/controller/plan/adapter/vsphere/BUILD.bazel b/pkg/controller/plan/adapter/vsphere/BUILD.bazel index c744b9e7c..68d93a55a 100644 --- a/pkg/controller/plan/adapter/vsphere/BUILD.bazel +++ b/pkg/controller/plan/adapter/vsphere/BUILD.bazel @@ -36,6 +36,7 @@ go_library( "//vendor/github.com/vmware/govmomi/find", "//vendor/github.com/vmware/govmomi/object", "//vendor/github.com/vmware/govmomi/session", + "//vendor/github.com/vmware/govmomi/task", "//vendor/github.com/vmware/govmomi/vim25", "//vendor/github.com/vmware/govmomi/vim25/mo", "//vendor/github.com/vmware/govmomi/vim25/soap", diff --git a/pkg/controller/plan/adapter/vsphere/client.go b/pkg/controller/plan/adapter/vsphere/client.go index a48af1af5..6715f1a72 100644 --- a/pkg/controller/plan/adapter/vsphere/client.go +++ b/pkg/controller/plan/adapter/vsphere/client.go @@ -16,6 +16,7 @@ import ( "github.com/vmware/govmomi" "github.com/vmware/govmomi/object" "github.com/vmware/govmomi/session" + "github.com/vmware/govmomi/task" "github.com/vmware/govmomi/vim25" "github.com/vmware/govmomi/vim25/mo" "github.com/vmware/govmomi/vim25/soap" @@ -27,8 +28,11 @@ import ( ) const ( - snapshotName = "forklift-migration-precopy" - snapshotDesc = "Forklift Operator warm migration precopy" + snapshotName = "forklift-migration-precopy" + snapshotDesc = "Forklift Operator warm migration precopy" + VirtualMachine = "VirtualMachine" + CreateSnapshotTask = "CreateSnapshot_Task" + RemoveSnapshotTask = "RemoveSnapshot_Task" ) // vSphere VM Client @@ -39,9 +43,9 @@ type Client struct { } // Create a VM snapshot and return its ID. -func (r *Client) CreateSnapshot(vmRef ref.Ref, hosts util.HostsFunc) (id string, err error) { +func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (id string, err error) { r.Log.V(1).Info("Creating snapshot", "vmRef", vmRef) - vm, err := r.getVM(vmRef, hosts) + vm, err := r.getVM(vmRef, hostsFunc) if err != nil { return } @@ -50,28 +54,81 @@ func (r *Client) CreateSnapshot(vmRef ref.Ref, hosts util.HostsFunc) (id string, err = liberr.Wrap(err) return } - res, err := task.WaitForResult(context.TODO(), nil) + return task.Common.Reference().Value, nil +} + +// Check if a snapshot is ready to transfer. +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { + taskInfo, err := r.getLatestTaskByName(vmRef, CreateSnapshotTask) if err != nil { - err = liberr.Wrap(err) - return + return false, "", liberr.Wrap(err) } - id = res.Result.(types.ManagedObjectReference).Value - r.Log.Info("Created snapshot", "vmRef", vmRef, "id", id) + ready, err = r.checkTaskStatus(taskInfo) + if err != nil { + return false, "", liberr.Wrap(err) + } + if ready { + return true, taskInfo.Result.(types.ManagedObjectReference).Value, nil + } else { + // The snapshot is not ready, retry the check + return false, "", nil + } +} - return +// Check if a snapshot is removed. +func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (ready bool, err error) { + taskInfo, err := r.getLatestTaskByName(vmRef, RemoveSnapshotTask) + if err != nil { + return false, liberr.Wrap(err) + } + return r.checkTaskStatus(taskInfo) } -// Check if a snapshot is ready to transfer. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) { - return true, nil +func (r *Client) checkTaskStatus(taskInfo *types.TaskInfo) (ready bool, err error) { + r.Log.Info("Snapshot task", "task", taskInfo.Task.Value, "name", taskInfo.Name, "status", taskInfo.State) + switch taskInfo.State { + case types.TaskInfoStateSuccess: + return true, nil + case types.TaskInfoStateError: + return false, fmt.Errorf(taskInfo.Error.LocalizedMessage) + default: + return false, nil + } +} + +func (r *Client) getLatestTaskByName(vmRef ref.Ref, taskName string) (*types.TaskInfo, error) { + taskManager := task.NewManager(r.client.Client) + taskCollector, err := taskManager.CreateCollectorForTasks(context.TODO(), types.TaskFilterSpec{ + Entity: &types.TaskFilterSpecByEntity{ + Entity: types.ManagedObjectReference{ + Type: VirtualMachine, + Value: vmRef.ID, + }, + Recursion: types.TaskFilterSpecRecursionOptionSelf, + }, + }) + if err != nil { + return nil, err + } + defer taskCollector.Destroy(context.Background()) + tasks, err := taskCollector.LatestPage(context.TODO()) + if err != nil { + return nil, err + } + for _, taskInfo := range tasks { + if taskInfo.Name == taskName { + return &taskInfo, nil + } + } + return nil, fmt.Errorf("no task found with name %s, vmRef %v", taskName, vmRef) } // Remove a VM snapshot. -func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hosts util.HostsFunc) (err error) { +func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (err error) { r.Log.V(1).Info("RemoveSnapshot", "vmRef", vmRef, "snapshot", snapshot) - err = r.removeSnapshot(vmRef, snapshot, false, hosts) + err = r.removeSnapshot(vmRef, snapshot, false, hostsFunc) return } diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 979afe3bd..47440a382 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -108,7 +108,6 @@ const ( TransferCompleted = "Transfer completed." PopulatorPodPrefix = "populate-" DvStatusCheckRetriesAnnotation = "dvStatusCheckRetries" - SnapshotRemovalCheckRetries = "snapshotRemovalCheckRetries" ) var ( @@ -1021,27 +1020,15 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { vm.AddError(fmt.Sprintf("Step '%s' not found", r.step(vm))) break } - // FIXME: This is just temporary timeout to unblock the migrations which get stuck on issue https://issues.redhat.com/browse/MTV-1753 - // This should be fixed properly by adding the task manager inside the inventory and monitor the task status - // from the main controller. - var retries int - retriesAnnotation := step.Annotations[SnapshotRemovalCheckRetries] - if retriesAnnotation == "" { - step.Annotations[SnapshotRemovalCheckRetries] = "1" - } else { - retries, err = strconv.Atoi(retriesAnnotation) - if err != nil { - step.AddError(err.Error()) - err = nil - break - } - if retries >= settings.Settings.SnapshotRemovalCheckRetries { - vm.Phase = r.next(vm.Phase) - // Reset for next precopy - step.Annotations[SnapshotRemovalCheckRetries] = "1" - } else { - step.Annotations[SnapshotRemovalCheckRetries] = strconv.Itoa(retries + 1) - } + snapshot := vm.Warm.Precopies[len(vm.Warm.Precopies)-1].Snapshot + ready, err := r.provider.CheckSnapshotRemoved(vm.Ref, snapshot) + if err != nil { + step.AddError(err.Error()) + err = nil + break + } + if ready { + vm.Phase = r.next(vm.Phase) } case CreateInitialSnapshot, CreateSnapshot, CreateFinalSnapshot: step, found := vm.FindStep(r.step(vm)) @@ -1070,12 +1057,18 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { break } snapshot := vm.Warm.Precopies[len(vm.Warm.Precopies)-1].Snapshot - ready, err := r.provider.CheckSnapshotReady(vm.Ref, snapshot) + ready, snapshotId, err := r.provider.CheckSnapshotReady(vm.Ref, snapshot) if err != nil { step.AddError(err.Error()) err = nil break } + // If the provider does not directly create the snapshot, but we need to wait for the snapshot to be created + // We start the creation task in CreateSnapshot, set the task ID as a snapshot id which needs to be replaced + // by the snapshot id after the task finishes. + if snapshotId != "" { + vm.Warm.Precopies[len(vm.Warm.Precopies)-1].Snapshot = snapshotId + } if ready { vm.Phase = r.next(vm.Phase) } diff --git a/pkg/settings/migration.go b/pkg/settings/migration.go index 354eadd18..0b8096156 100644 --- a/pkg/settings/migration.go +++ b/pkg/settings/migration.go @@ -12,26 +12,25 @@ import ( // Environment variables. const ( - MaxVmInFlight = "MAX_VM_INFLIGHT" - HookRetry = "HOOK_RETRY" - ImporterRetry = "IMPORTER_RETRY" - VirtV2vImage = "VIRT_V2V_IMAGE" - PrecopyInterval = "PRECOPY_INTERVAL" - VirtV2vDontRequestKVM = "VIRT_V2V_DONT_REQUEST_KVM" - SnapshotRemovalTimeout = "SNAPSHOT_REMOVAL_TIMEOUT" - SnapshotStatusCheckRate = "SNAPSHOT_STATUS_CHECK_RATE" - CDIExportTokenTTL = "CDI_EXPORT_TOKEN_TTL" - FileSystemOverhead = "FILESYSTEM_OVERHEAD" - BlockOverhead = "BLOCK_OVERHEAD" - CleanupRetries = "CLEANUP_RETRIES" - DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES" - SnapshotRemovalCheckRetries = "SNAPSHOT_REMOVAL_CHECK_RETRIES" - OvirtOsConfigMap = "OVIRT_OS_MAP" - VsphereOsConfigMap = "VSPHERE_OS_MAP" - VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP" - VddkJobActiveDeadline = "VDDK_JOB_ACTIVE_DEADLINE" - VirtV2vExtraArgs = "VIRT_V2V_EXTRA_ARGS" - VirtV2vExtraConfConfigMap = "VIRT_V2V_EXTRA_CONF_CONFIG_MAP" + MaxVmInFlight = "MAX_VM_INFLIGHT" + HookRetry = "HOOK_RETRY" + ImporterRetry = "IMPORTER_RETRY" + VirtV2vImage = "VIRT_V2V_IMAGE" + PrecopyInterval = "PRECOPY_INTERVAL" + VirtV2vDontRequestKVM = "VIRT_V2V_DONT_REQUEST_KVM" + SnapshotRemovalTimeout = "SNAPSHOT_REMOVAL_TIMEOUT" + SnapshotStatusCheckRate = "SNAPSHOT_STATUS_CHECK_RATE" + CDIExportTokenTTL = "CDI_EXPORT_TOKEN_TTL" + FileSystemOverhead = "FILESYSTEM_OVERHEAD" + BlockOverhead = "BLOCK_OVERHEAD" + CleanupRetries = "CLEANUP_RETRIES" + DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES" + OvirtOsConfigMap = "OVIRT_OS_MAP" + VsphereOsConfigMap = "VSPHERE_OS_MAP" + VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP" + VddkJobActiveDeadline = "VDDK_JOB_ACTIVE_DEADLINE" + VirtV2vExtraArgs = "VIRT_V2V_EXTRA_ARGS" + VirtV2vExtraConfConfigMap = "VIRT_V2V_EXTRA_CONF_CONFIG_MAP" ) // Migration settings @@ -62,8 +61,6 @@ type Migration struct { CleanupRetries int // DvStatusCheckRetries retries DvStatusCheckRetries int - // SnapshotRemovalCheckRetries retries - SnapshotRemovalCheckRetries int // oVirt OS config map name OvirtOsConfigMap string // vSphere OS config map name @@ -109,9 +106,6 @@ func (r *Migration) Load() (err error) { if r.DvStatusCheckRetries, err = getPositiveEnvLimit(DvStatusCheckRetries, 10); err != nil { return liberr.Wrap(err) } - if r.SnapshotRemovalCheckRetries, err = getPositiveEnvLimit(SnapshotRemovalCheckRetries, 20); err != nil { - return liberr.Wrap(err) - } if virtV2vImage, ok := os.LookupEnv(VirtV2vImage); ok { r.VirtV2vImage = virtV2vImage } else if Settings.Role.Has(MainRole) {