Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport to 2.7.6 #1254

Merged
merged 2 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions operator/roles/forkliftcontroller/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ controller_precopy_interval: 60
controller_snapshot_removal_timeout_minuts: 120
controller_snapshot_status_check_rate_seconds: 10
controller_cleanup_retries: 10
controller_dv_status_check_retries: 10
controller_snapshot_removal_check_retries: 20
controller_vsphere_incremental_backup: true
controller_ovirt_warm_migration: true
controller_max_vm_inflight: 20
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,14 @@ spec:
- name: CLEANUP_RETRIES
value: "{{ controller_cleanup_retries }}"
{% endif %}
{% if controller_dv_status_check_retries is number %}
- name: DV_STATUS_CHECK_RETRIES
value: "{{ controller_dv_status_check_retries }}"
{% endif %}
{% if controller_snapshot_removal_check_retries is number %}
- name: SNAPSHOT_REMOVAL_CHECK_RETRIES
value: "{{ controller_snapshot_removal_check_retries }}"
{% endif %}
{% if controller_max_vm_inflight is number %}
- name: MAX_VM_INFLIGHT
value: "{{ controller_max_vm_inflight }}"
Expand Down
166 changes: 130 additions & 36 deletions pkg/controller/plan/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,36 +55,41 @@ var (

// Phases.
const (
Started = "Started"
PreHook = "PreHook"
StorePowerState = "StorePowerState"
PowerOffSource = "PowerOffSource"
WaitForPowerOff = "WaitForPowerOff"
CreateDataVolumes = "CreateDataVolumes"
CreateVM = "CreateVM"
CopyDisks = "CopyDisks"
AllocateDisks = "AllocateDisks"
CopyingPaused = "CopyingPaused"
AddCheckpoint = "AddCheckpoint"
AddFinalCheckpoint = "AddFinalCheckpoint"
CreateSnapshot = "CreateSnapshot"
CreateInitialSnapshot = "CreateInitialSnapshot"
CreateFinalSnapshot = "CreateFinalSnapshot"
Finalize = "Finalize"
CreateGuestConversionPod = "CreateGuestConversionPod"
ConvertGuest = "ConvertGuest"
CopyDisksVirtV2V = "CopyDisksVirtV2V"
PostHook = "PostHook"
Completed = "Completed"
WaitForSnapshot = "WaitForSnapshot"
WaitForInitialSnapshot = "WaitForInitialSnapshot"
WaitForFinalSnapshot = "WaitForFinalSnapshot"
ConvertOpenstackSnapshot = "ConvertOpenstackSnapshot"
StoreSnapshotDeltas = "StoreSnapshotDeltas"
StoreInitialSnapshotDeltas = "StoreInitialSnapshotDeltas"
RemovePreviousSnapshot = "RemovePreviousSnapshot"
RemovePenultimateSnapshot = "RemovePenultimateSnapshot"
RemoveFinalSnapshot = "RemoveFinalSnapshot"
Started = "Started"
PreHook = "PreHook"
StorePowerState = "StorePowerState"
PowerOffSource = "PowerOffSource"
WaitForPowerOff = "WaitForPowerOff"
CreateDataVolumes = "CreateDataVolumes"
WaitForDataVolumesStatus = "WaitForDataVolumesStatus"
WaitForFinalDataVolumesStatus = "WaitForFinalDataVolumesStatus"
CreateVM = "CreateVM"
CopyDisks = "CopyDisks"
AllocateDisks = "AllocateDisks"
CopyingPaused = "CopyingPaused"
AddCheckpoint = "AddCheckpoint"
AddFinalCheckpoint = "AddFinalCheckpoint"
CreateSnapshot = "CreateSnapshot"
CreateInitialSnapshot = "CreateInitialSnapshot"
CreateFinalSnapshot = "CreateFinalSnapshot"
Finalize = "Finalize"
CreateGuestConversionPod = "CreateGuestConversionPod"
ConvertGuest = "ConvertGuest"
CopyDisksVirtV2V = "CopyDisksVirtV2V"
PostHook = "PostHook"
Completed = "Completed"
WaitForSnapshot = "WaitForSnapshot"
WaitForInitialSnapshot = "WaitForInitialSnapshot"
WaitForFinalSnapshot = "WaitForFinalSnapshot"
ConvertOpenstackSnapshot = "ConvertOpenstackSnapshot"
StoreSnapshotDeltas = "StoreSnapshotDeltas"
StoreInitialSnapshotDeltas = "StoreInitialSnapshotDeltas"
RemovePreviousSnapshot = "RemovePreviousSnapshot"
RemovePenultimateSnapshot = "RemovePenultimateSnapshot"
RemoveFinalSnapshot = "RemoveFinalSnapshot"
WaitForFinalSnapshotRemoval = "WaitForFinalSnapshotRemoval"
WaitForPreviousSnapshotRemoval = "WaitForPreviousSnapshotRemoval"
WaitForPenultimateSnapshotRemoval = "WaitForPenultimateSnapshotRemoval"
)

// Steps.
Expand All @@ -100,8 +105,10 @@ const (
)

const (
TransferCompleted = "Transfer completed."
PopulatorPodPrefix = "populate-"
TransferCompleted = "Transfer completed."
PopulatorPodPrefix = "populate-"
DvStatusCheckRetriesAnnotation = "dvStatusCheckRetries"
SnapshotRemovalCheckRetries = "snapshotRemovalCheckRetries"
)

var (
Expand Down Expand Up @@ -134,9 +141,11 @@ var (
{Name: WaitForInitialSnapshot},
{Name: StoreInitialSnapshotDeltas, All: VSphere},
{Name: CreateDataVolumes},
{Name: WaitForDataVolumesStatus},
{Name: CopyDisks},
{Name: CopyingPaused},
{Name: RemovePreviousSnapshot, All: VSphere},
{Name: WaitForPreviousSnapshotRemoval, All: VSphere},
{Name: CreateSnapshot},
{Name: WaitForSnapshot},
{Name: StoreSnapshotDeltas, All: VSphere},
Expand All @@ -145,11 +154,14 @@ var (
{Name: PowerOffSource},
{Name: WaitForPowerOff},
{Name: RemovePenultimateSnapshot, All: VSphere},
{Name: WaitForPenultimateSnapshotRemoval, All: VSphere},
{Name: CreateFinalSnapshot},
{Name: WaitForFinalSnapshot},
{Name: AddFinalCheckpoint},
{Name: WaitForFinalDataVolumesStatus},
{Name: Finalize},
{Name: RemoveFinalSnapshot, All: VSphere},
{Name: WaitForFinalSnapshotRemoval, All: VSphere},
{Name: CreateGuestConversionPod, All: RequiresConversion},
{Name: ConvertGuest, All: RequiresConversion},
{Name: CreateVM},
Expand Down Expand Up @@ -662,9 +674,9 @@ func (r *Migration) step(vm *plan.VMStatus) (step string) {
step = Initialize
case AllocateDisks:
step = DiskAllocation
case CopyDisks, CopyingPaused, RemovePreviousSnapshot, CreateSnapshot, WaitForSnapshot, StoreSnapshotDeltas, AddCheckpoint, ConvertOpenstackSnapshot:
case CopyDisks, CopyingPaused, RemovePreviousSnapshot, WaitForPreviousSnapshotRemoval, CreateSnapshot, WaitForSnapshot, StoreSnapshotDeltas, AddCheckpoint, ConvertOpenstackSnapshot, WaitForDataVolumesStatus:
step = DiskTransfer
case RemovePenultimateSnapshot, CreateFinalSnapshot, WaitForFinalSnapshot, AddFinalCheckpoint, Finalize, RemoveFinalSnapshot:
case RemovePenultimateSnapshot, WaitForPenultimateSnapshotRemoval, CreateFinalSnapshot, WaitForFinalSnapshot, AddFinalCheckpoint, Finalize, RemoveFinalSnapshot, WaitForFinalSnapshotRemoval, WaitForFinalDataVolumesStatus:
step = Cutover
case CreateGuestConversionPod, ConvertGuest:
step = ImageConversion
Expand Down Expand Up @@ -1003,6 +1015,34 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) {
break
}
vm.Phase = r.next(vm.Phase)
case WaitForPreviousSnapshotRemoval, WaitForPenultimateSnapshotRemoval, WaitForFinalSnapshotRemoval:
step, found := vm.FindStep(r.step(vm))
if !found {
vm.AddError(fmt.Sprintf("Step '%s' not found", r.step(vm)))
break
}
// FIXME: This is just temporary timeout to unblock the migrations which get stuck on issue https://issues.redhat.com/browse/MTV-1753
// This should be fixed properly by adding the task manager inside the inventory and monitor the task status
// from the main controller.
var retries int
retriesAnnotation := step.Annotations[SnapshotRemovalCheckRetries]
if retriesAnnotation == "" {
step.Annotations[SnapshotRemovalCheckRetries] = "1"
} else {
retries, err = strconv.Atoi(retriesAnnotation)
if err != nil {
step.AddError(err.Error())
err = nil
break
}
if retries >= settings.Settings.SnapshotRemovalCheckRetries {
vm.Phase = r.next(vm.Phase)
// Reset for next precopy
step.Annotations[SnapshotRemovalCheckRetries] = "1"
} else {
step.Annotations[SnapshotRemovalCheckRetries] = strconv.Itoa(retries + 1)
}
}
case CreateInitialSnapshot, CreateSnapshot, CreateFinalSnapshot:
step, found := vm.FindStep(r.step(vm))
if !found {
Expand Down Expand Up @@ -1039,6 +1079,51 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) {
if ready {
vm.Phase = r.next(vm.Phase)
}
case WaitForDataVolumesStatus, WaitForFinalDataVolumesStatus:
step, found := vm.FindStep(r.step(vm))
if !found {
vm.AddError(fmt.Sprintf("Step '%s' not found", r.step(vm)))
break
}

dvs, err := r.kubevirt.getDVs(vm)
if err != nil {
step.AddError(err.Error())
err = nil
break
}
if !r.hasPausedDv(dvs) {
vm.Phase = r.next(vm.Phase)
// Reset for next precopy
step.Annotations[DvStatusCheckRetriesAnnotation] = "1"
} else {
var retries int
retriesAnnotation := step.Annotations[DvStatusCheckRetriesAnnotation]
if retriesAnnotation == "" {
step.Annotations[DvStatusCheckRetriesAnnotation] = "1"
} else {
retries, err = strconv.Atoi(retriesAnnotation)
if err != nil {
step.AddError(err.Error())
err = nil
break
}
if retries >= settings.Settings.DvStatusCheckRetries {
// Do not fail the step as this can happen when the user runs the warm migration but the VM is already shutdown
// In that case we don't create any delta and don't change the CDI DV status.
r.Log.Info(
"DataVolume status check exceeded the retry limit."+
"If this causes the problems with the snapshot removal in the CDI please bump the controller_dv_status_check_retries.",
"vm",
vm.String())
vm.Phase = r.next(vm.Phase)
// Reset for next precopy
step.Annotations[DvStatusCheckRetriesAnnotation] = "1"
} else {
step.Annotations[DvStatusCheckRetriesAnnotation] = strconv.Itoa(retries + 1)
}
}
}
case StoreInitialSnapshotDeltas, StoreSnapshotDeltas:
step, found := vm.FindStep(r.step(vm))
if !found {
Expand Down Expand Up @@ -1073,9 +1158,9 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) {

switch vm.Phase {
case AddCheckpoint:
vm.Phase = CopyDisks
vm.Phase = WaitForDataVolumesStatus
case AddFinalCheckpoint:
vm.Phase = Finalize
vm.Phase = WaitForFinalDataVolumesStatus
}
case StorePowerState:
step, found := vm.FindStep(r.step(vm))
Expand Down Expand Up @@ -1259,6 +1344,15 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) {
return
}

func (r *Migration) hasPausedDv(dvs []ExtendedDataVolume) bool {
for _, dv := range dvs {
if dv.Status.Phase == Paused {
return true
}
}
return false
}

func (r *Migration) resetPrecopyTasks(vm *plan.VMStatus, step *plan.Step) {
step.Completed = nil
for _, task := range step.Tasks {
Expand Down
48 changes: 30 additions & 18 deletions pkg/settings/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,26 @@ import (

// Environment variables.
const (
MaxVmInFlight = "MAX_VM_INFLIGHT"
HookRetry = "HOOK_RETRY"
ImporterRetry = "IMPORTER_RETRY"
VirtV2vImage = "VIRT_V2V_IMAGE"
PrecopyInterval = "PRECOPY_INTERVAL"
VirtV2vDontRequestKVM = "VIRT_V2V_DONT_REQUEST_KVM"
SnapshotRemovalTimeout = "SNAPSHOT_REMOVAL_TIMEOUT"
SnapshotStatusCheckRate = "SNAPSHOT_STATUS_CHECK_RATE"
CDIExportTokenTTL = "CDI_EXPORT_TOKEN_TTL"
FileSystemOverhead = "FILESYSTEM_OVERHEAD"
BlockOverhead = "BLOCK_OVERHEAD"
CleanupRetries = "CLEANUP_RETRIES"
OvirtOsConfigMap = "OVIRT_OS_MAP"
VsphereOsConfigMap = "VSPHERE_OS_MAP"
VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP"
VddkJobActiveDeadline = "VDDK_JOB_ACTIVE_DEADLINE"
VirtV2vExtraArgs = "VIRT_V2V_EXTRA_ARGS"
VirtV2vExtraConfConfigMap = "VIRT_V2V_EXTRA_CONF_CONFIG_MAP"
MaxVmInFlight = "MAX_VM_INFLIGHT"
HookRetry = "HOOK_RETRY"
ImporterRetry = "IMPORTER_RETRY"
VirtV2vImage = "VIRT_V2V_IMAGE"
PrecopyInterval = "PRECOPY_INTERVAL"
VirtV2vDontRequestKVM = "VIRT_V2V_DONT_REQUEST_KVM"
SnapshotRemovalTimeout = "SNAPSHOT_REMOVAL_TIMEOUT"
SnapshotStatusCheckRate = "SNAPSHOT_STATUS_CHECK_RATE"
CDIExportTokenTTL = "CDI_EXPORT_TOKEN_TTL"
FileSystemOverhead = "FILESYSTEM_OVERHEAD"
BlockOverhead = "BLOCK_OVERHEAD"
CleanupRetries = "CLEANUP_RETRIES"
DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES"
SnapshotRemovalCheckRetries = "SNAPSHOT_REMOVAL_CHECK_RETRIES"
OvirtOsConfigMap = "OVIRT_OS_MAP"
VsphereOsConfigMap = "VSPHERE_OS_MAP"
VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP"
VddkJobActiveDeadline = "VDDK_JOB_ACTIVE_DEADLINE"
VirtV2vExtraArgs = "VIRT_V2V_EXTRA_ARGS"
VirtV2vExtraConfConfigMap = "VIRT_V2V_EXTRA_CONF_CONFIG_MAP"
)

// Migration settings
Expand Down Expand Up @@ -58,6 +60,10 @@ type Migration struct {
BlockOverhead int64
// Cleanup retries
CleanupRetries int
// DvStatusCheckRetries retries
DvStatusCheckRetries int
// SnapshotRemovalCheckRetries retries
SnapshotRemovalCheckRetries int
// oVirt OS config map name
OvirtOsConfigMap string
// vSphere OS config map name
Expand Down Expand Up @@ -100,6 +106,12 @@ func (r *Migration) Load() (err error) {
if r.CleanupRetries, err = getPositiveEnvLimit(CleanupRetries, 10); err != nil {
return liberr.Wrap(err)
}
if r.DvStatusCheckRetries, err = getPositiveEnvLimit(DvStatusCheckRetries, 10); err != nil {
return liberr.Wrap(err)
}
if r.SnapshotRemovalCheckRetries, err = getPositiveEnvLimit(SnapshotRemovalCheckRetries, 20); err != nil {
return liberr.Wrap(err)
}
if virtV2vImage, ok := os.LookupEnv(VirtV2vImage); ok {
r.VirtV2vImage = virtV2vImage
} else if Settings.Role.Has(MainRole) {
Expand Down
Loading