Skip to content

Commit e603cd9

Browse files
Merge pull request #30296 from hongkailiu/OTA-1637-reboot
OTA-1637: ClusterOperators should not go Progressing only for a node reboot
2 parents b60bbfe + 7d824bf commit e603cd9

File tree

2 files changed

+201
-10
lines changed

2 files changed

+201
-10
lines changed

pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
4545
isUpgrade := platformidentification.DidUpgradeHappenDuringCollection(finalIntervals, time.Time{}, time.Time{})
4646
if isUpgrade {
4747
junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
48+
junits = append(junits, clusterOperatorIsNotProgressingWhenMachineConfigIs(finalIntervals)...)
4849
} else {
4950
junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
5051
}

pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go

Lines changed: 200 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,21 @@ import (
66
"strings"
77
"time"
88

9-
"github.com/openshift/origin/pkg/monitortestlibrary/utility"
9+
configv1 "github.com/openshift/api/config/v1"
10+
clientconfigv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
11+
"github.com/sirupsen/logrus"
1012
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
13+
"k8s.io/apimachinery/pkg/util/sets"
1114
"k8s.io/client-go/kubernetes"
15+
"k8s.io/client-go/rest"
1216

13-
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer"
14-
"github.com/sirupsen/logrus"
15-
16-
configv1 "github.com/openshift/api/config/v1"
17-
clientconfigv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
1817
"github.com/openshift/origin/pkg/monitor/monitorapi"
1918
"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
2019
platformidentification2 "github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
20+
"github.com/openshift/origin/pkg/monitortestlibrary/utility"
21+
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer"
2122
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
2223
exutil "github.com/openshift/origin/test/extended/util"
23-
"k8s.io/client-go/rest"
2424
)
2525

2626
// exceptionCallback consumes a suspicious condition and returns an
@@ -516,9 +516,6 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
516516
for _, conditionType := range conditionTypes {
517517
for _, operatorName := range platformidentification.KnownOperators.List() {
518518
bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName)
519-
if bzComponent == "Unknown" {
520-
bzComponent = operatorName
521-
}
522519
testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should not change condition/%v", bzComponent, operatorName, conditionType)
523520
operatorEvents := eventsByOperator[operatorName]
524521
if len(operatorEvents) == 0 {
@@ -586,6 +583,9 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
586583
}
587584

588585
if len(fatal) > 0 || len(excepted) > 0 {
586+
// add a failure so we
587+
// either flake (or pass) in case len(fatal) == 0 by adding a success to the same test
588+
// or fail in case len(fatal) > 0 by leaving the failure as the only output for the test
589589
ret = append(ret, &junitapi.JUnitTestCase{
590590
Name: testName,
591591
Duration: duration,
@@ -597,7 +597,197 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
597597
}
598598

599599
if len(fatal) == 0 {
600+
if len(excepted) > 0 {
601+
// add a success so we flake (or pass) and don't fail
602+
ret = append(ret, &junitapi.JUnitTestCase{Name: testName, SystemOut: "Passing the case to make the overall test case flake as the previous failure is expected"})
603+
} else {
604+
ret = append(ret, &junitapi.JUnitTestCase{Name: testName})
605+
}
606+
}
607+
}
608+
}
609+
610+
return ret
611+
}
612+
613+
func clusterOperatorIsNotProgressingWhenMachineConfigIs(events monitorapi.Intervals) []*junitapi.JUnitTestCase {
614+
var ret []*junitapi.JUnitTestCase
615+
upgradeWindows := getUpgradeWindows(events)
616+
617+
var machineConfigProgressingStart time.Time
618+
var eventsInUpgradeWindows monitorapi.Intervals
619+
620+
var start, stop time.Time
621+
for _, event := range events {
622+
if !isInUpgradeWindow(upgradeWindows, event) {
623+
continue
624+
}
625+
eventsInUpgradeWindows = append(eventsInUpgradeWindows, event)
626+
if start.IsZero() || event.From.Before(start) {
627+
start = event.From
628+
}
629+
if stop.IsZero() || event.To.After(stop) {
630+
stop = event.To
631+
}
632+
}
633+
duration := stop.Sub(start).Seconds()
634+
635+
eventsByOperator := getEventsByOperator(eventsInUpgradeWindows)
636+
for _, mcEvent := range eventsByOperator["machine-config"] {
637+
condition := monitorapi.GetOperatorConditionStatus(mcEvent)
638+
if condition == nil {
639+
continue // ignore non-condition intervals
640+
}
641+
if condition.Type == configv1.OperatorProgressing && condition.Status == configv1.ConditionTrue {
642+
machineConfigProgressingStart = mcEvent.To
643+
break
644+
}
645+
}
646+
647+
mcTestCase := &junitapi.JUnitTestCase{
648+
Name: fmt.Sprintf("[bz-Machine Config Operator] clusteroperator/machine-config must go Progressing=True during an upgrade test"),
649+
Duration: duration,
650+
}
651+
if machineConfigProgressingStart.IsZero() {
652+
mcTestCase.FailureOutput = &junitapi.FailureOutput{
653+
Output: fmt.Sprintf("machine-config was never Progressing=True during the upgrade window from %s to %s", start.Format(time.RFC3339), stop.Format(time.RFC3339)),
654+
}
655+
return []*junitapi.JUnitTestCase{mcTestCase}
656+
} else {
657+
mcTestCase.SystemOut = fmt.Sprintf("machine-config became Progressing=True at %s during the upgrade window from %s to %s", machineConfigProgressingStart.Format(time.RFC3339), start.Format(time.RFC3339), stop.Format(time.RFC3339))
658+
}
659+
ret = append(ret, mcTestCase)
660+
661+
for _, operatorName := range platformidentification.KnownOperators.Difference(sets.NewString("machine-config")).List() {
662+
bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName)
663+
testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should stay Progressing=False while MCO is Progressing=True", bzComponent, operatorName)
664+
operatorEvents := eventsByOperator[operatorName]
665+
if len(operatorEvents) == 0 {
666+
ret = append(ret, &junitapi.JUnitTestCase{
667+
Name: testName,
668+
Duration: duration,
669+
})
670+
continue
671+
}
672+
673+
except := func(co string, reason string) string {
674+
switch co {
675+
case "csi-snapshot-controller":
676+
if reason == "CSISnapshotController_Deploying" {
677+
return "https://issues.redhat.com/browse/OCPBUGS-62624"
678+
}
679+
case "dns":
680+
if reason == "DNSReportsProgressingIsTrue" {
681+
return "https://issues.redhat.com/browse/OCPBUGS-62623"
682+
}
683+
case "image-registry":
684+
if reason == "NodeCADaemonUnavailable::Ready" || reason == "DeploymentNotCompleted" {
685+
return "https://issues.redhat.com/browse/OCPBUGS-62626"
686+
}
687+
case "ingress":
688+
if reason == "Reconciling" {
689+
return "https://issues.redhat.com/browse/OCPBUGS-62627"
690+
}
691+
case "kube-storage-version-migrator":
692+
if reason == "KubeStorageVersionMigrator_Deploying" {
693+
return "https://issues.redhat.com/browse/OCPBUGS-62629"
694+
}
695+
case "network":
696+
if reason == "Deploying" {
697+
return "https://issues.redhat.com/browse/OCPBUGS-62630"
698+
}
699+
case "node-tuning":
700+
if reason == "Reconciling" {
701+
return "https://issues.redhat.com/browse/OCPBUGS-62632"
702+
}
703+
case "openshift-controller-manager":
704+
if reason == "_DesiredStateNotYetAchieved" {
705+
return "https://issues.redhat.com/browse/OCPBUGS-63116"
706+
}
707+
case "service-ca":
708+
if reason == "_ManagedDeploymentsAvailable" {
709+
return "https://issues.redhat.com/browse/OCPBUGS-62633"
710+
}
711+
case "storage":
712+
// GCPPDCSIDriverOperatorCR_GCPPDDriverControllerServiceController_Deploying
713+
// GCPPDCSIDriverOperatorCR_GCPPDDriverNodeServiceController_Deploying
714+
// AWSEBSCSIDriverOperatorCR_AWSEBSDriverNodeServiceController_Deploying
715+
// VolumeDataSourceValidatorDeploymentController_Deploying
716+
if strings.HasSuffix(reason, "Controller_Deploying") ||
717+
reason == "GCPPD_Deploying" {
718+
return "https://issues.redhat.com/browse/OCPBUGS-62634"
719+
}
720+
case "olm":
721+
// CatalogdDeploymentCatalogdControllerManager_Deploying
722+
// OperatorcontrollerDeploymentOperatorControllerControllerManager_Deploying
723+
if strings.HasSuffix(reason, "ControllerManager_Deploying") {
724+
return "https://issues.redhat.com/browse/OCPBUGS-62635"
725+
}
726+
}
727+
return ""
728+
}
729+
730+
var excepted, fatal []string
731+
for _, operatorEvent := range operatorEvents {
732+
if operatorEvent.From.Before(machineConfigProgressingStart) {
733+
continue
734+
}
735+
condition := monitorapi.GetOperatorConditionStatus(operatorEvent)
736+
if condition == nil {
737+
continue // ignore non-condition intervals
738+
}
739+
if condition.Type == "" {
740+
fatal = append(fatal, fmt.Sprintf("failed to convert %v into a condition with a type", operatorEvent))
741+
continue
742+
}
743+
744+
if condition.Type != configv1.OperatorProgressing || condition.Status == configv1.ConditionFalse {
745+
continue
746+
}
747+
748+
// if there was any switch, it was wrong/unexpected at some point
749+
failure := fmt.Sprintf("%v", operatorEvent)
750+
751+
exception := except(operatorName, condition.Reason)
752+
if exception == "" {
753+
fatal = append(fatal, failure)
754+
} else {
755+
excepted = append(excepted, fmt.Sprintf("%s (exception: %s)", failure, exception))
756+
}
757+
}
758+
759+
output := fmt.Sprintf("%d (out of %d) unexpected clusteroperator state transitions while machine-config is progressing during the upgrade window from %s to %s", len(fatal), len(operatorEvents), start.Format(time.RFC3339), stop.Format(time.RFC3339))
760+
if len(fatal) > 0 {
761+
output = fmt.Sprintf("%s. These did not match any known exceptions, so they cause this test-case to fail:\n\n%v\n", output, strings.Join(fatal, "\n"))
762+
} else {
763+
output = fmt.Sprintf("%s, as desired.", output)
764+
}
765+
output = fmt.Sprintf("%s\n%d unwelcome but acceptable clusteroperator state transitions while machine-config is progressing during the upgrade window from %s to %s", output, len(excepted), start.Format(time.RFC3339), stop.Format(time.RFC3339))
766+
if len(excepted) > 0 {
767+
output = fmt.Sprintf("%s. These should not happen, but because they are tied to exceptions, the fact that they did happen is not sufficient to cause this test-case to fail:\n\n%v\n", output, strings.Join(excepted, "\n"))
768+
} else {
769+
output = fmt.Sprintf("%s, as desired.", output)
770+
}
771+
772+
if len(fatal) > 0 || len(excepted) > 0 {
773+
// add a failure so we
774+
// either flake (or pass) in case len(fatal) == 0 by adding a success to the same test
775+
// or fail in case len(fatal) > 0 by leaving the failure as the only output for the test
776+
ret = append(ret, &junitapi.JUnitTestCase{
777+
Name: testName,
778+
Duration: duration,
779+
SystemOut: output,
780+
FailureOutput: &junitapi.FailureOutput{
781+
Output: output,
782+
},
783+
})
784+
}
785+
786+
if len(fatal) == 0 {
787+
if len(excepted) > 0 {
600788
// add a success so we flake (or pass) and don't fail
789+
ret = append(ret, &junitapi.JUnitTestCase{Name: testName, SystemOut: "Passing the case to make the overall test case flake as the previous failure is expected"})
790+
} else {
601791
ret = append(ret, &junitapi.JUnitTestCase{Name: testName})
602792
}
603793
}

0 commit comments

Comments
 (0)