From 9ba91f391f95339fbcf8163b1d247ee3d50bbe8e Mon Sep 17 00:00:00 2001 From: Roelof Kuijpers Date: Wed, 9 Apr 2025 09:52:35 +0200 Subject: [PATCH 1/8] fix: stuck at 'Progressing' #15317 Signed-off-by: Roelof Kuijpers --- pkg/health/health_pod.go | 34 +++++--- ...-restart-never-with-ignore-annotation.yaml | 79 +++++++++++++++++++ 2 files changed, 103 insertions(+), 10 deletions(-) create mode 100644 pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml diff --git a/pkg/health/health_pod.go b/pkg/health/health_pod.go index 9ebcef558..c106a1f5e 100644 --- a/pkg/health/health_pod.go +++ b/pkg/health/health_pod.go @@ -12,6 +12,10 @@ import ( "github.com/argoproj/gitops-engine/pkg/utils/kube" ) +const ( + AnnotationIgnoreRestartPolicy = "argocd.argoproj.io/ignore-restart-policy" +) + func getPodHealth(obj *unstructured.Unstructured) (*HealthStatus, error) { gvk := obj.GroupVersionKind() switch gvk { @@ -93,9 +97,9 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { } return &HealthStatus{Status: HealthStatusDegraded, Message: ""}, nil + case corev1.PodRunning: - switch pod.Spec.RestartPolicy { - case corev1.RestartPolicyAlways: + getHealthStatus := func(pod *corev1.Pod) (*HealthStatus, error) { // if pod is ready, it is automatically healthy if podutils.IsPodReady(pod) { return &HealthStatus{ @@ -117,14 +121,24 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { Status: HealthStatusProgressing, Message: pod.Status.Message, }, nil - case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever: - // pods set with a restart policy of OnFailure or Never, have a finite life. - // These pods are typically resource hooks. Thus, we consider these as Progressing - // instead of healthy. - return &HealthStatus{ - Status: HealthStatusProgressing, - Message: pod.Status.Message, - }, nil + } + if _, hook := pod.Annotations[AnnotationIgnoreRestartPolicy]; hook { + return getHealthStatus(pod) + } else { + switch pod.Spec.RestartPolicy { + case corev1.RestartPolicyAlways: + return getHealthStatus(pod) + case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever: + // Most pods set with a restart policy of OnFailure or Never, have a finite life. + // These pods are typically resource hooks. Thus, we consider these as Progressing + // instead of healthy. If this is unwanted, e.g., when the pod is managed by an + // operator and therefore has a restart policy of OnFailure or Never, then use the + // the AnnotationIgnoreRestartPolicy annotation. + return &HealthStatus{ + Status: HealthStatusProgressing, + Message: pod.Status.Message, + }, nil + } } } return &HealthStatus{ diff --git a/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml b/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml new file mode 100644 index 000000000..432473dbd --- /dev/null +++ b/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml @@ -0,0 +1,79 @@ +apiVersion: v1 +kind: Pod +metadata: + creationTimestamp: 2018-12-02T09:15:16Z + name: my-pod + namespace: argocd + resourceVersion: "151053" + selfLink: /api/v1/namespaces/argocd/pods/my-pod + uid: c86e909c-f612-11e8-a057-fe5f49266390 + annotations: + argocd.argoproj.io/ignore-restart-policy: "true" +spec: + containers: + - command: + - sh + - -c + - sleep 10 + image: alpine:latest + imagePullPolicy: Always + name: main + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: default-token-f9jvj + readOnly: true + dnsPolicy: ClusterFirst + nodeName: minikube + restartPolicy: Never + schedulerName: default-scheduler + securityContext: {} + serviceAccount: default + serviceAccountName: default + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 300 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 300 + volumes: + - name: default-token-f9jvj + secret: + defaultMode: 420 + secretName: default-token-f9jvj +status: + conditions: + - lastProbeTime: null + lastTransitionTime: 2018-12-02T09:15:16Z + status: "True" + type: Initialized + - lastProbeTime: null + lastTransitionTime: 2018-12-02T09:15:19Z + status: "True" + type: Ready + - lastProbeTime: null + lastTransitionTime: 2018-12-02T09:15:16Z + status: "True" + type: PodScheduled + containerStatuses: + - containerID: docker://acfb261d6c1fe8c543438a202de62cb06c137fa93a2d59262d764470e96f3195 + image: alpine:latest + imageID: docker-pullable://alpine@sha256:621c2f39f8133acb8e64023a94dbdf0d5ca81896102b9e57c0dc184cadaf5528 + lastState: {} + name: main + ready: true + restartCount: 0 + state: + running: + startedAt: 2018-12-02T09:15:19Z + hostIP: 192.168.64.41 + phase: Running + podIP: 172.17.0.9 + qosClass: BestEffort + startTime: 2018-12-02T09:15:16Z From b0a321932629acf3f802e8c2e246ec528ea03b26 Mon Sep 17 00:00:00 2001 From: RoelofKuijpers Date: Wed, 9 Apr 2025 10:22:16 +0200 Subject: [PATCH 2/8] Update health_test.go Signed-off-by: Roelof Kuijpers --- pkg/health/health_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/health/health_test.go b/pkg/health/health_test.go index ef945eb46..20ddc5f5c 100644 --- a/pkg/health/health_test.go +++ b/pkg/health/health_test.go @@ -103,6 +103,7 @@ func TestPod(t *testing.T) { assertAppHealth(t, "./testdata/pod-error.yaml", HealthStatusDegraded) assertAppHealth(t, "./testdata/pod-running-restart-always.yaml", HealthStatusHealthy) assertAppHealth(t, "./testdata/pod-running-restart-never.yaml", HealthStatusProgressing) + assertAppHealth(t, "./testdata/pod-running-restart-never-with-ignore-annotation.yaml", HealthStatusHealthy) assertAppHealth(t, "./testdata/pod-running-restart-onfailure.yaml", HealthStatusProgressing) assertAppHealth(t, "./testdata/pod-failed.yaml", HealthStatusDegraded) assertAppHealth(t, "./testdata/pod-succeeded.yaml", HealthStatusHealthy) From 07cb6a0dec7bb9581ad516cda751779e7ee4ffed Mon Sep 17 00:00:00 2001 From: RoelofKuijpers Date: Wed, 9 Apr 2025 12:06:40 +0200 Subject: [PATCH 3/8] Update pod-running-restart-never-with-ignore-annotation.yaml so it passes the checks of the Quality Gate Signed-off-by: Roelof Kuijpers --- ...ng-restart-never-with-ignore-annotation.yaml | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml b/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml index 432473dbd..4c2b3682a 100644 --- a/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml +++ b/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml @@ -15,10 +15,16 @@ spec: - sh - -c - sleep 10 - image: alpine:latest + image: alpine:3.21 imagePullPolicy: Always name: main - resources: {} + resources: + requests: + memory: "128Mi" + cpu: "250m" + limits: + memory: "256Mi" + cpu: "500m" terminationMessagePath: /dev/termination-log terminationMessagePolicy: File volumeMounts: @@ -32,6 +38,7 @@ spec: securityContext: {} serviceAccount: default serviceAccountName: default + automountServiceAccountToken: false terminationGracePeriodSeconds: 30 tolerations: - effect: NoExecute @@ -62,9 +69,9 @@ status: status: "True" type: PodScheduled containerStatuses: - - containerID: docker://acfb261d6c1fe8c543438a202de62cb06c137fa93a2d59262d764470e96f3195 - image: alpine:latest - imageID: docker-pullable://alpine@sha256:621c2f39f8133acb8e64023a94dbdf0d5ca81896102b9e57c0dc184cadaf5528 + - containerID: containerd://adc73c2c0ae3f1fd9bf294abd834e740042ee375de680c0cfcdd90d863a73b8b + image: alpine:3.21 + imageID: docker.io/library/alpine@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c lastState: {} name: main ready: true From 9a9eb190e9a077a408d25810d210610e8964020e Mon Sep 17 00:00:00 2001 From: Roelof Kuijpers Date: Wed, 9 Apr 2025 13:57:34 +0200 Subject: [PATCH 4/8] add storage request Signed-off-by: Roelof Kuijpers --- .../pod-running-restart-never-with-ignore-annotation.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml b/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml index 4c2b3682a..1cc27e5f1 100644 --- a/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml +++ b/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml @@ -20,6 +20,7 @@ spec: name: main resources: requests: + ephemeral-storage: "100Mi" memory: "128Mi" cpu: "250m" limits: From 75c2fe7edb3d41f634e596402c486b28ebc4125a Mon Sep 17 00:00:00 2001 From: RoelofKuijpers Date: Mon, 21 Apr 2025 18:13:20 +0200 Subject: [PATCH 5/8] Update pkg/health/health_pod.go improve code readability Co-authored-by: sivchari Signed-off-by: Roelof Kuijpers --- pkg/health/health_pod.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pkg/health/health_pod.go b/pkg/health/health_pod.go index c106a1f5e..7d874f284 100644 --- a/pkg/health/health_pod.go +++ b/pkg/health/health_pod.go @@ -122,13 +122,12 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { Message: pod.Status.Message, }, nil } - if _, hook := pod.Annotations[AnnotationIgnoreRestartPolicy]; hook { + policy := pod.Spec.RestartPolicy + if _, ok := pod.Annotations[AnnotationIgnoreRestartPolicy]; ok || policy == corev1.RestartPolicyAlways { return getHealthStatus(pod) - } else { - switch pod.Spec.RestartPolicy { - case corev1.RestartPolicyAlways: - return getHealthStatus(pod) - case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever: + } + + if policy == corev1.RestartPolicyOnFailure || policy == corev1.RestartPolicyNever { // Most pods set with a restart policy of OnFailure or Never, have a finite life. // These pods are typically resource hooks. Thus, we consider these as Progressing // instead of healthy. If this is unwanted, e.g., when the pod is managed by an From 2c955b613a6e91c51f4a04862678a5d2051dbe2a Mon Sep 17 00:00:00 2001 From: Roelof Kuijpers Date: Mon, 21 Apr 2025 18:30:38 +0200 Subject: [PATCH 6/8] Improve code readability fix Signed-off-by: Roelof Kuijpers --- pkg/health/health_pod.go | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/pkg/health/health_pod.go b/pkg/health/health_pod.go index 7d874f284..d54c52cb0 100644 --- a/pkg/health/health_pod.go +++ b/pkg/health/health_pod.go @@ -122,22 +122,21 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { Message: pod.Status.Message, }, nil } - policy := pod.Spec.RestartPolicy + policy := pod.Spec.RestartPolicy if _, ok := pod.Annotations[AnnotationIgnoreRestartPolicy]; ok || policy == corev1.RestartPolicyAlways { return getHealthStatus(pod) } - - if policy == corev1.RestartPolicyOnFailure || policy == corev1.RestartPolicyNever { - // Most pods set with a restart policy of OnFailure or Never, have a finite life. - // These pods are typically resource hooks. Thus, we consider these as Progressing - // instead of healthy. If this is unwanted, e.g., when the pod is managed by an - // operator and therefore has a restart policy of OnFailure or Never, then use the - // the AnnotationIgnoreRestartPolicy annotation. - return &HealthStatus{ - Status: HealthStatusProgressing, - Message: pod.Status.Message, - }, nil - } + + if policy == corev1.RestartPolicyOnFailure || policy == corev1.RestartPolicyNever { + // Most pods set with a restart policy of OnFailure or Never, have a finite life. + // These pods are typically resource hooks. Thus, we consider these as Progressing + // instead of healthy. If this is unwanted, e.g., when the pod is managed by an + // operator and therefore has a restart policy of OnFailure or Never, then use the + // the AnnotationIgnoreRestartPolicy annotation. + return &HealthStatus{ + Status: HealthStatusProgressing, + Message: pod.Status.Message, + }, nil } } return &HealthStatus{ From 8ef020ab8235f540912341d3770d690caf19b3dd Mon Sep 17 00:00:00 2001 From: Roelof Kuijpers Date: Fri, 4 Jul 2025 00:47:56 +0200 Subject: [PATCH 7/8] Move annotation into common package Signed-off-by: Roelof Kuijpers --- pkg/health/health_pod.go | 7 ++----- pkg/sync/common/types.go | 3 +++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/health/health_pod.go b/pkg/health/health_pod.go index d54c52cb0..cd697d119 100644 --- a/pkg/health/health_pod.go +++ b/pkg/health/health_pod.go @@ -9,13 +9,10 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/kubectl/pkg/util/podutils" + "github.com/argoproj/gitops-engine/pkg/sync/common" "github.com/argoproj/gitops-engine/pkg/utils/kube" ) -const ( - AnnotationIgnoreRestartPolicy = "argocd.argoproj.io/ignore-restart-policy" -) - func getPodHealth(obj *unstructured.Unstructured) (*HealthStatus, error) { gvk := obj.GroupVersionKind() switch gvk { @@ -123,7 +120,7 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { }, nil } policy := pod.Spec.RestartPolicy - if _, ok := pod.Annotations[AnnotationIgnoreRestartPolicy]; ok || policy == corev1.RestartPolicyAlways { + if _, ok := pod.Annotations[common.AnnotationIgnoreRestartPolicy]; ok || policy == corev1.RestartPolicyAlways { return getHealthStatus(pod) } diff --git a/pkg/sync/common/types.go b/pkg/sync/common/types.go index 002bb23da..5981b9753 100644 --- a/pkg/sync/common/types.go +++ b/pkg/sync/common/types.go @@ -17,6 +17,9 @@ const ( // AnnotationKeyHookDeletePolicy is the policy of deleting a hook AnnotationKeyHookDeletePolicy = "argocd.argoproj.io/hook-delete-policy" AnnotationDeletionApproved = "argocd.argoproj.io/deletion-approved" + // AnnotationIgnoreRestartPolicy ignores restart policy, useful for operator-managed + // pods to be considered healthy + AnnotationIgnoreRestartPolicy = "argocd.argoproj.io/ignore-restart-policy" // Sync option that disables dry run in resource is missing in the cluster SyncOptionSkipDryRunOnMissingResource = "SkipDryRunOnMissingResource=true" From b216058d04d1ab71b34e72a120ba5ca9894307f1 Mon Sep 17 00:00:00 2001 From: Roelof Kuijpers Date: Tue, 29 Jul 2025 14:10:27 +0200 Subject: [PATCH 8/8] Add documentation Signed-off-by: Roelof Kuijpers --- pkg/health/health_pod.go | 2 ++ pkg/sync/doc.go | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/pkg/health/health_pod.go b/pkg/health/health_pod.go index cd697d119..837886554 100644 --- a/pkg/health/health_pod.go +++ b/pkg/health/health_pod.go @@ -120,6 +120,8 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { }, nil } policy := pod.Spec.RestartPolicy + // If the pod has the AnnotationIgnoreRestartPolicy annotation or its restart policy is Always, + // then treat it as a long-running pod and check its health status. if _, ok := pod.Annotations[common.AnnotationIgnoreRestartPolicy]; ok || policy == corev1.RestartPolicyAlways { return getHealthStatus(pod) } diff --git a/pkg/sync/doc.go b/pkg/sync/doc.go index f4f5d8725..0b6bdbc19 100644 --- a/pkg/sync/doc.go +++ b/pkg/sync/doc.go @@ -65,6 +65,14 @@ The following policies define when the hook will be deleted. - HookFailed - the hook resource is deleted after the hook failed. - BeforeHookCreation - any existing hook resource is deleted before the new one is created +**Pods with restartPolicy Never or OnFailure** + +During synchronization, the application may show the resource health as *Progressing* when it deploys a Pod that has the `restartPolicy` set to `Never` or `OnFailure` (see Kubernetes docs for restart policy). Generally, these resources behave like a Job and are expected to complete. This is intended behavior, since Jobs are commonly used for sync hooks and must finish before an application is considered *Healthy*. + +A workaround is to use the annotation: argocd.argoproj.io/ignore-restart-policy: "true". + +When this annotation is set on the Pod resource, the controller will ignore the `restartPolicy` and consider the Pod *Running* as a valid healthy state. + # Sync Waves The waves allow to group sync execution of syncing process into batches when each batch is executed sequentially one after