From b69e975128ac9a511542f9f1d245a6d4c3f91112 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 5 Sep 2024 09:15:50 -0700 Subject: [PATCH] fix: Fix spurious transport errors (v1.0.x) (#1642) --- .../karpenter.kwok.sh_kwoknodeclasses.yaml | 9 ++------- kwok/charts/crds/karpenter.sh_nodeclaims.yaml | 20 +++---------------- kwok/charts/crds/karpenter.sh_nodepools.yaml | 20 +++---------------- pkg/apis/crds/karpenter.sh_nodeclaims.yaml | 20 +++---------------- pkg/apis/crds/karpenter.sh_nodepools.yaml | 20 +++---------------- pkg/controllers/disruption/validation.go | 2 +- .../nodeclaim/consistency/controller.go | 3 ++- .../karpenter.test.sh_testnodeclasses.yaml | 9 ++------- pkg/webhooks/webhooks.go | 9 ++++++++- 9 files changed, 27 insertions(+), 85 deletions(-) diff --git a/kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml b/kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml index 2ed31b19e9..79b9db76fc 100644 --- a/kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml +++ b/kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.15.0 + controller-gen.kubebuilder.io/version: v0.16.2 name: kwoknodeclasses.karpenter.kwok.sh spec: group: karpenter.kwok.sh @@ -96,12 +96,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/kwok/charts/crds/karpenter.sh_nodeclaims.yaml b/kwok/charts/crds/karpenter.sh_nodeclaims.yaml index e10563ab7b..6e43e5640d 100644 --- a/kwok/charts/crds/karpenter.sh_nodeclaims.yaml +++ b/kwok/charts/crds/karpenter.sh_nodeclaims.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.15.0 + controller-gen.kubebuilder.io/version: v0.16.2 name: nodeclaims.karpenter.sh spec: group: karpenter.sh @@ -262,19 +262,15 @@ spec: description: |- TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. - Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. - This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. - Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. - The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. If left undefined, the controller will wait indefinitely for pods to be drained. pattern: ^([0-9]+(s|m|h))+$ @@ -350,12 +346,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -798,12 +789,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/kwok/charts/crds/karpenter.sh_nodepools.yaml b/kwok/charts/crds/karpenter.sh_nodepools.yaml index cbfcf0ab42..53e868df8f 100644 --- a/kwok/charts/crds/karpenter.sh_nodepools.yaml +++ b/kwok/charts/crds/karpenter.sh_nodepools.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.15.0 + controller-gen.kubebuilder.io/version: v0.16.2 name: nodepools.karpenter.sh spec: group: karpenter.sh @@ -390,19 +390,15 @@ spec: description: |- TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. - Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. - This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. - Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. - The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. If left undefined, the controller will wait indefinitely for pods to be drained. pattern: ^([0-9]+(s|m|h))+$ @@ -474,12 +470,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -1043,12 +1034,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml index 5f5bff9ae0..4e48290c73 100644 --- a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml +++ b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.15.0 + controller-gen.kubebuilder.io/version: v0.16.2 name: nodeclaims.karpenter.sh spec: group: karpenter.sh @@ -260,19 +260,15 @@ spec: description: |- TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. - Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. - This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. - Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. - The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. If left undefined, the controller will wait indefinitely for pods to be drained. pattern: ^([0-9]+(s|m|h))+$ @@ -348,12 +344,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -794,12 +785,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/pkg/apis/crds/karpenter.sh_nodepools.yaml b/pkg/apis/crds/karpenter.sh_nodepools.yaml index cd2b99ed11..f186c4c5f7 100644 --- a/pkg/apis/crds/karpenter.sh_nodepools.yaml +++ b/pkg/apis/crds/karpenter.sh_nodepools.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.15.0 + controller-gen.kubebuilder.io/version: v0.16.2 name: nodepools.karpenter.sh spec: group: karpenter.sh @@ -388,19 +388,15 @@ spec: description: |- TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. - Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. - This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. - Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. - The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. If left undefined, the controller will wait indefinitely for pods to be drained. pattern: ^([0-9]+(s|m|h))+$ @@ -472,12 +468,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -1039,12 +1030,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/pkg/controllers/disruption/validation.go b/pkg/controllers/disruption/validation.go index bcf47a996f..49004b2ecc 100644 --- a/pkg/controllers/disruption/validation.go +++ b/pkg/controllers/disruption/validation.go @@ -163,7 +163,7 @@ func (v *Validation) ValidateCommand(ctx context.Context, cmd Command, candidate return fmt.Errorf("simluating scheduling, %w", err) } if !results.AllNonPendingPodsScheduled() { - return NewValidationError(fmt.Errorf(results.NonPendingPodSchedulingErrors())) + return NewValidationError(errors.New(results.NonPendingPodSchedulingErrors())) } // We want to ensure that the re-simulated scheduling using the current cluster state produces the same result. diff --git a/pkg/controllers/nodeclaim/consistency/controller.go b/pkg/controllers/nodeclaim/consistency/controller.go index 42a2165b46..fd818d4b74 100644 --- a/pkg/controllers/nodeclaim/consistency/controller.go +++ b/pkg/controllers/nodeclaim/consistency/controller.go @@ -18,6 +18,7 @@ package consistency import ( "context" + stderrors "errors" "fmt" "time" @@ -122,7 +123,7 @@ func (c *Controller) checkConsistency(ctx context.Context, nodeClaim *v1.NodeCla return fmt.Errorf("checking node with %T, %w", check, err) } for _, issue := range issues { - log.FromContext(ctx).Error(fmt.Errorf(string(issue)), "consistency error") + log.FromContext(ctx).Error(stderrors.New(string(issue)), "consistency error") c.recorder.Publish(FailedConsistencyCheckEvent(nodeClaim, string(issue))) } hasIssues = hasIssues || (len(issues) > 0) diff --git a/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml b/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml index 3907c2cafd..fb3c15c292 100644 --- a/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml +++ b/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.15.0 + controller-gen.kubebuilder.io/version: v0.16.2 name: testnodeclasses.karpenter.test.sh spec: group: karpenter.test.sh @@ -84,12 +84,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/pkg/webhooks/webhooks.go b/pkg/webhooks/webhooks.go index 5d936c2318..af8c5a34a6 100644 --- a/pkg/webhooks/webhooks.go +++ b/pkg/webhooks/webhooks.go @@ -18,6 +18,7 @@ package webhooks import ( "context" + "crypto/tls" "errors" "fmt" "io" @@ -157,10 +158,16 @@ func Start(ctx context.Context, cfg *rest.Config, ctors ...knativeinjection.Cont } func HealthProbe(ctx context.Context) healthz.Checker { + // Create new transport that doesn't validate the TLS certificate + // This transport is just polling so validating the server certificate isn't necessary + transport := http.DefaultTransport.(*http.Transport).Clone() + transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true} // nolint:gosec + client := &http.Client{Transport: transport} + // TODO: Add knative health check port for webhooks when health port can be configured // Issue: https://github.com/knative/pkg/issues/2765 return func(req *http.Request) (err error) { - res, err := http.Get(fmt.Sprintf("http://localhost:%d", options.FromContext(ctx).WebhookPort)) + res, err := client.Get(fmt.Sprintf("https://localhost:%d", options.FromContext(ctx).WebhookPort)) // If the webhook connection errors out, liveness/readiness should fail if err != nil { return err