Skip to content

Commit df0ce2d

Browse files
authored
Merge pull request #6336 from qianlei90/fix-kwok-provider
fix(kwok): prevent quitting when scaling down node group
2 parents d31e1cf + e71a123 commit df0ce2d

File tree

6 files changed

+192
-41
lines changed

6 files changed

+192
-41
lines changed

cluster-autoscaler/cloudprovider/builder/builder_kwok.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ import (
2323
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
2424
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/kwok"
2525
"k8s.io/autoscaler/cluster-autoscaler/config"
26+
27+
"k8s.io/client-go/informers"
2628
)
2729

2830
// AvailableCloudProviders supported by the cloud provider builder.
@@ -33,10 +35,10 @@ var AvailableCloudProviders = []string{
3335
// DefaultCloudProvider for Kwok-only build is Kwok.
3436
const DefaultCloudProvider = cloudprovider.KwokProviderName
3537

36-
func buildCloudProvider(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) cloudprovider.CloudProvider {
38+
func buildCloudProvider(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter, informerFactory informers.SharedInformerFactory) cloudprovider.CloudProvider {
3739
switch opts.CloudProviderName {
3840
case cloudprovider.KwokProviderName:
39-
return kwok.BuildKwokCloudProvider(opts, do, rl)(opts, do, rl)
41+
return kwok.BuildKwok(opts, do, rl, informerFactory)
4042
}
4143

4244
return nil

cluster-autoscaler/cloudprovider/kwok/kwok_helpers.go

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,19 +25,19 @@ import (
2525
"log"
2626
"strconv"
2727
"strings"
28-
"time"
28+
29+
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
2930

3031
apiv1 "k8s.io/api/core/v1"
3132
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3233
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3334
"k8s.io/apimachinery/pkg/runtime"
3435
"k8s.io/apimachinery/pkg/runtime/serializer"
3536
"k8s.io/apimachinery/pkg/util/yaml"
36-
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
3737
"k8s.io/client-go/kubernetes"
3838
clientscheme "k8s.io/client-go/kubernetes/scheme"
3939
v1lister "k8s.io/client-go/listers/core/v1"
40-
klog "k8s.io/klog/v2"
40+
"k8s.io/klog/v2"
4141
)
4242

4343
const (
@@ -155,13 +155,19 @@ func createNodegroups(nodes []*apiv1.Node, kubeClient kubernetes.Interface, kc *
155155
}
156156

157157
ngName := getNGName(nodes[i], kc)
158+
if ngName == "" {
159+
klog.Fatalf("%s '%s' for node '%s' not present in the manifest",
160+
kc.status.groupNodesBy, kc.status.key,
161+
nodes[i].GetName())
162+
}
163+
158164
if ngs[ngName] != nil {
159165
ngs[ngName].targetSize += 1
160166
continue
161167
}
162168

163169
ng := parseAnnotations(nodes[i], kc)
164-
ng.name = getNGName(nodes[i], kc)
170+
ng.name = ngName
165171
sanitizeNode(nodes[i])
166172
prepareNode(nodes[i], ng.name)
167173
ng.nodeTemplate = nodes[i]
@@ -250,6 +256,8 @@ func parseAnnotations(no *apiv1.Node, kc *KwokProviderConfig) *NodeGroup {
250256
}
251257
}
252258

259+
// getNGName returns the node group name of the given k8s node object.
260+
// Return empty string if no node group is found.
253261
func getNGName(no *apiv1.Node, kc *KwokProviderConfig) string {
254262

255263
if no.GetAnnotations()[NGNameAnnotation] != "" {
@@ -263,16 +271,8 @@ func getNGName(no *apiv1.Node, kc *KwokProviderConfig) string {
263271
case "label":
264272
ngName = no.GetLabels()[kc.status.key]
265273
default:
266-
klog.Fatal("grouping criteria for nodes is not set (expected: 'annotation' or 'label')")
267-
}
268-
269-
if ngName == "" {
270-
klog.Fatalf("%s '%s' for node '%s' not present in the manifest",
271-
kc.status.groupNodesBy, kc.status.key,
272-
no.GetName())
274+
klog.Warning("grouping criteria for nodes is not set (expected: 'annotation' or 'label')")
273275
}
274276

275-
ngName = fmt.Sprintf("%s-%v", ngName, time.Now().Unix())
276-
277277
return ngName
278278
}

cluster-autoscaler/cloudprovider/kwok/kwok_nodegroups.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,10 +81,9 @@ func (nodeGroup *NodeGroup) IncreaseSize(delta int) error {
8181
if err != nil {
8282
return fmt.Errorf("couldn't create new node '%s': %v", node.Name, err)
8383
}
84+
nodeGroup.targetSize += 1
8485
}
8586

86-
nodeGroup.targetSize = newSize
87-
8887
return nil
8988
}
9089

@@ -111,6 +110,7 @@ func (nodeGroup *NodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
111110
if err != nil {
112111
return err
113112
}
113+
nodeGroup.targetSize -= 1
114114
}
115115
return nil
116116
}

cluster-autoscaler/cloudprovider/kwok/kwok_provider.go

Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,21 @@ import (
2222
"os"
2323
"strings"
2424

25-
apiv1 "k8s.io/api/core/v1"
26-
"k8s.io/apimachinery/pkg/api/resource"
27-
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2825
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
2926
"k8s.io/autoscaler/cluster-autoscaler/config"
3027
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
3128
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
3229
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
30+
31+
apiv1 "k8s.io/api/core/v1"
32+
"k8s.io/apimachinery/pkg/api/resource"
33+
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
34+
"k8s.io/apimachinery/pkg/labels"
3335
"k8s.io/client-go/informers"
3436
kubeclient "k8s.io/client-go/kubernetes"
3537
"k8s.io/client-go/rest"
3638
"k8s.io/client-go/tools/clientcmd"
37-
klog "k8s.io/klog/v2"
39+
"k8s.io/klog/v2"
3840
)
3941

4042
// Name returns name of the cloud provider.
@@ -123,24 +125,28 @@ func (kwok *KwokCloudProvider) GetNodeGpuConfig(node *apiv1.Node) *cloudprovider
123125

124126
// Refresh is called before every main loop and can be used to dynamically update cloud provider state.
125127
// In particular the list of node groups returned by NodeGroups can change as a result of CloudProvider.Refresh().
126-
// TODO(vadasambar): implement this
127128
func (kwok *KwokCloudProvider) Refresh() error {
128129

129-
// TODO(vadasambar): causes CA to not recognize kwok nodegroups
130-
// needs better implementation
131-
// nodeList, err := kwok.lister.List()
132-
// if err != nil {
133-
// return err
134-
// }
130+
allNodes, err := kwok.allNodesLister.List(labels.Everything())
131+
if err != nil {
132+
klog.ErrorS(err, "failed to list all nodes from lister")
133+
return err
134+
}
135+
136+
targetSizeInCluster := make(map[string]int)
135137

136-
// ngs := []*NodeGroup{}
137-
// for _, no := range nodeList {
138-
// ng := parseAnnotationsToNodegroup(no)
139-
// ng.kubeClient = kwok.kubeClient
140-
// ngs = append(ngs, ng)
141-
// }
138+
for _, node := range allNodes {
139+
ngName := getNGName(node, kwok.config)
140+
if ngName == "" {
141+
continue
142+
}
142143

143-
// kwok.nodeGroups = ngs
144+
targetSizeInCluster[ngName] += 1
145+
}
146+
147+
for _, ng := range kwok.nodeGroups {
148+
ng.targetSize = targetSizeInCluster[ng.Id()]
149+
}
144150

145151
return nil
146152
}
@@ -245,6 +251,7 @@ func BuildKwokProvider(ko *kwokOptions) (*KwokCloudProvider, error) {
245251
kubeClient: ko.kubeClient,
246252
resourceLimiter: ko.resourceLimiter,
247253
config: kwokConfig,
254+
allNodesLister: ko.allNodesLister,
248255
}, nil
249256
}
250257

cluster-autoscaler/cloudprovider/kwok/kwok_provider_test.go

Lines changed: 143 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,18 @@ import (
2020
"os"
2121
"testing"
2222

23+
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
24+
"k8s.io/autoscaler/cluster-autoscaler/config"
25+
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
26+
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
27+
2328
"github.com/stretchr/testify/assert"
2429
apiv1 "k8s.io/api/core/v1"
2530
"k8s.io/apimachinery/pkg/api/errors"
2631
"k8s.io/apimachinery/pkg/api/resource"
2732
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2833
"k8s.io/apimachinery/pkg/labels"
2934
"k8s.io/apimachinery/pkg/runtime"
30-
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
31-
"k8s.io/autoscaler/cluster-autoscaler/config"
32-
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
33-
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
3435
"k8s.io/client-go/kubernetes/fake"
3536
v1lister "k8s.io/client-go/listers/core/v1"
3637
core "k8s.io/client-go/testing"
@@ -156,6 +157,110 @@ func TestNodeGroups(t *testing.T) {
156157
})
157158
}
158159

160+
func TestRefresh(t *testing.T) {
161+
fakeClient := &fake.Clientset{}
162+
var nodesFrom string
163+
fakeClient.Fake.AddReactor("get", "configmaps", func(action core.Action) (bool, runtime.Object, error) {
164+
getAction := action.(core.GetAction)
165+
166+
if getAction == nil {
167+
return false, nil, nil
168+
}
169+
170+
if getAction.GetName() == defaultConfigName {
171+
if nodesFrom == "configmap" {
172+
return true, &apiv1.ConfigMap{
173+
Data: map[string]string{
174+
configKey: testConfig,
175+
},
176+
}, nil
177+
}
178+
179+
return true, &apiv1.ConfigMap{
180+
Data: map[string]string{
181+
configKey: testConfigDynamicTemplates,
182+
},
183+
}, nil
184+
185+
}
186+
187+
if getAction.GetName() == defaultTemplatesConfigName {
188+
if nodesFrom == "configmap" {
189+
return true, &apiv1.ConfigMap{
190+
Data: map[string]string{
191+
templatesKey: testTemplates,
192+
},
193+
}, nil
194+
}
195+
}
196+
197+
return true, nil, errors.NewNotFound(apiv1.Resource("configmaps"), "whatever")
198+
})
199+
200+
os.Setenv("POD_NAMESPACE", "kube-system")
201+
202+
t.Run("refresh nodegroup target size", func(t *testing.T) {
203+
nodesFrom = "configmap"
204+
ngName := "kind-worker"
205+
fakeNodeLister := newTestAllNodeLister(map[string]*apiv1.Node{
206+
"node1": {
207+
ObjectMeta: metav1.ObjectMeta{
208+
Name: "node1",
209+
Labels: map[string]string{
210+
"kwok-nodegroup": ngName,
211+
},
212+
},
213+
},
214+
"node2": {
215+
ObjectMeta: metav1.ObjectMeta{
216+
Name: "node2",
217+
Labels: map[string]string{
218+
"kwok-nodegroup": ngName,
219+
},
220+
},
221+
},
222+
"node3": {
223+
ObjectMeta: metav1.ObjectMeta{
224+
Name: "node3",
225+
Labels: map[string]string{
226+
"kwok-nodegroup": ngName,
227+
},
228+
},
229+
},
230+
"node4": {
231+
ObjectMeta: metav1.ObjectMeta{
232+
Name: "node4",
233+
},
234+
},
235+
})
236+
237+
ko := &kwokOptions{
238+
kubeClient: fakeClient,
239+
autoscalingOpts: &config.AutoscalingOptions{},
240+
discoveryOpts: &cloudprovider.NodeGroupDiscoveryOptions{},
241+
resourceLimiter: cloudprovider.NewResourceLimiter(
242+
map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000},
243+
map[string]int64{cloudprovider.ResourceNameCores: 10, cloudprovider.ResourceNameMemory: 100000000}),
244+
allNodesLister: fakeNodeLister,
245+
ngNodeListerFn: testNodeLister,
246+
}
247+
248+
p, err := BuildKwokProvider(ko)
249+
assert.NoError(t, err)
250+
assert.NotNil(t, p)
251+
252+
err = p.Refresh()
253+
assert.Nil(t, err)
254+
for _, ng := range p.NodeGroups() {
255+
if ng.Id() == ngName {
256+
targetSize, err := ng.TargetSize()
257+
assert.Nil(t, err)
258+
assert.Equal(t, 3, targetSize)
259+
}
260+
}
261+
})
262+
}
263+
159264
func TestGetResourceLimiter(t *testing.T) {
160265
fakeClient := &fake.Clientset{}
161266
fakeClient.Fake.AddReactor("get", "configmaps", func(action core.Action) (bool, runtime.Object, error) {
@@ -639,6 +744,40 @@ func TestNodeGroupForNode(t *testing.T) {
639744
assert.Contains(t, ng.Id(), "kind-worker")
640745
})
641746

747+
t.Run("empty nodegroup name for node", func(t *testing.T) {
748+
nodesFrom = "configmap"
749+
fakeNodeLister := newTestAllNodeLister(map[string]*apiv1.Node{})
750+
751+
ko := &kwokOptions{
752+
kubeClient: fakeClient,
753+
autoscalingOpts: &config.AutoscalingOptions{},
754+
discoveryOpts: &cloudprovider.NodeGroupDiscoveryOptions{},
755+
resourceLimiter: cloudprovider.NewResourceLimiter(
756+
map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000},
757+
map[string]int64{cloudprovider.ResourceNameCores: 10, cloudprovider.ResourceNameMemory: 100000000}),
758+
allNodesLister: fakeNodeLister,
759+
ngNodeListerFn: testNodeLister,
760+
}
761+
762+
p, err := BuildKwokProvider(ko)
763+
assert.NoError(t, err)
764+
assert.NotNil(t, p)
765+
assert.Len(t, p.nodeGroups, 1)
766+
assert.Contains(t, p.nodeGroups[0].Id(), "kind-worker")
767+
768+
testNode := &apiv1.Node{
769+
ObjectMeta: metav1.ObjectMeta{
770+
Name: "node-without-labels",
771+
},
772+
Spec: apiv1.NodeSpec{
773+
ProviderID: "kwok:random-instance-id",
774+
},
775+
}
776+
ng, err := p.NodeGroupForNode(testNode)
777+
assert.NoError(t, err)
778+
assert.Nil(t, ng)
779+
})
780+
642781
}
643782

644783
func TestBuildKwokProvider(t *testing.T) {

cluster-autoscaler/cloudprovider/kwok/kwok_types.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,12 @@ package kwok
1818

1919
import (
2020
apiv1 "k8s.io/api/core/v1"
21+
"k8s.io/client-go/kubernetes"
22+
listersv1 "k8s.io/client-go/listers/core/v1"
23+
2124
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
2225
"k8s.io/autoscaler/cluster-autoscaler/config"
2326
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
24-
"k8s.io/client-go/kubernetes"
25-
listersv1 "k8s.io/client-go/listers/core/v1"
2627
)
2728

2829
// KwokCloudProvider implements CloudProvider interface for kwok
@@ -32,6 +33,8 @@ type KwokCloudProvider struct {
3233
resourceLimiter *cloudprovider.ResourceLimiter
3334
// kubeClient is to be used only for create, delete and update
3435
kubeClient kubernetes.Interface
36+
//allNodesLister is a lister to list all nodes in cluster
37+
allNodesLister listersv1.NodeLister
3538
}
3639

3740
type kwokOptions struct {

0 commit comments

Comments
 (0)