Skip to content

Commit c8e4721

Browse files
authored
Merge pull request #6470 from gandhipr/prachigandhi/azure-getvmss-spot
use getvmss api for spot instances in azure
2 parents 8dbe8ac + b69fd45 commit c8e4721

File tree

4 files changed

+132
-55
lines changed

4 files changed

+132
-55
lines changed

cluster-autoscaler/cloudprovider/azure/azure_config.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ const (
5656
rateLimitWriteQPSEnvVar = "RATE_LIMIT_WRITE_QPS"
5757
rateLimitWriteBucketsEnvVar = "RATE_LIMIT_WRITE_BUCKETS"
5858

59+
// VmssSizeRefreshPeriodDefault in seconds
60+
VmssSizeRefreshPeriodDefault = 30
61+
5962
// auth methods
6063
authMethodPrincipal = "principal"
6164
authMethodCLI = "cli"
@@ -128,6 +131,9 @@ type Config struct {
128131
// Jitter in seconds subtracted from the VMSS cache TTL before the first refresh
129132
VmssVmsCacheJitter int `json:"vmssVmsCacheJitter" yaml:"vmssVmsCacheJitter"`
130133

134+
// GetVmssSizeRefreshPeriod (seconds) defines how frequently to call GET VMSS API to fetch VMSS info per nodegroup instance
135+
GetVmssSizeRefreshPeriod int `json:"getVmssSizeRefreshPeriod" yaml:"getVmssSizeRefreshPeriod"`
136+
131137
// number of latest deployments that will not be deleted
132138
MaxDeploymentsCount int64 `json:"maxDeploymentsCount" yaml:"maxDeploymentsCount"`
133139

@@ -256,6 +262,15 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
256262
cfg.EnableDynamicInstanceList = dynamicInstanceListDefault
257263
}
258264

265+
if getVmssSizeRefreshPeriod := os.Getenv("AZURE_GET_VMSS_SIZE_REFRESH_PERIOD"); getVmssSizeRefreshPeriod != "" {
266+
cfg.GetVmssSizeRefreshPeriod, err = strconv.Atoi(getVmssSizeRefreshPeriod)
267+
if err != nil {
268+
return nil, fmt.Errorf("failed to parse AZURE_GET_VMSS_SIZE_REFRESH_PERIOD %q: %v", getVmssSizeRefreshPeriod, err)
269+
}
270+
} else {
271+
cfg.GetVmssSizeRefreshPeriod = VmssSizeRefreshPeriodDefault
272+
}
273+
259274
if enableVmssFlex := os.Getenv("AZURE_ENABLE_VMSS_FLEX"); enableVmssFlex != "" {
260275
cfg.EnableVmssFlex, err = strconv.ParseBool(enableVmssFlex)
261276
if err != nil {

cluster-autoscaler/cloudprovider/azure/azure_manager_test.go

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ func TestCreateAzureManagerWithNilConfig(t *testing.T) {
366366
VmssCacheTTL: 100,
367367
VmssVmsCacheTTL: 110,
368368
VmssVmsCacheJitter: 90,
369+
GetVmssSizeRefreshPeriod: 30,
369370
MaxDeploymentsCount: 8,
370371
CloudProviderBackoff: true,
371372
CloudProviderBackoffRetries: 1,
@@ -480,6 +481,14 @@ func TestCreateAzureManagerWithNilConfig(t *testing.T) {
480481
assert.Equal(t, expectedErr, err, "Return err does not match, expected: %v, actual: %v", expectedErr, err)
481482
})
482483

484+
t.Run("invalid int for AZURE_GET_VMSS_SIZE_REFRESH_PERIOD", func(t *testing.T) {
485+
t.Setenv("AZURE_GET_VMSS_SIZE_REFRESH_PERIOD", "invalidint")
486+
manager, err := createAzureManagerInternal(nil, cloudprovider.NodeGroupDiscoveryOptions{}, mockAzClient)
487+
expectedErr := fmt.Errorf("failed to parse AZURE_GET_VMSS_SIZE_REFRESH_PERIOD \"invalidint\": strconv.Atoi: parsing \"invalidint\": invalid syntax")
488+
assert.Nil(t, manager)
489+
assert.Equal(t, expectedErr, err, "Return err does not match, expected: %v, actual: %v", expectedErr, err)
490+
})
491+
483492
t.Run("invalid int for AZURE_MAX_DEPLOYMENT_COUNT", func(t *testing.T) {
484493
t.Setenv("AZURE_MAX_DEPLOYMENT_COUNT", "invalidint")
485494
manager, err := createAzureManagerInternal(nil, cloudprovider.NodeGroupDiscoveryOptions{}, mockAzClient)
@@ -685,13 +694,14 @@ func TestGetFilteredAutoscalingGroupsVmss(t *testing.T) {
685694
azureRef: azureRef{
686695
Name: vmssName,
687696
},
688-
minSize: minVal,
689-
maxSize: maxVal,
690-
manager: manager,
691-
enableForceDelete: manager.config.EnableForceDelete,
692-
curSize: 3,
693-
sizeRefreshPeriod: manager.azureCache.refreshInterval,
694-
instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod,
697+
minSize: minVal,
698+
maxSize: maxVal,
699+
manager: manager,
700+
enableForceDelete: manager.config.EnableForceDelete,
701+
curSize: 3,
702+
sizeRefreshPeriod: manager.azureCache.refreshInterval,
703+
instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod,
704+
getVmssSizeRefreshPeriod: time.Duration(VmssSizeRefreshPeriodDefault) * time.Second,
695705
}}
696706
assert.True(t, assert.ObjectsAreEqualValues(expectedAsgs, asgs), "expected %#v, but found: %#v", expectedAsgs, asgs)
697707
}
@@ -732,13 +742,14 @@ func TestGetFilteredAutoscalingGroupsVmssWithConfiguredSizes(t *testing.T) {
732742
azureRef: azureRef{
733743
Name: vmssName,
734744
},
735-
minSize: minVal,
736-
maxSize: maxVal,
737-
manager: manager,
738-
enableForceDelete: manager.config.EnableForceDelete,
739-
curSize: 3,
740-
sizeRefreshPeriod: manager.azureCache.refreshInterval,
741-
instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod,
745+
minSize: minVal,
746+
maxSize: maxVal,
747+
manager: manager,
748+
enableForceDelete: manager.config.EnableForceDelete,
749+
curSize: 3,
750+
sizeRefreshPeriod: manager.azureCache.refreshInterval,
751+
instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod,
752+
getVmssSizeRefreshPeriod: time.Duration(VmssSizeRefreshPeriodDefault) * time.Second,
742753
}}
743754
assert.True(t, assert.ObjectsAreEqualValues(expectedAsgs, asgs), "expected %#v, but found: %#v", expectedAsgs, asgs)
744755
}

cluster-autoscaler/cloudprovider/azure/azure_scale_set.go

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -58,19 +58,29 @@ type ScaleSet struct {
5858
minSize int
5959
maxSize int
6060

61-
enableForceDelete bool
62-
63-
sizeMutex sync.Mutex
64-
curSize int64
65-
61+
enableForceDelete bool
6662
enableDynamicInstanceList bool
6763

68-
lastSizeRefresh time.Time
64+
// curSize tracks (and caches) the number of VMs in this ScaleSet.
65+
// It is periodically updated from vmss.Sku.Capacity, with VMSS itself coming
66+
// either from azure.Cache (which periodically does VMSS.List)
67+
// or from direct VMSS.Get (used for Spot).
68+
curSize int64
69+
// lastSizeRefresh is the time curSize was last refreshed from vmss.Sku.Capacity.
70+
// Together with sizeRefreshPeriod, it is used to determine if it is time to refresh curSize.
71+
lastSizeRefresh time.Time
72+
// sizeRefreshPeriod is how often curSize is refreshed from vmss.Sku.Capacity.
73+
// (Set from azureCache.refreshInterval = VmssCacheTTL or [defaultMetadataCache]refreshInterval = 1min)
6974
sizeRefreshPeriod time.Duration
75+
// getVmssSizeRefreshPeriod is how often curSize should be refreshed in case VMSS.Get call is used (only spot instances).
76+
// (Set from GetVmssSizeRefreshPeriod, if specified = get-vmss-size-refresh-period = 30s,
77+
// or override from autoscalerProfile.GetVmssSizeRefreshPeriod)
78+
getVmssSizeRefreshPeriod time.Duration
7079

7180
instancesRefreshPeriod time.Duration
7281
instancesRefreshJitter int
7382

83+
sizeMutex sync.Mutex
7484
instanceMutex sync.Mutex
7585
instanceCache []cloudprovider.Instance
7686
lastInstanceRefresh time.Time
@@ -98,6 +108,12 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64) (
98108
scaleSet.instancesRefreshPeriod = defaultVmssInstancesRefreshPeriod
99109
}
100110

111+
if az.config.GetVmssSizeRefreshPeriod != 0 {
112+
scaleSet.getVmssSizeRefreshPeriod = time.Duration(az.config.GetVmssSizeRefreshPeriod) * time.Second
113+
} else {
114+
scaleSet.getVmssSizeRefreshPeriod = time.Duration(VmssSizeRefreshPeriodDefault) * time.Second
115+
}
116+
101117
return scaleSet, nil
102118
}
103119

@@ -157,17 +173,43 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) {
157173
scaleSet.sizeMutex.Lock()
158174
defer scaleSet.sizeMutex.Unlock()
159175

160-
if scaleSet.lastSizeRefresh.Add(scaleSet.sizeRefreshPeriod).After(time.Now()) {
161-
klog.V(3).Infof("VMSS: %s, returning in-memory size: %d", scaleSet.Name, scaleSet.curSize)
162-
return scaleSet.curSize, nil
163-
}
164-
165176
set, err := scaleSet.getVMSSFromCache()
166177
if err != nil {
167178
klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, err)
168179
return -1, err
169180
}
170181

182+
effectiveSizeRefreshPeriod := scaleSet.sizeRefreshPeriod
183+
184+
// If the scale set is Spot, we want to have a more fresh view of the Sku.Capacity field.
185+
// This is because evictions can happen at any given point in time,
186+
// even before VMs are materialized as nodes. We should be able to
187+
// react to those and have the autoscaler readjust the goal again to force restoration.
188+
// Taking into account only if orchestrationMode == Uniform because flex mode can have
189+
// combination of spot and regular vms
190+
if isSpot(&set) {
191+
effectiveSizeRefreshPeriod = scaleSet.getVmssSizeRefreshPeriod
192+
}
193+
194+
if scaleSet.lastSizeRefresh.Add(effectiveSizeRefreshPeriod).After(time.Now()) {
195+
klog.V(3).Infof("VMSS: %s, returning in-memory size: %d", scaleSet.Name, scaleSet.curSize)
196+
return scaleSet.curSize, nil
197+
}
198+
199+
// If the scale set is on Spot, make a GET VMSS call to fetch more updated fresh info
200+
if isSpot(&set) {
201+
ctx, cancel := getContextWithCancel()
202+
defer cancel()
203+
204+
var rerr *retry.Error
205+
set, rerr = scaleSet.manager.azClient.virtualMachineScaleSetsClient.Get(ctx, scaleSet.manager.config.ResourceGroup,
206+
scaleSet.Name)
207+
if rerr != nil {
208+
klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, rerr)
209+
return -1, err
210+
}
211+
}
212+
171213
vmssSizeMutex.Lock()
172214
curSize := *set.Sku.Capacity
173215
vmssSizeMutex.Unlock()
@@ -184,6 +226,12 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) {
184226
return scaleSet.curSize, nil
185227
}
186228

229+
func isSpot(vmss *compute.VirtualMachineScaleSet) bool {
230+
return vmss != nil && vmss.VirtualMachineScaleSetProperties != nil &&
231+
vmss.VirtualMachineScaleSetProperties.VirtualMachineProfile != nil &&
232+
vmss.VirtualMachineScaleSetProperties.VirtualMachineProfile.Priority == compute.Spot
233+
}
234+
187235
// GetScaleSetSize gets Scale Set size.
188236
func (scaleSet *ScaleSet) GetScaleSetSize() (int64, error) {
189237
return scaleSet.getCurSize()

cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go

Lines changed: 33 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -168,44 +168,47 @@ func TestTargetSize(t *testing.T) {
168168
ctrl := gomock.NewController(t)
169169
defer ctrl.Finish()
170170

171-
orchestrationModes := [2]compute.OrchestrationMode{compute.Uniform, compute.Flexible}
172-
173171
expectedScaleSets := newTestVMSSList(3, "test-asg", "eastus", compute.Uniform)
174-
expectedVMSSVMs := newTestVMSSVMList(3)
175-
expectedVMs := newTestVMList(3)
172+
spotScaleSet := newTestVMSSList(5, "spot-vmss", "eastus", compute.Uniform)[0]
173+
spotScaleSet.VirtualMachineProfile = &compute.VirtualMachineScaleSetVMProfile{
174+
Priority: compute.Spot,
175+
}
176+
expectedScaleSets = append(expectedScaleSets, spotScaleSet)
176177

177-
for _, orchMode := range orchestrationModes {
178-
provider := newTestProvider(t)
179-
mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
180-
mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil).AnyTimes()
181-
provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient
182-
mockVMClient := mockvmclient.NewMockInterface(ctrl)
183-
mockVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return([]compute.VirtualMachine{}, nil).AnyTimes()
184-
provider.azureManager.azClient.virtualMachinesClient = mockVMClient
178+
provider := newTestProvider(t)
179+
mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
180+
mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil).AnyTimes()
181+
provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient
182+
mockVMClient := mockvmclient.NewMockInterface(ctrl)
183+
mockVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return([]compute.VirtualMachine{}, nil).AnyTimes()
184+
provider.azureManager.azClient.virtualMachinesClient = mockVMClient
185185

186-
if orchMode == compute.Uniform {
186+
err := provider.azureManager.forceRefresh()
187+
assert.NoError(t, err)
187188

188-
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
189-
mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
190-
provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient
189+
// non-spot nodepool
190+
registered := provider.azureManager.RegisterNodeGroup(
191+
newTestScaleSet(provider.azureManager, "test-asg"))
192+
assert.True(t, registered)
193+
assert.Equal(t, len(provider.NodeGroups()), 1)
191194

192-
} else {
193-
provider.azureManager.config.EnableVmssFlex = true
194-
mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), "test-asg").Return(expectedVMs, nil).AnyTimes()
195-
}
195+
targetSize, err := provider.NodeGroups()[0].TargetSize()
196+
assert.NoError(t, err)
197+
assert.Equal(t, 3, targetSize)
196198

197-
err := provider.azureManager.forceRefresh()
198-
assert.NoError(t, err)
199+
// Register a spot nodepool
200+
spotregistered := provider.azureManager.RegisterNodeGroup(
201+
newTestScaleSet(provider.azureManager, "spot-vmss"))
202+
assert.True(t, spotregistered)
203+
assert.Equal(t, len(provider.NodeGroups()), 2)
199204

200-
registered := provider.azureManager.RegisterNodeGroup(
201-
newTestScaleSet(provider.azureManager, "test-asg"))
202-
assert.True(t, registered)
203-
assert.Equal(t, len(provider.NodeGroups()), 1)
205+
// mock getvmss call for spotnode pool returning different capacity
206+
spotScaleSet.Sku.Capacity = to.Int64Ptr(1)
207+
mockVMSSClient.EXPECT().Get(gomock.Any(), provider.azureManager.config.ResourceGroup, "spot-vmss").Return(spotScaleSet, nil).Times(1)
204208

205-
targetSize, err := provider.NodeGroups()[0].TargetSize()
206-
assert.NoError(t, err)
207-
assert.Equal(t, 3, targetSize)
208-
}
209+
targetSize, err = provider.NodeGroups()[1].TargetSize()
210+
assert.NoError(t, err)
211+
assert.Equal(t, 1, targetSize)
209212
}
210213

211214
func TestIncreaseSize(t *testing.T) {

0 commit comments

Comments
 (0)