Skip to content

Commit 06fa717

Browse files
authored
Merge pull request #6584 from vbhargav875/delay_retries
Delay force refresh by DefaultInterval when OCI GetNodePool call retu…
2 parents c58e3fd + 2bf403e commit 06fa717

File tree

2 files changed

+30
-16
lines changed

2 files changed

+30
-16
lines changed

cluster-autoscaler/cloudprovider/oci/nodepools/cache.go

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,24 +40,33 @@ func (c *nodePoolCache) nodePools() map[string]*oke.NodePool {
4040
return result
4141
}
4242

43-
func (c *nodePoolCache) rebuild(staticNodePools map[string]NodePool) error {
43+
func (c *nodePoolCache) rebuild(staticNodePools map[string]NodePool, maxGetNodepoolRetries int) (httpStatusCode int, err error) {
4444
klog.Infof("rebuilding cache")
45+
var resp oke.GetNodePoolResponse
46+
var statusCode int
4547
for id := range staticNodePools {
46-
// prevent us from getting a node pool at the same time that we're performing delete actions on the node pool.
47-
c.mu.Lock()
48-
resp, err := c.okeClient.GetNodePool(context.Background(), oke.GetNodePoolRequest{
49-
NodePoolId: common.String(id),
50-
})
51-
c.mu.Unlock()
52-
48+
for i := 1; i <= maxGetNodepoolRetries; i++ {
49+
// prevent us from getting a node pool at the same time that we're performing delete actions on the node pool.
50+
c.mu.Lock()
51+
resp, err = c.okeClient.GetNodePool(context.Background(), oke.GetNodePoolRequest{
52+
NodePoolId: common.String(id),
53+
})
54+
c.mu.Unlock()
55+
httpResp := resp.HTTPResponse()
56+
statusCode = httpResp.StatusCode
57+
if err != nil {
58+
klog.Errorf("Failed to fetch the nodepool : %v. Retries available : %v", id, maxGetNodepoolRetries-i)
59+
} else {
60+
break
61+
}
62+
}
5363
if err != nil {
54-
return err
64+
klog.Errorf("Failed to fetch the nodepool : %v", id)
65+
return statusCode, err
5566
}
56-
5767
c.set(&resp.NodePool)
5868
}
59-
60-
return nil
69+
return statusCode, nil
6170
}
6271

6372
// removeInstance tries to remove the instance from the node pool.

cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ import (
3232
)
3333

3434
const (
35-
maxAddTaintRetries = 5
35+
maxAddTaintRetries = 5
36+
maxGetNodepoolRetries = 3
3637
)
3738

3839
var (
@@ -249,11 +250,15 @@ func (m *ociManagerImpl) TaintToPreventFurtherSchedulingOnRestart(nodes []*apiv1
249250
}
250251

251252
func (m *ociManagerImpl) forceRefresh() error {
252-
err := m.nodePoolCache.rebuild(m.staticNodePools)
253+
httpStatusCode, err := m.nodePoolCache.rebuild(m.staticNodePools, maxGetNodepoolRetries)
253254
if err != nil {
255+
if httpStatusCode == 404 {
256+
m.lastRefresh = time.Now()
257+
klog.Errorf("Failed to fetch the nodepools. Retrying after %v", m.lastRefresh.Add(m.cfg.Global.RefreshInterval))
258+
return err
259+
}
254260
return err
255261
}
256-
257262
m.lastRefresh = time.Now()
258263
klog.Infof("Refreshed NodePool list, next refresh after %v", m.lastRefresh.Add(m.cfg.Global.RefreshInterval))
259264
return nil
@@ -441,7 +446,7 @@ func (m *ociManagerImpl) GetNodePoolForInstance(instance ocicommon.OciRef) (Node
441446

442447
np, found := m.staticNodePools[instance.NodePoolID]
443448
if !found {
444-
klog.Infof("did not find node pool for reference: %+v", instance)
449+
klog.V(4).Infof("did not find node pool for reference: %+v", instance)
445450
return nil, errInstanceNodePoolNotFound
446451
}
447452

0 commit comments

Comments
 (0)