File tree Expand file tree Collapse file tree 2 files changed +30
-16
lines changed
cluster-autoscaler/cloudprovider/oci/nodepools Expand file tree Collapse file tree 2 files changed +30
-16
lines changed Original file line number Diff line number Diff line change @@ -40,24 +40,33 @@ func (c *nodePoolCache) nodePools() map[string]*oke.NodePool {
40
40
return result
41
41
}
42
42
43
- func (c * nodePoolCache ) rebuild (staticNodePools map [string ]NodePool ) error {
43
+ func (c * nodePoolCache ) rebuild (staticNodePools map [string ]NodePool , maxGetNodepoolRetries int ) ( httpStatusCode int , err error ) {
44
44
klog .Infof ("rebuilding cache" )
45
+ var resp oke.GetNodePoolResponse
46
+ var statusCode int
45
47
for id := range staticNodePools {
46
- // prevent us from getting a node pool at the same time that we're performing delete actions on the node pool.
47
- c .mu .Lock ()
48
- resp , err := c .okeClient .GetNodePool (context .Background (), oke.GetNodePoolRequest {
49
- NodePoolId : common .String (id ),
50
- })
51
- c .mu .Unlock ()
52
-
48
+ for i := 1 ; i <= maxGetNodepoolRetries ; i ++ {
49
+ // prevent us from getting a node pool at the same time that we're performing delete actions on the node pool.
50
+ c .mu .Lock ()
51
+ resp , err = c .okeClient .GetNodePool (context .Background (), oke.GetNodePoolRequest {
52
+ NodePoolId : common .String (id ),
53
+ })
54
+ c .mu .Unlock ()
55
+ httpResp := resp .HTTPResponse ()
56
+ statusCode = httpResp .StatusCode
57
+ if err != nil {
58
+ klog .Errorf ("Failed to fetch the nodepool : %v. Retries available : %v" , id , maxGetNodepoolRetries - i )
59
+ } else {
60
+ break
61
+ }
62
+ }
53
63
if err != nil {
54
- return err
64
+ klog .Errorf ("Failed to fetch the nodepool : %v" , id )
65
+ return statusCode , err
55
66
}
56
-
57
67
c .set (& resp .NodePool )
58
68
}
59
-
60
- return nil
69
+ return statusCode , nil
61
70
}
62
71
63
72
// removeInstance tries to remove the instance from the node pool.
Original file line number Diff line number Diff line change @@ -32,7 +32,8 @@ import (
32
32
)
33
33
34
34
const (
35
- maxAddTaintRetries = 5
35
+ maxAddTaintRetries = 5
36
+ maxGetNodepoolRetries = 3
36
37
)
37
38
38
39
var (
@@ -249,11 +250,15 @@ func (m *ociManagerImpl) TaintToPreventFurtherSchedulingOnRestart(nodes []*apiv1
249
250
}
250
251
251
252
func (m * ociManagerImpl ) forceRefresh () error {
252
- err := m .nodePoolCache .rebuild (m .staticNodePools )
253
+ httpStatusCode , err := m .nodePoolCache .rebuild (m .staticNodePools , maxGetNodepoolRetries )
253
254
if err != nil {
255
+ if httpStatusCode == 404 {
256
+ m .lastRefresh = time .Now ()
257
+ klog .Errorf ("Failed to fetch the nodepools. Retrying after %v" , m .lastRefresh .Add (m .cfg .Global .RefreshInterval ))
258
+ return err
259
+ }
254
260
return err
255
261
}
256
-
257
262
m .lastRefresh = time .Now ()
258
263
klog .Infof ("Refreshed NodePool list, next refresh after %v" , m .lastRefresh .Add (m .cfg .Global .RefreshInterval ))
259
264
return nil
@@ -441,7 +446,7 @@ func (m *ociManagerImpl) GetNodePoolForInstance(instance ocicommon.OciRef) (Node
441
446
442
447
np , found := m .staticNodePools [instance .NodePoolID ]
443
448
if ! found {
444
- klog .Infof ("did not find node pool for reference: %+v" , instance )
449
+ klog .V ( 4 ). Infof ("did not find node pool for reference: %+v" , instance )
445
450
return nil , errInstanceNodePoolNotFound
446
451
}
447
452
You can’t perform that action at this time.
0 commit comments