@@ -39,12 +39,13 @@ import (
39
39
)
40
40
41
41
const (
42
- resourceClaimFinalizer = "gpu.nvidia.com/finalizer.multiNodeEnvironment"
43
- imexDeviceClass = "imex.nvidia.com"
42
+ multiNodeEnvironmentFinalizer = "gpu.nvidia.com/finalizer.multiNodeEnvironment"
43
+ imexDeviceClass = "imex.nvidia.com"
44
44
45
45
MultiNodeEnvironmentAddEvent = "onMultiNodeEnvironmentAddEvent"
46
46
MultiNodeEnvironmentDeleteEvent = "onMultiNodeEnvironmentDeleteEvent"
47
47
ResourceClaimAddEvent = "ResourceClaimAddEvent"
48
+ DeviceClassAddEvent = "DeviceClassAddEvent"
48
49
)
49
50
50
51
type WorkItem struct {
@@ -58,24 +59,30 @@ type MultiNodeEnvironmentManager struct {
58
59
59
60
multiNodeEnvironmentLister nvlisters.MultiNodeEnvironmentLister
60
61
resourceClaimLister resourcelisters.ResourceClaimLister
62
+ deviceClassLister resourcelisters.DeviceClassLister
61
63
}
62
64
63
65
// StartManager starts a MultiNodeEnvironmentManager.
64
66
func StartMultiNodeEnvironmentManager (ctx context.Context , config * Config ) (* MultiNodeEnvironmentManager , error ) {
65
67
queue := workqueue .New (workqueue .DefaultControllerRateLimiter ())
66
68
67
- mneInformerFactory := nvinformers .NewSharedInformerFactory (config .clientsets .Nvidia , 30 * time .Second )
68
- mneInformer := mneInformerFactory .Gpu ().V1alpha1 ().MultiNodeEnvironments ().Informer ()
69
+ nvInformerFactory := nvinformers .NewSharedInformerFactory (config .clientsets .Nvidia , 30 * time .Second )
70
+ coreInformerFactory := informers .NewSharedInformerFactory (config .clientsets .Core , 30 * time .Second )
71
+
72
+ mneInformer := nvInformerFactory .Gpu ().V1alpha1 ().MultiNodeEnvironments ().Informer ()
69
73
mneLister := nvlisters .NewMultiNodeEnvironmentLister (mneInformer .GetIndexer ())
70
74
71
- rcInformerFactory := informers .NewSharedInformerFactory (config .clientsets .Core , 30 * time .Second )
72
- rcInformer := rcInformerFactory .Resource ().V1beta1 ().ResourceClaims ().Informer ()
75
+ rcInformer := coreInformerFactory .Resource ().V1beta1 ().ResourceClaims ().Informer ()
73
76
rcLister := resourcelisters .NewResourceClaimLister (rcInformer .GetIndexer ())
74
77
78
+ dcInformer := coreInformerFactory .Resource ().V1beta1 ().DeviceClasses ().Informer ()
79
+ dcLister := resourcelisters .NewDeviceClassLister (dcInformer .GetIndexer ())
80
+
75
81
m := & MultiNodeEnvironmentManager {
76
82
clientsets : config .clientsets ,
77
83
multiNodeEnvironmentLister : mneLister ,
78
84
resourceClaimLister : rcLister ,
85
+ deviceClassLister : dcLister ,
79
86
}
80
87
81
88
mneInformer .AddEventHandler (cache.ResourceEventHandlerFuncs {
@@ -87,21 +94,25 @@ func StartMultiNodeEnvironmentManager(ctx context.Context, config *Config) (*Mul
87
94
AddFunc : func (obj any ) { queue .Enqueue (obj , m .onResourceClaimAdd ) },
88
95
})
89
96
97
+ dcInformer .AddEventHandler (cache.ResourceEventHandlerFuncs {
98
+ AddFunc : func (obj any ) { queue .Enqueue (obj , m .onDeviceClassAdd ) },
99
+ })
100
+
90
101
m .waitGroup .Add (3 )
91
102
go func () {
92
103
defer m .waitGroup .Done ()
93
- rcInformerFactory .Start (ctx .Done ())
104
+ nvInformerFactory .Start (ctx .Done ())
94
105
}()
95
106
go func () {
96
107
defer m .waitGroup .Done ()
97
- mneInformerFactory .Start (ctx .Done ())
108
+ coreInformerFactory .Start (ctx .Done ())
98
109
}()
99
110
go func () {
100
111
defer m .waitGroup .Done ()
101
112
queue .Run (ctx .Done ())
102
113
}()
103
114
104
- if ! cache .WaitForCacheSync (ctx .Done (), mneInformer .HasSynced , rcInformer .HasSynced ) {
115
+ if ! cache .WaitForCacheSync (ctx .Done (), mneInformer .HasSynced , rcInformer .HasSynced , dcInformer . HasSynced ) {
105
116
klog .Warning ("Cache sync failed; retrying in 5 seconds" )
106
117
time .Sleep (5 * time .Second )
107
118
if ! cache .WaitForCacheSync (ctx .Done (), mneInformer .HasSynced , rcInformer .HasSynced ) {
@@ -157,7 +168,7 @@ func (m *MultiNodeEnvironmentManager) onMultiNodeEnvironmentAdd(obj any) error {
157
168
Name : mne .Spec .ResourceClaimName ,
158
169
Namespace : mne .Namespace ,
159
170
OwnerReferences : []metav1.OwnerReference {ownerReference },
160
- Finalizers : []string {resourceClaimFinalizer },
171
+ Finalizers : []string {multiNodeEnvironmentFinalizer },
161
172
},
162
173
Spec : resourceapi.ResourceClaimSpec {
163
174
Devices : resourceapi.DeviceClaim {
@@ -222,6 +233,37 @@ func (m *MultiNodeEnvironmentManager) onResourceClaimAdd(obj any) error {
222
233
return nil
223
234
}
224
235
236
+ func (m * MultiNodeEnvironmentManager ) onDeviceClassAdd (obj any ) error {
237
+ dc , ok := obj .(* resourceapi.DeviceClass )
238
+ if ! ok {
239
+ return fmt .Errorf ("failed to cast to DeviceClass" )
240
+ }
241
+
242
+ klog .Infof ("Processing added DeviceClass: %s/%s" , dc .Namespace , dc .Name )
243
+
244
+ if len (dc .OwnerReferences ) != 1 {
245
+ return nil
246
+ }
247
+
248
+ if dc .OwnerReferences [0 ].Kind != nvapi .MultiNodeEnvironmentKind {
249
+ return nil
250
+ }
251
+
252
+ _ , err := m .multiNodeEnvironmentLister .MultiNodeEnvironments (dc .Namespace ).Get (dc .OwnerReferences [0 ].Name )
253
+ if err == nil {
254
+ return nil
255
+ }
256
+ if ! errors .IsNotFound (err ) {
257
+ return fmt .Errorf ("error retrieving DeviceClass's OwnerReference '%s': %w" , dc .OwnerReferences [0 ].Name , err )
258
+ }
259
+
260
+ if err := m .removeDeviceClassFinalizer (dc .Name ); err != nil {
261
+ return fmt .Errorf ("error removing finalizer on DeviceClass '%s': %w" , dc .Name , err )
262
+ }
263
+
264
+ return nil
265
+ }
266
+
225
267
func (m * MultiNodeEnvironmentManager ) removeResourceClaimFinalizer (namespace , name string ) error {
226
268
rc , err := m .resourceClaimLister .ResourceClaims (namespace ).Get (name )
227
269
if err != nil && errors .IsNotFound (err ) {
@@ -235,7 +277,7 @@ func (m *MultiNodeEnvironmentManager) removeResourceClaimFinalizer(namespace, na
235
277
236
278
newRC .Finalizers = []string {}
237
279
for _ , f := range rc .Finalizers {
238
- if f != resourceClaimFinalizer {
280
+ if f != multiNodeEnvironmentFinalizer {
239
281
newRC .Finalizers = append (newRC .Finalizers , f )
240
282
}
241
283
}
@@ -247,3 +289,29 @@ func (m *MultiNodeEnvironmentManager) removeResourceClaimFinalizer(namespace, na
247
289
248
290
return nil
249
291
}
292
+
293
+ func (m * MultiNodeEnvironmentManager ) removeDeviceClassFinalizer (name string ) error {
294
+ dc , err := m .deviceClassLister .Get (name )
295
+ if err != nil && errors .IsNotFound (err ) {
296
+ return fmt .Errorf ("DeviceClass not found" )
297
+ }
298
+ if err != nil {
299
+ return fmt .Errorf ("error retrieving DeviceClass: %w" , err )
300
+ }
301
+
302
+ newDC := dc .DeepCopy ()
303
+
304
+ newDC .Finalizers = []string {}
305
+ for _ , f := range dc .Finalizers {
306
+ if f != multiNodeEnvironmentFinalizer {
307
+ newDC .Finalizers = append (newDC .Finalizers , f )
308
+ }
309
+ }
310
+
311
+ _ , err = m .clientsets .Core .ResourceV1beta1 ().DeviceClasses ().Update (context .Background (), newDC , metav1.UpdateOptions {})
312
+ if err != nil {
313
+ return fmt .Errorf ("failed to update DeviceClass: %w" , err )
314
+ }
315
+
316
+ return nil
317
+ }
0 commit comments