Skip to content

Commit 55b81c0

Browse files
committed
WIP
Signed-off-by: Kevin Klues <kklues@nvidia.com>
1 parent 72cffb1 commit 55b81c0

File tree

1 file changed

+79
-11
lines changed

1 file changed

+79
-11
lines changed

cmd/nvidia-dra-controller/mnenv.go

Lines changed: 79 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,13 @@ import (
3939
)
4040

4141
const (
42-
resourceClaimFinalizer = "gpu.nvidia.com/finalizer.multiNodeEnvironment"
43-
imexDeviceClass = "imex.nvidia.com"
42+
multiNodeEnvironmentFinalizer = "gpu.nvidia.com/finalizer.multiNodeEnvironment"
43+
imexDeviceClass = "imex.nvidia.com"
4444

4545
MultiNodeEnvironmentAddEvent = "onMultiNodeEnvironmentAddEvent"
4646
MultiNodeEnvironmentDeleteEvent = "onMultiNodeEnvironmentDeleteEvent"
4747
ResourceClaimAddEvent = "ResourceClaimAddEvent"
48+
DeviceClassAddEvent = "DeviceClassAddEvent"
4849
)
4950

5051
type WorkItem struct {
@@ -58,24 +59,30 @@ type MultiNodeEnvironmentManager struct {
5859

5960
multiNodeEnvironmentLister nvlisters.MultiNodeEnvironmentLister
6061
resourceClaimLister resourcelisters.ResourceClaimLister
62+
deviceClassLister resourcelisters.DeviceClassLister
6163
}
6264

6365
// StartManager starts a MultiNodeEnvironmentManager.
6466
func StartMultiNodeEnvironmentManager(ctx context.Context, config *Config) (*MultiNodeEnvironmentManager, error) {
6567
queue := workqueue.New(workqueue.DefaultControllerRateLimiter())
6668

67-
mneInformerFactory := nvinformers.NewSharedInformerFactory(config.clientsets.Nvidia, 30*time.Second)
68-
mneInformer := mneInformerFactory.Gpu().V1alpha1().MultiNodeEnvironments().Informer()
69+
nvInformerFactory := nvinformers.NewSharedInformerFactory(config.clientsets.Nvidia, 30*time.Second)
70+
coreInformerFactory := informers.NewSharedInformerFactory(config.clientsets.Core, 30*time.Second)
71+
72+
mneInformer := nvInformerFactory.Gpu().V1alpha1().MultiNodeEnvironments().Informer()
6973
mneLister := nvlisters.NewMultiNodeEnvironmentLister(mneInformer.GetIndexer())
7074

71-
rcInformerFactory := informers.NewSharedInformerFactory(config.clientsets.Core, 30*time.Second)
72-
rcInformer := rcInformerFactory.Resource().V1beta1().ResourceClaims().Informer()
75+
rcInformer := coreInformerFactory.Resource().V1beta1().ResourceClaims().Informer()
7376
rcLister := resourcelisters.NewResourceClaimLister(rcInformer.GetIndexer())
7477

78+
dcInformer := coreInformerFactory.Resource().V1beta1().DeviceClasses().Informer()
79+
dcLister := resourcelisters.NewDeviceClassLister(dcInformer.GetIndexer())
80+
7581
m := &MultiNodeEnvironmentManager{
7682
clientsets: config.clientsets,
7783
multiNodeEnvironmentLister: mneLister,
7884
resourceClaimLister: rcLister,
85+
deviceClassLister: dcLister,
7986
}
8087

8188
mneInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
@@ -87,21 +94,25 @@ func StartMultiNodeEnvironmentManager(ctx context.Context, config *Config) (*Mul
8794
AddFunc: func(obj any) { queue.Enqueue(obj, m.onResourceClaimAdd) },
8895
})
8996

97+
dcInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
98+
AddFunc: func(obj any) { queue.Enqueue(obj, m.onDeviceClassAdd) },
99+
})
100+
90101
m.waitGroup.Add(3)
91102
go func() {
92103
defer m.waitGroup.Done()
93-
rcInformerFactory.Start(ctx.Done())
104+
nvInformerFactory.Start(ctx.Done())
94105
}()
95106
go func() {
96107
defer m.waitGroup.Done()
97-
mneInformerFactory.Start(ctx.Done())
108+
coreInformerFactory.Start(ctx.Done())
98109
}()
99110
go func() {
100111
defer m.waitGroup.Done()
101112
queue.Run(ctx.Done())
102113
}()
103114

104-
if !cache.WaitForCacheSync(ctx.Done(), mneInformer.HasSynced, rcInformer.HasSynced) {
115+
if !cache.WaitForCacheSync(ctx.Done(), mneInformer.HasSynced, rcInformer.HasSynced, dcInformer.HasSynced) {
105116
klog.Warning("Cache sync failed; retrying in 5 seconds")
106117
time.Sleep(5 * time.Second)
107118
if !cache.WaitForCacheSync(ctx.Done(), mneInformer.HasSynced, rcInformer.HasSynced) {
@@ -157,7 +168,7 @@ func (m *MultiNodeEnvironmentManager) onMultiNodeEnvironmentAdd(obj any) error {
157168
Name: mne.Spec.ResourceClaimName,
158169
Namespace: mne.Namespace,
159170
OwnerReferences: []metav1.OwnerReference{ownerReference},
160-
Finalizers: []string{resourceClaimFinalizer},
171+
Finalizers: []string{multiNodeEnvironmentFinalizer},
161172
},
162173
Spec: resourceapi.ResourceClaimSpec{
163174
Devices: resourceapi.DeviceClaim{
@@ -222,6 +233,37 @@ func (m *MultiNodeEnvironmentManager) onResourceClaimAdd(obj any) error {
222233
return nil
223234
}
224235

236+
func (m *MultiNodeEnvironmentManager) onDeviceClassAdd(obj any) error {
237+
dc, ok := obj.(*resourceapi.DeviceClass)
238+
if !ok {
239+
return fmt.Errorf("failed to cast to DeviceClass")
240+
}
241+
242+
klog.Infof("Processing added DeviceClass: %s/%s", dc.Namespace, dc.Name)
243+
244+
if len(dc.OwnerReferences) != 1 {
245+
return nil
246+
}
247+
248+
if dc.OwnerReferences[0].Kind != nvapi.MultiNodeEnvironmentKind {
249+
return nil
250+
}
251+
252+
_, err := m.multiNodeEnvironmentLister.MultiNodeEnvironments(dc.Namespace).Get(dc.OwnerReferences[0].Name)
253+
if err == nil {
254+
return nil
255+
}
256+
if !errors.IsNotFound(err) {
257+
return fmt.Errorf("error retrieving DeviceClass's OwnerReference '%s': %w", dc.OwnerReferences[0].Name, err)
258+
}
259+
260+
if err := m.removeDeviceClassFinalizer(dc.Name); err != nil {
261+
return fmt.Errorf("error removing finalizer on DeviceClass '%s': %w", dc.Name, err)
262+
}
263+
264+
return nil
265+
}
266+
225267
func (m *MultiNodeEnvironmentManager) removeResourceClaimFinalizer(namespace, name string) error {
226268
rc, err := m.resourceClaimLister.ResourceClaims(namespace).Get(name)
227269
if err != nil && errors.IsNotFound(err) {
@@ -235,7 +277,7 @@ func (m *MultiNodeEnvironmentManager) removeResourceClaimFinalizer(namespace, na
235277

236278
newRC.Finalizers = []string{}
237279
for _, f := range rc.Finalizers {
238-
if f != resourceClaimFinalizer {
280+
if f != multiNodeEnvironmentFinalizer {
239281
newRC.Finalizers = append(newRC.Finalizers, f)
240282
}
241283
}
@@ -247,3 +289,29 @@ func (m *MultiNodeEnvironmentManager) removeResourceClaimFinalizer(namespace, na
247289

248290
return nil
249291
}
292+
293+
func (m *MultiNodeEnvironmentManager) removeDeviceClassFinalizer(name string) error {
294+
dc, err := m.deviceClassLister.Get(name)
295+
if err != nil && errors.IsNotFound(err) {
296+
return fmt.Errorf("DeviceClass not found")
297+
}
298+
if err != nil {
299+
return fmt.Errorf("error retrieving DeviceClass: %w", err)
300+
}
301+
302+
newDC := dc.DeepCopy()
303+
304+
newDC.Finalizers = []string{}
305+
for _, f := range dc.Finalizers {
306+
if f != multiNodeEnvironmentFinalizer {
307+
newDC.Finalizers = append(newDC.Finalizers, f)
308+
}
309+
}
310+
311+
_, err = m.clientsets.Core.ResourceV1beta1().DeviceClasses().Update(context.Background(), newDC, metav1.UpdateOptions{})
312+
if err != nil {
313+
return fmt.Errorf("failed to update DeviceClass: %w", err)
314+
}
315+
316+
return nil
317+
}

0 commit comments

Comments
 (0)