Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM --platform=$BUILDPLATFORM golang:1.22.1 AS common-builder
FROM --platform=$BUILDPLATFORM golang:1.24.4 AS common-builder
WORKDIR $GOPATH/src/github.com/run-ai/fake-gpu-operator
COPY go.mod .
COPY go.sum .
Expand All @@ -7,6 +7,11 @@ COPY Makefile .
COPY internal/common ./internal/common
ARG TARGETOS TARGETARCH

FROM common-builder AS compute-domain-controller-builder
COPY ./cmd/compute-domain-controller/ ./cmd/compute-domain-controller/
COPY ./pkg/compute-domain/ ./pkg/compute-domain/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENTS=compute-domain-controller

FROM common-builder AS device-plugin-builder
COPY ./cmd/device-plugin/ ./cmd/device-plugin/
COPY ./internal/deviceplugin/ ./internal/deviceplugin/
Expand Down Expand Up @@ -59,6 +64,10 @@ FROM ubuntu AS status-updater
COPY --from=status-updater-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-updater /bin/
ENTRYPOINT ["/bin/status-updater"]

FROM ubuntu AS compute-domain-controller
COPY --from=compute-domain-controller-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/compute-domain-controller /bin/
ENTRYPOINT ["/bin/compute-domain-controller"]

FROM ubuntu AS status-exporter
COPY --from=status-exporter-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-exporter /bin/
ENTRYPOINT ["/bin/status-exporter"]
Expand Down
19 changes: 17 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
BUILD_DIR=$(shell pwd)/bin
COMPONENTS?=device-plugin status-updater kwok-gpu-device-plugin status-exporter topology-server mig-faker jupyter-notebook
COMPONENTS?=device-plugin status-updater kwok-gpu-device-plugin status-exporter topology-server mig-faker jupyter-notebook compute-domain-controller

DOCKER_REPO_BASE=gcr.io/run-ai-lab/fake-gpu-operator
DOCKER_REPO_BASE?=gcr.io/run-ai-lab/fake-gpu-operator
DOCKER_TAG?=0.0.0-dev
NAMESPACE=gpu-operator

Expand Down Expand Up @@ -34,6 +34,21 @@ image:
done
.PHONY: image

docker-build-local:
for component in $(COMPONENTS); do \
docker build -t ${DOCKER_REPO_BASE}/$$component:${DOCKER_TAG} --target $$component .; \
done
@echo "Images built and loaded to local Docker daemon"
.PHONY: docker-build-local

# Push previously built images to registry
docker-push: docker-build-local
for component in $(COMPONENTS); do \
docker push ${DOCKER_REPO_BASE}/$$component:${DOCKER_TAG}; \
done
@echo "Images pushed to registry successfully"
.PHONY: docker-push

test: ginkgo
$(GINKGO) -r --procs=1 --output-dir=/tmp/artifacts/test-results/service-tests --compilers=1 --randomize-all --randomize-suites --fail-on-pending --keep-going --timeout=5m --race --trace --json-report=report.json
.PHONY: test
Expand Down
207 changes: 207 additions & 0 deletions cmd/compute-domain-controller/computedomain_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
/*
* Copyright 2025 The Kubernetes Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package main

import (
"context"
"fmt"

resourceapi "k8s.io/api/resource/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/log"

computedomainv1beta1 "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
"github.com/run-ai/fake-gpu-operator/pkg/compute-domain/consts"
)

const (
// DefaultComputeDomainAllocationMode is the default allocation mode when not specified
DefaultComputeDomainAllocationMode = "Single"
)

// ComputeDomainReconciler watches ComputeDomain resources and keeps the
// associated ResourceClaimTemplates in sync.
type ComputeDomainReconciler struct {
client.Client
Scheme *runtime.Scheme
}

//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains/finalizers,verbs=update
//+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaimtemplates,verbs=get;list;watch;create;update;patch;delete

func (r *ComputeDomainReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := log.FromContext(ctx)

domain := &computedomainv1beta1.ComputeDomain{}
if err := r.Get(ctx, req.NamespacedName, domain); err != nil {
return ctrl.Result{}, client.IgnoreNotFound(err)
}

if domain.DeletionTimestamp.IsZero() {
if err := r.ensureFinalizer(ctx, domain); err != nil {
return ctrl.Result{}, err
}
if err := r.ensureResourceClaimTemplates(ctx, domain); err != nil {
return ctrl.Result{}, err
}
} else {
if err := r.handleDeletion(ctx, domain); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}

logger.V(4).Info("reconciled ComputeDomain", "namespace", domain.Namespace, "name", domain.Name)
return ctrl.Result{}, nil
}

func (r *ComputeDomainReconciler) ensureFinalizer(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error {
if controllerutil.ContainsFinalizer(domain, consts.ComputeDomainFinalizer) {
return nil
}

controllerutil.AddFinalizer(domain, consts.ComputeDomainFinalizer)
return r.Update(ctx, domain)
}

func (r *ComputeDomainReconciler) handleDeletion(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error {
if !controllerutil.ContainsFinalizer(domain, consts.ComputeDomainFinalizer) {
return nil
}

if err := r.deleteResourceClaimTemplates(ctx, domain); err != nil {
return err
}

controllerutil.RemoveFinalizer(domain, consts.ComputeDomainFinalizer)
return r.Update(ctx, domain)
}

func (r *ComputeDomainReconciler) ensureResourceClaimTemplates(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error {
return r.ensureTemplate(ctx, domain, domain.Name, consts.ComputeDomainWorkloadDeviceClass, "workload")
}

func (r *ComputeDomainReconciler) getAllocationMode(domain *computedomainv1beta1.ComputeDomain) string {
if domain.Spec.Channel != nil && domain.Spec.Channel.AllocationMode != "" {
return domain.Spec.Channel.AllocationMode
}
return DefaultComputeDomainAllocationMode
}

func (r *ComputeDomainReconciler) ensureTemplate(
ctx context.Context,
domain *computedomainv1beta1.ComputeDomain,
name string,
deviceClass string,
templateType string,
) error {
key := client.ObjectKey{Namespace: domain.Namespace, Name: name}
existing := &resourceapi.ResourceClaimTemplate{}
err := r.Get(ctx, key, existing)
if err == nil {
return nil
}
if !apierrors.IsNotFound(err) {
return err
}

template := &resourceapi.ResourceClaimTemplate{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: domain.Namespace,
Labels: map[string]string{
"resource.nvidia.com/computeDomain": domain.Name,
"resource.nvidia.com/computeDomainTarget": templateType,
},
Finalizers: []string{
"resource.nvidia.com/computeDomain",
},
},
Spec: resourceapi.ResourceClaimTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"nvidia.com/computeDomain": domain.Name,
},
},
Spec: resourceapi.ResourceClaimSpec{
Devices: resourceapi.DeviceClaim{
Config: []resourceapi.DeviceClaimConfiguration{
{
DeviceConfiguration: resourceapi.DeviceConfiguration{
Opaque: &resourceapi.OpaqueDeviceConfiguration{
Driver: consts.ComputeDomainDriverName,
Parameters: runtime.RawExtension{
Raw: []byte(fmt.Sprintf(`{
"allocationMode": "%s",
"apiVersion": "resource.nvidia.com/v1beta1",
"domainID": "%s",
"kind": "ComputeDomainChannelConfig"
}`, r.getAllocationMode(domain), domain.UID)),
},
},
},
},
},
Requests: []resourceapi.DeviceRequest{
{
Name: "channel",
Exactly: &resourceapi.ExactDeviceRequest{
AllocationMode: resourceapi.DeviceAllocationModeExactCount,
Count: 1,
DeviceClassName: deviceClass,
},
},
},
},
},
},
}

if err := controllerutil.SetControllerReference(domain, template, r.Scheme); err != nil {
return err
}

return client.IgnoreAlreadyExists(r.Create(ctx, template))
}

func (r *ComputeDomainReconciler) deleteResourceClaimTemplates(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error {
template := &resourceapi.ResourceClaimTemplate{
ObjectMeta: metav1.ObjectMeta{
Name: domain.Name,
Namespace: domain.Namespace,
},
}
if err := r.Delete(ctx, template); err != nil && !apierrors.IsNotFound(err) {
return err
}
return nil
}

// SetupWithManager wires the reconciler into the controller-runtime manager.
func (r *ComputeDomainReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&computedomainv1beta1.ComputeDomain{}).
Owns(&resourceapi.ResourceClaimTemplate{}).
Complete(r)
}
Loading
Loading