defenseunicorns · gphorvath · Jul 23, 2024 · Jul 18, 2024 · Jul 18, 2024 · Jul 18, 2024
diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml
@@ -0,0 +1,38 @@
+name: Build Images
+
+on:
+  workflow_dispatch:
+  pull_request:
+    types:
+      - ready_for_review
+      - review_requested
+      - synchronize
+
+permissions:
+  contents: read
+  packages: write
+
+jobs:
+  build-and-publish-images:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and Publish k3d-gpu
+        run: |
+          docker build \
+            --platform linux/amd64 \
+            --build-arg K3S_TAG=v1.28.8-k3s1 \
+            --build-arg CUDA_TAG=12.4.1-base-ubuntu22.04 \
+            -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest \
+            -f packages/k3d-gpu/Dockerfile .
+          docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest
@@ -35,6 +35,12 @@ gen-python: ## Generate the protobufs for the OpenAI typing within the leapfroga
 local-registry: ## Start up a local container registry. Errors in this target are ignored.
 	-docker run -d -p ${REG_PORT}:5000 --restart=always --name registry registry:2
 
+# Clean up: Stop and remove the local registry
+clean-registry:
+	@echo "Cleaning up..."
+	@docker stop ${REGISTRY_NAME} || true
+	@docker rm ${REGISTRY_NAME} || true
+
 sdk-wheel: ## build wheels for the leapfrogai_sdk package as a dependency for other lfai components
 	docker build --platform=linux/${ARCH} -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-sdk:${LOCAL_VERSION} -f src/leapfrogai_sdk/Dockerfile .
 
@@ -151,3 +157,5 @@ build-gpu: build-supabase build-api build-ui build-vllm build-text-embeddings bu
 build-all: build-cpu build-gpu ## Build all of the LFAI packages
 
 include tests/make-tests.mk
+
+include packages/k3d-gpu/Makefile
@@ -0,0 +1,34 @@
+ARG K3S_TAG="v1.28.8-k3s1"
+ARG CUDA_TAG="12.4.1-base-ubuntu22.04"
+
+FROM rancher/k3s:$K3S_TAG AS k3s
+FROM nvidia/cuda:$CUDA_TAG
+
+# Install the NVIDIA container toolkit
+RUN apt-get update && apt-get install -y curl \
+    && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+    && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+      sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+      tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
+    && apt-get update && apt-get install -y nvidia-container-toolkit-base nvidia-container-toolkit nvidia-container-runtime util-linux \
+    && nvidia-ctk runtime configure --runtime=containerd
+
+COPY --from=k3s / / --exclude=/bin/
+COPY --from=k3s /bin /bin
+
+# Deploy the nvidia driver plugin on startup
+COPY plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml
+
+VOLUME /var/lib/kubelet
+VOLUME /var/lib/rancher/k3s
+VOLUME /var/lib/cni
+VOLUME /var/log
+
+# DIFF: resolve fsnotify issues
+RUN sysctl -w fs.inotify.max_user_watches=100000
+RUN sysctl -w fs.inotify.max_user_instances=100000
+
+ENV PATH="$PATH:/bin/aux"
+
+ENTRYPOINT ["/bin/k3s"]
+CMD ["agent"]
@@ -0,0 +1,39 @@
+
+MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+REGISTRY_NAME := registry
+REGISTRY_PORT := 5000
+ORGANIZATION := defenseunicorns
+PLATFORM := linux/amd64
+TAG := latest
+K3D_CLUSTER_NAME := k3d-core-slim-dev:0.24.0
+K3S_TAG := v1.28.8-k3s1
+CUDA_TAG := 12.4.1-base-ubuntu22.04
+
+# Create local Docker registry
+local-registry:
+	@echo "Creating local Docker registry..."
+	-@docker run -d -p ${REGISTRY_PORT}:5000 --name ${REGISTRY_NAME} registry:2
+	@echo "Local registry created at localhost:${REGISTRY_PORT}"
+
+build-k3d-gpu: local-registry
+	@docker build \
+	 --platform=${PLATFORM} \
+	 --build-arg K3S_TAG=${K3S_TAG} \
+	 --build-arg CUDA_TAG=${CUDA_TAG} \
+	 -t ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} \
+	 -f ${MAKEFILE_DIR}/Dockerfile ${MAKEFILE_DIR}
+	@docker tag ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}
+
+push-k3d-gpu: local-registry build-k3d-gpu
+	@docker push localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}
+
+uds-gpu-cluster: local-registry build-k3d-gpu push-k3d-gpu
+	uds deploy ${K3D_CLUSTER_NAME} --set K3D_EXTRA_ARGS="--gpus=all --image=localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}" --confirm
+
+test-k3d-gpu:
+	@uds zarf tools kubectl apply -f ${MAKEFILE_DIR}/test/cuda-vector-add.yaml
+	@uds zarf tools kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod
+	@uds zarf tools kubectl logs -l app=gpu-pod
+	@uds zarf tools kubectl delete -f ${MAKEFILE_DIR}/test/cuda-vector-add.yaml
+
+.PHONY: all local-registry build-k3d-gpu push-k3d-gpu test-k3d-gpu
@@ -0,0 +1,28 @@
+# K3D GPU
+
+Prepares `k3s` + `nvidia/cuda` base image that enables a K3D cluster to have access to your host machine's NVIDIA, CUDA-capable GPU(s).
+
+## Pre-Requisites
+
+* Docker: https://www.docker.com/
+* K3D: https://k3d.io/
+* UDS-CLI: https://github.com/defenseunicorns/uds-cli
+* Modern NVIDIA GPU with CUDA cores and drivers must be present. Additionally, the CUDA toolkit and NVIDIA container toolkit must be installed.
+
+## Usage
+
+Check out the Make targets for the various options.
+
+### Local
+
+```shell
+make push-k3d-gpu # build and push image to a local registry
+
+make uds-gpu-cluster # create a uds cluster equipped with the k3d-gpu image
+
+make test-k3d-gpu # deploy a test gpu pod to see if everything is working
+```
+
+## References
+
+* https://k3d.io/v5.7.2/usage/advanced/cuda/
@@ -0,0 +1,61 @@
+apiVersion: node.k8s.io/v1
+kind: RuntimeClass
+metadata:
+  name: nvidia
+handler: nvidia
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-device-plugin-daemonset
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      name: nvidia-device-plugin-daemonset
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        name: nvidia-device-plugin-daemonset
+    spec:
+      runtimeClassName: nvidia # Explicitly request the runtime
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      # Mark this pod as a critical add-on; when enabled, the critical add-on
+      # scheduler reserves resources for critical add-on pods so that they can
+      # be rescheduled after a failure.
+      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+      priorityClassName: "system-node-critical"
+      containers:
+      - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
+        name: nvidia-device-plugin-ctr
+        env:
+          - name: PASS_DEVICE_SPECS
+            value: "true"
+          - name: FAIL_ON_INIT_ERROR
+            value: "true"
+          - name: DEVICE_LIST_STRATEGY
+            value: envvar
+          - name: DEVICE_ID_STRATEGY
+            value: uuid
+          - name: NVIDIA_VISIBLE_DEVICES
+            value: all
+          - name: NVIDIA_DRIVER_CAPABILITIES
+            value: all
+          - name: MPS_ROOT
+            value: /run/nvidia/mps
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop: ["ALL"]
+        volumeMounts:
+        - name: device-plugin
+          mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+      - name: device-plugin
+        hostPath:
+          path: /var/lib/kubelet/device-plugins
@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: gpu-pod
+  labels:
+    app: gpu-pod
+spec:
+  runtimeClassName: nvidia
+  restartPolicy: Never
+  containers:
+    - name: cuda-container
+      image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2
+      resources:
+        limits:
+          nvidia.com/gpu: "1" # requesting 1 GPU
+          cpu: "1"
+          memory: 0.5Gi
+  tolerations:
+  - key: nvidia.com/gpu
+    operator: Exists
+    effect: NoSchedule