Skip to content
This repository was archived by the owner on Feb 15, 2025. It is now read-only.

feat(backend): add k3d gpu image builder #797

Merged
merged 31 commits into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
5ff5164
feat(backend): add k3d gpu image builder
gphorvath Jul 18, 2024
df654d0
link make commands to root Makefile
gphorvath Jul 18, 2024
b327bea
ensure k3d-gpu Make commands run relative to the directory of the Mak…
gphorvath Jul 18, 2024
2cd65c6
update readme and use kubectl controls the Kubernetes cluster manager.
gphorvath Jul 18, 2024
31b3f13
updates to Makefile
gphorvath Jul 19, 2024
58d55f4
wip: github workflow
gphorvath Jul 19, 2024
27b282c
fix registry variable names and make targets
gphorvath Jul 19, 2024
e7a67d3
use jq for synchronizing versions across Make, Dockerfile, and GitHub…
gphorvath Jul 19, 2024
4d1fe75
working on workflow
gphorvath Jul 19, 2024
8eb2aa8
attempting to figure out why jq can't find config.json
gphorvath Jul 19, 2024
0f1a697
updating Makefile to be more directory aware
gphorvath Jul 19, 2024
bdcc4d0
WIP: trying to figure out jq targeting in GitHub workflow
gphorvath Jul 19, 2024
128d47d
WIP: trying to figure out jq targeting in GitHub workflow
gphorvath Jul 19, 2024
4eb7cb4
figured out jq target in pipeline... maybe?
gphorvath Jul 19, 2024
1d9fb14
wip: zarf package
gphorvath Jul 19, 2024
29bd9e5
adding a the test
gphorvath Jul 19, 2024
ad83b0b
cleanup old stuff now that Zarf package is done
gphorvath Jul 19, 2024
3550eda
working on bundle/package/image
gphorvath Jul 22, 2024
8edb8e5
Merge branch 'main' into justin-the-law
gphorvath Jul 22, 2024
aa06af3
update zarf version in pipeline
gphorvath Jul 22, 2024
f6b7a0f
something wrong with docker copy
gphorvath Jul 22, 2024
4225a00
update relative path
gphorvath Jul 22, 2024
3ceec59
working out relative paths
gphorvath Jul 22, 2024
8d427e8
add uds cli to workflow
gphorvath Jul 22, 2024
61bf05e
fix a typo
gphorvath Jul 22, 2024
a65ddbf
switching from zarf back to make
gphorvath Jul 22, 2024
d171516
make depends
gphorvath Jul 22, 2024
1c0909c
Merge branch 'main' into justin-the-law
gphorvath Jul 22, 2024
996f5da
update workflow to only push on tag
gphorvath Jul 22, 2024
631b479
fix a couple name updates
gphorvath Jul 22, 2024
9b3718c
move image publish to release pipeline
gphorvath Jul 23, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .github/workflows/build-images.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: Build Images

on:
workflow_dispatch:
pull_request:
types:
- ready_for_review
- review_requested
- synchronize

permissions:
contents: read
packages: write

jobs:
build-and-publish-images:
runs-on: ubuntu-latest

steps:
- name: Checkout Repo
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- name: Login to GitHub Container Registry
uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Build and Publish k3d-gpu
run: |
docker build \
--platform linux/amd64 \
--build-arg K3S_TAG=v1.28.8-k3s1 \
--build-arg CUDA_TAG=12.4.1-base-ubuntu22.04 \
-t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest \
-f packages/k3d-gpu/Dockerfile .
docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest
8 changes: 8 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ gen-python: ## Generate the protobufs for the OpenAI typing within the leapfroga
local-registry: ## Start up a local container registry. Errors in this target are ignored.
-docker run -d -p ${REG_PORT}:5000 --restart=always --name registry registry:2

# Clean up: Stop and remove the local registry
clean-registry:
@echo "Cleaning up..."
@docker stop ${REGISTRY_NAME} || true
@docker rm ${REGISTRY_NAME} || true

sdk-wheel: ## build wheels for the leapfrogai_sdk package as a dependency for other lfai components
docker build --platform=linux/${ARCH} -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-sdk:${LOCAL_VERSION} -f src/leapfrogai_sdk/Dockerfile .

Expand Down Expand Up @@ -151,3 +157,5 @@ build-gpu: build-supabase build-api build-ui build-vllm build-text-embeddings bu
build-all: build-cpu build-gpu ## Build all of the LFAI packages

include tests/make-tests.mk

include packages/k3d-gpu/Makefile
34 changes: 34 additions & 0 deletions packages/k3d-gpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
ARG K3S_TAG="v1.28.8-k3s1"
ARG CUDA_TAG="12.4.1-base-ubuntu22.04"

FROM rancher/k3s:$K3S_TAG AS k3s
FROM nvidia/cuda:$CUDA_TAG

# Install the NVIDIA container toolkit
RUN apt-get update && apt-get install -y curl \
&& curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
&& apt-get update && apt-get install -y nvidia-container-toolkit-base nvidia-container-toolkit nvidia-container-runtime util-linux \
&& nvidia-ctk runtime configure --runtime=containerd

COPY --from=k3s / / --exclude=/bin/
COPY --from=k3s /bin /bin

# Deploy the nvidia driver plugin on startup
COPY plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml

VOLUME /var/lib/kubelet
VOLUME /var/lib/rancher/k3s
VOLUME /var/lib/cni
VOLUME /var/log

# DIFF: resolve fsnotify issues
RUN sysctl -w fs.inotify.max_user_watches=100000
RUN sysctl -w fs.inotify.max_user_instances=100000

ENV PATH="$PATH:/bin/aux"

ENTRYPOINT ["/bin/k3s"]
CMD ["agent"]
39 changes: 39 additions & 0 deletions packages/k3d-gpu/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@

MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
REGISTRY_NAME := registry
REGISTRY_PORT := 5000
ORGANIZATION := defenseunicorns
PLATFORM := linux/amd64
TAG := latest
K3D_CLUSTER_NAME := k3d-core-slim-dev:0.24.0
K3S_TAG := v1.28.8-k3s1
CUDA_TAG := 12.4.1-base-ubuntu22.04

# Create local Docker registry
local-registry:
@echo "Creating local Docker registry..."
-@docker run -d -p ${REGISTRY_PORT}:5000 --name ${REGISTRY_NAME} registry:2
@echo "Local registry created at localhost:${REGISTRY_PORT}"

build-k3d-gpu: local-registry
@docker build \
--platform=${PLATFORM} \
--build-arg K3S_TAG=${K3S_TAG} \
--build-arg CUDA_TAG=${CUDA_TAG} \
-t ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} \
-f ${MAKEFILE_DIR}/Dockerfile ${MAKEFILE_DIR}
@docker tag ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}

push-k3d-gpu: local-registry build-k3d-gpu
@docker push localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}

uds-gpu-cluster: local-registry build-k3d-gpu push-k3d-gpu
uds deploy ${K3D_CLUSTER_NAME} --set K3D_EXTRA_ARGS="--gpus=all --image=localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}" --confirm

test-k3d-gpu:
@uds zarf tools kubectl apply -f ${MAKEFILE_DIR}/test/cuda-vector-add.yaml
@uds zarf tools kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod
@uds zarf tools kubectl logs -l app=gpu-pod
@uds zarf tools kubectl delete -f ${MAKEFILE_DIR}/test/cuda-vector-add.yaml

.PHONY: all local-registry build-k3d-gpu push-k3d-gpu test-k3d-gpu
28 changes: 28 additions & 0 deletions packages/k3d-gpu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# K3D GPU

Prepares `k3s` + `nvidia/cuda` base image that enables a K3D cluster to have access to your host machine's NVIDIA, CUDA-capable GPU(s).

## Pre-Requisites

* Docker: https://www.docker.com/
* K3D: https://k3d.io/
* UDS-CLI: https://github.com/defenseunicorns/uds-cli
* Modern NVIDIA GPU with CUDA cores and drivers must be present. Additionally, the CUDA toolkit and NVIDIA container toolkit must be installed.

## Usage

Check out the Make targets for the various options.

### Local

```shell
make push-k3d-gpu # build and push image to a local registry

make uds-gpu-cluster # create a uds cluster equipped with the k3d-gpu image

make test-k3d-gpu # deploy a test gpu pod to see if everything is working
```

## References

* https://k3d.io/v5.7.2/usage/advanced/cuda/
61 changes: 61 additions & 0 deletions packages/k3d-gpu/plugin/device-plugin-daemonset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia
handler: nvidia
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-daemonset
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-daemonset
spec:
runtimeClassName: nvidia # Explicitly request the runtime
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
name: nvidia-device-plugin-ctr
env:
- name: PASS_DEVICE_SPECS
value: "true"
- name: FAIL_ON_INIT_ERROR
value: "true"
- name: DEVICE_LIST_STRATEGY
value: envvar
- name: DEVICE_ID_STRATEGY
value: uuid
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: all
- name: MPS_ROOT
value: /run/nvidia/mps
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
21 changes: 21 additions & 0 deletions packages/k3d-gpu/test/cuda-vector-add.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: v1
kind: Pod
metadata:
name: gpu-pod
labels:
app: gpu-pod
spec:
runtimeClassName: nvidia
restartPolicy: Never
containers:
- name: cuda-container
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2
resources:
limits:
nvidia.com/gpu: "1" # requesting 1 GPU
cpu: "1"
memory: 0.5Gi
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
Loading