From 5ff5164db2e58b745eec4ace9df4570228db8f2d Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Thu, 18 Jul 2024 16:18:59 -0400 Subject: [PATCH 01/29] feat(backend): add k3d gpu image builder --- packages/k3d-gpu/Dockerfile | 34 +++++++++++ packages/k3d-gpu/Makefile | 38 ++++++++++++ packages/k3d-gpu/README.md | 38 ++++++++++++ .../plugin/device-plugin-daemonset.yaml | 61 +++++++++++++++++++ packages/k3d-gpu/test/cuda-vector-add.yaml | 21 +++++++ 5 files changed, 192 insertions(+) create mode 100644 packages/k3d-gpu/Dockerfile create mode 100644 packages/k3d-gpu/Makefile create mode 100644 packages/k3d-gpu/README.md create mode 100644 packages/k3d-gpu/plugin/device-plugin-daemonset.yaml create mode 100644 packages/k3d-gpu/test/cuda-vector-add.yaml diff --git a/packages/k3d-gpu/Dockerfile b/packages/k3d-gpu/Dockerfile new file mode 100644 index 000000000..de269aff1 --- /dev/null +++ b/packages/k3d-gpu/Dockerfile @@ -0,0 +1,34 @@ +ARG K3S_TAG="v1.28.8-k3s1" +ARG CUDA_TAG="12.4.1-base-ubuntu22.04" + +FROM rancher/k3s:$K3S_TAG AS k3s +FROM nvidia/cuda:$CUDA_TAG + +# Install the NVIDIA container toolkit +RUN apt-get update && apt-get install -y curl \ + && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ + && apt-get update && apt-get install -y nvidia-container-toolkit-base nvidia-container-toolkit nvidia-container-runtime util-linux \ + && nvidia-ctk runtime configure --runtime=containerd + +COPY --from=k3s / / --exclude=/bin/ +COPY --from=k3s /bin /bin + +# Deploy the nvidia driver plugin on startup +COPY plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml + +VOLUME /var/lib/kubelet +VOLUME /var/lib/rancher/k3s +VOLUME /var/lib/cni +VOLUME /var/log + +# DIFF: resolve fsnotify issues +RUN sysctl -w fs.inotify.max_user_watches=100000 +RUN sysctl -w fs.inotify.max_user_instances=100000 + +ENV PATH="$PATH:/bin/aux" + +ENTRYPOINT ["/bin/k3s"] +CMD ["agent"] diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile new file mode 100644 index 000000000..40d1bcd62 --- /dev/null +++ b/packages/k3d-gpu/Makefile @@ -0,0 +1,38 @@ + +REGISTRY_NAME := registry +REGISTRY_PORT := 5000 +SHELL_SCRIPT := build.sh +ORGANIZATION := defenseunicorns +PLATFORM := linux/amd64 +TAG := latest +K3D_CLUSTER_NAME := k3d-core-slim-dev:0.24.0 + + +# Create local Docker registry +local-registry: + @echo "Creating local Docker registry..." + -@docker run -d -p ${REGISTRY_PORT}:5000 --name ${REGISTRY_NAME} registry:2 + @echo "Local registry created at localhost:${REGISTRY_PORT}" + +build-k3d: local-registry + @docker build --platform=${PLATFORM} -t ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} . + @docker tag ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG} + +push-k3d: local-registry build-k3d + @docker push localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG} + +uds: local-registry build-k3d push-k3d + uds deploy ${K3D_CLUSTER_NAME} --set K3D_EXTRA_ARGS="--gpus=all --image=localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}" --confirm + +test: + @kubectl apply -f test/cuda-vector-add.yaml + @kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod + @kubectl logs -l app=gpu-pod + +# Clean up: Stop and remove the local registry +clean-registry: + @echo "Cleaning up..." + @docker stop ${REGISTRY_NAME} || true + @docker rm ${REGISTRY_NAME} || true + +.PHONY: all local-registry build-k3d push-k3d clean-registry diff --git a/packages/k3d-gpu/README.md b/packages/k3d-gpu/README.md new file mode 100644 index 000000000..b564e83b5 --- /dev/null +++ b/packages/k3d-gpu/README.md @@ -0,0 +1,38 @@ +# K3D GPU + +Prepares `k3s` + `nvidia/cuda` base image that enables a K3D cluster to have access to your host machine's NVIDIA, CUDA-capable GPU(s). + +## Pre-Requisites + +Access to GitHub and GitHub Container Registry. Please follow the [GitHub Container Registry instructions](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry). + +Docker and all of its dependencies must be installed. + +For the container GPU test, a NVIDIA GPU with CUDA cores and drivers must be present. Additionally, the CUDA toolkit and NVIDIA container toolkit must be installed. + +For Kubernetes testing and pre-requisites, please see [Kubernetes Deployment](#kubernetes-deployment) for details. + +## Usage + +### Building and Pushing the Image + +Check out the Make targets for the various options. + +## Kubernetes Deployment + +Follow the instructions in the [zarf-package-k3d-airgap](https://github.com/defenseunicorns/zarf-package-k3d-airgap) repository for bootstrapping a K3d cluster that can access your NVIDIA GPUs. + +You can also a use more abstracted version of the above Kubernetes deployment by following the instructions in the [uds-leapfrogai](https://github.com/defenseunicorns/uds-leapfrogai/tree/main/bundles/gpu) bundle repository. + +## Test + +Run: + +```shell +kubectl apply -f test/cuda-vector-add.yaml +kubectl logs cuda-vector-add +``` + +## References + +* https://k3d.io/v5.7.2/usage/advanced/cuda/ diff --git a/packages/k3d-gpu/plugin/device-plugin-daemonset.yaml b/packages/k3d-gpu/plugin/device-plugin-daemonset.yaml new file mode 100644 index 000000000..202280341 --- /dev/null +++ b/packages/k3d-gpu/plugin/device-plugin-daemonset.yaml @@ -0,0 +1,61 @@ +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: nvidia-device-plugin-daemonset + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-device-plugin-daemonset + spec: + runtimeClassName: nvidia # Explicitly request the runtime + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2 + name: nvidia-device-plugin-ctr + env: + - name: PASS_DEVICE_SPECS + value: "true" + - name: FAIL_ON_INIT_ERROR + value: "true" + - name: DEVICE_LIST_STRATEGY + value: envvar + - name: DEVICE_ID_STRATEGY + value: uuid + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: all + - name: MPS_ROOT + value: /run/nvidia/mps + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/packages/k3d-gpu/test/cuda-vector-add.yaml b/packages/k3d-gpu/test/cuda-vector-add.yaml new file mode 100644 index 000000000..019881296 --- /dev/null +++ b/packages/k3d-gpu/test/cuda-vector-add.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + labels: + app: gpu-pod +spec: + runtimeClassName: nvidia + restartPolicy: Never + containers: + - name: cuda-container + image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2 + resources: + limits: + nvidia.com/gpu: "1" # requesting 1 GPU + cpu: "1" + memory: 0.5Gi + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule From df654d03482cd907e2099bf1d8cb8874d4812d44 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Thu, 18 Jul 2024 16:25:53 -0400 Subject: [PATCH 02/29] link make commands to root Makefile --- Makefile | 8 ++++++++ packages/k3d-gpu/Makefile | 17 +++++------------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index e83e76d5f..b13b0ab43 100644 --- a/Makefile +++ b/Makefile @@ -35,6 +35,12 @@ gen-python: ## Generate the protobufs for the OpenAI typing within the leapfroga local-registry: ## Start up a local container registry. Errors in this target are ignored. -docker run -d -p ${REG_PORT}:5000 --restart=always --name registry registry:2 +# Clean up: Stop and remove the local registry +clean-registry: + @echo "Cleaning up..." + @docker stop ${REGISTRY_NAME} || true + @docker rm ${REGISTRY_NAME} || true + sdk-wheel: ## build wheels for the leapfrogai_sdk package as a dependency for other lfai components docker build --platform=linux/${ARCH} -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-sdk:${LOCAL_VERSION} -f src/leapfrogai_sdk/Dockerfile . @@ -151,3 +157,5 @@ build-gpu: build-supabase build-api build-ui build-vllm build-text-embeddings bu build-all: build-cpu build-gpu ## Build all of the LFAI packages include tests/make-tests.mk + +include packages/k3d-gpu/Makefile diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile index 40d1bcd62..3ed565fe0 100644 --- a/packages/k3d-gpu/Makefile +++ b/packages/k3d-gpu/Makefile @@ -7,32 +7,25 @@ PLATFORM := linux/amd64 TAG := latest K3D_CLUSTER_NAME := k3d-core-slim-dev:0.24.0 - # Create local Docker registry local-registry: @echo "Creating local Docker registry..." -@docker run -d -p ${REGISTRY_PORT}:5000 --name ${REGISTRY_NAME} registry:2 @echo "Local registry created at localhost:${REGISTRY_PORT}" -build-k3d: local-registry +build-k3d-gpu: local-registry @docker build --platform=${PLATFORM} -t ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} . @docker tag ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG} -push-k3d: local-registry build-k3d +push-k3d-gpu: local-registry build-k3d-gpu @docker push localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG} -uds: local-registry build-k3d push-k3d +uds-gpu-cluster: local-registry build-k3d-gpu push-k3d-gpu uds deploy ${K3D_CLUSTER_NAME} --set K3D_EXTRA_ARGS="--gpus=all --image=localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}" --confirm -test: +test-k3d-gpu: @kubectl apply -f test/cuda-vector-add.yaml @kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod @kubectl logs -l app=gpu-pod -# Clean up: Stop and remove the local registry -clean-registry: - @echo "Cleaning up..." - @docker stop ${REGISTRY_NAME} || true - @docker rm ${REGISTRY_NAME} || true - -.PHONY: all local-registry build-k3d push-k3d clean-registry +.PHONY: all local-registry build-k3d-gpu push-k3d-gpu test-k3d-gpu From b327bea2f83ffe1f8cec7bbd83736d0193725c33 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Thu, 18 Jul 2024 16:34:41 -0400 Subject: [PATCH 03/29] ensure k3d-gpu Make commands run relative to the directory of the Makefile --- packages/k3d-gpu/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile index 3ed565fe0..4aacc6411 100644 --- a/packages/k3d-gpu/Makefile +++ b/packages/k3d-gpu/Makefile @@ -1,4 +1,5 @@ +MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) REGISTRY_NAME := registry REGISTRY_PORT := 5000 SHELL_SCRIPT := build.sh @@ -14,7 +15,7 @@ local-registry: @echo "Local registry created at localhost:${REGISTRY_PORT}" build-k3d-gpu: local-registry - @docker build --platform=${PLATFORM} -t ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} . + @docker build --platform=${PLATFORM} -t ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} -f ${MAKEFILE_DIR}/Dockerfile ${MAKEFILE_DIR} @docker tag ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG} push-k3d-gpu: local-registry build-k3d-gpu @@ -24,7 +25,7 @@ uds-gpu-cluster: local-registry build-k3d-gpu push-k3d-gpu uds deploy ${K3D_CLUSTER_NAME} --set K3D_EXTRA_ARGS="--gpus=all --image=localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}" --confirm test-k3d-gpu: - @kubectl apply -f test/cuda-vector-add.yaml + @kubectl apply -f ${MAKEFILE_DIR}/test/cuda-vector-add.yaml @kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod @kubectl logs -l app=gpu-pod From 2cd65c627a7c9a6d6937455c20d19b57f0c2b5b6 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Thu, 18 Jul 2024 16:52:49 -0400 Subject: [PATCH 04/29] update readme and use kubectl controls the Kubernetes cluster manager. Find more information at: https://kubernetes.io/docs/reference/kubectl/ Aliases: kubectl, k Basic Commands (Beginner): create Create a resource from a file or from stdin expose Take a replication controller, service, deployment or pod and expose it as a new Kubernetes service run Run a particular image on the cluster set Set specific features on objects Basic Commands (Intermediate): explain Get documentation for a resource get Display one or many resources edit Edit a resource on the server delete Delete resources by file names, stdin, resources and names, or by resources and label selector Deploy Commands: rollout Manage the rollout of a resource scale Set a new size for a deployment, replica set, or replication controller autoscale Auto-scale a deployment, replica set, stateful set, or replication controller Cluster Management Commands: certificate Modify certificate resources cluster-info Display cluster information top Display resource (CPU/memory) usage cordon Mark node as unschedulable uncordon Mark node as schedulable drain Drain node in preparation for maintenance taint Update the taints on one or more nodes Troubleshooting and Debugging Commands: describe Show details of a specific resource or group of resources logs Print the logs for a container in a pod attach Attach to a running container exec Execute a command in a container port-forward Forward one or more local ports to a pod proxy Run a proxy to the Kubernetes API server cp Copy files and directories to and from containers auth Inspect authorization debug Create debugging sessions for troubleshooting workloads and nodes events List events Advanced Commands: diff Diff the live version against a would-be applied version apply Apply a configuration to a resource by file name or stdin patch Update fields of a resource replace Replace a resource by file name or stdin wait Experimental: Wait for a specific condition on one or many resources kustomize Build a kustomization target from a directory or URL Settings Commands: label Update the labels on a resource annotate Update the annotations on a resource completion Output shell completion code for the specified shell (bash, zsh, fish, or powershell) Subcommands provided by plugins: Other Commands: api-resources Print the supported API resources on the server api-versions Print the supported API versions on the server, in the form of "group/version" config Modify kubeconfig files plugin Provides utilities for interacting with plugins version Print the client and server version information Usage: zarf tools kubectl [flags] [options] Use "kubectl --help" for more information about a given command. Use "zarf tools kubectl options" for a list of global command-line options (applies to all commands). instead of raw kubectl controls the Kubernetes cluster manager. Find more information at: https://kubernetes.io/docs/reference/kubectl/ Basic Commands (Beginner): create Create a resource from a file or from stdin expose Take a replication controller, service, deployment or pod and expose it as a new Kubernetes service run Run a particular image on the cluster set Set specific features on objects Basic Commands (Intermediate): explain Get documentation for a resource get Display one or many resources edit Edit a resource on the server delete Delete resources by file names, stdin, resources and names, or by resources and label selector Deploy Commands: rollout Manage the rollout of a resource scale Set a new size for a deployment, replica set, or replication controller autoscale Auto-scale a deployment, replica set, stateful set, or replication controller Cluster Management Commands: certificate Modify certificate resources cluster-info Display cluster information top Display resource (CPU/memory) usage cordon Mark node as unschedulable uncordon Mark node as schedulable drain Drain node in preparation for maintenance taint Update the taints on one or more nodes Troubleshooting and Debugging Commands: describe Show details of a specific resource or group of resources logs Print the logs for a container in a pod attach Attach to a running container exec Execute a command in a container port-forward Forward one or more local ports to a pod proxy Run a proxy to the Kubernetes API server cp Copy files and directories to and from containers auth Inspect authorization debug Create debugging sessions for troubleshooting workloads and nodes events List events Advanced Commands: diff Diff the live version against a would-be applied version apply Apply a configuration to a resource by file name or stdin patch Update fields of a resource replace Replace a resource by file name or stdin wait Experimental: Wait for a specific condition on one or many resources kustomize Build a kustomization target from a directory or URL Settings Commands: label Update the labels on a resource annotate Update the annotations on a resource completion Output shell completion code for the specified shell (bash, zsh, fish, or powershell) Subcommands provided by plugins: Other Commands: api-resources Print the supported API resources on the server api-versions Print the supported API versions on the server, in the form of "group/version" config Modify kubeconfig files plugin Provides utilities for interacting with plugins version Print the client and server version information Usage: kubectl [flags] [options] Use "kubectl --help" for more information about a given command. Use "kubectl options" for a list of global command-line options (applies to all commands). --- packages/k3d-gpu/Makefile | 7 ++++--- packages/k3d-gpu/README.md | 28 +++++++++------------------- 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile index 4aacc6411..6a8ea10d3 100644 --- a/packages/k3d-gpu/Makefile +++ b/packages/k3d-gpu/Makefile @@ -25,8 +25,9 @@ uds-gpu-cluster: local-registry build-k3d-gpu push-k3d-gpu uds deploy ${K3D_CLUSTER_NAME} --set K3D_EXTRA_ARGS="--gpus=all --image=localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}" --confirm test-k3d-gpu: - @kubectl apply -f ${MAKEFILE_DIR}/test/cuda-vector-add.yaml - @kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod - @kubectl logs -l app=gpu-pod + @uds zarf tools kubectl apply -f ${MAKEFILE_DIR}/test/cuda-vector-add.yaml + @uds zarf tools kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod + @uds zarf tools kubectl logs -l app=gpu-pod + @uds zarf tools kubectl delete -f ${MAKEFILE_DIR}/test/cuda-vector-add.yaml .PHONY: all local-registry build-k3d-gpu push-k3d-gpu test-k3d-gpu diff --git a/packages/k3d-gpu/README.md b/packages/k3d-gpu/README.md index b564e83b5..f35ac09d1 100644 --- a/packages/k3d-gpu/README.md +++ b/packages/k3d-gpu/README.md @@ -4,33 +4,23 @@ Prepares `k3s` + `nvidia/cuda` base image that enables a K3D cluster to have acc ## Pre-Requisites -Access to GitHub and GitHub Container Registry. Please follow the [GitHub Container Registry instructions](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry). - -Docker and all of its dependencies must be installed. - -For the container GPU test, a NVIDIA GPU with CUDA cores and drivers must be present. Additionally, the CUDA toolkit and NVIDIA container toolkit must be installed. - -For Kubernetes testing and pre-requisites, please see [Kubernetes Deployment](#kubernetes-deployment) for details. +* Docker: https://www.docker.com/ +* K3D: https://k3d.io/ +* UDS-CLI: https://github.com/defenseunicorns/uds-cli +* Modern NVIDIA GPU with CUDA cores and drivers must be present. Additionally, the CUDA toolkit and NVIDIA container toolkit must be installed. ## Usage -### Building and Pushing the Image - Check out the Make targets for the various options. -## Kubernetes Deployment +### Local -Follow the instructions in the [zarf-package-k3d-airgap](https://github.com/defenseunicorns/zarf-package-k3d-airgap) repository for bootstrapping a K3d cluster that can access your NVIDIA GPUs. - -You can also a use more abstracted version of the above Kubernetes deployment by following the instructions in the [uds-leapfrogai](https://github.com/defenseunicorns/uds-leapfrogai/tree/main/bundles/gpu) bundle repository. - -## Test +```shell +make push-k3d-gpu # build and push image to a local registry -Run: +make uds-gpu-cluster # create a uds cluster equipped with the k3d-gpu image -```shell -kubectl apply -f test/cuda-vector-add.yaml -kubectl logs cuda-vector-add +make test-k3d-gpu # deploy a test gpu pod to see if everything is working ``` ## References From 31b3f13e8a025c8170627ae46e11729c1948770b Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Thu, 18 Jul 2024 21:32:41 -0400 Subject: [PATCH 05/29] updates to Makefile --- packages/k3d-gpu/Makefile | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile index 6a8ea10d3..9bea9be82 100644 --- a/packages/k3d-gpu/Makefile +++ b/packages/k3d-gpu/Makefile @@ -2,11 +2,12 @@ MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) REGISTRY_NAME := registry REGISTRY_PORT := 5000 -SHELL_SCRIPT := build.sh ORGANIZATION := defenseunicorns PLATFORM := linux/amd64 TAG := latest K3D_CLUSTER_NAME := k3d-core-slim-dev:0.24.0 +K3S_TAG := v1.28.8-k3s1 +CUDA_TAG := 12.4.1-base-ubuntu22.04 # Create local Docker registry local-registry: @@ -15,7 +16,12 @@ local-registry: @echo "Local registry created at localhost:${REGISTRY_PORT}" build-k3d-gpu: local-registry - @docker build --platform=${PLATFORM} -t ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} -f ${MAKEFILE_DIR}/Dockerfile ${MAKEFILE_DIR} + @docker build \ + --platform=${PLATFORM} \ + --build-arg K3S_TAG=${K3S_TAG} \ + --build-arg CUDA_TAG=${CUDA_TAG} \ + -t ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} \ + -f ${MAKEFILE_DIR}/Dockerfile ${MAKEFILE_DIR} @docker tag ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG} push-k3d-gpu: local-registry build-k3d-gpu From 58d55f406b5529f81f3e677f24e5fdba6d7ef222 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Thu, 18 Jul 2024 21:43:06 -0400 Subject: [PATCH 06/29] wip: github workflow --- .github/workflows/build-images.yaml | 38 +++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 .github/workflows/build-images.yaml diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml new file mode 100644 index 000000000..0bdbc107c --- /dev/null +++ b/.github/workflows/build-images.yaml @@ -0,0 +1,38 @@ +name: Build Images + +on: + workflow_dispatch: + pull_request: + types: + - ready_for_review + - review_requested + - synchronize + +permissions: + contents: read + packages: write + +jobs: + build-and-publish-images: + runs-on: ubuntu-latest + + steps: + - name: Checkout Repo + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - name: Login to GitHub Container Registry + uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and Publish k3d-gpu + run: | + docker build \ + --platform linux/amd64 \ + --build-arg K3S_TAG=v1.28.8-k3s1 \ + --build-arg CUDA_TAG=12.4.1-base-ubuntu22.04 \ + -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest \ + -f packages/k3d-gpu/Dockerfile . + docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest From 27b282c8693519b30ffa9777dd4e49385ba4d75c Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Fri, 19 Jul 2024 10:18:37 -0400 Subject: [PATCH 07/29] fix registry variable names and make targets --- Makefile | 10 +++++++--- packages/k3d-gpu/Makefile | 18 +++++++++--------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index b13b0ab43..5ae01e870 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ ARCH ?= amd64 KEY ?= "" REG_PORT ?= 5000 +REG_NAME ?= registry VERSION ?= $(shell git describe --abbrev=0 --tags) LOCAL_VERSION ?= $(shell git rev-parse --short HEAD) @@ -33,13 +34,16 @@ gen-python: ## Generate the protobufs for the OpenAI typing within the leapfroga src/leapfrogai_sdk/proto/leapfrogai_sdk/**/*.proto local-registry: ## Start up a local container registry. Errors in this target are ignored. - -docker run -d -p ${REG_PORT}:5000 --restart=always --name registry registry:2 + @echo "Creating local Docker registry..." + -@docker run -d -p ${REG_PORT}:5000 --restart=always --name ${REG_NAME} registry:2 + @echo "Local registry created at localhost:${REG_PORT}" + # Clean up: Stop and remove the local registry clean-registry: @echo "Cleaning up..." - @docker stop ${REGISTRY_NAME} || true - @docker rm ${REGISTRY_NAME} || true + @docker stop registry + @docker rm ${REG_NAME} sdk-wheel: ## build wheels for the leapfrogai_sdk package as a dependency for other lfai components docker build --platform=linux/${ARCH} -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-sdk:${LOCAL_VERSION} -f src/leapfrogai_sdk/Dockerfile . diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile index 9bea9be82..d0513381a 100644 --- a/packages/k3d-gpu/Makefile +++ b/packages/k3d-gpu/Makefile @@ -1,7 +1,8 @@ +REG_PORT ?= 5000 +REG_NAME ?= registry + MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) -REGISTRY_NAME := registry -REGISTRY_PORT := 5000 ORGANIZATION := defenseunicorns PLATFORM := linux/amd64 TAG := latest @@ -9,11 +10,10 @@ K3D_CLUSTER_NAME := k3d-core-slim-dev:0.24.0 K3S_TAG := v1.28.8-k3s1 CUDA_TAG := 12.4.1-base-ubuntu22.04 -# Create local Docker registry -local-registry: +local-registry: ## Start up a local container registry. Errors in this target are ignored. @echo "Creating local Docker registry..." - -@docker run -d -p ${REGISTRY_PORT}:5000 --name ${REGISTRY_NAME} registry:2 - @echo "Local registry created at localhost:${REGISTRY_PORT}" + -@docker run -d -p ${REG_PORT}:5000 --restart=always --name ${REG_NAME} registry:2 + @echo "Local registry created at localhost:${REG_PORT}" build-k3d-gpu: local-registry @docker build \ @@ -22,13 +22,13 @@ build-k3d-gpu: local-registry --build-arg CUDA_TAG=${CUDA_TAG} \ -t ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} \ -f ${MAKEFILE_DIR}/Dockerfile ${MAKEFILE_DIR} - @docker tag ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG} + @docker tag ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} localhost:${REG_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG} push-k3d-gpu: local-registry build-k3d-gpu - @docker push localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG} + @docker push localhost:${REG_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG} uds-gpu-cluster: local-registry build-k3d-gpu push-k3d-gpu - uds deploy ${K3D_CLUSTER_NAME} --set K3D_EXTRA_ARGS="--gpus=all --image=localhost:${REGISTRY_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}" --confirm + uds deploy ${K3D_CLUSTER_NAME} --set K3D_EXTRA_ARGS="--gpus=all --image=localhost:${REG_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}" --confirm test-k3d-gpu: @uds zarf tools kubectl apply -f ${MAKEFILE_DIR}/test/cuda-vector-add.yaml From e7a67d395a4f60a3c47df6dadf17e48b42f56fb9 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Fri, 19 Jul 2024 10:39:55 -0400 Subject: [PATCH 08/29] use jq for synchronizing versions across Make, Dockerfile, and GitHub workflow --- .github/workflows/build-images.yaml | 13 +++++++++++-- packages/k3d-gpu/Dockerfile | 6 +++--- packages/k3d-gpu/Makefile | 4 ++-- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index 0bdbc107c..8449d6fae 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -20,6 +20,15 @@ jobs: - name: Checkout Repo uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - name: Install jq + run: sudo apt-get install jq + + - name: Read tags from config + id: read_tags + run: | + echo "K3S_TAG=$(jq -r .k3s_tag packages/k3d-gpu/config.json)" >> $GITHUB_OUTPUT + echo "CUDA_TAG=$(jq -r .cuda_tag packages/k3d-gpu/config.json)" >> $GITHUB_OUTPUT + - name: Login to GitHub Container Registry uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 with: @@ -31,8 +40,8 @@ jobs: run: | docker build \ --platform linux/amd64 \ - --build-arg K3S_TAG=v1.28.8-k3s1 \ - --build-arg CUDA_TAG=12.4.1-base-ubuntu22.04 \ + --build-arg K3S_TAG=${{ steps.read_tags.outputs.K3S_TAG }} \ + --build-arg CUDA_TAG=${{ steps.read_tags.outputs.CUDA_TAG }} \ -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest \ -f packages/k3d-gpu/Dockerfile . docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest diff --git a/packages/k3d-gpu/Dockerfile b/packages/k3d-gpu/Dockerfile index de269aff1..a07850f57 100644 --- a/packages/k3d-gpu/Dockerfile +++ b/packages/k3d-gpu/Dockerfile @@ -1,5 +1,5 @@ -ARG K3S_TAG="v1.28.8-k3s1" -ARG CUDA_TAG="12.4.1-base-ubuntu22.04" +ARG K3S_TAG +ARG CUDA_TAG FROM rancher/k3s:$K3S_TAG AS k3s FROM nvidia/cuda:$CUDA_TAG @@ -17,7 +17,7 @@ COPY --from=k3s / / --exclude=/bin/ COPY --from=k3s /bin /bin # Deploy the nvidia driver plugin on startup -COPY plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml +COPY packages/k3d-gpu/plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml VOLUME /var/lib/kubelet VOLUME /var/lib/rancher/k3s diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile index d0513381a..379fad266 100644 --- a/packages/k3d-gpu/Makefile +++ b/packages/k3d-gpu/Makefile @@ -7,8 +7,8 @@ ORGANIZATION := defenseunicorns PLATFORM := linux/amd64 TAG := latest K3D_CLUSTER_NAME := k3d-core-slim-dev:0.24.0 -K3S_TAG := v1.28.8-k3s1 -CUDA_TAG := 12.4.1-base-ubuntu22.04 +K3S_TAG := $(shell jq -r .k3s_tag ${MAKEFILE_DIR}/config.json) +CUDA_TAG := $(shell jq -r .cuda_tag ${MAKEFILE_DIR}/config.json) local-registry: ## Start up a local container registry. Errors in this target are ignored. @echo "Creating local Docker registry..." From 4d1fe75e515b7a21ac20321a01248cc197635343 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Fri, 19 Jul 2024 10:45:16 -0400 Subject: [PATCH 09/29] working on workflow --- .github/workflows/build-images.yaml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index 8449d6fae..a5f8e0f62 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -12,6 +12,10 @@ permissions: contents: read packages: write +env: + K3S_TAG: "" + CUDA_TAG: "" + jobs: build-and-publish-images: runs-on: ubuntu-latest @@ -26,8 +30,8 @@ jobs: - name: Read tags from config id: read_tags run: | - echo "K3S_TAG=$(jq -r .k3s_tag packages/k3d-gpu/config.json)" >> $GITHUB_OUTPUT - echo "CUDA_TAG=$(jq -r .cuda_tag packages/k3d-gpu/config.json)" >> $GITHUB_OUTPUT + echo "K3S_TAG=$(jq -r .k3s_tag packages/k3d-gpu/config.json)" >> $GITHUB_ENV + echo "CUDA_TAG=$(jq -r .cuda_tag packages/k3d-gpu/config.json)" >> $GITHUB_ENV - name: Login to GitHub Container Registry uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 @@ -40,8 +44,8 @@ jobs: run: | docker build \ --platform linux/amd64 \ - --build-arg K3S_TAG=${{ steps.read_tags.outputs.K3S_TAG }} \ - --build-arg CUDA_TAG=${{ steps.read_tags.outputs.CUDA_TAG }} \ + --build-arg K3S_TAG=${{ env.K3S_TAG }} \ + --build-arg CUDA_TAG=${{ env.CUDA_TAG }} \ -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest \ -f packages/k3d-gpu/Dockerfile . docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest From 8eb2aa848c73cade514525973f2982bab4ff83d6 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Fri, 19 Jul 2024 10:55:10 -0400 Subject: [PATCH 10/29] attempting to figure out why jq can't find config.json --- .github/workflows/build-images.yaml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index a5f8e0f62..1e73a4d1d 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -24,14 +24,11 @@ jobs: - name: Checkout Repo uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - name: Install jq - run: sudo apt-get install jq - - name: Read tags from config id: read_tags run: | - echo "K3S_TAG=$(jq -r .k3s_tag packages/k3d-gpu/config.json)" >> $GITHUB_ENV - echo "CUDA_TAG=$(jq -r .cuda_tag packages/k3d-gpu/config.json)" >> $GITHUB_ENV + echo "K3S_TAG=$(jq -r .k3s_tag ./packages/k3d-gpu/config.json)" >> $GITHUB_ENV + echo "CUDA_TAG=$(jq -r .cuda_tag ./packages/k3d-gpu/config.json)" >> $GITHUB_ENV - name: Login to GitHub Container Registry uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 From 0f1a69788020f91b34a533cdf4c82692e9311c8a Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Fri, 19 Jul 2024 11:14:49 -0400 Subject: [PATCH 11/29] updating Makefile to be more directory aware --- .github/workflows/build-images.yaml | 4 ++-- packages/k3d-gpu/Makefile | 14 +++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index 1e73a4d1d..8bd403797 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -27,8 +27,8 @@ jobs: - name: Read tags from config id: read_tags run: | - echo "K3S_TAG=$(jq -r .k3s_tag ./packages/k3d-gpu/config.json)" >> $GITHUB_ENV - echo "CUDA_TAG=$(jq -r .cuda_tag ./packages/k3d-gpu/config.json)" >> $GITHUB_ENV + echo "K3S_TAG=$(jq -r .k3s_tag packages/k3d-gpu/config.json)" >> $GITHUB_ENV + echo "CUDA_TAG=$(jq -r .cuda_tag packages/k3d-gpu/config.json)" >> $GITHUB_ENV - name: Login to GitHub Container Registry uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile index 379fad266..bb8518eed 100644 --- a/packages/k3d-gpu/Makefile +++ b/packages/k3d-gpu/Makefile @@ -10,18 +10,26 @@ K3D_CLUSTER_NAME := k3d-core-slim-dev:0.24.0 K3S_TAG := $(shell jq -r .k3s_tag ${MAKEFILE_DIR}/config.json) CUDA_TAG := $(shell jq -r .cuda_tag ${MAKEFILE_DIR}/config.json) +echo-config: + @echo "DIR: ${MAKEFILE_DIR}" + @echo "K3S_TAG: ${K3S_TAG}" + @echo "CUDA_TAG: ${CUDA_TAG}" + local-registry: ## Start up a local container registry. Errors in this target are ignored. @echo "Creating local Docker registry..." -@docker run -d -p ${REG_PORT}:5000 --restart=always --name ${REG_NAME} registry:2 @echo "Local registry created at localhost:${REG_PORT}" -build-k3d-gpu: local-registry - @docker build \ +build-k3d-gpu: local-registry echo-config ## Build the k3d-gpu-support image +# Change to the root of the repository and build the image + @cd ${MAKEFILE_DIR}/../.. && \ + docker build \ --platform=${PLATFORM} \ --build-arg K3S_TAG=${K3S_TAG} \ --build-arg CUDA_TAG=${CUDA_TAG} \ -t ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} \ - -f ${MAKEFILE_DIR}/Dockerfile ${MAKEFILE_DIR} + -f ${MAKEFILE_DIR}/Dockerfile . +# Tag the image for the local registry @docker tag ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} localhost:${REG_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG} push-k3d-gpu: local-registry build-k3d-gpu From bdcc4d04dcd0207aa3c60e68ba860f472be6fcb0 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Fri, 19 Jul 2024 11:23:45 -0400 Subject: [PATCH 12/29] WIP: trying to figure out jq targeting in GitHub workflow --- .github/workflows/build-images.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index 8bd403797..249a023b8 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -30,6 +30,20 @@ jobs: echo "K3S_TAG=$(jq -r .k3s_tag packages/k3d-gpu/config.json)" >> $GITHUB_ENV echo "CUDA_TAG=$(jq -r .cuda_tag packages/k3d-gpu/config.json)" >> $GITHUB_ENV + - name: Display directory structure and find config.json + run: | + echo "Directory structure:" + tree ${{ github.workspace }} + + echo -e "\nSearching for config.json:" + CONFIG_PATH=$(find ${{ github.workspace }} -name config.json) + if [ -z "$CONFIG_PATH" ]; then + echo "Error: config.json not found" + exit 1 + fi + echo "Config file found at: $CONFIG_PATH" + echo "CONFIG_PATH=$CONFIG_PATH" >> $GITHUB_ENV + - name: Login to GitHub Container Registry uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 with: From 128d47dc3f103df8e86deedc3becb2459c1af04b Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Fri, 19 Jul 2024 11:25:16 -0400 Subject: [PATCH 13/29] WIP: trying to figure out jq targeting in GitHub workflow --- .github/workflows/build-images.yaml | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index 249a023b8..aeab944fc 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -24,25 +24,27 @@ jobs: - name: Checkout Repo uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - name: Display directory structure and find config.json + run: | + echo "Directory structure:" + tree ${{ github.workspace }} + + echo -e "\nSearching for config.json:" + CONFIG_PATH=$(find ${{ github.workspace }} -name config.json) + if [ -z "$CONFIG_PATH" ]; then + echo "Error: config.json not found" + exit 1 + fi + echo "Config file found at: $CONFIG_PATH" + echo "CONFIG_PATH=$CONFIG_PATH" >> $GITHUB_ENV + - name: Read tags from config id: read_tags run: | echo "K3S_TAG=$(jq -r .k3s_tag packages/k3d-gpu/config.json)" >> $GITHUB_ENV echo "CUDA_TAG=$(jq -r .cuda_tag packages/k3d-gpu/config.json)" >> $GITHUB_ENV - - name: Display directory structure and find config.json - run: | - echo "Directory structure:" - tree ${{ github.workspace }} - - echo -e "\nSearching for config.json:" - CONFIG_PATH=$(find ${{ github.workspace }} -name config.json) - if [ -z "$CONFIG_PATH" ]; then - echo "Error: config.json not found" - exit 1 - fi - echo "Config file found at: $CONFIG_PATH" - echo "CONFIG_PATH=$CONFIG_PATH" >> $GITHUB_ENV + - name: Login to GitHub Container Registry uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 From 4eb7cb472c1957919afc3d4de39c29747e89df8a Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Fri, 19 Jul 2024 11:35:54 -0400 Subject: [PATCH 14/29] figured out jq target in pipeline... maybe? --- .github/workflows/build-images.yaml | 20 ++------------------ packages/k3d-gpu/Makefile | 6 +++--- packages/k3d-gpu/version_config.json | 4 ++++ 3 files changed, 9 insertions(+), 21 deletions(-) create mode 100644 packages/k3d-gpu/version_config.json diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index aeab944fc..e49673b33 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -24,27 +24,11 @@ jobs: - name: Checkout Repo uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - name: Display directory structure and find config.json - run: | - echo "Directory structure:" - tree ${{ github.workspace }} - - echo -e "\nSearching for config.json:" - CONFIG_PATH=$(find ${{ github.workspace }} -name config.json) - if [ -z "$CONFIG_PATH" ]; then - echo "Error: config.json not found" - exit 1 - fi - echo "Config file found at: $CONFIG_PATH" - echo "CONFIG_PATH=$CONFIG_PATH" >> $GITHUB_ENV - - name: Read tags from config id: read_tags run: | - echo "K3S_TAG=$(jq -r .k3s_tag packages/k3d-gpu/config.json)" >> $GITHUB_ENV - echo "CUDA_TAG=$(jq -r .cuda_tag packages/k3d-gpu/config.json)" >> $GITHUB_ENV - - + echo "K3S_TAG=$(jq -r .k3s_tag packages/k3d-gpu/version_config.json)" >> $GITHUB_ENV + echo "CUDA_TAG=$(jq -r .cuda_tag packages/k3d-gpu/version_config.json)" >> $GITHUB_ENV - name: Login to GitHub Container Registry uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile index bb8518eed..e13e1df21 100644 --- a/packages/k3d-gpu/Makefile +++ b/packages/k3d-gpu/Makefile @@ -7,8 +7,8 @@ ORGANIZATION := defenseunicorns PLATFORM := linux/amd64 TAG := latest K3D_CLUSTER_NAME := k3d-core-slim-dev:0.24.0 -K3S_TAG := $(shell jq -r .k3s_tag ${MAKEFILE_DIR}/config.json) -CUDA_TAG := $(shell jq -r .cuda_tag ${MAKEFILE_DIR}/config.json) +K3S_TAG := $(shell jq -r .k3s_tag ${MAKEFILE_DIR}/version_config.json) +CUDA_TAG := $(shell jq -r .cuda_tag ${MAKEFILE_DIR}/version_config.json) echo-config: @echo "DIR: ${MAKEFILE_DIR}" @@ -22,7 +22,7 @@ local-registry: ## Start up a local container registry. Errors in this target ar build-k3d-gpu: local-registry echo-config ## Build the k3d-gpu-support image # Change to the root of the repository and build the image - @cd ${MAKEFILE_DIR}/../.. && \ + @cd ${MAKEFILE_DIR}/../.. && \ docker build \ --platform=${PLATFORM} \ --build-arg K3S_TAG=${K3S_TAG} \ diff --git a/packages/k3d-gpu/version_config.json b/packages/k3d-gpu/version_config.json new file mode 100644 index 000000000..0d2934299 --- /dev/null +++ b/packages/k3d-gpu/version_config.json @@ -0,0 +1,4 @@ +{ + "k3s_tag": "v1.28.8-k3s1", + "cuda_tag": "12.4.1-base-ubuntu22.04" + } From 1d9fb14e563f9488ca49768046990b8b7d5fab6c Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Fri, 19 Jul 2024 18:40:39 -0400 Subject: [PATCH 15/29] wip: zarf package --- Makefile | 2 +- packages/k3d-gpu/Dockerfile | 2 +- packages/k3d-gpu/Makefile | 4 +- packages/k3d-gpu/zarf-config.yaml | 7 +++ packages/k3d-gpu/zarf.yaml | 91 +++++++++++++++++++++++++++++++ 5 files changed, 102 insertions(+), 4 deletions(-) create mode 100644 packages/k3d-gpu/zarf-config.yaml create mode 100644 packages/k3d-gpu/zarf.yaml diff --git a/Makefile b/Makefile index 5ae01e870..bff055321 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ local-registry: ## Start up a local container registry. Errors in this target ar # Clean up: Stop and remove the local registry clean-registry: @echo "Cleaning up..." - @docker stop registry + @docker stop ${REG_NAME} @docker rm ${REG_NAME} sdk-wheel: ## build wheels for the leapfrogai_sdk package as a dependency for other lfai components diff --git a/packages/k3d-gpu/Dockerfile b/packages/k3d-gpu/Dockerfile index a07850f57..3e3ad7c7e 100644 --- a/packages/k3d-gpu/Dockerfile +++ b/packages/k3d-gpu/Dockerfile @@ -17,7 +17,7 @@ COPY --from=k3s / / --exclude=/bin/ COPY --from=k3s /bin /bin # Deploy the nvidia driver plugin on startup -COPY packages/k3d-gpu/plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml +COPY ./plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml VOLUME /var/lib/kubelet VOLUME /var/lib/rancher/k3s diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile index e13e1df21..006521fe7 100644 --- a/packages/k3d-gpu/Makefile +++ b/packages/k3d-gpu/Makefile @@ -6,7 +6,7 @@ MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) ORGANIZATION := defenseunicorns PLATFORM := linux/amd64 TAG := latest -K3D_CLUSTER_NAME := k3d-core-slim-dev:0.24.0 +UDS_CORE := k3d-core-slim-dev:0.24.0 K3S_TAG := $(shell jq -r .k3s_tag ${MAKEFILE_DIR}/version_config.json) CUDA_TAG := $(shell jq -r .cuda_tag ${MAKEFILE_DIR}/version_config.json) @@ -36,7 +36,7 @@ push-k3d-gpu: local-registry build-k3d-gpu @docker push localhost:${REG_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG} uds-gpu-cluster: local-registry build-k3d-gpu push-k3d-gpu - uds deploy ${K3D_CLUSTER_NAME} --set K3D_EXTRA_ARGS="--gpus=all --image=localhost:${REG_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}" --confirm + uds deploy ${UDS_CORE} --set K3D_EXTRA_ARGS="--gpus=all --image=localhost:${REG_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}" --confirm test-k3d-gpu: @uds zarf tools kubectl apply -f ${MAKEFILE_DIR}/test/cuda-vector-add.yaml diff --git a/packages/k3d-gpu/zarf-config.yaml b/packages/k3d-gpu/zarf-config.yaml new file mode 100644 index 000000000..23da3d424 --- /dev/null +++ b/packages/k3d-gpu/zarf-config.yaml @@ -0,0 +1,7 @@ +package: + create: + set: + # x-release-please-start-version + version: 0.9.1 + # x-release-please-end + reg_name: registry diff --git a/packages/k3d-gpu/zarf.yaml b/packages/k3d-gpu/zarf.yaml new file mode 100644 index 000000000..9e41ac16b --- /dev/null +++ b/packages/k3d-gpu/zarf.yaml @@ -0,0 +1,91 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/defenseunicorns/zarf/main/zarf.schema.json + +kind: ZarfPackageConfig +metadata: + name: "k3d-gpu" + version: '###ZARF_PKG_TMPL_VERSION###' + description: > + k3d base image with GPU support + +variables: + - name: REG_PORT + description: "Local registry port" + default: "5000" + - name: UDS_CORE + description: "UDS Core version to use" + default: "k3d-core-slim-dev:0.24.0" + - name: K3S_TAG + description: "K3s version to use" + default: "v1.28.8-k3s1" + - name: CUDA_TAG + description: "CUDA version to use" + default: "12.4.1-base-ubuntu22.04" + +components: + - name: create-local-registry + required: true + actions: + onDeploy: + before: + - cmd: | + set +e + docker run -d --name ###ZARF_PKG_TMPL_REG_NAME### -p ${ZARF_VAR_REG_PORT}:5000 registry:2 + set -e + description: "Start the local registry" + mute: true + onRemove: + before: + - cmd: | + set +e + docker stop ###ZARF_PKG_TMPL_REG_NAME### + docker rm ###ZARF_PKG_TMPL_REG_NAME### + set -e + description: "Stop and remove the local registry" + mute: true + + - name: build-image + required: true + files: + - source: Dockerfile + target: Dockerfile + - source: plugin/device-plugin-daemonset.yaml + target: plugin/device-plugin-daemonset.yaml + actions: + onDeploy: + before: + - cmd: | + docker build \ + --platform linux/amd64 \ + --build-arg K3S_TAG=${ZARF_VAR_K3S_TAG} \ + --build-arg CUDA_TAG=${ZARF_VAR_CUDA_TAG} \ + -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### \ + -f ./Dockerfile . + + docker tag \ + ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### \ + localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### + description: "Build the k3d-gpu image" + # mute: true + after: + - cmd: + docker push localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### + description: "Push the image to the local registry" + mute: true + + - name: create-cluster + required: true + actions: + onDeploy: + before: + - cmd: | + uds deploy ${ZARF_VAR_UDS_CORE} \ + --set K3D_EXTRA_ARGS="--gpus=all \ + --image=localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION###" \ + --no-progress --insecure --confirm + description: "Create a k3d cluster with GPU support" + onRemove: + before: + - cmd: | + k3d cluster delete uds + description: "Delete the k3d cluster" + mute: true From 29bd9e517812cb580523ae2670f02ffd1eca86ab Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Fri, 19 Jul 2024 18:51:12 -0400 Subject: [PATCH 16/29] adding a the test --- packages/k3d-gpu/zarf.yaml | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/packages/k3d-gpu/zarf.yaml b/packages/k3d-gpu/zarf.yaml index 9e41ac16b..05c52cb43 100644 --- a/packages/k3d-gpu/zarf.yaml +++ b/packages/k3d-gpu/zarf.yaml @@ -32,7 +32,6 @@ components: docker run -d --name ###ZARF_PKG_TMPL_REG_NAME### -p ${ZARF_VAR_REG_PORT}:5000 registry:2 set -e description: "Start the local registry" - mute: true onRemove: before: - cmd: | @@ -41,7 +40,6 @@ components: docker rm ###ZARF_PKG_TMPL_REG_NAME### set -e description: "Stop and remove the local registry" - mute: true - name: build-image required: true @@ -65,12 +63,10 @@ components: ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### \ localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### description: "Build the k3d-gpu image" - # mute: true after: - cmd: docker push localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### description: "Push the image to the local registry" - mute: true - name: create-cluster required: true @@ -88,4 +84,21 @@ components: - cmd: | k3d cluster delete uds description: "Delete the k3d cluster" - mute: true + + - name: test-cluster + required: true + files: + - source: test/cuda-vector-add.yaml + target: test/cuda-vector-add.yaml + actions: + onDeploy: + before: + - cmd: | + uds zarf tools kubectl apply -f ./test/cuda-vector-add.yaml + uds zarf tools kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod + uds zarf tools kubectl logs -l app=gpu-pod + description: "Run the test pod" + after: + - cmd: | + uds zarf tools kubectl delete -f ./test/cuda-vector-add.yaml + description: "Delete the test pod" From ad83b0b106e3c4db7a56c82449c23423877e60a3 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Fri, 19 Jul 2024 19:06:02 -0400 Subject: [PATCH 17/29] cleanup old stuff now that Zarf package is done --- packages/k3d-gpu/Makefile | 50 ++++------------------------ packages/k3d-gpu/version_config.json | 4 --- 2 files changed, 7 insertions(+), 47 deletions(-) delete mode 100644 packages/k3d-gpu/version_config.json diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile index 006521fe7..6bbcbbf14 100644 --- a/packages/k3d-gpu/Makefile +++ b/packages/k3d-gpu/Makefile @@ -1,47 +1,11 @@ - -REG_PORT ?= 5000 -REG_NAME ?= registry - MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) -ORGANIZATION := defenseunicorns -PLATFORM := linux/amd64 -TAG := latest -UDS_CORE := k3d-core-slim-dev:0.24.0 -K3S_TAG := $(shell jq -r .k3s_tag ${MAKEFILE_DIR}/version_config.json) -CUDA_TAG := $(shell jq -r .cuda_tag ${MAKEFILE_DIR}/version_config.json) - -echo-config: - @echo "DIR: ${MAKEFILE_DIR}" - @echo "K3S_TAG: ${K3S_TAG}" - @echo "CUDA_TAG: ${CUDA_TAG}" - -local-registry: ## Start up a local container registry. Errors in this target are ignored. - @echo "Creating local Docker registry..." - -@docker run -d -p ${REG_PORT}:5000 --restart=always --name ${REG_NAME} registry:2 - @echo "Local registry created at localhost:${REG_PORT}" - -build-k3d-gpu: local-registry echo-config ## Build the k3d-gpu-support image -# Change to the root of the repository and build the image - @cd ${MAKEFILE_DIR}/../.. && \ - docker build \ - --platform=${PLATFORM} \ - --build-arg K3S_TAG=${K3S_TAG} \ - --build-arg CUDA_TAG=${CUDA_TAG} \ - -t ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} \ - -f ${MAKEFILE_DIR}/Dockerfile . -# Tag the image for the local registry - @docker tag ghcr.io/${ORGANIZATION}/k3d-gpu-support:${TAG} localhost:${REG_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG} - -push-k3d-gpu: local-registry build-k3d-gpu - @docker push localhost:${REG_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG} -uds-gpu-cluster: local-registry build-k3d-gpu push-k3d-gpu - uds deploy ${UDS_CORE} --set K3D_EXTRA_ARGS="--gpus=all --image=localhost:${REG_PORT}/${ORGANIZATION}/k3d-gpu-support:${TAG}" --confirm +k3d-gpu-package: + @cd ${MAKEFILE_DIR} && \ + uds zarf package create -test-k3d-gpu: - @uds zarf tools kubectl apply -f ${MAKEFILE_DIR}/test/cuda-vector-add.yaml - @uds zarf tools kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod - @uds zarf tools kubectl logs -l app=gpu-pod - @uds zarf tools kubectl delete -f ${MAKEFILE_DIR}/test/cuda-vector-add.yaml +create-uds-gpu-cluster: + @cd ${MAKEFILE_DIR} && \ + uds zarf package deploy zarf-package-k3d-gpu-amd64-*.tar.zst --confirm -.PHONY: all local-registry build-k3d-gpu push-k3d-gpu test-k3d-gpu +.PHONY: k3d-gpu-package create-uds-gpu-cluster diff --git a/packages/k3d-gpu/version_config.json b/packages/k3d-gpu/version_config.json deleted file mode 100644 index 0d2934299..000000000 --- a/packages/k3d-gpu/version_config.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "k3s_tag": "v1.28.8-k3s1", - "cuda_tag": "12.4.1-base-ubuntu22.04" - } From 3550eda243a6703111df3f12d54541c07e42f71d Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Mon, 22 Jul 2024 12:19:59 -0400 Subject: [PATCH 18/29] working on bundle/package/image --- .github/workflows/build-images.yaml | 29 ++++++------ packages/k3d-gpu/Dockerfile | 4 +- packages/k3d-gpu/Makefile | 2 +- packages/k3d-gpu/zarf-config.yaml | 4 +- packages/k3d-gpu/zarf.yaml | 69 ++++------------------------- 5 files changed, 30 insertions(+), 78 deletions(-) diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index e49673b33..d68129657 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -12,10 +12,6 @@ permissions: contents: read packages: write -env: - K3S_TAG: "" - CUDA_TAG: "" - jobs: build-and-publish-images: runs-on: ubuntu-latest @@ -24,11 +20,9 @@ jobs: - name: Checkout Repo uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - name: Read tags from config - id: read_tags - run: | - echo "K3S_TAG=$(jq -r .k3s_tag packages/k3d-gpu/version_config.json)" >> $GITHUB_ENV - echo "CUDA_TAG=$(jq -r .cuda_tag packages/k3d-gpu/version_config.json)" >> $GITHUB_ENV + - name: Version + id: get_version + uses: battila7/get-version-action@90eb8fc70f6dfcf3f9b95ed8f164d2c05038e729 # v2.2.1 - name: Login to GitHub Container Registry uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 @@ -37,12 +31,19 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Build and Publish k3d-gpu + - name: Install Zarf + uses: defenseunicorns/setup-zarf@f95763914e20e493bb5d45d63e30e17138f981d6 # v1.0.0 + + - name: Build and Publish k3d-gpu image run: | docker build \ --platform linux/amd64 \ - --build-arg K3S_TAG=${{ env.K3S_TAG }} \ - --build-arg CUDA_TAG=${{ env.CUDA_TAG }} \ - -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest \ + -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }} \ -f packages/k3d-gpu/Dockerfile . - docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest + docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }} + + - name: Build and Publish k3d-gpu package + run: | + zarf package create packages/k3d-gpu --set=VERSION=${{ steps.get_version.outputs.version-without-v }} --confirm + zarf package publish zarf-package-leapfrogai-k3d-gpu-amd64-${{ steps.get_version.outputs.version-without-v }}.tar.zst \ + oci://ghcr.io/defenseunicorns/packages/leapfrogai diff --git a/packages/k3d-gpu/Dockerfile b/packages/k3d-gpu/Dockerfile index 3e3ad7c7e..86ff5afc0 100644 --- a/packages/k3d-gpu/Dockerfile +++ b/packages/k3d-gpu/Dockerfile @@ -1,5 +1,5 @@ -ARG K3S_TAG -ARG CUDA_TAG +ARG K3S_TAG=v1.28.8-k3s1 +ARG CUDA_TAG=12.4.1-base-ubuntu22.04 FROM rancher/k3s:$K3S_TAG AS k3s FROM nvidia/cuda:$CUDA_TAG diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile index 6bbcbbf14..31b8c3780 100644 --- a/packages/k3d-gpu/Makefile +++ b/packages/k3d-gpu/Makefile @@ -1,6 +1,6 @@ MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) -k3d-gpu-package: +build-k3d-gpu: @cd ${MAKEFILE_DIR} && \ uds zarf package create diff --git a/packages/k3d-gpu/zarf-config.yaml b/packages/k3d-gpu/zarf-config.yaml index 23da3d424..b1d4c4023 100644 --- a/packages/k3d-gpu/zarf-config.yaml +++ b/packages/k3d-gpu/zarf-config.yaml @@ -4,4 +4,6 @@ package: # x-release-please-start-version version: 0.9.1 # x-release-please-end - reg_name: registry + uds_flavor: k3d-core-slim-dev + uds_version: 0.24.0 + arch: amd64 diff --git a/packages/k3d-gpu/zarf.yaml b/packages/k3d-gpu/zarf.yaml index 05c52cb43..9d9671778 100644 --- a/packages/k3d-gpu/zarf.yaml +++ b/packages/k3d-gpu/zarf.yaml @@ -7,77 +7,26 @@ metadata: description: > k3d base image with GPU support -variables: - - name: REG_PORT - description: "Local registry port" - default: "5000" - - name: UDS_CORE - description: "UDS Core version to use" - default: "k3d-core-slim-dev:0.24.0" - - name: K3S_TAG - description: "K3s version to use" - default: "v1.28.8-k3s1" - - name: CUDA_TAG - description: "CUDA version to use" - default: "12.4.1-base-ubuntu22.04" - components: - - name: create-local-registry + - name: create-uds-cluster required: true actions: - onDeploy: - before: - - cmd: | - set +e - docker run -d --name ###ZARF_PKG_TMPL_REG_NAME### -p ${ZARF_VAR_REG_PORT}:5000 registry:2 - set -e - description: "Start the local registry" - onRemove: + onCreate: before: - cmd: | - set +e - docker stop ###ZARF_PKG_TMPL_REG_NAME### - docker rm ###ZARF_PKG_TMPL_REG_NAME### - set -e - description: "Stop and remove the local registry" + uds pull ghcr.io/defenseunicorns/packages/uds/bundles/###ZARF_PKG_TMPL_UDS_FLAVOR###:###ZARF_PKG_TMPL_UDS_VERSION### -a ###ZARF_PKG_TMPL_ARCH### + description: "Pull the UDS K3D slim dev bundle" - - name: build-image - required: true - files: - - source: Dockerfile - target: Dockerfile - - source: plugin/device-plugin-daemonset.yaml - target: plugin/device-plugin-daemonset.yaml - actions: - onDeploy: - before: - cmd: | - docker build \ - --platform linux/amd64 \ - --build-arg K3S_TAG=${ZARF_VAR_K3S_TAG} \ - --build-arg CUDA_TAG=${ZARF_VAR_CUDA_TAG} \ - -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### \ - -f ./Dockerfile . - - docker tag \ - ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### \ - localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### - description: "Build the k3d-gpu image" - after: - - cmd: - docker push localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### - description: "Push the image to the local registry" - - - name: create-cluster - required: true - actions: + docker pull ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### + description: "Pull the k3d-gpu image" onDeploy: before: - cmd: | - uds deploy ${ZARF_VAR_UDS_CORE} \ + uds deploy uds-bundle-###ZARF_PKG_TMPL_UDS_FLAVOR###-###ZARF_PKG_TMPL_ARCH###-###ZARF_PKG_TMPL_UDS_VERSION###.tar.zst \ --set K3D_EXTRA_ARGS="--gpus=all \ - --image=localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION###" \ - --no-progress --insecure --confirm + --image=ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION###" \ + --no-progress --confirm description: "Create a k3d cluster with GPU support" onRemove: before: From aa06af38a801cbc74a76642b090028afb63ed899 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Mon, 22 Jul 2024 12:28:36 -0400 Subject: [PATCH 19/29] update zarf version in pipeline --- .github/workflows/build-images.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index d68129657..8fdb739a0 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -32,7 +32,7 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Install Zarf - uses: defenseunicorns/setup-zarf@f95763914e20e493bb5d45d63e30e17138f981d6 # v1.0.0 + uses: defenseunicorns/setup-zarf@10e539efed02f75ec39eb8823e22a5c795f492ae #v1.0.1 - name: Build and Publish k3d-gpu image run: | From f6b7a0f0bf968bd5a086750ed1505531a9fbbfd5 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Mon, 22 Jul 2024 12:31:15 -0400 Subject: [PATCH 20/29] something wrong with docker copy --- packages/k3d-gpu/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/k3d-gpu/Dockerfile b/packages/k3d-gpu/Dockerfile index 86ff5afc0..e2d14614a 100644 --- a/packages/k3d-gpu/Dockerfile +++ b/packages/k3d-gpu/Dockerfile @@ -17,7 +17,7 @@ COPY --from=k3s / / --exclude=/bin/ COPY --from=k3s /bin /bin # Deploy the nvidia driver plugin on startup -COPY ./plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml +COPY plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml VOLUME /var/lib/kubelet VOLUME /var/lib/rancher/k3s From 4225a00d219e2f42c54281a3c3aa3fc17a7b2cd5 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Mon, 22 Jul 2024 12:35:39 -0400 Subject: [PATCH 21/29] update relative path --- packages/k3d-gpu/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/k3d-gpu/Dockerfile b/packages/k3d-gpu/Dockerfile index e2d14614a..f82d20f5e 100644 --- a/packages/k3d-gpu/Dockerfile +++ b/packages/k3d-gpu/Dockerfile @@ -17,7 +17,7 @@ COPY --from=k3s / / --exclude=/bin/ COPY --from=k3s /bin /bin # Deploy the nvidia driver plugin on startup -COPY plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml +COPY packages/k3d-gpu/plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml VOLUME /var/lib/kubelet VOLUME /var/lib/rancher/k3s From 3ceec59f2791da1d8fd1b8baec4e6893e1ffe8dc Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Mon, 22 Jul 2024 12:40:08 -0400 Subject: [PATCH 22/29] working out relative paths --- .github/workflows/build-images.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index 8fdb739a0..27507ef83 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -44,6 +44,7 @@ jobs: - name: Build and Publish k3d-gpu package run: | - zarf package create packages/k3d-gpu --set=VERSION=${{ steps.get_version.outputs.version-without-v }} --confirm + cd packages/k3d-gpu + zarf package create --set=VERSION=${{ steps.get_version.outputs.version-without-v }} --confirm zarf package publish zarf-package-leapfrogai-k3d-gpu-amd64-${{ steps.get_version.outputs.version-without-v }}.tar.zst \ oci://ghcr.io/defenseunicorns/packages/leapfrogai From 8d427e8e24ee7c3cedb3d11e8381b13a060dc167 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Mon, 22 Jul 2024 12:51:18 -0400 Subject: [PATCH 23/29] add uds cli to workflow --- .github/workflows/build-images.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index 27507ef83..afee54b79 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -45,6 +45,9 @@ jobs: - name: Build and Publish k3d-gpu package run: | cd packages/k3d-gpu + wget https://github.com/defenseunicorns/uds-cli/releases/download/v0.13.1/uds-cli_v0.13.1_Linux_amd64 + sudo chmod +x uds-cli_v0.13.1_Linux_amd64 + sudo mv uds-cli_v0.13.1_Linux_amd64 /usr/local/bin/uds zarf package create --set=VERSION=${{ steps.get_version.outputs.version-without-v }} --confirm zarf package publish zarf-package-leapfrogai-k3d-gpu-amd64-${{ steps.get_version.outputs.version-without-v }}.tar.zst \ oci://ghcr.io/defenseunicorns/packages/leapfrogai From 61bf05ec165bb191e4ad89f744e84920cbeeae1a Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Mon, 22 Jul 2024 12:53:54 -0400 Subject: [PATCH 24/29] fix a typo --- .github/workflows/build-images.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index afee54b79..205f66073 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -49,5 +49,5 @@ jobs: sudo chmod +x uds-cli_v0.13.1_Linux_amd64 sudo mv uds-cli_v0.13.1_Linux_amd64 /usr/local/bin/uds zarf package create --set=VERSION=${{ steps.get_version.outputs.version-without-v }} --confirm - zarf package publish zarf-package-leapfrogai-k3d-gpu-amd64-${{ steps.get_version.outputs.version-without-v }}.tar.zst \ + zarf package publish zarf-package-k3d-gpu-amd64-${{ steps.get_version.outputs.version-without-v }}.tar.zst \ oci://ghcr.io/defenseunicorns/packages/leapfrogai From a65ddbfa037dcb657b37236ead43041d5829555b Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Mon, 22 Jul 2024 14:09:28 -0400 Subject: [PATCH 25/29] switching from zarf back to make --- .github/workflows/build-images.yaml | 14 ++------ packages/k3d-gpu/Dockerfile | 2 +- packages/k3d-gpu/Makefile | 19 +++++++++-- packages/k3d-gpu/zarf-config.yaml | 9 ----- packages/k3d-gpu/zarf.yaml | 53 ----------------------------- 5 files changed, 20 insertions(+), 77 deletions(-) delete mode 100644 packages/k3d-gpu/zarf-config.yaml delete mode 100644 packages/k3d-gpu/zarf.yaml diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index 205f66073..8902a4a29 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -36,18 +36,8 @@ jobs: - name: Build and Publish k3d-gpu image run: | + cd packages/k3d-gpu docker build \ --platform linux/amd64 \ - -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }} \ - -f packages/k3d-gpu/Dockerfile . + -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }} . docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }} - - - name: Build and Publish k3d-gpu package - run: | - cd packages/k3d-gpu - wget https://github.com/defenseunicorns/uds-cli/releases/download/v0.13.1/uds-cli_v0.13.1_Linux_amd64 - sudo chmod +x uds-cli_v0.13.1_Linux_amd64 - sudo mv uds-cli_v0.13.1_Linux_amd64 /usr/local/bin/uds - zarf package create --set=VERSION=${{ steps.get_version.outputs.version-without-v }} --confirm - zarf package publish zarf-package-k3d-gpu-amd64-${{ steps.get_version.outputs.version-without-v }}.tar.zst \ - oci://ghcr.io/defenseunicorns/packages/leapfrogai diff --git a/packages/k3d-gpu/Dockerfile b/packages/k3d-gpu/Dockerfile index f82d20f5e..e2d14614a 100644 --- a/packages/k3d-gpu/Dockerfile +++ b/packages/k3d-gpu/Dockerfile @@ -17,7 +17,7 @@ COPY --from=k3s / / --exclude=/bin/ COPY --from=k3s /bin /bin # Deploy the nvidia driver plugin on startup -COPY packages/k3d-gpu/plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml +COPY plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml VOLUME /var/lib/kubelet VOLUME /var/lib/rancher/k3s diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile index 31b8c3780..f121cb03b 100644 --- a/packages/k3d-gpu/Makefile +++ b/packages/k3d-gpu/Makefile @@ -1,11 +1,26 @@ MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +UDS_VERSION := 0.24.1 +LOCAL_VERSION ?= $(shell git rev-parse --short HEAD) + + build-k3d-gpu: @cd ${MAKEFILE_DIR} && \ - uds zarf package create + docker build \ + --platform linux/amd64 \ + -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${LOCAL_VERSION} . create-uds-gpu-cluster: + @uds deploy k3d-core-slim-dev:${UDS_VERSION} \ + --set K3D_EXTRA_ARGS="--gpus=all \ + --image=ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${LOCAL_VERSION}" --confirm + +test-uds-gpu-cluster: + @cd ${MAKEFILE_DIR} && \ + uds zarf tools kubectl apply -f ./test/cuda-vector-add.yaml + @uds zarf tools kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod + @uds zarf tools kubectl logs -l app=gpu-pod @cd ${MAKEFILE_DIR} && \ - uds zarf package deploy zarf-package-k3d-gpu-amd64-*.tar.zst --confirm + uds zarf tools kubectl delete -f ./test/cuda-vector-add.yaml .PHONY: k3d-gpu-package create-uds-gpu-cluster diff --git a/packages/k3d-gpu/zarf-config.yaml b/packages/k3d-gpu/zarf-config.yaml deleted file mode 100644 index b1d4c4023..000000000 --- a/packages/k3d-gpu/zarf-config.yaml +++ /dev/null @@ -1,9 +0,0 @@ -package: - create: - set: - # x-release-please-start-version - version: 0.9.1 - # x-release-please-end - uds_flavor: k3d-core-slim-dev - uds_version: 0.24.0 - arch: amd64 diff --git a/packages/k3d-gpu/zarf.yaml b/packages/k3d-gpu/zarf.yaml deleted file mode 100644 index 9d9671778..000000000 --- a/packages/k3d-gpu/zarf.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/defenseunicorns/zarf/main/zarf.schema.json - -kind: ZarfPackageConfig -metadata: - name: "k3d-gpu" - version: '###ZARF_PKG_TMPL_VERSION###' - description: > - k3d base image with GPU support - -components: - - name: create-uds-cluster - required: true - actions: - onCreate: - before: - - cmd: | - uds pull ghcr.io/defenseunicorns/packages/uds/bundles/###ZARF_PKG_TMPL_UDS_FLAVOR###:###ZARF_PKG_TMPL_UDS_VERSION### -a ###ZARF_PKG_TMPL_ARCH### - description: "Pull the UDS K3D slim dev bundle" - - - cmd: | - docker pull ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### - description: "Pull the k3d-gpu image" - onDeploy: - before: - - cmd: | - uds deploy uds-bundle-###ZARF_PKG_TMPL_UDS_FLAVOR###-###ZARF_PKG_TMPL_ARCH###-###ZARF_PKG_TMPL_UDS_VERSION###.tar.zst \ - --set K3D_EXTRA_ARGS="--gpus=all \ - --image=ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION###" \ - --no-progress --confirm - description: "Create a k3d cluster with GPU support" - onRemove: - before: - - cmd: | - k3d cluster delete uds - description: "Delete the k3d cluster" - - - name: test-cluster - required: true - files: - - source: test/cuda-vector-add.yaml - target: test/cuda-vector-add.yaml - actions: - onDeploy: - before: - - cmd: | - uds zarf tools kubectl apply -f ./test/cuda-vector-add.yaml - uds zarf tools kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod - uds zarf tools kubectl logs -l app=gpu-pod - description: "Run the test pod" - after: - - cmd: | - uds zarf tools kubectl delete -f ./test/cuda-vector-add.yaml - description: "Delete the test pod" From d171516b4ad5c5bb8fb1d34cd964c10bce2904d7 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Mon, 22 Jul 2024 14:23:01 -0400 Subject: [PATCH 26/29] make depends --- packages/k3d-gpu/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile index f121cb03b..cd5ccc7f3 100644 --- a/packages/k3d-gpu/Makefile +++ b/packages/k3d-gpu/Makefile @@ -10,7 +10,7 @@ build-k3d-gpu: --platform linux/amd64 \ -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${LOCAL_VERSION} . -create-uds-gpu-cluster: +create-uds-gpu-cluster: build-k3d-gpu @uds deploy k3d-core-slim-dev:${UDS_VERSION} \ --set K3D_EXTRA_ARGS="--gpus=all \ --image=ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${LOCAL_VERSION}" --confirm From 996f5da580d8812e37cbe79b673c2e574618fa7a Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Mon, 22 Jul 2024 14:25:22 -0400 Subject: [PATCH 27/29] update workflow to only push on tag --- .github/workflows/build-images.yaml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index 8902a4a29..bfc11040a 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -1,12 +1,9 @@ name: Build Images on: - workflow_dispatch: - pull_request: - types: - - ready_for_review - - review_requested - - synchronize + push: + tags: + - "*" permissions: contents: read From 631b4793e8dd54571708746f1d60c87fe55ea311 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Mon, 22 Jul 2024 14:46:01 -0400 Subject: [PATCH 28/29] fix a couple name updates --- packages/k3d-gpu/Makefile | 2 +- packages/k3d-gpu/README.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile index cd5ccc7f3..7dfc7e5e9 100644 --- a/packages/k3d-gpu/Makefile +++ b/packages/k3d-gpu/Makefile @@ -23,4 +23,4 @@ test-uds-gpu-cluster: @cd ${MAKEFILE_DIR} && \ uds zarf tools kubectl delete -f ./test/cuda-vector-add.yaml -.PHONY: k3d-gpu-package create-uds-gpu-cluster +.PHONY: build-k3d-gpu create-uds-gpu-cluster test-uds-gpu-cluster diff --git a/packages/k3d-gpu/README.md b/packages/k3d-gpu/README.md index f35ac09d1..dbe1e534a 100644 --- a/packages/k3d-gpu/README.md +++ b/packages/k3d-gpu/README.md @@ -16,11 +16,11 @@ Check out the Make targets for the various options. ### Local ```shell -make push-k3d-gpu # build and push image to a local registry +make build-k3d-gpu # build the image -make uds-gpu-cluster # create a uds cluster equipped with the k3d-gpu image +make create-uds-gpu-cluster # create a uds cluster equipped with the k3d-gpu image -make test-k3d-gpu # deploy a test gpu pod to see if everything is working +make test-uds-gpu-cluster # deploy a test gpu pod to see if everything is working ``` ## References From 9b3718cf6239aa0ea652c7b899c79487c2051d1c Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Tue, 23 Jul 2024 12:22:00 -0400 Subject: [PATCH 29/29] move image publish to release pipeline --- .github/workflows/build-images.yaml | 40 ----------------------------- .github/workflows/release.yaml | 9 +++++++ 2 files changed, 9 insertions(+), 40 deletions(-) delete mode 100644 .github/workflows/build-images.yaml diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml deleted file mode 100644 index bfc11040a..000000000 --- a/.github/workflows/build-images.yaml +++ /dev/null @@ -1,40 +0,0 @@ -name: Build Images - -on: - push: - tags: - - "*" - -permissions: - contents: read - packages: write - -jobs: - build-and-publish-images: - runs-on: ubuntu-latest - - steps: - - name: Checkout Repo - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - - name: Version - id: get_version - uses: battila7/get-version-action@90eb8fc70f6dfcf3f9b95ed8f164d2c05038e729 # v2.2.1 - - - name: Login to GitHub Container Registry - uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Install Zarf - uses: defenseunicorns/setup-zarf@10e539efed02f75ec39eb8823e22a5c795f492ae #v1.0.1 - - - name: Build and Publish k3d-gpu image - run: | - cd packages/k3d-gpu - docker build \ - --platform linux/amd64 \ - -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }} . - docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }} diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 4f39ac676..0f891a599 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -40,6 +40,15 @@ jobs: with: python-version-file: 'pyproject.toml' + - name: Build and Publish k3d-gpu image + run: | + cd packages/k3d-gpu + docker build \ + --platform linux/amd64 \ + -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }} . + docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }} + cd ../.. + - name: Download Python Wheels and Publish Builder Image run: | docker buildx build --platform amd64,arm64 -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-sdk:${{ steps.get_version.outputs.version-without-v }} --push -f src/leapfrogai_sdk/Dockerfile .