Skip to content

Commit a9a12ec

Browse files
committed
e2e: add GPU test
This adds an E2E test for GPU use on Contrast. It currently runs on the GPU-enabled bare-metal SNP runner. The test currently only verifies that the GPU is available via nvidia-smi, which also verifies that driver and CUDA work correctly.
1 parent 0ff18ff commit a9a12ec

File tree

7 files changed

+183
-29
lines changed

7 files changed

+183
-29
lines changed

.github/workflows/e2e_manual.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ on:
1010
options:
1111
- genpolicy
1212
- getdents
13+
- gpu
1314
- openssl
1415
- policy
1516
- regression
@@ -24,6 +25,7 @@ on:
2425
options:
2526
- AKS-CLH-SNP
2627
- K3s-QEMU-SNP
28+
- K3s-QEMU-SNP-GPU
2729
- K3s-QEMU-TDX
2830
skip-undeploy:
2931
description: "Skip undeploy"

.github/workflows/e2e_nightly.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,16 @@ jobs:
1919
- name: K3s-QEMU-TDX
2020
runner: TDX
2121
self-hosted: true
22+
- name: K3s-QEMU-SNP-GPU
23+
runner: SNP
24+
self-hosted: true
2225
test-name: [servicemesh, openssl, policy, workloadsecret, volumestatefulset]
26+
include:
27+
- platform:
28+
name: K3s-QEMU-SNP-GPU
29+
runner: SNP
30+
self-hosted: true
31+
test-name: [gpu]
2332
fail-fast: false
2433
name: "${{ matrix.platform.name }}"
2534
uses: ./.github/workflows/e2e.yml

e2e/gpu/gpu_test.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// Copyright 2024 Edgeless Systems GmbH
2+
// SPDX-License-Identifier: AGPL-3.0-only
3+
4+
//go:build e2e
5+
6+
package gpu
7+
8+
import (
9+
"bytes"
10+
"context"
11+
"flag"
12+
"os"
13+
"testing"
14+
"time"
15+
16+
"github.com/edgelesssys/contrast/e2e/internal/contrasttest"
17+
"github.com/edgelesssys/contrast/internal/kuberesource"
18+
"github.com/edgelesssys/contrast/internal/manifest"
19+
"github.com/edgelesssys/contrast/internal/platforms"
20+
"github.com/stretchr/testify/require"
21+
)
22+
23+
const (
24+
gpuPodName = "gpu-pod"
25+
gpuName = "NVIDIA H100 PCIe"
26+
)
27+
28+
// TestGPU runs e2e tests on an GPU-enabled Contrast.
29+
func TestGPU(t *testing.T) {
30+
platform, err := platforms.FromString(contrasttest.Flags.PlatformStr)
31+
require.NoError(t, err)
32+
ct := contrasttest.New(t)
33+
34+
runtimeHandler, err := manifest.RuntimeHandler(platform)
35+
require.NoError(t, err)
36+
37+
resources := kuberesource.OpenSSL()
38+
coordinator := kuberesource.CoordinatorBundle()
39+
40+
resources = append(resources, coordinator...)
41+
42+
resources = kuberesource.PatchRuntimeHandlers(resources, runtimeHandler)
43+
44+
resources = kuberesource.AddPortForwarders(resources)
45+
46+
ct.Init(t, resources)
47+
require.True(t, t.Run("generate", ct.Generate), "contrast generate needs to succeed for subsequent tests")
48+
49+
require.True(t, t.Run("apply", ct.Apply), "Kubernetes resources need to be applied for subsequent tests")
50+
51+
require.True(t, t.Run("set", ct.Set), "contrast set needs to succeed for subsequent tests")
52+
53+
require.True(t, t.Run("contrast verify", ct.Verify), "contrast verify needs to succeed for subsequent tests")
54+
55+
applyGPUPod := func(t *testing.T) {
56+
yaml, err := os.ReadFile("./e2e/gpu/testdata/gpu-pod.yaml")
57+
require.NoError(t, err)
58+
59+
yaml = bytes.ReplaceAll(
60+
bytes.ReplaceAll(yaml, []byte("@@REPLACE_NAMESPACE@@"), []byte(ct.Namespace)),
61+
[]byte("@@REPLACE_RUNTIME@@"), []byte(ct.RuntimeClassName),
62+
)
63+
64+
ct.ApplyFromYAML(t, yaml)
65+
}
66+
67+
require.True(t, t.Run("apply GPU pod", applyGPUPod), "GPU pod needs to deploy successfully for subsequent tests")
68+
69+
t.Run("check GPU availability", func(t *testing.T) {
70+
ctx, cancel := context.WithTimeout(context.Background(), ct.FactorPlatformTimeout(5*time.Minute))
71+
defer cancel()
72+
73+
require := require.New(t)
74+
75+
err := ct.Kubeclient.WaitForPod(ctx, ct.Namespace, gpuPodName)
76+
require.NoError(err, "GPU pod %s did not start", gpuPodName)
77+
78+
argv := []string{"/bin/sh", "-c", "nvidia-smi"}
79+
stdout, stderr, err := ct.Kubeclient.Exec(ctx, ct.Namespace, gpuPodName, argv)
80+
require.NoError(err, "stderr: %q", stderr)
81+
82+
require.Contains(stdout, gpuName, "nvidia-smi output should contain %s", gpuName)
83+
})
84+
}
85+
86+
func TestMain(m *testing.M) {
87+
contrasttest.RegisterFlags()
88+
flag.Parse()
89+
90+
os.Exit(m.Run())
91+
}

e2e/gpu/testdata/gpu-pod.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# TODO(msanft): Move this to internal/kuberesource/sets.go as soon as genpolicy
2+
# support for GPU pods is added.
3+
apiVersion: v1
4+
kind: Pod
5+
metadata:
6+
name: gpu-pod
7+
namespace: "@@REPLACE_NAMESPACE@@"
8+
annotations:
9+
# Allow-all policy
10+
# TODO(msanft): Generate a policy dynamically once we support policy generation for GPU pods.
11+
io.katacontainers.config.agent.policy: IyBDb3B5cmlnaHQgKGMpIDIwMjMgTWljcm9zb2Z0IENvcnBvcmF0aW9uCiMKIyBTUERYLUxpY2Vuc2UtSWRlbnRpZmllcjogQXBhY2hlLTIuMAojCgpwYWNrYWdlIGFnZW50X3BvbGljeQoKZGVmYXVsdCBBZGRBUlBOZWlnaGJvcnNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBBZGRTd2FwUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ2xvc2VTdGRpblJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IENvcHlGaWxlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ3JlYXRlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ3JlYXRlU2FuZGJveFJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IERlc3Ryb3lTYW5kYm94UmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgRXhlY1Byb2Nlc3NSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBHZXRNZXRyaWNzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgR2V0T09NRXZlbnRSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBHdWVzdERldGFpbHNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBMaXN0SW50ZXJmYWNlc1JlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IExpc3RSb3V0ZXNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBNZW1Ib3RwbHVnQnlQcm9iZVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IE9ubGluZUNQVU1lbVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFBhdXNlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgUHVsbEltYWdlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgUmVhZFN0cmVhbVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlbW92ZUNvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlbW92ZVN0YWxlVmlydGlvZnNTaGFyZU1vdW50c1JlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlc2VlZFJhbmRvbURldlJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlc3VtZUNvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFNldEd1ZXN0RGF0ZVRpbWVSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTZXRQb2xpY3lSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTaWduYWxQcm9jZXNzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgU3RhcnRDb250YWluZXJSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTdGFydFRyYWNpbmdSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTdGF0c0NvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFN0b3BUcmFjaW5nUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVHR5V2luUmVzaXplUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlRXBoZW1lcmFsTW91bnRzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlSW50ZXJmYWNlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlUm91dGVzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgV2FpdFByb2Nlc3NSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBXcml0ZVN0cmVhbVJlcXVlc3QgOj0gdHJ1ZQo=
12+
io.katacontainers.config.hypervisor.default_memory: "15258"
13+
cdi.k8s.io/gpu: "nvidia.com/pgpu=0"
14+
spec:
15+
runtimeClassName: "@@REPLACE_RUNTIME@@"
16+
restartPolicy: OnFailure
17+
containers:
18+
- name: vllm
19+
image: ghcr.io/edgelesssys/contrast/ubuntu:24.04
20+
env:
21+
- name: NVIDIA_VISIBLE_DEVICES
22+
value: all
23+
resources:
24+
limits:
25+
"nvidia.com/GH100_H100_PCIE": 1

e2e/internal/contrasttest/contrasttest.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ type ContrastTest struct {
6161
ImageReplacementsFile string
6262
Platform platforms.Platform
6363
NamespaceFile string
64+
RuntimeClassName string
6465
Kubeclient *kubeclient.Kubeclient
6566

6667
// outputs of contrast subcommands
@@ -70,15 +71,21 @@ type ContrastTest struct {
7071

7172
// New creates a new contrasttest.T object bound to the given test.
7273
func New(t *testing.T) *ContrastTest {
74+
require := require.New(t)
75+
7376
platform, err := platforms.FromString(Flags.PlatformStr)
74-
require.NoError(t, err)
77+
require.NoError(err)
78+
79+
runtimeClass, err := kuberesource.ContrastRuntimeClass(platform)
80+
require.NoError(err)
7581

7682
return &ContrastTest{
7783
Namespace: MakeNamespace(t, Flags.NamespaceSuffix),
7884
WorkDir: t.TempDir(),
7985
ImageReplacementsFile: Flags.ImageReplacementsFile,
8086
Platform: platform,
8187
NamespaceFile: Flags.NamespaceFile,
88+
RuntimeClassName: *runtimeClass.Handler,
8289
Kubeclient: kubeclient.NewForTest(t),
8390
}
8491
}
@@ -283,9 +290,15 @@ func patchReferenceValues(k *kubeclient.Kubeclient, platform platforms.Platform)
283290
// Apply the generated resources to the Kubernetes test environment.
284291
func (ct *ContrastTest) Apply(t *testing.T) {
285292
require := require.New(t)
286-
287293
yaml, err := os.ReadFile(path.Join(ct.WorkDir, "resources.yml"))
288294
require.NoError(err)
295+
ct.ApplyFromYAML(t, yaml)
296+
}
297+
298+
// ApplyFromYAML applies the given YAML to the Kubernetes test environment.
299+
func (ct *ContrastTest) ApplyFromYAML(t *testing.T, yaml []byte) {
300+
require := require.New(t)
301+
289302
objects, err := kubeapi.UnmarshalUnstructuredK8SResource(yaml)
290303
require.NoError(err)
291304

packages/by-name/contrast/package.nix

Lines changed: 36 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ let
3232
subPackages = [
3333
"e2e/genpolicy"
3434
"e2e/getdents"
35+
"e2e/gpu"
3536
"e2e/openssl"
3637
"e2e/servicemesh"
3738
"e2e/release"
@@ -81,35 +82,43 @@ let
8182
];
8283
};
8384

84-
snpRefVals = {
85-
snp =
86-
let
87-
launch-digest =
88-
if kata.contrast-node-installer-image.debugRuntime then
89-
kata.snp-launch-digest.override { debug = true; }
90-
else
91-
kata.snp-launch-digest;
92-
in
93-
[
94-
{
95-
trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/milan.hex");
96-
productName = "Milan";
97-
}
98-
{
99-
trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/genoa.hex");
100-
productName = "Genoa";
101-
}
102-
];
103-
};
85+
snpRefValsWith =
86+
{ gpu }:
87+
{
88+
snp =
89+
let
90+
os-image =
91+
if gpu then
92+
kata.contrast-node-installer-image.gpu.os-image
93+
else
94+
kata.contrast-node-installer-image.os-image;
95+
launch-digest = kata.snp-launch-digest.override {
96+
inherit os-image;
97+
debug = kata.contrast-node-installer-image.debugRuntime;
98+
};
99+
in
100+
[
101+
{
102+
trustedMeasurement = builtins.readFile "${launch-digest}/milan.hex";
103+
productName = "Milan";
104+
}
105+
{
106+
trustedMeasurement = builtins.readFile "${launch-digest}/genoa.hex";
107+
productName = "Genoa";
108+
}
109+
];
110+
};
111+
112+
snpRefVals = snpRefValsWith { gpu = false; };
113+
snpGpuRefVals = snpRefValsWith { gpu = true; };
114+
104115
tdxRefVals = {
105116
tdx = [
106117
(
107118
let
108-
launch-digests =
109-
if kata.contrast-node-installer-image.debugRuntime then
110-
kata.tdx-launch-digests.override { debug = true; }
111-
else
112-
kata.tdx-launch-digests;
119+
launch-digests = kata.tdx-launch-digests.override {
120+
debug = kata.contrast-node-installer-image.debugRuntime;
121+
};
113122
in
114123
{
115124
mrTd = builtins.readFile "${launch-digests}/mrtd.hex";
@@ -135,9 +144,9 @@ let
135144
"${k3s-qemu-tdx-handler}" = tdxRefVals;
136145
"${rke2-qemu-tdx-handler}" = tdxRefVals;
137146
"${metal-qemu-snp-handler}" = snpRefVals;
138-
"${metal-qemu-snp-gpu-handler}" = snpRefVals;
147+
"${metal-qemu-snp-gpu-handler}" = snpGpuRefVals;
139148
"${k3s-qemu-snp-handler}" = snpRefVals;
140-
"${k3s-qemu-snp-gpu-handler}" = snpRefVals;
149+
"${k3s-qemu-snp-gpu-handler}" = snpGpuRefVals;
141150
}
142151
);
143152

packages/by-name/kata/snp-launch-digest/package.nix

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,5 +52,10 @@ stdenvNoCC.mkDerivation {
5252
--initrd ${initrd} \
5353
--append "${cmdline}" \
5454
--output-format hex > $out/genoa.hex
55+
56+
# cut newlines
57+
for file in $out/*.hex; do
58+
truncate -s -1 "$file"
59+
done
5560
'';
5661
}

0 commit comments

Comments
 (0)