From 2fd580cc34e2621c13ebdaa53aa7d932728f1b9f Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 26 Sep 2024 11:45:54 -0700 Subject: [PATCH] tests: Added Gaudi HCCL Demo L2 Test Case Signed-off-by: Chaitanya Kulkarni --- tests/gaudi/l2/README.md | 36 +++++++++++++++++++++++++++++ tests/gaudi/l2/hccl_build.yaml | 42 ++++++++++++++++++++++++++++++++++ tests/gaudi/l2/hccl_job.yaml | 29 +++++++++++++++++++++++ 3 files changed, 107 insertions(+) create mode 100644 tests/gaudi/l2/README.md create mode 100644 tests/gaudi/l2/hccl_build.yaml create mode 100644 tests/gaudi/l2/hccl_job.yaml diff --git a/tests/gaudi/l2/README.md b/tests/gaudi/l2/README.md new file mode 100644 index 00000000..a32fb9fb --- /dev/null +++ b/tests/gaudi/l2/README.md @@ -0,0 +1,36 @@ +# Verify IntelĀ® GaudiĀ® AI Accelerator Provisioning + +## HCCL +HCCL (Habana Collective Communication Library) demo is a program that demonstrates HCCL usage and supports communication via Gaudi based scale-out or Host NIC scale-out. Refer [HCCL Demo](https://github.com/HabanaAI/hccl_demo/tree/main?tab=readme-ov-file#hccl-demo) for more details. + +Build the workload container image: +``` +$ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/gaudi/l2/hccl_build.yaml +``` +Deploy and execute the workload: +``` +$ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/gaudi/l2/hccl_job.yaml +``` + +Verify Output: +``` +$ oc get pods +NAME READY STATUS RESTARTS AGE +hccl-demo-workload-1-build 0/1 Completed 0 23m +hccl-demo-workload-wq8mx 0/1 Completed 0 10m +``` +``` +$ oc logs hccl-demo-workload-wq8mx +Affinity: Numa mapping directory: /tmp/affinity_topology_output +Affinity: Script has not been executed before, going to execute... +Affinity: Script has finished successfully +Welcome to HCCL demo +. +. +. +#################################################################################################### +[BENCHMARK] hcclAllReduce(src!=dst, data_size=33554432, count=8388608, dtype=float, iterations=1000) +[BENCHMARK] NW Bandwidth : 258.209121 GB/s +[BENCHMARK] Algo Bandwidth : 147.548069 GB/s +#################################################################################################### +``` \ No newline at end of file diff --git a/tests/gaudi/l2/hccl_build.yaml b/tests/gaudi/l2/hccl_build.yaml new file mode 100644 index 00000000..cd549876 --- /dev/null +++ b/tests/gaudi/l2/hccl_build.yaml @@ -0,0 +1,42 @@ +apiVersion: image.openshift.io/v1 +kind: ImageStream +metadata: + name: hccl-demo-workload + namespace: hccl-demo +--- +kind: BuildConfig +apiVersion: build.openshift.io/v1 +metadata: + name: hccl-demo-workload + namespace: hccl-demo +spec: + output: + to: + kind: ImageStreamTag + name: 'hccl-demo-workload:latest' + strategy: + type: Docker + source: + type: Dockerfile + dockerfile: | + ARG BUILDER=vault.habana.ai/gaudi-docker/1.17.1/rhel9.4/habanalabs/pytorch-installer-2.3.1:1.17.1-40 + FROM ${BUILDER} AS builder + + WORKDIR / + RUN git clone https://github.com/HabanaAI/hccl_demo.git \ + && cd hccl_demo \ + && make + + WORKDIR / + RUN git clone https://github.com/HabanaAI/hccl_ofi_wrapper.git \ + && export LIBFABRIC_ROOT=/opt/habanalabs/libfabric-1.20.0 \ + && cd hccl_ofi_wrapper \ + && make \ + && cp libhccl_ofi_wrapper.so /usr/lib/habanalabs/libhccl_ofi_wrapper.so \ + && ldconfig \ + && export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/habanalabs/ + + WORKDIR /hccl_demo + triggers: + - type: ConfigChange + runPolicy: Serial \ No newline at end of file diff --git a/tests/gaudi/l2/hccl_job.yaml b/tests/gaudi/l2/hccl_job.yaml new file mode 100644 index 00000000..cfd1651a --- /dev/null +++ b/tests/gaudi/l2/hccl_job.yaml @@ -0,0 +1,29 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: hccl-demo-workload + namespace: hccl-demo +spec: + template: + metadata: + spec: + restartPolicy: Never + serviceAccountName: hccl-demo-anyuid-sa + containers: + - name: hccl-demo-workload + image: image-registry.openshift-image-registry.svc:5000/hccl-demo/hccl-demo-workload:latest + workingDir: "/hccl_demo" + command: ["/bin/bash", "-c", "--"] + ## sleep for 20 seconds to avoid race condition + args: + - | + sleep 20 + python3 run_hccl_demo.py --nranks 8 --node_id 0 --size 32m --test all_reduce --loop 1000 --ranks_per_node 8 + sleep 20 + env: + - name: HCCL_COMM_ID + value: '127.0.0.1:5555' + resources: + limits: + habana.ai/gaudi: 8 + imagePullPolicy: IfNotPresent \ No newline at end of file