UCT: Merge branch 'master' into uct-perftest-device-cuda

iyastreb · iyastreb · commit 7a8e4da5db28 · 2025-09-08T09:55:05.000Z
diff --git a/Makefile.am b/Makefile.am
@@ -34,6 +34,7 @@ dist_perftest__DATA = \
 	contrib/ucx_perftest_config/test_types_ucp_rma \
 	contrib/ucx_perftest_config/test_types_ucp_amo \
 	contrib/ucx_perftest_config/test_types_ucp_daemon \
+	contrib/ucx_perftest_config/test_types_ucp_device_cuda \
 	contrib/ucx_perftest_config/transports
 
 SUBDIRS = \
diff --git a/buildlib/az-helpers.sh b/buildlib/az-helpers.sh
@@ -140,11 +140,16 @@ function az_module_unload() {
     module unload "${module}" || true
 }
 
+get_num_gpus() {
+    num_gpus=$(nvidia-smi -L | grep GPU | wc -l)
+    echo "$num_gpus"
+}
+
 # Ensure that GPU is present
 check_gpu() {
     name=$1
     if [ "$name" == "gpu" ]; then
-        if ! nvidia-smi -L |& grep -q GPU; then
+        if [ "$(get_num_gpus)" -eq 0 ]; then
             azure_log_error "No GPU device found on $(hostname -s)"
             exit 1
         fi
@@ -185,7 +190,7 @@ try_load_cuda_env() {
 
     # Check number of available GPUs
     nvidia-smi -a || true
-    num_gpus=$(nvidia-smi -L | grep GPU | wc -l)
+    num_gpus=$(get_num_gpus)
     [ "${num_gpus}" -gt 0 ] || return 0
 
     # Check cuda env module
diff --git a/buildlib/tools/common.sh b/buildlib/tools/common.sh
@@ -70,6 +70,10 @@ make_clean() {
 	$MAKEP ${1:-clean}
 }
 
+has_gpunetio_devel() {
+    [ -d "/opt/mellanox/doca" ]
+}
+
 #
 # Configure and build
 #   $1 - mode (devel|release)
@@ -82,6 +86,11 @@ build() {
 	if [ "X$have_cuda" == "Xyes" ]
 	then
 		config_args+=" --with-iodemo-cuda"
+
+		if has_gpunetio_devel
+		then
+			config_args+=" --with-doca-gpunetio=/opt/mellanox/doca"
+		fi
 	fi
 
 	../contrib/configure-${mode} ${config_args} "$@"
diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh
@@ -633,6 +633,36 @@ run_ucx_perftest_with_daemon() {
 	done
 }
 
+#
+# Run UCX performance cuda device test
+#
+run_ucx_perftest_cuda_device() {
+	if [ "X$have_cuda" != "Xyes" ]; then
+		echo "==== CUDA not available, skipping CUDA device tests ===="
+		return 0
+	fi
+
+	if ! has_gpunetio_devel; then
+		echo "==== DOCA not available, skipping CUDA device tests ===="
+		return 0
+	fi
+
+	if [ "$(get_num_gpus)" -eq 0 ]; then
+		echo "==== No NVIDIA GPUs found, skipping CUDA device tests ===="
+		return 0
+	fi
+
+    echo "==== Running ucx_perftest with cuda kernel ===="
+	ucx_inst_ptest=$ucx_inst/share/ucx/perftest
+	ucx_perftest="$ucx_inst/bin/ucx_perftest"
+	ucp_test_args="-b $ucx_inst_ptest/test_types_ucp_device_cuda"
+
+	# TODO: Run on all GPUs
+	ucp_client_args="-a cuda $(hostname)"
+
+	run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0
+}
+
 #
 # Test malloc hooks with mpi
 #
@@ -1208,6 +1238,7 @@ run_tests() {
 	do_distributed_task 3 4 run_ucp_client_server
 	do_distributed_task 0 4 test_no_cuda_context
 	do_distributed_task 1 4 run_ucx_perftest_with_daemon
+	do_distributed_task 1 4 run_ucx_perftest_cuda_device
 
 	# long devel tests
 	do_distributed_task 0 4 run_ucp_hello
diff --git a/contrib/ucx_perftest_config/test_types_ucp_device_cuda b/contrib/ucx_perftest_config/test_types_ucp_device_cuda
@@ -0,0 +1,7 @@
+#
+# UCP basic device cuda tests
+#
+ucp_device_cuda_bw_1k_1thread         -t ucp_put_multi_bw -m cuda -s 1024 -n 10000
+ucp_device_cuda_bw_1k_128threads      -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 128
+ucp_device_cuda_lat_1k_1thread        -t ucp_put_multi_lat -m cuda -s 1024 -n 10000
+ucp_device_cuda_lat_1k_128threads     -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 128
diff --git a/src/tools/perf/Makefile.am b/src/tools/perf/Makefile.am
@@ -43,6 +43,7 @@ dist_perftest_DATA = \
 	$(top_srcdir)/contrib/ucx_perftest_config/test_types_uct \
 	$(top_srcdir)/contrib/ucx_perftest_config/test_types_ucp \
 	$(top_srcdir)/contrib/ucx_perftest_config/test_types_ucp_daemon \
+	$(top_srcdir)/contrib/ucx_perftest_config/test_types_ucp_device_cuda \
 	$(top_srcdir)/contrib/ucx_perftest_config/transports