From 12310627ff20c55e0f1af1114edab7f8c9148e8f Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Tue, 28 Jan 2025 12:45:21 -0500 Subject: [PATCH] [release-test] Fix GPU memory usage detection (#2576) --- .github/workflows/userbenchmark-a100.yml | 6 +++--- userbenchmark/release-test/monitor_proc.sh | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/userbenchmark-a100.yml b/.github/workflows/userbenchmark-a100.yml index 9b7ac76e91..bc6c56beef 100644 --- a/.github/workflows/userbenchmark-a100.yml +++ b/.github/workflows/userbenchmark-a100.yml @@ -27,9 +27,9 @@ jobs: - name: Install Conda run: | bash ./.ci/torchbench/install-conda.sh - - name: Install TorchBench - run: | - bash ./.ci/torchbench/install.sh +# - name: Install TorchBench +# run: | +# bash ./.ci/torchbench/install.sh - name: Run user benchmark run: | set -x diff --git a/userbenchmark/release-test/monitor_proc.sh b/userbenchmark/release-test/monitor_proc.sh index 3594f8caf4..fd4ca94d65 100644 --- a/userbenchmark/release-test/monitor_proc.sh +++ b/userbenchmark/release-test/monitor_proc.sh @@ -16,12 +16,12 @@ get_gpu_max_memory_usage_cuda() { local my_pid=$1 local max=$2 local curr - # Some processes might not use the GPU - if ! nvidia-smi pmon -s m -c 1 -o T | grep "${my_pid}" >/dev/null 2>/dev/null; then + curr=$(nvidia-smi dmon -s m -c 1 -o T -i 0 | tail -n +3 | awk '{print $3}' | sort -n | tail -1 | grep -o "[0-9.]*") + # Some processes might not use the GPU, then memory usage should be 0 + if [ "${curr}" -eq 0 ] ; then echo "${max}" return fi - curr=$(nvidia-smi pmon -s m -c 1 -o T | grep "${my_pid}" | awk '{print $5}' | sort | tail -1 | grep -o "[0-9.]*") max "${curr}" "${max}" }