From fc5d62809fae59d5a7470f70b98de872730fd66a Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Mon, 2 Feb 2026 18:35:27 -0600 Subject: [PATCH 1/6] DEBUG:HIP device. --- .../integration_test_8gpu_autoparallel.yaml | 16 ++++++++ .../debug_hip_device_name_ci.cpp | 41 +++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 torchtitan/experiments/autoparallel/debug_ci_script/debug_hip_device_name_ci.cpp diff --git a/.github/workflows/integration_test_8gpu_autoparallel.yaml b/.github/workflows/integration_test_8gpu_autoparallel.yaml index 7ce391f71a..3f32ab209a 100644 --- a/.github/workflows/integration_test_8gpu_autoparallel.yaml +++ b/.github/workflows/integration_test_8gpu_autoparallel.yaml @@ -64,6 +64,22 @@ jobs: rocminfo || true fi + # DEBUG: HIP Device name + if [[ "${{ matrix.gpu-arch-type }}" == "rocm" ]]; then + echo "::group::DEBUG HIP device-name" + # Search for hipcc + HIPCC="$(find / -type f -name hipcc -perm -111 2>/dev/null | head -n 1)" + if [[ -z "$HIPCC" ]]; then + echo "ERROR: hipcc not found on filesystem" + exit 1 + fi + echo "Found hipcc at: $HIPCC" + # Compile and run the checked-in C++ program + "$HIPCC" -O2 torchtitan/experiments/autoparallel/debug_ci_script/debug_hip_device_name_ci.cpp -o "$RUNNER_TEMP/hip_device_name" + "$RUNNER_TEMP/hip_device_name" + echo "::endgroup::" + fi + pip config --user set global.progress_bar off python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} diff --git a/torchtitan/experiments/autoparallel/debug_ci_script/debug_hip_device_name_ci.cpp b/torchtitan/experiments/autoparallel/debug_ci_script/debug_hip_device_name_ci.cpp new file mode 100644 index 0000000000..af8a42b62c --- /dev/null +++ b/torchtitan/experiments/autoparallel/debug_ci_script/debug_hip_device_name_ci.cpp @@ -0,0 +1,41 @@ +// How to compile and run: +// hipcc hip_device_name.cpp -o hip_device_name +// ./hip_device_name +#include +#include + +#define HIP_CHECK(call) \ + do { \ + hipError_t err = call; \ + if (err != hipSuccess) { \ + fprintf(stderr, "HIP error at %s:%d: %s\n", __FILE__, __LINE__, \ + hipGetErrorString(err)); \ + return 1; \ + } \ + } while (0) + +// Simulates torch.cuda.get_device_name(None) behavior +void print_current_device_name() { + int current_device; + if (hipGetDevice(¤t_device) != hipSuccess) { // This is what torch.cuda.current_device() calls + fprintf(stderr, "Failed to get current device\n"); + return; + } + + hipDeviceProp_t prop; + if (hipGetDeviceProperties(&prop, current_device) != hipSuccess) { + fprintf(stderr, "Failed to get device properties\n"); + return; + } + + printf("Current device %d: %s\n", current_device, prop.name); +} + +int main(int argc, char* argv[]) { + // Demonstrate hipGetDevice / hipSetDevice + // This is what torch.cuda.get_device_name(None) does internally + printf("=== Simulating torch.cuda.get_device_name(None) ===\n"); + print_current_device_name(); + + return 0; +} From 77fb283b06635728120ba0e242362a474ecbb6af Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Mon, 2 Feb 2026 19:44:35 -0600 Subject: [PATCH 2/6] DEBUG: fix finding hipcc. --- .github/workflows/integration_test_8gpu_autoparallel.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_autoparallel.yaml b/.github/workflows/integration_test_8gpu_autoparallel.yaml index 3f32ab209a..15f553d566 100644 --- a/.github/workflows/integration_test_8gpu_autoparallel.yaml +++ b/.github/workflows/integration_test_8gpu_autoparallel.yaml @@ -68,7 +68,7 @@ jobs: if [[ "${{ matrix.gpu-arch-type }}" == "rocm" ]]; then echo "::group::DEBUG HIP device-name" # Search for hipcc - HIPCC="$(find / -type f -name hipcc -perm -111 2>/dev/null | head -n 1)" + HIPCC="$(find / -type f -name hipcc -perm -111 -print -quit 2>/dev/null || true)" if [[ -z "$HIPCC" ]]; then echo "ERROR: hipcc not found on filesystem" exit 1 From c321b739e07ae08b807e71597e20e63481cb923c Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Mon, 2 Feb 2026 20:24:59 -0600 Subject: [PATCH 3/6] Saving results in temporary folder. --- .github/workflows/integration_test_8gpu_autoparallel.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_autoparallel.yaml b/.github/workflows/integration_test_8gpu_autoparallel.yaml index 15f553d566..f8762660ed 100644 --- a/.github/workflows/integration_test_8gpu_autoparallel.yaml +++ b/.github/workflows/integration_test_8gpu_autoparallel.yaml @@ -75,7 +75,9 @@ jobs: fi echo "Found hipcc at: $HIPCC" # Compile and run the checked-in C++ program - "$HIPCC" -O2 torchtitan/experiments/autoparallel/debug_ci_script/debug_hip_device_name_ci.cpp -o "$RUNNER_TEMP/hip_device_name" + OUTDIR="./hip_tmp" + mkdir -p "$OUTDIR" + "$HIPCC" -O2 torchtitan/experiments/autoparallel/debug_ci_script/debug_hip_device_name_ci.cpp -o "$OUTDIR/hip_device_name" "$RUNNER_TEMP/hip_device_name" echo "::endgroup::" fi From 059867500e6a78c504d5920c58a4dc0eab30a9ec Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Mon, 2 Feb 2026 20:42:05 -0600 Subject: [PATCH 4/6] Using sudo. --- .github/workflows/integration_test_8gpu_autoparallel.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_autoparallel.yaml b/.github/workflows/integration_test_8gpu_autoparallel.yaml index f8762660ed..7d16df5b6b 100644 --- a/.github/workflows/integration_test_8gpu_autoparallel.yaml +++ b/.github/workflows/integration_test_8gpu_autoparallel.yaml @@ -76,9 +76,10 @@ jobs: echo "Found hipcc at: $HIPCC" # Compile and run the checked-in C++ program OUTDIR="./hip_tmp" - mkdir -p "$OUTDIR" + sudo mkdir -p "$OUTDIR" + sudo chown -R $(id -u):$(id -g) "$OUTDIR" "$HIPCC" -O2 torchtitan/experiments/autoparallel/debug_ci_script/debug_hip_device_name_ci.cpp -o "$OUTDIR/hip_device_name" - "$RUNNER_TEMP/hip_device_name" + "$OUTDIR/hip_device_name" echo "::endgroup::" fi From 6f947b20092b3be46573f404cb54732f4e61f0b8 Mon Sep 17 00:00:00 2001 From: Wen Chen Date: Tue, 3 Feb 2026 05:02:25 +0000 Subject: [PATCH 5/6] Added torch test. --- .github/workflows/integration_test_8gpu_autoparallel.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/integration_test_8gpu_autoparallel.yaml b/.github/workflows/integration_test_8gpu_autoparallel.yaml index 7d16df5b6b..4d60f8aa14 100644 --- a/.github/workflows/integration_test_8gpu_autoparallel.yaml +++ b/.github/workflows/integration_test_8gpu_autoparallel.yaml @@ -87,6 +87,9 @@ jobs: python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + echo "Getting device name:" + python -c "import torch; print(torch.cuda.get_device_name(None))" + # Install autoparallel - required dependency for autoparallel experiment python -m pip install git+https://github.com/meta-pytorch/autoparallel.git From b3002550d09ace530f932205c5e527d53156ff79 Mon Sep 17 00:00:00 2001 From: Wen Chen Date: Wed, 4 Feb 2026 04:33:54 +0000 Subject: [PATCH 6/6] Fix issue with out-of-date amdgpu.ids. --- .github/workflows/integration_test_8gpu_autoparallel.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/integration_test_8gpu_autoparallel.yaml b/.github/workflows/integration_test_8gpu_autoparallel.yaml index 4d60f8aa14..77a80cb882 100644 --- a/.github/workflows/integration_test_8gpu_autoparallel.yaml +++ b/.github/workflows/integration_test_8gpu_autoparallel.yaml @@ -87,6 +87,9 @@ jobs: python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + sudo curl -L -o /usr/share/libdrm/amdgpu.ids https://gitlab.freedesktop.org/mesa/libdrm/-/raw/main/data/amdgpu.ids + sudo mkdir -p /opt/amdgpu/share/libdrm + sudo ln -sf /usr/share/libdrm/amdgpu.ids /opt/amdgpu/share/libdrm/amdgpu.ids echo "Getting device name:" python -c "import torch; print(torch.cuda.get_device_name(None))"