diff --git a/.github/workflows/integration_test_8gpu_autoparallel.yaml b/.github/workflows/integration_test_8gpu_autoparallel.yaml index 7ce391f71a..77a80cb882 100644 --- a/.github/workflows/integration_test_8gpu_autoparallel.yaml +++ b/.github/workflows/integration_test_8gpu_autoparallel.yaml @@ -64,10 +64,35 @@ jobs: rocminfo || true fi + # DEBUG: HIP Device name + if [[ "${{ matrix.gpu-arch-type }}" == "rocm" ]]; then + echo "::group::DEBUG HIP device-name" + # Search for hipcc + HIPCC="$(find / -type f -name hipcc -perm -111 -print -quit 2>/dev/null || true)" + if [[ -z "$HIPCC" ]]; then + echo "ERROR: hipcc not found on filesystem" + exit 1 + fi + echo "Found hipcc at: $HIPCC" + # Compile and run the checked-in C++ program + OUTDIR="./hip_tmp" + sudo mkdir -p "$OUTDIR" + sudo chown -R $(id -u):$(id -g) "$OUTDIR" + "$HIPCC" -O2 torchtitan/experiments/autoparallel/debug_ci_script/debug_hip_device_name_ci.cpp -o "$OUTDIR/hip_device_name" + "$OUTDIR/hip_device_name" + echo "::endgroup::" + fi + pip config --user set global.progress_bar off python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + sudo curl -L -o /usr/share/libdrm/amdgpu.ids https://gitlab.freedesktop.org/mesa/libdrm/-/raw/main/data/amdgpu.ids + sudo mkdir -p /opt/amdgpu/share/libdrm + sudo ln -sf /usr/share/libdrm/amdgpu.ids /opt/amdgpu/share/libdrm/amdgpu.ids + echo "Getting device name:" + python -c "import torch; print(torch.cuda.get_device_name(None))" + # Install autoparallel - required dependency for autoparallel experiment python -m pip install git+https://github.com/meta-pytorch/autoparallel.git diff --git a/torchtitan/experiments/autoparallel/debug_ci_script/debug_hip_device_name_ci.cpp b/torchtitan/experiments/autoparallel/debug_ci_script/debug_hip_device_name_ci.cpp new file mode 100644 index 0000000000..af8a42b62c --- /dev/null +++ b/torchtitan/experiments/autoparallel/debug_ci_script/debug_hip_device_name_ci.cpp @@ -0,0 +1,41 @@ +// How to compile and run: +// hipcc hip_device_name.cpp -o hip_device_name +// ./hip_device_name +#include +#include + +#define HIP_CHECK(call) \ + do { \ + hipError_t err = call; \ + if (err != hipSuccess) { \ + fprintf(stderr, "HIP error at %s:%d: %s\n", __FILE__, __LINE__, \ + hipGetErrorString(err)); \ + return 1; \ + } \ + } while (0) + +// Simulates torch.cuda.get_device_name(None) behavior +void print_current_device_name() { + int current_device; + if (hipGetDevice(¤t_device) != hipSuccess) { // This is what torch.cuda.current_device() calls + fprintf(stderr, "Failed to get current device\n"); + return; + } + + hipDeviceProp_t prop; + if (hipGetDeviceProperties(&prop, current_device) != hipSuccess) { + fprintf(stderr, "Failed to get device properties\n"); + return; + } + + printf("Current device %d: %s\n", current_device, prop.name); +} + +int main(int argc, char* argv[]) { + // Demonstrate hipGetDevice / hipSetDevice + // This is what torch.cuda.get_device_name(None) does internally + printf("=== Simulating torch.cuda.get_device_name(None) ===\n"); + print_current_device_name(); + + return 0; +}