Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/workflows/integration_test_8gpu_autoparallel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,35 @@ jobs:
rocminfo || true
fi

# DEBUG: HIP Device name
if [[ "${{ matrix.gpu-arch-type }}" == "rocm" ]]; then
echo "::group::DEBUG HIP device-name"
# Search for hipcc
HIPCC="$(find / -type f -name hipcc -perm -111 -print -quit 2>/dev/null || true)"
if [[ -z "$HIPCC" ]]; then
echo "ERROR: hipcc not found on filesystem"
exit 1
fi
echo "Found hipcc at: $HIPCC"
# Compile and run the checked-in C++ program
OUTDIR="./hip_tmp"
sudo mkdir -p "$OUTDIR"
sudo chown -R $(id -u):$(id -g) "$OUTDIR"
"$HIPCC" -O2 torchtitan/experiments/autoparallel/debug_ci_script/debug_hip_device_name_ci.cpp -o "$OUTDIR/hip_device_name"
"$OUTDIR/hip_device_name"
echo "::endgroup::"
fi

pip config --user set global.progress_bar off

python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}

sudo curl -L -o /usr/share/libdrm/amdgpu.ids https://gitlab.freedesktop.org/mesa/libdrm/-/raw/main/data/amdgpu.ids
sudo mkdir -p /opt/amdgpu/share/libdrm
sudo ln -sf /usr/share/libdrm/amdgpu.ids /opt/amdgpu/share/libdrm/amdgpu.ids
echo "Getting device name:"
python -c "import torch; print(torch.cuda.get_device_name(None))"

# Install autoparallel - required dependency for autoparallel experiment
python -m pip install git+https://github.com/meta-pytorch/autoparallel.git

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// How to compile and run:
// hipcc hip_device_name.cpp -o hip_device_name
// ./hip_device_name
#include <hip/hip_runtime.h>
#include <stdio.h>

#define HIP_CHECK(call) \
do { \
hipError_t err = call; \
if (err != hipSuccess) { \
fprintf(stderr, "HIP error at %s:%d: %s\n", __FILE__, __LINE__, \
hipGetErrorString(err)); \
return 1; \
} \
} while (0)

// Simulates torch.cuda.get_device_name(None) behavior
void print_current_device_name() {
int current_device;
if (hipGetDevice(&current_device) != hipSuccess) { // This is what torch.cuda.current_device() calls
fprintf(stderr, "Failed to get current device\n");
return;
}

hipDeviceProp_t prop;
if (hipGetDeviceProperties(&prop, current_device) != hipSuccess) {
fprintf(stderr, "Failed to get device properties\n");
return;
}

printf("Current device %d: %s\n", current_device, prop.name);
}

int main(int argc, char* argv[]) {
// Demonstrate hipGetDevice / hipSetDevice
// This is what torch.cuda.get_device_name(None) does internally
printf("=== Simulating torch.cuda.get_device_name(None) ===\n");
print_current_device_name();

return 0;
}
Loading