diff --git a/.github/workflows/integration_test_8gpu_autoparallel.yaml b/.github/workflows/integration_test_8gpu_autoparallel.yaml index 7ce391f71a..7e9b6ac366 100644 --- a/.github/workflows/integration_test_8gpu_autoparallel.yaml +++ b/.github/workflows/integration_test_8gpu_autoparallel.yaml @@ -74,4 +74,12 @@ jobs: sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded" sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded" + # HIP runtime relies on the file /opt/amdgpu/share/libdrm/amdgpu.ids to look up the product name of AMDGPU. + # The docker image only has /usr/share/libdrm/amdgpu.ids and it is out of date. So this is the workaround. + if [[ "${{ matrix.gpu-arch-type }}" == "rocm" ]]; then + sudo curl -L -o /usr/share/libdrm/amdgpu.ids https://gitlab.freedesktop.org/mesa/libdrm/-/raw/main/data/amdgpu.ids + sudo mkdir -p /opt/amdgpu/share/libdrm + sudo ln -sf /usr/share/libdrm/amdgpu.ids /opt/amdgpu/share/libdrm/amdgpu.ids + fi + python -m torchtitan.experiments.autoparallel.tests.integration_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 4 diff --git a/torchtitan/experiments/autoparallel/tests/integration_tests.py b/torchtitan/experiments/autoparallel/tests/integration_tests.py index 4db3f4180d..5aa4e62797 100644 --- a/torchtitan/experiments/autoparallel/tests/integration_tests.py +++ b/torchtitan/experiments/autoparallel/tests/integration_tests.py @@ -30,7 +30,6 @@ def build_autoparallel_test_list() -> list[OverrideDefinitions]: "llama3 AutoParallel FSDP+TP", "llama3_autoparallel_fsdp_tp", ngpu=4, - skip_rocm_test=True, ), # TODO: Re-enable this once we fix the test # deepseek_v3 tests