From 5c2de37658741195c1994c93dab2499e9ee128a9 Mon Sep 17 00:00:00 2001 From: Huanyu He Date: Wed, 22 Jan 2025 15:37:37 -0800 Subject: [PATCH] set LD_LIBRARY_PATH for fbgemm in validate_binaries.sh (#2696) Summary: # context * to address the error when running github test ``` +++ conda run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec' +++ local cmd=run +++ case "$cmd" in +++ __conda_exe run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec' +++ /opt/conda/bin/conda run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec' ERROR:root:Could not load the library 'fbgemm_gpu_tbe_index_select.so': /lib64/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_tbe_index_select.so) Traceback (most recent call last): File "", line 1, in File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 62, in _load_library(f"{library}.so") File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 21, in _load_library raise error File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 17, in _load_library main() File "/home/ec2-user/actions-runner/_work/torchrec/torchrec/test-infra/.github/scripts/run_with_env_secrets.py", line 98, in main run_cmd_or_die(f"docker exec -t {container_name} /exec") File "/home/ec2-user/actions-runner/_work/torchrec/torchrec/test-infra/.github/scripts/run_with_env_secrets.py", line 39, in run_cmd_or_die raise RuntimeError(f"Command {cmd} failed with exit code {exit_code}") RuntimeError: Command docker exec -t d5cfe23625bf3b1538b808a1344090ae72ff3977990bc1f780c7a46435a384ec /exec failed with exit code 1 torch.ops.load_library(os.path.join(os.path.dirname(__file__), filename)) File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/torch/_ops.py", line 1357, in load_library ctypes.CDLL(path) File "/opt/conda/envs/build_binary/lib/python3.10/ctypes/__init__.py", line 374, in __init__ self._handle = _dlopen(self._name, mode) OSError: /lib64/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_tbe_index_select.so) ``` * the issue was fixed before by D67949409 ([#2671](https://github.com/pytorch/torchrec/pull/2671)) in for another test * this diff applies the same fix on the validate_binaries test. # details * previous failures {F1974496108} Differential Revision: D68511145 --- .github/scripts/validate_binaries.sh | 72 ++++++++++++++++--------- .github/workflows/validate-binaries.yml | 6 +++ 2 files changed, 54 insertions(+), 24 deletions(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 85ad0de47..118273869 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -7,8 +7,9 @@ export PYTORCH_CUDA_PKG="" +export CONDA_ENV="build_binary" -conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}" +conda create -y -n "${CONDA_ENV}" python="${MATRIX_PYTHON_VERSION}" conda run -n build_binary python --version @@ -49,41 +50,64 @@ elif [[ ${MATRIX_CHANNEL} = 'release' ]]; then export PYTORCH_URL="https://download.pytorch.org/whl/${CUDA_VERSION}" fi + +echo "CU_VERSION: ${CUDA_VERSION}" +echo "MATRIX_CHANNEL: ${MATRIX_CHANNEL}" +echo "CONDA_ENV: ${CONDA_ENV}" + +# shellcheck disable=SC2155 +export CONDA_PREFIX=$(conda run -n "${CONDA_ENV}" printenv CONDA_PREFIX) + +find / -name *cuda* + +if [[ $CUDA_VERSION = cu* ]]; then + # Setting LD_LIBRARY_PATH fixes the runtime error with fbgemm_gpu not + # being able to locate libnvrtc.so + echo "[NOVA] Setting LD_LIBRARY_PATH ..." + conda env config vars set -n ${CONDA_ENV} \ + LD_LIBRARY_PATH="/usr/local/lib:/usr/lib64:${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}" +else + echo "[NOVA] Setting LD_LIBRARY_PATH ..." + conda env config vars set -p ${CONDA_ENV} \ + LD_LIBRARY_PATH="/usr/local/lib:${CONDA_ENV}/lib:${LD_LIBRARY_PATH}" +fi + + # install pytorch # switch back to conda once torch nightly is fixed # if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then # export PYTORCH_CUDA_PKG="pytorch-cuda=${MATRIX_GPU_ARCH_VERSION}" # fi -conda run -n build_binary pip install torch --index-url "$PYTORCH_URL" +conda run -n "${CONDA_ENV}" pip install torch --index-url "$PYTORCH_URL" # install fbgemm -conda run -n build_binary pip install fbgemm-gpu --index-url "$PYTORCH_URL" +conda run -n "${CONDA_ENV}" pip install fbgemm-gpu --index-url "$PYTORCH_URL" # install requirements from pypi -conda run -n build_binary pip install torchmetrics==1.0.3 +conda run -n "${CONDA_ENV}" pip install torchmetrics==1.0.3 # install torchrec -conda run -n build_binary pip install torchrec --index-url "$PYTORCH_URL" +conda run -n "${CONDA_ENV}" pip install torchrec --index-url "$PYTORCH_URL" # Run small import test -conda run -n build_binary python -c "import torch; import fbgemm_gpu; import torchrec" +conda run -n "${CONDA_ENV}" python -c "import torch; import fbgemm_gpu; import torchrec" # check directory ls -R # check if cuda available -conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())" +conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.cuda.is_available())" # check cuda version -conda run -n build_binary python -c "import torch; print(torch.version.cuda)" +conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.version.cuda)" # Finally run smoke test # python 3.11 needs torchx-nightly -conda run -n build_binary pip install torchx-nightly iopath +conda run -n "${CONDA_ENV}" pip install torchx-nightly iopath if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then - conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py + conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py else - conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only + conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only fi @@ -93,8 +117,8 @@ if [[ ${MATRIX_CHANNEL} != 'release' ]]; then exit 0 else # Check version matches only for release binaries - torchrec_version=$(conda run -n build_binary pip show torchrec | grep Version | cut -d' ' -f2) - fbgemm_version=$(conda run -n build_binary pip show fbgemm_gpu | grep Version | cut -d' ' -f2) + torchrec_version=$(conda run -n "${CONDA_ENV}" pip show torchrec | grep Version | cut -d' ' -f2) + fbgemm_version=$(conda run -n "${CONDA_ENV}" pip show fbgemm_gpu | grep Version | cut -d' ' -f2) if [ "$torchrec_version" != "$fbgemm_version" ]; then echo "Error: TorchRec package version does not match FBGEMM package version" @@ -102,22 +126,22 @@ else fi fi -conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}" +conda create -y -n "${CONDA_ENV}" python="${MATRIX_PYTHON_VERSION}" -conda run -n build_binary python --version +conda run -n "${CONDA_ENV}" python --version if [[ ${MATRIX_GPU_ARCH_VERSION} != '12.4' ]]; then exit 0 fi echo "checking pypi release" -conda run -n build_binary pip install torch -conda run -n build_binary pip install fbgemm-gpu -conda run -n build_binary pip install torchrec +conda run -n "${CONDA_ENV}" pip install torch +conda run -n "${CONDA_ENV}" pip install fbgemm-gpu +conda run -n "${CONDA_ENV}" pip install torchrec # Check version matching again for PyPI -torchrec_version=$(conda run -n build_binary pip show torchrec | grep Version | cut -d' ' -f2) -fbgemm_version=$(conda run -n build_binary pip show fbgemm_gpu | grep Version | cut -d' ' -f2) +torchrec_version=$(conda run -n "${CONDA_ENV}" pip show torchrec | grep Version | cut -d' ' -f2) +fbgemm_version=$(conda run -n "${CONDA_ENV}" pip show fbgemm_gpu | grep Version | cut -d' ' -f2) if [ "$torchrec_version" != "$fbgemm_version" ]; then echo "Error: TorchRec package version does not match FBGEMM package version" @@ -128,13 +152,13 @@ fi ls -R # check if cuda available -conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())" +conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.cuda.is_available())" # check cuda version -conda run -n build_binary python -c "import torch; print(torch.version.cuda)" +conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.version.cuda)" # python 3.11 needs torchx-nightly -conda run -n build_binary pip install torchx-nightly iopath +conda run -n "${CONDA_ENV}" pip install torchx-nightly iopath # Finally run smoke test -conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py +conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py diff --git a/.github/workflows/validate-binaries.yml b/.github/workflows/validate-binaries.yml index 248857214..98d69d721 100644 --- a/.github/workflows/validate-binaries.yml +++ b/.github/workflows/validate-binaries.yml @@ -1,6 +1,12 @@ name: Validate binaries on: + pull_request: + paths-ignore: + - "docs/*" + - "third_party/*" + - .gitignore + - "*.md" workflow_call: inputs: channel: