Skip to content

Commit

Permalink
set LD_LIBRARY_PATH for fbgemm in validate_binaries.sh (#2696)
Browse files Browse the repository at this point in the history
Summary:

# context
* to address the error when running github test
```
+++ conda run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec'
+++ local cmd=run
+++ case "$cmd" in
+++ __conda_exe run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec'
+++ /opt/conda/bin/conda run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec'
ERROR:root:Could not load the library 'fbgemm_gpu_tbe_index_select.so': /lib64/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_tbe_index_select.so)
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 62, in <module>
    _load_library(f"{library}.so")
  File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 21, in _load_library
    raise error
  File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 17, in _load_library
    main()
  File "/home/ec2-user/actions-runner/_work/torchrec/torchrec/test-infra/.github/scripts/run_with_env_secrets.py", line 98, in main
    run_cmd_or_die(f"docker exec -t {container_name} /exec")
  File "/home/ec2-user/actions-runner/_work/torchrec/torchrec/test-infra/.github/scripts/run_with_env_secrets.py", line 39, in run_cmd_or_die
    raise RuntimeError(f"Command {cmd} failed with exit code {exit_code}")
RuntimeError: Command docker exec -t d5cfe23625bf3b1538b808a1344090ae72ff3977990bc1f780c7a46435a384ec /exec failed with exit code 1
    torch.ops.load_library(os.path.join(os.path.dirname(__file__), filename))
  File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/torch/_ops.py", line 1357, in load_library
    ctypes.CDLL(path)
  File "/opt/conda/envs/build_binary/lib/python3.10/ctypes/__init__.py", line 374, in __init__
    self._handle = _dlopen(self._name, mode)
OSError: /lib64/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_tbe_index_select.so)
```
* the issue was fixed before by D67949409 ([#2671](#2671)) in for another test
* this diff applies the same fix on the validate_binaries test.

# details
* previous failures
{F1974496108}

Differential Revision: D68511145
  • Loading branch information
TroyGarden authored and facebook-github-bot committed Jan 22, 2025
1 parent 519f193 commit 5c2de37
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 24 deletions.
72 changes: 48 additions & 24 deletions .github/scripts/validate_binaries.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@


export PYTORCH_CUDA_PKG=""
export CONDA_ENV="build_binary"

conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}"
conda create -y -n "${CONDA_ENV}" python="${MATRIX_PYTHON_VERSION}"

conda run -n build_binary python --version

Expand Down Expand Up @@ -49,41 +50,64 @@ elif [[ ${MATRIX_CHANNEL} = 'release' ]]; then
export PYTORCH_URL="https://download.pytorch.org/whl/${CUDA_VERSION}"
fi


echo "CU_VERSION: ${CUDA_VERSION}"
echo "MATRIX_CHANNEL: ${MATRIX_CHANNEL}"
echo "CONDA_ENV: ${CONDA_ENV}"

# shellcheck disable=SC2155
export CONDA_PREFIX=$(conda run -n "${CONDA_ENV}" printenv CONDA_PREFIX)

find / -name *cuda*

if [[ $CUDA_VERSION = cu* ]]; then
# Setting LD_LIBRARY_PATH fixes the runtime error with fbgemm_gpu not
# being able to locate libnvrtc.so
echo "[NOVA] Setting LD_LIBRARY_PATH ..."
conda env config vars set -n ${CONDA_ENV} \
LD_LIBRARY_PATH="/usr/local/lib:/usr/lib64:${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
else
echo "[NOVA] Setting LD_LIBRARY_PATH ..."
conda env config vars set -p ${CONDA_ENV} \
LD_LIBRARY_PATH="/usr/local/lib:${CONDA_ENV}/lib:${LD_LIBRARY_PATH}"
fi


# install pytorch
# switch back to conda once torch nightly is fixed
# if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then
# export PYTORCH_CUDA_PKG="pytorch-cuda=${MATRIX_GPU_ARCH_VERSION}"
# fi
conda run -n build_binary pip install torch --index-url "$PYTORCH_URL"
conda run -n "${CONDA_ENV}" pip install torch --index-url "$PYTORCH_URL"

# install fbgemm
conda run -n build_binary pip install fbgemm-gpu --index-url "$PYTORCH_URL"
conda run -n "${CONDA_ENV}" pip install fbgemm-gpu --index-url "$PYTORCH_URL"

# install requirements from pypi
conda run -n build_binary pip install torchmetrics==1.0.3
conda run -n "${CONDA_ENV}" pip install torchmetrics==1.0.3

# install torchrec
conda run -n build_binary pip install torchrec --index-url "$PYTORCH_URL"
conda run -n "${CONDA_ENV}" pip install torchrec --index-url "$PYTORCH_URL"

# Run small import test
conda run -n build_binary python -c "import torch; import fbgemm_gpu; import torchrec"
conda run -n "${CONDA_ENV}" python -c "import torch; import fbgemm_gpu; import torchrec"

# check directory
ls -R

# check if cuda available
conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())"
conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.cuda.is_available())"

# check cuda version
conda run -n build_binary python -c "import torch; print(torch.version.cuda)"
conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.version.cuda)"

# Finally run smoke test
# python 3.11 needs torchx-nightly
conda run -n build_binary pip install torchx-nightly iopath
conda run -n "${CONDA_ENV}" pip install torchx-nightly iopath
if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then
conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py
conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py
else
conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only
conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only
fi


Expand All @@ -93,31 +117,31 @@ if [[ ${MATRIX_CHANNEL} != 'release' ]]; then
exit 0
else
# Check version matches only for release binaries
torchrec_version=$(conda run -n build_binary pip show torchrec | grep Version | cut -d' ' -f2)
fbgemm_version=$(conda run -n build_binary pip show fbgemm_gpu | grep Version | cut -d' ' -f2)
torchrec_version=$(conda run -n "${CONDA_ENV}" pip show torchrec | grep Version | cut -d' ' -f2)
fbgemm_version=$(conda run -n "${CONDA_ENV}" pip show fbgemm_gpu | grep Version | cut -d' ' -f2)

if [ "$torchrec_version" != "$fbgemm_version" ]; then
echo "Error: TorchRec package version does not match FBGEMM package version"
exit 1
fi
fi

conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}"
conda create -y -n "${CONDA_ENV}" python="${MATRIX_PYTHON_VERSION}"

conda run -n build_binary python --version
conda run -n "${CONDA_ENV}" python --version

if [[ ${MATRIX_GPU_ARCH_VERSION} != '12.4' ]]; then
exit 0
fi

echo "checking pypi release"
conda run -n build_binary pip install torch
conda run -n build_binary pip install fbgemm-gpu
conda run -n build_binary pip install torchrec
conda run -n "${CONDA_ENV}" pip install torch
conda run -n "${CONDA_ENV}" pip install fbgemm-gpu
conda run -n "${CONDA_ENV}" pip install torchrec

# Check version matching again for PyPI
torchrec_version=$(conda run -n build_binary pip show torchrec | grep Version | cut -d' ' -f2)
fbgemm_version=$(conda run -n build_binary pip show fbgemm_gpu | grep Version | cut -d' ' -f2)
torchrec_version=$(conda run -n "${CONDA_ENV}" pip show torchrec | grep Version | cut -d' ' -f2)
fbgemm_version=$(conda run -n "${CONDA_ENV}" pip show fbgemm_gpu | grep Version | cut -d' ' -f2)

if [ "$torchrec_version" != "$fbgemm_version" ]; then
echo "Error: TorchRec package version does not match FBGEMM package version"
Expand All @@ -128,13 +152,13 @@ fi
ls -R

# check if cuda available
conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())"
conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.cuda.is_available())"

# check cuda version
conda run -n build_binary python -c "import torch; print(torch.version.cuda)"
conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.version.cuda)"

# python 3.11 needs torchx-nightly
conda run -n build_binary pip install torchx-nightly iopath
conda run -n "${CONDA_ENV}" pip install torchx-nightly iopath

# Finally run smoke test
conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py
conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py
6 changes: 6 additions & 0 deletions .github/workflows/validate-binaries.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
name: Validate binaries

on:
pull_request:
paths-ignore:
- "docs/*"
- "third_party/*"
- .gitignore
- "*.md"
workflow_call:
inputs:
channel:
Expand Down

0 comments on commit 5c2de37

Please sign in to comment.