From 5c2de37658741195c1994c93dab2499e9ee128a9 Mon Sep 17 00:00:00 2001
From: Huanyu He <hhy@meta.com>
Date: Wed, 22 Jan 2025 15:37:37 -0800
Subject: [PATCH] set LD_LIBRARY_PATH for fbgemm in validate_binaries.sh
 (#2696)

Summary:

# context
* to address the error when running github test
```
+++ conda run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec'
+++ local cmd=run
+++ case "$cmd" in
+++ __conda_exe run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec'
+++ /opt/conda/bin/conda run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec'
ERROR:root:Could not load the library 'fbgemm_gpu_tbe_index_select.so': /lib64/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_tbe_index_select.so)
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 62, in <module>
    _load_library(f"{library}.so")
  File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 21, in _load_library
    raise error
  File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 17, in _load_library
    main()
  File "/home/ec2-user/actions-runner/_work/torchrec/torchrec/test-infra/.github/scripts/run_with_env_secrets.py", line 98, in main
    run_cmd_or_die(f"docker exec -t {container_name} /exec")
  File "/home/ec2-user/actions-runner/_work/torchrec/torchrec/test-infra/.github/scripts/run_with_env_secrets.py", line 39, in run_cmd_or_die
    raise RuntimeError(f"Command {cmd} failed with exit code {exit_code}")
RuntimeError: Command docker exec -t d5cfe23625bf3b1538b808a1344090ae72ff3977990bc1f780c7a46435a384ec /exec failed with exit code 1
    torch.ops.load_library(os.path.join(os.path.dirname(__file__), filename))
  File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/torch/_ops.py", line 1357, in load_library
    ctypes.CDLL(path)
  File "/opt/conda/envs/build_binary/lib/python3.10/ctypes/__init__.py", line 374, in __init__
    self._handle = _dlopen(self._name, mode)
OSError: /lib64/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_tbe_index_select.so)
```
* the issue was fixed before by D67949409 ([#2671](https://github.com/pytorch/torchrec/pull/2671)) in for another test
* this diff applies the same fix on the validate_binaries test.

# details
* previous failures
{F1974496108}

Differential Revision: D68511145
---
 .github/scripts/validate_binaries.sh    | 72 ++++++++++++++++---------
 .github/workflows/validate-binaries.yml |  6 +++
 2 files changed, 54 insertions(+), 24 deletions(-)
diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh
index 85ad0de47..118273869 100755
--- a/.github/scripts/validate_binaries.sh
+++ b/.github/scripts/validate_binaries.sh
@@ -7,8 +7,9 @@
 
 
 export PYTORCH_CUDA_PKG=""
+export CONDA_ENV="build_binary"
 
-conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}"
+conda create -y -n "${CONDA_ENV}" python="${MATRIX_PYTHON_VERSION}"
 
 conda run -n build_binary python --version
 
@@ -49,41 +50,64 @@ elif [[ ${MATRIX_CHANNEL} = 'release' ]]; then
     export PYTORCH_URL="https://download.pytorch.org/whl/${CUDA_VERSION}"
 fi
 
+
+echo "CU_VERSION: ${CUDA_VERSION}"
+echo "MATRIX_CHANNEL: ${MATRIX_CHANNEL}"
+echo "CONDA_ENV: ${CONDA_ENV}"
+
+# shellcheck disable=SC2155
+export CONDA_PREFIX=$(conda run -n "${CONDA_ENV}" printenv CONDA_PREFIX)
+
+find / -name *cuda*
+
+if [[ $CUDA_VERSION = cu* ]]; then
+    # Setting LD_LIBRARY_PATH fixes the runtime error with fbgemm_gpu not
+    # being able to locate libnvrtc.so
+    echo "[NOVA] Setting LD_LIBRARY_PATH ..."
+    conda env config vars set -n ${CONDA_ENV}  \
+        LD_LIBRARY_PATH="/usr/local/lib:/usr/lib64:${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
+else
+    echo "[NOVA] Setting LD_LIBRARY_PATH ..."
+    conda env config vars set -p ${CONDA_ENV}  \
+        LD_LIBRARY_PATH="/usr/local/lib:${CONDA_ENV}/lib:${LD_LIBRARY_PATH}"
+fi
+
+
 # install pytorch
 # switch back to conda once torch nightly is fixed
 # if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then
 #     export PYTORCH_CUDA_PKG="pytorch-cuda=${MATRIX_GPU_ARCH_VERSION}"
 # fi
-conda run -n build_binary pip install torch --index-url "$PYTORCH_URL"
+conda run -n "${CONDA_ENV}" pip install torch --index-url "$PYTORCH_URL"
 
 # install fbgemm
-conda run -n build_binary pip install fbgemm-gpu --index-url "$PYTORCH_URL"
+conda run -n "${CONDA_ENV}" pip install fbgemm-gpu --index-url "$PYTORCH_URL"
 
 # install requirements from pypi
-conda run -n build_binary pip install torchmetrics==1.0.3
+conda run -n "${CONDA_ENV}" pip install torchmetrics==1.0.3
 
 # install torchrec
-conda run -n build_binary pip install torchrec --index-url "$PYTORCH_URL"
+conda run -n "${CONDA_ENV}" pip install torchrec --index-url "$PYTORCH_URL"
 
 # Run small import test
-conda run -n build_binary python -c "import torch; import fbgemm_gpu; import torchrec"
+conda run -n "${CONDA_ENV}" python -c "import torch; import fbgemm_gpu; import torchrec"
 
 # check directory
 ls -R
 
 # check if cuda available
-conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())"
+conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.cuda.is_available())"
 
 # check cuda version
-conda run -n build_binary python -c "import torch; print(torch.version.cuda)"
+conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.version.cuda)"
 
 # Finally run smoke test
 # python 3.11 needs torchx-nightly
-conda run -n build_binary pip install torchx-nightly iopath
+conda run -n "${CONDA_ENV}" pip install torchx-nightly iopath
 if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then
-    conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py
+    conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py
 else
-    conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only
+    conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only
 fi
 
 
@@ -93,8 +117,8 @@ if [[ ${MATRIX_CHANNEL} != 'release' ]]; then
     exit 0
 else
     # Check version matches only for release binaries
-    torchrec_version=$(conda run -n build_binary pip show torchrec | grep Version | cut -d' ' -f2)
-    fbgemm_version=$(conda run -n build_binary pip show fbgemm_gpu | grep Version | cut -d' ' -f2)
+    torchrec_version=$(conda run -n "${CONDA_ENV}" pip show torchrec | grep Version | cut -d' ' -f2)
+    fbgemm_version=$(conda run -n "${CONDA_ENV}" pip show fbgemm_gpu | grep Version | cut -d' ' -f2)
 
     if [ "$torchrec_version" != "$fbgemm_version" ]; then
         echo "Error: TorchRec package version does not match FBGEMM package version"
@@ -102,22 +126,22 @@ else
     fi
 fi
 
-conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}"
+conda create -y -n "${CONDA_ENV}" python="${MATRIX_PYTHON_VERSION}"
 
-conda run -n build_binary python --version
+conda run -n "${CONDA_ENV}" python --version
 
 if [[ ${MATRIX_GPU_ARCH_VERSION} != '12.4' ]]; then
     exit 0
 fi
 
 echo "checking pypi release"
-conda run -n build_binary pip install torch
-conda run -n build_binary pip install fbgemm-gpu
-conda run -n build_binary pip install torchrec
+conda run -n "${CONDA_ENV}" pip install torch
+conda run -n "${CONDA_ENV}" pip install fbgemm-gpu
+conda run -n "${CONDA_ENV}" pip install torchrec
 
 # Check version matching again for PyPI
-torchrec_version=$(conda run -n build_binary pip show torchrec | grep Version | cut -d' ' -f2)
-fbgemm_version=$(conda run -n build_binary pip show fbgemm_gpu | grep Version | cut -d' ' -f2)
+torchrec_version=$(conda run -n "${CONDA_ENV}" pip show torchrec | grep Version | cut -d' ' -f2)
+fbgemm_version=$(conda run -n "${CONDA_ENV}" pip show fbgemm_gpu | grep Version | cut -d' ' -f2)
 
 if [ "$torchrec_version" != "$fbgemm_version" ]; then
     echo "Error: TorchRec package version does not match FBGEMM package version"
@@ -128,13 +152,13 @@ fi
 ls -R
 
 # check if cuda available
-conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())"
+conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.cuda.is_available())"
 
 # check cuda version
-conda run -n build_binary python -c "import torch; print(torch.version.cuda)"
+conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.version.cuda)"
 
 # python 3.11 needs torchx-nightly
-conda run -n build_binary pip install torchx-nightly iopath
+conda run -n "${CONDA_ENV}" pip install torchx-nightly iopath
 
 # Finally run smoke test
-conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py
+conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py
diff --git a/.github/workflows/validate-binaries.yml b/.github/workflows/validate-binaries.yml
index 248857214..98d69d721 100644
--- a/.github/workflows/validate-binaries.yml
+++ b/.github/workflows/validate-binaries.yml
@@ -1,6 +1,12 @@
 name: Validate binaries
 
 on:
+  pull_request:
+    paths-ignore:
+      - "docs/*"
+      - "third_party/*"
+      - .gitignore
+      - "*.md"
   workflow_call:
     inputs:
       channel: