Release v1.13

NVIDIA · Dec 9, 2024 · e5edd6c · e5edd6c
2 parents 7a7225c + ccd7a0c
commit e5edd6c
Show file tree

Hide file tree

Showing 132 changed files with 12,494 additions and 4,209 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -76,7 +76,7 @@ jobs:
     name: 'PaddlePaddle'
     runs-on: ubuntu-latest
     container:
-      image: nvcr.io/nvidia/paddlepaddle:24.07-py3
+      image: nvcr.io/nvidia/paddlepaddle:24.10-py3
       options: --user root
     steps:
       - name: 'Checkout'

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -17,8 +17,8 @@ jobs:
         uses: actions/checkout@v3
       - name: 'Install dependencies'
         run: |
-          pip install sphinx==5.1.1 sphinx_rtd_theme==1.0.0 nbsphinx==0.8.10 IPython ipython_genutils==0.2.0 ipywidgets==8.0.2 astroid==2.15.7
-          pip install breathe==4.34.0 sphinx-autoapi==2.0.1
+          pip install sphinx==8.1.3 sphinx_rtd_theme==3.0.1 nbsphinx==0.9.5 IPython ipython_genutils==0.2.0 ipywidgets==8.0.2 astroid==3.3.2
+          pip install breathe==4.35.0 sphinx-autoapi==3.3.2
           sudo apt-get install -y pandoc graphviz doxygen
           export GIT_SHA=$(git show-ref --hash HEAD)
       - name: 'Build docs'

diff --git a/.github/workflows/trigger-ci.yml b/.github/workflows/trigger-ci.yml
@@ -39,6 +39,7 @@ jobs:
            || github.actor == 'pggPL'
            || github.actor == 'vasunvidia'
            || github.actor == 'erhoo82'
+           || github.actor == 'kocchop'
          )
     steps:
       - name: Check if comment is issued by authorized person

diff --git a/.gitignore b/.gitignore
@@ -22,9 +22,7 @@ __pycache__
 .hypothesis
 .devcontainer.json
 tests/cpp/build/
-docs/_build
 .ipynb_checkpoints
-docs/doxygen
 *.log
 CMakeFiles/CMakeSystem.cmake
 sdist/
@@ -40,3 +38,4 @@ dist/
 downloads/
 .pytest_cache/
 compile_commands.json
+.nfs
diff --git a/build_tools/VERSION.txt b/build_tools/VERSION.txt
@@ -1 +1 @@
-1.12.0
+1.13.0
diff --git a/build_tools/paddle.py b/build_tools/paddle.py
@@ -25,7 +25,7 @@ def setup_paddle_extension(
     # Source files
     csrc_source_files = Path(csrc_source_files)
     sources = [
-        csrc_source_files / "extensions.cu",
+        csrc_source_files / "extensions.cpp",
         csrc_source_files / "common.cpp",
         csrc_source_files / "custom_ops.cu",
     ]

diff --git a/build_tools/pytorch.py b/build_tools/pytorch.py
@@ -11,7 +11,6 @@
 from .utils import (
     all_files_in_dir,
     cuda_archs,
-    cuda_path,
     cuda_version,
 )
 
@@ -27,11 +26,8 @@ def setup_pytorch_extension(
     csrc_source_files = Path(csrc_source_files)
     extensions_dir = csrc_source_files / "extensions"
     sources = [
-        csrc_source_files / "common.cu",
+        csrc_source_files / "common.cpp",
         csrc_source_files / "ts_fp8_op.cpp",
-        csrc_source_files / "userbuffers" / "ipcsocket.cc",
-        csrc_source_files / "userbuffers" / "userbuffers.cu",
-        csrc_source_files / "userbuffers" / "userbuffers-host.cpp",
     ] + all_files_in_dir(extensions_dir)
 
     # Header files
@@ -85,19 +81,14 @@ def setup_pytorch_extension(
                 continue  # Already handled
             nvcc_flags.extend(["-gencode", f"arch=compute_{arch},code=sm_{arch}"])
 
-    # Libraries
-    library_dirs = []
-    libraries = []
-    if bool(int(os.getenv("NVTE_UB_WITH_MPI", 0))):
+    if bool(int(os.getenv("NVTE_UB_WITH_MPI", "0"))):
         assert (
             os.getenv("MPI_HOME") is not None
-        ), "MPI_HOME must be set when compiling with NVTE_UB_WITH_MPI=1"
-        mpi_home = Path(os.getenv("MPI_HOME"))
-        include_dirs.append(mpi_home / "include")
+        ), "MPI_HOME=/path/to/mpi must be set when compiling with NVTE_UB_WITH_MPI=1!"
+        mpi_path = Path(os.getenv("MPI_HOME"))
+        include_dirs.append(mpi_path / "include")
         cxx_flags.append("-DNVTE_UB_WITH_MPI")
         nvcc_flags.append("-DNVTE_UB_WITH_MPI")
-        library_dirs.append(mpi_home / "lib")
-        libraries.append("mpi")
 
     # Construct PyTorch CUDA extension
     sources = [str(path) for path in sources]
@@ -112,6 +103,4 @@ def setup_pytorch_extension(
             "cxx": cxx_flags,
             "nvcc": nvcc_flags,
         },
-        libraries=[str(lib) for lib in libraries],
-        library_dirs=[str(lib_dir) for lib_dir in library_dirs],
     )
diff --git a/docs/.gitignore b/docs/.gitignore
@@ -0,0 +1,3 @@
+_build
+doxygen
+sphinx_rtd_theme
diff --git a/docs/Makefile b/docs/Makefile
@@ -16,5 +16,10 @@ help:
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+%: Makefile sphinx_rtd_theme
+	PYTHONPATH=sphinx_rtd_theme:$(PYTHONPATH) $(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+# Patch Sphinx RTD theme 3.0.1 to add version selector in sidebar
+sphinx_rtd_theme:
+	git clone --depth=1 -b 3.0.1 --single-branch https://github.com/readthedocs/sphinx_rtd_theme.git
+	bash -c "cd sphinx_rtd_theme; git apply ../version_select.patch"
diff --git a/docs/api/pytorch.rst b/docs/api/pytorch.rst
@@ -51,3 +51,7 @@ pyTorch
 .. autoapifunction:: transformer_engine.pytorch.moe_permute
 
 .. autoapifunction:: transformer_engine.pytorch.moe_unpermute
+
+.. autoapifunction:: transformer_engine.pytorch.initialize_ub
+
+.. autoapifunction:: transformer_engine.pytorch.destroy_ub
diff --git a/docs/conf.py b/docs/conf.py
@@ -2,38 +2,30 @@
 #
 # See LICENSE for license information.
 
+import datetime
 import os
-import sys
-import sphinx_rtd_theme
-from sphinx.ext.autodoc.mock import mock
-from sphinx.ext.autodoc import between, ClassDocumenter, AttributeDocumenter
-from sphinx.util import inspect
-from builtins import str
-from enum import Enum
-import re
+import pathlib
 import subprocess
-from pathlib import Path
-from datetime import date
-
-te_path = os.path.dirname(os.path.realpath(__file__))
+from builtins import str
 
-with open(te_path + "/../build_tools/VERSION.txt", "r") as f:
-    te_version = f.readline().strip()
+# Basic project info
+project = "Transformer Engine"
+author = "NVIDIA CORPORATION & AFFILIATES"
 
+# Copyright statement
 release_year = 2022
-
-current_year = date.today().year
+current_year = datetime.date.today().year
 if current_year == release_year:
     copyright_year = release_year
 else:
     copyright_year = str(release_year) + "-" + str(current_year)
+copyright = f"{copyright_year}, NVIDIA CORPORATION & AFFILIATES. All rights reserved."
 
-project = "Transformer Engine"
-copyright = "{}, NVIDIA CORPORATION & AFFILIATES. All rights reserved.".format(copyright_year)
-author = "NVIDIA CORPORATION & AFFILIATES"
+# Transformer Engine root directory
+root_path = pathlib.Path(__file__).resolve().parent.parent
 
+# Git hash
 git_sha = os.getenv("GIT_SHA")
-
 if not git_sha:
     try:
         git_sha = (
@@ -44,31 +36,16 @@
         )
     except:
         git_sha = "0000000"
-
 git_sha = git_sha[:7] if len(git_sha) > 7 else git_sha
 
-if "dev" in te_version:
-    version = str(te_version + "-" + git_sha)
+# Version
+with open(root_path / "build_tools" / "VERSION.txt", "r") as f:
+    _raw_version = f.readline().strip()
+if "dev" in _raw_version:
+    version = str(_raw_version + "-" + git_sha)
 else:
-    version = str(te_version)
-release = te_version
-
-# hack: version is used for html creation, so put the version picker
-# link here as well:
-option_on = " selected"
-option_off = ""
-release_opt = option_on
-option_nr = 0
-version = (
-    version
-    + """<br/>
-Version select: <select onChange="window.location.href = this.value" onFocus="this.selectedIndex = {0}">
-    <option value="https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html"{1}>Current release</option>
-    <option value="https://docs.nvidia.com/deeplearning/transformer-engine/documentation-archive.html">Older releases</option>
-</select>""".format(
-        option_nr, release_opt
-    )
-)
+    version = str(_raw_version)
+release = _raw_version
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
@@ -92,12 +69,10 @@
 
 pygments_style = "sphinx"
 
-
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
 html_theme = "sphinx_rtd_theme"
-html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 html_static_path = ["_static"]
 html_show_sphinx = False
 
@@ -106,7 +81,12 @@
     "css/nvidia_footer.css",
 ]
 
-html_theme_options = {"display_version": True, "collapse_navigation": False, "logo_only": False}
+html_theme_options = {
+    "collapse_navigation": False,
+    "logo_only": False,
+    "version_selector": False,
+    "language_selector": False,
+}
 
 napoleon_custom_sections = [
     ("Parallelism parameters", "params_style"),
@@ -116,8 +96,8 @@
     ("FP8-related parameters", "params_style"),
 ]
 
-breathe_projects = {"TransformerEngine": os.path.abspath("doxygen/xml/")}
+breathe_projects = {"TransformerEngine": root_path / "docs" / "doxygen" / "xml"}
 breathe_default_project = "TransformerEngine"
 
 autoapi_generate_api_docs = False
-autoapi_dirs = ["../transformer_engine"]
+autoapi_dirs = [root_path / "transformer_engine"]
diff --git a/docs/version_select.patch b/docs/version_select.patch
@@ -0,0 +1,21 @@
+diff --git a/sphinx_rtd_theme/layout.html b/sphinx_rtd_theme/layout.html
+index e6a38b1..579eaec 100644
+--- a/sphinx_rtd_theme/layout.html
++++ b/sphinx_rtd_theme/layout.html
+@@ -124,6 +124,16 @@
+             {%- endif %}
+           </a>
+
++          {# Show TE version and version selector #}
++          <div class="version">
++            {{ version }}
++            <br>
++            Version select: <select onChange="window.location.href = this.value" onFocus="this.selectedIndex = {0}">
++              <option value="https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html"{1}>Current release</option>
++              <option value="https://docs.nvidia.com/deeplearning/transformer-engine/documentation-archive.html">Older releases</option>
++            </select>
++          </div>
++
+           {%- if READTHEDOCS or DEBUG %}
+             {%- if theme_version_selector or theme_language_selector %}
+               <div class="switch-menus">
diff --git a/qa/L0_jax_unittest/test.sh b/qa/L0_jax_unittest/test.sh
@@ -18,5 +18,7 @@ pip install -r $TE_PATH/examples/jax/encoder/requirements.txt
 
 pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/mnist
 
+# Make encoder tests to have run-to-run deterministic to have the stable CI results
+export XLA_FLAGS="${XLA_FLAGS} --xla_gpu_deterministic_ops"
 pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder --ignore=$TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py
 pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py
diff --git a/qa/L1_jax_distributed_unittest/test.sh b/qa/L1_jax_distributed_unittest/test.sh
@@ -5,4 +5,11 @@
 set -xe
 
 : ${TE_PATH:=/opt/transformerengine}
-pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/tests/jax/test_distributed_*
+
+# Skip ring attention tests since they need fixed environment vars
+pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/tests/jax/test_distributed_* -k 'not test_context_parallel_ring_attn'
+
+# Test ring attention with and without scan loop
+NVTE_FUSED_RING_ATTENTION_USE_SCAN=0 pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/tests/jax/test_distributed_fused_attn.py -k test_context_parallel_ring_attn
+NVTE_FUSED_RING_ATTENTION_USE_SCAN=1 XLA_FLAGS="--xla_experimental_ignore_channel_id" \
+    pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/tests/jax/test_distributed_fused_attn.py -k test_context_parallel_ring_attn
diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -10,4 +10,5 @@ pip install pytest==8.2.1
 pytest -v -s $TE_PATH/tests/pytorch/distributed/test_numerics.py
 pytest -v -s $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py
 pytest -v -s $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py
+pytest -v -s $TE_PATH/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py
 pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn_with_cp.py
diff --git a/qa/L1_pytorch_mcore_integration/test.sh b/qa/L1_pytorch_mcore_integration/test.sh
@@ -0,0 +1,58 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+set -e
+
+# Paths
+: ${TE_PATH:=/opt/transformerengine}
+: ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM}
+
+# Download Megatron-LM if needed
+if [ ! -d "${MCORE_PATH}" ]; then
+    pushd $(dirname ${MCORE_PATH})
+    git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
+    popd
+fi
+
+# Megatron-LM invocation
+COMMAND="
+NVTE_TORCH_COMPILE=0
+NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
+NVTE_FLASH_ATTN=1
+NVTE_FWD_LAYERNORM_SM_MARGIN=0
+NVTE_BWD_LAYERNORM_SM_MARGIN=0
+CUDA_DEVICE_MAX_CONNECTIONS=1
+NVTE_BIAS_GELU_NVFUSION=0
+NVTE_BIAS_DROPOUT_FUSION=0
+
+python
+-m torch.distributed.launch
+--use_env
+--nnodes=1
+--nproc_per_node=1
+
+${MCORE_PATH}/pretrain_gpt.py
+--tensor-model-parallel-size 1
+--pipeline-model-parallel-size 1
+--use-cpu-initialization
+--num-layers 2
+--hidden-size 128
+--num-attention-heads 8
+--seq-length 128
+--max-position-embeddings 2048
+--micro-batch-size 1
+--global-batch-size 8
+--train-iters 10
+--eval-iters 10
+--lr 1e-4
+--mock-data
+--vocab-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-vocab.json
+--merge-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-merges.txt
+--transformer-impl transformer_engine
+--fp8-format hybrid
+"
+COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')
+
+# Launch Megatron-LM
+bash -c "${COMMAND}"
diff --git a/qa/L3_pytorch_FA_versions_test/test.sh b/qa/L3_pytorch_FA_versions_test/test.sh
@@ -12,7 +12,7 @@ pip install pytest==8.2.1
 export MAX_JOBS=4
 
 # Iterate over Flash Attention versions
-FA_versions=(2.1.1 2.3.0 2.4.0.post1 2.4.1 2.5.7 2.6.3 3.0.0b1)
+FA_versions=(2.1.1 2.3.0 2.4.1 2.5.7 2.6.3 3.0.0b1)
 for fa_version in "${FA_versions[@]}"
 do
 

diff --git a/setup.py b/setup.py
@@ -57,13 +57,20 @@ def run(self):
 
 def setup_common_extension() -> CMakeExtension:
     """Setup CMake extension for common library"""
+    cmake_flags = ["-DCMAKE_CUDA_ARCHITECTURES={}".format(cuda_archs())]
+    if bool(int(os.getenv("NVTE_UB_WITH_MPI", "0"))):
+        assert (
+            os.getenv("MPI_HOME") is not None
+        ), "MPI_HOME must be set when compiling with NVTE_UB_WITH_MPI=1"
+        cmake_flags.append("-DNVTE_UB_WITH_MPI=ON")
+
     # Project directory root
     root_path = Path(__file__).resolve().parent
 
     return CMakeExtension(
         name="transformer_engine",
         cmake_path=root_path / Path("transformer_engine/common"),
-        cmake_flags=["-DCMAKE_CUDA_ARCHITECTURES={}".format(cuda_archs())],
+        cmake_flags=cmake_flags,
     )