2024-08-13 nightly release (9f17b23)

pytorch · Aug 13, 2024 · d7dd14e · d7dd14e
1 parent dfc47f0
commit d7dd14e
Show file tree

Hide file tree

Showing 18 changed files with 1,367 additions and 1,037 deletions.
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -48,7 +48,7 @@ prepare_fbgemm_gpu_build () {
   (exec_with_retries 3 conda run --no-capture-output ${env_prefix} python -m pip install -r requirements.txt) || return 1
 
   # BUILD_VARIANT is provided by the github workflow file
-  if [ "$BUILD_VARIANT" == "cuda" ]; then
+  if [ "$BUILD_VARIANT" == "cuda" ] || [ "$BUILD_VARIANT" == "genai" ]; then
     (install_triton_pip "${env_name}") || return 1
   fi
 

diff --git a/.github/workflows/fbgemm_gpu_ci_cpu.yml b/.github/workflows/fbgemm_gpu_ci_cpu.yml
@@ -112,7 +112,8 @@ jobs:
       uses: actions/upload-artifact@v4
       with:
         name: fbgemm_gpu_nightly_cpu_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}.whl
-        path: fbgemm_gpu/dist/fbgemm_gpu_nightly_cpu-*.whl
+        path: fbgemm_gpu/dist/*.whl
+        if-no-files-found: error
 
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
@@ -190,4 +191,4 @@ jobs:
       if: ${{ (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true')) && matrix.compiler == 'gcc' }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
-      run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_nightly_cpu-*.whl "$PYPI_TOKEN"
+      run: . $PRELUDE; publish_to_pypi $BUILD_ENV *.whl "$PYPI_TOKEN"
diff --git a/.github/workflows/fbgemm_gpu_ci_cuda.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -121,7 +121,8 @@ jobs:
       uses: actions/upload-artifact@v3
       with:
         name: fbgemm_gpu_nightly_cuda_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
-        path: fbgemm_gpu/dist/fbgemm_gpu_nightly-*.whl
+        path: fbgemm_gpu/dist/*.whl
+        if-no-files-found: error
 
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
@@ -211,4 +212,4 @@ jobs:
       if: ${{ (github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish) || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
-      run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_nightly-*.whl "$PYPI_TOKEN"
+      run: . $PRELUDE; publish_to_pypi $BUILD_ENV *.whl "$PYPI_TOKEN"
diff --git a/.github/workflows/fbgemm_gpu_ci_genai.yml b/.github/workflows/fbgemm_gpu_ci_genai.yml
@@ -0,0 +1,215 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This workflow is used for FBGEMM_GPU-GenAI CI as well as nightly builds of
+# FBGEMM_GPU-GenAI against PyTorch-CUDA Nightly.
+name: FBGEMM_GPU-GenAI CI
+
+on:
+  # PR Trigger (enabled for regression checks and debugging)
+  #
+  pull_request:
+    branches:
+      - main
+
+  # Push Trigger (enable to catch errors coming out of multiple merges)
+  #
+  push:
+    branches:
+      - main
+
+  # Cron Trigger (UTC)
+  #
+  # Based on the Conda page for PyTorch-nightly, the GPU nightly releases appear
+  # around 02:30 PST every day (roughly 2 hours after the CPU releases)
+  #
+  schedule:
+    - cron:  '45 12 * * *'
+
+  # Manual Trigger
+  #
+  workflow_dispatch:
+    inputs:
+      publish_to_pypi:
+        description: Publish Artifact to PyPI
+        type: boolean
+        required: false
+        default: false
+
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # Build on CPU hosts and upload to GHA
+  build_artifact:
+    runs-on: ${{ matrix.host-machine.instance }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+      BUILD_VARIANT: cuda
+    continue-on-error: true
+    strategy:
+      # Don't fast-fail all the other builds if one of the them fails
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.24xlarge" },
+        ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        compiler: [ "gcc", "clang" ]
+
+    steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v4
+      with:
+        submodules: true
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install C/C++ Compilers
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }}
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install CUDA
+      run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
+
+    # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
+    - name: Install PyTorch Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }}
+
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi
+
+    - name: Install cuDNN
+      run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Build FBGEMM_GPU Wheel
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly genai
+
+    - name: Upload Built Wheel as GHA Artifact
+      # Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old
+      uses: actions/upload-artifact@v3
+      with:
+        name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
+        path: fbgemm_gpu/dist/*.whl
+        if-no-files-found: error
+
+
+  # Download the built artifact from GHA, test on GPU, and push to PyPI
+  test_and_publish_artifact:
+    # runs-on: linux.4xlarge.nvidia.gpu
+    # Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml
+    runs-on: ${{ matrix.host-machine.instance }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+      BUILD_VARIANT: genai
+      ENFORCE_CUDA_DEVICE: 1
+    strategy:
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
+          # TODO: Enable when A100 machine queues are reasonably small enough for doing per-PR CI
+          # https://hud.pytorch.org/metrics
+          # { arch: x86, instance: "linux.gcp.a100" },
+        ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        # Specify exactly ONE CUDA version for artifact publish
+        cuda-version-publish: [ "12.1.1" ]
+        compiler: [ "gcc", "clang" ]
+    needs: build_artifact
+
+    steps:
+    # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
+
+    - name: Download Wheel Artifact from GHA
+      # Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
+      uses: actions/download-artifact@v3
+      with:
+        name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
+
+    # Use PyTorch test infrastructure action - https://github.com/pytorch/test-infra/blob/main/.github/actions/setup-nvidia/action.yml
+    - name: Install NVIDIA Drivers and NVIDIA-Docker Runtime
+      uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info; print_ec2_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install C/C++ Compilers for Updated LIBGCC
+      # Install clang libraries to enable building and install triton
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV clang
+
+    - name: Install CUDA
+      run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
+
+    # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
+    - name: Install PyTorch Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }}
+
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Install FBGEMM_GPU Wheel
+      run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
+
+    - name: Test with PyTest
+      timeout-minutes: 30
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV
+
+    - name: Push Wheel to PyPI
+      if: ${{ (github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish) || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}
+      env:
+        PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
+      run: . $PRELUDE; publish_to_pypi $BUILD_ENV *.whl "$PYPI_TOKEN"
diff --git a/.github/workflows/fbgemm_gpu_ci_rocm.yml b/.github/workflows/fbgemm_gpu_ci_rocm.yml
@@ -119,7 +119,8 @@ jobs:
       uses: actions/upload-artifact@v4
       with:
         name: fbgemm_gpu_nightly_rocm_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_rocm${{ matrix.rocm-version }}.whl
-        path: fbgemm_gpu/dist/fbgemm_gpu_nightly_rocm-*.whl
+        path: fbgemm_gpu/dist/*.whl
+        if-no-files-found: error
 
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI

diff --git a/.github/workflows/fbgemm_gpu_release_cpu.yml b/.github/workflows/fbgemm_gpu_release_cpu.yml
@@ -108,7 +108,8 @@ jobs:
       uses: actions/upload-artifact@v4
       with:
         name: fbgemm_gpu_release_cpu_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}.whl
-        path: fbgemm_gpu/dist/fbgemm_gpu_cpu-*.whl
+        path: fbgemm_gpu/dist/*.whl
+        if-no-files-found: error
 
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
@@ -182,4 +183,4 @@ jobs:
       if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
-      run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_cpu-*.whl "$PYPI_TOKEN"
+      run: . $PRELUDE; publish_to_pypi $BUILD_ENV *.whl "$PYPI_TOKEN"
diff --git a/.github/workflows/fbgemm_gpu_release_cuda.yml b/.github/workflows/fbgemm_gpu_release_cuda.yml
@@ -120,7 +120,8 @@ jobs:
       uses: actions/upload-artifact@v3
       with:
         name: fbgemm_gpu_release_cuda_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
-        path: fbgemm_gpu/dist/fbgemm_gpu-*.whl
+        path: fbgemm_gpu/dist/*.whl
+        if-no-files-found: error
 
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
@@ -192,4 +193,4 @@ jobs:
       if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == github.event.inputs.cuda_version }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
-      run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu-*.whl "$PYPI_TOKEN"
+      run: . $PRELUDE; publish_to_pypi $BUILD_ENV *.whl "$PYPI_TOKEN"
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@
 # General
 .DS_Store
 *~
+.hypothesis/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
@@ -22,9 +22,10 @@ set(CMAKE_VERBOSE_MAKEFILE on)
 # FBGEMM_GPU Build Options
 ################################################################################
 
-option(FBGEMM_CPU_ONLY  "Build FBGEMM_GPU without GPU support" OFF)
-option(USE_ROCM         "Build FBGEMM_GPU for ROCm" OFF)
-option(FBGEMM_GENAI_ONLY  "Build FBGEMM_GPU with GEN AI only support" OFF)
+option(FBGEMM_CPU_ONLY   "Build FBGEMM_GPU without GPU support" OFF)
+option(USE_ROCM          "Build FBGEMM_GPU for ROCm" OFF)
+option(FBGEMM_GENAI_ONLY "Build FBGEMM_GPU with GEN AI only support" OFF)
+option(USE_FB_ONLY       "Build FBGEMM_GPU FB only operators" OFF)
 
 if((NOT FBGEMM_CPU_ONLY) AND
    ((EXISTS "/opt/rocm/") OR (EXISTS $ENV{ROCM_PATH})) AND

diff --git a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
@@ -40,11 +40,14 @@ if(USE_ROCM)
 else()
   set(quantize_ops_sources
     src/quantize/cutlass_extensions.cu
+    src/quantize/cutlass_extensions/f8f8bf16.cu
     src/quantize/cutlass_extensions/f8f8bf16_blockwise.cu
     src/quantize/cutlass_extensions/f8f8bf16_cublas.cu
     src/quantize/cutlass_extensions/f8f8bf16_rowwise.cu
     src/quantize/cutlass_extensions/i8i8bf16.cu
+    src/quantize/cutlass_extensions/f8i4bf16_rowwise.cu
     src/quantize/cutlass_extensions/i8i8bf16_dynamic.cu
+    src/quantize/cutlass_extensions/bf16i4bf16_rowwise.cu
     src/quantize/quantize.cu
     src/quantize/quantize.cpp)
 endif()
@@ -58,6 +61,14 @@ set(experimental_gen_ai_cpp_source_files
     ${quantize_ops_sources}
     ${comm_ops_sources})
 
+# Set the source file for FB only CPP
+if(USE_FB_ONLY)
+  file(GLOB fb_only_ops_sources
+      fb/src/*/*.cu
+      fb/src/*/*.cpp)
+  list(APPEND experimental_gen_ai_cpp_source_files ${fb_only_ops_sources})
+endif()
+
 set_source_files_properties(${experimental_gen_ai_cpp_source_files}
     PROPERTIES INCLUDE_DIRECTORIES
     "${fbgemm_sources_include_directories}")

diff --git a/fbgemm_gpu/experimental/gen_ai/gen_ai/__init__.py b/fbgemm_gpu/experimental/gen_ai/gen_ai/__init__.py
@@ -27,6 +27,9 @@
     torch.ops.load_library(
         os.path.join(os.path.dirname(__file__), "fbgemm_gpu_experimental_gen_ai_py.so")
     )
+    torch.classes.load_library(
+        os.path.join(os.path.dirname(__file__), "fbgemm_gpu_experimental_gen_ai_py.so")
+    )
 else:
     torch.ops.load_library(
         "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai:attention_ops"