ROCm · liligwu · Dec 7, 2023 · Nov 3, 2023 · Nov 6, 2023 · Nov 6, 2023
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -347,6 +347,7 @@ build_fbgemm_gpu_package () {
       --package_name="${package_name}" \
       --python-tag="${python_tag}" \
       --plat-name="${plat_name}" \
+      --verbose \
       "${build_args[@]}"
 
   # Run checks on the built libraries

diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml
@@ -48,7 +48,7 @@ jobs:
         git config --global --add safe.directory '*'
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -86,7 +86,7 @@ jobs:
 
     steps:
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -127,7 +127,7 @@ jobs:
         git config --global --add safe.directory '*'
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -159,7 +159,7 @@ jobs:
 
     steps:
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 

diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
@@ -57,7 +57,7 @@ jobs:
         git config --global --add safe.directory '*'
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -126,7 +126,7 @@ jobs:
         git config --global --add safe.directory '*'
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -191,7 +191,7 @@ jobs:
         git config --global --add safe.directory '*'
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 

diff --git a/.github/workflows/fbgemm_gpu_cpu_nightly.yml b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
@@ -71,7 +71,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -136,7 +136,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 

diff --git a/.github/workflows/fbgemm_gpu_cpu_release.yml b/.github/workflows/fbgemm_gpu_cpu_release.yml
@@ -68,7 +68,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -133,7 +133,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 

diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -70,7 +70,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -140,6 +140,7 @@ jobs:
     needs: build_artifact
 
     steps:
+    # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:

diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml
@@ -74,7 +74,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 

diff --git a/.github/workflows/fbgemm_gpu_docs.yml b/.github/workflows/fbgemm_gpu_docs.yml
@@ -44,7 +44,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils rsync sudo tar wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 

diff --git a/.github/workflows/fbgemm_gpu_lint.yml b/.github/workflows/fbgemm_gpu_lint.yml
@@ -39,7 +39,7 @@ jobs:
 
     steps:
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Setup Miniconda
       run: . $PRELUDE; setup_miniconda $HOME/miniconda

diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
@@ -66,7 +66,7 @@ jobs:
       run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Display System Info
       run: . $PRELUDE; print_system_info; print_ec2_info
@@ -116,6 +116,7 @@ jobs:
         cuda-version-publish: [ "11.8.0" ]
 
     steps:
+    # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
     - name: Checkout the Repository
       uses: actions/checkout@v3
 
@@ -182,7 +183,7 @@ jobs:
         git config --global --add safe.directory '*'
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Display System Info
       run: . $PRELUDE; print_system_info

diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,10 @@
 # found in:
 # https://github.com/github/gitignore/
 
+# General
+.DS_Store
+*~
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
@@ -432,10 +432,22 @@ else()
     DEPENDS "${optimizer_codegen_dependencies}")
 endif()
 
+set(AVX2_FLAGS "-mavx2;-mf16c;-mfma;-fopenmp")
+if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
+  # NVCC in WSL complains about unknown -mavx options
+  # https://github.com/pytorch/FBGEMM/issues/2135
+  set(AVX2_FLAGS "-Xcompiler;-mavx;-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-fopenmp")
+endif()
+
+set(AVX512_FLAGS "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl;-fopenmp")
+if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
+  set(AVX512_FLAGS "-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-Xcompiler;-mavx512f;-Xcompiler;-mavx512bw;-Xcompiler;-mavx512dq;-Xcompiler;-mavx512vl;-fopenmp")
+endif()
+
 if(CXX_AVX2_FOUND)
   set_source_files_properties(${gen_cpu_source_files}
     PROPERTIES COMPILE_OPTIONS
-    "-mavx2;-mf16c;-mfma;-fopenmp")
+    "${AVX2_FLAGS}")
 else()
   set_source_files_properties(${gen_cpu_source_files}
     PROPERTIES COMPILE_OPTIONS
@@ -504,13 +516,13 @@ set(fbgemm_sources_avx512
 if(CXX_AVX2_FOUND)
   set_source_files_properties(${fbgemm_sources_avx2}
     PROPERTIES COMPILE_OPTIONS
-    "-mavx2;-mf16c;-mfma")
+    "${AVX2_FLAGS}")
 endif()
 
 if(CXX_AVX512_FOUND)
   set_source_files_properties(${fbgemm_sources_avx512}
     PROPERTIES COMPILE_OPTIONS
-    "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl")
+    "${AVX512_FLAGS}")
 endif()
 
 set(fbgemm_sources ${fbgemm_sources_normal})
@@ -561,19 +573,20 @@ set(fbgemm_gpu_sources_static_cpu
     codegen/embedding_forward_quantized_host_cpu.cpp
     codegen/embedding_backward_dense_host_cpu.cpp
     codegen/embedding_bounds_check_host_cpu.cpp
+    src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp
     src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
     src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
     src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
-    src/input_combine_cpu.cpp
-    src/layout_transform_ops_cpu.cpp
+    src/input_combine_ops/input_combine_cpu.cpp
+    src/layout_transform_ops/layout_transform_ops_cpu.cpp
     src/quantize_ops/quantize_ops_cpu.cpp
     src/quantize_ops/quantize_ops_meta.cpp
     src/sparse_ops/sparse_ops_cpu.cpp
     src/sparse_ops/sparse_ops_meta.cpp
-    src/embedding_inplace_update_cpu.cpp
+    src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
     src/split_embeddings_cache/linearize_cache_indices.cpp
     src/split_embeddings_cache/lfu_cache_populate_byte.cpp
     src/split_embeddings_cache/lru_cache_populate_byte.cpp
@@ -588,16 +601,16 @@ if(NOT FBGEMM_CPU_ONLY)
     codegen/embedding_bounds_check_host.cpp
     src/memory_utils/memory_utils.cpp
     src/memory_utils/memory_utils_ops.cpp
-    src/layout_transform_ops_gpu.cpp
+    src/layout_transform_ops/layout_transform_ops_gpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
     src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
     src/quantize_ops/quantize_ops_gpu.cpp
     src/sparse_ops/sparse_ops_gpu.cpp
-    src/split_embeddings_utils.cpp
+    src/split_embeddings_utils/split_embeddings_utils.cpp
     src/split_embeddings_cache/split_embeddings_cache_ops.cu
-    src/metric_ops_host.cpp
-    src/embedding_inplace_update_gpu.cpp
-    src/input_combine_gpu.cpp
+    src/metric_ops/metric_ops_host.cpp
+    src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
+    src/input_combine_ops/input_combine_gpu.cpp
     codegen/batch_index_select_dim0_host.cpp)
 
   if(NVML_LIB_PATH)
@@ -607,8 +620,7 @@ if(NOT FBGEMM_CPU_ONLY)
   if(NVML_LIB_PATH OR USE_ROCM)
     message(STATUS "Adding merge_pooled_embeddings sources")
     list(APPEND fbgemm_gpu_sources_static_cpu
-      src/merge_pooled_embeddings_cpu.cpp
-      src/merge_pooled_embeddings_gpu.cpp
+      src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp
       src/topology_utils.cpp)
   else()
     message(STATUS "Skipping merge_pooled_embeddings sources")
@@ -618,7 +630,7 @@ endif()
 if(CXX_AVX2_FOUND)
   set_source_files_properties(${fbgemm_gpu_sources_static_cpu}
     PROPERTIES COMPILE_OPTIONS
-    "-mavx;-mf16c;-mfma;-mavx2;-fopenmp")
+    "${AVX2_FLAGS}")
 else()
   set_source_files_properties(${fbgemm_gpu_sources_static_cpu}
     PROPERTIES COMPILE_OPTIONS
@@ -631,9 +643,9 @@ if(NOT FBGEMM_CPU_ONLY)
       codegen/embedding_forward_quantized_split_lookup.cu
       src/memory_utils/memory_utils.cu
       src/memory_utils/memory_utils_ops.cu
-      src/embedding_inplace_update.cu
+      src/embedding_inplace_ops/embedding_inplace_update.cu
       src/histogram_binning_calibration_ops.cu
-      src/input_combine.cu
+      src/input_combine_ops/input_combine.cu
       src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu
       src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu
       src/jagged_tensor_ops/dense_to_jagged_forward.cu
@@ -651,8 +663,8 @@ if(NOT FBGEMM_CPU_ONLY)
       src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu
       src/jagged_tensor_ops/jagged_unique_indices.cu
       src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
-      src/layout_transform_ops.cu
-      src/metric_ops.cu
+      src/layout_transform_ops/layout_transform_ops.cu
+      src/metric_ops/metric_ops.cu
       src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu
       src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
       src/quantize_ops/quantize_bfloat16.cu
@@ -691,7 +703,10 @@ if(NOT FBGEMM_CPU_ONLY)
       src/split_embeddings_cache/lxu_cache.cu
       src/split_embeddings_cache/linearize_cache_indices.cu
       src/split_embeddings_cache/reset_weight_momentum.cu
-      src/split_embeddings_utils.cu)
+      src/split_embeddings_utils/generate_vbe_metadata.cu
+      src/split_embeddings_utils/get_infos_metadata.cu
+      src/split_embeddings_utils/radix_sort_pairs.cu
+      src/split_embeddings_utils/transpose_embedding_input.cu)
 
   set_source_files_properties(${fbgemm_gpu_sources_static_gpu}
     PROPERTIES COMPILE_OPTIONS

diff --git a/fbgemm_gpu/bench/batched_unary_embeddings_benchmark.py b/fbgemm_gpu/bench/batched_unary_embeddings_benchmark.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
 import functools
 from math import sqrt
 from typing import List, Tuple
@@ -29,7 +28,10 @@
 
 
 def generate_unary_feature(
-    batch_size: int, num_embeddings: int
+    batch_size: int,
+    num_embeddings: int
+    # pyre-fixme[24]: Generic type `list` expects 1 type parameter, use
+    #  `typing.List[<element type>]` to avoid runtime subscripting errors.
 ) -> Tuple[List, List, List]:
     lengths = []
     offsets = []
@@ -90,6 +92,7 @@ def forward(
 @click.option("--num-tables", default=2)
 @click.option("--num-tasks", default=3)
 @click.option("--repeats", default=100)
+# pyre-fixme[2]: Parameter must be annotated.
 def main(batch_size, num_tables, num_tasks, repeats) -> None:
     device = torch.device("cuda", 0)
     torch.cuda.set_device(device)

diff --git a/fbgemm_gpu/bench/bench_utils.py b/fbgemm_gpu/bench/bench_utils.py
@@ -41,13 +41,13 @@ def benchmark_torch_function(  # noqa: C901
     copy_f_for_multi_thread_test: bool = False,
 ) -> Tuple[float, torch.Tensor]:
     logging.info(f"Start to benchmark {name}...")
-    if device != "" and device != "cuda":
+    if device != "cpu" and device != "" and device != "cuda":
         torch.cuda.set_device(device)
     for _ in range(num_warmups):
         output = f(*args)
 
     assert num_threads > 0
-    if torch.cuda.is_available() and (num_threads == 1):
+    if device != "cpu" and torch.cuda.is_available() and (num_threads == 1):
         cache = torch.empty(
             int(flush_gpu_cache_size_mb * 1024 * 1024 // 4),
             dtype=torch.float,
@@ -69,7 +69,7 @@ def benchmark_torch_function(  # noqa: C901
             [s.elapsed_time(e) for s, e in zip(start_event, end_event)]
         )
         elapsed_time = torch.mean(times).item() * 1.0e-3
-    elif torch.cuda.is_available() and (num_threads > 1):
+    elif device != "cpu" and torch.cuda.is_available() and (num_threads > 1):
         cache = torch.empty(
             int(flush_gpu_cache_size_mb * 1024 * 1024 // 4),
             dtype=torch.float,
@@ -156,6 +156,10 @@ def benchmark_requests(
 ) -> float:
     times = []
 
+    # Run at least one warmup iteration to avoid the long cudaLaunchKernel time
+    # for the first kernel
+    num_warmups = num_warmups + 1 if num_warmups >= 0 else 1
+
     if num_warmups > 0:
         indices, offsets, weights = requests[0]
         for _ in range(num_warmups):