Skip to content

Commit 5a2c43e

Browse files
committed
Merge remote-tracking branch 'upstream/main' into IFU-2023-12-06
2 parents 06879b1 + dbc3157 commit 5a2c43e

File tree

100 files changed

+2848
-963
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

100 files changed

+2848
-963
lines changed

.github/scripts/fbgemm_gpu_build.bash

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,7 @@ build_fbgemm_gpu_package () {
347347
--package_name="${package_name}" \
348348
--python-tag="${python_tag}" \
349349
--plat-name="${plat_name}" \
350+
--verbose \
350351
"${build_args[@]}"
351352

352353
# Run checks on the built libraries

.github/workflows/fbgemm_ci.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ jobs:
4848
git config --global --add safe.directory '*'
4949
5050
- name: Checkout the Repository
51-
uses: actions/checkout@v3
51+
uses: actions/checkout@v4
5252
with:
5353
submodules: true
5454

@@ -86,7 +86,7 @@ jobs:
8686

8787
steps:
8888
- name: Checkout the Repository
89-
uses: actions/checkout@v3
89+
uses: actions/checkout@v4
9090
with:
9191
submodules: true
9292

@@ -127,7 +127,7 @@ jobs:
127127
git config --global --add safe.directory '*'
128128
129129
- name: Checkout the Repository
130-
uses: actions/checkout@v3
130+
uses: actions/checkout@v4
131131
with:
132132
submodules: true
133133

@@ -159,7 +159,7 @@ jobs:
159159

160160
steps:
161161
- name: Checkout the Repository
162-
uses: actions/checkout@v3
162+
uses: actions/checkout@v4
163163
with:
164164
submodules: true
165165

.github/workflows/fbgemm_gpu_ci.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ jobs:
5757
git config --global --add safe.directory '*'
5858
5959
- name: Checkout the Repository
60-
uses: actions/checkout@v3
60+
uses: actions/checkout@v4
6161
with:
6262
submodules: true
6363

@@ -126,7 +126,7 @@ jobs:
126126
git config --global --add safe.directory '*'
127127
128128
- name: Checkout the Repository
129-
uses: actions/checkout@v3
129+
uses: actions/checkout@v4
130130
with:
131131
submodules: true
132132

@@ -191,7 +191,7 @@ jobs:
191191
git config --global --add safe.directory '*'
192192
193193
- name: Checkout the Repository
194-
uses: actions/checkout@v3
194+
uses: actions/checkout@v4
195195
with:
196196
submodules: true
197197

.github/workflows/fbgemm_gpu_cpu_nightly.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ jobs:
7171
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
7272

7373
- name: Checkout the Repository
74-
uses: actions/checkout@v3
74+
uses: actions/checkout@v4
7575
with:
7676
submodules: true
7777

@@ -136,7 +136,7 @@ jobs:
136136
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
137137

138138
- name: Checkout the Repository
139-
uses: actions/checkout@v3
139+
uses: actions/checkout@v4
140140
with:
141141
submodules: true
142142

.github/workflows/fbgemm_gpu_cpu_release.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ jobs:
6868
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
6969

7070
- name: Checkout the Repository
71-
uses: actions/checkout@v3
71+
uses: actions/checkout@v4
7272
with:
7373
submodules: true
7474

@@ -133,7 +133,7 @@ jobs:
133133
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
134134

135135
- name: Checkout the Repository
136-
uses: actions/checkout@v3
136+
uses: actions/checkout@v4
137137
with:
138138
submodules: true
139139

.github/workflows/fbgemm_gpu_cuda_nightly.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ jobs:
7070
run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
7171

7272
- name: Checkout the Repository
73-
uses: actions/checkout@v3
73+
uses: actions/checkout@v4
7474
with:
7575
submodules: true
7676

@@ -140,6 +140,7 @@ jobs:
140140
needs: build_artifact
141141

142142
steps:
143+
# Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
143144
- name: Checkout the Repository
144145
uses: actions/checkout@v3
145146
with:

.github/workflows/fbgemm_gpu_cuda_release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ jobs:
7474
run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
7575

7676
- name: Checkout the Repository
77-
uses: actions/checkout@v3
77+
uses: actions/checkout@v4
7878
with:
7979
submodules: true
8080

.github/workflows/fbgemm_gpu_docs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ jobs:
4444
run: yum update -y; yum install -y binutils findutils git pciutils rsync sudo tar wget which
4545

4646
- name: Checkout the Repository
47-
uses: actions/checkout@v3
47+
uses: actions/checkout@v4
4848
with:
4949
submodules: true
5050

.github/workflows/fbgemm_gpu_lint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ jobs:
3939

4040
steps:
4141
- name: Checkout the Repository
42-
uses: actions/checkout@v3
42+
uses: actions/checkout@v4
4343

4444
- name: Setup Miniconda
4545
run: . $PRELUDE; setup_miniconda $HOME/miniconda

.github/workflows/fbgemm_gpu_pip.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ jobs:
6666
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
6767

6868
- name: Checkout the Repository
69-
uses: actions/checkout@v3
69+
uses: actions/checkout@v4
7070

7171
- name: Display System Info
7272
run: . $PRELUDE; print_system_info; print_ec2_info
@@ -116,6 +116,7 @@ jobs:
116116
cuda-version-publish: [ "11.8.0" ]
117117

118118
steps:
119+
# Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
119120
- name: Checkout the Repository
120121
uses: actions/checkout@v3
121122

@@ -182,7 +183,7 @@ jobs:
182183
git config --global --add safe.directory '*'
183184
184185
- name: Checkout the Repository
185-
uses: actions/checkout@v3
186+
uses: actions/checkout@v4
186187

187188
- name: Display System Info
188189
run: . $PRELUDE; print_system_info

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
# found in:
99
# https://github.com/github/gitignore/
1010

11+
# General
12+
.DS_Store
13+
*~
14+
1115
# Byte-compiled / optimized / DLL files
1216
__pycache__/
1317
*.py[cod]

fbgemm_gpu/CMakeLists.txt

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -432,10 +432,22 @@ else()
432432
DEPENDS "${optimizer_codegen_dependencies}")
433433
endif()
434434

435+
set(AVX2_FLAGS "-mavx2;-mf16c;-mfma;-fopenmp")
436+
if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
437+
# NVCC in WSL complains about unknown -mavx options
438+
# https://github.com/pytorch/FBGEMM/issues/2135
439+
set(AVX2_FLAGS "-Xcompiler;-mavx;-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-fopenmp")
440+
endif()
441+
442+
set(AVX512_FLAGS "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl;-fopenmp")
443+
if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
444+
set(AVX512_FLAGS "-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-Xcompiler;-mavx512f;-Xcompiler;-mavx512bw;-Xcompiler;-mavx512dq;-Xcompiler;-mavx512vl;-fopenmp")
445+
endif()
446+
435447
if(CXX_AVX2_FOUND)
436448
set_source_files_properties(${gen_cpu_source_files}
437449
PROPERTIES COMPILE_OPTIONS
438-
"-mavx2;-mf16c;-mfma;-fopenmp")
450+
"${AVX2_FLAGS}")
439451
else()
440452
set_source_files_properties(${gen_cpu_source_files}
441453
PROPERTIES COMPILE_OPTIONS
@@ -504,13 +516,13 @@ set(fbgemm_sources_avx512
504516
if(CXX_AVX2_FOUND)
505517
set_source_files_properties(${fbgemm_sources_avx2}
506518
PROPERTIES COMPILE_OPTIONS
507-
"-mavx2;-mf16c;-mfma")
519+
"${AVX2_FLAGS}")
508520
endif()
509521

510522
if(CXX_AVX512_FOUND)
511523
set_source_files_properties(${fbgemm_sources_avx512}
512524
PROPERTIES COMPILE_OPTIONS
513-
"-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl")
525+
"${AVX512_FLAGS}")
514526
endif()
515527

516528
set(fbgemm_sources ${fbgemm_sources_normal})
@@ -561,19 +573,20 @@ set(fbgemm_gpu_sources_static_cpu
561573
codegen/embedding_forward_quantized_host_cpu.cpp
562574
codegen/embedding_backward_dense_host_cpu.cpp
563575
codegen/embedding_bounds_check_host_cpu.cpp
576+
src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
564577
src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
565578
src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
566579
src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp
567580
src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
568581
src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
569582
src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
570-
src/input_combine_cpu.cpp
571-
src/layout_transform_ops_cpu.cpp
583+
src/input_combine_ops/input_combine_cpu.cpp
584+
src/layout_transform_ops/layout_transform_ops_cpu.cpp
572585
src/quantize_ops/quantize_ops_cpu.cpp
573586
src/quantize_ops/quantize_ops_meta.cpp
574587
src/sparse_ops/sparse_ops_cpu.cpp
575588
src/sparse_ops/sparse_ops_meta.cpp
576-
src/embedding_inplace_update_cpu.cpp
589+
src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
577590
src/split_embeddings_cache/linearize_cache_indices.cpp
578591
src/split_embeddings_cache/lfu_cache_populate_byte.cpp
579592
src/split_embeddings_cache/lru_cache_populate_byte.cpp
@@ -588,16 +601,16 @@ if(NOT FBGEMM_CPU_ONLY)
588601
codegen/embedding_bounds_check_host.cpp
589602
src/memory_utils/memory_utils.cpp
590603
src/memory_utils/memory_utils_ops.cpp
591-
src/layout_transform_ops_gpu.cpp
604+
src/layout_transform_ops/layout_transform_ops_gpu.cpp
592605
src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
593606
src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
594607
src/quantize_ops/quantize_ops_gpu.cpp
595608
src/sparse_ops/sparse_ops_gpu.cpp
596-
src/split_embeddings_utils.cpp
609+
src/split_embeddings_utils/split_embeddings_utils.cpp
597610
src/split_embeddings_cache/split_embeddings_cache_ops.cu
598-
src/metric_ops_host.cpp
599-
src/embedding_inplace_update_gpu.cpp
600-
src/input_combine_gpu.cpp
611+
src/metric_ops/metric_ops_host.cpp
612+
src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
613+
src/input_combine_ops/input_combine_gpu.cpp
601614
codegen/batch_index_select_dim0_host.cpp)
602615

603616
if(NVML_LIB_PATH)
@@ -607,8 +620,7 @@ if(NOT FBGEMM_CPU_ONLY)
607620
if(NVML_LIB_PATH OR USE_ROCM)
608621
message(STATUS "Adding merge_pooled_embeddings sources")
609622
list(APPEND fbgemm_gpu_sources_static_cpu
610-
src/merge_pooled_embeddings_cpu.cpp
611-
src/merge_pooled_embeddings_gpu.cpp
623+
src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp
612624
src/topology_utils.cpp)
613625
else()
614626
message(STATUS "Skipping merge_pooled_embeddings sources")
@@ -618,7 +630,7 @@ endif()
618630
if(CXX_AVX2_FOUND)
619631
set_source_files_properties(${fbgemm_gpu_sources_static_cpu}
620632
PROPERTIES COMPILE_OPTIONS
621-
"-mavx;-mf16c;-mfma;-mavx2;-fopenmp")
633+
"${AVX2_FLAGS}")
622634
else()
623635
set_source_files_properties(${fbgemm_gpu_sources_static_cpu}
624636
PROPERTIES COMPILE_OPTIONS
@@ -631,9 +643,9 @@ if(NOT FBGEMM_CPU_ONLY)
631643
codegen/embedding_forward_quantized_split_lookup.cu
632644
src/memory_utils/memory_utils.cu
633645
src/memory_utils/memory_utils_ops.cu
634-
src/embedding_inplace_update.cu
646+
src/embedding_inplace_ops/embedding_inplace_update.cu
635647
src/histogram_binning_calibration_ops.cu
636-
src/input_combine.cu
648+
src/input_combine_ops/input_combine.cu
637649
src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu
638650
src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu
639651
src/jagged_tensor_ops/dense_to_jagged_forward.cu
@@ -651,8 +663,8 @@ if(NOT FBGEMM_CPU_ONLY)
651663
src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu
652664
src/jagged_tensor_ops/jagged_unique_indices.cu
653665
src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
654-
src/layout_transform_ops.cu
655-
src/metric_ops.cu
666+
src/layout_transform_ops/layout_transform_ops.cu
667+
src/metric_ops/metric_ops.cu
656668
src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu
657669
src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
658670
src/quantize_ops/quantize_bfloat16.cu
@@ -691,7 +703,10 @@ if(NOT FBGEMM_CPU_ONLY)
691703
src/split_embeddings_cache/lxu_cache.cu
692704
src/split_embeddings_cache/linearize_cache_indices.cu
693705
src/split_embeddings_cache/reset_weight_momentum.cu
694-
src/split_embeddings_utils.cu)
706+
src/split_embeddings_utils/generate_vbe_metadata.cu
707+
src/split_embeddings_utils/get_infos_metadata.cu
708+
src/split_embeddings_utils/radix_sort_pairs.cu
709+
src/split_embeddings_utils/transpose_embedding_input.cu)
695710

696711
set_source_files_properties(${fbgemm_gpu_sources_static_gpu}
697712
PROPERTIES COMPILE_OPTIONS

fbgemm_gpu/bench/batched_unary_embeddings_benchmark.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7-
# pyre-unsafe
87
import functools
98
from math import sqrt
109
from typing import List, Tuple
@@ -29,7 +28,10 @@
2928

3029

3130
def generate_unary_feature(
32-
batch_size: int, num_embeddings: int
31+
batch_size: int,
32+
num_embeddings: int
33+
# pyre-fixme[24]: Generic type `list` expects 1 type parameter, use
34+
# `typing.List[<element type>]` to avoid runtime subscripting errors.
3335
) -> Tuple[List, List, List]:
3436
lengths = []
3537
offsets = []
@@ -90,6 +92,7 @@ def forward(
9092
@click.option("--num-tables", default=2)
9193
@click.option("--num-tasks", default=3)
9294
@click.option("--repeats", default=100)
95+
# pyre-fixme[2]: Parameter must be annotated.
9396
def main(batch_size, num_tables, num_tasks, repeats) -> None:
9497
device = torch.device("cuda", 0)
9598
torch.cuda.set_device(device)

fbgemm_gpu/bench/bench_utils.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,13 @@ def benchmark_torch_function( # noqa: C901
4141
copy_f_for_multi_thread_test: bool = False,
4242
) -> Tuple[float, torch.Tensor]:
4343
logging.info(f"Start to benchmark {name}...")
44-
if device != "" and device != "cuda":
44+
if device != "cpu" and device != "" and device != "cuda":
4545
torch.cuda.set_device(device)
4646
for _ in range(num_warmups):
4747
output = f(*args)
4848

4949
assert num_threads > 0
50-
if torch.cuda.is_available() and (num_threads == 1):
50+
if device != "cpu" and torch.cuda.is_available() and (num_threads == 1):
5151
cache = torch.empty(
5252
int(flush_gpu_cache_size_mb * 1024 * 1024 // 4),
5353
dtype=torch.float,
@@ -69,7 +69,7 @@ def benchmark_torch_function( # noqa: C901
6969
[s.elapsed_time(e) for s, e in zip(start_event, end_event)]
7070
)
7171
elapsed_time = torch.mean(times).item() * 1.0e-3
72-
elif torch.cuda.is_available() and (num_threads > 1):
72+
elif device != "cpu" and torch.cuda.is_available() and (num_threads > 1):
7373
cache = torch.empty(
7474
int(flush_gpu_cache_size_mb * 1024 * 1024 // 4),
7575
dtype=torch.float,
@@ -156,6 +156,10 @@ def benchmark_requests(
156156
) -> float:
157157
times = []
158158

159+
# Run at least one warmup iteration to avoid the long cudaLaunchKernel time
160+
# for the first kernel
161+
num_warmups = num_warmups + 1 if num_warmups >= 0 else 1
162+
159163
if num_warmups > 0:
160164
indices, offsets, weights = requests[0]
161165
for _ in range(num_warmups):

0 commit comments

Comments
 (0)