From 1bf307b30066e3d25b79d5a6a637bacc9c140147 Mon Sep 17 00:00:00 2001 From: Jianhui Dai Date: Thu, 4 Dec 2025 09:49:21 +0800 Subject: [PATCH 1/4] [webgpu] Add `CeilDiv` into webgpu utils --- .../webgpu/quantization/matmul_nbits.cc | 13 +++-------- .../core/providers/webgpu/math/matmul.cc | 3 ++- .../core/providers/webgpu/nn/im2col_matmul.cc | 22 ++++++------------- .../core/providers/webgpu/tensor/transpose.cc | 18 +++++---------- .../core/providers/webgpu/webgpu_utils.h | 5 +++++ 5 files changed, 22 insertions(+), 39 deletions(-) diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc index eab61db4e43b4..ec44ac366136c 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc @@ -19,14 +19,7 @@ namespace contrib { namespace webgpu { namespace { - constexpr unsigned int kMinMForTileOptimization = 4; - -template -inline T ceil_div(T numerator, T denominator) { - return (numerator + denominator - 1) / denominator; -} - } // namespace ONNX_OPERATOR_KERNEL_EX( @@ -246,8 +239,8 @@ Status ApplyMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales, constexpr uint32_t workgroup_size = 128; constexpr uint32_t tile_m = workgroup_size / 8; constexpr uint32_t tile_n = workgroup_size; - const uint32_t num_N_tile = ceil_div(N, tile_n); - const uint32_t num_M_tile = ceil_div(M, tile_m); + const uint32_t num_N_tile = CeilDiv(N, tile_n); + const uint32_t num_M_tile = CeilDiv(M, tile_m); MatMulNBitsWideTileProgram program{has_zero_points, has_bias, has_weight_idx, tile_m, tile_n, static_cast(nbits)}; program.SetWorkgroupSize(workgroup_size); @@ -268,7 +261,7 @@ Status ApplyMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales, if (has_zero_points) { program.AddInput({zero_points, ProgramTensorMetadataDependency::TypeAndRank, - {ceil_div(zero_points->Shape().Size(), static_cast(4))}, + {CeilDiv(zero_points->Shape().Size(), static_cast(4))}, 4}); } if (has_bias) { diff --git a/onnxruntime/core/providers/webgpu/math/matmul.cc b/onnxruntime/core/providers/webgpu/math/matmul.cc index 72dd235eb820a..43342aa7a7df6 100644 --- a/onnxruntime/core/providers/webgpu/math/matmul.cc +++ b/onnxruntime/core/providers/webgpu/math/matmul.cc @@ -8,6 +8,7 @@ #include "core/providers/webgpu/webgpu_supported_types.h" #include "core/providers/webgpu/nn/fuse_utils.h" #include "core/providers/webgpu/data_transfer.h" +#include "core/providers/webgpu/webgpu_utils.h" namespace onnxruntime { namespace webgpu { @@ -147,7 +148,7 @@ Status MatMul::ComputeInternal(ComputeContext& context) const { } program .AddOutputs({{output_tensor, ProgramTensorMetadataDependency::None, output_shape_shader, components}}) - .SetDispatchGroupSize((output_size + 63) / 64) // Integer ceiling division + .SetDispatchGroupSize(CeilDiv(output_size, 64u)) .AddIndices(outer_dims) .AddUniformVariables({{output_size}, {m}, {n}, {k}}); diff --git a/onnxruntime/core/providers/webgpu/nn/im2col_matmul.cc b/onnxruntime/core/providers/webgpu/nn/im2col_matmul.cc index 685324884abeb..fb2bd9bba7450 100644 --- a/onnxruntime/core/providers/webgpu/nn/im2col_matmul.cc +++ b/onnxruntime/core/providers/webgpu/nn/im2col_matmul.cc @@ -10,15 +10,7 @@ namespace onnxruntime { namespace webgpu { - namespace { - -// TODO: move to common header. -template -inline T ceil_div(T numerator, T denominator) { - return (numerator + denominator - 1) / denominator; -} - // Chooses the optimal tile size (M, N) for the im2col operation. // This tile size is performance-tuned and varies depending on the target device. std::pair ChooseTileSize(uint32_t im2col_m, uint32_t im2col_n) { @@ -32,8 +24,8 @@ std::pair ChooseTileSize(uint32_t im2col_m, uint32_t im2col_ const uint32_t tile_m = tile_pair.first; const uint32_t tile_n = tile_pair.second; - const uint32_t dispatch_m = ceil_div(im2col_m, tile_m); - const uint32_t dispatch_n = ceil_div(im2col_n, tile_n); + const uint32_t dispatch_m = CeilDiv(im2col_m, tile_m); + const uint32_t dispatch_n = CeilDiv(im2col_n, tile_n); const uint32_t dispatch = dispatch_m * dispatch_n; if (dispatch >= 128) { @@ -115,7 +107,7 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context, OIHW2OHWIProgram transpose_program{}; transpose_program.SetWorkgroupSize(64); - const uint32_t Ci_tiles = ceil_div(channel_input, 64u); + const uint32_t Ci_tiles = CeilDiv(channel_input, 64u); transpose_program.SetDispatchGroupSize(channel_output, Ci_tiles); transpose_program.AddInput({weight, @@ -127,7 +119,7 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context, {kernel_height}, {kernel_width}, {Ci_tiles}, - {ceil_div(kernel_height * kernel_height, 4u)}}); + {CeilDiv(kernel_height * kernel_height, 4u)}}); ORT_RETURN_IF_ERROR(context.RunProgram(transpose_program)); // im2col-matmul @@ -156,8 +148,8 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context, Im2ColMatMulProgram im2col_mm_program{has_bias, tile_m, tile_n, use_subgroup}; im2col_mm_program.SetWorkgroupSize(workgroup_size); - const uint32_t M_tiles = ceil_div(im2col_m, tile_m); - const uint32_t N_tiles = ceil_div(im2col_n, tile_n); + const uint32_t M_tiles = CeilDiv(im2col_m, tile_m); + const uint32_t N_tiles = CeilDiv(im2col_n, tile_n); im2col_mm_program.SetDispatchGroupSize(M_tiles, N_tiles, batch); im2col_mm_program.AddInput({src, @@ -185,7 +177,7 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context, {im2col_n}, {M_tiles}, {N_tiles}, - {ceil_div(ceil_div(im2col_k, 4u), 4u)}, + {CeilDiv(CeilDiv(im2col_k, 4u), 4u)}, {dilations}, {pads}, {strides}}); diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc index 5415d4a5ead5b..aff1be69dde8a 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc @@ -2,23 +2,15 @@ // Licensed under the MIT License. #include "core/common/inlined_containers.h" -#include "core/providers/webgpu/tensor/transpose.h" #include "core/providers/cpu/tensor/utils.h" +#include "core/providers/webgpu/tensor/transpose.h" #include "core/providers/webgpu/shader_variable.h" #include "core/providers/webgpu/shader_helper.h" #include "core/providers/webgpu/webgpu_supported_types.h" - -namespace { - -inline uint32_t ceil_div(int64_t numerator, int32_t denominator) { - return static_cast((numerator + denominator - 1) / denominator); -} - -} // namespace +#include "core/providers/webgpu/webgpu_utils.h" namespace onnxruntime { namespace webgpu { - ONNX_OPERATOR_VERSIONED_KERNEL_EX( Transpose, kOnnxDomain, @@ -155,9 +147,9 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContextBase& context, program.SetDispatchGroupSize(static_cast((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE), static_cast(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE))); } else { - program.SetWorkgroupSize(WORKGROUP_SIZE); + program.SetWorkgroupSize(64u); - uint32_t dispatch_x = ceil_div(output_size, WORKGROUP_SIZE); + uint32_t dispatch_x = CeilDiv(output_size, 64u); uint32_t dispatch_y = 1; uint32_t dispatch_z = 1; @@ -171,7 +163,7 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContextBase& context, uint32_t dispatch_size = dispatch_x; dispatch_x = 4; dispatch_y = 8; - dispatch_z = ceil_div(dispatch_size, dispatch_x * dispatch_y); + dispatch_z = CeilDiv(dispatch_size, dispatch_x * dispatch_y); } program.SetDispatchGroupSize(dispatch_x, dispatch_y, dispatch_z); } diff --git a/onnxruntime/core/providers/webgpu/webgpu_utils.h b/onnxruntime/core/providers/webgpu/webgpu_utils.h index 0aa47371f6752..8a098d9f8c4be 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_utils.h +++ b/onnxruntime/core/providers/webgpu/webgpu_utils.h @@ -12,6 +12,11 @@ namespace onnxruntime { namespace webgpu { +template + +inline T CeilDiv(T numerator, T denominator) { + return (numerator + denominator - 1) / denominator; +} class ShaderVariableHelper; From 73084ab451307fb8ca09336fbedea15959d86997 Mon Sep 17 00:00:00 2001 From: Jianhui Dai Date: Thu, 4 Dec 2025 11:15:19 +0800 Subject: [PATCH 2/4] Fix `output_size` --- onnxruntime/core/providers/webgpu/tensor/transpose.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc index aff1be69dde8a..7b1c1d8888a19 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc @@ -131,16 +131,14 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContextBase& context, new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]}); } - uint32_t output_size = onnxruntime::narrow(input_shape.Size()); + uint32_t output_size = onnxruntime::narrow(input_shape.Size()); TransposeProgram program{permutations, use_shared}; program .CacheHint(absl::StrJoin(permutations, "-")) .AddInputs({{&input, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}}) .AddOutputs({{&output, ProgramTensorMetadataDependency::None, new_output_shape, 1}}) - .AddUniformVariables({ - {static_cast(output_size)}, - }); + .AddUniformVariables({{output_size}}); if (use_shared) { program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1); From 2413e5a4b2ce26142f6ef6a0a62c5635c5f72056 Mon Sep 17 00:00:00 2001 From: Jianhui Dai Date: Sat, 6 Dec 2025 07:37:11 +0800 Subject: [PATCH 3/4] Remove empty line --- onnxruntime/core/providers/webgpu/webgpu_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/webgpu/webgpu_utils.h b/onnxruntime/core/providers/webgpu/webgpu_utils.h index 8a098d9f8c4be..2baa45d13e492 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_utils.h +++ b/onnxruntime/core/providers/webgpu/webgpu_utils.h @@ -12,8 +12,8 @@ namespace onnxruntime { namespace webgpu { -template +template inline T CeilDiv(T numerator, T denominator) { return (numerator + denominator - 1) / denominator; } From 74de672202a495921330d91a9d6a1d27b1caceee Mon Sep 17 00:00:00 2001 From: Jianhui Dai Date: Sat, 6 Dec 2025 09:46:55 +0800 Subject: [PATCH 4/4] rebase --- onnxruntime/core/providers/webgpu/webgpu_utils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/webgpu/webgpu_utils.h b/onnxruntime/core/providers/webgpu/webgpu_utils.h index 2baa45d13e492..744e6f53aaba2 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_utils.h +++ b/onnxruntime/core/providers/webgpu/webgpu_utils.h @@ -13,13 +13,13 @@ namespace onnxruntime { namespace webgpu { +class ShaderVariableHelper; + template inline T CeilDiv(T numerator, T denominator) { return (numerator + denominator - 1) / denominator; } -class ShaderVariableHelper; - /** * Returns the maximum number of components `N` to be used as `vecN` for the given size. */