Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,7 @@ namespace contrib {
namespace webgpu {

namespace {

constexpr unsigned int kMinMForTileOptimization = 4;

template <typename T>
inline T ceil_div(T numerator, T denominator) {
return (numerator + denominator - 1) / denominator;
}

} // namespace

ONNX_OPERATOR_KERNEL_EX(
Expand Down Expand Up @@ -246,8 +239,8 @@ Status ApplyMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales,
constexpr uint32_t workgroup_size = 128;
constexpr uint32_t tile_m = workgroup_size / 8;
constexpr uint32_t tile_n = workgroup_size;
const uint32_t num_N_tile = ceil_div(N, tile_n);
const uint32_t num_M_tile = ceil_div(M, tile_m);
const uint32_t num_N_tile = CeilDiv(N, tile_n);
const uint32_t num_M_tile = CeilDiv(M, tile_m);

MatMulNBitsWideTileProgram program{has_zero_points, has_bias, has_weight_idx, tile_m, tile_n, static_cast<uint32_t>(nbits)};
program.SetWorkgroupSize(workgroup_size);
Expand All @@ -268,7 +261,7 @@ Status ApplyMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales,
if (has_zero_points) {
program.AddInput({zero_points,
ProgramTensorMetadataDependency::TypeAndRank,
{ceil_div(zero_points->Shape().Size(), static_cast<int64_t>(4))},
{CeilDiv(zero_points->Shape().Size(), static_cast<int64_t>(4))},
4});
}
if (has_bias) {
Expand Down
3 changes: 2 additions & 1 deletion onnxruntime/core/providers/webgpu/math/matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "core/providers/webgpu/webgpu_supported_types.h"
#include "core/providers/webgpu/nn/fuse_utils.h"
#include "core/providers/webgpu/data_transfer.h"
#include "core/providers/webgpu/webgpu_utils.h"

namespace onnxruntime {
namespace webgpu {
Expand Down Expand Up @@ -147,7 +148,7 @@ Status MatMul::ComputeInternal(ComputeContext& context) const {
}
program
.AddOutputs({{output_tensor, ProgramTensorMetadataDependency::None, output_shape_shader, components}})
.SetDispatchGroupSize((output_size + 63) / 64) // Integer ceiling division
.SetDispatchGroupSize(CeilDiv(output_size, 64u))
.AddIndices(outer_dims)
.AddUniformVariables({{output_size}, {m}, {n}, {k}});

Expand Down
22 changes: 7 additions & 15 deletions onnxruntime/core/providers/webgpu/nn/im2col_matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,7 @@

namespace onnxruntime {
namespace webgpu {

namespace {

// TODO: move to common header.
template <typename T>
inline T ceil_div(T numerator, T denominator) {
return (numerator + denominator - 1) / denominator;
}

// Chooses the optimal tile size (M, N) for the im2col operation.
// This tile size is performance-tuned and varies depending on the target device.
std::pair<uint32_t, uint32_t> ChooseTileSize(uint32_t im2col_m, uint32_t im2col_n) {
Expand All @@ -32,8 +24,8 @@ std::pair<uint32_t, uint32_t> ChooseTileSize(uint32_t im2col_m, uint32_t im2col_
const uint32_t tile_m = tile_pair.first;
const uint32_t tile_n = tile_pair.second;

const uint32_t dispatch_m = ceil_div(im2col_m, tile_m);
const uint32_t dispatch_n = ceil_div(im2col_n, tile_n);
const uint32_t dispatch_m = CeilDiv(im2col_m, tile_m);
const uint32_t dispatch_n = CeilDiv(im2col_n, tile_n);
const uint32_t dispatch = dispatch_m * dispatch_n;

if (dispatch >= 128) {
Expand Down Expand Up @@ -115,7 +107,7 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context,
OIHW2OHWIProgram transpose_program{};
transpose_program.SetWorkgroupSize(64);

const uint32_t Ci_tiles = ceil_div(channel_input, 64u);
const uint32_t Ci_tiles = CeilDiv(channel_input, 64u);
transpose_program.SetDispatchGroupSize(channel_output, Ci_tiles);

transpose_program.AddInput({weight,
Expand All @@ -127,7 +119,7 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context,
{kernel_height},
{kernel_width},
{Ci_tiles},
{ceil_div(kernel_height * kernel_height, 4u)}});
{CeilDiv(kernel_height * kernel_height, 4u)}});
ORT_RETURN_IF_ERROR(context.RunProgram(transpose_program));

// im2col-matmul
Expand Down Expand Up @@ -156,8 +148,8 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context,
Im2ColMatMulProgram im2col_mm_program{has_bias, tile_m, tile_n, use_subgroup};
im2col_mm_program.SetWorkgroupSize(workgroup_size);

const uint32_t M_tiles = ceil_div(im2col_m, tile_m);
const uint32_t N_tiles = ceil_div(im2col_n, tile_n);
const uint32_t M_tiles = CeilDiv(im2col_m, tile_m);
const uint32_t N_tiles = CeilDiv(im2col_n, tile_n);
im2col_mm_program.SetDispatchGroupSize(M_tiles, N_tiles, batch);

im2col_mm_program.AddInput({src,
Expand Down Expand Up @@ -185,7 +177,7 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context,
{im2col_n},
{M_tiles},
{N_tiles},
{ceil_div(ceil_div(im2col_k, 4u), 4u)},
{CeilDiv(CeilDiv(im2col_k, 4u), 4u)},
{dilations},
{pads},
{strides}});
Expand Down
24 changes: 7 additions & 17 deletions onnxruntime/core/providers/webgpu/tensor/transpose.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,15 @@
// Licensed under the MIT License.

#include "core/common/inlined_containers.h"
#include "core/providers/webgpu/tensor/transpose.h"
#include "core/providers/cpu/tensor/utils.h"
#include "core/providers/webgpu/tensor/transpose.h"
#include "core/providers/webgpu/shader_variable.h"
#include "core/providers/webgpu/shader_helper.h"
#include "core/providers/webgpu/webgpu_supported_types.h"

namespace {

inline uint32_t ceil_div(int64_t numerator, int32_t denominator) {
return static_cast<uint32_t>((numerator + denominator - 1) / denominator);
}

} // namespace
#include "core/providers/webgpu/webgpu_utils.h"

namespace onnxruntime {
namespace webgpu {

ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Transpose,
kOnnxDomain,
Expand Down Expand Up @@ -139,25 +131,23 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContextBase& context,
new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]});
}

uint32_t output_size = onnxruntime::narrow<int32_t>(input_shape.Size());
uint32_t output_size = onnxruntime::narrow<uint32_t>(input_shape.Size());
TransposeProgram program{permutations, use_shared};

program
.CacheHint(absl::StrJoin(permutations, "-"))
.AddInputs({{&input, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}})
.AddOutputs({{&output, ProgramTensorMetadataDependency::None, new_output_shape, 1}})
.AddUniformVariables({
{static_cast<uint32_t>(output_size)},
});
.AddUniformVariables({{output_size}});

if (use_shared) {
program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1);
program.SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE),
static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE)));
} else {
program.SetWorkgroupSize(WORKGROUP_SIZE);
program.SetWorkgroupSize(64u);

uint32_t dispatch_x = ceil_div(output_size, WORKGROUP_SIZE);
uint32_t dispatch_x = CeilDiv(output_size, 64u);
uint32_t dispatch_y = 1;
uint32_t dispatch_z = 1;

Expand All @@ -171,7 +161,7 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContextBase& context,
uint32_t dispatch_size = dispatch_x;
dispatch_x = 4;
dispatch_y = 8;
dispatch_z = ceil_div(dispatch_size, dispatch_x * dispatch_y);
dispatch_z = CeilDiv(dispatch_size, dispatch_x * dispatch_y);
}
program.SetDispatchGroupSize(dispatch_x, dispatch_y, dispatch_z);
}
Expand Down
5 changes: 5 additions & 0 deletions onnxruntime/core/providers/webgpu/webgpu_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ namespace webgpu {

class ShaderVariableHelper;

template <typename T>
inline T CeilDiv(T numerator, T denominator) {
return (numerator + denominator - 1) / denominator;
}

/**
* Returns the maximum number of components `N` to be used as `vecN` for the given size.
*/
Expand Down
Loading