From 9a854a7eb2e3b3e6df365b60638b3e77621ecb5c Mon Sep 17 00:00:00 2001 From: Jeremy Arnold Date: Sat, 21 Dec 2024 02:18:09 +0000 Subject: [PATCH] Updated fused_moe configs for MI325X with Triton 3.2 --- ...AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json | 38 +++++++------- ...36,device_name=AMD_Instinct_MI325_OAM.json | 34 ++++++------- ...AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json | 14 +++--- ...84,device_name=AMD_Instinct_MI325_OAM.json | 26 +++++----- ...AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json | 30 +++++------ ...92,device_name=AMD_Instinct_MI325_OAM.json | 32 ++++++------ ...AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json | 36 ++++++------- ...48,device_name=AMD_Instinct_MI325_OAM.json | 48 +++++++++--------- ...AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json | 50 +++++++++---------- ...84,device_name=AMD_Instinct_MI325_OAM.json | 50 +++++++++---------- ...AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json | 40 +++++++-------- ...96,device_name=AMD_Instinct_MI325_OAM.json | 38 +++++++------- ...AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json | 44 ++++++++-------- ...68,device_name=AMD_Instinct_MI325_OAM.json | 50 +++++++++---------- ...AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json | 14 +++--- ...92,device_name=AMD_Instinct_MI325_OAM.json | 32 ++++++------ 16 files changed, 288 insertions(+), 288 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json index 68ca2cdb374d5..f245285bd8217 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json @@ -1,16 +1,16 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "2": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -19,10 +19,10 @@ }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -31,12 +31,12 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "16": { - "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, @@ -45,11 +45,11 @@ "waves_per_eu": 0 }, "24": { - "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, @@ -58,23 +58,23 @@ "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 1, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "48": { - "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "64": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, "num_stages": 2, @@ -127,8 +127,8 @@ }, "1536": { "BLOCK_SIZE_M": 256, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, @@ -144,9 +144,9 @@ "waves_per_eu": 0 }, "3072": { - "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325_OAM.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325_OAM.json index a03a4d6f95404..3918c93b160a3 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325_OAM.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325_OAM.json @@ -34,10 +34,10 @@ }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 1, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -56,25 +56,25 @@ }, "24": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 1, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "32": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "48": { "BLOCK_SIZE_M": 16, @@ -96,7 +96,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "96": { "BLOCK_SIZE_M": 32, @@ -117,13 +117,13 @@ "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, - "matrix_instr_nonkdim": 16, + "matrix_instr_nonkdim": 32, "kpack": 2 }, "256": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 2, @@ -132,10 +132,10 @@ "kpack": 2 }, "512": { - "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, @@ -144,7 +144,7 @@ }, "1024": { "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, @@ -188,13 +188,13 @@ }, "4096": { "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 } } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json index 0130d54b6d02b..16e0a91baf31d 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json @@ -19,7 +19,7 @@ }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, @@ -126,27 +126,27 @@ "waves_per_eu": 0 }, "1536": { - "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, "2048": { - "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, "3072": { - "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325_OAM.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325_OAM.json index 15abb3386df32..d766fc062ddcc 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325_OAM.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325_OAM.json @@ -4,7 +4,7 @@ "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -44,7 +44,7 @@ "kpack": 2 }, "16": { - "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, @@ -56,8 +56,8 @@ }, "24": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 2, @@ -118,7 +118,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "256": { "BLOCK_SIZE_M": 128, @@ -136,7 +136,7 @@ "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -166,20 +166,20 @@ }, "2048": { "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "3072": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 8, + "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, @@ -187,10 +187,10 @@ "kpack": 2 }, "4096": { - "BLOCK_SIZE_M": 256, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 8, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json index 7fd3329d44adc..6d5b1ae5b15fc 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json @@ -19,10 +19,10 @@ }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -40,7 +40,7 @@ "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -67,7 +67,7 @@ "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -83,8 +83,8 @@ "96": { "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -101,7 +101,7 @@ "256": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 2, @@ -109,10 +109,10 @@ }, "512": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -135,27 +135,27 @@ "waves_per_eu": 0 }, "2048": { - "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "3072": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, "4096": { "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325_OAM.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325_OAM.json index e4be7a84d9668..ffc1b23ea90dd 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325_OAM.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325_OAM.json @@ -2,7 +2,7 @@ "1": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, @@ -15,11 +15,11 @@ "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "4": { "BLOCK_SIZE_M": 16, @@ -44,7 +44,7 @@ "kpack": 1 }, "16": { - "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, @@ -85,14 +85,14 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "64": { "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, - "num_warps": 8, + "GROUP_SIZE_M": 1, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -100,18 +100,18 @@ }, "96": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, @@ -121,15 +121,15 @@ "kpack": 2 }, "256": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "512": { "BLOCK_SIZE_M": 64, @@ -147,7 +147,7 @@ "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -156,7 +156,7 @@ "1536": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json index b89eb5fa2bc88..2758e48fc4065 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json @@ -4,7 +4,7 @@ "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -28,10 +28,10 @@ }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -81,8 +81,8 @@ "waves_per_eu": 0 }, "96": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -91,17 +91,17 @@ }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, - "num_warps": 4, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, "256": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, @@ -126,26 +126,26 @@ "waves_per_eu": 0 }, "1536": { - "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "2048": { "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "3072": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325_OAM.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325_OAM.json index a90d4a86048c2..fc31215cbae8a 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325_OAM.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325_OAM.json @@ -1,10 +1,10 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -23,10 +23,10 @@ }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -48,7 +48,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -59,7 +59,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -74,23 +74,23 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "48": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "64": { "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, @@ -110,21 +110,21 @@ "kpack": 2 }, "128": { - "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 1, - "num_warps": 8, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, - "matrix_instr_nonkdim": 16, + "matrix_instr_nonkdim": 32, "kpack": 2 }, "256": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, @@ -133,19 +133,19 @@ }, "512": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, - "matrix_instr_nonkdim": 16, + "matrix_instr_nonkdim": 32, "kpack": 2 }, "1024": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, @@ -156,7 +156,7 @@ "1536": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json index ab78a9ac06715..6cb80f48329f5 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json @@ -19,8 +19,8 @@ }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 2, @@ -40,7 +40,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -49,7 +49,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, @@ -58,16 +58,16 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "48": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -76,34 +76,34 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "96": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, + "GROUP_SIZE_M": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, "256": { - "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 4, - "num_warps": 8, + "GROUP_SIZE_M": 1, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -117,9 +117,9 @@ "waves_per_eu": 0 }, "1024": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, @@ -137,9 +137,9 @@ "2048": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, @@ -153,9 +153,9 @@ "waves_per_eu": 0 }, "4096": { - "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325_OAM.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325_OAM.json index 36b2e1062ec33..de9d0aba75a7e 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325_OAM.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325_OAM.json @@ -1,14 +1,14 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "2": { "BLOCK_SIZE_M": 16, @@ -45,14 +45,14 @@ }, "16": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 1, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "24": { "BLOCK_SIZE_M": 16, @@ -67,25 +67,25 @@ }, "32": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "48": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "64": { "BLOCK_SIZE_M": 32, @@ -107,45 +107,45 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "256": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 4, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "512": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, - "matrix_instr_nonkdim": 16, + "matrix_instr_nonkdim": 32, "kpack": 2 }, "1024": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json index a3537dcc7468a..2c49f359c22ac 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json @@ -27,8 +27,8 @@ "waves_per_eu": 0 }, "8": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, @@ -36,18 +36,18 @@ "waves_per_eu": 0 }, "16": { - "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "24": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, @@ -63,11 +63,11 @@ "waves_per_eu": 0 }, "48": { - "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -81,11 +81,11 @@ "waves_per_eu": 0 }, "96": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, @@ -108,7 +108,7 @@ "waves_per_eu": 0 }, "512": { - "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, @@ -117,9 +117,9 @@ "waves_per_eu": 0 }, "1024": { - "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, @@ -136,17 +136,17 @@ }, "2048": { "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "3072": { - "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325_OAM.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325_OAM.json index 734184460d2bd..c7db6c0cbd3fa 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325_OAM.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325_OAM.json @@ -4,7 +4,7 @@ "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 1, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -13,7 +13,7 @@ "2": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 2, @@ -34,21 +34,21 @@ }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "16": { - "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -68,13 +68,13 @@ "32": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "48": { "BLOCK_SIZE_M": 32, @@ -85,7 +85,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "64": { "BLOCK_SIZE_M": 32, @@ -107,7 +107,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "128": { "BLOCK_SIZE_M": 64, @@ -121,20 +121,20 @@ "kpack": 2 }, "256": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 4, - "num_warps": 2, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "512": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json index 75ea6c7da5800..7a07bbf414195 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json @@ -19,10 +19,10 @@ }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, @@ -36,46 +36,46 @@ "waves_per_eu": 0 }, "16": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "24": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "32": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, "48": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, "64": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 @@ -91,10 +91,10 @@ }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0 }, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325_OAM.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325_OAM.json index 122bec0f37b37..3a3268cc17a84 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325_OAM.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325_OAM.json @@ -34,14 +34,14 @@ }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 1, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "16": { "BLOCK_SIZE_M": 16, @@ -52,12 +52,12 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "24": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 2, @@ -67,32 +67,32 @@ }, "32": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "48": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "64": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 8, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -118,7 +118,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "256": { "BLOCK_SIZE_M": 128, @@ -128,14 +128,14 @@ "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, - "matrix_instr_nonkdim": 16, + "matrix_instr_nonkdim": 32, "kpack": 2 }, "512": { - "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 4, + "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, @@ -147,11 +147,11 @@ "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "1536": { "BLOCK_SIZE_M": 128, @@ -169,11 +169,11 @@ "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "3072": { "BLOCK_SIZE_M": 128, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json index 2538bca64e941..c27ca0a36594f 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json @@ -1,7 +1,7 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, @@ -40,7 +40,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 4, "num_stages": 2, "waves_per_eu": 0 }, @@ -58,7 +58,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0 }, @@ -127,8 +127,8 @@ }, "1536": { "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, @@ -144,9 +144,9 @@ "waves_per_eu": 0 }, "3072": { - "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325_OAM.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325_OAM.json index 947ff5654d42f..da477b1fb15eb 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325_OAM.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325_OAM.json @@ -8,7 +8,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "2": { "BLOCK_SIZE_M": 16, @@ -26,7 +26,7 @@ "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -45,10 +45,10 @@ }, "16": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 2, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, @@ -63,7 +63,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "32": { "BLOCK_SIZE_M": 16, @@ -74,7 +74,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "48": { "BLOCK_SIZE_M": 32, @@ -85,7 +85,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "64": { "BLOCK_SIZE_M": 32, @@ -96,7 +96,7 @@ "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "96": { "BLOCK_SIZE_M": 32, @@ -112,7 +112,7 @@ "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 2, @@ -125,11 +125,11 @@ "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, - "num_warps": 2, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "512": { "BLOCK_SIZE_M": 128, @@ -158,22 +158,22 @@ "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "2048": { "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "3072": { "BLOCK_SIZE_M": 128, @@ -191,7 +191,7 @@ "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 8, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16,