diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index 995f86ef51ad..e42ba2ddaa7d 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -135,6 +135,7 @@ static std::optional getMmaScheduleFromProblemAndTarget( GPUMMAHeuristicSeeds seeds; assert(problem.aType == problem.bType && "expected the same aType and bType."); + int64_t inBitWidth = problem.aType.getIntOrFloatBitWidth(); // Note that the following heuristic seeds are just placeholder values. // We need to clean it up and make it adjusting to different targets. @@ -147,14 +148,23 @@ static std::optional getMmaScheduleFromProblemAndTarget( // and a larger bestKTileCountPerSubgroup. seeds = {/*bestSubgroupCountPerWorkgroup=*/4, /*bestMNTileCountPerSubgroup=*/4, - /*bestKTileCountPerSubgroup=*/8}; + /*bestKTileCountPerSubgroup=*/8, + /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits * 4 / + inBitWidth}; } else { seeds = {/*bestSubgroupCountPerWorkgroup=*/4, - /*bestMNTileCountPerSubgroup=*/8, - /*bestKTileCountPerSubgroup=*/4}; - } - - int64_t maxSharedMemoryBytes = target.getWgp().getMaxWorkgroupMemoryBytes(); + /*bestMNTileCountPerSubgroup=*/16, + /*bestKTileCountPerSubgroup=*/4, + /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits * 2 / + inBitWidth}; + } + + // We target slightly below the full available shared Memory to leave room for + // `GPUReduceBankConflictsPass` that will pad shared memory without keeping + // track of usage. We can drop this after fixing + // https://github.com/iree-org/iree/issues/19675 + int64_t maxSharedMemoryBytes = + target.getWgp().getMaxWorkgroupMemoryBytes() - 64 * inBitWidth; // First try to find a schedule with an exactly matching intrinsic. std::optional schedule = deduceMMASchedule(