Skip to content

Commit 379700f

Browse files
committed
Merge branch 'main' into zhewen_ctrl_overlay
2 parents 35ef38f + f76c245 commit 379700f

18 files changed

+324
-106
lines changed

build_tools/ci/cpu_comparison/run.py

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1835,6 +1835,21 @@ def __init__(self):
18351835
"transpose_a": True,
18361836
"transpose_b": False,
18371837
},
1838+
# Test where the compute is omitted, this should help triangulate
1839+
# how much performance gain can be obtained with better matmul
1840+
# on core vs data movement.
1841+
{
1842+
"M": 4096,
1843+
"N": 512,
1844+
"K": 512,
1845+
"use_ukernel": False,
1846+
"peano_opt_level": 3,
1847+
"outline": True,
1848+
"outline_to_empty_function": True,
1849+
"transpose_a": False,
1850+
"transpose_b": False,
1851+
"skip_numerics": True,
1852+
},
18381853
]
18391854

18401855
# Some bf16 Performance tests:
@@ -1851,15 +1866,29 @@ def __init__(self):
18511866
outlining_string = "--iree-amdaie-enable-function-outlining=" + str(
18521867
int(outline)
18531868
)
1869+
18541870
peano_opt_level_string = f'"-O{peano_opt_level}"'
18551871
aie_compilation_flags = [
18561872
outlining_string,
18571873
f"--iree-amd-aie-additional-peano-opt-flags={peano_opt_level_string}",
18581874
]
18591875

1876+
outline_to_empty_function = False
1877+
empty_key = "outline_to_empty_function"
1878+
if empty_key in test and test[empty_key] == True:
1879+
outline_to_empty_function = True
1880+
1881+
if outline_to_empty_function:
1882+
aie_compilation_flags.append(
1883+
"--iree-amdaie-replace-outlined-functions-with-empty"
1884+
)
1885+
18601886
name_suffix = "O" + str(peano_opt_level)
18611887
if outline:
1862-
name_suffix += "_outline"
1888+
if outline_to_empty_function:
1889+
name_suffix += "_outline_empty"
1890+
else:
1891+
name_suffix += "_outline"
18631892

18641893
if (transpose_a, transpose_b) == (False, False):
18651894
NumericTestClass = Matmul
@@ -1873,20 +1902,25 @@ def __init__(self):
18731902
else:
18741903
raise ValueError("Transposing both LHS and RHS is not supported.")
18751904

1876-
self.register(
1877-
NumericTestClass(
1878-
M,
1879-
N,
1880-
K,
1881-
"bf16",
1882-
"f32",
1883-
use_ukernel=use_ukernel,
1884-
n_repeats=2,
1885-
aie_compilation_flags=aie_compilation_flags,
1886-
name_suffix=name_suffix,
1887-
additional_labels=["PerformanceCorrectness"],
1905+
# This should only be the case for benchmark tests which we expect
1906+
# to not pass numerically.
1907+
if "skip_numerics" in test and test["skip_numerics"]:
1908+
pass
1909+
else:
1910+
self.register(
1911+
NumericTestClass(
1912+
M,
1913+
N,
1914+
K,
1915+
"bf16",
1916+
"f32",
1917+
use_ukernel=use_ukernel,
1918+
n_repeats=2,
1919+
aie_compilation_flags=aie_compilation_flags,
1920+
name_suffix=name_suffix,
1921+
additional_labels=["PerformanceCorrectness"],
1922+
)
18881923
)
1889-
)
18901924

18911925
self.register(
18921926
BenchmarkTestClass(

compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,13 +229,15 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend {
229229

230230
void buildTranslationPassPipeline(IREE::HAL::ExecutableTargetAttr,
231231
OpPassManager &passManager) override {
232+
232233
buildAMDAIETransformPassPipeline(
233234
passManager, options.AMDAIETargetDevice, options.AMDAIENumRows,
234235
options.AMDAIENumCols, options.useTilePipeline,
235236
options.useLowerToAIEPipeline, options.matmulElementwiseFusion,
236237
options.enableVectorizationPasses, options.pathToUkernels,
237238
options.enablePacketFlow, options.enableCoalescingLoops,
238239
options.enableCollapsingUnitDims, options.enableFunctionOutlining,
240+
options.replaceOutlinedFunctionsWithEmpty,
239241
options.insertLoopAroundCoreBlock);
240242
}
241243

compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ struct AMDAIEOptions {
5757
bool enableCoalescingLoops{false};
5858
bool enableCollapsingUnitDims{false};
5959
bool enableFunctionOutlining{true};
60+
bool replaceOutlinedFunctionsWithEmpty{false};
6061
bool insertLoopAroundCoreBlock{false};
6162
bool matmulElementwiseFusion{false};
6263
AMDAIEDevice AMDAIETargetDevice{AMDAIEDevice::npu1_4col};
@@ -195,6 +196,13 @@ struct AMDAIEOptions {
195196
llvm::cl::desc("Flag to enable/disable linalg-function-outlining pass."
196197
"It is intended for development purposes only."));
197198

199+
binder.opt<bool>(
200+
"iree-amdaie-replace-outlined-functions-with-empty",
201+
replaceOutlinedFunctionsWithEmpty, llvm::cl::cat(category),
202+
llvm::cl::desc(
203+
"Flag to enable/disable replacing outlined functions with "
204+
"empty functions. For development purposes only."));
205+
198206
binder.opt<bool>(
199207
"iree-amdaie-enable-infinite-loop-around-core-block",
200208
insertLoopAroundCoreBlock, llvm::cl::cat(category),

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,14 @@ LogicalResult assignChannels(AMDAIE::WorkgroupOp workgroupOp) {
7070
assert(tileToGeneratorMap.contains(tile) &&
7171
"no channel generator found for tile");
7272
std::optional<uint8_t> maybeChannel =
73-
tileToGeneratorMap[tile].getAndAssignProducerDMAChannel(isPacketFlow);
73+
tileToGeneratorMap[tile].getProducerDMAChannel();
7474
if (!maybeChannel) {
7575
return connectionOp.emitOpError()
7676
<< "no producer DMA channel available";
7777
}
78+
// Only assign the channel if it is for circuit flow.
79+
if (!isPacketFlow)
80+
tileToGeneratorMap[tile].assignProducerDMAChannel(maybeChannel.value());
7881
auto channelOp = rewriter.create<AMDAIE::ChannelOp>(
7982
rewriter.getUnknownLoc(), tile, maybeChannel.value(),
8083
StrmSwPortType::DMA, AMDAIE::DMAChannelDir::MM2S);
@@ -85,11 +88,14 @@ LogicalResult assignChannels(AMDAIE::WorkgroupOp workgroupOp) {
8588
assert(tileToGeneratorMap.contains(tile) &&
8689
"no channel generator found for tile");
8790
std::optional<uint8_t> maybeChannel =
88-
tileToGeneratorMap[tile].getAndAssignConsumerDMAChannel(isPacketFlow);
91+
tileToGeneratorMap[tile].getConsumerDMAChannel();
8992
if (!maybeChannel) {
9093
return connectionOp.emitOpError()
9194
<< "no consumer DMA channel available";
9295
}
96+
// Only assign the channel if it is for circuit flow.
97+
if (!isPacketFlow)
98+
tileToGeneratorMap[tile].assignConsumerDMAChannel(maybeChannel.value());
9399
auto channelOp = rewriter.create<AMDAIE::ChannelOp>(
94100
rewriter.getUnknownLoc(), tile, maybeChannel.value(),
95101
StrmSwPortType::DMA, AMDAIE::DMAChannelDir::S2MM);

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEGenerateColumnControlOverlay.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ LogicalResult generateColumnControlOverlay(AMDAIE::WorkgroupOp workgroupOp,
113113
// shared across multiple packet flows as needed.
114114
std::optional<uint8_t> maybeChannel =
115115
shimTileToGeneratorMap[shimTileOp.getResult()]
116-
.getAndAssignProducerDMAChannel(/*isPacketFlow*/ true);
116+
.getProducerDMAChannel();
117117
if (!maybeChannel) {
118118
shimTileOp.emitOpError() << "no producer DMA channel available";
119119
return WalkResult::interrupt();

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELinalgFunctionOutlining.cpp

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,12 @@ class AMDAIELinalgFunctionOutliningPass
8989
AMDAIELinalgFunctionOutliningPass> {
9090
public:
9191
AMDAIELinalgFunctionOutliningPass() = default;
92+
AMDAIELinalgFunctionOutliningPass(const AMDAIELinalgFunctionOutliningPass &) {
93+
}
94+
AMDAIELinalgFunctionOutliningPass(
95+
const AMDAIELinalgFunctionOutliningOptions &opts)
96+
: AMDAIELinalgFunctionOutliningBase(opts) {}
97+
9298
void getDependentDialects(DialectRegistry &registry) const override {
9399
registry.insert<linalg::LinalgDialect>();
94100
}
@@ -169,6 +175,7 @@ void AMDAIELinalgFunctionOutliningPass::runOnOperation() {
169175
if (failed(maybeFunc)) return WalkResult::interrupt();
170176
func::FuncOp func = maybeFunc.value();
171177

178+
172179
rewriter.setInsertionPoint(computeOp);
173180
rewriter.create<func::CallOp>(computeOp.getLoc(), func,
174181
computeOp->getOperands());
@@ -182,11 +189,28 @@ void AMDAIELinalgFunctionOutliningPass::runOnOperation() {
182189
op->dropAllUses();
183190
rewriter.eraseOp(op);
184191
}
192+
193+
// If the option is set to true, make the body of all outlined functions
194+
// empty, so that only the return remains. This option to 'do no compute'
195+
// is useful for benchmarking purposes.
196+
if (emptyFunctions) {
197+
for (auto &nameAndFuncOp : computeOpToOutlinedFuncMap) {
198+
Region &region = nameAndFuncOp.second.getBody();
199+
Block &block = region.front();
200+
uint64_t nOperations = block.getOperations().size();
201+
assert(nOperations > 0 && "expected terminator");
202+
for (uint64_t i = 0; i < nOperations - 1; ++i) {
203+
Operation *frontOp = &block.front();
204+
rewriter.eraseOp(frontOp);
205+
}
206+
}
207+
}
185208
}
186209

187210
} // namespace
188211

189-
std::unique_ptr<Pass> createAMDAIELinalgFunctionOutliningPass() {
190-
return std::make_unique<AMDAIELinalgFunctionOutliningPass>();
212+
std::unique_ptr<Pass> createAMDAIELinalgFunctionOutliningPass(
213+
AMDAIELinalgFunctionOutliningOptions options) {
214+
return std::make_unique<AMDAIELinalgFunctionOutliningPass>(options);
191215
}
192216
} // namespace mlir::iree_compiler::AMDAIE

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,19 @@ static LogicalResult setRootConfigForPackPeelPipeline(
377377
outerPermVec.push_back(2);
378378
}
379379
SmallVector<SmallVector<int64_t>> outerPerm = {outerPermVec, outerPermVec};
380+
if (isObjectFifo) {
381+
// Add outer permutation for unpack. NOTE: This currently fails for some
382+
// tests in the AIR pipeline.
383+
transposePackIndices.push_back(2);
384+
unpackEmpty.push_back(true);
385+
innerPerm.push_back({0, 1});
386+
if (isa<linalg::BatchMatmulOp>(linalgOp)) {
387+
outerPerm.push_back({0, 2, 1});
388+
} else {
389+
outerPerm.push_back({1, 0});
390+
}
391+
}
392+
380393
auto packingConfigLevel0Attr = getPackingConfigPackingLevelAttr(
381394
context, packedSizesL0, transposePackIndices, unpackEmpty, innerPerm,
382395
outerPerm);

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -497,7 +497,7 @@ void buildAMDAIETransformPassPipeline(
497497
bool enableVectorizationPasses, const std::string &pathToUkernels,
498498
bool enablePacketFlow, bool enableCoalescingLoops,
499499
bool enableCollapsingUnitDims, bool enableFunctionOutlining,
500-
bool insertLoopAroundCoreBlock) {
500+
bool replaceOutlinedFunctionsWithEmpty, bool insertLoopAroundCoreBlock) {
501501
OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
502502
{
503503
FunctionLikeNest funcPassManager(modulePassManager);
@@ -529,7 +529,7 @@ void buildAMDAIETransformPassPipeline(
529529
modulePassManager, enablePacketFlow, useTilePipeline,
530530
enableVectorizationPasses, enableCoalescingLoops,
531531
enableCollapsingUnitDims, enableFunctionOutlining,
532-
insertLoopAroundCoreBlock, numCols);
532+
replaceOutlinedFunctionsWithEmpty, insertLoopAroundCoreBlock, numCols);
533533
} else if (useLowerToAIEPipeline == LowerToAIEPassPipeline::AIR) {
534534
addMLIRAIRLoweringPasses(modulePassManager, device, useTilePipeline,
535535
matmulElementwiseFusion,
@@ -553,8 +553,8 @@ void addAMDAIEObjectFifoLoweringPasses(
553553
OpPassManager &passManager, bool enablePacketFlow,
554554
TilePassPipeline useTilePipeline, bool enableVectorizationPasses,
555555
bool enableCoalescingLoops, bool enableCollapsingUnitDims,
556-
bool enableFunctionOutlining, bool insertLoopAroundCoreBlock,
557-
uint32_t numCols) {
556+
bool enableFunctionOutlining, bool replaceOutlinedFunctionsWithEmpty,
557+
bool insertLoopAroundCoreBlock, uint32_t numCols) {
558558
passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
559559
passManager.addPass(memref::createFoldMemRefAliasOpsPass());
560560

@@ -579,8 +579,19 @@ void addAMDAIEObjectFifoLoweringPasses(
579579

580580
passManager.addPass(createAMDAIENormalizeLoopBoundsPass());
581581
passManager.addPass(createAMDAIEInsertCoresPass());
582-
if (enableFunctionOutlining)
583-
passManager.addPass(createAMDAIELinalgFunctionOutliningPass());
582+
583+
if (enableFunctionOutlining) {
584+
// Create function outlining options object, etc.
585+
AMDAIELinalgFunctionOutliningOptions options;
586+
if (replaceOutlinedFunctionsWithEmpty) {
587+
options.emptyFunctions = true;
588+
}
589+
passManager.addPass(createAMDAIELinalgFunctionOutliningPass(options));
590+
} else {
591+
assert(!replaceOutlinedFunctionsWithEmpty &&
592+
"`replaceOutlinedFunctionsWithEmpty` is only valid when "
593+
"`enableFunctionOutlining` is true.");
594+
}
584595

585596
{
586597
// Vectorization passes

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ void addAMDAIEObjectFifoLoweringPasses(
1919
OpPassManager &passManager, bool enablePacketFlow,
2020
TilePassPipeline useTilePipeline, bool enableVectorizationPasses,
2121
bool enableCoalescingLoops, bool enableCollapsingUnitDims,
22-
bool enableFunctionOutlining, bool insertLoopAroundCoreBlock,
23-
uint32_t numCols);
22+
bool enableFunctionOutlining, bool replaceOutlinedFunctionsWithEmpty,
23+
bool insertLoopAroundCoreBlock, uint32_t numCols);
2424

2525
/// Add passes to lower from MLIR-AIR through AIE. This is
2626
/// currently the default passes used for lowering after IREEs tiling.
@@ -43,7 +43,7 @@ void buildAMDAIETransformPassPipeline(
4343
bool enableVectorizationPasses, const std::string &pathToUkernels,
4444
bool enablePacketFlow, bool enableCoalescingLoops,
4545
bool enableCollapsingUnitDims, bool enableFunctionOutlining,
46-
bool insertLoopAroundCoreBlock);
46+
bool replaceOutlinedFunctionsWithEmpty, bool insertLoopAroundCoreBlock);
4747

4848
/// Populates passes needed to lower the IR via a Pack-Peel based approach.
4949
void addPackPeelBasedPassPipeline(OpPassManager &oassManager,
@@ -185,7 +185,8 @@ std::unique_ptr<Pass> createAMDAIEDmaToCircularDmaPass();
185185
std::unique_ptr<Pass> createAMDAIEFlattenLogicalObjectFifoPass();
186186

187187
/// Create a pass for function outlining.
188-
std::unique_ptr<Pass> createAMDAIELinalgFunctionOutliningPass();
188+
std::unique_ptr<Pass> createAMDAIELinalgFunctionOutliningPass(
189+
AMDAIELinalgFunctionOutliningOptions = {});
189190

190191
/// Create a pass to fuse the consumer op into the innermost last scf loop.
191192
std::unique_ptr<Pass> createAMDAIEFuseConsumerIntoLoopPass(

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,14 @@ def AMDAIELinalgFunctionOutlining :
284284
repeated codes.
285285
}];
286286
let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIELinalgFunctionOutliningPass()";
287+
let options = [
288+
Option<"emptyFunctions", "empty-functions", "bool", /*default=*/"false",
289+
"A developer only option that results in incorrect numerics. "
290+
"Replace all outlined functions with a function that does nothing, "
291+
"i.e. it just returns. Useful for measuring the performance of data "
292+
"movement to/from the device -- by doing zero compute, all time is spent "
293+
"moving data to/from the AIE cores.">
294+
];
287295
}
288296

289297
def AMDAIEFoldDmaWaits :

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ iree_lit_test_suite(
4444
"fold_dma_waits.mlir"
4545
"flatten_logical_objectfifo.mlir"
4646
"linalg_function_outlining.mlir"
47+
"linalg_function_outlining_to_empty.mlir"
4748
"fuse_consumer_into_loop.mlir"
4849
"fuse_fill_into_forall.mlir"
4950
"fuse_pack_into_loop.mlir"

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/linalg_function_outlining.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// RUN: iree-opt --split-input-file --iree-amdaie-linalg-function-outlining --verify-diagnostics --split-input-file %s | FileCheck %s
22

3-
// Test demonstrating multiple Matmul using different SSAs.
3+
// Test demonstrating multiple matmuls using different SSAs.
44

55
// CHECK-LABEL: func.func private @generic_matmul_0_outlined
66
// CHECK-SAME: (%[[LHS:.*]]: memref<4x8xbf16>,
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-amdaie-linalg-function-outlining{empty-functions=true})" --verify-diagnostics --split-input-file %s | FileCheck %s --check-prefix=EMPTY
2+
// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-amdaie-linalg-function-outlining{empty-functions=false})" --verify-diagnostics --split-input-file %s | FileCheck %s --check-prefix=NOT_EMPTY
3+
4+
func.func @reduction(%A: memref<4xbf16>, %B: memref<bf16>) {
5+
%c2 = arith.constant 2 : index
6+
%tile = amdaie.tile(%c2, %c2)
7+
%1 = amdaie.core(%tile, in : [], out : []) {
8+
linalg.generic {
9+
indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>],
10+
iterator_types = ["reduction"]
11+
} ins(%A: memref<4xbf16>) outs(%B : memref<bf16>) {
12+
^bb0(%in: bf16, %out: bf16):
13+
linalg.yield %in : bf16
14+
}
15+
amdaie.end
16+
}
17+
return
18+
}
19+
20+
// The (default) case where empty-functions is false, outlining works as usual.
21+
// NOT_EMPTY: func.func private
22+
// NOT_EMPTY: linalg.generic
23+
// NOT_EMPTY: return
24+
25+
// When empty-functions=true, the outlined function shouldn't contain compute.
26+
// EMPTY: func.func private
27+
// EMPTY-NOT: linalg.generic
28+
// EMPTY: return

0 commit comments

Comments
 (0)