Skip to content

Commit

Permalink
[Benchmarking option] Make outlined functions do no compute (#1010)
Browse files Browse the repository at this point in the history
This PR has an experiment that checks how much performance loss is due
to compute vs data movement. It does the following: it adds an option to
elide/omit the computation of the outlined function entirely, basically
replacing the matmul with a noop. This gives us a lower bound answer to
the question: how fast would we be if the kernel running on the AIE core
was 100% efficient? Scraping the result from CI (see c&ps below) we see
that replacing the matmul with a noop goes from 1932 us to 1859 us -- a
4% speedup. So the actual speed-up we'd get from a 100% efficient kernel
is in range 0% - 4%. i.e. not very much.

With matmul:
```
matmul_benchmark_4096_512_512_bf16_f32_O3_outline.mlir
--------------------------------------------------------------------------------------------------
Benchmark                                        Time             CPU   Iterations UserCounters...
--------------------------------------------------------------------------------------------------
BM_matmul/process_time/real_time_mean         1932 us         94.5 us            5 items_per_second=517.698/s
BM_matmul/process_time/real_time_median       1933 us         91.1 us            5 items_per_second=517.365/s
BM_matmul/process_time/real_time_stddev       7.62 us         7.53 us            5 items_per_second=2.04055/s
--------------------------------------------------------------------------------------------------
The largest program memory size (read from byte 72 of elf files) is 8928 bytes
```


With noop replacing matmul:
```
matmul_benchmark_4096_512_512_bf16_f32_O3_outline_empty.mlir
--------------------------------------------------------------------------------------------------
Benchmark                                        Time             CPU   Iterations UserCounters...
--------------------------------------------------------------------------------------------------
BM_matmul/process_time/real_time_mean         1859 us         90.4 us            5 items_per_second=537.837/s
BM_matmul/process_time/real_time_median       1854 us         86.6 us            5 items_per_second=539.255/s
BM_matmul/process_time/real_time_stddev       19.6 us         13.9 us            5 items_per_second=5.60814/s
--------------------------------------------------------------------------------------------------
The largest program memory size (read from byte 72 of elf files) is 2816 bytes
```

---------

Co-authored-by: Jorn Tuyls <jtuyls@users.noreply.github.com>
  • Loading branch information
newling and jtuyls authored Jan 8, 2025
1 parent a7f0bef commit 756cac1
Show file tree
Hide file tree
Showing 10 changed files with 144 additions and 27 deletions.
62 changes: 48 additions & 14 deletions build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -1835,6 +1835,21 @@ def __init__(self):
"transpose_a": True,
"transpose_b": False,
},
# Test where the compute is omitted, this should help triangulate
# how much performance gain can be obtained with better matmul
# on core vs data movement.
{
"M": 4096,
"N": 512,
"K": 512,
"use_ukernel": False,
"peano_opt_level": 3,
"outline": True,
"outline_to_empty_function": True,
"transpose_a": False,
"transpose_b": False,
"skip_numerics": True,
},
]

# Some bf16 Performance tests:
Expand All @@ -1851,15 +1866,29 @@ def __init__(self):
outlining_string = "--iree-amdaie-enable-function-outlining=" + str(
int(outline)
)

peano_opt_level_string = f'"-O{peano_opt_level}"'
aie_compilation_flags = [
outlining_string,
f"--iree-amd-aie-additional-peano-opt-flags={peano_opt_level_string}",
]

outline_to_empty_function = False
empty_key = "outline_to_empty_function"
if empty_key in test and test[empty_key] == True:
outline_to_empty_function = True

if outline_to_empty_function:
aie_compilation_flags.append(
"--iree-amdaie-replace-outlined-functions-with-empty"
)

name_suffix = "O" + str(peano_opt_level)
if outline:
name_suffix += "_outline"
if outline_to_empty_function:
name_suffix += "_outline_empty"
else:
name_suffix += "_outline"

if (transpose_a, transpose_b) == (False, False):
NumericTestClass = Matmul
Expand All @@ -1873,20 +1902,25 @@ def __init__(self):
else:
raise ValueError("Transposing both LHS and RHS is not supported.")

self.register(
NumericTestClass(
M,
N,
K,
"bf16",
"f32",
use_ukernel=use_ukernel,
n_repeats=2,
aie_compilation_flags=aie_compilation_flags,
name_suffix=name_suffix,
additional_labels=["PerformanceCorrectness"],
# This should only be the case for benchmark tests which we expect
# to not pass numerically.
if "skip_numerics" in test and test["skip_numerics"]:
pass
else:
self.register(
NumericTestClass(
M,
N,
K,
"bf16",
"f32",
use_ukernel=use_ukernel,
n_repeats=2,
aie_compilation_flags=aie_compilation_flags,
name_suffix=name_suffix,
additional_labels=["PerformanceCorrectness"],
)
)
)

self.register(
BenchmarkTestClass(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -229,13 +229,15 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend {

void buildTranslationPassPipeline(IREE::HAL::ExecutableTargetAttr,
OpPassManager &passManager) override {

buildAMDAIETransformPassPipeline(
passManager, options.AMDAIETargetDevice, options.AMDAIENumRows,
options.AMDAIENumCols, options.useTilePipeline,
options.useLowerToAIEPipeline, options.matmulElementwiseFusion,
options.enableVectorizationPasses, options.pathToUkernels,
options.enablePacketFlow, options.enableCoalescingLoops,
options.enableCollapsingUnitDims, options.enableFunctionOutlining,
options.replaceOutlinedFunctionsWithEmpty,
options.insertLoopAroundCoreBlock);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ struct AMDAIEOptions {
bool enableCoalescingLoops{false};
bool enableCollapsingUnitDims{false};
bool enableFunctionOutlining{true};
bool replaceOutlinedFunctionsWithEmpty{false};
bool insertLoopAroundCoreBlock{false};
bool matmulElementwiseFusion{false};
AMDAIEDevice AMDAIETargetDevice{AMDAIEDevice::npu1_4col};
Expand Down Expand Up @@ -195,6 +196,13 @@ struct AMDAIEOptions {
llvm::cl::desc("Flag to enable/disable linalg-function-outlining pass."
"It is intended for development purposes only."));

binder.opt<bool>(
"iree-amdaie-replace-outlined-functions-with-empty",
replaceOutlinedFunctionsWithEmpty, llvm::cl::cat(category),
llvm::cl::desc(
"Flag to enable/disable replacing outlined functions with "
"empty functions. For development purposes only."));

binder.opt<bool>(
"iree-amdaie-enable-infinite-loop-around-core-block",
insertLoopAroundCoreBlock, llvm::cl::cat(category),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ class AMDAIELinalgFunctionOutliningPass
AMDAIELinalgFunctionOutliningPass> {
public:
AMDAIELinalgFunctionOutliningPass() = default;
AMDAIELinalgFunctionOutliningPass(const AMDAIELinalgFunctionOutliningPass &) {
}
AMDAIELinalgFunctionOutliningPass(
const AMDAIELinalgFunctionOutliningOptions &opts)
: AMDAIELinalgFunctionOutliningBase(opts) {}

void getDependentDialects(DialectRegistry &registry) const override {
registry.insert<linalg::LinalgDialect>();
}
Expand Down Expand Up @@ -169,6 +175,7 @@ void AMDAIELinalgFunctionOutliningPass::runOnOperation() {
if (failed(maybeFunc)) return WalkResult::interrupt();
func::FuncOp func = maybeFunc.value();


rewriter.setInsertionPoint(computeOp);
rewriter.create<func::CallOp>(computeOp.getLoc(), func,
computeOp->getOperands());
Expand All @@ -182,11 +189,28 @@ void AMDAIELinalgFunctionOutliningPass::runOnOperation() {
op->dropAllUses();
rewriter.eraseOp(op);
}

// If the option is set to true, make the body of all outlined functions
// empty, so that only the return remains. This option to 'do no compute'
// is useful for benchmarking purposes.
if (emptyFunctions) {
for (auto &nameAndFuncOp : computeOpToOutlinedFuncMap) {
Region &region = nameAndFuncOp.second.getBody();
Block &block = region.front();
uint64_t nOperations = block.getOperations().size();
assert(nOperations > 0 && "expected terminator");
for (uint64_t i = 0; i < nOperations - 1; ++i) {
Operation *frontOp = &block.front();
rewriter.eraseOp(frontOp);
}
}
}
}

} // namespace

std::unique_ptr<Pass> createAMDAIELinalgFunctionOutliningPass() {
return std::make_unique<AMDAIELinalgFunctionOutliningPass>();
std::unique_ptr<Pass> createAMDAIELinalgFunctionOutliningPass(
AMDAIELinalgFunctionOutliningOptions options) {
return std::make_unique<AMDAIELinalgFunctionOutliningPass>(options);
}
} // namespace mlir::iree_compiler::AMDAIE
23 changes: 17 additions & 6 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ void buildAMDAIETransformPassPipeline(
bool enableVectorizationPasses, const std::string &pathToUkernels,
bool enablePacketFlow, bool enableCoalescingLoops,
bool enableCollapsingUnitDims, bool enableFunctionOutlining,
bool insertLoopAroundCoreBlock) {
bool replaceOutlinedFunctionsWithEmpty, bool insertLoopAroundCoreBlock) {
OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
{
FunctionLikeNest funcPassManager(modulePassManager);
Expand Down Expand Up @@ -529,7 +529,7 @@ void buildAMDAIETransformPassPipeline(
modulePassManager, enablePacketFlow, useTilePipeline,
enableVectorizationPasses, enableCoalescingLoops,
enableCollapsingUnitDims, enableFunctionOutlining,
insertLoopAroundCoreBlock, numCols);
replaceOutlinedFunctionsWithEmpty, insertLoopAroundCoreBlock, numCols);
} else if (useLowerToAIEPipeline == LowerToAIEPassPipeline::AIR) {
addMLIRAIRLoweringPasses(modulePassManager, device, useTilePipeline,
matmulElementwiseFusion,
Expand All @@ -553,8 +553,8 @@ void addAMDAIEObjectFifoLoweringPasses(
OpPassManager &passManager, bool enablePacketFlow,
TilePassPipeline useTilePipeline, bool enableVectorizationPasses,
bool enableCoalescingLoops, bool enableCollapsingUnitDims,
bool enableFunctionOutlining, bool insertLoopAroundCoreBlock,
uint32_t numCols) {
bool enableFunctionOutlining, bool replaceOutlinedFunctionsWithEmpty,
bool insertLoopAroundCoreBlock, uint32_t numCols) {
passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
passManager.addPass(memref::createFoldMemRefAliasOpsPass());

Expand All @@ -579,8 +579,19 @@ void addAMDAIEObjectFifoLoweringPasses(

passManager.addPass(createAMDAIENormalizeLoopBoundsPass());
passManager.addPass(createAMDAIEInsertCoresPass());
if (enableFunctionOutlining)
passManager.addPass(createAMDAIELinalgFunctionOutliningPass());

if (enableFunctionOutlining) {
// Create function outlining options object, etc.
AMDAIELinalgFunctionOutliningOptions options;
if (replaceOutlinedFunctionsWithEmpty) {
options.emptyFunctions = true;
}
passManager.addPass(createAMDAIELinalgFunctionOutliningPass(options));
} else {
assert(!replaceOutlinedFunctionsWithEmpty &&
"`replaceOutlinedFunctionsWithEmpty` is only valid when "
"`enableFunctionOutlining` is true.");
}

{
// Vectorization passes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ void addAMDAIEObjectFifoLoweringPasses(
OpPassManager &passManager, bool enablePacketFlow,
TilePassPipeline useTilePipeline, bool enableVectorizationPasses,
bool enableCoalescingLoops, bool enableCollapsingUnitDims,
bool enableFunctionOutlining, bool insertLoopAroundCoreBlock,
uint32_t numCols);
bool enableFunctionOutlining, bool replaceOutlinedFunctionsWithEmpty,
bool insertLoopAroundCoreBlock, uint32_t numCols);

/// Add passes to lower from MLIR-AIR through AIE. This is
/// currently the default passes used for lowering after IREEs tiling.
Expand All @@ -43,7 +43,7 @@ void buildAMDAIETransformPassPipeline(
bool enableVectorizationPasses, const std::string &pathToUkernels,
bool enablePacketFlow, bool enableCoalescingLoops,
bool enableCollapsingUnitDims, bool enableFunctionOutlining,
bool insertLoopAroundCoreBlock);
bool replaceOutlinedFunctionsWithEmpty, bool insertLoopAroundCoreBlock);

/// Populates passes needed to lower the IR via a Pack-Peel based approach.
void addPackPeelBasedPassPipeline(OpPassManager &oassManager,
Expand Down Expand Up @@ -185,7 +185,8 @@ std::unique_ptr<Pass> createAMDAIEDmaToCircularDmaPass();
std::unique_ptr<Pass> createAMDAIEFlattenLogicalObjectFifoPass();

/// Create a pass for function outlining.
std::unique_ptr<Pass> createAMDAIELinalgFunctionOutliningPass();
std::unique_ptr<Pass> createAMDAIELinalgFunctionOutliningPass(
AMDAIELinalgFunctionOutliningOptions = {});

/// Create a pass to fuse the consumer op into the innermost last scf loop.
std::unique_ptr<Pass> createAMDAIEFuseConsumerIntoLoopPass(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,14 @@ def AMDAIELinalgFunctionOutlining :
repeated codes.
}];
let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIELinalgFunctionOutliningPass()";
let options = [
Option<"emptyFunctions", "empty-functions", "bool", /*default=*/"false",
"A developer only option that results in incorrect numerics. "
"Replace all outlined functions with a function that does nothing, "
"i.e. it just returns. Useful for measuring the performance of data "
"movement to/from the device -- by doing zero compute, all time is spent "
"moving data to/from the AIE cores.">
];
}

def AMDAIEFoldDmaWaits :
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ iree_lit_test_suite(
"fold_dma_waits.mlir"
"flatten_logical_objectfifo.mlir"
"linalg_function_outlining.mlir"
"linalg_function_outlining_to_empty.mlir"
"fuse_consumer_into_loop.mlir"
"fuse_fill_into_forall.mlir"
"fuse_pack_into_loop.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// RUN: iree-opt --split-input-file --iree-amdaie-linalg-function-outlining --verify-diagnostics --split-input-file %s | FileCheck %s

// Test demonstrating multiple Matmul using different SSAs.
// Test demonstrating multiple matmuls using different SSAs.

// CHECK-LABEL: func.func private @generic_matmul_0_outlined
// CHECK-SAME: (%[[LHS:.*]]: memref<4x8xbf16>,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-amdaie-linalg-function-outlining{empty-functions=true})" --verify-diagnostics --split-input-file %s | FileCheck %s --check-prefix=EMPTY
// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-amdaie-linalg-function-outlining{empty-functions=false})" --verify-diagnostics --split-input-file %s | FileCheck %s --check-prefix=NOT_EMPTY

func.func @reduction(%A: memref<4xbf16>, %B: memref<bf16>) {
%c2 = arith.constant 2 : index
%tile = amdaie.tile(%c2, %c2)
%1 = amdaie.core(%tile, in : [], out : []) {
linalg.generic {
indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>],
iterator_types = ["reduction"]
} ins(%A: memref<4xbf16>) outs(%B : memref<bf16>) {
^bb0(%in: bf16, %out: bf16):
linalg.yield %in : bf16
}
amdaie.end
}
return
}

// The (default) case where empty-functions is false, outlining works as usual.
// NOT_EMPTY: func.func private
// NOT_EMPTY: linalg.generic
// NOT_EMPTY: return

// When empty-functions=true, the outlined function shouldn't contain compute.
// EMPTY: func.func private
// EMPTY-NOT: linalg.generic
// EMPTY: return

0 comments on commit 756cac1

Please sign in to comment.