Skip to content

Commit

Permalink
squash commit
Browse files Browse the repository at this point in the history
  • Loading branch information
newling committed Aug 18, 2024
1 parent 9109a1f commit ef2efb7
Show file tree
Hide file tree
Showing 12 changed files with 116 additions and 202 deletions.
57 changes: 30 additions & 27 deletions compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/Support/Format.h"
#include "mlir/IR/AsmState.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
Expand Down Expand Up @@ -512,33 +511,37 @@ struct AMDAIEDmaToNpuPass : mlir::OperationPass<DeviceOp> {

instructions[2] = count;
instructions[3] = instructions.size() * sizeof(uint32_t);
ArrayRef<uint32_t> instsArrRef(instructions.data(), instructions.size());
device->setAttr(
"npu_instructions",
DenseUI32ResourceElementsAttr::get(
RankedTensorType::get(
instsArrRef.size(),
IntegerType::get(&getContext(), 32, IntegerType::Unsigned)),
"npu_instructions",
HeapAsmResourceBlob::allocateAndCopyInferAlign(instsArrRef)));
// The LX instructions for the entry point function are already generated by
// the pass hence we can safely delete the function as it is of no use to
// us. A reason to do this is that otherwise it is unceseccarily lowered to
// llvm where it can have a chance to crash in case the argument list is not
// lowerable for reasons such as memref's with dynamic offsets.
auto symName = dyn_cast_or_null<StringAttr>(device->getAttr("sym_name"));

// Note on use of 'DenseIntElementsAttr' below:
// We cannot use DenseUI32ResourceElementsAttr because it doesn't
// serialize-deserialize, and we (currently) serialize the module before
// running aie2xclbin.
{
ArrayRef<uint32_t> instsArrRef(instructions.data(), instructions.size());
ShapedType type =
RankedTensorType::get({static_cast<int64_t>(instsArrRef.size())},
IntegerType::get(&getContext(), 32));
auto instsAttr = DenseIntElementsAttr::get(type, instsArrRef);
device->setAttr("npu_instructions", instsAttr);
}

SmallVector<RuntimeSequenceOp> seqOps;
device->walk([&](RuntimeSequenceOp seqOp) {
// if the deviceOp has a symbol name attached to it we look for the
// sequence op that partically matches that symbol, if not we collect all
// sequenceOps.
if (!symName ||
symName.str().find(seqOp.getSymName()->str()) != std::string::npos)
seqOps.push_back(seqOp);
});
// If exactly one entry point function is found we can delete it. For any
// other result we do not make any change.
if (seqOps.size() == 1) seqOps[0].erase();
device->walk([&](RuntimeSequenceOp seqOp) { seqOps.push_back(seqOp); });

if (seqOps.size() > 1) {
device->emitOpError("has ")
<< seqOps.size()
<< " aiex.runtime_sequence ops. Expected no more than 1.";
signalPassFailure();
}

if (seqOps.size() == 1) {
auto seqOp = seqOps[0];
StringRef name = seqOp.getSymName().value();
device->setAttr("runtime_sequence_name",
StringAttr::get(&getContext(), name));
seqOp.erase();
}
}
};

Expand Down
19 changes: 6 additions & 13 deletions compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
// RUN: iree-opt --amdaie-dma-to-npu %s | FileCheck %s

// CHECK-LABEL: aie.device(npu1_4col) {
// CHECK: memref.global "public" @of_toMem : memref<32xi32>
// CHECK: memref.global "public" @of_fromMem : memref<32xi32>
// CHECK: aie.shim_dma_allocation @of_fromMem(MM2S, 0, 0)
// CHECK: aie.shim_dma_allocation @of_toMem(S2MM, 0, 0)
// CHECK: } {npu_instructions = dense_resource<npu_instructions> : tensor<64xui32>}

// CHECK: {-#
// CHECK: dialect_resources: {
// CHECK: builtin: {
// CHECK: npu_instructions: "0x0400000000010306050100000600000000010000010000000000000020D0010030000000200000000000000000000000000000000000008000000000000000000000000281000000300000000000000000000000000000000000000024D001000000000002000000000000000000000000000000000000000000000004D20100000000000100008018000000010000000000000000D001003000000020000000800000000000000000008000070020800F000000000000000000000281000000300000000000000000000000000000000000000004D001000000000000000000000000008000000000000000000000000000000014D20100000000000000000018000000"
// CHECK: }
// CHECK: }
// CHECK: #-}
// CHECK-LABEL: aie.device(npu1_4col) {
// CHECK: memref.global "public" @of_toMem : memref<32xi32>
// CHECK: memref.global "public" @of_fromMem : memref<32xi32>
// CHECK: aie.shim_dma_allocation @of_fromMem(MM2S, 0, 0)
// CHECK: aie.shim_dma_allocation @of_toMem(S2MM, 0, 0)
// CHECK: } {npu_instructions = dense<[100860160, 261, 6, 256, 1, 0, 118816, 48, 32, 0, 0, 0, -2147483648, 0, 0, 33554432, 129, 48, 0, 0, 0, 0, 118820, 0, 2, 0, 0, 0, 0, 0, 119300, 0, -2147483647, 24, 1, 0, 118784, 48, 32, 128, 0, 8388608, -2145386489, 15, 0, 33554432, 129, 48, 0, 0, 0, 0, 118788, 0, 0, 0, 128, 0, 0, 0, 119316, 0, 0, 24]> : tensor<64xi32>, runtime_sequence_name = "sequence"}


module {
Expand Down
23 changes: 0 additions & 23 deletions compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -77,26 +77,3 @@ module {
} {sym_name = "explicit_sym_name_0"}
}

// -----

// CHECK-LABEL: aie.device(npu1_4col) {
// CHECK: memref.global "public" @toMem : memref<16xi32>
// CHECK: func.func @pretend_microkernel
// CHECK: aiex.runtime_sequence @explicit_sym_name
// CHECK: aie.shim_dma_allocation @toMem(MM2S, 1, 1)

module {
aie.device(npu1_4col) {
memref.global "public" @toMem : memref<16xi32>
func.func @pretend_microkernel(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
return
}

aiex.runtime_sequence @explicit_sym_name(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
aiex.npu.dma_wait {symbol = @toMem}
}
aie.shim_dma_allocation @toMem (MM2S, 1, 1)
} {sym_name = "wrong_sym_name"}
}

13 changes: 4 additions & 9 deletions compiler/plugins/target/AMD-AIE/aie/test/push_to_queue.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,11 @@

// CHECK: module {
// CHECK: aie.device(npu1_4col) {
// CHECK: } {npu_instructions = dense_resource<npu_instructions> : tensor<16xui32>}
// CHECK: }

// CHECK: {-#
// CHECK: dialect_resources: {
// CHECK: builtin: {
// CHECK: npu_instructions: "0x040000000001030605010000020000004000000000000000000000000CD20100000000000300008018000000000000000000000014D20104000000000200030018000000"
// CHECK: func.func @sequence() {
// CHECK: return
// CHECK: }
// CHECK: }
// CHECK: #-}
// CHECK: } {npu_instructions = dense<[100860160, 261, 2, 64, 0, 0, 119308, 0, -2147483645, 24, 0, 0, 67228180, 0, 196610, 24]> : tensor<16xi32>}
// CHECK: }

module {
aie.device(npu1_4col) {
Expand Down
60 changes: 24 additions & 36 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#include "aievec/XLLVMDialect.h"
#include "air/Dialect/AIR/AIRDialect.h"
#include "air/Dialect/AIRRt/AIRRtDialect.h"
#include "iree-amd-aie/IR/AMDAIEAttrs.h"
#include "iree-amd-aie/IR/AMDAIEDialect.h"
#include "iree-amd-aie/Transforms/Passes.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
Expand All @@ -28,8 +27,12 @@
#include "llvm/Support/Path.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/ToolOutputFile.h"
#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
#include "mlir/Conversion/Passes.h"
#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
#include "mlir/Dialect/DLTI/DLTI.h"
#include "mlir/Dialect/EmitC/IR/EmitC.h"
#include "mlir/Dialect/Func/Extensions/AllExtensions.h"
Expand Down Expand Up @@ -75,42 +78,27 @@ static llvm::cl::opt<std::string> clEnableAMDAIEUkernels(
"unprefixed microkernels to enable, e.g. `matmul`."),
llvm::cl::init("none"));

// Utility to find aie.device Op corresponding to the export Op.
// For example, we have
// hal.executable.variant {
// hal.executable.export symbol1
// hal.executable.export symbol2
// module {
// aie.device {
// ...
// aiex.runtime_sequence symbol1
// }
// aie.device {
// ...
// aiex.runtime_sequence symbol2
// }
// }
// }
// Hence we need to find the aiex.runtime_sequence that coresponds to the export
// op symbol and return its parent aie.device Op. This is what we will pass to
// the `aie2xclbin` tool for artifact generation per entry point.
static xilinx::AIE::DeviceOp getDeviceOpFromEntryPoint(ModuleOp moduleOp,
StringRef exportOpName) {
static xilinx::AIE::DeviceOp getDeviceOpWithName(ModuleOp moduleOp,
StringRef targetName) {
xilinx::AIE::DeviceOp deviceOp;

moduleOp.walk([&](xilinx::AIEX::RuntimeSequenceOp sequenceOp) {
if (sequenceOp.getSymName() == exportOpName) {
deviceOp =
dyn_cast_or_null<xilinx::AIE::DeviceOp>(sequenceOp->getParentOp());
return WalkResult::interrupt();
}
return WalkResult::advance();
uint32_t nDeviceOpsVisited = 0;
moduleOp.walk([&](xilinx::AIE::DeviceOp d) {
++nDeviceOpsVisited;
// This attribute should've been set in the dma-to-npu pass.
auto maybeName = d->getAttrOfType<StringAttr>("runtime_sequence_name");
if (!maybeName) return WalkResult::advance();
auto name = maybeName.getValue();
if (name != targetName) return WalkResult::advance();
deviceOp = d;
return WalkResult::interrupt();
});
if (!deviceOp) {
moduleOp.emitError()
<< "failed to find aie.device containing func.func with symbol "
<< exportOpName;
}

if (!deviceOp)
moduleOp.emitError() << "visited " << nDeviceOpsVisited
<< " aie.device ops, and failed to find one with name "
<< targetName;

return deviceOp;
}

Expand Down Expand Up @@ -291,7 +279,7 @@ LogicalResult AIETargetBackend::serializeExecutable(
}

StringRef exportOpName = exportOp.getSymName();
deviceOps.push_back(getDeviceOpFromEntryPoint(moduleOp, exportOpName));
deviceOps.push_back(getDeviceOpWithName(moduleOp, exportOpName));

// The xclbin kernel name, appended with instance name suffix (`:MLIRAIEV1`,
// 10 chars) is required by the xclbinutil to have a length smaller or equal
Expand Down
44 changes: 33 additions & 11 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1101,6 +1101,35 @@ static LogicalResult generateUnifiedObject(
return success();
}

FailureOr<SmallVector<uint32_t>> getNpuInstructions(ModuleOp moduleOp) {
auto ctx = moduleOp.getContext();
SmallVector<uint32_t> npuInstructions;
auto deviceOps = moduleOp.getOps<AIE::DeviceOp>();
auto nDeviceOps = std::distance(deviceOps.begin(), deviceOps.end());
if (nDeviceOps != 1)
return emitError(UnknownLoc::get(ctx),
"Expected exactly one AIE::DeviceOp in the module");

auto deviceOp = *deviceOps.begin();
auto maybeNpuInstructions = deviceOp->getAttr("npu_instructions");
if (!maybeNpuInstructions)
return emitError(UnknownLoc::get(ctx),
"Expected npu_instructions attribute on aie.device. ");

auto npuInstructionsAttr =
dyn_cast<DenseIntElementsAttr>(maybeNpuInstructions);

if (!npuInstructionsAttr) {
return emitError(UnknownLoc::get(ctx),
"failed to cast to DenseUI32ResourceElementsAttr");
}

for (auto c : npuInstructionsAttr) {
npuInstructions.push_back(c.getSExtValue());
}
return npuInstructions;
}

LogicalResult aie2xclbin(
MLIRContext *ctx, ModuleOp moduleOp, const std::string &outputNPU,
const std::string &outputXCLBin, bool printIRBeforeAll,
Expand All @@ -1115,19 +1144,12 @@ LogicalResult aie2xclbin(
PassManager pm(ctx, mlir::ModuleOp::getOperationName());
applyConfigToPassManager(pm, printIRBeforeAll, printIRAfterAll,
printIRModuleScope, timing);
// generateNPUInstructions
pm.addNestedPass<AIE::DeviceOp>(
mlir::iree_compiler::AMDAIE::createAMDAIEDmaToNpuPass());
if (failed(pm.run(moduleOp)))
return moduleOp.emitOpError(": NPU Instruction pipeline failed");

std::optional<ArrayRef<uint32_t>> npuInstructions =
cast<DenseUI32ResourceElementsAttr>(
(*moduleOp.getOps<xilinx::AIE::DeviceOp>().begin())
->getAttr("npu_instructions"))
.tryGetAsArrayRef();
if (!npuInstructions)
return moduleOp.emitOpError(": No NPU instructions in device op");
auto maybeNpuInstructions = getNpuInstructions(moduleOp);
if (failed(maybeNpuInstructions)) return failure();
auto npuInstructions = maybeNpuInstructions.value();

std::string errorMessage;
auto output = openOutputFile(outputNPU, &errorMessage);
Expand All @@ -1136,7 +1158,7 @@ LogicalResult aie2xclbin(
<< errorMessage;
return failure();
}
for (auto w : *npuInstructions) output->os() << llvm::format("%08X\n", w);
for (auto w : npuInstructions) output->os() << llvm::format("%08X\n", w);
output->keep();

Path unifiedObj = Path(tempDir) / "input.o";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include "iree-amd-aie/IR/AMDAIEAttrs.h"
#include "iree-dialects/Dialect/LinalgTransform/Passes.h"
#include "iree/compiler/Codegen/Common/Passes.h"
#include "iree/compiler/Utils/PassUtils.h"
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
Expand Down Expand Up @@ -618,8 +617,6 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) {
passManager.addPass(createAMDAIELowerToAIEPass());
passManager.addPass(createCanonicalizerPass());

passManager.addPass(createConvertLinalgToLoopsPass());

// Now lower using the AIE passes from MLIR-AIE.
addMLIRAIELoweringPasses(passManager);
}
Expand Down Expand Up @@ -785,10 +782,13 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) {
}

void addMLIRAIELoweringPasses(OpPassManager &passManager) {
passManager.addPass(createLowerAffinePass());
OpPassManager &devicePM = passManager.nest<xilinx::AIE::DeviceOp>();
devicePM.addPass(createAMDAIEAssignLockIDsPass());
devicePM.addPass(createAMDAIEObjectFifoStatefulTransformPass());
devicePM.addPass(createAMDAIEDmaToNpuPass());
passManager.addPass(createCanonicalizerPass());
passManager.addPass(createConvertLinalgToLoopsPass());
passManager.addPass(createLowerAffinePass());
devicePM.addPass(createAMDAIEAssignLockIDsPass());
devicePM.addPass(createAMDAIEAssignBufferDescriptorIDsPass());
devicePM.addPass(createAMDAIEAssignBufferAddressesBasicPass());
devicePM.addPass(createAMDAIEPathfinderPass());
Expand Down
10 changes: 0 additions & 10 deletions tests/samples/conv_pipeline_e2e.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,6 @@ func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xi32>, %arg1: tensor<3x3x32
// CHECK: aie.shim_dma_allocation
// CHECK: aie.shim_dma_allocation
// CHECK: aie.shim_dma_allocation
// CHECK: aiex.runtime_sequence @conv_2d_nhwc_hwcf_dispatch_0_conv_2d_nhwc_hwcf_2x12x12x64x3x3x32_i32(%arg0: memref<2x14x14x32xi32>, %arg1: memref<3x3x32x64xi32>, %arg2: memref<2x12x12x64xi32>)
// CHECK: aiex.npu.dma_memcpy_nd
// CHECK: aiex.npu.dma_memcpy_nd
// CHECK: aiex.npu.dma_memcpy_nd
// CHECK: aiex.npu.sync

// -----

Expand All @@ -34,8 +29,3 @@ func.func @conv_2d_nhwc_hwcf_q(%arg0: tensor<2x14x14x32xi8>, %arg1: tensor<3x3x3
// CHECK: aie.shim_dma_allocation
// CHECK: aie.shim_dma_allocation
// CHECK: aie.shim_dma_allocation
// CHECK: aiex.runtime_sequence @conv_2d_nhwc_hwcf_q_dispatch_0_conv_2d_nhwc_hwcf_q_2x12x12x64x3x3x32_i8xi8xi32xi32xi32(%arg0: memref<3136xi32>, %arg1: memref<4608xi32>, %arg2: memref<2x12x12x64xi32>)
// CHECK: aiex.npu.dma_memcpy_nd
// CHECK: aiex.npu.dma_memcpy_nd
// CHECK: aiex.npu.dma_memcpy_nd
// CHECK: aiex.npu.sync
17 changes: 7 additions & 10 deletions tests/samples/matmul_peeled_objectfifo_e2e.mlir
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources --iree-amdaie-target-device=npu1_4col %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-lower-to-aie-pipeline=objectFifo --iree-amdaie-tile-pipeline=pack-peel --split-input-file | FileCheck %s

// CHECK-LABEL: hal.executable.export public @matmul_i32_dispatch_0_matmul_128x128x256_i32

// CHECK: aie.device(npu1_4col) {
// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2)
// CHECK-DAG: %[[TILE_0_3:.+]] = aie.tile(0, 3)
// CHECK-DAG: %[[TILE_1_2:.+]] = aie.tile(1, 2)
Expand All @@ -11,21 +13,16 @@
// CHECK-DAG: aie.core(%[[TILE_1_2]])
// CHECK-DAG: aie.core(%[[TILE_0_3]])
// CHECK-DAG: aie.core(%[[TILE_1_3]])
// CHECK-DAG: aiex.runtime_sequence @matmul_i32_dispatch_0_matmul_128x128x256_i32(%[[ARG0:.+]]: memref<128x256xi32>, %[[ARG1:.+]]: memref<256x128xi32>, %[[ARG2:.+]]: memref<128x128xi32>)
// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 0][2, 2, 64, 64][8192, 64, 128, 1]) {id = 0 : i64, issue_token = true, metadata = @[[OBJ10:.+]]}
// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 8, 64, 32][0, 32, 256, 1]) {id = 1 : i64, issue_token = true, metadata = @[[OBJ0:.+]]}
// CHECK-DAG: aiex.npu.dma_wait {symbol = @[[OBJ0]]}
// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][8, 2, 32, 32][4096, 32, 128, 1]) {id = 2 : i64, issue_token = true, metadata = @[[OBJ1:.+]]}
// CHECK-DAG: aiex.npu.dma_wait {symbol = @[[OBJ1]]}
// CHECK-DAG: aiex.npu.dma_wait {symbol = @[[OBJ10]]}
// CHECK-DAG: aie.shim_dma_allocation @[[OBJ0]](MM2S, 0, 0)
// CHECK-DAG: aie.shim_dma_allocation @[[OBJ1]](MM2S, 1, 0)
// CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 0)
// CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 1, 0)
// CHECK-DAG: aie.memtile_dma(%[[TILE_0_1]])
// CHECK-DAG: aie.mem(%[[TILE_0_2]])
// CHECK-DAG: aie.mem(%[[TILE_0_3]])
// CHECK-DAG: aie.mem(%[[TILE_1_2]])
// CHECK-DAG: aie.mem(%[[TILE_1_3]])
// CHECK-DAG: aie.shim_dma_allocation @[[OBJ10]](S2MM, 0, 0)
// CHECK-DAG: aie.shim_dma_allocation {{.*}}(S2MM, 0, 0)
// CHECK: {npu_instructions =
// CHECK-SAME: runtime_sequence_name = "matmul_i32_dispatch_0_matmul_128x128x256_i32"
func.func @matmul_i32(%lhs: tensor<128x256xi32>, %rhs: tensor<256x128xi32>) -> tensor<128x128xi32>
{
%cst = arith.constant 0 : i32
Expand Down
Loading

0 comments on commit ef2efb7

Please sign in to comment.