diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp
index f15990681..d63cac489 100644
--- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp
+++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp
@@ -10,7 +10,6 @@
 #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/Support/Format.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -512,33 +511,37 @@ struct AMDAIEDmaToNpuPass : mlir::OperationPass<DeviceOp> {
 
     instructions[2] = count;
     instructions[3] = instructions.size() * sizeof(uint32_t);
-    ArrayRef<uint32_t> instsArrRef(instructions.data(), instructions.size());
-    device->setAttr(
-        "npu_instructions",
-        DenseUI32ResourceElementsAttr::get(
-            RankedTensorType::get(
-                instsArrRef.size(),
-                IntegerType::get(&getContext(), 32, IntegerType::Unsigned)),
-            "npu_instructions",
-            HeapAsmResourceBlob::allocateAndCopyInferAlign(instsArrRef)));
-    // The LX instructions for the entry point function are already generated by
-    // the pass hence we can safely delete the function as it is of no use to
-    // us. A reason to do this is that otherwise it is unceseccarily lowered to
-    // llvm where it can have a chance to crash in case the argument list is not
-    // lowerable for reasons such as memref's with dynamic offsets.
-    auto symName = dyn_cast_or_null<StringAttr>(device->getAttr("sym_name"));
+
+    // Note on use of 'DenseIntElementsAttr' below:
+    // We cannot use DenseUI32ResourceElementsAttr because it doesn't
+    // serialize-deserialize, and we (currently) serialize the module before
+    // running aie2xclbin.
+    {
+      ArrayRef<uint32_t> instsArrRef(instructions.data(), instructions.size());
+      ShapedType type =
+          RankedTensorType::get({static_cast<int64_t>(instsArrRef.size())},
+                                IntegerType::get(&getContext(), 32));
+      auto instsAttr = DenseIntElementsAttr::get(type, instsArrRef);
+      device->setAttr("npu_instructions", instsAttr);
+    }
+
     SmallVector<RuntimeSequenceOp> seqOps;
-    device->walk([&](RuntimeSequenceOp seqOp) {
-      // if the deviceOp has a symbol name attached to it we look for the
-      // sequence op that partically matches that symbol, if not we collect all
-      // sequenceOps.
-      if (!symName ||
-          symName.str().find(seqOp.getSymName()->str()) != std::string::npos)
-        seqOps.push_back(seqOp);
-    });
-    // If exactly one entry point function is found we can delete it. For any
-    // other result we do not make any change.
-    if (seqOps.size() == 1) seqOps[0].erase();
+    device->walk([&](RuntimeSequenceOp seqOp) { seqOps.push_back(seqOp); });
+
+    if (seqOps.size() > 1) {
+      device->emitOpError("has ")
+          << seqOps.size()
+          << " aiex.runtime_sequence ops. Expected no more than 1.";
+      signalPassFailure();
+    }
+
+    if (seqOps.size() == 1) {
+      auto seqOp = seqOps[0];
+      StringRef name = seqOp.getSymName().value();
+      device->setAttr("runtime_sequence_name",
+                      StringAttr::get(&getContext(), name));
+      seqOp.erase();
+    }
   }
 };
 
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir b/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir
index cb82fcd22..7fa0c7b3b 100644
--- a/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir
+++ b/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir
@@ -1,19 +1,12 @@
 // RUN: iree-opt --amdaie-dma-to-npu %s | FileCheck %s
 
-// CHECK-LABEL:   aie.device(npu1_4col) {
-// CHECK:           memref.global "public" @of_toMem : memref<32xi32>
-// CHECK:           memref.global "public" @of_fromMem : memref<32xi32>
-// CHECK:           aie.shim_dma_allocation @of_fromMem(MM2S, 0, 0)
-// CHECK:           aie.shim_dma_allocation @of_toMem(S2MM, 0, 0)
-// CHECK:         } {npu_instructions = dense_resource<npu_instructions> : tensor<64xui32>}
 
-// CHECK:         {-#
-// CHECK:           dialect_resources: {
-// CHECK:             builtin: {
-// CHECK:               npu_instructions: "0x0400000000010306050100000600000000010000010000000000000020D0010030000000200000000000000000000000000000000000008000000000000000000000000281000000300000000000000000000000000000000000000024D001000000000002000000000000000000000000000000000000000000000004D20100000000000100008018000000010000000000000000D001003000000020000000800000000000000000008000070020800F000000000000000000000281000000300000000000000000000000000000000000000004D001000000000000000000000000008000000000000000000000000000000014D20100000000000000000018000000"
-// CHECK:             }
-// CHECK:           }
-// CHECK:         #-}
+// CHECK-LABEL:  aie.device(npu1_4col) {
+// CHECK:        memref.global "public" @of_toMem : memref<32xi32>
+// CHECK:        memref.global "public" @of_fromMem : memref<32xi32>
+// CHECK:        aie.shim_dma_allocation @of_fromMem(MM2S, 0, 0)
+// CHECK:        aie.shim_dma_allocation @of_toMem(S2MM, 0, 0)
+// CHECK:      } {npu_instructions = dense<[100860160, 261, 6, 256, 1, 0, 118816, 48, 32, 0, 0, 0, -2147483648, 0, 0, 33554432, 129, 48, 0, 0, 0, 0, 118820, 0, 2, 0, 0, 0, 0, 0, 119300, 0, -2147483647, 24, 1, 0, 118784, 48, 32, 128, 0, 8388608, -2145386489, 15, 0, 33554432, 129, 48, 0, 0, 0, 0, 118788, 0, 0, 0, 128, 0, 0, 0, 119316, 0, 0, 24]> : tensor<64xi32>, runtime_sequence_name = "sequence"}
 
 
 module {
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir b/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir
index f4cfd5647..3a78c854c 100644
--- a/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir
+++ b/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir
@@ -77,26 +77,3 @@ module  {
   } {sym_name = "explicit_sym_name_0"}
 }
 
-// -----
-
-// CHECK-LABEL:   aie.device(npu1_4col) {
-// CHECK:           memref.global "public" @toMem : memref<16xi32>
-// CHECK:           func.func @pretend_microkernel
-// CHECK:           aiex.runtime_sequence  @explicit_sym_name
-// CHECK:           aie.shim_dma_allocation @toMem(MM2S, 1, 1)
-
-module  {
-  aie.device(npu1_4col) {
-    memref.global "public" @toMem : memref<16xi32>
-    func.func @pretend_microkernel(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      return
-    }
-
-    aiex.runtime_sequence @explicit_sym_name(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
-      aiex.npu.dma_wait {symbol = @toMem}
-    }
-    aie.shim_dma_allocation @toMem (MM2S, 1, 1)
-  } {sym_name = "wrong_sym_name"}
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/push_to_queue.mlir b/compiler/plugins/target/AMD-AIE/aie/test/push_to_queue.mlir
index 72a5ade0e..7b726e581 100644
--- a/compiler/plugins/target/AMD-AIE/aie/test/push_to_queue.mlir
+++ b/compiler/plugins/target/AMD-AIE/aie/test/push_to_queue.mlir
@@ -3,16 +3,11 @@
 
 // CHECK: module {
 // CHECK:   aie.device(npu1_4col) {
-// CHECK:   } {npu_instructions = dense_resource<npu_instructions> : tensor<16xui32>}
-// CHECK: }
-
-// CHECK: {-#
-// CHECK:   dialect_resources: {
-// CHECK:     builtin: {
-// CHECK:       npu_instructions: "0x040000000001030605010000020000004000000000000000000000000CD20100000000000300008018000000000000000000000014D20104000000000200030018000000"
+// CHECK:     func.func @sequence() {
+// CHECK:       return
 // CHECK:     }
-// CHECK:   }
-// CHECK: #-}
+// CHECK:   } {npu_instructions = dense<[100860160, 261, 2, 64, 0, 0, 119308, 0, -2147483645, 24, 0, 0, 67228180, 0, 196610, 24]> : tensor<16xi32>}
+// CHECK: }
 
 module {
   aie.device(npu1_4col) {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
index 8f764732a..e62db71a7 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
@@ -16,7 +16,6 @@
 #include "aievec/XLLVMDialect.h"
 #include "air/Dialect/AIR/AIRDialect.h"
 #include "air/Dialect/AIRRt/AIRRtDialect.h"
-#include "iree-amd-aie/IR/AMDAIEAttrs.h"
 #include "iree-amd-aie/IR/AMDAIEDialect.h"
 #include "iree-amd-aie/Transforms/Passes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
@@ -28,8 +27,12 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
-#include "mlir/Conversion/Passes.h"
+#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
+#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
+#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"
@@ -75,42 +78,27 @@ static llvm::cl::opt<std::string> clEnableAMDAIEUkernels(
                    "unprefixed microkernels to enable, e.g. `matmul`."),
     llvm::cl::init("none"));
 
-// Utility to find aie.device Op corresponding to the export Op.
-// For example, we have
-// hal.executable.variant {
-//   hal.executable.export symbol1
-//   hal.executable.export symbol2
-//   module {
-//     aie.device {
-//       ...
-//       aiex.runtime_sequence symbol1
-//     }
-//     aie.device {
-//       ...
-//       aiex.runtime_sequence symbol2
-//     }
-//   }
-// }
-// Hence we need to find the aiex.runtime_sequence that coresponds to the export
-// op symbol and return its parent aie.device Op. This is what we will pass to
-// the `aie2xclbin` tool for artifact generation per entry point.
-static xilinx::AIE::DeviceOp getDeviceOpFromEntryPoint(ModuleOp moduleOp,
-                                                       StringRef exportOpName) {
+static xilinx::AIE::DeviceOp getDeviceOpWithName(ModuleOp moduleOp,
+                                                 StringRef targetName) {
   xilinx::AIE::DeviceOp deviceOp;
 
-  moduleOp.walk([&](xilinx::AIEX::RuntimeSequenceOp sequenceOp) {
-    if (sequenceOp.getSymName() == exportOpName) {
-      deviceOp =
-          dyn_cast_or_null<xilinx::AIE::DeviceOp>(sequenceOp->getParentOp());
-      return WalkResult::interrupt();
-    }
-    return WalkResult::advance();
+  uint32_t nDeviceOpsVisited = 0;
+  moduleOp.walk([&](xilinx::AIE::DeviceOp d) {
+    ++nDeviceOpsVisited;
+    // This attribute should've been set in the dma-to-npu pass.
+    auto maybeName = d->getAttrOfType<StringAttr>("runtime_sequence_name");
+    if (!maybeName) return WalkResult::advance();
+    auto name = maybeName.getValue();
+    if (name != targetName) return WalkResult::advance();
+    deviceOp = d;
+    return WalkResult::interrupt();
   });
-  if (!deviceOp) {
-    moduleOp.emitError()
-        << "failed to find aie.device containing func.func with symbol "
-        << exportOpName;
-  }
+
+  if (!deviceOp)
+    moduleOp.emitError() << "visited " << nDeviceOpsVisited
+                         << " aie.device ops, and failed to find one with name "
+                         << targetName;
+
   return deviceOp;
 }
 
@@ -291,7 +279,7 @@ LogicalResult AIETargetBackend::serializeExecutable(
     }
 
     StringRef exportOpName = exportOp.getSymName();
-    deviceOps.push_back(getDeviceOpFromEntryPoint(moduleOp, exportOpName));
+    deviceOps.push_back(getDeviceOpWithName(moduleOp, exportOpName));
 
     // The xclbin kernel name, appended with instance name suffix (`:MLIRAIEV1`,
     // 10 chars) is required by the xclbinutil to have a length smaller or equal
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp
index a9bf83a60..9cbec2f71 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp
@@ -1101,6 +1101,35 @@ static LogicalResult generateUnifiedObject(
   return success();
 }
 
+FailureOr<SmallVector<uint32_t>> getNpuInstructions(ModuleOp moduleOp) {
+  auto ctx = moduleOp.getContext();
+  SmallVector<uint32_t> npuInstructions;
+  auto deviceOps = moduleOp.getOps<AIE::DeviceOp>();
+  auto nDeviceOps = std::distance(deviceOps.begin(), deviceOps.end());
+  if (nDeviceOps != 1)
+    return emitError(UnknownLoc::get(ctx),
+                     "Expected exactly one AIE::DeviceOp in the module");
+
+  auto deviceOp = *deviceOps.begin();
+  auto maybeNpuInstructions = deviceOp->getAttr("npu_instructions");
+  if (!maybeNpuInstructions)
+    return emitError(UnknownLoc::get(ctx),
+                     "Expected npu_instructions attribute on aie.device. ");
+
+  auto npuInstructionsAttr =
+      dyn_cast<DenseIntElementsAttr>(maybeNpuInstructions);
+
+  if (!npuInstructionsAttr) {
+    return emitError(UnknownLoc::get(ctx),
+                     "failed to cast to DenseUI32ResourceElementsAttr");
+  }
+
+  for (auto c : npuInstructionsAttr) {
+    npuInstructions.push_back(c.getSExtValue());
+  }
+  return npuInstructions;
+}
+
 LogicalResult aie2xclbin(
     MLIRContext *ctx, ModuleOp moduleOp, const std::string &outputNPU,
     const std::string &outputXCLBin, bool printIRBeforeAll,
@@ -1115,19 +1144,12 @@ LogicalResult aie2xclbin(
   PassManager pm(ctx, mlir::ModuleOp::getOperationName());
   applyConfigToPassManager(pm, printIRBeforeAll, printIRAfterAll,
                            printIRModuleScope, timing);
-  // generateNPUInstructions
-  pm.addNestedPass<AIE::DeviceOp>(
-      mlir::iree_compiler::AMDAIE::createAMDAIEDmaToNpuPass());
   if (failed(pm.run(moduleOp)))
     return moduleOp.emitOpError(": NPU Instruction pipeline failed");
 
-  std::optional<ArrayRef<uint32_t>> npuInstructions =
-      cast<DenseUI32ResourceElementsAttr>(
-          (*moduleOp.getOps<xilinx::AIE::DeviceOp>().begin())
-              ->getAttr("npu_instructions"))
-          .tryGetAsArrayRef();
-  if (!npuInstructions)
-    return moduleOp.emitOpError(": No NPU instructions in device op");
+  auto maybeNpuInstructions = getNpuInstructions(moduleOp);
+  if (failed(maybeNpuInstructions)) return failure();
+  auto npuInstructions = maybeNpuInstructions.value();
 
   std::string errorMessage;
   auto output = openOutputFile(outputNPU, &errorMessage);
@@ -1136,7 +1158,7 @@ LogicalResult aie2xclbin(
                  << errorMessage;
     return failure();
   }
-  for (auto w : *npuInstructions) output->os() << llvm::format("%08X\n", w);
+  for (auto w : npuInstructions) output->os() << llvm::format("%08X\n", w);
   output->keep();
 
   Path unifiedObj = Path(tempDir) / "input.o";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index b0c8f799c..3f7935844 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -13,7 +13,6 @@
 #include "iree-amd-aie/IR/AMDAIEAttrs.h"
 #include "iree-dialects/Dialect/LinalgTransform/Passes.h"
 #include "iree/compiler/Codegen/Common/Passes.h"
-#include "iree/compiler/Utils/PassUtils.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
@@ -618,8 +617,6 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) {
   passManager.addPass(createAMDAIELowerToAIEPass());
   passManager.addPass(createCanonicalizerPass());
 
-  passManager.addPass(createConvertLinalgToLoopsPass());
-
   // Now lower using the AIE passes from MLIR-AIE.
   addMLIRAIELoweringPasses(passManager);
 }
@@ -785,10 +782,13 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) {
 }
 
 void addMLIRAIELoweringPasses(OpPassManager &passManager) {
-  passManager.addPass(createLowerAffinePass());
   OpPassManager &devicePM = passManager.nest<xilinx::AIE::DeviceOp>();
-  devicePM.addPass(createAMDAIEAssignLockIDsPass());
   devicePM.addPass(createAMDAIEObjectFifoStatefulTransformPass());
+  devicePM.addPass(createAMDAIEDmaToNpuPass());
+  passManager.addPass(createCanonicalizerPass());
+  passManager.addPass(createConvertLinalgToLoopsPass());
+  passManager.addPass(createLowerAffinePass());
+  devicePM.addPass(createAMDAIEAssignLockIDsPass());
   devicePM.addPass(createAMDAIEAssignBufferDescriptorIDsPass());
   devicePM.addPass(createAMDAIEAssignBufferAddressesBasicPass());
   devicePM.addPass(createAMDAIEPathfinderPass());
diff --git a/tests/samples/conv_pipeline_e2e.mlir b/tests/samples/conv_pipeline_e2e.mlir
index 71b1442b8..7c6957017 100644
--- a/tests/samples/conv_pipeline_e2e.mlir
+++ b/tests/samples/conv_pipeline_e2e.mlir
@@ -13,11 +13,6 @@ func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xi32>, %arg1: tensor<3x3x32
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aiex.runtime_sequence @conv_2d_nhwc_hwcf_dispatch_0_conv_2d_nhwc_hwcf_2x12x12x64x3x3x32_i32(%arg0: memref<2x14x14x32xi32>, %arg1: memref<3x3x32x64xi32>, %arg2: memref<2x12x12x64xi32>)
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.sync
 
 // -----
 
@@ -34,8 +29,3 @@ func.func @conv_2d_nhwc_hwcf_q(%arg0: tensor<2x14x14x32xi8>, %arg1: tensor<3x3x3
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aiex.runtime_sequence @conv_2d_nhwc_hwcf_q_dispatch_0_conv_2d_nhwc_hwcf_q_2x12x12x64x3x3x32_i8xi8xi32xi32xi32(%arg0: memref<3136xi32>, %arg1: memref<4608xi32>, %arg2: memref<2x12x12x64xi32>)
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.sync
diff --git a/tests/samples/matmul_peeled_objectfifo_e2e.mlir b/tests/samples/matmul_peeled_objectfifo_e2e.mlir
index 386214f58..484494045 100644
--- a/tests/samples/matmul_peeled_objectfifo_e2e.mlir
+++ b/tests/samples/matmul_peeled_objectfifo_e2e.mlir
@@ -1,6 +1,8 @@
 // RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources --iree-amdaie-target-device=npu1_4col %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-lower-to-aie-pipeline=objectFifo --iree-amdaie-tile-pipeline=pack-peel --split-input-file | FileCheck %s
 
 // CHECK-LABEL: hal.executable.export public @matmul_i32_dispatch_0_matmul_128x128x256_i32
+
+// CHECK:       aie.device(npu1_4col) {
 // CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)
 // CHECK-DAG:   %[[TILE_0_3:.+]] = aie.tile(0, 3)
 // CHECK-DAG:   %[[TILE_1_2:.+]] = aie.tile(1, 2)
@@ -11,21 +13,16 @@
 // CHECK-DAG:   aie.core(%[[TILE_1_2]])
 // CHECK-DAG:   aie.core(%[[TILE_0_3]])
 // CHECK-DAG:   aie.core(%[[TILE_1_3]])
-// CHECK-DAG:   aiex.runtime_sequence @matmul_i32_dispatch_0_matmul_128x128x256_i32(%[[ARG0:.+]]: memref<128x256xi32>, %[[ARG1:.+]]: memref<256x128xi32>, %[[ARG2:.+]]: memref<128x128xi32>)
-// CHECK-DAG:     aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 0][2, 2, 64, 64][8192, 64, 128, 1]) {id = 0 : i64, issue_token = true, metadata = @[[OBJ10:.+]]}
-// CHECK-DAG:     aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 8, 64, 32][0, 32, 256, 1]) {id = 1 : i64, issue_token = true, metadata = @[[OBJ0:.+]]}
-// CHECK-DAG:     aiex.npu.dma_wait {symbol = @[[OBJ0]]}
-// CHECK-DAG:     aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][8, 2, 32, 32][4096, 32, 128, 1]) {id = 2 : i64, issue_token = true, metadata = @[[OBJ1:.+]]}
-// CHECK-DAG:     aiex.npu.dma_wait {symbol = @[[OBJ1]]}
-// CHECK-DAG:     aiex.npu.dma_wait {symbol = @[[OBJ10]]}
-// CHECK-DAG:   aie.shim_dma_allocation @[[OBJ0]](MM2S, 0, 0)
-// CHECK-DAG:   aie.shim_dma_allocation @[[OBJ1]](MM2S, 1, 0)
+// CHECK-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 0, 0)
+// CHECK-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 1, 0)
 // CHECK-DAG:   aie.memtile_dma(%[[TILE_0_1]])
 // CHECK-DAG:   aie.mem(%[[TILE_0_2]])
 // CHECK-DAG:   aie.mem(%[[TILE_0_3]])
 // CHECK-DAG:   aie.mem(%[[TILE_1_2]])
 // CHECK-DAG:   aie.mem(%[[TILE_1_3]])
-// CHECK-DAG:   aie.shim_dma_allocation @[[OBJ10]](S2MM, 0, 0)
+// CHECK-DAG:   aie.shim_dma_allocation {{.*}}(S2MM, 0, 0)
+// CHECK:       {npu_instructions = 
+// CHECK-SAME:   runtime_sequence_name = "matmul_i32_dispatch_0_matmul_128x128x256_i32"
 func.func @matmul_i32(%lhs: tensor<128x256xi32>, %rhs: tensor<256x128xi32>) -> tensor<128x128xi32>
 {
   %cst = arith.constant 0 : i32
diff --git a/tests/samples/pack_peel_pipeline_matmul.mlir b/tests/samples/pack_peel_pipeline_matmul.mlir
index 344c34e5d..a626a2132 100644
--- a/tests/samples/pack_peel_pipeline_matmul.mlir
+++ b/tests/samples/pack_peel_pipeline_matmul.mlir
@@ -15,11 +15,6 @@ func.func @matmul_i8_i32(%lhs: tensor<32x16xi8>, %rhs: tensor<16x32xi8>) -> tens
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aiex.runtime_sequence @matmul_i8_i32_dispatch_0_matmul_32x32x16_i8xi8xi32(%arg0: memref<128xi32>, %arg1: memref<128xi32>, %arg2: memref<32x32xi32>)
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.sync
 
 // -----
 
@@ -38,8 +33,3 @@ func.func @matmul_bf16(%lhs: tensor<16x32xbf16>, %rhs: tensor<32x16xbf16>) -> te
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aiex.runtime_sequence @matmul_bf16_dispatch_0_matmul_16x16x32_bf16(%arg0: memref<256xi32>, %arg1: memref<256xi32>, %arg2: memref<128xi32>)
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.sync
diff --git a/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir b/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir
index 9c2cbf935..c99b3b269 100644
--- a/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir
+++ b/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir
@@ -20,15 +20,8 @@ func.func @matmul_elementwise_i32(%lhs: tensor<1024x512xi32>, %rhs: tensor<512x1
 }
 
 // CHECK-LABEL: hal.executable.export public @matmul_elementwise_i32_dispatch_0_matmul_1024x1024x512_i32
-//       CHECK:    aie.device(npu1_4col)
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aiex.runtime_sequence @matmul_elementwise_i32_dispatch_0_matmul_1024x1024x512_i32(%arg0: memref<1024x512xi32>, %arg1: memref<512x1024xi32>, %arg2: memref<1024x1024xi32>, %arg3: memref<1024x1024xi32>)
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.sync
+// CHECK:           aie.device(npu1_4col)
+// CHECK-COUNT-3:   aie.shim_dma_allocation
 
 // -----
 
@@ -52,15 +45,8 @@ func.func @matmul_elementwise_bf16_f32(%arg0: tensor<1024x512xbf16>, %arg1: tens
 }
 
 // CHECK-LABEL: hal.executable.export public @matmul_elementwise_bf16_f32_dispatch_0_matmul_1024x1024x512_bf16xbf16xf32
-//       CHECK:    aie.device(npu1_4col)
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aiex.runtime_sequence @matmul_elementwise_bf16_f32_dispatch_0_matmul_1024x1024x512_bf16xbf16xf32(%arg0: memref<262144xi32>, %arg1: memref<262144xi32>, %arg2: memref<1024xf32>, %arg3: memref<1024x1024xf32>)
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.sync
+// CHECK:           aie.device(npu1_4col)
+// CHECK-COUNT-3:   aie.shim_dma_allocation
 
 // -----
 func.func @matmul_elementwise_bf16(%arg0: tensor<512x512xbf16>, %arg1: tensor<512x16384xbf16>, %arg2: tensor<512xf32>) -> tensor<512x16384xbf16> {
@@ -78,12 +64,6 @@ func.func @matmul_elementwise_bf16(%arg0: tensor<512x512xbf16>, %arg1: tensor<51
   return %11 : tensor<512x16384xbf16>
 }
 
-// CHECK-LABEL: hal.executable.export public @matmul_elementwise_bf16_dispatch_0_matmul_512x16384x512_bf16xbf16xf32
-//       CHECK:    aie.device(npu1_4col)
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aiex.runtime_sequence @matmul_elementwise_bf16_dispatch_0_matmul_512x16384x512_bf16xbf16xf32(%arg0: memref<131072xi32>, %arg1: memref<4194304xi32>, %arg2: memref<512xf32>, %arg3: memref<4194304xi32>)
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
+// CHECK-LABEL:    hal.executable.export public @matmul_elementwise_bf16_dispatch_0_matmul_512x16384x512_bf16xbf16xf32
+// CHECK:          aie.device(npu1_4col)
+// CHECK-COUNT-3:  aie.shim_dma_allocation
diff --git a/tests/samples/pad_pack_pipeline_e2e.mlir b/tests/samples/pad_pack_pipeline_e2e.mlir
index 18d9d8708..14bdcb04c 100644
--- a/tests/samples/pad_pack_pipeline_e2e.mlir
+++ b/tests/samples/pad_pack_pipeline_e2e.mlir
@@ -7,11 +7,6 @@
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
-//       CPP:    aiex.runtime_sequence @matmul_small_dispatch_0_matmul_8x32x16_i32(%arg0: memref<8x16xi32>, %arg1: memref<16x32xi32>, %arg2: memref<8x32xi32>)
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.sync
 func.func @matmul_small(%lhs : tensor<8x16xi32>,
     %rhs : tensor<16x32xi32>) -> tensor<8x32xi32> {
   %empty = tensor.empty() : tensor<8x32xi32>
@@ -29,12 +24,6 @@ func.func @matmul_small(%lhs : tensor<8x16xi32>,
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
-//       CPP:    aiex.runtime_sequence @matmul_large_dispatch_0_matmul_2048x2048x2048_i32(%arg0: memref<2048x2048xi32>, %arg1: memref<2048x2048xi32>, %arg2: memref<2048x2048xi32>)
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.sync
-
 func.func @matmul_large(%lhs: tensor<2048x2048xi32>, %rhs: tensor<2048x2048xi32>) -> tensor<2048x2048xi32> {
   %empty = tensor.empty() : tensor<2048x2048xi32>
   %cst = arith.constant 0 : i32
@@ -54,11 +43,6 @@ func.func @matmul_large(%lhs: tensor<2048x2048xi32>, %rhs: tensor<2048x2048xi32>
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
-//       CPP:    aiex.runtime_sequence @generic_matmul_transpose_static_dispatch_0_matmul_like_8x32x16_i32(%arg0: memref<8x16xi32>, %arg1: memref<32x16xi32>, %arg2: memref<8x32xi32>)
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.sync
 func.func @generic_matmul_transpose_static(%lhs : tensor<8x16xi32>,
     %rhs : tensor<32x16xi32>) -> tensor<8x32xi32> {
   %cst = arith.constant 0 : i32
@@ -82,11 +66,6 @@ func.func @generic_matmul_transpose_static(%lhs : tensor<8x16xi32>,
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
-//       CPP:    aiex.runtime_sequence @matmul_transpose_b_static_dispatch_0_matmul_transpose_b_8x32x16_i32(%arg0: memref<8x16xi32>, %arg1: memref<32x16xi32>, %arg2: memref<8x32xi32>)
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.sync
 func.func @matmul_transpose_b_static(%lhs : tensor<8x16xi32>,
     %rhs : tensor<32x16xi32>) -> tensor<8x32xi32> {
   %cst = arith.constant 0 : i32