diff --git a/aie_kernels/aie2p/softmax.cc b/aie_kernels/aie2p/softmax.cc
index 7d480354..64cca202 100644
--- a/aie_kernels/aie2p/softmax.cc
+++ b/aie_kernels/aie2p/softmax.cc
@@ -177,4 +177,12 @@ void partial_softmax_bf16(bfloat16 *restrict input,
     partial_softmax_alias_bf16(input, output, scale_buffer, input_size, row_idx, num_rows, scale);
 }
 
+void mask_bf16(bfloat16 *inout, const int32 unmasked_size, const int32 total_size)
+{
+    // TODO: Optimize this to use vector code
+    for (int32 i = unmasked_size; i < total_size; i++) {
+        inout[i] = (bfloat16)(-INFINITY);
+    }
+}
+
 } // extern "C"
\ No newline at end of file
diff --git a/aie_kernels/generic/mv.cc b/aie_kernels/generic/mv.cc
index 34da4550..f632e8f0 100644
--- a/aie_kernels/generic/mv.cc
+++ b/aie_kernels/generic/mv.cc
@@ -15,6 +15,10 @@
 
 #include <aie_api/aie.hpp>
 
+#ifndef VEC_SIZE
+#define VEC_SIZE 64
+#endif
+
 void matvec_scalar(uint32_t m,
                    uint32_t k,
                    const bfloat16 *__restrict a,
@@ -40,22 +44,17 @@ Matrix-vector multiplication kernel
  - c: Pointer to the output vector
  - r: Vector size; data from the matrix and vector will be loaded in and processed in chunks of this size
 */
-template <uint32_t r>
-void matvec_vectorized(uint32_t m,
-                       uint32_t k,
-                       const bfloat16 *__restrict a,
-                       const bfloat16 *__restrict b,
-                       bfloat16 *__restrict c)
+template <uint32_t r, uint32_t k>
+void matvec_vectorized(uint32_t m, const bfloat16 *__restrict a, const bfloat16 *__restrict b, bfloat16 *__restrict c)
 {
     ::aie::set_rounding(aie::rounding_mode::conv_even);
     bfloat16 *c_end = c + m;
     const bfloat16 *b_end = b + k;
     for (; c < c_end; c++) {
         aie::accum acc = aie::zeros<accfloat, r>();
-        // The following two pragmas enable pipelining the zero-overhead loop, but they do assume that k is at least
-        // two. This assumption should hold for any useful use of this function; if k were one, this would be a simple
-        // scalar multiplication of a vector.
-        AIE_LOOP_MIN_ITERATION_COUNT(2)
+        // The following two pragmas enable pipelining the zero-overhead loop, but they do assume that there are at
+        // least two iterations of the loop, i.e. k >= 2*r. This pragma will break the code if that is not the case!
+        AIE_LOOP_MIN_ITERATION_COUNT(k / VEC_SIZE)
         for (const bfloat16 *__restrict b_cur = b; b_cur < b_end; b_cur += r, a += r) {
             aie::vector<bfloat16, r> a_vec = aie::load_v<r>(a);
             aie::vector<bfloat16, r> b_vec = aie::load_v<r>(b_cur);
@@ -72,25 +71,23 @@ extern "C" {
  * `c`.  */
 
 void matvec_scalar_bf16_bf16(uint32_t m,
-                             uint32_t k,
                              uint32_t row_offset,
                              const bfloat16 *__restrict a_in,
                              const bfloat16 *__restrict b_in,
                              bfloat16 *__restrict c_out)
 {
     c_out += row_offset;
-    matvec_scalar(m, k, a_in, b_in, c_out);
+    matvec_scalar(m, DIM_K, a_in, b_in, c_out);
 }
 
 void matvec_vectorized_bf16_bf16(uint32_t m,
-                                 uint32_t k,
                                  uint32_t row_offset,
                                  const bfloat16 *__restrict a_in,
                                  const bfloat16 *__restrict b_in,
                                  bfloat16 *__restrict c_out)
 {
     c_out += row_offset;
-    matvec_vectorized<64>(m, k, a_in, b_in, c_out);
+    matvec_vectorized<VEC_SIZE, DIM_K>(m, a_in, b_in, c_out);
 }
 
 } // extern "C"
\ No newline at end of file
diff --git a/conftest.py b/conftest.py
index 2f4ab726..e9269f09 100644
--- a/conftest.py
+++ b/conftest.py
@@ -16,7 +16,9 @@
 @pytest.fixture
 def aie_context():
     """Create a fresh AIEContext for each test"""
-    return AIEContext()
+    ctx = AIEContext()
+    yield ctx
+    ctx.device_manager.reset()
 
 
 def pytest_addoption(parser):
@@ -166,3 +168,8 @@ def pytest_generate_tests(metafunc):
     if iterations > 1:
         metafunc.fixturenames.append("_iteration")
         metafunc.parametrize("_iteration", range(iterations), ids=lambda i: f"iter{i}")
+
+
+def pytest_make_parametrize_id(config, val, argname):
+    """Automatically generate test IDs with parameter names"""
+    return f"{argname}_{val}"
diff --git a/iron/applications/llama_3.2_1b/src/block/feed_forward.py b/iron/applications/llama_3.2_1b/src/block/feed_forward.py
index 8bae36ec..b7dc8cf2 100644
--- a/iron/applications/llama_3.2_1b/src/block/feed_forward.py
+++ b/iron/applications/llama_3.2_1b/src/block/feed_forward.py
@@ -116,7 +116,7 @@ def __init__(
             )
 
         if self.cfg["use_kv_cache"] and self.cfg["use_aie_ffn_gemv"]:
-            aie_gemv_config = {"num_aie_columns": 8, "is_mv": False}
+            aie_gemv_config = {"num_aie_columns": 8}
             # FC1 and FC2: emb_dim -> hidden_dim
             self.aie_fc1_gemv = AIEGEMV(
                 M=self.hidden_dim,
diff --git a/iron/applications/llama_3.2_1b/src/block/gqa.py b/iron/applications/llama_3.2_1b/src/block/gqa.py
index 1a712ff9..2267cd28 100644
--- a/iron/applications/llama_3.2_1b/src/block/gqa.py
+++ b/iron/applications/llama_3.2_1b/src/block/gqa.py
@@ -133,7 +133,6 @@ def __init__(
 
             aie_gemv_config = {
                 "num_aie_columns": 8,
-                "is_mv": False,
                 "use_static_weight": True,
             }
             self.aie_query_gemv = AIEGEMV(
diff --git a/iron/applications/llama_3.2_1b/src/block/transformer.py b/iron/applications/llama_3.2_1b/src/block/transformer.py
index f2b46cdf..fd6f9e58 100644
--- a/iron/applications/llama_3.2_1b/src/block/transformer.py
+++ b/iron/applications/llama_3.2_1b/src/block/transformer.py
@@ -104,7 +104,6 @@ def __init__(
             self.aie_residual_add_prefill = AIEElementwiseAdd(
                 size=max_prefill_size,
                 num_aie_columns=8,
-                num_channels=2,
                 tile_size=cfg["emb_dim"],
             )
 
@@ -114,7 +113,6 @@ def __init__(
                 self.aie_residual_add_decode = AIEElementwiseAdd(
                     size=decode_size,
                     num_aie_columns=1,
-                    num_channels=2,
                     tile_size=cfg["emb_dim"],
                 )
             else:
diff --git a/iron/applications/llama_3.2_1b/src/model_with_json.py b/iron/applications/llama_3.2_1b/src/model_with_json.py
index 856fb048..ba240ffc 100644
--- a/iron/applications/llama_3.2_1b/src/model_with_json.py
+++ b/iron/applications/llama_3.2_1b/src/model_with_json.py
@@ -197,9 +197,7 @@ def __init__(
             )
             aie_gemv_config = {
                 "num_aie_columns": 8,
-                "is_mv": True,
                 "use_static_weight": True,
-                "num_aie_columns": 8,
                 "tile_size_input": 4,
                 "tile_size_output": 32,
             }
diff --git a/iron/common/__init__.py b/iron/common/__init__.py
index 4fa9ae3b..45013acf 100644
--- a/iron/common/__init__.py
+++ b/iron/common/__init__.py
@@ -3,8 +3,15 @@
 
 """Common utilities and base classes for IRON operators."""
 
-from .aie_base import AIEOperatorBase, AIEOperatorConstraintError
-from .aie_context import AIEContext
+from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
+from .base import (
+    AIEOperatorBase,
+    MLIROperator,
+    CompositeOperator,
+    CompositeCallable,
+    AIERuntimeArgSpec,
+)
+from .context import AIEContext
 from .compilation import (
     XclbinArtifact,
     InstsBinArtifact,
@@ -13,4 +20,4 @@
     SourceArtifact,
     PythonGeneratedMLIRArtifact,
 )
-from .aie_device_manager import AIEDeviceManager
+from .device_manager import AIEDeviceManager
diff --git a/iron/common/aie_base.py b/iron/common/aie_base.py
deleted file mode 100644
index 5238f6f5..00000000
--- a/iron/common/aie_base.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-import numpy as np
-import os
-from pathlib import Path
-from abc import ABC, abstractmethod
-import logging
-import time
-import torch
-from ml_dtypes import bfloat16
-
-import aie.utils.config
-from . import compilation as comp
-from .aie_context import AIEContext
-from .aie_device_manager import AIEDeviceManager, pyxrt
-from .utils import numpy_to_torch, torch_to_numpy
-
-
-class AIEOperatorBase(ABC):
-    """Base class for AIE-accelerated operations"""
-
-    @classmethod
-    def get_default_context(cls):
-        """One global 'default' context if none is specified"""
-        if not hasattr(AIEOperatorBase, "_default_context"):
-            AIEOperatorBase._default_context = AIEContext()
-        return AIEOperatorBase._default_context
-
-    def __init__(self, context=None):
-        self.artifacts = (
-            []
-        )  # CompilationArtifact objects are uniqued within the context
-        self.kernels = {}  # Name -> (xclbin_path, xclbin_kernel_name, insts_path)
-        self.buffers = {}  # Name -> required buffer size in bytes
-        self.buffer_static_data = {}
-        self.runlist = (
-            []
-        )  # List of (kernel_name, buffers_name, buffer_name...), will be executed in sequence
-
-        # AIE runtime state
-        self.buffer_bos = {}  # Buffer name -> buffer object
-        self.xrt_kernels = (
-            {}
-        )  # Kernel name -> (XRT context, XRT kernel object, instruction buffer object, instruction length)
-        self.xrt_runlist = None
-
-        if context is None:
-            context = self.get_default_context()
-        context.register_operator(self)
-
-    def __call__(self, *args, **kwargs):
-        return self.forward(*args, **kwargs)
-
-    def add_kernel(
-        self,
-        name: str,
-        xclbin_artifact: comp.XclbinArtifact,
-        xclbin_kernel_name: str,
-        insts_artifact: comp.InstsBinArtifact,
-    ):
-        assert name not in self.kernels
-        self.kernels[name] = (xclbin_artifact, xclbin_kernel_name, insts_artifact)
-
-    def add_buffer(self, name, count, dtype=bfloat16, static_data=None):
-        assert name not in self.buffers
-        self.buffers[name] = count * np.dtype(dtype).itemsize
-        if static_data is not None:
-            assert (
-                static_data.nbytes <= self.buffers[name]
-            ), f"Static data for buffer {name} exceeds allocated size: expected {self.buffers[name]} bytes, got {static_data.nbytes} bytes."
-            static_data_bytes = static_data.flatten().view(np.uint8).tobytes()
-            if static_data_bytes not in self.context.static_data_pool:
-                self.context.static_data_pool[static_data_bytes] = None
-            self.buffer_static_data[name] = next(
-                k
-                for k, v in self.context.static_data_pool.items()
-                if k == static_data_bytes
-            )
-
-    def add_to_runlist(self, kernel_name, *args):
-        if kernel_name not in self.kernels:
-            raise RuntimeError(f"No such kernel: {kernel_name}")
-        for arg in args:
-            if arg not in self.buffers:
-                raise RuntimeError(f"No such buffer: {arg}")
-        self.runlist.append((kernel_name, *args))
-
-    def get_bo(self, buffer_name):
-        return self.buffer_bos[buffer_name]
-
-    def read_buffer(self, buffer_name, shape, copy=False, dtype=bfloat16):
-        """Read buffer and return values as a numpy array"""
-        # Create a byte accessible memory view of the buffer object
-        mv = self.get_bo(buffer_name).map()
-
-        # Interpret the buffer as a 1-dimensional array then change its view to the expected shape
-        arr = np.frombuffer(mv, dtype=dtype, count=np.prod(shape)).reshape(shape)
-
-        # Return an independent copy of the array if needed
-        return arr.copy() if copy else arr
-
-    def read_buffer_as_torch(self, buffer_name, shape, dtype=bfloat16):
-        return numpy_to_torch(self.read_buffer(buffer_name, shape, dtype))
-
-    def write_buffer(self, buffer_name, array):
-        """Write buffer from a numpy array into a XRT buffer object"""
-        if buffer_name in self.buffer_static_data:
-            raise RuntimeError(f"Cannot write to static buffer: {buffer_name}")
-
-        # Normalize the source
-        if isinstance(array, torch.Tensor):
-            src = torch_to_numpy(array)
-        else:
-            src = np.asarray(array)
-
-        # Create a flattened 1D byte view of the source
-        src_bytes = src.ravel().view(np.uint8)
-
-        bo = self.get_bo(buffer_name)
-        mv = bo.map()  # byte accessible memory view
-        # Interpret the buffer as a 1-dimensional array
-        dst_bytes = np.frombuffer(mv, dtype=np.uint8, count=bo.size())
-
-        # The BO is an existing array, so copyto() can be called, which doesn't create a new array
-        np.copyto(dst_bytes[: src_bytes.size], src_bytes, casting="no")
-
-    @abstractmethod
-    def set_up_artifacts(self):
-        """
-        Subclasses should overwrite this method to set up their required dependenices and runtime runlist, kernels and buffers with calls to add_artifacts(), add_kernel(), add_buffer(), and add_to_runlist().
-        Note: This method should only *describe* the required artifacts and runtime buffers, and not yet do any computation or compilation.
-        Compilation will be handled automatically based on the provided description.
-        """
-        pass
-
-    @abstractmethod
-    def set_up_runtime(self):
-        pass
-
-    def compile(self, dry_run=None):
-        """
-        Set up the operator and compile any necessary artifacts.
-        Subclasses are expected to overwrite set_up(); they may register any artifacts that they need to be compiled there.
-        """
-        context = self.context
-        self.set_up_artifacts()
-        self._move_artifact_paths()
-        work_list = comp.get_work_list(self.artifacts)
-        compilation_rules = [
-            comp.GenerateMLIRFromPythonCompilationRule(dry_run=dry_run),
-            comp.PeanoCompilationRule(
-                context.peano_dir, context.mlir_aie_dir, dry_run=dry_run
-            ),
-            comp.ArchiveCompilationRule(context.peano_dir, dry_run=dry_run),
-            comp.AieccCompilationRule(
-                context.build_dir,
-                context.peano_dir,
-                context.mlir_aie_dir,
-                dry_run=dry_run,
-            ),
-        ]
-        if work_list:
-            logging.info(
-                f"Compiling {len(work_list)} new artifacts for AIE operator {self.__class__.__name__}: {', '.join(str(artifact.path.name) for artifact in work_list)}"
-            )
-        comp.compile(compilation_rules, work_list)
-
-    def add_artifacts(self, artifacts):
-        self.artifacts.extend(artifacts)
-
-    def _move_artifact_paths(self):
-        """Make all artifacts paths point into the build directory (source artifacts into the ironclad source directory). This doesn't phyisically move files; this function is called before artifact generation."""
-        context = self.context
-        todo = self.artifacts.copy()
-        while todo:
-            artifact = todo[0]
-            todo.pop(0)
-            if isinstance(artifact, comp.SourceArtifact):
-                artifact.set_path(context.base_dir / artifact.path)
-            else:
-                artifact.set_path(context.build_dir / artifact.path)
-            todo.extend(artifact.depends)
-
-    def run_runlist(self):
-        elapsed = 0.0
-        if self.xrt_runlist is None:
-            # Execute as separate xclbin kernel invocations
-            for i, (kernel_name, *buffer_args) in enumerate(self.runlist):
-                context, xrt_kernel, insts_bo, insts_len = self.xrt_kernels[kernel_name]
-                insts_bo.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-                bos = [self.buffer_bos[buffer_arg] for buffer_arg in buffer_args]
-                for bo in bos:
-                    bo.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-                opcode = 3
-                start = time.perf_counter()
-                run = xrt_kernel(opcode, insts_bo, insts_len, *bos)
-                result = run.wait()
-                stop = time.perf_counter()
-                elapsed += stop - start
-                if result != pyxrt.ert_cmd_state.ERT_CMD_STATE_COMPLETED:
-                    raise RuntimeError(
-                        f"Kernel {kernel_name} did not complete correctly: {result}"
-                    )
-                for bo in bos:
-                    bo.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
-        else:
-            bos = set(
-                self.buffer_bos[buffer_arg]
-                for _, *buffer_args in self.runlist
-                for buffer_arg in buffer_args
-            )
-            insts_bos = set(
-                self.xrt_kernels[kernel_name][2] for (kernel_name, *_) in self.runlist
-            )
-            for bo in bos | insts_bos:
-                bo.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-            start = time.perf_counter()
-            self.xrt_runlist.execute()
-            self.xrt_runlist.wait()
-            stop = time.perf_counter()
-            for bo in bos:
-                bo.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
-            elapsed = stop - start
-        return elapsed
-
-
-class AIEOperatorConstraintError(RuntimeError):
-    pass
diff --git a/iron/common/aie_context.py b/iron/common/aie_context.py
deleted file mode 100644
index 804499f6..00000000
--- a/iron/common/aie_context.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-import numpy as np
-import logging
-from pathlib import Path
-import os
-
-from .aie_device_manager import AIEDeviceManager, pyxrt
-from . import compilation as comp
-import aie.utils.config
-
-
-class AIEContext:
-    """Context for managing AIE operator compilation and runtime state"""
-
-    def __init__(self, use_runlist=True):
-        self.operators = []
-        self.static_data_pool = {}
-        self.device_manager = AIEDeviceManager()
-        self.base_dir = Path(__file__).parent.parent.parent
-        self.build_dir = Path(os.getcwd()) / "build"
-        self.mlir_aie_dir = Path(aie.utils.config.root_path())
-        self.peano_dir = Path(aie.utils.config.peano_install_dir())
-        # Disable the XRT runlist sacrifices performance by executing kernels individually as separate xclbin invocations for easier debugging (can tell which part of runlist execution failed)
-        self.use_runlist = use_runlist
-        self._runtime_prepared = False
-
-    def register_operator(self, operator):
-        """Register an operator with this context"""
-        if self._runtime_prepared:
-            raise RuntimeError("Cannot register operators after runtime is prepared")
-        operator.context = self
-        self.operators.append(operator)
-
-    def compile_all(self):
-        """Compile all registered operators"""
-        self.build_dir.mkdir(parents=True, exist_ok=True)
-        for op in self.operators:
-            op.compile()
-
-    def prepare_runtime(self):
-        """Setup XRT runtime for all registered operators"""
-        if self._runtime_prepared:
-            return
-
-        for op in self.operators:
-            op.set_up_runtime()
-
-        # Pools of preallocated buffer objects; each buffer object is allocated
-        # once at program start and then reused across operators where possible.
-        bo_pools = {}
-        page_sz = 4096
-        get_pool_sz = lambda x: (x + page_sz - 1) // page_sz * page_sz
-
-        # Allocate static buffers first
-        for buffer_data in self.static_data_pool:
-            logging.debug(
-                f"Allocating static buffer with size {len(buffer_data)} bytes."
-            )
-            bo = pyxrt.bo(
-                self.device_manager.device,
-                len(buffer_data),
-                pyxrt.bo.host_only,
-                0x10000,
-            )
-            bo.write(np.frombuffer(buffer_data, dtype=np.uint8), 0)
-            self.static_data_pool[buffer_data] = bo
-
-        for op in self.operators:
-            if len(op.kernels) == 0:
-                continue
-
-            logging.info(f"Preparing runtime for AIE operator: {op.__class__.__name__}")
-
-            # Set up kernels
-            for kernel_name, (xclbin, xclbin_kernel_name, insts) in op.kernels.items():
-                handle = self.device_manager.get_kernel_handle(
-                    str(xclbin.path), xclbin_kernel_name, str(insts.path)
-                )
-                op.xrt_kernels[kernel_name] = (
-                    handle.context,
-                    handle.kernel,
-                    handle.insts_bo,
-                    len(handle.insts),
-                )
-
-            # If multiple buffers (of the same binned size) are used in the
-            # same kernel invocation OR across different invocations with shared
-            # buffers, they require separate allocations.
-            conflicting_buffers = {}  # map buffer -> {set of conflicting buffers}
-            buffer_to_runlist_entries = {}  # map buffer -> set of runlist entry indices
-
-            # First pass: track which buffers appear in which runlist entries
-            for idx, (kernel, *args) in enumerate(op.runlist):
-                for arg in args:
-                    buffer_to_runlist_entries.setdefault(arg, set()).add(idx)
-
-            # Second pass: determine conflicts
-            for idx, (kernel, *args) in enumerate(op.runlist):
-                for arg in args:
-                    if arg in op.buffer_static_data:
-                        # Static buffers never conflict
-                        continue
-                    pool_sz = get_pool_sz(op.buffers[arg])
-
-                    # Buffers conflict if they're in the same runlist entry
-                    conflicting_args = {
-                        a for a in args if get_pool_sz(op.buffers[a]) == pool_sz
-                    } - {arg}
-
-                    # Also conflict with buffers in other runlist entries that share
-                    # a buffer with this entry
-                    for other_arg in args:
-                        if other_arg == arg:
-                            continue
-                        for other_idx in buffer_to_runlist_entries.get(
-                            other_arg, set()
-                        ):
-                            if other_idx != idx:
-                                _, *other_args = op.runlist[other_idx]
-                                conflicting_args.update(
-                                    {
-                                        a
-                                        for a in other_args
-                                        if get_pool_sz(op.buffers[a]) == pool_sz
-                                        and a != arg
-                                    }
-                                )
-
-                    conflicting_buffers[arg] = conflicting_buffers.get(
-                        arg, set()
-                    ).union(conflicting_args)
-
-            # Allocate buffers
-            buffer_allocations = {}
-            for buffer_name, buffer_min_size in op.buffers.items():
-                if buffer_name in op.buffer_static_data:
-                    static_data = op.buffer_static_data[buffer_name]
-                    op.buffer_bos[buffer_name] = self.static_data_pool[static_data]
-                    continue
-
-                alloc_pool = get_pool_sz(buffer_min_size)
-                alloc_idx = 0
-                for conflict in conflicting_buffers.get(buffer_name, set()):
-                    if conflict not in buffer_allocations:
-                        continue
-                    conflict_pool, conflict_idx = buffer_allocations[conflict]
-                    alloc_idx = max(alloc_idx, conflict_idx + 1)
-
-                assert 0 <= alloc_idx < len(bo_pools.get(alloc_pool, [])) + 1
-                if alloc_idx == len(bo_pools.get(alloc_pool, [])):
-                    bo = pyxrt.bo(
-                        self.device_manager.device,
-                        alloc_pool,
-                        pyxrt.bo.host_only,
-                        0x10000,
-                    )
-                    bo_pools.setdefault(alloc_pool, []).append(bo)
-
-                buffer_allocations[buffer_name] = (alloc_pool, alloc_idx)
-                op.buffer_bos[buffer_name] = bo_pools[alloc_pool][alloc_idx]
-
-            # Setup runlist
-            _, (first_xclbin, first_xclbin_kernel_name, first_insts) = next(
-                iter(op.kernels.items())
-            )
-            handle = self.device_manager.get_kernel_handle(
-                str(first_xclbin.path), first_xclbin_kernel_name, str(first_insts.path)
-            )
-            context = handle.context
-            if self.use_runlist:
-                op.xrt_runlist = pyxrt.runlist(context)
-                for i, (kernel_name, *buffer_args) in enumerate(op.runlist):
-                    this_context, xrt_kernel, insts_bo, insts_len = op.xrt_kernels[
-                        kernel_name
-                    ]
-                    assert this_context == context
-                    opcode = 3
-                    run = pyxrt.run(xrt_kernel)
-                    run.set_arg(0, opcode)
-                    run.set_arg(1, insts_bo)
-                    run.set_arg(2, insts_len)
-                    for j, buffer_arg in enumerate(buffer_args):
-                        run.set_arg(j + 3, op.buffer_bos[buffer_arg])
-                    op.xrt_runlist.add(run)
-            else:
-                op.xrt_runlist = None
-
-        # Log allocation info
-        bo_count = sum(len(pool) for pool in bo_pools.values())
-        bo_footprint = sum(len(pool) * pool_sz for pool_sz, pool in bo_pools.items())
-        logging.info(
-            f"Allocated {bo_count} total buffer objects with a total memory footprint of "
-            + (
-                f"{bo_footprint//1024//1024} MiB."
-                if bo_footprint >= 1024 * 1024
-                else f"{bo_footprint//1024} KiB."
-            )
-        )
-        static_data_footprint = sum(len(data) for data in self.static_data_pool)
-        logging.info(
-            f"Allocated {len(self.static_data_pool)} static buffers with a total memory footprint of "
-            + (
-                f"{static_data_footprint//1024//1024} MiB."
-                if static_data_footprint >= 1024 * 1024
-                else f"{static_data_footprint//1024} KiB."
-            )
-        )
-
-        self._runtime_prepared = True
diff --git a/iron/common/aie_device_manager.py b/iron/common/aie_device_manager.py
deleted file mode 100644
index fda4d0cb..00000000
--- a/iron/common/aie_device_manager.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-Global AIE Device Manager for resource sharing and cleanup
-"""
-
-import logging
-import os
-import sys
-from pathlib import Path
-from typing import Dict, Optional, Any
-import pyxrt
-from aie.utils import DefaultNPURuntime
-from aie.utils.npukernel import NPUKernel
-from aie.iron.device import NPU1, NPU2
-
-
-class AIEDeviceManager:
-    """Singleton manager for AIE XRT resources"""
-
-    _instance = None
-
-    def __new__(cls):
-        if cls._instance is None:
-            cls._instance = super().__new__(cls)
-        return cls._instance
-
-    def __init__(self):
-        self.runtime = DefaultNPURuntime
-        # Expose device for AIEContext buffer allocation
-        # Accessing protected member _device as AIEContext needs pyxrt.device
-        self.device = self.runtime._device
-        self.device_type = self.runtime.device()
-
-    def get_kernel_handle(self, xclbin_path: str, kernel_name: str, insts_path: str):
-        """Get kernel handle using HostRuntime"""
-        npu_kernel = NPUKernel(
-            xclbin_path=xclbin_path, insts_path=insts_path, kernel_name=kernel_name
-        )
-        return self.runtime.load(npu_kernel)
-
-    def device_str(self) -> str:
-        return self.device_type.resolve().name
-
-    def cleanup(self):
-        """Clean up all XRT resources"""
-        # HostRuntime handles cleanup
-        pass
-
-    def reset(self):
-        """Reset the device manager (for debugging)"""
-        pass
diff --git a/iron/common/base.py b/iron/common/base.py
new file mode 100644
index 00000000..49d9bc25
--- /dev/null
+++ b/iron/common/base.py
@@ -0,0 +1,182 @@
+# SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import os
+from pathlib import Path
+from abc import ABC, abstractmethod
+import logging
+import time
+import torch
+from ml_dtypes import bfloat16
+
+import aie.utils.config
+from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
+from aie.utils.hostruntime.tensor_class import Tensor
+from aie.utils.npukernel import NPUKernel
+from . import compilation as comp
+from .context import AIEContext
+from .device_manager import pyxrt
+from .compilation import (
+    XclbinArtifact,
+    InstsBinArtifact,
+    KernelObjectArtifact,
+    KernelArchiveArtifact,
+    SourceArtifact,
+    PythonGeneratedMLIRArtifact,
+)
+
+
+class AIEOperatorBase(ABC):
+    """Base class for AIE-accelerated operations"""
+
+    def __init__(self, context=None):
+        self.artifacts = comp.CompilationArtifactGraph(
+            []
+        )  # CompilationArtifact objects are uniqued within the context
+        if context is None:
+            context = self.get_default_context()
+        context.register_operator(self)
+        self.context = context
+
+    @abstractmethod
+    def set_up_artifacts(self):
+        """
+        Subclasses should overwrite this method to set up their required dependenices and runtime runlist, kernels and buffers with calls to add_artifacts(), add_kernel(), add_buffer(), and add_to_runlist().
+        Note: This method should only *describe* the required artifacts and runtime buffers, and not yet do any computation or compilation.
+        Compilation will be handled automatically based on the provided description.
+        """
+        pass
+
+    @abstractmethod
+    def get_arg_spec(self):
+        pass
+
+    @abstractmethod
+    def get_callable(self):
+        pass
+
+    @classmethod
+    def get_default_context(cls):
+        """One global 'default' context if none is specified"""
+        if not hasattr(AIEOperatorBase, "_default_context"):
+            AIEOperatorBase._default_context = AIEContext()
+        return AIEOperatorBase._default_context
+
+    def compile(self, dry_run=False):
+        """
+        Set up the operator and compile any necessary artifacts.
+        Subclasses are expected to overwrite set_up(); they may register any artifacts that they need to be compiled there.
+        """
+        self.set_up_artifacts()
+        comp.compile(
+            self.context.compilation_rules,
+            self.artifacts,
+            self.context.build_dir,
+            dry_run=dry_run,
+        )
+        return self
+
+    def add_artifacts(self, artifacts):
+        for artifact in artifacts:
+            self.artifacts.add(artifact)
+
+
+class MLIROperator(AIEOperatorBase, ABC):
+    """Base class for AIE-accelerated operations defined by a single MLIR source"""
+
+    def __init__(self, *args, **kwargs):
+        self.kernel_archive = f"{self.get_operator_name()}_kernels.a"
+        AIEOperatorBase.__init__(self, *args, **kwargs)
+
+    @abstractmethod
+    def get_operator_name(self):
+        pass
+
+    @abstractmethod
+    def get_mlir_artifact(self):
+        pass
+
+    @abstractmethod
+    def get_kernel_artifacts(self):
+        pass
+
+    def get_artifacts(self, prefix=""):
+        operator_name = prefix + self.get_operator_name()
+        mlir_artifact = self.get_mlir_artifact()
+        kernel_deps_inputs = self.get_kernel_artifacts()
+        if len(kernel_deps_inputs) > 0:
+            # FIXME: currently hard-coding that the design will accept this argument as an input if it uses kernels
+            # Also not handling name collisions of kernels with the same name
+            mlir_artifact.callback_kwargs["kernel_archive"] = self.kernel_archive
+        kernel_deps = (
+            [
+                KernelArchiveArtifact(
+                    self.kernel_archive,
+                    dependencies=kernel_deps_inputs,
+                )
+            ]
+            if kernel_deps_inputs
+            else []
+        )
+        xclbin_artifact = XclbinArtifact(
+            f"{operator_name}.xclbin",
+            mlir_input=mlir_artifact,
+            dependencies=[mlir_artifact] + kernel_deps,
+        )
+        insts_artifact = InstsBinArtifact(
+            f"{operator_name}.bin",
+            mlir_input=mlir_artifact,
+            dependencies=[mlir_artifact],
+        )
+        return xclbin_artifact, insts_artifact
+
+    def set_up_artifacts(self):
+        xclbin_artifact, insts_artifact = self.get_artifacts()
+        self.xclbin_artifact = xclbin_artifact
+        self.insts_artifact = insts_artifact
+        self.add_artifacts([xclbin_artifact, insts_artifact])
+
+    def get_callable(self):
+        return NPUKernel(
+            xclbin_path=self.xclbin_artifact.filename,
+            kernel_name=self.xclbin_artifact.kernel_name,
+            insts_path=self.insts_artifact.filename,
+        )
+
+
+class CompositeOperator(AIEOperatorBase, ABC):
+    """Base class for composite operators that chain multiple sub-operators"""
+
+    def __init__(self, context=None):
+        super().__init__(context)
+
+
+class AIERuntimeArgSpec:
+    def __init__(self, direction, shape, dtype=bfloat16):
+        self.shape = shape
+        self.dtype = dtype
+        assert direction in {"in", "out", "inout"}
+        self.direction = direction
+
+
+class CompositeCallable:
+    """Callable for executing a sequence of sub-operators"""
+
+    def __init__(self, sequence, intermediate_buffers=None):
+        """
+        Args:
+            sequence: List of (callable, args_indices) tuples.
+                      args_indices is a list of indices into the combined list of [inputs, outputs, intermediates].
+            intermediate_buffers: List of XRTTensor objects for intermediate results.
+        """
+        self.sequence = sequence
+        self.intermediate_buffers = intermediate_buffers or []
+
+    def __call__(self, *args):
+        # args contains inputs and outputs
+        all_buffers = list(args) + self.intermediate_buffers
+
+        for op_callable, indices in self.sequence:
+            op_args = [all_buffers[i] for i in indices]
+            op_callable(*op_args)
diff --git a/iron/common/compilation.py b/iron/common/compilation.py
deleted file mode 100644
index 2cbaa916..00000000
--- a/iron/common/compilation.py
+++ /dev/null
@@ -1,630 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-This file implements a simple Python-based build system. You specify what you
-want to compile (*artifacts*) through subclasses of `CompilationArtifact`.
-Each artifact can have a list of depenencies of other artifacts that it relies
-on. Each artifact corresponds to exactly one file. If a file with a matching
-name already exists, and all its dependencies are built and older than the file,
-then the existing file will be reused.
-
-For each file name, artifacts are singletons. You create artifacts by calling
-the `new` class method of the appropriate class. This ensures that artifact
-objects are uniqued, i.e., calling `new` twice with the same file name will
-return the same object.
-
-There is a special artifact for source files that do not need to get generated,
-`SourceArtifact`. It is likely that in your compilation dependency graph,
-the leaf nodes will be `SourceArtifact`s.
-
-You specify how to generate (compile) an artifact through *rules*, which are
-expressed as subclasses of `CompilationRule`. This class requires you to
-implement two methods: `matches` and `compile`. During compilation, we will
-call `matches` on the set of remaining artifacts to see if the given rule is
-able to produce any of the artifacts not available yet. If this function
-returns `True`, we will call `compile` on the rule to generate the artifact.
-`compile` returns a new list of artifacts, which may be the same one as
-before; however, if `matches()==True`, at least one of the artifacts in the
-list must be made available after calling `compile()`.
-"""
-
-from abc import ABC, abstractmethod
-from pathlib import Path
-import os.path
-import zlib
-import logging
-import subprocess
-import importlib.util
-from contextlib import nullcontext
-from aie.extras.context import mlir_mod_ctx
-
-# Compilation Artifacts
-# --------------------------------------------------------------------------
-
-
-class CompilationArtifact(ABC):
-    _instances = {}
-
-    @classmethod
-    def new(cls, path, *args, **kwargs):
-        """Uniques artifacts based on absolute file path; any two artifacts with the same absolute path will be represented by the same object."""
-        path = Path(path)
-        abs_path = path.absolute()
-        if abs_path not in cls._instances:
-            cls._instances[abs_path] = None
-            instance = cls(path, *args, **kwargs)
-            cls._instances[abs_path] = instance
-        else:
-            assert (
-                type(cls._instances[abs_path]) == cls
-            ), f"Artifact with path {abs_path} is already registered with a different type"
-        return cls._instances[abs_path]
-
-    def __init__(self, path, depends=None):
-        abs_path = path.absolute()
-        assert (
-            abs_path in self._instances
-        ), "do not construct artifact objects directly; call the get() class method instead for uniquing"
-        self.path: Path = path
-        self.depends: list[CompilationArtifact] = depends if depends is not None else []
-        self.users: list[CompilationArtifact] = (
-            []
-        )  # List of ancestor artifacts that depend on this artifact
-        for dependency in self.depends:
-            dependency.users.append(self)
-        self.fake_available = False
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(path={self.path}, depends={self.depends})"
-
-    def set_path(self, new_path):
-        old_abs_path = self.path.absolute()
-        new_path = Path(new_path)
-        abs_path = new_path.absolute()
-        self.path = new_path
-        del CompilationArtifact._instances[old_abs_path]
-        CompilationArtifact._instances[abs_path] = self
-
-    def is_available(self):
-        if self.fake_available:
-            return True
-        if not self.path.exists():
-            return False
-        for dependency in self.depends:
-            # If any of our dependencies' dependencies are outdated, this artifact is also outdated
-            if not dependency.is_available():
-                return False
-            # If any of our direct dependencies are newer than this artifact, this artifact is invalid
-            if dependency.is_newer_than(os.path.getmtime(str(self.path))):
-                return False
-        return True
-
-    def is_newer_than(self, time):
-        if self.fake_available:
-            return True
-        return os.path.getmtime(str(self.path)) > time
-
-    def delete(self):
-        for user in self.users:
-            user.depends.remove(self)
-        del self._instances[self.path.absolute()]
-        return self.users
-
-
-class SourceArtifact(CompilationArtifact):
-    pass
-
-
-class XclbinArtifact(CompilationArtifact):
-    def __init__(
-        self, path, depends, kernel_name="MLIR_AIE", extra_flags=None, xclbin_input=None
-    ):
-        super().__init__(path, depends)
-        self.kernel_name = kernel_name
-        self.extra_flags = extra_flags if extra_flags is not None else []
-        self.xclbin_input = xclbin_input
-
-
-class InstsBinArtifact(CompilationArtifact):
-    def __init__(self, path, depends, extra_flags=None):
-        super().__init__(path, depends)
-        self.extra_flags = extra_flags if extra_flags is not None else []
-
-
-class KernelObjectArtifact(CompilationArtifact):
-    def __init__(self, path, depends, extra_flags=None, rename_symbols=None):
-        super().__init__(path, depends)
-        self.extra_flags = extra_flags if extra_flags is not None else []
-        self.rename_symbols = rename_symbols if rename_symbols is not None else {}
-
-
-class KernelArchiveArtifact(CompilationArtifact):
-    pass
-
-
-class PythonGeneratedMLIRArtifact(CompilationArtifact):
-    def __init__(
-        self,
-        path,
-        import_path,
-        callback_fn,
-        callback_args=None,
-        callback_kwargs=None,
-        requires_context=False,
-    ):
-        self.import_path = import_path
-        self.callback_fn = callback_fn
-        self.callback_args = callback_args if callback_args is not None else []
-        self.callback_kwargs = callback_kwargs if callback_kwargs is not None else {}
-        self.requires_context = requires_context
-        super().__init__(path)
-
-    def is_available(self):
-        if self.fake_available:
-            return True
-        is_available = super().is_available()
-        if is_available:
-            # Force regeneration if the Python source is changed
-            return os.path.getmtime(str(self.path)) >= os.path.getmtime(
-                self.import_path
-            )
-        return is_available
-
-
-# Compilation Rules
-# --------------------------------------------------------------------------
-
-
-class CompilationRule(ABC):
-    def __init__(self, dry_run=None):
-        self.dry_run = dry_run
-
-    @abstractmethod
-    def matches(self, artifact: list[CompilationArtifact]) -> bool:
-        pass
-
-    @abstractmethod
-    def compile(
-        self, artifacts: list[CompilationArtifact]
-    ) -> list[CompilationArtifact]:
-        pass
-
-
-class GenerateMLIRFromPythonCompilationRule(CompilationRule):
-    def matches(self, artifacts):
-        return any(
-            isinstance(artifact, PythonGeneratedMLIRArtifact)
-            and len(artifact.depends) == 0
-            for artifact in artifacts
-        )
-
-    def compile(self, artifacts):
-        """Generate MLIR from a Python callback that uses the MLIR bindings"""
-        for i, artifact in enumerate(artifacts):
-            if not isinstance(artifact, PythonGeneratedMLIRArtifact):
-                continue
-            if not all(dependency.is_available() for dependency in artifact.depends):
-                continue
-
-            if self.dry_run is None:
-                # Import the Python source file
-                spec = importlib.util.spec_from_file_location(
-                    Path(artifact.import_path).name, artifact.import_path
-                )
-                module = importlib.util.module_from_spec(spec)
-                spec.loader.exec_module(module)
-                # We only initiate an MLIR context if requested; otherwise, it is expected that the callback creates the context
-                ctx_callback = lambda: (
-                    mlir_mod_ctx() if artifact.requires_context else nullcontext()
-                )
-                with ctx_callback() as ctx:
-                    callback_function = getattr(module, artifact.callback_fn)
-                    mlir_code = callback_function(
-                        *artifact.callback_args, **artifact.callback_kwargs
-                    )
-                # Stringify the generated MLIR
-                if artifact.requires_context:
-                    mlir_code = str(ctx.module)
-                else:
-                    mlir_code = str(mlir_code)
-
-                with open(artifact.path, "w") as f:
-                    f.write(mlir_code)
-
-            # Now that the artifact is generated, replace this artifact with the MLIR source code file
-            old_users = artifact.delete()
-            new_artifact = SourceArtifact.new(artifact.path)
-            for user in old_users:
-                user.depends.append(new_artifact)
-            if self.dry_run is not None:
-                python_cmd = ""
-                # Import the Python source file
-                python_cmd += (
-                    "import sys; sys.path.append("
-                    f'"{Path(artifact.import_path).parent}"'
-                    "); "
-                )
-                python_cmd += f"from {Path(artifact.import_path).stem} import {artifact.callback_fn}; "
-
-                # Check if we need to import device classes
-                # Device classes have __module__ == 'abc' but need to be imported from aie.iron.device
-                device_classes = set()
-                for arg in artifact.callback_args:
-                    obj_module = type(arg).__module__
-                    obj_class = type(arg).__name__
-                    if obj_module == "abc" and (
-                        obj_class.startswith("NPU") or obj_class.startswith("XCVC")
-                    ):
-                        device_classes.add(obj_class)
-                for v in artifact.callback_kwargs.values():
-                    obj_module = type(v).__module__
-                    obj_class = type(v).__name__
-                    if obj_module == "abc" and (
-                        obj_class.startswith("NPU") or obj_class.startswith("XCVC")
-                    ):
-                        device_classes.add(obj_class)
-
-                if device_classes:
-                    python_cmd += f"from aie.iron.device import {', '.join(sorted(device_classes))}; "
-
-                if artifact.requires_context:
-                    python_cmd += "from aie.extras.context import mlir_mod_ctx; "
-                    python_cmd += "with mlir_mod_ctx() as ctx: "
-                python_cmd += f"mlir_code = {artifact.callback_fn}({', '.join(map(GenerateMLIRFromPythonCompilationRule._repr_for_codegen, artifact.callback_args))}, {', '.join(f'{k}={_repr_for_codegen(v)}' for k, v in artifact.callback_kwargs.items())}); "
-                if artifact.requires_context:
-                    python_cmd += "print(str(ctx.module))"
-                else:
-                    python_cmd += "print(str(mlir_code))"
-                self.dry_run.append(f"python3 -c '{python_cmd}' > {artifact.path}")
-                new_artifact.fake_available = True
-            artifacts[i] = new_artifact
-            logging.debug(f"Created MLIR source string for {artifact.path.name}")
-
-        return artifacts
-
-    @staticmethod
-    def _repr_for_codegen(obj):
-        """Convert an object to its string representation for code generation.
-
-        Handles special cases like device classes that need to be instantiated
-        rather than using their default repr().
-        """
-        # Check if this is a device class from aie.iron.device
-        # These classes have __module__ == 'abc' but are imported from aie.iron.device
-        obj_module = type(obj).__module__
-        obj_class = type(obj).__name__
-
-        # Check for known device class patterns (NPU1, NPU2, XCVC1902, etc.)
-        # These are imported from aie.iron.device but have __module__ == 'abc'
-        if obj_module == "abc" and (
-            obj_class.startswith("NPU") or obj_class.startswith("XCVC")
-        ):
-            # For device classes, generate instantiation code
-            return f"{obj_class}()"
-
-        # Default to repr() for other types
-        return repr(obj)
-
-
-class AieccCompilationRule(CompilationRule):
-    def __init__(self, build_dir, peano_dir, mlir_aie_dir, *args, **kwargs):
-        self.build_dir = build_dir
-        self.aiecc_path = Path(mlir_aie_dir) / "bin" / "aiecc.py"
-        self.peano_dir = peano_dir
-        super().__init__(*args, **kwargs)
-
-    def matches(self, artifacts):
-        return any(
-            isinstance(artifact, (XclbinArtifact, InstsBinArtifact))
-            and all(dependency.is_available() for dependency in artifact.depends)
-            for artifact in artifacts
-        )
-
-    def compile(self, artifacts):
-        # If there are both xclbin and insts.bin targets based on the same source MLIR code, we can combine them into one single `aiecc.py` invocation.
-        mlir_sources = set()
-        mlir_sources_to_xclbins = {}
-        mlir_sources_to_insts_bins = {}
-        for artifact in artifacts:
-            if not isinstance(artifact, (XclbinArtifact, InstsBinArtifact)):
-                continue
-            if not all(dependency.is_available() for dependency in artifact.depends):
-                continue
-            mlir_dependencies = [
-                d
-                for d in artifact.depends
-                if isinstance(d, (SourceArtifact, PythonGeneratedMLIRArtifact))
-            ]
-            if len(mlir_dependencies) != 1:
-                raise RuntimeError(
-                    f"Expected exactly one dependency of {artifact.path} to be SourceArtifact or PythonGeneratedMLIRArtifact, got: {', '.join(str(dep.path) for dep in artifact.depends)}"
-                )
-            mlir_dependency = mlir_dependencies[0]
-            mlir_sources.add(mlir_dependency)
-            if isinstance(artifact, XclbinArtifact):
-                mlir_sources_to_xclbins.setdefault(mlir_dependency, []).append(artifact)
-            elif isinstance(artifact, InstsBinArtifact):
-                mlir_sources_to_insts_bins.setdefault(mlir_dependency, []).append(
-                    artifact
-                )
-
-        # Now we know for each mlir source if we need to generate an xclbin, an insts.bin or both for it
-        for mlir_source in mlir_sources:
-            # Build aiecc command using Peano
-            compile_cmd = [
-                "python",
-                str(self.aiecc_path),
-                "--no-compile-host",
-                "--no-xchesscc",
-                "--no-xbridge",
-                "--peano",
-                str(self.peano_dir),
-                "--dynamic-objFifos",
-            ]
-            do_compile_xclbin = mlir_source in mlir_sources_to_xclbins
-            do_compile_insts_bin = mlir_source in mlir_sources_to_insts_bins
-            if do_compile_xclbin:
-                first_xclbin = mlir_sources_to_xclbins[mlir_source][
-                    0
-                ]  # FIXME: this does not handle the case of multiple xclbins with different kernel names or flags from the same MLIR
-                compile_cmd += first_xclbin.extra_flags + [
-                    "--aie-generate-xclbin",
-                    "--xclbin-name=" + str(first_xclbin.path),
-                    "--xclbin-kernel-name=" + first_xclbin.kernel_name,
-                ]
-                if first_xclbin.xclbin_input is not None:
-                    compile_cmd += [
-                        "--xclbin-input=" + str(first_xclbin.xclbin_input.path)
-                    ]
-            if do_compile_insts_bin:
-                first_insts_bin = mlir_sources_to_insts_bins[mlir_source][
-                    0
-                ]  # FIXME: this does not handle the case of multiple insts.bins with different flags from the same MLIR
-                if not do_compile_xclbin:
-                    compile_cmd += ["--no-compile"]
-                compile_cmd += first_insts_bin.extra_flags + [
-                    "--aie-generate-npu",
-                    "--npu-insts-name=" + str(first_insts_bin.path),
-                ]
-            compile_cmd += [str(mlir_source.path)]
-
-            env = os.environ.copy()
-            logging.debug(f"Compiling MLIR with command: {' '.join(compile_cmd)}")
-            if not self.dry_run:
-                result = subprocess.run(
-                    compile_cmd,
-                    cwd=str(self.build_dir),
-                    capture_output=True,
-                    text=True,
-                    timeout=300,
-                    env=env,
-                )
-                if result.returncode == 0:
-                    logging.debug(
-                        f"Successfully compiled {mlir_source.path} to {', '.join([str(first_xclbin.path)] if do_compile_xclbin else [] + [str(first_insts_bin.path)] if do_compile_insts_bin else [])}"
-                    )
-                else:
-                    raise RuntimeError(
-                        f"MLIR compilation for {mlir_source.path} failed: {result.stderr}"
-                    )
-
-                # There may be multiple targets that require an xclbin/insts.bin from the same MLIR with different names; copy them
-                for sources_to in [mlir_sources_to_xclbins, mlir_sources_to_insts_bins]:
-                    if sources_to.get(mlir_source, [])[1:]:
-                        copy_src = sources_to[mlir_source][0]
-                        for copy_dest in sources_to[mlir_source][1:]:
-                            shutil.copy(copy_src.path, copy_dest.path)
-
-            else:
-                for sources_to in [mlir_sources_to_xclbins, mlir_sources_to_insts_bins]:
-                    for artifact in sources_to.get(mlir_source, []):
-                        self.dry_run.append(
-                            f"pushd {str(self.build_dir)} && {' '.join(compile_cmd)} && popd"
-                        )
-                        artifact.fake_available = True
-
-        # With the newly generated files, is_available() should now return True on the Xclbin and InstsBin targets
-        return artifacts
-
-
-class PeanoCompilationRule(CompilationRule):
-    def __init__(self, peano_dir, mlir_aie_dir, *args, **kwargs):
-        self.peano_dir = peano_dir
-        self.mlir_aie_dir = mlir_aie_dir
-        super().__init__(*args, **kwargs)
-
-    def matches(self, artifacts):
-        return any(
-            isinstance(artifact, KernelObjectArtifact)
-            and all(
-                isinstance(dependency, SourceArtifact) and dependency.is_available()
-                for dependency in artifact.depends
-            )
-            for artifact in artifacts
-        )
-
-    def compile(self, artifacts):
-        clang_path = Path(self.peano_dir) / "bin" / "clang++"
-        include_path = Path(self.mlir_aie_dir) / "include"
-
-        for artifact in artifacts:
-            if not isinstance(artifact, KernelObjectArtifact):
-                continue
-
-            if len(artifact.depends) != 1:
-                raise RuntimeError(
-                    "Expected exactly one dependency (the C source code) for KernelObjectArtifact"
-                )
-            source_file = artifact.depends[0]
-            if not isinstance(source_file, SourceArtifact):
-                raise RuntimeError(
-                    "Expected KernelObject dependency to be a C source file"
-                )
-
-            cmd = (
-                [
-                    str(clang_path),
-                    "-O2",
-                    "-std=c++20",
-                    "--target=aie2p-none-unknown-elf",
-                    "-Wno-parentheses",
-                    "-Wno-attributes",
-                    "-Wno-macro-redefined",
-                    "-Wno-empty-body",
-                    "-Wno-missing-template-arg-list-after-template-kw",
-                    f"-I{str(include_path)}",
-                ]
-                + artifact.extra_flags
-                + ["-c", str(source_file.path), "-o", str(artifact.path)]
-            )
-            logging.debug(f"Running compilation command: {' '.join(cmd)}")
-
-            if self.dry_run is None:
-                result = subprocess.run(cmd, capture_output=True, text=True)
-                if result.returncode != 0:
-                    raise RuntimeError(f"Compilation failed: {result.stderr}")
-                logging.debug(f"Successfully compiled: {artifact.path.name}")
-            else:
-                artifact.fake_available = True
-                self.dry_run.append(" ".join(cmd))
-
-            if artifact.rename_symbols:
-                self._rename_symbols(artifact)
-
-        return artifacts
-
-    def _rename_symbols(self, artifact):
-        objcopy_path = "llvm-objcopy-18"
-        cmd = [
-            objcopy_path,
-        ]
-        for old_sym, new_sym in artifact.rename_symbols.items():
-            cmd += [
-                "--redefine-sym",
-                f"{old_sym}={new_sym}",
-            ]
-        cmd += [str(artifact.path)]
-
-        logging.debug(f"Running renaming command: {' '.join(cmd)}")
-        if self.dry_run is None:
-            result = subprocess.run(cmd, capture_output=True, text=True)
-            if result.returncode == 0:
-                logging.info(f"Successfully renamed symbols in: {artifact.path.name}")
-            else:
-                raise RuntimeError(f"Symbol renaming failed: {result.stderr}")
-        else:
-            artifact.fake_available = True
-            self.dry_run.append(" ".join(cmd))
-
-
-class ArchiveCompilationRule(CompilationRule):
-    def __init__(self, peano_dir, *args, **kwargs):
-        self.peano_dir = peano_dir
-        super().__init__(*args, **kwargs)
-
-    def matches(self, artifacts):
-        return any(
-            isinstance(artifact, KernelArchiveArtifact) and len(artifact.depends) > 0
-            for artifact in artifacts
-        )
-
-    def compile(self, artifacts):
-        """Create an archive (.a) from compiled object files"""
-        for artifact in artifacts:
-            if not isinstance(artifact, KernelArchiveArtifact):
-                continue
-
-            # Get archive filename from method
-            archive_path = str(artifact.path)
-            object_files = [
-                str(dep.path)
-                for dep in artifact.depends
-                if isinstance(dep, KernelObjectArtifact)
-            ]
-
-            # Try to find ar tool from PEANO, then system
-            ar_path = None
-
-            if self.peano_dir:
-                # Peano has llvm-ar for archiving
-                peano_ar = Path(self.peano_dir) / "bin" / "llvm-ar"
-                if os.path.exists(peano_ar):
-                    ar_path = peano_ar
-
-            if ar_path is None:
-                raise RuntimeError(
-                    "Could not find 'ar' tool in PEANO installation or system PATH"
-                )
-
-            cmd = [str(ar_path), "rcs", archive_path] + object_files
-
-            if self.dry_run is None:
-                result = subprocess.run(cmd, capture_output=True, text=True)
-                if result.returncode == 0:
-                    logging.debug(
-                        f"Successfully created archive: {Path(archive_path).name}"
-                    )
-                else:
-                    raise RuntimeError(f"Archive creation failed: {result.stderr}")
-            else:
-                artifact.fake_available = True
-                self.dry_run.append(" ".join(cmd))
-
-        return artifacts
-
-
-# Global Functions
-# --------------------------------------------------------------------------
-
-
-def apply_rules(rules, artifacts):
-    for rule in rules:
-        if rule.matches(artifacts):
-            logging.debug(f"Applying rule: {rule.__class__.__name__}")
-            artifacts = rule.compile(artifacts)
-            break
-    else:
-        # None of the rules matched
-        return False, artifacts
-
-    return True, artifacts
-
-
-def compile(rules, artifacts):
-    # While some artifacts remain to be compiled (not all are available)
-    while not all(artifact.is_available() for artifact in artifacts):
-        remaining = [artifact for artifact in artifacts if not artifact.is_available()]
-        success, artifacts = apply_rules(rules, remaining)
-        if not success:
-            raise RuntimeError(
-                f"No matching rule to compile target(s): {', '.join(str(artifact.path.name) for artifact in artifacts if not artifact.is_available())}"
-            )
-    return artifacts
-
-
-def get_work_list(artifacts):
-    """
-    Return a flattened artifact creation worklist in reverse topological order from dependencies.
-    The returned list will start with leaf nodes (artifacts with no dependencies), and any following artifacts will only contain artifacts from earlier in the list.
-    """
-    work_list = []
-    todo = list(artifacts)
-    visited = set()
-
-    def dfs_visit(artifact):
-        if artifact in visited:
-            # Thanks to uniquing of artifact objects, this avoids duplicate creation of the same artifacts
-            return
-        visited.add(artifact)
-        # First visit all dependencies, so put leaves first (post-order) ...
-        for dep in artifact.depends:
-            dfs_visit(dep)
-        #  ... then put parent
-        if not artifact.is_available():
-            work_list.append(artifact)
-
-    for artifact in todo:
-        dfs_visit(artifact)
-
-    return work_list
diff --git a/iron/common/compilation/__init__.py b/iron/common/compilation/__init__.py
new file mode 100644
index 00000000..9cca95c6
--- /dev/null
+++ b/iron/common/compilation/__init__.py
@@ -0,0 +1,5 @@
+# SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from .base import *
+from .aie import *
diff --git a/iron/common/compilation/aie.py b/iron/common/compilation/aie.py
new file mode 100644
index 00000000..7e784b1c
--- /dev/null
+++ b/iron/common/compilation/aie.py
@@ -0,0 +1,507 @@
+# SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from pathlib import Path
+import os.path
+import importlib.util
+from contextlib import nullcontext
+from aie.extras.context import mlir_mod_ctx
+from .base import (
+    CompilationArtifact,
+    SourceArtifact,
+    CompilationRule,
+    ShellCompilationCommand,
+    PythonCallbackCompilationCommand,
+)
+
+# AIE Artifacts
+# ##########################################################################
+
+
+class FullElfArtifact(CompilationArtifact):
+    def __init__(self, filename, mlir_input, dependencies):
+        if mlir_input not in dependencies:
+            dependencies = dependencies + [mlir_input]
+        super().__init__(filename, dependencies)
+        self.mlir_input = mlir_input
+
+
+class XclbinArtifact(CompilationArtifact):
+    def __init__(
+        self,
+        filename,
+        mlir_input,
+        dependencies,
+        kernel_name="MLIR_AIE",
+        extra_flags=None,
+        xclbin_input=None,
+    ):
+        if mlir_input not in dependencies:
+            dependencies = dependencies + [mlir_input]
+        super().__init__(filename, dependencies)
+        self.mlir_input = mlir_input
+        self.kernel_name = kernel_name
+        self.extra_flags = extra_flags if extra_flags is not None else []
+        self.xclbin_input = xclbin_input
+
+
+class InstsBinArtifact(CompilationArtifact):
+    def __init__(self, filename, mlir_input, dependencies, extra_flags=None):
+        self.mlir_input = mlir_input
+        if mlir_input not in dependencies:
+            dependencies = dependencies + [mlir_input]
+        super().__init__(filename, dependencies)
+        self.extra_flags = extra_flags if extra_flags is not None else []
+
+
+class KernelObjectArtifact(CompilationArtifact):
+    def __init__(
+        self,
+        filename,
+        dependencies,
+        extra_flags=None,
+        rename_symbols=None,
+        prefix_symbols=None,
+    ):
+        super().__init__(filename, dependencies)
+        self.extra_flags = extra_flags if extra_flags is not None else []
+        self.rename_symbols = rename_symbols if rename_symbols is not None else {}
+        self.prefix_symbols = prefix_symbols
+
+
+class KernelArchiveArtifact(CompilationArtifact):
+    pass
+
+
+class PythonGeneratedMLIRArtifact(CompilationArtifact):
+    def __init__(
+        self,
+        filename,
+        import_path,
+        callback_fn,
+        callback_args=None,
+        callback_kwargs=None,
+        requires_context=False,
+        uses_kernel_archive=False,
+        kernel_archive=None,
+    ):
+        self.import_path = import_path
+        self.callback_fn = callback_fn
+        self.callback_args = callback_args if callback_args is not None else []
+        self.callback_kwargs = callback_kwargs if callback_kwargs is not None else {}
+        self.requires_context = requires_context
+        dependencies = [SourceArtifact(import_path)]
+        super().__init__(filename, dependencies=dependencies)
+
+
+# AIE Rules
+# ##########################################################################
+
+
+class GenerateMLIRFromPythonCompilationRule(CompilationRule):
+    def matches(self, graph):
+        return any(graph.get_worklist(PythonGeneratedMLIRArtifact))
+
+    def compile(self, graph):
+        """Generate MLIR from a Python callback that uses the MLIR bindings"""
+        commands = []
+        worklist = graph.get_worklist(PythonGeneratedMLIRArtifact)
+        for artifact in worklist:
+            new_artifact = SourceArtifact(artifact.filename)
+            # To make Python capture variables in this closure by value, not by reference, use default arguments
+            callback = lambda new_artifact=new_artifact, import_path=artifact.import_path, callback_fn=artifact.callback_fn, callback_args=artifact.callback_args, callback_kwargs=artifact.callback_kwargs, requires_context=artifact.requires_context: self.generate_mlir(
+                new_artifact,
+                import_path,
+                callback_fn,
+                callback_args,
+                callback_kwargs,
+                requires_context,
+            )
+            commands.append(PythonCallbackCompilationCommand(callback))
+            new_artifact.available = True
+            graph.replace(artifact, new_artifact)
+        return commands
+
+    @staticmethod
+    def generate_mlir(
+        output_artifact,
+        import_path,
+        callback_fn,
+        callback_args=None,
+        callback_kwargs=None,
+        requires_context=False,
+    ):
+        # Import the Python source file
+        spec = importlib.util.spec_from_file_location(
+            Path(import_path).name, import_path
+        )
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        # We only initiate an MLIR context if requested; otherwise, it is expected that the callback creates the context
+        ctx_callback = lambda: (mlir_mod_ctx() if requires_context else nullcontext())
+        with ctx_callback() as ctx:
+            callback_function = getattr(module, callback_fn)
+            mlir_code = callback_function(*callback_args, **callback_kwargs)
+        # Stringify the generated MLIR
+        if requires_context:
+            mlir_code = str(ctx.module)
+        else:
+            mlir_code = str(mlir_code)
+
+        with open(output_artifact.filename, "w") as f:
+            f.write(mlir_code)
+
+
+class AieccCompilationRule(CompilationRule):
+    def __init__(self, build_dir, peano_dir, mlir_aie_dir, *args, **kwargs):
+        self.build_dir = build_dir
+        self.aiecc_path = Path(mlir_aie_dir) / "bin" / "aiecc.py"
+        self.peano_dir = peano_dir
+        super().__init__(*args, **kwargs)
+
+
+class AieccFullElfCompilationRule(AieccCompilationRule):
+    def matches(self, graph):
+        return any(graph.get_worklist(FullElfArtifact))
+
+    def compile(self, graph):
+        worklist = graph.get_worklist(FullElfArtifact)
+        commands = []
+
+        for artifact in worklist:
+            compile_cmd = [
+                "python",
+                str(self.aiecc_path),
+                "--no-compile-host",
+                "--no-xchesscc",
+                "--no-xbridge",
+                "--peano",
+                str(self.peano_dir),
+                "--dynamic-objFifos",
+                "--expand-load-pdis",
+                "--generate-full-elf",
+                "--full-elf-name",
+                os.path.abspath(artifact.filename),
+                os.path.abspath(artifact.mlir_input.filename),
+            ]
+            commands.append(
+                ShellCompilationCommand(compile_cmd, cwd=str(self.build_dir))
+            )
+            artifact.available = True
+
+        return commands
+
+
+class AieccXclbinInstsCompilationRule(AieccCompilationRule):
+    def matches(self, graph):
+        return any(graph.get_worklist((XclbinArtifact, InstsBinArtifact)))
+
+    def compile(self, graph):
+        # Group artifacts by their unique compilation configuration
+        xclbin_configs = {}
+        insts_configs = {}
+        worklist = graph.get_worklist((XclbinArtifact, InstsBinArtifact))
+
+        for artifact in worklist:
+            mlir_dependency = artifact.mlir_input
+            if isinstance(artifact, XclbinArtifact):
+                key = (
+                    mlir_dependency,
+                    artifact.kernel_name,
+                    tuple(artifact.extra_flags),
+                    artifact.xclbin_input,
+                )
+                xclbin_configs.setdefault(key, []).append(artifact)
+            elif isinstance(artifact, InstsBinArtifact):
+                key = (mlir_dependency, tuple(artifact.extra_flags))
+                insts_configs.setdefault(key, []).append(artifact)
+
+        commands = []
+        handled_insts_configs = set()
+
+        # Iterate through XCLBIN configurations
+        for xclbin_key, xclbin_artifacts in xclbin_configs.items():
+            mlir_source, kernel_name, xclbin_flags, xclbin_input = xclbin_key
+
+            # Try to find a matching InstsBin configuration (same MLIR source)
+            matching_insts_key = None
+            for insts_key in insts_configs:
+                if (
+                    insts_key not in handled_insts_configs
+                    and insts_key[0] == mlir_source
+                ):
+                    matching_insts_key = insts_key
+                    break
+
+            compile_cmd = [
+                "python",
+                str(self.aiecc_path),
+                "--no-compile-host",
+                "--no-xchesscc",
+                "--no-xbridge",
+                "--peano",
+                str(self.peano_dir),
+                "--dynamic-objFifos",
+            ]
+
+            # Add XCLBIN flags
+            first_xclbin = xclbin_artifacts[0]
+            compile_cmd += list(xclbin_flags) + [
+                "--aie-generate-xclbin",
+                "--xclbin-name=" + os.path.abspath(first_xclbin.filename),
+                "--xclbin-kernel-name=" + kernel_name,
+            ]
+            if xclbin_input is not None:
+                compile_cmd += [
+                    "--xclbin-input=" + os.path.abspath(xclbin_input.filename)
+                ]
+
+            # Add InstsBin flags if matching config found
+            if matching_insts_key:
+                handled_insts_configs.add(matching_insts_key)
+                insts_artifacts = insts_configs[matching_insts_key]
+                first_insts = insts_artifacts[0]
+                compile_cmd += list(matching_insts_key[1]) + [
+                    "--aie-generate-npu",
+                    "--npu-insts-name=" + os.path.abspath(first_insts.filename),
+                ]
+
+            compile_cmd += [os.path.abspath(mlir_source.filename)]
+
+            # If the MLIR source depends on a kernel archive, pass it to aiecc.py so it can be linked
+            if (
+                isinstance(mlir_source, PythonGeneratedMLIRArtifact)
+                and "kernel_archive" in mlir_source.callback_kwargs
+            ):
+                compile_cmd.append(
+                    os.path.abspath(
+                        os.path.join(
+                            self.build_dir,
+                            mlir_source.callback_kwargs["kernel_archive"],
+                        )
+                    )
+                )
+
+            commands.append(
+                ShellCompilationCommand(compile_cmd, cwd=str(self.build_dir))
+            )
+
+            # Copy for other XCLBIN artifacts with same config
+            if len(xclbin_artifacts) > 1:
+                for copy_dest in xclbin_artifacts[1:]:
+                    commands.append(
+                        ShellCompilationCommand(
+                            ["cp", first_xclbin.filename, copy_dest.filename]
+                        )
+                    )
+
+            # Copy for other InstsBin artifacts with same config (if matched)
+            if matching_insts_key:
+                insts_artifacts = insts_configs[matching_insts_key]
+                if len(insts_artifacts) > 1:
+                    first_insts = insts_artifacts[0]
+                    for copy_dest in insts_artifacts[1:]:
+                        commands.append(
+                            ShellCompilationCommand(
+                                ["cp", first_insts.filename, copy_dest.filename]
+                            )
+                        )
+
+        # Handle remaining InstsBin configurations
+        for insts_key, insts_artifacts in insts_configs.items():
+            if insts_key in handled_insts_configs:
+                continue
+
+            mlir_source, insts_flags = insts_key
+            first_insts = insts_artifacts[0]
+
+            compile_cmd = [
+                "python",
+                str(self.aiecc_path),
+                "--no-compile-host",
+                "--no-xchesscc",
+                "--no-xbridge",
+                "--peano",
+                str(self.peano_dir),
+                "--dynamic-objFifos",
+                "--no-compile",
+            ]
+
+            compile_cmd += list(insts_flags) + [
+                "--aie-generate-npu",
+                "--npu-insts-name=" + os.path.abspath(first_insts.filename),
+            ]
+
+            compile_cmd += [os.path.abspath(mlir_source.filename)]
+
+            # If the MLIR source depends on a kernel archive, pass it to aiecc.py so it can be linked
+            if (
+                isinstance(mlir_source, PythonGeneratedMLIRArtifact)
+                and "kernel_archive" in mlir_source.callback_kwargs
+            ):
+                compile_cmd.append(
+                    os.path.abspath(
+                        os.path.join(
+                            self.build_dir,
+                            mlir_source.callback_kwargs["kernel_archive"],
+                        )
+                    )
+                )
+
+            commands.append(
+                ShellCompilationCommand(compile_cmd, cwd=str(self.build_dir))
+            )
+
+            # Copy for other InstsBin artifacts with same config
+            if len(insts_artifacts) > 1:
+                for copy_dest in insts_artifacts[1:]:
+                    commands.append(
+                        ShellCompilationCommand(
+                            ["cp", first_insts.filename, copy_dest.filename]
+                        )
+                    )
+
+        # Update graph
+        for artifact in worklist:
+            artifact.available = True
+
+        return commands
+
+
+class PeanoCompilationRule(CompilationRule):
+    def __init__(self, peano_dir, mlir_aie_dir, *args, **kwargs):
+        self.peano_dir = peano_dir
+        self.mlir_aie_dir = mlir_aie_dir
+        super().__init__(*args, **kwargs)
+
+    def matches(self, artifacts):
+        return any(artifacts.get_worklist(KernelObjectArtifact))
+
+    def compile(self, artifacts):
+        clang_path = Path(self.peano_dir) / "bin" / "clang++"
+        include_path = Path(self.mlir_aie_dir) / "include"
+        worklist = artifacts.get_worklist(KernelObjectArtifact)
+        commands = []
+        for artifact in worklist:
+            if len(artifact.dependencies) != 1:
+                raise RuntimeError(
+                    "Expected exactly one dependency (the C source code) for KernelObjectArtifact"
+                )
+            source_file = artifact.dependencies[0]
+            if not isinstance(source_file, SourceArtifact):
+                raise RuntimeError(
+                    "Expected KernelObject dependency to be a C source file"
+                )
+
+            cmd = (
+                [
+                    str(clang_path),
+                    "-O2",
+                    "-std=c++20",
+                    "--target=aie2p-none-unknown-elf",
+                    "-Wno-parentheses",
+                    "-Wno-attributes",
+                    "-Wno-macro-redefined",
+                    "-Wno-empty-body",
+                    "-Wno-missing-template-arg-list-after-template-kw",
+                    f"-I{str(include_path)}",
+                ]
+                + artifact.extra_flags
+                + ["-c", source_file.filename, "-o", artifact.filename]
+            )
+
+            commands.append(ShellCompilationCommand(cmd))
+            if artifact.rename_symbols:
+                commands.extend(self._rename_symbols(artifact))
+            if artifact.prefix_symbols:
+                commands.extend(self._prefix_symbols(artifact, artifact.prefix_symbols))
+            artifact.available = True
+
+        return commands
+
+    def _rename_symbols(self, artifact):
+        objcopy_path = "llvm-objcopy-18"
+        cmd = [
+            objcopy_path,
+        ]
+        for old_sym, new_sym in artifact.rename_symbols.items():
+            cmd += [
+                "--redefine-sym",
+                f"{old_sym}={new_sym}",
+            ]
+        cmd += [artifact.filename]
+        return [ShellCompilationCommand(cmd)]
+
+    def _prefix_symbols(self, artifact, prefix):
+        objcopy_path = "llvm-objcopy-18"
+        nm_path = "llvm-nm-18"
+        symbol_map_file = artifact.filename + ".symbol_map"
+
+        # Extract defined symbols and create symbol map
+        nm_cmd = [
+            "sh",
+            "-c",
+            f"{nm_path} --defined-only --extern-only {artifact.filename} | "
+            f"awk '{{print $3 \" {prefix}\" $3}}' > {symbol_map_file}",
+        ]
+
+        # Apply the renaming using the symbol map
+        objcopy_cmd = [
+            objcopy_path,
+            "--redefine-syms=" + symbol_map_file,
+            artifact.filename,
+        ]
+
+        return [ShellCompilationCommand(nm_cmd), ShellCompilationCommand(objcopy_cmd)]
+
+
+class ArchiveCompilationRule(CompilationRule):
+    def __init__(self, peano_dir, *args, **kwargs):
+        self.peano_dir = peano_dir
+        super().__init__(*args, **kwargs)
+
+    def matches(self, artifacts):
+        return any(artifacts.get_worklist(KernelArchiveArtifact))
+
+    def compile(self, artifacts):
+        """Create an archive (.a) from compiled object files"""
+        worklist = artifacts.get_worklist(KernelArchiveArtifact)
+        commands = []
+        for artifact in worklist:
+            # Get archive filename from method
+            archive_path = artifact.filename
+            object_files = [
+                dep.filename
+                for dep in artifact.dependencies
+                if isinstance(dep, KernelObjectArtifact)
+            ]
+
+            # Try to find ar tool from PEANO, then system
+            ar_path = None
+
+            if self.peano_dir:
+                # Peano has llvm-ar for archiving
+                peano_ar = Path(self.peano_dir) / "bin" / "llvm-ar"
+                if os.path.exists(peano_ar):
+                    ar_path = peano_ar
+
+            if ar_path is None:
+                raise RuntimeError(
+                    "Could not find 'ar' tool in PEANO installation or system PATH"
+                )
+
+            cmd = [str(ar_path), "rcs", archive_path] + object_files
+            commands.append(ShellCompilationCommand(cmd))
+
+            # Check for duplicate symbol definitions in the archive
+            check_cmd = [
+                "sh",
+                "-c",
+                f"nm {archive_path} | grep ' [TDR] ' | awk '{{print $3}}' | sort | uniq -d | "
+                f'if read sym; then echo "Error: Duplicate symbol in archive: $sym" >&2; exit 1; fi',
+            ]
+            commands.append(ShellCompilationCommand(check_cmd))
+
+            artifact.available = True
+
+        return commands
diff --git a/iron/common/compilation/base.py b/iron/common/compilation/base.py
new file mode 100644
index 00000000..84db532b
--- /dev/null
+++ b/iron/common/compilation/base.py
@@ -0,0 +1,304 @@
+# SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This file implements a simple Python-based build system. You specify what you
+want to compile (*artifacts*) through subclasses of `CompilationArtifact`.
+Multiple `CompilationArtifacts` form a `CompilationArtifactGraph`. Each artifact
+can have a list (subgraph) of depenencies of other artifacts that it relies on.
+Each artifact corresponds to exactly one file.
+
+There is a special artifact for source files that do not need to get generated,
+`SourceArtifact`. It is likely that in your compilation dependency graph,
+the leaf nodes will be `SourceArtifact`s.
+
+You specify how to generate (compile) an artifact through *rules*, which are
+expressed as subclasses of `CompilationRule`. Rules must implement two methods:
+`matches` and `compile`. If a rule `matches` to an artifact graph, it can be
+applied. Applying a rule is done by calling `compile`; this transforms the
+artifact graph (in the simplest case, marks one of the artifacts as available)
+and returns a list of compilation commands.
+
+At this point, we can print the compilation commands to the console (dry-run)
+or actually run them to generate the artifacts.
+
+Before starting compilation, you may call
+`populate_availability_from_filesystem()` -- this will check if any artifacts
+are already available at the given file paths (and ensure that dependencies are
+as old or older than the artifacts that depend on them). This way, you can avoid
+recompiling artifacts that are already up-to-date on disk. If you wish to
+regenerate everything, you can skip this step, but will at a minimum want to
+mark the `SourceArtifact`s as available -- they cannot be generated.
+"""
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+import os.path
+import logging
+import subprocess
+import sys
+
+# Global Functions
+# ##########################################################################
+
+
+def plan(rules, graph):
+    if all(artifact.is_available() for artifact in graph):
+        return []  # Everything has been compiled
+    for rule in rules:
+        if rule.matches(graph):
+            commands = rule.compile(graph)
+            break
+    else:
+        raise RuntimeError(
+            f"No matching rule to compile target(s): {', '.join(artifact.filename for artifact in graph)}"
+        )
+    return [(rule, commands)] + plan(rules, graph)
+
+
+def execute(plan_steps):
+    for rule, commands in plan_steps:
+        logging.debug(f"Applying rule: {rule.__class__.__name__}")
+        for command in commands:
+            logging.debug(f"  Executing command: {command}")
+            success = command.run()
+            if not success:
+                raise RuntimeError(f"Command failed: {command}")
+
+
+def compile(rules, artifacts, build_dir="build", dry_run=False):
+    if not os.path.exists(build_dir) and not dry_run:
+        os.makedirs(build_dir)
+    artifacts.move_artifacts(build_dir)
+    artifacts.populate_availability_from_filesystem()
+    plan_steps = plan(rules, artifacts)
+    if not dry_run:
+        execute(plan_steps)
+    else:
+        print("\n".join("\n".join(map(str, cmds)) for _, cmds in plan_steps))
+
+
+# Compilation Artifact Graph
+# ##########################################################################
+
+
+class CompilationArtifactGraph:
+    def __init__(self, artifacts=None):
+        self.artifacts = artifacts if artifacts is not None else []
+
+    def __repr__(self):
+        def format_artifact(artifact, indent=0):
+            prefix = "    " * indent
+            avail = "[x] " if artifact.is_available() else "[ ] "
+            result = f"{prefix}{avail}{artifact.__class__.__name__}({Path(artifact.filename).name})\n"
+            for dep in artifact.dependencies:
+                result += format_artifact(dep, indent + 1)
+            return result
+
+        result = "CompilationArtifactGraph(\n"
+        for artifact in self.artifacts:
+            result += format_artifact(artifact, indent=1)
+        result += ")"
+        return result
+
+    def __iter__(self):
+        return iter(self.artifacts)
+
+    def __len__(self):
+        return len(self.artifacts)
+
+    def __getitem__(self, index):
+        return self.artifacts[index]
+
+    def dfs(self):
+        return self._traverse(True)
+
+    def bfs(self):
+        return self._traverse(False)
+
+    def _traverse(self, dfs):
+        visited = set()
+        todo = self.artifacts.copy()
+        while todo:
+            artifact = todo.pop() if dfs else todo.pop(0)
+            if artifact in visited:
+                continue
+            visited.add(artifact)
+            todo.extend(artifact.dependencies)
+            yield artifact
+
+    def replace(self, old_artifact, new_artifact):
+        for i, artifact in enumerate(self.artifacts):
+            if artifact == old_artifact:
+                self.artifacts[i] = new_artifact
+            else:
+                artifact.dependencies.replace(old_artifact, new_artifact)
+        return self
+
+    def populate_availability_from_filesystem(self):
+        for artifact in self.artifacts:
+            artifact.dependencies.populate_availability_from_filesystem()
+            artifact.available = artifact.is_available_in_filesystem()
+
+    def get_worklist(self, kind):
+        """Return a list of artifacts of the given kind that can be built in the next step (dependencies available)."""
+        return [
+            artifact
+            for artifact in self.bfs()
+            if isinstance(artifact, kind)
+            and not artifact.is_available()
+            and artifact.dependencies_available()
+        ]
+
+    def move_artifacts(self, new_root):
+        """Make all artifacts paths point into a build directory"""
+        for artifact in self.bfs():
+            if not os.path.isabs(artifact.filename):
+                artifact.filename = str(Path(new_root) / Path(artifact.filename).name)
+
+    def add(self, artifact):
+        self.artifacts.append(artifact)
+
+
+# Compilation Artifacts
+# ##########################################################################
+
+
+class CompilationArtifact(ABC):
+    def __init__(self, filename, dependencies=None, available=False):
+        self.filename = str(filename)
+        self.dependencies: CompilationArtifactGraph = CompilationArtifactGraph(
+            artifacts=dependencies if dependencies is not None else []
+        )
+        self.available = available
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self.filename})"
+
+    def is_available(self):
+        """'Conceptual' availability: during a dry-run or in the planning stage, available may be True even if the underlying file does not exist yet."""
+        # If any of our dependencies' dependencies are outdated, this artifact is also outdated
+        return self.available and self.dependencies_available()
+
+    def dependencies_available(self):
+        return all(d.is_available() for d in self.dependencies)
+
+    def is_available_in_filesystem(self):
+        """'Real' availability: checks if the underlying file exists and is up-to-date with respect to dependencies."""
+        if not os.path.exists(self.filename):
+            return False
+        file_mtime = os.path.getmtime(self.filename)
+        for dependency in self.dependencies:
+            if (
+                not dependency.is_available_in_filesystem()
+                or os.path.getmtime(dependency.filename) > file_mtime
+            ):
+                return False
+        return True
+
+
+class SourceArtifact(CompilationArtifact):
+    """Artifact representing a source file that does not need to be generated, is assumed to be there."""
+
+    pass
+
+
+# Compilation Command
+# ##########################################################################
+
+
+class CompilationCommand(ABC):
+    """An abstraction for anything that can be executed to physically produce artifacts."""
+
+    @abstractmethod
+    def run(self) -> bool:
+        pass
+
+    @abstractmethod
+    def __repr__(self):
+        pass
+
+
+class ShellCompilationCommand(CompilationCommand):
+    def __init__(self, command: list[str], cwd=None, env="copy"):
+        self.command = command
+        self.cwd = cwd
+        if env == "copy":
+            env = os.environ.copy()
+        self.env = env
+
+    def run(self) -> bool:
+        result = subprocess.run(
+            self.command,
+            capture_output=True,
+            text=True,
+            cwd=self.cwd,
+            env=self.env,
+        )
+        if 0 != result.returncode:
+            print(result.stdout)
+            print(result.stderr, file=sys.stderr)
+        return 0 == result.returncode
+
+    def __repr__(self):
+        return f"Shell({' '.join(self.command)})"
+
+
+class PythonCallbackCompilationCommand(CompilationCommand):
+    def __init__(self, callback):
+        self.callback = callback
+
+    def run(self) -> bool:
+        result = self.callback()
+        return bool(result) if result is not None else True
+
+    def __repr__(self):
+        return f"PythonCallback({self.callback})"
+
+
+# Compilation Rules
+# ##########################################################################
+
+
+class CompilationRule(ABC):
+    """A compilation rule is applied to a artifact graph, producing compilation commands and a transformed artifact graph."""
+
+    @abstractmethod
+    def matches(self, artifact: CompilationArtifactGraph) -> bool:
+        """Return true if this rule can be applied to any artifact in the artifact graph."""
+        pass
+
+    @abstractmethod
+    def compile(self, artifacts: CompilationArtifactGraph) -> list[CompilationCommand]:
+        """Apply this rule to the artifact graph, returning compilation commands. This should modify the artifact graph in-place to reflect the newly generated artifacts."""
+        pass
+
+
+class BatchRule(CompilationRule):
+    """
+    A helper class for rules that process all available artifacts of a certain type in one go.
+    Subclasses should define `artifact_type` and implement `create_commands`.
+    """
+
+    artifact_type = None
+
+    def matches(self, graph):
+        if self.artifact_type is None:
+            raise NotImplementedError(
+                "Subclasses of BatchRule must define artifact_type"
+            )
+        return any(graph.get_worklist(self.artifact_type))
+
+    def compile(self, graph):
+        worklist = graph.get_worklist(self.artifact_type)
+        commands = self.create_commands(worklist)
+        for artifact in worklist:
+            artifact.available = True
+        return commands
+
+    def create_commands(self, artifacts):
+        """
+        Create compilation commands for the given list of artifacts.
+        Must be implemented by subclasses.
+        """
+        raise NotImplementedError
diff --git a/iron/common/context.py b/iron/common/context.py
new file mode 100644
index 00000000..d33c6888
--- /dev/null
+++ b/iron/common/context.py
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import logging
+from pathlib import Path
+import os
+
+from .device_manager import AIEDeviceManager, pyxrt
+from . import compilation as comp
+import aie.utils.config
+
+
+class AIEContext:
+    """Context for managing AIE operator compilation and runtime state"""
+
+    def __init__(self, use_runlist=True, build_dir=None):
+        self.operators = []
+        self.static_data_pool = {}
+        self.device_manager = AIEDeviceManager()
+        self.base_dir = Path(__file__).parent.parent.parent
+        self.build_dir = build_dir or Path(os.getcwd()) / "build"
+        self.mlir_aie_dir = Path(aie.utils.config.root_path())
+        self.peano_dir = Path(aie.utils.config.peano_install_dir())
+        # Disable the XRT runlist sacrifices performance by executing kernels individually as separate xclbin invocations for easier debugging (can tell which part of runlist execution failed)
+        self.use_runlist = use_runlist
+        self.compilation_rules = [
+            comp.GenerateMLIRFromPythonCompilationRule(),
+            comp.PeanoCompilationRule(self.peano_dir, self.mlir_aie_dir),
+            comp.ArchiveCompilationRule(self.peano_dir),
+            comp.AieccXclbinInstsCompilationRule(
+                self.build_dir, self.peano_dir, self.mlir_aie_dir
+            ),
+            comp.AieccFullElfCompilationRule(
+                self.build_dir, self.peano_dir, self.mlir_aie_dir
+            ),
+        ]
+
+    def register_operator(self, operator):
+        """Register an operator with this context"""
+        operator.context = self
+        self.operators.append(operator)
+
+    def compile_all(self):
+        """Compile all registered operators"""
+        self.build_dir.mkdir(parents=True, exist_ok=True)
+        for op in self.operators:
+            op.compile()
diff --git a/iron/common/device_manager.py b/iron/common/device_manager.py
new file mode 100644
index 00000000..2ae18bfe
--- /dev/null
+++ b/iron/common/device_manager.py
@@ -0,0 +1,107 @@
+# SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Global AIE Device Manager for resource sharing and cleanup
+"""
+
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Dict, Optional, Any
+import pyxrt
+from aie.utils.hostruntime.xrtruntime.hostruntime import XRTHostRuntime
+from aie.iron.device import NPU1, NPU2
+
+
+class AIEDeviceManager:
+    """Singleton manager for AIE XRT resources"""
+
+    _instance = None
+    _initialized = False
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def __init__(self):
+        # Only initialize once
+        if AIEDeviceManager._initialized:
+            return
+        AIEDeviceManager._initialized = True
+
+        self.device = pyxrt.device(0)
+        self.device_type = XRTHostRuntime().device()
+        self.contexts = {}  # xclbin_path -> (context, xclbin)
+        self.kernels = {}  # (xclbin_path, kernel_name) -> kernel
+
+    def get_context_and_kernel(
+        self, xclbin_path: str, kernel_name: str | None = None
+    ) -> (pyxrt.hw_context, pyxrt.kernel):
+        """Get or create hardware context and kernel for xclbin"""
+        # Check if we already have a context for this xclbin
+
+        if xclbin_path not in self.contexts:
+            xclbin = pyxrt.xclbin(xclbin_path)
+            self.device.register_xclbin(xclbin)
+            xclbin_uuid = xclbin.get_uuid()
+            context = pyxrt.hw_context(self.device, xclbin_uuid)
+            self.contexts[xclbin_path] = (context, xclbin)
+            logging.debug(f"Created new context for {Path(xclbin_path).name}")
+        else:
+            context, xclbin = self.contexts[xclbin_path]
+            logging.debug(f"Reusing context for {Path(xclbin_path).name}")
+
+        # Get kernel name if not provided
+        if kernel_name is None:
+            kernels = xclbin.get_kernels()
+            if not kernels:
+                raise RuntimeError("No kernels found in xclbin")
+            kernel_name = kernels[0].get_name()
+
+        # Check if we already have the kernel
+        kernel_key = (xclbin_path, kernel_name)
+        if kernel_key not in self.kernels:
+            self.kernels[kernel_key] = pyxrt.kernel(context, kernel_name)
+            logging.debug(
+                f"Created new kernel {kernel_name} from xclbin {Path(xclbin_path).name}"
+            )
+        else:
+            logging.debug(
+                f"Reusing kernel: {kernel_name} from xclbin {Path(xclbin_path).name}"
+            )
+
+        return context, self.kernels[kernel_key]
+
+    def device_str(self) -> str:
+        return self.device_type.resolve().name
+
+    def cleanup(self):
+        """Clean up all XRT resources"""
+        self.kernels.clear()
+
+        # Clear contexts
+        for xclbin_path, (context, xclbin) in self.contexts.items():
+            try:
+                del context
+            except:
+                pass
+        self.contexts.clear()
+
+        # Clear device
+        if self.device is not None:
+            try:
+                del self.device
+            except:
+                pass
+            self.device = None
+
+        logging.debug("Cleaned up AIE device manager")
+
+    def reset(self):
+        """Reset the device manager (for debugging)"""
+        self.cleanup()
+        AIEDeviceManager._instance = None
+        AIEDeviceManager._initialized = False
diff --git a/iron/common/test_utils.py b/iron/common/test_utils.py
index dc19df5d..0b8c8da2 100644
--- a/iron/common/test_utils.py
+++ b/iron/common/test_utils.py
@@ -4,8 +4,10 @@
 import time
 import numpy as np
 from ml_dtypes import bfloat16
-from .utils import torch_to_numpy
+from .utils import xrt_to_torch
 import logging
+from .base import MLIROperator, CompositeOperator
+from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
 
 
 def nearly_equal(
@@ -29,24 +31,24 @@ def nearly_equal(
     return diff < max(abs_tol, rel_tol * norm)
 
 
-def verify_buffer(operator, buf_name, reference, rel_tol=0.04, abs_tol=1e-6):
+def verify_buffer(output, buf_name, reference, rel_tol=0.04, abs_tol=1e-6):
     errors = []
-    expected_np = torch_to_numpy(reference).reshape((-1,))
-    buf_size = operator.buffers[buf_name] // 2
-    output = operator.read_buffer(buf_name, (buf_size,))
-    if len(output) < len(expected_np):
+    expected = reference.reshape((-1,))
+    output = output.reshape((-1,))
+
+    if len(output) < len(expected):
         # Allow larger buffers - binning may have allocated more space than needed
         print(
-            f"Buffer size mismatch for {buf_name}: expected {len(expected_np)}, got {len(output)}"
+            f"Buffer size mismatch for {buf_name}: expected {len(expected)}, got {len(output)}"
         )
-        errors.extend(i for i in range(abs(len(output) - len(expected_np))))
-    compare_len = min(len(output), len(expected_np))
+        errors.extend(i for i in range(abs(len(output) - len(expected))))
+    compare_len = min(len(output), len(expected))
     for i in range(compare_len):
-        if not nearly_equal(float(output[i]), float(expected_np[i]), rel_tol, abs_tol):
+        if not nearly_equal(float(output[i]), float(expected[i]), rel_tol, abs_tol):
             errors.append(i)
             if len(errors) <= 10:
                 print(
-                    f"Mismatch in {buf_name}[{i}]: expected {float(expected_np[i]):.6f}, got {float(output[i]):.6f}"
+                    f"Mismatch in {buf_name}[{i}]: expected {float(expected[i]):.6f}, got {float(output[i]):.6f}"
                 )
     return errors
 
@@ -65,7 +67,7 @@ def run_test(
     Run operator test with specified input/output/intermediate buffers.
 
     Args:
-        operator: AIE operator instance with registered buffers
+        operator: AIE operator instance
         input_buffers: Dict mapping buffer names to input data arrays
         output_buffers: Dict mapping buffer names to reference output arrays
         intermediate_buffers: Optional dict mapping buffer names to reference arrays for validation
@@ -83,45 +85,79 @@ def run_test(
         level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
     )
     logger = logging.getLogger(__name__)
-    operator.context.compile_all()
-    operator.context.prepare_runtime()
 
-    # Run warmup iterations before writing to buffers (warmup iters might corrupt the buffers)
+    if not isinstance(operator, (MLIROperator, CompositeOperator)):
+        raise ValueError("run_test only supports MLIROperator or CompositeOperator")
+
+    operator.compile()
+    op_func = operator.get_callable()
+
+    args = []
+    arg_spec = operator.get_arg_spec()
+
+    input_iter = iter(input_buffers.items())
+    output_iter = iter(output_buffers.items())
+    output_map = {}
+
+    total_bytes = 0
+
+    for spec in arg_spec:
+        if spec.direction == "in":
+            try:
+                name, data = next(input_iter)
+            except StopIteration:
+                raise ValueError("Not enough input buffers provided for arg spec")
+            buf = XRTTensor.from_torch(data)
+            args.append(buf)
+            total_bytes += buf.buffer_object().size()
+        elif spec.direction == "out":
+            try:
+                name, expected = next(output_iter)
+            except StopIteration:
+                raise ValueError("Not enough output buffers provided for arg spec")
+            buf = XRTTensor(spec.shape, dtype=spec.dtype)
+            args.append(buf)
+            output_map[name] = buf
+            total_bytes += buf.buffer_object().size()
+        else:
+            # Handle other directions if needed, or raise error
+            raise ValueError(f"Unsupported direction: {spec.direction}")
+
+    # Run warmup iterations
     for _ in range(warmup_iters):
-        operator.run_runlist()  # warmup run to configure
-
-    # Write input buffers and zero outputs
-    for buf_name in output_buffers:
-        buf_size = operator.buffers[buf_name]
-        operator.write_buffer(buf_name, np.zeros(buf_size, dtype=np.uint8))
-    # Operator may share the same buffer object for inputs and outputs; hence, write input after outputs
-    for buf_name, data in input_buffers.items():
-        data_np = torch_to_numpy(data)
-        operator.write_buffer(buf_name, data_np)
+        op_func(*args)
 
     # Run operator
-    elapsed_total = 0
+    start_time = time.time()
     for _ in range(timed_iters):
-        elapsed_total += operator.run_runlist()
-    elapsed = elapsed_total / timed_iters
+        op_func(*args)
+    end_time = time.time()
+
+    elapsed = (end_time - start_time) / timed_iters
     latency_us = elapsed * 1e6
 
     # Verify outputs
     errors = {}
     for buf_name, expected in output_buffers.items():
-        buf_errors = verify_buffer(operator, buf_name, expected, rel_tol, abs_tol)
-        if buf_errors:
-            errors[buf_name] = buf_errors
-
-    for buf_name, expected in intermediate_buffers.items():
-        buf_errors = verify_buffer(operator, buf_name, expected, rel_tol, abs_tol)
-        if buf_errors:
-            errors[buf_name] = buf_errors
+        if expected is None:
+            continue
+        if buf_name in output_map:
+            buf = output_map[buf_name]
+            output_torch = xrt_to_torch(buf)
+            buf_errors = verify_buffer(
+                output_torch, buf_name, expected, rel_tol, abs_tol
+            )
+            if buf_errors:
+                errors[buf_name] = buf_errors
+        else:
+            print(f"Warning: Output buffer {buf_name} not found in operator arguments")
+
+    # Intermediate buffers are not supported in this generic run_test
+    # unless we expose them somehow. For now, ignore or warn.
+    if intermediate_buffers:
+        print("Warning: intermediate_buffers verification is not supported in run_test")
 
     # Calculate bandwidth
-    input_bytes = sum(operator.buffers[buf_name] for buf_name in input_buffers)
-    output_bytes = sum(operator.buffers[buf_name] for buf_name in output_buffers)
-    total_bytes = input_bytes + output_bytes
     bandwidth_gbps = total_bytes / (latency_us * 1e-6) / 1e9
 
     return errors, latency_us, bandwidth_gbps
diff --git a/iron/common/utils.py b/iron/common/utils.py
index 9037fbd8..9966b1dd 100644
--- a/iron/common/utils.py
+++ b/iron/common/utils.py
@@ -21,32 +21,28 @@
 }
 
 
-def torch_to_numpy(tensor: torch.Tensor) -> np.ndarray:
-    # Detach (to drop grad) and ensure on CPU
-    t = tensor.detach()
-    if t.device.type != "cpu":
-        t = t.cpu()
-    # Ensure contiguous for safe view operations
-    if not t.is_contiguous():
-        t = t.contiguous()
-
-    if t.dtype == torch.bfloat16:
-        # View the same memory as uint16, then as NumPy bfloat16
-        # This avoids numeric conversion and extra passes over memory.
-        u16_np = t.view(torch.uint16).numpy()  # shares memory
-        return u16_np.view(np.dtype("bfloat16"))  # reinterpret
-
-    return t.numpy()
-
-
-def numpy_to_torch(array: np.ndarray) -> torch.Tensor:
-    # Ensure contiguous to let from_numpy create a view
-    if not array.flags["C_CONTIGUOUS"]:
-        array = np.ascontiguousarray(array)
-
-    if array.dtype == np.dtype("bfloat16"):
-        # reinterpret the same memory as uint16, then view as torch.bfloat16
-        t_u16 = torch.from_numpy(array.view(np.uint16))
-        return t_u16.view(torch.bfloat16)  # view
-
-    return torch.from_numpy(array)
+def xrt_to_torch(xrttensor) -> torch.Tensor:
+    """
+    Convert an XRTTensor (or compatible object with buffer_object()) to a Torch tensor
+    without intermediate numpy array creation, supporting bfloat16.
+    """
+    dtype_map = {
+        np.dtype("float32"): torch.float32,
+        np.dtype("int32"): torch.int32,
+        np.dtype("int16"): torch.int16,
+        np.dtype("int8"): torch.int8,
+        np.dtype("uint8"): torch.uint8,
+        np.dtype("float16"): torch.float16,
+        np.dtype(bfloat16): torch.bfloat16,
+        bfloat16: torch.bfloat16,
+    }
+
+    torch_dtype = dtype_map.get(xrttensor.dtype)
+    if torch_dtype is None:
+        raise ValueError(f"Unsupported dtype: {xrttensor.dtype}")
+
+    xrttensor.to("cpu")
+    bo = xrttensor.buffer_object()
+    mem = bo.map()
+    t = torch.frombuffer(mem, dtype=torch_dtype)
+    return t.reshape(xrttensor.shape)
diff --git a/iron/operators/__init__.py b/iron/operators/__init__.py
index fc203892..1ad3044e 100644
--- a/iron/operators/__init__.py
+++ b/iron/operators/__init__.py
@@ -1,24 +1,15 @@
 # SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from .axpy.op import AIEAXPY
-from .dequant.op import AIEDequant
 from .elementwise_add.op import AIEElementwiseAdd
 from .elementwise_mul.op import AIEElementwiseMul
-from .gelu.op import AIEGELU
 from .gemm.op import AIEGEMM
 from .gemv.op import AIEGEMV
-from .layer_norm.op import AIELayerNorm
-from .leaky_relu.op import AIELeakyReLU
-from .mem_copy.op import AIEMemCopy
 from .mha.op import AIEMHA
-from .relu.op import AIEReLU
 from .rms_norm.op import AIERMSNorm
 from .rope.op import AIERope
-from .sigmoid.op import AIESigmoid
 from .silu.op import AIESiLU
 from .softmax.op import AIESoftmax
 from .swiglu_decode.op import AIESwiGLUDecode
 from .swiglu_prefill.op import AIESwiGLUPrefill
-from .tanh.op import AIETanh
 from .transpose.op import AIETranspose
diff --git a/iron/operators/axpy/design.py b/iron/operators/axpy/design.py
index 69468940..bfa676f8 100644
--- a/iron/operators/axpy/design.py
+++ b/iron/operators/axpy/design.py
@@ -16,7 +16,14 @@
 
 
 def my_axpy(
-    dev, num_elements, num_columns, num_channels, tile_size, trace_size, scalar_factor
+    dev,
+    num_elements,
+    num_columns,
+    num_channels,
+    tile_size,
+    trace_size,
+    scalar_factor,
+    kernel_archive=None,
 ):
     factor = scalar_factor
     per_tile_elements = 4096 if tile_size > 4096 else tile_size
diff --git a/iron/operators/axpy/op.py b/iron/operators/axpy/op.py
index ce1702c6..37e66a33 100644
--- a/iron/operators/axpy/op.py
+++ b/iron/operators/axpy/op.py
@@ -7,8 +7,8 @@
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -17,7 +17,7 @@
 )
 
 
-class AIEAXPY(AIEOperatorBase):
+class AIEAXPY(MLIROperator):
     """AIE-accelerated aX + Y operator"""
 
     def __init__(
@@ -30,25 +30,26 @@ def __init__(
         context=None,
     ):
         max_multiple = num_aie_columns * tile_size
-        padded_size = ((size + max_multiple - 1) // max_multiple) * max_multiple
-        self.orig_size = size
-        self.size = padded_size
+        assert (
+            size % max_multiple == 0
+        ), "size must be multiple of num_aie_columns * tile_size"
+        assert size % tile_size == 0, "size must be multiple of tile_size"
+
+        self.size = size
         self.tile_size = tile_size
         self.num_aie_columns = num_aie_columns
         self.num_channels = num_channels
         self.scalar_factor = scalar_factor
 
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"axpy_{self.num_aie_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t_{self.scalar_factor}s"
 
-    def set_up_artifacts(self):
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"axpy_{self.num_aie_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t_{self.scalar_factor}s"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="my_axpy",
             callback_args=[
@@ -62,68 +63,21 @@ def set_up_artifacts(self):
             ],
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelObjectArtifact.new(
-                    f"axpy.o",
-                    depends=[
-                        SourceArtifact.new(
-                            self.context.base_dir
-                            / "aie_kernels"
-                            / "generic"
-                            / "axpy.cc"
-                        )
-                    ],
-                ),
-            ],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-        self.add_artifacts([xclbin_artifact, insts_artifact])
-
-    def set_up_runtime(self):
-        self.add_buffer("x", self.size)
-        self.add_buffer("y", self.size)
-        self.add_buffer("output", self.size)
-        self.add_kernel(
-            "axpy",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("axpy", "x", "y", "output")
-
-    def forward(self, x, y):
-        if x.numel() > self.size or y.numel() > self.size:
-            raise AIEOperatorConstraintError(
-                "AIEAXPY: input too large for configured size"
-            )
-        if x.numel() != y.numel():
-            raise AIEOperatorConstraintError("AIEAXPY: sizes of X and Y do not match")
-
-        original_shape = x.shape
-        x_flat = x.reshape(-1)
-        y_flat = y.reshape(-1)
-
-        pad_len = self.size - x_flat.numel()
-        if pad_len > 0:
-            x_flat = torch.nn.functional.pad(x_flat, (0, pad_len))
-            y_flat = torch.nn.functional.pad(y_flat, (0, pad_len))
-
-        self.write_buffer("x", x_flat)
-        self.write_buffer("y", y_flat)
-        self.write_buffer("output", np.zeros(self.size, dtype=bfloat16))
-        self.run_runlist()
-        result = self.read_buffer_as_torch("output", shape=(self.size,), dtype=bfloat16)
-
-        if pad_len > 0:
-            result = result[: x_flat.numel() - pad_len]
-
-        return result.reshape(*original_shape)
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"axpy.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir / "aie_kernels" / "generic" / "axpy.cc"
+                    )
+                ],
+            ),
+        ]
+
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec("in", (self.size,)),  # x
+            AIERuntimeArgSpec("in", (self.size,)),  # y
+            AIERuntimeArgSpec("out", (self.size,)),  # output
+        ]
diff --git a/iron/operators/axpy/test.py b/iron/operators/axpy/test.py
index b91e802f..8fc84ef0 100755
--- a/iron/operators/axpy/test.py
+++ b/iron/operators/axpy/test.py
@@ -12,40 +12,34 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
+def get_params():
     max_aie_columns = 8
     num_channels = 2
-    input_lengths = [2048] if not extensive else [1024, 2048, 4096, 8192]
-    scalar_factors = [3.0] if not extensive else [3.0, 10.0]
+    input_lengths = [1024, 2048, 4096, 8192]
+    scalar_factors = [3.0, 10.0]
 
     params = []
-    names = []
     for input_length in input_lengths:
         for num_aie_columns in range(1, max_aie_columns + 1):
             tile_size = input_length // num_aie_columns
             if tile_size * num_aie_columns != input_length:
                 continue
             for scalar in scalar_factors:
-                names.append(
-                    f"axpy_{num_aie_columns}_cols_{num_channels}_channels_{input_length}_tile_{tile_size}_{scalar}"
-                )
+                # Determine if this is a regular test case
+                is_regular = input_length == 2048 and scalar == 3.0
+                marks = [] if is_regular else [pytest.mark.extensive]
+
                 params.append(
-                    (input_length, num_aie_columns, num_channels, tile_size, scalar)
+                    pytest.param(
+                        input_length,
+                        num_aie_columns,
+                        num_channels,
+                        tile_size,
+                        scalar,
+                        marks=marks,
+                    )
                 )
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    return params
 
 
 @pytest.mark.metrics(
@@ -54,7 +48,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size,scalar_factor",
-    all_params,
+    get_params(),
 )
 def test_axpy(
     input_length, num_aie_columns, num_channels, tile_size, scalar_factor, aie_context
diff --git a/iron/operators/dequant/design.py b/iron/operators/dequant/design.py
index 05cf2ddd..07c3e3bf 100644
--- a/iron/operators/dequant/design.py
+++ b/iron/operators/dequant/design.py
@@ -16,7 +16,14 @@
 
 
 def my_dequant_kernel(
-    dev, num_elements, num_columns, num_channels, trace_size, tile_size, group_size
+    dev,
+    num_elements,
+    num_columns,
+    num_channels,
+    trace_size,
+    tile_size,
+    group_size,
+    kernel_archive=None,
 ):
     per_tile_elements = (
         16384 if tile_size > 16384 else tile_size
diff --git a/iron/operators/dequant/op.py b/iron/operators/dequant/op.py
index d4aeab8a..8fd3e933 100644
--- a/iron/operators/dequant/op.py
+++ b/iron/operators/dequant/op.py
@@ -7,8 +7,8 @@
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -17,7 +17,7 @@
 )
 
 
-class AIEDequant(AIEOperatorBase):
+class AIEDequant(MLIROperator):
 
     def __init__(
         self,
@@ -46,17 +46,15 @@ def __init__(
         assert self.size % total_cores == 0, "Size must be divisible by total cores"
         assert total_cores <= 16, "Total cores (columns * channels) must be <= 16"
 
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"dequant_{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
 
-    def set_up_artifacts(self):
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"dequant_{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="my_dequant_kernel",
             callback_args=[
@@ -70,68 +68,24 @@ def set_up_artifacts(self):
             ],
         )
 
-        # Build the kernel object file with the appropriate tile size and group size
-        kernel_artifact = KernelObjectArtifact.new(
-            f"expand_aie2_{self.tile_size}.o",
-            depends=[
-                SourceArtifact.new(
-                    self.context.base_dir / "aie_kernels" / "generic" / "expand.cc"
-                )
-            ],
-            extra_flags=[
-                f"-DTILE_SIZE={self.tile_size}",
-                f"-DGROUP_SIZE={self.group_size}",
-            ],
-        )
-
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[mlir_artifact, kernel_artifact],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-
-        artifacts = [xclbin_artifact, insts_artifact]
-        self.add_artifacts(artifacts)
-
-    def set_up_runtime(self):
-        # Input buffer uses uint8 dtype, output uses bfloat16
-        self.add_buffer("input", self.input_size, dtype=np.uint8)
-        self.add_buffer("output", self.output_size, dtype=bfloat16)
-        self.add_kernel(
-            "dequant",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("dequant", "input", "output")
-
-    def forward(self, x_packed):
-        """
-        Forward pass for dequantization.
-
-        Args:
-            x_packed: Packed uint8 numpy array containing int4 data + scale factors
-
-        Returns:
-            Dequantized bfloat16 torch tensor
-        """
-        if x_packed.size != self.input_size:
-            raise AIEOperatorConstraintError(
-                f"AIEDequant: input size {x_packed.size} does not match expected size {self.input_size}"
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"expand_aie2_{self.tile_size}.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir / "aie_kernels" / "generic" / "expand.cc"
+                    )
+                ],
+                extra_flags=[
+                    f"-DTILE_SIZE={self.tile_size}",
+                    f"-DGROUP_SIZE={self.group_size}",
+                ],
             )
+        ]
 
-        # Write input and execute
-        self.write_buffer("input", x_packed.flatten())
-        self.write_buffer("output", np.zeros(self.output_size, dtype=bfloat16))
-        self.run_runlist()
-        result = self.read_buffer_as_torch(
-            "output", shape=(self.output_size,), dtype=bfloat16
-        )
-
-        return result
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec("in", (self.input_size,), dtype=np.uint8),  # input
+            AIERuntimeArgSpec("out", (self.output_size,), dtype=bfloat16),  # output
+        ]
diff --git a/iron/operators/dequant/test.py b/iron/operators/dequant/test.py
index 03b037f4..a4678199 100644
--- a/iron/operators/dequant/test.py
+++ b/iron/operators/dequant/test.py
@@ -12,12 +12,11 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
-    input_lengths = [2048] if not extensive else [1024, 2048, 4096, 8192]
+def get_params():
+    input_lengths = [1024, 2048, 4096, 8192]
     group_size = 32
 
     params = []
-    names = []
     for input_length in input_lengths:
         for num_columns in range(1, 9):  # 1 to 8 columns
             for num_channels in range(1, 3):  # 1 or 2 channels
@@ -30,26 +29,20 @@ def generate_test_params(extensive=False):
 
                 # Only proceed if tile_size * total_cores == input_length (exact division)
                 if tile_size * total_cores == input_length:
-                    names.append(
-                        f"dequant_{num_columns}_cols_{num_channels}_channels_{input_length}_tile_{tile_size}"
-                    )
+                    is_regular = input_length == 2048
+                    marks = [] if is_regular else [pytest.mark.extensive]
+
                     params.append(
-                        (input_length, num_columns, num_channels, tile_size, group_size)
+                        pytest.param(
+                            input_length,
+                            num_columns,
+                            num_channels,
+                            tile_size,
+                            group_size,
+                            marks=marks,
+                        )
                     )
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    return params
 
 
 @pytest.mark.metrics(
@@ -58,7 +51,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size,group_size",
-    all_params,
+    get_params(),
 )
 def test_dequant(
     input_length, num_aie_columns, num_channels, tile_size, group_size, aie_context
diff --git a/iron/operators/elementwise_add/design.py b/iron/operators/elementwise_add/design.py
index d1eda376..246331b7 100644
--- a/iron/operators/elementwise_add/design.py
+++ b/iron/operators/elementwise_add/design.py
@@ -15,7 +15,15 @@
 from aie.helpers.util import np_ndarray_type_get_shape
 
 
-def my_eltwise_add(dev, num_elements, num_columns, num_channels, tile_size, trace_size):
+def my_eltwise_add(
+    dev,
+    num_elements,
+    num_columns,
+    tile_size,
+    trace_size,
+    kernel_archive,
+    func_prefix="",
+):
     per_tile_elements = 4096 if tile_size > 4096 else tile_size
     n = per_tile_elements * num_columns
     if num_elements % n != 0:
@@ -37,7 +45,9 @@ def my_eltwise_add(dev, num_elements, num_columns, num_channels, tile_size, trac
 
     # AIE Core Function declaration
     eltwise_add_bf16_vector = Kernel(
-        "eltwise_add_bf16_vector", "add.o", [tile_ty, tile_ty, tile_ty, np.int32]
+        f"{func_prefix}eltwise_add_bf16_vector",
+        kernel_archive,
+        [tile_ty, tile_ty, tile_ty, np.int32],
     )
 
     # Define a task that will run on a compute tile
diff --git a/iron/operators/elementwise_add/op.py b/iron/operators/elementwise_add/op.py
index d1963723..7d2dd7a7 100644
--- a/iron/operators/elementwise_add/op.py
+++ b/iron/operators/elementwise_add/op.py
@@ -8,8 +8,8 @@
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -19,152 +19,62 @@
 )
 
 
-class AIEElementwiseAdd(AIEOperatorBase):
+class AIEElementwiseAdd(MLIROperator):
     """AIE-accelerated element-wise addition"""
 
     def __init__(
         self,
         size,
-        num_aie_columns=None,
-        num_channels=None,
-        tile_size=None,
+        tile_size,
+        num_aie_columns=8,
         context=None,
     ):
-        max_multiple = num_aie_columns * tile_size
-        padded_size = ((size + max_multiple - 1) // max_multiple) * max_multiple
-        self.orig_size = size
-        self.size = padded_size
+        assert (
+            size % (num_aie_columns * tile_size) == 0
+        ), "size must be multiple of num_aie_columns * tile_size"
+        self.size = size
         self.tile_size = tile_size
-
         self.num_aie_columns = num_aie_columns
-        self.num_channels = num_channels
         # Enforce ShimDMA limits for elementwise_add (uses 2 inputs per core)
         # Maximum safe configuration: 8 columns × 2 channels = 16 ShimDMA channels
-        total_shimdma_channels = self.num_aie_columns * self.num_channels
+        total_shimdma_channels = self.num_aie_columns * 2
         assert total_shimdma_channels <= 16, "Conservative ShimDMA limit"
+        MLIROperator.__init__(self, context=context)
 
-        # Artifacts created by set_up_artifacts()
-        self.xclbin_artifact = None
-        self.insts_artifact = None
-
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"add_{self.num_aie_columns}col_{self.size}_{self.tile_size}t"
 
-    def set_up_artifacts(self):
-        # Compilation artifacts
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"add_{self.num_aie_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="my_eltwise_add",
             callback_args=[
                 self.context.device_manager.device_type,
                 self.size,
                 self.num_aie_columns,
-                self.num_channels,
                 self.tile_size,
                 0,
             ],
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelObjectArtifact.new(
-                    f"add.o",
-                    depends=[
-                        SourceArtifact.new(
-                            self.context.base_dir / "aie_kernels" / "generic" / "add.cc"
-                        )
-                    ],
-                ),
-            ],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-
-        artifacts = [xclbin_artifact, insts_artifact]
-        self.add_artifacts(artifacts)
-
-    def set_up_runtime(self):
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"add.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir / "aie_kernels" / "generic" / "add.cc"
+                    )
+                ],
+            ),
+        ]
+
+    def get_arg_spec(self):
         # Runtime setup
-        self.add_buffer("input1", self.size)
-        self.add_buffer("input2", self.size)
-        self.add_buffer("output", self.size)
-        self.add_kernel(
-            "eltwise_add",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("eltwise_add", "input1", "input2", "output")
-
-    def forward(self, x, y):
-        """Forward pass for element-wise addition"""
-        applicable = (
-            len(x.shape) >= 1
-            and len(y.shape) >= 1
-            and x.shape[-1] <= self.size
-            and y.shape[-1] <= self.size
-            and x.numel() <= self.size
-            and y.numel() <= self.size
-            and x.numel() == y.numel()
-            and x.shape == y.shape
-        )
-        if not applicable:
-            raise AIEOperatorConstraintError(
-                "AIEElementwiseAdd: incompatible tensor shape(s)"
-            )
-
-        # Always flatten to [batch, orig_size]
-        original_shape = x.shape
-        batch = x.shape[0] if x.dim() > 1 else 1
-        x_flat = x.reshape(batch, -1)
-        y_flat = y.reshape(batch, -1)
-
-        pad_len = self.size - x_flat.shape[1]
-        if pad_len > 0:
-            x_flat = torch.nn.functional.pad(x_flat, (0, pad_len))
-            y_flat = torch.nn.functional.pad(y_flat, (0, pad_len))
-
-        out = self._execute_aie_operation(x_flat, y_flat)
-
-        # Remove padding if added
-        numel = np.prod(original_shape)
-        if pad_len > 0:
-            out = out.reshape(-1)[..., :numel]
-        # Restore original shape
-        out = out.reshape(*original_shape)
-
-        return out
-
-    def _execute_aie_operation(self, x, y):
-        """Execute element-wise addition operation on AIE hardware"""
-        # x, y are [batch, size]
-        batch = x.shape[0] if x.dim() > 1 else 1
-
-        # Flatten inputs for AIE processing
-        x_flat = x.view(-1)
-        y_flat = y.view(-1)
-
-        # Verify size matches expected
-        if len(x_flat) != self.size or len(y_flat) != self.size:
-            raise AIEOperatorConstraintError(
-                f"Input size x={len(x_flat)}, y={len(y_flat)} doesn't match configured size {self.size}"
-            )
-
-        self.write_buffer("input1", x_flat)
-        self.write_buffer("input2", y_flat)
-        test_pattern = np.zeros(len(x_flat), dtype=bfloat16)
-        self.write_buffer("output", test_pattern)
-        self.run_runlist()
-        result = self.read_buffer_as_torch("output", shape=x_flat.shape, dtype=bfloat16)
-
-        return result
+        return [
+            AIERuntimeArgSpec("in", (self.size,)),  # input1
+            AIERuntimeArgSpec("in", (self.size,)),  # input2
+            AIERuntimeArgSpec("out", (self.size,)),  # output
+        ]
diff --git a/iron/operators/elementwise_add/test.py b/iron/operators/elementwise_add/test.py
index 781265f5..87cb5c1f 100755
--- a/iron/operators/elementwise_add/test.py
+++ b/iron/operators/elementwise_add/test.py
@@ -12,36 +12,32 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
+def get_params():
     max_aie_columns = 8
     num_channels = 2
-    input_lengths = [2048] if not extensive else [1024, 4096, 8192]
+    # Combine all lengths
+    input_lengths = [1024, 2048, 4096, 8192]
 
     params = []
-    names = []
     for input_length in input_lengths:
         for num_aie_columns in range(1, max_aie_columns + 1):
             tile_size = input_length // num_aie_columns
             if tile_size * num_aie_columns != input_length:
                 continue
-            names.append(
-                f"eltwise_add_{num_aie_columns}_cols_{num_channels}_channels_{input_length}_tile_{tile_size}"
-            )
-            params.append((input_length, num_aie_columns, num_channels, tile_size))
-    return params, names
-
 
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+            is_regular = input_length == 2048
+            marks = [] if is_regular else [pytest.mark.extensive]
+
+            params.append(
+                pytest.param(
+                    input_length,
+                    num_aie_columns,
+                    num_channels,
+                    tile_size,
+                    marks=marks,
+                )
+            )
+    return params
 
 
 @pytest.mark.metrics(
@@ -50,7 +46,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size",
-    all_params,
+    get_params(),
 )
 def test_elementwise_add(
     input_length, num_aie_columns, num_channels, tile_size, aie_context
@@ -60,7 +56,6 @@ def test_elementwise_add(
     operator = AIEElementwiseAdd(
         size=input_length,
         num_aie_columns=num_aie_columns,
-        num_channels=num_channels,
         tile_size=tile_size,
         context=aie_context,
     )
diff --git a/iron/operators/elementwise_mul/design.py b/iron/operators/elementwise_mul/design.py
index 88ae1e31..51319004 100644
--- a/iron/operators/elementwise_mul/design.py
+++ b/iron/operators/elementwise_mul/design.py
@@ -12,9 +12,18 @@
 from aie.iron.device import NPU1, NPU2
 from aie.helpers.taplib.tap import TensorAccessPattern
 from aie.iron.controlflow import range_
-
-
-def my_eltwise_mul(dev, num_elements, num_columns, num_channels, tile_size, trace_size):
+from aie.helpers.util import np_ndarray_type_get_shape
+
+
+def my_eltwise_mul(
+    dev,
+    num_elements,
+    num_columns,
+    tile_size,
+    trace_size,
+    kernel_archive,
+    func_prefix="",
+):
     per_tile_elements = 4096 if tile_size > 4096 else tile_size
     n = per_tile_elements * num_columns
     if num_elements % n != 0:
@@ -36,7 +45,9 @@ def my_eltwise_mul(dev, num_elements, num_columns, num_channels, tile_size, trac
 
     # AIE Core Function declaration
     eltwise_mul_bf16_vector = Kernel(
-        "eltwise_mul_bf16_vector", "mul.o", [tile_ty, tile_ty, tile_ty, np.int32]
+        f"{func_prefix}eltwise_mul_bf16_vector",
+        kernel_archive,
+        [tile_ty, tile_ty, tile_ty, np.int32],
     )
 
     # Define a task that will run on a compute tile
@@ -146,11 +157,6 @@ def str_to_device(device: str):
     p.add_argument(
         "-co", "--columns", required=True, dest="cols", help="Number of columns"
     )
-    # Number of channels is required to define the number of channels to be used
-    # It must be 1 or 2
-    p.add_argument(
-        "-ch", "--channels", required=True, dest="chans", help="Number of channels"
-    )
     # Tile size (elements per tile) - defaults to 1024 for backward compatibility
     p.add_argument(
         "-ts",
@@ -183,9 +189,6 @@ def str_to_device(device: str):
     elif isinstance(dev, NPU2) and columns > 8:
         raise ValueError("[ERROR] NPU2 device cannot allocate more than 8 columns")
 
-    channels = int(opts.chans)
-    if channels < 1 or channels > 2:
-        raise ValueError("Number of channels must be 1 or 2")
     tile_size = int(opts.tile_size)
     if length % (tile_size * columns) != 0:
         print(
@@ -198,7 +201,7 @@ def str_to_device(device: str):
         raise ValueError
     trace_size = int(opts.trace_size) if opts.trace_size is not None else 0
 
-    module = my_eltwise_mul(dev, length, columns, channels, tile_size, trace_size)
+    module = my_eltwise_mul(dev, length, columns, tile_size, trace_size, "mul.o")
 
     output_file_path = Path(opts.output_file_path)
 
diff --git a/iron/operators/elementwise_mul/op.py b/iron/operators/elementwise_mul/op.py
index 60113341..2304ca99 100644
--- a/iron/operators/elementwise_mul/op.py
+++ b/iron/operators/elementwise_mul/op.py
@@ -1,164 +1,75 @@
 # SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import torch
-import numpy as np
-from ml_dtypes import bfloat16
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
-    KernelArchiveArtifact,
     SourceArtifact,
     PythonGeneratedMLIRArtifact,
 )
 
 
-class AIEElementwiseMul(AIEOperatorBase):
+class AIEElementwiseMul(MLIROperator):
     """AIE-accelerated element-wise multiplication"""
 
     def __init__(
-        self, size, num_aie_columns, num_channels, tile_size, trace_size=0, context=None
+        self,
+        size,
+        tile_size,
+        num_aie_columns=8,
+        context=None,
     ):
-        max_multiple = num_aie_columns * tile_size
-        padded_size = ((size + max_multiple - 1) // max_multiple) * max_multiple
-        self.orig_size = size
-        self.size = padded_size
+        assert (
+            size % (num_aie_columns * tile_size) == 0
+        ), "size must be multiple of num_aie_columns * tile_size"
+        self.size = size
         self.tile_size = tile_size
         self.num_aie_columns = num_aie_columns
-        self.num_channels = num_channels
-        self.trace_size = trace_size
-
-        total_shimdma_channels = self.num_aie_columns * self.num_channels
+        # Enforce ShimDMA limits for elementwise_mul (uses 2 inputs per core)
+        # Maximum safe configuration: 8 columns × 2 channels = 16 ShimDMA channels
+        total_shimdma_channels = self.num_aie_columns * 2
         assert total_shimdma_channels <= 16, "Conservative ShimDMA limit"
+        MLIROperator.__init__(self, context=context)
 
-        self.xclbin_artifact = None
-        self.insts_artifact = None
-
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"mul_{self.num_aie_columns}col_{self.size}_{self.tile_size}t"
 
-    def get_artifacts(self, prefix="eltwise_mul_"):
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"{prefix}{self.num_aie_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="my_eltwise_mul",
             callback_args=[
                 self.context.device_manager.device_type,
                 self.size,
                 self.num_aie_columns,
-                self.num_channels,
                 self.tile_size,
-                self.trace_size,
+                0,
             ],
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelObjectArtifact.new(
-                    f"mul.o",
-                    depends=[
-                        SourceArtifact.new(
-                            self.context.base_dir / "aie_kernels" / "generic" / "mul.cc"
-                        )
-                    ],
-                ),
-            ],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        return xclbin_artifact, insts_artifact
-
-    def set_up_artifacts(self):
-        xclbin_artifact, insts_artifact = self.get_artifacts()
-
-        mlir_artifact = xclbin_artifact.depends[0]
-        mlir_artifact.callback_args[0] = self.context.device_manager.device_type
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-
-        artifacts = [xclbin_artifact, insts_artifact]
-        self.add_artifacts(artifacts)
-
-    def set_up_runtime(self):
-        self.add_buffer("input1", self.size)
-        self.add_buffer("input2", self.size)
-        self.add_buffer("output", self.size)
-        self.add_kernel(
-            "eltwise_mul",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("eltwise_mul", "input1", "input2", "output")
-
-    def forward(self, x, y):
-        """Forward pass for element-wise multiplication"""
-        applicable = (
-            len(x.shape) >= 1
-            and len(y.shape) >= 1
-            and x.shape[-1] <= self.size
-            and y.shape[-1] <= self.size
-            and x.numel() <= self.size
-            and y.numel() <= self.size
-            and x.numel() == y.numel()
-            and x.shape == y.shape
-        )
-
-        # Always flatten to [batch, orig_size]
-        original_shape = x.shape
-        batch = x.shape[0] if x.dim() > 1 else 1
-        x_flat = x.reshape(batch, -1)
-        y_flat = y.reshape(batch, -1)
-
-        pad_len = self.size - x_flat.shape[1]
-        if pad_len > 0:
-            x_flat = torch.nn.functional.pad(x_flat, (0, pad_len))
-            y_flat = torch.nn.functional.pad(y_flat, (0, pad_len))
-
-        out = self._execute_aie_operation(x_flat, y_flat)
-
-        # Remove padding if added
-        numel = np.prod(original_shape)
-        if pad_len > 0:
-            out = out.reshape(-1)[..., :numel]
-        # Restore original shape
-        out = out.reshape(*original_shape)
-
-        return out
-
-    def _execute_aie_operation(self, x, y):
-        """Execute element-wise multiplication operation on AIE hardware"""
-        # x, y are [batch, size]
-        batch = x.shape[0] if x.dim() > 1 else 1
-
-        # Flatten inputs for AIE processing
-        x_flat = x.view(-1)
-        y_flat = y.view(-1)
-
-        # Verify size matches expected
-        if len(x_flat) != self.size or len(y_flat) != self.size:
-            raise AIEOperatorConstraintError(
-                f"Input size x={len(x_flat)}, y={len(y_flat)} doesn't match configured size {self.size}"
-            )
-
-        self.write_buffer("input1", x_flat)
-        self.write_buffer("input2", y_flat)
-        test_pattern = np.zeros(len(x_flat), dtype=bfloat16)
-        self.write_buffer("output", test_pattern)
-        self.run_runlist()
-        result = self.read_buffer_as_torch("output", shape=x_flat.shape, dtype=bfloat16)
-
-        return result
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"mul.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir / "aie_kernels" / "generic" / "mul.cc"
+                    )
+                ],
+            ),
+        ]
+
+    def get_arg_spec(self):
+        # Runtime setup
+        return [
+            AIERuntimeArgSpec("in", (self.size,)),  # input1
+            AIERuntimeArgSpec("in", (self.size,)),  # input2
+            AIERuntimeArgSpec("out", (self.size,)),  # output
+        ]
diff --git a/iron/operators/elementwise_mul/test.py b/iron/operators/elementwise_mul/test.py
index 2c92d288..163ff0e4 100755
--- a/iron/operators/elementwise_mul/test.py
+++ b/iron/operators/elementwise_mul/test.py
@@ -12,13 +12,12 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
+def get_params():
     max_aie_columns = 8
     num_channels = 2
-    input_lengths = [2048] if not extensive else [1024, 4096, 8192]
+    input_lengths = [1024, 2048, 4096, 8192]
 
     params = []
-    names = []
     for input_length in input_lengths:
         for num_aie_columns in range(1, max_aie_columns + 1):
             tile_size = input_length // num_aie_columns
@@ -26,24 +25,20 @@ def generate_test_params(extensive=False):
                 tile_size = 4096
             if tile_size * num_aie_columns != input_length:
                 continue
-            names.append(
-                f"eltwise_mul_{num_aie_columns}_cols_{num_channels}_channels_{input_length}_tile_{tile_size}"
-            )
-            params.append((input_length, num_aie_columns, num_channels, tile_size))
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
 
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+            is_regular = input_length == 2048
+            marks = [] if is_regular else [pytest.mark.extensive]
+
+            params.append(
+                pytest.param(
+                    input_length,
+                    num_aie_columns,
+                    num_channels,
+                    tile_size,
+                    marks=marks,
+                )
+            )
+    return params
 
 
 @pytest.mark.metrics(
@@ -52,7 +47,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size",
-    all_params,
+    get_params(),
 )
 def test_elementwise_mul(
     input_length, num_aie_columns, num_channels, tile_size, aie_context
@@ -61,9 +56,8 @@ def test_elementwise_mul(
 
     operator = AIEElementwiseMul(
         size=input_length,
-        num_aie_columns=num_aie_columns,
-        num_channels=num_channels,
         tile_size=tile_size,
+        num_aie_columns=num_aie_columns,
         context=aie_context,
     )
 
diff --git a/iron/operators/gelu/design.py b/iron/operators/gelu/design.py
index 7a110286..3ecd85a5 100644
--- a/iron/operators/gelu/design.py
+++ b/iron/operators/gelu/design.py
@@ -15,7 +15,9 @@
 from aie.iron.controlflow import range_
 
 
-def my_gelu(dev, size, num_columns, num_channels, tile_size, trace_size):
+def my_gelu(
+    dev, size, num_columns, num_channels, tile_size, trace_size, kernel_archive=None
+):
     xfr_dtype = bfloat16
     line_size = 8192 if tile_size > 8192 else tile_size
     fifodepth = 1 if line_size > 4096 else 2
diff --git a/iron/operators/gelu/op.py b/iron/operators/gelu/op.py
index 86fea435..8f8f8157 100644
--- a/iron/operators/gelu/op.py
+++ b/iron/operators/gelu/op.py
@@ -7,8 +7,8 @@
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -17,14 +17,17 @@
 )
 
 
-class AIEGELU(AIEOperatorBase):
+class AIEGELU(MLIROperator):
     """AIE-accelerated GELU activation function"""
 
     def __init__(self, size, num_aie_columns, num_channels, tile_size, context=None):
         max_multiple = num_aie_columns * tile_size
-        padded_size = ((size + max_multiple - 1) // max_multiple) * max_multiple
-        self.orig_size = size
-        self.size = padded_size
+        assert (
+            size % max_multiple == 0
+        ), "size must be multiple of num_aie_columns * tile_size"
+        assert size % tile_size == 0, "size must be multiple of tile_size"
+
+        self.size = size
         self.tile_size = tile_size
         self.num_aie_columns = num_aie_columns
         self.num_channels = num_channels
@@ -32,17 +35,15 @@ def __init__(self, size, num_aie_columns, num_channels, tile_size, context=None)
         total_shimdma_channels = self.num_aie_columns * self.num_channels
         assert total_shimdma_channels <= 16, "Conservative ShimDMA limit"
 
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"gelu_{self.num_aie_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
 
-    def set_up_artifacts(self):
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"gelu_{self.num_aie_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="my_gelu",
             callback_args=[
@@ -55,65 +56,20 @@ def set_up_artifacts(self):
             ],
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelObjectArtifact.new(
-                    f"gelu.o",
-                    depends=[
-                        SourceArtifact.new(
-                            self.context.base_dir / "aie_kernels" / "aie2p" / "gelu.cc"
-                        )
-                    ],
-                ),
-            ],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-
-        artifacts = [xclbin_artifact, insts_artifact]
-        self.add_artifacts(artifacts)
-
-    def set_up_runtime(self):
-        self.add_buffer("input", self.size)
-        self.add_buffer("output", self.size)
-        self.add_kernel(
-            "gelu",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("gelu", "input", "output")
-
-    def forward(self, x):
-        """Forward pass for GELU activation"""
-        if x.numel() > self.size:
-            raise AIEOperatorConstraintError(
-                "AIEGELU: input too large for configured size"
-            )
-
-        original_shape = x.shape
-        x_flat = x.reshape(-1)
-
-        # Pad if necessary
-        pad_len = self.size - x_flat.numel()
-        if pad_len > 0:
-            x_flat = torch.nn.functional.pad(x_flat, (0, pad_len))
-
-        # Execute on AIE
-        self.write_buffer("input", x_flat)
-        self.write_buffer("output", np.zeros(self.size, dtype=bfloat16))
-        self.run_runlist()
-        result = self.read_buffer_as_torch("output", shape=(self.size,), dtype=bfloat16)
-
-        # Remove padding and restore shape
-        if pad_len > 0:
-            result = result[: x_flat.numel() - pad_len]
-
-        return result.reshape(*original_shape)
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"gelu.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir / "aie_kernels" / "aie2p" / "gelu.cc"
+                    )
+                ],
+            ),
+        ]
+
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec("in", (self.size,)),  # input
+            AIERuntimeArgSpec("out", (self.size,)),  # output
+        ]
diff --git a/iron/operators/gelu/test.py b/iron/operators/gelu/test.py
index d91a9e7a..f74a2e73 100755
--- a/iron/operators/gelu/test.py
+++ b/iron/operators/gelu/test.py
@@ -12,13 +12,12 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
+def get_params():
     max_aie_columns = 8
     num_channels_choices = [1, 2]
-    input_lengths = [2048] if not extensive else [1024, 4096, 8192]
+    input_lengths = [1024, 2048, 4096, 8192]
 
     params = []
-    names = []
     for input_length in input_lengths:
         for num_aie_columns in range(1, max_aie_columns + 1):
             for num_channels in num_channels_choices:
@@ -28,26 +27,19 @@ def generate_test_params(extensive=False):
                     tile_size = 8192
                 check_length = tile_size * total_cores
                 if check_length == input_length:
-                    names.append(
-                        f"gelu_{num_aie_columns}_cols_{num_channels}_channels_{input_length}_tile_{tile_size}"
-                    )
+                    is_regular = input_length == 2048
+                    marks = [] if is_regular else [pytest.mark.extensive]
+
                     params.append(
-                        (input_length, num_aie_columns, num_channels, tile_size)
+                        pytest.param(
+                            input_length,
+                            num_aie_columns,
+                            num_channels,
+                            tile_size,
+                            marks=marks,
+                        )
                     )
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    return params
 
 
 @pytest.mark.metrics(
@@ -56,7 +48,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size",
-    all_params,
+    get_params(),
 )
 def test_gelu(input_length, num_aie_columns, num_channels, tile_size, aie_context):
     golden_ref = generate_golden_reference(input_length=input_length)
diff --git a/iron/operators/gemm/design.py b/iron/operators/gemm/design.py
index 6ea439d5..e5b4d748 100644
--- a/iron/operators/gemm/design.py
+++ b/iron/operators/gemm/design.py
@@ -106,6 +106,7 @@ def main():
         args.separate_c_tiles,
         args.trace_size,
         args.archive,
+        "",
         args.generate_taps,
     )
 
@@ -140,7 +141,8 @@ def my_matmul(
     prio_accuracy,
     separate_c_tiles,
     trace_size,
-    archive=None,
+    kernel_archive=None,
+    func_prefix="",
     generate_taps=False,
 ):
     n_aie_rows = 4
@@ -273,7 +275,11 @@ def my_matmul(
 
     # AIE Core Function declarations
     scalar_suffix = "_scalar" if use_scalar else ""
-    archive_name = f"gemm_{m}x{k}x{n}_archive.a" if archive is None else archive
+    kernel_archive = (
+        f"{func_prefix}gemm_{m}x{k}x{n}_archive.a"
+        if kernel_archive is None
+        else kernel_archive
+    )
     if use_larger_internal_buffer:
         # Fix fifo depth for C objfifo to 1 since 1 buffer will be used for accumulation
         # and another for transfer to L2
@@ -283,19 +289,19 @@ def my_matmul(
         # A kernel to convert from the internal f32 accumulation to bf16 for transfer to L2 is needed
         convert_copy_kernel = Kernel(
             f"convert_copy_f32_to_bf16",
-            archive_name,
+            kernel_archive,
             [C_l1_ty_internal, C_l1_ty, np.int32],
         )
         # Fix the kernels to use f32 outputs
         zero_kernel = Kernel(
             f"zero{scalar_suffix}_f32",
-            archive_name,
+            kernel_archive,
             [C_l1_ty_internal],
         )
         matmul_func_name = f"matmul{scalar_suffix}_{dtype_in_str}_f32"
         matmul_kernel = Kernel(
             matmul_func_name,
-            archive_name,
+            kernel_archive,
             [A_l1_ty, B_l1_ty, C_l1_ty_internal],
         )
     else:
@@ -304,13 +310,13 @@ def my_matmul(
         fifo_depth_out = fifo_depth
         zero_kernel = Kernel(
             f"zero{scalar_suffix}_{dtype_out_str}",
-            archive_name,
+            kernel_archive,
             [C_l1_ty],
         )
         matmul_func_name = f"matmul{scalar_suffix}_{dtype_in_str}_{dtype_out_str}"
         matmul_kernel = Kernel(
             matmul_func_name,
-            archive_name,
+            kernel_archive,
             [A_l1_ty, B_l1_ty, C_l1_ty],
         )
 
diff --git a/iron/operators/gemm/op.py b/iron/operators/gemm/op.py
index 007e46b3..0c087ad8 100644
--- a/iron/operators/gemm/op.py
+++ b/iron/operators/gemm/op.py
@@ -8,20 +8,17 @@
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
-    KernelArchiveArtifact,
     SourceArtifact,
     PythonGeneratedMLIRArtifact,
 )
 
-from iron.common.utils import torch_to_numpy, numpy_to_torch
 
-
-class AIEGEMM(AIEOperatorBase):
+class AIEGEMM(MLIROperator):
     """AIE-accelerated General Matrix Multiplication (GEMM) layer"""
 
     def __init__(
@@ -36,64 +33,50 @@ def __init__(
         # TODO: Add support for partitioning M and/or K
         # partition_M=1,
         # partition_K=1,
-        partition_N=1,
         num_aie_columns=8,
         context=None,
         **gemm_kwargs,
     ):
-
+        num_aie_rows = 4
+        min_M = tile_m * num_aie_rows
+        min_K = tile_k
+        min_N = tile_n * num_aie_columns
+        assert M % min_M == 0, f"M ({M}) must be multiple of {min_M}"
+        assert K % min_K == 0, f"K ({K}) must be multiple of {min_K}"
+        assert N % min_N == 0, f"N ({N}) must be multiple of {min_N}"
+        self.M = M
+        self.K = K
+        self.N = N
         self.tile_m = tile_m
         self.tile_k = tile_k
         self.tile_n = tile_n
+
         self.num_aie_columns = num_aie_columns
         self.gemm_args = gemm_kwargs
-
-        # Set frequently accessed gemm_args
         self.b_col_maj = gemm_kwargs.get("b_col_maj", False)
         self.c_col_maj = gemm_kwargs.get("c_col_maj", False)
-        self.weight = (
-            None
-            if not use_static_weight
-            else torch.zeros((K, N), dtype=torch.bfloat16).T
-        )
-        self.static_weight_shape = (K, N)
-
-        # The operator's M, K, N represent what the NPU operator supports.
-        # Calls to forward() may supply matrices of different sizes, and the
-        # Python code will perform necessary padding/repeated application of
-        # the NPU operator.
-        assert (
-            N % partition_N == 0
-        ), f"N ({N}) must be divisible by partition_N ({partition_N})"
-        M_padded, K_padded, N_padded = self._get_padded_dims(
-            M, K, N // partition_N, tile_m, tile_k, tile_n
+
+        emulate_bf16_mmul_with_bfp16 = self.gemm_args.get(
+            "emulate_bf16_mmul_with_bfp16", True
         )
-        self.M = M_padded
-        self.K = K_padded
-        self.N = N_padded
-        self.partition_N = partition_N
+        if emulate_bf16_mmul_with_bfp16:
+            min_tile_m, min_tile_k, min_tile_n = 8, 8, 8
+        else:
+            min_tile_m, min_tile_k, min_tile_n = 4, 8, 8
+        assert tile_m >= min_tile_m, f"tile_m ({tile_m}) must be >= {min_tile_m}"
+        assert tile_k >= min_tile_k, f"tile_k ({tile_k}) must be >= {min_tile_k}"
+        assert tile_n >= min_tile_n, f"tile_n ({tile_n}) must be >= {min_tile_n}"
 
-        # Artifacts created by set_up_artifacts()
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"gemm_{self.M}x{self.K}x{self.N}_{self.tile_m}x{self.tile_k}x{self.tile_n}_{int(self.b_col_maj)}_{int(self.c_col_maj)}"
 
-    def get_artifacts(self, prefix="gemm_"):
-        # Extract parameters from self
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        tile_m = self.tile_m
-        tile_k = self.tile_k
-        tile_n = self.tile_n
-        M = self.M
-        K = self.K
-        N = self.N
-        num_aie_columns = self.num_aie_columns
+        operator_name = self.get_operator_name()
         base_dir = self.context.base_dir
         device_str = self.context.device_manager.device_str()
-
-        b_col_maj = self.b_col_maj
-        c_col_maj = self.c_col_maj
         dtype_in = self.gemm_args.get("dtype_in", "bf16")
         dtype_out = self.gemm_args.get("dtype_out", "bf16")
         emulate_bf16_mmul_with_bfp16 = self.gemm_args.get(
@@ -102,245 +85,171 @@ def get_artifacts(self, prefix="gemm_"):
         prio_accuracy = self.gemm_args.get("prio_accuracy", False)
         use_scalar = self.gemm_args.get("use_scalar", False)
         round_conv_even = self.gemm_args.get("round_conv_even", True)
-
-        if emulate_bf16_mmul_with_bfp16:
-            min_tile_m, min_tile_k, min_tile_n = 8, 8, 8
-        else:
-            min_tile_m, min_tile_k, min_tile_n = 4, 8, 8
-        assert tile_m >= min_tile_m, f"tile_m ({tile_m}) must be >= {min_tile_m}"
-        assert tile_k >= min_tile_k, f"tile_k ({tile_k}) must be >= {min_tile_k}"
-        assert tile_n >= min_tile_n, f"tile_n ({tile_n}) must be >= {min_tile_n}"
-
-        file_name_tile_base = f"{prefix}{tile_m}x{tile_k}x{tile_n}"
-        file_name_total_base = f"{prefix}{M}x{K}x{N}_{tile_m}x{tile_k}x{tile_n}_{int(b_col_maj)}_{int(c_col_maj)}"
-        xclbin_kernel_name = f"gemm_{file_name_tile_base}"
-        kernel_flags = [
-            f"-DDIM_M={tile_m}",
-            f"-DDIM_K={tile_k}",
-            f"-DDIM_N={tile_n}",
-            "-DROUND_CONV_EVEN",
-        ]
-        if prio_accuracy:
-            kernel_flags.append("-Dbf16_f32_ONLY")
-        else:
-            kernel_flags.append("-Dbf16_bf16_ONLY")
-        if round_conv_even:
-            kernel_flags.append("-DROUND_CONV_EVEN")
-        if emulate_bf16_mmul_with_bfp16:
-            kernel_flags.append("-DAIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16")
-        if b_col_maj:
-            kernel_flags.append("-DB_COL_MAJ")
-        if c_col_maj:
-            kernel_flags.append("-DC_COL_MAJ")
-
-        kernel_archive = (
-            f"gemm_{tile_m}x{tile_k}x{tile_n}_{int(b_col_maj)}_{int(c_col_maj)}.a"
-        )
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_total_base}.mlir",
+        separate_c_tiles = self.gemm_args.get("separate_c_tiles", False)
+        return PythonGeneratedMLIRArtifact(
+            f"{operator_name}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="my_matmul",
             callback_kwargs={
                 "dev": device_str,
-                "M": M,
-                "K": K,
-                "N": N,
-                "m": tile_m,
-                "k": tile_k,
-                "n": tile_n,
-                "n_aie_cols": num_aie_columns,
+                "M": self.M,
+                "K": self.K,
+                "N": self.N,
+                "m": self.tile_m,
+                "k": self.tile_k,
+                "n": self.tile_n,
+                "n_aie_cols": self.num_aie_columns,
                 "dtype_in_str": dtype_in,
                 "dtype_out_str": dtype_out,
-                "b_col_maj": int(b_col_maj),
-                "c_col_maj": int(c_col_maj),
+                "b_col_maj": int(self.b_col_maj),
+                "c_col_maj": int(self.c_col_maj),
                 "use_scalar": use_scalar,
                 "emulate_bf16_mmul_with_bfp16": emulate_bf16_mmul_with_bfp16,
                 "prio_accuracy": prio_accuracy,
-                "separate_c_tiles": int(self.partition_N > 1),
+                "separate_c_tiles": int(separate_c_tiles),
                 "trace_size": 0,
-                "archive": kernel_archive,
                 "generate_taps": False,
             },
             requires_context=False,
         )
 
-        # FIXME: We should be able to reuse the same xclbin for same tile
-        # sizes, only swapping out the instruction sequence for different
-        # problem sizes. However, there seem to be cases where this does
-        # not work and the GEMM appears to be misconfigured for the wrong
-        # size (resulting in a timeout when trying to run it). Perhaps
-        # XRT is caching something, or something is wrong with the run-
-        # time parameter (synchronization)? For now, create separate
-        # xclbins for each problem size.
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_total_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelArchiveArtifact.new(
-                    kernel_archive,
-                    depends=[
-                        KernelObjectArtifact.new(
-                            f"gemm_{tile_m}x{tile_k}x{tile_n}_{int(b_col_maj)}_{int(c_col_maj)}.o",
-                            extra_flags=kernel_flags,
-                            depends=[
-                                SourceArtifact.new(
-                                    base_dir / "aie_kernels" / "aie2p" / "mm.cc"
-                                )
-                            ],
-                        ),
-                        KernelObjectArtifact.new(
-                            "convert_copy.o",
-                            [
-                                SourceArtifact.new(
-                                    base_dir
-                                    / "aie_kernels"
-                                    / "generic"
-                                    / "convert_copy.cc"
-                                )
-                            ],
-                        ),
-                    ],
-                ),
-            ],
-            extra_flags=["--dynamic-objFifos"],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_total_base}.bin",
-            depends=[mlir_artifact],
-            extra_flags=["--dynamic-objFifos"],
-        )
-
-        return (xclbin_artifact, insts_artifact)
-
-    def set_up_artifacts(self):
-        # Describe required artifacts (xclbin, insts.bin)
-        device_str = self.context.device_manager.device_str()
-        xclbin_artifact, insts_artifact = self.get_artifacts()
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-
-        self.add_artifacts([xclbin_artifact, insts_artifact])
-
-    def set_up_runtime(self):
-        static_weights = None
-        if self.weight is not None:
-            static_weights = self.weight.T
-            if isinstance(static_weights, torch.Tensor):
-                static_weights = torch_to_numpy(static_weights)
-        self.add_kernel(
-            "gemm",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_buffer("A", self.M * self.K)
-        B_parts = self._partition_B(static_weights)
-        for i, B_part in enumerate(B_parts):
-            self.add_buffer(
-                f"B_{i}",
-                self.K * self.N,
-                static_data=B_part,
-            )
-            self.add_buffer(f"C_{i}", self.M * self.N)
-            self.add_to_runlist("gemm", "A", f"B_{i}", f"C_{i}")
-
-    def _get_B_dims(self, B_shape):
-        """Extract K and N dimensions from B matrix shape based on layout.
-
-        Returns:
-            tuple: (K, N) dimensions regardless of B's layout
-        """
-        if self.b_col_maj:
-            return B_shape[-1], B_shape[-2]  # B is (N, K) -> return (K, N)
-        else:
-            return B_shape[-2], B_shape[-1]  # B is (K, N) -> return (K, N)
-
-    def forward(self, A, B=None):
-        """Forward pass through GEMM operation: C = A @ B"""
-        B_shape = B.shape if B is not None else self.static_weight_shape
-
-        # Determine output dimensions based on matrix layout
-        K2, N = self._get_B_dims(B_shape)
-        N_part = N // self.partition_N
-
-        # Build expected output shape based on C layout
-        expected_output_shape = (
-            A.shape[:-2] + (N, A.shape[-1]) if self.c_col_maj else A.shape[:-1] + (N,)
-        )
-
-        # Remove batch dimension, if any
-        if len(A.shape) > 2:
-            A = A.view(-1, A.shape[-1])
-        if B is not None and len(B.shape) > 2:
-            B = B.view(-1, B_shape[-1])
-
-        M, K = A.shape
-
-        applicable = (
-            K == K2
-            and (M <= self.M or not self.c_col_maj)
-            and K <= self.K
-            and N <= self.N
-        )
-        if not applicable:
-            raise AIEOperatorConstraintError("AIEGEMM: incompatible tensor shape(s)")
-
-        A_padded = self._pad_A(torch_to_numpy(A))
-        if B is not None:
-            B_parts = self._partition_B(torch_to_numpy(B))
-        else:
-            B_parts = None
-
-        logging.debug(
-            f"Executing GEMM for dimensions M={M}, K={K}, N={N} using NPU operator with M={self.M}, K={self.N}, N={self.N}"
+    def get_kernel_artifacts(self):
+        base_dir = self.context.base_dir
+        emulate_bf16_mmul_with_bfp16 = self.gemm_args.get(
+            "emulate_bf16_mmul_with_bfp16", True
         )
-
-        if self.c_col_maj:
-            result_padded = np.zeros((N, M), dtype=A_padded.dtype)
+        prio_accuracy = self.gemm_args.get("prio_accuracy", False)
+        round_conv_even = self.gemm_args.get("round_conv_even", True)
+        kernel_flags = [
+            f"-DDIM_M={self.tile_m}",
+            f"-DDIM_K={self.tile_k}",
+            f"-DDIM_N={self.tile_n}",
+            "-DROUND_CONV_EVEN",
+        ]
+        if prio_accuracy:
+            kernel_flags.append("-Dbf16_f32_ONLY")
         else:
-            result_padded = np.zeros((M, N), dtype=A_padded.dtype)
-        for M_lo in range(0, M, self.M):
-            A_part = A_padded[M_lo : M_lo + self.M, :]
-            result_parts = self._execute_aie_operation(A_part, B_parts)
-            max_M = min(M_lo + self.M, M)
-            for part in range(self.partition_N):
-                if self.c_col_maj:
-                    result_padded[part * N_part : (part + 1) * N_part, M_lo:max_M] = (
-                        result_parts[part][:N_part, :max_M]
-                    )
-                else:
-                    result_padded[M_lo:max_M, part * N_part : (part + 1) * N_part] = (
-                        result_parts[part][:max_M, :N_part]
-                    )
-
-        # GEMM produces 2D result, reshape to expected output shape
+            kernel_flags.append("-Dbf16_bf16_ONLY")
+        if round_conv_even:
+            kernel_flags.append("-DROUND_CONV_EVEN")
+        if emulate_bf16_mmul_with_bfp16:
+            kernel_flags.append("-DAIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16")
+        if self.b_col_maj:
+            kernel_flags.append("-DB_COL_MAJ")
         if self.c_col_maj:
-            result = numpy_to_torch(result_padded[:N, :M])
-        else:
-            result = numpy_to_torch(result_padded[:M, :N])
-        result = result.view(expected_output_shape)
-
-        return result
-
-    def _get_padded_dims(self, M, K, N, tile_m, tile_k, tile_n):
-        num_aie_columns = self.num_aie_columns
-        num_aie_rows = 4
-
-        min_M = tile_m * num_aie_rows
-        min_K = tile_k
-        min_N = tile_n * num_aie_columns
+            kernel_flags.append("-DC_COL_MAJ")
 
-        # Calculate padded dimensions
-        M_padded = ((M + min_M - 1) // min_M) * min_M
-        K_padded = ((K + min_K - 1) // min_K) * min_K
-        N_padded = ((N + min_N - 1) // min_N) * min_N
+        # Include flags in the filename to avoid stale builds when flags change
+        flags_suffix = f"_{int(prio_accuracy)}_{int(emulate_bf16_mmul_with_bfp16)}_{int(round_conv_even)}"
+
+        return [
+            KernelObjectArtifact(
+                f"gemm_{self.tile_m}x{self.tile_k}x{self.tile_n}_{int(self.b_col_maj)}_{int(self.c_col_maj)}{flags_suffix}.o",
+                extra_flags=kernel_flags,
+                dependencies=[
+                    SourceArtifact(base_dir / "aie_kernels" / "aie2p" / "mm.cc")
+                ],
+            ),
+            KernelObjectArtifact(
+                "convert_copy.o",
+                [
+                    SourceArtifact(
+                        base_dir / "aie_kernels" / "generic" / "convert_copy.cc"
+                    )
+                ],
+            ),
+        ]
 
-        return M_padded, K_padded, N_padded
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec("in", (self.M, self.K)),  # input A
+            AIERuntimeArgSpec(
+                "in", (self.K, self.N) if not self.b_col_maj else (self.N, self.K)
+            ),  # input B (weights)
+            AIERuntimeArgSpec(
+                "out", (self.M, self.N) if not self.c_col_maj else (self.N, self.M)
+            ),  # output C
+        ]
 
-    def _pad_A(self, A_np):
+    # def _get_B_dims(self, B_shape):
+    #     """Extract K and N dimensions from B matrix shape based on layout.
+
+    #     Returns:
+    #         tuple: (K, N) dimensions regardless of B's layout
+    #     """
+    #     if self.b_col_maj:
+    #         return B_shape[-1], B_shape[-2]  # B is (N, K) -> return (K, N)
+    #     else:
+    #         return B_shape[-2], B_shape[-1]  # B is (K, N) -> return (K, N)
+
+    # def forward(self, A, B=None):
+    #     """Forward pass through GEMM operation: C = A @ B"""
+    #     B_shape = B.shape if B is not None else self.static_weight_shape
+
+    #     # Determine output dimensions based on matrix layout
+    #     K2, N = self._get_B_dims(B_shape)
+    #     N_part = N // self.partition_N
+
+    #     # Build expected output shape based on C layout
+    #     expected_output_shape = (
+    #         A.shape[:-2] + (N, A.shape[-1]) if self.c_col_maj else A.shape[:-1] + (N,)
+    #     )
+
+    #     # Remove batch dimension, if any
+    #     if len(A.shape) > 2:
+    #         A = A.view(-1, A.shape[-1])
+    #     if B is not None and len(B.shape) > 2:
+    #         B = B.view(-1, B_shape[-1])
+
+    #     M, K = A.shape
+
+    #     applicable = (
+    #         K == K2
+    #         and (M <= self.M or not self.c_col_maj)
+    #         and K <= self.K
+    #         and N <= self.N
+    #     )
+    #     if not applicable:
+    #         raise AIEOperatorConstraintError("AIEGEMM: incompatible tensor shape(s)")
+
+    #     A_padded = self._pad_A(torch_to_numpy(A))
+    #     if B is not None:
+    #         B_parts = self._partition_B(torch_to_numpy(B))
+    #     else:
+    #         B_parts = None
+
+    #     logging.debug(
+    #         f"Executing GEMM for dimensions M={M}, K={K}, N={N} using NPU operator with M={self.M}, K={self.N}, N={self.N}"
+    #     )
+
+    #     if self.c_col_maj:
+    #         result_padded = np.zeros((N, M), dtype=A_padded.dtype)
+    #     else:
+    #         result_padded = np.zeros((M, N), dtype=A_padded.dtype)
+    #     for M_lo in range(0, M, self.M):
+    #         A_part = A_padded[M_lo : M_lo + self.M, :]
+    #         result_parts = self._execute_aie_operation(A_part, B_parts)
+    #         max_M = min(M_lo + self.M, M)
+    #         for part in range(self.partition_N):
+    #             if self.c_col_maj:
+    #                 result_padded[part * N_part : (part + 1) * N_part, M_lo:max_M] = (
+    #                     result_parts[part][:N_part, :max_M]
+    #                 )
+    #             else:
+    #                 result_padded[M_lo:max_M, part * N_part : (part + 1) * N_part] = (
+    #                     result_parts[part][:max_M, :N_part]
+    #                 )
+
+    #     # GEMM produces 2D result, reshape to expected output shape
+    #     if self.c_col_maj:
+    #         result = numpy_to_torch(result_padded[:N, :M])
+    #     else:
+    #         result = numpy_to_torch(result_padded[:M, :N])
+    #     result = result.view(expected_output_shape)
+
+    #     return result
+
+    def pad_A(self, A_np):
         """Pad A matrix to match operator dimensions (M, K)"""
         M, K = A_np.shape
         if M % self.M == 0 and K == self.K:
@@ -351,7 +260,7 @@ def _pad_A(self, A_np):
         A_padded[:M, :K] = A_np
         return A_padded
 
-    def _pad_B(self, B_np):
+    def pad_B(self, B_np):
         """Pad B matrix to match operator dimensions based on layout"""
         if self.b_col_maj:
             N, K = B_np.shape
@@ -367,56 +276,16 @@ def _pad_B(self, B_np):
             B_padded[:K, :N] = B_np
         return B_padded
 
-    def _partition_B(self, B):
-        B_parts = [None] * self.partition_N
+    def partition_B(self, B, partition_N):
+        B_parts = [None] * partition_N
         if B is None:
             return B_parts
-        for i in range(self.partition_N):
+        for i in range(partition_N):
             col_start = i * self.N
             col_end = (i + 1) * self.N
 
-            # Just in case, pad the weights before adding the buffer
             if self.b_col_maj:
-                B_parts[i] = self._pad_B(B[col_start:col_end, :])
+                B_parts[i] = self.pad_B(B[col_start:col_end, :])
             else:
-                B_parts[i] = self._pad_B(B[:, col_start:col_end])
-        self.static_weight_shape = B_parts[0].shape
+                B_parts[i] = self.pad_B(B[:, col_start:col_end])
         return B_parts
-
-    def _execute_aie_operation(self, A_np, B_nps=None):
-        """Execute GEMM operation on AIE hardware"""
-        M, K = A_np.shape
-        B_shape = B_nps[0].shape if B_nps is not None else self.static_weight_shape
-        K2, N = self._get_B_dims(B_shape)
-        C_shape = (N, M) if self.c_col_maj else (M, N)
-
-        # Validate dimensions match operator configuration
-        assert M == self.M
-        assert K == K2 and K == self.K
-        assert N == self.N
-
-        self.write_buffer("A", A_np)
-        if B_nps is not None:
-            for i, B_np in enumerate(B_nps):
-                self.add_buffer(
-                    f"B_{i}",
-                    self.M * self.N,
-                    static_data=B_np,
-                )
-        self.run_runlist()
-        result_nps = [
-            self.read_buffer(f"C_{i}", shape=C_shape, dtype=bfloat16)
-            for i in range(self.partition_N)
-        ]
-
-        # Check for NaN and fail hard
-        # for result_np in result_nps:
-        #     if np.isnan(result_np).any():
-        #         nan_count = np.isnan(result_np).sum()
-        #         total_count = result_np.size
-        #         raise RuntimeError(
-        #             f"AIE execution returned {nan_count}/{total_count} NaN values. "
-        #         )
-
-        # Convert back to torch tensor
-        return result_nps
diff --git a/iron/operators/gemm/test.py b/iron/operators/gemm/test.py
index 6480aeff..b9da6f10 100755
--- a/iron/operators/gemm/test.py
+++ b/iron/operators/gemm/test.py
@@ -12,10 +12,10 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
+def get_params():
     # fmt: off
-    params = [
-        #   M,     K,     N, num_aie_columns, b_col_maj, c_col_maj,   m,   k,   n, trace_size, partition_N
+    #   M,     K,     N, num_aie_columns, b_col_maj, c_col_maj,   m,   k,   n, trace_size, partition_N
+    regular_params = [
         (2048,  2048,  2048,               1,     False,     False,  64,  64,  64,          0,           1),
         (2048,  2048,  2048,               2,      True,     False,  64,  64,  64,          0,           1),
         (2048,  2048,  2048,               8,      True,      True,  64,  64,  64,          0,           1),
@@ -44,48 +44,18 @@ def generate_test_params(extensive=False):
     ]
     # fmt: on
 
-    if extensive:
-        params = extensive_params
-
-    names = []
-    for (
-        M,
-        K,
-        N,
-        num_aie_columns,
-        b_col_maj,
-        c_col_maj,
-        m,
-        k,
-        n,
-        trace_size,
-        partition_N,
-    ) in params:
-        name = f"gemm_{M}x{K}x{N}_{m}x{k}x{n}_{num_aie_columns}cols"
-        if b_col_maj:
-            name += "_bcolmaj"
-        if c_col_maj:
-            name += "_ccolmaj"
-        if partition_N > 1:
-            name += f"_{partition_N}npart"
-        if trace_size > 0:
-            name += f"_{trace_size}trace"
-        names.append(name)
-
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    params = []
+
+    # Helper to generate name and append param
+    def add_params(param_list, is_extensive):
+        for p in param_list:
+            marks = [pytest.mark.extensive] if is_extensive else []
+            params.append(pytest.param(*p, marks=marks))
+
+    add_params(regular_params, is_extensive=False)
+    add_params(extensive_params, is_extensive=True)
+
+    return params
 
 
 @pytest.mark.metrics(
@@ -95,7 +65,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "M,K,N,num_aie_columns,b_col_maj,c_col_maj,m,k,n,trace_size,partition_N",
-    all_params,
+    get_params(),
 )
 def test_gemm(
     M,
diff --git a/iron/operators/gemv/design.py b/iron/operators/gemv/design.py
index 0a153364..6d48aa6d 100644
--- a/iron/operators/gemv/design.py
+++ b/iron/operators/gemv/design.py
@@ -29,10 +29,21 @@
  - K: number of columns in the matrix == number of rows in the vector
  - m_input: number of input rows stored on each AIE core == chunk size for data movement of input A
  - m_output: number of output rows stored on each AIE core == chunk size for data movement of output C
+ - num_batches: number of iterations of this mat-vec to perform on contiguous matrices and vectors in memory (results concatenated)
 """
 
 
-def my_matvec(dev, cols, M, K, m_input, m_output=None):
+def my_matvec(
+    dev,
+    cols,
+    M,
+    K,
+    m_input,
+    m_output=None,
+    num_batches=1,
+    kernel_archive="mv.o",
+    func_prefix="",
+):
     if m_output is None:
         m_output = m_input
 
@@ -68,20 +79,17 @@ def my_matvec(dev, cols, M, K, m_input, m_output=None):
     L1_B_ty = np.ndarray[(K,), dtype_in]
     L1_C_ty = np.ndarray[(m_output,), dtype_out]
     L3_A_ty = np.ndarray[
-        (
-            M,
-            K,
-        ),
+        (num_batches * M * K,),
         dtype_in,
     ]
-    L3_B_ty = np.ndarray[(K,), dtype_in]
-    L3_C_ty = np.ndarray[(M,), dtype_out]
+    L3_B_ty = np.ndarray[(num_batches * K,), dtype_in]
+    L3_C_ty = np.ndarray[(num_batches * M,), dtype_out]
 
     func_type = "vectorized" if vectorized else "scalar"
     matvec = Kernel(
-        f"matvec_{func_type}_{dtype_in_str}_{dtype_out_str}",
-        "mv.o",
-        [np.int32, np.int32, np.int32, L1_A_ty, L1_B_ty, L1_C_ty],
+        f"{func_prefix}matvec_{func_type}_{dtype_in_str}_{dtype_out_str}",
+        kernel_archive,
+        [np.int32, np.int32, L1_A_ty, L1_B_ty, L1_C_ty],
     )
 
     A_L3L1_fifos = [
@@ -96,7 +104,7 @@ def my_matvec(dev, cols, M, K, m_input, m_output=None):
 
     def core_body(A_L3L1_fifo, B_L3L1_fifo, C_L1L3_fifo, matvec):
         one_idx = index.constant(1)
-        for _ in range_(0xFFFFFFFF):
+        for _ in range_(0xFFFFFFFF):  # batch dim handled as part of this loop
             b = B_L3L1_fifo.acquire(1)
             # The kernel function computes m output rows; each core is responsible for (M/cols) output rows, so we need to call the kernel (M/cols)/m times.
             for i_idx in range_(M // m_output // cols):
@@ -106,7 +114,7 @@ def core_body(A_L3L1_fifo, B_L3L1_fifo, C_L1L3_fifo, matvec):
                     j_i32 = index.casts(T.i32(), j_idx)
                     output_row_offset = j_i32 * m_input
                     a = A_L3L1_fifo.acquire(1)
-                    matvec(m_input, K, output_row_offset, a, b, c)
+                    matvec(m_input, output_row_offset, a, b, c)
                     A_L3L1_fifo.release(1)
                 C_L1L3_fifo.release(1)
             B_L3L1_fifo.release(1)
@@ -128,66 +136,63 @@ def core_body(A_L3L1_fifo, B_L3L1_fifo, C_L1L3_fifo, matvec):
     # The input matrix in DDR is MxK-sized (row-major); each core processes (M/cols)xK-sized matrices in chunks of mxK-sized tiles.
     # The chunking into mxK-sized tiles happens in the ObjectFIFO; the shim puts all data on the stream in sequence.
     A_taps = [
-        TensorAccessPattern(
-            tensor_dims=(M, K),
-            offset=col * (M // cols) * K,
-            sizes=[1, 1, 1, (M // cols) * K],
-            strides=[0, 0, 0, 1],
-        )
+        [
+            TensorAccessPattern(
+                tensor_dims=L3_A_ty.__args__[0],
+                offset=col * (M // cols) * K + batch * M * K,
+                sizes=[1, 1, 1, (M // cols) * K],
+                strides=[0, 0, 0, 1],
+            )
+            for batch in range(num_batches)
+        ]
         for col in range(cols)
     ]
 
     # Every column gets the entirety of the vector B, no TAP needed.
     # This design assumes that all of B fits on the cores.
+    B_tap = TensorAccessPattern(
+        tensor_dims=L3_B_ty.__args__[0],
+        offset=0,
+        sizes=[1, 1, 1, num_batches * K],
+        strides=[0, 0, 0, 1],
+    )
 
     # Collection pattern for the output vector C: each AIE core writes back its contiguous chunk of rows.
     C_taps = [
-        TensorAccessPattern(
-            tensor_dims=(1, M),
-            offset=col * (M // cols),
-            sizes=[1, 1, 1, (M // cols)],
-            strides=[0, 0, 0, 1],
-        )
+        [
+            TensorAccessPattern(
+                tensor_dims=L3_C_ty.__args__[0],
+                offset=col * (M // cols) + batch * M,
+                sizes=[1, 1, 1, (M // cols)],
+                strides=[0, 0, 0, 1],
+            )
+            for batch in range(num_batches)
+        ]
         for col in range(cols)
     ]
 
     rt = Runtime()
     with rt.sequence(L3_A_ty, L3_B_ty, L3_C_ty) as (A, B, C):
         rt.start(*workers)
-        tg = rt.task_group()
-        for i in range(cols):
-            rt.fill(A_L3L1_fifos[i].prod(), A, A_taps[i], task_group=tg)
-            rt.fill(B_L3L1_fifos[i].prod(), B, task_group=tg)
-        for i in range(cols):
-            rt.drain(C_L1L3_fifos[i].cons(), C, C_taps[i], task_group=tg, wait=True)
-        rt.finish_task_group(tg)
+        tg_b = rt.task_group()
+        for col in range(cols):
+            # Simple linear transfer of B, includes all batches in sequence
+            rt.fill(B_L3L1_fifos[col].prod(), B, B_tap, task_group=tg_b)
+        for batch in range(num_batches):
+            tg_ac = rt.task_group()
+            for col in range(cols):
+                rt.fill(
+                    A_L3L1_fifos[col].prod(), A, A_taps[col][batch], task_group=tg_ac
+                )
+            for col in range(cols):
+                rt.drain(
+                    C_L1L3_fifos[col].cons(),
+                    C,
+                    C_taps[col][batch],
+                    task_group=tg_ac,
+                    wait=True,
+                )
+            rt.finish_task_group(tg_ac)
+        rt.finish_task_group(tg_b)
 
     return Program(dev_ty, rt).resolve_program(SequentialPlacer())
-
-
-def main():
-    argparser = argparse.ArgumentParser(
-        prog="AIE Matrix Vector Multiplication MLIR Design",
-    )
-    argparser.add_argument("--dev", type=str, choices=["npu", "npu2"], default="npu")
-    argparser.add_argument("-M", type=int)
-    argparser.add_argument("-K", type=int)
-    argparser.add_argument("-m", type=int)
-    argparser.add_argument("--cols", type=int)
-    argparser.add_argument(
-        "--output-file-path",
-        "-o",
-        type=str,
-        help="Output file path for the generated MLIR module",
-    )
-    args = argparser.parse_args()
-    module = my_matvec(args.dev, args.cols, args.M, args.K, args.m)
-
-    output_file_path = Path(args.output_file_path)
-
-    with open(output_file_path, "w") as f:
-        f.write(str(module))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/iron/operators/gemv/op.py b/iron/operators/gemv/op.py
index 6ed5a9fe..a96910ac 100644
--- a/iron/operators/gemv/op.py
+++ b/iron/operators/gemv/op.py
@@ -7,8 +7,8 @@
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -16,10 +16,9 @@
     SourceArtifact,
     PythonGeneratedMLIRArtifact,
 )
-from iron.common.utils import torch_to_numpy
 
 
-class AIEGEMV(AIEOperatorBase):
+class AIEGEMV(MLIROperator):
     """AIE-accelerated General Matrix-Vector/Vector-Matrix Multiplication layer"""
 
     def __init__(
@@ -29,8 +28,9 @@ def __init__(
         num_aie_columns=1,
         tile_size_input=2,
         tile_size_output=None,
-        is_mv=True,
+        num_batches=1,
         use_static_weight=False,
+        kernel_vector_size=64,
         context=None,
     ):
         if tile_size_output is None:
@@ -40,31 +40,30 @@ def __init__(
             tile_size_output % tile_size_input == 0
             and tile_size_output >= tile_size_input
         ), "tile_size_output must be a multiple of tile_size_input"
-        self.M = M  # matrix rows  (if is_mv=False, matrix columns)
-        self.K = K  # matrix columns, vector rows  (if is_mv=False, matrix rows, vector columns)
+        self.M = M  # matrix rows
+        self.K = K  # matrix columns, vector rows
         self.num_aie_columns = num_aie_columns
         self.tile_size_input = tile_size_input
         self.tile_size_output = tile_size_output
-        self.is_mv = is_mv
-        if use_static_weight:
-            self.weight = torch.zeros(
-                (M, K) if is_mv else (K, M), dtype=torch.bfloat16
-            ).T  # weights are stored col-major/transposed
-        else:
-            self.weight = None
+        self.num_batches = num_batches
+        self.kernel_vector_size = kernel_vector_size
+        assert (
+            K >= kernel_vector_size and K % kernel_vector_size == 0
+        ), "K must be multiple of kernel_vector_size"
 
         self.xclbin_artifact = None
         self.insts_artifact = None
 
-        AIEOperatorBase.__init__(self, context=context)
+        MLIROperator.__init__(self, context=context)
+
+    def get_operator_name(self):
+        return f"gemv_{self.M}x{self.K}_{self.tile_size_input}tsi_{self.tile_size_output}tso_{self.num_batches}batch_{self.num_aie_columns}col"
 
-    def get_artifacts(self, prefix="gemv_"):
-        # The underlying MLIR design is a matrix-vector multiplication. We support vector-matrix multiplication by transposing the matrix beforehand (AB = C <=> B^T A^T = C^T).
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"{prefix}{self.M}x{self.K}_{self.tile_size_input}tsi_{self.tile_size_output}tso_{self.num_aie_columns}col"
 
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="my_matvec",
             callback_args=[
@@ -74,119 +73,30 @@ def get_artifacts(self, prefix="gemv_"):
                 self.K,
                 self.tile_size_input,
                 self.tile_size_output,
+                self.num_batches,
             ],
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelObjectArtifact.new(
-                    f"mv.o",
-                    depends=[
-                        SourceArtifact.new(
-                            self.context.base_dir / "aie_kernels" / "generic" / "mv.cc"
-                        )
-                    ],
-                ),
-            ],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        return xclbin_artifact, insts_artifact
-
-    def set_up_artifacts(self):
-        xclbin_artifact, insts_artifact = self.get_artifacts()
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-
-        artifacts = [xclbin_artifact, insts_artifact]
-        self.add_artifacts(artifacts)
-
-    def set_up_runtime(self):
-        # If this operator is only used as a sub-operator in another operator that sets it up, we should skip the setup here as those artifacts and buffers may not be needed.
-        # Runtime Setup
-        # ---
-        static_weights = None
-        if self.weight is not None:
-            # Kernel expects row-major weights, so might need to transpose (torch weights are stored in col-major);
-            # also might need to transpose if is_mv
-            if self.is_mv:
-                static_weights = self.weight.T
-            else:
-                # Double transpose cancels out
-                static_weights = self.weight
-            if isinstance(static_weights, torch.Tensor):
-                static_weights = torch_to_numpy(static_weights)
-        self.add_kernel(
-            "gemv",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_buffer("matrix", self.M * self.K, static_data=static_weights)
-        self.add_buffer("vector", self.K)
-        self.add_buffer("output", self.M)
-        self.add_to_runlist("gemv", "matrix", "vector", "output")
-
-    def forward(self, vector, matrix=None):
-        """Forward pass through GEMV operation
-
-        Args:
-            matrix: Input matrix of shape (..., M, K)
-            vector: Input vector of shape (..., K) for MV or (..., M) for VM
-            is_mv: True for matrix-vector multiplication, False for vector-matrix
-
-        Returns:
-            Output vector of shape (..., M) for MV or (..., K) for VM
-        """
-
-        # Flatten batch dimensions if needed
-        if matrix is not None:
-            matrix = matrix.reshape(*matrix.shape[-2:])
-        original_vector_dims = vector.ndim
-        vector = vector.reshape(*vector.shape[-1:])
-
-        # For vector-matrix, we'll transpose the matrix internally
-        if matrix is not None and not self.is_mv:
-            # Transpose the matrix for vector-matrix multiplication
-            # (if using static weights, the matrix is already transposed once at setup if needed)
-            matrix = matrix.transpose(-2, -1)
-
-        if matrix is not None:
-            matrix_rows = matrix.shape[-2]
-            matrix_cols = matrix.shape[-1]
-        else:
-            matrix_rows = self.M
-            matrix_cols = self.K
-
-        vector_size = vector.shape[-1]
-
-        applicable = (
-            matrix_cols == vector_size
-            and matrix_rows == self.M
-            and matrix_cols == self.K
-            and (matrix is None or matrix.dtype == torch.bfloat16)
-            and vector.dtype == torch.bfloat16
-        )
-        if not applicable:
-            raise AIEOperatorConstraintError(
-                "AIEElementwiseAdd: incompatible tensor shape(s)"
-            )
-
-        if matrix is not None:
-            # If matrix is none, we are using static weights that have already been written to the buffer
-            self.write_buffer("matrix", matrix)
-        self.write_buffer("vector", vector)
-        self.run_runlist()
-        result = self.read_buffer_as_torch("output", (self.M,))
-
-        # Add back batch dimensions if we removed them earlier.
-        if result.ndim < original_vector_dims:
-            result = result.reshape(*((1,) * (original_vector_dims - 1)), -1)
-
-        return result
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"gemv_{self.K}k.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir / "aie_kernels" / "generic" / "mv.cc"
+                    )
+                ],
+                extra_flags=[
+                    f"-DDIM_K={self.K}",
+                    f"-DVEC_SIZE={self.kernel_vector_size}",
+                ],
+            ),
+        ]
+
+    def get_arg_spec(self):
+        batch_dim = (self.num_batches,) if self.num_batches > 1 else ()
+        return [
+            AIERuntimeArgSpec("in", batch_dim + (self.M, self.K)),  # matrix
+            AIERuntimeArgSpec("in", batch_dim + (self.K,)),  # vector
+            AIERuntimeArgSpec("out", batch_dim + (self.M,)),  # output
+        ]
diff --git a/iron/operators/gemv/test.py b/iron/operators/gemv/test.py
index 2dd4a8e6..493d51c0 100755
--- a/iron/operators/gemv/test.py
+++ b/iron/operators/gemv/test.py
@@ -12,8 +12,8 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
-    params = [
+def get_params():
+    params_list = [
         (128, 128, 1, 32, 128),
         (2048, 8192, 1, 1, 2048),
         (8192, 2048, 1, 4, 1024),
@@ -24,24 +24,13 @@ def generate_test_params(extensive=False):
         (2048, 8192, 8, 1, 256),
         (8192, 2048, 8, 4, 1024),
     ]
-    names = [
-        f"matrix_vector_mul_{M}x{K}_{tile_size_input}tsi_{tile_size_output}tso_{num_aie_columns}col"
-        for M, K, num_aie_columns, tile_size_input, tile_size_output in params
-    ]
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
 
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    params = []
+    for p in params_list:
+        # All tests are considered regular here as per original code structure
+        # (original code returned same list for both regular and extensive)
+        params.append(pytest.param(*p))
+    return params
 
 
 @pytest.mark.metrics(
@@ -50,7 +39,7 @@ def generate_test_params(extensive=False):
     Throughput=r"Throughput: (?P<value>[\d\.e\+-]+) GFLOP/s",
 )
 @pytest.mark.parametrize(
-    "M,K,num_aie_columns,tile_size_input,tile_size_output", all_params
+    "M,K,num_aie_columns,tile_size_input,tile_size_output", get_params()
 )
 def test_gemv(M, K, num_aie_columns, tile_size_input, tile_size_output, aie_context):
     golden_ref = generate_golden_reference(M=M, K=K)
diff --git a/iron/operators/layer_norm/design.py b/iron/operators/layer_norm/design.py
index f48bb2d2..c5f088a4 100644
--- a/iron/operators/layer_norm/design.py
+++ b/iron/operators/layer_norm/design.py
@@ -15,7 +15,15 @@
 from aie.helpers.util import np_ndarray_type_get_shape
 
 
-def my_layer_norm(dev, num_elements, num_columns, num_channels, trace_size, tile_size):
+def my_layer_norm(
+    dev,
+    num_elements,
+    num_columns,
+    num_channels,
+    trace_size,
+    tile_size,
+    kernel_archive=None,
+):
     per_tile_elements = 8192 if tile_size > 8192 else tile_size
     n = per_tile_elements * num_columns
     if num_elements % n != 0:
diff --git a/iron/operators/layer_norm/op.py b/iron/operators/layer_norm/op.py
index cc3c1aa2..b2b7a35e 100644
--- a/iron/operators/layer_norm/op.py
+++ b/iron/operators/layer_norm/op.py
@@ -7,8 +7,8 @@
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -17,16 +17,19 @@
 )
 
 
-class AIELayerNorm(AIEOperatorBase):
+class AIELayerNorm(MLIROperator):
     """AIE-accelerated LAYER NORM operator"""
 
     def __init__(
         self, size, num_aie_columns, num_channels, tile_size, trace_size=0, context=None
     ):
         max_multiple = num_aie_columns * tile_size
-        padded_size = ((size + max_multiple - 1) // max_multiple) * max_multiple
-        self.orig_size = size
-        self.size = padded_size
+        assert (
+            size % max_multiple == 0
+        ), "size must be multiple of num_aie_columns * tile_size"
+        assert size % tile_size == 0, "size must be multiple of tile_size"
+
+        self.size = size
         self.tile_size = tile_size
         self.trace_size = trace_size
         self.num_aie_columns = num_aie_columns
@@ -35,17 +38,15 @@ def __init__(
         total_shimdma_channels = self.num_aie_columns * self.num_channels
         assert total_shimdma_channels <= 16, "Conservative ShimDMA limit"
 
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"layer_norm_{self.num_aie_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
 
-    def set_up_artifacts(self):
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"layer_norm_{self.num_aie_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="my_layer_norm",
             callback_args=[
@@ -58,62 +59,23 @@ def set_up_artifacts(self):
             ],
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelObjectArtifact.new(
-                    f"layer_norm.o",
-                    depends=[
-                        SourceArtifact.new(
-                            self.context.base_dir
-                            / "aie_kernels"
-                            / "aie2p"
-                            / "layer_norm.cc"
-                        )
-                    ],
-                ),
-            ],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-        self.add_artifacts([xclbin_artifact, insts_artifact])
-
-    def set_up_runtime(self):
-        self.add_buffer("input", self.size)
-        self.add_buffer("output", self.size)
-        self.add_kernel(
-            "layer_norm",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("layer_norm", "input", "output")
-
-    def forward(self, x):
-        if x.numel() > self.size:
-            raise AIEOperatorConstraintError(
-                "AIELayerNorm: input too large for configured size"
-            )
-
-        original_shape = x.shape
-        x_flat = x.reshape(-1)
-
-        pad_len = self.size - x_flat.numel()
-        if pad_len > 0:
-            x_flat = torch.nn.functional.pad(x_flat, (0, pad_len))
-
-        self.write_buffer("input", x_flat)
-        self.write_buffer("output", np.zeros(self.size, dtype=bfloat16))
-        self.run_runlist()
-        result = self.read_buffer_as_torch("output", shape=(self.size,), dtype=bfloat16)
-
-        if pad_len > 0:
-            result = result[: x_flat.numel() - pad_len]
-
-        return result.reshape(*original_shape)
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"layer_norm.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir
+                        / "aie_kernels"
+                        / "aie2p"
+                        / "layer_norm.cc"
+                    )
+                ],
+            ),
+        ]
+
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec("in", (self.size,)),  # input
+            AIERuntimeArgSpec("out", (self.size,)),  # output
+        ]
diff --git a/iron/operators/layer_norm/test.py b/iron/operators/layer_norm/test.py
index 2b14641c..57fffea4 100755
--- a/iron/operators/layer_norm/test.py
+++ b/iron/operators/layer_norm/test.py
@@ -12,11 +12,10 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
+def get_params():
     max_aie_columns = 8
-    input_lengths = [2048] if not extensive else [1024, 4096, 8192]
+    input_lengths = [1024, 2048, 4096, 8192]
     params = []
-    names = []
     for input_length in input_lengths:
         for num_aie_columns in range(1, max_aie_columns + 1):
             for num_channels_layer in range(1, 3):  # 1 or 2
@@ -26,26 +25,19 @@ def generate_test_params(extensive=False):
                     tile_size = 8192
                 check_length = tile_size * total_cores
                 if check_length == input_length:
-                    names.append(
-                        f"layer_norm_{num_aie_columns}_cols_{num_channels_layer}_channels_{input_length}_tile_{tile_size}"
-                    )
+                    is_regular = input_length == 2048
+                    marks = [] if is_regular else [pytest.mark.extensive]
+
                     params.append(
-                        (input_length, num_aie_columns, num_channels_layer, tile_size)
+                        pytest.param(
+                            input_length,
+                            num_aie_columns,
+                            num_channels_layer,
+                            tile_size,
+                            marks=marks,
+                        )
                     )
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    return params
 
 
 @pytest.mark.metrics(
@@ -54,7 +46,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size",
-    all_params,
+    get_params(),
 )
 def test_layer_norm(
     input_length, num_aie_columns, num_channels, tile_size, aie_context
diff --git a/iron/operators/leaky_relu/design.py b/iron/operators/leaky_relu/design.py
index 25cd580b..a5d5c534 100644
--- a/iron/operators/leaky_relu/design.py
+++ b/iron/operators/leaky_relu/design.py
@@ -14,7 +14,16 @@
 from aie.iron.controlflow import range_
 
 
-def my_leaky_relu(dev, size, num_columns, num_channels, tile_size, trace_size, alpha):
+def my_leaky_relu(
+    dev,
+    size,
+    num_columns,
+    num_channels,
+    tile_size,
+    trace_size,
+    alpha,
+    kernel_archive=None,
+):
     xfr_dtype = bfloat16
     line_size = 4096 if tile_size > 4096 else tile_size
     line_type = np.ndarray[(line_size,), np.dtype[xfr_dtype]]
diff --git a/iron/operators/leaky_relu/op.py b/iron/operators/leaky_relu/op.py
index e26fc368..72fddeb7 100644
--- a/iron/operators/leaky_relu/op.py
+++ b/iron/operators/leaky_relu/op.py
@@ -7,8 +7,8 @@
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -17,16 +17,19 @@
 )
 
 
-class AIELeakyReLU(AIEOperatorBase):
+class AIELeakyReLU(MLIROperator):
     """AIE-accelerated LEAKY RELU operator"""
 
     def __init__(
         self, size, num_aie_columns, num_channels, tile_size, alpha=0.01, context=None
     ):
         max_multiple = num_aie_columns * tile_size
-        padded_size = ((size + max_multiple - 1) // max_multiple) * max_multiple
-        self.orig_size = size
-        self.size = padded_size
+        assert (
+            size % max_multiple == 0
+        ), "size must be multiple of num_aie_columns * tile_size"
+        assert size % tile_size == 0, "size must be multiple of tile_size"
+
+        self.size = size
         self.tile_size = tile_size
 
         self.num_columns = num_aie_columns
@@ -36,17 +39,15 @@ def __init__(
         total_shimdma_channels = self.num_columns * self.num_channels
         assert total_shimdma_channels <= 16, "Conservative ShimDMA limit"
 
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"leaky_relu_{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
 
-    def set_up_artifacts(self):
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"leaky_relu_{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="my_leaky_relu",
             callback_args=[
@@ -60,62 +61,23 @@ def set_up_artifacts(self):
             ],
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelObjectArtifact.new(
-                    f"leaky_relu.o",
-                    depends=[
-                        SourceArtifact.new(
-                            self.context.base_dir
-                            / "aie_kernels"
-                            / "aie2p"
-                            / "leaky_relu.cc"
-                        )
-                    ],
-                ),
-            ],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-        self.add_artifacts([xclbin_artifact, insts_artifact])
-
-    def set_up_runtime(self):
-        self.add_buffer("input", self.size)
-        self.add_buffer("output", self.size)
-        self.add_kernel(
-            "leaky_relu",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("leaky_relu", "input", "output")
-
-    def forward(self, x):
-        if x.numel() > self.size:
-            raise AIEOperatorConstraintError(
-                "AIELeakyReLU: input too large for configured size"
-            )
-
-        original_shape = x.shape
-        x_flat = x.reshape(-1)
-
-        pad_len = self.size - x_flat.numel()
-        if pad_len > 0:
-            x_flat = torch.nn.functional.pad(x_flat, (0, pad_len))
-
-        self.write_buffer("input", x_flat)
-        self.write_buffer("output", np.zeros(self.size, dtype=bfloat16))
-        self.run_runlist()
-        result = self.read_buffer_as_torch("output", shape=(self.size,), dtype=bfloat16)
-
-        if pad_len > 0:
-            result = result[: x_flat.numel() - pad_len]
-
-        return result.reshape(*original_shape)
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"leaky_relu.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir
+                        / "aie_kernels"
+                        / "aie2p"
+                        / "leaky_relu.cc"
+                    )
+                ],
+            ),
+        ]
+
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec("in", (self.size,)),  # input
+            AIERuntimeArgSpec("out", (self.size,)),  # output
+        ]
diff --git a/iron/operators/leaky_relu/test.py b/iron/operators/leaky_relu/test.py
index cac577ad..6adb8d4d 100755
--- a/iron/operators/leaky_relu/test.py
+++ b/iron/operators/leaky_relu/test.py
@@ -12,24 +12,10 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
+def get_params():
     # Leaky ReLU is currently broken (#36); leave it untested
     params = []
-    names = []
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    return params
 
 
 @pytest.mark.metrics(
@@ -38,7 +24,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size,alpha",
-    all_params,
+    get_params(),
 )
 def test_leaky_relu(
     input_length, num_aie_columns, num_channels, tile_size, alpha, aie_context
diff --git a/iron/operators/mem_copy/design.py b/iron/operators/mem_copy/design.py
index ce807a48..73a0eca2 100644
--- a/iron/operators/mem_copy/design.py
+++ b/iron/operators/mem_copy/design.py
@@ -167,7 +167,16 @@ def create_partial_workload_config(
 #
 
 
-def my_mem_copy(dev, size, num_cores, num_channels, bypass, tile_size, trace_size):
+def my_mem_copy(
+    dev,
+    size,
+    num_cores,
+    num_channels,
+    bypass,
+    tile_size,
+    trace_size,
+    kernel_archive=None,
+):
     # --------------------------------------------------------------------------
     # Configuration
     # --------------------------------------------------------------------------
diff --git a/iron/operators/mem_copy/op.py b/iron/operators/mem_copy/op.py
index c5c9f14e..08cd95c9 100644
--- a/iron/operators/mem_copy/op.py
+++ b/iron/operators/mem_copy/op.py
@@ -7,17 +7,18 @@
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
+    KernelArchiveArtifact,
     SourceArtifact,
     PythonGeneratedMLIRArtifact,
 )
 
 
-class AIEMemCopy(AIEOperatorBase):
+class AIEMemCopy(MLIROperator):
 
     def __init__(self, size, num_cores, num_channels, bypass, tile_size, context=None):
         self.size = size
@@ -29,22 +30,16 @@ def __init__(self, size, num_cores, num_channels, bypass, tile_size, context=Non
         # For naming consistency with other operators
         self.bypass_str = "bypass" if bypass else "no_bypass"
 
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"mem_copy_{self.num_cores}_cores_{self.num_channels}_chans_tile_{self.tile_size}_{self.bypass_str}"
 
-    def set_up_artifacts(self):
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-
         size = self.tile_size * self.num_cores
-
-        # Xclbin base name (shared)
-        xclbin_base_name = f"mem_copy_{self.num_cores}_cores_{self.num_channels}_chans_tile_{self.tile_size}_{self.bypass_str}"
-
-        # Generate MLIR for xclbin (using dummy size)
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{xclbin_base_name}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="my_mem_copy",
             callback_args=[
@@ -58,67 +53,57 @@ def set_up_artifacts(self):
             ],
         )
 
-        # Build kernel only if not bypass mode
+    def get_kernel_artifacts(self):
         if not self.bypass:
-            kernel_artifact = KernelObjectArtifact.new(
-                "mem_copy.o",
-                depends=[
-                    SourceArtifact.new(
-                        self.context.base_dir
-                        / "aie_kernels"
-                        / "generic"
-                        / "passThrough.cc"
-                    )
-                ],
-            )
-            xclbin_depends = [mlir_artifact, kernel_artifact]
+            return [
+                KernelObjectArtifact(
+                    "mem_copy.o",
+                    dependencies=[
+                        SourceArtifact(
+                            self.context.base_dir
+                            / "aie_kernels"
+                            / "generic"
+                            / "passThrough.cc"
+                        )
+                    ],
+                )
+            ]
         else:
-            xclbin_depends = [mlir_artifact]
-
-        xclbin_artifact = XclbinArtifact.new(
-            f"{xclbin_base_name}.xclbin",
-            depends=xclbin_depends,
-            extra_flags=["--dynamic-objFifos"],
+            return []
+
+    def get_artifacts(self):
+        # Override to add --dynamic-objFifos flag
+        operator_name = self.get_operator_name()
+        mlir_artifact = self.get_mlir_artifact()
+        kernel_deps_inputs = self.get_kernel_artifacts()
+        if len(kernel_deps_inputs) > 0:
+            mlir_artifact.callback_kwargs["kernel_archive"] = self.kernel_archive
+        kernel_deps = (
+            [
+                KernelArchiveArtifact(
+                    self.kernel_archive,
+                    dependencies=kernel_deps_inputs,
+                )
+            ]
+            if kernel_deps_inputs
+            else []
         )
-
-        insts_file_name = f"mem_copy_{self.num_cores}_cores_{self.num_channels}_chans_{self.size}_tile_{self.tile_size}_{self.bypass_str}"
-        insts_artifact = InstsBinArtifact.new(
-            f"{insts_file_name}.bin",
-            depends=[mlir_artifact],
+        xclbin_artifact = XclbinArtifact(
+            f"{operator_name}.xclbin",
+            mlir_input=mlir_artifact,
+            dependencies=[mlir_artifact] + kernel_deps,
             extra_flags=["--dynamic-objFifos"],
         )
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-
-        artifacts = [xclbin_artifact, insts_artifact]
-        self.add_artifacts(artifacts)
-
-    def set_up_runtime(self):
-        self.add_buffer("input", self.size)
-        self.add_buffer("output", self.size)
-        self.add_kernel(
-            "mem_copy",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
+        insts_artifact = InstsBinArtifact(
+            f"{operator_name}.bin",
+            mlir_input=mlir_artifact,
+            dependencies=[mlir_artifact],
+            extra_flags=["--dynamic-objFifos"],
         )
-        self.add_to_runlist("mem_copy", "input", "output")
-
-    def forward(self, x):
-        """Forward pass for memory copy"""
-        if x.numel() != self.size:
-            raise AIEOperatorConstraintError(
-                f"AIEMemCopy: input size {x.numel()} does not match expected size {self.size}"
-            )
-
-        original_shape = x.shape
-        x_flat = x.reshape(-1)
-
-        # Execute on AIE
-        self.write_buffer("input", x_flat)
-        self.write_buffer("output", np.zeros(self.size, dtype=bfloat16))
-        self.run_runlist()
-        result = self.read_buffer_as_torch("output", shape=(self.size,), dtype=bfloat16)
+        return xclbin_artifact, insts_artifact
 
-        return result.reshape(*original_shape)
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec("in", (self.size,)),  # input
+            AIERuntimeArgSpec("out", (self.size,)),  # output
+        ]
diff --git a/iron/operators/mem_copy/test.py b/iron/operators/mem_copy/test.py
index afd7f540..d65028c2 100644
--- a/iron/operators/mem_copy/test.py
+++ b/iron/operators/mem_copy/test.py
@@ -12,12 +12,11 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
-    input_lengths = [2048] if not extensive else [1024, 2048, 4096, 8192]
-    bypass_modes = [False] if not extensive else [False, True]
+def get_params():
+    input_lengths = [1024, 2048, 4096, 8192]
+    bypass_modes = [False, True]
 
     params = []
-    names = []
 
     for input_length in input_lengths:
         for num_cores in range(1, 17):  # 1 to 16 cores
@@ -35,33 +34,21 @@ def generate_test_params(extensive=False):
 
                         # Only proceed if tile_size * num_cores == input_length (exact division)
                         if tile_size * num_cores == input_length:
-                            names.append(
-                                f"mem_copy_{num_cores}_cores_{num_channels}_chans_{input_length}_tile_{tile_size}_{str(bypass)}"
-                            )
+                            is_regular = input_length == 2048 and bypass == False
+                            marks = [] if is_regular else [pytest.mark.extensive]
+
                             params.append(
-                                (
+                                pytest.param(
                                     input_length,
                                     num_cores,
                                     num_channels,
                                     bypass,
                                     tile_size,
+                                    marks=marks,
                                 )
                             )
 
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    return params
 
 
 @pytest.mark.metrics(
@@ -70,7 +57,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "input_length,num_cores,num_channels,bypass,tile_size",
-    all_params,
+    get_params(),
 )
 def test_mem_copy(
     input_length, num_cores, num_channels, bypass, tile_size, aie_context
diff --git a/iron/operators/mha/design.py b/iron/operators/mha/design.py
index d11e4ed4..9dc33b92 100644
--- a/iron/operators/mha/design.py
+++ b/iron/operators/mha/design.py
@@ -115,6 +115,7 @@ def fused_mha(
     emulate_bf16_mmul_with_bfp16: bool,
     trace_size: int = 0,
     verbose: bool = False,
+    kernel_archive=None,
 ):
 
     of_depth = 2
@@ -205,7 +206,7 @@ def fused_mha(
 
     # AIE kernel declarations
     func_type = "" if vectorized else "_scalar"
-    bin_name = "mha_kernels.a"
+    bin_name = kernel_archive if kernel_archive else "mha_kernels.a"
 
     zero_kernel = Kernel(f"zero_{dtype_str}", bin_name, [qk_ty])
 
diff --git a/iron/operators/mha/op.py b/iron/operators/mha/op.py
index 58864519..9fa8c0ba 100644
--- a/iron/operators/mha/op.py
+++ b/iron/operators/mha/op.py
@@ -8,8 +8,8 @@
 from typing import Dict, List
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -17,10 +17,9 @@
     SourceArtifact,
     PythonGeneratedMLIRArtifact,
 )
-from iron.common.utils import torch_to_numpy, numpy_to_torch
 
 
-class AIEMHA(AIEOperatorBase):
+class AIEMHA(MLIROperator):
 
     def __init__(
         self,
@@ -40,20 +39,34 @@ def __init__(
         self.num_of_pipelines = num_of_pipelines
         assert d == 64, "Only d=64 is supported in this version"
 
-        # Artifacts created by set_up_artifacts()
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        kv_heads = self.num_KV_heads if self.num_KV_heads > 0 else self.num_heads
+        return f"mha_{self.num_heads}h_{kv_heads}kv_{self.seq_len}s_{self.d}d"
 
-    def set_up_artifacts(self):
-        # Set up compilation artifacts
-        # ---
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
+            import_path=operator_dir / "design.py",
+            callback_fn="fused_mha",
+            callback_kwargs={
+                "heads": self.num_heads,
+                "S_q": self.seq_len,
+                "S_kv": self.seq_len,
+                "d": self.d,
+                "B_q": self.B_q,
+                "B_kv": self.B_kv,
+                "num_KV_heads": self.num_KV_heads,
+                "number_of_pipelines": self.num_of_pipelines,
+                "emulate_bf16_mmul_with_bfp16": True,
+                "trace_size": 0,
+                "verbose": False,
+            },
+        )
 
-        kv_heads = self.num_KV_heads if self.num_KV_heads > 0 else self.num_heads
-        file_name_base = f"mha_{self.num_heads}h_{kv_heads}kv_{self.seq_len}s_{self.d}d"
-
+    def get_kernel_artifacts(self):
         # Define source files
         mm_source = str(self.context.base_dir / "aie_kernels" / "aie2p" / "mm.cc")
         softmax_source = str(
@@ -83,105 +96,72 @@ def set_up_artifacts(self):
             "zero_scalar_bf16": "zero_scalar_bf16_rowmaj",
         }
 
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
-            import_path=operator_dir / "design.py",
-            callback_fn="fused_mha",
-            callback_kwargs={
-                "heads": self.num_heads,
-                "S_q": self.seq_len,
-                "S_kv": self.seq_len,
-                "d": self.d,
-                "B_q": self.B_q,
-                "B_kv": self.B_kv,
-                "num_KV_heads": self.num_KV_heads,
-                "number_of_pipelines": self.num_of_pipelines,
-                "emulate_bf16_mmul_with_bfp16": True,
-                "trace_size": 0,
-                "verbose": False,
-            },
-        )
+        return [
+            KernelObjectArtifact(
+                f"mha_mm.o",
+                extra_flags=mm_defines_colmaj,
+                dependencies=[SourceArtifact(mm_source)],
+            ),
+            KernelObjectArtifact(
+                f"mha_mm_rowmaj.o",
+                extra_flags=mm_defines_rowmaj,
+                dependencies=[SourceArtifact(mm_source)],
+                rename_symbols=mm_rename_symbols,
+            ),
+            KernelObjectArtifact(
+                "mha_softmax.o",
+                dependencies=[SourceArtifact(softmax_source)],
+            ),
+            KernelObjectArtifact(
+                "mha_mha.o", dependencies=[SourceArtifact(mha_source)]
+            ),
+            KernelObjectArtifact(
+                "mha_passThrough.o",
+                extra_flags=["-DBIT_WIDTH=16"],
+                dependencies=[SourceArtifact(passthrough_source)],
+            ),
+        ]
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"mha.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelArchiveArtifact.new(
-                    f"mha_kernels.a",
-                    depends=[
-                        KernelObjectArtifact.new(
-                            f"mha_mm.o",
-                            extra_flags=mm_defines_colmaj,
-                            depends=[SourceArtifact.new(mm_source)],
-                        ),
-                        KernelObjectArtifact.new(
-                            f"mha_mm_rowmaj.o",
-                            extra_flags=mm_defines_rowmaj,
-                            depends=[SourceArtifact.new(mm_source)],
-                            rename_symbols=mm_rename_symbols,
-                        ),
-                        KernelObjectArtifact.new(
-                            "mha_softmax.o",
-                            depends=[SourceArtifact.new(softmax_source)],
-                        ),
-                        KernelObjectArtifact.new(
-                            "mha_mha.o", depends=[SourceArtifact.new(mha_source)]
-                        ),
-                        KernelObjectArtifact.new(
-                            "mha_passThrough.o",
-                            extra_flags=["-DBIT_WIDTH=16"],
-                            depends=[SourceArtifact.new(passthrough_source)],
-                        ),
-                    ],
-                ),
-            ],
+    def get_artifacts(self):
+        # Override to add --dynamic-objFifos flag
+        operator_name = self.get_operator_name()
+        mlir_artifact = self.get_mlir_artifact()
+        kernel_deps_inputs = self.get_kernel_artifacts()
+        if len(kernel_deps_inputs) > 0:
+            mlir_artifact.callback_kwargs["kernel_archive"] = self.kernel_archive
+        kernel_deps = (
+            [
+                KernelArchiveArtifact(
+                    self.kernel_archive,
+                    dependencies=kernel_deps_inputs,
+                )
+            ]
+            if kernel_deps_inputs
+            else []
+        )
+        xclbin_artifact = XclbinArtifact(
+            f"{operator_name}.xclbin",
+            mlir_input=mlir_artifact,
+            dependencies=[mlir_artifact] + kernel_deps,
             extra_flags=["--dynamic-objFifos"],
         )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"mha.bin", depends=[mlir_artifact], extra_flags=["--dynamic-objFifos"]
+        insts_artifact = InstsBinArtifact(
+            f"{operator_name}.bin",
+            mlir_input=mlir_artifact,
+            dependencies=[mlir_artifact],
+            extra_flags=["--dynamic-objFifos"],
         )
+        return xclbin_artifact, insts_artifact
 
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-
-        artifacts = [xclbin_artifact, insts_artifact]
-        self.add_artifacts(artifacts)
-
-    def set_up_runtime(self):
-        # Set up runtime
-        # ---
-        self.add_kernel(
-            "mha",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_buffer(
-            "Q",
-            self.num_heads
-            * self.d
-            * self._calculate_seq_padding(self.seq_len, self.num_of_pipelines),
-        )
-        self.add_buffer(
-            "K",
-            self.num_heads
-            * self.d
-            * self._calculate_seq_padding(self.seq_len, self.num_of_pipelines),
-        )
-        self.add_buffer(
-            "V",
-            self.num_heads
-            * self.d
-            * self._calculate_seq_padding(self.seq_len, self.num_of_pipelines),
-        )
-        self.add_buffer(
-            "O",
-            self.num_heads
-            * self.d
-            * self._calculate_seq_padding(self.seq_len, self.num_of_pipelines),
-        )
-        self.add_to_runlist("mha", "Q", "K", "V", "O")
+    def get_arg_spec(self):
+        seq_padding = self._calculate_seq_padding(self.seq_len, self.num_of_pipelines)
+        buffer_size = self.num_heads * self.d * seq_padding
+        return [
+            AIERuntimeArgSpec("in", (buffer_size,)),  # Q
+            AIERuntimeArgSpec("in", (buffer_size,)),  # K
+            AIERuntimeArgSpec("in", (buffer_size,)),  # V
+            AIERuntimeArgSpec("out", (buffer_size,)),  # O
+        ]
 
     def _calculate_seq_padding(self, seq_len, num_pipeline=1):
         return ((seq_len + 63 * num_pipeline) // (64 * num_pipeline)) * (
@@ -190,7 +170,7 @@ def _calculate_seq_padding(self, seq_len, num_pipeline=1):
 
     def _pad_to_multiple_of_64(self, tensor, seq_dim, num_pipeline=1):
         seq_len = tensor.shape[seq_dim]
-        padded_seq_len = _calculate_seq_padding(seq_len, num_pipeline)
+        padded_seq_len = self._calculate_seq_padding(seq_len, num_pipeline)
         if padded_seq_len == seq_len:
             return tensor
 
@@ -219,63 +199,3 @@ def _unpack_padded_to_compact(
             dst = np.zeros((H, S, D), dtype=src.dtype)
             dst = src[:H, :S, :D]
         return dst
-
-    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
-        applicable = (
-            q.shape[-1] == self.d
-            and k.shape[-1] == self.d
-            and v.shape[-1] == self.d
-            and q.shape[-2] == self.seq_len
-            and k.shape[-2] == self.seq_len
-            and v.shape[-2] == self.seq_len
-            and self.seq_len % 64 == 0,  # Sequence length must be multiple of 64
-        )
-        if not applicable:
-            raise AIEOperatorConstraintError(
-                "AIEElementwiseAdd: incompatible tensor shape(s)"
-            )
-
-        ret = self._execute_aie_operation(q, k, v)
-        return ret
-
-    def _execute_aie_operation(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
-        # Convert to numpy
-        q_np = torch_to_numpy(q)
-        k_np = torch_to_numpy(k)
-        v_np = torch_to_numpy(v)
-
-        # Calculate padded sequence length
-        S_pad = self._calculate_seq_padding(self.seq_len, self.num_of_pipelines)
-
-        # Pack compact inputs to padded format
-        q_padded = self._pack_compact_to_padded(
-            q_np, self.num_heads, self.seq_len, S_pad, self.d
-        )
-        k_padded = self._pack_compact_to_padded(
-            k_np, self.num_heads, self.seq_len, S_pad, self.d
-        )
-        v_padded = self._pack_compact_to_padded(
-            v_np, self.num_heads, self.seq_len, S_pad, self.d
-        )
-
-        # Write padded buffers
-        self.write_buffer("Q", q_padded)
-        self.write_buffer("K", k_padded)
-        self.write_buffer("V", v_padded)
-
-        # Execute
-        self.run_runlist()
-
-        # Read padded output
-        o_padded = self.read_buffer(
-            "O", shape=(self.num_heads, S_pad, self.d), dtype=bfloat16
-        )
-
-        # Unpack padded output to compact format
-        o_compact = self._unpack_padded_to_compact(
-            o_padded, self.num_heads, self.seq_len, S_pad, self.d
-        )
-
-        # Convert back to torch with correct shape
-        result = numpy_to_torch(o_compact)
-        return result
diff --git a/iron/operators/mha/test.py b/iron/operators/mha/test.py
index 35c5087f..ae0275cf 100755
--- a/iron/operators/mha/test.py
+++ b/iron/operators/mha/test.py
@@ -12,30 +12,20 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
-    params = [(16384, 64, 1, 8)]
-    names = ["mha"]
-    return params, names
+def get_params():
+    params_list = [(16384, 64, 1, 8)]
 
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    params = []
+    for p in params_list:
+        params.append(pytest.param(*p))
+    return params
 
 
 @pytest.mark.metrics(
     Latency=r"Latency \(us\): (?P<value>[\d\.]+)",
     Bandwidth=r"Effective Bandwidth: (?P<value>[\d\.e\+-]+) GB/s",
 )
-@pytest.mark.parametrize("seq_len,dim,num_heads,num_pipelines", all_params)
+@pytest.mark.parametrize("seq_len,dim,num_heads,num_pipelines", get_params())
 def test_mha(seq_len, dim, num_heads, num_pipelines, aie_context):
     golden_ref = generate_golden_reference(
         S_q=seq_len,
diff --git a/iron/operators/relu/design.py b/iron/operators/relu/design.py
index 496bb443..5c46fbb9 100644
--- a/iron/operators/relu/design.py
+++ b/iron/operators/relu/design.py
@@ -14,7 +14,9 @@
 from aie.iron.controlflow import range_
 
 
-def my_relu(dev, size, num_columns, num_channels, tile_size, trace_size):
+def my_relu(
+    dev, size, num_columns, num_channels, tile_size, trace_size, kernel_archive=None
+):
     xfr_dtype = bfloat16
     line_size = 4096 if tile_size > 4096 else tile_size
     line_type = np.ndarray[(line_size,), np.dtype[xfr_dtype]]
diff --git a/iron/operators/relu/op.py b/iron/operators/relu/op.py
index 8b1f54e8..24ad44dc 100644
--- a/iron/operators/relu/op.py
+++ b/iron/operators/relu/op.py
@@ -7,8 +7,8 @@
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -17,14 +17,17 @@
 )
 
 
-class AIEReLU(AIEOperatorBase):
+class AIEReLU(MLIROperator):
     """AIE-accelerated ReLU activation function"""
 
     def __init__(self, size, num_aie_columns, num_channels, tile_size, context=None):
         max_multiple = num_aie_columns * tile_size
-        padded_size = ((size + max_multiple - 1) // max_multiple) * max_multiple
-        self.orig_size = size
-        self.size = padded_size
+        assert (
+            size % max_multiple == 0
+        ), "size must be multiple of num_aie_columns * tile_size"
+        assert size % tile_size == 0, "size must be multiple of tile_size"
+
+        self.size = size
         self.tile_size = tile_size
         self.num_aie_columns = num_aie_columns
         self.num_channels = num_channels
@@ -32,17 +35,15 @@ def __init__(self, size, num_aie_columns, num_channels, tile_size, context=None)
         total_shimdma_channels = self.num_aie_columns * self.num_channels
         assert total_shimdma_channels <= 16, "Conservative ShimDMA limit"
 
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"relu_{self.num_aie_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
 
-    def set_up_artifacts(self):
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"relu_{self.num_aie_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="my_relu",
             callback_args=[
@@ -55,59 +56,20 @@ def set_up_artifacts(self):
             ],
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelObjectArtifact.new(
-                    f"relu.o",
-                    depends=[
-                        SourceArtifact.new(
-                            self.context.base_dir / "aie_kernels" / "aie2p" / "relu.cc"
-                        )
-                    ],
-                ),
-            ],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-        self.add_artifacts([xclbin_artifact, insts_artifact])
-
-    def set_up_runtime(self):
-        self.add_buffer("input", self.size)
-        self.add_buffer("output", self.size)
-        self.add_kernel(
-            "relu",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("relu", "input", "output")
-
-    def forward(self, x):
-        if x.numel() > self.size:
-            raise AIEOperatorConstraintError(
-                "AIEReLU: input too large for configured size"
-            )
-
-        original_shape = x.shape
-        x_flat = x.reshape(-1)
-
-        pad_len = self.size - x_flat.numel()
-        if pad_len > 0:
-            x_flat = torch.nn.functional.pad(x_flat, (0, pad_len))
-
-        self.write_buffer("input", x_flat)
-        self.write_buffer("output", np.zeros(self.size, dtype=bfloat16))
-        self.run_runlist()
-        result = self.read_buffer_as_torch("output", shape=(self.size,), dtype=bfloat16)
-
-        if pad_len > 0:
-            result = result[: x_flat.numel() - pad_len]
-
-        return result.reshape(*original_shape)
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"relu.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir / "aie_kernels" / "aie2p" / "relu.cc"
+                    )
+                ],
+            ),
+        ]
+
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec("in", (self.size,)),  # input
+            AIERuntimeArgSpec("out", (self.size,)),  # output
+        ]
diff --git a/iron/operators/relu/test.py b/iron/operators/relu/test.py
index 3194c8c0..f3236c26 100755
--- a/iron/operators/relu/test.py
+++ b/iron/operators/relu/test.py
@@ -12,13 +12,12 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
+def get_params():
     max_aie_columns = 8
     num_channels = 1  # 1 channel for 1 input
-    input_lengths = [2048] if not extensive else [1024, 4096, 8192]
+    input_lengths = [1024, 2048, 4096, 8192]
 
     params = []
-    names = []
     for input_length in input_lengths:
         for num_aie_columns in range(1, max_aie_columns + 1):
             tile_size = input_length // num_aie_columns
@@ -26,24 +25,19 @@ def generate_test_params(extensive=False):
                 tile_size = 4096
             check_length = tile_size * num_aie_columns
             if check_length == input_length:
-                names.append(
-                    f"relu_{num_aie_columns}_cols_{num_channels}_channels_{input_length}_tile_{tile_size}"
+                is_regular = input_length == 2048
+                marks = [] if is_regular else [pytest.mark.extensive]
+
+                params.append(
+                    pytest.param(
+                        input_length,
+                        num_aie_columns,
+                        num_channels,
+                        tile_size,
+                        marks=marks,
+                    )
                 )
-                params.append((input_length, num_aie_columns, num_channels, tile_size))
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    return params
 
 
 @pytest.mark.metrics(
@@ -52,7 +46,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size",
-    all_params,
+    get_params(),
 )
 def test_relu(input_length, num_aie_columns, num_channels, tile_size, aie_context):
     golden_ref = generate_golden_reference(input_length=input_length)
diff --git a/iron/operators/rms_norm/design.py b/iron/operators/rms_norm/design.py
index 2bf09b43..583ca8f6 100644
--- a/iron/operators/rms_norm/design.py
+++ b/iron/operators/rms_norm/design.py
@@ -15,7 +15,15 @@
 from aie.helpers.util import np_ndarray_type_get_shape
 
 
-def my_rms_norm(dev, num_elements, num_columns, num_channels, trace_size, tile_size):
+def my_rms_norm(
+    dev,
+    num_elements,
+    num_columns,
+    num_channels,
+    trace_size,
+    tile_size,
+    kernel_archive="rms_norm.a",
+):
     per_tile_elements = 8192 if tile_size > 8192 else tile_size
     n = per_tile_elements * num_columns
     if num_elements % n != 0:
@@ -46,7 +54,7 @@ def my_rms_norm(dev, num_elements, num_columns, num_channels, trace_size, tile_s
 
     # AIE Core Function declaration
     rms_norm_kernel = Kernel(
-        "rms_norm_bf16_vector", "rms_norm.o", [tile_ty, tile_ty, np.int32]
+        "rms_norm_bf16_vector", kernel_archive, [tile_ty, tile_ty, np.int32]
     )
 
     # Define a task that will run on a compute tile
@@ -120,93 +128,3 @@ def core_body(of_in1, of_out, rms_norm_kernel):
 
     # Place program components (assign them resources on the device) and generate an MLIR module
     return Program(dev, rt).resolve_program(SequentialPlacer())
-
-
-if __name__ == "__main__":
-
-    def str_to_device(device: str):
-        if device == "npu":
-            return NPU1()
-        elif device == "npu2":
-            return NPU2()
-        else:
-            raise ValueError(f"Device name {device} is unknown.")
-
-    p = argparse.ArgumentParser()
-    # Parse command line arguments
-
-    # Device name is required to select the AIE device: npu or npu2
-    p.add_argument(
-        "-d",
-        "--dev",
-        required=True,
-        dest="device",
-        help="AIE Device",
-        type=str_to_device,
-    )
-    # Transfer size is required to define the size of the data to be transferred
-    # It must be a multiple of 1024 and divisible by the number of columns and 2 channels per column
-    p.add_argument("-l", "--length", required=True, dest="length", help="Transfer size")
-    # Number of columns is required to define the number of columns to be used
-    # It must be less than or equal to 4 for npu and 8 for npu2
-    p.add_argument(
-        "-co", "--columns", required=True, dest="cols", help="Number of columns"
-    )
-    # Number of channels is required to define the number of channels to be used
-    # It must be 1 or 2
-    p.add_argument(
-        "-ch", "--channels", required=True, dest="chans", help="Number of channels"
-    )
-    # Tile size (columns per tile) - defaults to 1024 for backward compatibility
-    p.add_argument(
-        "-ts",
-        "--tile-size",
-        required=False,
-        dest="tile_size",
-        default="1024",
-        help="Tile size (columns per tile)",
-    )
-    # Trace Size
-    p.add_argument(
-        "-tr", "--trace-size", required=True, dest="trace_size", help="Trace size"
-    )
-    p.add_argument(
-        "--output-file-path",
-        "-o",
-        type=str,
-        help="Output file path for the generated MLIR module",
-    )
-
-    opts = p.parse_args(sys.argv[1:])
-
-    length = int(opts.length)
-    columns = int(opts.cols)
-    dev = opts.device  # Now this is already a device object!
-
-    # Validate columns based on device type
-    if isinstance(dev, NPU1) and columns > 4:
-        raise ValueError("[ERROR] NPU device cannot allocate more than 4 columns")
-    elif isinstance(dev, NPU2) and columns > 8:
-        raise ValueError("[ERROR] NPU2 device cannot allocate more than 8 columns")
-
-    channels = int(opts.chans)
-    if channels < 1 or channels > 2:
-        raise ValueError("Number of channels must be 1 or 2")
-    tile_size = int(opts.tile_size)
-    if ((length % tile_size) % columns % channels) != 0:
-        print(
-            "transfer size ("
-            + str(length)
-            + ") must be a multiple of "
-            + str(tile_size)
-            + " and divisible by the number of columns and 2 channels per column"
-        )
-        raise ValueError
-    trace_size = int(opts.trace_size) if opts.trace_size is not None else 0
-
-    module = my_rms_norm(dev, length, columns, channels, trace_size, tile_size)
-
-    output_file_path = Path(opts.output_file_path)
-
-    with open(output_file_path, "w") as f:
-        f.write(str(module))
diff --git a/iron/operators/rms_norm/design_weighted.py b/iron/operators/rms_norm/design_weighted.py
index 20c4fbbe..fab3caac 100644
--- a/iron/operators/rms_norm/design_weighted.py
+++ b/iron/operators/rms_norm/design_weighted.py
@@ -16,7 +16,14 @@
 
 
 def my_weighted_rms_norm(
-    dev, num_elements, num_columns, num_channels, weight_length, trace_size
+    dev,
+    num_elements,
+    num_columns,
+    num_channels,
+    weight_length,
+    trace_size,
+    kernel_archive="rms_norm.a",
+    func_prefix="",
 ):
     per_tile_elements = weight_length
     total_cores = num_columns  # For each core that does rms norm, another core will take its output to do eltwise mul
@@ -53,11 +60,13 @@ def my_weighted_rms_norm(
 
     # AIE Core Function declaration
     rms_norm_kernel = Kernel(
-        "rms_norm_bf16_vector", "rms_norm_archive.a", [tile_ty, tile_ty, np.int32]
+        f"{func_prefix}rms_norm_bf16_vector",
+        kernel_archive,
+        [tile_ty, tile_ty, np.int32],
     )
     eltwise_mul_kernel = Kernel(
-        "eltwise_mul_bf16_vector",
-        "rms_norm_archive.a",
+        f"{func_prefix}eltwise_mul_bf16_vector",
+        kernel_archive,
         [tile_ty, weights_ty, tile_ty, np.int32],
     )
 
@@ -157,96 +166,3 @@ def core_body_mul(of_in1, of_in2, of_out2, eltwise_mul):
 
     # Place program components (assign them resources on the device) and generate an MLIR module
     return Program(dev, rt).resolve_program(SequentialPlacer())
-
-
-if __name__ == "__main__":
-
-    def str_to_device(device: str):
-        if device == "npu":
-            return NPU1()
-        elif device == "npu2":
-            return NPU2()
-        else:
-            raise ValueError(f"Device name {device} is unknown.")
-
-    p = argparse.ArgumentParser()
-    # Parse command line arguments
-
-    # Device name is required to select the AIE device: npu or npu2
-    p.add_argument(
-        "-d",
-        "--dev",
-        required=True,
-        dest="device",
-        help="AIE Device",
-        type=str_to_device,
-    )
-    # Transfer size is required to define the size of the data to be transferred
-    # It must be a multiple of 1024 and divisible by the number of columns and 2 channels per column
-    p.add_argument("-l", "--length", required=True, dest="length", help="Transfer size")
-    # Number of columns is required to define the number of columns to be used
-    # It must be less than or equal to 4 for npu and 8 for npu2
-    p.add_argument(
-        "-co", "--columns", required=True, dest="cols", help="Number of columns"
-    )
-    # Number of channels is required to define the number of channels to be used
-    # It must be 1 or 2
-    p.add_argument(
-        "-ch", "--channels", required=True, dest="chans", help="Number of channels"
-    )
-    # Weight length
-    p.add_argument(
-        "-wl",
-        "--weight-length",
-        required=True,
-        dest="weight_length",
-        help="Weight vector length",
-    )
-    # Trace Size
-    p.add_argument(
-        "-ts", "--trace-size", required=True, dest="trace_size", help="Trace size"
-    )
-    p.add_argument(
-        "--output-file-path",
-        "-o",
-        type=str,
-        help="Output file path for the generated MLIR module",
-    )
-
-    opts = p.parse_args(sys.argv[1:])
-
-    length = int(opts.length)
-    columns = int(opts.cols)
-    dev = opts.device  # Now this is already a device object!
-
-    # Validate columns based on device type
-    if isinstance(dev, NPU1) and columns > 4:
-        raise ValueError("[ERROR] NPU device cannot allocate more than 4 columns")
-    elif isinstance(dev, NPU2) and columns > 8:
-        raise ValueError("[ERROR] NPU2 device cannot allocate more than 8 columns")
-
-    channels = int(opts.chans)
-    if channels < 1 or channels > 2:
-        raise ValueError("Number of channels must be 1 or 2")
-    weight_length = int(opts.weight_length)
-    # For weighted RMS norm: cores = columns (weights are broadcasted)
-    total_cores = columns
-    if (length % (weight_length * total_cores)) != 0:
-        print(
-            "transfer size ("
-            + str(length)
-            + ") must be a multiple of weight_length * total_cores ("
-            + str(weight_length * total_cores)
-            + ")"
-        )
-        raise ValueError
-    trace_size = int(opts.trace_size) if opts.trace_size is not None else 0
-
-    module = my_weighted_rms_norm(
-        dev, length, columns, channels, weight_length, trace_size
-    )
-
-    output_file_path = Path(opts.output_file_path)
-
-    with open(output_file_path, "w") as f:
-        f.write(str(module))
diff --git a/iron/operators/rms_norm/op.py b/iron/operators/rms_norm/op.py
index 1ba38d92..fe8c8aa9 100644
--- a/iron/operators/rms_norm/op.py
+++ b/iron/operators/rms_norm/op.py
@@ -8,8 +8,8 @@
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -17,10 +17,9 @@
     SourceArtifact,
     PythonGeneratedMLIRArtifact,
 )
-from iron.common.utils import torch_to_numpy
 
 
-class AIERMSNorm(AIEOperatorBase):
+class AIERMSNorm(MLIROperator):
     """AIE-accelerated RMS Normalization layer"""
 
     def __init__(
@@ -34,9 +33,12 @@ def __init__(
         context=None,
     ):
         max_multiple = num_aie_columns * tile_size
-        padded_size = ((size + max_multiple - 1) // max_multiple) * max_multiple
-        self.orig_size = size
-        self.size = padded_size
+        assert (
+            size % max_multiple == 0
+        ), "size must be multiple of num_aie_columns * tile_size"
+        assert size % tile_size == 0, "size must be multiple of tile_size"
+
+        self.size = size
         self.tile_size = tile_size
 
         self.num_columns = num_aie_columns
@@ -44,158 +46,80 @@ def __init__(
         self.eps = eps
         self.weighted = weighted
 
-        # Initializes weights to 1. Weights have size embedding dim, which is assumed to be tile size
-        self.weight = nn.Parameter(torch.ones(tile_size, dtype=torch.bfloat16))
-
         # Enforce ShimDMA limits for weighted RMS Norm (uses 2 inputs per core)
         # Maximum safe configuration: 8 columns × 2 channels = 16 ShimDMA channels
         total_shimdma_channels = self.num_columns * self.num_channels
         assert total_shimdma_channels <= 16, "Conservative ShimDMA limit"
 
-        # Artifacts created by set_up_artifacts()
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"weighted_rms_{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
 
-    def set_up_artifacts(self):
-        # Compilation artifacts
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"weighted_rms_{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
-            import_path=operator_dir / "design_weighted.py",
-            callback_fn="my_weighted_rms_norm",
-            callback_args=[
+        if self.weighted:
+            import_path = operator_dir / "design_weighted.py"
+            callback_fn = "my_weighted_rms_norm"
+            callback_args = [
                 self.context.device_manager.device_type,
                 self.size,
                 self.num_columns,
                 self.num_channels,
                 self.tile_size,
                 0,
-            ],
+            ]
+        else:
+            import_path = operator_dir / "design.py"
+            callback_fn = "my_rms_norm"
+            callback_args = [
+                self.context.device_manager.device_type,
+                self.size,
+                self.num_columns,
+                self.num_channels,
+                0,  # trace_size
+                self.tile_size,
+            ]
+
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
+            import_path=import_path,
+            callback_fn=callback_fn,
+            callback_args=callback_args,
+            callback_kwargs={
+                "kernel_archive": self.kernel_archive,
+            },
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelArchiveArtifact.new(
-                    f"rms_norm_archive.a",
-                    depends=[
-                        KernelObjectArtifact.new(
-                            f"rms_norm.o",
-                            depends=[
-                                SourceArtifact.new(
-                                    self.context.base_dir
-                                    / "aie_kernels"
-                                    / "aie2p"
-                                    / "rms_norm.cc"
-                                )
-                            ],
-                        ),
-                        KernelObjectArtifact.new(
-                            "mul.o",
-                            depends=[
-                                SourceArtifact.new(
-                                    self.context.base_dir
-                                    / "aie_kernels"
-                                    / "generic"
-                                    / "mul.cc"
-                                )
-                            ],
-                        ),
+    def get_kernel_artifacts(self):
+        artifacts = [
+            KernelObjectArtifact(
+                f"rms_norm.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir / "aie_kernels" / "aie2p" / "rms_norm.cc"
+                    )
+                ],
+            ),
+        ]
+        if self.weighted:
+            artifacts.append(
+                KernelObjectArtifact(
+                    "mul.o",
+                    dependencies=[
+                        SourceArtifact(
+                            self.context.base_dir / "aie_kernels" / "generic" / "mul.cc"
+                        )
                     ],
-                ),
-            ],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-
-        artifacts = [xclbin_artifact, insts_artifact]
-        self.add_artifacts(artifacts)
-
-    def set_up_runtime(self):
-        # Runtime setup
-        static_weights = None
-        if self.weight is not None:
-            static_weights = torch_to_numpy(self.weight)
-
-        self.add_buffer("input1", self.size)
-        self.add_buffer("input2", self.tile_size, static_data=static_weights)
-        self.add_buffer("output", self.size)
-        self.add_kernel(
-            "eltwise_mul",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("eltwise_mul", "input1", "input2", "output")
-
-    def forward(self, x, y=None):
-        """Forward pass through RMS normalization"""
-        applicable = (
-            len(x.shape) >= 1 and x.shape[-1] <= self.size and x.numel() <= self.size
-        )
-        if not applicable:
-            raise AIEOperatorConstraintError("AIERMSNorm: incompatible tensor shape(s)")
-
-        # Always flatten to [batch, orig_size]
-        original_shape = x.shape
-        batch = x.shape[0] if x.dim() > 1 else 1
-        x_flat = x.reshape(batch, -1)
-        if y is not None:
-            y_flat = y.reshape(batch, -1)
-        else:
-            y_flat = None
-
-        pad_len = self.size - x_flat.shape[1]
-        if pad_len > 0:
-            x_flat = torch.nn.functional.pad(x_flat, (0, pad_len))
-
-        out = self._execute_aie_operation(x_flat, y_flat)
-
-        # Remove padding if added
-        numel = np.prod(original_shape)
-        if pad_len > 0:
-            out = out.reshape(-1)[..., :numel]
-        # Restore original shape
-        out = out.reshape(*original_shape)
-
-        return out
-
-    def _execute_aie_operation(self, x, y=None):
-        """Execute RMS normalization on AIE hardware"""
-        # x, y are [batch, size]
-        batch = x.shape[0] if x.dim() > 1 else 1
-
-        # Flatten inputs for AIE processing
-        x_flat = x.view(-1)
-        if y is not None:
-            y_flat = y.view(-1)
-
-        # Verify size matches expected
-        if len(x_flat) != self.size:
-            raise AIEOperatorConstraintError(
-                f"Input size x={len(x_flat)} doesn't match configured size {self.size}"
+                )
             )
-
-        self.write_buffer("input1", x_flat)
-        if y is not None:
-            self.write_buffer("input2", y_flat)
-        else:
-            assert (
-                self.weight is not None
-            ), "Weights must be provided either as input or during initialization."
-        test_pattern = np.zeros(len(x_flat), dtype=bfloat16)
-        self.write_buffer("output", test_pattern)
-        self.run_runlist()
-        result = self.read_buffer_as_torch("output", shape=x_flat.shape, dtype=bfloat16)
-
-        return result
+        return artifacts
+
+    def get_arg_spec(self):
+        specs = [AIERuntimeArgSpec("in", (self.size // self.tile_size, self.tile_size))]
+        if self.weighted:
+            specs.append(AIERuntimeArgSpec("in", (self.tile_size,)))
+        specs.append(
+            AIERuntimeArgSpec("out", (self.size // self.tile_size, self.tile_size))
+        )
+        return specs
diff --git a/iron/operators/rms_norm/test.py b/iron/operators/rms_norm/test.py
index e6dd012d..7f736021 100755
--- a/iron/operators/rms_norm/test.py
+++ b/iron/operators/rms_norm/test.py
@@ -12,13 +12,12 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
+def get_params():
     max_aie_columns = 8
     num_channels = 2
-    input_lengths = [2048] if not extensive else [1024, 4096, 8192]
+    input_lengths = [1024, 2048, 4096, 8192]
 
     params = []
-    names = []
     for weighted in [False, True]:
         for input_length in input_lengths:
             for num_aie_columns in range(1, max_aie_columns + 1):
@@ -36,38 +35,21 @@ def generate_test_params(extensive=False):
                             tile_size = 4096
                         check_length = tile_size * num_aie_columns
                     if check_length == input_length:
-                        if not weighted:
-                            names.append(
-                                f"rms_norm_{num_aie_columns}_cols_{num_channels_rms}_channels_{input_length}_tile_{tile_size}"
-                            )
-                        else:
-                            names.append(
-                                f"weighted_rms_norm_{num_aie_columns}_cols_{num_channels_rms}_channels_{input_length}_weights_{tile_size}"
-                            )
+                        is_regular = input_length == 2048
+                        marks = [] if is_regular else [pytest.mark.extensive]
+
                         params.append(
-                            (
+                            pytest.param(
                                 input_length,
                                 num_aie_columns,
                                 num_channels_rms,
                                 tile_size,
                                 weighted,
+                                marks=marks,
                             )
                         )
 
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    return params
 
 
 @pytest.mark.metrics(
@@ -76,7 +58,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size,weighted",
-    all_params,
+    get_params(),
 )
 def test_rms_norm(
     input_length, num_aie_columns, num_channels, tile_size, weighted, aie_context
@@ -97,6 +79,7 @@ def test_rms_norm(
     input_buffers = {"input1": golden_ref["input"]}
     if weighted:
         operator.weight = golden_ref["weight"]
+        input_buffers["weight"] = golden_ref["weight"]
     output_buffers = {"output": golden_ref["output"]}
 
     errors, latency_us, bandwidth_gbps = run_test(
diff --git a/iron/operators/rope/design.py b/iron/operators/rope/design.py
index f1082bdd..f486071d 100644
--- a/iron/operators/rope/design.py
+++ b/iron/operators/rope/design.py
@@ -37,11 +37,17 @@ def rope(
     num_aie_columns=1,
     trace_size=0,
     method_type=None,
+    kernel_archive=None,
+    func_prefix="",
 ):
     dtype = bfloat16
 
     if angle_rows is None:
         angle_rows = rows
+    if kernel_archive is None:
+        kernel_archive = (
+            "rope" + (f"_{method_type}" if method_type is not None else "") + ".o"
+        )
 
     assert cols % (16 * 2) == 0 and cols >= (
         16 * 2
@@ -73,8 +79,8 @@ def rope(
 
     # AIE Core Function declaration
     rope_kernel = Kernel(
-        "rope",
-        "rope" + (f"_{method_type}" if method_type is not None else "") + ".o",
+        f"{func_prefix}rope",
+        kernel_archive,
         [tensor_tile_ty, angle_tile_ty, tensor_tile_ty, np.int32],
     )
 
@@ -127,7 +133,7 @@ def core_body(of_in, of_lut, of_out, rope_kernel):
 
     # Runtime operations to move data to/from the AIE-array
     rt = Runtime()
-    with rt.sequence(tensor_ty, tensor_ty, tensor_ty) as (A, B, C):
+    with rt.sequence(tensor_ty, angle_ty, tensor_ty) as (A, B, C):
         rt.start(*my_workers)
 
         # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
diff --git a/iron/operators/rope/op.py b/iron/operators/rope/op.py
index be8e7f95..fa6f1e6a 100644
--- a/iron/operators/rope/op.py
+++ b/iron/operators/rope/op.py
@@ -1,38 +1,43 @@
 # SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import torch
-import numpy as np
-from ml_dtypes import bfloat16
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
-    KernelArchiveArtifact,
     SourceArtifact,
     PythonGeneratedMLIRArtifact,
 )
 
 
-class AIERope(AIEOperatorBase):
+class AIERope(MLIROperator):
 
     def __init__(
         self,
         rows: int,
         cols: int,
         angle_rows=None,
-        num_aie_columns=None,
+        num_aie_columns=1,
         method_type=0,
         context=None,
     ):
         if angle_rows is None:
             angle_rows = rows
-        if num_aie_columns is None:
-            num_aie_columns = 1
+
+        assert cols % (16 * 2) == 0 and cols >= (
+            16 * 2
+        ), "cols must be multiple of 32 and >= 32"
+        assert rows % num_aie_columns == 0, "rows must be divisible by num_aie_columns"
+        assert (
+            angle_rows <= rows and rows % angle_rows == 0
+        ), "angle_rows must divide rows"
+        assert (
+            angle_rows >= num_aie_columns and angle_rows % num_aie_columns == 0
+        ), "angle_rows must be divisible by num_aie_columns"
 
         self.rows = rows
         self.cols = cols
@@ -41,19 +46,15 @@ def __init__(
         self.method_type = method_type
         assert method_type in {0, 1}
 
-        # Artifacts created by set_up_artifacts()
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"rope_{self.num_aie_columns}col_{self.rows}rows_{self.cols}cols_{self.angle_rows}arows_{self.method_type}m"
 
-    def set_up_artifacts(self):
-        # Compilation artifacts
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"rope_{self.num_aie_columns}c_{self.rows}rows_{self.cols}cols_{self.angle_rows}arows_{self.method_type}m"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="rope",
             callback_args=[
@@ -67,68 +68,42 @@ def set_up_artifacts(self):
             ],
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelObjectArtifact.new(
-                    f"rope_{self.method_type}.o",
-                    depends=[
-                        SourceArtifact.new(
-                            self.context.base_dir
-                            / "aie_kernels"
-                            / "generic"
-                            / "rope.cc"
-                        )
-                    ],
-                    extra_flags=[
-                        "-DTWO_HALVES" if 0 == self.method_type else "-DINTERLEAVED"
-                    ],
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"rope_{self.method_type}.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir / "aie_kernels" / "generic" / "rope.cc"
+                    )
+                ],
+                extra_flags=[
+                    "-DTWO_HALVES" if 0 == self.method_type else "-DINTERLEAVED"
+                ],
+            ),
+        ]
+
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec(
+                "in",
+                (
+                    self.rows,
+                    self.cols,
                 ),
-            ],
-        )
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-
-        artifacts = [xclbin_artifact, insts_artifact]
-        self.add_artifacts(artifacts)
-
-    def set_up_runtime(self):
-        # Runtime setup
-        self.add_buffer("in", self.rows * self.cols)
-        self.add_buffer("angles", self.angle_rows * self.cols)
-        self.add_buffer("output", self.rows * self.cols)
-        self.add_kernel(
-            "rope",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("rope", "in", "angles", "output")
-
-    def forward(self, tensor, angles):
-        applicable = (
-            tensor.shape[-2] == self.rows
-            and tensor.shape[-1] == self.cols
-            and tensor.shape[-1] % 16 == 0
-            and angles.shape[-2] == self.angle_rows
-            and angles.shape[-1] == self.cols
-        )
-        if not applicable:
-            raise AIEOperatorConstraintError("AIERope: incompatible tensor shape(s)")
-
-        # Write data to buffers
-        self.write_buffer("in", tensor)
-        self.write_buffer("angles", angles)
-
-        # Execute kernel
-        self.run_runlist()
-
-        # Read output
-        result = self.read_buffer_as_torch("output", shape=tensor.shape, dtype=bfloat16)
-
-        return result
+            ),  # input tensor
+            AIERuntimeArgSpec(
+                "in",
+                (
+                    self.angle_rows,
+                    self.cols,
+                ),
+            ),  # angles
+            AIERuntimeArgSpec(
+                "out",
+                (
+                    self.rows,
+                    self.cols,
+                ),
+            ),  # output
+        ]
diff --git a/iron/operators/rope/test.py b/iron/operators/rope/test.py
index 095a8cc3..6459e28a 100755
--- a/iron/operators/rope/test.py
+++ b/iron/operators/rope/test.py
@@ -12,55 +12,46 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
-    params = []
-    names = []
-
+def get_params():
     num_aie_columns_options = [1, 2, 8]
 
-    if not extensive:
-        input_rows = [32]
-        input_cols = [512]
-        input_angle_rows = [8, 32]
-        method_types = [0]  # 0: Two-halves method
-    else:
-        input_rows = [32, 64]
-        input_cols = [128]
-        input_angle_rows = [8, 16, 32]
-        method_types = [0, 1]  # 0: Two-halves method, 1: interleaved method
+    # Combine all options
+    input_rows = [32, 64]
+    input_cols = [128, 512]
+    input_angle_rows = [8, 16, 32]
+    method_types = [0, 1]  # 0: Two-halves method, 1: interleaved method
 
+    params = []
     for num_aie_columns in num_aie_columns_options:
         for n_rows in input_rows:
             for n_angle_rows in input_angle_rows:
                 for n_cols in input_cols:
                     for method_type in method_types:
-                        names.append(
-                            f"rope_{num_aie_columns}c_{n_rows}rows_{n_cols}cols_{n_angle_rows}arows_{method_type}m"
+                        is_regular = (
+                            n_rows == 32
+                            and n_cols == 512
+                            and n_angle_rows in [8, 32]
+                            and method_type == 0
                         )
+
+                        is_extensive_valid = n_cols == 128
+
+                        if not is_regular and not is_extensive_valid:
+                            continue
+
+                        marks = [] if is_regular else [pytest.mark.extensive]
+
                         params.append(
-                            (
+                            pytest.param(
                                 n_rows,
                                 n_cols,
                                 n_angle_rows,
                                 num_aie_columns,
                                 method_type,
+                                marks=marks,
                             )
                         )
-
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    return params
 
 
 @pytest.mark.metrics(
@@ -69,7 +60,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "rows,cols,angle_rows,aie_columns,method_type",
-    all_params,
+    get_params(),
 )
 def test_rope(rows, cols, angle_rows, aie_columns, method_type, aie_context):
     golden_ref = generate_golden_reference(
@@ -97,12 +88,7 @@ def test_rope(rows, cols, angle_rows, aie_columns, method_type, aie_context):
         operator, input_buffers, output_buffers, rel_tol=0.05, abs_tol=0.5
     )
 
-    print(golden_ref["C"])
-    print(
-        operator.read_buffer_as_torch("output", (rows // angle_rows, angle_rows, cols))
-    )
-
     print(f"\nLatency (us): {latency_us:.1f}")
     print(f"Effective Bandwidth: {bandwidth_gbps:.6e} GB/s\n")
 
-    # assert not errors, f"Test failed with errors: {errors}"
+    assert not errors, f"Test failed with errors: {errors}"
diff --git a/iron/operators/sigmoid/design.py b/iron/operators/sigmoid/design.py
index 49d33502..927f9432 100644
--- a/iron/operators/sigmoid/design.py
+++ b/iron/operators/sigmoid/design.py
@@ -14,7 +14,9 @@
 from aie.iron.controlflow import range_
 
 
-def my_sigmoid(dev, size, num_columns, num_channels, tile_size, trace_size):
+def my_sigmoid(
+    dev, size, num_columns, num_channels, tile_size, trace_size, kernel_archive=None
+):
     xfr_dtype = bfloat16
     line_size = 4096 if tile_size > 4096 else tile_size
     line_type = np.ndarray[(line_size,), np.dtype[xfr_dtype]]
diff --git a/iron/operators/sigmoid/op.py b/iron/operators/sigmoid/op.py
index a24d051d..0135800b 100644
--- a/iron/operators/sigmoid/op.py
+++ b/iron/operators/sigmoid/op.py
@@ -7,8 +7,8 @@
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -17,14 +17,17 @@
 )
 
 
-class AIESigmoid(AIEOperatorBase):
+class AIESigmoid(MLIROperator):
     """AIE-accelerated Sigmoid activation function"""
 
     def __init__(self, size, num_aie_columns, num_channels, tile_size, context=None):
         max_multiple = num_aie_columns * tile_size
-        padded_size = ((size + max_multiple - 1) // max_multiple) * max_multiple
-        self.orig_size = size
-        self.size = padded_size
+        assert (
+            size % max_multiple == 0
+        ), "size must be multiple of num_aie_columns * tile_size"
+        assert size % tile_size == 0, "size must be multiple of tile_size"
+
+        self.size = size
         self.tile_size = tile_size
 
         self.num_columns = num_aie_columns
@@ -33,17 +36,15 @@ def __init__(self, size, num_aie_columns, num_channels, tile_size, context=None)
         total_shimdma_channels = self.num_columns * self.num_channels
         assert total_shimdma_channels <= 16, "Conservative ShimDMA limit"
 
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"sigmoid_{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
 
-    def set_up_artifacts(self):
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"sigmoid_{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="my_sigmoid",
             callback_args=[
@@ -56,62 +57,20 @@ def set_up_artifacts(self):
             ],
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelObjectArtifact.new(
-                    f"sigmoid.o",
-                    depends=[
-                        SourceArtifact.new(
-                            self.context.base_dir
-                            / "aie_kernels"
-                            / "aie2p"
-                            / "sigmoid.cc"
-                        )
-                    ],
-                ),
-            ],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-        self.add_artifacts([xclbin_artifact, insts_artifact])
-
-    def set_up_runtime(self):
-        self.add_buffer("input", self.size)
-        self.add_buffer("output", self.size)
-        self.add_kernel(
-            "sigmoid",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("sigmoid", "input", "output")
-
-    def forward(self, x):
-        if x.numel() > self.size:
-            raise AIEOperatorConstraintError(
-                "AIESigmoid: input too large for configured size"
-            )
-
-        original_shape = x.shape
-        x_flat = x.reshape(-1)
-
-        pad_len = self.size - x_flat.numel()
-        if pad_len > 0:
-            x_flat = torch.nn.functional.pad(x_flat, (0, pad_len))
-
-        self.write_buffer("input", x_flat)
-        self.write_buffer("output", np.zeros(self.size, dtype=bfloat16))
-        self.run_runlist()
-        result = self.read_buffer_as_torch("output", shape=(self.size,), dtype=bfloat16)
-
-        if pad_len > 0:
-            result = result[: x_flat.numel() - pad_len]
-
-        return result.reshape(*original_shape)
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"sigmoid.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir / "aie_kernels" / "aie2p" / "sigmoid.cc"
+                    )
+                ],
+            ),
+        ]
+
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec("in", (self.size,)),  # input
+            AIERuntimeArgSpec("out", (self.size,)),  # output
+        ]
diff --git a/iron/operators/sigmoid/test.py b/iron/operators/sigmoid/test.py
index 1dc5b99d..a9b6b596 100755
--- a/iron/operators/sigmoid/test.py
+++ b/iron/operators/sigmoid/test.py
@@ -12,13 +12,12 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
+def get_params():
     max_aie_columns = 8
     num_channels = 1  # 1 channel for 1 input
-    input_lengths = [2048] if not extensive else [1024, 4096, 8192]
+    input_lengths = [1024, 2048, 4096, 8192]
 
     params = []
-    names = []
     for input_length in input_lengths:
         for num_aie_columns in range(1, max_aie_columns + 1):
             tile_size = input_length // num_aie_columns
@@ -26,24 +25,19 @@ def generate_test_params(extensive=False):
                 tile_size = 4096
             check_length = tile_size * num_aie_columns
             if check_length == input_length:
-                names.append(
-                    f"sigmoid_{num_aie_columns}_cols_{num_channels}_channels_{input_length}_tile_{tile_size}"
+                is_regular = input_length == 2048
+                marks = [] if is_regular else [pytest.mark.extensive]
+
+                params.append(
+                    pytest.param(
+                        input_length,
+                        num_aie_columns,
+                        num_channels,
+                        tile_size,
+                        marks=marks,
+                    )
                 )
-                params.append((input_length, num_aie_columns, num_channels, tile_size))
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    return params
 
 
 @pytest.mark.metrics(
@@ -52,7 +46,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size",
-    all_params,
+    get_params(),
 )
 def test_sigmoid(input_length, num_aie_columns, num_channels, tile_size, aie_context):
     golden_ref = generate_golden_reference(input_length=input_length)
diff --git a/iron/operators/silu/design.py b/iron/operators/silu/design.py
index 5968943b..4c041afb 100644
--- a/iron/operators/silu/design.py
+++ b/iron/operators/silu/design.py
@@ -12,15 +12,19 @@
 from aie.iron.device import Tile, NPU1, NPU2
 from aie.helpers.taplib.tap import TensorAccessPattern
 from aie.iron.controlflow import range_
+from aie.helpers.util import np_ndarray_type_get_shape
 
 
-def my_silu(dev, size, num_columns, num_channels, tile_size, trace_size):
+def my_silu(
+    dev, size, num_columns, tile_size, trace_size, kernel_archive, func_prefix=""
+):
     xfr_dtype = bfloat16
     line_size = 4096 if tile_size > 4096 else tile_size
     line_type = np.ndarray[(line_size,), np.dtype[xfr_dtype]]
     transfer_type = np.ndarray[(size,), np.dtype[xfr_dtype]]
 
-    # Calculate number of iterations per core
+    # Calculate number of iterations per core (using 1 channel per column)
+    num_channels = 1
     total_cores = num_columns * num_channels
     per_core_elements = size // total_cores
     N_div_n = per_core_elements // line_size
@@ -42,8 +46,8 @@ def my_silu(dev, size, num_columns, num_channels, tile_size, trace_size):
 
     # External, binary kernel definition
     silu_fcn = Kernel(
-        "silu_bf16",
-        "silu.o",
+        f"{func_prefix}silu_bf16",
+        kernel_archive,
         [line_type, line_type, np.int32],
     )
 
@@ -152,11 +156,6 @@ def str_to_device(device: str):
     p.add_argument(
         "-co", "--columns", required=True, dest="cols", help="Number of columns"
     )
-    # Number of channels is required to define the number of channels to be used
-    # It must be 1 or 2
-    p.add_argument(
-        "-ch", "--channels", required=True, dest="chans", help="Number of channels"
-    )
     # Tile size (elements per tile) - defaults to 1024 for backward compatibility
     p.add_argument(
         "-ts",
@@ -189,11 +188,10 @@ def str_to_device(device: str):
     elif isinstance(dev, NPU2) and columns > 8:
         raise ValueError("[ERROR] NPU2 device cannot allocate more than 8 columns")
 
-    channels = int(opts.chans)
-    if channels < 1 or channels > 2:
-        raise ValueError("Number of channels must be 1 or 2")
     tile_size = int(opts.tile_size)
-    if ((length % tile_size) % columns % channels) != 0:
+    # Using 1 channel per column for SiLU
+    num_channels = 1
+    if ((length % tile_size) % columns % num_channels) != 0:
         print(
             "transfer size ("
             + str(length)
@@ -204,7 +202,7 @@ def str_to_device(device: str):
         raise ValueError
     trace_size = opts.trace_size
 
-    module = my_silu(dev, length, columns, channels, tile_size, trace_size)
+    module = my_silu(dev, length, columns, tile_size, trace_size, "silu.o")
 
     output_file_path = Path(opts.output_file_path)
 
diff --git a/iron/operators/silu/op.py b/iron/operators/silu/op.py
index 3583868c..1fe853f2 100644
--- a/iron/operators/silu/op.py
+++ b/iron/operators/silu/op.py
@@ -1,155 +1,68 @@
 # SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import torch
-import numpy as np
-from ml_dtypes import bfloat16
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
-    KernelArchiveArtifact,
     SourceArtifact,
     PythonGeneratedMLIRArtifact,
 )
 
 
-class AIESiLU(AIEOperatorBase):
+class AIESiLU(MLIROperator):
     """AIE-accelerated SiLU activation function"""
 
-    def __init__(self, size, num_aie_columns, num_channels, tile_size, context=None):
-        max_multiple = num_aie_columns * tile_size
-        padded_size = ((size + max_multiple - 1) // max_multiple) * max_multiple
-        self.orig_size = size
-        self.size = padded_size
+    def __init__(self, size, tile_size, num_aie_columns=8, context=None):
+        assert (
+            size % (num_aie_columns * tile_size) == 0
+        ), "size must be multiple of num_aie_columns * tile_size"
+        self.size = size
         self.tile_size = tile_size
-
-        self.num_columns = num_aie_columns
-        self.num_channels = num_channels
+        self.num_aie_columns = num_aie_columns
         # Enforce ShimDMA limits for SiLU (uses 1 input per core)
-        # Maximum safe configuration: 8 columns × 2 channels = 16 ShimDMA channels
-        total_shimdma_channels = self.num_columns * self.num_channels
+        # Maximum safe configuration: 8 columns × 1 channel = 8 ShimDMA channels
+        total_shimdma_channels = self.num_aie_columns * 1
         assert total_shimdma_channels <= 16, "Conservative ShimDMA limit"
+        MLIROperator.__init__(self, context=context)
 
-        # Artifacts created by set_up_artifacts()
-        self.xclbin_artifact = None
-        self.insts_artifact = None
-
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"silu_{self.num_aie_columns}col_{self.size}_{self.tile_size}t"
 
-    def get_artifacts(self, prefix="silu_"):
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"{prefix}{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="my_silu",
             callback_args=[
                 self.context.device_manager.device_type,
                 self.size,
-                self.num_columns,
-                self.num_channels,
+                self.num_aie_columns,
                 self.tile_size,
                 0,
             ],
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelObjectArtifact.new(
-                    f"silu.o",
-                    depends=[
-                        SourceArtifact.new(
-                            self.context.base_dir / "aie_kernels" / "aie2p" / "silu.cc"
-                        )
-                    ],
-                ),
-            ],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        return xclbin_artifact, insts_artifact
-
-    def set_up_artifacts(self):
-        # If this operator is only used as a sub-operator in another operator that sets it up, we should skip the setup here as those artifacts and buffers may not be needed.
-        # Compilation artifacts
-        xclbin_artifact, insts_artifact = self.get_artifacts()
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-
-        artifacts = [xclbin_artifact, insts_artifact]
-        self.add_artifacts(artifacts)
-
-    def set_up_runtime(self):
-        # If this operator is only used as a sub-operator in another operator that sets it up, we should skip the setup here as those artifacts and buffers may not be needed.
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"silu.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir / "aie_kernels" / "aie2p" / "silu.cc"
+                    )
+                ],
+            ),
+        ]
+
+    def get_arg_spec(self):
         # Runtime setup
-        self.add_buffer("input", self.size)
-        self.add_buffer("output", self.size)
-        self.add_kernel(
-            "silu",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("silu", "input", "output")
-
-    def forward(self, x):
-        """Forward pass for SiLU activation"""
-        applicable = (
-            len(x.shape) >= 1 and x.shape[-1] <= self.size and x.numel() <= self.size
-        )
-        if not applicable:
-            raise AIEOperatorConstraintError("AIESiLU: incompatible tensor shape(s)")
-
-        # Always flatten to [batch, orig_size]
-        original_shape = x.shape
-        batch = x.shape[0] if x.dim() > 1 else 1
-        x_flat = x.reshape(batch, -1)
-
-        pad_len = self.size - x_flat.shape[1]
-        if pad_len > 0:
-            x_flat = torch.nn.functional.pad(x_flat, (0, pad_len))
-
-        out = self._execute_aie_operation(x_flat)
-
-        # Remove padding if added
-        numel = np.prod(original_shape)
-        if pad_len > 0:
-            out = out.reshape(-1)[..., :numel]
-        # Restore original shape
-        out = out.reshape(*original_shape)
-
-        return out
-
-    def _execute_aie_operation(self, x, y=None):
-        """Execute SiLU operation on AIE hardware"""
-        # x is [batch, size]
-        batch = x.shape[0] if x.dim() > 1 else 1
-
-        # Flatten inputs for AIE processing
-        x_flat = x.view(-1)
-
-        # Verify size matches expected
-        if len(x_flat) != self.size:
-            raise AIEOperatorConstraintError(
-                f"Input size x={len(x_flat)} doesn't match configured size {self.size}"
-            )
-
-        self.write_buffer("input", x_flat)
-        test_pattern = np.zeros(len(x_flat), dtype=bfloat16)
-        self.write_buffer("output", test_pattern)
-        self.run_runlist()
-        result = self.read_buffer_as_torch("output", shape=x_flat.shape, dtype=bfloat16)
-
-        return result
+        return [
+            AIERuntimeArgSpec("in", (self.size,)),  # input
+            AIERuntimeArgSpec("out", (self.size,)),  # output
+        ]
diff --git a/iron/operators/silu/test.py b/iron/operators/silu/test.py
index 4dc52ba0..267f7669 100755
--- a/iron/operators/silu/test.py
+++ b/iron/operators/silu/test.py
@@ -12,13 +12,12 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
+def get_params():
     max_aie_columns = 8
     num_channels = 1  # 1 channel for 1 input
-    input_lengths = [2048] if not extensive else [1024, 4096, 8192]
+    input_lengths = [1024, 2048, 4096, 8192]
 
     params = []
-    names = []
     for input_length in input_lengths:
         for num_aie_columns in range(1, max_aie_columns + 1):
             tile_size = input_length // num_aie_columns
@@ -26,24 +25,19 @@ def generate_test_params(extensive=False):
                 tile_size = 4096
             check_length = tile_size * num_aie_columns
             if check_length == input_length:
-                names.append(
-                    f"silu_{num_aie_columns}_cols_{num_channels}_channels_{input_length}_tile_{tile_size}"
+                is_regular = input_length == 2048
+                marks = [] if is_regular else [pytest.mark.extensive]
+
+                params.append(
+                    pytest.param(
+                        input_length,
+                        num_aie_columns,
+                        num_channels,
+                        tile_size,
+                        marks=marks,
+                    )
                 )
-                params.append((input_length, num_aie_columns, num_channels, tile_size))
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    return params
 
 
 @pytest.mark.metrics(
@@ -52,7 +46,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size",
-    all_params,
+    get_params(),
 )
 def test_silu(input_length, num_aie_columns, num_channels, tile_size, aie_context):
     golden_ref = generate_golden_reference(input_length=input_length)
@@ -60,7 +54,6 @@ def test_silu(input_length, num_aie_columns, num_channels, tile_size, aie_contex
     operator = AIESiLU(
         size=input_length,
         num_aie_columns=num_aie_columns,
-        num_channels=num_channels,
         tile_size=tile_size,
         context=aie_context,
     )
diff --git a/iron/operators/softmax/design.py b/iron/operators/softmax/design.py
index 981312be..567dbbc6 100644
--- a/iron/operators/softmax/design.py
+++ b/iron/operators/softmax/design.py
@@ -7,7 +7,15 @@
 import argparse
 import sys
 
-from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
+from aie.iron import (
+    Kernel,
+    ObjectFifo,
+    Program,
+    Runtime,
+    Worker,
+    Buffer,
+    WorkerRuntimeBarrier,
+)
 from aie.iron.placers import SequentialPlacer
 from aie.iron.device import NPU1, NPU2
 from aie.helpers.taplib.tap import TensorAccessPattern
@@ -15,15 +23,28 @@
 from ml_dtypes import bfloat16
 
 
-def softmax(dev, num_elements, num_columns, num_channels, trace_size, tile_size):
+def softmax(
+    dev,
+    num_elements,
+    num_aie_columns,
+    num_channels,
+    trace_size,
+    tile_size,
+    rtp_vector_size=None,
+    mask_patch_value=0,
+    kernel_archive="softmax.a",
+    func_prefix="",
+):
     per_tile_elements = tile_size
-    n = per_tile_elements * num_columns
+    if rtp_vector_size is None:
+        rtp_vector_size = per_tile_elements
+    n = per_tile_elements * num_aie_columns
     if num_elements % n != 0:
         raise ValueError(
             f"Number of elements ({num_elements}) must be a multiple of {n}."
         )
     N_div_n = num_elements // n
-    chunk = num_elements // num_columns // num_channels  # For offset calculation
+    chunk = num_elements // num_aie_columns // num_channels  # For offset calculation
     dtype = bfloat16
 
     # Define tensor types
@@ -33,28 +54,52 @@ def softmax(dev, num_elements, num_columns, num_channels, trace_size, tile_size)
     # AIE-array data movement with object fifos
     of_in1s = [
         ObjectFifo(tile_ty, name=f"in1_{i}_{j}")
-        for i in range(num_columns)
+        for i in range(num_aie_columns)
         for j in range(num_channels)
     ]
     of_outs = [
         ObjectFifo(tile_ty, name=f"out_{i}_{j}")
-        for i in range(num_columns)
+        for i in range(num_aie_columns)
         for j in range(num_channels)
     ]
 
     # AIE Core Function declaration
-    softmax_kernel = Kernel("softmax_bf16", "softmax.o", [tile_ty, tile_ty, np.int32])
+    softmax_kernel = Kernel(
+        f"{func_prefix}softmax_bf16", kernel_archive, [tile_ty, tile_ty, np.int32]
+    )
+    mask_kernel = Kernel(
+        f"{func_prefix}mask_bf16", kernel_archive, [tile_ty, np.int32, np.int32]
+    )
 
     # Define a task that will run on a compute tile
-    def core_body(of_in1, of_out, softmax_kernel):
+    def core_body(of_in1, of_out, softmax_kernel, mask_kernel, rtp, barrier):
         # Number of sub-vector "tile" iterations
+        barrier.wait_for_value(1)
+        vector_size = rtp[0]
         for _ in range_(N_div_n):
             elem_in1 = of_in1.acquire(1)
             elem_out = of_out.acquire(1)
+            mask_kernel(elem_in1, vector_size, per_tile_elements)
             softmax_kernel(elem_in1, elem_out, per_tile_elements)
             of_in1.release(1)
             of_out.release(1)
 
+    rtps = [
+        Buffer(
+            np.ndarray[(1,), np.dtype[np.int32]],
+            name=f"rtp_{i}_{j}",
+            use_write_rtp=True,
+        )
+        for i in range(num_aie_columns)
+        for j in range(num_channels)
+    ]
+
+    barriers = [
+        WorkerRuntimeBarrier()
+        for i in range(num_aie_columns)
+        for j in range(num_channels)
+    ]
+
     # Create a worker to run the task on a compute tile
     my_workers = [
         Worker(
@@ -63,9 +108,12 @@ def core_body(of_in1, of_out, softmax_kernel):
                 of_in1s[i * num_channels + j].cons(),
                 of_outs[i * num_channels + j].prod(),
                 softmax_kernel,
+                mask_kernel,
+                rtps[i * num_channels + j],
+                barriers[i * num_channels + j],
             ],
         )
-        for i in range(num_columns)
+        for i in range(num_aie_columns)
         for j in range(num_channels)
     ]
 
@@ -81,7 +129,7 @@ def core_body(of_in1, of_out, softmax_kernel):
             [1, 1, 1, chunk],
             [0, 0, 0, 1],
         )
-        for i in range(num_columns)
+        for i in range(num_aie_columns)
         for j in range(num_channels)
     ]
 
@@ -90,11 +138,21 @@ def core_body(of_in1, of_out, softmax_kernel):
     with rt.sequence(tensor_ty, tensor_ty) as (A, C):
         rt.start(*my_workers)
 
+        # Set run-time parameter for actual vector size (remainder is considered padding and ignored by the computation)
+        def set_rtps(*args):
+            for rtp in args:
+                rtp[0] = rtp_vector_size if not mask_patch_value else mask_patch_value
+
+        rt.inline_ops(set_rtps, rtps)
+
+        for i in range(num_aie_columns * num_channels):
+            rt.set_barrier(barriers[i], 1)
+
         # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
         tg = rt.task_group()
 
         # Fill the input objectFIFOs with data
-        for i in range(num_columns):
+        for i in range(num_aie_columns):
             for j in range(num_channels):
                 rt.fill(
                     of_in1s[i * num_channels + j].prod(),
@@ -103,7 +161,7 @@ def core_body(of_in1, of_out, softmax_kernel):
                     task_group=tg,
                 )
         # Drain the output objectFIFOs with data
-        for i in range(num_columns):
+        for i in range(num_aie_columns):
             for j in range(num_channels):
                 rt.drain(
                     of_outs[i * num_channels + j].cons(),
diff --git a/iron/operators/softmax/op.py b/iron/operators/softmax/op.py
index 106f0415..2beb0627 100644
--- a/iron/operators/softmax/op.py
+++ b/iron/operators/softmax/op.py
@@ -2,133 +2,86 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import torch
-import numpy as np
-from ml_dtypes import bfloat16
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
-    KernelArchiveArtifact,
     SourceArtifact,
     PythonGeneratedMLIRArtifact,
 )
 
 
-class AIESoftmax(AIEOperatorBase):
+class AIESoftmax(MLIROperator):
+    """AIE-accelerated Softmax operation"""
 
     def __init__(
-        self, rows: int, cols: int, num_aie_columns=1, num_channels=1, context=None
+        self,
+        rows: int,
+        cols: int,
+        num_aie_columns=1,
+        num_channels=1,
+        rtp_vector_size=None,
+        mask_patch_value=0,
+        context=None,
     ):
-        self.size = rows * cols
+        assert rows % 16 == 0, "rows must be multiple of 16"
+        assert cols % 16 == 0, "cols must be multiple of 16"
+        assert (rows * cols) % (
+            num_aie_columns * cols
+        ) == 0, "size must be multiple of num_aie_columns * tile_size"
+
         self.rows = rows
         self.cols = cols
-
+        self.size = rows * cols
+        self.num_aie_columns = num_aie_columns
         self.num_channels = num_channels
-        self.num_columns = num_aie_columns
+        self.rtp_vector_size = rtp_vector_size
+        self.mask_patch_value = mask_patch_value
 
-        # Artifacts created by set_up_artifacts()
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        name = f"softmax_{self.num_aie_columns}col_{self.num_channels}ch_{self.size}_{self.cols}t"
+        if self.rtp_vector_size is not None:
+            name += f"_{self.rtp_vector_size}rtp"
+        return name
 
-    def set_up_artifacts(self):
-        # Compilation artifacts
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"softmax_{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.cols}t"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="softmax",
             callback_args=[
                 self.context.device_manager.device_type,
-                self.rows * self.cols,
-                self.num_columns,
+                self.size,
+                self.num_aie_columns,
                 self.num_channels,
-                0,
+                0,  # trace_size
                 self.cols,
+                self.rtp_vector_size,
+                self.mask_patch_value,
             ],
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelObjectArtifact.new(
-                    f"softmax.o",
-                    depends=[
-                        SourceArtifact.new(
-                            self.context.base_dir
-                            / "aie_kernels"
-                            / "aie2p"
-                            / "softmax.cc"
-                        )
-                    ],
-                ),
-            ],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"gemm_{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-
-        artifacts = [xclbin_artifact, insts_artifact]
-        self.add_artifacts(artifacts)
-
-    def set_up_runtime(self):
-        # Runlist setup
-        self.add_buffer("in", self.size)
-        self.add_buffer("output", self.size)
-        self.add_kernel(
-            "softmax",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("softmax", "in", "output")
-
-    def forward(self, x):
-        applicable = (
-            x.shape[-1] * x.shape[-2] == self.size
-            and x.shape[-1] == self.cols
-            and x.shape[-1] % 16 == 0
-            and x.shape[-2] % 16 == 0
-        )
-        if not applicable:
-            raise AIEOperatorConstraintError("AIESoftmax: incompatible tensor shape(s)")
-
-        return self._execute_aie_operation(x)
-
-    def _execute_aie_operation(self, x):
-        original_shape = x.shape
-
-        # Reshape for processing
-        # Split x into a list of H tensors of size [S_q, S_kv]
-        heads = x.shape[1]
-        x_list = [x[0, h, :, :] for h in range(heads)]
-        results = []
-        for i in range(heads):
-            x_iter = x_list[i]
-            input_size = x_iter.nbytes
-            self.write_buffer("in", x_iter)
-            test_pattern = np.zeros(len(x_iter), dtype=bfloat16)
-            self.write_buffer("output", test_pattern)
-            self.run_runlist()
-            result = self.read_buffer_as_torch(
-                "output", shape=x_list[i].shape, dtype=bfloat16
-            )
-            results.append(result)
-
-        result = torch.stack(results, dim=0).unsqueeze(
-            0
-        )  # Shape: (1, heads, S_q, S_kv)
-
-        return result
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"softmax.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir / "aie_kernels" / "aie2p" / "softmax.cc"
+                    )
+                ],
+            ),
+        ]
+
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec("in", (self.size,)),
+            AIERuntimeArgSpec("out", (self.size,)),
+        ]
diff --git a/iron/operators/softmax/test.py b/iron/operators/softmax/test.py
index 1ad613d9..a75c37af 100755
--- a/iron/operators/softmax/test.py
+++ b/iron/operators/softmax/test.py
@@ -30,37 +30,24 @@ def get_optimal_columns_channels(input_length, tile_size):
         return 2, 2  # Default fallback
 
 
-def generate_test_params(extensive=False):
+def get_params():
     max_aie_columns = 8
     num_channels = 2
-    input_lengths = [4096] if not extensive else []
+    input_lengths = [32768]
     tile_sizes = [1024, 512, 2048]
 
     params = []
-    names = []
     for input_length in input_lengths:
         for tile_size in tile_sizes:
             optimal_columns, optimal_channels = get_optimal_columns_channels(
                 input_length, tile_size
             )
-            names.append(
-                f"softmax_{optimal_columns}_cols_{optimal_channels}_channels_{input_length}_tile_{tile_size}"
-            )
-            params.append((input_length, optimal_columns, optimal_channels, tile_size))
-    return params, names
-
 
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+            # All tests are regular as extensive list was empty in original code
+            params.append(
+                pytest.param(input_length, optimal_columns, optimal_channels, tile_size)
+            )
+    return params
 
 
 @pytest.mark.metrics(
@@ -69,7 +56,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size",
-    all_params,
+    get_params(),
 )
 def test_softmax(input_length, num_aie_columns, num_channels, tile_size, aie_context):
 
diff --git a/iron/operators/swiglu_decode/op.py b/iron/operators/swiglu_decode/op.py
index 869493c9..0fb0969b 100644
--- a/iron/operators/swiglu_decode/op.py
+++ b/iron/operators/swiglu_decode/op.py
@@ -6,8 +6,11 @@
 import numpy as np
 from ml_dtypes import bfloat16
 
+from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
+from aie.utils.npukernel import NPUKernel
 from iron.common import (
-    AIEOperatorBase,
+    CompositeOperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -18,10 +21,92 @@
 from iron.operators.gemv.op import AIEGEMV
 from iron.operators.silu.op import AIESiLU
 from iron.operators.elementwise_mul.op import AIEElementwiseMul
-from iron.common.utils import torch_to_numpy
 
 
-class AIESwiGLUDecode(AIEOperatorBase):
+class SwiGLUDecodeCallable:
+    def __init__(self, op):
+        self.op = op
+        # Create callables for sub-operators
+        # We need to manually construct NPUKernel because sub-operators weren't "compiled" in the standard way
+
+        # Helper to create callable from operator and artifacts
+        def create_callable(sub_op, xclbin_path, kernel_name, insts_artifact):
+            return NPUKernel(
+                xclbin_path=xclbin_path,
+                kernel_name=kernel_name,
+                insts_path=insts_artifact.filename,
+            )
+
+        self.gemv_1_callable = create_callable(
+            op.gemv_1,
+            op.combined_xclbin.filename,
+            op.gemv_1_xclbin.kernel_name,
+            op.gemv_1_insts,
+        )
+        self.silu_callable = create_callable(
+            op.silu,
+            op.combined_xclbin.filename,
+            op.silu_xclbin.kernel_name,
+            op.silu_insts,
+        )
+        self.eltwise_mul_callable = create_callable(
+            op.eltwise_mul,
+            op.combined_xclbin.filename,
+            op.eltwise_mul_xclbin.kernel_name,
+            op.eltwise_mul_insts,
+        )
+        self.gemv_2_callable = create_callable(
+            op.gemv_2,
+            op.combined_xclbin.filename,
+            op.gemv_2_xclbin.kernel_name,
+            op.gemv_2_insts,
+        )
+
+        # Allocate and upload weights
+        self.weights_1 = XRTTensor.from_torch(op.weights_1)
+        self.weights_2 = XRTTensor.from_torch(op.weights_2)
+        self.weights_3 = XRTTensor.from_torch(op.weights_3)
+
+        # Allocate intermediate buffers
+        # left: output of gemv_1 (hidden_dim_padded)
+        self.left = XRTTensor((op.hidden_dim_padded,), dtype=bfloat16)
+        # right: output of gemv_1 (hidden_dim_padded)
+        self.right = XRTTensor((op.hidden_dim_padded,), dtype=bfloat16)
+        # left_swished: output of silu (hidden_dim_padded)
+        self.left_swished = XRTTensor((op.hidden_dim_padded,), dtype=bfloat16)
+        # intermediate: output of eltwise_mul (hidden_dim_padded)
+        self.intermediate = XRTTensor((op.hidden_dim_padded,), dtype=bfloat16)
+
+    def __call__(self, input_buf, output_buf):
+        # Ensure inputs are on device
+        input_buf.to("npu")
+        output_buf.to("npu")
+        self.weights_1.to("npu")
+        self.weights_2.to("npu")
+        self.weights_3.to("npu")
+        self.left.to("npu")
+        self.right.to("npu")
+        self.left_swished.to("npu")
+        self.intermediate.to("npu")
+
+        # Sequence:
+        # 1. GEMV(weights_1, input, left)
+        self.gemv_1_callable(self.weights_1, input_buf, self.left)
+
+        # 2. GEMV(weights_2, input, right)
+        self.gemv_1_callable(self.weights_2, input_buf, self.right)
+
+        # 3. SiLU(left, left_swished)
+        self.silu_callable(self.left, self.left_swished)
+
+        # 4. EltwiseMul(left_swished, right, intermediate)
+        self.eltwise_mul_callable(self.left_swished, self.right, self.intermediate)
+
+        # 5. GEMV(weights_3, intermediate, output)
+        self.gemv_2_callable(self.weights_3, self.intermediate, output_buf)
+
+
+class AIESwiGLUDecode(CompositeOperator):
 
     def __init__(self, embedding_dim, hidden_dim, prio_accuracy=False, context=None):
         self.hidden_dim = hidden_dim
@@ -57,9 +142,7 @@ def set_up_artifacts(self):
             tile_size_output=self.hidden_dim // 8,
         )
         self.gemv_1 = gemv_1
-        gemv_1_xclbin, gemv_1_insts = gemv_1.get_artifacts(
-            prefix="swiglu_decode_gemv_1_"
-        )
+        gemv_1_xclbin, gemv_1_insts = gemv_1.get_artifacts(prefix="swiglu_gemv_1_")
         gemv_1_xclbin.extra_flags += [
             "--xclbin-instance-name=swiglu_gemv_1",
             "--xclbin-kernel-id=0x901",
@@ -72,31 +155,29 @@ def set_up_artifacts(self):
         silu = AIESiLU(
             size=self.hidden_dim,
             num_aie_columns=8,
-            num_channels=2,
             tile_size=self.hidden_dim // 16,
         )
         self.silu = silu
         self.hidden_dim_padded = silu.size
-        silu_xclbin, silu_insts = silu.get_artifacts(prefix="swiglu_decode_silu_")
+        silu_xclbin, silu_insts = silu.get_artifacts(prefix="swiglu_silu_")
         silu_xclbin.xclbin_input = gemv_1_xclbin
         silu_xclbin.extra_flags += [
             "--xclbin-instance-name=swiglu_silu",
             "--xclbin-kernel-id=0x902",
         ]
         silu_xclbin.kernel_name = "swiglu_silu"
-        silu_xclbin.depends += [gemv_1_xclbin]
+        silu_xclbin.dependencies.add(gemv_1_xclbin)
         artifacts.append(silu_insts)
 
         eltwise_mul = AIEElementwiseMul(
             size=self.hidden_dim,
             num_aie_columns=8,
-            num_channels=2,
             tile_size=self.hidden_dim // 8,
         )
         self.eltwise_mul = eltwise_mul
         assert self.hidden_dim <= eltwise_mul.size <= self.hidden_dim_padded
         eltwise_mul_xclbin, eltwise_mul_insts = eltwise_mul.get_artifacts(
-            prefix="swiglu_decode_eltwise_mul_"
+            prefix="swiglu_eltwise_mul_"
         )
         eltwise_mul_xclbin.xclbin_input = silu_xclbin
         eltwise_mul_xclbin.extra_flags += [
@@ -104,7 +185,7 @@ def set_up_artifacts(self):
             "--xclbin-kernel-id=0x903",
         ]
         eltwise_mul_xclbin.kernel_name = "swiglu_eltwise_mul"
-        eltwise_mul_xclbin.depends += [silu_xclbin]
+        eltwise_mul_xclbin.dependencies.add(silu_xclbin)
         artifacts.append(eltwise_mul_insts)
 
         gemv_2 = AIEGEMV(
@@ -115,16 +196,14 @@ def set_up_artifacts(self):
             tile_size_output=self.embedding_dim // 8,
         )
         self.gemv_2 = gemv_2
-        gemv_2_xclbin, gemv_2_insts = gemv_2.get_artifacts(
-            prefix="swiglu_decode_gemv_2_"
-        )
+        gemv_2_xclbin, gemv_2_insts = gemv_2.get_artifacts(prefix="swiglu_gemv_2_")
         gemv_2_xclbin.xclbin_input = eltwise_mul_xclbin
         gemv_2_xclbin.extra_flags += [
             "--xclbin-instance-name=swiglu_gemv_2",
             "--xclbin-kernel-id=0x904",
         ]
         gemv_2_xclbin.kernel_name = "swiglu_gemv_2"
-        gemv_2_xclbin.depends += [eltwise_mul_xclbin]
+        gemv_2_xclbin.dependencies.add(eltwise_mul_xclbin)
         artifacts.append(gemv_2_xclbin)
         artifacts.append(gemv_2_insts)
 
@@ -140,69 +219,11 @@ def set_up_artifacts(self):
 
         self.add_artifacts(artifacts)
 
-    def set_up_runtime(self):
-        self.add_buffer("input", self.embedding_dim)
-        self.add_buffer(
-            "weights_1",
-            self.embedding_dim * self.hidden_dim_padded,
-            static_data=torch_to_numpy(self.weights_1),
-        )
-        self.add_buffer(
-            "weights_2",
-            self.embedding_dim * self.hidden_dim_padded,
-            static_data=torch_to_numpy(self.weights_2),
-        )
-        self.add_buffer(
-            "weights_3",
-            self.hidden_dim_padded * self.embedding_dim,
-            static_data=torch_to_numpy(self.weights_3),
-        )
-        self.add_buffer("left", self.hidden_dim_padded)
-        self.add_buffer("left_swished", self.hidden_dim_padded)
-        self.add_buffer("right", self.hidden_dim_padded)
-        self.add_buffer("intermediate", self.hidden_dim_padded)
-        self.add_buffer("output", self.embedding_dim)
-        self.add_kernel(
-            "swiglu_gemv_1",
-            self.combined_xclbin,
-            self.gemv_1_xclbin.kernel_name,
-            self.gemv_1_insts,
-        )
-        self.add_kernel(
-            "swiglu_silu",
-            self.combined_xclbin,
-            self.silu_xclbin.kernel_name,
-            self.silu_insts,
-        )
-        self.add_kernel(
-            "swiglu_eltwise_mul",
-            self.combined_xclbin,
-            self.eltwise_mul_xclbin.kernel_name,
-            self.eltwise_mul_insts,
-        )
-        self.add_kernel(
-            "swiglu_gemv_2",
-            self.combined_xclbin,
-            self.gemv_2_xclbin.kernel_name,
-            self.gemv_2_insts,
-        )
-        self.add_to_runlist("swiglu_gemv_1", "weights_1", "input", "left")
-        self.add_to_runlist("swiglu_gemv_1", "weights_2", "input", "right")
-        self.add_to_runlist("swiglu_silu", "left", "left_swished")
-        self.add_to_runlist(
-            "swiglu_eltwise_mul", "left_swished", "right", "intermediate"
-        )
-        self.add_to_runlist("swiglu_gemv_2", "weights_3", "intermediate", "output")
-
-    def forward(self, x):
-        x_flat = x.reshape(x.shape[-1])
-        assert x_flat.shape[0] == self.embedding_dim
-
-        self.write_buffer("input", x_flat)
-        self.run_runlist()
-        result = self.read_buffer_as_torch(
-            "output",
-            (self.embedding_dim,),
-        ).view_as(x)
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec("in", (self.embedding_dim,)),
+            AIERuntimeArgSpec("out", (self.embedding_dim,)),
+        ]
 
-        return result
+    def get_callable(self):
+        return SwiGLUDecodeCallable(self)
diff --git a/iron/operators/swiglu_decode/test.py b/iron/operators/swiglu_decode/test.py
index 11b35fa2..606c082f 100755
--- a/iron/operators/swiglu_decode/test.py
+++ b/iron/operators/swiglu_decode/test.py
@@ -7,35 +7,28 @@
 from pathlib import Path
 
 
+from ml_dtypes import bfloat16
+from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
+from iron.common.utils import xrt_to_torch
 from iron.operators.swiglu_decode.op import AIESwiGLUDecode
 from iron.operators.swiglu_decode.reference import generate_golden_reference
-from iron.common.test_utils import run_test, verify_buffer
+from iron.common.test_utils import verify_buffer
 
 
-def generate_test_params(extensive=False):
-    params = [(2048, 2048)]
-    names = [f"swiglu_decode_1x{emb}x{hid}" for emb, hid in params]
-    return params, names
+def get_params():
+    params_list = [(2048, 2048)]
 
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    params = []
+    for p in params_list:
+        params.append(pytest.param(*p))
+    return params
 
 
 @pytest.mark.metrics(
     Latency=r"Latency \(us\): (?P<value>[\d\.]+)",
     Bandwidth=r"Effective Bandwidth: (?P<value>[\d\.e\+-]+) GB/s",
 )
-@pytest.mark.parametrize("embedding_dim,hidden_dim", all_params)
+@pytest.mark.parametrize("embedding_dim,hidden_dim", get_params())
 def test_swiglu_decode(embedding_dim, hidden_dim, aie_context):
     golden_ref = generate_golden_reference(M=1, K=embedding_dim, N=hidden_dim)
 
@@ -46,39 +39,32 @@ def test_swiglu_decode(embedding_dim, hidden_dim, aie_context):
     operator.weights_2 = golden_ref["w_up"].T
     operator.weights_3 = golden_ref["w_down"].T
 
-    # In the following, some buffers are commented out.
-    # Because this operator calls multiple kernels in sequence, rounding errors due to the smaller bf16 data type accumulate, which can cause it to fail verification.
-    # So, instead of verifying the final output buffers against the float32-calculated reference, we calculate another reference for the final output:
-    # This reference is based on the previous intermediate result read back from the AIE operator, "resetting"  the accumulated error to zero.
-    # Note that the previous intermediate result _is_ still verified up to the given tolerance.
+    operator.compile()
+    op_func = operator.get_callable()
+
+    input_buf = XRTTensor.from_torch(golden_ref["input"])
+    output_buf = XRTTensor((1, embedding_dim), dtype=bfloat16)
 
-    input_buffers = {"input": golden_ref["input"]}
-    output_buffers = {}
-    intermediate_buffers = {
-        "left": golden_ref["left"],
-        "left_swished": golden_ref["left_swished"],
-        "right": golden_ref["right"],
-        "intermediate": golden_ref["intermediate"],
-    }
+    op_func(input_buf, output_buf)
 
-    errors, latency_us, bandwidth_gbps = run_test(
-        operator,
-        input_buffers,
-        output_buffers,
-        intermediate_buffers,
+    errors = {}
+    # Verify intermediate result
+    intermediate = xrt_to_torch(op_func.intermediate).reshape((1, hidden_dim))
+    errors_intermediate = verify_buffer(
+        intermediate,
+        "intermediate",
+        golden_ref["intermediate"],
         rel_tol=0.07,
         abs_tol=0.7,
     )
-
-    ref_2 = (
-        operator.read_buffer_as_torch("intermediate", (1, hidden_dim))
-        @ golden_ref["w_down"]
-    )
-    errors_2 = verify_buffer(operator, "output", ref_2, rel_tol=0.04, abs_tol=0.4)
-    if errors_2:
-        errors["output"] = errors_2
-
-    print(f"\nLatency (us): {latency_us:.1f}")
-    print(f"Effective Bandwidth: {bandwidth_gbps:.6e} GB/s\n")
+    if errors_intermediate:
+        errors["intermediate"] = errors_intermediate
+
+    # Verify output using intermediate result
+    ref_2 = intermediate @ golden_ref["w_down"]
+    output = xrt_to_torch(output_buf).reshape((1, embedding_dim))
+    errors_output = verify_buffer(output, "output", ref_2, rel_tol=0.04, abs_tol=0.4)
+    if errors_output:
+        errors["output"] = errors_output
 
     assert not errors, f"Test failed with errors: {errors}"
diff --git a/iron/operators/swiglu_prefill/op.py b/iron/operators/swiglu_prefill/op.py
index 2b2aa341..9a19bb20 100644
--- a/iron/operators/swiglu_prefill/op.py
+++ b/iron/operators/swiglu_prefill/op.py
@@ -6,8 +6,11 @@
 import numpy as np
 from ml_dtypes import bfloat16
 
+from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
+from aie.utils.npukernel import NPUKernel
 from iron.common import (
-    AIEOperatorBase,
+    CompositeOperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -18,10 +21,88 @@
 from iron.operators.gemm.op import AIEGEMM
 from iron.operators.silu.op import AIESiLU
 from iron.operators.elementwise_mul.op import AIEElementwiseMul
-from iron.common.utils import torch_to_numpy
 
 
-class AIESwiGLUPrefill(AIEOperatorBase):
+class SwiGLUPrefillCallable:
+    def __init__(self, op):
+        self.op = op
+
+        def create_callable(sub_op, xclbin_path, kernel_name, insts_artifact):
+            return NPUKernel(
+                xclbin_path=xclbin_path,
+                kernel_name=kernel_name,
+                insts_path=insts_artifact.filename,
+            )
+
+        self.gemm_1_callable = create_callable(
+            op.gemm_1,
+            op.combined_xclbin.filename,
+            op.gemm_1_xclbin.kernel_name,
+            op.gemm_1_insts,
+        )
+        self.silu_callable = create_callable(
+            op.silu,
+            op.combined_xclbin.filename,
+            op.silu_xclbin.kernel_name,
+            op.silu_insts,
+        )
+        self.eltwise_mul_callable = create_callable(
+            op.eltwise_mul,
+            op.combined_xclbin.filename,
+            op.eltwise_mul_xclbin.kernel_name,
+            op.eltwise_mul_insts,
+        )
+        self.gemm_2_callable = create_callable(
+            op.gemm_2,
+            op.combined_xclbin.filename,
+            op.gemm_2_xclbin.kernel_name,
+            op.gemm_2_insts,
+        )
+
+        # Allocate and upload weights
+        self.weights_1 = XRTTensor.from_torch(op.weights_1.T)
+        self.weights_2 = XRTTensor.from_torch(op.weights_2.T)
+        self.weights_3 = XRTTensor.from_torch(op.weights_3.T)
+
+        # Allocate intermediate buffers
+        # Sizes are padded
+        size_hidden = op.seq_len_padded * op.hidden_dim_padded
+        self.left = XRTTensor((size_hidden,), dtype=bfloat16)
+        self.right = XRTTensor((size_hidden,), dtype=bfloat16)
+        self.left_swished = XRTTensor((size_hidden,), dtype=bfloat16)
+        self.intermediate = XRTTensor((size_hidden,), dtype=bfloat16)
+        self.last_output_buf = None
+
+    def __call__(self, input_buf, output_buf):
+        self.last_output_buf = output_buf
+        input_buf.to("npu")
+        output_buf.to("npu")
+        self.weights_1.to("npu")
+        self.weights_2.to("npu")
+        self.weights_3.to("npu")
+        self.left.to("npu")
+        self.right.to("npu")
+        self.left_swished.to("npu")
+        self.intermediate.to("npu")
+
+        # Sequence:
+        # 1. GEMM(input, weights_1, left)
+        self.gemm_1_callable(input_buf, self.weights_1, self.left)
+
+        # 2. GEMM(input, weights_2, right)
+        self.gemm_1_callable(input_buf, self.weights_2, self.right)
+
+        # 3. SiLU(left, left_swished)
+        self.silu_callable(self.left, self.left_swished)
+
+        # 4. EltwiseMul(left_swished, right, intermediate)
+        self.eltwise_mul_callable(self.left_swished, self.right, self.intermediate)
+
+        # 5. GEMM(intermediate, weights_3, output)
+        self.gemm_2_callable(self.intermediate, self.weights_3, output_buf)
+
+
+class AIESwiGLUPrefill(CompositeOperator):
 
     def __init__(
         self, seq_len, embedding_dim, hidden_dim, prio_accuracy=False, context=None
@@ -85,7 +166,6 @@ def set_up_artifacts(self):
         silu = AIESiLU(
             size=self.seq_len_padded * self.hidden_dim_padded,
             num_aie_columns=8,
-            num_channels=2,
             tile_size=self.hidden_dim_padded // 8,
         )
         self.silu = silu
@@ -98,13 +178,12 @@ def set_up_artifacts(self):
             "--xclbin-kernel-id=0x902",
         ]
         silu_xclbin.kernel_name = "swiglu_silu"
-        silu_xclbin.depends += [gemm_1_xclbin]
+        silu_xclbin.dependencies.add(gemm_1_xclbin)
         artifacts.append(silu_insts)
 
         eltwise_mul = AIEElementwiseMul(
             size=self.seq_len_padded * self.hidden_dim_padded,
             num_aie_columns=8,
-            num_channels=2,
             tile_size=self.hidden_dim_padded // 8,
         )
         self.eltwise_mul = eltwise_mul
@@ -119,7 +198,7 @@ def set_up_artifacts(self):
             "--xclbin-kernel-id=0x903",
         ]
         eltwise_mul_xclbin.kernel_name = "swiglu_eltwise_mul"
-        eltwise_mul_xclbin.depends += [silu_xclbin]
+        eltwise_mul_xclbin.dependencies.add(silu_xclbin)
         artifacts.append(eltwise_mul_insts)
 
         gemm_2 = AIEGEMM(
@@ -137,7 +216,7 @@ def set_up_artifacts(self):
             "--xclbin-kernel-id=0x904",
         ]
         gemm_2_xclbin.kernel_name = "swiglu_gemm_2"
-        gemm_2_xclbin.depends += [eltwise_mul_xclbin]
+        gemm_2_xclbin.dependencies.add(eltwise_mul_xclbin)
         artifacts.append(gemm_2_xclbin)
         artifacts.append(gemm_2_insts)
 
@@ -153,109 +232,13 @@ def set_up_artifacts(self):
 
         self.add_artifacts(artifacts)
 
-    def set_up_runtime(self):
-        # Runtime setup
-        # ---
-        self.add_buffer("input", self.seq_len_padded * self.embedding_dim_padded)
-        self.add_buffer(
-            "weights_1",
-            self.embedding_dim_padded * self.hidden_dim_padded,
-            static_data=torch_to_numpy(self.weights_1.T),
-        )
-        self.add_buffer(
-            "weights_2",
-            self.embedding_dim_padded * self.hidden_dim_padded,
-            static_data=torch_to_numpy(self.weights_2.T),
-        )
-        self.add_buffer(
-            "weights_3",
-            self.hidden_dim_padded * self.embedding_dim_padded,
-            static_data=torch_to_numpy(self.weights_3.T),
-        )
-        self.add_buffer("left", self.seq_len_padded * self.hidden_dim_padded)
-        self.add_buffer("left_swished", self.seq_len_padded * self.hidden_dim_padded)
-        self.add_buffer("right", self.seq_len_padded * self.hidden_dim_padded)
-        self.add_buffer("intermediate", self.seq_len_padded * self.hidden_dim_padded)
-        self.add_buffer("output", self.seq_len_padded * self.embedding_dim_padded)
-        self.add_kernel(
-            "swiglu_gemm_1",
-            self.combined_xclbin,
-            self.gemm_1_xclbin.kernel_name,
-            self.gemm_1_insts,
-        )
-        self.add_kernel(
-            "swiglu_silu",
-            self.combined_xclbin,
-            self.silu_xclbin.kernel_name,
-            self.silu_insts,
-        )
-        self.add_kernel(
-            "swiglu_eltwise_mul",
-            self.combined_xclbin,
-            self.eltwise_mul_xclbin.kernel_name,
-            self.eltwise_mul_insts,
-        )
-        self.add_kernel(
-            "swiglu_gemm_2",
-            self.combined_xclbin,
-            self.gemm_2_xclbin.kernel_name,
-            self.gemm_2_insts,
-        )
-        self.add_to_runlist("swiglu_gemm_1", "input", "weights_1", "left")
-        self.add_to_runlist("swiglu_gemm_1", "input", "weights_2", "right")
-        self.add_to_runlist("swiglu_silu", "left", "left_swished")
-        self.add_to_runlist(
-            "swiglu_eltwise_mul", "left_swished", "right", "intermediate"
-        )
-        self.add_to_runlist("swiglu_gemm_2", "intermediate", "weights_3", "output")
-
-    def forward(self, x):
-        """Forward pass for SwiGLU operation"""
-
-        # Always flatten to [batch, orig_size]
-        original_shape = x.shape
-        batch = x.shape[0] if x.dim() > 1 else 1
-        x_flat = x.reshape(batch, -1)
-
-        out = self._execute_aie_operation(x_flat)
-
-        # Restore original shape
-        out = out.reshape(*original_shape)
-
-        return out
-
-    def _execute_aie_operation(self, x):
-        # x is [batch, size]
-        batch = x.shape[0] if x.dim() > 1 else 1
-
-        # Flatten inputs for AIE processing
-        x_flat = x.view(-1)
-
-        # Verify input size matches expected dimensions
-        expected_size = batch * self.seq_len * self.embedding_dim
-        assert x_flat.shape[0] == expected_size
-
-        # Pad input if necessary to match GEMM requirements
-        if self.seq_len_padded * self.embedding_dim_padded > x_flat.shape[0]:
-            x_padded = torch.zeros(
-                self.seq_len_padded * self.embedding_dim_padded,
-                dtype=x_flat.dtype,
-                device=x_flat.device,
-            )
-            x_padded[: x_flat.shape[0]] = x_flat
-            x_flat = x_padded
-
-        self.write_buffer("input", x_flat)
-        self.run_runlist()
-
-        # Read padded output buffer
-        result_padded = self.read_buffer_as_torch(
-            "output",
-            shape=(self.seq_len_padded * self.embedding_dim_padded,),
-            dtype=bfloat16,
-        )
-
-        # Extract only the unpadded portion
-        result = result_padded[:expected_size].view(batch, -1)
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec("in", (self.seq_len_padded * self.embedding_dim_padded,)),
+            AIERuntimeArgSpec(
+                "out", (self.seq_len_padded * self.embedding_dim_padded,)
+            ),
+        ]
 
-        return result
+    def get_callable(self):
+        return SwiGLUPrefillCallable(self)
diff --git a/iron/operators/swiglu_prefill/test.py b/iron/operators/swiglu_prefill/test.py
index 75510d63..583dbdcf 100755
--- a/iron/operators/swiglu_prefill/test.py
+++ b/iron/operators/swiglu_prefill/test.py
@@ -7,36 +7,29 @@
 from pathlib import Path
 
 
+from ml_dtypes import bfloat16
+from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
+from iron.common.utils import xrt_to_torch
 from iron.operators.swiglu_prefill.op import AIESwiGLUPrefill
 from iron.operators.swiglu_decode.reference import generate_golden_reference
-from iron.common.test_utils import run_test, verify_buffer
+from iron.common.test_utils import verify_buffer
 
 
-def generate_test_params(extensive=False):
+def get_params():
     # This operation is currently untested except for the integrated llama application tests.
-    params = []
-    names = []
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
+    params_list = [(256, 2048, 2048, False)]
 
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    params = []
+    for p in params_list:
+        params.append(pytest.param(*p))
+    return params
 
 
 @pytest.mark.metrics(
     Latency=r"Latency \(us\): (?P<value>[\d\.]+)",
     Bandwidth=r"Effective Bandwidth: (?P<value>[\d\.e\+-]+) GB/s",
 )
-@pytest.mark.parametrize("seq_len,embedding_dim,hidden_dim,prio_accuracy", all_params)
+@pytest.mark.parametrize("seq_len,embedding_dim,hidden_dim,prio_accuracy", get_params())
 def test_swiglu_prefill(seq_len, embedding_dim, hidden_dim, prio_accuracy, aie_context):
     golden_ref = generate_golden_reference(M=seq_len, K=embedding_dim, N=hidden_dim)
 
@@ -51,41 +44,36 @@ def test_swiglu_prefill(seq_len, embedding_dim, hidden_dim, prio_accuracy, aie_c
     operator.weights_2 = golden_ref["w_up"].T
     operator.weights_3 = golden_ref["w_down"].T
 
-    input_buffers = {"input": golden_ref["input"]}
-    # output_buffers = {'output': golden_ref['output']}
-    output_buffers = {}
-    intermediate_buffers = {
-        "left": golden_ref["left"],
-        "left_swished": golden_ref["left_swished"],
-        "right": golden_ref["right"],
-        # 'intermediate': golden_ref['intermediate']
-    }
-
-    errors, latency_us, bandwidth_gbps = run_test(
-        operator,
-        input_buffers,
-        output_buffers,
-        intermediate_buffers,
-        rel_tol=0.07,
-        abs_tol=0.7,
-    )
+    operator.compile()
+    op_func = operator.get_callable()
+
+    input_buf = XRTTensor.from_torch(golden_ref["input"])
+    output_buf = XRTTensor(
+        (seq_len * embedding_dim,), dtype=bfloat16
+    )  # Output is flattened
 
-    ref_2 = operator.read_buffer_as_torch(
-        "left_swished", (seq_len, hidden_dim)
-    ) * operator.read_buffer_as_torch("right", (seq_len, hidden_dim))
-    errors_2 = verify_buffer(operator, "intermediate", ref_2, rel_tol=0.04, abs_tol=0.4)
+    op_func(input_buf, output_buf)
+
+    errors = {}
+
+    # Verify intermediate result (left_swished * right)
+    left_swished = xrt_to_torch(op_func.left_swished).reshape((seq_len, hidden_dim))
+    right = xrt_to_torch(op_func.right).reshape((seq_len, hidden_dim))
+    ref_2 = left_swished * right
+
+    # Note: intermediate buffer in op_func stores the result of eltwise_mul
+    intermediate = xrt_to_torch(op_func.intermediate).reshape((seq_len, hidden_dim))
+    errors_2 = verify_buffer(
+        intermediate, "intermediate", ref_2, rel_tol=0.04, abs_tol=0.4
+    )
     if errors_2:
         errors["intermediate"] = errors_2
 
-    ref_3 = (
-        operator.read_buffer_as_torch("intermediate", (seq_len, hidden_dim))
-        @ golden_ref["w_down"]
-    )
-    errors_3 = verify_buffer(operator, "output", ref_3, rel_tol=0.04, abs_tol=0.4)
+    # Verify output using intermediate result
+    ref_3 = intermediate @ golden_ref["w_down"]
+    output = xrt_to_torch(output_buf).reshape((seq_len, embedding_dim))
+    errors_3 = verify_buffer(output, "output", ref_3, rel_tol=0.04, abs_tol=0.4)
     if errors_3:
-        errors["output"] = errors_2
-
-    print(f"\nLatency (us): {latency_us:.1f}")
-    print(f"Effective Bandwidth: {bandwidth_gbps:.6e} GB/s\n")
+        errors["output"] = errors_3
 
     assert not errors, f"Test failed with errors: {errors}"
diff --git a/iron/operators/tanh/design.py b/iron/operators/tanh/design.py
index 0f78fc92..c3e0acad 100644
--- a/iron/operators/tanh/design.py
+++ b/iron/operators/tanh/design.py
@@ -14,7 +14,9 @@
 from aie.iron.controlflow import range_
 
 
-def my_tanh(dev, size, num_columns, num_channels, tile_size, trace_size):
+def my_tanh(
+    dev, size, num_columns, num_channels, tile_size, trace_size, kernel_archive=None
+):
     xfr_dtype = bfloat16
     line_size = 4096 if tile_size > 4096 else tile_size
     line_type = np.ndarray[(line_size,), np.dtype[xfr_dtype]]
diff --git a/iron/operators/tanh/op.py b/iron/operators/tanh/op.py
index 5bccad5e..2a0233aa 100644
--- a/iron/operators/tanh/op.py
+++ b/iron/operators/tanh/op.py
@@ -7,8 +7,8 @@
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -17,14 +17,17 @@
 )
 
 
-class AIETanh(AIEOperatorBase):
+class AIETanh(MLIROperator):
     """AIE-accelerated Tanh activation function"""
 
     def __init__(self, size, num_aie_columns, num_channels, tile_size, context=None):
         max_multiple = num_aie_columns * tile_size
-        padded_size = ((size + max_multiple - 1) // max_multiple) * max_multiple
-        self.orig_size = size
-        self.size = padded_size
+        assert (
+            size % max_multiple == 0
+        ), "size must be multiple of num_aie_columns * tile_size"
+        assert size % tile_size == 0, "size must be multiple of tile_size"
+
+        self.size = size
         self.tile_size = tile_size
 
         self.num_columns = num_aie_columns
@@ -33,17 +36,15 @@ def __init__(self, size, num_aie_columns, num_channels, tile_size, context=None)
         total_shimdma_channels = self.num_columns * self.num_channels
         assert total_shimdma_channels <= 16, "Conservative ShimDMA limit"
 
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"tanh_{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
 
-    def set_up_artifacts(self):
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"tanh_{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="my_tanh",
             callback_args=[
@@ -56,59 +57,20 @@ def set_up_artifacts(self):
             ],
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelObjectArtifact.new(
-                    f"tanh.o",
-                    depends=[
-                        SourceArtifact.new(
-                            self.context.base_dir / "aie_kernels" / "aie2p" / "tanh.cc"
-                        )
-                    ],
-                ),
-            ],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-        self.add_artifacts([xclbin_artifact, insts_artifact])
-
-    def set_up_runtime(self):
-        self.add_buffer("input", self.size)
-        self.add_buffer("output", self.size)
-        self.add_kernel(
-            "tanh",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("tanh", "input", "output")
-
-    def forward(self, x):
-        if x.numel() > self.size:
-            raise AIEOperatorConstraintError(
-                "AIETanh: input too large for configured size"
-            )
-
-        original_shape = x.shape
-        x_flat = x.reshape(-1)
-
-        pad_len = self.size - x_flat.numel()
-        if pad_len > 0:
-            x_flat = torch.nn.functional.pad(x_flat, (0, pad_len))
-
-        self.write_buffer("input", x_flat)
-        self.write_buffer("output", np.zeros(self.size, dtype=bfloat16))
-        self.run_runlist()
-        result = self.read_buffer_as_torch("output", shape=(self.size,), dtype=bfloat16)
-
-        if pad_len > 0:
-            result = result[: x_flat.numel() - pad_len]
-
-        return result.reshape(*original_shape)
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"tanh.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir / "aie_kernels" / "aie2p" / "tanh.cc"
+                    )
+                ],
+            ),
+        ]
+
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec("in", (self.size,)),  # input
+            AIERuntimeArgSpec("out", (self.size,)),  # output
+        ]
diff --git a/iron/operators/tanh/test.py b/iron/operators/tanh/test.py
index f9986bb3..6888474d 100755
--- a/iron/operators/tanh/test.py
+++ b/iron/operators/tanh/test.py
@@ -12,13 +12,12 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
+def get_params():
     max_aie_columns = 8
     num_channels = 1  # 1 channel for 1 input
-    input_lengths = [2048] if not extensive else [1024, 4096, 8192]
+    input_lengths = [1024, 2048, 4096, 8192]
 
     params = []
-    names = []
     for input_length in input_lengths:
         for num_aie_columns in range(1, max_aie_columns + 1):
             tile_size = input_length // num_aie_columns
@@ -26,24 +25,19 @@ def generate_test_params(extensive=False):
                 tile_size = 4096
             check_length = tile_size * num_aie_columns
             if check_length == input_length:
-                names.append(
-                    f"tanh_{num_aie_columns}_cols_{num_channels}_channels_{input_length}_tile_{tile_size}"
+                is_regular = input_length == 2048
+                marks = [] if is_regular else [pytest.mark.extensive]
+
+                params.append(
+                    pytest.param(
+                        input_length,
+                        num_aie_columns,
+                        num_channels,
+                        tile_size,
+                        marks=marks,
+                    )
                 )
-                params.append((input_length, num_aie_columns, num_channels, tile_size))
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
-
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    return params
 
 
 @pytest.mark.metrics(
@@ -52,7 +46,7 @@ def generate_test_params(extensive=False):
 )
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size",
-    all_params,
+    get_params(),
 )
 def test_tanh(input_length, num_aie_columns, num_channels, tile_size, aie_context):
     golden_ref = generate_golden_reference(input_length=input_length)
diff --git a/iron/operators/transpose/design.py b/iron/operators/transpose/design.py
index 7a53365a..03fad5d3 100644
--- a/iron/operators/transpose/design.py
+++ b/iron/operators/transpose/design.py
@@ -2,20 +2,17 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from ml_dtypes import bfloat16
-from pathlib import Path
 import numpy as np
-import argparse
-import sys
 
 from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
 from aie.iron.placers import SequentialPlacer
-from aie.iron.device import NPU1, NPU2
 from aie.helpers.taplib.tap import TensorAccessPattern
 from aie.iron.controlflow import range_
-from aie.helpers.util import np_ndarray_type_get_shape
 
 
-def shuffle_transpose(dev, M, N, num_columns, num_channels, trace_size, m, n, s):
+def shuffle_transpose(
+    dev, M, N, num_columns, num_channels, m, n, s, kernel_archive=None, func_prefix=""
+):
     num_elements = M * N
     per_tile_elements = m * n
     dtype = bfloat16
@@ -103,8 +100,10 @@ def shuffle_transpose(dev, M, N, num_columns, num_channels, trace_size, m, n, s)
     ]
 
     # AIE Core Function declaration
+    if kernel_archive is None:
+        kernel_archive = f"transpose_{s}x{s}.a"
     transpose_kernel = Kernel(
-        f"transpose_{s}x{s}", f"transpose_{m}x{n}.o", [tile_ty, tile_ty]
+        f"{func_prefix}transpose_{s}x{s}", kernel_archive, [tile_ty, tile_ty]
     )
 
     # Define a task that will run on a compute tile
@@ -163,115 +162,3 @@ def core_body(of_in1, of_out, transpose_kernel):
 
     # Place program components (assign them resources on the device) and generate an MLIR module
     return Program(dev, rt).resolve_program(SequentialPlacer())
-
-
-if __name__ == "__main__":
-
-    def str_to_device(device: str):
-        if device == "npu":
-            return NPU1()
-        elif device == "npu2":
-            return NPU2()
-        else:
-            raise ValueError(f"Device name {device} is unknown.")
-
-    p = argparse.ArgumentParser()
-    # Parse command line arguments
-
-    # Device name is required to select the AIE device: npu or npu2
-    p.add_argument(
-        "-d",
-        "--dev",
-        required=True,
-        dest="device",
-        help="AIE Device",
-        type=str_to_device,
-    )
-    # Transfer size is required to define the size of the data to be transferred
-    p.add_argument(
-        "-M", "--workload-rows", required=True, dest="work_rows", help="Number of rows"
-    )
-    p.add_argument(
-        "-N",
-        "--workload-columns",
-        required=True,
-        dest="work_cols",
-        help="Number of columns",
-    )
-    # Number of columns is required to define the number of columns to be used
-    # It must be less than or equal to 4 for npu and 8 for npu2
-    p.add_argument(
-        "-co", "--columns", required=True, dest="cols", help="Number of columns"
-    )
-    # Number of channels is required to define the number of channels to be used
-    # It must be 1 or 2
-    p.add_argument(
-        "-ch", "--channels", required=True, dest="chans", help="Number of channels"
-    )
-    # Tile size
-    p.add_argument(
-        "-m", "--tile-rows", required=True, dest="tile_rows", help="Outer tile rows"
-    )
-    p.add_argument(
-        "-n",
-        "--tile-columns",
-        required=True,
-        dest="tile_cols",
-        help="Outer tile columns",
-    )
-    p.add_argument(
-        "-s",
-        "--kernel-dim",
-        required=True,
-        choices=["4", "8"],
-        dest="kernel_dim",
-        help="Inner tile dimension (square)",
-    )
-    # Trace Size
-    p.add_argument(
-        "-tr", "--trace-size", required=True, dest="trace_size", help="Trace size"
-    )
-    p.add_argument(
-        "--output-file-path",
-        "-o",
-        type=str,
-        help="Output file path for the generated MLIR module",
-    )
-
-    opts = p.parse_args(sys.argv[1:])
-
-    M = int(opts.work_rows)
-    N = int(opts.work_cols)
-    columns = int(opts.cols)
-
-    dev = opts.device  # Already a device object from str_to_device
-
-    # Validate columns based on device type
-    if isinstance(dev, NPU1) and columns > 4:
-        raise ValueError("[ERROR] Device NPU cannot allocate more than 4 columns")
-    elif isinstance(dev, NPU2) and columns > 8:
-        raise ValueError("[ERROR] Device NPU2 cannot allocate more than 8 columns")
-
-    channels = int(opts.chans)
-    if channels < 1 or channels > 2:
-        raise ValueError("Number of channels must be 1 or 2")
-    m = int(opts.tile_rows)
-    n = int(opts.tile_cols)
-    s = int(opts.kernel_dim)
-    if (((M * N) % (m * n)) % columns % channels) != 0:
-        print(
-            "transfer size ("
-            + str(M * N)
-            + ") must be a multiple of "
-            + str(m * n)
-            + f" and divisible by the number of columns ({columns}) and {channels} channels per column"
-        )
-        raise ValueError
-    trace_size = int(opts.trace_size) if opts.trace_size is not None else 0
-
-    module = shuffle_transpose(dev, M, N, columns, channels, trace_size, m, n, s)
-
-    output_file_path = Path(opts.output_file_path)
-
-    with open(output_file_path, "w") as f:
-        f.write(str(module))
diff --git a/iron/operators/transpose/op.py b/iron/operators/transpose/op.py
index 7963fd06..83c7891e 100644
--- a/iron/operators/transpose/op.py
+++ b/iron/operators/transpose/op.py
@@ -1,14 +1,11 @@
 # SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import torch
-import numpy as np
-from ml_dtypes import bfloat16
 from pathlib import Path
 
 from iron.common import (
-    AIEOperatorBase,
-    AIEOperatorConstraintError,
+    MLIROperator,
+    AIERuntimeArgSpec,
     XclbinArtifact,
     InstsBinArtifact,
     KernelObjectArtifact,
@@ -17,37 +14,35 @@
 )
 
 
-class AIETranspose(AIEOperatorBase):
+class AIETranspose(MLIROperator):
     """AIE-accelerated transpose operator"""
 
     def __init__(self, M, N, num_aie_columns, num_channels, m, n, s, context=None):
+        assert M % m == 0, f"Matrix rows ({M}) must be a multiple of {m}"
+        assert N % n == 0, f"Matrix columns ({N}) must be a multiple of {n}"
+        assert m % s == 0, f"AIE tile rows ({m}) must be a multiple of {s}"
+        assert n % s == 0, f"AIE tile columns ({n}) must be a multiple of {s}"
+        assert (
+            M * N % (m * n * num_aie_columns * num_channels) == 0
+        ), "Transfer size must be divisible by m*n*num_columns*num_channels"
+
         self.M = M
         self.N = N
         self.m = m
         self.n = n
         self.s = s
-        self.size = M * N
-        self.tile_size = m * n
-
         self.num_columns = num_aie_columns
         self.num_channels = num_channels
 
-        total_shimdma_channels = self.num_columns * self.num_channels
-        if 1 > 1:
-            total_shimdma_channels *= 1
-        assert total_shimdma_channels <= 16, "Conservative ShimDMA limit"
-
-        self.xclbin_artifact = None
-        self.insts_artifact = None
+        MLIROperator.__init__(self, context=context)
 
-        AIEOperatorBase.__init__(self, context=context)
+    def get_operator_name(self):
+        return f"transpose_{self.num_columns}c_{self.num_channels}ch_{self.M}x{self.N}_{self.m}x{self.n}_{self.s}s"
 
-    def set_up_artifacts(self):
+    def get_mlir_artifact(self):
         operator_dir = Path(__file__).parent
-        file_name_base = f"transpose_{self.num_columns}c_{self.num_channels}ch_{self.M}x{self.N}_{self.m}x{self.n}_{self.s}s"
-
-        mlir_artifact = PythonGeneratedMLIRArtifact.new(
-            f"{file_name_base}.mlir",
+        return PythonGeneratedMLIRArtifact(
+            f"{self.get_operator_name()}.mlir",
             import_path=operator_dir / "design.py",
             callback_fn="shuffle_transpose",
             callback_args=[
@@ -56,73 +51,33 @@ def set_up_artifacts(self):
                 self.N,
                 self.num_columns,
                 self.num_channels,
-                0,
                 self.m,
                 self.n,
                 self.s,
             ],
         )
 
-        xclbin_artifact = XclbinArtifact.new(
-            f"{file_name_base}.xclbin",
-            depends=[
-                mlir_artifact,
-                KernelObjectArtifact.new(
-                    f"transpose_{self.m}x{self.n}.o",
-                    depends=[
-                        SourceArtifact.new(
-                            self.context.base_dir
-                            / "aie_kernels"
-                            / "generic"
-                            / "transpose.cc"
-                        )
-                    ],
-                    extra_flags=[
-                        f"-DDIM_m={self.m}",
-                        f"-DDIM_n={self.n}",
-                    ],
-                ),
-            ],
-        )
-
-        insts_artifact = InstsBinArtifact.new(
-            f"{file_name_base}.bin", depends=[mlir_artifact]
-        )
-
-        self.xclbin_artifact = xclbin_artifact
-        self.insts_artifact = insts_artifact
-        self.add_artifacts([xclbin_artifact, insts_artifact])
-
-    def set_up_runtime(self):
-        self.add_buffer("input", self.size)
-        self.add_buffer("output", self.size)
-        self.add_kernel(
-            "transpose",
-            self.xclbin_artifact,
-            self.xclbin_artifact.kernel_name,
-            self.insts_artifact,
-        )
-        self.add_to_runlist("transpose", "input", "output")
-
-    def forward(self, x):
-        if x.numel() > self.size:
-            raise AIEOperatorConstraintError(
-                "AIETranspose: input too large for configured size"
-            )
-
-        original_shape = x.shape
-        x_flat = x.reshape(-1)
-
-        pad_len = self.size - x_flat.numel()
-        if pad_len > 0:
-            x_flat = torch.nn.functional.pad(x_flat, (0, pad_len))
-
-        self.write_buffer("input", x_flat)
-        self.write_buffer("output", np.zeros(self.size, dtype=bfloat16))
-        self.run_runlist()
-        result = self.read_buffer_as_torch("output", shape=(self.size,), dtype=bfloat16)
-
-        if pad_len > 0:
-            result = result[: x_flat.numel() - pad_len]
-
-        return result.reshape(*original_shape)
+    def get_kernel_artifacts(self):
+        return [
+            KernelObjectArtifact(
+                f"transpose_{self.m}x{self.n}.o",
+                dependencies=[
+                    SourceArtifact(
+                        self.context.base_dir
+                        / "aie_kernels"
+                        / "generic"
+                        / "transpose.cc"
+                    )
+                ],
+                extra_flags=[
+                    f"-DDIM_m={self.m}",
+                    f"-DDIM_n={self.n}",
+                ],
+            ),
+        ]
+
+    def get_arg_spec(self):
+        return [
+            AIERuntimeArgSpec("in", (self.M * self.N,)),  # input
+            AIERuntimeArgSpec("out", (self.M * self.N,)),  # output (transposed)
+        ]
diff --git a/iron/operators/transpose/test.py b/iron/operators/transpose/test.py
index 8f0d9981..f151f8df 100755
--- a/iron/operators/transpose/test.py
+++ b/iron/operators/transpose/test.py
@@ -12,16 +12,15 @@
 from iron.common.test_utils import run_test
 
 
-def generate_test_params(extensive=False):
-    params = []
-    names = []
+def get_params():
     max_aie_columns = 8
-    input_lengths = [2048] if not extensive else [64, 2048]
-    n_list = [64] if not extensive else [64, 128, 256, 512]
+    input_lengths = [64, 2048]
+    n_list = [64, 128, 256, 512]
     s_list = [8]
     m = 64
     n = 64
 
+    params = []
     for M in input_lengths:
         for N in n_list:
             for s in s_list:
@@ -37,32 +36,31 @@ def generate_test_params(extensive=False):
                         length = M * N
                         if check_length != length:
                             continue
-                        names.append(
-                            f"transpose_{M}_M_{N}_N_{num_aie_columns}_cols_{num_channels}_channels_{m}_m_{n}_n_{s}_s"
-                        )
-                        params.append((M, N, num_aie_columns, num_channels, m, n, s))
 
-    return params, names
-
-
-regular_params, regular_names = generate_test_params(extensive=False)
-extensive_params, extensive_names = generate_test_params(extensive=True)
+                        is_regular = M == 2048 and N == 64
+                        marks = [] if is_regular else [pytest.mark.extensive]
+
+                        params.append(
+                            pytest.param(
+                                M,
+                                N,
+                                num_aie_columns,
+                                num_channels,
+                                m,
+                                n,
+                                s,
+                                marks=marks,
+                            )
+                        )
 
-# Combine params with marks - extensive params get pytest.mark.extensive
-all_params = [
-    pytest.param(*params, id=name)
-    for params, name in zip(regular_params, regular_names)
-] + [
-    pytest.param(*params, marks=pytest.mark.extensive, id=name)
-    for params, name in zip(extensive_params, extensive_names)
-]
+    return params
 
 
 @pytest.mark.metrics(
     Latency=r"Latency \(us\): (?P<value>[\d\.]+)",
     Bandwidth=r"Effective Bandwidth: (?P<value>[\d\.e\+-]+) GB/s",
 )
-@pytest.mark.parametrize("M,N,aie_columns,channels,m,n,s", all_params)
+@pytest.mark.parametrize("M,N,aie_columns,channels,m,n,s", get_params())
 def test_transpose(M, N, aie_columns, channels, m, n, s, aie_context):
     golden_ref = generate_golden_reference(rows=M, cols=N)
 
diff --git a/requirements.txt b/requirements.txt
index 0072a859..c849253f 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,11 +6,11 @@
 # version of torch (don't need CUDA), so we give this index precedence over the
 # main PyPI. These indices are consulted in order of precedence by pip.
 --index-url https://download.pytorch.org/whl/cpu
---extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/v1.2.0
+--extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/v1.2.1
 --extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly
 --extra-index-url https://pypi.org/simple
 
-mlir_aie==v1.2.0
+mlir_aie==v1.2.1
 llvm-aie
 
 black