diff --git a/README.md b/README.md
index b209fb9db..0d33e61aa 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,7 @@ it is highly recommended to use KInference TensorFlow.js backend instead for mor
 KInference Core dependency coordinates:
 ```kotlin
 dependencies {
-    api("io.kinference", "inference-core", "0.2.22")
+    api("io.kinference", "inference-core", "0.2.23")
 }
 ```
 
@@ -67,7 +67,7 @@ This backend is recommended for JavaScript projects.
 TensorFlow.js backend dependency coordinates:
 ```kotlin
 dependencies {
-    api("io.kinference", "inference-tfjs", "0.2.22")
+    api("io.kinference", "inference-tfjs", "0.2.23")
 }
 ```
 
@@ -81,14 +81,14 @@ To check on the system requirements, visit the following [link](https://onnxrunt
 ONNXRuntime CPU backend dependency coordinates:
 ```kotlin
 dependencies {
-    api("io.kinference", "inference-ort", "0.2.22")
+    api("io.kinference", "inference-ort", "0.2.23")
 }
 ```
 
 ONNXRuntime GPU backend dependency coordinates:
 ```kotlin
 dependencies {
-    api("io.kinference", "inference-ort-gpu", "0.2.22")
+    api("io.kinference", "inference-ort-gpu", "0.2.23")
 }
 ```
 
@@ -104,7 +104,7 @@ Array adapter for the [kmath](https://github.com/SciProgCentre/kmath) library th
 Dependency coordinates:
 ```kotlin
 dependencies {
-    api("io.kinference", "adapter-kmath-{backend_name}", "0.2.22")
+    api("io.kinference", "adapter-kmath-{backend_name}", "0.2.23")
 }
 ```
 
@@ -114,12 +114,12 @@ Array adapter for the [multik](https://github.com/Kotlin/multik) library that wo
 Dependency coordinates:
 ```kotlin
 dependencies {
-    api("io.kinference", "adapter-multik-{backend_name}", "0.2.22")
+    api("io.kinference", "adapter-multik-{backend_name}", "0.2.23")
 }
 ```
 
 ## Getting started
-Let us now walk through how to get started with KInference. The latest version of KInference is *0.2.22*
+Let us now walk through how to get started with KInference. The latest version of KInference is *0.2.23*
 
 ### Setup dependencies repository
 
@@ -142,7 +142,7 @@ To enable the backend, you can add the chosen KInference runtime as a dependency
 
 ```kotlin
 dependencies {
-    api("io.kinference", "inference-core", "0.2.22")
+    api("io.kinference", "inference-core", "0.2.23")
 }
 ```
 
@@ -160,20 +160,20 @@ kotlin {
     sourceSets {
         val commonMain by getting {
             dependencies {
-                api("io.kinference:inference-api:0.2.22")
-                api("io.kinference:ndarray-api:0.2.22")
+                api("io.kinference:inference-api:0.2.23")
+                api("io.kinference:ndarray-api:0.2.23")
             }
         }
 
         val jvmMain by getting {
             dependencies {
-                api("io.kinference:inference-core:0.2.22")
+                api("io.kinference:inference-core:0.2.23")
             }
         }
 
         val jsMain by getting {
             dependencies {
-                api("io.kinference:inference-tfjs:0.2.22")
+                api("io.kinference:inference-tfjs:0.2.23")
             }
         }
     }
diff --git a/build.gradle.kts b/build.gradle.kts
index 371868781..085912ebb 100644
--- a/build.gradle.kts
+++ b/build.gradle.kts
@@ -8,7 +8,7 @@ import org.jetbrains.kotlin.gradle.targets.js.yarn.YarnRootExtension
 import org.jetbrains.kotlin.gradle.tasks.KotlinCompilationTask
 
 group = "io.kinference"
-version = "0.2.22-kotlin18"
+version = "0.2.23-kotlin18"
 
 plugins {
     alias(libs.plugins.kotlin.multiplatform) apply false
diff --git a/buildSrc/src/main/kotlin/io/kinference/gradle/JVMTestTasks.kt b/buildSrc/src/main/kotlin/io/kinference/gradle/JVMTestTasks.kt
index 250843f1c..c9c51d46a 100644
--- a/buildSrc/src/main/kotlin/io/kinference/gradle/JVMTestTasks.kt
+++ b/buildSrc/src/main/kotlin/io/kinference/gradle/JVMTestTasks.kt
@@ -56,6 +56,7 @@ fun KotlinJvmTarget.configureBenchmarkTests() {
         group = "verification"
 
         maxHeapSize = "4G"
+        systemProperty("kotlinx.coroutines.debug", "off")
 
         useJUnitPlatform()
 
diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
index 5566bb795..e9edc52ed 100644
--- a/gradle/libs.versions.toml
+++ b/gradle/libs.versions.toml
@@ -11,6 +11,7 @@ okio = "3.4.0"
 onnxruntime = "1.17.0.patched-1"
 slf4j = "2.0.9"
 wire = "4.8.1"
+fastutil = "8.5.14"
 
 primitives = "0.1.26-kotlin18"
 
@@ -38,3 +39,4 @@ onnxruntime-gpu = { module = "com.microsoft.onnxruntime:onnxruntime_gpu", versio
 slf4j-api = { module = "org.slf4j:slf4j-api", version.ref = "slf4j" }
 slf4j-simple = { module = "org.slf4j:slf4j-simple", version.ref = "slf4j" }
 wire-runtime = { module = "com.squareup.wire:wire-runtime", version.ref = "wire" }
+fastutil-core = { module = "it.unimi.dsi:fastutil-core", version.ref = "fastutil" }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/KIEngine.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/KIEngine.kt
index 287094629..674bbaed5 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/KIEngine.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/KIEngine.kt
@@ -10,30 +10,19 @@ import io.kinference.core.optimizer.rules.OptimizerRuleSet
 import io.kinference.data.ONNXData
 import io.kinference.data.ONNXDataType
 import io.kinference.model.IrOptimizableEngine
-import io.kinference.ndarray.arrays.memory.MemoryLimiter
-import io.kinference.ndarray.arrays.memory.MemoryLimiters
 import io.kinference.optimizer.GraphOptimizer
 import io.kinference.optimizer.OptimizerRule
 import io.kinference.protobuf.*
 import io.kinference.protobuf.message.*
 import io.kinference.utils.CommonDataLoader
-import io.kinference.utils.PlatformUtils
+import io.kinference.utils.PredictionConfig
+import io.kinference.utils.PredictionConfigs
 import okio.Buffer
 import okio.Path
 import okio.Path.Companion.toPath
 
 typealias KIONNXData<T> = ONNXData<T, CoreBackend>
 
-// Define an interface for allocation control marking output
-internal interface KIONNXDataArraysReleaser {
-    fun markOutput()
-}
-
-internal fun <T> KIONNXData<T>.markOutput() {
-    if (this is KIONNXDataArraysReleaser)
-        this.markOutput()
-}
-
 object CoreBackend : BackendInfo(name = "KInference Core CPU Backend")
 
 /**
@@ -51,24 +40,24 @@ object KIEngine : IrOptimizableEngine<KIONNXData<*>> {
 
     fun protoReader(bytes: ByteArray) = ProtobufReader(Buffer().write(bytes), KI_READER_CONFIG)
 
-    suspend fun loadModel(bytes: ByteArray, optimize: Boolean, memoryLimiter: MemoryLimiter, parallelismLimit: Int): KIModel {
+    suspend fun loadModel(bytes: ByteArray, optimize: Boolean, predictionConfig: PredictionConfig): KIModel {
         val rules = if (optimize) OptimizerRuleSet.DEFAULT_OPT_RULES else emptyList()
-        return loadModel(bytes, rules, memoryLimiter, parallelismLimit)
+        return loadModel(bytes, rules, predictionConfig)
     }
 
     override suspend fun loadModel(bytes: ByteArray, optimize: Boolean): KIModel {
-        return loadModel(bytes, optimize, MemoryLimiters.NoAllocator, PlatformUtils.cores)
+        return loadModel(bytes, optimize, PredictionConfigs.NoAllocator)
     }
 
-    override suspend fun loadModel(bytes: ByteArray, rules: List<OptimizerRule<KIONNXData<*>>>): KIModel = loadModel(bytes, rules, MemoryLimiters.NoAllocator, PlatformUtils.cores)
+    override suspend fun loadModel(bytes: ByteArray, rules: List<OptimizerRule<KIONNXData<*>>>): KIModel = loadModel(bytes, rules, PredictionConfigs.NoAllocator)
 
-    suspend fun loadModel(bytes: ByteArray, rules: List<OptimizerRule<KIONNXData<*>>>, memoryLimiter: MemoryLimiter, parallelismLimit: Int): KIModel {
+    suspend fun loadModel(bytes: ByteArray, rules: List<OptimizerRule<KIONNXData<*>>>, predictionConfig: PredictionConfig): KIModel {
         val modelScheme = ModelProto.decode(protoReader(bytes))
-        val model = KIModel(modelScheme, memoryLimiter)
+        val model = KIModel(modelScheme, predictionConfig)
 
         return if (rules.isNotEmpty()) {
             val newGraph = GraphOptimizer(model.graph).run(rules) as KIGraph
-            KIModel(model.id, model.name, model.opSet, newGraph, memoryLimiter, parallelismLimit)
+            KIModel(model.id, model.name, model.opSet, newGraph, predictionConfig)
         } else {
             model
         }
@@ -76,12 +65,12 @@ object KIEngine : IrOptimizableEngine<KIONNXData<*>> {
 
     override suspend fun loadModel(bytes: ByteArray): KIModel = loadModel(bytes, optimize = true)
 
-    suspend fun loadModel(path: Path, optimize: Boolean, memoryLimiter: MemoryLimiter, parallelismLimit: Int): KIModel {
-        return loadModel(CommonDataLoader.bytes(path), optimize, memoryLimiter, parallelismLimit)
+    suspend fun loadModel(path: Path, optimize: Boolean, predictionConfig: PredictionConfig): KIModel {
+        return loadModel(CommonDataLoader.bytes(path), optimize, predictionConfig)
     }
 
     override suspend fun loadModel(path: Path, optimize: Boolean): KIModel {
-        return loadModel(path, optimize, MemoryLimiters.NoAllocator, PlatformUtils.cores)
+        return loadModel(path, optimize, PredictionConfigs.NoAllocator)
     }
 
     override suspend fun loadModel(path: Path): KIModel = loadModel(path, optimize = true)
@@ -90,12 +79,12 @@ object KIEngine : IrOptimizableEngine<KIONNXData<*>> {
         return loadModel(CommonDataLoader.bytes(path), rules)
     }
 
-    suspend fun loadModel(path: String, optimize: Boolean, memoryLimiter: MemoryLimiter, parallelismLimit: Int): KIModel {
-        return loadModel(CommonDataLoader.bytes(path.toPath()), optimize, memoryLimiter, parallelismLimit)
+    suspend fun loadModel(path: String, optimize: Boolean, predictionConfig: PredictionConfig): KIModel {
+        return loadModel(CommonDataLoader.bytes(path.toPath()), optimize, predictionConfig)
     }
 
     override suspend fun loadModel(path: String, optimize: Boolean): KIModel {
-        return loadModel(path, optimize, MemoryLimiters.NoAllocator, PlatformUtils.cores)
+        return loadModel(path, optimize, PredictionConfigs.NoAllocator)
     }
 
     override suspend fun loadModel(path: String): KIModel = loadModel(path, optimize = true)
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/map/KIONNXMap.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/map/KIONNXMap.kt
index f541c4c23..a1bbcf7eb 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/map/KIONNXMap.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/map/KIONNXMap.kt
@@ -9,7 +9,7 @@ import io.kinference.protobuf.message.TensorProto
 import io.kinference.types.ValueInfo
 import io.kinference.types.ValueTypeInfo
 
-class KIONNXMap(name: String?, data: Map<Any, KIONNXData<*>>, val info: ValueTypeInfo.MapTypeInfo) : ONNXMap<Map<Any, KIONNXData<*>>, CoreBackend>(name, data), KIONNXDataArraysReleaser {
+class KIONNXMap(name: String?, data: Map<Any, KIONNXData<*>>, val info: ValueTypeInfo.MapTypeInfo) : ONNXMap<Map<Any, KIONNXData<*>>, CoreBackend>(name, data) {
     constructor(data: Map<Any, KIONNXData<*>>, info: ValueInfo) : this(info.name, data, info.typeInfo as ValueTypeInfo.MapTypeInfo)
 
     override val backend = CoreBackend
@@ -26,10 +26,6 @@ class KIONNXMap(name: String?, data: Map<Any, KIONNXData<*>>, val info: ValueTyp
 
     override fun rename(name: String): KIONNXMap = KIONNXMap(name, data, info)
 
-    override fun markOutput() {
-        data.values.forEach { it.markOutput() }
-    }
-
     override suspend fun clone(newName: String?): KIONNXMap {
         val newMap = HashMap<Any, KIONNXData<*>>(data.size)
         for ((key, value) in data.entries) {
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/seq/KIONNXSequence.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/seq/KIONNXSequence.kt
index 24b52085c..49383fca0 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/seq/KIONNXSequence.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/seq/KIONNXSequence.kt
@@ -7,7 +7,7 @@ import io.kinference.data.ONNXSequence
 import io.kinference.protobuf.message.SequenceProto
 import io.kinference.types.*
 
-class KIONNXSequence(name: String?, data: List<KIONNXData<*>>, val info: ValueTypeInfo.SequenceTypeInfo) : ONNXSequence<List<KIONNXData<*>>, CoreBackend>(name, data), KIONNXDataArraysReleaser {
+class KIONNXSequence(name: String?, data: List<KIONNXData<*>>, val info: ValueTypeInfo.SequenceTypeInfo) : ONNXSequence<List<KIONNXData<*>>, CoreBackend>(name, data) {
     constructor(name: String?, info: ValueTypeInfo.SequenceTypeInfo, size: Int, init: (Int) -> KIONNXData<*>) : this(name, List(size, init), info)
     constructor(data: List<KIONNXData<*>>, info: ValueInfo) : this(info.name, data, info.typeInfo as ValueTypeInfo.SequenceTypeInfo)
 
@@ -23,10 +23,6 @@ class KIONNXSequence(name: String?, data: List<KIONNXData<*>>, val info: ValueTy
 
     override fun rename(name: String): KIONNXSequence = KIONNXSequence(name, data, info)
 
-    override fun markOutput() {
-        data.forEach { it.markOutput() }
-    }
-
     val length: Int = data.size
 
     companion object {
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt
index 2c6de1a69..dba23754f 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt
@@ -1,9 +1,9 @@
 package io.kinference.core.data.tensor
 
-import io.kinference.core.CoreBackend
-import io.kinference.core.KIONNXDataArraysReleaser
+import io.kinference.core.*
 import io.kinference.data.ONNXTensor
 import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.arrays.tiled.*
 import io.kinference.protobuf.FLOAT_TENSOR_TYPES
 import io.kinference.protobuf.message.TensorProto
@@ -13,10 +13,11 @@ import io.kinference.types.ValueTypeInfo
 
 //TODO: support segments
 //TODO: support external data
-class KITensor(name: String?, override val data: NDArrayCore, val info: ValueTypeInfo.TensorTypeInfo) : ONNXTensor<NDArrayCore, CoreBackend>(name, data), KIONNXDataArraysReleaser {
+class KITensor(name: String?, override val data: NDArrayCore, val info: ValueTypeInfo.TensorTypeInfo, private var context: ManualAllocatorContext? = null) : ONNXTensor<NDArrayCore, CoreBackend>(name, data) {
     constructor(data: NDArrayCore, info: ValueInfo) : this(info.name, data, info.typeInfo as ValueTypeInfo.TensorTypeInfo)
 
     override suspend fun close() {
+        context?.returnNDArray(data)
         data.close()
     }
 
@@ -24,11 +25,6 @@ class KITensor(name: String?, override val data: NDArrayCore, val info: ValueTyp
         return KITensor(newName, data.clone(), info)
     }
 
-    override fun markOutput() {
-        if (this.data is MemoryControlledArray)
-            data.markOutput()
-    }
-
     suspend operator fun minus(other: KITensor): KITensor {
         require(this.data is NumberNDArrayCore && other.data is NumberNDArrayCore)
         return (this.data - other.data).asTensor()
@@ -47,7 +43,7 @@ class KITensor(name: String?, override val data: NDArrayCore, val info: ValueTyp
     override val backend = CoreBackend
 
     override fun rename(name: String): KITensor {
-        return KITensor(name, data, info)
+        return KITensor(name, data, info, context)
     }
 
     companion object {
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/TensorExtensions.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/TensorExtensions.kt
index 618431c01..f8e2daf19 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/TensorExtensions.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/TensorExtensions.kt
@@ -1,6 +1,7 @@
 package io.kinference.core.data.tensor
 
 import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.extensions.concat
 import io.kinference.ndarray.extensions.splitWithAxis
 import io.kinference.primitives.types.DataType
@@ -8,9 +9,9 @@ import io.kinference.protobuf.resolveProtoDataType
 import io.kinference.types.TensorShape
 import io.kinference.types.ValueTypeInfo
 
-fun NDArrayCore.asTensor(name: String? = null) = KITensor(name, this, ValueTypeInfo.TensorTypeInfo(TensorShape(this.shape), type.resolveProtoDataType()))
+fun NDArrayCore.asTensor(name: String? = null, context: ManualAllocatorContext? = null) = KITensor(name, this, ValueTypeInfo.TensorTypeInfo(TensorShape(this.shape), type.resolveProtoDataType()), context)
 
-internal fun <T : NDArray> T.asTensor(name: String? = null) = (this as NDArrayCore).asTensor(name)
+internal fun <T : NDArray> T.asTensor(name: String? = null, context: ManualAllocatorContext? = null) = (this as NDArrayCore).asTensor(name, context)
 
 internal fun <T : NDArray> Collection<T>.asONNXTensors(names: List<String>): List<KITensor> {
     return this.zip(names).map { (data, name) -> data.asTensor(name) }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
index 1e25b4ae1..3f78377d5 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
@@ -1,8 +1,7 @@
 package io.kinference.core.model
 
-import io.kinference.core.KIONNXData
+import io.kinference.core.*
 import io.kinference.core.graph.KIGraph
-import io.kinference.core.markOutput
 import io.kinference.graph.Contexts
 import io.kinference.model.Model
 import io.kinference.ndarray.arrays.memory.*
@@ -18,14 +17,10 @@ class KIModel(
     val name: String,
     val opSet: OperatorSetRegistry,
     val graph: KIGraph,
-    memoryLimiter: MemoryLimiter = MemoryLimiters.NoAllocator,
-    parallelismLimit: Int = PlatformUtils.cores,
+    predictionConfig: PredictionConfig = PredictionConfigs.NoAllocator,
 ) : Model<KIONNXData<*>>, Profilable, Cacheable {
     private val profiles: MutableList<ProfilingContext> = ArrayList()
-
-    @OptIn(ExperimentalCoroutinesApi::class)
-    private val dispatcher: CoroutineDispatcher = Dispatchers.Default.limitedParallelism(parallelismLimit)
-    private val modelArrayStorage: ModelArrayStorage = ModelArrayStorage(memoryLimiter)
+    private val predictionContextDispatcher: PredictionContextDispatcher = PredictionContextDispatcher(predictionConfig)
 
     override fun addProfilingContext(name: String): ProfilingContext = ProfilingContext(name).apply { profiles.add(this) }
     override fun analyzeProfilingResults(): ProfileAnalysisEntry = profiles.analyze("Model $name")
@@ -37,7 +32,6 @@ class KIModel(
             if (profile) addProfilingContext("Model $name") else null
         )
 
-        val limiterContext = ParallelismLimiterContext(dispatcher)
         var coreReserved = false
         val results = try {
             withContext(NonCancellable) {
@@ -45,16 +39,15 @@ class KIModel(
                 coreReserved = true
             }
 
-            val allocatorContext = modelArrayStorage.createAllocatorContext()
-            val mixedContext = allocatorContext + limiterContext
-
-            withContext(mixedContext) {
-                val coroutineContext = coroutineContext[AllocatorContext.Key]!!
-                val execResult = graph.execute(input, contexts)
-                execResult.forEach { it.markOutput() }
-                coroutineContext.closeAllocated()
-                execResult
+            val predictionContext = predictionContextDispatcher.getPredictionContext()
+            val output = if (predictionContextDispatcher.allocationMode != AllocationMode.Auto) withContext(predictionContext) {
+                return@withContext graph.execute(input, contexts)
+            } else withContext(predictionContext) {
+                return@withContext graph.execute(input, contexts).map { it.clone(it.name) }.toList()
             }
+
+            predictionContextDispatcher.returnStorage(predictionContext)
+            output
         } finally {
             if (coreReserved) {
                 ResourcesDispatcher.releaseCore()
@@ -66,11 +59,11 @@ class KIModel(
 
     override suspend fun close() {
         graph.close()
-        modelArrayStorage.close()
+        predictionContextDispatcher.close()
     }
 
     override fun clearCache() {
-        modelArrayStorage.clearCache()
+        predictionContextDispatcher.clearCache()
     }
 
     companion object {
@@ -80,14 +73,13 @@ class KIModel(
 
         suspend operator fun invoke(
             proto: ModelProto,
-            memoryLimiter: MemoryLimiter = MemoryLimiters.NoAllocator,
-            limiterParallelismCounter: Int = PlatformUtils.cores,
+            predictionConfig: PredictionConfig = PredictionConfigs.NoAllocator,
         ): KIModel {
             val name = "${proto.domain}:${proto.modelVersion}"
             val id = "$name:${generateModelId()}"
             val opSet = OperatorSetRegistry(proto.opSetImport)
             val graph = KIGraph(proto.graph!!, opSet)
-            return KIModel(id, name, opSet, graph, memoryLimiter, limiterParallelismCounter)
+            return KIModel(id, name, opSet, graph, predictionConfig)
         }
     }
 }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
index 6487c46c1..0a60d0278 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
@@ -7,17 +7,21 @@ import io.kinference.core.optimizer.rules.context.AttentionContextRule
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.arrays.pointers.accept
 import io.kinference.ndarray.arrays.pointers.map
 import io.kinference.ndarray.arrays.tiled.FloatTiledArray
 import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.ndarray.extensions.dotTransposedWithAlpha
+import io.kinference.ndarray.extensions.softmax.softmax
 import io.kinference.operator.*
 import io.kinference.optimizer.GraphOptimizer.Companion.isOpt
+import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.message.AttributeProto
 import io.kinference.protobuf.message.TensorProto
 import io.kinference.utils.launchWithLimitOrDefault
 import kotlinx.coroutines.coroutineScope
+import kotlin.coroutines.coroutineContext
 import kotlin.math.min
 import kotlin.math.sqrt
 
@@ -25,11 +29,12 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
     companion object {
         private suspend fun attentionScore(
             scores: NDArrayCore, batchSize: Int, seqLen: Int,
-            numHeads: Int, hiddenSize: Int, present: NDArrayCore
+            numHeads: Int, hiddenSize: Int, present: NDArrayCore, context: ManualAllocatorContext? = null
         ): Pair<NDArrayCore, NDArrayCore> {
             val headSize = hiddenSize / numHeads
 
-            val output = allocateNDArray(scores.type, Strides(intArrayOf(batchSize, numHeads, seqLen, headSize)))
+            val outputStrides = Strides(intArrayOf(batchSize, numHeads, seqLen, headSize))
+            val output = context?.getNDArray(scores.type, outputStrides, fillZeros = true) ?: allocateNDArray(scores.type, outputStrides)
 
             coroutineScope {
                 for (batchNum in 0 until batchSize) {
@@ -46,6 +51,8 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
                 }
             }
 
+            context?.returnNDArray(scores)
+
             val outputTransposed = output.transpose(intArrayOf(0, 2, 1, 3)).reshape(intArrayOf(batchSize, seqLen, hiddenSize))
             return outputTransposed to present
         }
@@ -61,21 +68,16 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
             val kBlocks = k.array.blocks
             val vBlocks = v.array.blocks
 
-            val kMarker = k.array.marker
-            val vMarker = v.array.marker
 
             val resultBlocks: Array<FloatArray>
-            val resultMarker: Array<StateMarker>
 
             if (past == null || past.linearSize == 0) {
                 resultBlocks = kBlocks.plus(vBlocks)
-                resultMarker = kMarker.plus(vMarker)
             } else {
                 val pastSeqLen = past.shape[3]
                 presentDims[3] += pastSeqLen
 
                 val pastBlocks = past.array.blocks
-                val pastMarker = past.array.marker
 
                 val blocksInRow = headSize / past.array.blockSize
 
@@ -84,60 +86,56 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
 
                 val rowsSize = batchSize * numHeads
                 val futureRes = arrayOfNulls<FloatArray>(2 * batchSize * numHeads * presentDims[3] * blocksInRow)
-                val futureResMarker = arrayOfNulls<StateMarker>(2 * batchSize * numHeads * presentDims[3] * blocksInRow)
 
                 var resBlockIdx = 0
                 var pastBlocIdx = 0
 
                 repeat(2) { presentKeyValueIdx ->
                     val kvBlocks = if (presentKeyValueIdx == 0) kBlocks else vBlocks
-                    val kvMarker = if (presentKeyValueIdx == 0) kMarker else vMarker
 
                     var kvBlockIdx = 0
 
                     repeat(rowsSize) {
                         pastBlocks.copyInto(futureRes, resBlockIdx, pastBlocIdx, pastBlocIdx + pastRowBlocksCount)
-                        pastMarker.copyInto(futureResMarker, resBlockIdx, pastBlocIdx, pastBlocIdx + pastRowBlocksCount)
 
                         resBlockIdx += pastRowBlocksCount
                         pastBlocIdx += pastRowBlocksCount
 
                         kvBlocks.copyInto(futureRes, resBlockIdx, kvBlockIdx, kvBlockIdx + kvRowBlocksCount)
-                        kvMarker.copyInto(futureResMarker, resBlockIdx, kvBlockIdx, kvBlockIdx + kvRowBlocksCount)
                         resBlockIdx += kvRowBlocksCount
                         kvBlockIdx += kvRowBlocksCount
                     }
                 }
                 resultBlocks = futureRes as Array<FloatArray>
-                resultMarker = futureResMarker as Array<StateMarker>
             }
 
-            return FloatNDArray(FloatTiledArray(resultBlocks, resultMarker), Strides(presentDims))
+            return FloatNDArray(FloatTiledArray(resultBlocks), Strides(presentDims))
         }
 
 
         internal suspend fun getScores(
             unidir: Boolean, q: NDArrayCore, k: NDArrayCore, v: NDArrayCore, mask: IntNDArray?,
-            past: NDArrayCore?, batchSize: Int, seqLen: Int, numHeads: Int, hiddenSize: Int, maskFilterValue: Float = -10_000f
+            past: NDArrayCore?, batchSize: Int, seqLen: Int, numHeads: Int, hiddenSize: Int, maskFilterValue: Float = -10_000f, context: ManualAllocatorContext? = null
         ): Pair<NDArrayCore, NDArrayCore> {
             val headSize = hiddenSize / numHeads
 
             val pastSeqLen = past?.shape?.get(3) ?: 0
             val present = makePresent(past, k, v, batchSize, seqLen, numHeads, hiddenSize)
 
-            val scores = normalizedScores(unidir, q, mask, batchSize, seqLen, pastSeqLen, headSize, numHeads, present, maskFilterValue)
-            return attentionScore(scores, batchSize, seqLen, numHeads, hiddenSize, present)
+            val scores = normalizedScores(unidir, q, mask, batchSize, seqLen, pastSeqLen, headSize, numHeads, present, maskFilterValue, context)
+            return attentionScore(scores, batchSize, seqLen, numHeads, hiddenSize, present, context)
         }
 
         private suspend fun normalizedScores(
             unidir: Boolean, queries: NDArrayCore, maskIndices: IntNDArray?, batchSize: Int,
-            seqLen: Int, pastSeqLen: Int, headSize: Int, numHeads: Int, present: NDArrayCore, maskFilterValue: Float = -10_000f
+            seqLen: Int, pastSeqLen: Int, headSize: Int, numHeads: Int, present: NDArrayCore, maskFilterValue: Float = -10_000f, context: ManualAllocatorContext? = null
         ): NumberNDArrayCore {
             val allSeqLen = present.shape[3]
 
-            val scores = allocateNDArray(queries.type, Strides(intArrayOf(batchSize, numHeads, seqLen, allSeqLen))) as MutableNumberNDArrayCore
+            val scoresStrides = Strides(intArrayOf(batchSize, numHeads, seqLen, allSeqLen))
+            val scores = (context?.getNDArray(queries.type, scoresStrides, fillZeros = true) ?: allocateNDArray(queries.type, scoresStrides)) as MutableNumberNDArrayCore
 
-            val maskData = maskIndices?.maskFromIndices(unidir, batchSize, seqLen, pastSeqLen, maskFilterValue)
+            val maskData = maskIndices?.maskFromIndices(unidir, batchSize, seqLen, pastSeqLen, maskFilterValue, context)
 
             val alpha = 1.0 / sqrt(headSize.toDouble())
 
@@ -158,14 +156,23 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
                 }
             }
 
+            if (maskData != null) {
+                context?.returnNDArray(maskData)
+            }
+            context?.returnNDArray(queries)
+
+            val softmaxDest = (context?.getNDArray(scores.type, scoresStrides) ?: allocateNDArray(scores.type, scoresStrides)) as MutableNumberNDArrayCore
+
             //softmax for each result (normalize along last axis)
-            return scores.softmax(axis = -1)
+            return softmax(input = scores, axis = -1, dest = softmaxDest)
         }
 
-        private suspend fun IntNDArray?.maskFromIndices(unidir: Boolean, batchSize: Int, seqLen: Int, pastSeqLen: Int, maskFilterValue: Float = -10_000f): FloatNDArray {
+        private suspend fun IntNDArray?.maskFromIndices(unidir: Boolean, batchSize: Int, seqLen: Int, pastSeqLen: Int, maskFilterValue: Float = -10_000f, context: ManualAllocatorContext? = null): FloatNDArray {
             val fullSeqLen = seqLen + pastSeqLen
             val maskDataShape = intArrayOf(batchSize, seqLen, fullSeqLen)
-            val mask = MutableFloatNDArray(Strides(maskDataShape))
+            val maskStrides = Strides(maskDataShape)
+
+            val mask = (context?.getNDArray(DataType.FLOAT, maskStrides) ?: MutableFloatNDArray(maskStrides)) as MutableFloatNDArray
             val maskOffset = seqLen * fullSeqLen
             repeat(batchSize) { i ->
                 if (this != null) {
@@ -245,12 +252,13 @@ class AttentionVer1(name: String, attributes: Map<String, Attribute<Any>>, input
 
         internal suspend fun initQueryKeyValue(
             input: NDArrayCore, weights: NDArrayCore, bias: NDArrayCore,
-            batchSize: Int, seqLen: Int, hiddenSize: Int, numHeads: Int
+            batchSize: Int, seqLen: Int, hiddenSize: Int, numHeads: Int, context: ManualAllocatorContext? = null
         ): Array<MutableNDArrayCore> {
             input as NumberNDArrayCore
             val headSize = hiddenSize / numHeads
 
-            val qkv = Array(3) { allocateNDArray(input.type, Strides(intArrayOf(batchSize, numHeads, seqLen, headSize))) }
+            val qkvStrides = Strides(intArrayOf(batchSize, numHeads, seqLen, headSize))
+            val qkv = Array(3) { context?.getNDArray(input.type, qkvStrides, fillZeros = true) ?: allocateNDArray(input.type, qkvStrides) }
 
             coroutineScope {
                 for (qkvIdx in 0 until 3) {
@@ -279,6 +287,8 @@ class AttentionVer1(name: String, attributes: Map<String, Attribute<Any>>, input
     private val maskFilterValue: Float by attribute("mask_filter_value") { it: Number -> it.toFloat() }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
+        val context = coroutineContext[ManualAllocatorContext]
+
         val input = inputs[0]!!
         val weights = inputs[1]!!
 
@@ -296,10 +306,10 @@ class AttentionVer1(name: String, attributes: Map<String, Attribute<Any>>, input
             input.data,
             preparedWeights.data,
             preparedBias.data,
-            batchSize, seqLen, hiddenSize, numHeads,
+            batchSize, seqLen, hiddenSize, numHeads, context
         )
 
-        val (scores, present) = getScores(unidir, queries, keys, values, maskIndices, past, batchSize, seqLen, numHeads, hiddenSize, maskFilterValue)
-        return listOf(scores.asTensor(), present.asTensor())
+        val (scores, present) = getScores(unidir, queries, keys, values, maskIndices, past, batchSize, seqLen, numHeads, hiddenSize, maskFilterValue, context)
+        return listOf(scores.asTensor(context = context), present.asTensor(context = context))
     }
 }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
index b1861b281..5fad8cd77 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
@@ -1,15 +1,17 @@
 package io.kinference.core.operators.layer.normalization
 
 import io.kinference.attribute.Attribute
-import io.kinference.core.data.tensor.KITensor
-import io.kinference.core.data.tensor.asONNXTensors
+import io.kinference.core.data.tensor.*
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.arrays.pointers.*
 import io.kinference.operator.*
+import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.message.AttributeProto.AttributeType
 import io.kinference.protobuf.message.TensorProto
+import kotlin.coroutines.coroutineContext
 import kotlin.math.sqrt
 
 sealed class EmbedLayerNormalization(
@@ -73,9 +75,12 @@ class EmbedLayerNormalizationVer1(
 
         private data class NormalizeResult(val output: FloatNDArray, val embeddingSum: FloatNDArray)
 
-        internal suspend fun createMaskIndices(mask: IntNDArray?, batchSize: Int, seqLen: Int): NumberNDArrayCore {
-            val maskIndices = MutableIntNDArray(intArrayOf(batchSize))
-            if (mask == null) return maskIndices
+        internal suspend fun createMaskIndices(mask: IntNDArray?, batchSize: Int, seqLen: Int, context: ManualAllocatorContext? = null): NumberNDArrayCore {
+            val strides = Strides(intArrayOf(batchSize))
+            val maskIndices = (context?.getNDArray(DataType.INT, strides) ?: MutableIntNDArray(strides)) as MutableIntNDArray
+
+            if (mask == null)
+                return maskIndices.also { it.fill(0) }
 
             val pointer = mask.array.pointer()
             val maskIndicesPointer = maskIndices.array.pointer()
@@ -95,12 +100,15 @@ class EmbedLayerNormalizationVer1(
 
         private suspend fun normalize(
             epsilon: Float, inputIds: IntNDArray, segmentIds: IntNDArray?, wordEmbed: FloatNDArray, posEmbed: FloatNDArray,
-            segmentEmbed: FloatNDArray?, gamma: FloatNDArray, beta: FloatNDArray, positionIds: IntNDArray?
+            segmentEmbed: FloatNDArray?, gamma: FloatNDArray, beta: FloatNDArray, positionIds: IntNDArray?, context: ManualAllocatorContext? = null
         ): NormalizeResult {
             val (batchSize, seqLen) = inputIds.shape
             val (_, hiddenSize) = wordEmbed.shape
-            val output = MutableFloatNDArray(intArrayOf(batchSize, seqLen, hiddenSize))
-            val embeddingSum = MutableFloatNDArray(intArrayOf(batchSize, seqLen, hiddenSize))
+
+            val outputStrides = Strides(intArrayOf(batchSize, seqLen, hiddenSize))
+
+            val output = (context?.getNDArray(DataType.FLOAT, outputStrides, fillZeros = false) ?: MutableFloatNDArray(outputStrides)) as MutableFloatNDArray
+            val embeddingSum = (context?.getNDArray(DataType.FLOAT, outputStrides, fillZeros = false) ?: MutableFloatNDArray(outputStrides)) as MutableFloatNDArray
 
             for (batch in 0 until batchSize) {
                 val blockIdx = batch * seqLen
@@ -167,6 +175,8 @@ class EmbedLayerNormalizationVer1(
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
+        val manualContext = coroutineContext[ManualAllocatorContext]
+
         val inputIds = inputs[0]!!.data as IntNDArray
         val segmentIds = inputs[1]?.data as IntNDArray?
         val wordEmbed = inputs[2]!!.data as FloatNDArray
@@ -177,8 +187,12 @@ class EmbedLayerNormalizationVer1(
         val mask = inputs.getOrNull(7)?.data as IntNDArray?
         val positionIds = inputs.getOrNull(8)?.data as IntNDArray?
 
-        val (normalized, embedSum) = normalize(epsilon, inputIds, segmentIds, wordEmbed, posEmbed, segmentEmbed, gamma, beta, positionIds)
+        val (normalized, embedSum) = normalize(epsilon, inputIds, segmentIds, wordEmbed, posEmbed, segmentEmbed, gamma, beta, positionIds, manualContext)
         val maskIndices = createMaskIndices(mask, inputIds.shape[0], inputIds.shape[1])
-        return listOf(normalized, maskIndices, embedSum).asONNXTensors(outputs)
+        return listOf(
+            normalized.asTensor(context = manualContext),
+            maskIndices.asTensor(context = manualContext),
+            embedSum.asTensor(context = manualContext)
+        )
     }
 }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
index 6b6243ba3..598a14c26 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
@@ -7,10 +7,13 @@ import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.FloatNDArray
 import io.kinference.ndarray.arrays.MutableFloatNDArray
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.arrays.pointers.*
 import io.kinference.operator.*
+import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.message.AttributeProto
 import io.kinference.protobuf.message.TensorProto
+import kotlin.coroutines.coroutineContext
 import kotlin.math.sqrt
 
 sealed class SkipLayerNormalization(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
@@ -104,8 +107,10 @@ class SkipLayerNormalizationVer1(name: String, attributes: Map<String, Attribute
 
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
+        val manualContext = coroutineContext[ManualAllocatorContext]
+
         val input = inputs[0]!!.data as FloatNDArray
-        val output = MutableFloatNDArray(input.strides)
+        val output = (manualContext?.getNDArray(DataType.FLOAT, input.strides, fillZeros = false) ?: MutableFloatNDArray(input.strides)) as MutableFloatNDArray
         input.normalize(
             skip = inputs[1]!!.data as FloatNDArray,
             gamma = inputs[2]!!.data as FloatNDArray,
@@ -114,6 +119,7 @@ class SkipLayerNormalizationVer1(name: String, attributes: Map<String, Attribute
             epsilon = epsilon,
             dst = output
         )
-        return listOf(output.asTensor())
+
+        return listOf(output.asTensor(context = manualContext))
     }
 }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
index f248dcddc..55d325668 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
@@ -5,9 +5,12 @@ import io.kinference.core.data.tensor.KITensor
 import io.kinference.core.data.tensor.asTensor
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
-import io.kinference.ndarray.arrays.NumberNDArrayCore
+import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
+import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.operator.*
 import io.kinference.protobuf.message.TensorProto
+import kotlin.coroutines.coroutineContext
 
 sealed class Add(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
     companion object {
@@ -52,7 +55,16 @@ class AddVer7(name: String, attributes: Map<String, Attribute<Any>>, inputs: Lis
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val result = (inputs[0]!!.data as NumberNDArrayCore) + (inputs[1]!!.data as NumberNDArrayCore)
-        return listOf(result.asTensor("C"))
+        val manualContext = coroutineContext[ManualAllocatorContext]
+
+        val left = inputs[0]!!.data as NumberNDArrayCore
+        val right = inputs[1]!!.data as NumberNDArrayCore
+
+        val destShape = broadcastShape(listOf(left.shape, right.shape))
+        val destStrides = Strides(destShape)
+        val dest = (manualContext?.getNDArray(left.type, destStrides) ?: allocateNDArray(left.type, destStrides)) as MutableNumberNDArrayCore
+
+        val result = left.plus(right, dest) //(inputs[0]!!.data as NumberNDArrayCore) + (inputs[1]!!.data as NumberNDArrayCore)
+        return listOf(result.asTensor("C", manualContext))
     }
 }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
index 0be701b4e..da93d0e8a 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
@@ -5,9 +5,13 @@ import io.kinference.core.data.tensor.KITensor
 import io.kinference.core.data.tensor.asTensor
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
+import io.kinference.ndarray.arrays.MutableNumberNDArrayCore
 import io.kinference.ndarray.arrays.NumberNDArrayCore
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
+import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.ndarray.extensions.gelu.biasGelu
 import io.kinference.operator.*
+import kotlin.coroutines.coroutineContext
 
 sealed class BiasGelu(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
     companion object {
@@ -39,16 +43,20 @@ class BiasGeluVer1(name: String, attributes: Map<String, Attribute<Any>> = empty
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
+        val manualContext = coroutineContext[ManualAllocatorContext]
+
         val input = inputs[0]!!.data as NumberNDArrayCore
         val bias = inputs[1]!!.data as NumberNDArrayCore
 
         require(input.shape.last() == bias.shape.last()) { "Last dimensions of input and bias tensors must be equal" }
 
+        val dest = (manualContext?.getNDArray(input.type, input.strides) ?: allocateNDArray(input.type, input.strides)) as MutableNumberNDArrayCore
+
         // Uses ERF formula with fractional error less than x.xx * 10 ^ -4.
         // Algorithm 26.2.17 in Abromowitz and Stegun, Handbook of Mathematical.
         // Another possible ERF implementation (several ms faster):
         // https://github.com/apache/commons-numbers/blob/master/commons-numbers-gamma/src/main/java/org/apache/commons/numbers/gamma/BoostErf.java
 
-        return listOf(biasGelu(input, bias).asTensor("C"))
+        return listOf(biasGelu(input, bias, dest).asTensor("C", manualContext))
     }
 }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
index 8c1735ea3..4165c554c 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
@@ -5,9 +5,13 @@ import io.kinference.core.data.tensor.KITensor
 import io.kinference.core.data.tensor.asTensor
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
-import io.kinference.ndarray.arrays.NumberNDArrayCore
+import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
+import io.kinference.ndarray.broadcasting.Broadcasting
+import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.operator.*
 import io.kinference.protobuf.message.TensorProto
+import kotlin.coroutines.coroutineContext
 
 sealed class MatMul(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
     companion object {
@@ -46,8 +50,16 @@ class MatMulVer1(name: String, attributes: Map<String, Attribute<Any>>, inputs:
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
+        val manualContext = coroutineContext[ManualAllocatorContext]
+
         val first = inputs[0]!!.data as NumberNDArrayCore
         val second = inputs[1]!!.data as NumberNDArrayCore
-        return listOf((first.matmul(second)).asTensor("Y"))
+
+        val destShape = Broadcasting.broadcastShapeForMatmul(first.shape, second.shape)
+        val destStrides = Strides(destShape)
+
+        val dest = (manualContext?.getNDArray(first.type, destStrides, fillZeros = true) ?: allocateNDArray(first.type, destStrides)) as MutableNumberNDArrayCore
+
+        return listOf((first.matmul(second, dest)).asTensor("Y", manualContext))
     }
 }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
index 5ce45c866..1bfb35fee 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
@@ -6,6 +6,7 @@ import io.kinference.core.data.tensor.asTensor
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.arrays.pointers.mapTo
 import io.kinference.ndarray.arrays.tiled.*
 import io.kinference.operator.*
@@ -13,6 +14,7 @@ import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.FLOAT_TENSOR_TYPES
 import io.kinference.protobuf.message.AttributeProto
 import io.kinference.protobuf.message.TensorProto
+import kotlin.coroutines.coroutineContext
 
 sealed class Cast(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
     companion object {
@@ -41,65 +43,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
         internal val VERSION = VersionInfo(sinceVersion = 6)
         private val INFO = OperatorInfo("Cast", ATTRIBUTES_INFO, INPUTS_INFO, OUTPUTS_INFO, VERSION, OperatorInfo.DEFAULT_DOMAIN)
 
-        private suspend fun castByte(array: ByteNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castByte(array: ByteNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UBYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> array
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != (0).toByte() }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -108,65 +110,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-        private suspend fun castShort(array: ShortNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castShort(array: ShortNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UBYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> array
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != (0).toShort() }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -175,66 +177,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castInt(array: IntNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castInt(array: IntNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UBYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> array
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
-                    array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != 0 }
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
+                    array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != (0).toInt() }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -243,66 +244,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castLong(array: LongNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castLong(array: LongNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UBYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> array
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != 0L }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -311,66 +311,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castUByte(array: UByteNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castUByte(array: UByteNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> array
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != (0).toUByte() }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -379,66 +378,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castUShort(array: UShortNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castUShort(array: UShortNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UBYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> array
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != (0).toUShort() }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -447,66 +445,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castUInt(array: UIntNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castUInt(array: UIntNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != (0).toUInt() }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> array
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -515,65 +512,64 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castULong(array: ULongNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castULong(array: ULongNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != (0).toULong() }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
@@ -583,66 +579,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castFloat(array: FloatNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castFloat(array: FloatNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> array
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong().toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt().toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong().toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt().toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != 0f }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -651,66 +646,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castDouble(array: DoubleNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castDouble(array: DoubleNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong().toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt().toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong().toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt().toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != 0.0 }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> array
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -719,66 +713,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castBoolean(array: BooleanNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castBoolean(array: BooleanNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) 1f else 0f }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) (1).toUByte() else (0).toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) (1).toByte() else (0).toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) (1).toUShort() else (0).toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) (1).toShort() else (0).toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) 1 else 0 }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) 1L else 0L }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> array
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) 1.0 else 0.0 }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) (1).toUInt() else (0).toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) (1).toULong() else (0).toULong() }
                     output
                 }
@@ -787,19 +780,19 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-        internal suspend fun castTo(input: NDArrayCore, to: TensorProto.DataType): NDArrayCore {
+        internal suspend fun castTo(input: NDArrayCore, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (input.type) {
-                DataType.BYTE -> castByte(input as ByteNDArray, to)
-                DataType.SHORT -> castShort(input as ShortNDArray, to)
-                DataType.INT -> castInt(input as IntNDArray, to)
-                DataType.LONG -> castLong(input as LongNDArray, to)
-                DataType.UBYTE -> castUByte(input as UByteNDArray, to)
-                DataType.USHORT -> castUShort(input as UShortNDArray, to)
-                DataType.UINT -> castUInt(input as UIntNDArray, to)
-                DataType.ULONG -> castULong(input as ULongNDArray, to)
-                DataType.FLOAT -> castFloat(input as FloatNDArray, to)
-                DataType.DOUBLE -> castDouble(input as DoubleNDArray, to)
-                DataType.BOOLEAN -> castBoolean(input as BooleanNDArray, to)
+                DataType.BYTE -> castByte(input as ByteNDArray, to, context)
+                DataType.SHORT -> castShort(input as ShortNDArray, to, context)
+                DataType.INT -> castInt(input as IntNDArray, to, context)
+                DataType.LONG -> castLong(input as LongNDArray, to, context)
+                DataType.UBYTE -> castUByte(input as UByteNDArray, to, context)
+                DataType.USHORT -> castUShort(input as UShortNDArray, to, context)
+                DataType.UINT -> castUInt(input as UIntNDArray, to, context)
+                DataType.ULONG -> castULong(input as ULongNDArray, to, context)
+                DataType.FLOAT -> castFloat(input as FloatNDArray, to, context)
+                DataType.DOUBLE -> castDouble(input as DoubleNDArray, to, context)
+                DataType.BOOLEAN -> castBoolean(input as BooleanNDArray, to, context)
                 else -> throw IllegalStateException("Unsupported type ${input.type}")
             }
         }
@@ -808,11 +801,13 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
     private val toType: Int by attribute("to") { it: Number -> it.toInt() }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
+        val manualContext = coroutineContext[ManualAllocatorContext]
+
         val tensor = inputs.first()!!
         val to = TensorProto.DataType.fromValue(toType)!!
 
-        val casted = castTo(tensor.data, to)
+        val casted = castTo(tensor.data, to, manualContext)
 
-        return listOf(casted.asTensor("output"))
+        return listOf(casted.asTensor("output", manualContext))
     }
 }
diff --git a/inference/inference-core/src/jvmTest/kotlin/io/kinference/models/bert/BERTTest.kt b/inference/inference-core/src/jvmTest/kotlin/io/kinference/models/bert/BERTTest.kt
index 30ff16b83..bed95862c 100644
--- a/inference/inference-core/src/jvmTest/kotlin/io/kinference/models/bert/BERTTest.kt
+++ b/inference/inference-core/src/jvmTest/kotlin/io/kinference/models/bert/BERTTest.kt
@@ -15,6 +15,6 @@ class BERTTest {
 
     @Test
     fun benchmark_test_vanilla_bert_performance() = TestRunner.runTest {
-        KIPerformanceRunner.runFromS3("bert:standard:en:v1", count = 3)
+        KIPerformanceRunner.runFromS3("bert:standard:en:v1", count = 20)
     }
 }
diff --git a/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt b/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt
deleted file mode 100644
index bea90d149..000000000
--- a/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt
+++ /dev/null
@@ -1,21 +0,0 @@
-package io.kinference.ndarray.arrays
-
-typealias StateMarker = () -> Unit
-
-enum class ArrayTypes(val index: Int, val size: Int) {
-    ByteArray(0, Byte.SIZE_BYTES),
-    UByteArray(1, UByte.SIZE_BYTES),
-    ShortArray(2, Short.SIZE_BYTES),
-    UShortArray(3, UShort.SIZE_BYTES),
-    IntArray(4, Int.SIZE_BYTES),
-    UIntArray(5, UInt.SIZE_BYTES),
-    LongArray(6, Long.SIZE_BYTES),
-    ULongArray(7, ULong.SIZE_BYTES),
-    FloatArray(8, Float.SIZE_BYTES),
-    DoubleArray(9, Double.SIZE_BYTES),
-    BooleanArray(10, 1);
-}
-
-interface MemoryControlledArray {
-    fun markOutput()
-}
diff --git a/ndarray/ndarray-core/build.gradle.kts b/ndarray/ndarray-core/build.gradle.kts
index 4b88ac6ed..8aa7ddf85 100644
--- a/ndarray/ndarray-core/build.gradle.kts
+++ b/ndarray/ndarray-core/build.gradle.kts
@@ -19,6 +19,7 @@ kotlin {
                 implementation(libs.kotlinx.coroutines.core)
                 implementation(libs.kotlinx.atomicfu)
                 api(libs.apache.commons.math4.core)
+                api(libs.fastutil.core)
             }
         }
     }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/Utils.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/Utils.kt
index c1af61364..546857006 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/Utils.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/Utils.kt
@@ -73,18 +73,25 @@ const val ERF_COEF_3 = 1.421413741
 const val ERF_COEF_4 = -1.453152027
 const val ERF_COEF_5 = 1.061405429
 
+const val INIT_STORAGE_SIZE = 64
+
 internal fun IntArray.swap(leftIdx: Int, rightIdx: Int) {
     val temp = get(leftIdx)
     this[leftIdx] = this[rightIdx]
     this[rightIdx] = temp
 }
+
+fun interface ParallelizeBody {
+    operator fun invoke(start: Int, end: Int, coroutineIndex: Int)
+}
+
 /*
  * Parallelize with batching by minDataPerLaunch
  */
 suspend fun parallelizeByBlocks(blockSize: Int,
                                 countBlocks: Int,
                                 minDataPerLaunch: Int,
-                                body: (blockStart: Int, blockEnd: Int, coroutineIndex: Int) -> Unit) {
+                                body: ParallelizeBody) {
 
     val batchSize = batchSizeByData(blockSize, countBlocks, minDataPerLaunch)
 
@@ -101,7 +108,7 @@ suspend fun parallelizeByBlocks(blockSize: Int,
     }
 }
 
-suspend inline fun parallelizeByRows(rowSize: Int, countRows: Int, minDataPerLaunch: Int, noinline body: (rowStart: Int, rowEnd: Int, index: Int) -> Unit) = parallelizeByBlocks(rowSize, countRows, minDataPerLaunch, body)
+suspend inline fun parallelizeByRows(rowSize: Int, countRows: Int, minDataPerLaunch: Int, body: ParallelizeBody) = parallelizeByBlocks(rowSize, countRows, minDataPerLaunch, body)
 
 internal fun countCoroutinesByData(rowSize: Int, countRows: Int, minDataPerLaunch: Int): Int {
     val batchSize = batchSizeByData(rowSize, countRows, minDataPerLaunch)
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/BooleanNDArray.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/BooleanNDArray.kt
index 5a4e758dc..3037028d3 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/BooleanNDArray.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/BooleanNDArray.kt
@@ -21,7 +21,7 @@ fun interface BooleanBinaryOperation {
     operator fun invoke(first: Boolean, second: Boolean): Boolean
 }
 
-open class BooleanNDArray(var array: BooleanTiledArray, strides: Strides) : NDArrayCore, MemoryControlledArray {
+open class BooleanNDArray(var array: BooleanTiledArray, strides: Strides) : NDArrayCore {
     override val type: DataType = DataType.BOOLEAN
 
     final override var strides: Strides = strides
@@ -79,10 +79,6 @@ open class BooleanNDArray(var array: BooleanTiledArray, strides: Strides) : NDAr
         return array.blocks[0][0]
     }
 
-    override fun markOutput() {
-        array.marker.forEach { it.invoke() }
-    }
-
     override suspend fun toMutable(): MutableBooleanNDArray {
         return MutableBooleanNDArray(array.copyOf(), strides)
     }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/PrimitiveNDArray.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/PrimitiveNDArray.kt
index 0b391f275..f1bd91b44 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/PrimitiveNDArray.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/PrimitiveNDArray.kt
@@ -30,7 +30,7 @@ import kotlin.math.*
 
 @GenerateNameFromPrimitives
 @MakePublic
-internal open class PrimitiveNDArray(array: PrimitiveTiledArray, strides: Strides) : NumberNDArrayCore, MemoryControlledArray {
+internal open class PrimitiveNDArray(array: PrimitiveTiledArray, strides: Strides) : NumberNDArrayCore {
     var array: PrimitiveTiledArray = array
         protected set
 
@@ -85,10 +85,6 @@ internal open class PrimitiveNDArray(array: PrimitiveTiledArray, strides: Stride
         return array.blocks[0][0]
     }
 
-    override fun markOutput() {
-        array.marker.forEach { it.invoke() }
-    }
-
     override suspend fun clone(): PrimitiveNDArray {
         return PrimitiveNDArray(array.copyOf(), Strides(shape))
     }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt
deleted file mode 100644
index 2ed73f878..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt
+++ /dev/null
@@ -1,35 +0,0 @@
-package io.kinference.ndarray.arrays.memory
-
-import io.kinference.ndarray.arrays.*
-import kotlin.coroutines.CoroutineContext
-
-data class AllocatorContext internal constructor(
-    private val unusedContainers: ArrayStorage,
-    private val limiter: MemoryLimiter,
-    private val returnStorageFn: (ArrayStorage) -> Unit
-) : CoroutineContext.Element {
-    private val usedContainers: ArrayDeque<ArrayContainer> = ArrayDeque()
-
-    companion object Key : CoroutineContext.Key<AllocatorContext>
-    override val key: CoroutineContext.Key<*> get() = Key
-
-    internal fun getArrayContainers(type: ArrayTypes, size: Int, count: Int): Array<ArrayContainer> {
-        return if (limiter !is NoAllocatorMemoryLimiter) {
-            val result = Array(count) { unusedContainers.getArrayContainer(type, size) }
-            usedContainers.addAll(result)
-            result
-        } else {
-            Array(count) { ArrayContainer(type, size) }
-        }
-    }
-
-    fun closeAllocated() {
-        usedContainers.forEach {
-            if (!it.isOutput && limiter.checkMemoryLimitAndAdd(it.sizeBytes.toLong())) {
-                unusedContainers[it.arrayTypeIndex, it.arraySizeIndex].addLast(it)
-            }
-        }
-        usedContainers.clear()
-        returnStorageFn(unusedContainers)
-    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayContainer.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayContainer.kt
deleted file mode 100644
index d39ba62ba..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayContainer.kt
+++ /dev/null
@@ -1,55 +0,0 @@
-package io.kinference.ndarray.arrays.memory
-
-import io.kinference.ndarray.arrays.*
-
-sealed class ArrayContainer(
-    val arrayTypeIndex: Int,
-    val arraySizeIndex: Int,
-    val sizeBytes: Int
-) {
-    var isOutput: Boolean = false
-        private set
-
-    val markAsOutput = {
-        isOutput = true
-    }
-
-    companion object {
-        private const val EMPTY_INDEX = -1
-
-        operator fun invoke(type: ArrayTypes, size: Int, sizeIndex: Int = EMPTY_INDEX): ArrayContainer {
-            val sizeBytes: Int = type.size * size
-            return when (type) {
-                ArrayTypes.ByteArray -> ByteArrayContainer(type.index, sizeIndex, sizeBytes, ByteArray(size))         // 8-bit signed
-                ArrayTypes.UByteArray -> UByteArrayContainer(type.index, sizeIndex, sizeBytes, UByteArray(size))      // 8-bit unsigned
-                ArrayTypes.ShortArray -> ShortArrayContainer(type.index, sizeIndex, sizeBytes, ShortArray(size))      // 16-bit signed
-                ArrayTypes.UShortArray -> UShortArrayContainer(type.index, sizeIndex, sizeBytes, UShortArray(size))   // 16-bit unsigned
-                ArrayTypes.IntArray -> IntArrayContainer(type.index, sizeIndex, sizeBytes, IntArray(size))            // 32-bit signed
-                ArrayTypes.UIntArray -> UIntArrayContainer(type.index, sizeIndex, sizeBytes, UIntArray(size))         // 32-bit unsigned
-                ArrayTypes.LongArray -> LongArrayContainer(type.index, sizeIndex, sizeBytes, LongArray(size))         // 64-bit signed
-                ArrayTypes.ULongArray -> ULongArrayContainer(type.index, sizeIndex, sizeBytes, ULongArray(size))      // 64-bit unsigned
-                ArrayTypes.FloatArray -> FloatArrayContainer(type.index, sizeIndex, sizeBytes, FloatArray(size))
-                ArrayTypes.DoubleArray -> DoubleArrayContainer(type.index, sizeIndex, sizeBytes, DoubleArray(size))
-                ArrayTypes.BooleanArray -> BooleanArrayContainer(type.index, sizeIndex, sizeBytes, BooleanArray(size))
-                else -> throw IllegalArgumentException("Unsupported array type")
-            }
-        }
-
-        fun resetArray(arrayContainer: ArrayContainer) {
-            when (arrayContainer) {
-                is ByteArrayContainer -> arrayContainer.array.fill(0)       // 8-bit signed
-                is UByteArrayContainer -> arrayContainer.array.fill(0u)     // 8-bit unsigned
-                is ShortArrayContainer -> arrayContainer.array.fill(0)      // 16-bit signed
-                is UShortArrayContainer -> arrayContainer.array.fill(0u)    // 16-bit unsigned
-                is IntArrayContainer -> arrayContainer.array.fill(0)        // 32-bit signed
-                is UIntArrayContainer -> arrayContainer.array.fill(0u)      // 32-bit unsigned
-                is LongArrayContainer -> arrayContainer.array.fill(0L)      // 64-bit signed
-                is ULongArrayContainer -> arrayContainer.array.fill(0U)     // 64-bit unsigned
-                is FloatArrayContainer -> arrayContainer.array.fill(0.0f)
-                is DoubleArrayContainer -> arrayContainer.array.fill(0.0)
-                is BooleanArrayContainer -> arrayContainer.array.fill(false)
-                else -> throw IllegalArgumentException("Unsupported array type")
-            }
-        }
-    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt
deleted file mode 100644
index 00a98c0cb..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt
+++ /dev/null
@@ -1,59 +0,0 @@
-package io.kinference.ndarray.arrays.memory
-
-import io.kinference.ndarray.arrays.ArrayTypes
-
-internal class ArrayStorage(typeLength: Int, sizeLength: Int, private val limiter: MemoryLimiter) {
-    /**
-     * Structure is as follows:
-     * 1. Array by predefined types (all types are known compiled time)
-     * 2. Array by size. Starting with 'INIT_SIZE_VALUE' element and grow it doubling (typically there are no more than 16 different sizes)
-     * 3. Queue of array containers (used as FIFO)
-     */
-    private var storage: Array<Array<ArrayDeque<ArrayContainer>>> =
-        Array(typeLength) { Array(sizeLength) { ArrayDeque() } }
-
-    private var sizeIndices: IntArray = IntArray(typeLength)
-    private var sizes: Array<IntArray> = Array(typeLength) { IntArray(sizeLength) }
-
-
-    operator fun get(typeIndex: Int, sizeIndex: Int): ArrayDeque<ArrayContainer> {
-        return storage[typeIndex][sizeIndex]
-    }
-
-    fun getArrayContainer(type: ArrayTypes, size: Int): ArrayContainer {
-        val tIndex = type.index
-        val sIndex = sizes[tIndex].indexOf(size)
-
-        // Checking that we have this array size in our storage for this type
-        val idx = if (sIndex != -1) {
-            val array = storage[tIndex][sIndex].removeFirstOrNull()
-            array?.let {
-                ArrayContainer.resetArray(it)
-                limiter.deductMemory(it.sizeBytes.toLong())
-                return it
-            }
-            sIndex
-        } else {
-            if (sizeIndices[tIndex] >= storage[tIndex].size)
-                grow(tIndex)
-
-            val idx = sizeIndices[tIndex]++
-            sizes[tIndex][idx] = size
-            idx
-        }
-
-        return ArrayContainer(type, size, idx)
-    }
-
-    private fun grow(typeIndex: Int) {
-        val newSize = sizes[typeIndex].size * 2
-        val newStorage: Array<ArrayDeque<ArrayContainer>> = Array(newSize) { ArrayDeque() }
-
-        for (i in storage[typeIndex].indices) {
-            newStorage[i] = storage[typeIndex][i]
-        }
-
-        storage[typeIndex] = newStorage
-        sizes[typeIndex] = sizes[typeIndex].copyOf(newSize)
-    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryLimiter.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryLimiter.kt
deleted file mode 100644
index 775c0a895..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryLimiter.kt
+++ /dev/null
@@ -1,57 +0,0 @@
-package io.kinference.ndarray.arrays.memory
-
-import io.kinference.utils.PlatformUtils
-import kotlinx.atomicfu.AtomicLong
-import kotlinx.atomicfu.atomic
-
-interface MemoryLimiter {
-    /**
-     * Checks if the memory limit allows adding the specified amount of memory and performs the addition.
-     *
-     * @param added the memory in bytes to add
-     * @return true if the memory was added successfully and false if adding the memory exceeds the memory limit
-     */
-    fun checkMemoryLimitAndAdd(added: Long): Boolean
-
-    /**
-     * Deducts the specified amount of memory from the memory limiter.
-     *
-     * @param deducted the memory in bytes to deduct from the memory limiter
-     */
-    fun deductMemory(deducted: Long)
-}
-
-class BaseMemoryLimiter(private val memoryLimit: Long) : MemoryLimiter {
-    private var usedMemory: AtomicLong = atomic(0L)
-
-    override fun checkMemoryLimitAndAdd(added: Long): Boolean {
-        val currentMemory = usedMemory.addAndGet(added)
-        return if (currentMemory > memoryLimit) {
-            usedMemory.addAndGet(-added)
-            false
-        } else true
-    }
-
-    override fun deductMemory(deducted: Long) {
-        usedMemory.addAndGet(-deducted)
-    }
-}
-
-object MemoryLimiters {
-    val Default: MemoryLimiter = BaseMemoryLimiter((PlatformUtils.maxHeap * 0.3).toLong())
-    val NoAllocator: MemoryLimiter = NoAllocatorMemoryLimiter
-
-    fun customLimiter(memoryLimit: Long): MemoryLimiter {
-        return BaseMemoryLimiter(memoryLimit)
-    }
-}
-
-internal object NoAllocatorMemoryLimiter : MemoryLimiter {
-    override fun checkMemoryLimitAndAdd(added: Long): Boolean {
-        return false
-    }
-
-    override fun deductMemory(deducted: Long) {
-
-    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryManager.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryManager.kt
new file mode 100644
index 000000000..cef20b3af
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryManager.kt
@@ -0,0 +1,75 @@
+package io.kinference.ndarray.arrays.memory
+
+import kotlinx.atomicfu.*
+import kotlinx.coroutines.*
+
+internal class MemoryManager internal constructor(private val memoryLimit: Long, private val cacheClearingInterval: Long, private val onCacheClear: () -> Unit) {
+    private var usedMemory: AtomicLong = atomic(0L)
+    private val lastAccessTime = atomic(System.currentTimeMillis())
+    private val monitorJob: AtomicRef<Job?> = atomic(initial = null)
+    private val isFinalized = atomic(initial = false)
+
+    /**
+     * Checks if the memory limit allows adding the specified amount of memory and performs the addition
+     *
+     * @param sizeInBytes is the checking size of an array in bytes
+     * @return true if the memory was added successfully and false if adding the memory exceeds the memory limit
+     */
+    fun checkMemoryLimitAndAdd(sizeInBytes: Long): Boolean {
+        // Attempt to add memory and check the limit
+        val successful = usedMemory.getAndUpdate { current ->
+            if (current + sizeInBytes > memoryLimit) current else current + sizeInBytes
+        } != usedMemory.value // Check if the update was successful
+
+        return successful
+    }
+
+    /**
+     * Resets the used memory into 0L
+     */
+    fun resetLimit() {
+        usedMemory.value = 0L
+    }
+
+    /**
+     * Updates the last access time to the current system time and starts a monitoring coroutine if it isn't already running.
+     *
+     * This function sets the `lastAccessTime` to the current system time in milliseconds.
+     * It also initiates a monitoring coroutine to periodically check
+     * if the time since the last access exceeds a predefined `cacheClearingInterval`.
+     * If it does, the `onCacheClear` function is triggered to handle
+     * any necessary cache clearing.
+     * The coroutine will run only if it is not already running and `isFinalized` is false.
+     */
+    fun updateLastAccessTime() {
+        lastAccessTime.value = System.currentTimeMillis()
+
+        // Start monitoring if not already started
+        if (monitorJob.compareAndSet(expect = null, update = null) && !isFinalized.value) {
+            val newJob = CoroutineScope(Dispatchers.Default).launch {
+                while (isActive) {
+                    delay(cacheClearingInterval)
+                    if (System.currentTimeMillis() - lastAccessTime.value > cacheClearingInterval) {
+                        onCacheClear()
+                    }
+                }
+            }
+            if (!monitorJob.compareAndSet(expect = null, newJob)) {
+                newJob.cancel() // Cancel if another thread set the job
+            }
+        }
+    }
+
+    /**
+     * Stops the monitoring process by canceling the active monitoring coroutine.
+     *
+     * This function sets the `isFinalized` flag to true, indicating that the monitoring process has been
+     * concluded.
+     * If a monitoring coroutine is currently active, it will be canceled.
+     */
+    fun stopMonitoring() {
+        if (isFinalized.compareAndSet(expect = false, update = true)) {
+            monitorJob.getAndSet(value = null)?.cancel()
+        }
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ModelArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ModelArrayStorage.kt
deleted file mode 100644
index c9247d060..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ModelArrayStorage.kt
+++ /dev/null
@@ -1,34 +0,0 @@
-package io.kinference.ndarray.arrays.memory
-
-import io.kinference.ndarray.arrays.ArrayTypes
-import io.kinference.utils.Closeable
-import java.util.concurrent.ConcurrentLinkedQueue
-
-class ModelArrayStorage(private val limiter: MemoryLimiter = MemoryLimiters.NoAllocator) : Closeable {
-    private val unusedArrays: ConcurrentLinkedQueue<ArrayStorage> = ConcurrentLinkedQueue()
-
-    companion object {
-        private const val INIT_SIZE_VALUE: Int = 2
-        private val typeSize: Int = ArrayTypes.values().size
-    }
-
-    fun createAllocatorContext(): AllocatorContext {
-        return AllocatorContext(getStorage(), limiter, ::returnStorage)
-    }
-
-    fun clearCache() {
-        unusedArrays.clear()
-    }
-
-    override suspend fun close() {
-        clearCache()
-    }
-
-    private fun getStorage(): ArrayStorage {
-        return unusedArrays.poll() ?: ArrayStorage(typeSize, INIT_SIZE_VALUE, limiter)
-    }
-
-    private fun returnStorage(storage: ArrayStorage) {
-        unusedArrays.offer(storage)
-    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PredictionContextDispatcher.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PredictionContextDispatcher.kt
new file mode 100644
index 000000000..801e5c66b
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PredictionContextDispatcher.kt
@@ -0,0 +1,63 @@
+package io.kinference.ndarray.arrays.memory
+
+import io.kinference.ndarray.arrays.memory.contexts.*
+import io.kinference.ndarray.arrays.memory.storage.*
+import io.kinference.utils.*
+import kotlinx.coroutines.*
+import java.util.concurrent.ConcurrentLinkedQueue
+
+class PredictionContextDispatcher(private val predictionConfig: PredictionConfig) : Closeable {
+    private val limiter: MemoryManager = MemoryManager(
+        memoryLimit = predictionConfig.memoryThreshold,
+        cacheClearingInterval = predictionConfig.memoryClearingInterval,
+        onCacheClear = ::clearCache)
+
+    private val contextQueue: ConcurrentLinkedQueue<PredictionContext> = ConcurrentLinkedQueue()
+    val allocationMode
+        get() = predictionConfig.allocationMode
+
+    fun getPredictionContext(): PredictionContext {
+        val allocatorContext = when (predictionConfig.allocationMode) {
+            AllocationMode.NoAllocation -> getNoAllocatorContext()
+            AllocationMode.Manual -> getManualAllocatorContext()
+            AllocationMode.Auto -> getAutoAllocatorContext()
+        }
+        return allocatorContext
+    }
+
+    private fun getNoAllocatorContext(): PredictionContext {
+        return contextQueue.poll() ?: (NoAllocatorContext(getDispatcher()))
+    }
+
+    private fun getAutoAllocatorContext(): PredictionContext {
+        limiter.updateLastAccessTime()
+        return contextQueue.poll() ?: (AutoAllocatorContext(getDispatcher(), AutoArrayHandlingStorage(limiter)))
+    }
+
+    private fun getManualAllocatorContext(): PredictionContext {
+        limiter.updateLastAccessTime()
+        return contextQueue.poll() ?: (ManualAllocatorContext(getDispatcher(), ManualArrayHandlingStorage(limiter)))
+    }
+
+    @OptIn(ExperimentalCoroutinesApi::class)
+    private fun getDispatcher(): CoroutineDispatcher {
+        return Dispatchers.Default.limitedParallelism(predictionConfig.parallelismLimit)
+    }
+
+    fun clearCache() {
+        limiter.stopMonitoring()
+        contextQueue.clear()
+        limiter.resetLimit()
+    }
+
+    override suspend fun close() {
+        clearCache()
+    }
+
+    fun returnStorage(context: PredictionContext) {
+        if (context is AllocatorContext<*>) {
+            context.finalizeContext()
+        }
+        contextQueue.offer(context)
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayContainer.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayContainer.kt
deleted file mode 100644
index 8818345fe..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayContainer.kt
+++ /dev/null
@@ -1,17 +0,0 @@
-@file:GeneratePrimitives(DataType.ALL)
-@file:Suppress("DuplicatedCode")
-
-package io.kinference.ndarray.arrays.memory
-
-import io.kinference.primitives.annotations.GenerateNameFromPrimitives
-import io.kinference.primitives.annotations.GeneratePrimitives
-import io.kinference.primitives.types.DataType
-import io.kinference.primitives.types.PrimitiveArray
-
-@GenerateNameFromPrimitives
-internal class PrimitiveArrayContainer(
-    arrayTypeIndex: Int,
-    arraySizeIndex: Int,
-    sizeBytes: Int,
-    val array: PrimitiveArray
-) : ArrayContainer(arrayTypeIndex, arraySizeIndex, sizeBytes)
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
new file mode 100644
index 000000000..9af632e6b
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
@@ -0,0 +1,16 @@
+package io.kinference.ndarray.arrays.memory.contexts
+
+import io.kinference.ndarray.arrays.memory.storage.AutoArrayHandlingStorage
+import io.kinference.utils.*
+import kotlinx.coroutines.CoroutineDispatcher
+import kotlin.coroutines.*
+
+@OptIn(ExperimentalStdlibApi::class)
+internal class AutoAllocatorContext internal constructor(
+    dispatcher: CoroutineDispatcher,
+    storage: AutoArrayHandlingStorage,
+) : AllocatorContext<AutoArrayHandlingStorage>(dispatcher, storage) {
+    companion object Key : AbstractCoroutineContextKey<AllocatorContext<*>, AutoAllocatorContext>(
+        AllocatorContext.Key, { it as? AutoAllocatorContext }
+    )
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt
new file mode 100644
index 000000000..5a93917de
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt
@@ -0,0 +1,28 @@
+package io.kinference.ndarray.arrays.memory.contexts
+
+import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.storage.ManualArrayHandlingStorage
+import io.kinference.ndarray.arrays.memory.storage.ManualStorage
+import io.kinference.primitives.types.DataType
+import io.kinference.utils.AllocatorContext
+import kotlinx.coroutines.CoroutineDispatcher
+import kotlin.coroutines.AbstractCoroutineContextKey
+
+@OptIn(ExperimentalStdlibApi::class)
+class ManualAllocatorContext internal constructor(
+    dispatcher: CoroutineDispatcher,
+    storage: ManualArrayHandlingStorage,
+) : AllocatorContext<ManualStorage>(dispatcher, storage) {
+    companion object Key : AbstractCoroutineContextKey<AllocatorContext<*>, ManualAllocatorContext>(
+        AllocatorContext.Key, { it as? ManualAllocatorContext }
+    )
+
+
+    fun getNDArray(dataType: DataType, strides: Strides, fillZeros: Boolean = false): MutableNDArrayCore {
+        return storage.getNDArray(dataType, strides, fillZeros)
+    }
+
+    fun returnNDArray(ndArray: NDArrayCore) {
+        storage.returnNDArray(ndArray)
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt
new file mode 100644
index 000000000..b0ffdbbb5
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt
@@ -0,0 +1,29 @@
+package io.kinference.ndarray.arrays.memory.storage
+
+import io.kinference.ndarray.arrays.memory.*
+import io.kinference.utils.ArrayStorage
+
+internal interface TypedAutoHandlingStorage {
+    fun moveBlocksIntoUnused()
+}
+
+internal class AutoArrayHandlingStorage(internal val limiter: MemoryManager) : ArrayStorage {
+    internal val storage: Array<TypedAutoHandlingStorage> = arrayOf(
+        ByteAutoHandlingArrayStorage(),
+        ShortAutoHandlingArrayStorage(),
+        IntAutoHandlingArrayStorage(),
+        LongAutoHandlingArrayStorage(),
+        UByteAutoHandlingArrayStorage(),
+        UShortAutoHandlingArrayStorage(),
+        UIntAutoHandlingArrayStorage(),
+        ULongAutoHandlingArrayStorage(),
+        FloatAutoHandlingArrayStorage(),
+        DoubleAutoHandlingArrayStorage(),
+        BooleanAutoHandlingArrayStorage()
+    )
+
+    override fun resetState() {
+        storage.forEach { it.moveBlocksIntoUnused() }
+        limiter.resetLimit()
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt
new file mode 100644
index 000000000..227d25136
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt
@@ -0,0 +1,46 @@
+package io.kinference.ndarray.arrays.memory.storage
+
+import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.*
+import io.kinference.primitives.types.DataType
+import io.kinference.utils.ArrayStorage
+
+internal interface TypedManualHandlingStorage {
+    fun getNDArray(strides: Strides, fillZeros: Boolean = false, limiter: MemoryManager): MutableNDArrayCore
+    fun returnNDArray(ndarray: NDArrayCore)
+    fun clear()
+}
+
+interface ManualStorage : ArrayStorage {
+    fun getNDArray(dataType: DataType, strides: Strides, fillZeros: Boolean = false): MutableNDArrayCore
+    fun returnNDArray(ndArray: NDArrayCore)
+}
+
+internal class ManualArrayHandlingStorage(private val memoryManager: MemoryManager) : ManualStorage {
+    private val storage: Array<TypedManualHandlingStorage> = arrayOf(
+        ByteManualHandlingArrayStorage(),
+        ShortManualHandlingArrayStorage(),
+        IntManualHandlingArrayStorage(),
+        LongManualHandlingArrayStorage(),
+        UByteManualHandlingArrayStorage(),
+        UShortManualHandlingArrayStorage(),
+        UIntManualHandlingArrayStorage(),
+        ULongManualHandlingArrayStorage(),
+        FloatManualHandlingArrayStorage(),
+        DoubleManualHandlingArrayStorage(),
+        BooleanManualHandlingArrayStorage()
+    )
+
+    override fun getNDArray(dataType: DataType, strides: Strides, fillZeros: Boolean): MutableNDArrayCore {
+        return storage[dataType.ordinal].getNDArray(strides, fillZeros, memoryManager)
+    }
+
+    override fun returnNDArray(ndArray: NDArrayCore) {
+        storage[ndArray.type.ordinal].returnNDArray(ndArray)
+    }
+
+    override fun resetState() {
+        storage.forEach { it.clear() }
+        memoryManager.resetLimit()
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
new file mode 100644
index 000000000..4cd5bb663
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
@@ -0,0 +1,48 @@
+@file:GeneratePrimitives(DataType.ALL)
+package io.kinference.ndarray.arrays.memory.storage
+
+import io.kinference.ndarray.INIT_STORAGE_SIZE
+import io.kinference.ndarray.arrays.memory.MemoryManager
+import io.kinference.ndarray.extensions.constants.PrimitiveConstants
+import io.kinference.ndarray.extensions.utils.getOrPut
+import io.kinference.primitives.annotations.GenerateNameFromPrimitives
+import io.kinference.primitives.annotations.GeneratePrimitives
+import io.kinference.primitives.types.DataType
+import io.kinference.primitives.types.PrimitiveArray
+import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap
+
+@GenerateNameFromPrimitives
+internal class PrimitiveAutoHandlingArrayStorage : TypedAutoHandlingStorage {
+    private val used = Int2ObjectOpenHashMap<ArrayDeque<PrimitiveArray>>(INIT_STORAGE_SIZE)
+    private val unused = Int2ObjectOpenHashMap<ArrayDeque<PrimitiveArray>>(INIT_STORAGE_SIZE)
+
+    companion object {
+        private val type = DataType.CurrentPrimitive
+    }
+
+    internal fun getBlock(blocksNum: Int, blockSize: Int, limiter: MemoryManager): Array<PrimitiveArray> {
+        val unusedQueue = unused.getOrPut(blockSize) { ArrayDeque(blocksNum) }
+        val usedQueue = used.getOrPut(blockSize) { ArrayDeque(blocksNum) }
+
+        val blocks = if (limiter.checkMemoryLimitAndAdd(getPrimitiveArraySizeInBytes(arraySize = blockSize * blocksNum))) {
+            Array(blocksNum) {
+                unusedQueue.removeFirstOrNull()?.apply {
+                    fill(PrimitiveConstants.ZERO)
+                } ?: PrimitiveArray(blockSize)
+            }
+        } else {
+            Array(blocksNum) { PrimitiveArray(blockSize) }
+        }
+
+        usedQueue.addAll(blocks)
+
+        return blocks
+    }
+
+    override fun moveBlocksIntoUnused() {
+        used.forEach { (blockSize, queue) ->
+            unused[blockSize]!!.addAll(queue)
+            queue.clear()
+        }
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt
new file mode 100644
index 000000000..9280823d8
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt
@@ -0,0 +1,24 @@
+@file:GeneratePrimitives(DataType.ALL)
+@file:Suppress("DuplicatedCode")
+package io.kinference.ndarray.arrays.memory.storage
+
+import io.kinference.ndarray.arrays.memory.contexts.AutoAllocatorContext
+import io.kinference.ndarray.extensions.constants.PrimitiveConstants
+import io.kinference.primitives.annotations.GenerateNameFromPrimitives
+import io.kinference.primitives.annotations.GeneratePrimitives
+import io.kinference.primitives.types.*
+
+@GenerateNameFromPrimitives
+internal fun AutoArrayHandlingStorage.getPrimitiveBlock(blocksNum: Int, blockSize: Int): Array<PrimitiveArray> {
+    return (storage[DataType.CurrentPrimitive.ordinal] as PrimitiveAutoHandlingArrayStorage).getBlock(blocksNum = blocksNum, blockSize = blockSize, limiter = limiter)
+}
+
+@GenerateNameFromPrimitives
+internal fun AutoAllocatorContext.getPrimitiveBlock(blocksNum: Int, blockSize: Int): Array<PrimitiveArray> {
+    return storage.getPrimitiveBlock(blocksNum = blocksNum, blockSize = blockSize)
+}
+
+@GenerateNameFromPrimitives
+internal fun getPrimitiveArraySizeInBytes(arraySize: Int): Long {
+    return PrimitiveConstants.SIZE_BYTES * arraySize
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt
new file mode 100644
index 000000000..1c264be01
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt
@@ -0,0 +1,57 @@
+@file:GeneratePrimitives(DataType.ALL)
+package io.kinference.ndarray.arrays.memory.storage
+
+import io.kinference.ndarray.INIT_STORAGE_SIZE
+import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.PrimitiveNDArray
+import io.kinference.ndarray.arrays.memory.MemoryManager
+import io.kinference.ndarray.arrays.tiled.PrimitiveTiledArray
+import io.kinference.ndarray.blockSizeByStrides
+import io.kinference.ndarray.extensions.constants.PrimitiveConstants
+import io.kinference.ndarray.extensions.utils.getOrPut
+import io.kinference.primitives.annotations.*
+import io.kinference.primitives.types.DataType
+import io.kinference.primitives.types.PrimitiveArray
+import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap
+
+@GenerateNameFromPrimitives
+internal class PrimitiveManualHandlingArrayStorage : TypedManualHandlingStorage {
+    private val storage = Int2ObjectOpenHashMap<ArrayDeque<PrimitiveArray>>(INIT_STORAGE_SIZE)
+
+    companion object {
+        private val type = DataType.CurrentPrimitive
+    }
+
+    override fun getNDArray(strides: Strides, fillZeros: Boolean, limiter: MemoryManager): MutableNDArrayCore {
+        val blockSize = blockSizeByStrides(strides)
+        val blocksNum = strides.linearSize / blockSize
+        val blocks = if (limiter.checkMemoryLimitAndAdd(getPrimitiveArraySizeInBytes(arraySize = blockSize * blocksNum))) {
+            val queue = storage.getOrPut(blockSize) { ArrayDeque(blocksNum) }
+            Array(blocksNum) {
+                queue.removeFirstOrNull()?.apply {
+                    fill(PrimitiveConstants.ZERO)
+                } ?: PrimitiveArray(blockSize)
+            }
+        } else {
+            Array(blocksNum) { PrimitiveArray(blockSize) }
+        }
+
+        val tiled = PrimitiveTiledArray(blocks)
+
+        return MutablePrimitiveNDArray(tiled, strides)
+    }
+
+    override fun returnNDArray(ndarray: NDArrayCore) {
+        require(ndarray is PrimitiveNDArray)
+        val blockSize = ndarray.array.blockSize
+        val blocksNum = ndarray.array.blocksNum
+
+        val queue = storage.getOrPut(blockSize) { ArrayDeque(blocksNum) }
+
+        queue.addAll(ndarray.array.blocks)
+    }
+
+    override fun clear() {
+        storage.forEach { (_, queue) -> queue.clear() }
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
index eda58c092..eee93692c 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
@@ -4,8 +4,8 @@
 package io.kinference.ndarray.arrays.tiled
 
 import io.kinference.ndarray.arrays.*
-import io.kinference.ndarray.arrays.memory.*
-import io.kinference.ndarray.arrays.memory.PrimitiveArrayContainer
+import io.kinference.ndarray.arrays.memory.contexts.AutoAllocatorContext
+import io.kinference.ndarray.arrays.memory.storage.*
 import io.kinference.ndarray.arrays.pointers.PrimitivePointer
 import io.kinference.ndarray.arrays.pointers.accept
 import io.kinference.ndarray.blockSizeByStrides
@@ -17,7 +17,7 @@ import kotlin.math.min
 
 @GenerateNameFromPrimitives
 @MakePublic
-internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>, val marker: Array<StateMarker> = emptyMarker) {
+internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>) {
     val size: Int
     val blockSize: Int = if (blocks.isEmpty()) 0 else blocks.first().size
     val blocksNum: Int = blocks.size
@@ -27,8 +27,7 @@ internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>, val marker
     }
 
     companion object {
-        val type: ArrayTypes = ArrayTypes.valueOf(PrimitiveArray::class.simpleName!!)
-        private val emptyMarker: Array<StateMarker> = arrayOf()
+        val type: DataType = DataType.CurrentPrimitive
 
         suspend operator fun invoke(strides: Strides): PrimitiveTiledArray {
             val blockSize = blockSizeByStrides(strides)
@@ -60,15 +59,10 @@ internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>, val marker
                 require(size % blockSize == 0) { "Size must divide blockSize" }
 
             val blocksNum = if (blockSize == 0) 0 else size / blockSize
+            val blocks = coroutineContext[AutoAllocatorContext]?.getPrimitiveBlock(blocksNum, blockSize)
+                ?: Array(blocksNum) { PrimitiveArray(blockSize) }
 
-            val coroutineContext = coroutineContext[AllocatorContext.Key]
-
-            // With array dispatcher
-            val containerArray = coroutineContext?.getArrayContainers(type, blockSize, blocksNum) ?: Array(blocksNum) { ArrayContainer(type, blockSize) }
-            val blocks = Array(containerArray.size) { i -> (containerArray[i] as PrimitiveArrayContainer).array }
-            val marker = Array(containerArray.size) { i -> containerArray[i].markAsOutput }
-
-            return PrimitiveTiledArray(blocks, marker)
+            return PrimitiveTiledArray(blocks)
         }
 
         suspend operator fun invoke(size: Int, blockSize: Int, init: (InlineInt) -> PrimitiveType) : PrimitiveTiledArray {
@@ -133,17 +127,17 @@ internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>, val marker
         blocks[blockIdx][blockOff] = value
     }
 
-    suspend fun copyOf(): PrimitiveTiledArray {
-        val copyArray = PrimitiveTiledArray(size, blockSize)
+    fun copyOf(): PrimitiveTiledArray {
+        val copyBlocks = Array(blocksNum) { PrimitiveArray(blockSize) }
 
         for (blockNum in 0 until blocksNum) {
             val thisBlock = this.blocks[blockNum]
-            val destBlock = copyArray.blocks[blockNum]
+            val destBlock = copyBlocks[blockNum]
 
             thisBlock.copyInto(destBlock)
         }
 
-        return copyArray
+        return PrimitiveTiledArray(copyBlocks)
     }
 
     fun copyInto(dest: PrimitiveTiledArray, destOffset: Int = 0, srcStart: Int = 0, srcEnd: Int = size) {
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/broadcasting/Broadcasting.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/broadcasting/Broadcasting.kt
index bde7580d4..7e2e18551 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/broadcasting/Broadcasting.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/broadcasting/Broadcasting.kt
@@ -17,7 +17,7 @@ fun unsqueezeFirst(shape: IntArray, newShapeSize: Int): IntArray {
 object Broadcasting {
     fun broadcastShapeForMatmul(leftShape: IntArray, rightShape: IntArray): IntArray {
         val actualLeftShape = if (leftShape.size == 1) intArrayOf(1, leftShape[0]) else leftShape
-        val actualRightShape = if (rightShape.size == 1) intArrayOf(1, rightShape[1]) else rightShape
+        val actualRightShape = if (rightShape.size == 1) intArrayOf(rightShape[0], 1) else rightShape
 
         val outputMatrixShape = intArrayOf(actualLeftShape[actualLeftShape.lastIndex - 1], actualRightShape.last())
         val broadcastShape = broadcastShape(listOf(actualLeftShape.copyOfRange(0, actualLeftShape.size - 2),
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/PrimitiveExtensions.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/PrimitiveExtensions.kt
index 21442ecda..cbf651bc0 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/PrimitiveExtensions.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/PrimitiveExtensions.kt
@@ -8,6 +8,7 @@ import io.kinference.ndarray.arrays.pointers.accept
 import io.kinference.ndarray.arrays.pointers.acceptWithRecursive
 import io.kinference.ndarray.stubs.*
 import io.kinference.ndarray.arrays.tiled.*
+import io.kinference.ndarray.extensions.constants.PrimitiveConstants
 import io.kinference.primitives.annotations.*
 import io.kinference.primitives.types.*
 import io.kinference.utils.launchWithLimitOrDefault
@@ -127,21 +128,17 @@ internal suspend fun PrimitiveNDArray.dotTransposedWithAlpha(alpha: Double, othe
     other as PrimitiveNDArray; destination as MutablePrimitiveNDArray
 
     val alpha = alpha.toPrimitive()
-    val dBlocksInRow = destination.blocksInRow
     val lrBlocksInRow = this.blocksInRow
 
     val n = this.shape[0]
     val t = this.shape[1]
     val m = other.shape[0]
 
-    val dBlockSize = destination.array.blockSize
     val lrBlockSize = this.array.blockSize
 
-    val destBlocks = destination.array.blocks
     val leftBlocks = this.array.blocks
     val rightBlocks = other.array.blocks
     val rowFlop = t * m
-    val zero = (0).toPrimitive()
 
 
     /* TODO: (dmitriyb) this is temporary commented. On GEC performance test we have large inputs that cause out of memory exceptions
@@ -161,33 +158,26 @@ internal suspend fun PrimitiveNDArray.dotTransposedWithAlpha(alpha: Double, othe
     // TODO: (cupertank) Remove constants
     // TODO: (dmitriyb) Implement concurrent array retrieve with a separate structure from ArraysDispatcher
     parallelizeByRows(rowFlop, n, 262144) { nStart: Int, nEnd: Int, _ ->
-        val mSums = Array(m) { PrimitiveArray(lrBlockSize) }
+        val tempSum = PrimitiveArray(lrBlockSize)
+        val destPointer = destination.array.pointer()
         for (i in nStart until nEnd) {
             val leftBlockOffset = i * lrBlocksInRow
             val rightBlockIter = rightBlocks.iterator()
 
-            val destBlockOffset = i * dBlocksInRow
+            destPointer.linearIndex = i * m
 
             for (k in 0 until m) {
-                val tempArray = mSums[k]
                 for (lrBlock in 0 until lrBlocksInRow) {
                     val leftBlock = leftBlocks[leftBlockOffset + lrBlock]
                     val rightBlock = rightBlockIter.next()
 
-                    for (j in tempArray.indices) {
-                        tempArray[j] += leftBlock[j] * rightBlock[j]
+                    for (j in tempSum.indices) {
+                        tempSum[j] += leftBlock[j] * rightBlock[j]
                     }
                 }
-            }
 
-            val mSumsIter = mSums.iterator()
-            for (destBlockNum in 0 until dBlocksInRow) {
-                val destBlock = destBlocks[destBlockOffset + destBlockNum]
-                for (j in destBlock.indices) {
-                    val sumBlock = mSumsIter.next()
-                    destBlock[j] = sumBlock.sum() * alpha
-                    sumBlock.fill(zero)
-                }
+                destPointer.setAndIncrement(tempSum.sum() * alpha)
+                tempSum.fill(PrimitiveConstants.ZERO)
             }
         }
     }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/BroadcastTwoArgumentsPrimitive.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/BroadcastTwoArgumentsPrimitive.kt
index 90056a8bf..8fd770dd1 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/BroadcastTwoArgumentsPrimitive.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/BroadcastTwoArgumentsPrimitive.kt
@@ -8,7 +8,6 @@ import io.kinference.primitives.annotations.GenerateNameFromPrimitives
 import io.kinference.primitives.annotations.GeneratePrimitives
 import io.kinference.primitives.types.DataType
 import io.kinference.primitives.types.PrimitiveType
-import io.kinference.utils.inlines.InlineInt
 
 @GenerateNameFromPrimitives
 internal fun broadcastTwoTensorsPrimitive(
@@ -45,16 +44,20 @@ internal fun broadcastTwoTensorsPrimitive(
     val rightBlocks = right.array.blocks
     val destBlocks = dest.array.blocks
 
-    val leftIsScalarFun = { leftOffset: InlineInt, rightOffset: InlineInt, destOffset: InlineInt, axisToBroadcastIdx: InlineInt ->
-        val shapeIdx = axisToBroadcastIdx.value * 2
+    val leftIsScalarFun = ScalarBroadcastFun { leftOffset, rightOffset, destOffset, axisToBroadcastIdx ->
+        val shapeIdx = axisToBroadcastIdx * 2
         val batchSize = destBroadcastingShape[shapeIdx]
 
         for (batchIdx in 0 until batchSize) {
-            val leftScalar = leftBlocks[leftOffset.value][0]
+            val leftBatchOffset = leftOffset + leftOffsets[shapeIdx] * batchIdx
+            val rightBatchOffset = rightOffset + rightOffsets[shapeIdx] * batchIdx
+            val destBatchOffset = destOffset + destOffsets[shapeIdx] * batchIdx
+
+            val leftScalar = leftBlocks[leftBatchOffset][0]
 
             for (blockIdx in 0 until destBlocksInRow) {
-                val destBlock = destBlocks[destOffset.value + blockIdx]
-                val rightBlock = rightBlocks[rightOffset.value + blockIdx]
+                val destBlock = destBlocks[destBatchOffset + blockIdx]
+                val rightBlock = rightBlocks[rightBatchOffset + blockIdx]
 
                 for (idx in destBlock.indices) {
                     destBlock[idx] = op(leftScalar, rightBlock[idx])
@@ -63,16 +66,20 @@ internal fun broadcastTwoTensorsPrimitive(
         }
     }
 
-    val rightIsScalarFun = { leftOffset: InlineInt, rightOffset: InlineInt, destOffset: InlineInt, axisToBroadcastIdx: InlineInt ->
-        val shapeIdx = axisToBroadcastIdx.value * 2
+    val rightIsScalarFun = ScalarBroadcastFun { leftOffset, rightOffset, destOffset, axisToBroadcastIdx ->
+        val shapeIdx = axisToBroadcastIdx * 2
         val batchSize = destBroadcastingShape[shapeIdx]
 
         for (batchIdx in 0 until batchSize) {
-            val rightScalar = rightBlocks[rightOffset.value][0]
+            val leftBatchOffset = leftOffset + leftOffsets[shapeIdx] * batchIdx
+            val rightBatchOffset = rightOffset + rightOffsets[shapeIdx] * batchIdx
+            val destBatchOffset = destOffset + destOffsets[shapeIdx] * batchIdx
+
+            val rightScalar = rightBlocks[rightBatchOffset][0]
 
             for (blockIdx in 0 until destBlocksInRow) {
-                val destBlock = destBlocks[destOffset.value + blockIdx]
-                val leftBlock = leftBlocks[leftOffset.value + blockIdx]
+                val destBlock = destBlocks[destBatchOffset + blockIdx]
+                val leftBlock = leftBlocks[leftBatchOffset + blockIdx]
 
                 for (idx in destBlock.indices) {
                     destBlock[idx] = op(leftBlock[idx], rightScalar)
@@ -81,11 +88,11 @@ internal fun broadcastTwoTensorsPrimitive(
         }
     }
 
-    val defaultFun = { leftOffset: InlineInt, rightOffset: InlineInt, destOffset: InlineInt, axisToBroadcastIdx: InlineInt ->
+    val defaultFun = ScalarBroadcastFun { leftOffset, rightOffset, destOffset, _ ->
         for (blockIdx in 0 until destBlocksInRow) {
-            val leftBlock = leftBlocks[leftOffset.value + blockIdx]
-            val rightBlock = rightBlocks[rightOffset.value + blockIdx]
-            val destBlock = destBlocks[destOffset.value + blockIdx]
+            val leftBlock = leftBlocks[leftOffset + blockIdx]
+            val rightBlock = rightBlocks[rightOffset + blockIdx]
+            val destBlock = destBlocks[destOffset + blockIdx]
 
             for (idx in destBlock.indices) {
                 destBlock[idx] = op(leftBlock[idx], rightBlock[idx])
@@ -93,7 +100,7 @@ internal fun broadcastTwoTensorsPrimitive(
         }
     }
 
-    val broadcastingFun = when {
+    val broadcastingFun: ScalarBroadcastFun = when {
         leftIsScalar -> leftIsScalarFun
         rightIsScalar -> rightIsScalarFun
         else -> defaultFun
@@ -101,7 +108,7 @@ internal fun broadcastTwoTensorsPrimitive(
 
     fun broadcast(leftOffset: Int, rightOffset: Int, destOffset: Int, axisToBroadcastIdx: Int) {
         if (axisToBroadcastIdx == totalAxesToBroadcast) {
-            broadcastingFun(InlineInt(leftOffset), InlineInt(rightOffset), InlineInt(destOffset), InlineInt(axisToBroadcastIdx))
+            broadcastingFun(leftOffset, rightOffset, destOffset, axisToBroadcastIdx)
         } else {
             val shapeIdx = axisToBroadcastIdx * 2
 
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/ReshapeView.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/ReshapeView.kt
index 43c6f672a..b980f530e 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/ReshapeView.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/ReshapeView.kt
@@ -3,6 +3,10 @@ package io.kinference.ndarray.extensions.broadcasting
 import io.kinference.ndarray.arrays.NDArrayCore
 import io.kinference.ndarray.extensions.utils.calculateBlock
 
+internal fun interface ScalarBroadcastFun {
+    operator fun invoke(leftOffset: Int, rightOffset: Int, destOffset: Int, axisToBroadcastIdx: Int)
+}
+
 internal data class BroadcastingInfo(
     val broadcastingShapes: Array<IntArray>,
     val broadcastingDestShape: IntArray,
@@ -89,8 +93,6 @@ internal data class BroadcastingInfo(
     }
 }
 
-
-
 internal fun makeOffsets(shape: IntArray, blocksInRow: Int): IntArray {
     val offsets = IntArray(shape.size)
     offsets[offsets.lastIndex - 1] = blocksInRow
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt
new file mode 100644
index 000000000..0bac99911
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt
@@ -0,0 +1,7 @@
+package io.kinference.ndarray.extensions.constants
+
+object BooleanConstants {
+    const val ZERO = false
+    const val ONE = true
+    const val SIZE_BYTES = 1L
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/PrimitiveConstants.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/PrimitiveConstants.kt
index e1edbef10..09aec0c9e 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/PrimitiveConstants.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/PrimitiveConstants.kt
@@ -2,10 +2,9 @@
 package io.kinference.ndarray.extensions.constants
 
 import io.kinference.primitives.annotations.*
-import io.kinference.primitives.types.DataType
-import io.kinference.primitives.types.toPrimitive
 import io.kinference.ndarray.toUShort
 import io.kinference.ndarray.toUByte
+import io.kinference.primitives.types.*
 
 
 @GenerateNameFromPrimitives
@@ -29,5 +28,7 @@ internal object PrimitiveConstants {
 
     val INV_ERF_COEF_1 = (4.330746750799873).toPrimitive()
     val INV_ERF_COEF_2 = (6.802721088435375).toPrimitive()
+
+    val SIZE_BYTES = PrimitiveType.SIZE_BYTES.toLong()
 }
 
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gather/PrimitiveGatherByBlocks.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gather/PrimitiveGatherByBlocks.kt
index b40873787..9adb86a46 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gather/PrimitiveGatherByBlocks.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gather/PrimitiveGatherByBlocks.kt
@@ -26,14 +26,11 @@ internal suspend fun gatherByBlocksPrimitive(array: PrimitiveNDArray, indices: N
     val dataToCopyBlocks = dataToCopySize / array.array.blockSize
 
     val dataBlocks = array.array.blocks
-    val dataMarkers = array.array.marker
 
     val destBatchBlocksOffset = indicesSize * dataToCopyBlocks
     val inputBatchBlockOffset = array.shape[actualAxis] * dataToCopyBlocks
 
     val destArray = arrayOfNulls<PrimitiveArray>(destBatchBlocksOffset * dataBatchSize)
-    val destMarkersArray = arrayOfNulls<StateMarker>(destBatchBlocksOffset * dataBatchSize)
-
 
     for (dataBatchNum in 0 until dataBatchSize) {
         val dataBlocksOffset = inputBatchBlockOffset * dataBatchNum
@@ -46,12 +43,11 @@ internal suspend fun gatherByBlocksPrimitive(array: PrimitiveNDArray, indices: N
 
             for (blockIdx in 0 until dataToCopyBlocks) {
                 destArray[destBlocksOffset + blockIdx] = dataBlocks[dataOffset + blockIdx]
-                destMarkersArray[destBlocksOffset + blockIdx] = dataMarkers[dataOffset + blockIdx]
             }
 
             destBlocksOffset += dataToCopyBlocks
         }
     }
 
-    return PrimitiveNDArray(PrimitiveTiledArray(destArray as Array<PrimitiveArray>, destMarkersArray as Array<StateMarker>), Strides(destShape))
+    return PrimitiveNDArray(PrimitiveTiledArray(destArray as Array<PrimitiveArray>), Strides(destShape))
 }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGelu.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGelu.kt
index 0636cb824..8dc2a6705 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGelu.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGelu.kt
@@ -16,3 +16,17 @@ suspend fun biasGelu(input: NumberNDArrayCore, bias: NumberNDArrayCore): Mutable
         else -> error("BiasGelu operation supported only for FLOAT and DOUBLE tensors, actual types is ${input.type}")
     }
 }
+
+suspend fun biasGelu(input: NumberNDArrayCore, bias: NumberNDArrayCore, dest: MutableNumberNDArrayCore): MutableNumberNDArrayCore {
+    require(input.type == bias.type)
+        { "Input and Bias types should be equal, actual input type is ${input.type}, actual bias type is ${bias.type}" }
+
+    require(input.type == DataType.FLOAT || input.type == DataType.DOUBLE)
+        { "BiasGelu operation supported only for FLOAT and DOUBLE tensors, actual types is ${input.type}" }
+
+    return when(input.type) {
+        DataType.FLOAT -> computeGeluFloat(input as FloatNDArray, bias as FloatNDArray, dest as MutableFloatNDArray)
+        DataType.DOUBLE -> computeGeluDouble(input as DoubleNDArray, bias as DoubleNDArray, dest as MutableDoubleNDArray)
+        else -> error("BiasGelu operation supported only for FLOAT and DOUBLE tensors, actual types is ${input.type}")
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt
index 577c3ea02..9ba08ddb0 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt
@@ -3,8 +3,6 @@ package io.kinference.ndarray.extensions.gelu
 
 import io.kinference.ndarray.*
 import io.kinference.ndarray.arrays.*
-
-import io.kinference.ndarray.arrays.memory.PrimitiveArrayContainer
 import io.kinference.ndarray.arrays.tiled.PrimitiveTiledArray
 import io.kinference.ndarray.extensions.constants.PrimitiveConstants
 import io.kinference.ndarray.stubs.absoluteValue
@@ -16,8 +14,7 @@ import io.kinference.primitives.types.*
 import kotlin.math.*
 
 @GenerateNameFromPrimitives
-internal suspend fun computeGeluPrimitive(input: PrimitiveNDArray, bias: PrimitiveNDArray): MutablePrimitiveNDArray {
-    val output = MutablePrimitiveNDArray(input.strides)
+internal suspend fun computeGeluPrimitive(input: PrimitiveNDArray, bias: PrimitiveNDArray, output: MutablePrimitiveNDArray): MutablePrimitiveNDArray {
 
     val inputBlocks = input.array.blocks
     val biasBlocks = bias.array.blocks
@@ -81,3 +78,8 @@ internal suspend fun computeGeluPrimitive(input: PrimitiveNDArray, bias: Primiti
 
     return output
 }
+
+@GenerateNameFromPrimitives
+internal suspend fun computeGeluPrimitive(input: PrimitiveNDArray, bias: PrimitiveNDArray): MutablePrimitiveNDArray {
+    return computeGeluPrimitive(input, bias, MutablePrimitiveNDArray(input.strides))
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/FastGeluPrimitive.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/FastGeluPrimitive.kt
index ffb626267..32a1e6e02 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/FastGeluPrimitive.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/FastGeluPrimitive.kt
@@ -6,7 +6,6 @@ package io.kinference.ndarray.extensions.gelu
 import io.kinference.ndarray.arrays.*
 import io.kinference.ndarray.arrays.MutablePrimitiveNDArray
 import io.kinference.ndarray.arrays.PrimitiveNDArray
-import io.kinference.ndarray.arrays.memory.PrimitiveArrayContainer
 import io.kinference.ndarray.arrays.tiled.PrimitiveTiledArray
 import io.kinference.ndarray.countCoroutinesByData
 import io.kinference.ndarray.parallelizeByBlocks
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/utils/Utils.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/utils/Utils.kt
index 8c4e18063..fec73c0f9 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/utils/Utils.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/utils/Utils.kt
@@ -1,5 +1,7 @@
 package io.kinference.ndarray.extensions.utils
 
+import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap
+
 /***
  * Calculates the total size of the tensor with such shape.
  */
@@ -50,3 +52,14 @@ internal fun computeColumnMajorIndex(
 internal fun isInPadding(actual: Int, bound: Int) : Boolean {
     return actual < 0 || actual >= bound
 }
+
+inline fun <V> Int2ObjectOpenHashMap<V>.getOrPut(key: Int, defaultValue: () -> V): V {
+    val existingValue = this[key]
+    return if (existingValue != null) {
+        existingValue
+    } else {
+        val value = defaultValue()
+        put(key, value)
+        value
+    }
+}
diff --git a/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/PredictionConfig.kt b/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/PredictionConfig.kt
new file mode 100644
index 000000000..1828b36bd
--- /dev/null
+++ b/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/PredictionConfig.kt
@@ -0,0 +1,67 @@
+package io.kinference.utils
+
+enum class AllocationMode {
+    NoAllocation,
+    Manual,
+    Auto;
+}
+
+class PredictionConfig private constructor(
+    val parallelismLimit: Int,
+    val allocationMode: AllocationMode,
+    val memoryThreshold: Long,
+    val memoryClearingInterval: Long
+) {
+    companion object {
+        operator fun invoke(
+            parallelismLimit: Int,
+            allocationMode: AllocationMode,
+            memoryThreshold: Long,
+            memoryClearingInterval: Long
+        ): PredictionConfig {
+            require(parallelismLimit in 1..PlatformUtils.cores) {
+                "Parallelism limit must be within the range of 1 to available CPU cores: ${PlatformUtils.cores}."
+            }
+            return if (allocationMode == AllocationMode.NoAllocation) {
+                PredictionConfig(parallelismLimit, allocationMode, 0L, Long.MAX_VALUE)
+            } else {
+                require(memoryThreshold > 0) {
+                    "Memory threshold must be positive."
+                }
+                require(memoryClearingInterval > 0) {
+                    "Memory clearing interval must be positive."
+                }
+
+                PredictionConfig(parallelismLimit, allocationMode, memoryThreshold, memoryClearingInterval)
+            }
+        }
+    }
+}
+
+object PredictionConfigs {
+    val DefaultAutoAllocator: PredictionConfig = PredictionConfig(
+        parallelismLimit = PlatformUtils.cores,
+        allocationMode = AllocationMode.Auto,
+        memoryThreshold = (PlatformUtils.maxHeap * 0.3).toLong(),
+        memoryClearingInterval = 500
+    )
+    val DefaultManualAllocator: PredictionConfig = PredictionConfig(
+        parallelismLimit = PlatformUtils.cores,
+        allocationMode = AllocationMode.Manual,
+        memoryThreshold = 50 * 1024 * 1024,
+        memoryClearingInterval = 500
+    )
+    val NoAllocator: PredictionConfig = PredictionConfig(
+        parallelismLimit = PlatformUtils.cores,
+        allocationMode = AllocationMode.NoAllocation,
+        memoryThreshold = 0L,
+        memoryClearingInterval = Long.MAX_VALUE
+    )
+
+    fun customPredictionConfig(parallelismLimit: Int,
+                               allocationMode: AllocationMode,
+                               memoryThreshold: Long,
+                               memoryClearingInterval: Long): PredictionConfig {
+        return PredictionConfig(parallelismLimit, allocationMode, memoryThreshold, memoryClearingInterval)
+    }
+}
diff --git a/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt b/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt
index 66b5cea95..b2d5b40a9 100644
--- a/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt
+++ b/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt
@@ -1,8 +1,9 @@
+@file:OptIn(ExperimentalStdlibApi::class)
 package io.kinference.utils
 
 import kotlinx.coroutines.*
 import kotlinx.coroutines.channels.Channel
-import kotlin.coroutines.CoroutineContext
+import kotlin.coroutines.*
 
 object ResourcesDispatcher {
     private val tokenChannel = Channel<Unit>(capacity = PlatformUtils.cores)
@@ -16,11 +17,46 @@ object ResourcesDispatcher {
     }
 }
 
-class ParallelismLimiterContext(val dispatcher: CoroutineDispatcher) : CoroutineContext.Element {
-    companion object Key : CoroutineContext.Key<ParallelismLimiterContext>
-    override val key: CoroutineContext.Key<*> get() = Key
+sealed class PredictionContext(
+    val dispatcher: CoroutineDispatcher
+) : AbstractCoroutineContextElement(PredictionContext) {
+    companion object Key : CoroutineContext.Key<PredictionContext>
+
+    override val key
+        get() = Key
+
+    @OptIn(ExperimentalStdlibApi::class)
+    override fun <E : CoroutineContext.Element> get(key: CoroutineContext.Key<E>): E? = getPolymorphicElement(key)
+
+    @OptIn(ExperimentalStdlibApi::class)
+    override fun minusKey(key: CoroutineContext.Key<*>): CoroutineContext = minusPolymorphicKey(key)
+}
+
+interface ArrayStorage {
+    fun resetState()
+}
+
+abstract class AllocatorContext<T : ArrayStorage>(
+    dispatcher: CoroutineDispatcher,
+    val storage: T
+) : PredictionContext(dispatcher) {
+    companion object Key : AbstractCoroutineContextKey<PredictionContext, AllocatorContext<*>>(
+        PredictionContext.Key,
+        { it as? AllocatorContext<*> }
+    )
+
+    fun finalizeContext() {
+        storage.resetState()
+    }
+}
+
+class NoAllocatorContext(dispatcher: CoroutineDispatcher) : PredictionContext(dispatcher) {
+    companion object Key : AbstractCoroutineContextKey<PredictionContext, NoAllocatorContext>(
+        PredictionContext.Key,
+        { it as? NoAllocatorContext }
+    )
 }
 
 fun CoroutineScope.launchWithLimitOrDefault(block: suspend CoroutineScope.() -> Unit) {
-    this.launch(coroutineContext[ParallelismLimiterContext.Key]?.dispatcher ?: Dispatchers.Default, block = block)
+    this.launch(coroutineContext[PredictionContext]?.dispatcher ?: Dispatchers.Default, block = block)
 }
diff --git a/utils/utils-testing/src/commonMain/kotlin/io.kinference/runners/PerformanceRunner.kt b/utils/utils-testing/src/commonMain/kotlin/io.kinference/runners/PerformanceRunner.kt
index 6d8054dc2..5b288eac5 100644
--- a/utils/utils-testing/src/commonMain/kotlin/io.kinference/runners/PerformanceRunner.kt
+++ b/utils/utils-testing/src/commonMain/kotlin/io.kinference/runners/PerformanceRunner.kt
@@ -120,6 +120,7 @@ class PerformanceRunner<T : ONNXData<*, *>>(private val engine: TestEngine<T>) {
         for (result in results.sortedBy { it.name }) {
             logger.info { "Test ${result.name}: avg ${result.avg}, min ${result.min}, max ${result.max}" }
         }
+        logger.info { "Average between inputs: avg ${results.map { it.avg }.average()}, min ${results.minOfOrNull { it.min }}, max ${results.maxOfOrNull { it.max }}" }
     }
 
     companion object {