From 54ec016bf8794bf5145e8bcd522dcb3b96f71ac0 Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Mon, 5 Aug 2024 14:16:55 +0200
Subject: [PATCH 01/19] JBAI-4393 [core, ndarray] Edited an output allocation
 marking mechanism from special function referencing to simple output copying.

---
 .../jvmMain/kotlin/io/kinference.core/KIEngine.kt    | 10 ----------
 .../kotlin/io/kinference.core/data/map/KIONNXMap.kt  |  6 +-----
 .../io/kinference.core/data/seq/KIONNXSequence.kt    |  6 +-----
 .../io/kinference.core/data/tensor/KITensor.kt       | 10 ++--------
 .../kotlin/io/kinference.core/model/KIModel.kt       |  7 +++----
 .../operators/layer/attention/Attention.kt           | 12 +-----------
 .../ndarray/arrays/ArrayDispatcherUtils.kt           |  4 ----
 .../io/kinference/ndarray/arrays/BooleanNDArray.kt   |  6 +-----
 .../io/kinference/ndarray/arrays/PrimitiveNDArray.kt |  6 +-----
 .../ndarray/arrays/memory/AllocatorContext.kt        |  2 +-
 .../ndarray/arrays/memory/ArrayContainer.kt          |  7 -------
 .../ndarray/arrays/tiled/PrimitiveTiledArray.kt      |  6 ++----
 .../extensions/gather/PrimitiveGatherByBlocks.kt     |  6 +-----
 13 files changed, 14 insertions(+), 74 deletions(-)
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/KIEngine.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/KIEngine.kt
index 287094629..dba47b43d 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/KIEngine.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/KIEngine.kt
@@ -24,16 +24,6 @@ import okio.Path.Companion.toPath
 
 typealias KIONNXData<T> = ONNXData<T, CoreBackend>
 
-// Define an interface for allocation control marking output
-internal interface KIONNXDataArraysReleaser {
-    fun markOutput()
-}
-
-internal fun <T> KIONNXData<T>.markOutput() {
-    if (this is KIONNXDataArraysReleaser)
-        this.markOutput()
-}
-
 object CoreBackend : BackendInfo(name = "KInference Core CPU Backend")
 
 /**
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/map/KIONNXMap.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/map/KIONNXMap.kt
index f541c4c23..a1bbcf7eb 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/map/KIONNXMap.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/map/KIONNXMap.kt
@@ -9,7 +9,7 @@ import io.kinference.protobuf.message.TensorProto
 import io.kinference.types.ValueInfo
 import io.kinference.types.ValueTypeInfo
 
-class KIONNXMap(name: String?, data: Map<Any, KIONNXData<*>>, val info: ValueTypeInfo.MapTypeInfo) : ONNXMap<Map<Any, KIONNXData<*>>, CoreBackend>(name, data), KIONNXDataArraysReleaser {
+class KIONNXMap(name: String?, data: Map<Any, KIONNXData<*>>, val info: ValueTypeInfo.MapTypeInfo) : ONNXMap<Map<Any, KIONNXData<*>>, CoreBackend>(name, data) {
     constructor(data: Map<Any, KIONNXData<*>>, info: ValueInfo) : this(info.name, data, info.typeInfo as ValueTypeInfo.MapTypeInfo)
 
     override val backend = CoreBackend
@@ -26,10 +26,6 @@ class KIONNXMap(name: String?, data: Map<Any, KIONNXData<*>>, val info: ValueTyp
 
     override fun rename(name: String): KIONNXMap = KIONNXMap(name, data, info)
 
-    override fun markOutput() {
-        data.values.forEach { it.markOutput() }
-    }
-
     override suspend fun clone(newName: String?): KIONNXMap {
         val newMap = HashMap<Any, KIONNXData<*>>(data.size)
         for ((key, value) in data.entries) {
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/seq/KIONNXSequence.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/seq/KIONNXSequence.kt
index 24b52085c..49383fca0 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/seq/KIONNXSequence.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/seq/KIONNXSequence.kt
@@ -7,7 +7,7 @@ import io.kinference.data.ONNXSequence
 import io.kinference.protobuf.message.SequenceProto
 import io.kinference.types.*
 
-class KIONNXSequence(name: String?, data: List<KIONNXData<*>>, val info: ValueTypeInfo.SequenceTypeInfo) : ONNXSequence<List<KIONNXData<*>>, CoreBackend>(name, data), KIONNXDataArraysReleaser {
+class KIONNXSequence(name: String?, data: List<KIONNXData<*>>, val info: ValueTypeInfo.SequenceTypeInfo) : ONNXSequence<List<KIONNXData<*>>, CoreBackend>(name, data) {
     constructor(name: String?, info: ValueTypeInfo.SequenceTypeInfo, size: Int, init: (Int) -> KIONNXData<*>) : this(name, List(size, init), info)
     constructor(data: List<KIONNXData<*>>, info: ValueInfo) : this(info.name, data, info.typeInfo as ValueTypeInfo.SequenceTypeInfo)
 
@@ -23,10 +23,6 @@ class KIONNXSequence(name: String?, data: List<KIONNXData<*>>, val info: ValueTy
 
     override fun rename(name: String): KIONNXSequence = KIONNXSequence(name, data, info)
 
-    override fun markOutput() {
-        data.forEach { it.markOutput() }
-    }
-
     val length: Int = data.size
 
     companion object {
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt
index 2c6de1a69..d1ca7c5f6 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt
@@ -1,7 +1,6 @@
 package io.kinference.core.data.tensor
 
-import io.kinference.core.CoreBackend
-import io.kinference.core.KIONNXDataArraysReleaser
+import io.kinference.core.*
 import io.kinference.data.ONNXTensor
 import io.kinference.ndarray.arrays.*
 import io.kinference.ndarray.arrays.tiled.*
@@ -13,7 +12,7 @@ import io.kinference.types.ValueTypeInfo
 
 //TODO: support segments
 //TODO: support external data
-class KITensor(name: String?, override val data: NDArrayCore, val info: ValueTypeInfo.TensorTypeInfo) : ONNXTensor<NDArrayCore, CoreBackend>(name, data), KIONNXDataArraysReleaser {
+class KITensor(name: String?, override val data: NDArrayCore, val info: ValueTypeInfo.TensorTypeInfo) : ONNXTensor<NDArrayCore, CoreBackend>(name, data) {
     constructor(data: NDArrayCore, info: ValueInfo) : this(info.name, data, info.typeInfo as ValueTypeInfo.TensorTypeInfo)
 
     override suspend fun close() {
@@ -24,11 +23,6 @@ class KITensor(name: String?, override val data: NDArrayCore, val info: ValueTyp
         return KITensor(newName, data.clone(), info)
     }
 
-    override fun markOutput() {
-        if (this.data is MemoryControlledArray)
-            data.markOutput()
-    }
-
     suspend operator fun minus(other: KITensor): KITensor {
         require(this.data is NumberNDArrayCore && other.data is NumberNDArrayCore)
         return (this.data - other.data).asTensor()
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
index 1e25b4ae1..2a1e50db7 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
@@ -1,8 +1,7 @@
 package io.kinference.core.model
 
-import io.kinference.core.KIONNXData
+import io.kinference.core.*
 import io.kinference.core.graph.KIGraph
-import io.kinference.core.markOutput
 import io.kinference.graph.Contexts
 import io.kinference.model.Model
 import io.kinference.ndarray.arrays.memory.*
@@ -51,9 +50,9 @@ class KIModel(
             withContext(mixedContext) {
                 val coroutineContext = coroutineContext[AllocatorContext.Key]!!
                 val execResult = graph.execute(input, contexts)
-                execResult.forEach { it.markOutput() }
+                val copies = execResult.map { it.clone(it.name) }.toList()
                 coroutineContext.closeAllocated()
-                execResult
+                copies
             }
         } finally {
             if (coreReserved) {
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
index 6487c46c1..05b76119b 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
@@ -61,21 +61,16 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
             val kBlocks = k.array.blocks
             val vBlocks = v.array.blocks
 
-            val kMarker = k.array.marker
-            val vMarker = v.array.marker
 
             val resultBlocks: Array<FloatArray>
-            val resultMarker: Array<StateMarker>
 
             if (past == null || past.linearSize == 0) {
                 resultBlocks = kBlocks.plus(vBlocks)
-                resultMarker = kMarker.plus(vMarker)
             } else {
                 val pastSeqLen = past.shape[3]
                 presentDims[3] += pastSeqLen
 
                 val pastBlocks = past.array.blocks
-                val pastMarker = past.array.marker
 
                 val blocksInRow = headSize / past.array.blockSize
 
@@ -84,35 +79,30 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
 
                 val rowsSize = batchSize * numHeads
                 val futureRes = arrayOfNulls<FloatArray>(2 * batchSize * numHeads * presentDims[3] * blocksInRow)
-                val futureResMarker = arrayOfNulls<StateMarker>(2 * batchSize * numHeads * presentDims[3] * blocksInRow)
 
                 var resBlockIdx = 0
                 var pastBlocIdx = 0
 
                 repeat(2) { presentKeyValueIdx ->
                     val kvBlocks = if (presentKeyValueIdx == 0) kBlocks else vBlocks
-                    val kvMarker = if (presentKeyValueIdx == 0) kMarker else vMarker
 
                     var kvBlockIdx = 0
 
                     repeat(rowsSize) {
                         pastBlocks.copyInto(futureRes, resBlockIdx, pastBlocIdx, pastBlocIdx + pastRowBlocksCount)
-                        pastMarker.copyInto(futureResMarker, resBlockIdx, pastBlocIdx, pastBlocIdx + pastRowBlocksCount)
 
                         resBlockIdx += pastRowBlocksCount
                         pastBlocIdx += pastRowBlocksCount
 
                         kvBlocks.copyInto(futureRes, resBlockIdx, kvBlockIdx, kvBlockIdx + kvRowBlocksCount)
-                        kvMarker.copyInto(futureResMarker, resBlockIdx, kvBlockIdx, kvBlockIdx + kvRowBlocksCount)
                         resBlockIdx += kvRowBlocksCount
                         kvBlockIdx += kvRowBlocksCount
                     }
                 }
                 resultBlocks = futureRes as Array<FloatArray>
-                resultMarker = futureResMarker as Array<StateMarker>
             }
 
-            return FloatNDArray(FloatTiledArray(resultBlocks, resultMarker), Strides(presentDims))
+            return FloatNDArray(FloatTiledArray(resultBlocks), Strides(presentDims))
         }
 
 
diff --git a/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt b/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt
index bea90d149..2da712ca3 100644
--- a/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt
+++ b/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt
@@ -15,7 +15,3 @@ enum class ArrayTypes(val index: Int, val size: Int) {
     DoubleArray(9, Double.SIZE_BYTES),
     BooleanArray(10, 1);
 }
-
-interface MemoryControlledArray {
-    fun markOutput()
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/BooleanNDArray.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/BooleanNDArray.kt
index 5a4e758dc..3037028d3 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/BooleanNDArray.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/BooleanNDArray.kt
@@ -21,7 +21,7 @@ fun interface BooleanBinaryOperation {
     operator fun invoke(first: Boolean, second: Boolean): Boolean
 }
 
-open class BooleanNDArray(var array: BooleanTiledArray, strides: Strides) : NDArrayCore, MemoryControlledArray {
+open class BooleanNDArray(var array: BooleanTiledArray, strides: Strides) : NDArrayCore {
     override val type: DataType = DataType.BOOLEAN
 
     final override var strides: Strides = strides
@@ -79,10 +79,6 @@ open class BooleanNDArray(var array: BooleanTiledArray, strides: Strides) : NDAr
         return array.blocks[0][0]
     }
 
-    override fun markOutput() {
-        array.marker.forEach { it.invoke() }
-    }
-
     override suspend fun toMutable(): MutableBooleanNDArray {
         return MutableBooleanNDArray(array.copyOf(), strides)
     }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/PrimitiveNDArray.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/PrimitiveNDArray.kt
index 0b391f275..f1bd91b44 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/PrimitiveNDArray.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/PrimitiveNDArray.kt
@@ -30,7 +30,7 @@ import kotlin.math.*
 
 @GenerateNameFromPrimitives
 @MakePublic
-internal open class PrimitiveNDArray(array: PrimitiveTiledArray, strides: Strides) : NumberNDArrayCore, MemoryControlledArray {
+internal open class PrimitiveNDArray(array: PrimitiveTiledArray, strides: Strides) : NumberNDArrayCore {
     var array: PrimitiveTiledArray = array
         protected set
 
@@ -85,10 +85,6 @@ internal open class PrimitiveNDArray(array: PrimitiveTiledArray, strides: Stride
         return array.blocks[0][0]
     }
 
-    override fun markOutput() {
-        array.marker.forEach { it.invoke() }
-    }
-
     override suspend fun clone(): PrimitiveNDArray {
         return PrimitiveNDArray(array.copyOf(), Strides(shape))
     }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt
index 2ed73f878..f6fd4f008 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt
@@ -25,7 +25,7 @@ data class AllocatorContext internal constructor(
 
     fun closeAllocated() {
         usedContainers.forEach {
-            if (!it.isOutput && limiter.checkMemoryLimitAndAdd(it.sizeBytes.toLong())) {
+            if (limiter.checkMemoryLimitAndAdd(it.sizeBytes.toLong())) {
                 unusedContainers[it.arrayTypeIndex, it.arraySizeIndex].addLast(it)
             }
         }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayContainer.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayContainer.kt
index d39ba62ba..8884fcfa1 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayContainer.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayContainer.kt
@@ -7,13 +7,6 @@ sealed class ArrayContainer(
     val arraySizeIndex: Int,
     val sizeBytes: Int
 ) {
-    var isOutput: Boolean = false
-        private set
-
-    val markAsOutput = {
-        isOutput = true
-    }
-
     companion object {
         private const val EMPTY_INDEX = -1
 
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
index eda58c092..a9863aadb 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
@@ -17,7 +17,7 @@ import kotlin.math.min
 
 @GenerateNameFromPrimitives
 @MakePublic
-internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>, val marker: Array<StateMarker> = emptyMarker) {
+internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>) {
     val size: Int
     val blockSize: Int = if (blocks.isEmpty()) 0 else blocks.first().size
     val blocksNum: Int = blocks.size
@@ -28,7 +28,6 @@ internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>, val marker
 
     companion object {
         val type: ArrayTypes = ArrayTypes.valueOf(PrimitiveArray::class.simpleName!!)
-        private val emptyMarker: Array<StateMarker> = arrayOf()
 
         suspend operator fun invoke(strides: Strides): PrimitiveTiledArray {
             val blockSize = blockSizeByStrides(strides)
@@ -66,9 +65,8 @@ internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>, val marker
             // With array dispatcher
             val containerArray = coroutineContext?.getArrayContainers(type, blockSize, blocksNum) ?: Array(blocksNum) { ArrayContainer(type, blockSize) }
             val blocks = Array(containerArray.size) { i -> (containerArray[i] as PrimitiveArrayContainer).array }
-            val marker = Array(containerArray.size) { i -> containerArray[i].markAsOutput }
 
-            return PrimitiveTiledArray(blocks, marker)
+            return PrimitiveTiledArray(blocks)
         }
 
         suspend operator fun invoke(size: Int, blockSize: Int, init: (InlineInt) -> PrimitiveType) : PrimitiveTiledArray {
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gather/PrimitiveGatherByBlocks.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gather/PrimitiveGatherByBlocks.kt
index b40873787..9adb86a46 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gather/PrimitiveGatherByBlocks.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gather/PrimitiveGatherByBlocks.kt
@@ -26,14 +26,11 @@ internal suspend fun gatherByBlocksPrimitive(array: PrimitiveNDArray, indices: N
     val dataToCopyBlocks = dataToCopySize / array.array.blockSize
 
     val dataBlocks = array.array.blocks
-    val dataMarkers = array.array.marker
 
     val destBatchBlocksOffset = indicesSize * dataToCopyBlocks
     val inputBatchBlockOffset = array.shape[actualAxis] * dataToCopyBlocks
 
     val destArray = arrayOfNulls<PrimitiveArray>(destBatchBlocksOffset * dataBatchSize)
-    val destMarkersArray = arrayOfNulls<StateMarker>(destBatchBlocksOffset * dataBatchSize)
-
 
     for (dataBatchNum in 0 until dataBatchSize) {
         val dataBlocksOffset = inputBatchBlockOffset * dataBatchNum
@@ -46,12 +43,11 @@ internal suspend fun gatherByBlocksPrimitive(array: PrimitiveNDArray, indices: N
 
             for (blockIdx in 0 until dataToCopyBlocks) {
                 destArray[destBlocksOffset + blockIdx] = dataBlocks[dataOffset + blockIdx]
-                destMarkersArray[destBlocksOffset + blockIdx] = dataMarkers[dataOffset + blockIdx]
             }
 
             destBlocksOffset += dataToCopyBlocks
         }
     }
 
-    return PrimitiveNDArray(PrimitiveTiledArray(destArray as Array<PrimitiveArray>, destMarkersArray as Array<StateMarker>), Strides(destShape))
+    return PrimitiveNDArray(PrimitiveTiledArray(destArray as Array<PrimitiveArray>), Strides(destShape))
 }

From 2d7c3108668abb208e09ce5d52291d3b3a65a6a6 Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Mon, 12 Aug 2024 11:45:01 +0200
Subject: [PATCH 02/19] JBAI-4393 [core, ndarray] Removed ArrayContainer and
 refactored memory management: use of primitive arrays directly.

---
 .../ndarray/arrays/ArrayDispatcherUtils.kt    |  2 -
 .../ndarray/arrays/memory/AllocatorContext.kt | 20 ++---
 .../ndarray/arrays/memory/ArrayContainer.kt   | 48 -----------
 .../ndarray/arrays/memory/ArrayStorage.kt     | 81 ++++++++++++++++---
 .../arrays/memory/PrimitiveArrayContainer.kt  | 17 ----
 .../arrays/tiled/PrimitiveTiledArray.kt       | 17 ++--
 .../extensions/gelu/BiasGeluPrimitive.kt      |  2 -
 .../extensions/gelu/FastGeluPrimitive.kt      |  1 -
 .../runners/PerformanceRunner.kt              |  1 +
 9 files changed, 85 insertions(+), 104 deletions(-)
 delete mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayContainer.kt
 delete mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayContainer.kt

diff --git a/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt b/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt
index 2da712ca3..53f217c09 100644
--- a/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt
+++ b/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt
@@ -1,7 +1,5 @@
 package io.kinference.ndarray.arrays
 
-typealias StateMarker = () -> Unit
-
 enum class ArrayTypes(val index: Int, val size: Int) {
     ByteArray(0, Byte.SIZE_BYTES),
     UByteArray(1, UByte.SIZE_BYTES),
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt
index f6fd4f008..84d02017e 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt
@@ -8,28 +8,24 @@ data class AllocatorContext internal constructor(
     private val limiter: MemoryLimiter,
     private val returnStorageFn: (ArrayStorage) -> Unit
 ) : CoroutineContext.Element {
-    private val usedContainers: ArrayDeque<ArrayContainer> = ArrayDeque()
 
     companion object Key : CoroutineContext.Key<AllocatorContext>
     override val key: CoroutineContext.Key<*> get() = Key
 
-    internal fun getArrayContainers(type: ArrayTypes, size: Int, count: Int): Array<ArrayContainer> {
+    internal fun getArrayContainers(type: ArrayTypes, size: Int, count: Int): Array<Any> {
         return if (limiter !is NoAllocatorMemoryLimiter) {
-            val result = Array(count) { unusedContainers.getArrayContainer(type, size) }
-            usedContainers.addAll(result)
-            result
+            Array(count) { unusedContainers.getArrayContainer(type, size) }
         } else {
-            Array(count) { ArrayContainer(type, size) }
+            Array(count) { unusedContainers.create(type, size) }
         }
     }
 
+    fun closeOperator() {
+        unusedContainers.moveUsedArrays()
+    }
+
     fun closeAllocated() {
-        usedContainers.forEach {
-            if (limiter.checkMemoryLimitAndAdd(it.sizeBytes.toLong())) {
-                unusedContainers[it.arrayTypeIndex, it.arraySizeIndex].addLast(it)
-            }
-        }
-        usedContainers.clear()
+        unusedContainers.moveUsedArrays()
         returnStorageFn(unusedContainers)
     }
 }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayContainer.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayContainer.kt
deleted file mode 100644
index 8884fcfa1..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayContainer.kt
+++ /dev/null
@@ -1,48 +0,0 @@
-package io.kinference.ndarray.arrays.memory
-
-import io.kinference.ndarray.arrays.*
-
-sealed class ArrayContainer(
-    val arrayTypeIndex: Int,
-    val arraySizeIndex: Int,
-    val sizeBytes: Int
-) {
-    companion object {
-        private const val EMPTY_INDEX = -1
-
-        operator fun invoke(type: ArrayTypes, size: Int, sizeIndex: Int = EMPTY_INDEX): ArrayContainer {
-            val sizeBytes: Int = type.size * size
-            return when (type) {
-                ArrayTypes.ByteArray -> ByteArrayContainer(type.index, sizeIndex, sizeBytes, ByteArray(size))         // 8-bit signed
-                ArrayTypes.UByteArray -> UByteArrayContainer(type.index, sizeIndex, sizeBytes, UByteArray(size))      // 8-bit unsigned
-                ArrayTypes.ShortArray -> ShortArrayContainer(type.index, sizeIndex, sizeBytes, ShortArray(size))      // 16-bit signed
-                ArrayTypes.UShortArray -> UShortArrayContainer(type.index, sizeIndex, sizeBytes, UShortArray(size))   // 16-bit unsigned
-                ArrayTypes.IntArray -> IntArrayContainer(type.index, sizeIndex, sizeBytes, IntArray(size))            // 32-bit signed
-                ArrayTypes.UIntArray -> UIntArrayContainer(type.index, sizeIndex, sizeBytes, UIntArray(size))         // 32-bit unsigned
-                ArrayTypes.LongArray -> LongArrayContainer(type.index, sizeIndex, sizeBytes, LongArray(size))         // 64-bit signed
-                ArrayTypes.ULongArray -> ULongArrayContainer(type.index, sizeIndex, sizeBytes, ULongArray(size))      // 64-bit unsigned
-                ArrayTypes.FloatArray -> FloatArrayContainer(type.index, sizeIndex, sizeBytes, FloatArray(size))
-                ArrayTypes.DoubleArray -> DoubleArrayContainer(type.index, sizeIndex, sizeBytes, DoubleArray(size))
-                ArrayTypes.BooleanArray -> BooleanArrayContainer(type.index, sizeIndex, sizeBytes, BooleanArray(size))
-                else -> throw IllegalArgumentException("Unsupported array type")
-            }
-        }
-
-        fun resetArray(arrayContainer: ArrayContainer) {
-            when (arrayContainer) {
-                is ByteArrayContainer -> arrayContainer.array.fill(0)       // 8-bit signed
-                is UByteArrayContainer -> arrayContainer.array.fill(0u)     // 8-bit unsigned
-                is ShortArrayContainer -> arrayContainer.array.fill(0)      // 16-bit signed
-                is UShortArrayContainer -> arrayContainer.array.fill(0u)    // 16-bit unsigned
-                is IntArrayContainer -> arrayContainer.array.fill(0)        // 32-bit signed
-                is UIntArrayContainer -> arrayContainer.array.fill(0u)      // 32-bit unsigned
-                is LongArrayContainer -> arrayContainer.array.fill(0L)      // 64-bit signed
-                is ULongArrayContainer -> arrayContainer.array.fill(0U)     // 64-bit unsigned
-                is FloatArrayContainer -> arrayContainer.array.fill(0.0f)
-                is DoubleArrayContainer -> arrayContainer.array.fill(0.0)
-                is BooleanArrayContainer -> arrayContainer.array.fill(false)
-                else -> throw IllegalArgumentException("Unsupported array type")
-            }
-        }
-    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt
index 00a98c0cb..859ad2654 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt
@@ -9,32 +9,36 @@ internal class ArrayStorage(typeLength: Int, sizeLength: Int, private val limite
      * 2. Array by size. Starting with 'INIT_SIZE_VALUE' element and grow it doubling (typically there are no more than 16 different sizes)
      * 3. Queue of array containers (used as FIFO)
      */
-    private var storage: Array<Array<ArrayDeque<ArrayContainer>>> =
+    private var storageUnused: Array<Array<ArrayDeque<Any>>> =
+        Array(typeLength) { Array(sizeLength) { ArrayDeque() } }
+
+    private var storageUsed: Array<Array<ArrayDeque<Any>>> =
         Array(typeLength) { Array(sizeLength) { ArrayDeque() } }
 
     private var sizeIndices: IntArray = IntArray(typeLength)
     private var sizes: Array<IntArray> = Array(typeLength) { IntArray(sizeLength) }
 
 
-    operator fun get(typeIndex: Int, sizeIndex: Int): ArrayDeque<ArrayContainer> {
-        return storage[typeIndex][sizeIndex]
+    operator fun get(typeIndex: Int, sizeIndex: Int): ArrayDeque<Any> {
+        return storageUnused[typeIndex][sizeIndex]
     }
 
-    fun getArrayContainer(type: ArrayTypes, size: Int): ArrayContainer {
+    fun getArrayContainer(type: ArrayTypes, size: Int): Any {
         val tIndex = type.index
         val sIndex = sizes[tIndex].indexOf(size)
 
         // Checking that we have this array size in our storage for this type
         val idx = if (sIndex != -1) {
-            val array = storage[tIndex][sIndex].removeFirstOrNull()
+            val array = storageUnused[tIndex][sIndex].removeFirstOrNull()
             array?.let {
-                ArrayContainer.resetArray(it)
-                limiter.deductMemory(it.sizeBytes.toLong())
+                resetArray(it)
+                limiter.deductMemory((type.size * size).toLong())
+                storageUsed[tIndex][sIndex].addLast(it)
                 return it
             }
             sIndex
         } else {
-            if (sizeIndices[tIndex] >= storage[tIndex].size)
+            if (sizeIndices[tIndex] >= storageUnused[tIndex].size)
                 grow(tIndex)
 
             val idx = sizeIndices[tIndex]++
@@ -42,18 +46,69 @@ internal class ArrayStorage(typeLength: Int, sizeLength: Int, private val limite
             idx
         }
 
-        return ArrayContainer(type, size, idx)
+        val array = create(type, size)
+        storageUsed[tIndex][idx].addLast(array)
+
+        return array
+    }
+
+    fun moveUsedArrays() {
+        storageUsed.forEachIndexed { typeIndex, arraysByType ->
+            arraysByType.forEachIndexed { sizeIndex, arrayDeque ->
+                arrayDeque.forEach {
+                    storageUnused[typeIndex][sizeIndex].addLast(it)
+                }
+                arrayDeque.clear()
+            }
+        }
     }
 
     private fun grow(typeIndex: Int) {
         val newSize = sizes[typeIndex].size * 2
-        val newStorage: Array<ArrayDeque<ArrayContainer>> = Array(newSize) { ArrayDeque() }
+        val newStorageUnused: Array<ArrayDeque<Any>> = Array(newSize) { ArrayDeque() }
+        val newStorageUsed: Array<ArrayDeque<Any>> = Array(newSize) { ArrayDeque() }
 
-        for (i in storage[typeIndex].indices) {
-            newStorage[i] = storage[typeIndex][i]
+        for (i in storageUnused[typeIndex].indices) {
+            newStorageUnused[i] = storageUnused[typeIndex][i]
+            newStorageUsed[i] = storageUsed[typeIndex][i]
         }
 
-        storage[typeIndex] = newStorage
+        storageUnused[typeIndex] = newStorageUnused
+        storageUsed[typeIndex] = newStorageUsed
         sizes[typeIndex] = sizes[typeIndex].copyOf(newSize)
     }
+
+    fun create(type: ArrayTypes, size: Int): Any {
+        return when (type) {
+            ArrayTypes.ByteArray -> ByteArray(size)         // 8-bit signed
+            ArrayTypes.UByteArray -> UByteArray(size)       // 8-bit unsigned
+            ArrayTypes.ShortArray -> ShortArray(size)       // 16-bit signed
+            ArrayTypes.UShortArray -> UShortArray(size)     // 16-bit unsigned
+            ArrayTypes.IntArray -> IntArray(size)           // 32-bit signed
+            ArrayTypes.UIntArray -> UIntArray(size)         // 32-bit unsigned
+            ArrayTypes.LongArray -> LongArray(size)         // 64-bit signed
+            ArrayTypes.ULongArray -> ULongArray(size)       // 64-bit unsigned
+            ArrayTypes.FloatArray -> FloatArray(size)
+            ArrayTypes.DoubleArray -> DoubleArray(size)
+            ArrayTypes.BooleanArray -> BooleanArray(size)
+            else -> throw IllegalArgumentException("Unsupported array type")
+        }
+    }
+
+    private fun resetArray(array: Any) {
+        when (array) {
+            is ByteArray -> array.fill(0)       // 8-bit signed
+            is UByteArray -> array.fill(0u)     // 8-bit unsigned
+            is ShortArray -> array.fill(0)      // 16-bit signed
+            is UShortArray -> array.fill(0u)    // 16-bit unsigned
+            is IntArray -> array.fill(0)        // 32-bit signed
+            is UIntArray -> array.fill(0u)      // 32-bit unsigned
+            is LongArray -> array.fill(0L)      // 64-bit signed
+            is ULongArray -> array.fill(0U)     // 64-bit unsigned
+            is FloatArray -> array.fill(0.0f)
+            is DoubleArray -> array.fill(0.0)
+            is BooleanArray -> array.fill(false)
+            else -> throw IllegalArgumentException("Unsupported array type")
+        }
+    }
 }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayContainer.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayContainer.kt
deleted file mode 100644
index 8818345fe..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayContainer.kt
+++ /dev/null
@@ -1,17 +0,0 @@
-@file:GeneratePrimitives(DataType.ALL)
-@file:Suppress("DuplicatedCode")
-
-package io.kinference.ndarray.arrays.memory
-
-import io.kinference.primitives.annotations.GenerateNameFromPrimitives
-import io.kinference.primitives.annotations.GeneratePrimitives
-import io.kinference.primitives.types.DataType
-import io.kinference.primitives.types.PrimitiveArray
-
-@GenerateNameFromPrimitives
-internal class PrimitiveArrayContainer(
-    arrayTypeIndex: Int,
-    arraySizeIndex: Int,
-    sizeBytes: Int,
-    val array: PrimitiveArray
-) : ArrayContainer(arrayTypeIndex, arraySizeIndex, sizeBytes)
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
index a9863aadb..e6ad36001 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
@@ -5,7 +5,6 @@ package io.kinference.ndarray.arrays.tiled
 
 import io.kinference.ndarray.arrays.*
 import io.kinference.ndarray.arrays.memory.*
-import io.kinference.ndarray.arrays.memory.PrimitiveArrayContainer
 import io.kinference.ndarray.arrays.pointers.PrimitivePointer
 import io.kinference.ndarray.arrays.pointers.accept
 import io.kinference.ndarray.blockSizeByStrides
@@ -61,12 +60,9 @@ internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>) {
             val blocksNum = if (blockSize == 0) 0 else size / blockSize
 
             val coroutineContext = coroutineContext[AllocatorContext.Key]
+            val blocks = coroutineContext?.getArrayContainers(type, blockSize, blocksNum) ?: Array(blocksNum) { PrimitiveArray(blockSize) }
 
-            // With array dispatcher
-            val containerArray = coroutineContext?.getArrayContainers(type, blockSize, blocksNum) ?: Array(blocksNum) { ArrayContainer(type, blockSize) }
-            val blocks = Array(containerArray.size) { i -> (containerArray[i] as PrimitiveArrayContainer).array }
-
-            return PrimitiveTiledArray(blocks)
+            return PrimitiveTiledArray(blocks.map { it as PrimitiveArray }.toTypedArray())
         }
 
         suspend operator fun invoke(size: Int, blockSize: Int, init: (InlineInt) -> PrimitiveType) : PrimitiveTiledArray {
@@ -132,16 +128,19 @@ internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>) {
     }
 
     suspend fun copyOf(): PrimitiveTiledArray {
-        val copyArray = PrimitiveTiledArray(size, blockSize)
+//        val copyArray = PrimitiveTiledArray(size, blockSize)
+        val copyBlocks = Array(blocksNum) { PrimitiveArray(blockSize) }
 
         for (blockNum in 0 until blocksNum) {
             val thisBlock = this.blocks[blockNum]
-            val destBlock = copyArray.blocks[blockNum]
+//            val destBlock = copyArray.blocks[blockNum]
+            val destBlock = copyBlocks[blockNum]
 
             thisBlock.copyInto(destBlock)
         }
 
-        return copyArray
+//        return copyArray
+        return PrimitiveTiledArray(copyBlocks)
     }
 
     fun copyInto(dest: PrimitiveTiledArray, destOffset: Int = 0, srcStart: Int = 0, srcEnd: Int = size) {
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt
index 577c3ea02..bc14a927a 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt
@@ -3,8 +3,6 @@ package io.kinference.ndarray.extensions.gelu
 
 import io.kinference.ndarray.*
 import io.kinference.ndarray.arrays.*
-
-import io.kinference.ndarray.arrays.memory.PrimitiveArrayContainer
 import io.kinference.ndarray.arrays.tiled.PrimitiveTiledArray
 import io.kinference.ndarray.extensions.constants.PrimitiveConstants
 import io.kinference.ndarray.stubs.absoluteValue
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/FastGeluPrimitive.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/FastGeluPrimitive.kt
index ffb626267..32a1e6e02 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/FastGeluPrimitive.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/FastGeluPrimitive.kt
@@ -6,7 +6,6 @@ package io.kinference.ndarray.extensions.gelu
 import io.kinference.ndarray.arrays.*
 import io.kinference.ndarray.arrays.MutablePrimitiveNDArray
 import io.kinference.ndarray.arrays.PrimitiveNDArray
-import io.kinference.ndarray.arrays.memory.PrimitiveArrayContainer
 import io.kinference.ndarray.arrays.tiled.PrimitiveTiledArray
 import io.kinference.ndarray.countCoroutinesByData
 import io.kinference.ndarray.parallelizeByBlocks
diff --git a/utils/utils-testing/src/commonMain/kotlin/io.kinference/runners/PerformanceRunner.kt b/utils/utils-testing/src/commonMain/kotlin/io.kinference/runners/PerformanceRunner.kt
index 6d8054dc2..5b288eac5 100644
--- a/utils/utils-testing/src/commonMain/kotlin/io.kinference/runners/PerformanceRunner.kt
+++ b/utils/utils-testing/src/commonMain/kotlin/io.kinference/runners/PerformanceRunner.kt
@@ -120,6 +120,7 @@ class PerformanceRunner<T : ONNXData<*, *>>(private val engine: TestEngine<T>) {
         for (result in results.sortedBy { it.name }) {
             logger.info { "Test ${result.name}: avg ${result.avg}, min ${result.min}, max ${result.max}" }
         }
+        logger.info { "Average between inputs: avg ${results.map { it.avg }.average()}, min ${results.minOfOrNull { it.min }}, max ${results.maxOfOrNull { it.max }}" }
     }
 
     companion object {

From 9caf75c01947c03b2afd58fbf9ebdebad148602a Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Thu, 15 Aug 2024 22:07:08 +0200
Subject: [PATCH 03/19] JBAI-4393 [core, ndarray] Refactored memory management
 and array handling: streamlined array type handling and improved memory limit
 checks within create and reset methods; KIModel predict improved for
 NoAllocator case.

---
 .../io/kinference.core/model/KIModel.kt       |  26 +++--
 .../ndarray/arrays/ArrayDispatcherUtils.kt    |  26 +++--
 .../ndarray/arrays/memory/AllocatorContext.kt |  10 +-
 .../ndarray/arrays/memory/ArrayStorage.kt     | 109 ++++++++++--------
 .../ndarray/arrays/memory/MemoryLimiter.kt    |  40 +++----
 .../arrays/tiled/PrimitiveTiledArray.kt       |   7 +-
 6 files changed, 106 insertions(+), 112 deletions(-)

diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
index 2a1e50db7..5aecb3ce6 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
@@ -17,14 +17,14 @@ class KIModel(
     val name: String,
     val opSet: OperatorSetRegistry,
     val graph: KIGraph,
-    memoryLimiter: MemoryLimiter = MemoryLimiters.NoAllocator,
+    private val memoryLimiter: MemoryLimiter = MemoryLimiters.NoAllocator,
     parallelismLimit: Int = PlatformUtils.cores,
 ) : Model<KIONNXData<*>>, Profilable, Cacheable {
     private val profiles: MutableList<ProfilingContext> = ArrayList()
 
     @OptIn(ExperimentalCoroutinesApi::class)
     private val dispatcher: CoroutineDispatcher = Dispatchers.Default.limitedParallelism(parallelismLimit)
-    private val modelArrayStorage: ModelArrayStorage = ModelArrayStorage(memoryLimiter)
+    private val modelArrayStorage: ModelArrayStorage = ModelArrayStorage(MemoryLimiters.Default)
 
     override fun addProfilingContext(name: String): ProfilingContext = ProfilingContext(name).apply { profiles.add(this) }
     override fun analyzeProfilingResults(): ProfileAnalysisEntry = profiles.analyze("Model $name")
@@ -44,15 +44,21 @@ class KIModel(
                 coreReserved = true
             }
 
-            val allocatorContext = modelArrayStorage.createAllocatorContext()
-            val mixedContext = allocatorContext + limiterContext
+            if (memoryLimiter == MemoryLimiters.NoAllocator) {
+                withContext(limiterContext) {
+                    return@withContext graph.execute(input, contexts)
+                }
+            } else {
+                val allocatorContext = modelArrayStorage.createAllocatorContext()
+                val mixedContext = allocatorContext + limiterContext
 
-            withContext(mixedContext) {
-                val coroutineContext = coroutineContext[AllocatorContext.Key]!!
-                val execResult = graph.execute(input, contexts)
-                val copies = execResult.map { it.clone(it.name) }.toList()
-                coroutineContext.closeAllocated()
-                copies
+                withContext(mixedContext) {
+                    val coroutineContext = coroutineContext[AllocatorContext.Key]!!
+                    val execResult = graph.execute(input, contexts)
+                    val copies = execResult.map { it.clone(it.name) }.toList()
+                    coroutineContext.closeAllocated()
+                    return@withContext copies
+                }
             }
         } finally {
             if (coreReserved) {
diff --git a/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt b/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt
index 53f217c09..c52d6ffd8 100644
--- a/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt
+++ b/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt
@@ -1,15 +1,19 @@
 package io.kinference.ndarray.arrays
 
 enum class ArrayTypes(val index: Int, val size: Int) {
-    ByteArray(0, Byte.SIZE_BYTES),
-    UByteArray(1, UByte.SIZE_BYTES),
-    ShortArray(2, Short.SIZE_BYTES),
-    UShortArray(3, UShort.SIZE_BYTES),
-    IntArray(4, Int.SIZE_BYTES),
-    UIntArray(5, UInt.SIZE_BYTES),
-    LongArray(6, Long.SIZE_BYTES),
-    ULongArray(7, ULong.SIZE_BYTES),
-    FloatArray(8, Float.SIZE_BYTES),
-    DoubleArray(9, Double.SIZE_BYTES),
-    BooleanArray(10, 1);
+    ByteArrayType(0, Byte.SIZE_BYTES),
+    UByteArrayType(1, UByte.SIZE_BYTES),
+    ShortArrayType(2, Short.SIZE_BYTES),
+    UShortArrayType(3, UShort.SIZE_BYTES),
+    IntArrayType(4, Int.SIZE_BYTES),
+    UIntArrayType(5, UInt.SIZE_BYTES),
+    LongArrayType(6, Long.SIZE_BYTES),
+    ULongArrayType(7, ULong.SIZE_BYTES),
+    FloatArrayType(8, Float.SIZE_BYTES),
+    DoubleArrayType(9, Double.SIZE_BYTES),
+    BooleanArrayType(10, 1);
+
+    companion object {
+        fun sizeInBytes(index: Int, arraySize: Int): Long = entries[index].size * arraySize.toLong()
+    }
 }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt
index 84d02017e..7c5286a41 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt
@@ -13,15 +13,7 @@ data class AllocatorContext internal constructor(
     override val key: CoroutineContext.Key<*> get() = Key
 
     internal fun getArrayContainers(type: ArrayTypes, size: Int, count: Int): Array<Any> {
-        return if (limiter !is NoAllocatorMemoryLimiter) {
-            Array(count) { unusedContainers.getArrayContainer(type, size) }
-        } else {
-            Array(count) { unusedContainers.create(type, size) }
-        }
-    }
-
-    fun closeOperator() {
-        unusedContainers.moveUsedArrays()
+        return Array(count) { unusedContainers.getArrayContainer(type, size) }
     }
 
     fun closeAllocated() {
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt
index 859ad2654..782c121c2 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt
@@ -4,6 +4,8 @@ import io.kinference.ndarray.arrays.ArrayTypes
 
 internal class ArrayStorage(typeLength: Int, sizeLength: Int, private val limiter: MemoryLimiter) {
     /**
+     * This is a storage for arrays which are available for retrieving
+     *
      * Structure is as follows:
      * 1. Array by predefined types (all types are known compiled time)
      * 2. Array by size. Starting with 'INIT_SIZE_VALUE' element and grow it doubling (typically there are no more than 16 different sizes)
@@ -12,30 +14,52 @@ internal class ArrayStorage(typeLength: Int, sizeLength: Int, private val limite
     private var storageUnused: Array<Array<ArrayDeque<Any>>> =
         Array(typeLength) { Array(sizeLength) { ArrayDeque() } }
 
+    /**
+     * This is a storage for arrays which are currently in use.
+     * They should be moved back into unused storage when there is no need for them.
+     *
+     * Structure is as follows:
+     * 1. Array by predefined types (all types are known compiled time)
+     * 2. Array by size.
+     * Starting with 'INIT_SIZE_VALUE' element and grow it doubling (typically there are no more than 16 different sizes)
+     * 3. Queue of array containers (used as FIFO)
+     */
     private var storageUsed: Array<Array<ArrayDeque<Any>>> =
         Array(typeLength) { Array(sizeLength) { ArrayDeque() } }
 
     private var sizeIndices: IntArray = IntArray(typeLength)
     private var sizes: Array<IntArray> = Array(typeLength) { IntArray(sizeLength) }
 
+    internal fun getArrayContainer(type: ArrayTypes, size: Int): Any {
+        return if (limiter.checkMemoryLimitAndAdd(ArrayTypes.sizeInBytes(type.index, size))) {
+            val tIndex = type.index
+            val sIndex = getSizeIndex(tIndex, size)
+            val array = storageUnused[tIndex][sIndex].removeFirstOrNull()?.also(::resetArray)
+                ?: create(type, size)
 
-    operator fun get(typeIndex: Int, sizeIndex: Int): ArrayDeque<Any> {
-        return storageUnused[typeIndex][sizeIndex]
+            storageUsed[tIndex][sIndex].addLast(array)
+            array
+        } else {
+            create(type, size)
+        }
     }
 
-    fun getArrayContainer(type: ArrayTypes, size: Int): Any {
-        val tIndex = type.index
+    internal fun moveUsedArrays() {
+        storageUsed.forEachIndexed { typeIndex, arraysByType ->
+            arraysByType.forEachIndexed { sizeIndex, arrayDeque ->
+                arrayDeque.forEach {
+                    storageUnused[typeIndex][sizeIndex].addLast(it)
+                }
+                arrayDeque.clear()
+            }
+        }
+        limiter.resetLimit()
+    }
+
+    private fun getSizeIndex(tIndex: Int, size: Int): Int {
         val sIndex = sizes[tIndex].indexOf(size)
 
-        // Checking that we have this array size in our storage for this type
-        val idx = if (sIndex != -1) {
-            val array = storageUnused[tIndex][sIndex].removeFirstOrNull()
-            array?.let {
-                resetArray(it)
-                limiter.deductMemory((type.size * size).toLong())
-                storageUsed[tIndex][sIndex].addLast(it)
-                return it
-            }
+        return if (sIndex != -1) {
             sIndex
         } else {
             if (sizeIndices[tIndex] >= storageUnused[tIndex].size)
@@ -45,22 +69,6 @@ internal class ArrayStorage(typeLength: Int, sizeLength: Int, private val limite
             sizes[tIndex][idx] = size
             idx
         }
-
-        val array = create(type, size)
-        storageUsed[tIndex][idx].addLast(array)
-
-        return array
-    }
-
-    fun moveUsedArrays() {
-        storageUsed.forEachIndexed { typeIndex, arraysByType ->
-            arraysByType.forEachIndexed { sizeIndex, arrayDeque ->
-                arrayDeque.forEach {
-                    storageUnused[typeIndex][sizeIndex].addLast(it)
-                }
-                arrayDeque.clear()
-            }
-        }
     }
 
     private fun grow(typeIndex: Int) {
@@ -78,37 +86,36 @@ internal class ArrayStorage(typeLength: Int, sizeLength: Int, private val limite
         sizes[typeIndex] = sizes[typeIndex].copyOf(newSize)
     }
 
-    fun create(type: ArrayTypes, size: Int): Any {
+    private fun create(type: ArrayTypes, size: Int): Any {
         return when (type) {
-            ArrayTypes.ByteArray -> ByteArray(size)         // 8-bit signed
-            ArrayTypes.UByteArray -> UByteArray(size)       // 8-bit unsigned
-            ArrayTypes.ShortArray -> ShortArray(size)       // 16-bit signed
-            ArrayTypes.UShortArray -> UShortArray(size)     // 16-bit unsigned
-            ArrayTypes.IntArray -> IntArray(size)           // 32-bit signed
-            ArrayTypes.UIntArray -> UIntArray(size)         // 32-bit unsigned
-            ArrayTypes.LongArray -> LongArray(size)         // 64-bit signed
-            ArrayTypes.ULongArray -> ULongArray(size)       // 64-bit unsigned
-            ArrayTypes.FloatArray -> FloatArray(size)
-            ArrayTypes.DoubleArray -> DoubleArray(size)
-            ArrayTypes.BooleanArray -> BooleanArray(size)
+            ArrayTypes.ByteArrayType -> ByteArray(size)         // 8-bit signed
+            ArrayTypes.UByteArrayType -> UByteArray(size)       // 8-bit unsigned
+            ArrayTypes.ShortArrayType -> ShortArray(size)       // 16-bit signed
+            ArrayTypes.UShortArrayType -> UShortArray(size)     // 16-bit unsigned
+            ArrayTypes.IntArrayType -> IntArray(size)           // 32-bit signed
+            ArrayTypes.UIntArrayType -> UIntArray(size)         // 32-bit unsigned
+            ArrayTypes.LongArrayType -> LongArray(size)         // 64-bit signed
+            ArrayTypes.ULongArrayType -> ULongArray(size)       // 64-bit unsigned
+            ArrayTypes.FloatArrayType -> FloatArray(size)
+            ArrayTypes.DoubleArrayType -> DoubleArray(size)
+            ArrayTypes.BooleanArrayType -> BooleanArray(size)
             else -> throw IllegalArgumentException("Unsupported array type")
         }
     }
 
-    private fun resetArray(array: Any) {
+    private fun resetArray(array: Any): Unit =
         when (array) {
-            is ByteArray -> array.fill(0)       // 8-bit signed
-            is UByteArray -> array.fill(0u)     // 8-bit unsigned
-            is ShortArray -> array.fill(0)      // 16-bit signed
-            is UShortArray -> array.fill(0u)    // 16-bit unsigned
-            is IntArray -> array.fill(0)        // 32-bit signed
-            is UIntArray -> array.fill(0u)      // 32-bit unsigned
-            is LongArray -> array.fill(0L)      // 64-bit signed
-            is ULongArray -> array.fill(0U)     // 64-bit unsigned
+            is ByteArray -> array.fill(0)               // 8-bit signed
+            is UByteArray -> array.fill(0u)             // 8-bit unsigned
+            is ShortArray -> array.fill(0)              // 16-bit signed
+            is UShortArray -> array.fill(0u)            // 16-bit unsigned
+            is IntArray -> array.fill(0)                // 32-bit signed
+            is UIntArray -> array.fill(0u)              // 32-bit unsigned
+            is LongArray -> array.fill(0L)              // 64-bit signed
+            is ULongArray -> array.fill(0U)             // 64-bit unsigned
             is FloatArray -> array.fill(0.0f)
             is DoubleArray -> array.fill(0.0)
             is BooleanArray -> array.fill(false)
             else -> throw IllegalArgumentException("Unsupported array type")
         }
-    }
 }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryLimiter.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryLimiter.kt
index 775c0a895..ebb86fd8c 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryLimiter.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryLimiter.kt
@@ -1,12 +1,11 @@
 package io.kinference.ndarray.arrays.memory
 
 import io.kinference.utils.PlatformUtils
-import kotlinx.atomicfu.AtomicLong
-import kotlinx.atomicfu.atomic
+import kotlinx.atomicfu.*
 
 interface MemoryLimiter {
     /**
-     * Checks if the memory limit allows adding the specified amount of memory and performs the addition.
+     * Checks if the memory limit allows adding the specified amount of memory and performs the addition
      *
      * @param added the memory in bytes to add
      * @return true if the memory was added successfully and false if adding the memory exceeds the memory limit
@@ -14,44 +13,33 @@ interface MemoryLimiter {
     fun checkMemoryLimitAndAdd(added: Long): Boolean
 
     /**
-     * Deducts the specified amount of memory from the memory limiter.
-     *
-     * @param deducted the memory in bytes to deduct from the memory limiter
+     * Resets the used memory into 0L
      */
-    fun deductMemory(deducted: Long)
+    fun resetLimit()
 }
 
-class BaseMemoryLimiter(private val memoryLimit: Long) : MemoryLimiter {
+class BaseMemoryLimiter internal constructor(private val memoryLimit: Long) : MemoryLimiter {
     private var usedMemory: AtomicLong = atomic(0L)
 
     override fun checkMemoryLimitAndAdd(added: Long): Boolean {
-        val currentMemory = usedMemory.addAndGet(added)
-        return if (currentMemory > memoryLimit) {
-            usedMemory.addAndGet(-added)
-            false
-        } else true
+        // Attempt to add memory and check the limit
+        val successful = usedMemory.getAndUpdate { current ->
+            if (current + added > memoryLimit) current else current + added
+        } != usedMemory.value // Check if the update was successful
+
+        return successful
     }
 
-    override fun deductMemory(deducted: Long) {
-        usedMemory.addAndGet(-deducted)
+    override fun resetLimit() {
+        usedMemory.value = 0L
     }
 }
 
 object MemoryLimiters {
     val Default: MemoryLimiter = BaseMemoryLimiter((PlatformUtils.maxHeap * 0.3).toLong())
-    val NoAllocator: MemoryLimiter = NoAllocatorMemoryLimiter
+    val NoAllocator: MemoryLimiter = BaseMemoryLimiter(0L)
 
     fun customLimiter(memoryLimit: Long): MemoryLimiter {
         return BaseMemoryLimiter(memoryLimit)
     }
 }
-
-internal object NoAllocatorMemoryLimiter : MemoryLimiter {
-    override fun checkMemoryLimitAndAdd(added: Long): Boolean {
-        return false
-    }
-
-    override fun deductMemory(deducted: Long) {
-
-    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
index e6ad36001..07cbf57f1 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
@@ -26,7 +26,7 @@ internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>) {
     }
 
     companion object {
-        val type: ArrayTypes = ArrayTypes.valueOf(PrimitiveArray::class.simpleName!!)
+        val type: ArrayTypes = ArrayTypes.valueOf(PrimitiveArray::class.simpleName!! + "Type")
 
         suspend operator fun invoke(strides: Strides): PrimitiveTiledArray {
             val blockSize = blockSizeByStrides(strides)
@@ -127,19 +127,16 @@ internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>) {
         blocks[blockIdx][blockOff] = value
     }
 
-    suspend fun copyOf(): PrimitiveTiledArray {
-//        val copyArray = PrimitiveTiledArray(size, blockSize)
+    fun copyOf(): PrimitiveTiledArray {
         val copyBlocks = Array(blocksNum) { PrimitiveArray(blockSize) }
 
         for (blockNum in 0 until blocksNum) {
             val thisBlock = this.blocks[blockNum]
-//            val destBlock = copyArray.blocks[blockNum]
             val destBlock = copyBlocks[blockNum]
 
             thisBlock.copyInto(destBlock)
         }
 
-//        return copyArray
         return PrimitiveTiledArray(copyBlocks)
     }
 

From f1a929665a1af27ab7c465e6d072755fc2fab6d4 Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Mon, 19 Aug 2024 18:52:03 +0200
Subject: [PATCH 04/19] JBAI-4393 [core, ndarray] Refactored memory management
 and array handling: added new type for limiter which works with manually
 managed ndarrays, added manual ndarray handling in Attention and
 TensorExtensions, moved to use standard DataType enum instead of ArrayTypes.

---
 .../kinference.core/data/tensor/KITensor.kt   |   6 +-
 .../data/tensor/TensorExtensions.kt           |   5 +-
 .../io/kinference.core/model/KIModel.kt       |  37 +++--
 .../operators/layer/attention/Attention.kt    |  62 +++++---
 .../ndarray/arrays/ArrayDispatcherUtils.kt    |  19 ---
 .../ndarray/arrays/memory/AllocatorContext.kt |  23 ---
 .../ndarray/arrays/memory/ArrayStorage.kt     | 150 ++++++++++--------
 .../arrays/memory/AutoAllocatorContext.kt     |  23 +++
 .../arrays/memory/ManualAllocatorContext.kt   |  53 +++++++
 .../ndarray/arrays/memory/MemoryLimiter.kt    |  35 +++-
 .../arrays/memory/ModelArrayStorage.kt        |  23 +--
 .../memory/PrimitiveArrayStorageWrapper.kt    |  30 ++++
 .../arrays/tiled/PrimitiveTiledArray.kt       |   6 +-
 .../extensions/constants/BooleanConstants.kt  |   6 +
 14 files changed, 317 insertions(+), 161 deletions(-)
 delete mode 100644 ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt
 delete mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt
 create mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AutoAllocatorContext.kt
 create mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ManualAllocatorContext.kt
 create mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayStorageWrapper.kt
 create mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt

diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt
index d1ca7c5f6..cdf96e0e1 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt
@@ -3,6 +3,7 @@ package io.kinference.core.data.tensor
 import io.kinference.core.*
 import io.kinference.data.ONNXTensor
 import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
 import io.kinference.ndarray.arrays.tiled.*
 import io.kinference.protobuf.FLOAT_TENSOR_TYPES
 import io.kinference.protobuf.message.TensorProto
@@ -12,10 +13,11 @@ import io.kinference.types.ValueTypeInfo
 
 //TODO: support segments
 //TODO: support external data
-class KITensor(name: String?, override val data: NDArrayCore, val info: ValueTypeInfo.TensorTypeInfo) : ONNXTensor<NDArrayCore, CoreBackend>(name, data) {
+class KITensor(name: String?, override val data: NDArrayCore, val info: ValueTypeInfo.TensorTypeInfo, private var context: ManualAllocatorContext? = null) : ONNXTensor<NDArrayCore, CoreBackend>(name, data) {
     constructor(data: NDArrayCore, info: ValueInfo) : this(info.name, data, info.typeInfo as ValueTypeInfo.TensorTypeInfo)
 
     override suspend fun close() {
+        context?.returnNDArray(data)
         data.close()
     }
 
@@ -41,7 +43,7 @@ class KITensor(name: String?, override val data: NDArrayCore, val info: ValueTyp
     override val backend = CoreBackend
 
     override fun rename(name: String): KITensor {
-        return KITensor(name, data, info)
+        return KITensor(name, data, info, context)
     }
 
     companion object {
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/TensorExtensions.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/TensorExtensions.kt
index 618431c01..b83e75c2e 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/TensorExtensions.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/TensorExtensions.kt
@@ -1,6 +1,7 @@
 package io.kinference.core.data.tensor
 
 import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
 import io.kinference.ndarray.extensions.concat
 import io.kinference.ndarray.extensions.splitWithAxis
 import io.kinference.primitives.types.DataType
@@ -8,9 +9,9 @@ import io.kinference.protobuf.resolveProtoDataType
 import io.kinference.types.TensorShape
 import io.kinference.types.ValueTypeInfo
 
-fun NDArrayCore.asTensor(name: String? = null) = KITensor(name, this, ValueTypeInfo.TensorTypeInfo(TensorShape(this.shape), type.resolveProtoDataType()))
+fun NDArrayCore.asTensor(name: String? = null, context: ManualAllocatorContext? = null) = KITensor(name, this, ValueTypeInfo.TensorTypeInfo(TensorShape(this.shape), type.resolveProtoDataType()), context)
 
-internal fun <T : NDArray> T.asTensor(name: String? = null) = (this as NDArrayCore).asTensor(name)
+internal fun <T : NDArray> T.asTensor(name: String? = null, context: ManualAllocatorContext? = null) = (this as NDArrayCore).asTensor(name, context)
 
 internal fun <T : NDArray> Collection<T>.asONNXTensors(names: List<String>): List<KITensor> {
     return this.zip(names).map { (data, name) -> data.asTensor(name) }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
index 5aecb3ce6..35d554631 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
@@ -24,7 +24,7 @@ class KIModel(
 
     @OptIn(ExperimentalCoroutinesApi::class)
     private val dispatcher: CoroutineDispatcher = Dispatchers.Default.limitedParallelism(parallelismLimit)
-    private val modelArrayStorage: ModelArrayStorage = ModelArrayStorage(MemoryLimiters.Default)
+    private val modelArrayStorage: ModelArrayStorage = ModelArrayStorage(memoryLimiter)
 
     override fun addProfilingContext(name: String): ProfilingContext = ProfilingContext(name).apply { profiles.add(this) }
     override fun analyzeProfilingResults(): ProfileAnalysisEntry = profiles.analyze("Model $name")
@@ -44,20 +44,31 @@ class KIModel(
                 coreReserved = true
             }
 
-            if (memoryLimiter == MemoryLimiters.NoAllocator) {
-                withContext(limiterContext) {
-                    return@withContext graph.execute(input, contexts)
+            when (memoryLimiter) {
+                MemoryLimiters.NoAllocator -> {
+                    withContext(limiterContext) {
+                        return@withContext graph.execute(input, contexts)
+                    }
                 }
-            } else {
-                val allocatorContext = modelArrayStorage.createAllocatorContext()
-                val mixedContext = allocatorContext + limiterContext
+                MemoryLimiters.DefaultManualAllocator -> {
+                    val allocatorContext = modelArrayStorage.createManualAllocatorContext()
+                    val mixedContext = allocatorContext + limiterContext
 
-                withContext(mixedContext) {
-                    val coroutineContext = coroutineContext[AllocatorContext.Key]!!
-                    val execResult = graph.execute(input, contexts)
-                    val copies = execResult.map { it.clone(it.name) }.toList()
-                    coroutineContext.closeAllocated()
-                    return@withContext copies
+                    withContext(mixedContext) {
+                        return@withContext graph.execute(input, contexts)
+                    }
+                }
+                else -> {
+                    val allocatorContext = modelArrayStorage.createAutoAllocatorContext()
+                    val mixedContext = allocatorContext + limiterContext
+
+                    withContext(mixedContext) {
+                        val coroutineContext = coroutineContext[AutoAllocatorContext.Key]!!
+                        val execResult = graph.execute(input, contexts)
+                        val copies = execResult.map { it.clone(it.name) }.toList()
+                        coroutineContext.returnUsedArrays()
+                        return@withContext copies
+                    }
                 }
             }
         } finally {
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
index 05b76119b..234639c96 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
@@ -7,17 +7,21 @@ import io.kinference.core.optimizer.rules.context.AttentionContextRule
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
 import io.kinference.ndarray.arrays.pointers.accept
 import io.kinference.ndarray.arrays.pointers.map
 import io.kinference.ndarray.arrays.tiled.FloatTiledArray
 import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.ndarray.extensions.dotTransposedWithAlpha
+import io.kinference.ndarray.extensions.softmax.softmax
 import io.kinference.operator.*
 import io.kinference.optimizer.GraphOptimizer.Companion.isOpt
+import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.message.AttributeProto
 import io.kinference.protobuf.message.TensorProto
 import io.kinference.utils.launchWithLimitOrDefault
 import kotlinx.coroutines.coroutineScope
+import kotlin.coroutines.coroutineContext
 import kotlin.math.min
 import kotlin.math.sqrt
 
@@ -25,11 +29,12 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
     companion object {
         private suspend fun attentionScore(
             scores: NDArrayCore, batchSize: Int, seqLen: Int,
-            numHeads: Int, hiddenSize: Int, present: NDArrayCore
+            numHeads: Int, hiddenSize: Int, present: NDArrayCore, context: ManualAllocatorContext? = null
         ): Pair<NDArrayCore, NDArrayCore> {
             val headSize = hiddenSize / numHeads
 
-            val output = allocateNDArray(scores.type, Strides(intArrayOf(batchSize, numHeads, seqLen, headSize)))
+            val outputStrides = Strides(intArrayOf(batchSize, numHeads, seqLen, headSize))
+            val output = context?.getNDArray(scores.type, outputStrides, fillZeros = true) ?: allocateNDArray(scores.type, outputStrides)
 
             coroutineScope {
                 for (batchNum in 0 until batchSize) {
@@ -46,6 +51,8 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
                 }
             }
 
+            context?.returnNDArray(scores)
+
             val outputTransposed = output.transpose(intArrayOf(0, 2, 1, 3)).reshape(intArrayOf(batchSize, seqLen, hiddenSize))
             return outputTransposed to present
         }
@@ -108,26 +115,27 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
 
         internal suspend fun getScores(
             unidir: Boolean, q: NDArrayCore, k: NDArrayCore, v: NDArrayCore, mask: IntNDArray?,
-            past: NDArrayCore?, batchSize: Int, seqLen: Int, numHeads: Int, hiddenSize: Int, maskFilterValue: Float = -10_000f
+            past: NDArrayCore?, batchSize: Int, seqLen: Int, numHeads: Int, hiddenSize: Int, maskFilterValue: Float = -10_000f, context: ManualAllocatorContext? = null
         ): Pair<NDArrayCore, NDArrayCore> {
             val headSize = hiddenSize / numHeads
 
             val pastSeqLen = past?.shape?.get(3) ?: 0
             val present = makePresent(past, k, v, batchSize, seqLen, numHeads, hiddenSize)
 
-            val scores = normalizedScores(unidir, q, mask, batchSize, seqLen, pastSeqLen, headSize, numHeads, present, maskFilterValue)
-            return attentionScore(scores, batchSize, seqLen, numHeads, hiddenSize, present)
+            val scores = normalizedScores(unidir, q, mask, batchSize, seqLen, pastSeqLen, headSize, numHeads, present, maskFilterValue, context)
+            return attentionScore(scores, batchSize, seqLen, numHeads, hiddenSize, present, context)
         }
 
         private suspend fun normalizedScores(
             unidir: Boolean, queries: NDArrayCore, maskIndices: IntNDArray?, batchSize: Int,
-            seqLen: Int, pastSeqLen: Int, headSize: Int, numHeads: Int, present: NDArrayCore, maskFilterValue: Float = -10_000f
+            seqLen: Int, pastSeqLen: Int, headSize: Int, numHeads: Int, present: NDArrayCore, maskFilterValue: Float = -10_000f, context: ManualAllocatorContext? = null
         ): NumberNDArrayCore {
             val allSeqLen = present.shape[3]
 
-            val scores = allocateNDArray(queries.type, Strides(intArrayOf(batchSize, numHeads, seqLen, allSeqLen))) as MutableNumberNDArrayCore
+            val scoresStrides = Strides(intArrayOf(batchSize, numHeads, seqLen, allSeqLen))
+            val scores = (context?.getNDArray(queries.type, scoresStrides, fillZeros = true) ?: allocateNDArray(queries.type, scoresStrides)) as MutableNumberNDArrayCore
 
-            val maskData = maskIndices?.maskFromIndices(unidir, batchSize, seqLen, pastSeqLen, maskFilterValue)
+            val maskData = maskIndices?.maskFromIndices(unidir, batchSize, seqLen, pastSeqLen, maskFilterValue, context)
 
             val alpha = 1.0 / sqrt(headSize.toDouble())
 
@@ -148,27 +156,38 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
                 }
             }
 
+            if (maskData != null) {
+                context?.returnNDArray(maskData)
+            }
+            context?.returnNDArray(queries)
+
+            val softmaxDest = (context?.getNDArray(scores.type, scoresStrides) ?: allocateNDArray(scores.type, scoresStrides)) as MutableNumberNDArrayCore
+
+            return softmax(input = scores, axis = -1, dest = softmaxDest)
+
             //softmax for each result (normalize along last axis)
-            return scores.softmax(axis = -1)
+//            return scores.softmax(axis = -1)
         }
 
-        private suspend fun IntNDArray?.maskFromIndices(unidir: Boolean, batchSize: Int, seqLen: Int, pastSeqLen: Int, maskFilterValue: Float = -10_000f): FloatNDArray {
+        private suspend fun IntNDArray?.maskFromIndices(unidir: Boolean, batchSize: Int, seqLen: Int, pastSeqLen: Int, maskFilterValue: Float = -10_000f, context: ManualAllocatorContext? = null): FloatNDArray {
             val fullSeqLen = seqLen + pastSeqLen
             val maskDataShape = intArrayOf(batchSize, seqLen, fullSeqLen)
-            val mask = MutableFloatNDArray(Strides(maskDataShape))
+            val maskStrides = Strides(maskDataShape)
+
+            val mask = context?.getNDArray(DataType.FLOAT, maskStrides) ?: MutableFloatNDArray(maskStrides)
             val maskOffset = seqLen * fullSeqLen
             repeat(batchSize) { i ->
                 if (this != null) {
                     //raw attention (no padding). only raw attention mask is 2-dimensional
                     if (this.rank == 2) {
-                        val maskPointer = mask.array.pointer(maskOffset * i)
+                        val maskPointer = (mask as MutableFloatNDArray).array.pointer(maskOffset * i)
                         val maskIndicesPointer = this.array.pointer(i * fullSeqLen)
 
                         maskPointer.accept(maskIndicesPointer, fullSeqLen) { _, src -> if (src > 0) 0f else maskFilterValue }
                     } else {
                         //for left/right-side padding
                         val maskIndicesPointer = this.array.pointer(i)
-                        val maskPointer = mask.array.pointer(maskOffset * i + maskIndicesPointer.get())
+                        val maskPointer = (mask as MutableFloatNDArray).array.pointer(maskOffset * i + maskIndicesPointer.get())
                         maskPointer.map(fullSeqLen - maskIndicesPointer.get()) { maskFilterValue }
 
                         if (this.rank == 1 && this.shape[0] == 2 * batchSize) {
@@ -186,7 +205,7 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
                 }
 
                 if (unidir) {
-                    val maskPointer = mask.array.pointer()
+                    val maskPointer = (mask as MutableFloatNDArray).array.pointer()
                     for (seqIdx in 0 until seqLen - 1) {
                         val start = pastSeqLen + seqIdx + 1
                         maskPointer.linearIndex = seqIdx * fullSeqLen + maskOffset * i + start
@@ -194,7 +213,7 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
                     }
                 }
             }
-            return mask
+            return (mask as MutableFloatNDArray)
         }
 
         private val DEFAULT_VERSION = VersionInfo(sinceVersion = 1)
@@ -235,12 +254,13 @@ class AttentionVer1(name: String, attributes: Map<String, Attribute<Any>>, input
 
         internal suspend fun initQueryKeyValue(
             input: NDArrayCore, weights: NDArrayCore, bias: NDArrayCore,
-            batchSize: Int, seqLen: Int, hiddenSize: Int, numHeads: Int
+            batchSize: Int, seqLen: Int, hiddenSize: Int, numHeads: Int, context: ManualAllocatorContext? = null
         ): Array<MutableNDArrayCore> {
             input as NumberNDArrayCore
             val headSize = hiddenSize / numHeads
 
-            val qkv = Array(3) { allocateNDArray(input.type, Strides(intArrayOf(batchSize, numHeads, seqLen, headSize))) }
+            val qkvStrides = Strides(intArrayOf(batchSize, numHeads, seqLen, headSize))
+            val qkv = Array(3) { context?.getNDArray(input.type, qkvStrides, fillZeros = true) ?: allocateNDArray(input.type, qkvStrides) }
 
             coroutineScope {
                 for (qkvIdx in 0 until 3) {
@@ -269,6 +289,8 @@ class AttentionVer1(name: String, attributes: Map<String, Attribute<Any>>, input
     private val maskFilterValue: Float by attribute("mask_filter_value") { it: Number -> it.toFloat() }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
+        val context = coroutineContext[ManualAllocatorContext.Key]
+
         val input = inputs[0]!!
         val weights = inputs[1]!!
 
@@ -286,10 +308,10 @@ class AttentionVer1(name: String, attributes: Map<String, Attribute<Any>>, input
             input.data,
             preparedWeights.data,
             preparedBias.data,
-            batchSize, seqLen, hiddenSize, numHeads,
+            batchSize, seqLen, hiddenSize, numHeads, context
         )
 
-        val (scores, present) = getScores(unidir, queries, keys, values, maskIndices, past, batchSize, seqLen, numHeads, hiddenSize, maskFilterValue)
-        return listOf(scores.asTensor(), present.asTensor())
+        val (scores, present) = getScores(unidir, queries, keys, values, maskIndices, past, batchSize, seqLen, numHeads, hiddenSize, maskFilterValue, context)
+        return listOf(scores.asTensor(context = context), present.asTensor(context = context))
     }
 }
diff --git a/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt b/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt
deleted file mode 100644
index c52d6ffd8..000000000
--- a/ndarray/ndarray-api/src/commonMain/kotlin/io/kinference/ndarray/arrays/ArrayDispatcherUtils.kt
+++ /dev/null
@@ -1,19 +0,0 @@
-package io.kinference.ndarray.arrays
-
-enum class ArrayTypes(val index: Int, val size: Int) {
-    ByteArrayType(0, Byte.SIZE_BYTES),
-    UByteArrayType(1, UByte.SIZE_BYTES),
-    ShortArrayType(2, Short.SIZE_BYTES),
-    UShortArrayType(3, UShort.SIZE_BYTES),
-    IntArrayType(4, Int.SIZE_BYTES),
-    UIntArrayType(5, UInt.SIZE_BYTES),
-    LongArrayType(6, Long.SIZE_BYTES),
-    ULongArrayType(7, ULong.SIZE_BYTES),
-    FloatArrayType(8, Float.SIZE_BYTES),
-    DoubleArrayType(9, Double.SIZE_BYTES),
-    BooleanArrayType(10, 1);
-
-    companion object {
-        fun sizeInBytes(index: Int, arraySize: Int): Long = entries[index].size * arraySize.toLong()
-    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt
deleted file mode 100644
index 7c5286a41..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AllocatorContext.kt
+++ /dev/null
@@ -1,23 +0,0 @@
-package io.kinference.ndarray.arrays.memory
-
-import io.kinference.ndarray.arrays.*
-import kotlin.coroutines.CoroutineContext
-
-data class AllocatorContext internal constructor(
-    private val unusedContainers: ArrayStorage,
-    private val limiter: MemoryLimiter,
-    private val returnStorageFn: (ArrayStorage) -> Unit
-) : CoroutineContext.Element {
-
-    companion object Key : CoroutineContext.Key<AllocatorContext>
-    override val key: CoroutineContext.Key<*> get() = Key
-
-    internal fun getArrayContainers(type: ArrayTypes, size: Int, count: Int): Array<Any> {
-        return Array(count) { unusedContainers.getArrayContainer(type, size) }
-    }
-
-    fun closeAllocated() {
-        unusedContainers.moveUsedArrays()
-        returnStorageFn(unusedContainers)
-    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt
index 782c121c2..dcf704673 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt
@@ -1,22 +1,10 @@
 package io.kinference.ndarray.arrays.memory
 
-import io.kinference.ndarray.arrays.ArrayTypes
+import io.kinference.primitives.types.DataType
 
-internal class ArrayStorage(typeLength: Int, sizeLength: Int, private val limiter: MemoryLimiter) {
+internal abstract class BaseArrayStorage(typeLength: Int, sizeLength: Int, storageCount: Int) {
     /**
-     * This is a storage for arrays which are available for retrieving
-     *
-     * Structure is as follows:
-     * 1. Array by predefined types (all types are known compiled time)
-     * 2. Array by size. Starting with 'INIT_SIZE_VALUE' element and grow it doubling (typically there are no more than 16 different sizes)
-     * 3. Queue of array containers (used as FIFO)
-     */
-    private var storageUnused: Array<Array<ArrayDeque<Any>>> =
-        Array(typeLength) { Array(sizeLength) { ArrayDeque() } }
-
-    /**
-     * This is a storage for arrays which are currently in use.
-     * They should be moved back into unused storage when there is no need for them.
+     * This is a storage for arrays.
      *
      * Structure is as follows:
      * 1. Array by predefined types (all types are known compiled time)
@@ -24,45 +12,19 @@ internal class ArrayStorage(typeLength: Int, sizeLength: Int, private val limite
      * Starting with 'INIT_SIZE_VALUE' element and grow it doubling (typically there are no more than 16 different sizes)
      * 3. Queue of array containers (used as FIFO)
      */
-    private var storageUsed: Array<Array<ArrayDeque<Any>>> =
-        Array(typeLength) { Array(sizeLength) { ArrayDeque() } }
+    protected var storage: Array<Array<Array<ArrayDeque<Any>>>> =
+        Array(storageCount) { Array(typeLength) { Array(sizeLength) { ArrayDeque() } } }
 
     private var sizeIndices: IntArray = IntArray(typeLength)
     private var sizes: Array<IntArray> = Array(typeLength) { IntArray(sizeLength) }
 
-    internal fun getArrayContainer(type: ArrayTypes, size: Int): Any {
-        return if (limiter.checkMemoryLimitAndAdd(ArrayTypes.sizeInBytes(type.index, size))) {
-            val tIndex = type.index
-            val sIndex = getSizeIndex(tIndex, size)
-            val array = storageUnused[tIndex][sIndex].removeFirstOrNull()?.also(::resetArray)
-                ?: create(type, size)
-
-            storageUsed[tIndex][sIndex].addLast(array)
-            array
-        } else {
-            create(type, size)
-        }
-    }
-
-    internal fun moveUsedArrays() {
-        storageUsed.forEachIndexed { typeIndex, arraysByType ->
-            arraysByType.forEachIndexed { sizeIndex, arrayDeque ->
-                arrayDeque.forEach {
-                    storageUnused[typeIndex][sizeIndex].addLast(it)
-                }
-                arrayDeque.clear()
-            }
-        }
-        limiter.resetLimit()
-    }
-
-    private fun getSizeIndex(tIndex: Int, size: Int): Int {
+    protected fun getSizeIndex(tIndex: Int, size: Int): Int {
         val sIndex = sizes[tIndex].indexOf(size)
 
         return if (sIndex != -1) {
             sIndex
         } else {
-            if (sizeIndices[tIndex] >= storageUnused[tIndex].size)
+            if (sizeIndices[tIndex] >= storage[0][tIndex].size)
                 grow(tIndex)
 
             val idx = sizeIndices[tIndex]++
@@ -73,37 +35,40 @@ internal class ArrayStorage(typeLength: Int, sizeLength: Int, private val limite
 
     private fun grow(typeIndex: Int) {
         val newSize = sizes[typeIndex].size * 2
-        val newStorageUnused: Array<ArrayDeque<Any>> = Array(newSize) { ArrayDeque() }
-        val newStorageUsed: Array<ArrayDeque<Any>> = Array(newSize) { ArrayDeque() }
+        for (i in storage.indices) {
+            val newStorage: Array<ArrayDeque<Any>> = Array(newSize) { ArrayDeque() }
 
-        for (i in storageUnused[typeIndex].indices) {
-            newStorageUnused[i] = storageUnused[typeIndex][i]
-            newStorageUsed[i] = storageUsed[typeIndex][i]
+            for (j in storage[i][typeIndex].indices) {
+                newStorage[j] = storage[i][typeIndex][j]
+            }
+
+            storage[i][typeIndex] = newStorage
         }
 
-        storageUnused[typeIndex] = newStorageUnused
-        storageUsed[typeIndex] = newStorageUsed
         sizes[typeIndex] = sizes[typeIndex].copyOf(newSize)
     }
 
-    private fun create(type: ArrayTypes, size: Int): Any {
+    protected fun create(type: DataType, size: Int): Any {
         return when (type) {
-            ArrayTypes.ByteArrayType -> ByteArray(size)         // 8-bit signed
-            ArrayTypes.UByteArrayType -> UByteArray(size)       // 8-bit unsigned
-            ArrayTypes.ShortArrayType -> ShortArray(size)       // 16-bit signed
-            ArrayTypes.UShortArrayType -> UShortArray(size)     // 16-bit unsigned
-            ArrayTypes.IntArrayType -> IntArray(size)           // 32-bit signed
-            ArrayTypes.UIntArrayType -> UIntArray(size)         // 32-bit unsigned
-            ArrayTypes.LongArrayType -> LongArray(size)         // 64-bit signed
-            ArrayTypes.ULongArrayType -> ULongArray(size)       // 64-bit unsigned
-            ArrayTypes.FloatArrayType -> FloatArray(size)
-            ArrayTypes.DoubleArrayType -> DoubleArray(size)
-            ArrayTypes.BooleanArrayType -> BooleanArray(size)
+            DataType.BYTE -> ByteArray(size)         // 8-bit signed
+            DataType.SHORT -> ShortArray(size)       // 16-bit signed
+            DataType.INT -> IntArray(size)           // 32-bit signed
+            DataType.LONG -> LongArray(size)         // 64-bit signed
+
+            DataType.UBYTE -> UByteArray(size)       // 8-bit unsigned
+            DataType.USHORT -> UShortArray(size)     // 16-bit unsigned
+            DataType.UINT -> UIntArray(size)         // 32-bit unsigned
+            DataType.ULONG -> ULongArray(size)       // 64-bit unsigned
+
+            DataType.FLOAT -> FloatArray(size)
+            DataType.DOUBLE -> DoubleArray(size)
+
+            DataType.BOOLEAN -> BooleanArray(size)
             else -> throw IllegalArgumentException("Unsupported array type")
         }
     }
 
-    private fun resetArray(array: Any): Unit =
+    protected fun resetArray(array: Any): Unit =
         when (array) {
             is ByteArray -> array.fill(0)               // 8-bit signed
             is UByteArray -> array.fill(0u)             // 8-bit unsigned
@@ -116,6 +81,59 @@ internal class ArrayStorage(typeLength: Int, sizeLength: Int, private val limite
             is FloatArray -> array.fill(0.0f)
             is DoubleArray -> array.fill(0.0)
             is BooleanArray -> array.fill(false)
-            else -> throw IllegalArgumentException("Unsupported array type")
+            else -> error("Unsupported array type")
         }
 }
+
+internal class SingleArrayStorage(typeLength: Int, sizeLength: Int, private val limiter: MemoryLimiter) : BaseArrayStorage(typeLength, sizeLength, 1) {
+    internal fun getArray(type: DataType, size: Int, fillZeros: Boolean = true): Any {
+        return if (limiter.checkMemoryLimitAndAdd(type, size)) {
+            val tIndex = type.ordinal
+            val sIndex = getSizeIndex(tIndex, size)
+            storage[0][tIndex][sIndex].removeFirstOrNull()?.takeIf { fillZeros }?.apply(::resetArray) ?: create(type, size)
+        } else {
+            create(type, size)
+        }
+    }
+
+    internal fun returnArrays(type: DataType, size: Int, arrays: Array<Any>) {
+        val tIndex = type.ordinal
+        val sIndex = getSizeIndex(tIndex, size)
+        val queue = storage[0][tIndex][sIndex]
+
+        queue.addAll(arrays)
+    }
+
+    internal fun clear() {
+        storage[0].forEach { arraysBySize ->
+            arraysBySize.forEach { arrayDeque ->
+                arrayDeque.clear()
+            }
+        }
+        limiter.resetLimit()
+    }
+}
+
+internal class ArrayStorage(typeLength: Int, sizeLength: Int, private val limiter: MemoryLimiter) : BaseArrayStorage(typeLength, sizeLength, 2) {
+    internal fun getArray(type: DataType, size: Int, fillZeros: Boolean = true): Any {
+        return if (limiter.checkMemoryLimitAndAdd(type, size)) {
+            val tIndex = type.ordinal
+            val sIndex = getSizeIndex(tIndex, size)
+            val array = storage[0][tIndex][sIndex].removeFirstOrNull()?.takeIf { fillZeros }?.apply(::resetArray) ?: create(type, size)
+            storage[1][tIndex][sIndex].add(array)
+            array
+        } else {
+            create(type, size)
+        }
+    }
+
+    internal fun moveArrays() {
+        storage[1].forEachIndexed { typeIndex, arraysByType ->
+            arraysByType.forEachIndexed { sizeIndex, arrayDeque ->
+                storage[0][typeIndex][sizeIndex].addAll(arrayDeque)
+                arrayDeque.clear()
+            }
+        }
+        limiter.resetLimit()
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AutoAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AutoAllocatorContext.kt
new file mode 100644
index 000000000..a9255dd93
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AutoAllocatorContext.kt
@@ -0,0 +1,23 @@
+package io.kinference.ndarray.arrays.memory
+
+import io.kinference.ndarray.arrays.*
+import io.kinference.primitives.types.DataType
+import kotlin.coroutines.CoroutineContext
+
+data class AutoAllocatorContext internal constructor(
+    private val storage: ArrayStorage,
+    private val returnStorageFn: (ArrayStorage) -> Unit
+) : CoroutineContext.Element {
+
+    companion object Key : CoroutineContext.Key<AutoAllocatorContext>
+    override val key: CoroutineContext.Key<*> get() = Key
+
+    internal fun getArrays(type: DataType, size: Int, count: Int): Array<Any> {
+        return Array(count) { storage.getArray(type, size) }
+    }
+
+    fun returnUsedArrays() {
+        storage.moveArrays()
+        returnStorageFn(storage)
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ManualAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ManualAllocatorContext.kt
new file mode 100644
index 000000000..788541e6f
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ManualAllocatorContext.kt
@@ -0,0 +1,53 @@
+package io.kinference.ndarray.arrays.memory
+
+import io.kinference.ndarray.arrays.*
+import io.kinference.primitives.types.DataType
+import kotlin.coroutines.CoroutineContext
+
+data class ManualAllocatorContext internal constructor(private val storage: SingleArrayStorage) : CoroutineContext.Element {
+
+    companion object Key : CoroutineContext.Key<ManualAllocatorContext>
+    override val key: CoroutineContext.Key<*> get() = Key
+
+    fun getNDArray(dataType: DataType, strides: Strides, fillZeros: Boolean = false): MutableNDArrayCore {
+        return when(dataType) {
+            DataType.BYTE -> ByteArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+            DataType.SHORT -> ShortArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+            DataType.INT -> IntArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+            DataType.LONG -> LongArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+
+            DataType.UBYTE -> UByteArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+            DataType.USHORT -> UShortArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+            DataType.UINT -> UIntArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+            DataType.ULONG -> ULongArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+
+            DataType.FLOAT -> FloatArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+            DataType.DOUBLE -> DoubleArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+
+            DataType.BOOLEAN -> BooleanArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+
+            else -> error("Unsupported array type")
+        }
+    }
+
+    fun returnNDArray(ndArray: NDArrayCore) {
+        when(ndArray.type) {
+            DataType.BYTE -> ByteArrayStorageWrapper.returnNDArray(storage, ndArray as ByteNDArray)
+            DataType.SHORT -> ShortArrayStorageWrapper.returnNDArray(storage, ndArray as ShortNDArray)
+            DataType.INT -> IntArrayStorageWrapper.returnNDArray(storage, ndArray as IntNDArray)
+            DataType.LONG -> LongArrayStorageWrapper.returnNDArray(storage, ndArray as LongNDArray)
+
+            DataType.UBYTE -> UByteArrayStorageWrapper.returnNDArray(storage, ndArray as UByteNDArray)
+            DataType.USHORT -> UShortArrayStorageWrapper.returnNDArray(storage, ndArray as UShortNDArray)
+            DataType.UINT -> UIntArrayStorageWrapper.returnNDArray(storage, ndArray as UIntNDArray)
+            DataType.ULONG -> ULongArrayStorageWrapper.returnNDArray(storage, ndArray as ULongNDArray)
+
+            DataType.FLOAT -> FloatArrayStorageWrapper.returnNDArray(storage, ndArray as FloatNDArray)
+            DataType.DOUBLE -> DoubleArrayStorageWrapper.returnNDArray(storage, ndArray as DoubleNDArray)
+
+            DataType.BOOLEAN -> BooleanArrayStorageWrapper.returnNDArray(storage, ndArray as BooleanNDArray)
+
+            else -> error("Unsupported array type")
+        }
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryLimiter.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryLimiter.kt
index ebb86fd8c..85ed03eb1 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryLimiter.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryLimiter.kt
@@ -1,5 +1,6 @@
 package io.kinference.ndarray.arrays.memory
 
+import io.kinference.primitives.types.DataType
 import io.kinference.utils.PlatformUtils
 import kotlinx.atomicfu.*
 
@@ -7,10 +8,11 @@ interface MemoryLimiter {
     /**
      * Checks if the memory limit allows adding the specified amount of memory and performs the addition
      *
-     * @param added the memory in bytes to add
+     * @param type is the DataType of underlying primitives in a checking array
+     * @param size is the checking array size
      * @return true if the memory was added successfully and false if adding the memory exceeds the memory limit
      */
-    fun checkMemoryLimitAndAdd(added: Long): Boolean
+    fun checkMemoryLimitAndAdd(type: DataType, size: Int): Boolean
 
     /**
      * Resets the used memory into 0L
@@ -21,8 +23,9 @@ interface MemoryLimiter {
 class BaseMemoryLimiter internal constructor(private val memoryLimit: Long) : MemoryLimiter {
     private var usedMemory: AtomicLong = atomic(0L)
 
-    override fun checkMemoryLimitAndAdd(added: Long): Boolean {
+    override fun checkMemoryLimitAndAdd(type: DataType, size: Int): Boolean {
         // Attempt to add memory and check the limit
+        val added = sizeInBytes(type.ordinal, size)
         val successful = usedMemory.getAndUpdate { current ->
             if (current + added > memoryLimit) current else current + added
         } != usedMemory.value // Check if the update was successful
@@ -33,10 +36,34 @@ class BaseMemoryLimiter internal constructor(private val memoryLimit: Long) : Me
     override fun resetLimit() {
         usedMemory.value = 0L
     }
+
+    companion object {
+        private val typeSizes: LongArray = LongArray(DataType.entries.size).apply {
+            this[DataType.BYTE.ordinal] = Byte.SIZE_BYTES.toLong()
+            this[DataType.SHORT.ordinal] = Short.SIZE_BYTES.toLong()
+            this[DataType.INT.ordinal] = Int.SIZE_BYTES.toLong()
+            this[DataType.LONG.ordinal] = Long.SIZE_BYTES.toLong()
+
+            this[DataType.UBYTE.ordinal] = UByte.SIZE_BYTES.toLong()
+            this[DataType.USHORT.ordinal] = UShort.SIZE_BYTES.toLong()
+            this[DataType.UINT.ordinal] = UInt.SIZE_BYTES.toLong()
+            this[DataType.ULONG.ordinal] = ULong.SIZE_BYTES.toLong()
+
+            this[DataType.FLOAT.ordinal] = Float.SIZE_BYTES.toLong()
+            this[DataType.DOUBLE.ordinal] = Double.SIZE_BYTES.toLong()
+
+            this[DataType.BOOLEAN.ordinal] = 1.toLong()
+        }
+
+        private fun sizeInBytes(typeIndex: Int, size: Int): Long {
+            return typeSizes[typeIndex] * size
+        }
+    }
 }
 
 object MemoryLimiters {
-    val Default: MemoryLimiter = BaseMemoryLimiter((PlatformUtils.maxHeap * 0.3).toLong())
+    val DefaultAutoAllocator: MemoryLimiter = BaseMemoryLimiter((PlatformUtils.maxHeap * 0.3).toLong())
+    val DefaultManualAllocator: MemoryLimiter = BaseMemoryLimiter(50 * 1024 * 1024)
     val NoAllocator: MemoryLimiter = BaseMemoryLimiter(0L)
 
     fun customLimiter(memoryLimit: Long): MemoryLimiter {
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ModelArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ModelArrayStorage.kt
index 9c7f02aa5..0135921cb 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ModelArrayStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ModelArrayStorage.kt
@@ -1,34 +1,39 @@
 package io.kinference.ndarray.arrays.memory
 
-import io.kinference.ndarray.arrays.ArrayTypes
+import io.kinference.primitives.types.DataType
 import io.kinference.utils.Closeable
 import java.util.concurrent.ConcurrentLinkedQueue
 
 class ModelArrayStorage(private val limiter: MemoryLimiter = MemoryLimiters.NoAllocator) : Closeable {
-    private val unusedArrays: ConcurrentLinkedQueue<ArrayStorage> = ConcurrentLinkedQueue()
+    private val autoStorageQueue: ConcurrentLinkedQueue<ArrayStorage> = ConcurrentLinkedQueue()
 
     companion object {
         private const val INIT_SIZE_VALUE: Int = 2
-        private val typeSize: Int = ArrayTypes.entries.size
+        private val typeSize: Int = DataType.entries.size
     }
 
-    fun createAllocatorContext(): AllocatorContext {
-        return AllocatorContext(getStorage(), limiter, ::returnStorage)
+    fun createAutoAllocatorContext(): AutoAllocatorContext {
+        return AutoAllocatorContext(getStorage(autoStorageQueue), ::returnStorage)
+    }
+
+    fun createManualAllocatorContext(): ManualAllocatorContext {
+        limiter.resetLimit()
+        return ManualAllocatorContext(SingleArrayStorage(typeSize, INIT_SIZE_VALUE, limiter))
     }
 
     fun clearCache() {
-        unusedArrays.clear()
+        autoStorageQueue.clear()
     }
 
     override suspend fun close() {
         clearCache()
     }
 
-    private fun getStorage(): ArrayStorage {
-        return unusedArrays.poll() ?: ArrayStorage(typeSize, INIT_SIZE_VALUE, limiter)
+    private fun getStorage(queue: ConcurrentLinkedQueue<ArrayStorage>): ArrayStorage {
+        return queue.poll() ?: ArrayStorage(typeSize, INIT_SIZE_VALUE, limiter)
     }
 
     private fun returnStorage(storage: ArrayStorage) {
-        unusedArrays.offer(storage)
+        autoStorageQueue.offer(storage)
     }
 }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayStorageWrapper.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayStorageWrapper.kt
new file mode 100644
index 000000000..52921ced8
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayStorageWrapper.kt
@@ -0,0 +1,30 @@
+@file:GeneratePrimitives(DataType.ALL)
+package io.kinference.ndarray.arrays.memory
+
+import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.PrimitiveNDArray
+import io.kinference.ndarray.arrays.tiled.PrimitiveTiledArray
+import io.kinference.ndarray.blockSizeByStrides
+import io.kinference.primitives.annotations.*
+import io.kinference.primitives.types.DataType
+import io.kinference.primitives.types.PrimitiveArray
+
+@GenerateNameFromPrimitives
+internal object PrimitiveArrayStorageWrapper {
+    private val type = DataType.CurrentPrimitive
+
+    fun getNDArray(strides: Strides, storage: SingleArrayStorage, fillZeros: Boolean = false): MutablePrimitiveNDArray {
+        val blockSize = blockSizeByStrides(strides)
+        val blocksNum = strides.linearSize / blockSize
+        val blocks = Array(blocksNum) { storage.getArray(type, blockSize, fillZeros) }
+        val typedBlocks = blocks.map { it as PrimitiveArray }.toTypedArray()
+        val tiled = PrimitiveTiledArray(typedBlocks)
+
+        return MutablePrimitiveNDArray(tiled, strides)
+    }
+
+    fun returnNDArray(storage: SingleArrayStorage, ndarray: PrimitiveNDArray) {
+        val blockSize = ndarray.array.blockSize
+        storage.returnArrays(type, blockSize, ndarray.array.blocks as Array<Any>)
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
index 07cbf57f1..4469e9d4e 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
@@ -26,7 +26,7 @@ internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>) {
     }
 
     companion object {
-        val type: ArrayTypes = ArrayTypes.valueOf(PrimitiveArray::class.simpleName!! + "Type")
+        val type: DataType = DataType.CurrentPrimitive
 
         suspend operator fun invoke(strides: Strides): PrimitiveTiledArray {
             val blockSize = blockSizeByStrides(strides)
@@ -59,8 +59,8 @@ internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>) {
 
             val blocksNum = if (blockSize == 0) 0 else size / blockSize
 
-            val coroutineContext = coroutineContext[AllocatorContext.Key]
-            val blocks = coroutineContext?.getArrayContainers(type, blockSize, blocksNum) ?: Array(blocksNum) { PrimitiveArray(blockSize) }
+            val coroutineContext = coroutineContext[AutoAllocatorContext.Key]
+            val blocks = coroutineContext?.getArrays(type, blockSize, blocksNum) ?: Array(blocksNum) { PrimitiveArray(blockSize) }
 
             return PrimitiveTiledArray(blocks.map { it as PrimitiveArray }.toTypedArray())
         }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt
new file mode 100644
index 000000000..00f4767fa
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt
@@ -0,0 +1,6 @@
+package io.kinference.ndarray.extensions.constants
+
+object BooleanConstants {
+    const val ZERO = false
+    const val ONE = true
+}

From f3346323dce2960e1ca722472a0f5bb50cf4a025 Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Wed, 21 Aug 2024 13:02:53 +0200
Subject: [PATCH 05/19] JBAI-4393 [core, ndarray] Refactored memory management
 and array handling

Added manual NDArray handling, refactored existing operations to use standard DataType enum instead of ArrayTypes, and optimized memory allocations across multiple modules.
---
 .../io/kinference.core/model/KIModel.kt       |   4 +-
 .../normalization/EmbedLayerNormalization.kt  |  34 ++-
 .../normalization/SkipLayerNormalization.kt   |  10 +-
 .../io/kinference.core/operators/math/Add.kt  |  18 +-
 .../operators/math/BiasGelu.kt                |  10 +-
 .../kinference.core/operators/math/MatMul.kt  |  16 +-
 .../kinference.core/operators/tensor/Cast.kt  | 285 +++++++++---------
 .../io/kinference/models/bert/BERTTest.kt     |   2 +-
 .../ndarray/extensions/PrimitiveExtensions.kt |  43 +--
 .../ndarray/extensions/gelu/BiasGelu.kt       |  14 +
 .../extensions/gelu/BiasGeluPrimitive.kt      |   9 +-
 11 files changed, 259 insertions(+), 186 deletions(-)

diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
index 35d554631..837d222da 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
@@ -24,7 +24,7 @@ class KIModel(
 
     @OptIn(ExperimentalCoroutinesApi::class)
     private val dispatcher: CoroutineDispatcher = Dispatchers.Default.limitedParallelism(parallelismLimit)
-    private val modelArrayStorage: ModelArrayStorage = ModelArrayStorage(memoryLimiter)
+    private val modelArrayStorage: ModelArrayStorage = ModelArrayStorage(MemoryLimiters.DefaultManualAllocator)
 
     override fun addProfilingContext(name: String): ProfilingContext = ProfilingContext(name).apply { profiles.add(this) }
     override fun analyzeProfilingResults(): ProfileAnalysisEntry = profiles.analyze("Model $name")
@@ -44,7 +44,7 @@ class KIModel(
                 coreReserved = true
             }
 
-            when (memoryLimiter) {
+            when (MemoryLimiters.DefaultManualAllocator) {
                 MemoryLimiters.NoAllocator -> {
                     withContext(limiterContext) {
                         return@withContext graph.execute(input, contexts)
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
index b1861b281..f2be9a212 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
@@ -1,15 +1,17 @@
 package io.kinference.core.operators.layer.normalization
 
 import io.kinference.attribute.Attribute
-import io.kinference.core.data.tensor.KITensor
-import io.kinference.core.data.tensor.asONNXTensors
+import io.kinference.core.data.tensor.*
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
 import io.kinference.ndarray.arrays.pointers.*
 import io.kinference.operator.*
+import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.message.AttributeProto.AttributeType
 import io.kinference.protobuf.message.TensorProto
+import kotlin.coroutines.coroutineContext
 import kotlin.math.sqrt
 
 sealed class EmbedLayerNormalization(
@@ -73,9 +75,12 @@ class EmbedLayerNormalizationVer1(
 
         private data class NormalizeResult(val output: FloatNDArray, val embeddingSum: FloatNDArray)
 
-        internal suspend fun createMaskIndices(mask: IntNDArray?, batchSize: Int, seqLen: Int): NumberNDArrayCore {
-            val maskIndices = MutableIntNDArray(intArrayOf(batchSize))
-            if (mask == null) return maskIndices
+        internal suspend fun createMaskIndices(mask: IntNDArray?, batchSize: Int, seqLen: Int, context: ManualAllocatorContext? = null): NumberNDArrayCore {
+            val strides = Strides(intArrayOf(batchSize))
+            val maskIndices = (context?.getNDArray(DataType.INT, strides) ?: MutableIntNDArray(strides)) as MutableIntNDArray
+
+            if (mask == null)
+                return maskIndices.also { it.fill(0) }
 
             val pointer = mask.array.pointer()
             val maskIndicesPointer = maskIndices.array.pointer()
@@ -95,12 +100,15 @@ class EmbedLayerNormalizationVer1(
 
         private suspend fun normalize(
             epsilon: Float, inputIds: IntNDArray, segmentIds: IntNDArray?, wordEmbed: FloatNDArray, posEmbed: FloatNDArray,
-            segmentEmbed: FloatNDArray?, gamma: FloatNDArray, beta: FloatNDArray, positionIds: IntNDArray?
+            segmentEmbed: FloatNDArray?, gamma: FloatNDArray, beta: FloatNDArray, positionIds: IntNDArray?, context: ManualAllocatorContext? = null
         ): NormalizeResult {
             val (batchSize, seqLen) = inputIds.shape
             val (_, hiddenSize) = wordEmbed.shape
-            val output = MutableFloatNDArray(intArrayOf(batchSize, seqLen, hiddenSize))
-            val embeddingSum = MutableFloatNDArray(intArrayOf(batchSize, seqLen, hiddenSize))
+
+            val outputStrides = Strides(intArrayOf(batchSize, seqLen, hiddenSize))
+
+            val output = (context?.getNDArray(DataType.FLOAT, outputStrides, fillZeros = false) ?: MutableFloatNDArray(outputStrides)) as MutableFloatNDArray
+            val embeddingSum = (context?.getNDArray(DataType.FLOAT, outputStrides, fillZeros = false) ?: MutableFloatNDArray(outputStrides)) as MutableFloatNDArray
 
             for (batch in 0 until batchSize) {
                 val blockIdx = batch * seqLen
@@ -167,6 +175,8 @@ class EmbedLayerNormalizationVer1(
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
+        val manualContext = coroutineContext[ManualAllocatorContext.Key]
+
         val inputIds = inputs[0]!!.data as IntNDArray
         val segmentIds = inputs[1]?.data as IntNDArray?
         val wordEmbed = inputs[2]!!.data as FloatNDArray
@@ -177,8 +187,12 @@ class EmbedLayerNormalizationVer1(
         val mask = inputs.getOrNull(7)?.data as IntNDArray?
         val positionIds = inputs.getOrNull(8)?.data as IntNDArray?
 
-        val (normalized, embedSum) = normalize(epsilon, inputIds, segmentIds, wordEmbed, posEmbed, segmentEmbed, gamma, beta, positionIds)
+        val (normalized, embedSum) = normalize(epsilon, inputIds, segmentIds, wordEmbed, posEmbed, segmentEmbed, gamma, beta, positionIds, manualContext)
         val maskIndices = createMaskIndices(mask, inputIds.shape[0], inputIds.shape[1])
-        return listOf(normalized, maskIndices, embedSum).asONNXTensors(outputs)
+        return listOf(
+            normalized.asTensor(context = manualContext),
+            maskIndices.asTensor(context = manualContext),
+            embedSum.asTensor(context = manualContext)
+        )
     }
 }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
index 6b6243ba3..75320199f 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
@@ -7,10 +7,13 @@ import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.FloatNDArray
 import io.kinference.ndarray.arrays.MutableFloatNDArray
+import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
 import io.kinference.ndarray.arrays.pointers.*
 import io.kinference.operator.*
+import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.message.AttributeProto
 import io.kinference.protobuf.message.TensorProto
+import kotlin.coroutines.coroutineContext
 import kotlin.math.sqrt
 
 sealed class SkipLayerNormalization(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
@@ -104,8 +107,10 @@ class SkipLayerNormalizationVer1(name: String, attributes: Map<String, Attribute
 
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
+        val manualContext = coroutineContext[ManualAllocatorContext.Key]
+
         val input = inputs[0]!!.data as FloatNDArray
-        val output = MutableFloatNDArray(input.strides)
+        val output = (manualContext?.getNDArray(DataType.FLOAT, input.strides, fillZeros = false) ?: MutableFloatNDArray(input.strides)) as MutableFloatNDArray
         input.normalize(
             skip = inputs[1]!!.data as FloatNDArray,
             gamma = inputs[2]!!.data as FloatNDArray,
@@ -114,6 +119,7 @@ class SkipLayerNormalizationVer1(name: String, attributes: Map<String, Attribute
             epsilon = epsilon,
             dst = output
         )
-        return listOf(output.asTensor())
+        // Do we need to pass context here??
+        return listOf(output.asTensor(context = manualContext))
     }
 }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
index f248dcddc..b7a64397d 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
@@ -5,9 +5,12 @@ import io.kinference.core.data.tensor.KITensor
 import io.kinference.core.data.tensor.asTensor
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
-import io.kinference.ndarray.arrays.NumberNDArrayCore
+import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
+import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.operator.*
 import io.kinference.protobuf.message.TensorProto
+import kotlin.coroutines.coroutineContext
 
 sealed class Add(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
     companion object {
@@ -52,7 +55,16 @@ class AddVer7(name: String, attributes: Map<String, Attribute<Any>>, inputs: Lis
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val result = (inputs[0]!!.data as NumberNDArrayCore) + (inputs[1]!!.data as NumberNDArrayCore)
-        return listOf(result.asTensor("C"))
+        val manualContext = coroutineContext[ManualAllocatorContext.Key]
+
+        val left = inputs[0]!!.data as NumberNDArrayCore
+        val right = inputs[1]!!.data as NumberNDArrayCore
+
+        val destShape = broadcastShape(listOf(left.shape, right.shape))
+        val destStrides = Strides(destShape)
+        val dest = (manualContext?.getNDArray(left.type, destStrides) ?: allocateNDArray(left.type, destStrides)) as MutableNumberNDArrayCore
+
+        val result = left.plus(right, dest) //(inputs[0]!!.data as NumberNDArrayCore) + (inputs[1]!!.data as NumberNDArrayCore)
+        return listOf(result.asTensor("C", manualContext))
     }
 }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
index 0be701b4e..f2d8d01b3 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
@@ -5,9 +5,13 @@ import io.kinference.core.data.tensor.KITensor
 import io.kinference.core.data.tensor.asTensor
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
+import io.kinference.ndarray.arrays.MutableNumberNDArrayCore
 import io.kinference.ndarray.arrays.NumberNDArrayCore
+import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
+import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.ndarray.extensions.gelu.biasGelu
 import io.kinference.operator.*
+import kotlin.coroutines.coroutineContext
 
 sealed class BiasGelu(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
     companion object {
@@ -39,16 +43,20 @@ class BiasGeluVer1(name: String, attributes: Map<String, Attribute<Any>> = empty
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
+        val manualContext = coroutineContext[ManualAllocatorContext.Key]
+
         val input = inputs[0]!!.data as NumberNDArrayCore
         val bias = inputs[1]!!.data as NumberNDArrayCore
 
         require(input.shape.last() == bias.shape.last()) { "Last dimensions of input and bias tensors must be equal" }
 
+        val dest = (manualContext?.getNDArray(input.type, input.strides) ?: allocateNDArray(input.type, input.strides)) as MutableNumberNDArrayCore
+
         // Uses ERF formula with fractional error less than x.xx * 10 ^ -4.
         // Algorithm 26.2.17 in Abromowitz and Stegun, Handbook of Mathematical.
         // Another possible ERF implementation (several ms faster):
         // https://github.com/apache/commons-numbers/blob/master/commons-numbers-gamma/src/main/java/org/apache/commons/numbers/gamma/BoostErf.java
 
-        return listOf(biasGelu(input, bias).asTensor("C"))
+        return listOf(biasGelu(input, bias, dest).asTensor("C", manualContext))
     }
 }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
index 8c1735ea3..e3baa2e4e 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
@@ -5,9 +5,13 @@ import io.kinference.core.data.tensor.KITensor
 import io.kinference.core.data.tensor.asTensor
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
-import io.kinference.ndarray.arrays.NumberNDArrayCore
+import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
+import io.kinference.ndarray.broadcasting.Broadcasting
+import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.operator.*
 import io.kinference.protobuf.message.TensorProto
+import kotlin.coroutines.coroutineContext
 
 sealed class MatMul(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
     companion object {
@@ -46,8 +50,16 @@ class MatMulVer1(name: String, attributes: Map<String, Attribute<Any>>, inputs:
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
+        val manualContext = coroutineContext[ManualAllocatorContext.Key]
+
         val first = inputs[0]!!.data as NumberNDArrayCore
         val second = inputs[1]!!.data as NumberNDArrayCore
-        return listOf((first.matmul(second)).asTensor("Y"))
+
+        val destShape = Broadcasting.broadcastShapeForMatmul(first.shape, second.shape)
+        val destStrides = Strides(destShape)
+
+        val dest = (manualContext?.getNDArray(first.type, destStrides, fillZeros = true) ?: allocateNDArray(first.type, destStrides)) as MutableNumberNDArrayCore
+
+        return listOf((first.matmul(second, dest)).asTensor("Y", manualContext))
     }
 }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
index 5ce45c866..742fd7c2d 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
@@ -6,6 +6,7 @@ import io.kinference.core.data.tensor.asTensor
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
 import io.kinference.ndarray.arrays.pointers.mapTo
 import io.kinference.ndarray.arrays.tiled.*
 import io.kinference.operator.*
@@ -13,6 +14,7 @@ import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.FLOAT_TENSOR_TYPES
 import io.kinference.protobuf.message.AttributeProto
 import io.kinference.protobuf.message.TensorProto
+import kotlin.coroutines.coroutineContext
 
 sealed class Cast(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
     companion object {
@@ -41,65 +43,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
         internal val VERSION = VersionInfo(sinceVersion = 6)
         private val INFO = OperatorInfo("Cast", ATTRIBUTES_INFO, INPUTS_INFO, OUTPUTS_INFO, VERSION, OperatorInfo.DEFAULT_DOMAIN)
 
-        private suspend fun castByte(array: ByteNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castByte(array: ByteNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UBYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> array
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != (0).toByte() }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -108,65 +110,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-        private suspend fun castShort(array: ShortNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castShort(array: ShortNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UBYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> array
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != (0).toShort() }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -175,66 +177,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castInt(array: IntNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castInt(array: IntNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UBYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> array
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
-                    array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != 0 }
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
+                    array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != (0).toInt() }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -243,66 +244,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castLong(array: LongNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castLong(array: LongNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UBYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> array
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != 0L }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -311,66 +311,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castUByte(array: UByteNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castUByte(array: UByteNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> array
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != (0).toUByte() }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -379,66 +378,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castUShort(array: UShortNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castUShort(array: UShortNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UBYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> array
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != (0).toUShort() }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -447,66 +445,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castUInt(array: UIntNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castUInt(array: UIntNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != (0).toUInt() }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> array
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -515,65 +512,64 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castULong(array: ULongNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castULong(array: ULongNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != (0).toULong() }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
@@ -583,66 +579,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castFloat(array: FloatNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castFloat(array: FloatNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> array
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong().toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt().toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong().toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt().toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != 0f }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toDouble() }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -651,66 +646,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castDouble(array: DoubleNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castDouble(array: DoubleNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toFloat() }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong().toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt().toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong().toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt().toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toInt() }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toLong() }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> {
-                    val output = BooleanNDArray(BooleanTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BOOLEAN, array.strides) ?: BooleanNDArray(BooleanTiledArray(array.shape), array.strides)) as BooleanNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it != 0.0 }
                     output
                 }
 
                 TensorProto.DataType.DOUBLE -> array
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { it.toULong() }
                     output
                 }
@@ -719,66 +713,65 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-
-        private suspend fun castBoolean(array: BooleanNDArray, to: TensorProto.DataType): NDArrayCore {
+        private suspend fun castBoolean(array: BooleanNDArray, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (to) {
                 in FLOAT_TENSOR_TYPES -> {
-                    val output = FloatNDArray(FloatTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.FLOAT, array.strides) ?: FloatNDArray(FloatTiledArray(array.shape), array.strides)) as FloatNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) 1f else 0f }
                     output
                 }
 
                 TensorProto.DataType.UINT8 -> {
-                    val output = UByteNDArray(UByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: UByteNDArray(UByteTiledArray(array.shape), array.strides)) as UByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) (1).toUByte() else (0).toUByte() }
                     output
                 }
 
                 TensorProto.DataType.INT8 -> {
-                    val output = ByteNDArray(ByteTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.BYTE, array.strides) ?: ByteNDArray(ByteTiledArray(array.shape), array.strides)) as ByteNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) (1).toByte() else (0).toByte() }
                     output
                 }
 
                 TensorProto.DataType.UINT16 -> {
-                    val output = UShortNDArray(UShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.USHORT, array.strides) ?: UShortNDArray(UShortTiledArray(array.shape), array.strides)) as UShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) (1).toUShort() else (0).toUShort() }
                     output
                 }
 
                 TensorProto.DataType.INT16 -> {
-                    val output = ShortNDArray(ShortTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.SHORT, array.strides) ?: ShortNDArray(ShortTiledArray(array.shape), array.strides)) as ShortNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) (1).toShort() else (0).toShort() }
                     output
                 }
 
                 TensorProto.DataType.INT32 -> {
-                    val output = IntNDArray(IntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.INT, array.strides) ?: IntNDArray(IntTiledArray(array.shape), array.strides)) as IntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) 1 else 0 }
                     output
                 }
 
                 TensorProto.DataType.INT64 -> {
-                    val output = LongNDArray(LongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.LONG, array.strides) ?: LongNDArray(LongTiledArray(array.shape), array.strides)) as LongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) 1L else 0L }
                     output
                 }
 
                 TensorProto.DataType.BOOL -> array
                 TensorProto.DataType.DOUBLE -> {
-                    val output = DoubleNDArray(DoubleTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.DOUBLE, array.strides) ?: DoubleNDArray(DoubleTiledArray(array.shape), array.strides)) as DoubleNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) 1.0 else 0.0 }
                     output
                 }
 
                 TensorProto.DataType.UINT32 -> {
-                    val output = UIntNDArray(UIntTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.UINT, array.strides) ?: UIntNDArray(UIntTiledArray(array.shape), array.strides)) as UIntNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) (1).toUInt() else (0).toUInt() }
                     output
                 }
 
                 TensorProto.DataType.UINT64 -> {
-                    val output = ULongNDArray(ULongTiledArray(array.shape), array.strides)
+                    val output = (context?.getNDArray(DataType.ULONG, array.strides) ?: ULongNDArray(ULongTiledArray(array.shape), array.strides)) as ULongNDArray
                     array.array.pointer().mapTo(output.array.pointer(), array.linearSize) { if (it) (1).toULong() else (0).toULong() }
                     output
                 }
@@ -787,19 +780,19 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
             }
         }
 
-        internal suspend fun castTo(input: NDArrayCore, to: TensorProto.DataType): NDArrayCore {
+        internal suspend fun castTo(input: NDArrayCore, to: TensorProto.DataType, context: ManualAllocatorContext? = null): NDArrayCore {
             return when (input.type) {
-                DataType.BYTE -> castByte(input as ByteNDArray, to)
-                DataType.SHORT -> castShort(input as ShortNDArray, to)
-                DataType.INT -> castInt(input as IntNDArray, to)
-                DataType.LONG -> castLong(input as LongNDArray, to)
-                DataType.UBYTE -> castUByte(input as UByteNDArray, to)
-                DataType.USHORT -> castUShort(input as UShortNDArray, to)
-                DataType.UINT -> castUInt(input as UIntNDArray, to)
-                DataType.ULONG -> castULong(input as ULongNDArray, to)
-                DataType.FLOAT -> castFloat(input as FloatNDArray, to)
-                DataType.DOUBLE -> castDouble(input as DoubleNDArray, to)
-                DataType.BOOLEAN -> castBoolean(input as BooleanNDArray, to)
+                DataType.BYTE -> castByte(input as ByteNDArray, to, context)
+                DataType.SHORT -> castShort(input as ShortNDArray, to, context)
+                DataType.INT -> castInt(input as IntNDArray, to, context)
+                DataType.LONG -> castLong(input as LongNDArray, to, context)
+                DataType.UBYTE -> castUByte(input as UByteNDArray, to, context)
+                DataType.USHORT -> castUShort(input as UShortNDArray, to, context)
+                DataType.UINT -> castUInt(input as UIntNDArray, to, context)
+                DataType.ULONG -> castULong(input as ULongNDArray, to, context)
+                DataType.FLOAT -> castFloat(input as FloatNDArray, to, context)
+                DataType.DOUBLE -> castDouble(input as DoubleNDArray, to, context)
+                DataType.BOOLEAN -> castBoolean(input as BooleanNDArray, to, context)
                 else -> throw IllegalStateException("Unsupported type ${input.type}")
             }
         }
@@ -808,11 +801,13 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
     private val toType: Int by attribute("to") { it: Number -> it.toInt() }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
+        val manualContext = coroutineContext[ManualAllocatorContext.Key]
+
         val tensor = inputs.first()!!
         val to = TensorProto.DataType.fromValue(toType)!!
 
-        val casted = castTo(tensor.data, to)
+        val casted = castTo(tensor.data, to, manualContext)
 
-        return listOf(casted.asTensor("output"))
+        return listOf(casted.asTensor("output", manualContext))
     }
 }
diff --git a/inference/inference-core/src/jvmTest/kotlin/io/kinference/models/bert/BERTTest.kt b/inference/inference-core/src/jvmTest/kotlin/io/kinference/models/bert/BERTTest.kt
index 30ff16b83..bed95862c 100644
--- a/inference/inference-core/src/jvmTest/kotlin/io/kinference/models/bert/BERTTest.kt
+++ b/inference/inference-core/src/jvmTest/kotlin/io/kinference/models/bert/BERTTest.kt
@@ -15,6 +15,6 @@ class BERTTest {
 
     @Test
     fun benchmark_test_vanilla_bert_performance() = TestRunner.runTest {
-        KIPerformanceRunner.runFromS3("bert:standard:en:v1", count = 3)
+        KIPerformanceRunner.runFromS3("bert:standard:en:v1", count = 20)
     }
 }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/PrimitiveExtensions.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/PrimitiveExtensions.kt
index 21442ecda..570ca8520 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/PrimitiveExtensions.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/PrimitiveExtensions.kt
@@ -8,6 +8,7 @@ import io.kinference.ndarray.arrays.pointers.accept
 import io.kinference.ndarray.arrays.pointers.acceptWithRecursive
 import io.kinference.ndarray.stubs.*
 import io.kinference.ndarray.arrays.tiled.*
+import io.kinference.ndarray.extensions.constants.PrimitiveConstants
 import io.kinference.primitives.annotations.*
 import io.kinference.primitives.types.*
 import io.kinference.utils.launchWithLimitOrDefault
@@ -127,21 +128,21 @@ internal suspend fun PrimitiveNDArray.dotTransposedWithAlpha(alpha: Double, othe
     other as PrimitiveNDArray; destination as MutablePrimitiveNDArray
 
     val alpha = alpha.toPrimitive()
-    val dBlocksInRow = destination.blocksInRow
+//    val dBlocksInRow = destination.blocksInRow
     val lrBlocksInRow = this.blocksInRow
 
     val n = this.shape[0]
     val t = this.shape[1]
     val m = other.shape[0]
 
-    val dBlockSize = destination.array.blockSize
+//    val dBlockSize = destination.array.blockSize
     val lrBlockSize = this.array.blockSize
 
-    val destBlocks = destination.array.blocks
+//    val destBlocks = destination.array.blocks
     val leftBlocks = this.array.blocks
     val rightBlocks = other.array.blocks
     val rowFlop = t * m
-    val zero = (0).toPrimitive()
+//    val zero = (0).toPrimitive()
 
 
     /* TODO: (dmitriyb) this is temporary commented. On GEC performance test we have large inputs that cause out of memory exceptions
@@ -161,34 +162,40 @@ internal suspend fun PrimitiveNDArray.dotTransposedWithAlpha(alpha: Double, othe
     // TODO: (cupertank) Remove constants
     // TODO: (dmitriyb) Implement concurrent array retrieve with a separate structure from ArraysDispatcher
     parallelizeByRows(rowFlop, n, 262144) { nStart: Int, nEnd: Int, _ ->
-        val mSums = Array(m) { PrimitiveArray(lrBlockSize) }
+//        val mSums = Array(m) { PrimitiveArray(lrBlockSize) }
+        val tempSum = PrimitiveArray(lrBlockSize)
+        val destPointer = destination.array.pointer()
         for (i in nStart until nEnd) {
             val leftBlockOffset = i * lrBlocksInRow
             val rightBlockIter = rightBlocks.iterator()
 
-            val destBlockOffset = i * dBlocksInRow
+            destPointer.linearIndex = i * m
+//            val destBlockOffset = i * dBlocksInRow
 
             for (k in 0 until m) {
-                val tempArray = mSums[k]
+//                val tempArray = mSums[k]
                 for (lrBlock in 0 until lrBlocksInRow) {
                     val leftBlock = leftBlocks[leftBlockOffset + lrBlock]
                     val rightBlock = rightBlockIter.next()
 
-                    for (j in tempArray.indices) {
-                        tempArray[j] += leftBlock[j] * rightBlock[j]
+                    for (j in tempSum.indices) {
+                        tempSum[j] += leftBlock[j] * rightBlock[j]
                     }
                 }
-            }
 
-            val mSumsIter = mSums.iterator()
-            for (destBlockNum in 0 until dBlocksInRow) {
-                val destBlock = destBlocks[destBlockOffset + destBlockNum]
-                for (j in destBlock.indices) {
-                    val sumBlock = mSumsIter.next()
-                    destBlock[j] = sumBlock.sum() * alpha
-                    sumBlock.fill(zero)
-                }
+                destPointer.setAndIncrement(tempSum.sum() * alpha)
+                tempSum.fill(PrimitiveConstants.ZERO)
             }
+
+//            val mSumsIter = mSums.iterator()
+//            for (destBlockNum in 0 until dBlocksInRow) {
+//                val destBlock = destBlocks[destBlockOffset + destBlockNum]
+//                for (j in destBlock.indices) {
+//                    val sumBlock = mSumsIter.next()
+//                    destBlock[j] = sumBlock.sum() * alpha
+//                    sumBlock.fill(zero)
+//                }
+//            }
         }
     }
 
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGelu.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGelu.kt
index 0636cb824..8dc2a6705 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGelu.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGelu.kt
@@ -16,3 +16,17 @@ suspend fun biasGelu(input: NumberNDArrayCore, bias: NumberNDArrayCore): Mutable
         else -> error("BiasGelu operation supported only for FLOAT and DOUBLE tensors, actual types is ${input.type}")
     }
 }
+
+suspend fun biasGelu(input: NumberNDArrayCore, bias: NumberNDArrayCore, dest: MutableNumberNDArrayCore): MutableNumberNDArrayCore {
+    require(input.type == bias.type)
+        { "Input and Bias types should be equal, actual input type is ${input.type}, actual bias type is ${bias.type}" }
+
+    require(input.type == DataType.FLOAT || input.type == DataType.DOUBLE)
+        { "BiasGelu operation supported only for FLOAT and DOUBLE tensors, actual types is ${input.type}" }
+
+    return when(input.type) {
+        DataType.FLOAT -> computeGeluFloat(input as FloatNDArray, bias as FloatNDArray, dest as MutableFloatNDArray)
+        DataType.DOUBLE -> computeGeluDouble(input as DoubleNDArray, bias as DoubleNDArray, dest as MutableDoubleNDArray)
+        else -> error("BiasGelu operation supported only for FLOAT and DOUBLE tensors, actual types is ${input.type}")
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt
index bc14a927a..4c5682899 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt
@@ -14,8 +14,8 @@ import io.kinference.primitives.types.*
 import kotlin.math.*
 
 @GenerateNameFromPrimitives
-internal suspend fun computeGeluPrimitive(input: PrimitiveNDArray, bias: PrimitiveNDArray): MutablePrimitiveNDArray {
-    val output = MutablePrimitiveNDArray(input.strides)
+internal suspend fun computeGeluPrimitive(input: PrimitiveNDArray, bias: PrimitiveNDArray, output: MutablePrimitiveNDArray): MutablePrimitiveNDArray {
+//    val output = MutablePrimitiveNDArray(input.strides)
 
     val inputBlocks = input.array.blocks
     val biasBlocks = bias.array.blocks
@@ -79,3 +79,8 @@ internal suspend fun computeGeluPrimitive(input: PrimitiveNDArray, bias: Primiti
 
     return output
 }
+
+@GenerateNameFromPrimitives
+internal suspend fun computeGeluPrimitive(input: PrimitiveNDArray, bias: PrimitiveNDArray): MutablePrimitiveNDArray {
+    return computeGeluPrimitive(input, bias, MutablePrimitiveNDArray(input.strides))
+}

From e900ca89d3519dd9066d2d927abdf84bd228f693 Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Thu, 22 Aug 2024 12:24:46 +0200
Subject: [PATCH 06/19] Refactor NDArray storage and retrieval methods.

Replaced static method calls with instance method calls for NDArray storage wrappers. Introduced internal storage fields in `ManualAllocatorContext` to streamline access and management of different data types.
---
 .../arrays/memory/ManualAllocatorContext.kt   | 101 ++++++++++++++----
 .../memory/PrimitiveArrayStorageWrapper.kt    |  34 +++++-
 2 files changed, 112 insertions(+), 23 deletions(-)

diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ManualAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ManualAllocatorContext.kt
index 788541e6f..27f261fef 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ManualAllocatorContext.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ManualAllocatorContext.kt
@@ -6,25 +6,40 @@ import kotlin.coroutines.CoroutineContext
 
 data class ManualAllocatorContext internal constructor(private val storage: SingleArrayStorage) : CoroutineContext.Element {
 
+    internal val byteStorage = ByteArrayStorageWrapper()
+    internal val shortStorage =  ShortArrayStorageWrapper()
+    internal val intStorage =  IntArrayStorageWrapper()
+    internal val longStorage =  LongArrayStorageWrapper()
+
+    internal val ubyteStorage =  UByteArrayStorageWrapper()
+    internal val ushortStorage =  UShortArrayStorageWrapper()
+    internal val uintStorage =  UIntArrayStorageWrapper()
+    internal val ulongStorage =  ULongArrayStorageWrapper()
+
+    internal val floatStorage =  FloatArrayStorageWrapper()
+    internal val doubleStorage =  DoubleArrayStorageWrapper()
+
+    internal val booleanStorage =  BooleanArrayStorageWrapper()
+
     companion object Key : CoroutineContext.Key<ManualAllocatorContext>
     override val key: CoroutineContext.Key<*> get() = Key
 
     fun getNDArray(dataType: DataType, strides: Strides, fillZeros: Boolean = false): MutableNDArrayCore {
         return when(dataType) {
-            DataType.BYTE -> ByteArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-            DataType.SHORT -> ShortArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-            DataType.INT -> IntArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-            DataType.LONG -> LongArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+            DataType.BYTE -> byteStorage.getNDArray(strides, fillZeros)
+            DataType.SHORT -> shortStorage.getNDArray(strides, fillZeros)
+            DataType.INT -> intStorage.getNDArray(strides, fillZeros)
+            DataType.LONG -> longStorage.getNDArray(strides, fillZeros)
 
-            DataType.UBYTE -> UByteArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-            DataType.USHORT -> UShortArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-            DataType.UINT -> UIntArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-            DataType.ULONG -> ULongArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+            DataType.UBYTE -> ubyteStorage.getNDArray(strides, fillZeros)
+            DataType.USHORT -> ushortStorage.getNDArray(strides, fillZeros)
+            DataType.UINT -> uintStorage.getNDArray(strides, fillZeros)
+            DataType.ULONG -> ulongStorage.getNDArray(strides, fillZeros)
 
-            DataType.FLOAT -> FloatArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-            DataType.DOUBLE -> DoubleArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+            DataType.FLOAT -> floatStorage.getNDArray(strides, fillZeros)
+            DataType.DOUBLE -> doubleStorage.getNDArray(strides, fillZeros)
 
-            DataType.BOOLEAN -> BooleanArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+            DataType.BOOLEAN -> booleanStorage.getNDArray(strides, fillZeros)
 
             else -> error("Unsupported array type")
         }
@@ -32,22 +47,64 @@ data class ManualAllocatorContext internal constructor(private val storage: Sing
 
     fun returnNDArray(ndArray: NDArrayCore) {
         when(ndArray.type) {
-            DataType.BYTE -> ByteArrayStorageWrapper.returnNDArray(storage, ndArray as ByteNDArray)
-            DataType.SHORT -> ShortArrayStorageWrapper.returnNDArray(storage, ndArray as ShortNDArray)
-            DataType.INT -> IntArrayStorageWrapper.returnNDArray(storage, ndArray as IntNDArray)
-            DataType.LONG -> LongArrayStorageWrapper.returnNDArray(storage, ndArray as LongNDArray)
+            DataType.BYTE -> byteStorage.returnNDArray(ndArray as ByteNDArray)
+            DataType.SHORT -> shortStorage.returnNDArray(ndArray as ShortNDArray)
+            DataType.INT -> intStorage.returnNDArray(ndArray as IntNDArray)
+            DataType.LONG -> longStorage.returnNDArray(ndArray as LongNDArray)
 
-            DataType.UBYTE -> UByteArrayStorageWrapper.returnNDArray(storage, ndArray as UByteNDArray)
-            DataType.USHORT -> UShortArrayStorageWrapper.returnNDArray(storage, ndArray as UShortNDArray)
-            DataType.UINT -> UIntArrayStorageWrapper.returnNDArray(storage, ndArray as UIntNDArray)
-            DataType.ULONG -> ULongArrayStorageWrapper.returnNDArray(storage, ndArray as ULongNDArray)
+            DataType.UBYTE -> ubyteStorage.returnNDArray(ndArray as UByteNDArray)
+            DataType.USHORT -> ushortStorage.returnNDArray(ndArray as UShortNDArray)
+            DataType.UINT -> uintStorage.returnNDArray(ndArray as UIntNDArray)
+            DataType.ULONG -> ulongStorage.returnNDArray(ndArray as ULongNDArray)
 
-            DataType.FLOAT -> FloatArrayStorageWrapper.returnNDArray(storage, ndArray as FloatNDArray)
-            DataType.DOUBLE -> DoubleArrayStorageWrapper.returnNDArray(storage, ndArray as DoubleNDArray)
+            DataType.FLOAT -> floatStorage.returnNDArray(ndArray as FloatNDArray)
+            DataType.DOUBLE -> doubleStorage.returnNDArray(ndArray as DoubleNDArray)
 
-            DataType.BOOLEAN -> BooleanArrayStorageWrapper.returnNDArray(storage, ndArray as BooleanNDArray)
+            DataType.BOOLEAN -> booleanStorage.returnNDArray(ndArray as BooleanNDArray)
 
             else -> error("Unsupported array type")
         }
     }
+
+//    fun getNDArray(dataType: DataType, strides: Strides, fillZeros: Boolean = false): MutableNDArrayCore {
+//        return when(dataType) {
+//            DataType.BYTE -> ByteArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+//            DataType.SHORT -> ShortArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+//            DataType.INT -> IntArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+//            DataType.LONG -> LongArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+//
+//            DataType.UBYTE -> UByteArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+//            DataType.USHORT -> UShortArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+//            DataType.UINT -> UIntArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+//            DataType.ULONG -> ULongArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+//
+//            DataType.FLOAT -> FloatArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+//            DataType.DOUBLE -> DoubleArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+//
+//            DataType.BOOLEAN -> BooleanArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
+//
+//            else -> error("Unsupported array type")
+//        }
+//    }
+//
+//    fun returnNDArray(ndArray: NDArrayCore) {
+//        when(ndArray.type) {
+//            DataType.BYTE -> ByteArrayStorageWrapper.returnNDArray(storage, ndArray as ByteNDArray)
+//            DataType.SHORT -> ShortArrayStorageWrapper.returnNDArray(storage, ndArray as ShortNDArray)
+//            DataType.INT -> IntArrayStorageWrapper.returnNDArray(storage, ndArray as IntNDArray)
+//            DataType.LONG -> LongArrayStorageWrapper.returnNDArray(storage, ndArray as LongNDArray)
+//
+//            DataType.UBYTE -> UByteArrayStorageWrapper.returnNDArray(storage, ndArray as UByteNDArray)
+//            DataType.USHORT -> UShortArrayStorageWrapper.returnNDArray(storage, ndArray as UShortNDArray)
+//            DataType.UINT -> UIntArrayStorageWrapper.returnNDArray(storage, ndArray as UIntNDArray)
+//            DataType.ULONG -> ULongArrayStorageWrapper.returnNDArray(storage, ndArray as ULongNDArray)
+//
+//            DataType.FLOAT -> FloatArrayStorageWrapper.returnNDArray(storage, ndArray as FloatNDArray)
+//            DataType.DOUBLE -> DoubleArrayStorageWrapper.returnNDArray(storage, ndArray as DoubleNDArray)
+//
+//            DataType.BOOLEAN -> BooleanArrayStorageWrapper.returnNDArray(storage, ndArray as BooleanNDArray)
+//
+//            else -> error("Unsupported array type")
+//        }
+//    }
 }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayStorageWrapper.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayStorageWrapper.kt
index 52921ced8..e6c566938 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayStorageWrapper.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayStorageWrapper.kt
@@ -5,14 +5,46 @@ import io.kinference.ndarray.arrays.*
 import io.kinference.ndarray.arrays.PrimitiveNDArray
 import io.kinference.ndarray.arrays.tiled.PrimitiveTiledArray
 import io.kinference.ndarray.blockSizeByStrides
+import io.kinference.ndarray.extensions.constants.PrimitiveConstants
 import io.kinference.primitives.annotations.*
 import io.kinference.primitives.types.DataType
 import io.kinference.primitives.types.PrimitiveArray
+import io.kinference.utils.inlines.InlineInt
 
 @GenerateNameFromPrimitives
-internal object PrimitiveArrayStorageWrapper {
+internal class PrimitiveArrayStorageWrapper {
     private val type = DataType.CurrentPrimitive
 
+    private val storage = HashMap<InlineInt, ArrayDeque<PrimitiveArray>>(2)
+
+    fun getNDArray(strides: Strides, fillZeros: Boolean = false): MutablePrimitiveNDArray {
+        val blockSize = InlineInt(blockSizeByStrides(strides))
+        val blocksNum = strides.linearSize / blockSize.value
+
+        val queue = storage.getOrPut(blockSize) { ArrayDeque(blocksNum) }
+
+        val blocks = Array(blocksNum) {
+            val block = queue.removeFirstOrNull()
+            if (fillZeros) {
+                block?.fill(PrimitiveConstants.ZERO)
+            }
+            block ?: PrimitiveArray(blockSize.value)
+        }
+
+        val tiled = PrimitiveTiledArray(blocks)
+
+        return MutablePrimitiveNDArray(tiled, strides)
+    }
+
+    fun returnNDArray(ndarray: PrimitiveNDArray) {
+        val blockSize = InlineInt(ndarray.array.blockSize)
+        val blocksNum = ndarray.array.blocksNum
+
+        val queue = storage.getOrPut(blockSize) { ArrayDeque(blocksNum) }
+
+        queue.addAll(ndarray.array.blocks)
+    }
+
     fun getNDArray(strides: Strides, storage: SingleArrayStorage, fillZeros: Boolean = false): MutablePrimitiveNDArray {
         val blockSize = blockSizeByStrides(strides)
         val blocksNum = strides.linearSize / blockSize

From 954f6cc8b484217041577c8876c753993f6bdf57 Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Tue, 27 Aug 2024 17:09:15 +0200
Subject: [PATCH 07/19] JBAI-4393 [core, ndarray, utils] Major allocator
 refactoring

Deleted obsolete memory storage and allocator context classes. Added new classes with improved manual and auto handling storage for arrays, enhancing memory management and performance.
---
 .../kotlin/io/kinference.core/KIEngine.kt     |  31 ++--
 .../kinference.core/data/tensor/KITensor.kt   |   2 +-
 .../data/tensor/TensorExtensions.kt           |   2 +-
 .../io/kinference.core/model/KIModel.kt       |  54 ++-----
 .../operators/layer/attention/Attention.kt    |   2 +-
 .../normalization/EmbedLayerNormalization.kt  |   2 +-
 .../normalization/SkipLayerNormalization.kt   |   2 +-
 .../io/kinference.core/operators/math/Add.kt  |   2 +-
 .../operators/math/BiasGelu.kt                |   2 +-
 .../kinference.core/operators/math/MatMul.kt  |   2 +-
 .../kinference.core/operators/tensor/Cast.kt  |   2 +-
 .../ndarray/arrays/memory/ArrayStorage.kt     | 139 ------------------
 .../arrays/memory/AutoAllocatorContext.kt     |  23 ---
 .../arrays/memory/ManualAllocatorContext.kt   | 110 --------------
 .../{MemoryLimiter.kt => MemoryManager.kt}    |  62 ++++----
 .../arrays/memory/ModelArrayStorage.kt        |  39 -----
 .../memory/PredictionContextDispatcher.kt     |  64 ++++++++
 .../memory/PrimitiveArrayStorageWrapper.kt    |  62 --------
 .../memory/contexts/AutoAllocatorContext.kt   |  17 +++
 .../BaseAllocatorContextWithStorage.kt        |  24 +++
 .../memory/contexts/ManualAllocatorContext.kt |  23 +++
 .../storage/AutoArrayHandlingStorage.kt       |  34 +++++
 .../storage/ManualArrayHandlingStorage.kt     |  45 ++++++
 .../PrimitiveAutoHandlingArrayStorage.kt      |  47 ++++++
 .../PrimitiveManualHandlingArrayStorage.kt    |  56 +++++++
 .../arrays/tiled/PrimitiveTiledArray.kt       |   1 +
 .../io/kinference/utils/PredictionConfig.kt   |  67 +++++++++
 27 files changed, 455 insertions(+), 461 deletions(-)
 delete mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt
 delete mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AutoAllocatorContext.kt
 delete mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ManualAllocatorContext.kt
 rename ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/{MemoryLimiter.kt => MemoryManager.kt} (60%)
 delete mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ModelArrayStorage.kt
 create mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PredictionContextDispatcher.kt
 delete mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayStorageWrapper.kt
 create mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
 create mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/BaseAllocatorContextWithStorage.kt
 create mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt
 create mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt
 create mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt
 create mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
 create mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt
 create mode 100644 utils/utils-common/src/commonMain/kotlin/io/kinference/utils/PredictionConfig.kt

diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/KIEngine.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/KIEngine.kt
index dba47b43d..674bbaed5 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/KIEngine.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/KIEngine.kt
@@ -10,14 +10,13 @@ import io.kinference.core.optimizer.rules.OptimizerRuleSet
 import io.kinference.data.ONNXData
 import io.kinference.data.ONNXDataType
 import io.kinference.model.IrOptimizableEngine
-import io.kinference.ndarray.arrays.memory.MemoryLimiter
-import io.kinference.ndarray.arrays.memory.MemoryLimiters
 import io.kinference.optimizer.GraphOptimizer
 import io.kinference.optimizer.OptimizerRule
 import io.kinference.protobuf.*
 import io.kinference.protobuf.message.*
 import io.kinference.utils.CommonDataLoader
-import io.kinference.utils.PlatformUtils
+import io.kinference.utils.PredictionConfig
+import io.kinference.utils.PredictionConfigs
 import okio.Buffer
 import okio.Path
 import okio.Path.Companion.toPath
@@ -41,24 +40,24 @@ object KIEngine : IrOptimizableEngine<KIONNXData<*>> {
 
     fun protoReader(bytes: ByteArray) = ProtobufReader(Buffer().write(bytes), KI_READER_CONFIG)
 
-    suspend fun loadModel(bytes: ByteArray, optimize: Boolean, memoryLimiter: MemoryLimiter, parallelismLimit: Int): KIModel {
+    suspend fun loadModel(bytes: ByteArray, optimize: Boolean, predictionConfig: PredictionConfig): KIModel {
         val rules = if (optimize) OptimizerRuleSet.DEFAULT_OPT_RULES else emptyList()
-        return loadModel(bytes, rules, memoryLimiter, parallelismLimit)
+        return loadModel(bytes, rules, predictionConfig)
     }
 
     override suspend fun loadModel(bytes: ByteArray, optimize: Boolean): KIModel {
-        return loadModel(bytes, optimize, MemoryLimiters.NoAllocator, PlatformUtils.cores)
+        return loadModel(bytes, optimize, PredictionConfigs.NoAllocator)
     }
 
-    override suspend fun loadModel(bytes: ByteArray, rules: List<OptimizerRule<KIONNXData<*>>>): KIModel = loadModel(bytes, rules, MemoryLimiters.NoAllocator, PlatformUtils.cores)
+    override suspend fun loadModel(bytes: ByteArray, rules: List<OptimizerRule<KIONNXData<*>>>): KIModel = loadModel(bytes, rules, PredictionConfigs.NoAllocator)
 
-    suspend fun loadModel(bytes: ByteArray, rules: List<OptimizerRule<KIONNXData<*>>>, memoryLimiter: MemoryLimiter, parallelismLimit: Int): KIModel {
+    suspend fun loadModel(bytes: ByteArray, rules: List<OptimizerRule<KIONNXData<*>>>, predictionConfig: PredictionConfig): KIModel {
         val modelScheme = ModelProto.decode(protoReader(bytes))
-        val model = KIModel(modelScheme, memoryLimiter)
+        val model = KIModel(modelScheme, predictionConfig)
 
         return if (rules.isNotEmpty()) {
             val newGraph = GraphOptimizer(model.graph).run(rules) as KIGraph
-            KIModel(model.id, model.name, model.opSet, newGraph, memoryLimiter, parallelismLimit)
+            KIModel(model.id, model.name, model.opSet, newGraph, predictionConfig)
         } else {
             model
         }
@@ -66,12 +65,12 @@ object KIEngine : IrOptimizableEngine<KIONNXData<*>> {
 
     override suspend fun loadModel(bytes: ByteArray): KIModel = loadModel(bytes, optimize = true)
 
-    suspend fun loadModel(path: Path, optimize: Boolean, memoryLimiter: MemoryLimiter, parallelismLimit: Int): KIModel {
-        return loadModel(CommonDataLoader.bytes(path), optimize, memoryLimiter, parallelismLimit)
+    suspend fun loadModel(path: Path, optimize: Boolean, predictionConfig: PredictionConfig): KIModel {
+        return loadModel(CommonDataLoader.bytes(path), optimize, predictionConfig)
     }
 
     override suspend fun loadModel(path: Path, optimize: Boolean): KIModel {
-        return loadModel(path, optimize, MemoryLimiters.NoAllocator, PlatformUtils.cores)
+        return loadModel(path, optimize, PredictionConfigs.NoAllocator)
     }
 
     override suspend fun loadModel(path: Path): KIModel = loadModel(path, optimize = true)
@@ -80,12 +79,12 @@ object KIEngine : IrOptimizableEngine<KIONNXData<*>> {
         return loadModel(CommonDataLoader.bytes(path), rules)
     }
 
-    suspend fun loadModel(path: String, optimize: Boolean, memoryLimiter: MemoryLimiter, parallelismLimit: Int): KIModel {
-        return loadModel(CommonDataLoader.bytes(path.toPath()), optimize, memoryLimiter, parallelismLimit)
+    suspend fun loadModel(path: String, optimize: Boolean, predictionConfig: PredictionConfig): KIModel {
+        return loadModel(CommonDataLoader.bytes(path.toPath()), optimize, predictionConfig)
     }
 
     override suspend fun loadModel(path: String, optimize: Boolean): KIModel {
-        return loadModel(path, optimize, MemoryLimiters.NoAllocator, PlatformUtils.cores)
+        return loadModel(path, optimize, PredictionConfigs.NoAllocator)
     }
 
     override suspend fun loadModel(path: String): KIModel = loadModel(path, optimize = true)
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt
index cdf96e0e1..dba23754f 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/KITensor.kt
@@ -3,7 +3,7 @@ package io.kinference.core.data.tensor
 import io.kinference.core.*
 import io.kinference.data.ONNXTensor
 import io.kinference.ndarray.arrays.*
-import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.arrays.tiled.*
 import io.kinference.protobuf.FLOAT_TENSOR_TYPES
 import io.kinference.protobuf.message.TensorProto
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/TensorExtensions.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/TensorExtensions.kt
index b83e75c2e..f8e2daf19 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/TensorExtensions.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/data/tensor/TensorExtensions.kt
@@ -1,7 +1,7 @@
 package io.kinference.core.data.tensor
 
 import io.kinference.ndarray.arrays.*
-import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.extensions.concat
 import io.kinference.ndarray.extensions.splitWithAxis
 import io.kinference.primitives.types.DataType
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
index 837d222da..6611fc1ce 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
@@ -5,6 +5,7 @@ import io.kinference.core.graph.KIGraph
 import io.kinference.graph.Contexts
 import io.kinference.model.Model
 import io.kinference.ndarray.arrays.memory.*
+import io.kinference.ndarray.arrays.memory.contexts.finalizeAllocatorContext
 import io.kinference.operator.OperatorSetRegistry
 import io.kinference.profiler.*
 import io.kinference.protobuf.message.ModelProto
@@ -17,14 +18,10 @@ class KIModel(
     val name: String,
     val opSet: OperatorSetRegistry,
     val graph: KIGraph,
-    private val memoryLimiter: MemoryLimiter = MemoryLimiters.NoAllocator,
-    parallelismLimit: Int = PlatformUtils.cores,
+    predictionConfig: PredictionConfig = PredictionConfigs.NoAllocator,
 ) : Model<KIONNXData<*>>, Profilable, Cacheable {
     private val profiles: MutableList<ProfilingContext> = ArrayList()
-
-    @OptIn(ExperimentalCoroutinesApi::class)
-    private val dispatcher: CoroutineDispatcher = Dispatchers.Default.limitedParallelism(parallelismLimit)
-    private val modelArrayStorage: ModelArrayStorage = ModelArrayStorage(MemoryLimiters.DefaultManualAllocator)
+    private val predictionContextDispatcher: PredictionContextDispatcher = PredictionContextDispatcher(predictionConfig)
 
     override fun addProfilingContext(name: String): ProfilingContext = ProfilingContext(name).apply { profiles.add(this) }
     override fun analyzeProfilingResults(): ProfileAnalysisEntry = profiles.analyze("Model $name")
@@ -36,7 +33,6 @@ class KIModel(
             if (profile) addProfilingContext("Model $name") else null
         )
 
-        val limiterContext = ParallelismLimiterContext(dispatcher)
         var coreReserved = false
         val results = try {
             withContext(NonCancellable) {
@@ -44,33 +40,16 @@ class KIModel(
                 coreReserved = true
             }
 
-            when (MemoryLimiters.DefaultManualAllocator) {
-                MemoryLimiters.NoAllocator -> {
-                    withContext(limiterContext) {
-                        return@withContext graph.execute(input, contexts)
-                    }
-                }
-                MemoryLimiters.DefaultManualAllocator -> {
-                    val allocatorContext = modelArrayStorage.createManualAllocatorContext()
-                    val mixedContext = allocatorContext + limiterContext
-
-                    withContext(mixedContext) {
-                        return@withContext graph.execute(input, contexts)
-                    }
-                }
-                else -> {
-                    val allocatorContext = modelArrayStorage.createAutoAllocatorContext()
-                    val mixedContext = allocatorContext + limiterContext
-
-                    withContext(mixedContext) {
-                        val coroutineContext = coroutineContext[AutoAllocatorContext.Key]!!
-                        val execResult = graph.execute(input, contexts)
-                        val copies = execResult.map { it.clone(it.name) }.toList()
-                        coroutineContext.returnUsedArrays()
-                        return@withContext copies
-                    }
-                }
+            val predictionContext = predictionContextDispatcher.getPredictionContext()
+            val output = if (predictionContextDispatcher.allocationMode != AllocationMode.Auto) withContext(predictionContext) {
+                return@withContext graph.execute(input, contexts)
+            } else withContext(predictionContext) {
+                return@withContext graph.execute(input, contexts).map { it.clone(it.name) }.toList()
             }
+
+            predictionContext.finalizeAllocatorContext()
+            predictionContextDispatcher.returnStorage(predictionContext)
+            output
         } finally {
             if (coreReserved) {
                 ResourcesDispatcher.releaseCore()
@@ -82,11 +61,11 @@ class KIModel(
 
     override suspend fun close() {
         graph.close()
-        modelArrayStorage.close()
+        predictionContextDispatcher.close()
     }
 
     override fun clearCache() {
-        modelArrayStorage.clearCache()
+        predictionContextDispatcher.clearCache()
     }
 
     companion object {
@@ -96,14 +75,13 @@ class KIModel(
 
         suspend operator fun invoke(
             proto: ModelProto,
-            memoryLimiter: MemoryLimiter = MemoryLimiters.NoAllocator,
-            limiterParallelismCounter: Int = PlatformUtils.cores,
+            predictionConfig: PredictionConfig = PredictionConfigs.NoAllocator,
         ): KIModel {
             val name = "${proto.domain}:${proto.modelVersion}"
             val id = "$name:${generateModelId()}"
             val opSet = OperatorSetRegistry(proto.opSetImport)
             val graph = KIGraph(proto.graph!!, opSet)
-            return KIModel(id, name, opSet, graph, memoryLimiter, limiterParallelismCounter)
+            return KIModel(id, name, opSet, graph, predictionConfig)
         }
     }
 }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
index 234639c96..a06b99080 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
@@ -7,7 +7,7 @@ import io.kinference.core.optimizer.rules.context.AttentionContextRule
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.*
-import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.arrays.pointers.accept
 import io.kinference.ndarray.arrays.pointers.map
 import io.kinference.ndarray.arrays.tiled.FloatTiledArray
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
index f2be9a212..33a01c6d3 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
@@ -5,7 +5,7 @@ import io.kinference.core.data.tensor.*
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.*
-import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.arrays.pointers.*
 import io.kinference.operator.*
 import io.kinference.primitives.types.DataType
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
index 75320199f..08b8e7f1a 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
@@ -7,7 +7,7 @@ import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.FloatNDArray
 import io.kinference.ndarray.arrays.MutableFloatNDArray
-import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.arrays.pointers.*
 import io.kinference.operator.*
 import io.kinference.primitives.types.DataType
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
index b7a64397d..46596f4e1 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
@@ -6,7 +6,7 @@ import io.kinference.core.data.tensor.asTensor
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.*
-import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.operator.*
 import io.kinference.protobuf.message.TensorProto
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
index f2d8d01b3..c6b21a778 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
@@ -7,7 +7,7 @@ import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.MutableNumberNDArrayCore
 import io.kinference.ndarray.arrays.NumberNDArrayCore
-import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.ndarray.extensions.gelu.biasGelu
 import io.kinference.operator.*
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
index e3baa2e4e..1d5608450 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
@@ -6,7 +6,7 @@ import io.kinference.core.data.tensor.asTensor
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.*
-import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.broadcasting.Broadcasting
 import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.operator.*
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
index 742fd7c2d..d0bc9a56a 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
@@ -6,7 +6,7 @@ import io.kinference.core.data.tensor.asTensor
 import io.kinference.data.ONNXData
 import io.kinference.graph.Contexts
 import io.kinference.ndarray.arrays.*
-import io.kinference.ndarray.arrays.memory.ManualAllocatorContext
+import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.arrays.pointers.mapTo
 import io.kinference.ndarray.arrays.tiled.*
 import io.kinference.operator.*
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt
deleted file mode 100644
index dcf704673..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ArrayStorage.kt
+++ /dev/null
@@ -1,139 +0,0 @@
-package io.kinference.ndarray.arrays.memory
-
-import io.kinference.primitives.types.DataType
-
-internal abstract class BaseArrayStorage(typeLength: Int, sizeLength: Int, storageCount: Int) {
-    /**
-     * This is a storage for arrays.
-     *
-     * Structure is as follows:
-     * 1. Array by predefined types (all types are known compiled time)
-     * 2. Array by size.
-     * Starting with 'INIT_SIZE_VALUE' element and grow it doubling (typically there are no more than 16 different sizes)
-     * 3. Queue of array containers (used as FIFO)
-     */
-    protected var storage: Array<Array<Array<ArrayDeque<Any>>>> =
-        Array(storageCount) { Array(typeLength) { Array(sizeLength) { ArrayDeque() } } }
-
-    private var sizeIndices: IntArray = IntArray(typeLength)
-    private var sizes: Array<IntArray> = Array(typeLength) { IntArray(sizeLength) }
-
-    protected fun getSizeIndex(tIndex: Int, size: Int): Int {
-        val sIndex = sizes[tIndex].indexOf(size)
-
-        return if (sIndex != -1) {
-            sIndex
-        } else {
-            if (sizeIndices[tIndex] >= storage[0][tIndex].size)
-                grow(tIndex)
-
-            val idx = sizeIndices[tIndex]++
-            sizes[tIndex][idx] = size
-            idx
-        }
-    }
-
-    private fun grow(typeIndex: Int) {
-        val newSize = sizes[typeIndex].size * 2
-        for (i in storage.indices) {
-            val newStorage: Array<ArrayDeque<Any>> = Array(newSize) { ArrayDeque() }
-
-            for (j in storage[i][typeIndex].indices) {
-                newStorage[j] = storage[i][typeIndex][j]
-            }
-
-            storage[i][typeIndex] = newStorage
-        }
-
-        sizes[typeIndex] = sizes[typeIndex].copyOf(newSize)
-    }
-
-    protected fun create(type: DataType, size: Int): Any {
-        return when (type) {
-            DataType.BYTE -> ByteArray(size)         // 8-bit signed
-            DataType.SHORT -> ShortArray(size)       // 16-bit signed
-            DataType.INT -> IntArray(size)           // 32-bit signed
-            DataType.LONG -> LongArray(size)         // 64-bit signed
-
-            DataType.UBYTE -> UByteArray(size)       // 8-bit unsigned
-            DataType.USHORT -> UShortArray(size)     // 16-bit unsigned
-            DataType.UINT -> UIntArray(size)         // 32-bit unsigned
-            DataType.ULONG -> ULongArray(size)       // 64-bit unsigned
-
-            DataType.FLOAT -> FloatArray(size)
-            DataType.DOUBLE -> DoubleArray(size)
-
-            DataType.BOOLEAN -> BooleanArray(size)
-            else -> throw IllegalArgumentException("Unsupported array type")
-        }
-    }
-
-    protected fun resetArray(array: Any): Unit =
-        when (array) {
-            is ByteArray -> array.fill(0)               // 8-bit signed
-            is UByteArray -> array.fill(0u)             // 8-bit unsigned
-            is ShortArray -> array.fill(0)              // 16-bit signed
-            is UShortArray -> array.fill(0u)            // 16-bit unsigned
-            is IntArray -> array.fill(0)                // 32-bit signed
-            is UIntArray -> array.fill(0u)              // 32-bit unsigned
-            is LongArray -> array.fill(0L)              // 64-bit signed
-            is ULongArray -> array.fill(0U)             // 64-bit unsigned
-            is FloatArray -> array.fill(0.0f)
-            is DoubleArray -> array.fill(0.0)
-            is BooleanArray -> array.fill(false)
-            else -> error("Unsupported array type")
-        }
-}
-
-internal class SingleArrayStorage(typeLength: Int, sizeLength: Int, private val limiter: MemoryLimiter) : BaseArrayStorage(typeLength, sizeLength, 1) {
-    internal fun getArray(type: DataType, size: Int, fillZeros: Boolean = true): Any {
-        return if (limiter.checkMemoryLimitAndAdd(type, size)) {
-            val tIndex = type.ordinal
-            val sIndex = getSizeIndex(tIndex, size)
-            storage[0][tIndex][sIndex].removeFirstOrNull()?.takeIf { fillZeros }?.apply(::resetArray) ?: create(type, size)
-        } else {
-            create(type, size)
-        }
-    }
-
-    internal fun returnArrays(type: DataType, size: Int, arrays: Array<Any>) {
-        val tIndex = type.ordinal
-        val sIndex = getSizeIndex(tIndex, size)
-        val queue = storage[0][tIndex][sIndex]
-
-        queue.addAll(arrays)
-    }
-
-    internal fun clear() {
-        storage[0].forEach { arraysBySize ->
-            arraysBySize.forEach { arrayDeque ->
-                arrayDeque.clear()
-            }
-        }
-        limiter.resetLimit()
-    }
-}
-
-internal class ArrayStorage(typeLength: Int, sizeLength: Int, private val limiter: MemoryLimiter) : BaseArrayStorage(typeLength, sizeLength, 2) {
-    internal fun getArray(type: DataType, size: Int, fillZeros: Boolean = true): Any {
-        return if (limiter.checkMemoryLimitAndAdd(type, size)) {
-            val tIndex = type.ordinal
-            val sIndex = getSizeIndex(tIndex, size)
-            val array = storage[0][tIndex][sIndex].removeFirstOrNull()?.takeIf { fillZeros }?.apply(::resetArray) ?: create(type, size)
-            storage[1][tIndex][sIndex].add(array)
-            array
-        } else {
-            create(type, size)
-        }
-    }
-
-    internal fun moveArrays() {
-        storage[1].forEachIndexed { typeIndex, arraysByType ->
-            arraysByType.forEachIndexed { sizeIndex, arrayDeque ->
-                storage[0][typeIndex][sizeIndex].addAll(arrayDeque)
-                arrayDeque.clear()
-            }
-        }
-        limiter.resetLimit()
-    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AutoAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AutoAllocatorContext.kt
deleted file mode 100644
index a9255dd93..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/AutoAllocatorContext.kt
+++ /dev/null
@@ -1,23 +0,0 @@
-package io.kinference.ndarray.arrays.memory
-
-import io.kinference.ndarray.arrays.*
-import io.kinference.primitives.types.DataType
-import kotlin.coroutines.CoroutineContext
-
-data class AutoAllocatorContext internal constructor(
-    private val storage: ArrayStorage,
-    private val returnStorageFn: (ArrayStorage) -> Unit
-) : CoroutineContext.Element {
-
-    companion object Key : CoroutineContext.Key<AutoAllocatorContext>
-    override val key: CoroutineContext.Key<*> get() = Key
-
-    internal fun getArrays(type: DataType, size: Int, count: Int): Array<Any> {
-        return Array(count) { storage.getArray(type, size) }
-    }
-
-    fun returnUsedArrays() {
-        storage.moveArrays()
-        returnStorageFn(storage)
-    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ManualAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ManualAllocatorContext.kt
deleted file mode 100644
index 27f261fef..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ManualAllocatorContext.kt
+++ /dev/null
@@ -1,110 +0,0 @@
-package io.kinference.ndarray.arrays.memory
-
-import io.kinference.ndarray.arrays.*
-import io.kinference.primitives.types.DataType
-import kotlin.coroutines.CoroutineContext
-
-data class ManualAllocatorContext internal constructor(private val storage: SingleArrayStorage) : CoroutineContext.Element {
-
-    internal val byteStorage = ByteArrayStorageWrapper()
-    internal val shortStorage =  ShortArrayStorageWrapper()
-    internal val intStorage =  IntArrayStorageWrapper()
-    internal val longStorage =  LongArrayStorageWrapper()
-
-    internal val ubyteStorage =  UByteArrayStorageWrapper()
-    internal val ushortStorage =  UShortArrayStorageWrapper()
-    internal val uintStorage =  UIntArrayStorageWrapper()
-    internal val ulongStorage =  ULongArrayStorageWrapper()
-
-    internal val floatStorage =  FloatArrayStorageWrapper()
-    internal val doubleStorage =  DoubleArrayStorageWrapper()
-
-    internal val booleanStorage =  BooleanArrayStorageWrapper()
-
-    companion object Key : CoroutineContext.Key<ManualAllocatorContext>
-    override val key: CoroutineContext.Key<*> get() = Key
-
-    fun getNDArray(dataType: DataType, strides: Strides, fillZeros: Boolean = false): MutableNDArrayCore {
-        return when(dataType) {
-            DataType.BYTE -> byteStorage.getNDArray(strides, fillZeros)
-            DataType.SHORT -> shortStorage.getNDArray(strides, fillZeros)
-            DataType.INT -> intStorage.getNDArray(strides, fillZeros)
-            DataType.LONG -> longStorage.getNDArray(strides, fillZeros)
-
-            DataType.UBYTE -> ubyteStorage.getNDArray(strides, fillZeros)
-            DataType.USHORT -> ushortStorage.getNDArray(strides, fillZeros)
-            DataType.UINT -> uintStorage.getNDArray(strides, fillZeros)
-            DataType.ULONG -> ulongStorage.getNDArray(strides, fillZeros)
-
-            DataType.FLOAT -> floatStorage.getNDArray(strides, fillZeros)
-            DataType.DOUBLE -> doubleStorage.getNDArray(strides, fillZeros)
-
-            DataType.BOOLEAN -> booleanStorage.getNDArray(strides, fillZeros)
-
-            else -> error("Unsupported array type")
-        }
-    }
-
-    fun returnNDArray(ndArray: NDArrayCore) {
-        when(ndArray.type) {
-            DataType.BYTE -> byteStorage.returnNDArray(ndArray as ByteNDArray)
-            DataType.SHORT -> shortStorage.returnNDArray(ndArray as ShortNDArray)
-            DataType.INT -> intStorage.returnNDArray(ndArray as IntNDArray)
-            DataType.LONG -> longStorage.returnNDArray(ndArray as LongNDArray)
-
-            DataType.UBYTE -> ubyteStorage.returnNDArray(ndArray as UByteNDArray)
-            DataType.USHORT -> ushortStorage.returnNDArray(ndArray as UShortNDArray)
-            DataType.UINT -> uintStorage.returnNDArray(ndArray as UIntNDArray)
-            DataType.ULONG -> ulongStorage.returnNDArray(ndArray as ULongNDArray)
-
-            DataType.FLOAT -> floatStorage.returnNDArray(ndArray as FloatNDArray)
-            DataType.DOUBLE -> doubleStorage.returnNDArray(ndArray as DoubleNDArray)
-
-            DataType.BOOLEAN -> booleanStorage.returnNDArray(ndArray as BooleanNDArray)
-
-            else -> error("Unsupported array type")
-        }
-    }
-
-//    fun getNDArray(dataType: DataType, strides: Strides, fillZeros: Boolean = false): MutableNDArrayCore {
-//        return when(dataType) {
-//            DataType.BYTE -> ByteArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-//            DataType.SHORT -> ShortArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-//            DataType.INT -> IntArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-//            DataType.LONG -> LongArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-//
-//            DataType.UBYTE -> UByteArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-//            DataType.USHORT -> UShortArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-//            DataType.UINT -> UIntArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-//            DataType.ULONG -> ULongArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-//
-//            DataType.FLOAT -> FloatArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-//            DataType.DOUBLE -> DoubleArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-//
-//            DataType.BOOLEAN -> BooleanArrayStorageWrapper.getNDArray(strides, storage, fillZeros)
-//
-//            else -> error("Unsupported array type")
-//        }
-//    }
-//
-//    fun returnNDArray(ndArray: NDArrayCore) {
-//        when(ndArray.type) {
-//            DataType.BYTE -> ByteArrayStorageWrapper.returnNDArray(storage, ndArray as ByteNDArray)
-//            DataType.SHORT -> ShortArrayStorageWrapper.returnNDArray(storage, ndArray as ShortNDArray)
-//            DataType.INT -> IntArrayStorageWrapper.returnNDArray(storage, ndArray as IntNDArray)
-//            DataType.LONG -> LongArrayStorageWrapper.returnNDArray(storage, ndArray as LongNDArray)
-//
-//            DataType.UBYTE -> UByteArrayStorageWrapper.returnNDArray(storage, ndArray as UByteNDArray)
-//            DataType.USHORT -> UShortArrayStorageWrapper.returnNDArray(storage, ndArray as UShortNDArray)
-//            DataType.UINT -> UIntArrayStorageWrapper.returnNDArray(storage, ndArray as UIntNDArray)
-//            DataType.ULONG -> ULongArrayStorageWrapper.returnNDArray(storage, ndArray as ULongNDArray)
-//
-//            DataType.FLOAT -> FloatArrayStorageWrapper.returnNDArray(storage, ndArray as FloatNDArray)
-//            DataType.DOUBLE -> DoubleArrayStorageWrapper.returnNDArray(storage, ndArray as DoubleNDArray)
-//
-//            DataType.BOOLEAN -> BooleanArrayStorageWrapper.returnNDArray(storage, ndArray as BooleanNDArray)
-//
-//            else -> error("Unsupported array type")
-//        }
-//    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryLimiter.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryManager.kt
similarity index 60%
rename from ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryLimiter.kt
rename to ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryManager.kt
index 85ed03eb1..3dc575bf2 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryLimiter.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryManager.kt
@@ -1,10 +1,15 @@
 package io.kinference.ndarray.arrays.memory
 
 import io.kinference.primitives.types.DataType
-import io.kinference.utils.PlatformUtils
 import kotlinx.atomicfu.*
+import kotlinx.coroutines.*
+
+internal class MemoryManager internal constructor(private val memoryLimit: Long, private val cacheClearingInterval: Long, private val onCacheClear: () -> Unit) {
+    private var usedMemory: AtomicLong = atomic(0L)
+    private val lastAccessTime = atomic(System.currentTimeMillis())
+    private val monitorJob: AtomicRef<Job?> = atomic(initial = null)
+    private val isFinalized = atomic(initial = false)
 
-interface MemoryLimiter {
     /**
      * Checks if the memory limit allows adding the specified amount of memory and performs the addition
      *
@@ -12,18 +17,7 @@ interface MemoryLimiter {
      * @param size is the checking array size
      * @return true if the memory was added successfully and false if adding the memory exceeds the memory limit
      */
-    fun checkMemoryLimitAndAdd(type: DataType, size: Int): Boolean
-
-    /**
-     * Resets the used memory into 0L
-     */
-    fun resetLimit()
-}
-
-class BaseMemoryLimiter internal constructor(private val memoryLimit: Long) : MemoryLimiter {
-    private var usedMemory: AtomicLong = atomic(0L)
-
-    override fun checkMemoryLimitAndAdd(type: DataType, size: Int): Boolean {
+    fun checkMemoryLimitAndAdd(type: DataType, size: Int): Boolean {
         // Attempt to add memory and check the limit
         val added = sizeInBytes(type.ordinal, size)
         val successful = usedMemory.getAndUpdate { current ->
@@ -33,10 +27,38 @@ class BaseMemoryLimiter internal constructor(private val memoryLimit: Long) : Me
         return successful
     }
 
-    override fun resetLimit() {
+    /**
+     * Resets the used memory into 0L
+     */
+    fun resetLimit() {
         usedMemory.value = 0L
     }
 
+    fun updateLastAccessTime() {
+        lastAccessTime.value = System.currentTimeMillis()
+
+        // Start monitoring if not already started
+        if (monitorJob.compareAndSet(expect = null, update = null) && !isFinalized.value) {
+            val newJob = CoroutineScope(Dispatchers.Default).launch {
+                while (isActive) {
+                    delay(cacheClearingInterval)
+                    if (System.currentTimeMillis() - lastAccessTime.value > cacheClearingInterval) {
+                        onCacheClear()
+                    }
+                }
+            }
+            if (!monitorJob.compareAndSet(expect = null, newJob)) {
+                newJob.cancel() // Cancel if another thread set the job
+            }
+        }
+    }
+
+    fun stopMonitoring() {
+        if (isFinalized.compareAndSet(expect = false, update = true)) {
+            monitorJob.getAndSet(value = null)?.cancel()
+        }
+    }
+
     companion object {
         private val typeSizes: LongArray = LongArray(DataType.entries.size).apply {
             this[DataType.BYTE.ordinal] = Byte.SIZE_BYTES.toLong()
@@ -60,13 +82,3 @@ class BaseMemoryLimiter internal constructor(private val memoryLimit: Long) : Me
         }
     }
 }
-
-object MemoryLimiters {
-    val DefaultAutoAllocator: MemoryLimiter = BaseMemoryLimiter((PlatformUtils.maxHeap * 0.3).toLong())
-    val DefaultManualAllocator: MemoryLimiter = BaseMemoryLimiter(50 * 1024 * 1024)
-    val NoAllocator: MemoryLimiter = BaseMemoryLimiter(0L)
-
-    fun customLimiter(memoryLimit: Long): MemoryLimiter {
-        return BaseMemoryLimiter(memoryLimit)
-    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ModelArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ModelArrayStorage.kt
deleted file mode 100644
index 0135921cb..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/ModelArrayStorage.kt
+++ /dev/null
@@ -1,39 +0,0 @@
-package io.kinference.ndarray.arrays.memory
-
-import io.kinference.primitives.types.DataType
-import io.kinference.utils.Closeable
-import java.util.concurrent.ConcurrentLinkedQueue
-
-class ModelArrayStorage(private val limiter: MemoryLimiter = MemoryLimiters.NoAllocator) : Closeable {
-    private val autoStorageQueue: ConcurrentLinkedQueue<ArrayStorage> = ConcurrentLinkedQueue()
-
-    companion object {
-        private const val INIT_SIZE_VALUE: Int = 2
-        private val typeSize: Int = DataType.entries.size
-    }
-
-    fun createAutoAllocatorContext(): AutoAllocatorContext {
-        return AutoAllocatorContext(getStorage(autoStorageQueue), ::returnStorage)
-    }
-
-    fun createManualAllocatorContext(): ManualAllocatorContext {
-        limiter.resetLimit()
-        return ManualAllocatorContext(SingleArrayStorage(typeSize, INIT_SIZE_VALUE, limiter))
-    }
-
-    fun clearCache() {
-        autoStorageQueue.clear()
-    }
-
-    override suspend fun close() {
-        clearCache()
-    }
-
-    private fun getStorage(queue: ConcurrentLinkedQueue<ArrayStorage>): ArrayStorage {
-        return queue.poll() ?: ArrayStorage(typeSize, INIT_SIZE_VALUE, limiter)
-    }
-
-    private fun returnStorage(storage: ArrayStorage) {
-        autoStorageQueue.offer(storage)
-    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PredictionContextDispatcher.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PredictionContextDispatcher.kt
new file mode 100644
index 000000000..10a2c4bc4
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PredictionContextDispatcher.kt
@@ -0,0 +1,64 @@
+package io.kinference.ndarray.arrays.memory
+
+import io.kinference.ndarray.arrays.memory.contexts.*
+import io.kinference.ndarray.arrays.memory.storage.*
+import io.kinference.utils.*
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.ExperimentalCoroutinesApi
+import java.util.concurrent.ConcurrentLinkedQueue
+import kotlin.coroutines.CoroutineContext
+
+interface ArrayStorage {
+    fun resetState()
+}
+
+class PredictionContextDispatcher(private val predictionConfig: PredictionConfig) : Closeable {
+    private val limiter: MemoryManager = MemoryManager(
+        memoryLimit = predictionConfig.memoryThreshold,
+        cacheClearingInterval = predictionConfig.memoryClearingInterval,
+        onCacheClear = ::clearCache)
+
+    private val contextQueue: ConcurrentLinkedQueue<CoroutineContext> = ConcurrentLinkedQueue()
+    val allocationMode
+        get() = predictionConfig.allocationMode
+
+    fun getPredictionContext(): CoroutineContext {
+        val allocatorContext = when (predictionConfig.allocationMode) {
+            AllocationMode.NoAllocation -> getNoAllocatorContext()
+            AllocationMode.Manual -> getManualAllocatorContext()
+            AllocationMode.Auto -> getAutoAllocatorContext()
+        }
+        return allocatorContext
+    }
+
+    @OptIn(ExperimentalCoroutinesApi::class)
+    private fun getNoAllocatorContext(): CoroutineContext {
+        return contextQueue.poll() ?: (NoAllocatorContext() + ParallelismLimiterContext(Dispatchers.Default.limitedParallelism(predictionConfig.parallelismLimit)))
+    }
+
+    @OptIn(ExperimentalCoroutinesApi::class)
+    private fun getAutoAllocatorContext(): CoroutineContext {
+        limiter.updateLastAccessTime()
+        return contextQueue.poll() ?: (AutoAllocatorContext(AutoArrayHandlingStorage(limiter)) + ParallelismLimiterContext(Dispatchers.Default.limitedParallelism(predictionConfig.parallelismLimit)))
+    }
+
+    @OptIn(ExperimentalCoroutinesApi::class)
+    private fun getManualAllocatorContext(): CoroutineContext {
+        limiter.updateLastAccessTime()
+        return contextQueue.poll() ?: (ManualAllocatorContext(ManualArrayHandlingStorage(limiter)) + ParallelismLimiterContext(Dispatchers.Default.limitedParallelism(predictionConfig.parallelismLimit)))
+    }
+
+    fun clearCache() {
+        limiter.stopMonitoring()
+        contextQueue.clear()
+        limiter.resetLimit()
+    }
+
+    override suspend fun close() {
+        clearCache()
+    }
+
+    fun returnStorage(context: CoroutineContext) {
+        contextQueue.offer(context)
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayStorageWrapper.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayStorageWrapper.kt
deleted file mode 100644
index e6c566938..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PrimitiveArrayStorageWrapper.kt
+++ /dev/null
@@ -1,62 +0,0 @@
-@file:GeneratePrimitives(DataType.ALL)
-package io.kinference.ndarray.arrays.memory
-
-import io.kinference.ndarray.arrays.*
-import io.kinference.ndarray.arrays.PrimitiveNDArray
-import io.kinference.ndarray.arrays.tiled.PrimitiveTiledArray
-import io.kinference.ndarray.blockSizeByStrides
-import io.kinference.ndarray.extensions.constants.PrimitiveConstants
-import io.kinference.primitives.annotations.*
-import io.kinference.primitives.types.DataType
-import io.kinference.primitives.types.PrimitiveArray
-import io.kinference.utils.inlines.InlineInt
-
-@GenerateNameFromPrimitives
-internal class PrimitiveArrayStorageWrapper {
-    private val type = DataType.CurrentPrimitive
-
-    private val storage = HashMap<InlineInt, ArrayDeque<PrimitiveArray>>(2)
-
-    fun getNDArray(strides: Strides, fillZeros: Boolean = false): MutablePrimitiveNDArray {
-        val blockSize = InlineInt(blockSizeByStrides(strides))
-        val blocksNum = strides.linearSize / blockSize.value
-
-        val queue = storage.getOrPut(blockSize) { ArrayDeque(blocksNum) }
-
-        val blocks = Array(blocksNum) {
-            val block = queue.removeFirstOrNull()
-            if (fillZeros) {
-                block?.fill(PrimitiveConstants.ZERO)
-            }
-            block ?: PrimitiveArray(blockSize.value)
-        }
-
-        val tiled = PrimitiveTiledArray(blocks)
-
-        return MutablePrimitiveNDArray(tiled, strides)
-    }
-
-    fun returnNDArray(ndarray: PrimitiveNDArray) {
-        val blockSize = InlineInt(ndarray.array.blockSize)
-        val blocksNum = ndarray.array.blocksNum
-
-        val queue = storage.getOrPut(blockSize) { ArrayDeque(blocksNum) }
-
-        queue.addAll(ndarray.array.blocks)
-    }
-
-    fun getNDArray(strides: Strides, storage: SingleArrayStorage, fillZeros: Boolean = false): MutablePrimitiveNDArray {
-        val blockSize = blockSizeByStrides(strides)
-        val blocksNum = strides.linearSize / blockSize
-        val blocks = Array(blocksNum) { storage.getArray(type, blockSize, fillZeros) }
-        val typedBlocks = blocks.map { it as PrimitiveArray }.toTypedArray()
-        val tiled = PrimitiveTiledArray(typedBlocks)
-
-        return MutablePrimitiveNDArray(tiled, strides)
-    }
-
-    fun returnNDArray(storage: SingleArrayStorage, ndarray: PrimitiveNDArray) {
-        val blockSize = ndarray.array.blockSize
-        storage.returnArrays(type, blockSize, ndarray.array.blocks as Array<Any>)
-    }
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
new file mode 100644
index 000000000..05f7063c0
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
@@ -0,0 +1,17 @@
+package io.kinference.ndarray.arrays.memory.contexts
+
+import io.kinference.ndarray.arrays.memory.storage.AutoArrayHandlingStorage
+import io.kinference.primitives.types.DataType
+import kotlin.coroutines.*
+
+internal class AutoAllocatorContext internal constructor(
+    storage: AutoArrayHandlingStorage,
+) : BaseAllocatorContextWithStorage<AutoArrayHandlingStorage>(storage) {
+
+    companion object Key : CoroutineContext.Key<AutoAllocatorContext>
+    override val key: CoroutineContext.Key<*> get() = Key
+
+    internal fun getArrays(type: DataType, size: Int, count: Int): Array<Any> {
+        return storage.getArrays(type, size, count)
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/BaseAllocatorContextWithStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/BaseAllocatorContextWithStorage.kt
new file mode 100644
index 000000000..f98e96649
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/BaseAllocatorContextWithStorage.kt
@@ -0,0 +1,24 @@
+package io.kinference.ndarray.arrays.memory.contexts
+
+import io.kinference.ndarray.arrays.memory.ArrayStorage
+import kotlin.coroutines.CoroutineContext
+
+interface BaseAllocatorContext: CoroutineContext.Element
+
+abstract class BaseAllocatorContextWithStorage<T : ArrayStorage>(protected val storage: T) : BaseAllocatorContext {
+    fun finalizeContext() {
+        storage.resetState()
+    }
+}
+
+fun CoroutineContext.finalizeAllocatorContext() {
+    this.fold(Unit) { _, context ->
+        if (context is BaseAllocatorContextWithStorage<*>)
+            context.finalizeContext()
+    }
+}
+
+class NoAllocatorContext : BaseAllocatorContext {
+    companion object Key : CoroutineContext.Key<NoAllocatorContext>
+    override val key: CoroutineContext.Key<*> get() = Key
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt
new file mode 100644
index 000000000..a713f31fe
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt
@@ -0,0 +1,23 @@
+package io.kinference.ndarray.arrays.memory.contexts
+
+import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.storage.ManualArrayHandlingStorage
+import io.kinference.ndarray.arrays.memory.storage.ManualStorage
+import io.kinference.primitives.types.DataType
+import kotlin.coroutines.CoroutineContext
+
+class ManualAllocatorContext internal constructor(
+    storage: ManualArrayHandlingStorage,
+) : BaseAllocatorContextWithStorage<ManualStorage>(storage) {
+
+    companion object Key : CoroutineContext.Key<ManualAllocatorContext>
+    override val key: CoroutineContext.Key<*> get() = Key
+
+    fun getNDArray(dataType: DataType, strides: Strides, fillZeros: Boolean = false): MutableNDArrayCore {
+        return storage.getNDArray(dataType, strides, fillZeros)
+    }
+
+    fun returnNDArray(ndArray: NDArrayCore) {
+        storage.returnNDArray(ndArray)
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt
new file mode 100644
index 000000000..030beac56
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt
@@ -0,0 +1,34 @@
+package io.kinference.ndarray.arrays.memory.storage
+
+import io.kinference.ndarray.arrays.memory.*
+import io.kinference.primitives.types.DataType
+
+internal interface TypedAutoHandlingStorage {
+    fun getBlock(blocksNum: Int, blockSize: Int, limiter: MemoryManager): Array<Any>
+    fun moveBlocksIntoUnused()
+}
+
+internal class AutoArrayHandlingStorage(private val limiter: MemoryManager) : ArrayStorage {
+    private val storage: List<TypedAutoHandlingStorage> = listOf(
+        ByteAutoHandlingArrayStorage(),
+        ShortAutoHandlingArrayStorage(),
+        IntAutoHandlingArrayStorage(),
+        LongAutoHandlingArrayStorage(),
+        UByteAutoHandlingArrayStorage(),
+        UShortAutoHandlingArrayStorage(),
+        UIntAutoHandlingArrayStorage(),
+        ULongAutoHandlingArrayStorage(),
+        FloatAutoHandlingArrayStorage(),
+        DoubleAutoHandlingArrayStorage(),
+        BooleanAutoHandlingArrayStorage()
+    )
+
+    internal fun getArrays(type: DataType, size: Int, count: Int): Array<Any> {
+        return storage[type.ordinal].getBlock(blocksNum = count, blockSize = size, limiter = limiter)
+    }
+
+    override fun resetState() {
+        storage.forEach { it.moveBlocksIntoUnused() }
+        limiter.resetLimit()
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt
new file mode 100644
index 000000000..0631056b3
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt
@@ -0,0 +1,45 @@
+package io.kinference.ndarray.arrays.memory.storage
+
+import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.memory.*
+import io.kinference.primitives.types.DataType
+
+internal interface TypedManualHandlingStorage {
+    fun getNDArray(strides: Strides, fillZeros: Boolean = false, limiter: MemoryManager): MutableNDArrayCore
+    fun returnNDArray(ndarray: NDArrayCore)
+    fun clear()
+}
+
+interface ManualStorage : ArrayStorage {
+    fun getNDArray(dataType: DataType, strides: Strides, fillZeros: Boolean = false): MutableNDArrayCore
+    fun returnNDArray(ndArray: NDArrayCore)
+}
+
+internal class ManualArrayHandlingStorage(private val memoryManager: MemoryManager) : ManualStorage {
+    private val storage: List<TypedManualHandlingStorage> = listOf(
+        ByteManualHandlingArrayStorage(),
+        ShortManualHandlingArrayStorage(),
+        IntManualHandlingArrayStorage(),
+        LongManualHandlingArrayStorage(),
+        UByteManualHandlingArrayStorage(),
+        UShortManualHandlingArrayStorage(),
+        UIntManualHandlingArrayStorage(),
+        ULongManualHandlingArrayStorage(),
+        FloatManualHandlingArrayStorage(),
+        DoubleManualHandlingArrayStorage(),
+        BooleanManualHandlingArrayStorage()
+    )
+
+    override fun getNDArray(dataType: DataType, strides: Strides, fillZeros: Boolean): MutableNDArrayCore {
+        return storage[dataType.ordinal].getNDArray(strides, fillZeros, memoryManager)
+    }
+
+    override fun returnNDArray(ndArray: NDArrayCore) {
+        storage[ndArray.type.ordinal].returnNDArray(ndArray)
+    }
+
+    override fun resetState() {
+        storage.forEach { it.clear() }
+        memoryManager.resetLimit()
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
new file mode 100644
index 000000000..0d9d4e7da
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
@@ -0,0 +1,47 @@
+@file:GeneratePrimitives(DataType.ALL)
+package io.kinference.ndarray.arrays.memory.storage
+
+import io.kinference.ndarray.arrays.memory.MemoryManager
+import io.kinference.ndarray.extensions.constants.PrimitiveConstants
+import io.kinference.primitives.annotations.GenerateNameFromPrimitives
+import io.kinference.primitives.annotations.GeneratePrimitives
+import io.kinference.primitives.types.DataType
+import io.kinference.primitives.types.PrimitiveArray
+
+@GenerateNameFromPrimitives
+internal class PrimitiveAutoHandlingArrayStorage : TypedAutoHandlingStorage {
+    private val used = HashMap<Int, ArrayDeque<PrimitiveArray>>(8)
+    private val unused = HashMap<Int, ArrayDeque<PrimitiveArray>>(8)
+
+    companion object {
+        private val type = DataType.CurrentPrimitive
+    }
+
+    override fun getBlock(blocksNum: Int, blockSize: Int, limiter: MemoryManager): Array<Any> {
+        val unusedQueue = unused.getOrPut(blockSize) { ArrayDeque(blocksNum) }
+        val usedQueue = used.getOrPut(blockSize) { ArrayDeque(blocksNum) }
+
+        val blocks = if (limiter.checkMemoryLimitAndAdd(type, blockSize * blocksNum)) {
+            Array(blocksNum) {
+                val block = unusedQueue.removeFirstOrNull()
+                block?.fill(PrimitiveConstants.ZERO)
+                block ?: PrimitiveArray(blockSize)
+            }
+        } else {
+            Array(blocksNum) {
+                PrimitiveArray(blockSize)
+            }
+        }
+
+        usedQueue.addAll(blocks)
+
+        return blocks as Array<Any>
+    }
+
+    override fun moveBlocksIntoUnused() {
+        used.forEach { (blockSize, queue) ->
+            unused[blockSize]!!.addAll(queue)
+            queue.clear()
+        }
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt
new file mode 100644
index 000000000..821438a97
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt
@@ -0,0 +1,56 @@
+@file:GeneratePrimitives(DataType.ALL)
+package io.kinference.ndarray.arrays.memory.storage
+
+import io.kinference.ndarray.arrays.*
+import io.kinference.ndarray.arrays.PrimitiveNDArray
+import io.kinference.ndarray.arrays.memory.MemoryManager
+import io.kinference.ndarray.arrays.tiled.PrimitiveTiledArray
+import io.kinference.ndarray.blockSizeByStrides
+import io.kinference.ndarray.extensions.constants.PrimitiveConstants
+import io.kinference.primitives.annotations.*
+import io.kinference.primitives.types.DataType
+import io.kinference.primitives.types.PrimitiveArray
+
+@GenerateNameFromPrimitives
+internal class PrimitiveManualHandlingArrayStorage : TypedManualHandlingStorage {
+    private val storage = HashMap<Int, ArrayDeque<PrimitiveArray>>(8)
+
+    companion object {
+        private val type = DataType.CurrentPrimitive
+    }
+
+    override fun getNDArray(strides: Strides, fillZeros: Boolean, limiter: MemoryManager): MutableNDArrayCore {
+        val blockSize = blockSizeByStrides(strides)
+        val blocksNum = strides.linearSize / blockSize
+        val blocks = if (limiter.checkMemoryLimitAndAdd(type, blockSize * blocksNum)) {
+            val queue = storage.getOrPut(blockSize) { ArrayDeque(blocksNum) }
+            Array(blocksNum) {
+                val block = queue.removeFirstOrNull()
+                if (fillZeros) {
+                    block?.fill(PrimitiveConstants.ZERO)
+                }
+                block ?: PrimitiveArray(blockSize)
+            }
+        } else {
+            Array(blocksNum) { PrimitiveArray(blockSize) }
+        }
+
+        val tiled = PrimitiveTiledArray(blocks)
+
+        return MutablePrimitiveNDArray(tiled, strides)
+    }
+
+    override fun returnNDArray(ndarray: NDArrayCore) {
+        require(ndarray is PrimitiveNDArray)
+        val blockSize = ndarray.array.blockSize
+        val blocksNum = ndarray.array.blocksNum
+
+        val queue = storage.getOrPut(blockSize) { ArrayDeque(blocksNum) }
+
+        queue.addAll(ndarray.array.blocks)
+    }
+
+    override fun clear() {
+        storage.forEach { (_, queue) -> queue.clear() }
+    }
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
index 4469e9d4e..db442f977 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
@@ -5,6 +5,7 @@ package io.kinference.ndarray.arrays.tiled
 
 import io.kinference.ndarray.arrays.*
 import io.kinference.ndarray.arrays.memory.*
+import io.kinference.ndarray.arrays.memory.contexts.AutoAllocatorContext
 import io.kinference.ndarray.arrays.pointers.PrimitivePointer
 import io.kinference.ndarray.arrays.pointers.accept
 import io.kinference.ndarray.blockSizeByStrides
diff --git a/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/PredictionConfig.kt b/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/PredictionConfig.kt
new file mode 100644
index 000000000..1828b36bd
--- /dev/null
+++ b/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/PredictionConfig.kt
@@ -0,0 +1,67 @@
+package io.kinference.utils
+
+enum class AllocationMode {
+    NoAllocation,
+    Manual,
+    Auto;
+}
+
+class PredictionConfig private constructor(
+    val parallelismLimit: Int,
+    val allocationMode: AllocationMode,
+    val memoryThreshold: Long,
+    val memoryClearingInterval: Long
+) {
+    companion object {
+        operator fun invoke(
+            parallelismLimit: Int,
+            allocationMode: AllocationMode,
+            memoryThreshold: Long,
+            memoryClearingInterval: Long
+        ): PredictionConfig {
+            require(parallelismLimit in 1..PlatformUtils.cores) {
+                "Parallelism limit must be within the range of 1 to available CPU cores: ${PlatformUtils.cores}."
+            }
+            return if (allocationMode == AllocationMode.NoAllocation) {
+                PredictionConfig(parallelismLimit, allocationMode, 0L, Long.MAX_VALUE)
+            } else {
+                require(memoryThreshold > 0) {
+                    "Memory threshold must be positive."
+                }
+                require(memoryClearingInterval > 0) {
+                    "Memory clearing interval must be positive."
+                }
+
+                PredictionConfig(parallelismLimit, allocationMode, memoryThreshold, memoryClearingInterval)
+            }
+        }
+    }
+}
+
+object PredictionConfigs {
+    val DefaultAutoAllocator: PredictionConfig = PredictionConfig(
+        parallelismLimit = PlatformUtils.cores,
+        allocationMode = AllocationMode.Auto,
+        memoryThreshold = (PlatformUtils.maxHeap * 0.3).toLong(),
+        memoryClearingInterval = 500
+    )
+    val DefaultManualAllocator: PredictionConfig = PredictionConfig(
+        parallelismLimit = PlatformUtils.cores,
+        allocationMode = AllocationMode.Manual,
+        memoryThreshold = 50 * 1024 * 1024,
+        memoryClearingInterval = 500
+    )
+    val NoAllocator: PredictionConfig = PredictionConfig(
+        parallelismLimit = PlatformUtils.cores,
+        allocationMode = AllocationMode.NoAllocation,
+        memoryThreshold = 0L,
+        memoryClearingInterval = Long.MAX_VALUE
+    )
+
+    fun customPredictionConfig(parallelismLimit: Int,
+                               allocationMode: AllocationMode,
+                               memoryThreshold: Long,
+                               memoryClearingInterval: Long): PredictionConfig {
+        return PredictionConfig(parallelismLimit, allocationMode, memoryThreshold, memoryClearingInterval)
+    }
+}

From b83f7f8ad20df6f909caed4789b92c91c64e1d0b Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Thu, 29 Aug 2024 13:44:48 +0200
Subject: [PATCH 08/19] JBAI-4393 [core, ndarray] Added getPrimitiveBlock
 extension functions for better primitive types handling: this solution gives
 less double primitive array allocations when Array<Any> changes to actual
 type.

---
 .../memory/contexts/AutoAllocatorContext.kt    |  5 +----
 .../BaseAllocatorContextWithStorage.kt         |  2 +-
 .../memory/storage/AutoArrayHandlingStorage.kt | 10 ++--------
 .../PrimitiveAutoHandlingArrayStorage.kt       | 10 +++++-----
 .../PrimitiveGetBlockFunctionsExtension.kt     | 18 ++++++++++++++++++
 .../arrays/tiled/PrimitiveTiledArray.kt        |  8 +++-----
 6 files changed, 30 insertions(+), 23 deletions(-)
 create mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt

diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
index 05f7063c0..a4d36b555 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
@@ -2,6 +2,7 @@ package io.kinference.ndarray.arrays.memory.contexts
 
 import io.kinference.ndarray.arrays.memory.storage.AutoArrayHandlingStorage
 import io.kinference.primitives.types.DataType
+import io.kinference.primitives.types.PrimitiveArray
 import kotlin.coroutines.*
 
 internal class AutoAllocatorContext internal constructor(
@@ -10,8 +11,4 @@ internal class AutoAllocatorContext internal constructor(
 
     companion object Key : CoroutineContext.Key<AutoAllocatorContext>
     override val key: CoroutineContext.Key<*> get() = Key
-
-    internal fun getArrays(type: DataType, size: Int, count: Int): Array<Any> {
-        return storage.getArrays(type, size, count)
-    }
 }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/BaseAllocatorContextWithStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/BaseAllocatorContextWithStorage.kt
index f98e96649..e617c78de 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/BaseAllocatorContextWithStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/BaseAllocatorContextWithStorage.kt
@@ -5,7 +5,7 @@ import kotlin.coroutines.CoroutineContext
 
 interface BaseAllocatorContext: CoroutineContext.Element
 
-abstract class BaseAllocatorContextWithStorage<T : ArrayStorage>(protected val storage: T) : BaseAllocatorContext {
+abstract class BaseAllocatorContextWithStorage<T : ArrayStorage>(internal val storage: T) : BaseAllocatorContext {
     fun finalizeContext() {
         storage.resetState()
     }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt
index 030beac56..803fe4416 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt
@@ -1,15 +1,13 @@
 package io.kinference.ndarray.arrays.memory.storage
 
 import io.kinference.ndarray.arrays.memory.*
-import io.kinference.primitives.types.DataType
 
 internal interface TypedAutoHandlingStorage {
-    fun getBlock(blocksNum: Int, blockSize: Int, limiter: MemoryManager): Array<Any>
     fun moveBlocksIntoUnused()
 }
 
-internal class AutoArrayHandlingStorage(private val limiter: MemoryManager) : ArrayStorage {
-    private val storage: List<TypedAutoHandlingStorage> = listOf(
+internal class AutoArrayHandlingStorage(internal val limiter: MemoryManager) : ArrayStorage {
+    internal val storage: List<TypedAutoHandlingStorage> = listOf(
         ByteAutoHandlingArrayStorage(),
         ShortAutoHandlingArrayStorage(),
         IntAutoHandlingArrayStorage(),
@@ -23,10 +21,6 @@ internal class AutoArrayHandlingStorage(private val limiter: MemoryManager) : Ar
         BooleanAutoHandlingArrayStorage()
     )
 
-    internal fun getArrays(type: DataType, size: Int, count: Int): Array<Any> {
-        return storage[type.ordinal].getBlock(blocksNum = count, blockSize = size, limiter = limiter)
-    }
-
     override fun resetState() {
         storage.forEach { it.moveBlocksIntoUnused() }
         limiter.resetLimit()
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
index 0d9d4e7da..71ad40341 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
@@ -17,15 +17,15 @@ internal class PrimitiveAutoHandlingArrayStorage : TypedAutoHandlingStorage {
         private val type = DataType.CurrentPrimitive
     }
 
-    override fun getBlock(blocksNum: Int, blockSize: Int, limiter: MemoryManager): Array<Any> {
+    fun getBlock(blocksNum: Int, blockSize: Int, limiter: MemoryManager): Array<PrimitiveArray> {
         val unusedQueue = unused.getOrPut(blockSize) { ArrayDeque(blocksNum) }
         val usedQueue = used.getOrPut(blockSize) { ArrayDeque(blocksNum) }
 
         val blocks = if (limiter.checkMemoryLimitAndAdd(type, blockSize * blocksNum)) {
             Array(blocksNum) {
-                val block = unusedQueue.removeFirstOrNull()
-                block?.fill(PrimitiveConstants.ZERO)
-                block ?: PrimitiveArray(blockSize)
+                unusedQueue.removeFirstOrNull()?.apply {
+                    fill(PrimitiveConstants.ZERO)
+                } ?: PrimitiveArray(blockSize)
             }
         } else {
             Array(blocksNum) {
@@ -35,7 +35,7 @@ internal class PrimitiveAutoHandlingArrayStorage : TypedAutoHandlingStorage {
 
         usedQueue.addAll(blocks)
 
-        return blocks as Array<Any>
+        return blocks
     }
 
     override fun moveBlocksIntoUnused() {
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt
new file mode 100644
index 000000000..6bb61d9c8
--- /dev/null
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt
@@ -0,0 +1,18 @@
+@file:GeneratePrimitives(DataType.ALL)
+@file:Suppress("DuplicatedCode")
+package io.kinference.ndarray.arrays.memory.storage
+
+import io.kinference.ndarray.arrays.memory.contexts.AutoAllocatorContext
+import io.kinference.primitives.annotations.GenerateNameFromPrimitives
+import io.kinference.primitives.annotations.GeneratePrimitives
+import io.kinference.primitives.types.*
+
+@GenerateNameFromPrimitives
+internal fun AutoArrayHandlingStorage.getPrimitiveBlock(blocksNum: Int, blockSize: Int): Array<PrimitiveArray> {
+    return (storage[DataType.CurrentPrimitive.ordinal] as PrimitiveAutoHandlingArrayStorage).getBlock(blocksNum = blocksNum, blockSize = blockSize, limiter = limiter)
+}
+
+@GenerateNameFromPrimitives
+internal fun AutoAllocatorContext.getPrimitiveBlock(blocksNum: Int, blockSize: Int): Array<PrimitiveArray> {
+    return storage.getPrimitiveBlock(blocksNum = blocksNum, blockSize = blockSize)
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
index db442f977..600211e3b 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
@@ -4,8 +4,8 @@
 package io.kinference.ndarray.arrays.tiled
 
 import io.kinference.ndarray.arrays.*
-import io.kinference.ndarray.arrays.memory.*
 import io.kinference.ndarray.arrays.memory.contexts.AutoAllocatorContext
+import io.kinference.ndarray.arrays.memory.storage.*
 import io.kinference.ndarray.arrays.pointers.PrimitivePointer
 import io.kinference.ndarray.arrays.pointers.accept
 import io.kinference.ndarray.blockSizeByStrides
@@ -59,11 +59,9 @@ internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>) {
                 require(size % blockSize == 0) { "Size must divide blockSize" }
 
             val blocksNum = if (blockSize == 0) 0 else size / blockSize
+            val blocks = coroutineContext[AutoAllocatorContext.Key]?.getPrimitiveBlock(blocksNum, blockSize) ?: Array(blocksNum) { PrimitiveArray(blockSize) }
 
-            val coroutineContext = coroutineContext[AutoAllocatorContext.Key]
-            val blocks = coroutineContext?.getArrays(type, blockSize, blocksNum) ?: Array(blocksNum) { PrimitiveArray(blockSize) }
-
-            return PrimitiveTiledArray(blocks.map { it as PrimitiveArray }.toTypedArray())
+            return PrimitiveTiledArray(blocks)
         }
 
         suspend operator fun invoke(size: Int, blockSize: Int, init: (InlineInt) -> PrimitiveType) : PrimitiveTiledArray {

From 450a39e712c2fb343e08ff6d8ac2888d04f0c18f Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Thu, 29 Aug 2024 16:44:28 +0200
Subject: [PATCH 09/19] JBAI-4393 [ndarray] Added Fastutil support for more
 efficient primitive handling in primitive array storage classes.

---
 gradle/libs.versions.toml                         |  2 ++
 ndarray/ndarray-core/build.gradle.kts             |  1 +
 .../jvmMain/kotlin/io/kinference/ndarray/Utils.kt |  2 ++
 .../storage/PrimitiveAutoHandlingArrayStorage.kt  | 15 ++++++++-------
 .../PrimitiveManualHandlingArrayStorage.kt        | 15 ++++++++-------
 .../kinference/ndarray/extensions/utils/Utils.kt  | 13 +++++++++++++
 6 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
index 71df4deb4..bf86867b5 100644
--- a/gradle/libs.versions.toml
+++ b/gradle/libs.versions.toml
@@ -11,6 +11,7 @@ okio = "3.6.0"
 onnxruntime = "1.17.0.patched-1"
 slf4j = "2.0.9"
 wire = "4.9.3"
+fastutil = "8.5.14"
 
 # JS Dependencies
 loglevel = "1.8.1"
@@ -36,3 +37,4 @@ onnxruntime-gpu = { module = "com.microsoft.onnxruntime:onnxruntime_gpu", versio
 slf4j-api = { module = "org.slf4j:slf4j-api", version.ref = "slf4j" }
 slf4j-simple = { module = "org.slf4j:slf4j-simple", version.ref = "slf4j" }
 wire-runtime = { module = "com.squareup.wire:wire-runtime", version.ref = "wire" }
+fastutil-core = { module = "it.unimi.dsi:fastutil-core", version.ref = "fastutil" }
diff --git a/ndarray/ndarray-core/build.gradle.kts b/ndarray/ndarray-core/build.gradle.kts
index f68fa0d08..96d59fb66 100644
--- a/ndarray/ndarray-core/build.gradle.kts
+++ b/ndarray/ndarray-core/build.gradle.kts
@@ -17,6 +17,7 @@ kotlin {
                 implementation(libs.kotlinx.coroutines.core)
                 implementation(libs.kotlinx.atomicfu)
                 api(libs.apache.commons.math4.core)
+                api(libs.fastutil.core)
             }
         }
     }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/Utils.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/Utils.kt
index c1af61364..3869f6162 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/Utils.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/Utils.kt
@@ -73,6 +73,8 @@ const val ERF_COEF_3 = 1.421413741
 const val ERF_COEF_4 = -1.453152027
 const val ERF_COEF_5 = 1.061405429
 
+const val INIT_STORAGE_SIZE = 64
+
 internal fun IntArray.swap(leftIdx: Int, rightIdx: Int) {
     val temp = get(leftIdx)
     this[leftIdx] = this[rightIdx]
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
index 71ad40341..aca7fd13f 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
@@ -1,36 +1,37 @@
 @file:GeneratePrimitives(DataType.ALL)
 package io.kinference.ndarray.arrays.memory.storage
 
+import io.kinference.ndarray.INIT_STORAGE_SIZE
 import io.kinference.ndarray.arrays.memory.MemoryManager
 import io.kinference.ndarray.extensions.constants.PrimitiveConstants
+import io.kinference.ndarray.extensions.utils.getOrPut
 import io.kinference.primitives.annotations.GenerateNameFromPrimitives
 import io.kinference.primitives.annotations.GeneratePrimitives
 import io.kinference.primitives.types.DataType
 import io.kinference.primitives.types.PrimitiveArray
+import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap
 
 @GenerateNameFromPrimitives
 internal class PrimitiveAutoHandlingArrayStorage : TypedAutoHandlingStorage {
-    private val used = HashMap<Int, ArrayDeque<PrimitiveArray>>(8)
-    private val unused = HashMap<Int, ArrayDeque<PrimitiveArray>>(8)
+    private val used = Int2ObjectOpenHashMap<ArrayDeque<PrimitiveArray>>(INIT_STORAGE_SIZE)
+    private val unused = Int2ObjectOpenHashMap<ArrayDeque<PrimitiveArray>>(INIT_STORAGE_SIZE)
 
     companion object {
         private val type = DataType.CurrentPrimitive
     }
 
-    fun getBlock(blocksNum: Int, blockSize: Int, limiter: MemoryManager): Array<PrimitiveArray> {
+    internal fun getBlock(blocksNum: Int, blockSize: Int, limiter: MemoryManager): Array<PrimitiveArray> {
         val unusedQueue = unused.getOrPut(blockSize) { ArrayDeque(blocksNum) }
         val usedQueue = used.getOrPut(blockSize) { ArrayDeque(blocksNum) }
 
-        val blocks = if (limiter.checkMemoryLimitAndAdd(type, blockSize * blocksNum)) {
+        val blocks = if (limiter.checkMemoryLimitAndAdd(type, size = blockSize * blocksNum)) {
             Array(blocksNum) {
                 unusedQueue.removeFirstOrNull()?.apply {
                     fill(PrimitiveConstants.ZERO)
                 } ?: PrimitiveArray(blockSize)
             }
         } else {
-            Array(blocksNum) {
-                PrimitiveArray(blockSize)
-            }
+            Array(blocksNum) { PrimitiveArray(blockSize) }
         }
 
         usedQueue.addAll(blocks)
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt
index 821438a97..7a71e16c9 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt
@@ -1,19 +1,22 @@
 @file:GeneratePrimitives(DataType.ALL)
 package io.kinference.ndarray.arrays.memory.storage
 
+import io.kinference.ndarray.INIT_STORAGE_SIZE
 import io.kinference.ndarray.arrays.*
 import io.kinference.ndarray.arrays.PrimitiveNDArray
 import io.kinference.ndarray.arrays.memory.MemoryManager
 import io.kinference.ndarray.arrays.tiled.PrimitiveTiledArray
 import io.kinference.ndarray.blockSizeByStrides
 import io.kinference.ndarray.extensions.constants.PrimitiveConstants
+import io.kinference.ndarray.extensions.utils.getOrPut
 import io.kinference.primitives.annotations.*
 import io.kinference.primitives.types.DataType
 import io.kinference.primitives.types.PrimitiveArray
+import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap
 
 @GenerateNameFromPrimitives
 internal class PrimitiveManualHandlingArrayStorage : TypedManualHandlingStorage {
-    private val storage = HashMap<Int, ArrayDeque<PrimitiveArray>>(8)
+    private val storage = Int2ObjectOpenHashMap<ArrayDeque<PrimitiveArray>>(INIT_STORAGE_SIZE)
 
     companion object {
         private val type = DataType.CurrentPrimitive
@@ -22,14 +25,12 @@ internal class PrimitiveManualHandlingArrayStorage : TypedManualHandlingStorage
     override fun getNDArray(strides: Strides, fillZeros: Boolean, limiter: MemoryManager): MutableNDArrayCore {
         val blockSize = blockSizeByStrides(strides)
         val blocksNum = strides.linearSize / blockSize
-        val blocks = if (limiter.checkMemoryLimitAndAdd(type, blockSize * blocksNum)) {
+        val blocks = if (limiter.checkMemoryLimitAndAdd(type, size = blockSize * blocksNum)) {
             val queue = storage.getOrPut(blockSize) { ArrayDeque(blocksNum) }
             Array(blocksNum) {
-                val block = queue.removeFirstOrNull()
-                if (fillZeros) {
-                    block?.fill(PrimitiveConstants.ZERO)
-                }
-                block ?: PrimitiveArray(blockSize)
+                queue.removeFirstOrNull()?.apply {
+                    fill(PrimitiveConstants.ZERO)
+                } ?: PrimitiveArray(blockSize)
             }
         } else {
             Array(blocksNum) { PrimitiveArray(blockSize) }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/utils/Utils.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/utils/Utils.kt
index 8c4e18063..fec73c0f9 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/utils/Utils.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/utils/Utils.kt
@@ -1,5 +1,7 @@
 package io.kinference.ndarray.extensions.utils
 
+import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap
+
 /***
  * Calculates the total size of the tensor with such shape.
  */
@@ -50,3 +52,14 @@ internal fun computeColumnMajorIndex(
 internal fun isInPadding(actual: Int, bound: Int) : Boolean {
     return actual < 0 || actual >= bound
 }
+
+inline fun <V> Int2ObjectOpenHashMap<V>.getOrPut(key: Int, defaultValue: () -> V): V {
+    val existingValue = this[key]
+    return if (existingValue != null) {
+        existingValue
+    } else {
+        val value = defaultValue()
+        put(key, value)
+        value
+    }
+}

From d25ecfff8b64b03e9ce835548a4f66b733c92f18 Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Thu, 29 Aug 2024 18:11:02 +0200
Subject: [PATCH 10/19] JBAI-4393 [buildSrc] Configured JVM benchmark tests to
 disable coroutines debug mode.

---
 buildSrc/src/main/kotlin/io/kinference/gradle/JVMTestTasks.kt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/buildSrc/src/main/kotlin/io/kinference/gradle/JVMTestTasks.kt b/buildSrc/src/main/kotlin/io/kinference/gradle/JVMTestTasks.kt
index 250843f1c..c9c51d46a 100644
--- a/buildSrc/src/main/kotlin/io/kinference/gradle/JVMTestTasks.kt
+++ b/buildSrc/src/main/kotlin/io/kinference/gradle/JVMTestTasks.kt
@@ -56,6 +56,7 @@ fun KotlinJvmTarget.configureBenchmarkTests() {
         group = "verification"
 
         maxHeapSize = "4G"
+        systemProperty("kotlinx.coroutines.debug", "off")
 
         useJUnitPlatform()
 

From 8dfd6eb5750605513a3b4376f6bd4ea5ab6f3c6e Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Fri, 30 Aug 2024 13:47:25 +0200
Subject: [PATCH 11/19] Fixed broadcasting shape logic in matrix multiplication
 for 1D.

---
 .../kotlin/io/kinference/ndarray/broadcasting/Broadcasting.kt   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/broadcasting/Broadcasting.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/broadcasting/Broadcasting.kt
index bde7580d4..7e2e18551 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/broadcasting/Broadcasting.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/broadcasting/Broadcasting.kt
@@ -17,7 +17,7 @@ fun unsqueezeFirst(shape: IntArray, newShapeSize: Int): IntArray {
 object Broadcasting {
     fun broadcastShapeForMatmul(leftShape: IntArray, rightShape: IntArray): IntArray {
         val actualLeftShape = if (leftShape.size == 1) intArrayOf(1, leftShape[0]) else leftShape
-        val actualRightShape = if (rightShape.size == 1) intArrayOf(1, rightShape[1]) else rightShape
+        val actualRightShape = if (rightShape.size == 1) intArrayOf(rightShape[0], 1) else rightShape
 
         val outputMatrixShape = intArrayOf(actualLeftShape[actualLeftShape.lastIndex - 1], actualRightShape.last())
         val broadcastShape = broadcastShape(listOf(actualLeftShape.copyOfRange(0, actualLeftShape.size - 2),

From a19fc9ce083c7831a6bf6ed5ad9134edb2d36ca6 Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Mon, 2 Sep 2024 10:26:05 +0200
Subject: [PATCH 12/19] JBAI-4393 [core, ndarray] Streamlined memory size
 calculations using constants, removed commented-out and redundant code.

---
 .../operators/layer/attention/Attention.kt    | 14 +++---
 .../ndarray/arrays/memory/MemoryManager.kt    | 49 ++++++++-----------
 .../storage/AutoArrayHandlingStorage.kt       |  2 +-
 .../storage/ManualArrayHandlingStorage.kt     |  2 +-
 .../PrimitiveAutoHandlingArrayStorage.kt      |  2 +-
 .../PrimitiveGetBlockFunctionsExtension.kt    |  6 +++
 .../PrimitiveManualHandlingArrayStorage.kt    |  2 +-
 .../ndarray/extensions/PrimitiveExtensions.kt | 17 -------
 .../extensions/constants/BooleanConstants.kt  |  1 +
 .../constants/PrimitiveConstants.kt           |  5 +-
 .../extensions/gelu/BiasGeluPrimitive.kt      |  1 -
 11 files changed, 40 insertions(+), 61 deletions(-)

diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
index a06b99080..1add2b1b9 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
@@ -163,10 +163,8 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
 
             val softmaxDest = (context?.getNDArray(scores.type, scoresStrides) ?: allocateNDArray(scores.type, scoresStrides)) as MutableNumberNDArrayCore
 
-            return softmax(input = scores, axis = -1, dest = softmaxDest)
-
             //softmax for each result (normalize along last axis)
-//            return scores.softmax(axis = -1)
+            return softmax(input = scores, axis = -1, dest = softmaxDest)
         }
 
         private suspend fun IntNDArray?.maskFromIndices(unidir: Boolean, batchSize: Int, seqLen: Int, pastSeqLen: Int, maskFilterValue: Float = -10_000f, context: ManualAllocatorContext? = null): FloatNDArray {
@@ -174,20 +172,20 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
             val maskDataShape = intArrayOf(batchSize, seqLen, fullSeqLen)
             val maskStrides = Strides(maskDataShape)
 
-            val mask = context?.getNDArray(DataType.FLOAT, maskStrides) ?: MutableFloatNDArray(maskStrides)
+            val mask = (context?.getNDArray(DataType.FLOAT, maskStrides) ?: MutableFloatNDArray(maskStrides)) as MutableFloatNDArray
             val maskOffset = seqLen * fullSeqLen
             repeat(batchSize) { i ->
                 if (this != null) {
                     //raw attention (no padding). only raw attention mask is 2-dimensional
                     if (this.rank == 2) {
-                        val maskPointer = (mask as MutableFloatNDArray).array.pointer(maskOffset * i)
+                        val maskPointer = mask.array.pointer(maskOffset * i)
                         val maskIndicesPointer = this.array.pointer(i * fullSeqLen)
 
                         maskPointer.accept(maskIndicesPointer, fullSeqLen) { _, src -> if (src > 0) 0f else maskFilterValue }
                     } else {
                         //for left/right-side padding
                         val maskIndicesPointer = this.array.pointer(i)
-                        val maskPointer = (mask as MutableFloatNDArray).array.pointer(maskOffset * i + maskIndicesPointer.get())
+                        val maskPointer = mask.array.pointer(maskOffset * i + maskIndicesPointer.get())
                         maskPointer.map(fullSeqLen - maskIndicesPointer.get()) { maskFilterValue }
 
                         if (this.rank == 1 && this.shape[0] == 2 * batchSize) {
@@ -205,7 +203,7 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
                 }
 
                 if (unidir) {
-                    val maskPointer = (mask as MutableFloatNDArray).array.pointer()
+                    val maskPointer = mask.array.pointer()
                     for (seqIdx in 0 until seqLen - 1) {
                         val start = pastSeqLen + seqIdx + 1
                         maskPointer.linearIndex = seqIdx * fullSeqLen + maskOffset * i + start
@@ -213,7 +211,7 @@ sealed class Attention(name: String, info: OperatorInfo, attributes: Map<String,
                     }
                 }
             }
-            return (mask as MutableFloatNDArray)
+            return mask
         }
 
         private val DEFAULT_VERSION = VersionInfo(sinceVersion = 1)
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryManager.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryManager.kt
index 3dc575bf2..cef20b3af 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryManager.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/MemoryManager.kt
@@ -1,6 +1,5 @@
 package io.kinference.ndarray.arrays.memory
 
-import io.kinference.primitives.types.DataType
 import kotlinx.atomicfu.*
 import kotlinx.coroutines.*
 
@@ -13,15 +12,13 @@ internal class MemoryManager internal constructor(private val memoryLimit: Long,
     /**
      * Checks if the memory limit allows adding the specified amount of memory and performs the addition
      *
-     * @param type is the DataType of underlying primitives in a checking array
-     * @param size is the checking array size
+     * @param sizeInBytes is the checking size of an array in bytes
      * @return true if the memory was added successfully and false if adding the memory exceeds the memory limit
      */
-    fun checkMemoryLimitAndAdd(type: DataType, size: Int): Boolean {
+    fun checkMemoryLimitAndAdd(sizeInBytes: Long): Boolean {
         // Attempt to add memory and check the limit
-        val added = sizeInBytes(type.ordinal, size)
         val successful = usedMemory.getAndUpdate { current ->
-            if (current + added > memoryLimit) current else current + added
+            if (current + sizeInBytes > memoryLimit) current else current + sizeInBytes
         } != usedMemory.value // Check if the update was successful
 
         return successful
@@ -34,6 +31,16 @@ internal class MemoryManager internal constructor(private val memoryLimit: Long,
         usedMemory.value = 0L
     }
 
+    /**
+     * Updates the last access time to the current system time and starts a monitoring coroutine if it isn't already running.
+     *
+     * This function sets the `lastAccessTime` to the current system time in milliseconds.
+     * It also initiates a monitoring coroutine to periodically check
+     * if the time since the last access exceeds a predefined `cacheClearingInterval`.
+     * If it does, the `onCacheClear` function is triggered to handle
+     * any necessary cache clearing.
+     * The coroutine will run only if it is not already running and `isFinalized` is false.
+     */
     fun updateLastAccessTime() {
         lastAccessTime.value = System.currentTimeMillis()
 
@@ -53,32 +60,16 @@ internal class MemoryManager internal constructor(private val memoryLimit: Long,
         }
     }
 
+    /**
+     * Stops the monitoring process by canceling the active monitoring coroutine.
+     *
+     * This function sets the `isFinalized` flag to true, indicating that the monitoring process has been
+     * concluded.
+     * If a monitoring coroutine is currently active, it will be canceled.
+     */
     fun stopMonitoring() {
         if (isFinalized.compareAndSet(expect = false, update = true)) {
             monitorJob.getAndSet(value = null)?.cancel()
         }
     }
-
-    companion object {
-        private val typeSizes: LongArray = LongArray(DataType.entries.size).apply {
-            this[DataType.BYTE.ordinal] = Byte.SIZE_BYTES.toLong()
-            this[DataType.SHORT.ordinal] = Short.SIZE_BYTES.toLong()
-            this[DataType.INT.ordinal] = Int.SIZE_BYTES.toLong()
-            this[DataType.LONG.ordinal] = Long.SIZE_BYTES.toLong()
-
-            this[DataType.UBYTE.ordinal] = UByte.SIZE_BYTES.toLong()
-            this[DataType.USHORT.ordinal] = UShort.SIZE_BYTES.toLong()
-            this[DataType.UINT.ordinal] = UInt.SIZE_BYTES.toLong()
-            this[DataType.ULONG.ordinal] = ULong.SIZE_BYTES.toLong()
-
-            this[DataType.FLOAT.ordinal] = Float.SIZE_BYTES.toLong()
-            this[DataType.DOUBLE.ordinal] = Double.SIZE_BYTES.toLong()
-
-            this[DataType.BOOLEAN.ordinal] = 1.toLong()
-        }
-
-        private fun sizeInBytes(typeIndex: Int, size: Int): Long {
-            return typeSizes[typeIndex] * size
-        }
-    }
 }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt
index 803fe4416..62364570b 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt
@@ -7,7 +7,7 @@ internal interface TypedAutoHandlingStorage {
 }
 
 internal class AutoArrayHandlingStorage(internal val limiter: MemoryManager) : ArrayStorage {
-    internal val storage: List<TypedAutoHandlingStorage> = listOf(
+    internal val storage: Array<TypedAutoHandlingStorage> = arrayOf(
         ByteAutoHandlingArrayStorage(),
         ShortAutoHandlingArrayStorage(),
         IntAutoHandlingArrayStorage(),
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt
index 0631056b3..559334f8d 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt
@@ -16,7 +16,7 @@ interface ManualStorage : ArrayStorage {
 }
 
 internal class ManualArrayHandlingStorage(private val memoryManager: MemoryManager) : ManualStorage {
-    private val storage: List<TypedManualHandlingStorage> = listOf(
+    private val storage: Array<TypedManualHandlingStorage> = arrayOf(
         ByteManualHandlingArrayStorage(),
         ShortManualHandlingArrayStorage(),
         IntManualHandlingArrayStorage(),
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
index aca7fd13f..c0b7d9866 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
@@ -24,7 +24,7 @@ internal class PrimitiveAutoHandlingArrayStorage : TypedAutoHandlingStorage {
         val unusedQueue = unused.getOrPut(blockSize) { ArrayDeque(blocksNum) }
         val usedQueue = used.getOrPut(blockSize) { ArrayDeque(blocksNum) }
 
-        val blocks = if (limiter.checkMemoryLimitAndAdd(type, size = blockSize * blocksNum)) {
+        val blocks = if (limiter.checkMemoryLimitAndAdd(type.getPrimitiveArraySizeInBytes(arraySize = blockSize * blocksNum))) {
             Array(blocksNum) {
                 unusedQueue.removeFirstOrNull()?.apply {
                     fill(PrimitiveConstants.ZERO)
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt
index 6bb61d9c8..5da084dc3 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt
@@ -3,6 +3,7 @@
 package io.kinference.ndarray.arrays.memory.storage
 
 import io.kinference.ndarray.arrays.memory.contexts.AutoAllocatorContext
+import io.kinference.ndarray.extensions.constants.PrimitiveConstants
 import io.kinference.primitives.annotations.GenerateNameFromPrimitives
 import io.kinference.primitives.annotations.GeneratePrimitives
 import io.kinference.primitives.types.*
@@ -16,3 +17,8 @@ internal fun AutoArrayHandlingStorage.getPrimitiveBlock(blocksNum: Int, blockSiz
 internal fun AutoAllocatorContext.getPrimitiveBlock(blocksNum: Int, blockSize: Int): Array<PrimitiveArray> {
     return storage.getPrimitiveBlock(blocksNum = blocksNum, blockSize = blockSize)
 }
+
+@GenerateNameFromPrimitives
+internal fun DataType.getPrimitiveArraySizeInBytes(arraySize: Int): Long {
+    return PrimitiveConstants.SIZE_BYTES * arraySize
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt
index 7a71e16c9..29060279b 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt
@@ -25,7 +25,7 @@ internal class PrimitiveManualHandlingArrayStorage : TypedManualHandlingStorage
     override fun getNDArray(strides: Strides, fillZeros: Boolean, limiter: MemoryManager): MutableNDArrayCore {
         val blockSize = blockSizeByStrides(strides)
         val blocksNum = strides.linearSize / blockSize
-        val blocks = if (limiter.checkMemoryLimitAndAdd(type, size = blockSize * blocksNum)) {
+        val blocks = if (limiter.checkMemoryLimitAndAdd(type.getPrimitiveArraySizeInBytes(arraySize = blockSize * blocksNum))) {
             val queue = storage.getOrPut(blockSize) { ArrayDeque(blocksNum) }
             Array(blocksNum) {
                 queue.removeFirstOrNull()?.apply {
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/PrimitiveExtensions.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/PrimitiveExtensions.kt
index 570ca8520..cbf651bc0 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/PrimitiveExtensions.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/PrimitiveExtensions.kt
@@ -128,21 +128,17 @@ internal suspend fun PrimitiveNDArray.dotTransposedWithAlpha(alpha: Double, othe
     other as PrimitiveNDArray; destination as MutablePrimitiveNDArray
 
     val alpha = alpha.toPrimitive()
-//    val dBlocksInRow = destination.blocksInRow
     val lrBlocksInRow = this.blocksInRow
 
     val n = this.shape[0]
     val t = this.shape[1]
     val m = other.shape[0]
 
-//    val dBlockSize = destination.array.blockSize
     val lrBlockSize = this.array.blockSize
 
-//    val destBlocks = destination.array.blocks
     val leftBlocks = this.array.blocks
     val rightBlocks = other.array.blocks
     val rowFlop = t * m
-//    val zero = (0).toPrimitive()
 
 
     /* TODO: (dmitriyb) this is temporary commented. On GEC performance test we have large inputs that cause out of memory exceptions
@@ -162,7 +158,6 @@ internal suspend fun PrimitiveNDArray.dotTransposedWithAlpha(alpha: Double, othe
     // TODO: (cupertank) Remove constants
     // TODO: (dmitriyb) Implement concurrent array retrieve with a separate structure from ArraysDispatcher
     parallelizeByRows(rowFlop, n, 262144) { nStart: Int, nEnd: Int, _ ->
-//        val mSums = Array(m) { PrimitiveArray(lrBlockSize) }
         val tempSum = PrimitiveArray(lrBlockSize)
         val destPointer = destination.array.pointer()
         for (i in nStart until nEnd) {
@@ -170,10 +165,8 @@ internal suspend fun PrimitiveNDArray.dotTransposedWithAlpha(alpha: Double, othe
             val rightBlockIter = rightBlocks.iterator()
 
             destPointer.linearIndex = i * m
-//            val destBlockOffset = i * dBlocksInRow
 
             for (k in 0 until m) {
-//                val tempArray = mSums[k]
                 for (lrBlock in 0 until lrBlocksInRow) {
                     val leftBlock = leftBlocks[leftBlockOffset + lrBlock]
                     val rightBlock = rightBlockIter.next()
@@ -186,16 +179,6 @@ internal suspend fun PrimitiveNDArray.dotTransposedWithAlpha(alpha: Double, othe
                 destPointer.setAndIncrement(tempSum.sum() * alpha)
                 tempSum.fill(PrimitiveConstants.ZERO)
             }
-
-//            val mSumsIter = mSums.iterator()
-//            for (destBlockNum in 0 until dBlocksInRow) {
-//                val destBlock = destBlocks[destBlockOffset + destBlockNum]
-//                for (j in destBlock.indices) {
-//                    val sumBlock = mSumsIter.next()
-//                    destBlock[j] = sumBlock.sum() * alpha
-//                    sumBlock.fill(zero)
-//                }
-//            }
         }
     }
 
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt
index 00f4767fa..e3e369c6b 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt
@@ -3,4 +3,5 @@ package io.kinference.ndarray.extensions.constants
 object BooleanConstants {
     const val ZERO = false
     const val ONE = true
+    const val SIZE_BYTES = 1.toLong()
 }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/PrimitiveConstants.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/PrimitiveConstants.kt
index e1edbef10..09aec0c9e 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/PrimitiveConstants.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/PrimitiveConstants.kt
@@ -2,10 +2,9 @@
 package io.kinference.ndarray.extensions.constants
 
 import io.kinference.primitives.annotations.*
-import io.kinference.primitives.types.DataType
-import io.kinference.primitives.types.toPrimitive
 import io.kinference.ndarray.toUShort
 import io.kinference.ndarray.toUByte
+import io.kinference.primitives.types.*
 
 
 @GenerateNameFromPrimitives
@@ -29,5 +28,7 @@ internal object PrimitiveConstants {
 
     val INV_ERF_COEF_1 = (4.330746750799873).toPrimitive()
     val INV_ERF_COEF_2 = (6.802721088435375).toPrimitive()
+
+    val SIZE_BYTES = PrimitiveType.SIZE_BYTES.toLong()
 }
 
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt
index 4c5682899..9ba08ddb0 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/gelu/BiasGeluPrimitive.kt
@@ -15,7 +15,6 @@ import kotlin.math.*
 
 @GenerateNameFromPrimitives
 internal suspend fun computeGeluPrimitive(input: PrimitiveNDArray, bias: PrimitiveNDArray, output: MutablePrimitiveNDArray): MutablePrimitiveNDArray {
-//    val output = MutablePrimitiveNDArray(input.strides)
 
     val inputBlocks = input.array.blocks
     val biasBlocks = bias.array.blocks

From c942273ab31b07d037c02cbfab4e2d17dd84f825 Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Mon, 2 Sep 2024 16:08:20 +0200
Subject: [PATCH 13/19] JBAI-4393 [core, ndarray] Refactored coroutine contexts
 to be polymorphic, merge ParallelismLimiterContext and its thread limiter
 behavior into PredictionContext.

---
 .../io/kinference.core/model/KIModel.kt       |  2 -
 .../operators/layer/attention/Attention.kt    |  3 +-
 .../normalization/EmbedLayerNormalization.kt  |  3 +-
 .../normalization/SkipLayerNormalization.kt   |  5 ++-
 .../io/kinference.core/operators/math/Add.kt  |  3 +-
 .../operators/math/BiasGelu.kt                |  3 +-
 .../kinference.core/operators/math/MatMul.kt  |  3 +-
 .../kinference.core/operators/tensor/Cast.kt  |  3 +-
 .../memory/PredictionContextDispatcher.kt     | 37 +++++++++----------
 .../memory/contexts/AutoAllocatorContext.kt   |  9 ++---
 .../BaseAllocatorContextWithStorage.kt        | 24 ------------
 .../memory/contexts/ManualAllocatorContext.kt |  9 ++---
 .../storage/AutoArrayHandlingStorage.kt       |  1 +
 .../storage/ManualArrayHandlingStorage.kt     |  1 +
 .../PrimitiveAutoHandlingArrayStorage.kt      |  2 +-
 .../PrimitiveGetBlockFunctionsExtension.kt    |  2 +-
 .../PrimitiveManualHandlingArrayStorage.kt    |  2 +-
 .../arrays/tiled/PrimitiveTiledArray.kt       |  4 +-
 .../kinference/utils/ResourcesDispatcher.kt   | 28 ++++++++++++--
 19 files changed, 73 insertions(+), 71 deletions(-)
 delete mode 100644 ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/BaseAllocatorContextWithStorage.kt

diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
index 6611fc1ce..3f78377d5 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/model/KIModel.kt
@@ -5,7 +5,6 @@ import io.kinference.core.graph.KIGraph
 import io.kinference.graph.Contexts
 import io.kinference.model.Model
 import io.kinference.ndarray.arrays.memory.*
-import io.kinference.ndarray.arrays.memory.contexts.finalizeAllocatorContext
 import io.kinference.operator.OperatorSetRegistry
 import io.kinference.profiler.*
 import io.kinference.protobuf.message.ModelProto
@@ -47,7 +46,6 @@ class KIModel(
                 return@withContext graph.execute(input, contexts).map { it.clone(it.name) }.toList()
             }
 
-            predictionContext.finalizeAllocatorContext()
             predictionContextDispatcher.returnStorage(predictionContext)
             output
         } finally {
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
index 1add2b1b9..737328779 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
@@ -19,6 +19,7 @@ import io.kinference.optimizer.GraphOptimizer.Companion.isOpt
 import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.message.AttributeProto
 import io.kinference.protobuf.message.TensorProto
+import io.kinference.utils.PredictionContext
 import io.kinference.utils.launchWithLimitOrDefault
 import kotlinx.coroutines.coroutineScope
 import kotlin.coroutines.coroutineContext
@@ -287,7 +288,7 @@ class AttentionVer1(name: String, attributes: Map<String, Attribute<Any>>, input
     private val maskFilterValue: Float by attribute("mask_filter_value") { it: Number -> it.toFloat() }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val context = coroutineContext[ManualAllocatorContext.Key]
+        val context = coroutineContext[PredictionContext.Key] as? ManualAllocatorContext
 
         val input = inputs[0]!!
         val weights = inputs[1]!!
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
index 33a01c6d3..098667725 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
@@ -11,6 +11,7 @@ import io.kinference.operator.*
 import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.message.AttributeProto.AttributeType
 import io.kinference.protobuf.message.TensorProto
+import io.kinference.utils.PredictionContext
 import kotlin.coroutines.coroutineContext
 import kotlin.math.sqrt
 
@@ -175,7 +176,7 @@ class EmbedLayerNormalizationVer1(
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val manualContext = coroutineContext[ManualAllocatorContext.Key]
+        val manualContext = coroutineContext[PredictionContext.Key] as? ManualAllocatorContext
 
         val inputIds = inputs[0]!!.data as IntNDArray
         val segmentIds = inputs[1]?.data as IntNDArray?
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
index 08b8e7f1a..aa246044f 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
@@ -13,6 +13,7 @@ import io.kinference.operator.*
 import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.message.AttributeProto
 import io.kinference.protobuf.message.TensorProto
+import io.kinference.utils.PredictionContext
 import kotlin.coroutines.coroutineContext
 import kotlin.math.sqrt
 
@@ -107,7 +108,7 @@ class SkipLayerNormalizationVer1(name: String, attributes: Map<String, Attribute
 
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val manualContext = coroutineContext[ManualAllocatorContext.Key]
+        val manualContext = coroutineContext[PredictionContext.Key] as? ManualAllocatorContext
 
         val input = inputs[0]!!.data as FloatNDArray
         val output = (manualContext?.getNDArray(DataType.FLOAT, input.strides, fillZeros = false) ?: MutableFloatNDArray(input.strides)) as MutableFloatNDArray
@@ -119,7 +120,7 @@ class SkipLayerNormalizationVer1(name: String, attributes: Map<String, Attribute
             epsilon = epsilon,
             dst = output
         )
-        // Do we need to pass context here??
+
         return listOf(output.asTensor(context = manualContext))
     }
 }
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
index 46596f4e1..afb811664 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
@@ -10,6 +10,7 @@ import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.operator.*
 import io.kinference.protobuf.message.TensorProto
+import io.kinference.utils.PredictionContext
 import kotlin.coroutines.coroutineContext
 
 sealed class Add(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
@@ -55,7 +56,7 @@ class AddVer7(name: String, attributes: Map<String, Attribute<Any>>, inputs: Lis
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val manualContext = coroutineContext[ManualAllocatorContext.Key]
+        val manualContext = coroutineContext[PredictionContext.Key] as? ManualAllocatorContext
 
         val left = inputs[0]!!.data as NumberNDArrayCore
         val right = inputs[1]!!.data as NumberNDArrayCore
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
index c6b21a778..65b5089ec 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
@@ -11,6 +11,7 @@ import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.ndarray.extensions.gelu.biasGelu
 import io.kinference.operator.*
+import io.kinference.utils.PredictionContext
 import kotlin.coroutines.coroutineContext
 
 sealed class BiasGelu(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
@@ -43,7 +44,7 @@ class BiasGeluVer1(name: String, attributes: Map<String, Attribute<Any>> = empty
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val manualContext = coroutineContext[ManualAllocatorContext.Key]
+        val manualContext = coroutineContext[PredictionContext.Key] as? ManualAllocatorContext
 
         val input = inputs[0]!!.data as NumberNDArrayCore
         val bias = inputs[1]!!.data as NumberNDArrayCore
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
index 1d5608450..aabce734e 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
@@ -11,6 +11,7 @@ import io.kinference.ndarray.broadcasting.Broadcasting
 import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.operator.*
 import io.kinference.protobuf.message.TensorProto
+import io.kinference.utils.PredictionContext
 import kotlin.coroutines.coroutineContext
 
 sealed class MatMul(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
@@ -50,7 +51,7 @@ class MatMulVer1(name: String, attributes: Map<String, Attribute<Any>>, inputs:
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val manualContext = coroutineContext[ManualAllocatorContext.Key]
+        val manualContext = coroutineContext[PredictionContext.Key] as? ManualAllocatorContext
 
         val first = inputs[0]!!.data as NumberNDArrayCore
         val second = inputs[1]!!.data as NumberNDArrayCore
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
index d0bc9a56a..acc9dfb94 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
@@ -14,6 +14,7 @@ import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.FLOAT_TENSOR_TYPES
 import io.kinference.protobuf.message.AttributeProto
 import io.kinference.protobuf.message.TensorProto
+import io.kinference.utils.PredictionContext
 import kotlin.coroutines.coroutineContext
 
 sealed class Cast(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
@@ -801,7 +802,7 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
     private val toType: Int by attribute("to") { it: Number -> it.toInt() }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val manualContext = coroutineContext[ManualAllocatorContext.Key]
+        val manualContext = coroutineContext[PredictionContext.Key] as? ManualAllocatorContext
 
         val tensor = inputs.first()!!
         val to = TensorProto.DataType.fromValue(toType)!!
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PredictionContextDispatcher.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PredictionContextDispatcher.kt
index 10a2c4bc4..801e5c66b 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PredictionContextDispatcher.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/PredictionContextDispatcher.kt
@@ -3,14 +3,8 @@ package io.kinference.ndarray.arrays.memory
 import io.kinference.ndarray.arrays.memory.contexts.*
 import io.kinference.ndarray.arrays.memory.storage.*
 import io.kinference.utils.*
-import kotlinx.coroutines.Dispatchers
-import kotlinx.coroutines.ExperimentalCoroutinesApi
+import kotlinx.coroutines.*
 import java.util.concurrent.ConcurrentLinkedQueue
-import kotlin.coroutines.CoroutineContext
-
-interface ArrayStorage {
-    fun resetState()
-}
 
 class PredictionContextDispatcher(private val predictionConfig: PredictionConfig) : Closeable {
     private val limiter: MemoryManager = MemoryManager(
@@ -18,11 +12,11 @@ class PredictionContextDispatcher(private val predictionConfig: PredictionConfig
         cacheClearingInterval = predictionConfig.memoryClearingInterval,
         onCacheClear = ::clearCache)
 
-    private val contextQueue: ConcurrentLinkedQueue<CoroutineContext> = ConcurrentLinkedQueue()
+    private val contextQueue: ConcurrentLinkedQueue<PredictionContext> = ConcurrentLinkedQueue()
     val allocationMode
         get() = predictionConfig.allocationMode
 
-    fun getPredictionContext(): CoroutineContext {
+    fun getPredictionContext(): PredictionContext {
         val allocatorContext = when (predictionConfig.allocationMode) {
             AllocationMode.NoAllocation -> getNoAllocatorContext()
             AllocationMode.Manual -> getManualAllocatorContext()
@@ -31,21 +25,23 @@ class PredictionContextDispatcher(private val predictionConfig: PredictionConfig
         return allocatorContext
     }
 
-    @OptIn(ExperimentalCoroutinesApi::class)
-    private fun getNoAllocatorContext(): CoroutineContext {
-        return contextQueue.poll() ?: (NoAllocatorContext() + ParallelismLimiterContext(Dispatchers.Default.limitedParallelism(predictionConfig.parallelismLimit)))
+    private fun getNoAllocatorContext(): PredictionContext {
+        return contextQueue.poll() ?: (NoAllocatorContext(getDispatcher()))
     }
 
-    @OptIn(ExperimentalCoroutinesApi::class)
-    private fun getAutoAllocatorContext(): CoroutineContext {
+    private fun getAutoAllocatorContext(): PredictionContext {
         limiter.updateLastAccessTime()
-        return contextQueue.poll() ?: (AutoAllocatorContext(AutoArrayHandlingStorage(limiter)) + ParallelismLimiterContext(Dispatchers.Default.limitedParallelism(predictionConfig.parallelismLimit)))
+        return contextQueue.poll() ?: (AutoAllocatorContext(getDispatcher(), AutoArrayHandlingStorage(limiter)))
     }
 
-    @OptIn(ExperimentalCoroutinesApi::class)
-    private fun getManualAllocatorContext(): CoroutineContext {
+    private fun getManualAllocatorContext(): PredictionContext {
         limiter.updateLastAccessTime()
-        return contextQueue.poll() ?: (ManualAllocatorContext(ManualArrayHandlingStorage(limiter)) + ParallelismLimiterContext(Dispatchers.Default.limitedParallelism(predictionConfig.parallelismLimit)))
+        return contextQueue.poll() ?: (ManualAllocatorContext(getDispatcher(), ManualArrayHandlingStorage(limiter)))
+    }
+
+    @OptIn(ExperimentalCoroutinesApi::class)
+    private fun getDispatcher(): CoroutineDispatcher {
+        return Dispatchers.Default.limitedParallelism(predictionConfig.parallelismLimit)
     }
 
     fun clearCache() {
@@ -58,7 +54,10 @@ class PredictionContextDispatcher(private val predictionConfig: PredictionConfig
         clearCache()
     }
 
-    fun returnStorage(context: CoroutineContext) {
+    fun returnStorage(context: PredictionContext) {
+        if (context is AllocatorContext<*>) {
+            context.finalizeContext()
+        }
         contextQueue.offer(context)
     }
 }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
index a4d36b555..e69367f55 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
@@ -3,12 +3,11 @@ package io.kinference.ndarray.arrays.memory.contexts
 import io.kinference.ndarray.arrays.memory.storage.AutoArrayHandlingStorage
 import io.kinference.primitives.types.DataType
 import io.kinference.primitives.types.PrimitiveArray
+import io.kinference.utils.*
+import kotlinx.coroutines.CoroutineDispatcher
 import kotlin.coroutines.*
 
 internal class AutoAllocatorContext internal constructor(
+    dispatcher: CoroutineDispatcher,
     storage: AutoArrayHandlingStorage,
-) : BaseAllocatorContextWithStorage<AutoArrayHandlingStorage>(storage) {
-
-    companion object Key : CoroutineContext.Key<AutoAllocatorContext>
-    override val key: CoroutineContext.Key<*> get() = Key
-}
+) : AllocatorContext<AutoArrayHandlingStorage>(dispatcher, storage)
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/BaseAllocatorContextWithStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/BaseAllocatorContextWithStorage.kt
deleted file mode 100644
index e617c78de..000000000
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/BaseAllocatorContextWithStorage.kt
+++ /dev/null
@@ -1,24 +0,0 @@
-package io.kinference.ndarray.arrays.memory.contexts
-
-import io.kinference.ndarray.arrays.memory.ArrayStorage
-import kotlin.coroutines.CoroutineContext
-
-interface BaseAllocatorContext: CoroutineContext.Element
-
-abstract class BaseAllocatorContextWithStorage<T : ArrayStorage>(internal val storage: T) : BaseAllocatorContext {
-    fun finalizeContext() {
-        storage.resetState()
-    }
-}
-
-fun CoroutineContext.finalizeAllocatorContext() {
-    this.fold(Unit) { _, context ->
-        if (context is BaseAllocatorContextWithStorage<*>)
-            context.finalizeContext()
-    }
-}
-
-class NoAllocatorContext : BaseAllocatorContext {
-    companion object Key : CoroutineContext.Key<NoAllocatorContext>
-    override val key: CoroutineContext.Key<*> get() = Key
-}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt
index a713f31fe..9a6663c7f 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt
@@ -4,14 +4,13 @@ import io.kinference.ndarray.arrays.*
 import io.kinference.ndarray.arrays.memory.storage.ManualArrayHandlingStorage
 import io.kinference.ndarray.arrays.memory.storage.ManualStorage
 import io.kinference.primitives.types.DataType
-import kotlin.coroutines.CoroutineContext
+import io.kinference.utils.AllocatorContext
+import kotlinx.coroutines.CoroutineDispatcher
 
 class ManualAllocatorContext internal constructor(
+    dispatcher: CoroutineDispatcher,
     storage: ManualArrayHandlingStorage,
-) : BaseAllocatorContextWithStorage<ManualStorage>(storage) {
-
-    companion object Key : CoroutineContext.Key<ManualAllocatorContext>
-    override val key: CoroutineContext.Key<*> get() = Key
+) : AllocatorContext<ManualStorage>(dispatcher, storage) {
 
     fun getNDArray(dataType: DataType, strides: Strides, fillZeros: Boolean = false): MutableNDArrayCore {
         return storage.getNDArray(dataType, strides, fillZeros)
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt
index 62364570b..b0ffdbbb5 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/AutoArrayHandlingStorage.kt
@@ -1,6 +1,7 @@
 package io.kinference.ndarray.arrays.memory.storage
 
 import io.kinference.ndarray.arrays.memory.*
+import io.kinference.utils.ArrayStorage
 
 internal interface TypedAutoHandlingStorage {
     fun moveBlocksIntoUnused()
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt
index 559334f8d..227d25136 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/ManualArrayHandlingStorage.kt
@@ -3,6 +3,7 @@ package io.kinference.ndarray.arrays.memory.storage
 import io.kinference.ndarray.arrays.*
 import io.kinference.ndarray.arrays.memory.*
 import io.kinference.primitives.types.DataType
+import io.kinference.utils.ArrayStorage
 
 internal interface TypedManualHandlingStorage {
     fun getNDArray(strides: Strides, fillZeros: Boolean = false, limiter: MemoryManager): MutableNDArrayCore
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
index c0b7d9866..4cd5bb663 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveAutoHandlingArrayStorage.kt
@@ -24,7 +24,7 @@ internal class PrimitiveAutoHandlingArrayStorage : TypedAutoHandlingStorage {
         val unusedQueue = unused.getOrPut(blockSize) { ArrayDeque(blocksNum) }
         val usedQueue = used.getOrPut(blockSize) { ArrayDeque(blocksNum) }
 
-        val blocks = if (limiter.checkMemoryLimitAndAdd(type.getPrimitiveArraySizeInBytes(arraySize = blockSize * blocksNum))) {
+        val blocks = if (limiter.checkMemoryLimitAndAdd(getPrimitiveArraySizeInBytes(arraySize = blockSize * blocksNum))) {
             Array(blocksNum) {
                 unusedQueue.removeFirstOrNull()?.apply {
                     fill(PrimitiveConstants.ZERO)
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt
index 5da084dc3..9280823d8 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveGetBlockFunctionsExtension.kt
@@ -19,6 +19,6 @@ internal fun AutoAllocatorContext.getPrimitiveBlock(blocksNum: Int, blockSize: I
 }
 
 @GenerateNameFromPrimitives
-internal fun DataType.getPrimitiveArraySizeInBytes(arraySize: Int): Long {
+internal fun getPrimitiveArraySizeInBytes(arraySize: Int): Long {
     return PrimitiveConstants.SIZE_BYTES * arraySize
 }
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt
index 29060279b..1c264be01 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/storage/PrimitiveManualHandlingArrayStorage.kt
@@ -25,7 +25,7 @@ internal class PrimitiveManualHandlingArrayStorage : TypedManualHandlingStorage
     override fun getNDArray(strides: Strides, fillZeros: Boolean, limiter: MemoryManager): MutableNDArrayCore {
         val blockSize = blockSizeByStrides(strides)
         val blocksNum = strides.linearSize / blockSize
-        val blocks = if (limiter.checkMemoryLimitAndAdd(type.getPrimitiveArraySizeInBytes(arraySize = blockSize * blocksNum))) {
+        val blocks = if (limiter.checkMemoryLimitAndAdd(getPrimitiveArraySizeInBytes(arraySize = blockSize * blocksNum))) {
             val queue = storage.getOrPut(blockSize) { ArrayDeque(blocksNum) }
             Array(blocksNum) {
                 queue.removeFirstOrNull()?.apply {
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
index 600211e3b..339f2fb8d 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
@@ -11,6 +11,7 @@ import io.kinference.ndarray.arrays.pointers.accept
 import io.kinference.ndarray.blockSizeByStrides
 import io.kinference.primitives.annotations.*
 import io.kinference.primitives.types.*
+import io.kinference.utils.PredictionContext
 import io.kinference.utils.inlines.InlineInt
 import kotlin.coroutines.coroutineContext
 import kotlin.math.min
@@ -59,7 +60,8 @@ internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>) {
                 require(size % blockSize == 0) { "Size must divide blockSize" }
 
             val blocksNum = if (blockSize == 0) 0 else size / blockSize
-            val blocks = coroutineContext[AutoAllocatorContext.Key]?.getPrimitiveBlock(blocksNum, blockSize) ?: Array(blocksNum) { PrimitiveArray(blockSize) }
+            val blocks =  (coroutineContext[PredictionContext.Key] as? AutoAllocatorContext)?.getPrimitiveBlock(blocksNum, blockSize)
+                ?: Array(blocksNum) { PrimitiveArray(blockSize) }
 
             return PrimitiveTiledArray(blocks)
         }
diff --git a/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt b/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt
index 66b5cea95..b17df1f79 100644
--- a/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt
+++ b/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt
@@ -2,6 +2,7 @@ package io.kinference.utils
 
 import kotlinx.coroutines.*
 import kotlinx.coroutines.channels.Channel
+import kotlin.coroutines.AbstractCoroutineContextElement
 import kotlin.coroutines.CoroutineContext
 
 object ResourcesDispatcher {
@@ -16,11 +17,30 @@ object ResourcesDispatcher {
     }
 }
 
-class ParallelismLimiterContext(val dispatcher: CoroutineDispatcher) : CoroutineContext.Element {
-    companion object Key : CoroutineContext.Key<ParallelismLimiterContext>
-    override val key: CoroutineContext.Key<*> get() = Key
+interface PredictionKey<T : PredictionContext> : CoroutineContext.Key<T>
+
+sealed class PredictionContext(
+    val dispatcher: CoroutineDispatcher
+) : AbstractCoroutineContextElement(PredictionContext) {
+    companion object Key : PredictionKey<PredictionContext>
+}
+
+interface ArrayStorage {
+    fun resetState()
+}
+
+abstract class AllocatorContext<T : ArrayStorage>(
+    dispatcher: CoroutineDispatcher,
+    val storage: T
+) : PredictionContext(dispatcher) {
+
+    fun finalizeContext() {
+        storage.resetState()
+    }
 }
 
+class NoAllocatorContext(dispatcher: CoroutineDispatcher) : PredictionContext(dispatcher)
+
 fun CoroutineScope.launchWithLimitOrDefault(block: suspend CoroutineScope.() -> Unit) {
-    this.launch(coroutineContext[ParallelismLimiterContext.Key]?.dispatcher ?: Dispatchers.Default, block = block)
+    this.launch(coroutineContext[PredictionContext]?.dispatcher ?: Dispatchers.Default, block = block)
 }

From 3caa0bcf62b1514fe5a22e224769da54262be46f Mon Sep 17 00:00:00 2001
From: Ilia Vologin <ilya.vologin@jetbrains.com>
Date: Mon, 2 Sep 2024 17:20:12 +0200
Subject: [PATCH 14/19] JBAI-4393 [core] Rework context keys

---
 .../operators/layer/attention/Attention.kt    |  2 +-
 .../normalization/EmbedLayerNormalization.kt  |  2 +-
 .../normalization/SkipLayerNormalization.kt   |  2 +-
 .../io/kinference.core/operators/math/Add.kt  |  2 +-
 .../operators/math/BiasGelu.kt                |  2 +-
 .../kinference.core/operators/math/MatMul.kt  |  2 +-
 .../kinference.core/operators/tensor/Cast.kt  |  2 +-
 .../memory/contexts/AutoAllocatorContext.kt   |  7 ++++-
 .../memory/contexts/ManualAllocatorContext.kt |  6 +++++
 .../arrays/tiled/PrimitiveTiledArray.kt       |  2 +-
 .../kinference/utils/ResourcesDispatcher.kt   | 26 ++++++++++++++-----
 11 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
index 737328779..ee7af6f18 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
@@ -288,7 +288,7 @@ class AttentionVer1(name: String, attributes: Map<String, Attribute<Any>>, input
     private val maskFilterValue: Float by attribute("mask_filter_value") { it: Number -> it.toFloat() }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val context = coroutineContext[PredictionContext.Key] as? ManualAllocatorContext
+        val context = coroutineContext[ManualAllocatorContext]
 
         val input = inputs[0]!!
         val weights = inputs[1]!!
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
index 098667725..21b64def4 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
@@ -176,7 +176,7 @@ class EmbedLayerNormalizationVer1(
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val manualContext = coroutineContext[PredictionContext.Key] as? ManualAllocatorContext
+        val manualContext = coroutineContext[ManualAllocatorContext]
 
         val inputIds = inputs[0]!!.data as IntNDArray
         val segmentIds = inputs[1]?.data as IntNDArray?
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
index aa246044f..842eee3b1 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
@@ -108,7 +108,7 @@ class SkipLayerNormalizationVer1(name: String, attributes: Map<String, Attribute
 
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val manualContext = coroutineContext[PredictionContext.Key] as? ManualAllocatorContext
+        val manualContext = coroutineContext[ManualAllocatorContext]
 
         val input = inputs[0]!!.data as FloatNDArray
         val output = (manualContext?.getNDArray(DataType.FLOAT, input.strides, fillZeros = false) ?: MutableFloatNDArray(input.strides)) as MutableFloatNDArray
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
index afb811664..131f91786 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
@@ -56,7 +56,7 @@ class AddVer7(name: String, attributes: Map<String, Attribute<Any>>, inputs: Lis
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val manualContext = coroutineContext[PredictionContext.Key] as? ManualAllocatorContext
+        val manualContext = coroutineContext[ManualAllocatorContext]
 
         val left = inputs[0]!!.data as NumberNDArrayCore
         val right = inputs[1]!!.data as NumberNDArrayCore
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
index 65b5089ec..02bbc6349 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
@@ -44,7 +44,7 @@ class BiasGeluVer1(name: String, attributes: Map<String, Attribute<Any>> = empty
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val manualContext = coroutineContext[PredictionContext.Key] as? ManualAllocatorContext
+        val manualContext = coroutineContext[ManualAllocatorContext]
 
         val input = inputs[0]!!.data as NumberNDArrayCore
         val bias = inputs[1]!!.data as NumberNDArrayCore
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
index aabce734e..deb54fa35 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
@@ -51,7 +51,7 @@ class MatMulVer1(name: String, attributes: Map<String, Attribute<Any>>, inputs:
     }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val manualContext = coroutineContext[PredictionContext.Key] as? ManualAllocatorContext
+        val manualContext = coroutineContext[ManualAllocatorContext]
 
         val first = inputs[0]!!.data as NumberNDArrayCore
         val second = inputs[1]!!.data as NumberNDArrayCore
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
index acc9dfb94..e858dd91f 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
@@ -802,7 +802,7 @@ class CastVer6(name: String, attributes: Map<String, Attribute<Any>>, inputs: Li
     private val toType: Int by attribute("to") { it: Number -> it.toInt() }
 
     override suspend fun <D : ONNXData<*, *>> apply(contexts: Contexts<D>, inputs: List<KITensor?>): List<KITensor?> {
-        val manualContext = coroutineContext[PredictionContext.Key] as? ManualAllocatorContext
+        val manualContext = coroutineContext[ManualAllocatorContext]
 
         val tensor = inputs.first()!!
         val to = TensorProto.DataType.fromValue(toType)!!
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
index e69367f55..486738d91 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
@@ -7,7 +7,12 @@ import io.kinference.utils.*
 import kotlinx.coroutines.CoroutineDispatcher
 import kotlin.coroutines.*
 
+@OptIn(ExperimentalStdlibApi::class)
 internal class AutoAllocatorContext internal constructor(
     dispatcher: CoroutineDispatcher,
     storage: AutoArrayHandlingStorage,
-) : AllocatorContext<AutoArrayHandlingStorage>(dispatcher, storage)
+) : AllocatorContext<AutoArrayHandlingStorage>(dispatcher, storage) {
+    companion object Key : AbstractCoroutineContextKey<AllocatorContext<*>, AutoAllocatorContext>(
+        AllocatorContext.Key, { it as? AutoAllocatorContext }
+    )
+}
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt
index 9a6663c7f..5a93917de 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/ManualAllocatorContext.kt
@@ -6,11 +6,17 @@ import io.kinference.ndarray.arrays.memory.storage.ManualStorage
 import io.kinference.primitives.types.DataType
 import io.kinference.utils.AllocatorContext
 import kotlinx.coroutines.CoroutineDispatcher
+import kotlin.coroutines.AbstractCoroutineContextKey
 
+@OptIn(ExperimentalStdlibApi::class)
 class ManualAllocatorContext internal constructor(
     dispatcher: CoroutineDispatcher,
     storage: ManualArrayHandlingStorage,
 ) : AllocatorContext<ManualStorage>(dispatcher, storage) {
+    companion object Key : AbstractCoroutineContextKey<AllocatorContext<*>, ManualAllocatorContext>(
+        AllocatorContext.Key, { it as? ManualAllocatorContext }
+    )
+
 
     fun getNDArray(dataType: DataType, strides: Strides, fillZeros: Boolean = false): MutableNDArrayCore {
         return storage.getNDArray(dataType, strides, fillZeros)
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
index 339f2fb8d..2f791feb3 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
@@ -60,7 +60,7 @@ internal class PrimitiveTiledArray(val blocks: Array<PrimitiveArray>) {
                 require(size % blockSize == 0) { "Size must divide blockSize" }
 
             val blocksNum = if (blockSize == 0) 0 else size / blockSize
-            val blocks =  (coroutineContext[PredictionContext.Key] as? AutoAllocatorContext)?.getPrimitiveBlock(blocksNum, blockSize)
+            val blocks = coroutineContext[AutoAllocatorContext]?.getPrimitiveBlock(blocksNum, blockSize)
                 ?: Array(blocksNum) { PrimitiveArray(blockSize) }
 
             return PrimitiveTiledArray(blocks)
diff --git a/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt b/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt
index b17df1f79..45727274a 100644
--- a/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt
+++ b/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt
@@ -1,9 +1,9 @@
+@file:OptIn(ExperimentalStdlibApi::class)
 package io.kinference.utils
 
 import kotlinx.coroutines.*
 import kotlinx.coroutines.channels.Channel
-import kotlin.coroutines.AbstractCoroutineContextElement
-import kotlin.coroutines.CoroutineContext
+import kotlin.coroutines.*
 
 object ResourcesDispatcher {
     private val tokenChannel = Channel<Unit>(capacity = PlatformUtils.cores)
@@ -17,12 +17,17 @@ object ResourcesDispatcher {
     }
 }
 
-interface PredictionKey<T : PredictionContext> : CoroutineContext.Key<T>
-
 sealed class PredictionContext(
     val dispatcher: CoroutineDispatcher
 ) : AbstractCoroutineContextElement(PredictionContext) {
-    companion object Key : PredictionKey<PredictionContext>
+    companion object Key : CoroutineContext.Key<PredictionContext>
+
+    override val key
+        get() = Key
+
+    override fun <E : CoroutineContext.Element> get(key: CoroutineContext.Key<E>): E? = getPolymorphicElement(key)
+
+    override fun minusKey(key: CoroutineContext.Key<*>): CoroutineContext = minusPolymorphicKey(key)
 }
 
 interface ArrayStorage {
@@ -33,13 +38,22 @@ abstract class AllocatorContext<T : ArrayStorage>(
     dispatcher: CoroutineDispatcher,
     val storage: T
 ) : PredictionContext(dispatcher) {
+    companion object Key : AbstractCoroutineContextKey<PredictionContext, AllocatorContext<*>>(
+        PredictionContext.Key,
+        { it as? AllocatorContext<*> }
+    )
 
     fun finalizeContext() {
         storage.resetState()
     }
 }
 
-class NoAllocatorContext(dispatcher: CoroutineDispatcher) : PredictionContext(dispatcher)
+class NoAllocatorContext(dispatcher: CoroutineDispatcher) : PredictionContext(dispatcher) {
+    companion object Key : AbstractCoroutineContextKey<PredictionContext, NoAllocatorContext>(
+        PredictionContext.Key,
+        { it as? NoAllocatorContext }
+    )
+}
 
 fun CoroutineScope.launchWithLimitOrDefault(block: suspend CoroutineScope.() -> Unit) {
     this.launch(coroutineContext[PredictionContext]?.dispatcher ?: Dispatchers.Default, block = block)

From 9d67670bb7a807c81907640ac35fbdb929c48aa5 Mon Sep 17 00:00:00 2001
From: Ilia Vologin <ilya.vologin@jetbrains.com>
Date: Mon, 2 Sep 2024 17:21:41 +0200
Subject: [PATCH 15/19] JBAI-4393 [core] Optimize imports

---
 .../io/kinference.core/operators/layer/attention/Attention.kt   | 1 -
 .../operators/layer/normalization/EmbedLayerNormalization.kt    | 1 -
 .../operators/layer/normalization/SkipLayerNormalization.kt     | 1 -
 .../src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt | 1 -
 .../kotlin/io/kinference.core/operators/math/BiasGelu.kt        | 1 -
 .../jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt  | 1 -
 .../jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt  | 1 -
 .../ndarray/arrays/memory/contexts/AutoAllocatorContext.kt      | 2 --
 .../io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt   | 1 -
 .../kotlin/io/kinference/utils/ResourcesDispatcher.kt           | 2 ++
 10 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
index ee7af6f18..0a60d0278 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/attention/Attention.kt
@@ -19,7 +19,6 @@ import io.kinference.optimizer.GraphOptimizer.Companion.isOpt
 import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.message.AttributeProto
 import io.kinference.protobuf.message.TensorProto
-import io.kinference.utils.PredictionContext
 import io.kinference.utils.launchWithLimitOrDefault
 import kotlinx.coroutines.coroutineScope
 import kotlin.coroutines.coroutineContext
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
index 21b64def4..5fad8cd77 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/EmbedLayerNormalization.kt
@@ -11,7 +11,6 @@ import io.kinference.operator.*
 import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.message.AttributeProto.AttributeType
 import io.kinference.protobuf.message.TensorProto
-import io.kinference.utils.PredictionContext
 import kotlin.coroutines.coroutineContext
 import kotlin.math.sqrt
 
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
index 842eee3b1..598a14c26 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/layer/normalization/SkipLayerNormalization.kt
@@ -13,7 +13,6 @@ import io.kinference.operator.*
 import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.message.AttributeProto
 import io.kinference.protobuf.message.TensorProto
-import io.kinference.utils.PredictionContext
 import kotlin.coroutines.coroutineContext
 import kotlin.math.sqrt
 
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
index 131f91786..55d325668 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/Add.kt
@@ -10,7 +10,6 @@ import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.operator.*
 import io.kinference.protobuf.message.TensorProto
-import io.kinference.utils.PredictionContext
 import kotlin.coroutines.coroutineContext
 
 sealed class Add(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
index 02bbc6349..da93d0e8a 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/BiasGelu.kt
@@ -11,7 +11,6 @@ import io.kinference.ndarray.arrays.memory.contexts.ManualAllocatorContext
 import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.ndarray.extensions.gelu.biasGelu
 import io.kinference.operator.*
-import io.kinference.utils.PredictionContext
 import kotlin.coroutines.coroutineContext
 
 sealed class BiasGelu(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
index deb54fa35..4165c554c 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/math/MatMul.kt
@@ -11,7 +11,6 @@ import io.kinference.ndarray.broadcasting.Broadcasting
 import io.kinference.ndarray.extensions.allocateNDArray
 import io.kinference.operator.*
 import io.kinference.protobuf.message.TensorProto
-import io.kinference.utils.PredictionContext
 import kotlin.coroutines.coroutineContext
 
 sealed class MatMul(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
diff --git a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
index e858dd91f..1bfb35fee 100644
--- a/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
+++ b/inference/inference-core/src/jvmMain/kotlin/io/kinference.core/operators/tensor/Cast.kt
@@ -14,7 +14,6 @@ import io.kinference.primitives.types.DataType
 import io.kinference.protobuf.FLOAT_TENSOR_TYPES
 import io.kinference.protobuf.message.AttributeProto
 import io.kinference.protobuf.message.TensorProto
-import io.kinference.utils.PredictionContext
 import kotlin.coroutines.coroutineContext
 
 sealed class Cast(name: String, info: OperatorInfo, attributes: Map<String, Attribute<Any>>, inputs: List<String>, outputs: List<String>) : Operator<KITensor, KITensor>(name, info, attributes, inputs, outputs) {
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
index 486738d91..9af632e6b 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/memory/contexts/AutoAllocatorContext.kt
@@ -1,8 +1,6 @@
 package io.kinference.ndarray.arrays.memory.contexts
 
 import io.kinference.ndarray.arrays.memory.storage.AutoArrayHandlingStorage
-import io.kinference.primitives.types.DataType
-import io.kinference.primitives.types.PrimitiveArray
 import io.kinference.utils.*
 import kotlinx.coroutines.CoroutineDispatcher
 import kotlin.coroutines.*
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
index 2f791feb3..eee93692c 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/arrays/tiled/PrimitiveTiledArray.kt
@@ -11,7 +11,6 @@ import io.kinference.ndarray.arrays.pointers.accept
 import io.kinference.ndarray.blockSizeByStrides
 import io.kinference.primitives.annotations.*
 import io.kinference.primitives.types.*
-import io.kinference.utils.PredictionContext
 import io.kinference.utils.inlines.InlineInt
 import kotlin.coroutines.coroutineContext
 import kotlin.math.min
diff --git a/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt b/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt
index 45727274a..b2d5b40a9 100644
--- a/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt
+++ b/utils/utils-common/src/commonMain/kotlin/io/kinference/utils/ResourcesDispatcher.kt
@@ -25,8 +25,10 @@ sealed class PredictionContext(
     override val key
         get() = Key
 
+    @OptIn(ExperimentalStdlibApi::class)
     override fun <E : CoroutineContext.Element> get(key: CoroutineContext.Key<E>): E? = getPolymorphicElement(key)
 
+    @OptIn(ExperimentalStdlibApi::class)
     override fun minusKey(key: CoroutineContext.Key<*>): CoroutineContext = minusPolymorphicKey(key)
 }
 

From 61011f8b1665052d74bc698e1d57d302d0d646de Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Mon, 2 Sep 2024 17:59:37 +0200
Subject: [PATCH 16/19] JBAI-4393 [ndarray] Functional interface to streamline
 parallelization block handling (avoid Integer boxing operations).

---
 .../src/jvmMain/kotlin/io/kinference/ndarray/Utils.kt    | 9 +++++++--
 .../ndarray/extensions/constants/BooleanConstants.kt     | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/Utils.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/Utils.kt
index 3869f6162..546857006 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/Utils.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/Utils.kt
@@ -80,13 +80,18 @@ internal fun IntArray.swap(leftIdx: Int, rightIdx: Int) {
     this[leftIdx] = this[rightIdx]
     this[rightIdx] = temp
 }
+
+fun interface ParallelizeBody {
+    operator fun invoke(start: Int, end: Int, coroutineIndex: Int)
+}
+
 /*
  * Parallelize with batching by minDataPerLaunch
  */
 suspend fun parallelizeByBlocks(blockSize: Int,
                                 countBlocks: Int,
                                 minDataPerLaunch: Int,
-                                body: (blockStart: Int, blockEnd: Int, coroutineIndex: Int) -> Unit) {
+                                body: ParallelizeBody) {
 
     val batchSize = batchSizeByData(blockSize, countBlocks, minDataPerLaunch)
 
@@ -103,7 +108,7 @@ suspend fun parallelizeByBlocks(blockSize: Int,
     }
 }
 
-suspend inline fun parallelizeByRows(rowSize: Int, countRows: Int, minDataPerLaunch: Int, noinline body: (rowStart: Int, rowEnd: Int, index: Int) -> Unit) = parallelizeByBlocks(rowSize, countRows, minDataPerLaunch, body)
+suspend inline fun parallelizeByRows(rowSize: Int, countRows: Int, minDataPerLaunch: Int, body: ParallelizeBody) = parallelizeByBlocks(rowSize, countRows, minDataPerLaunch, body)
 
 internal fun countCoroutinesByData(rowSize: Int, countRows: Int, minDataPerLaunch: Int): Int {
     val batchSize = batchSizeByData(rowSize, countRows, minDataPerLaunch)
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt
index e3e369c6b..0bac99911 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/constants/BooleanConstants.kt
@@ -3,5 +3,5 @@ package io.kinference.ndarray.extensions.constants
 object BooleanConstants {
     const val ZERO = false
     const val ONE = true
-    const val SIZE_BYTES = 1.toLong()
+    const val SIZE_BYTES = 1L
 }

From 460f929f2f78d056c00332021efc23f236955c31 Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Mon, 23 Sep 2024 12:23:31 +0200
Subject: [PATCH 17/19] JBAI-6945 [ndarray] Fixed broadcasting logic for batch
 processing.

---
 .../BroadcastTwoArgumentsPrimitive.kt         | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/BroadcastTwoArgumentsPrimitive.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/BroadcastTwoArgumentsPrimitive.kt
index 90056a8bf..61fc1c076 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/BroadcastTwoArgumentsPrimitive.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/BroadcastTwoArgumentsPrimitive.kt
@@ -50,11 +50,15 @@ internal fun broadcastTwoTensorsPrimitive(
         val batchSize = destBroadcastingShape[shapeIdx]
 
         for (batchIdx in 0 until batchSize) {
-            val leftScalar = leftBlocks[leftOffset.value][0]
+            val leftBatchOffset = leftOffset.value + leftOffsets[shapeIdx] * batchIdx
+            val rightBatchOffset = rightOffset.value + rightOffsets[shapeIdx] * batchIdx
+            val destBatchOffset = destOffset.value + destOffsets[shapeIdx] * batchIdx
+
+            val leftScalar = leftBlocks[leftBatchOffset][0]
 
             for (blockIdx in 0 until destBlocksInRow) {
-                val destBlock = destBlocks[destOffset.value + blockIdx]
-                val rightBlock = rightBlocks[rightOffset.value + blockIdx]
+                val destBlock = destBlocks[destBatchOffset + blockIdx]
+                val rightBlock = rightBlocks[rightBatchOffset + blockIdx]
 
                 for (idx in destBlock.indices) {
                     destBlock[idx] = op(leftScalar, rightBlock[idx])
@@ -68,11 +72,15 @@ internal fun broadcastTwoTensorsPrimitive(
         val batchSize = destBroadcastingShape[shapeIdx]
 
         for (batchIdx in 0 until batchSize) {
-            val rightScalar = rightBlocks[rightOffset.value][0]
+            val leftBatchOffset = leftOffset.value + leftOffsets[shapeIdx] * batchIdx
+            val rightBatchOffset = rightOffset.value + rightOffsets[shapeIdx] * batchIdx
+            val destBatchOffset = destOffset.value + destOffsets[shapeIdx] * batchIdx
+
+            val rightScalar = rightBlocks[rightBatchOffset][0]
 
             for (blockIdx in 0 until destBlocksInRow) {
-                val destBlock = destBlocks[destOffset.value + blockIdx]
-                val leftBlock = leftBlocks[leftOffset.value + blockIdx]
+                val destBlock = destBlocks[destBatchOffset + blockIdx]
+                val leftBlock = leftBlocks[leftBatchOffset + blockIdx]
 
                 for (idx in destBlock.indices) {
                     destBlock[idx] = op(leftBlock[idx], rightScalar)

From 4bdb061d906bdfe6e5a63a81ffe9b4a7c407123b Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Tue, 24 Sep 2024 14:34:17 +0200
Subject: [PATCH 18/19] JBAI-6945 [ndarray] Introduced functional interface
 ScalarBroadcastFun instead of lambda, so InlineInt inside changed to regular
 Int without additional boxing operations.

---
 .../BroadcastTwoArgumentsPrimitive.kt         | 33 +++++++++----------
 .../extensions/broadcasting/ReshapeView.kt    |  6 ++--
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/BroadcastTwoArgumentsPrimitive.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/BroadcastTwoArgumentsPrimitive.kt
index 61fc1c076..8fd770dd1 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/BroadcastTwoArgumentsPrimitive.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/BroadcastTwoArgumentsPrimitive.kt
@@ -8,7 +8,6 @@ import io.kinference.primitives.annotations.GenerateNameFromPrimitives
 import io.kinference.primitives.annotations.GeneratePrimitives
 import io.kinference.primitives.types.DataType
 import io.kinference.primitives.types.PrimitiveType
-import io.kinference.utils.inlines.InlineInt
 
 @GenerateNameFromPrimitives
 internal fun broadcastTwoTensorsPrimitive(
@@ -45,14 +44,14 @@ internal fun broadcastTwoTensorsPrimitive(
     val rightBlocks = right.array.blocks
     val destBlocks = dest.array.blocks
 
-    val leftIsScalarFun = { leftOffset: InlineInt, rightOffset: InlineInt, destOffset: InlineInt, axisToBroadcastIdx: InlineInt ->
-        val shapeIdx = axisToBroadcastIdx.value * 2
+    val leftIsScalarFun = ScalarBroadcastFun { leftOffset, rightOffset, destOffset, axisToBroadcastIdx ->
+        val shapeIdx = axisToBroadcastIdx * 2
         val batchSize = destBroadcastingShape[shapeIdx]
 
         for (batchIdx in 0 until batchSize) {
-            val leftBatchOffset = leftOffset.value + leftOffsets[shapeIdx] * batchIdx
-            val rightBatchOffset = rightOffset.value + rightOffsets[shapeIdx] * batchIdx
-            val destBatchOffset = destOffset.value + destOffsets[shapeIdx] * batchIdx
+            val leftBatchOffset = leftOffset + leftOffsets[shapeIdx] * batchIdx
+            val rightBatchOffset = rightOffset + rightOffsets[shapeIdx] * batchIdx
+            val destBatchOffset = destOffset + destOffsets[shapeIdx] * batchIdx
 
             val leftScalar = leftBlocks[leftBatchOffset][0]
 
@@ -67,14 +66,14 @@ internal fun broadcastTwoTensorsPrimitive(
         }
     }
 
-    val rightIsScalarFun = { leftOffset: InlineInt, rightOffset: InlineInt, destOffset: InlineInt, axisToBroadcastIdx: InlineInt ->
-        val shapeIdx = axisToBroadcastIdx.value * 2
+    val rightIsScalarFun = ScalarBroadcastFun { leftOffset, rightOffset, destOffset, axisToBroadcastIdx ->
+        val shapeIdx = axisToBroadcastIdx * 2
         val batchSize = destBroadcastingShape[shapeIdx]
 
         for (batchIdx in 0 until batchSize) {
-            val leftBatchOffset = leftOffset.value + leftOffsets[shapeIdx] * batchIdx
-            val rightBatchOffset = rightOffset.value + rightOffsets[shapeIdx] * batchIdx
-            val destBatchOffset = destOffset.value + destOffsets[shapeIdx] * batchIdx
+            val leftBatchOffset = leftOffset + leftOffsets[shapeIdx] * batchIdx
+            val rightBatchOffset = rightOffset + rightOffsets[shapeIdx] * batchIdx
+            val destBatchOffset = destOffset + destOffsets[shapeIdx] * batchIdx
 
             val rightScalar = rightBlocks[rightBatchOffset][0]
 
@@ -89,11 +88,11 @@ internal fun broadcastTwoTensorsPrimitive(
         }
     }
 
-    val defaultFun = { leftOffset: InlineInt, rightOffset: InlineInt, destOffset: InlineInt, axisToBroadcastIdx: InlineInt ->
+    val defaultFun = ScalarBroadcastFun { leftOffset, rightOffset, destOffset, _ ->
         for (blockIdx in 0 until destBlocksInRow) {
-            val leftBlock = leftBlocks[leftOffset.value + blockIdx]
-            val rightBlock = rightBlocks[rightOffset.value + blockIdx]
-            val destBlock = destBlocks[destOffset.value + blockIdx]
+            val leftBlock = leftBlocks[leftOffset + blockIdx]
+            val rightBlock = rightBlocks[rightOffset + blockIdx]
+            val destBlock = destBlocks[destOffset + blockIdx]
 
             for (idx in destBlock.indices) {
                 destBlock[idx] = op(leftBlock[idx], rightBlock[idx])
@@ -101,7 +100,7 @@ internal fun broadcastTwoTensorsPrimitive(
         }
     }
 
-    val broadcastingFun = when {
+    val broadcastingFun: ScalarBroadcastFun = when {
         leftIsScalar -> leftIsScalarFun
         rightIsScalar -> rightIsScalarFun
         else -> defaultFun
@@ -109,7 +108,7 @@ internal fun broadcastTwoTensorsPrimitive(
 
     fun broadcast(leftOffset: Int, rightOffset: Int, destOffset: Int, axisToBroadcastIdx: Int) {
         if (axisToBroadcastIdx == totalAxesToBroadcast) {
-            broadcastingFun(InlineInt(leftOffset), InlineInt(rightOffset), InlineInt(destOffset), InlineInt(axisToBroadcastIdx))
+            broadcastingFun(leftOffset, rightOffset, destOffset, axisToBroadcastIdx)
         } else {
             val shapeIdx = axisToBroadcastIdx * 2
 
diff --git a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/ReshapeView.kt b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/ReshapeView.kt
index 43c6f672a..b980f530e 100644
--- a/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/ReshapeView.kt
+++ b/ndarray/ndarray-core/src/jvmMain/kotlin/io/kinference/ndarray/extensions/broadcasting/ReshapeView.kt
@@ -3,6 +3,10 @@ package io.kinference.ndarray.extensions.broadcasting
 import io.kinference.ndarray.arrays.NDArrayCore
 import io.kinference.ndarray.extensions.utils.calculateBlock
 
+internal fun interface ScalarBroadcastFun {
+    operator fun invoke(leftOffset: Int, rightOffset: Int, destOffset: Int, axisToBroadcastIdx: Int)
+}
+
 internal data class BroadcastingInfo(
     val broadcastingShapes: Array<IntArray>,
     val broadcastingDestShape: IntArray,
@@ -89,8 +93,6 @@ internal data class BroadcastingInfo(
     }
 }
 
-
-
 internal fun makeOffsets(shape: IntArray, blocksInRow: Int): IntArray {
     val offsets = IntArray(shape.size)
     offsets[offsets.lastIndex - 1] = blocksInRow

From 52c7687d9fd5b3119b5d6a4a71b67c65b9a0d04a Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Thu, 26 Sep 2024 11:39:50 +0200
Subject: [PATCH 19/19] [RELEASE] Update version to 0.2.23

---
 README.md        | 24 ++++++++++++------------
 build.gradle.kts |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index b209fb9db..0d33e61aa 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,7 @@ it is highly recommended to use KInference TensorFlow.js backend instead for mor
 KInference Core dependency coordinates:
 ```kotlin
 dependencies {
-    api("io.kinference", "inference-core", "0.2.22")
+    api("io.kinference", "inference-core", "0.2.23")
 }
 ```
 
@@ -67,7 +67,7 @@ This backend is recommended for JavaScript projects.
 TensorFlow.js backend dependency coordinates:
 ```kotlin
 dependencies {
-    api("io.kinference", "inference-tfjs", "0.2.22")
+    api("io.kinference", "inference-tfjs", "0.2.23")
 }
 ```
 
@@ -81,14 +81,14 @@ To check on the system requirements, visit the following [link](https://onnxrunt
 ONNXRuntime CPU backend dependency coordinates:
 ```kotlin
 dependencies {
-    api("io.kinference", "inference-ort", "0.2.22")
+    api("io.kinference", "inference-ort", "0.2.23")
 }
 ```
 
 ONNXRuntime GPU backend dependency coordinates:
 ```kotlin
 dependencies {
-    api("io.kinference", "inference-ort-gpu", "0.2.22")
+    api("io.kinference", "inference-ort-gpu", "0.2.23")
 }
 ```
 
@@ -104,7 +104,7 @@ Array adapter for the [kmath](https://github.com/SciProgCentre/kmath) library th
 Dependency coordinates:
 ```kotlin
 dependencies {
-    api("io.kinference", "adapter-kmath-{backend_name}", "0.2.22")
+    api("io.kinference", "adapter-kmath-{backend_name}", "0.2.23")
 }
 ```
 
@@ -114,12 +114,12 @@ Array adapter for the [multik](https://github.com/Kotlin/multik) library that wo
 Dependency coordinates:
 ```kotlin
 dependencies {
-    api("io.kinference", "adapter-multik-{backend_name}", "0.2.22")
+    api("io.kinference", "adapter-multik-{backend_name}", "0.2.23")
 }
 ```
 
 ## Getting started
-Let us now walk through how to get started with KInference. The latest version of KInference is *0.2.22*
+Let us now walk through how to get started with KInference. The latest version of KInference is *0.2.23*
 
 ### Setup dependencies repository
 
@@ -142,7 +142,7 @@ To enable the backend, you can add the chosen KInference runtime as a dependency
 
 ```kotlin
 dependencies {
-    api("io.kinference", "inference-core", "0.2.22")
+    api("io.kinference", "inference-core", "0.2.23")
 }
 ```
 
@@ -160,20 +160,20 @@ kotlin {
     sourceSets {
         val commonMain by getting {
             dependencies {
-                api("io.kinference:inference-api:0.2.22")
-                api("io.kinference:ndarray-api:0.2.22")
+                api("io.kinference:inference-api:0.2.23")
+                api("io.kinference:ndarray-api:0.2.23")
             }
         }
 
         val jvmMain by getting {
             dependencies {
-                api("io.kinference:inference-core:0.2.22")
+                api("io.kinference:inference-core:0.2.23")
             }
         }
 
         val jsMain by getting {
             dependencies {
-                api("io.kinference:inference-tfjs:0.2.22")
+                api("io.kinference:inference-tfjs:0.2.23")
             }
         }
     }
diff --git a/build.gradle.kts b/build.gradle.kts
index 7c543737d..7e08e87a7 100644
--- a/build.gradle.kts
+++ b/build.gradle.kts
@@ -8,7 +8,7 @@ import org.jetbrains.kotlin.gradle.targets.js.yarn.YarnRootExtension
 import org.jetbrains.kotlin.gradle.tasks.KotlinCompilationTask
 
 group = "io.kinference"
-version = "0.2.22"
+version = "0.2.23"
 
 plugins {
     alias(libs.plugins.kotlin.multiplatform) apply false