From a38c5559403fed97ca8e2991c3f8bda286ac1a06 Mon Sep 17 00:00:00 2001
From: dmitriyb <dvbozhko@gmail.com>
Date: Fri, 27 Sep 2024 13:50:47 +0200
Subject: [PATCH] JBAI-5829 [examples] Added GPT-2 example using ORTEngine for
 text generation.

---
 .../kotlin/io/kinference/examples/Utils.kt    |  23 +++
 .../io/kinference/examples/lm/KIMain.kt       |  60 +++++++
 .../kotlin/io/kinference/examples/lm/Main.kt  | 169 ------------------
 .../io/kinference/examples/lm/ORTMain.kt      |  74 ++++++++
 4 files changed, 157 insertions(+), 169 deletions(-)
 create mode 100644 examples/src/jvmMain/kotlin/io/kinference/examples/lm/KIMain.kt
 delete mode 100644 examples/src/jvmMain/kotlin/io/kinference/examples/lm/Main.kt
 create mode 100644 examples/src/jvmMain/kotlin/io/kinference/examples/lm/ORTMain.kt

diff --git a/examples/src/jvmMain/kotlin/io/kinference/examples/Utils.kt b/examples/src/jvmMain/kotlin/io/kinference/examples/Utils.kt
index 4aede5dc..3b3acdc3 100644
--- a/examples/src/jvmMain/kotlin/io/kinference/examples/Utils.kt
+++ b/examples/src/jvmMain/kotlin/io/kinference/examples/Utils.kt
@@ -1,5 +1,9 @@
 package io.kinference.examples
 
+import io.kinference.core.KIONNXData
+import io.kinference.core.data.tensor.KITensor
+import io.kinference.ndarray.arrays.LongNDArray
+import io.kinference.ndarray.arrays.NumberNDArrayCore
 import io.ktor.client.HttpClient
 import io.ktor.client.plugins.HttpTimeout
 import io.ktor.client.request.prepareRequest
@@ -39,3 +43,22 @@ suspend fun downloadFile(url: String, outputPath: String) {
 
     client.close()
 }
+
+suspend fun extractTopToken(output: Map<String, KIONNXData<*>>, tokensSize: Int, outputName: String): Long {
+    val logits = output[outputName]!! as KITensor
+    val sliced = logits.data.slice(
+        starts = intArrayOf(0, 0, tokensSize - 1, 0),   // First batch, first element in the second dimension, last token, first vocab entry
+        ends = intArrayOf(1, 1, tokensSize, 50257),     // Same batch, same second dimension, one token step, whole vocab (50257)
+        steps = intArrayOf(1, 1, 1, 1)                  // Step of 1 for each dimension
+    ) as NumberNDArrayCore
+    val softmax = sliced.softmax(axis = -1)
+    val topK = softmax.topK(
+        axis = -1,                                      // Apply top-k along the last dimension (vocabulary size)
+        k = 1,                                          // Retrieve the top 1 element
+        largest = true,                                 // We want the largest probabilities (most probable tokens)
+        sorted = false                                  // Sorting is unnecessary since we are only retrieving the top 1
+    )
+    val tokenId = (topK.second as LongNDArray)[intArrayOf(0, 0, 0, 0)]
+
+    return tokenId
+}
diff --git a/examples/src/jvmMain/kotlin/io/kinference/examples/lm/KIMain.kt b/examples/src/jvmMain/kotlin/io/kinference/examples/lm/KIMain.kt
new file mode 100644
index 00000000..6a1bc484
--- /dev/null
+++ b/examples/src/jvmMain/kotlin/io/kinference/examples/lm/KIMain.kt
@@ -0,0 +1,60 @@
+package io.kinference.examples.lm
+
+import ai.djl.huggingface.tokenizers.HuggingFaceTokenizer
+import io.kinference.core.KIEngine
+import io.kinference.core.data.tensor.asTensor
+import io.kinference.examples.downloadFile
+import io.kinference.examples.extractTopToken
+import io.kinference.examples.resourcesPath
+import io.kinference.ndarray.arrays.LongNDArray
+import io.kinference.ndarray.arrays.NDArrayCore
+import io.kinference.utils.CommonDataLoader
+import io.kinference.utils.PredictionConfigs
+import io.kinference.utils.inlines.InlineInt
+import okio.Path.Companion.toPath
+
+// Constants for input and output tensor names used in the GPT-2 model
+private const val INPUT_TENSOR_NAME = "input1"
+private const val OUTPUT_TENSOR_NAME = "output1" // We use only logits tensor
+
+suspend fun main() {
+    val modelUrl = "https://github.com/onnx/models/raw/main/validated/text/machine_comprehension/gpt-2/model/gpt2-lm-head-10.onnx"
+    val modelName = "gpt2-lm-head-10"
+
+    println("Downloading model from: $modelUrl")
+    downloadFile(modelUrl, "$resourcesPath/$modelName.onnx")
+
+    val modelBytes = CommonDataLoader.bytes("${resourcesPath}/$modelName.onnx".toPath())
+
+    println("Loading model...")
+    val model = KIEngine.loadModel(modelBytes, optimize = true, predictionConfig = PredictionConfigs.DefaultAutoAllocator)
+
+    val tokenizer = HuggingFaceTokenizer.newInstance("gpt2", mapOf("modelMaxLength" to "1024"))
+    val testString = "Neurogenesis is most active during embryonic development and is responsible for producing " +
+        "all the various types of neurons of the organism, but it continues throughout adult life " +
+        "in a variety of organisms. Once born, neurons do not divide (see mitosis), and many will " +
+        "live the lifespan of the animal, except under extraordinary and usually pathogenic circumstances."
+    val encoded = tokenizer.encode(testString)
+    val tokens = encoded.ids
+    val tokensSize = tokens.size
+
+    val predictionLength = 34
+    val outputTokens = LongArray(predictionLength) { 0 }
+
+    val input = LongNDArray(1, tokensSize) { idx: InlineInt -> tokens[idx.value] }.unsqueeze(0)
+    var currentContext = input.clone()
+
+    print("Here goes the test text for generation:\n$testString")
+
+    for (idx in 0 until predictionLength) {
+        val inputTensor = listOf((currentContext as NDArrayCore).asTensor(INPUT_TENSOR_NAME))
+        val output = model.predict(inputTensor)
+
+        outputTokens[idx] = extractTopToken(output, tokensSize + idx, OUTPUT_TENSOR_NAME)
+
+        val newTokenArray = LongNDArray(1, 1) { _: InlineInt -> outputTokens[idx] }
+        currentContext = currentContext.concat(listOf(newTokenArray.unsqueeze(0)), axis = -1)
+        print(tokenizer.decode(longArrayOf(outputTokens[idx])))
+    }
+    println("\n\nDone")
+}
diff --git a/examples/src/jvmMain/kotlin/io/kinference/examples/lm/Main.kt b/examples/src/jvmMain/kotlin/io/kinference/examples/lm/Main.kt
deleted file mode 100644
index 01de3393..00000000
--- a/examples/src/jvmMain/kotlin/io/kinference/examples/lm/Main.kt
+++ /dev/null
@@ -1,169 +0,0 @@
-package io.kinference.examples.lm
-
-import ai.djl.huggingface.tokenizers.HuggingFaceTokenizer
-import ai.onnxruntime.OnnxTensor
-import io.kinference.core.KIEngine
-import io.kinference.core.KIONNXData
-import io.kinference.core.data.tensor.KITensor
-import io.kinference.core.data.tensor.asTensor
-import io.kinference.data.ONNXDataType
-import io.kinference.examples.downloadFile
-import io.kinference.examples.resourcesPath
-import io.kinference.ndarray.arrays.FloatNDArray
-import io.kinference.ndarray.arrays.LongNDArray
-import io.kinference.ndarray.arrays.NDArrayCore
-import io.kinference.ndarray.arrays.NumberNDArrayCore
-import io.kinference.ort.ORTEngine
-import io.kinference.utils.CommonDataLoader
-import io.kinference.utils.PredictionConfigs
-import io.kinference.utils.inlines.InlineInt
-import io.kinference.utils.toIntArray
-import okio.Path.Companion.toPath
-
-// Softmax function
-fun softmax(logits: FloatArray): FloatArray {
-    val maxLogit = logits.maxOrNull() ?: 0.0f
-    val expLogits = logits.map { Math.exp((it - maxLogit).toDouble()).toFloat() }.toFloatArray()
-    val sumExp = expLogits.sum()
-    return expLogits.map { it / sumExp }.toFloatArray()  // Normalize
-}
-
-// Top-K function to get the top K probabilities and their indices
-fun topK(probs: FloatArray, k: Int): Pair<FloatArray, IntArray> {
-    val indexedProbs = probs.mapIndexed { index, prob -> index to prob }
-    val sortedProbs = indexedProbs.sortedByDescending { it.second }.take(k)
-    val topProbs = sortedProbs.map { it.second }.toFloatArray()
-    val topIndices = sortedProbs.map { it.first }.toIntArray()
-    return Pair(topProbs, topIndices)
-}
-
-fun transformToFloatArray2D(original: FloatArray, n: Int): Array<FloatArray> {
-    // Calculate how many sub-arrays (rows) we will have
-    val rowCount = original.size / n
-
-    // Create a new 2D array to store the result
-    val result = Array(rowCount) { FloatArray(n) }
-
-    // Fill the new 2D array with sub-arrays from the original array
-    for (i in 0 until rowCount) {
-        // Copy the next n elements into the current row
-        result[i] = original.sliceArray(i * n until (i + 1) * n)
-    }
-
-    return result
-}
-
-suspend fun mainONNXRuntimeValidation() {
-    val modelBytes = CommonDataLoader.bytes("$resourcesPath/gpt2-lm-head-10.onnx".toPath())
-    val model = ORTEngine.loadModel(modelBytes)
-
-    val inputTestTensor = ORTEngine.loadData("$resourcesPath/test_data_set_0/input_0.pb".toPath(), ONNXDataType.ONNX_TENSOR)
-    val realOutput = model.predict(listOf(inputTestTensor))
-    println(realOutput)
-    val output = realOutput["output1"]!!.data as OnnxTensor
-    val logits = output.value as Array<Array<Array<FloatArray>>>
-    val lastTokenLogits = logits[0][0][7] // shape: [50257]
-    val lastTokenProbs = softmax(lastTokenLogits)
-    val topK = topK(lastTokenProbs, 5)
-    val topKIndices = topK.second
-    println(topKIndices.joinToString(", "))
-}
-
-suspend fun mainKIValidation() {
-    val modelBytes = CommonDataLoader.bytes("$resourcesPath/gpt2-lm-head-10.onnx".toPath())
-    val model = KIEngine.loadModel(modelBytes, optimize = true, predictionConfig = PredictionConfigs.NoAllocator)
-
-    val tokenizer = HuggingFaceTokenizer.newInstance("gpt2")
-
-    val inputTestTensor = KIEngine.loadData("$resourcesPath/test_data_set_0/input_0.pb".toPath(), ONNXDataType.ONNX_TENSOR)
-    val realOutput = model.predict(listOf(inputTestTensor))
-    println(realOutput)
-
-    val farray = ((realOutput["output1"]!! as KITensor).data as FloatNDArray).array.toArray()
-    val farray2d = transformToFloatArray2D(farray, 50257)
-    println(farray2d)
-
-    val slicedReal = (realOutput["output1"]!! as KITensor).data.slice(
-        starts = intArrayOf(0, 0, 8 - 1, 0),
-        ends = intArrayOf(1, 1, 8, 50257),
-        steps = intArrayOf(1, 1, 1, 1)
-    ) as NumberNDArrayCore
-    val fslice = (slicedReal as FloatNDArray).array.toArray()
-    println(fslice)
-    val softmaxReal = slicedReal.softmax(axis = -1)
-    val topKReal = softmaxReal.topK(
-        axis = -1,
-        k = 5,
-        largest = true,
-        sorted = true
-    )
-
-    val tokenIdReal = (topKReal.second as LongNDArray)[intArrayOf(0,0,0,0)].toInt()
-    val decodeReal = tokenizer.decode(longArrayOf(tokenIdReal.toLong()))
-    println(decodeReal)
-}
-
-// Constants for input and output tensor names used in the GPT-2 model
-private const val INPUT_TENSOR_NAME = "input1"
-private const val OUTPUT_TENSOR_NAME = "output1" // We use only logits tensor
-
-suspend fun extractTopToken(output: Map<String, KIONNXData<*>>, tokensSize: Int): Long {
-    val logits = output[OUTPUT_TENSOR_NAME]!! as KITensor
-    val sliced = logits.data.slice(
-        starts = intArrayOf(0, 0, tokensSize - 1, 0),   // First batch, first element in the second dimension, last token, first vocab entry
-        ends = intArrayOf(1, 1, tokensSize, 50257),     // Same batch, same second dimension, one token step, whole vocab (50257)
-        steps = intArrayOf(1, 1, 1, 1)                  // Step of 1 for each dimension
-    ) as NumberNDArrayCore
-    val softmax = sliced.softmax(axis = -1)
-    val topK = softmax.topK(
-        axis = -1,                                      // Apply top-k along the last dimension (vocabulary size)
-        k = 1,                                          // Retrieve the top 1 element
-        largest = true,                                 // We want the largest probabilities (most probable tokens)
-        sorted = false                                  // Sorting is unnecessary since we are only retrieving the top 1
-    )
-    val tokenId = (topK.second as LongNDArray)[intArrayOf(0, 0, 0, 0)]
-
-    return tokenId
-}
-
-suspend fun main() {
-    val modelUrl = "https://github.com/onnx/models/raw/main/validated/text/machine_comprehension/gpt-2/model/gpt2-lm-head-10.onnx"
-    val modelName = "gpt2-lm-head-10"
-
-    println("Downloading model from: $modelUrl")
-    downloadFile(modelUrl, "$resourcesPath/$modelName.onnx")
-
-    val modelBytes = CommonDataLoader.bytes("${resourcesPath}/$modelName.onnx".toPath())
-
-    println("Loading model...")
-    val model = KIEngine.loadModel(modelBytes, optimize = true, predictionConfig = PredictionConfigs.DefaultAutoAllocator)
-
-    val tokenizer = HuggingFaceTokenizer.newInstance("gpt2", mapOf("modelMaxLength" to "1024"))
-    val testString = "Neurogenesis is most active during embryonic development and is responsible for producing " +
-        "all the various types of neurons of the organism, but it continues throughout adult life " +
-        "in a variety of organisms. Once born, neurons do not divide (see mitosis), and many will " +
-        "live the lifespan of the animal, except under extraordinary and usually pathogenic circumstances."
-    val encoded = tokenizer.encode(testString)
-    val tokens = encoded.ids
-    val tokensSize = tokens.size
-
-    val predictionLength = 34
-    val outputTokens = LongArray(predictionLength) { 0 }
-
-    val input = LongNDArray(1, tokensSize) { idx: InlineInt -> tokens[idx.value] }.unsqueeze(0)
-    var currentContext = input.clone()
-
-    print("Here goes the test text for generation:\n$testString")
-
-    for (idx in 0 until predictionLength) {
-        val inputTensor = listOf((currentContext as NDArrayCore).asTensor(INPUT_TENSOR_NAME))
-        val output = model.predict(inputTensor)
-
-        outputTokens[idx] = extractTopToken(output, tokensSize + idx)
-
-        val newTokenArray = LongNDArray(1, 1) { _: InlineInt -> outputTokens[idx] }
-        currentContext = currentContext.concat(listOf(newTokenArray.unsqueeze(0)), axis = -1)
-        print(tokenizer.decode(longArrayOf(outputTokens[idx])))
-    }
-    println("\n\nDone")
-}
diff --git a/examples/src/jvmMain/kotlin/io/kinference/examples/lm/ORTMain.kt b/examples/src/jvmMain/kotlin/io/kinference/examples/lm/ORTMain.kt
new file mode 100644
index 00000000..f1956fb9
--- /dev/null
+++ b/examples/src/jvmMain/kotlin/io/kinference/examples/lm/ORTMain.kt
@@ -0,0 +1,74 @@
+package io.kinference.examples.lm
+
+import ai.djl.huggingface.tokenizers.HuggingFaceTokenizer
+import io.kinference.core.data.tensor.KITensor
+import io.kinference.core.data.tensor.asTensor
+import io.kinference.examples.downloadFile
+import io.kinference.examples.extractTopToken
+import io.kinference.examples.resourcesPath
+import io.kinference.ndarray.arrays.FloatNDArray
+import io.kinference.ndarray.arrays.FloatNDArray.Companion.invoke
+import io.kinference.ort.ORTData
+import io.kinference.ort.ORTEngine
+import io.kinference.ort.data.tensor.ORTTensor
+import io.kinference.utils.CommonDataLoader
+import io.kinference.utils.inlines.InlineInt
+import io.kinference.utils.toIntArray
+import okio.Path.Companion.toPath
+
+// Constants for input and output tensor names used in the GPT-2 model
+private const val INPUT_TENSOR_NAME = "input1"
+private const val OUTPUT_TENSOR_NAME = "output1" // We use only logits tensor
+
+suspend fun main() {
+    val modelUrl = "https://github.com/onnx/models/raw/main/validated/text/machine_comprehension/gpt-2/model/gpt2-lm-head-10.onnx"
+    val modelName = "gpt2-lm-head-10"
+
+    println("Downloading model from: $modelUrl")
+    downloadFile(modelUrl, "$resourcesPath/$modelName.onnx")
+
+    val modelBytes = CommonDataLoader.bytes("${resourcesPath}/$modelName.onnx".toPath())
+
+    println("Loading model...")
+    val model = ORTEngine.loadModel(modelBytes)
+
+    val tokenizer = HuggingFaceTokenizer.newInstance("gpt2", mapOf("modelMaxLength" to "1024"))
+    val testString = "Neurogenesis is most active during embryonic development and is responsible for producing " +
+        "all the various types of neurons of the organism, but it continues throughout adult life " +
+        "in a variety of organisms. Once born, neurons do not divide (see mitosis), and many will " +
+        "live the lifespan of the animal, except under extraordinary and usually pathogenic circumstances."
+    val encoded = tokenizer.encode(testString)
+    val tokens = encoded.ids
+    val tokensSize = tokens.size
+
+    val predictionLength = 34
+    val outputTokens = LongArray(predictionLength) { 0 }
+
+    val input = ORTTensor(tokens, longArrayOf(1, 1, tokensSize.toLong()))
+    var currentContext = input.clone(INPUT_TENSOR_NAME)
+
+    print("Here goes the test text for generation:\n$testString")
+
+    for (idx in 0 until predictionLength) {
+        val inputTensor = listOf(currentContext)
+        val output = model.predict(inputTensor)
+
+        outputTokens[idx] = extractTopToken(convertToKITensorMap(output), tokensSize + idx, OUTPUT_TENSOR_NAME)
+
+        val newTokenArray = tokens + outputTokens.slice(IntRange(0, idx))
+        currentContext = ORTTensor(newTokenArray, longArrayOf(1, 1, tokensSize + idx + 1L), INPUT_TENSOR_NAME)
+        print(tokenizer.decode(longArrayOf(outputTokens[idx])))
+    }
+    println("\n\nDone")
+}
+
+private suspend fun convertToKITensorMap(outputs: Map<String, ORTData<*>>): Map<String, KITensor> {
+    return outputs.map { (key, value) ->
+        val ortTensor = value as ORTTensor
+        val data = ortTensor.toFloatArray()
+        val shape = ortTensor.shape.toIntArray()
+        val ndArray = FloatNDArray(shape) { idx: InlineInt -> data[idx.value] }
+        val tensor = ndArray.asTensor(key)
+        return@map key to tensor
+    }.toMap()
+}