From a38c5559403fed97ca8e2991c3f8bda286ac1a06 Mon Sep 17 00:00:00 2001 From: dmitriyb Date: Fri, 27 Sep 2024 13:50:47 +0200 Subject: [PATCH] JBAI-5829 [examples] Added GPT-2 example using ORTEngine for text generation. --- .../kotlin/io/kinference/examples/Utils.kt | 23 +++ .../io/kinference/examples/lm/KIMain.kt | 60 +++++++ .../kotlin/io/kinference/examples/lm/Main.kt | 169 ------------------ .../io/kinference/examples/lm/ORTMain.kt | 74 ++++++++ 4 files changed, 157 insertions(+), 169 deletions(-) create mode 100644 examples/src/jvmMain/kotlin/io/kinference/examples/lm/KIMain.kt delete mode 100644 examples/src/jvmMain/kotlin/io/kinference/examples/lm/Main.kt create mode 100644 examples/src/jvmMain/kotlin/io/kinference/examples/lm/ORTMain.kt diff --git a/examples/src/jvmMain/kotlin/io/kinference/examples/Utils.kt b/examples/src/jvmMain/kotlin/io/kinference/examples/Utils.kt index 4aede5dc..3b3acdc3 100644 --- a/examples/src/jvmMain/kotlin/io/kinference/examples/Utils.kt +++ b/examples/src/jvmMain/kotlin/io/kinference/examples/Utils.kt @@ -1,5 +1,9 @@ package io.kinference.examples +import io.kinference.core.KIONNXData +import io.kinference.core.data.tensor.KITensor +import io.kinference.ndarray.arrays.LongNDArray +import io.kinference.ndarray.arrays.NumberNDArrayCore import io.ktor.client.HttpClient import io.ktor.client.plugins.HttpTimeout import io.ktor.client.request.prepareRequest @@ -39,3 +43,22 @@ suspend fun downloadFile(url: String, outputPath: String) { client.close() } + +suspend fun extractTopToken(output: Map>, tokensSize: Int, outputName: String): Long { + val logits = output[outputName]!! as KITensor + val sliced = logits.data.slice( + starts = intArrayOf(0, 0, tokensSize - 1, 0), // First batch, first element in the second dimension, last token, first vocab entry + ends = intArrayOf(1, 1, tokensSize, 50257), // Same batch, same second dimension, one token step, whole vocab (50257) + steps = intArrayOf(1, 1, 1, 1) // Step of 1 for each dimension + ) as NumberNDArrayCore + val softmax = sliced.softmax(axis = -1) + val topK = softmax.topK( + axis = -1, // Apply top-k along the last dimension (vocabulary size) + k = 1, // Retrieve the top 1 element + largest = true, // We want the largest probabilities (most probable tokens) + sorted = false // Sorting is unnecessary since we are only retrieving the top 1 + ) + val tokenId = (topK.second as LongNDArray)[intArrayOf(0, 0, 0, 0)] + + return tokenId +} diff --git a/examples/src/jvmMain/kotlin/io/kinference/examples/lm/KIMain.kt b/examples/src/jvmMain/kotlin/io/kinference/examples/lm/KIMain.kt new file mode 100644 index 00000000..6a1bc484 --- /dev/null +++ b/examples/src/jvmMain/kotlin/io/kinference/examples/lm/KIMain.kt @@ -0,0 +1,60 @@ +package io.kinference.examples.lm + +import ai.djl.huggingface.tokenizers.HuggingFaceTokenizer +import io.kinference.core.KIEngine +import io.kinference.core.data.tensor.asTensor +import io.kinference.examples.downloadFile +import io.kinference.examples.extractTopToken +import io.kinference.examples.resourcesPath +import io.kinference.ndarray.arrays.LongNDArray +import io.kinference.ndarray.arrays.NDArrayCore +import io.kinference.utils.CommonDataLoader +import io.kinference.utils.PredictionConfigs +import io.kinference.utils.inlines.InlineInt +import okio.Path.Companion.toPath + +// Constants for input and output tensor names used in the GPT-2 model +private const val INPUT_TENSOR_NAME = "input1" +private const val OUTPUT_TENSOR_NAME = "output1" // We use only logits tensor + +suspend fun main() { + val modelUrl = "https://github.com/onnx/models/raw/main/validated/text/machine_comprehension/gpt-2/model/gpt2-lm-head-10.onnx" + val modelName = "gpt2-lm-head-10" + + println("Downloading model from: $modelUrl") + downloadFile(modelUrl, "$resourcesPath/$modelName.onnx") + + val modelBytes = CommonDataLoader.bytes("${resourcesPath}/$modelName.onnx".toPath()) + + println("Loading model...") + val model = KIEngine.loadModel(modelBytes, optimize = true, predictionConfig = PredictionConfigs.DefaultAutoAllocator) + + val tokenizer = HuggingFaceTokenizer.newInstance("gpt2", mapOf("modelMaxLength" to "1024")) + val testString = "Neurogenesis is most active during embryonic development and is responsible for producing " + + "all the various types of neurons of the organism, but it continues throughout adult life " + + "in a variety of organisms. Once born, neurons do not divide (see mitosis), and many will " + + "live the lifespan of the animal, except under extraordinary and usually pathogenic circumstances." + val encoded = tokenizer.encode(testString) + val tokens = encoded.ids + val tokensSize = tokens.size + + val predictionLength = 34 + val outputTokens = LongArray(predictionLength) { 0 } + + val input = LongNDArray(1, tokensSize) { idx: InlineInt -> tokens[idx.value] }.unsqueeze(0) + var currentContext = input.clone() + + print("Here goes the test text for generation:\n$testString") + + for (idx in 0 until predictionLength) { + val inputTensor = listOf((currentContext as NDArrayCore).asTensor(INPUT_TENSOR_NAME)) + val output = model.predict(inputTensor) + + outputTokens[idx] = extractTopToken(output, tokensSize + idx, OUTPUT_TENSOR_NAME) + + val newTokenArray = LongNDArray(1, 1) { _: InlineInt -> outputTokens[idx] } + currentContext = currentContext.concat(listOf(newTokenArray.unsqueeze(0)), axis = -1) + print(tokenizer.decode(longArrayOf(outputTokens[idx]))) + } + println("\n\nDone") +} diff --git a/examples/src/jvmMain/kotlin/io/kinference/examples/lm/Main.kt b/examples/src/jvmMain/kotlin/io/kinference/examples/lm/Main.kt deleted file mode 100644 index 01de3393..00000000 --- a/examples/src/jvmMain/kotlin/io/kinference/examples/lm/Main.kt +++ /dev/null @@ -1,169 +0,0 @@ -package io.kinference.examples.lm - -import ai.djl.huggingface.tokenizers.HuggingFaceTokenizer -import ai.onnxruntime.OnnxTensor -import io.kinference.core.KIEngine -import io.kinference.core.KIONNXData -import io.kinference.core.data.tensor.KITensor -import io.kinference.core.data.tensor.asTensor -import io.kinference.data.ONNXDataType -import io.kinference.examples.downloadFile -import io.kinference.examples.resourcesPath -import io.kinference.ndarray.arrays.FloatNDArray -import io.kinference.ndarray.arrays.LongNDArray -import io.kinference.ndarray.arrays.NDArrayCore -import io.kinference.ndarray.arrays.NumberNDArrayCore -import io.kinference.ort.ORTEngine -import io.kinference.utils.CommonDataLoader -import io.kinference.utils.PredictionConfigs -import io.kinference.utils.inlines.InlineInt -import io.kinference.utils.toIntArray -import okio.Path.Companion.toPath - -// Softmax function -fun softmax(logits: FloatArray): FloatArray { - val maxLogit = logits.maxOrNull() ?: 0.0f - val expLogits = logits.map { Math.exp((it - maxLogit).toDouble()).toFloat() }.toFloatArray() - val sumExp = expLogits.sum() - return expLogits.map { it / sumExp }.toFloatArray() // Normalize -} - -// Top-K function to get the top K probabilities and their indices -fun topK(probs: FloatArray, k: Int): Pair { - val indexedProbs = probs.mapIndexed { index, prob -> index to prob } - val sortedProbs = indexedProbs.sortedByDescending { it.second }.take(k) - val topProbs = sortedProbs.map { it.second }.toFloatArray() - val topIndices = sortedProbs.map { it.first }.toIntArray() - return Pair(topProbs, topIndices) -} - -fun transformToFloatArray2D(original: FloatArray, n: Int): Array { - // Calculate how many sub-arrays (rows) we will have - val rowCount = original.size / n - - // Create a new 2D array to store the result - val result = Array(rowCount) { FloatArray(n) } - - // Fill the new 2D array with sub-arrays from the original array - for (i in 0 until rowCount) { - // Copy the next n elements into the current row - result[i] = original.sliceArray(i * n until (i + 1) * n) - } - - return result -} - -suspend fun mainONNXRuntimeValidation() { - val modelBytes = CommonDataLoader.bytes("$resourcesPath/gpt2-lm-head-10.onnx".toPath()) - val model = ORTEngine.loadModel(modelBytes) - - val inputTestTensor = ORTEngine.loadData("$resourcesPath/test_data_set_0/input_0.pb".toPath(), ONNXDataType.ONNX_TENSOR) - val realOutput = model.predict(listOf(inputTestTensor)) - println(realOutput) - val output = realOutput["output1"]!!.data as OnnxTensor - val logits = output.value as Array>> - val lastTokenLogits = logits[0][0][7] // shape: [50257] - val lastTokenProbs = softmax(lastTokenLogits) - val topK = topK(lastTokenProbs, 5) - val topKIndices = topK.second - println(topKIndices.joinToString(", ")) -} - -suspend fun mainKIValidation() { - val modelBytes = CommonDataLoader.bytes("$resourcesPath/gpt2-lm-head-10.onnx".toPath()) - val model = KIEngine.loadModel(modelBytes, optimize = true, predictionConfig = PredictionConfigs.NoAllocator) - - val tokenizer = HuggingFaceTokenizer.newInstance("gpt2") - - val inputTestTensor = KIEngine.loadData("$resourcesPath/test_data_set_0/input_0.pb".toPath(), ONNXDataType.ONNX_TENSOR) - val realOutput = model.predict(listOf(inputTestTensor)) - println(realOutput) - - val farray = ((realOutput["output1"]!! as KITensor).data as FloatNDArray).array.toArray() - val farray2d = transformToFloatArray2D(farray, 50257) - println(farray2d) - - val slicedReal = (realOutput["output1"]!! as KITensor).data.slice( - starts = intArrayOf(0, 0, 8 - 1, 0), - ends = intArrayOf(1, 1, 8, 50257), - steps = intArrayOf(1, 1, 1, 1) - ) as NumberNDArrayCore - val fslice = (slicedReal as FloatNDArray).array.toArray() - println(fslice) - val softmaxReal = slicedReal.softmax(axis = -1) - val topKReal = softmaxReal.topK( - axis = -1, - k = 5, - largest = true, - sorted = true - ) - - val tokenIdReal = (topKReal.second as LongNDArray)[intArrayOf(0,0,0,0)].toInt() - val decodeReal = tokenizer.decode(longArrayOf(tokenIdReal.toLong())) - println(decodeReal) -} - -// Constants for input and output tensor names used in the GPT-2 model -private const val INPUT_TENSOR_NAME = "input1" -private const val OUTPUT_TENSOR_NAME = "output1" // We use only logits tensor - -suspend fun extractTopToken(output: Map>, tokensSize: Int): Long { - val logits = output[OUTPUT_TENSOR_NAME]!! as KITensor - val sliced = logits.data.slice( - starts = intArrayOf(0, 0, tokensSize - 1, 0), // First batch, first element in the second dimension, last token, first vocab entry - ends = intArrayOf(1, 1, tokensSize, 50257), // Same batch, same second dimension, one token step, whole vocab (50257) - steps = intArrayOf(1, 1, 1, 1) // Step of 1 for each dimension - ) as NumberNDArrayCore - val softmax = sliced.softmax(axis = -1) - val topK = softmax.topK( - axis = -1, // Apply top-k along the last dimension (vocabulary size) - k = 1, // Retrieve the top 1 element - largest = true, // We want the largest probabilities (most probable tokens) - sorted = false // Sorting is unnecessary since we are only retrieving the top 1 - ) - val tokenId = (topK.second as LongNDArray)[intArrayOf(0, 0, 0, 0)] - - return tokenId -} - -suspend fun main() { - val modelUrl = "https://github.com/onnx/models/raw/main/validated/text/machine_comprehension/gpt-2/model/gpt2-lm-head-10.onnx" - val modelName = "gpt2-lm-head-10" - - println("Downloading model from: $modelUrl") - downloadFile(modelUrl, "$resourcesPath/$modelName.onnx") - - val modelBytes = CommonDataLoader.bytes("${resourcesPath}/$modelName.onnx".toPath()) - - println("Loading model...") - val model = KIEngine.loadModel(modelBytes, optimize = true, predictionConfig = PredictionConfigs.DefaultAutoAllocator) - - val tokenizer = HuggingFaceTokenizer.newInstance("gpt2", mapOf("modelMaxLength" to "1024")) - val testString = "Neurogenesis is most active during embryonic development and is responsible for producing " + - "all the various types of neurons of the organism, but it continues throughout adult life " + - "in a variety of organisms. Once born, neurons do not divide (see mitosis), and many will " + - "live the lifespan of the animal, except under extraordinary and usually pathogenic circumstances." - val encoded = tokenizer.encode(testString) - val tokens = encoded.ids - val tokensSize = tokens.size - - val predictionLength = 34 - val outputTokens = LongArray(predictionLength) { 0 } - - val input = LongNDArray(1, tokensSize) { idx: InlineInt -> tokens[idx.value] }.unsqueeze(0) - var currentContext = input.clone() - - print("Here goes the test text for generation:\n$testString") - - for (idx in 0 until predictionLength) { - val inputTensor = listOf((currentContext as NDArrayCore).asTensor(INPUT_TENSOR_NAME)) - val output = model.predict(inputTensor) - - outputTokens[idx] = extractTopToken(output, tokensSize + idx) - - val newTokenArray = LongNDArray(1, 1) { _: InlineInt -> outputTokens[idx] } - currentContext = currentContext.concat(listOf(newTokenArray.unsqueeze(0)), axis = -1) - print(tokenizer.decode(longArrayOf(outputTokens[idx]))) - } - println("\n\nDone") -} diff --git a/examples/src/jvmMain/kotlin/io/kinference/examples/lm/ORTMain.kt b/examples/src/jvmMain/kotlin/io/kinference/examples/lm/ORTMain.kt new file mode 100644 index 00000000..f1956fb9 --- /dev/null +++ b/examples/src/jvmMain/kotlin/io/kinference/examples/lm/ORTMain.kt @@ -0,0 +1,74 @@ +package io.kinference.examples.lm + +import ai.djl.huggingface.tokenizers.HuggingFaceTokenizer +import io.kinference.core.data.tensor.KITensor +import io.kinference.core.data.tensor.asTensor +import io.kinference.examples.downloadFile +import io.kinference.examples.extractTopToken +import io.kinference.examples.resourcesPath +import io.kinference.ndarray.arrays.FloatNDArray +import io.kinference.ndarray.arrays.FloatNDArray.Companion.invoke +import io.kinference.ort.ORTData +import io.kinference.ort.ORTEngine +import io.kinference.ort.data.tensor.ORTTensor +import io.kinference.utils.CommonDataLoader +import io.kinference.utils.inlines.InlineInt +import io.kinference.utils.toIntArray +import okio.Path.Companion.toPath + +// Constants for input and output tensor names used in the GPT-2 model +private const val INPUT_TENSOR_NAME = "input1" +private const val OUTPUT_TENSOR_NAME = "output1" // We use only logits tensor + +suspend fun main() { + val modelUrl = "https://github.com/onnx/models/raw/main/validated/text/machine_comprehension/gpt-2/model/gpt2-lm-head-10.onnx" + val modelName = "gpt2-lm-head-10" + + println("Downloading model from: $modelUrl") + downloadFile(modelUrl, "$resourcesPath/$modelName.onnx") + + val modelBytes = CommonDataLoader.bytes("${resourcesPath}/$modelName.onnx".toPath()) + + println("Loading model...") + val model = ORTEngine.loadModel(modelBytes) + + val tokenizer = HuggingFaceTokenizer.newInstance("gpt2", mapOf("modelMaxLength" to "1024")) + val testString = "Neurogenesis is most active during embryonic development and is responsible for producing " + + "all the various types of neurons of the organism, but it continues throughout adult life " + + "in a variety of organisms. Once born, neurons do not divide (see mitosis), and many will " + + "live the lifespan of the animal, except under extraordinary and usually pathogenic circumstances." + val encoded = tokenizer.encode(testString) + val tokens = encoded.ids + val tokensSize = tokens.size + + val predictionLength = 34 + val outputTokens = LongArray(predictionLength) { 0 } + + val input = ORTTensor(tokens, longArrayOf(1, 1, tokensSize.toLong())) + var currentContext = input.clone(INPUT_TENSOR_NAME) + + print("Here goes the test text for generation:\n$testString") + + for (idx in 0 until predictionLength) { + val inputTensor = listOf(currentContext) + val output = model.predict(inputTensor) + + outputTokens[idx] = extractTopToken(convertToKITensorMap(output), tokensSize + idx, OUTPUT_TENSOR_NAME) + + val newTokenArray = tokens + outputTokens.slice(IntRange(0, idx)) + currentContext = ORTTensor(newTokenArray, longArrayOf(1, 1, tokensSize + idx + 1L), INPUT_TENSOR_NAME) + print(tokenizer.decode(longArrayOf(outputTokens[idx]))) + } + println("\n\nDone") +} + +private suspend fun convertToKITensorMap(outputs: Map>): Map { + return outputs.map { (key, value) -> + val ortTensor = value as ORTTensor + val data = ortTensor.toFloatArray() + val shape = ortTensor.shape.toIntArray() + val ndArray = FloatNDArray(shape) { idx: InlineInt -> data[idx.value] } + val tensor = ndArray.asTensor(key) + return@map key to tensor + }.toMap() +}