diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 619eebf0ef..8cfc99847b 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 # -add_subdirectory(cpp/benchmark_genai) add_subdirectory(cpp/text_generation) add_subdirectory(cpp/image_generation) add_subdirectory(cpp/visual_language_chat) @@ -16,7 +15,6 @@ install(FILES COMPONENT cpp_samples_genai) install(DIRECTORY - cpp/benchmark_genai cpp/text_generation cpp/image_generation cpp/visual_language_chat diff --git a/samples/cpp/benchmark_genai/CMakeLists.txt b/samples/cpp/benchmark_genai/CMakeLists.txt deleted file mode 100644 index 902a05eee6..0000000000 --- a/samples/cpp/benchmark_genai/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED - PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) - -include(FetchContent) - -if(POLICY CMP0135) - cmake_policy(SET CMP0135 NEW) -endif() - -FetchContent_Declare(cxxopts - URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz - URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) -FetchContent_MakeAvailable(cxxopts) - -add_executable(benchmark_genai benchmark_genai.cpp) -target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts) -set_target_properties(benchmark_genai PROPERTIES - COMPILE_PDB_NAME benchmark_genai - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) - -install(TARGETS benchmark_genai - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/benchmark_genai/README.md b/samples/cpp/benchmark_genai/README.md deleted file mode 100644 index d7b3f6ac21..0000000000 --- a/samples/cpp/benchmark_genai/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# LLMs benchmarking sample - -This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. - -```sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Usage - -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. - -```sh -benchmark_genai [OPTIONS] -``` - -### Options - -- `-m, --model`: Path to the model and tokenizers base directory. -- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. -- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. -- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. -- `-n, --num_iter` (default: `3`): Number of iterations. -- `-d, --device` (default: `"CPU"`): Device to run the model on. - -### Output: - -``` -benchmark_genai -m TinyLlama-1.1B-Chat-v1.0 -n 10 -``` - -``` -Load time: 3405.69 ms -Generate time: 1430.77 ± 3.04 ms -Tokenization time: 0.51 ± 0.02 ms -Detokenization time: 0.37 ± 0.01 ms -TTFT: 81.60 ± 0.54 ms -TPOT: 71.52 ± 2.72 ms -Throughput tokens/s: 13.98 ± 0.53 -``` - -For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics). diff --git a/samples/cpp/text_generation/CMakeLists.txt b/samples/cpp/text_generation/CMakeLists.txt index d4d58c8383..77ab30112f 100644 --- a/samples/cpp/text_generation/CMakeLists.txt +++ b/samples/cpp/text_generation/CMakeLists.txt @@ -35,3 +35,28 @@ set (SAMPLE_LIST foreach(sample ${SAMPLE_LIST}) add_sample_executable(${sample}) endforeach() + + +# benchmark_genai +include(FetchContent) + +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +FetchContent_Declare(cxxopts + URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz + URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) +FetchContent_MakeAvailable(cxxopts) + +add_executable(benchmark_genai benchmark_genai.cpp) +target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts) +set_target_properties(benchmark_genai PROPERTIES + COMPILE_PDB_NAME benchmark_genai + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS benchmark_genai + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) \ No newline at end of file diff --git a/samples/cpp/text_generation/README.md b/samples/cpp/text_generation/README.md index 0ec44c1b4f..806ebab280 100644 --- a/samples/cpp/text_generation/README.md +++ b/samples/cpp/text_generation/README.md @@ -1,17 +1,15 @@ -# OpenVINO AI Text Generation Samples +# OpenVINO GenAI Text Generation Samples These samples showcase the use of OpenVINO's inference capabilities for text generation tasks, including different decoding strategies such as beam search, multinomial sampling, and speculative decoding. Each sample has a specific focus and demonstrates a unique aspect of text generation. The applications don't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. -There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) that provides an example of LLM-powered text generation in Python. +There are also Jupyter notebooks for some samples. You can find links to them in the appropriate sample descritions. ## Table of Contents 1. [Download and Convert the Model and Tokenizers](#download-and-convert-the-model-and-tokenizers) -2. [Running the Samples](#running-the-samples) -3. [Using encrypted models](#using-encrypted-models) -4. [Sample Descriptions](#sample-descriptions) -5. [Troubleshooting](#troubleshooting) -6. [Support and Contribution](#support-and-contribution) +2. [Sample Descriptions](#sample-descriptions) +3. [Troubleshooting](#troubleshooting) +4. [Support and Contribution](#support-and-contribution) ## Download and convert the model and tokenizers @@ -24,41 +22,47 @@ pip install --upgrade-strategy eager -r ../../requirements.txt optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 ``` -## Running the Samples +Model examples to use for different samples: +chat_sample - meta-llama/Llama-2-7b-chat-hf +speculative_decoding_lm - meta-llama/Llama-2-13b-hf as main model and TinyLlama/TinyLlama-1.1B-Chat-v1.0 as draft model +other samples - meta-llama/Llama-2-7b-hf -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run a specific sample. - -`greedy_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` +## Sample Descriptions +### Common information +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to get common information about OpenVINO samples. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -## Sample Descriptions - ### 1. Greedy Causal LM (`greedy_causal_lm`) -- **Description:** Basic text generation using a causal language model. +- **Description:** +Basic text generation using a causal language model. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. - **Main Feature:** Demonstrates simple text continuation. - **Run Command:** ```bash - ./greedy_causal_lm + ./greedy_causal_lm "" ``` ### 2. Beam Search Causal LM (`beam_search_causal_lm`) -- **Description:** Uses beam search for more coherent text generation. +- **Description:** +Uses beam search for more coherent text generation. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. - **Main Feature:** Improves text quality with beam search. - **Run Command:** ```bash - ./beam_search_causal_lm + ./beam_search_causal_lm "" ["" ...] ``` ### 3. Chat Sample (`chat_sample`) -- **Description:** Interactive chat interface powered by OpenVINO. +- **Description:** +Interactive chat interface powered by OpenVINO. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) that provides an example of LLM-powered text generation in Python. - **Main Feature:** Real-time chat-like text generation. - **Run Command:** ```bash - ./chat_sample + ./chat_sample ``` #### Missing chat template If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. @@ -73,7 +77,7 @@ The following template can be used as a default, but it may not work properly wi - **Main Feature:** Introduces randomness for creative outputs. - **Run Command:** ```bash - ./multinomial_causal_lm + ./multinomial_causal_lm "" ``` ### 5. Prompt Lookup Decoding LM (`prompt_lookup_decoding_lm`) @@ -82,7 +86,7 @@ The following template can be used as a default, but it may not work properly wi - **Main Feature:** Specialized prompt-based inference. - **Run Command:** ```bash - ./prompt_lookup_decoding_lm + ./prompt_lookup_decoding_lm "" ``` ### 6. Speculative Decoding LM (`speculative_decoding_lm`) @@ -92,10 +96,12 @@ Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assis Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In that case they are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf + +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/speculative-sampling) that provides an example of LLM-powered text generation in Python. - **Main Feature:** Reduces latency while generating high-quality text. - **Run Command:** ```bash - ./speculative_decoding_lm + ./speculative_decoding_lm "" ``` ### 7. Encrypted Model Causal LM (`encrypted_model_causal_lm`) @@ -111,9 +117,28 @@ For the sake of brevity the code above does not include Tokenizer decryption. Fo - **Main Feature:** Read model directly from memory buffer - **Run Command:** ```bash - ./encrypted_model_causal_lm + ./encrypted_model_causal_lm "" ``` +### 8. LLMs benchmarking sample (`benchmark_genai`) +- **Description:** +This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. + +For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics). +- **Main Feature:** Benchmark model via GenAI +- **Run Command:** + ```bash + ./benchmark_genai [OPTIONS] + ``` + #### Options +- `-m, --model`: Path to the model and tokenizers base directory. +- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. +- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. +- `-n, --num_iter` (default: `3`): Number of iterations. +- `-d, --device` (default: `"CPU"`): Device to run the model on. + + ## Troubleshooting ### Unicode characters encoding error on Windows diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/text_generation/benchmark_genai.cpp similarity index 100% rename from samples/cpp/benchmark_genai/benchmark_genai.cpp rename to samples/cpp/text_generation/benchmark_genai.cpp diff --git a/src/README.md b/src/README.md index 6466b431d0..d8c15b1c77 100644 --- a/src/README.md +++ b/src/README.md @@ -394,7 +394,7 @@ durations = np.array(raw_metrics.m_new_token_times[1:]) - np.array(raw_metrics.m print(f'Median from token to token duration: {np.median(durations):.2f} ms') ``` -For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](../samples/cpp/benchmark_genai/README.md) samples. +For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](../samples/cpp/text_generation/README.md) samples. ## How It Works