From b92d180dfacc13247e6ecc0626d6d919b3d3e375 Mon Sep 17 00:00:00 2001 From: Oleg Pipikin Date: Wed, 18 Dec 2024 22:08:21 +0000 Subject: [PATCH 1/9] Move cpp text generation samples to one folder --- .github/workflows/causal_lm_cpp.yml | 16 ++-- samples/CMakeLists.txt | 12 --- .../cpp/beam_search_causal_lm/CMakeLists.txt | 22 ----- samples/cpp/chat_sample/CMakeLists.txt | 22 ----- .../cpp/lora_greedy_causal_lm/CMakeLists.txt | 19 ---- .../cpp/multinomial_causal_lm/CMakeLists.txt | 22 ----- .../prompt_lookup_decoding_lm/CMakeLists.txt | 23 ----- .../speculative_decoding_lm/CMakeLists.txt | 23 ----- samples/cpp/text_generation/CMakeLists.txt | 93 ++++++++++++++++++- .../beam_search_causal_lm.cpp | 0 .../chat_sample.cpp | 0 .../lora_greedy_causal_lm.cpp | 0 .../multinomial_causal_lm.cpp | 0 .../prompt_lookup_decoding_lm.cpp | 0 .../speculative_decoding_lm.cpp | 0 15 files changed, 99 insertions(+), 153 deletions(-) delete mode 100644 samples/cpp/beam_search_causal_lm/CMakeLists.txt delete mode 100644 samples/cpp/chat_sample/CMakeLists.txt delete mode 100644 samples/cpp/lora_greedy_causal_lm/CMakeLists.txt delete mode 100644 samples/cpp/multinomial_causal_lm/CMakeLists.txt delete mode 100644 samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt delete mode 100644 samples/cpp/speculative_decoding_lm/CMakeLists.txt rename samples/cpp/{beam_search_causal_lm => text_generation}/beam_search_causal_lm.cpp (100%) rename samples/cpp/{chat_sample => text_generation}/chat_sample.cpp (100%) rename samples/cpp/{lora_greedy_causal_lm => text_generation}/lora_greedy_causal_lm.cpp (100%) rename samples/cpp/{multinomial_causal_lm => text_generation}/multinomial_causal_lm.cpp (100%) rename samples/cpp/{prompt_lookup_decoding_lm => text_generation}/prompt_lookup_decoding_lm.cpp (100%) rename samples/cpp/{speculative_decoding_lm => text_generation}/speculative_decoding_lm.cpp (100%) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index b6abbefac0..6576b75ff0 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -53,7 +53,7 @@ jobs: wget https://huggingface.co/smangrul/tinyllama_lora_sql/resolve/main/adapter_model.safetensors?download=true -O adapter_model.safetensors - run: > . ./ov/setupvars.sh - && timeout 35s ./build/samples/cpp/multinomial_causal_lm/multinomial_causal_lm ./open_llama_3b_v2/ a + && timeout 35s ./build/samples/cpp/text_generation/multinomial_causal_lm ./open_llama_3b_v2/ a env: PYTHONPATH: "./build" - run: > @@ -78,7 +78,7 @@ jobs: matrix: executable: [ - ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm, + ./build/samples/cpp/text_generation/beam_search_causal_lm, python ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py, ] runs-on: ubuntu-20.04 @@ -338,7 +338,7 @@ jobs: optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat - run: > . ./ov/setupvars.sh - && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" + && timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好!") - env: PYTHONPATH: "./build" @@ -373,7 +373,7 @@ jobs: optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2 - run: > . ./ov/setupvars.sh - && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./phi-2/ 69 + && timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./phi-2/ 69 | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./phi-2/ 69) - env: PYTHONPATH: "./build" @@ -408,7 +408,7 @@ jobs: optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1 - run: > . ./ov/setupvars.sh - && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./notus-7b-v1/ 69 + && timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./notus-7b-v1/ 69 | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./notus-7b-v1/ 69) - env: PYTHONPATH: "./build" @@ -445,7 +445,7 @@ jobs: - name: run and compare run: | source ./ov/setupvars.sh - ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_speculative.txt + ./build/samples/cpp/text_generation/speculative_decoding_lm ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_speculative.txt ./build/samples/cpp/text_generation/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt python ./samples/python/speculative_decoding_lm/speculative_decoding_lm.py ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_py.txt python -c " @@ -502,7 +502,7 @@ jobs: Question: Can you please add 2 and 3 A:' > ./prompt.txt - ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt + ./build/samples/cpp/text_generation/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt ./build/samples/cpp/text_generation/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt python ./samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_py.txt python -c " @@ -664,7 +664,7 @@ jobs: run: | source ./ov/setupvars.sh printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\n' > ./input.txt - timeout 30s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt + timeout 30s ./build/samples/cpp/text_generation/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt python -c " from transformers import AutoTokenizer, AutoModelForCausalLM model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 02539df6e7..619eebf0ef 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -2,14 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 # -add_subdirectory(cpp/beam_search_causal_lm) add_subdirectory(cpp/benchmark_genai) -add_subdirectory(cpp/chat_sample) add_subdirectory(cpp/text_generation) -add_subdirectory(cpp/lora_greedy_causal_lm) -add_subdirectory(cpp/multinomial_causal_lm) -add_subdirectory(cpp/prompt_lookup_decoding_lm) -add_subdirectory(cpp/speculative_decoding_lm) add_subdirectory(cpp/image_generation) add_subdirectory(cpp/visual_language_chat) add_subdirectory(cpp/whisper_speech_recognition) @@ -22,15 +16,9 @@ install(FILES COMPONENT cpp_samples_genai) install(DIRECTORY - cpp/beam_search_causal_lm cpp/benchmark_genai - cpp/chat_sample cpp/text_generation cpp/image_generation - cpp/lora_greedy_causal_lm - cpp/multinomial_causal_lm - # Don't install prompt_lookup_decoding_lm because it doesn't use openvino_genai library and is not verified yet. - cpp/speculative_decoding_lm cpp/visual_language_chat cpp/whisper_speech_recognition DESTINATION samples/cpp COMPONENT cpp_samples_genai) diff --git a/samples/cpp/beam_search_causal_lm/CMakeLists.txt b/samples/cpp/beam_search_causal_lm/CMakeLists.txt deleted file mode 100644 index 9bf1a8aac8..0000000000 --- a/samples/cpp/beam_search_causal_lm/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED - HINTS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) - -add_executable(beam_search_causal_lm beam_search_causal_lm.cpp) -target_link_libraries(beam_search_causal_lm PRIVATE openvino::genai) -set_target_properties(beam_search_causal_lm PROPERTIES - COMPILE_PDB_NAME beam_search_causal_lm - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(beam_search_causal_lm PRIVATE cxx_std_11) - -install(TARGETS beam_search_causal_lm - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/chat_sample/CMakeLists.txt b/samples/cpp/chat_sample/CMakeLists.txt deleted file mode 100644 index 69578dc86c..0000000000 --- a/samples/cpp/chat_sample/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED - PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) - -add_executable(chat_sample chat_sample.cpp) -target_link_libraries(chat_sample PRIVATE openvino::genai) -set_target_properties(chat_sample PROPERTIES - COMPILE_PDB_NAME chat_sample - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(chat_sample PRIVATE cxx_std_11) - -install(TARGETS chat_sample - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/lora_greedy_causal_lm/CMakeLists.txt b/samples/cpp/lora_greedy_causal_lm/CMakeLists.txt deleted file mode 100644 index 1d3f6307c0..0000000000 --- a/samples/cpp/lora_greedy_causal_lm/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) -add_executable(lora_greedy_causal_lm lora_greedy_causal_lm.cpp) -target_link_libraries(lora_greedy_causal_lm PRIVATE openvino::genai) -set_target_properties(lora_greedy_causal_lm PROPERTIES - COMPILE_PDB_NAME lora_greedy_causal_lm - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(lora_greedy_causal_lm PRIVATE cxx_std_11) -install(TARGETS lora_greedy_causal_lm - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/multinomial_causal_lm/CMakeLists.txt b/samples/cpp/multinomial_causal_lm/CMakeLists.txt deleted file mode 100644 index 83b2335431..0000000000 --- a/samples/cpp/multinomial_causal_lm/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED - PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) - -add_executable(multinomial_causal_lm multinomial_causal_lm.cpp) -target_link_libraries(multinomial_causal_lm PRIVATE openvino::genai) -set_target_properties(multinomial_causal_lm PROPERTIES - COMPILE_PDB_NAME multinomial_causal_lm - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(multinomial_causal_lm PRIVATE cxx_std_11) - -install(TARGETS multinomial_causal_lm - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt deleted file mode 100644 index b0ce8b1b60..0000000000 --- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED - PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) - -set(TARGET_NAME prompt_lookup_decoding_lm) -add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp) -target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai) - -set_target_properties(${TARGET_NAME} PROPERTIES - COMPILE_PDB_NAME ${TARGET_NAME} - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) - -install(TARGETS ${TARGET_NAME} - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt deleted file mode 100644 index 7c48b6cc0b..0000000000 --- a/samples/cpp/speculative_decoding_lm/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED - PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) - -set(TARGET_NAME speculative_decoding_lm) -add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp) -target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai) - -set_target_properties(${TARGET_NAME} PROPERTIES - COMPILE_PDB_NAME ${TARGET_NAME} - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) - -install(TARGETS ${TARGET_NAME} - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/text_generation/CMakeLists.txt b/samples/cpp/text_generation/CMakeLists.txt index 377682974e..d262e57e2f 100644 --- a/samples/cpp/text_generation/CMakeLists.txt +++ b/samples/cpp/text_generation/CMakeLists.txt @@ -8,6 +8,8 @@ find_package(OpenVINOGenAI REQUIRED NO_CMAKE_FIND_ROOT_PATH ) + +# greedy_causal_lm add_executable(greedy_causal_lm greedy_causal_lm.cpp) target_link_libraries(greedy_causal_lm PRIVATE openvino::genai) set_target_properties(greedy_causal_lm PROPERTIES @@ -15,12 +17,13 @@ set_target_properties(greedy_causal_lm PROPERTIES # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) target_compile_features(greedy_causal_lm PRIVATE cxx_std_11) - install(TARGETS greedy_causal_lm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL) + +# encrypted_model_causal_lm add_executable(encrypted_model_causal_lm encrypted_model_causal_lm.cpp) target_link_libraries(encrypted_model_causal_lm PRIVATE openvino::genai) set_target_properties(encrypted_model_causal_lm PROPERTIES @@ -28,8 +31,94 @@ set_target_properties(encrypted_model_causal_lm PROPERTIES # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) target_compile_features(encrypted_model_causal_lm PRIVATE cxx_std_11) - install(TARGETS encrypted_model_causal_lm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL) + + +# beam_search_causal_lm +add_executable(beam_search_causal_lm beam_search_causal_lm.cpp) +target_link_libraries(beam_search_causal_lm PRIVATE openvino::genai) +set_target_properties(beam_search_causal_lm PROPERTIES + COMPILE_PDB_NAME beam_search_causal_lm + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +target_compile_features(beam_search_causal_lm PRIVATE cxx_std_11) +install(TARGETS beam_search_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) + + +# chat_sample +add_executable(chat_sample chat_sample.cpp) +target_link_libraries(chat_sample PRIVATE openvino::genai) +set_target_properties(chat_sample PROPERTIES + COMPILE_PDB_NAME chat_sample + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +target_compile_features(chat_sample PRIVATE cxx_std_11) +install(TARGETS chat_sample + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) + + +# lora_greedy_causal_lm +add_executable(lora_greedy_causal_lm lora_greedy_causal_lm.cpp) +target_link_libraries(lora_greedy_causal_lm PRIVATE openvino::genai) +set_target_properties(lora_greedy_causal_lm PROPERTIES + COMPILE_PDB_NAME lora_greedy_causal_lm + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +target_compile_features(lora_greedy_causal_lm PRIVATE cxx_std_11) +install(TARGETS lora_greedy_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) + + +# multinomial_causal_lm +add_executable(multinomial_causal_lm multinomial_causal_lm.cpp) +target_link_libraries(multinomial_causal_lm PRIVATE openvino::genai) +set_target_properties(multinomial_causal_lm PROPERTIES + COMPILE_PDB_NAME multinomial_causal_lm + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +target_compile_features(multinomial_causal_lm PRIVATE cxx_std_11) + +install(TARGETS multinomial_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) + + +# prompt_lookup_decoding_lm +add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp) +target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::genai) + +set_target_properties(prompt_lookup_decoding_lm PROPERTIES + COMPILE_PDB_NAME prompt_lookup_decoding_lm + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +# Don't install prompt_lookup_decoding_lm because it doesn't use openvino_genai library and is not verified yet. +# install(TARGETS prompt_lookup_decoding_lm +# RUNTIME DESTINATION samples_bin/ +# COMPONENT samples_bin +# EXCLUDE_FROM_ALL) + + +# speculative_decoding_lm +add_executable(speculative_decoding_lm speculative_decoding_lm.cpp) +target_link_libraries(speculative_decoding_lm PRIVATE openvino::genai) + +set_target_properties(speculative_decoding_lm PROPERTIES + COMPILE_PDB_NAME speculative_decoding_lm + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS speculative_decoding_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp b/samples/cpp/text_generation/beam_search_causal_lm.cpp similarity index 100% rename from samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp rename to samples/cpp/text_generation/beam_search_causal_lm.cpp diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/text_generation/chat_sample.cpp similarity index 100% rename from samples/cpp/chat_sample/chat_sample.cpp rename to samples/cpp/text_generation/chat_sample.cpp diff --git a/samples/cpp/lora_greedy_causal_lm/lora_greedy_causal_lm.cpp b/samples/cpp/text_generation/lora_greedy_causal_lm.cpp similarity index 100% rename from samples/cpp/lora_greedy_causal_lm/lora_greedy_causal_lm.cpp rename to samples/cpp/text_generation/lora_greedy_causal_lm.cpp diff --git a/samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp b/samples/cpp/text_generation/multinomial_causal_lm.cpp similarity index 100% rename from samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp rename to samples/cpp/text_generation/multinomial_causal_lm.cpp diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/text_generation/prompt_lookup_decoding_lm.cpp similarity index 100% rename from samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp rename to samples/cpp/text_generation/prompt_lookup_decoding_lm.cpp diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/text_generation/speculative_decoding_lm.cpp similarity index 100% rename from samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp rename to samples/cpp/text_generation/speculative_decoding_lm.cpp From 39259f87eeb4a90b52f8ad027913b0886f935418 Mon Sep 17 00:00:00 2001 From: Oleg Pipikin Date: Thu, 19 Dec 2024 11:17:31 +0000 Subject: [PATCH 2/9] Update readme.md --- samples/cpp/beam_search_causal_lm/README.md | 38 ------- samples/cpp/chat_sample/README.md | 46 -------- samples/cpp/multinomial_causal_lm/README.md | 38 ------- .../cpp/prompt_lookup_decoding_lm/README.md | 41 ------- samples/cpp/speculative_decoding_lm/README.md | 45 -------- samples/cpp/text_generation/README.md | 107 ++++++++++++++++-- 6 files changed, 99 insertions(+), 216 deletions(-) delete mode 100644 samples/cpp/beam_search_causal_lm/README.md delete mode 100644 samples/cpp/chat_sample/README.md delete mode 100644 samples/cpp/multinomial_causal_lm/README.md delete mode 100644 samples/cpp/prompt_lookup_decoding_lm/README.md delete mode 100644 samples/cpp/speculative_decoding_lm/README.md diff --git a/samples/cpp/beam_search_causal_lm/README.md b/samples/cpp/beam_search_causal_lm/README.md deleted file mode 100644 index 947160e092..0000000000 --- a/samples/cpp/beam_search_causal_lm/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Text generation C++ sample that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a different one, GPU for example, from the command line interface. The sample fearures `ov::genai::LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. - -`beam_search_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/chat_sample/README.md b/samples/cpp/chat_sample/README.md deleted file mode 100644 index bdc1d294ee..0000000000 --- a/samples/cpp/chat_sample/README.md +++ /dev/null @@ -1,46 +0,0 @@ -# C++ chat_sample that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. - -```sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run: - -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. - -`chat_sample TinyLlama-1.1B-Chat-v1.0` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. - -#### Missing chat template - -If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. -The following template can be used as a default, but it may not work properly with every model: -``` -"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", -``` diff --git a/samples/cpp/multinomial_causal_lm/README.md b/samples/cpp/multinomial_causal_lm/README.md deleted file mode 100644 index 35ca054fdd..0000000000 --- a/samples/cpp/multinomial_causal_lm/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. - -```sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. - -`multinomial_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/prompt_lookup_decoding_lm/README.md b/samples/cpp/prompt_lookup_decoding_lm/README.md deleted file mode 100644 index 2057ff2c6f..0000000000 --- a/samples/cpp/prompt_lookup_decoding_lm/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 3 - -[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. - -```sh -source /setupvars.sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. - -`prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0;"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/speculative_decoding_lm/README.md b/samples/cpp/speculative_decoding_lm/README.md deleted file mode 100644 index 7ca26164a6..0000000000 --- a/samples/cpp/speculative_decoding_lm/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 3 - -Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alongside with the main model. - -Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. - -This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In that case they are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. - -```sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -optimum-cli export openvino --trust-remote-code --model meta-llama/Llama-2-7b-chat-hf Llama-2-7b-chat-hf -``` - -## Run - -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. - -`speculative_decoding_lm TinyLlama-1.1B-Chat-v1.0 Llama-2-7b-chat-hf "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/text_generation/README.md b/samples/cpp/text_generation/README.md index 6928d03927..d67c90aef9 100644 --- a/samples/cpp/text_generation/README.md +++ b/samples/cpp/text_generation/README.md @@ -1,6 +1,31 @@ -# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 3 -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. +# OpenVINO Generative AI Text Generation Samples + +These samples showcase the use of OpenVINO's inference capabilities for text generation tasks, including different decoding strategies such as beam search, multinomial sampling, and speculative decoding. Each sample has a specific focus and demonstrates a unique aspect of text generation. +There is also a Jupyter notebook that provides an example of LLM-powered text generation in Python. + +## Table of Contents +1. [Setup Instructions](#setup-instructions) +2. [Download and Convert the Model and Tokenizers](#download-and-convert-the-model-and-tokenizers) +3. [Running the Samples](#running-the-samples) +4. [Using encrypted models](#using-encrypted-models) +5. [Sample Descriptions](#sample-descriptions) +6. [Troubleshooting](#troubleshooting) +7. [Support and Contribution](#support-and-contribution) + +## Setup Instructions +1. **Install OpenVINO Toolkit:** Follow the [OpenVINO installation guide](https://docs.openvino.ai/latest/openvino_docs_install_guides.html). +2. **Clone the Repository:** + ```bash + git clone https://github.com/openvinotoolkit/openvino.genai.git + cd openvino.genai/samples/cpp + ``` +3. **Build the Samples:** + ```bash + mkdir build && cd build + cmake .. + make + ``` ## Download and convert the model and tokenizers @@ -13,19 +38,18 @@ pip install --upgrade-strategy eager -r ../../requirements.txt optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 ``` -## Run +## Running the Samples -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run a specific sample. `greedy_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` - Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. -## Using encrypted models +## Using encrypted models LLMPipeline and Tokenizer objects can be initialized directly from the memory buffer, e.g. when user stores only encrypted files and decrypts them on-the-fly. The following code snippet demonstrates how to load the model from the memory buffer: @@ -36,9 +60,72 @@ ov::genai::LLMPipeline pipe(model_str, weights_tensor, tokenizer, device); ``` For the sake of brevity the code above does not include Tokenizer decryption. For more details look to encrypted_model_causal_lm sample. -### Troubleshooting +## Sample Descriptions + +### 1. Text Generation (`text_generation`) +- **Description:** Basic text generation using a causal language model. +- **Main Feature:** Demonstrates simple text continuation. +- **Run Command:** + ```bash + ./text_generation -m -i "Hello, how are you?" -d CPU + ``` + +### 2. Beam Search Causal LM (`beam_search_causal_lm`) +- **Description:** Uses beam search for more coherent text generation. +- **Main Feature:** Improves text quality with beam search. +- **Run Command:** + ```bash + ./beam_search_causal_lm -m -i "Once upon a time" -d CPU + ``` + +### 3. Chat Sample (`chat_sample`) +- **Description:** Interactive chat interface powered by OpenVINO. +- **Main Feature:** Real-time chat-like text generation. +- **Run Command:** + ```bash + ./chat_sample -m -d CPU + ``` +#### Missing chat template +If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. +The following template can be used as a default, but it may not work properly with every model: +``` +"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", +``` + + +### 4. Multinomial Causal LM (`multinomial_causal_lm`) +- **Description:** Text generation with multinomial sampling for diversity. +- **Main Feature:** Introduces randomness for creative outputs. +- **Run Command:** + ```bash + ./multinomial_causal_lm -m -i "Imagine a world" -d CPU + ``` -#### Unicode characters encoding error on Windows +### 5. Prompt Lookup Decoding LM (`prompt_lookup_decoding_lm`) +- **Description:** +[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. +- **Main Feature:** Specialized prompt-based inference. +- **Run Command:** + ```bash + ./prompt_lookup_decoding_lm -m -i "The future of AI" -d CPU + ``` + +### 6. Speculative Decoding LM (`speculative_decoding_lm`) +- **Description:** +Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alongside with the main model. + +Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. + +This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In that case they are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf +- **Main Feature:** Reduces latency while generating high-quality text. +- **Run Command:** + ```bash + ./speculative_decoding_lm -m -i "Breaking news:" -d CPU + ``` + +## Troubleshooting + +### Unicode characters encoding error on Windows Example error: ``` @@ -48,3 +135,7 @@ UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: 1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. 2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. + +## Support and Contribution +- For troubleshooting, consult the [OpenVINO documentation](https://docs.openvino.ai). +- To report issues or contribute, visit the [GitHub repository](https://github.com/openvinotoolkit/openvino.genai). From c1b313a9d70fe14ba828856f6955e7c0bb92fa85 Mon Sep 17 00:00:00 2001 From: Oleg Pipikin Date: Thu, 19 Dec 2024 15:39:12 +0000 Subject: [PATCH 3/9] Apply comments --- samples/cpp/text_generation/CMakeLists.txt | 5 -- samples/cpp/text_generation/README.md | 73 ++++++++++------------ 2 files changed, 32 insertions(+), 46 deletions(-) diff --git a/samples/cpp/text_generation/CMakeLists.txt b/samples/cpp/text_generation/CMakeLists.txt index d262e57e2f..7ccd496804 100644 --- a/samples/cpp/text_generation/CMakeLists.txt +++ b/samples/cpp/text_generation/CMakeLists.txt @@ -8,7 +8,6 @@ find_package(OpenVINOGenAI REQUIRED NO_CMAKE_FIND_ROOT_PATH ) - # greedy_causal_lm add_executable(greedy_causal_lm greedy_causal_lm.cpp) target_link_libraries(greedy_causal_lm PRIVATE openvino::genai) @@ -87,7 +86,6 @@ set_target_properties(multinomial_causal_lm PROPERTIES # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) target_compile_features(multinomial_causal_lm PRIVATE cxx_std_11) - install(TARGETS multinomial_causal_lm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin @@ -97,7 +95,6 @@ install(TARGETS multinomial_causal_lm # prompt_lookup_decoding_lm add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp) target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::genai) - set_target_properties(prompt_lookup_decoding_lm PROPERTIES COMPILE_PDB_NAME prompt_lookup_decoding_lm # Ensure out of box LC_RPATH on macOS with SIP @@ -112,12 +109,10 @@ set_target_properties(prompt_lookup_decoding_lm PROPERTIES # speculative_decoding_lm add_executable(speculative_decoding_lm speculative_decoding_lm.cpp) target_link_libraries(speculative_decoding_lm PRIVATE openvino::genai) - set_target_properties(speculative_decoding_lm PROPERTIES COMPILE_PDB_NAME speculative_decoding_lm # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) - install(TARGETS speculative_decoding_lm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin diff --git a/samples/cpp/text_generation/README.md b/samples/cpp/text_generation/README.md index d67c90aef9..0ec44c1b4f 100644 --- a/samples/cpp/text_generation/README.md +++ b/samples/cpp/text_generation/README.md @@ -1,31 +1,17 @@ -# OpenVINO Generative AI Text Generation Samples +# OpenVINO AI Text Generation Samples These samples showcase the use of OpenVINO's inference capabilities for text generation tasks, including different decoding strategies such as beam search, multinomial sampling, and speculative decoding. Each sample has a specific focus and demonstrates a unique aspect of text generation. -There is also a Jupyter notebook that provides an example of LLM-powered text generation in Python. +The applications don't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. +There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) that provides an example of LLM-powered text generation in Python. ## Table of Contents -1. [Setup Instructions](#setup-instructions) -2. [Download and Convert the Model and Tokenizers](#download-and-convert-the-model-and-tokenizers) -3. [Running the Samples](#running-the-samples) -4. [Using encrypted models](#using-encrypted-models) -5. [Sample Descriptions](#sample-descriptions) -6. [Troubleshooting](#troubleshooting) -7. [Support and Contribution](#support-and-contribution) - -## Setup Instructions -1. **Install OpenVINO Toolkit:** Follow the [OpenVINO installation guide](https://docs.openvino.ai/latest/openvino_docs_install_guides.html). -2. **Clone the Repository:** - ```bash - git clone https://github.com/openvinotoolkit/openvino.genai.git - cd openvino.genai/samples/cpp - ``` -3. **Build the Samples:** - ```bash - mkdir build && cd build - cmake .. - make - ``` +1. [Download and Convert the Model and Tokenizers](#download-and-convert-the-model-and-tokenizers) +2. [Running the Samples](#running-the-samples) +3. [Using encrypted models](#using-encrypted-models) +4. [Sample Descriptions](#sample-descriptions) +5. [Troubleshooting](#troubleshooting) +6. [Support and Contribution](#support-and-contribution) ## Download and convert the model and tokenizers @@ -49,25 +35,14 @@ Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. -## Using encrypted models -LLMPipeline and Tokenizer objects can be initialized directly from the memory buffer, e.g. when user stores only encrypted files and decrypts them on-the-fly. -The following code snippet demonstrates how to load the model from the memory buffer: - -```cpp -auto [model_str, weights_tensor] = decrypt_model(models_path + "/openvino_model.xml", models_path + "/openvino_model.bin"); -ov::genai::Tokenizer tokenizer(models_path); -ov::genai::LLMPipeline pipe(model_str, weights_tensor, tokenizer, device); -``` -For the sake of brevity the code above does not include Tokenizer decryption. For more details look to encrypted_model_causal_lm sample. - ## Sample Descriptions -### 1. Text Generation (`text_generation`) +### 1. Greedy Causal LM (`greedy_causal_lm`) - **Description:** Basic text generation using a causal language model. - **Main Feature:** Demonstrates simple text continuation. - **Run Command:** ```bash - ./text_generation -m -i "Hello, how are you?" -d CPU + ./greedy_causal_lm ``` ### 2. Beam Search Causal LM (`beam_search_causal_lm`) @@ -75,7 +50,7 @@ For the sake of brevity the code above does not include Tokenizer decryption. Fo - **Main Feature:** Improves text quality with beam search. - **Run Command:** ```bash - ./beam_search_causal_lm -m -i "Once upon a time" -d CPU + ./beam_search_causal_lm ``` ### 3. Chat Sample (`chat_sample`) @@ -83,7 +58,7 @@ For the sake of brevity the code above does not include Tokenizer decryption. Fo - **Main Feature:** Real-time chat-like text generation. - **Run Command:** ```bash - ./chat_sample -m -d CPU + ./chat_sample ``` #### Missing chat template If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. @@ -98,7 +73,7 @@ The following template can be used as a default, but it may not work properly wi - **Main Feature:** Introduces randomness for creative outputs. - **Run Command:** ```bash - ./multinomial_causal_lm -m -i "Imagine a world" -d CPU + ./multinomial_causal_lm ``` ### 5. Prompt Lookup Decoding LM (`prompt_lookup_decoding_lm`) @@ -107,7 +82,7 @@ The following template can be used as a default, but it may not work properly wi - **Main Feature:** Specialized prompt-based inference. - **Run Command:** ```bash - ./prompt_lookup_decoding_lm -m -i "The future of AI" -d CPU + ./prompt_lookup_decoding_lm ``` ### 6. Speculative Decoding LM (`speculative_decoding_lm`) @@ -120,7 +95,23 @@ This approach reduces the need for multiple infer requests to the main model, en - **Main Feature:** Reduces latency while generating high-quality text. - **Run Command:** ```bash - ./speculative_decoding_lm -m -i "Breaking news:" -d CPU + ./speculative_decoding_lm + ``` + +### 7. Encrypted Model Causal LM (`encrypted_model_causal_lm`) +- **Description:** +LLMPipeline and Tokenizer objects can be initialized directly from the memory buffer, e.g. when user stores only encrypted files and decrypts them on-the-fly. +The following code snippet demonstrates how to load the model from the memory buffer: +```cpp +auto [model_str, weights_tensor] = decrypt_model(models_path + "/openvino_model.xml", models_path + "/openvino_model.bin"); +ov::genai::Tokenizer tokenizer(models_path); +ov::genai::LLMPipeline pipe(model_str, weights_tensor, tokenizer, device); +``` +For the sake of brevity the code above does not include Tokenizer decryption. For more details look to encrypted_model_causal_lm sample. +- **Main Feature:** Read model directly from memory buffer +- **Run Command:** + ```bash + ./encrypted_model_causal_lm ``` ## Troubleshooting From e3eda76eab7a56bd87c3b7ef397d557ca78d2555 Mon Sep 17 00:00:00 2001 From: Oleg Pipikin Date: Fri, 3 Jan 2025 14:32:20 +0000 Subject: [PATCH 4/9] Refactor cmake --- samples/cpp/text_generation/CMakeLists.txt | 136 ++++----------------- 1 file changed, 27 insertions(+), 109 deletions(-) diff --git a/samples/cpp/text_generation/CMakeLists.txt b/samples/cpp/text_generation/CMakeLists.txt index 7ccd496804..d4d58c8383 100644 --- a/samples/cpp/text_generation/CMakeLists.txt +++ b/samples/cpp/text_generation/CMakeLists.txt @@ -8,112 +8,30 @@ find_package(OpenVINOGenAI REQUIRED NO_CMAKE_FIND_ROOT_PATH ) -# greedy_causal_lm -add_executable(greedy_causal_lm greedy_causal_lm.cpp) -target_link_libraries(greedy_causal_lm PRIVATE openvino::genai) -set_target_properties(greedy_causal_lm PROPERTIES - COMPILE_PDB_NAME greedy_causal_lm - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(greedy_causal_lm PRIVATE cxx_std_11) -install(TARGETS greedy_causal_lm - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) - - -# encrypted_model_causal_lm -add_executable(encrypted_model_causal_lm encrypted_model_causal_lm.cpp) -target_link_libraries(encrypted_model_causal_lm PRIVATE openvino::genai) -set_target_properties(encrypted_model_causal_lm PROPERTIES - COMPILE_PDB_NAME encrypted_model_causal_lm - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(encrypted_model_causal_lm PRIVATE cxx_std_11) -install(TARGETS encrypted_model_causal_lm - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) - - -# beam_search_causal_lm -add_executable(beam_search_causal_lm beam_search_causal_lm.cpp) -target_link_libraries(beam_search_causal_lm PRIVATE openvino::genai) -set_target_properties(beam_search_causal_lm PROPERTIES - COMPILE_PDB_NAME beam_search_causal_lm - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(beam_search_causal_lm PRIVATE cxx_std_11) -install(TARGETS beam_search_causal_lm - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) - - -# chat_sample -add_executable(chat_sample chat_sample.cpp) -target_link_libraries(chat_sample PRIVATE openvino::genai) -set_target_properties(chat_sample PROPERTIES - COMPILE_PDB_NAME chat_sample - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(chat_sample PRIVATE cxx_std_11) -install(TARGETS chat_sample - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) - - -# lora_greedy_causal_lm -add_executable(lora_greedy_causal_lm lora_greedy_causal_lm.cpp) -target_link_libraries(lora_greedy_causal_lm PRIVATE openvino::genai) -set_target_properties(lora_greedy_causal_lm PROPERTIES - COMPILE_PDB_NAME lora_greedy_causal_lm - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(lora_greedy_causal_lm PRIVATE cxx_std_11) -install(TARGETS lora_greedy_causal_lm - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) - - -# multinomial_causal_lm -add_executable(multinomial_causal_lm multinomial_causal_lm.cpp) -target_link_libraries(multinomial_causal_lm PRIVATE openvino::genai) -set_target_properties(multinomial_causal_lm PROPERTIES - COMPILE_PDB_NAME multinomial_causal_lm - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(multinomial_causal_lm PRIVATE cxx_std_11) -install(TARGETS multinomial_causal_lm - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) - - -# prompt_lookup_decoding_lm -add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp) -target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::genai) -set_target_properties(prompt_lookup_decoding_lm PROPERTIES - COMPILE_PDB_NAME prompt_lookup_decoding_lm - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -# Don't install prompt_lookup_decoding_lm because it doesn't use openvino_genai library and is not verified yet. -# install(TARGETS prompt_lookup_decoding_lm -# RUNTIME DESTINATION samples_bin/ -# COMPONENT samples_bin -# EXCLUDE_FROM_ALL) - - -# speculative_decoding_lm -add_executable(speculative_decoding_lm speculative_decoding_lm.cpp) -target_link_libraries(speculative_decoding_lm PRIVATE openvino::genai) -set_target_properties(speculative_decoding_lm PROPERTIES - COMPILE_PDB_NAME speculative_decoding_lm - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -install(TARGETS speculative_decoding_lm - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) +function(add_sample_executable target_name) + add_executable(${target_name} ${target_name}.cpp) + target_link_libraries(${target_name} PRIVATE openvino::genai) + set_target_properties(${target_name} PROPERTIES + COMPILE_PDB_NAME ${target_name} + # Ensure out-of-box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + target_compile_features(${target_name} PRIVATE cxx_std_11) + install(TARGETS ${target_name} + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) +endfunction() + +set (SAMPLE_LIST + greedy_causal_lm + encrypted_model_causal_lm + beam_search_causal_lm + chat_sample + lora_greedy_causal_lm + multinomial_causal_lm + prompt_lookup_decoding_lm + speculative_decoding_lm) + +foreach(sample ${SAMPLE_LIST}) + add_sample_executable(${sample}) +endforeach() From 50c4f8f9cd70741480930080e061c3c77c18238e Mon Sep 17 00:00:00 2001 From: Oleg Pipikin Date: Fri, 3 Jan 2025 16:38:50 +0000 Subject: [PATCH 5/9] Apply comments --- samples/CMakeLists.txt | 2 - samples/cpp/benchmark_genai/CMakeLists.txt | 32 -------- samples/cpp/benchmark_genai/README.md | 49 ------------- samples/cpp/text_generation/CMakeLists.txt | 25 +++++++ samples/cpp/text_generation/README.md | 73 +++++++++++++------ .../benchmark_genai.cpp | 0 src/README.md | 2 +- 7 files changed, 75 insertions(+), 108 deletions(-) delete mode 100644 samples/cpp/benchmark_genai/CMakeLists.txt delete mode 100644 samples/cpp/benchmark_genai/README.md rename samples/cpp/{benchmark_genai => text_generation}/benchmark_genai.cpp (100%) diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 619eebf0ef..8cfc99847b 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 # -add_subdirectory(cpp/benchmark_genai) add_subdirectory(cpp/text_generation) add_subdirectory(cpp/image_generation) add_subdirectory(cpp/visual_language_chat) @@ -16,7 +15,6 @@ install(FILES COMPONENT cpp_samples_genai) install(DIRECTORY - cpp/benchmark_genai cpp/text_generation cpp/image_generation cpp/visual_language_chat diff --git a/samples/cpp/benchmark_genai/CMakeLists.txt b/samples/cpp/benchmark_genai/CMakeLists.txt deleted file mode 100644 index 902a05eee6..0000000000 --- a/samples/cpp/benchmark_genai/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED - PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) - -include(FetchContent) - -if(POLICY CMP0135) - cmake_policy(SET CMP0135 NEW) -endif() - -FetchContent_Declare(cxxopts - URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz - URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) -FetchContent_MakeAvailable(cxxopts) - -add_executable(benchmark_genai benchmark_genai.cpp) -target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts) -set_target_properties(benchmark_genai PROPERTIES - COMPILE_PDB_NAME benchmark_genai - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) - -install(TARGETS benchmark_genai - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/benchmark_genai/README.md b/samples/cpp/benchmark_genai/README.md deleted file mode 100644 index d7b3f6ac21..0000000000 --- a/samples/cpp/benchmark_genai/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# LLMs benchmarking sample - -This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. - -```sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Usage - -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. - -```sh -benchmark_genai [OPTIONS] -``` - -### Options - -- `-m, --model`: Path to the model and tokenizers base directory. -- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. -- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. -- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. -- `-n, --num_iter` (default: `3`): Number of iterations. -- `-d, --device` (default: `"CPU"`): Device to run the model on. - -### Output: - -``` -benchmark_genai -m TinyLlama-1.1B-Chat-v1.0 -n 10 -``` - -``` -Load time: 3405.69 ms -Generate time: 1430.77 ± 3.04 ms -Tokenization time: 0.51 ± 0.02 ms -Detokenization time: 0.37 ± 0.01 ms -TTFT: 81.60 ± 0.54 ms -TPOT: 71.52 ± 2.72 ms -Throughput tokens/s: 13.98 ± 0.53 -``` - -For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics). diff --git a/samples/cpp/text_generation/CMakeLists.txt b/samples/cpp/text_generation/CMakeLists.txt index d4d58c8383..77ab30112f 100644 --- a/samples/cpp/text_generation/CMakeLists.txt +++ b/samples/cpp/text_generation/CMakeLists.txt @@ -35,3 +35,28 @@ set (SAMPLE_LIST foreach(sample ${SAMPLE_LIST}) add_sample_executable(${sample}) endforeach() + + +# benchmark_genai +include(FetchContent) + +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +FetchContent_Declare(cxxopts + URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz + URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) +FetchContent_MakeAvailable(cxxopts) + +add_executable(benchmark_genai benchmark_genai.cpp) +target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts) +set_target_properties(benchmark_genai PROPERTIES + COMPILE_PDB_NAME benchmark_genai + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS benchmark_genai + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) \ No newline at end of file diff --git a/samples/cpp/text_generation/README.md b/samples/cpp/text_generation/README.md index 0ec44c1b4f..806ebab280 100644 --- a/samples/cpp/text_generation/README.md +++ b/samples/cpp/text_generation/README.md @@ -1,17 +1,15 @@ -# OpenVINO AI Text Generation Samples +# OpenVINO GenAI Text Generation Samples These samples showcase the use of OpenVINO's inference capabilities for text generation tasks, including different decoding strategies such as beam search, multinomial sampling, and speculative decoding. Each sample has a specific focus and demonstrates a unique aspect of text generation. The applications don't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. -There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) that provides an example of LLM-powered text generation in Python. +There are also Jupyter notebooks for some samples. You can find links to them in the appropriate sample descritions. ## Table of Contents 1. [Download and Convert the Model and Tokenizers](#download-and-convert-the-model-and-tokenizers) -2. [Running the Samples](#running-the-samples) -3. [Using encrypted models](#using-encrypted-models) -4. [Sample Descriptions](#sample-descriptions) -5. [Troubleshooting](#troubleshooting) -6. [Support and Contribution](#support-and-contribution) +2. [Sample Descriptions](#sample-descriptions) +3. [Troubleshooting](#troubleshooting) +4. [Support and Contribution](#support-and-contribution) ## Download and convert the model and tokenizers @@ -24,41 +22,47 @@ pip install --upgrade-strategy eager -r ../../requirements.txt optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 ``` -## Running the Samples +Model examples to use for different samples: +chat_sample - meta-llama/Llama-2-7b-chat-hf +speculative_decoding_lm - meta-llama/Llama-2-13b-hf as main model and TinyLlama/TinyLlama-1.1B-Chat-v1.0 as draft model +other samples - meta-llama/Llama-2-7b-hf -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run a specific sample. - -`greedy_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` +## Sample Descriptions +### Common information +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to get common information about OpenVINO samples. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -## Sample Descriptions - ### 1. Greedy Causal LM (`greedy_causal_lm`) -- **Description:** Basic text generation using a causal language model. +- **Description:** +Basic text generation using a causal language model. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. - **Main Feature:** Demonstrates simple text continuation. - **Run Command:** ```bash - ./greedy_causal_lm + ./greedy_causal_lm "" ``` ### 2. Beam Search Causal LM (`beam_search_causal_lm`) -- **Description:** Uses beam search for more coherent text generation. +- **Description:** +Uses beam search for more coherent text generation. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. - **Main Feature:** Improves text quality with beam search. - **Run Command:** ```bash - ./beam_search_causal_lm + ./beam_search_causal_lm "" ["" ...] ``` ### 3. Chat Sample (`chat_sample`) -- **Description:** Interactive chat interface powered by OpenVINO. +- **Description:** +Interactive chat interface powered by OpenVINO. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) that provides an example of LLM-powered text generation in Python. - **Main Feature:** Real-time chat-like text generation. - **Run Command:** ```bash - ./chat_sample + ./chat_sample ``` #### Missing chat template If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. @@ -73,7 +77,7 @@ The following template can be used as a default, but it may not work properly wi - **Main Feature:** Introduces randomness for creative outputs. - **Run Command:** ```bash - ./multinomial_causal_lm + ./multinomial_causal_lm "" ``` ### 5. Prompt Lookup Decoding LM (`prompt_lookup_decoding_lm`) @@ -82,7 +86,7 @@ The following template can be used as a default, but it may not work properly wi - **Main Feature:** Specialized prompt-based inference. - **Run Command:** ```bash - ./prompt_lookup_decoding_lm + ./prompt_lookup_decoding_lm "" ``` ### 6. Speculative Decoding LM (`speculative_decoding_lm`) @@ -92,10 +96,12 @@ Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assis Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In that case they are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf + +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/speculative-sampling) that provides an example of LLM-powered text generation in Python. - **Main Feature:** Reduces latency while generating high-quality text. - **Run Command:** ```bash - ./speculative_decoding_lm + ./speculative_decoding_lm "" ``` ### 7. Encrypted Model Causal LM (`encrypted_model_causal_lm`) @@ -111,9 +117,28 @@ For the sake of brevity the code above does not include Tokenizer decryption. Fo - **Main Feature:** Read model directly from memory buffer - **Run Command:** ```bash - ./encrypted_model_causal_lm + ./encrypted_model_causal_lm "" ``` +### 8. LLMs benchmarking sample (`benchmark_genai`) +- **Description:** +This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. + +For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics). +- **Main Feature:** Benchmark model via GenAI +- **Run Command:** + ```bash + ./benchmark_genai [OPTIONS] + ``` + #### Options +- `-m, --model`: Path to the model and tokenizers base directory. +- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. +- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. +- `-n, --num_iter` (default: `3`): Number of iterations. +- `-d, --device` (default: `"CPU"`): Device to run the model on. + + ## Troubleshooting ### Unicode characters encoding error on Windows diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/text_generation/benchmark_genai.cpp similarity index 100% rename from samples/cpp/benchmark_genai/benchmark_genai.cpp rename to samples/cpp/text_generation/benchmark_genai.cpp diff --git a/src/README.md b/src/README.md index 6466b431d0..d8c15b1c77 100644 --- a/src/README.md +++ b/src/README.md @@ -394,7 +394,7 @@ durations = np.array(raw_metrics.m_new_token_times[1:]) - np.array(raw_metrics.m print(f'Median from token to token duration: {np.median(durations):.2f} ms') ``` -For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](../samples/cpp/benchmark_genai/README.md) samples. +For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](../samples/cpp/text_generation/README.md) samples. ## How It Works From da1605ad03946740ad687072ec38b45d5996601b Mon Sep 17 00:00:00 2001 From: Oleg Pipikin Date: Wed, 8 Jan 2025 12:46:14 +0000 Subject: [PATCH 6/9] fix --- samples/cpp/text_generation/CMakeLists.txt | 1 - samples/cpp/text_generation/README.md | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/samples/cpp/text_generation/CMakeLists.txt b/samples/cpp/text_generation/CMakeLists.txt index 77ab30112f..f798b4f5fc 100644 --- a/samples/cpp/text_generation/CMakeLists.txt +++ b/samples/cpp/text_generation/CMakeLists.txt @@ -15,7 +15,6 @@ function(add_sample_executable target_name) COMPILE_PDB_NAME ${target_name} # Ensure out-of-box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) - target_compile_features(${target_name} PRIVATE cxx_std_11) install(TARGETS ${target_name} RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin diff --git a/samples/cpp/text_generation/README.md b/samples/cpp/text_generation/README.md index 806ebab280..ab1d45964e 100644 --- a/samples/cpp/text_generation/README.md +++ b/samples/cpp/text_generation/README.md @@ -1,4 +1,3 @@ - # OpenVINO GenAI Text Generation Samples These samples showcase the use of OpenVINO's inference capabilities for text generation tasks, including different decoding strategies such as beam search, multinomial sampling, and speculative decoding. Each sample has a specific focus and demonstrates a unique aspect of text generation. @@ -15,7 +14,7 @@ There are also Jupyter notebooks for some samples. You can find links to them in The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. +It's not required to install [../../export-requirements.txt](../../export-requirements.txt) for deployment if the model has already been exported. ```sh pip install --upgrade-strategy eager -r ../../requirements.txt From 08f939bad4b4617621af8badd9f948ce9c3a330a Mon Sep 17 00:00:00 2001 From: Oleg Pipikin Date: Wed, 8 Jan 2025 15:33:05 +0000 Subject: [PATCH 7/9] Consolidate pyton samples --- .github/workflows/causal_lm_cpp.yml | 16 +- samples/CMakeLists.txt | 5 - .../python/beam_search_causal_lm/README.md | 38 ----- samples/python/benchmark_genai/README.md | 50 ------- samples/python/chat_sample/README.md | 46 ------ .../python/multinomial_causal_lm/README.md | 48 ------ .../prompt_lookup_decoding_lm/README.md | 41 ------ .../python/speculative_decoding_lm/README.md | 50 ------- samples/python/text_generation/README.md | 137 ++++++++++++++---- .../beam_search_causal_lm.py | 0 .../benchmark_genai.py | 0 .../chat_sample.py | 0 .../multinomial_causal_lm.py | 0 .../prompt_lookup_decoding_lm.py | 0 .../speculative_decoding_lm.py | 0 src/README.md | 2 +- 16 files changed, 120 insertions(+), 313 deletions(-) delete mode 100644 samples/python/beam_search_causal_lm/README.md delete mode 100644 samples/python/benchmark_genai/README.md delete mode 100644 samples/python/chat_sample/README.md delete mode 100644 samples/python/multinomial_causal_lm/README.md delete mode 100644 samples/python/prompt_lookup_decoding_lm/README.md delete mode 100644 samples/python/speculative_decoding_lm/README.md rename samples/python/{beam_search_causal_lm => text_generation}/beam_search_causal_lm.py (100%) rename samples/python/{benchmark_genai => text_generation}/benchmark_genai.py (100%) rename samples/python/{chat_sample => text_generation}/chat_sample.py (100%) rename samples/python/{multinomial_causal_lm => text_generation}/multinomial_causal_lm.py (100%) rename samples/python/{prompt_lookup_decoding_lm => text_generation}/prompt_lookup_decoding_lm.py (100%) rename samples/python/{speculative_decoding_lm => text_generation}/speculative_decoding_lm.py (100%) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 6576b75ff0..d56f51f8ae 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -58,7 +58,7 @@ jobs: PYTHONPATH: "./build" - run: > . ./ov/setupvars.sh - && timeout 35s ./samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./open_llama_3b_v2/ b + && timeout 35s ./samples/python/text_generation/multinomial_causal_lm.py ./open_llama_3b_v2/ b env: PYTHONPATH: "./build" - run: > @@ -79,7 +79,7 @@ jobs: executable: [ ./build/samples/cpp/text_generation/beam_search_causal_lm, - python ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py, + python ./samples/python/text_generation/beam_search_causal_lm.py, ] runs-on: ubuntu-20.04 defaults: @@ -339,7 +339,7 @@ jobs: - run: > . ./ov/setupvars.sh && timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" - | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好!") - + | diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好!") - env: PYTHONPATH: "./build" @@ -374,7 +374,7 @@ jobs: - run: > . ./ov/setupvars.sh && timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./phi-2/ 69 - | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./phi-2/ 69) - + | diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./phi-2/ 69) - env: PYTHONPATH: "./build" @@ -409,7 +409,7 @@ jobs: - run: > . ./ov/setupvars.sh && timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./notus-7b-v1/ 69 - | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./notus-7b-v1/ 69) - + | diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./notus-7b-v1/ 69) - env: PYTHONPATH: "./build" @@ -447,7 +447,7 @@ jobs: source ./ov/setupvars.sh ./build/samples/cpp/text_generation/speculative_decoding_lm ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_speculative.txt ./build/samples/cpp/text_generation/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt - python ./samples/python/speculative_decoding_lm/speculative_decoding_lm.py ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_py.txt + python ./samples/python/text_generation/speculative_decoding_lm.py ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_py.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -504,7 +504,7 @@ jobs: ./build/samples/cpp/text_generation/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt ./build/samples/cpp/text_generation/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt - python ./samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_py.txt + python ./samples/python/text_generation/prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_py.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -693,7 +693,7 @@ jobs: " diff pred.txt ref.txt echo "Chat sample cpp" passed - timeout 30s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt + timeout 30s ./samples/python/text_generation/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt diff pred2.txt ref.txt echo "Chat sample python" passed diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 8cfc99847b..d32eb832a6 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -22,13 +22,8 @@ install(DIRECTORY DESTINATION samples/cpp COMPONENT cpp_samples_genai) install(DIRECTORY - python/beam_search_causal_lm - python/benchmark_genai - python/chat_sample python/text_generation python/image_generation - python/multinomial_causal_lm - python/speculative_decoding_lm python/visual_language_chat python/whisper_speech_recognition DESTINATION samples/python COMPONENT cpp_samples_genai diff --git a/samples/python/beam_search_causal_lm/README.md b/samples/python/beam_search_causal_lm/README.md deleted file mode 100644 index fac6a26e8e..0000000000 --- a/samples/python/beam_search_causal_lm/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Text generation Python sample that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a different one, GPU for example, from the command line interface. The sample fearures `openvino_genai.LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python beam_search_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/benchmark_genai/README.md b/samples/python/benchmark_genai/README.md deleted file mode 100644 index 95f24b6eca..0000000000 --- a/samples/python/benchmark_genai/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# LLMs benchmarking sample - -This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - - -## Usage - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -```sh -python benchmark_genai.py [OPTIONS] -``` - -### Options - -- `-m, --model`: Path to the model and tokenizers base directory. -- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. -- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. -- `-n, --num_iter` (default: `3`): Number of iterations. -- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. -- `-d, --device` (default: `"CPU"`): Device to run the model on. - -### Output: - -``` -python benchmark_genai.py -m TinyLlama-1.1B-Chat-v1.0 -n 10 -``` - -``` -Load time: 3405.69 ms -Generate time: 1430.77 ± 3.04 ms -Tokenization time: 0.51 ± 0.02 ms -Detokenization time: 0.37 ± 0.01 ms -TTFT: 81.60 ± 0.54 ms -TPOT: 71.52 ± 2.72 ms -Throughput tokens/s: 13.98 ± 0.53 -``` - -For more information on how performance metrics are calculated, see [performance metrics readme](../../../src/README.md#performance-metrics). diff --git a/samples/python/chat_sample/README.md b/samples/python/chat_sample/README.md deleted file mode 100644 index 7e3c206431..0000000000 --- a/samples/python/chat_sample/README.md +++ /dev/null @@ -1,46 +0,0 @@ -# Python chat_sample that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run: - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python chat_sample.py TinyLlama-1.1B-Chat-v1.0` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. - -#### Missing chat template - -If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. -The following template can be used as a default, but it may not work properly with every model: -``` -"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", -``` diff --git a/samples/python/multinomial_causal_lm/README.md b/samples/python/multinomial_causal_lm/README.md deleted file mode 100644 index c1afc08a8d..0000000000 --- a/samples/python/multinomial_causal_lm/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# Text generation Python multinomial_causal_lm that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -This sample also contains example implementation of an iterable streamer with bufferisation. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python multinomial_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -## Streaming - -This Python example demonstrates custom detokenization with bufferization. The streamer receives integer tokens corresponding to each word or subword, one by one. If tokens are decoded individually, the resulting text misses necessary spaces because of detokenize(tokenize(" a")) == "a". - -To address this, the detokenizer needs a larger context. We accumulate tokens in a tokens_cache buffer and decode multiple tokens together, adding the text to the streaming queue only when a complete decoded chunk is ready. We run a separate thread to print all new elements arriving in this queue from the generation pipeline. Each generated chunk of text is put into a synchronized queue, ensuring that all put and get operations are thread-safe and blocked until they can proceed. - -At the same time, in order to optimize the performance in streaming mode, we provide the Chuck Streaming. Chunk streaming has significant benefits to very small LLM for streaming generate token rate improvement. It does sampling once after several token generation. We can use the tokens_len parameter to control the number of tokens in the token_cache before sampling. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/prompt_lookup_decoding_lm/README.md b/samples/python/prompt_lookup_decoding_lm/README.md deleted file mode 100644 index 1e5f4003d4..0000000000 --- a/samples/python/prompt_lookup_decoding_lm/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# prompt_lookup_decoding_lm Python sample that supports most popular models like LLaMA 3 - -[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. - -```sh -source /setupvars.sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "return 0;"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/speculative_decoding_lm/README.md b/samples/python/speculative_decoding_lm/README.md deleted file mode 100644 index 7d2656c0a3..0000000000 --- a/samples/python/speculative_decoding_lm/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# speculative_decoding_lm Python sample that supports most popular models like LLaMA 3 and other - -Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alongside with the main model. - -Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. - -This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In that case they are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. Run `optimum-cli` to generate IRs for the samples. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -Download assisting and main model to run speculative decoding sample. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b -optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b -``` - -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python speculative_decoding_lm.py ./dolly-v2-7b ./dolly-v2-3b "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - - -> *_NOTE:_* User can run speculative decoding on different devices. Please, specify `device` in `LLMPipeline` constructor to run main model and `device` for `draft_model` in the constructor. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/text_generation/README.md b/samples/python/text_generation/README.md index a634e21cb0..132dfc27f4 100644 --- a/samples/python/text_generation/README.md +++ b/samples/python/text_generation/README.md @@ -1,48 +1,129 @@ -# Text generation Python greedy_causal_lm that supports most popular models like LLaMA 3 +# OpenVINO GenAI Text Generation Python Samples -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. +These samples showcase the use of OpenVINO's inference capabilities for text generation tasks, including different decoding strategies such as beam search, multinomial sampling, and speculative decoding. Each sample has a specific focus and demonstrates a unique aspect of text generation. +The applications don't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. +There are also Jupyter notebooks for some samples. You can find links to them in the appropriate sample descritions. -There are two sample files: - - [`greedy_causal_lm.py`](./greedy_causal_lm.py) demonstrates basic usage of the LLM pipeline - - [`lora.py`](./lora.py) shows how to apply LoRA adapters to the pipeline +## Table of Contents +1. [Download and Convert the Model and Tokenizers](#download-and-convert-the-model-and-tokenizers) +2. [Sample Descriptions](#sample-descriptions) +3. [Troubleshooting](#troubleshooting) +4. [Support and Contribution](#support-and-contribution) ## Download and convert the model and tokenizers The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. +It's not required to install [../../export-requirements.txt](../../export-requirements.txt) for deployment if the model has already been exported. ```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt +pip install --upgrade-strategy eager -r ../../requirements.txt optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 ``` -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python greedy_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` +Model examples to use for different samples: +chat_sample - meta-llama/Llama-2-7b-chat-hf +speculative_decoding_lm - meta-llama/Llama-2-13b-hf as main model and TinyLlama/TinyLlama-1.1B-Chat-v1.0 as draft model +other samples - meta-llama/Llama-2-7b-hf +## Sample Descriptions +### Common information +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to get common information about OpenVINO samples. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. -## Run with optional LoRA adapters - -LoRA adapters can be connected to the pipeline and modify generated text. Adapters are supported in Safetensors format and can be downloaded from public sources like [Civitai](https://civitai.com) or [HuggingFace](https://huggingface.co/models) or trained by the user. Adapters compatible with a base model should be used only. A weighted blend of multiple adapters can be applied by specifying multiple adapter files with corresponding alpha parameters in command line. Check `lora.py` source code to learn how to enable adapters and specify them in each `generate` call. - -Here is an example how to run the sample with a single adapter. First download adapter file from TODO page manually and save it as TODO. Or download it from command line: - -#TODO command to download adapter - -Then run `lora.py`: - -#TODO command to run lora.py with adapter - -### Troubleshooting +### 1. Greedy Causal LM (`greedy_causal_lm`) +- **Description:** +Basic text generation using a causal language model. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. +- **Main Feature:** Demonstrates simple text continuation. +- **Run Command:** + ```bash + python greedy_causal_lm.py [-h] model_dir prompt + ``` + +### 2. Beam Search Causal LM (`beam_search_causal_lm`) +- **Description:** +Uses beam search for more coherent text generation. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. +- **Main Feature:** Improves text quality with beam search. +- **Run Command:** + ```bash + python beam_search_causal_lm.py model_dir prompts [prompts ...] + ``` + +### 3. Chat Sample (`chat_sample`) +- **Description:** +Interactive chat interface powered by OpenVINO. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) that provides an example of LLM-powered text generation in Python. +- **Main Feature:** Real-time chat-like text generation. +- **Run Command:** + ```bash + python chat_sample.py model_dir + ``` +#### Missing chat template +If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. +The following template can be used as a default, but it may not work properly with every model: +``` +"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", +``` -#### Unicode characters encoding error on Windows +### 4. Multinomial Causal LM (`multinomial_causal_lm`) +- **Description:** Text generation with multinomial sampling for diversity. +- **Main Feature:** Introduces randomness for creative outputs. +- **Run Command:** + ```bash + python multinomial_causal_lm.py model_dir prompt + ``` + +### 5. Prompt Lookup Decoding LM (`prompt_lookup_decoding_lm`) +- **Description:** +[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. +- **Main Feature:** Specialized prompt-based inference. +- **Run Command:** + ```bash + python prompt_lookup_decoding_lm.py model_dir prompt + ``` + +### 6. Speculative Decoding LM (`speculative_decoding_lm`) +- **Description:** +Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alongside with the main model. + +Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. + +This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In that case they are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf + +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/speculative-sampling) that provides an example of LLM-powered text generation in Python. +- **Main Feature:** Reduces latency while generating high-quality text. +- **Run Command:** + ```bash + python speculative_decoding_lm.py model_dir draft_model_dir prompt + ``` + +### 7. LLMs benchmarking sample (`benchmark_genai`) +- **Description:** +This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. + +For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics). +- **Main Feature:** Benchmark model via GenAI +- **Run Command:** + ```bash + python benchmark_genai.py [-m MODEL] [-p PROMPT] [-nw NUM_WARMUP] [-n NUM_ITER] [-mt MAX_NEW_TOKENS] [-d DEVICE] + ``` + #### Options +- `-m, --model`: Path to the model and tokenizers base directory. +- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. +- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. +- `-n, --num_iter` (default: `3`): Number of iterations. +- `-d, --device` (default: `"CPU"`): Device to run the model on. + + +## Troubleshooting + +### Unicode characters encoding error on Windows Example error: ``` @@ -52,3 +133,7 @@ UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: 1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. 2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. + +## Support and Contribution +- For troubleshooting, consult the [OpenVINO documentation](https://docs.openvino.ai). +- To report issues or contribute, visit the [GitHub repository](https://github.com/openvinotoolkit/openvino.genai). diff --git a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py b/samples/python/text_generation/beam_search_causal_lm.py similarity index 100% rename from samples/python/beam_search_causal_lm/beam_search_causal_lm.py rename to samples/python/text_generation/beam_search_causal_lm.py diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/text_generation/benchmark_genai.py similarity index 100% rename from samples/python/benchmark_genai/benchmark_genai.py rename to samples/python/text_generation/benchmark_genai.py diff --git a/samples/python/chat_sample/chat_sample.py b/samples/python/text_generation/chat_sample.py similarity index 100% rename from samples/python/chat_sample/chat_sample.py rename to samples/python/text_generation/chat_sample.py diff --git a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py b/samples/python/text_generation/multinomial_causal_lm.py similarity index 100% rename from samples/python/multinomial_causal_lm/multinomial_causal_lm.py rename to samples/python/text_generation/multinomial_causal_lm.py diff --git a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py b/samples/python/text_generation/prompt_lookup_decoding_lm.py similarity index 100% rename from samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py rename to samples/python/text_generation/prompt_lookup_decoding_lm.py diff --git a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py b/samples/python/text_generation/speculative_decoding_lm.py similarity index 100% rename from samples/python/speculative_decoding_lm/speculative_decoding_lm.py rename to samples/python/text_generation/speculative_decoding_lm.py diff --git a/src/README.md b/src/README.md index d8c15b1c77..028277d4db 100644 --- a/src/README.md +++ b/src/README.md @@ -394,7 +394,7 @@ durations = np.array(raw_metrics.m_new_token_times[1:]) - np.array(raw_metrics.m print(f'Median from token to token duration: {np.median(durations):.2f} ms') ``` -For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](../samples/cpp/text_generation/README.md) samples. +For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/text_generation/README.md) and C++ [benchmark_genai](../samples/cpp/text_generation/README.md) samples. ## How It Works From 1457292cdf658256dbf127d40f4e251164b79d5a Mon Sep 17 00:00:00 2001 From: Oleg Pipikin Date: Wed, 8 Jan 2025 16:49:25 +0000 Subject: [PATCH 8/9] fix --- .github/workflows/linux.yml | 2 +- .github/workflows/mac.yml | 2 +- .github/workflows/windows.yml | 2 +- src/README.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 0a991e2a54..5fc5568853 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -374,7 +374,7 @@ jobs: - name: Test multinomial_causal_lm.py if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only timeout-minutes: 1 - run: ${{ env.INSTALL_DIR }}/samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 + run: ${{ env.INSTALL_DIR }}/samples/python/text_generation/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 working-directory: ${{ env.MODELS_DIR }} - name: Test whisper_speech_recognition.py diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 7cb0ff98d3..f377d3e6a5 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -395,7 +395,7 @@ jobs: if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only run: | source ${OV_INSTALL_DIR}/setupvars.sh - ${OV_INSTALL_DIR}/samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 + ${OV_INSTALL_DIR}/samples/python/text_generation/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 timeout-minutes: 1 - name: Test python samples (whisper_speech_recognition) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index e65972110b..ea07316942 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -470,7 +470,7 @@ jobs: if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - python ${{ env.OV_INSTALL_DIR }}\samples\python\multinomial_causal_lm\multinomial_causal_lm.py TinyLlama-1.1B-Chat-v1.0 0 + python ${{ env.OV_INSTALL_DIR }}\samples\python\text_generation\multinomial_causal_lm.py TinyLlama-1.1B-Chat-v1.0 0 - name: Test python samples (whisper_speech_recognition) if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only diff --git a/src/README.md b/src/README.md index 028277d4db..5d18d0b67b 100644 --- a/src/README.md +++ b/src/README.md @@ -231,7 +231,7 @@ custom_streamer = CustomStreamer() pipe.generate("The Sun is yellow because", max_new_tokens=15, streamer=custom_streamer) ``` -For fully implemented iterable CustomStreamer please refer to [multinomial_causal_lm](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/samples/python/multinomial_causal_lm/README.md) sample. +For fully implemented iterable CustomStreamer please refer to [multinomial_causal_lm](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/samples/python/text_generation/README.md) sample. Continuous batching with LLMPipeline: From d8fe11b73882088c3faea8527638ea217c9fe113 Mon Sep 17 00:00:00 2001 From: Oleg Pipikin Date: Fri, 10 Jan 2025 14:40:20 +0000 Subject: [PATCH 9/9] Apply comments --- samples/cpp/text_generation/README.md | 15 ++++++++------- samples/python/text_generation/README.md | 16 +++++++++------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/samples/cpp/text_generation/README.md b/samples/cpp/text_generation/README.md index ab1d45964e..d9e5bd8d22 100644 --- a/samples/cpp/text_generation/README.md +++ b/samples/cpp/text_generation/README.md @@ -18,14 +18,9 @@ It's not required to install [../../export-requirements.txt](../../export-requir ```sh pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +optimim-cli export openvino --model ``` -Model examples to use for different samples: -chat_sample - meta-llama/Llama-2-7b-chat-hf -speculative_decoding_lm - meta-llama/Llama-2-13b-hf as main model and TinyLlama/TinyLlama-1.1B-Chat-v1.0 as draft model -other samples - meta-llama/Llama-2-7b-hf - ## Sample Descriptions ### Common information Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to get common information about OpenVINO samples. @@ -38,6 +33,7 @@ See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md# - **Description:** Basic text generation using a causal language model. Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. +Recommended models: meta-llama/Llama-2-7b-hf, etc - **Main Feature:** Demonstrates simple text continuation. - **Run Command:** ```bash @@ -48,6 +44,7 @@ Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_noteboo - **Description:** Uses beam search for more coherent text generation. Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. +Recommended models: meta-llama/Llama-2-7b-hf, etc - **Main Feature:** Improves text quality with beam search. - **Run Command:** ```bash @@ -58,6 +55,7 @@ Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_noteboo - **Description:** Interactive chat interface powered by OpenVINO. Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) that provides an example of LLM-powered text generation in Python. +Recommended models: meta-llama/Llama-2-7b-chat-hf, TinyLlama/TinyLlama-1.1B-Chat-v1.0, etc - **Main Feature:** Real-time chat-like text generation. - **Run Command:** ```bash @@ -70,9 +68,9 @@ The following template can be used as a default, but it may not work properly wi "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", ``` - ### 4. Multinomial Causal LM (`multinomial_causal_lm`) - **Description:** Text generation with multinomial sampling for diversity. +Recommended models: meta-llama/Llama-2-7b-hf, etc - **Main Feature:** Introduces randomness for creative outputs. - **Run Command:** ```bash @@ -82,6 +80,7 @@ The following template can be used as a default, but it may not work properly wi ### 5. Prompt Lookup Decoding LM (`prompt_lookup_decoding_lm`) - **Description:** [Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. +Recommended models: meta-llama/Llama-2-7b-hf, etc - **Main Feature:** Specialized prompt-based inference. - **Run Command:** ```bash @@ -97,6 +96,8 @@ Speculative decoding works the following way. The draft model predicts the next This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In that case they are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/speculative-sampling) that provides an example of LLM-powered text generation in Python. + +Recommended models: meta-llama/Llama-2-13b-hf as main model and TinyLlama/TinyLlama-1.1B-Chat-v1.0 as draft model, etc - **Main Feature:** Reduces latency while generating high-quality text. - **Run Command:** ```bash diff --git a/samples/python/text_generation/README.md b/samples/python/text_generation/README.md index 132dfc27f4..9940904cfb 100644 --- a/samples/python/text_generation/README.md +++ b/samples/python/text_generation/README.md @@ -18,14 +18,9 @@ It's not required to install [../../export-requirements.txt](../../export-requir ```sh pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +optimim-cli export openvino --model ``` -Model examples to use for different samples: -chat_sample - meta-llama/Llama-2-7b-chat-hf -speculative_decoding_lm - meta-llama/Llama-2-13b-hf as main model and TinyLlama/TinyLlama-1.1B-Chat-v1.0 as draft model -other samples - meta-llama/Llama-2-7b-hf - ## Sample Descriptions ### Common information Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to get common information about OpenVINO samples. @@ -38,6 +33,7 @@ See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md# - **Description:** Basic text generation using a causal language model. Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. +Recommended models: meta-llama/Llama-2-7b-hf, etc - **Main Feature:** Demonstrates simple text continuation. - **Run Command:** ```bash @@ -48,16 +44,18 @@ Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_noteboo - **Description:** Uses beam search for more coherent text generation. Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. +Recommended models: meta-llama/Llama-2-7b-hf, etc - **Main Feature:** Improves text quality with beam search. - **Run Command:** ```bash - python beam_search_causal_lm.py model_dir prompts [prompts ...] + python beam_search_causal_lm.py model_dir prompt [prompts ...] ``` ### 3. Chat Sample (`chat_sample`) - **Description:** Interactive chat interface powered by OpenVINO. Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) that provides an example of LLM-powered text generation in Python. +Recommended models: meta-llama/Llama-2-7b-chat-hf, TinyLlama/TinyLlama-1.1B-Chat-v1.0, etc - **Main Feature:** Real-time chat-like text generation. - **Run Command:** ```bash @@ -72,6 +70,7 @@ The following template can be used as a default, but it may not work properly wi ### 4. Multinomial Causal LM (`multinomial_causal_lm`) - **Description:** Text generation with multinomial sampling for diversity. +Recommended models: meta-llama/Llama-2-7b-hf, etc - **Main Feature:** Introduces randomness for creative outputs. - **Run Command:** ```bash @@ -81,6 +80,7 @@ The following template can be used as a default, but it may not work properly wi ### 5. Prompt Lookup Decoding LM (`prompt_lookup_decoding_lm`) - **Description:** [Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. +Recommended models: meta-llama/Llama-2-7b-hf, etc - **Main Feature:** Specialized prompt-based inference. - **Run Command:** ```bash @@ -96,6 +96,8 @@ Speculative decoding works the following way. The draft model predicts the next This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In that case they are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/speculative-sampling) that provides an example of LLM-powered text generation in Python. + +Recommended models: meta-llama/Llama-2-13b-hf as main model and TinyLlama/TinyLlama-1.1B-Chat-v1.0 as draft model, etc - **Main Feature:** Reduces latency while generating high-quality text. - **Run Command:** ```bash