From 9f6257676d85042ae20e1bc6a153308d7417ba0e Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 12 Dec 2024 12:44:21 +0400
Subject: [PATCH 001/110] Update OpenVINO tokenizers (#1369)

To catch up
https://github.com/openvinotoolkit/openvino_tokenizers/pull/350
---
 thirdparty/openvino_tokenizers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 78946fa8c3..1da0d2c705 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 78946fa8c385fdc26d978019ecbcb1a55b39eb18
+Subproject commit 1da0d2c705016ad3f04c160ac9338f06505a07c1

From 63e26211486b4e97ebca7d8d7939945cef560811 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Thu, 12 Dec 2024 13:15:41 +0400
Subject: [PATCH 002/110] LLM Bench fix (#1359)

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .../speculative_decoding_lm.cpp                    |  2 +-
 src/cpp/src/sampler.cpp                            | 14 +++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
index b34157a07f..dc6761879c 100644
--- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
+++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
@@ -16,7 +16,7 @@ int main(int argc, char* argv[]) try {
     // add parameter to enable speculative decoding to generate `num_assistant_tokens` candidates by draft_model per iteration
     config.num_assistant_tokens = 5;
     // add parameter to enable speculative decoding to generate candidates by draft_model while candidate probability is higher than `assistant_confidence_threshold`
-    // config.assistant_confidence_threshold = 0.4
+    // config.assistant_confidence_threshold = 0.4;
 
     std::string main_model_path = argv[1];
     std::string draft_model_path = argv[2];
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index 3febadf112..f77463d767 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -599,7 +599,8 @@ void register_new_token(const Token& sampled_token,
         running_sequence->append_token(sampled_token.m_index, sampled_token.m_log_prob);
     }
     if (!is_validation_mode_enabled &&
-        std::fabs(std::exp(sampled_token.m_log_prob)) < logit_processor.get_assistant_confidence_threshold()) {
+        logit_processor.get_assistant_confidence_threshold() > 0 &&
+        (std::fabs(std::exp(sampled_token.m_log_prob)) < logit_processor.get_assistant_confidence_threshold() || sampled_token.m_log_prob == 0)) {
         auto sequence_group = running_sequence->get_sequence_group_ptr();
         sequence_group->pause_generation(true);
     }
@@ -764,13 +765,17 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
             m_logit_processors.insert({request_id, LogitProcessor(sampling_params, sequence_group->get_prompt_ids())});
         }
         auto& logit_processor = m_logit_processors.at(request_id);
-
         const void * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens;
         ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data);
-        size_t max_removed_tokens_per_request = 0, min_generated_len = std::numeric_limits<size_t>::max();
+        size_t max_removed_tokens_per_request = 0, min_generated_len = std::numeric_limits<size_t>::max(), updated_validation_len = 0;
         if (sequence_group->requires_sampling()) {
             // get number of token to be validated
             auto num_tokens_to_process = sequence_group->get_num_tokens_to_validate();
+            if (num_tokens_to_process > actual_seq_len - 1) {
+                auto delta = num_tokens_to_process - (actual_seq_len - 1);
+                updated_validation_len = std::max(updated_validation_len, delta);
+                num_tokens_to_process -= delta;
+            }
             if (sampling_params.is_greedy_decoding() || sampling_params.is_multinomial()) {
                 std::vector<Sequence::Ptr> running_sequences = sequence_group->get_running_sequences();
                 if (sampling_params.is_greedy_decoding()) {
@@ -897,6 +902,9 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
             sequence_group->update_processed_tokens_num(min_processed_tokens);
             logit_processor.update_generated_len(min_processed_tokens);
         }
+        if (updated_validation_len) {
+            sequence_group->set_num_validated_tokens(updated_validation_len);
+        }
 
         // accumulate a number of processed tokens
         currently_processed_tokens += padded_amount_of_processed_tokens * num_running_sequences;

From b955ea6e778c51394c18e3917c52725554d71c04 Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Thu, 12 Dec 2024 13:46:16 +0100
Subject: [PATCH 003/110] [GHA] Minimize memory consumption during tests
 (#1371)

This is a workaround to minimize memory consumption during tests and
allow the use of less powerful CI runners

---------

Co-authored-by: Alexander Suvorov <alexander.suvorov@intel.com>
---
 .github/workflows/linux.yml                     | 4 +---
 tests/python_tests/test_whisper_generate_api.py | 9 +++++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 0d1dc9f948..96848e947c 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -271,14 +271,12 @@ jobs:
         test:
           - name: 'Whisper'
             cmd: 'tests/python_tests/test_whisper_generate_api.py'
-            runner: aks-linux-8-cores-32gb
           - name: 'LLM & VLM'
             cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py -k "not Qwen2-0.5B-Instruct"' # Skip failed tests Qwen2-0.5B-Instruct
-            runner: aks-linux-4-cores-16gb
     defaults:
       run:
         shell: bash
-    runs-on: ${{ matrix.test.runner }}
+    runs-on: aks-linux-4-cores-16gb
     container:
       image: openvinogithubactions.azurecr.io/ov_test/ubuntu_22_04_x64:${{ needs.openvino_download.outputs.docker_tag }}
       volumes: 
diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_generate_api.py
index bcbe2890bd..5a68dd98b6 100644
--- a/tests/python_tests/test_whisper_generate_api.py
+++ b/tests/python_tests/test_whisper_generate_api.py
@@ -10,11 +10,20 @@
 import datasets
 from transformers import WhisperProcessor, pipeline, AutoTokenizer
 from optimum.intel.openvino import OVModelForSpeechSeq2Seq
+import gc
 import json
 import time
 import typing
 import numpy as np
 
+@pytest.fixture(scope="class", autouse=True)
+def run_gc_after_test():
+    """
+    Fixture to run garbage collection after each test class.
+    This is a workaround to minimize memory consumption during tests and allow the use of less powerful CI runners.
+    """
+    yield
+    gc.collect()
 
 @functools.lru_cache(1)
 def read_whisper_model(params, **tokenizer_kwargs):

From d17f7168f278ef98acfdc7ba1ac93e4c759a6402 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 13 Dec 2024 08:03:11 +0400
Subject: [PATCH 004/110] [Image generation] Added num_steps to callback
 (#1372)

With image to image and inpainting, an user passed `num_inference_steps`
is scaled based on `strength` parameter.
So, we need to report actual number of steps within `callback`

CC @RyanMetcalfeInt8
---
 samples/cpp/image_generation/README.md        |  6 ++---
 samples/python/image_generation/README.md     |  6 ++---
 .../image_generation/generation_config.hpp    |  4 ++--
 .../src/image_generation/flux_pipeline.hpp    | 22 ++++++-------------
 .../stable_diffusion_3_pipeline.hpp           | 21 ++++++++----------
 .../stable_diffusion_pipeline.hpp             | 13 +++++------
 src/python/py_utils.cpp                       |  2 +-
 tools/llm_bench/llm_bench_utils/ov_utils.py   |  2 +-
 8 files changed, 31 insertions(+), 45 deletions(-)

diff --git a/samples/cpp/image_generation/README.md b/samples/cpp/image_generation/README.md
index 795bea8999..8a5cc5aa19 100644
--- a/samples/cpp/image_generation/README.md
+++ b/samples/cpp/image_generation/README.md
@@ -52,9 +52,9 @@ Please find the template of the callback usage below.
 ```cpp
 ov::genai::Text2ImagePipeline pipe(models_path, device);
 
-auto callback = [&](size_t step, ov::Tensor& intermediate_res) -> bool {
-   std::cout << "Image generation step: " << step << std::endl;
-   ov::Tensor img = pipe.decode(intermediate_res); // get intermediate image tensor
+auto callback = [&](size_t step, size_t num_steps, ov::Tensor& latent) -> bool {
+   std::cout << "Image generation step: " << step << " / " << num_steps << std::endl;
+   ov::Tensor img = pipe.decode(latent); // get intermediate image tensor
    if (your_condition) // return true if you want to interrupt image generation
       return true;
    return false;
diff --git a/samples/python/image_generation/README.md b/samples/python/image_generation/README.md
index 4abe45b2b4..321f3f6d05 100644
--- a/samples/python/image_generation/README.md
+++ b/samples/python/image_generation/README.md
@@ -52,9 +52,9 @@ Please find the template of the callback usage below.
 ```python
 pipe = openvino_genai.Text2ImagePipeline(model_dir, device)
 
-def callback(step, intermediate_res):
-   print("Image generation step: ", step)
-   image_tensor = pipe.decode(intermediate_res) # get intermediate image tensor
+def callback(step, num_steps, latent):
+   print(f"Image generation step: {step} / {num_steps}")
+   image_tensor = pipe.decode(latent) # get intermediate image tensor
    if your_condition: # return True if you want to interrupt image generation
       return True
    return False
diff --git a/src/cpp/include/openvino/genai/image_generation/generation_config.hpp b/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
index 0b749ecd83..50e576466d 100644
--- a/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
@@ -216,11 +216,11 @@ static constexpr ov::Property<int> max_sequence_length{"max_sequence_length"};
 
 /**
  * User callback for image generation pipelines, which is called within a pipeline with the following arguments:
- * - Total number of inference steps. Note, that in case of 'strength' parameter, the number of inference steps is reduced linearly
  * - Current inference step
+ * - Total number of inference steps. Note, that in case of 'strength' parameter, the number of inference steps is reduced linearly
  * - Tensor representing current latent. Such latent can be converted to human-readable representation via image generation pipeline 'decode()' method
  */
-static constexpr ov::Property<std::function<bool(size_t, ov::Tensor&)>> callback{"callback"};
+static constexpr ov::Property<std::function<bool(size_t, size_t, ov::Tensor&)>> callback{"callback"};
 
 /**
  * Function to pass 'ImageGenerationConfig' as property to 'generate()' call.
diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp
index 4cdac5bb1a..ac82bd0cab 100644
--- a/src/cpp/src/image_generation/flux_pipeline.hpp
+++ b/src/cpp/src/image_generation/flux_pipeline.hpp
@@ -326,9 +326,11 @@ class FluxPipeline : public DiffusionPipeline {
             m_custom_generation_config.strength = 1.0f;
         }
 
-        if (!initial_image) {
-            // in case of typical text to image generation, we need to ignore 'strength'
-            m_custom_generation_config.strength = 1.0f;
+        // Use callback if defined
+        std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
+        auto callback_iter = properties.find(ov::genai::callback.name());
+        if (callback_iter != properties.end()) {
+            callback = callback_iter->second.as<std::function<bool(size_t, size_t, ov::Tensor&)>>();
         }
 
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
@@ -355,14 +357,6 @@ class FluxPipeline : public DiffusionPipeline {
         m_scheduler->set_timesteps_with_sigma(sigmas, mu);
         std::vector<float> timesteps = m_scheduler->get_float_timesteps();
 
-        // Use callback if defined
-        std::function<bool(size_t, ov::Tensor&)> callback;
-        auto callback_iter = properties.find(ov::genai::callback.name());
-        bool do_callback = callback_iter != properties.end();
-        if (do_callback) {
-            callback = callback_iter->second.as<std::function<bool(size_t, ov::Tensor&)>>();
-        }
-
         // 6. Denoising loop
         ov::Tensor timestep(ov::element::f32, {1});
         float* timestep_data = timestep.data<float>();
@@ -375,10 +369,8 @@ class FluxPipeline : public DiffusionPipeline {
             auto scheduler_step_result = m_scheduler->step(noise_pred_tensor, latents, inference_step, m_custom_generation_config.generator);
             latents = scheduler_step_result["latent"];
 
-            if (do_callback) {
-                if (callback(inference_step, latents)) {
-                    return ov::Tensor(ov::element::u8, {});
-                }
+            if (callback && callback(inference_step, timesteps.size(), latents)) {
+                return ov::Tensor(ov::element::u8, {});
             }
         }
 
diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
index 4e9a70ec2d..3cdaa409d1 100644
--- a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
@@ -431,6 +431,13 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
             generation_config.strength = 1.0f;
         }
 
+        // Use callback if defined
+        std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
+        auto callback_iter = properties.find(ov::genai::callback.name());
+        if (callback_iter != properties.end()) {
+            callback = callback_iter->second.as<std::function<bool(size_t, size_t, ov::Tensor&)>>();
+        }
+
         const auto& transformer_config = m_transformer->get_config();
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
         const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale)
@@ -467,14 +474,6 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         // 6. Denoising loop
         ov::Tensor noisy_residual_tensor(ov::element::f32, {});
 
-        // Use callback if defined
-        std::function<bool(size_t, ov::Tensor&)> callback;
-        auto callback_iter = properties.find(ov::genai::callback.name());
-        bool do_callback = callback_iter != properties.end();
-        if (do_callback) {
-            callback = callback_iter->second.as<std::function<bool(size_t, ov::Tensor&)>>();
-        }
-
         for (size_t inference_step = 0; inference_step < timesteps.size(); ++inference_step) {
             // concat the same latent twice along a batch dimension in case of CFG
             if (batch_size_multiplier > 1) {
@@ -510,10 +509,8 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
             auto scheduler_step_result = m_scheduler->step(noisy_residual_tensor, latent, inference_step, generation_config.generator);
             latent = scheduler_step_result["latent"];
 
-            if (do_callback) {
-                if (callback(inference_step, latent)) {
-                    return ov::Tensor(ov::element::u8, {});
-                }
+            if (callback && callback(inference_step, timesteps.size(), latent)) {
+                return ov::Tensor(ov::element::u8, {});
             }
         }
 
diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
index 9dbdbac088..c53c9b7d25 100644
--- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
@@ -306,11 +306,10 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         }
 
         // use callback if defined
-        std::function<bool(size_t, ov::Tensor&)> callback;
+        std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
         auto callback_iter = properties.find(ov::genai::callback.name());
-        bool do_callback = callback_iter != properties.end();
-        if (do_callback) {
-            callback = callback_iter->second.as<std::function<bool(size_t, ov::Tensor&)>>();
+        if (callback_iter != properties.end()) {
+            callback = callback_iter->second.as<std::function<bool(size_t, size_t, ov::Tensor&)>>();
         }
 
         // Stable Diffusion pipeline
@@ -400,10 +399,8 @@ class StableDiffusionPipeline : public DiffusionPipeline {
             const auto it = scheduler_step_result.find("denoised");
             denoised = it != scheduler_step_result.end() ? it->second : latent;
 
-            if (do_callback) {
-                if (callback(inference_step, denoised)) {
-                    return ov::Tensor(ov::element::u8, {});
-                }
+            if (callback && callback(inference_step, timesteps.size(), denoised)) {
+                return ov::Tensor(ov::element::u8, {});
             }
         }
 
diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp
index 9d33318f0a..45a0c46174 100644
--- a/src/python/py_utils.cpp
+++ b/src/python/py_utils.cpp
@@ -280,7 +280,7 @@ ov::Any py_object_to_any(const py::object& py_obj, std::string property_name) {
     } else if (py::isinstance<ov::genai::Generator>(py_obj)) {
         return py::cast<std::shared_ptr<ov::genai::Generator>>(py_obj);
     } else if (py::isinstance<py::function>(py_obj) && property_name == "callback") {
-        return py::cast<std::function<bool(size_t, ov::Tensor&)>>(py_obj);
+        return py::cast<std::function<bool(size_t, size_t, ov::Tensor&)>>(py_obj);
     } else if ((py::isinstance<py::function>(py_obj) || py::isinstance<ov::genai::StreamerBase>(py_obj) || py::isinstance<std::monostate>(py_obj)) && property_name == "streamer") {
         auto streamer = py::cast<ov::genai::pybind::utils::PyBindStreamerVariant>(py_obj);
         return ov::genai::streamer(pystreamer_to_streamer(streamer)).second;
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
index c5fa422824..8a28fbe355 100644
--- a/tools/llm_bench/llm_bench_utils/ov_utils.py
+++ b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -366,7 +366,7 @@ def __init__(self) -> types.NoneType:
             self.start_time = time.perf_counter()
             self.duration = -1
 
-        def __call__(self, step, latents):
+        def __call__(self, step, num_steps, latents):
             self.iteration_time.append(time.perf_counter() - self.start_time)
             self.start_time = time.perf_counter()
             return False

From d189eb7541a61a41581dd21361db3aa3884d211b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 13 Dec 2024 15:46:38 +0400
Subject: [PATCH 005/110] GHA: use preconverted LCM model (#1380)

---
 .github/workflows/lcm_dreamshaper_cpp.yml | 24 +++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index b3a36761e1..258184e9e4 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -62,35 +62,35 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install -r ./samples/requirements.txt
 
-      - name: Download and convert models and tokenizer
+      - name: Download models
         run: |
           source openvino_lcm_cpp/bin/activate
-          optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --task stable-diffusion --weight-format fp16 models/lcm_dreamshaper_v7/FP16
+          huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir models/lcm_dreamshaper_v7
           wget -O ./image.png https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png
           wget -O ./mask_image.png https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png
 
       - name: Run heterogeneous_stable_diffusion
         run: |
           source ${{ env.OV_INSTALL_DIR }}/setupvars.sh
-          ${{ env.build_dir }}/samples/cpp/image_generation/heterogeneous_stable_diffusion ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"
+          ${{ env.build_dir }}/samples/cpp/image_generation/heterogeneous_stable_diffusion ./models/lcm_dreamshaper_v7 "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"
 
       - name: Run heterogeneous_stable_diffusion.py
         run: |
           source openvino_lcm_cpp/bin/activate
           source ./ov/setupvars.sh
-          python ./samples/python/image_generation/heterogeneous_stable_diffusion.py ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting"
+          python ./samples/python/image_generation/heterogeneous_stable_diffusion.py ./models/lcm_dreamshaper_v7 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting"
         env:
           PYTHONPATH: ${{ env.build_dir }}
 
       - name: Run image2image
         run: |
           source ./ov/setupvars.sh
-          ${{ env.build_dir }}/samples/cpp/image_generation/image2image ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" ./image.png
+          ${{ env.build_dir }}/samples/cpp/image_generation/image2image ./models/lcm_dreamshaper_v7 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" ./image.png
 
       - name: Run inpainting
         run: |
           source ./ov/setupvars.sh
-          ${{ env.build_dir }}/samples/cpp/image_generation/inpainting ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" ./image.png ./mask_image.png
+          ${{ env.build_dir }}/samples/cpp/image_generation/inpainting ./models/lcm_dreamshaper_v7 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" ./image.png ./mask_image.png
 
   lcm_dreamshaper_v7_cpp-windows:
     runs-on: windows-2019
@@ -134,24 +134,24 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install -r ./samples/requirements.txt
 
-      - name: Download and convert models and tokenizer
+      - name: Download models
         run: |
           . "./openvino_lcm_cpp/Scripts/Activate.ps1"
-          optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --task stable-diffusion --weight-format fp16 models/lcm_dreamshaper_v7/FP16
+          huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir models/lcm_dreamshaper_v7
           Invoke-WebRequest -Uri 'https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png' -OutFile 'image.png'
           Invoke-WebRequest -Uri 'https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png' -OutFile 'mask_image.png'
 
       - name: Run heterogeneous_stable_diffusion
         run: >
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
-          & "${{ env.build_dir }}/samples/cpp/image_generation/Release/heterogeneous_stable_diffusion.exe ./models/lcm_dreamshaper_v7/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'"
+          & "${{ env.build_dir }}/samples/cpp/image_generation/Release/heterogeneous_stable_diffusion.exe ./models/lcm_dreamshaper_v7 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'"
 
       - name: Run heterogeneous_stable_diffusion.py
         run: |
           . "./openvino_lcm_cpp/Scripts/Activate.ps1"
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           $env:Path += "${{ env.build_dir }}\openvino_genai"
-          python .\samples\python\image_generation\heterogeneous_stable_diffusion.py .\models\lcm_dreamshaper_v7\FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting"
+          python .\samples\python\image_generation\heterogeneous_stable_diffusion.py .\models\lcm_dreamshaper_v7 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting"
         env:
           PYTHONPATH: ${{ env.build_dir }}
 
@@ -160,7 +160,7 @@ jobs:
           . "./openvino_lcm_cpp/Scripts/Activate.ps1"
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           $env:Path += "${{ env.build_dir }}\openvino_genai"
-          python .\samples\python\image_generation\image2image.py .\models\lcm_dreamshaper_v7\FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" .\image.png
+          python .\samples\python\image_generation\image2image.py .\models\lcm_dreamshaper_v7 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" .\image.png
         env:
           PYTHONPATH: ${{ env.build_dir }}
 
@@ -169,7 +169,7 @@ jobs:
           . "./openvino_lcm_cpp/Scripts/Activate.ps1"
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           $env:Path += "${{ env.build_dir }}\openvino_genai"
-          python .\samples\python\image_generation\inpainting.py .\models\lcm_dreamshaper_v7\FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" .\image.png .\mask_image.png
+          python .\samples\python\image_generation\inpainting.py .\models\lcm_dreamshaper_v7 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" .\image.png .\mask_image.png
         env:
           PYTHONPATH: ${{ env.build_dir }}
 

From 1b7a9e58433c8e78467db55549df8a1ab65b11b5 Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Fri, 13 Dec 2024 19:26:27 +0100
Subject: [PATCH 006/110] [GHA] Use the latest stable ov commit (#1385)

Temporary freeze OV commit until regression caused by
https://github.com/openvinotoolkit/openvino/commit/f1cba31319c3a2b150a801ea969bfe463041d5fc
is fixed
---
 .github/workflows/linux.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 96848e947c..18107aa203 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -53,7 +53,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: latest_available_commit
+        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
 
     - name: Clone docker tag from OpenVINO repo
       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

From 1e9e2c09a7698fb84fea6984e7588ea8a718c842 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sat, 14 Dec 2024 00:47:09 +0400
Subject: [PATCH 007/110] [Image generation] Added conversion between pipelines
 (#1375)

Created constructors for image generation pipelines to share models
between pipelines, but generation configs are initialized as default
(because, text2image and image2image have different default values for
some parameters and cannot be shared as is)
---
 .../image_generation/image2image_pipeline.hpp | 25 +++------
 .../image_generation/inpainting_pipeline.hpp  | 11 ++++
 .../image_generation/text2image_pipeline.hpp  | 47 ++++++++--------
 .../src/image_generation/flux_pipeline.hpp    | 56 +++++++++++++------
 .../image_generation/image2image_pipeline.cpp | 10 ++++
 .../image_generation/inpainting_pipeline.cpp  | 17 +++++-
 .../image_generation/schedulers/scheduler.cpp |  1 -
 .../stable_diffusion_3_pipeline.hpp           | 54 ++++++++++++++----
 .../stable_diffusion_pipeline.hpp             | 39 ++++++++-----
 .../stable_diffusion_xl_pipeline.hpp          | 18 +++++-
 .../image_generation/text2image_pipeline.cpp  | 28 ++++++++++
 .../openvino_genai/py_openvino_genai.pyi      | 12 ++++
 src/python/py_image_generation_pipelines.cpp  | 40 ++++++++-----
 13 files changed, 256 insertions(+), 102 deletions(-)

diff --git a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
index a859b29c2e..ea02969c5e 100644
--- a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
@@ -3,28 +3,11 @@
 
 #pragma once
 
-#include <memory>
-#include <string>
-#include <random>
-#include <filesystem>
-
-#include "openvino/core/any.hpp"
-#include "openvino/runtime/tensor.hpp"
-
-#include "openvino/genai/image_generation/scheduler.hpp"
-#include "openvino/genai/image_generation/generation_config.hpp"
-
-#include "openvino/genai/image_generation/clip_text_model.hpp"
-#include "openvino/genai/image_generation/clip_text_model_with_projection.hpp"
-#include "openvino/genai/image_generation/unet2d_condition_model.hpp"
-#include "openvino/genai/image_generation/autoencoder_kl.hpp"
+#include "openvino/genai/image_generation/inpainting_pipeline.hpp"
 
 namespace ov {
 namespace genai {
 
-// forward declaration
-class DiffusionPipeline;
-
 //
 // Image to image pipeline
 //
@@ -42,6 +25,8 @@ class OPENVINO_GENAI_EXPORTS Image2ImagePipeline {
                         Properties&&... properties)
         : Image2ImagePipeline(models_path, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
 
+    Image2ImagePipeline(const InpaintingPipeline& pipe);
+
     // creates either LCM or SD pipeline from building blocks
     static Image2ImagePipeline stable_diffusion(
         const std::shared_ptr<Scheduler>& scheduler_type,
@@ -99,6 +84,10 @@ class OPENVINO_GENAI_EXPORTS Image2ImagePipeline {
     std::shared_ptr<DiffusionPipeline> m_impl;
 
     explicit Image2ImagePipeline(const std::shared_ptr<DiffusionPipeline>& impl);
+
+    // to create other pipelines from image to image
+    friend class Text2ImagePipeline;
+    friend class InpaintingPipeline;
 };
 
 } // namespace genai
diff --git a/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp
index c970fa0e23..6eead673e4 100644
--- a/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp
@@ -18,12 +18,17 @@
 #include "openvino/genai/image_generation/clip_text_model_with_projection.hpp"
 #include "openvino/genai/image_generation/unet2d_condition_model.hpp"
 #include "openvino/genai/image_generation/autoencoder_kl.hpp"
+#include "openvino/genai/image_generation/t5_encoder_model.hpp"
+#include "openvino/genai/image_generation/sd3_transformer_2d_model.hpp"
+#include "openvino/genai/image_generation/flux_transformer_2d_model.hpp"
 
 namespace ov {
 namespace genai {
 
 // forward declaration
 class DiffusionPipeline;
+class Text2ImagePipeline;
+class Image2ImagePipeline;
 
 //
 // Inpainting pipeline
@@ -42,6 +47,8 @@ class OPENVINO_GENAI_EXPORTS InpaintingPipeline {
                        Properties&&... properties)
         : InpaintingPipeline(models_path, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
 
+    InpaintingPipeline(const Image2ImagePipeline& pipe);
+
     // creates either LCM or SD pipeline from building blocks
     static InpaintingPipeline stable_diffusion(
         const std::shared_ptr<Scheduler>& scheduler_type,
@@ -100,6 +107,10 @@ class OPENVINO_GENAI_EXPORTS InpaintingPipeline {
     std::shared_ptr<DiffusionPipeline> m_impl;
 
     explicit InpaintingPipeline(const std::shared_ptr<DiffusionPipeline>& impl);
+
+    // to create other pipelines from inpainting
+    friend class Text2ImagePipeline;
+    friend class Image2ImagePipeline;
 };
 
 } // namespace genai
diff --git a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp
index b66ced748b..34b9d6e341 100644
--- a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp
@@ -3,31 +3,11 @@
 
 #pragma once
 
-#include <memory>
-#include <string>
-#include <random>
-#include <filesystem>
-
-#include "openvino/core/any.hpp"
-#include "openvino/runtime/tensor.hpp"
-
-#include "openvino/genai/image_generation/scheduler.hpp"
-#include "openvino/genai/image_generation/generation_config.hpp"
-
-#include "openvino/genai/image_generation/clip_text_model.hpp"
-#include "openvino/genai/image_generation/clip_text_model_with_projection.hpp"
-#include "openvino/genai/image_generation/unet2d_condition_model.hpp"
-#include "openvino/genai/image_generation/sd3_transformer_2d_model.hpp"
-#include "openvino/genai/image_generation/autoencoder_kl.hpp"
-#include "openvino/genai/image_generation/t5_encoder_model.hpp"
-#include "openvino/genai/image_generation/flux_transformer_2d_model.hpp"
+#include "openvino/genai/image_generation/image2image_pipeline.hpp"
 
 namespace ov {
 namespace genai {
 
-// forward declaration
-class DiffusionPipeline;
-
 /**
  * Text to image pipelines which provides unified API to all supported models types.
  * Models specific aspects are hidden in image generation config, which includes multiple prompts support or
@@ -63,6 +43,20 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
                        Properties&&... properties)
         : Text2ImagePipeline(models_path, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
 
+    /**
+     * Creates text to image pipeline based on image to image pipeline and shares models
+     * @param pipe Image to image pipeline to share models with
+     * @note Generation config is not shared with image to image pipeline and default one is created
+     */
+    Text2ImagePipeline(const Image2ImagePipeline& pipe);
+
+    /**
+     * Creates text to image pipeline based on inpainting pipeline and shares models
+     * @param pipe Inpainting pipeline to share models with
+     * @note Generation config is not shared with image to image pipeline and default one is created
+     */
+    Text2ImagePipeline(const InpaintingPipeline& pipe);
+
     /**
      * Creates Stable Diffusion pipeline from individual models
      * @param scheduler A scheduler used to denoise final image
@@ -178,6 +172,15 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
      * @param guidance_scale A guidance scale. Note, that it's important whether guidance_scale > 1, which affects whether negative prompts
      * are used or not. For example, all values > 1 are the same for reshape perspective and may vary in subsequent 'generate()' calls.
      * @note If pipeline has been already compiled, it cannot be reshaped and an exception is thrown.
+     * 
+     * Example how to reshape SD3 or Flux models for specific max sequence length:
+     * @code
+     *  ov::genai::Text2ImagePipeline pipe("/path");
+     *  ov::genai::ImageGenerationConfig default_config = pipe.get_generation_config();
+     *  default_config.max_sequence_length = 30;
+     *  pipe.set_generation_config(default_config);
+     *  pipe.reshape(1, 512, 512, default_config.guidance_scale); // reshape will bypass `max_sequence_length` to T5 encoder model
+     * @endcode
      */
     void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale);
 
@@ -200,7 +203,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
      * Generates image(s) based on prompt and other image generarion parameters
      * @param positive_prompt Prompt to generate image(s) from
      * @param properties Image generation parameters specified as properties. Values in 'properties' override default value for generation parameters.
-     * @return A tensor which has dimensions [num_images_per_prompt, height, width, 3]
+     * @returns A tensor which has dimensions [num_images_per_prompt, height, width, 3]
      */
     ov::Tensor generate(const std::string& positive_prompt, const ov::AnyMap& properties = {});
 
diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp
index ac82bd0cab..716ba6b61b 100644
--- a/src/cpp/src/image_generation/flux_pipeline.hpp
+++ b/src/cpp/src/image_generation/flux_pipeline.hpp
@@ -225,6 +225,15 @@ class FluxPipeline : public DiffusionPipeline {
         initialize_generation_config("FluxPipeline");
     }
 
+    FluxPipeline(PipelineType pipeline_type, const FluxPipeline& pipe) :
+        FluxPipeline(pipe) {
+        OPENVINO_ASSERT(!pipe.is_inpainting_model(), "Cannot create ",
+            pipeline_type == PipelineType::TEXT_2_IMAGE ? "'Text2ImagePipeline'" : "'Image2ImagePipeline'", " from InpaintingPipeline with inpainting model");
+
+        m_pipeline_type = pipeline_type;
+        initialize_generation_config("FluxPipeline");
+    }
+
     void reshape(const int num_images_per_prompt,
                  const int height,
                  const int width,
@@ -232,13 +241,6 @@ class FluxPipeline : public DiffusionPipeline {
         check_image_size(height, width);
 
         m_clip_text_encoder->reshape(1);
-
-        // TODO: max_sequence_length cannot be specified easily outside, only via:
-        //   Text2ImagePipeline pipe("/path");
-        //   ImageGenerationConfig default_config = pipe.get_generation_config();
-        //   default_config.max_sequence_length = 30;
-        //   pipe.set_generation_config(default_config);
-        //   pipe.reshape(1, 512, 512, default_config.guidance_scale);
         m_t5_text_encoder->reshape(1, m_generation_config.max_sequence_length);
         m_transformer->reshape(num_images_per_prompt, height, width, m_generation_config.max_sequence_length);
 
@@ -321,11 +323,6 @@ class FluxPipeline : public DiffusionPipeline {
         m_custom_generation_config = m_generation_config;
         m_custom_generation_config.update_generation_config(properties);
 
-        if (!initial_image) {
-            // in case of typical text to image generation, we need to ignore 'strength'
-            m_custom_generation_config.strength = 1.0f;
-        }
-
         // Use callback if defined
         std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
         auto callback_iter = properties.find(ov::genai::callback.name());
@@ -337,9 +334,9 @@ class FluxPipeline : public DiffusionPipeline {
         const auto& transformer_config = m_transformer->get_config();
 
         if (m_custom_generation_config.height < 0)
-            m_custom_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor;
+            compute_dim(m_custom_generation_config.height, initial_image, 1 /* assume NHWC */);
         if (m_custom_generation_config.width < 0)
-            m_custom_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor;
+            compute_dim(m_custom_generation_config.width, initial_image, 2 /* assume NHWC */);
 
         check_inputs(m_custom_generation_config, initial_image);
 
@@ -387,6 +384,29 @@ class FluxPipeline : public DiffusionPipeline {
     }
 
 private:
+    bool is_inpainting_model() const {
+        assert(m_transformer != nullptr);
+        assert(m_vae != nullptr);
+        return m_transformer->get_config().in_channels == (m_vae->get_config().latent_channels * 2 + 1);
+    }
+
+    void compute_dim(int64_t & generation_config_value, ov::Tensor initial_image, int dim_idx) {
+        const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
+        const auto& transformer_config = m_transformer->get_config();
+
+        // in case of image to image generation_config_value is just ignored and computed based on initial image
+        if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) {
+            OPENVINO_ASSERT(initial_image, "Initial image is empty for image to image pipeline");
+            ov::Shape shape = initial_image.get_shape();
+            int64_t dim_val = shape[dim_idx];
+
+            generation_config_value = dim_val - (dim_val % vae_scale_factor);
+        }
+
+        if (generation_config_value < 0)
+            generation_config_value = transformer_config.m_default_sample_size * vae_scale_factor;
+    }
+
     void initialize_generation_config(const std::string& class_name) override {
         assert(m_transformer != nullptr);
         assert(m_vae != nullptr);
@@ -394,8 +414,12 @@ class FluxPipeline : public DiffusionPipeline {
         const auto& transformer_config = m_transformer->get_config();
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
 
-        m_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor;
-        m_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor;
+        m_generation_config = ImageGenerationConfig();
+
+        if (m_pipeline_type != PipelineType::IMAGE_2_IMAGE) {
+            m_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor;
+            m_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor;
+        }
 
         if (class_name == "FluxPipeline" || class_name == "FluxImg2ImgPipeline" || class_name == "FluxInpaintPipeline" ) {
             if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) {
diff --git a/src/cpp/src/image_generation/image2image_pipeline.cpp b/src/cpp/src/image_generation/image2image_pipeline.cpp
index 527b532b71..38ff5a0a4c 100644
--- a/src/cpp/src/image_generation/image2image_pipeline.cpp
+++ b/src/cpp/src/image_generation/image2image_pipeline.cpp
@@ -39,6 +39,16 @@ Image2ImagePipeline::Image2ImagePipeline(const std::filesystem::path& root_dir,
     }
 }
 
+Image2ImagePipeline::Image2ImagePipeline(const InpaintingPipeline& pipe) {
+    if (auto stable_diffusion_xl = std::dynamic_pointer_cast<StableDiffusionXLPipeline>(pipe.m_impl); stable_diffusion_xl != nullptr) {
+        m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::IMAGE_2_IMAGE, *stable_diffusion_xl);
+    } else if (auto stable_diffusion = std::dynamic_pointer_cast<StableDiffusionPipeline>(pipe.m_impl); stable_diffusion != nullptr) {
+        m_impl = std::make_shared<StableDiffusionPipeline>(PipelineType::IMAGE_2_IMAGE, *stable_diffusion);
+    } else {
+        OPENVINO_ASSERT("Cannot convert specified InpaintingPipeline to Image2ImagePipeline");
+    }
+}
+
 Image2ImagePipeline::Image2ImagePipeline(const std::shared_ptr<DiffusionPipeline>& impl)
     : m_impl(impl) {
     assert(m_impl != nullptr);
diff --git a/src/cpp/src/image_generation/inpainting_pipeline.cpp b/src/cpp/src/image_generation/inpainting_pipeline.cpp
index d3612c4964..a510be0a57 100644
--- a/src/cpp/src/image_generation/inpainting_pipeline.cpp
+++ b/src/cpp/src/image_generation/inpainting_pipeline.cpp
@@ -6,6 +6,7 @@
 #include <filesystem>
 
 #include "openvino/genai/image_generation/inpainting_pipeline.hpp"
+#include "openvino/genai/image_generation/image2image_pipeline.hpp"
 
 #include "image_generation/stable_diffusion_pipeline.hpp"
 #include "image_generation/stable_diffusion_xl_pipeline.hpp"
@@ -43,6 +44,16 @@ InpaintingPipeline::InpaintingPipeline(const std::filesystem::path& root_dir, co
     }
 }
 
+InpaintingPipeline::InpaintingPipeline(const Image2ImagePipeline& pipe) {
+    if (auto stable_diffusion_xl = std::dynamic_pointer_cast<StableDiffusionXLPipeline>(pipe.m_impl); stable_diffusion_xl != nullptr) {
+        m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::INPAINTING, *stable_diffusion_xl);
+    } else if (auto stable_diffusion = std::dynamic_pointer_cast<StableDiffusionPipeline>(pipe.m_impl); stable_diffusion != nullptr) {
+        m_impl = std::make_shared<StableDiffusionPipeline>(PipelineType::INPAINTING, *stable_diffusion);
+    } else {
+        OPENVINO_ASSERT("Cannot convert specified Image2ImagePipeline to InpaintingPipeline");
+    }
+}
+
 InpaintingPipeline::InpaintingPipeline(const std::shared_ptr<DiffusionPipeline>& impl)
     : m_impl(impl) {
     assert(m_impl != nullptr);
@@ -53,7 +64,7 @@ InpaintingPipeline InpaintingPipeline::stable_diffusion(
     const CLIPTextModel& clip_text_model,
     const UNet2DConditionModel& unet,
     const AutoencoderKL& vae) {
-    auto impl = std::make_shared<StableDiffusionPipeline>(PipelineType::IMAGE_2_IMAGE, clip_text_model, unet, vae);
+    auto impl = std::make_shared<StableDiffusionPipeline>(PipelineType::INPAINTING, clip_text_model, unet, vae);
 
     assert(scheduler != nullptr);
     impl->set_scheduler(scheduler);
@@ -66,7 +77,7 @@ InpaintingPipeline InpaintingPipeline::latent_consistency_model(
     const CLIPTextModel& clip_text_model,
     const UNet2DConditionModel& unet,
     const AutoencoderKL& vae) {
-    auto impl = std::make_shared<StableDiffusionPipeline>(PipelineType::IMAGE_2_IMAGE, clip_text_model, unet, vae);
+    auto impl = std::make_shared<StableDiffusionPipeline>(PipelineType::INPAINTING, clip_text_model, unet, vae);
 
     assert(scheduler != nullptr);
     impl->set_scheduler(scheduler);
@@ -80,7 +91,7 @@ InpaintingPipeline InpaintingPipeline::stable_diffusion_xl(
     const CLIPTextModelWithProjection& clip_text_model_with_projection,
     const UNet2DConditionModel& unet,
     const AutoencoderKL& vae) {
-    auto impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::IMAGE_2_IMAGE, clip_text_model, clip_text_model_with_projection, unet, vae);
+    auto impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::INPAINTING, clip_text_model, clip_text_model_with_projection, unet, vae);
 
     assert(scheduler != nullptr);
     impl->set_scheduler(scheduler);
diff --git a/src/cpp/src/image_generation/schedulers/scheduler.cpp b/src/cpp/src/image_generation/schedulers/scheduler.cpp
index 6ec31bbf6c..3a7556b6d9 100644
--- a/src/cpp/src/image_generation/schedulers/scheduler.cpp
+++ b/src/cpp/src/image_generation/schedulers/scheduler.cpp
@@ -29,7 +29,6 @@ std::shared_ptr<Scheduler> Scheduler::from_config(const std::filesystem::path& s
 
     std::shared_ptr<Scheduler> scheduler = nullptr;
     if (scheduler_type == Scheduler::Type::LCM) {
-        // TODO: do we need to pass RNG generator somehow to LCM?
         scheduler = std::make_shared<LCMScheduler>(scheduler_config_path);
     } else if (scheduler_type == Scheduler::Type::LMS_DISCRETE) {
         scheduler = std::make_shared<LMSDiscreteScheduler>(scheduler_config_path);
diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
index 3cdaa409d1..18a3e0346f 100644
--- a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
@@ -218,6 +218,15 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         initialize_generation_config("StableDiffusion3Pipeline");
     }
 
+    StableDiffusion3Pipeline(PipelineType pipeline_type, const StableDiffusion3Pipeline& pipe) :
+        StableDiffusion3Pipeline(pipe) {
+        OPENVINO_ASSERT(!pipe.is_inpainting_model(), "Cannot create ",
+            pipeline_type == PipelineType::TEXT_2_IMAGE ? "'Text2ImagePipeline'" : "'Image2ImagePipeline'", " from InpaintingPipeline with inpainting model");
+
+        m_pipeline_type = pipeline_type;
+        initialize_generation_config("StableDiffusion3Pipeline");
+    }
+
     void reshape(const int num_images_per_prompt,
                  const int height,
                  const int width,
@@ -426,11 +435,6 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         ImageGenerationConfig generation_config = m_generation_config;
         generation_config.update_generation_config(properties);
 
-        if (!initial_image) {
-            // in case of typical text to image generation, we need to ignore 'strength'
-            generation_config.strength = 1.0f;
-        }
-
         // Use callback if defined
         std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
         auto callback_iter = properties.find(ov::genai::callback.name());
@@ -440,14 +444,12 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
 
         const auto& transformer_config = m_transformer->get_config();
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
-        const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale)
-                                                 ? 2
-                                                 : 1;  // Transformer accepts 2x batch in case of CFG
+        const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1;  // Transformer accepts 2x batch in case of CFG
 
         if (generation_config.height < 0)
-            generation_config.height = transformer_config.sample_size * vae_scale_factor;
+            compute_dim(generation_config.height, initial_image, 1 /* assume NHWC */);
         if (generation_config.width < 0)
-            generation_config.width = transformer_config.sample_size * vae_scale_factor;
+            compute_dim(generation_config.width, initial_image, 2 /* assume NHWC */);
 
         check_inputs(generation_config, initial_image);
 
@@ -522,6 +524,29 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
     }
 
 private:
+    bool is_inpainting_model() const {
+        assert(m_transformer != nullptr);
+        assert(m_vae != nullptr);
+        return m_transformer->get_config().in_channels == (m_vae->get_config().latent_channels * 2 + 1);
+    }
+
+    void compute_dim(int64_t & generation_config_value, ov::Tensor initial_image, int dim_idx) {
+        const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
+        const auto& transformer_config = m_transformer->get_config();
+
+        // in case of image to image generation_config_value is just ignored and computed based on initial image
+        if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) {
+            OPENVINO_ASSERT(initial_image, "Initial image is empty for image to image pipeline");
+            ov::Shape shape = initial_image.get_shape();
+            int64_t dim_val = shape[dim_idx];
+
+            generation_config_value = dim_val - (dim_val % vae_scale_factor);
+        }
+
+        if (generation_config_value < 0)
+            generation_config_value = transformer_config.sample_size * vae_scale_factor;
+    }
+
     bool do_classifier_free_guidance(float guidance_scale) const {
         return guidance_scale > 1.0;
     }
@@ -533,8 +558,13 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         const auto& transformer_config = m_transformer->get_config();
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
 
-        m_generation_config.height = transformer_config.sample_size * vae_scale_factor;
-        m_generation_config.width = transformer_config.sample_size * vae_scale_factor;
+        m_generation_config = ImageGenerationConfig();
+
+        // in case of image to image, the shape is computed based on initial image
+        if (m_pipeline_type != PipelineType::IMAGE_2_IMAGE) {
+            m_generation_config.height = transformer_config.sample_size * vae_scale_factor;
+            m_generation_config.width = transformer_config.sample_size * vae_scale_factor;
+        }
 
         if (class_name == "StableDiffusion3Pipeline" || class_name == "StableDiffusion3Img2ImgPipeline" || class_name == "StableDiffusion3InpaintPipeline") {
             m_generation_config.guidance_scale = 7.0f;
diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
index c53c9b7d25..4afbd3ac78 100644
--- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
@@ -147,6 +147,18 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         initialize_generation_config(pipeline_name);
     }
 
+    StableDiffusionPipeline(PipelineType pipeline_type, const StableDiffusionPipeline& pipe) :
+        StableDiffusionPipeline(pipe) {
+        OPENVINO_ASSERT(!pipe.is_inpainting_model(), "Cannot create ",
+            pipeline_type == PipelineType::TEXT_2_IMAGE ? "'Text2ImagePipeline'" : "'Image2ImagePipeline'", " from InpaintingPipeline with inpainting model");
+
+        m_pipeline_type = pipeline_type;
+
+        const bool is_lcm = m_unet->get_config().time_cond_proj_dim > 0;
+        const char * const pipeline_name = is_lcm ? "LatentConsistencyModelPipeline" : "StableDiffusionPipeline";
+        initialize_generation_config(pipeline_name);
+    }
+
     void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) override {
         check_image_size(height, width);
 
@@ -206,8 +218,7 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
         const bool is_inpainting = m_pipeline_type == PipelineType::INPAINTING,
             is_strength_max = is_inpainting && generation_config.strength == 1.0f,
-            is_inpainting_model = is_inpainting && m_unet->get_config().in_channels == (m_vae->get_config().latent_channels * 2 + 1),
-            return_image_latent = is_inpainting && !is_inpainting_model;
+            return_image_latent = is_inpainting && !is_inpainting_model();
 
         ov::Shape latent_shape{generation_config.num_images_per_prompt, m_vae->get_config().latent_channels,
                                generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor};
@@ -254,7 +265,6 @@ class StableDiffusionPipeline : public DiffusionPipeline {
 
         const size_t batch_size_multiplier = m_unet->do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1;  // Unet accepts 2x batch in case of CFG
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
-        const bool is_inpainting_model = m_unet->get_config().in_channels == (m_vae->get_config().latent_channels * 2 + 1);
         ov::Shape target_shape = processed_image.get_shape();
 
         ov::Tensor mask_condition = m_image_resizer->execute(mask_image, target_shape[2], target_shape[3]);
@@ -266,7 +276,7 @@ class StableDiffusionPipeline : public DiffusionPipeline {
 
         ov::Tensor masked_image_latent;
 
-        if (is_inpainting_model) {
+        if (is_inpainting_model()) {
             // create masked image
             ov::Tensor masked_image(ov::element::f32, processed_image.get_shape());
             const float * mask_condition_data = mask_condition.data<const float>();
@@ -300,11 +310,6 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         ImageGenerationConfig generation_config = m_generation_config;
         generation_config.update_generation_config(properties);
 
-        if (!initial_image) {
-            // in case of typical text to image generation, we need to ignore 'strength'
-            generation_config.strength = 1.0f;
-        }
-
         // use callback if defined
         std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
         auto callback_iter = properties.find(ov::genai::callback.name());
@@ -318,12 +323,12 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         const auto& unet_config = m_unet->get_config();
         const size_t batch_size_multiplier = m_unet->do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1;  // Unet accepts 2x batch in case of CFG
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
-        const bool is_inpainting_model = unet_config.in_channels == (m_vae->get_config().latent_channels * 2 + 1);
 
         if (generation_config.height < 0)
             compute_dim(generation_config.height, initial_image, 1 /* assume NHWC */);
         if (generation_config.width < 0)
             compute_dim(generation_config.width, initial_image, 2 /* assume NHWC */);
+
         check_inputs(generation_config, initial_image);
 
         set_lora_adapters(generation_config.adapters);
@@ -364,7 +369,7 @@ class StableDiffusionPipeline : public DiffusionPipeline {
 
             m_scheduler->scale_model_input(latent_cfg, inference_step);
 
-            ov::Tensor latent_model_input = is_inpainting_model ? numpy_utils::concat(numpy_utils::concat(latent_cfg, mask, 1), masked_image_latent, 1) : latent_cfg;
+            ov::Tensor latent_model_input = is_inpainting_model() ? numpy_utils::concat(numpy_utils::concat(latent_cfg, mask, 1), masked_image_latent, 1) : latent_cfg;
             ov::Tensor timestep(ov::element::i64, {1}, &timesteps[inference_step]);
             ov::Tensor noise_pred_tensor = m_unet->infer(latent_model_input, timestep);
 
@@ -391,7 +396,7 @@ class StableDiffusionPipeline : public DiffusionPipeline {
             latent = scheduler_step_result["latent"];
 
             // in case of non-specialized inpainting model, we need manually mask current denoised latent and initial image latent
-            if (m_pipeline_type == PipelineType::INPAINTING && !is_inpainting_model) {
+            if (m_pipeline_type == PipelineType::INPAINTING && !is_inpainting_model()) {
                 blend_latents(image_latent, noise, mask, latent, inference_step);
             }
 
@@ -412,6 +417,12 @@ class StableDiffusionPipeline : public DiffusionPipeline {
     }
 
 protected:
+    bool is_inpainting_model() const {
+        assert(m_unet != nullptr);
+        assert(m_vae != nullptr);
+        return m_unet->get_config().in_channels == (m_vae->get_config().latent_channels * 2 + 1);
+    }
+
     void compute_dim(int64_t & generation_config_value, ov::Tensor initial_image, int dim_idx) {
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
         const auto& unet_config = m_unet->get_config();
@@ -435,13 +446,15 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         const auto& unet_config = m_unet->get_config();
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
 
+        m_generation_config = ImageGenerationConfig();
+
         // in case of image to image, the shape is computed based on initial image
         if (m_pipeline_type != PipelineType::IMAGE_2_IMAGE) {
             m_generation_config.height = unet_config.sample_size * vae_scale_factor;
             m_generation_config.width = unet_config.sample_size * vae_scale_factor;
         }
 
-        if (class_name == "StableDiffusionPipeline" || class_name == "StableDiffusionInpaintPipeline" || class_name == "StableDiffusionInpaintPipeline") {
+        if (class_name == "StableDiffusionPipeline" || class_name == "StableDiffusionImg2ImgPipeline" || class_name == "StableDiffusionInpaintPipeline") {
             m_generation_config.guidance_scale = 7.5f;
             m_generation_config.num_inference_steps = 50;
             m_generation_config.strength = m_pipeline_type == PipelineType::IMAGE_2_IMAGE ? 0.8f : 1.0f;
diff --git a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
index 6913d901df..15f15219c2 100644
--- a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
@@ -116,6 +116,15 @@ class StableDiffusionXLPipeline : public StableDiffusionPipeline {
         m_force_zeros_for_empty_prompt = true;
     }
 
+    StableDiffusionXLPipeline(PipelineType pipeline_type, const StableDiffusionXLPipeline& pipe) :
+        StableDiffusionXLPipeline(pipe) {
+        OPENVINO_ASSERT(!pipe.is_inpainting_model(), "Cannot create ",
+            pipeline_type == PipelineType::TEXT_2_IMAGE ? "'Text2ImagePipeline'" : "'Image2ImagePipeline'", " from InpaintingPipeline with inpainting model");
+
+        m_pipeline_type = pipeline_type;
+        initialize_generation_config("StableDiffusionXLPipeline");
+    }
+
     void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) override {
         check_image_size(height, width);
 
@@ -291,8 +300,13 @@ class StableDiffusionXLPipeline : public StableDiffusionPipeline {
         const auto& unet_config = m_unet->get_config();
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
 
-        m_generation_config.height = unet_config.sample_size * vae_scale_factor;
-        m_generation_config.width = unet_config.sample_size * vae_scale_factor;
+        m_generation_config = ImageGenerationConfig();
+
+        // in case of image to image, the shape is computed based on initial image
+        if (m_pipeline_type != PipelineType::IMAGE_2_IMAGE) {
+            m_generation_config.height = unet_config.sample_size * vae_scale_factor;
+            m_generation_config.width = unet_config.sample_size * vae_scale_factor;
+        }
 
         if (class_name == "StableDiffusionXLPipeline" || class_name == "StableDiffusionXLImg2ImgPipeline" || class_name == "StableDiffusionXLInpaintPipeline") {
             if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) {
diff --git a/src/cpp/src/image_generation/text2image_pipeline.cpp b/src/cpp/src/image_generation/text2image_pipeline.cpp
index 6ceb076f85..56b02a2e10 100644
--- a/src/cpp/src/image_generation/text2image_pipeline.cpp
+++ b/src/cpp/src/image_generation/text2image_pipeline.cpp
@@ -51,6 +51,34 @@ Text2ImagePipeline::Text2ImagePipeline(const std::filesystem::path& root_dir, co
     }
 }
 
+Text2ImagePipeline::Text2ImagePipeline(const Image2ImagePipeline& pipe) {
+    if (auto stable_diffusion_xl = std::dynamic_pointer_cast<StableDiffusionXLPipeline>(pipe.m_impl); stable_diffusion_xl != nullptr) {
+        m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::TEXT_2_IMAGE, *stable_diffusion_xl);
+    } else if (auto stable_diffusion = std::dynamic_pointer_cast<StableDiffusionPipeline>(pipe.m_impl); stable_diffusion != nullptr) {
+        m_impl = std::make_shared<StableDiffusionPipeline>(PipelineType::TEXT_2_IMAGE, *stable_diffusion);
+    } else if (auto stable_diffusion_3 = std::dynamic_pointer_cast<StableDiffusion3Pipeline>(pipe.m_impl); stable_diffusion_3 != nullptr) {
+        m_impl = std::make_shared<StableDiffusion3Pipeline>(PipelineType::TEXT_2_IMAGE, *stable_diffusion_3);
+    } else if (auto flux = std::dynamic_pointer_cast<FluxPipeline>(pipe.m_impl); flux != nullptr) {
+        m_impl = std::make_shared<FluxPipeline>(PipelineType::TEXT_2_IMAGE, *flux);
+    } else {
+        OPENVINO_ASSERT("Cannot convert specified Image2ImagePipeline to Text2ImagePipeline");
+    }
+}
+
+Text2ImagePipeline::Text2ImagePipeline(const InpaintingPipeline& pipe) {
+    if (auto stable_diffusion_xl = std::dynamic_pointer_cast<StableDiffusionXLPipeline>(pipe.m_impl); stable_diffusion_xl != nullptr) {
+        m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::TEXT_2_IMAGE, *stable_diffusion_xl);
+    } else if (auto stable_diffusion = std::dynamic_pointer_cast<StableDiffusionPipeline>(pipe.m_impl); stable_diffusion != nullptr) {
+        m_impl = std::make_shared<StableDiffusionPipeline>(PipelineType::TEXT_2_IMAGE, *stable_diffusion);
+    } else if (auto stable_diffusion_3 = std::dynamic_pointer_cast<StableDiffusion3Pipeline>(pipe.m_impl); stable_diffusion_3 != nullptr) {
+        m_impl = std::make_shared<StableDiffusion3Pipeline>(PipelineType::TEXT_2_IMAGE, *stable_diffusion_3);
+    } else if (auto flux = std::dynamic_pointer_cast<FluxPipeline>(pipe.m_impl); flux != nullptr) {
+        m_impl = std::make_shared<FluxPipeline>(PipelineType::TEXT_2_IMAGE, *flux);
+    } else {
+        OPENVINO_ASSERT("Cannot convert specified InpaintingPipeline to Text2ImagePipeline");
+    }
+}
+
 Text2ImagePipeline::Text2ImagePipeline(const std::shared_ptr<DiffusionPipeline>& impl)
     : m_impl(impl) {
     assert(m_impl != nullptr);
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index 8ab0407ea7..829d4844e8 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -772,6 +772,9 @@ class Image2ImagePipeline:
                     device (str): Device to run the model on (e.g., CPU, GPU).
                     kwargs: Image2ImagePipeline properties
         """
+    @typing.overload
+    def __init__(self, pipe: InpaintingPipeline) -> None:
+        ...
     def compile(self, device: str, **kwargs) -> None:
         """
                         Compiles the model.
@@ -868,6 +871,9 @@ class InpaintingPipeline:
                     device (str): Device to run the model on (e.g., CPU, GPU).
                     kwargs: InpaintingPipeline properties
         """
+    @typing.overload
+    def __init__(self, pipe: Image2ImagePipeline) -> None:
+        ...
     def compile(self, device: str, **kwargs) -> None:
         """
                         Compiles the model.
@@ -1535,6 +1541,12 @@ class Text2ImagePipeline:
                     device (str): Device to run the model on (e.g., CPU, GPU).
                     kwargs: Text2ImagePipeline properties
         """
+    @typing.overload
+    def __init__(self, pipe: Image2ImagePipeline) -> None:
+        ...
+    @typing.overload
+    def __init__(self, pipe: InpaintingPipeline) -> None:
+        ...
     def compile(self, device: str, **kwargs) -> None:
         """
                         Compiles the model.
diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
index 7739b88ff9..55be1708c1 100644
--- a/src/python/py_image_generation_pipelines.cpp
+++ b/src/python/py_image_generation_pipelines.cpp
@@ -85,9 +85,7 @@ void init_image_generation_pipelines(py::module_& m) {
         .def(py::init<>());
 
     py::class_<ov::genai::CppStdGenerator, ov::genai::Generator, std::shared_ptr<ov::genai::CppStdGenerator>>(m, "CppStdGenerator", "This class wraps std::mt19937 pseudo-random generator.")
-        .def(py::init([](
-            uint32_t seed
-        ) {
+        .def(py::init([](uint32_t seed) {
             return std::make_unique<ov::genai::CppStdGenerator>(seed);
         }), 
         py::arg("seed"))
@@ -140,9 +138,7 @@ void init_image_generation_pipelines(py::module_& m) {
         });
 
     auto text2image_pipeline = py::class_<ov::genai::Text2ImagePipeline>(m, "Text2ImagePipeline", "This class is used for generation with text-to-image models.")
-        .def(py::init([](
-            const std::filesystem::path& models_path
-        ) {
+        .def(py::init([](const std::filesystem::path& models_path) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
             return std::make_unique<ov::genai::Text2ImagePipeline>(models_path);
         }),
@@ -151,7 +147,6 @@ void init_image_generation_pipelines(py::module_& m) {
             Text2ImagePipeline class constructor.
             models_path (os.PathLike): Path to the folder with exported model files.
         )")
-
         .def(py::init([](
             const std::filesystem::path& models_path,
             const std::string& device,
@@ -211,9 +206,7 @@ void init_image_generation_pipelines(py::module_& m) {
 
 
     auto image2image_pipeline = py::class_<ov::genai::Image2ImagePipeline>(m, "Image2ImagePipeline", "This class is used for generation with image-to-image models.")
-        .def(py::init([](
-            const std::filesystem::path& models_path
-        ) {
+        .def(py::init([](const std::filesystem::path& models_path) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
             return std::make_unique<ov::genai::Image2ImagePipeline>(models_path);
         }),
@@ -222,7 +215,6 @@ void init_image_generation_pipelines(py::module_& m) {
             Image2ImagePipeline class constructor.
             models_path (os.PathLike): Path to the folder with exported model files.
         )")
-
         .def(py::init([](
             const std::filesystem::path& models_path,
             const std::string& device,
@@ -277,9 +269,7 @@ void init_image_generation_pipelines(py::module_& m) {
 
 
     auto inpainting_pipeline = py::class_<ov::genai::InpaintingPipeline>(m, "InpaintingPipeline", "This class is used for generation with inpainting models.")
-        .def(py::init([](
-            const std::filesystem::path& models_path
-        ) {
+        .def(py::init([](const std::filesystem::path& models_path) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
             return std::make_unique<ov::genai::InpaintingPipeline>(models_path);
         }),
@@ -288,7 +278,6 @@ void init_image_generation_pipelines(py::module_& m) {
             InpaintingPipeline class constructor.
             models_path (os.PathLike): Path to the folder with exported model files.
         )")
-
         .def(py::init([](
             const std::filesystem::path& models_path,
             const std::string& device,
@@ -342,4 +331,25 @@ void init_image_generation_pipelines(py::module_& m) {
             py::arg("mask_image"), "Mask image",
             (text2image_generate_docstring + std::string(" \n ")).c_str())
         .def("decode", &ov::genai::InpaintingPipeline::decode, py::arg("latent"));
+
+    // define constructors to create one pipeline from another
+    // NOTE: needs to be defined once all pipelines are created
+
+    text2image_pipeline
+        .def(py::init([](const ov::genai::Image2ImagePipeline& pipe) {
+            return std::make_unique<ov::genai::Text2ImagePipeline>(pipe);
+        }), py::arg("pipe"))
+        .def(py::init([](const ov::genai::InpaintingPipeline& pipe) {
+            return std::make_unique<ov::genai::Text2ImagePipeline>(pipe);
+        }), py::arg("pipe"));
+
+    image2image_pipeline
+        .def(py::init([](const ov::genai::InpaintingPipeline& pipe) {
+            return std::make_unique<ov::genai::Image2ImagePipeline>(pipe);
+        }), py::arg("pipe"));
+
+    inpainting_pipeline
+        .def(py::init([](const ov::genai::Image2ImagePipeline& pipe) {
+            return std::make_unique<ov::genai::InpaintingPipeline>(pipe);
+        }), py::arg("pipe"));
 }

From 095840028c8f412ac7ed6e7c51c2f5b1fbf65853 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sat, 14 Dec 2024 00:48:08 +0400
Subject: [PATCH 008/110] Fixed typo in image generation readme (#1384)

Sync with C++ readme

https://github.com/openvinotoolkit/openvino.genai/blob/d189eb7541a61a41581dd21361db3aa3884d211b/samples/cpp/image_generation/README.md?plain=1#L121-L125
---
 samples/python/image_generation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/python/image_generation/README.md b/samples/python/image_generation/README.md
index 321f3f6d05..33da6bd43a 100644
--- a/samples/python/image_generation/README.md
+++ b/samples/python/image_generation/README.md
@@ -112,7 +112,7 @@ To run the sample, download initial image first:
 
 And then run the sample:
 
-`python image2mage.py ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting' small_city.bmp`
+`python image2mage.py ./dreamlike_anime_1_0_ov/FP16 'cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k' cat.png`
 
 The resuling image is:
 

From 8045cf0d6340312aebaeb73cd927f9375079a8ee Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sat, 14 Dec 2024 00:50:55 +0400
Subject: [PATCH 009/110] GHA: use OpenVINO GenAI from PR in LLM bench / WWB
 tests (#1376)

- Install OpenVINO GenAI from source code in LLM bench pipeline
- Use OpenVINO provider to ensure OpenVINO GenAI is built on the same
platform as OpenVINO
- Changed `dreamlike-anime` to `OpenVINO/LCM_Dreamshaper_v7-int8-ov` to
save conversion time
---
 .github/workflows/linux.yml            |   2 -
 .github/workflows/llm_bench-python.yml | 265 +++++++++++++++++--------
 2 files changed, 180 insertions(+), 87 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 18107aa203..8d596aed56 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -34,7 +34,6 @@ jobs:
       status: ${{ steps.openvino_download.outcome }}
       ov_artifact_name: ${{ steps.openvino_download.outputs.ov_artifact_name }}
       ov_wheel_source: ${{ steps.openvino_download.outputs.ov_wheel_source }}
-      ov_version: ${{ steps.openvino_download.outputs.ov_version }}
       docker_tag: ${{ steps.get_docker_tag.outputs.docker_tag }}
     timeout-minutes: 10
     defaults:
@@ -147,7 +146,6 @@ jobs:
       OV_INSTALL_DIR: ${{ github.workspace }}/ov
       INSTALL_DIR: ${{ github.workspace }}/install
       WHEELS_DIR: ${{ github.workspace }}/install/wheels
-      BUILD_DIR: ${{ github.workspace }}/build
       SRC_DIR: ${{ github.workspace }}/src
 
     steps:
diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index 2c8f6a358a..f0df824efa 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -19,105 +19,200 @@ concurrency:
   group: ${{ github.event_name == 'push' && github.run_id || github.ref }}-llm-bench-python
   cancel-in-progress: true
 
-env:
-  LLM_BENCH_PYPATH: tools/llm_bench
-  WWB_PATH: tools/who_what_benchmark
-
 jobs:
+  openvino_download:
+    name: Download OpenVINO
+    outputs:
+      status: ${{ steps.openvino_download.outcome }}
+      ov_artifact_name: ${{ steps.openvino_download.outputs.ov_artifact_name }}
+      ov_wheel_source: ${{ steps.openvino_download.outputs.ov_wheel_source }}
+      ov_version: ${{ steps.openvino_download.outputs.ov_version }}
+      docker_tag: ${{ steps.get_docker_tag.outputs.docker_tag }}
+    timeout-minutes: 10
+    defaults:
+      run:
+        shell: bash
+    runs-on: aks-linux-2-cores-8gb
+    container:
+      image: 'openvinogithubactions.azurecr.io/openvino_provider:0.1.0'
+      volumes: 
+        - /mount:/mount
+        - ${{ github.workspace }}:${{ github.workspace }}
+
+    steps:
+    - uses: openvinotoolkit/openvino/.github/actions/openvino_provider@master
+      id: openvino_download
+      with:
+        platform: ubuntu22
+        commit_packages_to_provide: wheels
+        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
+
+    - name: Clone docker tag from OpenVINO repo
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        repository: 'openvinotoolkit/openvino'
+        path: 'openvino'
+        ref: ${{ env.OV_BRANCH }}
+        sparse-checkout: |
+          .github/dockerfiles/docker_tag
+
+    - name: Save docker tag to output
+      id: get_docker_tag
+      run: |
+        docker_tag=$(cat openvino/.github/dockerfiles/docker_tag)
+        echo "docker_tag=$docker_tag" >> $GITHUB_OUTPUT
+
   build:
-    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+    runs-on: ubuntu-22.04
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.10"]
+        python-version: ["3.11"]
+    needs: [ openvino_download ]
+    env:
+      OV_INSTALL_DIR: ${{ github.workspace }}/ov
+      SRC_DIR: ${{ github.workspace }}
+      LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench
+      WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark
+
     steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        python -m pip install flake8 pytest black
-        GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt
-        python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        python -m flake8 ${{ env.LLM_BENCH_PYPATH }} --config=${{ env.LLM_BENCH_PYPATH }}/setup.cfg
-        python -m flake8 ${{ env.WWB_PATH }} --config=${{ env.WWB_PATH }}/setup.cfg
-    - name: Create code style diff for samples
-      if: failure()
-      run: |
-        python -m black -l 160 -S ${{ env.LLM_BENCH_PYPATH }}/
-        git diff > llm.bench_diff.diff
-    - uses: actions/upload-artifact@v3
-      if: failure()
-      with:
-        name: llm.bench_diff
-        path: llm.bench_diff.diff
-    - name: Test native pytorch model on Linux
-      run: |
-        git clone --depth 1 https://huggingface.co/katuni4ka/tiny-random-qwen
-        python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt
-      env:
-        GIT_LFS_SKIP_SMUDGE: 0
-    - name: Test tiny-random-baichuan2 on Linux Optimum Intel
-      run: |
-        optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 --optimum
-    - name: Test tiny-stable-diffusion on Linux Optimum Intel
-      run: |
-        optimum-cli export openvino --model segmind/tiny-sd --trust-remote-code --weight-format fp16 ./ov_models/tiny-sd/pytorch/dldt/FP16/
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum
-    - name: Test dreamlike-anime on Linux with GenAI
-      run: |
-        optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 ov_models/dreamlike-art-dreamlike-anime-1.0/FP16
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1
-    - name: Test dreamlike-anime on Linux with GenAI and LoRA
-      run: |
-        wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7
-    - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
-      run: |
-        optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16
-        optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1  --assistant_confidence_threshold 0.4
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1  --num_assistant_tokens 5
-    - name: Test whisper-tiny on Linux
-      run: |
-        GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech
-        cd multilingual_librispeech
-        git lfs pull -I /data/mls_polish/train/audio/3283_1447_000.tar.gz
-        mkdir data/mls_polish/train/audio/3283_1447_000
-        tar zxvf data/mls_polish/train/audio/3283_1447_000.tar.gz -C data/mls_polish/train/audio/3283_1447_000/
-        cd ..
-        optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny ./ov_models/whisper-tiny
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1 --optimum
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1
-    - name: WWB Tests
-      run: |
-        pip install git+https://github.com/huggingface/optimum-intel.git
-        GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
-        python -m pytest -v ${{ env.WWB_PATH }}/tests
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          submodules: recursive
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Download OpenVINO package
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: ${{ needs.openvino_download.outputs.ov_artifact_name }}
+          path: ${{ env.OV_INSTALL_DIR }}
+          merge-multiple: true
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install flake8 pytest black
+          python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
+        working-directory: ${{ env.OV_INSTALL_DIR }}
+        env:
+          CMAKE_BUILD_PARALLEL_LEVEL: 4
+      - name: Lint with flake8
+        run: |
+          # stop the build if there are Python syntax errors or undefined names
+          python -m flake8 ${{ env.LLM_BENCH_PYPATH }} --config=${{ env.LLM_BENCH_PYPATH }}/setup.cfg
+          python -m flake8 ${{ env.WWB_PATH }} --config=${{ env.WWB_PATH }}/setup.cfg
+      - name: Create code style diff for samples
+        if: failure()
+        run: |
+          python -m black -l 160 -S ${{ env.LLM_BENCH_PYPATH }}/
+          git diff > llm.bench_diff.diff
+      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        if: failure()
+        with:
+          name: llm.bench_diff
+          path: llm.bench_diff.diff
+      - name: Test native pytorch model on Linux
+        run: |
+          git clone --depth 1 https://huggingface.co/katuni4ka/tiny-random-qwen
+          python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt
+        env:
+          GIT_LFS_SKIP_SMUDGE: 0
+      - name: Test tiny-random-baichuan2 on Linux Optimum Intel
+        run: |
+          optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 --optimum
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux Optimum Intel
+        run: |
+          huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir ov_models/lcm_dreamshaper_v7
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI
+        run: |
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI and LoRA
+        run: |
+          wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7
+      - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
+        run: |
+          optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16
+          optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --assistant_confidence_threshold 0.4
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --num_assistant_tokens 5
+      - name: Test whisper-tiny on Linux
+        run: |
+          GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech
+          cd multilingual_librispeech
+          git lfs pull -I /data/mls_polish/train/audio/3283_1447_000.tar.gz
+          mkdir data/mls_polish/train/audio/3283_1447_000
+          tar zxvf data/mls_polish/train/audio/3283_1447_000.tar.gz -C data/mls_polish/train/audio/3283_1447_000/
+          cd ..
+          optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny ./ov_models/whisper-tiny
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1 --optimum
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1
+      - name: WWB Tests
+        run: |
+          pip install git+https://github.com/huggingface/optimum-intel.git
+          GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
+          python -m pytest -v ${{ env.WWB_PATH }}/tests
+
   stateful:
-    runs-on: ubuntu-20.04
+    defaults:
+      run:
+        shell: bash
+    runs-on: ubuntu-22.04
+    needs: [ openvino_download ]
+    env:
+      OV_INSTALL_DIR: ${{ github.workspace }}/ov
+      SRC_DIR: ${{ github.workspace }}
+      LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench
+      WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark
+
     steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          submodules: recursive
+      - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: "3.11"
+      - name: Download OpenVINO package
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
-          python-version: "3.10"
+          name: ${{ needs.openvino_download.outputs.ov_artifact_name }}
+          path: ${{ env.OV_INSTALL_DIR }}
+          merge-multiple: true
       - name: Test stateful
         run: |
-          GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r tools/llm_bench/requirements.txt
-          python -m pip uninstall --yes openvino
-          python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python tools/llm_bench/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir . --stateful
-          grep beam_idx pytorch/dldt/FP32/openvino_model.xml
+          python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          python ${{ env.LLM_BENCH_PYPATH }}/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ${{ env.SRC_DIR }} --stateful
+          grep beam_idx ${{ env.SRC_DIR }}/pytorch/dldt/FP32/openvino_model.xml
+        working-directory: ${{ env.OV_INSTALL_DIR }}
+        env:
+          CMAKE_BUILD_PARALLEL_LEVEL: 4
       - name: WWB Tests
         run: |
           pip install pytest
           pip install git+https://github.com/huggingface/optimum-intel.git
           GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
           python -m pytest -v ${{ env.WWB_PATH }}/tests
+
+  Overall_Status:
+    name: ci/gha_overall_status_llm_bench
+    needs: [openvino_download, build, stateful]
+    if: ${{ always() }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check status of all jobs
+        if: >-
+          ${{
+            contains(needs.*.result, 'failure') ||
+            contains(needs.*.result, 'cancelled')
+          }}
+        run: exit 1

From 669588df2534a3ba96f9589f3645269b4d1f88c9 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Sat, 14 Dec 2024 00:51:28 +0400
Subject: [PATCH 010/110] LLM Inference Guide -> Generative AI workflow (#1383)

---
 README.md                 | 10 +++++-----
 tools/llm_bench/README.md |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index c00971a4e3..4892c86f10 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ Continuous batching functionality is used within OpenVINO Model Server (OVMS) to
 ## Performing text generation 
 <details>
 
-For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
+For more examples check out our [Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
 
 ### Converting and compressing text generation model from Hugging Face library
 
@@ -103,7 +103,7 @@ See [here](https://openvinotoolkit.github.io/openvino_notebooks/?search=Create+a
 ## Performing visual language text generation
 <details>
 
-For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
+For more examples check out our [Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
 
 ### Converting and compressing the model from Hugging Face library
 
@@ -173,7 +173,7 @@ See [here](https://openvinotoolkit.github.io/openvino_notebooks/?search=Visual-l
 
 <details>
 
-For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
+For more examples check out our [Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
 
 ### Converting and compressing image generation model from Hugging Face library
 
@@ -335,7 +335,7 @@ See [here](https://openvinotoolkit.github.io/openvino_notebooks/?search=Text+to+
 ## Speech-to-text processing using Whisper Pipeline
 <details>
 
-For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
+For more examples check out our [Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
 
 NOTE: Whisper Pipeline requires preprocessing of audio input (to adjust sampling rate and normalize)
  
@@ -397,7 +397,7 @@ See [here](https://openvinotoolkit.github.io/openvino_notebooks/?search=Automati
 ## Additional materials
 
 - [List of supported models](https://github.com/openvinotoolkit/openvino.genai/blob/master/src/docs/SUPPORTED_MODELS.md) (NOTE: models can work, but were not tried yet)
-- [OpenVINO LLM inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
+- [OpenVINO Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
 - [Optimum-intel and OpenVINO](https://huggingface.co/docs/optimum/intel/openvino/export)
 
 ## License
diff --git a/tools/llm_bench/README.md b/tools/llm_bench/README.md
index bcb7436189..d0ce53145d 100755
--- a/tools/llm_bench/README.md
+++ b/tools/llm_bench/README.md
@@ -32,7 +32,7 @@ huggingface-cli login
 The `optimum-cli` tool simplifies converting Hugging Face models to OpenVINO IR format. 
 - Detailed documentation can be found in the [Optimum-Intel documentation](https://huggingface.co/docs/optimum/main/en/intel/openvino/export). 
 - To learn more about weight compression, see the [NNCF Weight Compression Guide](https://docs.openvino.ai/2024/openvino-workflow/model-optimization-guide/weight-compression.html).
-- For additional guidance on running inference with OpenVINO for LLMs, see the [OpenVINO LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html).
+- For additional guidance on running inference with OpenVINO for LLMs, see the [OpenVINO Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html).
 
 **Usage:**
 

From c77f7c93f5f82cde4988a8ef1b3ca204d3d6873d Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sat, 14 Dec 2024 02:38:32 +0400
Subject: [PATCH 011/110] GHA: use OpenVINO provider for SD (#1386)

---
 .github/workflows/llm_bench-python.yml        |  20 ----
 .../workflows/stable_diffusion_1_5_cpp.yml    | 103 ++++++++++++++----
 2 files changed, 80 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index f0df824efa..8b022f27e0 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -27,7 +27,6 @@ jobs:
       ov_artifact_name: ${{ steps.openvino_download.outputs.ov_artifact_name }}
       ov_wheel_source: ${{ steps.openvino_download.outputs.ov_wheel_source }}
       ov_version: ${{ steps.openvino_download.outputs.ov_version }}
-      docker_tag: ${{ steps.get_docker_tag.outputs.docker_tag }}
     timeout-minutes: 10
     defaults:
       run:
@@ -47,21 +46,6 @@ jobs:
         commit_packages_to_provide: wheels
         revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
 
-    - name: Clone docker tag from OpenVINO repo
-      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      with:
-        repository: 'openvinotoolkit/openvino'
-        path: 'openvino'
-        ref: ${{ env.OV_BRANCH }}
-        sparse-checkout: |
-          .github/dockerfiles/docker_tag
-
-    - name: Save docker tag to output
-      id: get_docker_tag
-      run: |
-        docker_tag=$(cat openvino/.github/dockerfiles/docker_tag)
-        echo "docker_tag=$docker_tag" >> $GITHUB_OUTPUT
-
   build:
     defaults:
       run:
@@ -100,8 +84,6 @@ jobs:
           python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
           GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
         working-directory: ${{ env.OV_INSTALL_DIR }}
-        env:
-          CMAKE_BUILD_PARALLEL_LEVEL: 4
       - name: Lint with flake8
         run: |
           # stop the build if there are Python syntax errors or undefined names
@@ -194,8 +176,6 @@ jobs:
           python ${{ env.LLM_BENCH_PYPATH }}/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ${{ env.SRC_DIR }} --stateful
           grep beam_idx ${{ env.SRC_DIR }}/pytorch/dldt/FP32/openvino_model.xml
         working-directory: ${{ env.OV_INSTALL_DIR }}
-        env:
-          CMAKE_BUILD_PARALLEL_LEVEL: 4
       - name: WWB Tests
         run: |
           pip install pytest
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index b355cd4f09..497bfbff3e 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -17,29 +17,83 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  PYTHON_VERSION: '3.10'
-  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241205_x86_64.tgz
-  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/w_openvino_toolkit_windows_2025.0.0.dev20241205_x86_64.zip
+  PYTHON_VERSION: '3.11'
   OV_INSTALL_DIR: ${{ github.workspace }}/ov
 
 jobs:
+  openvino_download_linux:
+    name: Download OpenVINO for Linux
+    outputs:
+      status: ${{ steps.openvino_download.outcome }}
+      ov_artifact_name: ${{ steps.openvino_download.outputs.ov_artifact_name }}
+      ov_wheel_source: ${{ steps.openvino_download.outputs.ov_wheel_source }}
+      ov_version: ${{ steps.openvino_download.outputs.ov_version }}
+    timeout-minutes: 10
+    defaults:
+      run:
+        shell: bash
+    runs-on: aks-linux-2-cores-8gb
+    container:
+      image: 'openvinogithubactions.azurecr.io/openvino_provider:0.1.0'
+      volumes: 
+        - /mount:/mount
+        - ${{ github.workspace }}:${{ github.workspace }}
+
+    steps:
+    - uses: openvinotoolkit/openvino/.github/actions/openvino_provider@master
+      id: openvino_download
+      with:
+        platform: ubuntu22
+        commit_packages_to_provide: wheels
+        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
+
+  openvino_download_windows:
+    name: Download OpenVINO for Windows
+    outputs:
+      status: ${{ steps.openvino_download.outcome }}
+      ov_artifact_name: ${{ steps.openvino_download.outputs.ov_artifact_name }}
+      ov_wheel_source: ${{ steps.openvino_download.outputs.ov_wheel_source }}
+      ov_version: ${{ steps.openvino_download.outputs.ov_version }}
+    timeout-minutes: 10
+    defaults:
+      run:
+        shell: bash
+    runs-on: aks-linux-2-cores-8gb
+    container:
+      image: 'openvinogithubactions.azurecr.io/openvino_provider:0.1.0'
+      volumes: 
+        - /mount:/mount
+        - ${{ github.workspace }}:${{ github.workspace }}
+
+    steps:
+    - uses: openvinotoolkit/openvino/.github/actions/openvino_provider@master
+      id: openvino_download
+      with:
+        platform: windows
+        commit_packages_to_provide: wheels
+        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
+
   stable_diffusion_1_5_cpp-linux:
-    runs-on: ubuntu-20.04-8-cores
+    runs-on: ubuntu-22.04-8-cores
+    needs: [ openvino_download_linux ]
     defaults:
       run:
         shell: bash -l {0}
     env:
       build_dir: ${{ github.workspace }}//build
+      SRC_DIR: ${{ github.workspace }}
+
     steps:
       - uses: actions/checkout@v4
         with:
           submodules: recursive
 
-      - name: Download OpenVINO archive
-        run: |
-          wget ${{ env.LINUX_OV_ARCHIVE_URL}} --progress=bar:force:noscroll -O openvino_package.tar.gz
-          mkdir ${{ env.OV_INSTALL_DIR }}
-          tar -xzf openvino_package.tar.gz -C ${{ env.OV_INSTALL_DIR }} --strip-components=1
+      - name: Download OpenVINO package
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: ${{ needs.openvino_download_linux.outputs.ov_artifact_name }}
+          path: ${{ env.OV_INSTALL_DIR }}
+          merge-multiple: true
 
       - name: Setup Python ${{ env.PYTHON_VERSION }}
         uses: actions/setup-python@v5
@@ -58,9 +112,10 @@ jobs:
 
       - name: Install python dependencies
         run: |
-          source openvino_sd_cpp/bin/activate
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ./samples/requirements.txt
+          source ${{ github.workspace }}/openvino_sd_cpp/bin/activate
+          python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers/[transformers] ${{ needs.openvino_download_linux.outputs.ov_wheel_source }}
+          python -m pip install -r ${{ env.SRC_DIR }}/samples/requirements.txt
+        working-directory: ${{ env.OV_INSTALL_DIR }}
 
       - name: Download and convert models and tokenizer
         run: |
@@ -95,25 +150,26 @@ jobs:
           PYTHONPATH: ${{ env.build_dir }}
 
   stable_diffusion_1_5_cpp-windows:
+    needs: [ openvino_download_windows ]
     runs-on: windows-2019
     defaults:
       run:
         shell: pwsh
     env:
       build_dir: ${{ github.workspace }}\build
+      SRC_DIR: ${{ github.workspace }}
+
     steps:
       - uses: actions/checkout@v4
         with:
           submodules: recursive
 
-      - name: Download OpenVINO archive
-        run: |
-          mkdir ${{ env.OV_INSTALL_DIR }}
-          pushd ${{ env.OV_INSTALL_DIR }}
-            Invoke-WebRequest "${{ env.WINDOWS_OV_ARCHIVE_URL}}" -OutFile "openvino_package.zip"
-            Expand-Archive openvino_package.zip -DestinationPath ./tmp
-            mv ./tmp/*/* .
-          popd
+      - name: Download OpenVINO package
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: ${{ needs.openvino_download_windows.outputs.ov_artifact_name }}
+          path: ${{ env.OV_INSTALL_DIR }}
+          merge-multiple: true
 
       - name: Setup Python ${{ env.PYTHON_VERSION }}
         uses: actions/setup-python@v5
@@ -132,9 +188,10 @@ jobs:
 
       - name: Install python dependencies
         run: |
-          . "./openvino_sd_cpp/Scripts/Activate.ps1"
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ./samples/requirements.txt
+          . "${{ github.workspace }}/openvino_sd_cpp/Scripts/Activate.ps1"
+          python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers/[transformers] ${{ needs.openvino_download_windows.outputs.ov_wheel_source }}
+          python -m pip install -r ${{ env.SRC_DIR }}/samples/requirements.txt
+        working-directory: ${{ env.OV_INSTALL_DIR }}
 
       - name: Download and convert models and tokenizer
         run: |

From 4a7374bc1533466a159477760bf1cee1c1b10443 Mon Sep 17 00:00:00 2001
From: Dmitriy Pastushenkov <dmitriy.pastushenkov@intel.com>
Date: Mon, 16 Dec 2024 11:16:19 +0100
Subject: [PATCH 012/110] fix some typos in image2image sample readme (#1388)

fix some typos in image2image sample readme
---
 samples/python/image_generation/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/python/image_generation/README.md b/samples/python/image_generation/README.md
index 33da6bd43a..0ddf57d882 100644
--- a/samples/python/image_generation/README.md
+++ b/samples/python/image_generation/README.md
@@ -108,11 +108,11 @@ Also, `strength` parameter linearly affects a number of inferenece steps, becaus
 
 To run the sample, download initial image first:
 
-`wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png`
+`wget -O cat.png https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png`
 
 And then run the sample:
 
-`python image2mage.py ./dreamlike_anime_1_0_ov/FP16 'cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k' cat.png`
+`python image2image.py ./dreamlike_anime_1_0_ov/FP16 'cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k' cat.png`
 
 The resuling image is:
 

From 8ce5eb389179ba82da6523f849944ea3dc8c93e0 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Mon, 16 Dec 2024 15:49:38 +0400
Subject: [PATCH 013/110] Update streaming in LM Encoding & CB (#1377)

---
 src/cpp/src/continuous_batching_impl.cpp      |  8 +++---
 src/cpp/src/lm_encoding.cpp                   | 25 +++++++++++--------
 .../speculative_decoding_impl.cpp             |  2 --
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index d27e8934dc..1e42f5b2d9 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -285,9 +285,11 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         }
         if (streamer_ptr && generations.at(0)->can_read()) {
             std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
-            OPENVINO_ASSERT(1 == token.size());
-            OPENVINO_ASSERT(1 == token.begin()->second.generated_ids.size());
-            continue_generation = !streamer_ptr->put(token.begin()->second.generated_ids.at(0));
+            for (const auto& gen_token : token.begin()->second.generated_ids) {
+                if (!streamer_ptr->put(gen_token)) {
+                    break;
+                }
+            }
         }
     }
 
diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index c76d9f7edf..3ab041fa58 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -125,6 +125,17 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
                                                 active_sequence_groups.end(),
                                                 get_active_sequence_groups),
                                  active_sequence_groups.end());
+    
+    auto stream_generated_tokens = [&streamer_ptr, &generations]() {
+        if (streamer_ptr && generations.at(0).get()->can_read()) {
+            std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
+            for (const auto& gen_token : token.begin()->second.generated_ids) {
+                if (!streamer_ptr->put(gen_token)) {
+                    break;
+                }
+            }
+        }
+    };
 
     while (active_sequence_groups.size() > 0) {
         size_t total_num_tokens = 0;
@@ -202,13 +213,7 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
         raw_perf_counters.m_new_token_times.emplace_back(infer_end);
         raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
 
-        if (streamer_ptr) {
-            // stream data from first sequence
-            int64_t out_token = sequence_groups.at(0).get()->operator[](0)->get_generated_ids().back();
-            if (streamer_ptr->put(out_token)) {
-                break;
-            }
-        }
+        stream_generated_tokens();
 
         sampler_output = sampler.sample(active_sequence_groups, m_llm.get_tensor("logits"));
 
@@ -218,9 +223,9 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
                                     active_sequence_groups.end());
     }
 
+    // to stream last token
+    stream_generated_tokens();
     if (streamer_ptr) {
-        int64_t out_token = sequence_groups.at(0).get()->operator[](0)->get_generated_ids().back();
-        streamer_ptr->put(out_token);
         streamer_ptr->end();
     }
     
@@ -246,4 +251,4 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
 }
 
 }  // namespace genai
-}  // namespace ov
+}  // namespace ov
\ No newline at end of file
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
index 2be67320a9..e4f3b1ad1f 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -232,8 +232,6 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
                 continue;
             }
             std::unordered_map<uint64_t, GenerationOutput> token = main_generations.at(0).get()->back();
-            OPENVINO_ASSERT(1 <= token.size());
-            OPENVINO_ASSERT(1 <= token.begin()->second.generated_ids.size());
             for (const auto& gen_token : token.begin()->second.generated_ids) {
                 continue_generation = !streamer_ptr->put(gen_token);
                 if (!continue_generation) {

From 9e9b409a871ac75fe9d515d6ae6d68d882350b35 Mon Sep 17 00:00:00 2001
From: Sofya Balandina <sofya.balandina@intel.com>
Date: Mon, 16 Dec 2024 13:02:45 +0000
Subject: [PATCH 014/110] Use whole history in case of undetermined
 tokenization of sequence (#1254)

Task: [CVS-157295](https://jira.devtools.intel.com/browse/CVS-157295)

- fist commit is cherry-pick from
https://github.com/openvinotoolkit/openvino.genai/pull/1268 and
https://github.com/openvinotoolkit/openvino.genai/pull/1361
- next commit includes applying comments from
https://github.com/openvinotoolkit/openvino.genai/pull/1268 and adding
usage of kv cache for LLM
---
 src/cpp/src/llm_pipeline.cpp                  | 108 +++++++++++++++---
 src/cpp/src/utils.cpp                         |  75 ++++++++++++
 src/cpp/src/utils.hpp                         |  11 ++
 .../src/visual_language/inputs_embedder.cpp   |  73 +++++++++---
 .../src/visual_language/inputs_embedder.hpp   |   7 ++
 src/cpp/src/visual_language/pipeline.cpp      |  27 ++++-
 6 files changed, 264 insertions(+), 37 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 84f76730eb..f663b27dd9 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -36,13 +36,15 @@ std::pair<EncodedResults, int32_t> beam_search(
 class StatefulLLMPipeline final : public LLMPipelineImplBase {
 public:
     ov::InferRequest m_model_runner;
-
     bool is_chat_conversation = false;
-    bool m_is_cache_empty = true;
+    bool m_trust_encoded_history = true;
     std::optional<int32_t> m_selected_beam = std::nullopt;
     ChatHistory m_history;
     std::string m_templated_chat_history = {};
-    TokenizedInputs m_tokenized_chat_history;
+    std::vector<int64_t> m_tokenized_chat_history;
+    ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+    size_t m_to_remove_from_hist = 0;
+    size_t m_kv_cache_seq_length_axis = 2;
 
     StatefulLLMPipeline(
         const ov::InferRequest& request,
@@ -77,6 +79,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         ov::Core core;
         auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
         utils::slice_matmul_statefull_model(model);
+        m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model);
 
         if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
             m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
@@ -102,8 +105,20 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         OptionalGenerationConfig generation_config,
         StreamerVariant streamer
     ) override {
+        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
+            m_chat_input_type = ov::genai::utils::GenerationChatInputsType::STRING;
+
+        if (is_chat_conversation)
+            OPENVINO_ASSERT(m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS,
+                            "Chat doesn't support switching between input types. Please, continue using EncodedInputs or restart the chat.");
+
         auto start_time = std::chrono::steady_clock::now();
         GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+        // If eos_token_id was not provided, take value from default m_generation_config
+        if (config.eos_token_id == -1)
+            config.set_eos_token_id(m_generation_config.eos_token_id);
+        config.validate();
+
         TokenizedInputs encoded_input;
 
         if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
@@ -127,19 +142,51 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                 auto new_templated_chat_history  = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
                 // Do not add special tokens in chat scenario to be aligned with HF.
                 auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false));
-                if (m_is_cache_empty) {
+                auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
+
+                // some symbols combinations can be encoded by the tokenizer in different ways
+                // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
+                // so let's check it out, find the trusted part and use it in on the next step
+                size_t last_same_hist_token = 0;
+                if (!m_tokenized_chat_history.empty()) {
+                    std::set<int64_t> stop_tokens = config.stop_token_ids;
+                    last_same_hist_token = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
+                    m_trust_encoded_history = last_same_hist_token == SIZE_MAX;
+                }
+
+                if (m_tokenized_chat_history.empty()) {
                     encoded_input = new_chat_tokens;
+                } else if (last_same_hist_token != SIZE_MAX) {
+                    m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token;
+
+                    ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
+                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - last_same_hist_token},
+                                                       new_chat_tokens.input_ids.data<int64_t>() + last_same_hist_token);
+
+                    ov::Tensor new_attention_mask(ov::element::i64, new_tensor.get_shape());
+                    std::fill_n(new_attention_mask.data<int64_t>(), new_tensor.get_shape()[1], 1);
+
+                    encoded_input.input_ids = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
+                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - last_same_hist_token});
+                    new_tensor.copy_to(encoded_input.input_ids);
+                    encoded_input.attention_mask = new_attention_mask;
+
+                    m_selected_beam = std::nullopt;
                 } else {
-                    auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
                     encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
                 }
                 m_templated_chat_history = new_templated_chat_history;
-                m_tokenized_chat_history = new_chat_tokens;
+                m_tokenized_chat_history.clear();
+                m_tokenized_chat_history.reserve(new_chat_tokens.input_ids.get_size());
+                std::copy_n(new_chat_tokens.input_ids.data<int64_t>(), new_chat_tokens.input_ids.get_size(),
+                            std::back_inserter(m_tokenized_chat_history));
+
                 // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
             } else {
                 encoded_input = m_tokenizer.encode(prompt);
             }
         }
+
         auto encode_stop_time =  std::chrono::steady_clock::now();
         auto encoded_results = generate(encoded_input, config, streamer);
 
@@ -188,6 +235,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         OptionalGenerationConfig generation_config,
         StreamerVariant streamer
     ) override {
+        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
+            m_chat_input_type = ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS;
+
+        if (is_chat_conversation)
+            // if chat was run in StringInputs mode, but it was called EncodedInputs generate, last m_history entry will be with assistant role
+            OPENVINO_ASSERT(m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS || m_history.back()["role"] == "user",
+                            "Chat doesn't support switching between input types. Please, continue using StringInputs or restart the chat.");
+
         auto start_time = std::chrono::steady_clock::now();
         ov::Tensor input_ids;
         ov::Tensor attention_mask;
@@ -199,6 +254,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             attention_mask = data->attention_mask;
         }
 
+        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
+            std::copy(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history));
+
         GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
 
         // If eos_token_id was not provided, take value from default m_generation_config
@@ -230,16 +288,17 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                         "(input_ids, attention_mask, position_ids, beam_idx) "
                         "but you have '" + std::to_string(num_inputs) + "' inputs");
 
+        ov::genai::utils::trim_kv_cache(m_model_runner, m_to_remove_from_hist, m_kv_cache_seq_length_axis, m_adapter_controller);
 
         size_t kv_cache_len = 0;
         ov::Tensor concatenated_attention_mask;
-        if (is_chat_conversation && !m_is_cache_empty) {
+        if (is_chat_conversation && !m_tokenized_chat_history.empty()) {
             OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1");
             // If history is saved in KV cache, concatenate new attention_mask with the already existing.
             // Between subsequent runs attention_mask should not be modified.
             auto atten_mask_history = m_model_runner.get_tensor("attention_mask");
             auto prompt_len = attention_mask.get_shape()[1];
-            kv_cache_len = atten_mask_history.get_shape()[1];
+            kv_cache_len = atten_mask_history.get_shape()[1] - m_to_remove_from_hist;
 
             ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}};
             auto start_atten_hst = atten_mask_history.data<int64_t>() + kv_cache_len * (*m_selected_beam);
@@ -263,6 +322,11 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             m_adapter_controller->apply(m_model_runner, config.adapters);
         }
 
+        if (is_chat_conversation && !m_trust_encoded_history) {
+            m_trust_encoded_history = true;
+            m_to_remove_from_hist = 0;
+        }
+
         ov::genai::EncodedResults result;
         if (config.is_beam_search() && is_chat_conversation) {
             std::tie(result, m_selected_beam) = beam_search(m_model_runner, input_ids, concatenated_attention_mask,
@@ -274,8 +338,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
             for (size_t request_id = 0; request_id < batch_size; request_id++) {
                 SequenceGroup::Ptr sequence_group;
-                if (is_chat_conversation && !m_is_cache_empty) {
-                    sequence_group = std::make_shared<SequenceGroup>(request_id, m_tokenized_chat_history.input_ids, config, block_size, enable_prefix_caching);
+                if (is_chat_conversation) {
+                    ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
+                    sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching);
                 } else {
                     size_t seq_len = input_ids.get_shape().at(1);
                     size_t batch_offset = request_id * seq_len;
@@ -294,12 +359,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                                                                                   sampler, requests, position_ids, std::nullopt, m_selected_beam);
         }
 
-        if (!is_chat_conversation) {
+        if (is_chat_conversation) {
+            std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
+        } else {
             reset_kv_state();
             m_selected_beam = std::nullopt;
-        } else {
-            m_is_cache_empty = false;
         }
+
         auto stop_time = std::chrono::steady_clock::now();
 
         // If is called without tokenization then that stat will not be reported.
@@ -313,12 +379,15 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
     void start_chat(const std::string& system_message) override {
         is_chat_conversation = true;
-        m_selected_beam  = std::nullopt;
-        if (!m_is_cache_empty) {
+        m_selected_beam = std::nullopt;
+        m_trust_encoded_history = true;
+        m_to_remove_from_hist = 0;
+        m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+        if (!m_tokenized_chat_history.empty()) {
             reset_kv_state();
-            m_is_cache_empty = true;
             m_history = {};
             m_templated_chat_history = "";
+            m_tokenized_chat_history.clear();
         }
         if (system_message.empty())
             return;
@@ -332,11 +401,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
     void finish_chat() override {
         is_chat_conversation = false;
         m_selected_beam = std::nullopt;
-        if (!m_is_cache_empty) {
+        m_trust_encoded_history = true;
+        m_to_remove_from_hist = 0;
+        m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+        if (!m_tokenized_chat_history.empty()) {
             reset_kv_state();
-            m_is_cache_empty = true;
             m_history.clear();
             m_templated_chat_history.clear();
+            m_tokenized_chat_history.clear();
         }
     }
 };
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index 337b0ab47e..3690920295 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -13,6 +13,8 @@
 #include "openvino/op/tanh.hpp"
 #include "openvino/op/transpose.hpp"
 
+#include "sampler.hpp"
+
 namespace ov {
 namespace genai {
 namespace utils {
@@ -306,6 +308,79 @@ ov::Core singleton_core() {
     return core;
 }
 
+size_t get_first_history_difference(const ov::Tensor& encoded_history, const std::vector<int64_t> tokenized_history, std::set<int64_t> stop_tokens) {
+    size_t idx = 0;
+    auto encoded_history_data = encoded_history.data<int64_t>();
+    while(idx < encoded_history.get_size() && idx < tokenized_history.size()) {
+        if (encoded_history_data[idx] != tokenized_history[idx])
+            break;
+        idx++;
+    }
+
+    // encoded_history after decode of tokenizer could lose one last token (eos/stop token)
+    if ((idx == tokenized_history.size() && idx == encoded_history.get_size()) ||
+        (encoded_history.get_size() < tokenized_history.size() && idx == tokenized_history.size() - 1 && stop_tokens.find(tokenized_history.back()) != stop_tokens.end()))
+        return SIZE_MAX;
+    else
+        return idx;
+}
+
+size_t get_seq_len_axis(std::shared_ptr<const ov::Model> model) {
+    // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
+    // therefore usually seq_length_axis = 2
+    size_t seq_length_axis = 2;
+
+    // "ReadValue" node is KV cache representation in stateful model
+    std::string kv_node_type_name = std::string(ov::op::v6::ReadValue::get_type_info_static().name);
+
+    for (const auto op : model->get_ops()) {
+        // check input size, as in LoRA adapters case it could be 0
+        if (op->get_type_name() != kv_node_type_name || op->get_input_size() < 1) {
+            continue;
+        }
+
+        // Shape example: [-1,4,0,64]
+        auto shape = op->get_input_partial_shape(0);
+
+        for (size_t i = 0; i < shape.rank().get_length(); i++) {
+            // Find axis = 0. This would be sequence length axis.
+            if (shape[i] == 0) {
+                seq_length_axis = i;
+            }
+        }
+        break;
+    }
+
+    return seq_length_axis;
+}
+
+void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t seq_length_axis, std::optional<AdapterController> adapter_controller) {
+    // nothing to trim in this case
+    if (remove_from_end == 0)
+        return;
+
+    auto states = request.query_state();
+    for (auto& state : states) {
+        if(adapter_controller && adapter_controller->has_state_name(state.get_name()))
+            continue;
+
+        ov::Tensor old_tensor = state.get_state();
+        // [BATCH_SIZE, num_kv_heads, seq_len, head_size]
+        auto shape = old_tensor.get_shape();
+        shape[seq_length_axis] -= remove_from_end;
+
+        ov::Coordinate new_shape_begin{0, 0, 0, 0};
+        ov::Coordinate new_shape_end{shape};
+
+        auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end);
+
+        ov::Tensor new_tensor(old_tensor.get_element_type(), shape);
+        trimmed_tensor.copy_to(new_tensor);
+
+        state.set_state(new_tensor);
+    }
+}
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 792987d383..57728cd0dc 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -22,6 +22,11 @@ constexpr bool is_container<T,
     std::void_t<decltype(std::declval<T>().begin()),
                 decltype(std::declval<T>().end())>> = true;
 
+enum class GenerationChatInputsType {
+    UNDEF = 0, // Default value, type of inputs is not defined
+    STRING = 1, // Type of inputs is StringInputs
+    ENCODED_INPUTS = 2, // Type of inputs is EncodedInputs
+};
 
 Tensor init_attention_mask(const Tensor& position_ids);
 
@@ -93,6 +98,12 @@ ov::Core singleton_core();
 template <typename T>
 void read_rt_info(std::shared_ptr<ov::Model>& model, const char* name, T& value);
 
+size_t get_first_history_difference(const ov::Tensor& encoded_history, const std::vector<int64_t> tokenized_history, std::set<int64_t> stop_tokens);
+
+size_t get_seq_len_axis(std::shared_ptr<const ov::Model> model);
+
+void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t seq_length_axis, std::optional<AdapterController> adapter_controller);
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index ced17a2ebd..dfdb1521ef 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -39,8 +39,11 @@ class InputsEmbedder::IInputsEmbedder {
     ChatHistory m_history;
     // Templated chat history
     std::string m_templated_chat_history;
-    // Whether we have computed some inputs already
-    bool m_is_cache_empty = true;
+    // Tokenized chat history
+    std::vector<int64_t> m_tokenized_chat_history;
+    // The number of elements, which need to remove from the end of KV cache
+    // removed elements will be added to inputs_ids
+    size_t m_to_remove_from_hist = 0;
 
 public:
     virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
@@ -53,12 +56,26 @@ class InputsEmbedder::IInputsEmbedder {
         return m_tokenizer;
     }
 
+    std::vector<int64_t> get_tokenized_chat_history() const {
+        return m_tokenized_chat_history;
+    }
+
+    size_t get_amount_to_remove_from_hist() const {
+        return m_to_remove_from_hist;
+    }
+
+    void update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
+        std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_chat_history));
+        m_to_remove_from_hist = 0;
+    }
+
     virtual void start_chat(const std::string& system_message) {
         m_is_chat_conversation = true;
-        if (!m_is_cache_empty) {
+        m_to_remove_from_hist = 0;
+        if (!m_tokenized_chat_history.empty()) {
             m_history.clear();
             m_templated_chat_history.clear();
-            m_is_cache_empty = true;
+            m_tokenized_chat_history.clear();
         }
         if (system_message.empty()) {
             return;
@@ -77,10 +94,11 @@ class InputsEmbedder::IInputsEmbedder {
 
     virtual void finish_chat() {
         m_is_chat_conversation = false;
-        m_is_cache_empty = true;
+        m_to_remove_from_hist = 0;
 
         m_history.clear();
         m_templated_chat_history.clear();
+        m_tokenized_chat_history.clear();
     }
 
 protected:
@@ -92,7 +110,7 @@ class InputsEmbedder::IInputsEmbedder {
         m_vlm_config{vlm_config},
         m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config),
         m_embedding(model_dir, m_vlm_config.scale_emb, device, device_config),
-        m_tokenizer{model_dir.string(), device_config} { }
+        m_tokenizer{model_dir, device_config} { }
     
     IInputsEmbedder(
         const VLMConfig& vlm_config,
@@ -140,15 +158,28 @@ class InputsEmbedder::IInputsEmbedder {
                 new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
             }
             auto start_tokenizer_time = std::chrono::steady_clock::now();
-            ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids;
-            if (m_is_cache_empty) {
+            ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids;
+            TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
+
+            // some symbols combinations can be encoded by the tokenizer in different ways
+            // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
+            // so let's check it out, find the trusted part and use it in on the next step
+            size_t last_same_hist_token = 0;
+            if (!m_tokenized_chat_history.empty()) {
+                std::set<int64_t> stop_tokens = {m_tokenizer.get_eos_token_id()};
+                last_same_hist_token = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
+            }
+
+            if (m_tokenized_chat_history.empty()) {
                 encoded_input_ids = new_chat_tokens;
-                // after first `get_inputs_embeds` is called, we supposed LLM is inferred and cache is not empty
-                m_is_cache_empty = false;
+            } else if (last_same_hist_token != SIZE_MAX) {
+                m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token;
+
+                ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.get_element_type(),
+                                                   {1, new_chat_tokens.get_shape().at(1) - last_same_hist_token},
+                                                   new_chat_tokens.data<int64_t>() + last_same_hist_token);
+                encoded_input_ids = new_tensor;
             } else {
-                TokenizedInputs prev_chat_tokens = m_tokenizer.encode(
-                    m_templated_chat_history
-                );
                 encoded_input_ids = utils::subtract_chat_tokenized_inputs(
                     {new_chat_tokens}, prev_chat_tokens
                 ).input_ids;
@@ -156,6 +187,9 @@ class InputsEmbedder::IInputsEmbedder {
             auto end_tokenizer_time = std::chrono::steady_clock::now();
             metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             m_templated_chat_history = std::move(new_templated_chat_history);
+            m_tokenized_chat_history.clear();
+            std::copy(new_chat_tokens.data<int64_t>(), new_chat_tokens.data<int64_t>() + new_chat_tokens.get_size(),
+                        std::back_inserter(m_tokenized_chat_history));
         } else {
             auto start_tokenizer_time = std::chrono::steady_clock::now();
             encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
@@ -639,7 +673,6 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
                 merged_idx++;
             }
         }
-
         return merged_embeds;
     }
 };
@@ -1138,6 +1171,18 @@ EmbeddingsModel InputsEmbedder::get_embedding_model() const {
     return m_impl->get_embedding_model();
 }
 
+std::vector<int64_t> InputsEmbedder::get_tokenized_chat_history() const {
+    return m_impl->get_tokenized_chat_history();
+}
+
+void InputsEmbedder::update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
+    return m_impl->update_tokenized_chat_history(encoded_result);
+}
+
+size_t InputsEmbedder::get_amount_to_remove_from_hist() const {
+    return m_impl->get_amount_to_remove_from_hist();
+}
+
 Tokenizer InputsEmbedder::get_tokenizer() const {
     return m_impl->get_tokenizer();
 }
diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp
index 0e3a3533e2..5c5b9d2b81 100644
--- a/src/cpp/src/visual_language/inputs_embedder.hpp
+++ b/src/cpp/src/visual_language/inputs_embedder.hpp
@@ -40,6 +40,13 @@ class InputsEmbedder {
     // returns tokenizer
     Tokenizer get_tokenizer() const;
 
+    // returns tokenized chat history
+    std::vector<int64_t> get_tokenized_chat_history() const;
+    // add new results to tokenized chat history
+    void update_tokenized_chat_history(std::vector<int64_t> encoded_result);
+    // returns amount of elements, which need to remove from the end of the KV cache
+    size_t get_amount_to_remove_from_hist() const;
+
     // starts chat and adds optional system_message to chat history
     void start_chat(const std::string& system_message);
     // adds currently generated text to chat history
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
index f7508acb35..b8e89a8e04 100644
--- a/src/cpp/src/visual_language/pipeline.cpp
+++ b/src/cpp/src/visual_language/pipeline.cpp
@@ -64,6 +64,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
     std::shared_ptr<InputsEmbedder> m_inputs_embedder;
     // Load pipeline time
     float m_load_time_ms = 0;
+    // Axis num in kv cache from m_language model, which contains information about history len
+    size_t m_kv_cache_seq_length_axis = 2;
 
     VLMPipelineImpl(
         const std::filesystem::path& models_dir,
@@ -87,9 +89,14 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         m_tokenizer = m_inputs_embedder->get_tokenizer();
         m_embedding = m_inputs_embedder->get_embedding_model();
 
-        m_language = utils::singleton_core().compile_model(
+        auto compiled_language_model = utils::singleton_core().compile_model(
             models_dir / "openvino_language_model.xml", device, properties
-        ).create_infer_request();
+        );
+
+        auto language_model = compiled_language_model.get_runtime_model();
+        m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(language_model);
+
+        m_language = compiled_language_model.create_infer_request();
 
         m_language.get_tensor("attention_mask").set_shape({1, 0});
 
@@ -153,14 +160,20 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics);
         auto end_get_inputs_embeds = std::chrono::steady_clock::now();
 
+        auto to_remove_from_hist = m_inputs_embedder->get_amount_to_remove_from_hist();
+        ov::genai::utils::trim_kv_cache(m_language, to_remove_from_hist, m_kv_cache_seq_length_axis, std::nullopt);
+
         Sampler sampler = Sampler(m_tokenizer);
 
         std::vector<SequenceGroup::Ptr> requests;
         size_t request_id = 0;
         size_t block_size = 1; // not used
         bool enable_prefix_caching = false;
-        size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1);
+
+        auto tokenized_chat_history = m_inputs_embedder->get_tokenized_chat_history();
+        size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1) - to_remove_from_hist;
         size_t inputs_embeds_size = inputs_embeds.get_shape().at(1);
+
         ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size });
         std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), 0);
 
@@ -185,10 +198,10 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         OPENVINO_ASSERT((generation_config.is_greedy_decoding() || generation_config.is_multinomial() || !streamer_ptr),
                         "Currently streaming is possible only for greedy or multinomial decoding");
 
-        ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, { 1, history_size + inputs_embeds.get_shape()[1] }};
+        ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, { 1, history_size + inputs_embeds_size }};
         std::fill_n(new_atten_mask.data<int64_t>(), new_atten_mask.get_size(), 1);
 
-        ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds.get_shape()[1] }};
+        ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }};
         std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), history_size);
 
         ov::genai::EncodedResults encoded_result;
@@ -211,6 +224,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
             m_language.reset_state();
             m_language.get_tensor("attention_mask").set_shape({1, 0});
         }
+
         auto generate_end_time = std::chrono::steady_clock::now();
         decoded.perf_metrics = encoded_result.perf_metrics;
 
@@ -228,6 +242,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         // Evaluate statistics
         decoded.perf_metrics.m_evaluated = false;
         decoded.perf_metrics.evaluate_statistics(generate_start_time);
+
+        m_inputs_embedder->update_tokenized_chat_history(encoded_result.tokens[0]);
+
         return decoded;
     }
 

From 7548c4c49c0a91da12c11faf71658bab8a27a3f9 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 16 Dec 2024 20:11:57 +0400
Subject: [PATCH 015/110] Image generation: added TorchGenerator and rng_seed
 (#1379)

- Added `TorchGenerator` which wraps `torch.Generator`. It throws an
exception is `torch` is not available.
- Added `rng_seed` parameter to `ImageGenerationConfig` which has lower
priority compared with `generator` when they both are specified to
`generate()` or `ImageGenerationConfig::update_generation_config`
---
 README.md                                     |  12 +-
 samples/cpp/image_generation/README.md        |  11 +-
 .../cpp/image_generation/lora_text2image.cpp  |   8 +-
 samples/python/image_generation/README.md     |   8 +-
 samples/python/image_generation/baseline.bmp  |   3 -
 .../heterogeneous_stable_diffusion.py         |   3 +-
 samples/python/image_generation/lora.bmp      |   3 -
 .../image_generation/lora_text2image.py       |  24 +---
 .../python/image_generation/text2image.bmp    |   3 -
 samples/python/image_generation/text2image.py |  15 +-
 .../openvino/genai/generation_config.hpp      |   8 +-
 .../image_generation/generation_config.hpp    |  32 ++++-
 src/cpp/src/generation_config.cpp             |   7 +-
 .../src/image_generation/flux_pipeline.hpp    |   1 -
 .../image_generation/generation_config.cpp    |  28 +++-
 .../stable_diffusion_3_pipeline.hpp           |   6 -
 .../stable_diffusion_pipeline.hpp             |   6 -
 src/python/openvino_genai/__init__.py         |   2 +-
 src/python/openvino_genai/__init__.pyi        |   3 +-
 .../openvino_genai/py_openvino_genai.pyi      |  26 +++-
 src/python/py_image_generation_pipelines.cpp  | 134 +++++++++++++++---
 .../whowhatbench/text2image_evaluator.py      |  13 +-
 22 files changed, 223 insertions(+), 133 deletions(-)
 delete mode 100644 samples/python/image_generation/baseline.bmp
 delete mode 100644 samples/python/image_generation/lora.bmp
 delete mode 100644 samples/python/image_generation/text2image.bmp

diff --git a/README.md b/README.md
index 4892c86f10..680bc9bc65 100644
--- a/README.md
+++ b/README.md
@@ -194,12 +194,7 @@ import openvino_genai
 
 device = 'CPU'  # GPU can be used as well
 pipe = openvino_genai.Text2ImagePipeline("./dreamlike_anime_1_0_ov/INT8", device)
-image_tensor = pipe.generate(
-    "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting",
-    width=512,
-    height=512,
-    num_inference_steps=20
-)
+image_tensor = pipe.generate("cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting")
 
 image = Image.fromarray(image_tensor.data[0])
 image.save("image.bmp")
@@ -218,10 +213,7 @@ int main(int argc, char* argv[]) {
    const std::string device = "CPU";  // GPU can be used as well
 
    ov::genai::Text2ImagePipeline pipe(models_path, device);
-   ov::Tensor image = pipe.generate(prompt,
-        ov::genai::width(512),
-        ov::genai::height(512),
-        ov::genai::num_inference_steps(20));
+   ov::Tensor image = pipe.generate(prompt);
 
    imwrite("image.bmp", image, true);
 }
diff --git a/samples/cpp/image_generation/README.md b/samples/cpp/image_generation/README.md
index 8a5cc5aa19..f8dc21cc39 100644
--- a/samples/cpp/image_generation/README.md
+++ b/samples/cpp/image_generation/README.md
@@ -20,6 +20,10 @@ Users can change the sample code and play with the following generation paramete
 - Apply multiple different LoRA adapters and mix them with different blending coefficients
 - (Image to image and inpainting) Play with `strength` parameter to control how initial image is noised and reduce number of inference steps
 
+
+> [!NOTE]
+> Image generated with HuggingFace / Optimum Intel is not the same generated by this C++ sample: C++ random generation with MT19937 results differ from `numpy.random.randn()` and `diffusers.utils.randn_tensor` (uses `torch.Generator` inside). So, it's expected that image generated by Diffusers and C++ versions provide different images, because latent images are initialize differently.
+
 ## Download and convert the models and tokenizers
 
 The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
@@ -88,13 +92,6 @@ With adapter | Without adapter
 :---:|:---:
 ![](./lora.bmp) | ![](./baseline.bmp)
 
-
-## Note
-
-- Image generated with HuggingFace / Optimum Intel is not the same generated by this C++ sample:
-
-C++ random generation with MT19937 results differ from `numpy.random.randn()` and `diffusers.utils.randn_tensor`. So, it's expected that image generated by Python and C++ versions provide different images, because latent images are initialize differently. Users can implement their own random generator derived from `ov::genai::Generator` and pass it to `Text2ImagePipeline::generate` method.
-
 ## Run text to image with multiple devices
 
 The `heterogeneous_stable_diffusion` sample demonstrates how a Text2ImagePipeline object can be created from individual subcomponents - scheduler, text encoder, unet, & vae decoder. This approach gives fine-grained control over the devices used to execute each stage of the stable diffusion pipeline.
diff --git a/samples/cpp/image_generation/lora_text2image.cpp b/samples/cpp/image_generation/lora_text2image.cpp
index 3fe4b74ff6..c1e6461db9 100644
--- a/samples/cpp/image_generation/lora_text2image.cpp
+++ b/samples/cpp/image_generation/lora_text2image.cpp
@@ -24,19 +24,19 @@ int32_t main(int32_t argc, char* argv[]) try {
 
     std::cout << "Generating image with LoRA adapters applied, resulting image will be in lora.bmp\n";
     ov::Tensor image = pipe.generate(prompt,
-        ov::genai::generator(std::make_shared<ov::genai::CppStdGenerator>(42)),
         ov::genai::width(512),
         ov::genai::height(896),
-        ov::genai::num_inference_steps(20));
+        ov::genai::num_inference_steps(20),
+        ov::genai::rng_seed(42));
     imwrite("lora.bmp", image, true);
 
     std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n";
     image = pipe.generate(prompt,
         ov::genai::adapters(),  // passing adapters in generate overrides adapters set in the constructor; adapters() means no adapters
-        ov::genai::generator(std::make_shared<ov::genai::CppStdGenerator>(42)),
         ov::genai::width(512),
         ov::genai::height(896),
-        ov::genai::num_inference_steps(20));
+        ov::genai::num_inference_steps(20),
+        ov::genai::rng_seed(42));
     imwrite("baseline.bmp", image, true);
 
     return EXIT_SUCCESS;
diff --git a/samples/python/image_generation/README.md b/samples/python/image_generation/README.md
index 0ddf57d882..3e53f40fc4 100644
--- a/samples/python/image_generation/README.md
+++ b/samples/python/image_generation/README.md
@@ -20,6 +20,10 @@ Users can change the sample code and play with the following generation paramete
 - Apply multiple different LoRA adapters and mix them with different blending coefficients
 - (Image to image and inpainting) Play with `strength` parameter to control how initial image is noised and reduce number of inference steps
 
+> [!NOTE]  
+> OpenVINO GenAI is written in C++ and uses `CppStdGenerator` random generator in Image Generation pipelines, while Diffusers library uses `torch.Generator` underhood.
+> To have the same results with HuggingFace, pass manually created `torch.Generator(device='cpu').manual_seed(seed)` to Diffusers generation pipelines and `openvino_genai.TorchGenerator(seed)` to OpenVINO GenAI pipelines as value for `generator` kwarg.
+
 ## Download and convert the models and tokenizers
 
 The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
@@ -41,7 +45,7 @@ Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pi
 
 Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting`
 
-   ![](./text2image.bmp)
+   ![](./../../cpp/image_generation/512x512.bmp)
 
 ### Run with callback
 
@@ -85,7 +89,7 @@ Check the difference:
 
 With adapter | Without adapter
 :---:|:---:
-![](./lora.bmp) | ![](./baseline.bmp)
+![](./../../cpp/image_generation/lora.bmp) | ![](./../../cpp/image_generation/baseline.bmp)
 
 ## Run text to image with multiple devices
 
diff --git a/samples/python/image_generation/baseline.bmp b/samples/python/image_generation/baseline.bmp
deleted file mode 100644
index 1501f5960e..0000000000
--- a/samples/python/image_generation/baseline.bmp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ea0b60b64c4448448140a3cfb5e8609248ad35abd484ace1467d832e6966c941
-size 1376310
diff --git a/samples/python/image_generation/heterogeneous_stable_diffusion.py b/samples/python/image_generation/heterogeneous_stable_diffusion.py
index b1a2f9d5de..18f150816e 100644
--- a/samples/python/image_generation/heterogeneous_stable_diffusion.py
+++ b/samples/python/image_generation/heterogeneous_stable_diffusion.py
@@ -101,8 +101,7 @@ def main():
             height=height,
             guidance_scale=guidance_scale,
             num_inference_steps=number_of_inference_steps_per_image,
-            num_images_per_prompt=1,
-            generator=openvino_genai.CppStdGenerator(42)
+            num_images_per_prompt=1
         )
 
         image = Image.fromarray(image_tensor.data[0])
diff --git a/samples/python/image_generation/lora.bmp b/samples/python/image_generation/lora.bmp
deleted file mode 100644
index a0aaedb930..0000000000
--- a/samples/python/image_generation/lora.bmp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:804bb8d49f1702422abf57c300af75fe75acbef60a9cf8ad5cfc9262b7532c95
-size 1376310
diff --git a/samples/python/image_generation/lora_text2image.py b/samples/python/image_generation/lora_text2image.py
index 95e31ca0ea..6a46099dc2 100644
--- a/samples/python/image_generation/lora_text2image.py
+++ b/samples/python/image_generation/lora_text2image.py
@@ -6,20 +6,6 @@
 
 import openvino as ov
 import openvino_genai
-import numpy as np
-import sys
-
-
-class Generator(openvino_genai.Generator):
-    def __init__(self, seed, mu=0.0, sigma=1.0):
-        openvino_genai.Generator.__init__(self)
-        np.random.seed(seed)
-        self.mu = mu
-        self.sigma = sigma
-
-    def next(self):
-        return np.random.normal(self.mu, self.sigma)
-
 
 def image_write(path: str, image_tensor: ov.Tensor):
     from PIL import Image
@@ -46,23 +32,23 @@ def main():
 
     # LoRA adapters passed to the constructor will be activated by default in next generates
     pipe = openvino_genai.Text2ImagePipeline(args.models_path, device, adapters=adapter_config)
+
     print("Generating image with LoRA adapters applied, resulting image will be in lora.bmp")
     image = pipe.generate(prompt,
-                          generator=Generator(42),
                           width=512,
                           height=896,
-                          num_inference_steps=20)
+                          num_inference_steps=20,
+                          rng_seed=42)
 
     image_write("lora.bmp", image)
     print("Generating image without LoRA adapters applied, resulting image will be in baseline.bmp")
     image = pipe.generate(prompt,
                           # passing adapters in generate overrides adapters set in the constructor; openvino_genai.AdapterConfig() means no adapters
                           adapters=openvino_genai.AdapterConfig(),
-                          generator=Generator(42),
                           width=512,
                           height=896,
-                          num_inference_steps=20
-                          )
+                          num_inference_steps=20,
+                          rng_seed=42)
     image_write("baseline.bmp", image)
 
 
diff --git a/samples/python/image_generation/text2image.bmp b/samples/python/image_generation/text2image.bmp
deleted file mode 100644
index 54974556a4..0000000000
--- a/samples/python/image_generation/text2image.bmp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7c150896ec84f64d4f0cacd67f8f277e08d3ebb1c9a756d43fc80944db7a2ed4
-size 786486
diff --git a/samples/python/image_generation/text2image.py b/samples/python/image_generation/text2image.py
index 95d8c68e82..cba1eefd1d 100644
--- a/samples/python/image_generation/text2image.py
+++ b/samples/python/image_generation/text2image.py
@@ -6,17 +6,6 @@
 
 import openvino_genai
 from PIL import Image
-import numpy as np
-
-class Generator(openvino_genai.Generator):
-    def __init__(self, seed, mu=0.0, sigma=1.0):
-        openvino_genai.Generator.__init__(self)
-        np.random.seed(seed)
-        self.mu = mu
-        self.sigma = sigma
-
-    def next(self):
-        return np.random.normal(self.mu, self.sigma)
 
 
 def main():
@@ -33,9 +22,7 @@ def main():
         width=512,
         height=512,
         num_inference_steps=20,
-        num_images_per_prompt=1,
-        generator=Generator(42)  # openvino_genai.CppStdGenerator can be used to have same images as C++ sample
-    )
+        num_images_per_prompt=1)
 
     image = Image.fromarray(image_tensor.data[0])
     image.save("image.bmp")
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index 2402f57fba..9d79240aa8 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -67,9 +67,9 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
  * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
  * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
  * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
- * @param presence_penalty reduces absolute log prob if the token was generated at least once. Ignored for non continuous batching.
- * @param frequency_penalty reduces absolute log prob as many times as the token was generated. Ignored for non continuous batching.
- * @param rng_seed initializes random generator. Ignored for non continuous batching.
+ * @param presence_penalty reduces absolute log prob if the token was generated at least once.
+ * @param frequency_penalty reduces absolute log prob as many times as the token was generated.
+ * @param rng_seed initializes random generator.
  *
  * Speculative decoding parameters:
  * @param assistant_confidence_threshold the lower token probability of candidate to be validated by main model in case of static strategy candidates number update.
@@ -174,7 +174,7 @@ static constexpr ov::Property<float> repetition_penalty{"repetition_penalty"};
 static constexpr ov::Property<int64_t> eos_token_id{"eos_token_id"};
 static constexpr ov::Property<float> presence_penalty{"presence_penalty"};
 static constexpr ov::Property<float> frequency_penalty{"frequency_penalty"};
-static constexpr ov::Property<size_t> rng_seed{"rng_seed"};
+extern OPENVINO_GENAI_EXPORTS ov::Property<size_t> rng_seed;
 
 static constexpr ov::Property<float> assistant_confidence_threshold{"assistant_confidence_threshold"};
 static constexpr ov::Property<size_t> num_assistant_tokens{"num_assistant_tokens"};
diff --git a/src/cpp/include/openvino/genai/image_generation/generation_config.hpp b/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
index 50e576466d..bd7073520a 100644
--- a/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
@@ -39,6 +39,12 @@ class OPENVINO_GENAI_EXPORTS Generator {
      */
     virtual ov::Tensor randn_tensor(const ov::Shape& shape);
 
+    /**
+     * Sets a new initial seed value to random generator
+     * @param new_seed A new seed value
+     */
+    virtual void seed(size_t new_seed) = 0;
+
     /**
      * Default dtor defined to ensure working RTTI.
      */
@@ -58,9 +64,11 @@ class OPENVINO_GENAI_EXPORTS CppStdGenerator : public Generator {
 
     virtual float next() override;
 
+    virtual void seed(size_t new_seed) override;
+
 private:
-    std::mt19937 gen;
-    std::normal_distribution<float> normal;
+    std::mt19937 m_gen;
+    std::normal_distribution<float> m_normal;
 };
 
 /**
@@ -81,9 +89,17 @@ struct OPENVINO_GENAI_EXPORTS ImageGenerationConfig {
     size_t num_images_per_prompt = 1;
 
     /**
-     * Random generator to initial latents, add noise to initial images in case of image to image / inpainting pipelines
+     * Random generator to initialize latents, add noise to initial images in case of image to image / inpainting pipelines
+     * By default, random generator is initialized as `CppStdGenerator(generation_config.rng_seed)`
+     * @note If `generator` is specified, it has higher priority than `rng_seed` parameter.
+     */
+    std::shared_ptr<Generator> generator = nullptr;
+
+    /**
+     * Seed for random generator
+     * @note If `generator` is specified, it has higher priority than `rng_seed` parameter.
      */
-    std::shared_ptr<Generator> generator = std::make_shared<CppStdGenerator>(42);
+    size_t rng_seed = 42;
 
     float guidance_scale = 7.5f;
     int64_t height = -1;
@@ -91,7 +107,7 @@ struct OPENVINO_GENAI_EXPORTS ImageGenerationConfig {
     size_t num_inference_steps = 50;
 
     /**
-     * Max sequence lenght for T4 encoder / tokenizer used in SD3 / FLUX models
+     * Max sequence length for T5 encoder / tokenizer used in SD3 / FLUX models
      */
     int max_sequence_length = -1;
 
@@ -203,6 +219,12 @@ static constexpr ov::Property<float> strength{"strength"};
  */
 static constexpr ov::Property<std::shared_ptr<Generator>> generator{"generator"};
 
+/**
+ * Seed for random generator
+ * @note If `generator` is specified, it has higher priority than `rng_seed` parameter.
+ */
+extern OPENVINO_GENAI_EXPORTS ov::Property<size_t> rng_seed;
+
 /**
  * This parameters limits max sequence length for T5 encoder for SD3 and FLUX models.
  * T5 tokenizer output is padded with pad tokens to 'max_sequence_length' within a pipeline.
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index 0829e8376a..189cfeded7 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -14,6 +14,8 @@
 namespace ov {
 namespace genai {
 
+ov::Property<size_t> rng_seed{"rng_seed"};
+
 GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) {
     using utils::read_json_param;
 
@@ -21,7 +23,7 @@ GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) {
     OPENVINO_ASSERT(f.is_open(), "Failed to open '", json_path, "' with generation config");
 
     nlohmann::json data = nlohmann::json::parse(f);
-    
+
     read_json_param(data, "max_new_tokens", max_new_tokens);
     read_json_param(data, "max_length", max_length);
     // note that ignore_eos is not present in HF GenerationConfig
@@ -103,6 +105,9 @@ void GenerationConfig::update_generation_config(const ov::AnyMap& config_map) {
     read_anymap_param(config_map, "echo", echo);
     read_anymap_param(config_map, "logprobs", logprobs);
     read_anymap_param(config_map, "adapters", adapters);
+
+    // TODO: add support of 'generator' property similar to Image generation
+    read_anymap_param(config_map, "rng_seed", rng_seed);
 }
 
 size_t GenerationConfig::get_max_new_tokens(size_t prompt_length) const {
diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp
index 716ba6b61b..e74cd441ce 100644
--- a/src/cpp/src/image_generation/flux_pipeline.hpp
+++ b/src/cpp/src/image_generation/flux_pipeline.hpp
@@ -4,7 +4,6 @@
 #pragma once
 
 #include <cassert>
-#include <ctime>
 
 #include "image_generation/diffusion_pipeline.hpp"
 #include "image_generation/numpy_utils.hpp"
diff --git a/src/cpp/src/image_generation/generation_config.cpp b/src/cpp/src/image_generation/generation_config.cpp
index 938034f628..ab098fabe5 100644
--- a/src/cpp/src/image_generation/generation_config.cpp
+++ b/src/cpp/src/image_generation/generation_config.cpp
@@ -27,11 +27,15 @@ ov::Tensor Generator::randn_tensor(const ov::Shape& shape) {
 }
 
 CppStdGenerator::CppStdGenerator(uint32_t seed)
-    : gen(seed), normal(0.0f, 1.0f) {
+    : m_gen(seed), m_normal(0.0f, 1.0f) {
 }
 
 float CppStdGenerator::next() {
-    return normal(gen);
+    return m_normal(m_gen);
+}
+
+void CppStdGenerator::seed(size_t new_seed) {
+    m_gen.seed(new_seed);
 }
 
 //
@@ -55,7 +59,6 @@ void ImageGenerationConfig::update_generation_config(const ov::AnyMap& propertie
     read_anymap_param(properties, "negative_prompt_2", negative_prompt_2);
     read_anymap_param(properties, "negative_prompt_3", negative_prompt_3);
     read_anymap_param(properties, "num_images_per_prompt", num_images_per_prompt);
-    read_anymap_param(properties, "generator", generator);
     read_anymap_param(properties, "guidance_scale", guidance_scale);
     read_anymap_param(properties, "height", height);
     read_anymap_param(properties, "width", width);
@@ -64,6 +67,25 @@ void ImageGenerationConfig::update_generation_config(const ov::AnyMap& propertie
     read_anymap_param(properties, "adapters", adapters);
     read_anymap_param(properties, "max_sequence_length", max_sequence_length);
 
+    // 'generator' has higher priority than 'seed' parameter
+    const bool have_generator_param = properties.find(ov::genai::generator.name()) != properties.end();
+    if (have_generator_param) {
+        read_anymap_param(properties, "generator", generator);
+    } else {
+        read_anymap_param(properties, "rng_seed", rng_seed);
+
+        // initialize random generator with a given seed value
+        if (!generator) {
+            generator = std::make_shared<CppStdGenerator>(rng_seed);
+        }
+
+        const bool have_rng_seed = properties.find(ov::genai::rng_seed.name()) != properties.end();
+        if (have_rng_seed) {
+            // we need to change seed as an user have specified it manually
+            generator->seed(rng_seed);
+        }
+    }
+
     validate();
 }
 
diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
index 18a3e0346f..e3e720109d 100644
--- a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
@@ -4,7 +4,6 @@
 #pragma once
 
 #include <cassert>
-#include <ctime>
 
 #include "image_generation/diffusion_pipeline.hpp"
 #include "image_generation/numpy_utils.hpp"
@@ -453,11 +452,6 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
 
         check_inputs(generation_config, initial_image);
 
-        if (generation_config.generator == nullptr) {
-            uint32_t seed = time(NULL);
-            generation_config.generator = std::make_shared<CppStdGenerator>(seed);
-        }
-
         // 3. Prepare timesteps
         m_scheduler->set_timesteps(generation_config.num_inference_steps, generation_config.strength);
         std::vector<float> timesteps = m_scheduler->get_float_timesteps();
diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
index 4afbd3ac78..7549b67919 100644
--- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
@@ -3,7 +3,6 @@
 
 #pragma once
 
-#include <ctime>
 #include <cassert>
 #include <filesystem>
 
@@ -333,11 +332,6 @@ class StableDiffusionPipeline : public DiffusionPipeline {
 
         set_lora_adapters(generation_config.adapters);
 
-        if (generation_config.generator == nullptr) {
-            uint32_t seed = time(NULL);
-            generation_config.generator = std::make_shared<CppStdGenerator>(seed);
-        }
-
         m_scheduler->set_timesteps(generation_config.num_inference_steps, generation_config.strength);
         std::vector<std::int64_t> timesteps = m_scheduler->get_timesteps();
 
diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py
index ca7c2c0b32..470ddd0cd8 100644
--- a/src/python/openvino_genai/__init__.py
+++ b/src/python/openvino_genai/__init__.py
@@ -11,7 +11,6 @@
 if hasattr(os, "add_dll_directory"):
     os.add_dll_directory(os.path.dirname(__file__))
 
-
 from .py_openvino_genai import (
     DecodedResults,
     EncodedResults,
@@ -75,6 +74,7 @@
     ImageGenerationConfig,
     Generator,
     CppStdGenerator,
+    TorchGenerator,
 )
 
 # Continuous batching
diff --git a/src/python/openvino_genai/__init__.pyi b/src/python/openvino_genai/__init__.pyi
index 4d74e17588..187e0a0a06 100644
--- a/src/python/openvino_genai/__init__.pyi
+++ b/src/python/openvino_genai/__init__.pyi
@@ -34,6 +34,7 @@ from openvino_genai.py_openvino_genai import T5EncoderModel
 from openvino_genai.py_openvino_genai import Text2ImagePipeline
 from openvino_genai.py_openvino_genai import TokenizedInputs
 from openvino_genai.py_openvino_genai import Tokenizer
+from openvino_genai.py_openvino_genai import TorchGenerator
 from openvino_genai.py_openvino_genai import UNet2DConditionModel
 from openvino_genai.py_openvino_genai import VLMPipeline
 from openvino_genai.py_openvino_genai import WhisperGenerationConfig
@@ -43,5 +44,5 @@ from openvino_genai.py_openvino_genai import WhisperRawPerfMetrics
 from openvino_genai.py_openvino_genai import draft_model
 import os as os
 from . import py_openvino_genai
-__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'openvino', 'os', 'py_openvino_genai']
+__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'openvino', 'os', 'py_openvino_genai']
 __version__: str = '2025.0.0.0'
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index 829d4844e8..8b8eb76b12 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -5,7 +5,7 @@ from __future__ import annotations
 import openvino._pyopenvino
 import os
 import typing
-__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model']
+__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model']
 class Adapter:
     """
     Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier.
@@ -398,6 +398,8 @@ class CppStdGenerator(Generator):
         ...
     def randn_tensor(self, shape: openvino._pyopenvino.Shape) -> openvino._pyopenvino.Tensor:
         ...
+    def seed(self, new_seed: int) -> None:
+        ...
 class DecodedResults:
     """
     
@@ -804,7 +806,8 @@ class Image2ImagePipeline:
             height: int - height of resulting images,
             width: int - width of resulting images,
             num_inference_steps: int - number of inference steps,
-            generator: openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
+            rng_seed: int - a seed for random numbers generator,
+            generator: openvino_genai.TorchGenerator, openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
             adapters: LoRA adapters,
             strength: strength for image to image generation. 1.0f means initial image is fully noised,
             max_sequence_length: int - length of t5_encoder_model input
@@ -836,6 +839,7 @@ class ImageGenerationConfig:
     num_inference_steps: int
     prompt_2: str | None
     prompt_3: str | None
+    rng_seed: int
     strength: float
     width: int
     def __init__(self) -> None:
@@ -903,7 +907,8 @@ class InpaintingPipeline:
             height: int - height of resulting images,
             width: int - width of resulting images,
             num_inference_steps: int - number of inference steps,
-            generator: openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
+            rng_seed: int - a seed for random numbers generator,
+            generator: openvino_genai.TorchGenerator, openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
             adapters: LoRA adapters,
             strength: strength for image to image generation. 1.0f means initial image is fully noised,
             max_sequence_length: int - length of t5_encoder_model input
@@ -1576,7 +1581,8 @@ class Text2ImagePipeline:
             height: int - height of resulting images,
             width: int - width of resulting images,
             num_inference_steps: int - number of inference steps,
-            generator: openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
+            rng_seed: int - a seed for random numbers generator,
+            generator: openvino_genai.TorchGenerator, openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
             adapters: LoRA adapters,
             strength: strength for image to image generation. 1.0f means initial image is fully noised,
             max_sequence_length: int - length of t5_encoder_model input
@@ -1649,6 +1655,18 @@ class Tokenizer:
         """
         Override a chat_template read from tokenizer_config.json.
         """
+class TorchGenerator(CppStdGenerator):
+    """
+    This class provides OpenVINO GenAI Generator wrapper for torch.Generator
+    """
+    def __init__(self, seed: int) -> None:
+        ...
+    def next(self) -> float:
+        ...
+    def randn_tensor(self, shape: openvino._pyopenvino.Shape) -> openvino._pyopenvino.Tensor:
+        ...
+    def seed(self, new_seed: int) -> None:
+        ...
 class UNet2DConditionModel:
     """
     UNet2DConditionModel class.
diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
index 55be1708c1..da6ce6d21b 100644
--- a/src/python/py_image_generation_pipelines.cpp
+++ b/src/python/py_image_generation_pipelines.cpp
@@ -8,6 +8,7 @@
 #include <pybind11/stl_bind.h>
 #include <pybind11/stl/filesystem.h>
 #include <pybind11/functional.h>
+#include <pybind11/numpy.h>
 
 #include "openvino/genai/image_generation/text2image_pipeline.hpp"
 #include "openvino/genai/image_generation/image2image_pipeline.hpp"
@@ -19,23 +20,7 @@
 namespace py = pybind11;
 namespace pyutils = ov::genai::pybind::utils;
 
-namespace ov {
-namespace genai {
-
-/// Trampoline class to support inheritance from Generator in Python
-class PyGenerator : public ov::genai::Generator {
-public:
-    float next() override {
-        PYBIND11_OVERRIDE_PURE(float, Generator, next);
-    }
-
-    ov::Tensor randn_tensor(const ov::Shape& shape) override {
-        PYBIND11_OVERRIDE(ov::Tensor, Generator, randn_tensor, shape);
-    }
-};
-
-} // namespace genai
-} // namespace ov
+using namespace pybind11::literals;
 
 namespace {
 
@@ -59,7 +44,8 @@ auto text2image_generate_docstring = R"(
     height: int - height of resulting images,
     width: int - width of resulting images,
     num_inference_steps: int - number of inference steps,
-    generator: openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
+    rng_seed: int - a seed for random numbers generator,
+    generator: openvino_genai.TorchGenerator, openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
     adapters: LoRA adapters,
     strength: strength for image to image generation. 1.0f means initial image is fully noised,
     max_sequence_length: int - length of t5_encoder_model input
@@ -68,7 +54,102 @@ auto text2image_generate_docstring = R"(
     :rtype: ov.Tensor
 )";
 
+// Trampoline class to support inheritance from Generator in Python
+class PyGenerator : public ov::genai::Generator {
+public:
+    float next() override {
+        PYBIND11_OVERRIDE_PURE(float, Generator, next);
+    }
+
+    ov::Tensor randn_tensor(const ov::Shape& shape) override {
+        PYBIND11_OVERRIDE(ov::Tensor, Generator, randn_tensor, shape);
+    }
+
+    void seed(size_t new_seed) override {
+        PYBIND11_OVERRIDE_PURE(void, Generator, seed, new_seed);
+    }
+};
+
+py::list to_py_list(const ov::Shape shape) {
+    py::list py_shape;
+    for (auto d : shape)
+        py_shape.append(d);
+
+    return py_shape;
+}
 
+class TorchGenerator : public ov::genai::CppStdGenerator {
+    py::module_ m_torch;
+    py::object m_torch_generator, m_float32;
+
+    void create_torch_generator(size_t seed) {
+        m_torch_generator = m_torch.attr("Generator")("device"_a="cpu").attr("manual_seed")(seed);
+    }
+public:
+    explicit TorchGenerator(uint32_t seed) : CppStdGenerator(seed) {
+        try {
+            m_torch = py::module_::import("torch");
+        } catch (const py::error_already_set& e) {
+            if (e.matches(PyExc_ModuleNotFoundError)) {
+                throw std::runtime_error("The 'torch' package is not installed. Please, call 'pip install torch' or use 'rng_seed' parameter.");
+            } else {
+                // Re-throw other exceptions
+                throw;
+            }
+        }
+
+        m_float32 = m_torch.attr("float32");
+        create_torch_generator(seed);
+    }
+
+    float next() override {
+        return m_torch.attr("randn")(1, "generator"_a=m_torch_generator, "dtype"_a=m_float32).attr("item")().cast<float>();
+    }
+
+    ov::Tensor randn_tensor(const ov::Shape& shape) override {
+        py::object torch_tensor = m_torch.attr("randn")(to_py_list(shape), "generator"_a=m_torch_generator, "dtype"_a=m_float32);
+        py::object numpy_tensor = torch_tensor.attr("numpy")();
+        py::array numpy_array = py::cast<py::array>(numpy_tensor);
+
+        if (!numpy_array.dtype().is(py::dtype::of<float>())) {
+            throw std::runtime_error("Expected a NumPy array with dtype float32");
+        }
+
+        class TorchTensorAllocator {
+            size_t m_total_size;
+            void * m_mutable_data;
+            py::object m_torch_tensor; // we need to hold torch.Tensor to avoid memory destruction
+
+        public:
+            TorchTensorAllocator(size_t total_size, void * mutable_data, py::object torch_tensor) :
+                m_total_size(total_size), m_mutable_data(mutable_data), m_torch_tensor(torch_tensor) { }
+
+            void* allocate(size_t bytes, size_t) const {
+                if (m_total_size == bytes) {
+                    return m_mutable_data;
+                }
+                throw std::runtime_error{"Unexpected number of bytes was requested to allocate."};
+            }
+
+            void deallocate(void*, size_t bytes, size_t) {
+                if (m_total_size != bytes) {
+                    throw std::runtime_error{"Unexpected number of bytes was requested to deallocate."};
+                }
+            }
+
+            bool is_equal(const TorchTensorAllocator& other) const noexcept {
+                return this == &other;
+            }
+        };
+
+        return ov::Tensor(ov::element::f32, shape,
+            TorchTensorAllocator(ov::shape_size(shape) * ov::element::f32.size(), numpy_array.mutable_data(), torch_tensor));
+    }
+
+    void seed(size_t new_seed) override {
+        create_torch_generator(new_seed);
+    }
+};
 
 } // namespace
 
@@ -81,16 +162,24 @@ void init_flux_transformer_2d_model(py::module_& m);
 void init_autoencoder_kl(py::module_& m);
 
 void init_image_generation_pipelines(py::module_& m) {
-    py::class_<ov::genai::Generator, ov::genai::PyGenerator, std::shared_ptr<ov::genai::Generator>>(m, "Generator", "This class is used for storing pseudo-random generator.")
+    py::class_<ov::genai::Generator, ::PyGenerator, std::shared_ptr<ov::genai::Generator>>(m, "Generator", "This class is used for storing pseudo-random generator.")
         .def(py::init<>());
 
     py::class_<ov::genai::CppStdGenerator, ov::genai::Generator, std::shared_ptr<ov::genai::CppStdGenerator>>(m, "CppStdGenerator", "This class wraps std::mt19937 pseudo-random generator.")
         .def(py::init([](uint32_t seed) {
             return std::make_unique<ov::genai::CppStdGenerator>(seed);
-        }), 
-        py::arg("seed"))
+        }), py::arg("seed"))
         .def("next", &ov::genai::CppStdGenerator::next)
-        .def("randn_tensor", &ov::genai::CppStdGenerator::randn_tensor, py::arg("shape"));
+        .def("randn_tensor", &ov::genai::CppStdGenerator::randn_tensor, py::arg("shape"))
+        .def("seed", &ov::genai::CppStdGenerator::seed, py::arg("new_seed"));
+
+    py::class_<::TorchGenerator, ov::genai::CppStdGenerator, std::shared_ptr<::TorchGenerator>>(m, "TorchGenerator", "This class provides OpenVINO GenAI Generator wrapper for torch.Generator")
+        .def(py::init([](uint32_t seed) {
+            return std::make_unique<::TorchGenerator>(seed);
+        }), py::arg("seed"))
+        .def("next", &::TorchGenerator::next)
+        .def("randn_tensor", &::TorchGenerator::randn_tensor, py::arg("shape"))
+        .def("seed", &::TorchGenerator::seed, py::arg("new_seed"));
 
     // init image generation models
     init_clip_text_model(m);
@@ -122,6 +211,7 @@ void init_image_generation_pipelines(py::module_& m) {
         .def_readwrite("negative_prompt_2", &ov::genai::ImageGenerationConfig::negative_prompt_2)
         .def_readwrite("negative_prompt_3", &ov::genai::ImageGenerationConfig::negative_prompt_3)
         .def_readwrite("generator", &ov::genai::ImageGenerationConfig::generator)
+        .def_readwrite("rng_seed", &ov::genai::ImageGenerationConfig::rng_seed)
         .def_readwrite("guidance_scale", &ov::genai::ImageGenerationConfig::guidance_scale)
         .def_readwrite("height", &ov::genai::ImageGenerationConfig::height)
         .def_readwrite("width", &ov::genai::ImageGenerationConfig::width)
diff --git a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py
index 1ff7ff5e21..0cced117e4 100644
--- a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py
+++ b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py
@@ -27,17 +27,6 @@
 }
 
 
-class Generator(openvino_genai.Generator):
-    def __init__(self, seed, rng, mu=0.0, sigma=1.0):
-        openvino_genai.Generator.__init__(self)
-        self.mu = mu
-        self.sigma = sigma
-        self.rng = rng
-
-    def next(self):
-        return torch.randn(1, generator=self.rng, dtype=torch.float32).item()
-
-
 @register_evaluator("text-to-image")
 class Text2ImageEvaluator(BaseEvaluator):
     def __init__(
@@ -171,7 +160,7 @@ def default_gen_image_fn(model, prompt, num_inference_steps, generator=None):
                 model,
                 prompt,
                 self.num_inference_steps,
-                generator=Generator(self.seed, rng) if self.is_genai else rng
+                generator=openvino_genai.TorchGenerator(self.seed) if self.is_genai else rng
             )
             image_path = os.path.join(image_dir, f"{i}.png")
             image.save(image_path)

From 2a52e869e16b47a06b17b3f21428207a61c1e8ea Mon Sep 17 00:00:00 2001
From: Pawel Raasz <pawel.raasz@intel.com>
Date: Mon, 16 Dec 2024 17:40:31 +0100
Subject: [PATCH 016/110] Fix test fails after PPP stops move tensor names
 (#1390)

###  Description:
- Fix the GENAI test after PPP don't move Node's name and tensor names.

### Related PRs:
- openvinotoolkit/openvino_tokenizers#352
- openvinotoolkit/openvino#28069

---------

Signed-off-by: Raasz, Pawel <pawel.raasz@intel.com>
---
 .github/workflows/linux.yml                    | 2 +-
 .github/workflows/llm_bench-python.yml         | 4 ++--
 .github/workflows/stable_diffusion_1_5_cpp.yml | 8 ++++----
 thirdparty/openvino_tokenizers                 | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 8d596aed56..0bb0c1af6e 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -52,7 +52,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
+        revision: latest_available_commit
 
     - name: Clone docker tag from OpenVINO repo
       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index 8b022f27e0..f87cd76126 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -34,7 +34,7 @@ jobs:
     runs-on: aks-linux-2-cores-8gb
     container:
       image: 'openvinogithubactions.azurecr.io/openvino_provider:0.1.0'
-      volumes: 
+      volumes:
         - /mount:/mount
         - ${{ github.workspace }}:${{ github.workspace }}
 
@@ -44,7 +44,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
+        revision: latest_available_commit
 
   build:
     defaults:
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index 497bfbff3e..34c5a0f87e 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -35,7 +35,7 @@ jobs:
     runs-on: aks-linux-2-cores-8gb
     container:
       image: 'openvinogithubactions.azurecr.io/openvino_provider:0.1.0'
-      volumes: 
+      volumes:
         - /mount:/mount
         - ${{ github.workspace }}:${{ github.workspace }}
 
@@ -45,7 +45,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
+        revision: latest_available_commit
 
   openvino_download_windows:
     name: Download OpenVINO for Windows
@@ -61,7 +61,7 @@ jobs:
     runs-on: aks-linux-2-cores-8gb
     container:
       image: 'openvinogithubactions.azurecr.io/openvino_provider:0.1.0'
-      volumes: 
+      volumes:
         - /mount:/mount
         - ${{ github.workspace }}:${{ github.workspace }}
 
@@ -71,7 +71,7 @@ jobs:
       with:
         platform: windows
         commit_packages_to_provide: wheels
-        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
+        revision: latest_available_commit
 
   stable_diffusion_1_5_cpp-linux:
     runs-on: ubuntu-22.04-8-cores
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 1da0d2c705..bcfd3eda25 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 1da0d2c705016ad3f04c160ac9338f06505a07c1
+Subproject commit bcfd3eda25ae3ec423502a4074e35c774506c732

From a651292a803f184b9de957a44252c41f364d68ab Mon Sep 17 00:00:00 2001
From: Helena Kloosterman <helena.kloosterman@intel.com>
Date: Tue, 17 Dec 2024 04:47:36 +0100
Subject: [PATCH 017/110] Fix optimum-cli command for VLM example in README
 (#1348)

With the existing command users get an error: Channel size 4304 should
be divisible by size of group 128.

---------

Co-authored-by: Alexander Kozlov <alexander.kozlov@intel.com>
Co-authored-by: Nikita Savelyev <nikita.savelyev@intel.com>
Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 680bc9bc65..c2509528c3 100644
--- a/README.md
+++ b/README.md
@@ -107,12 +107,12 @@ For more examples check out our [Generative AI workflow](https://docs.openvino.a
 
 ### Converting and compressing the model from Hugging Face library
 
-```sh
-#(Basic) download and convert to OpenVINO MiniCPM-V-2_6 model
-optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code --weight-format fp16 MiniCPM-V-2_6
+To convert the [OpenGVLab/InternVL2-1B](https://huggingface.co/OpenGVLab/InternVL2-1B) model, `timm` and `einops` are required: `pip install timm einops`.
 
-#(Recommended) Same as above but with compression: language model is compressed to int4, other model components are compressed to int8
-optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code --weight-format int4 MiniCPM-V-2_6
+```sh
+# Download and convert the OpenGVLab/InternVL2-1B model to OpenVINO with int4 weight-compression for the language model
+# Other components are compressed to int8
+optimum-cli export openvino -m OpenGVLab/InternVL2-1B --trust-remote-code --weight-format int4 InternVL2-1B
 ```
 
 ### Run generation using VLMPipeline API in Python
@@ -132,7 +132,7 @@ import openvino_genai as ov_genai
 from PIL import Image
 
 # Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
-pipe = ov_genai.VLMPipeline("./MiniCPM-V-2_6/", "CPU")
+pipe = ov_genai.VLMPipeline("./InternVL2-1B", "CPU")
 
 image = Image.open("dog.jpg")
 image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)

From 1d4b1039a95c9f8817f412248656a83d463d3376 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Tue, 17 Dec 2024 10:48:02 +0400
Subject: [PATCH 018/110] [llm_bench] enable text2img callback only if
 supported (#1392)

CVS-159282
---
 tools/llm_bench/task/image_generation.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/tools/llm_bench/task/image_generation.py b/tools/llm_bench/task/image_generation.py
index f227898ef6..b870c7ec98 100644
--- a/tools/llm_bench/task/image_generation.py
+++ b/tools/llm_bench/task/image_generation.py
@@ -25,7 +25,7 @@
 stable_diffusion_hook = StableDiffusionHook()
 
 
-def collects_input_args(image_param, model_type, model_name):
+def collects_input_args(image_param, model_type, model_name, callback=None):
     input_args = {}
     input_args["width"] = image_param.get('width', DEFAULT_IMAGE_WIDTH)
     input_args["height"] = image_param.get('height', DEFAULT_IMAGE_HEIGHT)
@@ -37,6 +37,19 @@ def collects_input_args(image_param, model_type, model_name):
     else:
         if 'turbo' in model_name:
             input_args["guidance_scale"] = 0.0
+    if callback is not None:
+        from openvino import get_version
+        from packaging.version import parse
+
+        version = get_version()
+        # avoid invalid format
+        if "-" in version:
+            ov_major_version, dev_info = version.split("-", 1)
+            commit_id = dev_info.split("-")[0]
+            version = f"{ov_major_version}-{commit_id}"
+        is_callback_supported = parse(version) >= parse("2025.0.0")
+        if is_callback_supported:
+            input_args["callback"] = callback
 
     return input_args
 
@@ -107,7 +120,7 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list,
 def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None):
     set_seed(args['seed'])
     input_text = image_param['prompt']
-    input_args = collects_input_args(image_param, args['model_type'], args['model_name'])
+    input_args = collects_input_args(image_param, args['model_type'], args['model_name'], callback)
     out_str = f"Input params: Batch_size={args['batch_size']}, " \
               f"steps={input_args['num_inference_steps']}, width={input_args['width']}, height={input_args['height']}"
     if 'guidance_scale' in input_args:
@@ -127,7 +140,7 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data
             llm_bench_utils.output_file.output_image_input_text(in_text, args, image_id, bs_idx, proc_id)
     callback.reset()
     start = time.perf_counter()
-    res = pipe.generate(input_text, **input_args, callback=callback).data
+    res = pipe.generate(input_text, **input_args).data
     end = time.perf_counter()
     callback.duration = end - start
     if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
@@ -157,7 +170,7 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data
         max_rss_mem=max_rss_mem_consumption,
         max_shared_mem=max_shared_mem_consumption,
         max_uss_mem=max_uss_mem_consumption,
-        stable_diffusion=callback,
+        stable_diffusion=callback if "callback" in input_args else None,
         prompt_idx=image_id
     )
     metrics_print.print_generated(num, warm_up=(num == 0), generated=rslt_img_fn, prompt_idx=image_id)

From f177ffc9799ef34a57b257e1811a60c68c167eb2 Mon Sep 17 00:00:00 2001
From: Anna Likholat <anna.likholat@intel.com>
Date: Tue, 17 Dec 2024 09:45:41 +0100
Subject: [PATCH 019/110] [ImageGeneration] PNDMScheduler support (#1393)

![image](https://github.com/user-attachments/assets/3ca9c44b-ec2e-49ae-afba-2e56d5bf51f7)

![image](https://github.com/user-attachments/assets/8999eac8-6acb-41ec-85f1-d6dab910aa44)

![image](https://github.com/user-attachments/assets/ee8e1461-5953-4c64-8c01-340cf6c3916b)

![image](https://github.com/user-attachments/assets/5ad73a32-ef2a-479d-b7ee-37543fd8d235)
---
 .../genai/image_generation/scheduler.hpp      |   3 +-
 .../src/image_generation/schedulers/pndm.cpp  | 277 ++++++++++++++++++
 .../src/image_generation/schedulers/pndm.hpp  |  67 +++++
 .../image_generation/schedulers/scheduler.cpp |   3 +
 .../src/image_generation/schedulers/types.cpp |   2 +
 src/docs/SUPPORTED_MODELS.md                  |  19 ++
 6 files changed, 370 insertions(+), 1 deletion(-)
 create mode 100644 src/cpp/src/image_generation/schedulers/pndm.cpp
 create mode 100644 src/cpp/src/image_generation/schedulers/pndm.hpp

diff --git a/src/cpp/include/openvino/genai/image_generation/scheduler.hpp b/src/cpp/include/openvino/genai/image_generation/scheduler.hpp
index 9b038ccd56..21c266aa50 100644
--- a/src/cpp/include/openvino/genai/image_generation/scheduler.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/scheduler.hpp
@@ -18,7 +18,8 @@ class OPENVINO_GENAI_EXPORTS Scheduler {
         LMS_DISCRETE,
         DDIM,
         EULER_DISCRETE,
-        FLOW_MATCH_EULER_DISCRETE
+        FLOW_MATCH_EULER_DISCRETE,
+        PNDM
     };
 
     static std::shared_ptr<Scheduler> from_config(const std::filesystem::path& scheduler_config_path,
diff --git a/src/cpp/src/image_generation/schedulers/pndm.cpp b/src/cpp/src/image_generation/schedulers/pndm.cpp
new file mode 100644
index 0000000000..a760283b97
--- /dev/null
+++ b/src/cpp/src/image_generation/schedulers/pndm.cpp
@@ -0,0 +1,277 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cassert>
+#include <random>
+#include <fstream>
+#include <iterator>
+
+#include "image_generation/schedulers/pndm.hpp"
+#include "image_generation/numpy_utils.hpp"
+
+namespace ov {
+namespace genai {
+
+PNDMScheduler::Config::Config(const std::filesystem::path& scheduler_config_path) {
+    std::ifstream file(scheduler_config_path);
+    OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path);
+
+    nlohmann::json data = nlohmann::json::parse(file);
+    using utils::read_json_param;
+
+    read_json_param(data, "num_train_timesteps", num_train_timesteps);
+    read_json_param(data, "beta_start", beta_start);
+    read_json_param(data, "beta_end", beta_end);
+    read_json_param(data, "beta_schedule", beta_schedule);
+    read_json_param(data, "trained_betas", trained_betas);
+    read_json_param(data, "set_alpha_to_one", set_alpha_to_one);
+    read_json_param(data, "skip_prk_steps", skip_prk_steps);
+    read_json_param(data, "steps_offset", steps_offset);
+    read_json_param(data, "prediction_type", prediction_type);
+    read_json_param(data, "timestep_spacing", timestep_spacing);
+}
+
+PNDMScheduler::PNDMScheduler(const std::filesystem::path& scheduler_config_path) 
+    : PNDMScheduler(Config(scheduler_config_path)) {
+}
+
+PNDMScheduler::PNDMScheduler(const Config& scheduler_config): m_config(scheduler_config) {
+
+    std::vector<float> alphas, betas;
+
+    using numpy_utils::linspace;
+
+    if (!m_config.trained_betas.empty()) {
+        betas = m_config.trained_betas;
+    } else if (m_config.beta_schedule == BetaSchedule::LINEAR) {
+        betas = linspace<float>(m_config.beta_start, m_config.beta_end, m_config.num_train_timesteps);
+    } else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) {
+        float start = std::sqrt(m_config.beta_start);
+        float end = std::sqrt(m_config.beta_end);
+        betas = linspace<float>(start, end, m_config.num_train_timesteps);
+        std::for_each(betas.begin(), betas.end(), [] (float & x) { x *= x; });
+        // TODO: elif beta_schedule == "squaredcos_cap_v2":
+    } else {
+        OPENVINO_THROW("'beta_schedule' must be one of 'LINEAR' or 'SCALED_LINEAR'. Please, add support of other types");
+    }
+
+    std::transform(betas.begin(), betas.end(), std::back_inserter(alphas), [] (float b) { return 1.0f - b; });
+
+    for (size_t i = 1; i <= alphas.size(); i++) {
+        float alpha_cumprod =
+            std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies<float>{});
+        m_alphas_cumprod.push_back(alpha_cumprod);
+    }
+
+    m_final_alpha_cumprod = m_config.set_alpha_to_one ? 1 : m_alphas_cumprod[0];
+
+    // running values
+    m_ets = {};
+    m_counter = 0;
+
+    // setable values
+    m_num_inference_steps = -1;
+    m_prk_timesteps = {};
+    m_plms_timesteps = {};
+    m_timesteps = {};
+}
+
+void PNDMScheduler::set_timesteps(size_t num_inference_steps, float strength) {
+    m_timesteps.clear(), m_prk_timesteps.clear(), m_plms_timesteps.clear();
+
+    OPENVINO_ASSERT(num_inference_steps <= m_config.num_train_timesteps,
+                    "`num_inference_steps` cannot be larger than `m_config.num_train_timesteps`");
+
+    m_num_inference_steps = num_inference_steps;
+
+    switch (m_config.timestep_spacing) {
+        case TimestepSpacing::LINSPACE:
+        {
+            using numpy_utils::linspace;
+            float end = static_cast<float>(m_config.num_train_timesteps - 1);
+            auto linspaced = linspace<float>(0.0f, end, num_inference_steps, true);
+            for (float val : linspaced) {
+                m_timesteps.push_back(static_cast<int64_t>(std::round(val)));
+            }
+            break;
+        }
+        case TimestepSpacing::LEADING:
+        {
+            size_t step_ratio = m_config.num_train_timesteps / m_num_inference_steps;
+            for (size_t i = 0; i < m_num_inference_steps; ++i) {
+                m_timesteps.push_back(i * step_ratio + m_config.steps_offset);
+            }
+            break;
+        }
+        case TimestepSpacing::TRAILING:
+        {
+            float step_ratio = static_cast<float>(m_config.num_train_timesteps) / static_cast<float>(m_num_inference_steps);
+            for (float i = m_config.num_train_timesteps; i > 0; i-=step_ratio){
+                m_timesteps.push_back(static_cast<int64_t>(std::round(i)) - 1);
+            }
+            std::reverse(m_timesteps.begin(), m_timesteps.end());
+            break;
+        }
+        default:
+            OPENVINO_THROW("Unsupported value for 'timestep_spacing'. Please make sure to choose one of 'linspace', 'leading' or 'trailing'.");
+    }
+
+    if (m_config.skip_prk_steps) {
+        m_prk_timesteps = {};
+        std::copy(m_timesteps.begin(), m_timesteps.end() - 1, std::back_inserter(m_plms_timesteps));
+        m_plms_timesteps.push_back(m_timesteps[m_timesteps.size() - 2]);
+        m_plms_timesteps.push_back(m_timesteps[m_timesteps.size() - 1]);
+        std::reverse(m_plms_timesteps.begin(), m_plms_timesteps.end());
+    } else {
+        OPENVINO_THROW("'skip_prk_steps=false' case isn't supported. Please, add support.");
+    }
+
+    m_timesteps = m_prk_timesteps;
+    m_timesteps.insert(m_timesteps.end(), m_plms_timesteps.begin(), m_plms_timesteps.end());
+
+    m_ets = {};
+    m_counter = 0;
+    m_cur_sample = ov::Tensor(ov::element::f32, {});
+}
+
+std::map<std::string, ov::Tensor> PNDMScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) {
+    // noise_pred - model_output
+    // latents - sample
+    // inference_step
+
+    if (m_counter < m_prk_timesteps.size() && !m_config.skip_prk_steps) {
+        OPENVINO_THROW("'skip_prk_steps=false' case isn't supported. Please, add support.");
+    } else {
+        return step_plms(noise_pred, latents, m_timesteps[inference_step]);
+    }
+}
+
+std::map<std::string, ov::Tensor> PNDMScheduler::step_plms(ov::Tensor model_output, ov::Tensor sample, size_t timestep) {
+    OPENVINO_ASSERT(m_num_inference_steps != -1,
+                    "Number of inference steps isn't set, you need to run `set_timesteps` after creating the scheduler");
+
+    int prev_timestep = timestep - m_config.num_train_timesteps / m_num_inference_steps;
+
+    if (m_counter != 1) {
+        if (m_ets.size() > 3) {
+            m_ets = std::vector<ov::Tensor>(m_ets.end() - 3, m_ets.end());
+        }
+        ov::Tensor ets_last(model_output.get_element_type(), model_output.get_shape());
+        model_output.copy_to(ets_last);
+        m_ets.push_back(ets_last);
+    } else {
+        prev_timestep = timestep;
+        timestep = timestep + m_config.num_train_timesteps / m_num_inference_steps;
+    }
+
+    float* model_output_data = model_output.data<float>();
+
+    size_t m_ets_size = m_ets.size();
+
+    if (m_ets_size == 1 && m_counter == 0) {
+        m_cur_sample = ov::Tensor(sample.get_element_type(), sample.get_shape());
+        sample.copy_to(m_cur_sample);
+    } else if (m_ets_size == 1 && m_counter == 1) {
+        const float* ets_data = m_ets[0].data<float>();
+        for (size_t i = 0; i < model_output.get_size(); ++i) {
+            model_output_data[i] = (model_output_data[i] + ets_data[i]) / 2.0f;
+        }
+        sample = ov::Tensor(m_cur_sample.get_element_type(), m_cur_sample.get_shape());
+        m_cur_sample.copy_to(sample);
+        m_cur_sample = ov::Tensor(ov::element::f32, {});
+    } else if (m_ets_size == 2) {
+        const float* ets_data_1 = m_ets[1].data<float>();
+        const float* ets_data_2 = m_ets[0].data<float>();
+        for (size_t i = 0; i < model_output.get_size(); ++i) {
+            model_output_data[i] = (3.0f * ets_data_1[i] - ets_data_2[i]) / 2.0f;
+        }
+    } else if (m_ets_size == 3) {
+        const float* ets_data_1 = m_ets[2].data<float>();
+        const float* ets_data_2 = m_ets[1].data<float>();
+        const float* ets_data_3 = m_ets[0].data<float>();
+        for (size_t i = 0; i < model_output.get_size(); ++i) {
+            model_output_data[i] = (23.0f * ets_data_1[i] - 16.0f * ets_data_2[i] + 5.0f * ets_data_3[i]) / 12.0f;
+        }
+    } else if (m_ets_size == 4) {
+        const float* ets_data_1 = m_ets[3].data<float>();
+        const float* ets_data_2 = m_ets[2].data<float>();
+        const float* ets_data_3 = m_ets[1].data<float>();
+        const float* ets_data_4 = m_ets[0].data<float>();
+
+        for (size_t i = 0; i < model_output.get_size(); ++i) {
+            model_output_data[i] = (1.0f / 24.0f)
+                                   * (55.0f * ets_data_1[i] - 59.0f * ets_data_2[i] + 37.0f * ets_data_3[i] - 9.0f * ets_data_4[i]);
+        }
+    } else {
+        OPENVINO_THROW("PNDMScheduler: Unsupported step_plms case.");
+    }
+
+    ov::Tensor prev_sample = get_prev_sample(sample, timestep, prev_timestep, model_output);
+    m_counter++;
+
+    std::map<std::string, ov::Tensor> result{{"latent", prev_sample}};
+    return result;
+}
+
+ov::Tensor PNDMScheduler::get_prev_sample(ov::Tensor sample, size_t timestep, int prev_timestep, ov::Tensor model_output) {
+    float alpha_prod_t = m_alphas_cumprod[timestep];
+    float alpha_prod_t_prev = (prev_timestep >= 0) ? m_alphas_cumprod[prev_timestep] : m_final_alpha_cumprod;
+    float beta_prod_t = 1 - alpha_prod_t;
+    float beta_prod_t_prev = 1 - alpha_prod_t_prev;
+
+    float sample_coeff = std::sqrt((alpha_prod_t_prev / alpha_prod_t));
+    float model_output_denom_coeff = alpha_prod_t * std::sqrt(beta_prod_t_prev) +
+                                     std::sqrt((alpha_prod_t * beta_prod_t * alpha_prod_t_prev));
+
+    float* model_output_data = model_output.data<float>();
+    float* sample_data = sample.data<float>();
+
+    switch (m_config.prediction_type) {
+        case PredictionType::EPSILON:
+            break;
+        case PredictionType::V_PREDICTION:
+            for (size_t i = 0; i < model_output.get_size(); ++i) {
+                model_output_data[i] = std::sqrt(alpha_prod_t) * model_output_data[i] + std::sqrt(beta_prod_t) * sample_data[i];
+            }
+            break;
+        default:
+            OPENVINO_THROW("Unsupported value for 'PredictionType'");
+    }
+
+    ov::Tensor prev_sample = ov::Tensor(model_output.get_element_type(), model_output.get_shape());
+    float* prev_sample_data = prev_sample.data<float>();
+
+    for (size_t i = 0; i < prev_sample.get_size(); ++i) {
+        prev_sample_data[i] = sample_coeff * sample_data[i] - (alpha_prod_t_prev - alpha_prod_t) * model_output_data[i] / model_output_denom_coeff;
+    }
+
+    return prev_sample;
+}
+
+void PNDMScheduler::add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const {
+    float sqrt_alpha_prod = std::sqrt(m_alphas_cumprod[latent_timestep]);
+    float sqrt_one_minus_alpha_prod = std::sqrt(1.0 - m_alphas_cumprod[latent_timestep]);
+
+    float * init_latent_data = init_latent.data<float>();
+    const float * noise_data = noise.data<float>();
+
+    for (size_t i = 0; i < init_latent.get_size(); ++i) {
+        init_latent_data[i] = sqrt_alpha_prod * init_latent_data[i] + sqrt_one_minus_alpha_prod * noise_data[i];
+    }
+}
+
+std::vector<int64_t> PNDMScheduler::get_timesteps() const {
+    return m_timesteps;
+}
+
+void PNDMScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) {
+    return;
+}
+
+float PNDMScheduler::get_init_noise_sigma() const {
+    return 1.0f;
+}
+
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/image_generation/schedulers/pndm.hpp b/src/cpp/src/image_generation/schedulers/pndm.hpp
new file mode 100644
index 0000000000..4e346f58b3
--- /dev/null
+++ b/src/cpp/src/image_generation/schedulers/pndm.hpp
@@ -0,0 +1,67 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <filesystem>
+#include <list>
+#include <string>
+
+#include "image_generation/schedulers/types.hpp"
+#include "image_generation/schedulers/ischeduler.hpp"
+
+namespace ov {
+namespace genai {
+
+class PNDMScheduler : public IScheduler {
+public:
+    struct Config {
+        int32_t num_train_timesteps = 1000;
+        float beta_start = 0.0001f, beta_end = 0.02f;
+        BetaSchedule beta_schedule = BetaSchedule::LINEAR;
+        std::vector<float> trained_betas = {};
+        bool set_alpha_to_one = false, skip_prk_steps = false;
+        PredictionType prediction_type = PredictionType::EPSILON;
+        TimestepSpacing timestep_spacing = TimestepSpacing::LEADING;
+        size_t steps_offset = 0;
+
+        Config() = default;
+        explicit Config(const std::filesystem::path& scheduler_config_path);
+    };
+
+    explicit PNDMScheduler(const std::filesystem::path& scheduler_config_path);
+    explicit PNDMScheduler(const Config& scheduler_config);
+
+    void set_timesteps(size_t num_inference_steps, float strength) override;
+
+    std::vector<std::int64_t> get_timesteps() const override;
+
+    float get_init_noise_sigma() const override;
+
+    void scale_model_input(ov::Tensor sample, size_t inference_step) override;
+
+    std::map<std::string, ov::Tensor> step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) override;
+
+    void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t timestep) const override;
+
+private:
+    Config m_config;
+
+    float m_final_alpha_cumprod;
+    size_t m_num_inference_steps;
+    size_t m_counter;
+
+    std::vector<float> m_alphas_cumprod;
+    std::vector<int64_t> m_timesteps;
+    std::vector<int64_t> m_prk_timesteps;
+    std::vector<int64_t> m_plms_timesteps;
+    std::vector<ov::Tensor> m_ets;
+
+    ov::Tensor m_cur_sample;
+
+    std::map<std::string, ov::Tensor> step_plms(ov::Tensor model_output, ov::Tensor sample, size_t timestep);
+    ov::Tensor get_prev_sample(ov::Tensor sample, size_t timestep, int prev_timestep, ov::Tensor model_output);
+};
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/image_generation/schedulers/scheduler.cpp b/src/cpp/src/image_generation/schedulers/scheduler.cpp
index 3a7556b6d9..f9cd098346 100644
--- a/src/cpp/src/image_generation/schedulers/scheduler.cpp
+++ b/src/cpp/src/image_generation/schedulers/scheduler.cpp
@@ -10,6 +10,7 @@
 #include "image_generation/schedulers/ddim.hpp"
 #include "image_generation/schedulers/euler_discrete.hpp"
 #include "image_generation/schedulers/flow_match_euler_discrete.hpp"
+#include "image_generation/schedulers/pndm.hpp"
 
 namespace ov {
 namespace genai {
@@ -38,6 +39,8 @@ std::shared_ptr<Scheduler> Scheduler::from_config(const std::filesystem::path& s
         scheduler = std::make_shared<EulerDiscreteScheduler>(scheduler_config_path);
     } else if (scheduler_type == Scheduler::Type::FLOW_MATCH_EULER_DISCRETE) {
         scheduler = std::make_shared<FlowMatchEulerDiscreteScheduler>(scheduler_config_path);
+    } else if (scheduler_type == Scheduler::Type::PNDM) {
+        scheduler = std::make_shared<PNDMScheduler>(scheduler_config_path);
     } else {
         OPENVINO_THROW("Unsupported scheduler type '", scheduler_type, ". Please, manually create scheduler via supported one");
     }
diff --git a/src/cpp/src/image_generation/schedulers/types.cpp b/src/cpp/src/image_generation/schedulers/types.cpp
index f7d21b12af..2f7c6d3f25 100644
--- a/src/cpp/src/image_generation/schedulers/types.cpp
+++ b/src/cpp/src/image_generation/schedulers/types.cpp
@@ -55,6 +55,8 @@ void read_json_param(const nlohmann::json& data, const std::string& name, Schedu
             param = Scheduler::EULER_DISCRETE;
         else if (scheduler_type_str == "FlowMatchEulerDiscreteScheduler")
             param = Scheduler::FLOW_MATCH_EULER_DISCRETE;
+        else if (scheduler_type_str == "PNDMScheduler")
+            param = Scheduler::PNDM;
         else if (!scheduler_type_str.empty()) {
             OPENVINO_THROW("Unsupported value for 'scheduler' ", scheduler_type_str);
         }
diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md
index fe13e5848f..8c922ee644 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/src/docs/SUPPORTED_MODELS.md
@@ -183,10 +183,29 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <td>Supported</td>
       <td>
         <ul>
+          <li><a href="https://huggingface.co/CompVis/stable-diffusion-v1-1"><code>CompVis/stable-diffusion-v1-1</code></a></li>
+          <li><a href="https://huggingface.co/CompVis/stable-diffusion-v1-2"><code>CompVis/stable-diffusion-v1-2</code></a></li>
+          <li><a href="https://huggingface.co/CompVis/stable-diffusion-v1-3"><code>CompVis/stable-diffusion-v1-3</code></a></li>
+          <li><a href="https://huggingface.co/CompVis/stable-diffusion-v1-4"><code>CompVis/stable-diffusion-v1-4</code></a></li>
+          <li><a href="https://huggingface.co/junnyu/stable-diffusion-v1-4-paddle"><code>junnyu/stable-diffusion-v1-4-paddle</code></a></li>
+          <li><a href="https://huggingface.co/jcplus/stable-diffusion-v1-5"><code>jcplus/stable-diffusion-v1-5</code></a></li>
+          <li><a href="https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5"><code>stable-diffusion-v1-5/stable-diffusion-v1-5</code></a></li>
           <li><a href="https://huggingface.co/botp/stable-diffusion-v1-5"><code>botp/stable-diffusion-v1-5</code></a></li>
           <li><a href="https://huggingface.co/dreamlike-art/dreamlike-anime-1.0"><code>dreamlike-art/dreamlike-anime-1.0</code></a></li>
           <li><a href="https://huggingface.co/stabilityai/stable-diffusion-2"><code>stabilityai/stable-diffusion-2</code></a></li>
+          <li><a href="https://huggingface.co/stabilityai/stable-diffusion-2-base"><code>stabilityai/stable-diffusion-2-base</code></a></li>
           <li><a href="https://huggingface.co/stabilityai/stable-diffusion-2-1"><code>stabilityai/stable-diffusion-2-1</code></a></li>
+          <li><a href="https://huggingface.co/bguisard/stable-diffusion-nano-2-1"><code>bguisard/stable-diffusion-nano-2-1</code></a></li>
+          <li><a href="https://huggingface.co/justinpinkney/pokemon-stable-diffusion"><code>justinpinkney/pokemon-stable-diffusion</code></a></li>
+          <li><a href="https://huggingface.co/stablediffusionapi/architecture-tuned-model"><code>stablediffusionapi/architecture-tuned-model</code></a></li>
+          <li><a href="https://huggingface.co/IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1"><code>IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1</code></a></li>
+          <li><a href="https://huggingface.co/ZeroCool94/stable-diffusion-v1-5"><code>ZeroCool94/stable-diffusion-v1-5</code></a></li>
+          <li><a href="https://huggingface.co/pcuenq/stable-diffusion-v1-4"><code>pcuenq/stable-diffusion-v1-4</code></a></li>
+          <li><a href="https://huggingface.co/rinna/japanese-stable-diffusion"><code>rinna/japanese-stable-diffusion</code></a></li>
+          <li><a href="https://huggingface.co/benjamin-paine/stable-diffusion-v1-5"><code>benjamin-paine/stable-diffusion-v1-5</code></a></li>
+          <li><a href="https://huggingface.co/philschmid/stable-diffusion-v1-4-endpoints"><code>philschmid/stable-diffusion-v1-4-endpoints</code></a></li>
+          <li><a href="https://huggingface.co/naclbit/trinart_stable_diffusion_v2"><code>naclbit/trinart_stable_diffusion_v2</code></a></li>
+          <li><a href="https://huggingface.co/Fictiverse/Stable_Diffusion_PaperCut_Model"><code>Fictiverse/Stable_Diffusion_PaperCut_Model</code></a></li>
         </ul>
       </td>
     </tr>

From 973b26b2b1fed25b878ea6108b4d7c5ae825dc12 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Tue, 17 Dec 2024 13:20:32 +0400
Subject: [PATCH 020/110] add VLM support in llm bench (#1318)

TO DO:

- [x] add test
- [x] check correctness of num_input_tokens after
https://github.com/openvinotoolkit/openvino.genai/pull/1317
- [x] move unsupported pipelines to optimum

---------

Co-authored-by: Andrei Kochin <andrei.kochin@intel.com>
---
 .github/workflows/llm_bench-python.yml        |   6 +-
 tools/llm_bench/benchmark.py                  |   2 +
 tools/llm_bench/doc/PROMPT.md                 |   7 +-
 .../llm_bench/llm_bench_utils/config_class.py |   6 +-
 .../llm_bench_utils/gen_output_data.py        |   2 +
 .../llm_bench_utils/hook_beam_search.py       |  26 +-
 .../llm_bench/llm_bench_utils/hook_common.py  |   2 +
 .../llm_bench_utils/hook_greedy_search.py     |  30 +-
 .../llm_bench_utils/metrics_print.py          |   2 +
 .../llm_bench/llm_bench_utils/model_utils.py  |  74 ++--
 tools/llm_bench/llm_bench_utils/ov_utils.py   |  81 ++++
 .../llm_bench_utils/parse_json_data.py        |  17 +
 .../task/visual_language_generation.py        | 366 ++++++++++++++++++
 13 files changed, 586 insertions(+), 35 deletions(-)
 create mode 100644 tools/llm_bench/task/visual_language_generation.py

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index f87cd76126..3d31649cea 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -137,12 +137,16 @@ jobs:
           optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny ./ov_models/whisper-tiny
           python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1 --optimum
           python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1
+      - name: Text InternVL2-1B on Linux
+        run: |
+          optimum-cli export openvino --model OpenGVLab/InternVL2-1B ./ov_models/internvl2-1B --task image-text-to-text --trust-remote-code
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20 --optimum
       - name: WWB Tests
         run: |
           pip install git+https://github.com/huggingface/optimum-intel.git
           GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
           python -m pytest -v ${{ env.WWB_PATH }}/tests
-
   stateful:
     defaults:
       run:
diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py
index bd5a5716a7..5fa22497c1 100644
--- a/tools/llm_bench/benchmark.py
+++ b/tools/llm_bench/benchmark.py
@@ -12,6 +12,7 @@
 from llm_bench_utils.memory_profile import MemConsumption
 import llm_bench_utils.output_csv
 import llm_bench_utils.output_json
+import task.visual_language_generation as bench_vlm
 import task.text_generation as bench_text
 import task.image_generation as bench_image
 import task.super_resolution_generation as bench_ldm_sr
@@ -167,6 +168,7 @@ def get_argprser():
     'code_gen': bench_text.run_text_generation_benchmark,
     'ldm_super_resolution': bench_ldm_sr.run_ldm_super_resolution_benchmark,
     'speech2text': bench_speech.run_speech_2_txt_benchmark,
+    "vlm": bench_vlm.run_visual_language_generation_benchmark
 }
 
 
diff --git a/tools/llm_bench/doc/PROMPT.md b/tools/llm_bench/doc/PROMPT.md
index 4ee28d47fa..5418bf0bb5 100644
--- a/tools/llm_bench/doc/PROMPT.md
+++ b/tools/llm_bench/doc/PROMPT.md
@@ -36,4 +36,9 @@ Supported parameters that can be set are:
 * `timestamp` - timestamp for whisper (default true)
 Prompt file example：
 {"media": "./audio/intel_ad_90s_128kbps.mp3", "language": "<|en|>", "timestamp":false}
-{"media": "./audio/intel_ad_120s_128kbps.mp3", "language": "<|en|>", "timestamp":true}
\ No newline at end of file
+{"media": "./audio/intel_ad_120s_128kbps.mp3", "language": "<|en|>", "timestamp":true}
+
+## 5. Visual Language Models
+Supported parameters that can be set are:
+* `media` - imge file path
+* `prompt`- input text prompt
\ No newline at end of file
diff --git a/tools/llm_bench/llm_bench_utils/config_class.py b/tools/llm_bench/llm_bench_utils/config_class.py
index 12385d2879..7dd27b198b 100644
--- a/tools/llm_bench/llm_bench_utils/config_class.py
+++ b/tools/llm_bench/llm_bench_utils/config_class.py
@@ -8,7 +8,8 @@
     OVModelForCausalLM,
     OVModelForSeq2SeqLM,
     OVDiffusionPipeline,
-    OVModelForSpeechSeq2Seq
+    OVModelForSpeechSeq2Seq,
+    OVModelForVisualCausalLM
 )
 from llm_bench_utils.ov_model_classes import OVMPTModel, OVLDMSuperResolutionPipeline, OVChatGLMModel
 
@@ -36,6 +37,7 @@
     'chatglm3': OVModelForCausalLM,
     'chatglm': OVChatGLMModel,
     'whisper': OVModelForSpeechSeq2Seq,
+    "vlm": OVModelForVisualCausalLM,
 }
 
 PT_MODEL_CLASSES_MAPPING = {
@@ -51,6 +53,7 @@
 
 USE_CASES = {
     'image_gen': ['stable-diffusion-', 'ssd-', 'tiny-sd', 'small-sd', 'lcm-', 'sdxl', 'dreamlike', "flux"],
+    "vlm": ["llava", "llava-next", "qwen2-vl", "llava-qwen2", "internvl-chat", "minicpmv", "phi3-v"],
     'speech2text': ['whisper'],
     'image_cls': ['vit'],
     'code_gen': ['replit', 'codegen2', 'codegen', 'codet5', "stable-code"],
@@ -111,4 +114,5 @@
     'speech2text': 'whisper',
     'code_gen': 'decoder',
     'ldm_super_resolution': 'ldm_super_resolution',
+    "vlm": "vlm"
 }
diff --git a/tools/llm_bench/llm_bench_utils/gen_output_data.py b/tools/llm_bench/llm_bench_utils/gen_output_data.py
index 3b7c668c82..b65e7b5c8c 100644
--- a/tools/llm_bench/llm_bench_utils/gen_output_data.py
+++ b/tools/llm_bench/llm_bench_utils/gen_output_data.py
@@ -16,6 +16,7 @@ def gen_iterate_data(
     max_uss_mem='',
     prompt_idx='',
     tokenization_time=[],
+    mm_embeddings_preparation_time=''
 ):
     iter_data = {}
     iter_data['iteration'] = iter_idx
@@ -35,4 +36,5 @@ def gen_iterate_data(
     iter_data['prompt_idx'] = prompt_idx
     iter_data['tokenization_time'] = tokenization_time[0] if len(tokenization_time) > 0 else ''
     iter_data['detokenization_time'] = tokenization_time[1] if len(tokenization_time) > 1 else ''
+    iter_data["mm_embeddings_preparation_time"] = mm_embeddings_preparation_time
     return iter_data
diff --git a/tools/llm_bench/llm_bench_utils/hook_beam_search.py b/tools/llm_bench/llm_bench_utils/hook_beam_search.py
index 99b0a9e5c3..d933acc3a5 100644
--- a/tools/llm_bench/llm_bench_utils/hook_beam_search.py
+++ b/tools/llm_bench/llm_bench_utils/hook_beam_search.py
@@ -5,6 +5,7 @@
 import time
 import torch
 import warnings
+import types
 import logging as log
 from torch import nn
 from typing import Optional, Tuple, Union, List
@@ -54,6 +55,7 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput):
 
 tm_list = []
 tm_infer_list = []
+tm_mm_embeddings = []
 
 
 # Transformers version: v4.40-release 4fdf58afb72b0754da30037fc800b6044e7d9c99
@@ -455,6 +457,15 @@ def new_beam_search(
         else:
             return sequence_outputs["sequences"]
 
+def new_get_multimodal_embeddings(
+        self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, **kwargs
+    ):
+
+    start = time.perf_counter()
+    result = self._orig_get_multimodal_embeddings(input_ids, pixel_values=pixel_values, attention_mask=attention_mask, position_ids=position_ids, **kwargs)
+    end = time.perf_counter()
+    tm_mm_embeddings.append(end - start)
+    return result
 
 class BeamSearchHook:
     def __init__(self):
@@ -483,6 +494,19 @@ def get_time_infer_list(self):
         global tm_infer_list
         return tm_infer_list
 
+    def get_mm_embeddings_time_list(self):
+        global tm_mm_embeddings
+        return tm_mm_embeddings
+
+    def clear_mm_embeddins_time_list(self):
+        """Clear the infer time list."""
+        global tm_mm_embeddings
+        tm_mm_embeddings.clear()
+
     def new_forward(self, model):
         """Define a new beam search function."""
-        model._beam_search = new_beam_search.__get__(model, model.__class__)
\ No newline at end of file
+        model._beam_search = new_beam_search.__get__(model, model.__class__)
+
+    def new_get_multimodal_embeddings(self, model):
+        model._orig_get_multimodal_embeddings = model.get_multimodal_embeddings
+        model.get_multimodal_embeddings = types.MethodType(new_get_multimodal_embeddings, model)
\ No newline at end of file
diff --git a/tools/llm_bench/llm_bench_utils/hook_common.py b/tools/llm_bench/llm_bench_utils/hook_common.py
index 4751ed7d4d..c805680cee 100644
--- a/tools/llm_bench/llm_bench_utils/hook_common.py
+++ b/tools/llm_bench/llm_bench_utils/hook_common.py
@@ -21,6 +21,8 @@ def get_bench_hook(num_beams, ov_model):
         else:
             bench_hook = llm_bench_utils.hook_greedy_search.GreedySearchHook()
         bench_hook.new_forward(ov_model)
+        if hasattr(ov_model, "get_multimodal_embeddings"):
+            bench_hook.new_get_multimodal_embeddings(ov_model)
     else:
         log.warning(f'The minimum version of transformers to get 1st and 2nd tokens latency of {search_type} is: {min_version}')
         bench_hook = None
diff --git a/tools/llm_bench/llm_bench_utils/hook_greedy_search.py b/tools/llm_bench/llm_bench_utils/hook_greedy_search.py
index 03bbd55ea4..9039a99e69 100644
--- a/tools/llm_bench/llm_bench_utils/hook_greedy_search.py
+++ b/tools/llm_bench/llm_bench_utils/hook_greedy_search.py
@@ -4,6 +4,7 @@
 # flake8: noqa
 import time
 import torch
+import types
 import warnings
 import logging as log
 import transformers
@@ -50,7 +51,7 @@ class GenerateEncoderDecoderOutput(ModelOutput):
 
 tm_list = []
 tm_infer_list = []
-
+tm_mm_embeddings = []
 # Transformers version: v4.40-release 4fdf58afb72b0754da30037fc800b6044e7d9c99
 # Copied from https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/generation/utils.py#L2310
 # Add the function of collecting latency
@@ -328,6 +329,17 @@ def new_greedy_search(
             return input_ids
 
 
+def new_get_multimodal_embeddings(
+        self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, **kwargs
+    ):
+
+    start = time.perf_counter()
+    result = self._orig_get_multimodal_embeddings(input_ids, pixel_values=pixel_values, attention_mask=attention_mask, position_ids=position_ids, **kwargs)
+    end = time.perf_counter()
+    tm_mm_embeddings.append(end - start)
+    return result
+
+
 class GreedySearchHook:
     def __init__(self):
         """Clear the time list."""
@@ -355,6 +367,16 @@ def get_time_infer_list(self):
         global tm_infer_list
         return tm_infer_list
 
+
+    def get_mm_embeddings_time_list(self):
+        global tm_mm_embeddings
+        return tm_mm_embeddings
+
+    def clear_mm_embeddins_time_list(self):
+        """Clear the infer time list."""
+        global tm_mm_embeddings
+        tm_mm_embeddings.clear()
+
     def new_forward(self, model):
         """Define a new greedy search function."""
         model._greedy_search = new_greedy_search.__get__(model, model.__class__)
@@ -363,4 +385,8 @@ def new_forward(self, model):
         if trans_version >= version.parse('4.45.0'):
             model._sample = hook_sample_v45.new_sample.__get__(model, model.__class__)
         elif trans_version >= version.parse('4.43.0'):
-            model._sample = hook_sample_v43.new_sample.__get__(model, model.__class__)         
+            model._sample = hook_sample_v43.new_sample.__get__(model, model.__class__)  
+       
+    def new_get_multimodal_embeddings(self, model):
+        model._orig_get_multimodal_embeddings = model.get_multimodal_embeddings
+        model.get_multimodal_embeddings = types.MethodType(new_get_multimodal_embeddings, model)
\ No newline at end of file
diff --git a/tools/llm_bench/llm_bench_utils/metrics_print.py b/tools/llm_bench/llm_bench_utils/metrics_print.py
index 73e83dc672..740d3b9bcc 100644
--- a/tools/llm_bench/llm_bench_utils/metrics_print.py
+++ b/tools/llm_bench/llm_bench_utils/metrics_print.py
@@ -26,6 +26,8 @@ def print_metrics(
         output_str += 'Tokenization Time: {:.2f}ms, '.format(tokenization_time[0])
         if len(tokenization_time) > 1:
             output_str += 'Detokenization Time: {:.2f}ms, '.format(tokenization_time[1])
+    if iter_data['mm_embeddings_preparation_time'] != '':
+        output_str += ' Multimodal Embeddings Preparation Time: {:.2f}ms, '.format(iter_data['mm_embeddings_preparation_time'])
     if iter_data['generation_time'] != '':
         output_str += 'Generation Time: {:.2f}s, '.format(iter_data['generation_time'])
     if iter_data['latency'] != '':
diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py
index f72557b6c5..f3e7d21777 100644
--- a/tools/llm_bench/llm_bench_utils/model_utils.py
+++ b/tools/llm_bench/llm_bench_utils/model_utils.py
@@ -13,38 +13,54 @@
 def get_param_from_file(args, input_key):
     is_json_data = False
     data_list = []
-    if args[input_key] is None and args['prompt_file'] is None:
-        if args['use_case'] == 'text_gen':
-            data_list.append('What is OpenVINO?')
-        elif args['use_case'] == 'code_gen':
-            data_list.append('def print_hello_world():')
-        elif args['use_case'] == 'image_gen':
-            data_list.append('sailing ship in storm by Leonardo da Vinci')
-        else:
-            raise RuntimeError(f'== {input_key} and prompt file is empty ==')
-    elif args[input_key] is not None and args['prompt_file'] is not None:
-        raise RuntimeError(f'== {input_key} and prompt file should not exist together ==')
-    else:
-        if args[input_key] is not None:
-            if args[input_key] != '':
-                data_list.append(args[input_key])
+    if args['prompt_file'] is None:
+        if not isinstance(input_key, (list, tuple)):
+            if args[input_key] is None:
+                if args['use_case'] == 'text_gen':
+                    data_list.append('What is OpenVINO?')
+                elif args['use_case'] == 'code_gen':
+                    data_list.append('def print_hello_world():')
+                elif args['use_case'] == 'image_gen':
+                    data_list.append('sailing ship in storm by Leonardo da Vinci')
+                else:
+                    raise RuntimeError(f'== {input_key} and prompt file is empty ==')
+
+            elif args[input_key] is not None and args['prompt_file'] is not None:
+                raise RuntimeError(f'== {input_key} and prompt file should not exist together ==')
             else:
-                raise RuntimeError(f'== {input_key} path should not be empty string ==')
-        else:
-            input_prompt_list = args['prompt_file']
-            is_json_data = True
-            for input_prompt in input_prompt_list:
-                if input_prompt.endswith('.jsonl'):
-                    if os.path.exists(input_prompt):
-                        log.info(f'Read prompts from {input_prompt}')
-                        with open(input_prompt, 'r', encoding='utf-8') as f:
-                            for line in f:
-                                data = json.loads(line)
-                                data_list.append(data)
+                if args[input_key] is not None:
+                    if args[input_key] != '':
+                        data_list.append(args[input_key])
                     else:
-                        raise RuntimeError(f'== The prompt file:{input_prompt} does not exist ==')
+                        raise RuntimeError(f'== {input_key} path should not be empty string ==')
+        else:
+            if args["use_case"] != "vlm":
+                raise RuntimeError("Multiple sources for benchmarking supported only for Visual Language Models")
+            data_dict = {}
+            if args["media"] is None:
+                log.warn("Input image is not provided. Only text generation part will be evaluated")
+            else:
+                data_dict["media"] = args["media"]
+            if args["prompt"] is None:
+                data_dict["prompt"] = "What is OpenVINO?" if args["media"] is None else "Describe image"
+            else:
+                data_dict["prompt"] = args["prompt"]
+            data_list.append(data_dict)
+    else:
+        input_prompt_list = args['prompt_file']
+        is_json_data = True
+        for input_prompt in input_prompt_list:
+            if input_prompt.endswith('.jsonl'):
+                if os.path.exists(input_prompt):
+                    log.info(f'Read prompts from {input_prompt}')
+                    with open(input_prompt, 'r', encoding='utf-8') as f:
+                        for line in f:
+                            data = json.loads(line)
+                            data_list.append(data)
                 else:
-                    raise RuntimeError(f'== The prompt file:{input_prompt} should be ended with .jsonl ==')
+                    raise RuntimeError(f'== The prompt file:{input_prompt} does not exist ==')
+            else:
+                raise RuntimeError(f'== The prompt file:{input_prompt} should be ended with .jsonl ==')
     return data_list, is_json_data
 
 
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
index 8a28fbe355..427f1c84f3 100644
--- a/tools/llm_bench/llm_bench_utils/ov_utils.py
+++ b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -23,6 +23,8 @@
 import queue
 from transformers.generation.streamers import BaseStreamer
 
+GENAI_SUPPORTED_VLM = ["llava", "llava-next", "internvl-chat", "minicpmv"]
+
 
 def generate_simplified(self, *args, **kwargs):
     if len(args):
@@ -523,6 +525,85 @@ def create_speech_2txt_model(model_path, device, **kwargs):
     return pipe, processor, from_pretrained_time, False
 
 
+def get_vlm_processor(model_path):
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    model_type = config.model_type
+    if model_type == "llava-qwen2":
+        processor = AutoProcessor.from_pretrained(config.mm_vision_tower, trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        preprocessors = {"processor": processor, "tokenizer": tokenizer}
+    elif model_type == "internvl_chat":
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        preprocessors = {"processor": None, "tokenizer": tokenizer, "config": config}
+    else:
+        processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+        preprocessors = {"processor": processor, "tokenizer": processor}
+    return preprocessors
+
+
+def create_genai_image_text_gen_model(model_path, device, ov_config, **kwargs):
+    import openvino_genai
+
+    if not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists():
+        convert_ov_tokenizer(model_path)
+
+    processor_config = get_vlm_processor(model_path)
+
+    start = time.perf_counter()
+    llm_pipe = openvino_genai.VLMPipeline(model_path, device.upper(), **ov_config)
+    end = time.perf_counter()
+    log.info(f'Pipeline initialization time: {end - start:.2f}s')
+
+    return llm_pipe, processor_config, end - start, None, True
+
+
+def create_image_text_gen_model(model_path, device, **kwargs):
+    model_path = Path(model_path)
+    # specify the model path
+    if model_path.name.endswith('xml'):
+        model_path = model_path.parents[2]
+
+    ov_config = kwargs['config']
+
+    model_path_existed = Path(model_path).exists()
+    # load model
+    if not model_path_existed:
+        raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist')
+    else:
+        remote_code = False
+        try:
+            model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=False)
+        except Exception:
+            model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+            remote_code = True
+        if kwargs.get("genai", True) and is_genai_available(log_msg=True):
+            if model_config.model_type.replace("_", "-") in GENAI_SUPPORTED_VLM:
+                log.info("Selected OpenVINO GenAI for benchmarking")
+                return create_genai_image_text_gen_model(model_path, device, ov_config, **kwargs)
+            else:
+                log.warning(
+                    f"Model type `{model_config.model_type}` is not supported by OpenVINO GenAI. "
+                    "Benchmark will be switched to Optimum Intel pipeline realization"
+                )
+
+        log.info("Selected Optimum Intel for benchmarking")
+        model_class = OV_MODEL_CLASSES_MAPPING.get(DEFAULT_MODEL_CLASSES[kwargs['use_case']])
+        start = time.perf_counter()
+        ov_model = model_class.from_pretrained(
+            model_path,
+            device=device,
+            ov_config=ov_config,
+            config=model_config,
+            trust_remote_code=remote_code
+        )
+        end = time.perf_counter()
+    bench_hook = get_bench_hook(kwargs['num_beams'], ov_model)
+    from_pretrained_time = end - start
+    log.info(f'From pretrained time: {from_pretrained_time:.2f}s')
+    processor_config = get_vlm_processor(model_path)
+    return ov_model, processor_config, from_pretrained_time, bench_hook, False
+
+
 def is_genai_available(log_msg=False):
     import importlib
     try:
diff --git a/tools/llm_bench/llm_bench_utils/parse_json_data.py b/tools/llm_bench/llm_bench_utils/parse_json_data.py
index 6e2978d9d6..28fbd298cd 100644
--- a/tools/llm_bench/llm_bench_utils/parse_json_data.py
+++ b/tools/llm_bench/llm_bench_utils/parse_json_data.py
@@ -16,6 +16,23 @@ def parse_text_json_data(json_data_list):
     return text_param_list
 
 
+def parse_vlm_json_data(json_data_list):
+    text_param_list = []
+    for json_data in json_data_list:
+        prompt_data = {}
+        if 'prompt' in json_data:
+            if json_data['prompt'] != '':
+                prompt_data["prompt"] = json_data['prompt']
+            else:
+                raise RuntimeError('== prompt should not be empty string ==')
+        else:
+            raise RuntimeError('== key word "prompt" does not exist ==')
+        if "media" in json_data_list:
+            prompt_data["media"] = json_data["media"]
+        text_param_list.append(prompt_data)
+    return text_param_list
+
+
 def parse_image_json_data(json_data_list):
     image_param_list = []
     for data in json_data_list:
diff --git a/tools/llm_bench/task/visual_language_generation.py b/tools/llm_bench/task/visual_language_generation.py
new file mode 100644
index 0000000000..c4144366b4
--- /dev/null
+++ b/tools/llm_bench/task/visual_language_generation.py
@@ -0,0 +1,366 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+import os
+import time
+import datetime
+from pathlib import Path
+import logging as log
+import llm_bench_utils.ov_utils
+import llm_bench_utils.pt_utils
+import llm_bench_utils.model_utils as model_utils
+import numpy as np
+import openvino as ov
+import hashlib
+import llm_bench_utils.metrics_print as metrics_print
+import llm_bench_utils.output_csv
+from transformers import set_seed
+from transformers.image_utils import load_image
+import llm_bench_utils.output_json
+import llm_bench_utils.output_file
+import llm_bench_utils.gen_output_data as gen_output_data
+import llm_bench_utils.parse_json_data as parse_json_data
+
+FW_UTILS = {'pt': llm_bench_utils.pt_utils, 'ov': llm_bench_utils.ov_utils}
+
+DEFAULT_OUTPUT_TOKEN_SIZE = 512
+
+
+def run_visual_language_generation_optimum(
+    inputs, num, model, processor, args, iter_data_list, md5_list, prompt_index, bench_hook, model_precision, proc_id, mem_consumption
+):
+    set_seed(args['seed'])
+    if args['batch_size'] != 1:
+        log.warning("Only batch size 1 available for benchmarking")
+        args["batch_size"] = 1
+    images = []
+    prompts = []
+    for input_data in inputs:
+        if "media" in input_data:
+            images.append(load_image(input_data["media"]))
+        prompts.append(input_data["prompt"])
+
+    if args["output_dir"] is not None and num == 0:
+        for bs_index, in_text in enumerate(prompts):
+            llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id)
+    tok_encode_start = time.perf_counter()
+    input_data = model.preprocess_inputs(text=prompts[0], image=images[0], **processor)
+    tok_encode_end = time.perf_counter()
+    tok_encode_time = (tok_encode_end - tok_encode_start) * 1000
+    # Remove `token_type_ids` from inputs
+    input_tokens = input_data['input_ids'] if 'input_ids' in input_data else input_data
+    input_token_size = input_tokens[0].numel()
+    if args['batch_size'] > 1:
+        out_str = '[warm-up]' if num == 0 else '[{}]'.format(num)
+        out_str += " Batch_size={}, ".format(args['batch_size'])
+        out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size'])
+        if args['infer_count'] is not None:
+            out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size'])
+        log.info(out_str)
+
+    max_rss_mem_consumption = ''
+    max_uss_mem_consumption = ''
+    max_shared_mem_consumption = ''
+    if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
+        mem_consumption.start_collect_memory_consumption()
+    max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
+    start = time.perf_counter()
+    if args['infer_count'] is not None and args['end_token_stopping'] is False:
+        model.generation_config.eos_token_id = None
+        model.config.eos_token_id = None
+        result = model.generate(
+            **input_data,
+            max_new_tokens=int(max_gen_tokens),
+            num_beams=args['num_beams'],
+            use_cache=True,
+            eos_token_id=None,
+            do_sample=False
+        )
+    else:
+        result = model.generate(
+            **input_data,
+            max_new_tokens=int(max_gen_tokens),
+            num_beams=args['num_beams'],
+            use_cache=True,
+            do_sample=False
+        )
+    end = time.perf_counter()
+    if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
+        mem_consumption.end_collect_momory_consumption()
+        max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption()
+        mem_consumption.clear_max_memory_consumption()
+
+    generation_time = end - start
+    tok_decode_start = time.perf_counter()
+    generated_text = processor["tokenizer"].batch_decode(result[:, input_data["input_ids"].shape[1]:], skip_special_tokens=True)
+    tok_decode_end = time.perf_counter()
+    tok_decode_time = (tok_decode_end - tok_decode_start) * 1000
+    # Only text_gen need to minus length of input_data, because generated_text may include input_text
+    num_tokens = 0
+    result_md5_list = []
+    for bs_idx in range(args['batch_size']):
+        generated_token_size = len(result[bs_idx]) - input_data["input_ids"][bs_idx].numel()
+        num_tokens += generated_token_size
+        if generated_token_size > max_gen_tokens:
+            log.error('Output token size is over max output token size!')
+        result_text = generated_text[bs_idx]
+        if args["output_dir"] is not None:
+            llm_bench_utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, bs_idx, proc_id)
+        result_md5_list.append(hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest())
+    if len(md5_list[num]) == 0:
+        md5_list[num] = {prompt_index : result_md5_list}
+    else:
+        md5_list[num][prompt_index] = result_md5_list
+    per_token_time = ""
+    if num_tokens > 0:
+        per_token_time = generation_time * 1000 / (num_tokens / args['batch_size'])
+    else:
+        log.warning("No generated tokens")
+    tm_list = []
+    tm_infer_list = []
+    tm_mm_embeddings = ""
+    if bench_hook is not None:
+        tm_list = bench_hook.get_time_list()
+        tm_mm_embeddings = np.mean(bench_hook.get_mm_embeddings_time_list()) * 1000 * 1000
+        log.debug('latency of all tokens:')
+        [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
+        tm_infer_list = bench_hook.get_time_infer_list()
+        log.debug('latency of all infers:')
+        [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_infer_list)]
+        if args['num_beams'] == 1 and generated_token_size != len(tm_infer_list):
+            log.warning(f'Output token size({generated_token_size}) is not equal to infer count({len(tm_infer_list)})')
+    iter_data = gen_output_data.gen_iterate_data(
+        iter_idx=num,
+        in_size=input_token_size * args['batch_size'],
+        infer_count=len(tm_infer_list),
+        out_size=num_tokens,
+        gen_time=generation_time,
+        latency=per_token_time,
+        res_md5=result_md5_list,
+        max_rss_mem=max_rss_mem_consumption,
+        max_shared_mem=max_shared_mem_consumption,
+        max_uss_mem=max_uss_mem_consumption,
+        prompt_idx=prompt_index,
+        tokenization_time=(tok_encode_time, tok_decode_time),
+        mm_embeddings_preparation_time=tm_mm_embeddings
+    )
+    iter_data_list.append(iter_data)
+    metrics_print.print_metrics(
+        num,
+        iter_data,
+        tm_list,
+        tm_infer_list,
+        warm_up=(num == 0),
+        max_rss_mem=max_rss_mem_consumption,
+        max_shared_mem=max_shared_mem_consumption,
+        max_uss_mem=max_uss_mem_consumption,
+        tokenization_time=(tok_encode_time, tok_decode_time),
+        batch_size=args['batch_size'],
+        prompt_idx=prompt_index
+    )
+    if num > 0:
+        prev_md5 = md5_list[num - 1][prompt_index]
+        if result_md5_list != prev_md5:
+            log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} "
+                        f"is different from md5 of the {num - 1} iteration {prev_md5}")
+            metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
+            if not args.get("use_cb", False):
+                if num == 1:
+                    # if the device is CPU, throw exception
+                    if args['devices'].lower().startswith('cpu') is True:
+                        assert (result_md5_list == prev_md5)
+                else:
+                    # throw exception
+                    assert (result_md5_list == prev_md5)
+    else:
+        metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
+    if bench_hook is not None:
+        bench_hook.clear_time_list()
+        bench_hook.clear_time_infer_list()
+        bench_hook.clear_mm_embeddins_time_list()
+
+
+def load_image_genai(image_path):
+    pil_image = load_image(image_path)
+    image_data = np.array(pil_image.getdata()).reshape(1, pil_image.size[1], pil_image.size[0], 3).astype(np.uint8)
+    return ov.Tensor(image_data)
+
+
+def run_visual_language_generation_genai(
+    inputs, num, model, processor, args, iter_data_list, md5_list, prompt_index, streamer, model_precision, proc_id, mem_consumption
+):
+    if args['batch_size'] != 1:
+        log.warning("Only batch size 1 available for benchmarking")
+        args["batch_size"] = 1
+    images = []
+    prompts = []
+    for input_data in inputs:
+        if "media" in input_data:
+            images.append(load_image_genai(input_data["media"]))
+        prompts.append(input_data["prompt"])
+    if args["output_dir"] is not None and num == 0:
+        for bs_index, in_text in enumerate(prompts):
+            llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id)
+    max_rss_mem_consumption = ''
+    max_uss_mem_consumption = ''
+    max_shared_mem_consumption = ''
+    if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
+        mem_consumption.start_collect_memory_consumption()
+    max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
+    gen_config = model.get_generation_config()
+    gen_config.max_new_tokens = max_gen_tokens
+    gen_config.num_beams = args["num_beams"]
+    gen_config.do_sample = False
+    start = time.perf_counter()
+    generation_result = model.generate(prompts[0], images=images[0], generation_config=gen_config)
+    end = time.perf_counter()
+    generated_text = generation_result.texts
+    perf_metrics = generation_result.perf_metrics
+    if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
+        mem_consumption.end_collect_momory_consumption()
+        max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption()
+        mem_consumption.clear_max_memory_consumption()
+
+    generation_time = end - start
+    result_md5_list = []
+    generated_text_len = perf_metrics.get_num_generated_tokens()
+    if generated_text_len > max_gen_tokens:
+        log.error('Output token size is over max output token size!')
+    result_text = generated_text[0]
+    if args["output_dir"] is not None:
+        llm_bench_utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, 0, proc_id)
+    result_md5_list.append(hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest())
+    if len(md5_list[num]) == 0:
+        md5_list[num] = {prompt_index : result_md5_list}
+    else:
+        md5_list[num][prompt_index] = result_md5_list
+    per_token_time = ""
+    if generated_text_len > 0:
+        per_token_time = generation_time * 1000 / (generated_text_len / args['batch_size'])
+    else:
+        log.warning("No generated tokens")
+    first_token_time = (perf_metrics.get_ttft().mean - perf_metrics.raw_metrics.tokenization_durations[-1] / 1000) * args["batch_size"]
+    second_tokens_durations = (
+        np.array(perf_metrics.raw_metrics.m_new_token_times[1:])
+        - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1])
+    ).tolist()
+
+    tm_list = np.array([first_token_time] + second_tokens_durations) / 1000
+    log.debug('latency of all tokens:')
+    [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
+    tokenization_time = (
+        np.mean(perf_metrics.raw_metrics.tokenization_durations) / 1000,
+        np.mean(perf_metrics.raw_metrics.detokenization_durations) / 1000
+    )
+    iter_data = gen_output_data.gen_iterate_data(
+        iter_idx=num,
+        in_size=args['batch_size'] * perf_metrics.get_num_input_tokens(),
+        infer_count=len(tm_list),
+        out_size=generated_text_len,
+        gen_time=generation_time,
+        latency=per_token_time,
+        res_md5=result_md5_list,
+        max_rss_mem=max_rss_mem_consumption,
+        max_shared_mem=max_shared_mem_consumption,
+        max_uss_mem=max_uss_mem_consumption,
+        prompt_idx=prompt_index,
+        tokenization_time=tokenization_time,
+        mm_embeddings_preparation_time=perf_metrics.get_prepare_embeddings_duration().mean
+    )
+    iter_data_list.append(iter_data)
+    metrics_print.print_metrics(
+        num,
+        iter_data,
+        tm_list.tolist(),
+        None,
+        warm_up=(num == 0),
+        max_rss_mem=max_rss_mem_consumption,
+        max_shared_mem=max_shared_mem_consumption,
+        max_uss_mem=max_uss_mem_consumption,
+        tokenization_time=tokenization_time,
+        batch_size=args['batch_size'],
+        prompt_idx=prompt_index
+    )
+    if num > 0:
+        prev_md5 = md5_list[num - 1][prompt_index]
+        if result_md5_list != prev_md5:
+            log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} "
+                        f"is different from md5 of the {num - 1} iteration {prev_md5}")
+            metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
+    else:
+        metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
+
+
+def run_visual_language_generation_benchmark(model_path, framework, device, args, num_iters, mem_consumption):
+    model, processor, pretrain_time, bench_hook, use_genai = FW_UTILS[framework].create_image_text_gen_model(model_path, device, **args)
+    model_precision = model_utils.get_model_precision(model_path.parts)
+    iter_data_list = []
+    md5_list = {num : {} for num in range(num_iters + 1)}
+    input_image_text_list = get_image_text_prompt(args)
+    if args['prompt_index'] is None:
+        prompt_idx_list = list(range(0, len(input_image_text_list)))
+        image_text_list = input_image_text_list
+    else:
+        prompt_idx_list = []
+        image_text_list = []
+        for i in args['prompt_index']:
+            if 0 <= i < len(input_image_text_list):
+                image_text_list.append(input_image_text_list[i])
+                prompt_idx_list.append(i)
+    if len(input_image_text_list) == 0:
+        raise RuntimeError('==Failure prompts is empty ==')
+    log.info(f"Numbeams: {args['num_beams']}, benchmarking iter nums(exclude warm-up): {num_iters}, "
+             f'prompt nums: {len(image_text_list)}, prompt idx: {prompt_idx_list}')
+
+    if not use_genai:
+        gen_fn = run_visual_language_generation_optimum
+    else:
+        gen_fn = run_visual_language_generation_genai
+
+    proc_id = os.getpid()
+    iter_timestamp = model_utils.init_timestamp(num_iters, image_text_list, prompt_idx_list)
+    if args['subsequent'] is False:
+        for num in range(num_iters + 1):
+            for idx, input_text in enumerate(image_text_list):
+                p_idx = prompt_idx_list[idx]
+                if num == 0:
+                    log.info(f'[warm-up][P{p_idx}] Input text: {input_text}')
+                iter_timestamp[num][p_idx]['start'] = datetime.datetime.now().isoformat()
+                gen_fn(
+                    input_text, num, model, processor, args, iter_data_list, md5_list,
+                    p_idx, bench_hook, model_precision, proc_id, mem_consumption)
+                iter_timestamp[num][p_idx]['end'] = datetime.datetime.now().isoformat()
+                prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
+                log.info(f"{prefix}[P{p_idx}] start: {iter_timestamp[num][p_idx]['start']}, end: {iter_timestamp[num][p_idx]['end']}")
+    else:
+        for idx, input_text in enumerate(image_text_list):
+            p_idx = prompt_idx_list[idx]
+            for num in range(num_iters + 1):
+                if num == 0:
+                    log.info(f'[warm-up][P{p_idx}] Input text: {input_text}')
+                iter_timestamp[num][p_idx]['start'] = datetime.datetime.now().isoformat()
+                gen_fn(
+                    input_text, num, model, processor, args, iter_data_list, md5_list,
+                    prompt_idx_list[idx], bench_hook, model_precision, proc_id, mem_consumption)
+                iter_timestamp[num][p_idx]['end'] = datetime.datetime.now().isoformat()
+                prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
+                log.info(f"{prefix}[P{p_idx}] start: {iter_timestamp[num][p_idx]['start']}, end: {iter_timestamp[num][p_idx]['end']}")
+
+    metrics_print.print_average(iter_data_list, prompt_idx_list, args['batch_size'], True)
+    return iter_data_list, pretrain_time, iter_timestamp
+
+
+def get_image_text_prompt(args):
+    vlm_file_list = []
+    output_data_list, is_json_data = model_utils.get_param_from_file(args, ['media', "prompt"])
+    if is_json_data:
+        vlm_param_list = parse_json_data.parse_vlm_json_data(output_data_list)
+        if len(vlm_param_list) > 0:
+            for vlm_file in vlm_param_list:
+                if args['prompt_file'] is not None and len(args['prompt_file']) > 0:
+                    vlm_file['media'] = os.path.join(os.path.dirname(args['prompt_file'][0]), vlm_file['media'].replace('./', ''))
+                    vlm_file['media'] = Path(vlm_file['media'])
+                vlm_file_list.append(vlm_file)
+    else:
+        vlm_file_list.append(output_data_list)
+    return vlm_file_list

From f598639fa23530457aece2d0f9a0527f82be0c3f Mon Sep 17 00:00:00 2001
From: tongqiu <tong.qiu@intel.com>
Date: Tue, 17 Dec 2024 22:52:43 +0800
Subject: [PATCH 021/110] Add workaround for MSVC mutex constructor issue 
 (#1367)

This issue is a MSVC compiler bug affecting certain versions of Visual
Studio 2022. When using `std::mutex` a null dereference may occur,
leading to a silent crash in Release mode, as illustrated in the image
below.

![mutex](https://github.com/user-attachments/assets/07331f59-7e6d-47b4-a72a-887e01817fa8)

Adding the compiler option `/D"_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR"
`serves as a workaround for this problem.
Reference:
https://hydrogenaud.io/index.php/topic,126070.0.html
https://github.com/microsoft/STL/wiki/Changelog#vs-2022-1710
---
 CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 35ca895abc..fec8df34af 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,6 +79,13 @@ if(WIN32 OR APPLE)
   set(CMAKE_DEBUG_POSTFIX "d")
 endif()
 
+# Workaround for an MSVC compiler issue in some versions of Visual Studio 2022.
+# The issue involves a null dereference to a mutex. For details, refer to link https://github.com/microsoft/STL/wiki/Changelog#vs-2022-1710
+if(MSVC AND MSVC_VERSION GREATER_EQUAL 1930 AND MSVC_VERSION LESS 1941)
+    add_compile_definitions(_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR)
+endif()
+
+
 add_subdirectory(thirdparty)
 add_subdirectory(src)
 if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/samples")

From c6af2f12d7e85f14c5b8260f43d03a7a32508ddc Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 17 Dec 2024 18:53:13 +0400
Subject: [PATCH 022/110] [Image to image] PNDM support (#1394)

Continuation for
https://github.com/openvinotoolkit/openvino.genai/pull/1393

CVS-158967
---
 samples/cpp/image_generation/README.md           | 4 ++++
 samples/cpp/image_generation/inpainting.bmp      | 3 +++
 samples/python/image_generation/README.md        | 4 ++++
 src/cpp/src/image_generation/schedulers/pndm.cpp | 8 ++++++++
 src/python/openvino_genai/py_openvino_genai.pyi  | 5 ++++-
 src/python/py_image_generation_pipelines.cpp     | 3 ++-
 tools/llm_bench/llm_bench_utils/ov_utils.py      | 3 ++-
 7 files changed, 27 insertions(+), 3 deletions(-)
 create mode 100644 samples/cpp/image_generation/inpainting.bmp

diff --git a/samples/cpp/image_generation/README.md b/samples/cpp/image_generation/README.md
index f8dc21cc39..3dcb64b97c 100644
--- a/samples/cpp/image_generation/README.md
+++ b/samples/cpp/image_generation/README.md
@@ -143,4 +143,8 @@ And run the sample:
 
 `./inpainting ./stable-diffusion-2-inpainting 'Face of a yellow cat, high resolution, sitting on a park bench' image.png mask_image.png`
 
+The resuling image is:
+
+   ![](./inpainting.bmp)
+
 Note, that LoRA, heterogeneous execution and other features of `Text2ImagePipeline` are applicable for `InpaintingPipeline`.
diff --git a/samples/cpp/image_generation/inpainting.bmp b/samples/cpp/image_generation/inpainting.bmp
new file mode 100644
index 0000000000..b93292e075
--- /dev/null
+++ b/samples/cpp/image_generation/inpainting.bmp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:527cee8f7d451c7e5004bc58c079d4c853443644eaeb2d84a343016cd25214c1
+size 786486
diff --git a/samples/python/image_generation/README.md b/samples/python/image_generation/README.md
index 3e53f40fc4..13b4ea6ee0 100644
--- a/samples/python/image_generation/README.md
+++ b/samples/python/image_generation/README.md
@@ -142,4 +142,8 @@ And run the sample:
 
 `python inpainting.py ./stable-diffusion-2-inpainting 'Face of a yellow cat, high resolution, sitting on a park bench' image.png mask_image.png`
 
+The resuling image is:
+
+   ![](./../../cpp/image_generation/inpainting.bmp)
+
 Note, that LoRA, heterogeneous execution and other features of `Text2ImagePipeline` are applicable for `InpaintingPipeline`.
diff --git a/src/cpp/src/image_generation/schedulers/pndm.cpp b/src/cpp/src/image_generation/schedulers/pndm.cpp
index a760283b97..4ddc099d0e 100644
--- a/src/cpp/src/image_generation/schedulers/pndm.cpp
+++ b/src/cpp/src/image_generation/schedulers/pndm.cpp
@@ -132,6 +132,14 @@ void PNDMScheduler::set_timesteps(size_t num_inference_steps, float strength) {
     m_ets = {};
     m_counter = 0;
     m_cur_sample = ov::Tensor(ov::element::f32, {});
+
+    // apply 'strength' used in image generation
+    // in diffusers, it's https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L711
+    {
+        size_t init_timestep = std::min<size_t>(num_inference_steps * strength, num_inference_steps);
+        size_t t_start = std::max<size_t>(num_inference_steps - init_timestep, 0);
+        m_timesteps = std::vector<int64_t>(m_timesteps.begin() + t_start, m_timesteps.end());
+    }
 }
 
 std::map<std::string, ov::Tensor> PNDMScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) {
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index 8b8eb76b12..6135a187eb 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -1338,6 +1338,8 @@ class Scheduler:
           EULER_DISCRETE
         
           FLOW_MATCH_EULER_DISCRETE
+        
+          PNDM
         """
         AUTO: typing.ClassVar[Scheduler.Type]  # value = <Type.AUTO: 0>
         DDIM: typing.ClassVar[Scheduler.Type]  # value = <Type.DDIM: 3>
@@ -1345,7 +1347,8 @@ class Scheduler:
         FLOW_MATCH_EULER_DISCRETE: typing.ClassVar[Scheduler.Type]  # value = <Type.FLOW_MATCH_EULER_DISCRETE: 5>
         LCM: typing.ClassVar[Scheduler.Type]  # value = <Type.LCM: 1>
         LMS_DISCRETE: typing.ClassVar[Scheduler.Type]  # value = <Type.LMS_DISCRETE: 2>
-        __members__: typing.ClassVar[dict[str, Scheduler.Type]]  # value = {'AUTO': <Type.AUTO: 0>, 'LCM': <Type.LCM: 1>, 'LMS_DISCRETE': <Type.LMS_DISCRETE: 2>, 'DDIM': <Type.DDIM: 3>, 'EULER_DISCRETE': <Type.EULER_DISCRETE: 4>, 'FLOW_MATCH_EULER_DISCRETE': <Type.FLOW_MATCH_EULER_DISCRETE: 5>}
+        PNDM: typing.ClassVar[Scheduler.Type]  # value = <Type.PNDM: 6>
+        __members__: typing.ClassVar[dict[str, Scheduler.Type]]  # value = {'AUTO': <Type.AUTO: 0>, 'LCM': <Type.LCM: 1>, 'LMS_DISCRETE': <Type.LMS_DISCRETE: 2>, 'DDIM': <Type.DDIM: 3>, 'EULER_DISCRETE': <Type.EULER_DISCRETE: 4>, 'FLOW_MATCH_EULER_DISCRETE': <Type.FLOW_MATCH_EULER_DISCRETE: 5>, 'PNDM': <Type.PNDM: 6>}
         def __eq__(self, other: typing.Any) -> bool:
             ...
         def __getstate__(self) -> int:
diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
index da6ce6d21b..f5347c279d 100644
--- a/src/python/py_image_generation_pipelines.cpp
+++ b/src/python/py_image_generation_pipelines.cpp
@@ -197,7 +197,8 @@ void init_image_generation_pipelines(py::module_& m) {
         .value("LMS_DISCRETE", ov::genai::Scheduler::Type::LMS_DISCRETE)
         .value("DDIM", ov::genai::Scheduler::Type::DDIM)
         .value("EULER_DISCRETE", ov::genai::Scheduler::Type::EULER_DISCRETE)
-        .value("FLOW_MATCH_EULER_DISCRETE", ov::genai::Scheduler::Type::FLOW_MATCH_EULER_DISCRETE);
+        .value("FLOW_MATCH_EULER_DISCRETE", ov::genai::Scheduler::Type::FLOW_MATCH_EULER_DISCRETE)
+        .value("PNDM", ov::genai::Scheduler::Type::PNDM);
     image_generation_scheduler.def_static("from_config",
         &ov::genai::Scheduler::from_config,
         py::arg("scheduler_config_path"),
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
index 427f1c84f3..c3df84925b 100644
--- a/tools/llm_bench/llm_bench_utils/ov_utils.py
+++ b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -420,7 +420,8 @@ def get_vae_decoder_step_count(self):
     start = time.perf_counter()
 
     scheduler_type = data.get("scheduler", ["", ""])[1]
-    if (scheduler_type not in ["LCMScheduler", "DDIMScheduler", "LMSDiscreteScheduler", "EulerDiscreteScheduler", "FlowMatchEulerDiscreteScheduler"]):
+    if (scheduler_type not in ["LCMScheduler", "DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler", "EulerDiscreteScheduler",
+                               "FlowMatchEulerDiscreteScheduler"]):
         scheduler = openvino_genai.Scheduler.from_config(model_path / "scheduler/scheduler_config.json", openvino_genai.Scheduler.Type.DDIM)
         log.warning(f'Type of scheduler {scheduler_type} is unsupported. Please, be aware that it will be replaced to DDIMScheduler')
 

From 79f64a6541558a66e7d55b36990b26dbcf5ebf4b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 17 Dec 2024 18:53:50 +0400
Subject: [PATCH 023/110] [Inpainting] Added single channel mask support
 (#1398)

Current PR brings a single channel masks support (both GRAY and BINARY;
GRAY is converted in BINARY anyway within mask image processor)
Based on passed mask type, we dynamically select proper mask processor
and convert all mask images types to BINARY.

CVS-159222
---
 .../image_generation/image2image_pipeline.hpp    |  9 ++++++++-
 .../image_generation/inpainting_pipeline.hpp     |  9 ++++++++-
 .../image_generation/text2image_pipeline.hpp     |  2 +-
 src/cpp/src/image_generation/image_processor.cpp | 16 ++++++----------
 src/cpp/src/image_generation/image_processor.hpp |  4 ++--
 .../stable_diffusion_pipeline.hpp                | 14 ++++++++------
 6 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
index ea02969c5e..c6c1f59c88 100644
--- a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
@@ -67,7 +67,14 @@ class OPENVINO_GENAI_EXPORTS Image2ImagePipeline {
         return compile(device, ov::AnyMap{std::forward<Properties>(properties)...});
     }
 
-    // Returns a tensor with the following dimensions [num_images_per_prompt, height, width, 3]
+    /**
+     * Peforms initial image editing conditioned on a text prompt.
+     * @param positive_prompt Prompt to generate image(s) from
+     * @param initial_image RGB/BGR image of [1, height, width, 3] shape used to initialize latent image
+     * @param properties Image generation parameters specified as properties. Values in 'properties' override default value for generation parameters.
+     * @returns A tensor which has dimensions [num_images_per_prompt, height, width, 3]
+     * @note Output image size is the same as initial image size, but rounded down to be divisible by VAE scale factor (usually, 8)
+     */
     ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, const ov::AnyMap& properties = {});
 
     template <typename... Properties>
diff --git a/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp
index 6eead673e4..03dd9468f7 100644
--- a/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp
@@ -89,7 +89,14 @@ class OPENVINO_GENAI_EXPORTS InpaintingPipeline {
         return compile(device, ov::AnyMap{std::forward<Properties>(properties)...});
     }
 
-    // Returns a tensor with the following dimensions [num_images_per_prompt, height, width, 3]
+    /**
+     * Inpaints an initial image within an area defined by mask and conditioned on prompt
+     * @param positive_prompt Prompt to generate image(s) from
+     * @param initial_image RGB/BGR image of [1, height, width, 3] shape used to initialize latent image
+     * @param mask_image RGB/BGR or GRAY/BINARY image of [1, height, width, 3 or 1] shape used as a mask
+     * @param properties Image generation parameters specified as properties. Values in 'properties' override default value for generation parameters.
+     * @returns A tensor which has dimensions [num_images_per_prompt, height, width, 3]
+     */
     ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask_image, const ov::AnyMap& properties = {});
 
     template <typename... Properties>
diff --git a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp
index 34b9d6e341..3dc1fc0803 100644
--- a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp
@@ -200,7 +200,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
     }
 
     /**
-     * Generates image(s) based on prompt and other image generarion parameters
+     * Generates image(s) based on prompt and other image generation parameters
      * @param positive_prompt Prompt to generate image(s) from
      * @param properties Image generation parameters specified as properties. Values in 'properties' override default value for generation parameters.
      * @returns A tensor which has dimensions [num_images_per_prompt, height, width, 3]
diff --git a/src/cpp/src/image_generation/image_processor.cpp b/src/cpp/src/image_generation/image_processor.cpp
index 8c73ee2da0..1e168da33b 100644
--- a/src/cpp/src/image_generation/image_processor.cpp
+++ b/src/cpp/src/image_generation/image_processor.cpp
@@ -41,34 +41,30 @@ void IImageProcessor::compile(std::shared_ptr<ov::Model> model) {
     m_request = utils::singleton_core().compile_model(model, m_device).create_infer_request();
 }
 
-ImageProcessor::ImageProcessor(const std::string& device, bool do_normalize, bool do_binarize) :
+ImageProcessor::ImageProcessor(const std::string& device, bool do_normalize, bool do_binarize, bool gray_scale_source) :
     IImageProcessor(device) {
     auto image_processor_model = create_empty_model();
-    merge_image_preprocessing(image_processor_model, do_normalize, do_binarize);
+    merge_image_preprocessing(image_processor_model, do_normalize, do_binarize, gray_scale_source);
 
     compile(image_processor_model);
 }
 
-void ImageProcessor::merge_image_preprocessing(std::shared_ptr<ov::Model> model, bool do_normalize, bool do_binarize) {
+void ImageProcessor::merge_image_preprocessing(std::shared_ptr<ov::Model> model, bool do_normalize, bool do_binarize, bool gray_scale_source) {
     OPENVINO_ASSERT(do_normalize ^ do_binarize, "Both binarize and normalize are not supported");
 
     // https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L90-L110
     ov::preprocess::PrePostProcessor ppp(model);
 
+    ov::preprocess::ColorFormat source_color_format = gray_scale_source ? ov::preprocess::ColorFormat::GRAY : ov::preprocess::ColorFormat::RGB;
+
     ppp.input().tensor()
         .set_layout("NHWC")
         .set_element_type(ov::element::u8)
-        .set_color_format(ov::preprocess::ColorFormat::BGR);
+        .set_color_format(source_color_format);
     ppp.input().model()
         .set_layout("NCHW");
 
     if (do_normalize) {
-        ppp.input().tensor().set_layout("NHWC");
-        ppp.input().model().set_layout("NCHW");
-
-        ppp.input().tensor()
-            .set_element_type(ov::element::u8);
-
         ppp.input().preprocess()
             .convert_layout()
             .convert_element_type(ov::element::f32)
diff --git a/src/cpp/src/image_generation/image_processor.hpp b/src/cpp/src/image_generation/image_processor.hpp
index d0ef7532aa..8c62742006 100644
--- a/src/cpp/src/image_generation/image_processor.hpp
+++ b/src/cpp/src/image_generation/image_processor.hpp
@@ -28,9 +28,9 @@ class IImageProcessor {
 
 class ImageProcessor : public IImageProcessor {
 public:
-    explicit ImageProcessor(const std::string& device, bool do_normalize = true, bool do_binarize = false);
+    explicit ImageProcessor(const std::string& device, bool do_normalize = true, bool do_binarize = false, bool gray_scale_source = false);
 
-    static void merge_image_preprocessing(std::shared_ptr<ov::Model> model, bool do_normalize = true, bool do_binarize = false);
+    static void merge_image_preprocessing(std::shared_ptr<ov::Model> model, bool do_normalize = true, bool do_binarize = false, bool gray_scale_source = false);
 };
 
 class ImageResizer {
diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
index 7549b67919..3801c855fd 100644
--- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
@@ -33,14 +33,15 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         const std::string device = "CPU";
 
         if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) {
-            const bool do_normalize = true, do_binarize = false;
-            m_image_processor = std::make_shared<ImageProcessor>(device, do_normalize, do_binarize);
+            const bool do_normalize = true, do_binarize = false, gray_scale_source = false;
+            m_image_processor = std::make_shared<ImageProcessor>(device, do_normalize, do_binarize, gray_scale_source);
             m_image_resizer = std::make_shared<ImageResizer>(device, ov::element::u8, "NHWC", ov::op::v11::Interpolate::InterpolateMode::BICUBIC_PILLOW);
         }
 
         if (m_pipeline_type == PipelineType::INPAINTING) {
-            const bool do_normalize = false, do_binarize = true;
-            m_mask_processor = std::make_shared<ImageProcessor>(device, do_normalize, do_binarize);
+            bool do_normalize = false, do_binarize = true;
+            m_mask_processor_rgb = std::make_shared<ImageProcessor>(device, do_normalize, do_binarize, false);
+            m_mask_processor_gray = std::make_shared<ImageProcessor>(device, do_normalize, do_binarize, true);
             m_mask_resizer = std::make_shared<ImageResizer>(device, ov::element::f32, "NCHW", ov::op::v11::Interpolate::InterpolateMode::NEAREST);
         }
     }
@@ -267,7 +268,8 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         ov::Shape target_shape = processed_image.get_shape();
 
         ov::Tensor mask_condition = m_image_resizer->execute(mask_image, target_shape[2], target_shape[3]);
-        mask_condition = m_mask_processor->execute(mask_condition);
+        std::shared_ptr<IImageProcessor> mask_processor = mask_condition.get_shape()[3] == 1 ? m_mask_processor_gray : m_mask_processor_rgb;
+        mask_condition = mask_processor->execute(mask_condition);
 
         // resize mask to shape of latent space
         ov::Tensor mask = m_mask_resizer->execute(mask_condition, target_shape[2] / vae_scale_factor, target_shape[3] / vae_scale_factor);
@@ -501,7 +503,7 @@ class StableDiffusionPipeline : public DiffusionPipeline {
     std::shared_ptr<CLIPTextModel> m_clip_text_encoder = nullptr;
     std::shared_ptr<UNet2DConditionModel> m_unet = nullptr;
     std::shared_ptr<AutoencoderKL> m_vae = nullptr;
-    std::shared_ptr<IImageProcessor> m_image_processor = nullptr, m_mask_processor = nullptr;
+    std::shared_ptr<IImageProcessor> m_image_processor = nullptr, m_mask_processor_rgb = nullptr, m_mask_processor_gray = nullptr;
     std::shared_ptr<ImageResizer> m_image_resizer = nullptr, m_mask_resizer = nullptr;
 };
 

From b31b6a152c3771bb92427b85cd85cc5ebd514f36 Mon Sep 17 00:00:00 2001
From: guozhong wang <guozhong.wang@intel.com>
Date: Wed, 18 Dec 2024 01:08:54 +0800
Subject: [PATCH 024/110] Enable print properties of compiled model in genai
 API (#1289)

When setting the environment variable OPENVINO_LOG_LEVEL >
ov::log::Level::WARNING, the properties of the compiled model can be
printed in genai API.

When the device is CPU, the properties of the compiled model are as
follows:
Model: Stateful LLM model
  NETWORK_NAME: Model0
  OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1
  NUM_STREAMS: 1
  INFERENCE_NUM_THREADS: 48
  PERF_COUNT: NO
  INFERENCE_PRECISION_HINT: bf16
  PERFORMANCE_HINT: LATENCY
  EXECUTION_MODE_HINT: PERFORMANCE
  PERFORMANCE_HINT_NUM_REQUESTS: 0
  ENABLE_CPU_PINNING: YES
  SCHEDULING_CORE_TYPE: ANY_CORE
  MODEL_DISTRIBUTION_POLICY:
  ENABLE_HYPER_THREADING: NO
  EXECUTION_DEVICES: CPU
  CPU_DENORMALS_OPTIMIZATION: NO
  LOG_LEVEL: LOG_NONE
  CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1
  DYNAMIC_QUANTIZATION_GROUP_SIZE: 32
  KV_CACHE_PRECISION: f16
  AFFINITY: CORE
EXECUTION_DEVICES:
 CPU: Intel(R) Xeon(R) Platinum 8468


[stable_diffusion_compiled_model_log.txt](https://github.com/user-attachments/files/18120641/stable_diffusion_compiled_model_log.txt)

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .github/workflows/llm_bench-python.yml        |  1 +
 src/README.md                                 |  4 ++
 src/cpp/src/continuous_batching_impl.cpp      |  4 +-
 .../models/autoencoder_kl.cpp                 |  2 +
 .../models/clip_text_model.cpp                |  1 +
 .../clip_text_model_with_projection.cpp       |  1 +
 .../models/flux_transformer_2d_model.cpp      |  1 +
 .../models/sd3_transformer_2d_model.cpp       |  1 +
 .../models/t5_encoder_model.cpp               |  1 +
 .../models/unet_inference_dynamic.hpp         |  1 +
 .../models/unet_inference_static_bs1.hpp      |  1 +
 src/cpp/src/llm_pipeline.cpp                  |  8 +++-
 src/cpp/src/llm_pipeline_static.cpp           | 13 +++---
 src/cpp/src/lora_adapter.cpp                  |  4 +-
 src/cpp/src/tokenizer.cpp                     |  2 +
 src/cpp/src/utils.cpp                         | 37 ++++++++++++++++
 src/cpp/src/utils.hpp                         |  2 +
 .../src/visual_language/embedding_model.cpp   |  1 +
 .../src/visual_language/inputs_embedder.cpp   |  7 +--
 src/cpp/src/visual_language/pipeline.cpp      |  2 +-
 .../src/visual_language/vision_encoder.cpp    | 10 +++--
 src/cpp/src/whisper_pipeline.cpp              | 21 +++++----
 src/cpp/src/whisper_pipeline_static.cpp       | 13 ++++--
 src/docs/DEBUG_LOG.md                         | 43 +++++++++++++++++++
 24 files changed, 152 insertions(+), 29 deletions(-)
 create mode 100644 src/docs/DEBUG_LOG.md

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index 3d31649cea..6903882ca0 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -61,6 +61,7 @@ jobs:
       SRC_DIR: ${{ github.workspace }}
       LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench
       WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark
+      OPENVINO_LOG_LEVEL: 3
 
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/src/README.md b/src/README.md
index c90bc8f4e4..6466b431d0 100644
--- a/src/README.md
+++ b/src/README.md
@@ -403,3 +403,7 @@ For information on how OpenVINO™ GenAI works, refer to the [How It Works Secti
 ## Supported Models
 
 For a list of supported models, refer to the [Supported Models Section](./docs/SUPPORTED_MODELS.md).
+
+## Debug Log
+
+For using debug log, refer to [DEBUG Log](./doc/DEBUG_LOG.md).
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 1e42f5b2d9..bf0c979d39 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -46,7 +46,9 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
     const ov::AnyMap& properties,
     const DeviceConfig& device_config,
     ov::Core& core) {
-    ov::InferRequest infer_request = core.compile_model(model, device_config.get_device(), properties).create_infer_request();
+    auto compiled_model = core.compile_model(model, device_config.get_device(), properties);
+    ov::genai::utils::print_compiled_model_properties(compiled_model, "LLM with Paged Attention");
+    ov::InferRequest infer_request = compiled_model.create_infer_request();
 
     // setup KV caches
     m_cache_manager = std::make_shared<CacheManager>(device_config, core);
diff --git a/src/cpp/src/image_generation/models/autoencoder_kl.cpp b/src/cpp/src/image_generation/models/autoencoder_kl.cpp
index e0d6a44189..d3dd7324ee 100644
--- a/src/cpp/src/image_generation/models/autoencoder_kl.cpp
+++ b/src/cpp/src/image_generation/models/autoencoder_kl.cpp
@@ -212,12 +212,14 @@ AutoencoderKL& AutoencoderKL::compile(const std::string& device, const ov::AnyMa
 
     if (m_encoder_model) {
         ov::CompiledModel encoder_compiled_model = core.compile_model(m_encoder_model, device, properties);
+        ov::genai::utils::print_compiled_model_properties(encoder_compiled_model, "Auto encoder KL encoder model");
         m_encoder_request = encoder_compiled_model.create_infer_request();
         // release the original model
         m_encoder_model.reset();
     }
 
     ov::CompiledModel decoder_compiled_model = core.compile_model(m_decoder_model, device, properties);
+    ov::genai::utils::print_compiled_model_properties(decoder_compiled_model, "Auto encoder KL decoder model");
     m_decoder_request = decoder_compiled_model.create_infer_request();
     // release the original model
     m_decoder_model.reset();
diff --git a/src/cpp/src/image_generation/models/clip_text_model.cpp b/src/cpp/src/image_generation/models/clip_text_model.cpp
index d2dab30bcf..efbc840d4f 100644
--- a/src/cpp/src/image_generation/models/clip_text_model.cpp
+++ b/src/cpp/src/image_generation/models/clip_text_model.cpp
@@ -97,6 +97,7 @@ CLIPTextModel& CLIPTextModel::compile(const std::string& device, const ov::AnyMa
     } else {
         compiled_model = core.compile_model(m_model, device, properties);
     }
+    ov::genai::utils::print_compiled_model_properties(compiled_model, "Clip Text model");
     m_request = compiled_model.create_infer_request();
     // release the original model
     m_model.reset();
diff --git a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
index 13c7f5a442..982800a701 100644
--- a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
+++ b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
@@ -88,6 +88,7 @@ CLIPTextModelWithProjection& CLIPTextModelWithProjection::compile(const std::str
     } else {
         compiled_model = core.compile_model(m_model, device, properties);
     }
+    ov::genai::utils::print_compiled_model_properties(compiled_model, "Clip Text with projection model");
     m_request = compiled_model.create_infer_request();
     // release the original model
     m_model.reset();
diff --git a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
index 6b28b116b0..b09f099655 100644
--- a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
+++ b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
@@ -108,6 +108,7 @@ FluxTransformer2DModel& FluxTransformer2DModel::reshape(int batch_size,
 FluxTransformer2DModel& FluxTransformer2DModel::compile(const std::string& device, const ov::AnyMap& properties) {
     OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model");
     ov::CompiledModel compiled_model = utils::singleton_core().compile_model(m_model, device, properties);
+    ov::genai::utils::print_compiled_model_properties(compiled_model, "Flux Transformer 2D model");
     m_request = compiled_model.create_infer_request();
     // release the original model
     m_model.reset();
diff --git a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
index 70dddb0476..33771f2316 100644
--- a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
+++ b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
@@ -105,6 +105,7 @@ SD3Transformer2DModel& SD3Transformer2DModel::reshape(int batch_size,
 SD3Transformer2DModel& SD3Transformer2DModel::compile(const std::string& device, const ov::AnyMap& properties) {
     OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model");
     ov::CompiledModel compiled_model = utils::singleton_core().compile_model(m_model, device, properties);
+    ov::genai::utils::print_compiled_model_properties(compiled_model, "SD3 Transformer 2D model");
     m_request = compiled_model.create_infer_request();
     // release the original model
     m_model.reset();
diff --git a/src/cpp/src/image_generation/models/t5_encoder_model.cpp b/src/cpp/src/image_generation/models/t5_encoder_model.cpp
index 8c6df34667..21df456d46 100644
--- a/src/cpp/src/image_generation/models/t5_encoder_model.cpp
+++ b/src/cpp/src/image_generation/models/t5_encoder_model.cpp
@@ -63,6 +63,7 @@ T5EncoderModel& T5EncoderModel::compile(const std::string& device, const ov::Any
     ov::Core core = utils::singleton_core();
     ov::CompiledModel compiled_model;
     compiled_model = core.compile_model(m_model, device, properties);
+    ov::genai::utils::print_compiled_model_properties(compiled_model, "T5 encoder model");
     m_request = compiled_model.create_infer_request();
     // release the original model
     m_model.reset();
diff --git a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
index c8658a1c1a..6dc285f76d 100644
--- a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
+++ b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
@@ -20,6 +20,7 @@ class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::
         ov::Core core = utils::singleton_core();
 
         ov::CompiledModel compiled_model = core.compile_model(model, device, properties);
+        ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition dynamic model");
         m_request = compiled_model.create_infer_request();
     }
 
diff --git a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp
index fcde31e9ee..7aa6f6301c 100644
--- a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp
+++ b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp
@@ -40,6 +40,7 @@ class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel
 
         ov::Core core = utils::singleton_core();
         ov::CompiledModel compiled_model = core.compile_model(model, device, properties);
+        ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition batch-1 model");
 
         for (int i = 0; i < m_native_batch_size; i++)
         {
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index f663b27dd9..6d9aae30fa 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -77,6 +77,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         const ov::genai::GenerationConfig& generation_config
     ) : LLMPipelineImplBase(tokenizer, generation_config) {
         ov::Core core;
+        ov::CompiledModel compiled_model;
         auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
         utils::slice_matmul_statefull_model(model);
         m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model);
@@ -84,10 +85,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
             m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
             m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device);   // TODO: Make the prefix name configurable
-            m_model_runner = core.compile_model(model, device, *filtered_plugin_config).create_infer_request();
+            compiled_model = core.compile_model(model, device, *filtered_plugin_config);
+            m_model_runner = compiled_model.create_infer_request();
         } else {
-            m_model_runner = core.compile_model(model, device, plugin_config).create_infer_request();
+            compiled_model = core.compile_model(model, device, plugin_config);
+            m_model_runner = compiled_model.create_infer_request();
         }
+        ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model");
 
         // If eos_token_id was not provided, take value
         if (m_generation_config.eos_token_id == -1)
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index cb83209b4b..090aed9650 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -777,12 +777,15 @@ void StaticLLMPipeline::setupAndCompileModels(
     set_npuw_cache_dir(prefill_config);
     set_npuw_cache_dir(generate_config);
 
-    m_kvcache_request = core.compile_model(
+    auto kv_compiled_model = core.compile_model(
         kvcache_model, device, generate_config
-    ).create_infer_request();
-    m_prefill_request = core.compile_model(
-        prefill_model, device, prefill_config
-    ).create_infer_request();
+    );
+    ov::genai::utils::print_compiled_model_properties(kv_compiled_model, "Static LLM kv compiled model");
+    m_kvcache_request = kv_compiled_model.create_infer_request();
+
+    auto prefill_compiled_model = core.compile_model(prefill_model, device, prefill_config);
+    m_prefill_request = prefill_compiled_model.create_infer_request();
+    ov::genai::utils::print_compiled_model_properties(prefill_compiled_model, "Static LLM prefill compiled model");
 }
 
 void StaticLLMPipeline::setupAndImportModels(
diff --git a/src/cpp/src/lora_adapter.cpp b/src/cpp/src/lora_adapter.cpp
index 5e8839513e..fd446ef708 100644
--- a/src/cpp/src/lora_adapter.cpp
+++ b/src/cpp/src/lora_adapter.cpp
@@ -637,7 +637,9 @@ class InferRequestSignatureCache {
 
         ov::Core core = ov::genai::utils::singleton_core();
         auto model = std::make_shared<ov::Model>(request_results, request_parameters);
-        rwb.request = core.compile_model(model, device).create_infer_request();
+        auto compiled_model = core.compile_model(model, device);
+        ov::genai::utils::print_compiled_model_properties(compiled_model, "Infer Request Signature Cache");
+        rwb.request = compiled_model.create_infer_request();
         requests.emplace(signature, rwb);
     }
 
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index cff25f07f8..642236d32a 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -203,6 +203,7 @@ class Tokenizer::TokenizerImpl {
             manager.register_pass<MakeCombineSegmentsSatateful>();
             manager.run_passes(ov_tokenizer);
             m_tokenizer = core.compile_model(ov_tokenizer, device, properties);
+            ov::genai::utils::print_compiled_model_properties(m_tokenizer, "OV Tokenizer");
 
             m_ireq_queue_tokenizer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
                 m_tokenizer.get_property(ov::optimal_number_of_infer_requests),
@@ -216,6 +217,7 @@ class Tokenizer::TokenizerImpl {
             manager_detok.register_pass<MakeVocabDecoderSatateful>();
             manager_detok.run_passes(ov_detokenizer);
             m_detokenizer = core.compile_model(ov_detokenizer, device, properties);
+            ov::genai::utils::print_compiled_model_properties(m_detokenizer, "OV Detokenizer");
 
             m_ireq_queue_detokenizer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
                 m_detokenizer.get_property(ov::optimal_number_of_infer_requests),
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index 3690920295..9fa14b7f9f 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -381,6 +381,43 @@ void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t se
     }
 }
 
+void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const char* model_title) {
+    // Specify the name of the environment variable
+    const char* env_var_name = "OPENVINO_LOG_LEVEL";
+    const char* env_var_value = std::getenv(env_var_name);
+
+    // Check if the environment variable was found
+    if (env_var_value != nullptr && atoi(env_var_value) > static_cast<int>(ov::log::Level::WARNING)) {
+        // output of the actual settings that the device selected
+        auto supported_properties = compiled_Model.get_property(ov::supported_properties);
+        std::cout << "Model: " << model_title << std::endl;
+        for (const auto& cfg : supported_properties) {
+            if (cfg == ov::supported_properties)
+                continue;
+            auto prop = compiled_Model.get_property(cfg);
+            if (cfg == ov::device::properties) {
+                auto devices_properties = prop.as<ov::AnyMap>();
+                for (auto& item : devices_properties) {
+                    std::cout << "  " << item.first << ": " << std::endl;
+                    for (auto& item2 : item.second.as<ov::AnyMap>()) {
+                        std::cout << "    " << item2.first << ": " << item2.second.as<std::string>() << std::endl;
+                    }
+                }
+            } else {
+                std::cout << "  " << cfg << ": " << prop.as<std::string>() << std::endl;
+            }
+        }
+
+        ov::Core core;
+        std::vector<std::string> exeTargets;
+        exeTargets = compiled_Model.get_property(ov::execution_devices);
+        std::cout << "EXECUTION_DEVICES:" << std::endl;
+        for (const auto& device : exeTargets) {
+            std::cout << " " << device << ": " << core.get_property(device, ov::device::full_name) << std::endl;
+        }
+    }
+}
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 57728cd0dc..5342ac427c 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -104,6 +104,8 @@ size_t get_seq_len_axis(std::shared_ptr<const ov::Model> model);
 
 void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t seq_length_axis, std::optional<AdapterController> adapter_controller);
 
+void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const char* model_title);
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/visual_language/embedding_model.cpp b/src/cpp/src/visual_language/embedding_model.cpp
index 88ddfc39cd..307bdcebac 100644
--- a/src/cpp/src/visual_language/embedding_model.cpp
+++ b/src/cpp/src/visual_language/embedding_model.cpp
@@ -26,6 +26,7 @@ EmbeddingsModel::EmbeddingsModel(const std::filesystem::path& model_dir,
     merge_postprocess(m_model, scale_emb);
 
     ov::CompiledModel compiled_model = core.compile_model(m_model, device, properties);
+    ov::genai::utils::print_compiled_model_properties(compiled_model, "text embeddings model");
     m_request = compiled_model.create_infer_request();
 }
 
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index dfdb1521ef..cf77dfce3c 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -259,9 +259,10 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         const std::string& device,
         const ov::AnyMap device_config) :
         IInputsEmbedder(vlm_config, model_dir, device, device_config) {
-        m_resampler = utils::singleton_core().compile_model(
-            model_dir / "openvino_resampler_model.xml", device, device_config
-        ).create_infer_request();
+        auto compiled_model =
+            utils::singleton_core().compile_model(model_dir / "openvino_resampler_model.xml", device, device_config);
+        ov::genai::utils::print_compiled_model_properties(compiled_model, "VLM resampler model");
+        m_resampler = compiled_model.create_infer_request();
 
         m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
     }
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
index b8e89a8e04..1ce0cbf210 100644
--- a/src/cpp/src/visual_language/pipeline.cpp
+++ b/src/cpp/src/visual_language/pipeline.cpp
@@ -92,7 +92,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         auto compiled_language_model = utils::singleton_core().compile_model(
             models_dir / "openvino_language_model.xml", device, properties
         );
-
+        ov::genai::utils::print_compiled_model_properties(compiled_language_model, "VLM language model");
         auto language_model = compiled_language_model.get_runtime_model();
         m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(language_model);
 
diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp
index 0842524820..9f8f9b0498 100644
--- a/src/cpp/src/visual_language/vision_encoder.cpp
+++ b/src/cpp/src/visual_language/vision_encoder.cpp
@@ -648,10 +648,12 @@ ov::Tensor get_pixel_values_internvl(const ov::Tensor& image, const ProcessorCon
 
 VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config) :
     model_type(model_type) {
-        m_vision_encoder = utils::singleton_core().compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request();
-        m_processor_config = utils::from_config_json_if_exists<ProcessorConfig>(
-            model_dir, "preprocessor_config.json"
-        );
+    auto compiled_model = utils::singleton_core().compile_model(model_dir / "openvino_vision_embeddings_model.xml",
+                                                                device,
+                                                                device_config);
+    ov::genai::utils::print_compiled_model_properties(compiled_model, "VLM vision embeddings model");
+    m_vision_encoder = compiled_model.create_infer_request();
+    m_processor_config = utils::from_config_json_if_exists<ProcessorConfig>(model_dir, "preprocessor_config.json");
 }
 
 VisionEncoder::VisionEncoder(
diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp
index 5c31d85fec..d472a20238 100644
--- a/src/cpp/src/whisper_pipeline.cpp
+++ b/src/cpp/src/whisper_pipeline.cpp
@@ -56,15 +56,18 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
         auto [core_properties, compile_properties] = ov::genai::utils::split_core_compile_config(properties);
         core.set_property(core_properties);
 
-        m_models.encoder =
-            core.compile_model((models_path / "openvino_encoder_model.xml").string(), device, compile_properties)
-                .create_infer_request();
-        m_models.decoder =
-            core.compile_model((models_path / "openvino_decoder_model.xml").string(), device, compile_properties)
-                .create_infer_request();
-        m_models.decoder_with_past =
-            core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, compile_properties)
-                .create_infer_request();
+        ov::CompiledModel compiled_model;
+        compiled_model =
+            core.compile_model((models_path / "openvino_encoder_model.xml").string(), device, compile_properties);
+        ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper encoder model");
+        m_models.encoder = compiled_model.create_infer_request();
+        compiled_model =
+            core.compile_model((models_path / "openvino_decoder_model.xml").string(), device, compile_properties);
+        ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper decoder model");
+        m_models.decoder = compiled_model.create_infer_request();
+        compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, compile_properties);
+        m_models.decoder_with_past = compiled_model.create_infer_request();
+        ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model");
 
         // If eos_token_id was not provided, take value
         if (m_generation_config.eos_token_id == -1) {
diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp
index 9937082a81..136819fa01 100644
--- a/src/cpp/src/whisper_pipeline_static.cpp
+++ b/src/cpp/src/whisper_pipeline_static.cpp
@@ -555,9 +555,16 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys
     preprocess_decoder(decoder_model);
     preprocess_decoder(decoder_with_past_model);
 
-    m_models.encoder = core.compile_model(encoder_model, "NPU").create_infer_request();
-    m_models.decoder = core.compile_model(decoder_model, "NPU").create_infer_request();
-    m_models.decoder_with_past = core.compile_model(decoder_with_past_model, "NPU").create_infer_request();
+    ov::CompiledModel compiled_model;
+    compiled_model = core.compile_model(encoder_model, "NPU");
+    ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper encoder model");
+    m_models.encoder = compiled_model.create_infer_request();
+    compiled_model = core.compile_model(decoder_model, "NPU");
+    ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper decoder model");
+    m_models.decoder = compiled_model.create_infer_request();
+    compiled_model = core.compile_model(decoder_with_past_model, "NPU");
+    ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper decoder with past model");
+    m_models.decoder_with_past = compiled_model.create_infer_request();
 
     // If eos_token_id was not provided, take value
     if (m_generation_config.eos_token_id == -1) {
diff --git a/src/docs/DEBUG_LOG.md b/src/docs/DEBUG_LOG.md
new file mode 100644
index 0000000000..5ed3f35d17
--- /dev/null
+++ b/src/docs/DEBUG_LOG.md
@@ -0,0 +1,43 @@
+## 1. Using Debug Log
+
+There are six levels of logs, which can be called explicitly or set via the ``OPENVINO_LOG_LEVEL`` environment variable:
+
+0 - ``ov::log::Level::NO``
+1 - ``ov::log::Level::ERR``
+2 - ``ov::log::Level::WARNING``
+3 - ``ov::log::Level::INFO``
+4 - ``ov::log::Level::DEBUG``
+5 - ``ov::log::Level::TRACE``
+
+When setting the environment variable OPENVINO_LOG_LEVEL > ov::log::Level::WARNING, the properties of the compiled model can be printed.
+
+For example:
+
+Linux - export OPENVINO_LOG_LEVEL=3
+Windows - set OPENVINO_LOG_LEVEL=3
+
+the properties of the compiled model are printed as follows:
+```sh
+    NETWORK_NAME: Model0
+    OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1
+    NUM_STREAMS: 1
+    INFERENCE_NUM_THREADS: 48
+    PERF_COUNT: NO
+    INFERENCE_PRECISION_HINT: bf16
+    PERFORMANCE_HINT: LATENCY
+    EXECUTION_MODE_HINT: PERFORMANCE
+    PERFORMANCE_HINT_NUM_REQUESTS: 0
+    ENABLE_CPU_PINNING: YES
+    SCHEDULING_CORE_TYPE: ANY_CORE
+    MODEL_DISTRIBUTION_POLICY:
+    ENABLE_HYPER_THREADING: NO
+    EXECUTION_DEVICES: CPU
+    CPU_DENORMALS_OPTIMIZATION: NO
+    LOG_LEVEL: LOG_NONE
+    CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1
+    DYNAMIC_QUANTIZATION_GROUP_SIZE: 32
+    KV_CACHE_PRECISION: f16
+    AFFINITY: CORE
+    EXECUTION_DEVICES:
+    CPU: Intel(R) Xeon(R) Platinum 8468
+```
\ No newline at end of file

From 7d2a303270ac2c6f34754edff5611a6e8c23c854 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Wed, 18 Dec 2024 07:13:06 +0400
Subject: [PATCH 025/110] remove test models and fix order of checks (#1401)

      * added removing test models after llm bench tests passing to reduce
disk space
* fixed order of checks in wwb tests (csv file is result of successful
execution of cli command, test trying to open file and only after that
check cli command return code)
* reduces inference counts in llm bench tests (reduces execution time in
2 times, from 72 min to 36 min)
---
 .github/workflows/llm_bench-python.yml        | 22 ++++++++++++-------
 tools/llm_bench/task/image_generation.py      | 11 ++++++----
 .../who_what_benchmark/tests/test_cli_text.py |  9 +++-----
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index 6903882ca0..1999bafcfe 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -61,7 +61,6 @@ jobs:
       SRC_DIR: ${{ github.workspace }}
       LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench
       WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark
-      OPENVINO_LOG_LEVEL: 3
 
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -103,30 +102,34 @@ jobs:
       - name: Test native pytorch model on Linux
         run: |
           git clone --depth 1 https://huggingface.co/katuni4ka/tiny-random-qwen
-          python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt
+          python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt -ic 20
+          rm -rf tiny-random-qwen
         env:
           GIT_LFS_SKIP_SMUDGE: 0
       - name: Test tiny-random-baichuan2 on Linux Optimum Intel
         run: |
           optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 --optimum
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 --optimum -ic 10
+          rm -rf ./ov_models/tiny-random-baichuan2
       - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux Optimum Intel
         run: |
           huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir ov_models/lcm_dreamshaper_v7
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum -ic 4
       - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI
         run: |
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 -ic 4
       - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI and LoRA
         run: |
           wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 -ic 4
+          rm -rf ./ov_models/lcm_dreamshaper_v7/
       - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
         run: |
           optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16
           optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --assistant_confidence_threshold 0.4
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --num_assistant_tokens 5
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --assistant_confidence_threshold 0.4 -ic 20
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --num_assistant_tokens 5 -ic 20
+          rm -rf ov_models/TinyLlama-1.1B-Chat-v1.0
       - name: Test whisper-tiny on Linux
         run: |
           GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech
@@ -138,11 +141,14 @@ jobs:
           optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny ./ov_models/whisper-tiny
           python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1 --optimum
           python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1
+          rm -rf ./ov_models/whisper-tiny
+          rm -rf multilingual_librispeech
       - name: Text InternVL2-1B on Linux
         run: |
           optimum-cli export openvino --model OpenGVLab/InternVL2-1B ./ov_models/internvl2-1B --task image-text-to-text --trust-remote-code
           python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20
           python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20 --optimum
+          rm -rf ./ov_models/internvl2-1B
       - name: WWB Tests
         run: |
           pip install git+https://github.com/huggingface/optimum-intel.git
diff --git a/tools/llm_bench/task/image_generation.py b/tools/llm_bench/task/image_generation.py
index b870c7ec98..7f43afe6e2 100644
--- a/tools/llm_bench/task/image_generation.py
+++ b/tools/llm_bench/task/image_generation.py
@@ -25,11 +25,14 @@
 stable_diffusion_hook = StableDiffusionHook()
 
 
-def collects_input_args(image_param, model_type, model_name, callback=None):
+def collects_input_args(image_param, model_type, model_name, infer_count=None, callback=None):
     input_args = {}
     input_args["width"] = image_param.get('width', DEFAULT_IMAGE_WIDTH)
     input_args["height"] = image_param.get('height', DEFAULT_IMAGE_HEIGHT)
-    input_args["num_inference_steps"] = image_param.get('steps', DEFAULT_INFERENCE_STEPS if 'lcm' not in model_name else LCM_DEFAULT_INFERENCE_STEPS)
+    if infer_count is None:
+        input_args["num_inference_steps"] = image_param.get('steps', DEFAULT_INFERENCE_STEPS if 'lcm' not in model_name else LCM_DEFAULT_INFERENCE_STEPS)
+    else:
+        input_args["num_inference_steps"] = infer_count
     guidance_scale = image_param.get('guidance_scale', None)
 
     if guidance_scale is not None:
@@ -57,7 +60,7 @@ def collects_input_args(image_param, model_type, model_name, callback=None):
 def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None):
     set_seed(args['seed'])
     input_text = image_param['prompt']
-    input_args = collects_input_args(image_param, args['model_type'], args['model_name'])
+    input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["infer_count"])
     out_str = f"Input params: Batch_size={args['batch_size']}, " \
               f"steps={input_args['num_inference_steps']}, width={input_args['width']}, height={input_args['height']}"
     if 'guidance_scale' in input_args:
@@ -120,7 +123,7 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list,
 def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None):
     set_seed(args['seed'])
     input_text = image_param['prompt']
-    input_args = collects_input_args(image_param, args['model_type'], args['model_name'], callback)
+    input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["infer_count"], callback)
     out_str = f"Input params: Batch_size={args['batch_size']}, " \
               f"steps={input_args['num_inference_steps']}, width={input_args['width']}, height={input_args['height']}"
     if 'guidance_scale' in input_args:
diff --git a/tools/who_what_benchmark/tests/test_cli_text.py b/tools/who_what_benchmark/tests/test_cli_text.py
index 79335d46eb..9973cd357f 100644
--- a/tools/who_what_benchmark/tests/test_cli_text.py
+++ b/tools/who_what_benchmark/tests/test_cli_text.py
@@ -94,9 +94,8 @@ def test_text_gt_data():
                 "CPU",
             ]
         )
+        assert result.returncode == 0
         data = pd.read_csv(temp_file_name)
-
-    assert result.returncode == 0
     assert len(data["questions"].values) == 2
 
 
@@ -174,9 +173,8 @@ def test_text_language_autodetect():
                 "CPU",
             ]
         )
+        assert result.returncode == 0
         data = pd.read_csv(temp_file_name)
-
-    assert result.returncode == 0
     assert "马克" in data["prompts"].values[0]
 
 
@@ -196,9 +194,8 @@ def test_text_hf_model():
                 "--hf",
             ]
         )
+        assert result.returncode == 0
         data = pd.read_csv(temp_file_name)
-
-    assert result.returncode == 0
     assert len(data["prompts"].values) == 2
 
 
From 9bcadf7ffdcfe5b133605847d964759593949fac Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Wed, 18 Dec 2024 08:33:47 +0400
Subject: [PATCH 026/110] [Prompt lookup] (#1245)

*Description:*
* Implementation of Prompt lookup decoding based on continuous batching
pipeline (cb_promp_lookup_impl + prompt_lookup_impl)
* Update `prompt_lookup_sample` to use new API
* Update statistic to make of printing more usable

*Ticket:*
* https://jira.devtools.intel.com/browse/CVS-137987

*Example of usage:*
* **Input:** `return 0;`
* **Result Prompt lookup:**
```
===============================
Total duration, ms: 3.02267
Draft model duration, ms: 0.000724718
Main model duration, ms: 3.02195
Draft model duration, %: 0.0239761
Main model duration, %: 99.976
AVG acceptance rate, %: 10.8333
===============================
Request_id: 0 ||| 0 0 0 0 0 0 0 0 20 20 0 0 0 0 20 100 80 0 0 0 0 0 0 60 0 0 20 0 0 0 0 0 20 0 0 50
```
* **Result Greedy:**
```
===============================
Total duration, ms: 3.18111
Draft model duration, ms: 1.538e-06
Main model duration, ms: 3.18111
Draft model duration, %: 4.83479e-05
Main model duration, %: 100
AVG acceptance rate, %: -nan
===============================
```
* **Speedup**: 100 Generated tokens: 5.24% && 300 Generated tokens: 81%
(9.42 vs 5.19)

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .github/workflows/causal_lm_cpp.yml           |  31 +-
 .../prompt_lookup_decoding_lm/CMakeLists.txt  |  21 +-
 .../prompt_lookup_decoding_lm.cpp             | 357 ++----------------
 .../speculative_decoding_lm.cpp               |   1 -
 .../prompt_lookup_decoding_lm/README.md       |  41 ++
 .../prompt_lookup_decoding_lm.py              |  39 ++
 .../genai/continuous_batching_pipeline.hpp    |   4 +
 .../openvino/genai/generation_config.hpp      |  13 +-
 .../include/openvino/genai/llm_pipeline.hpp   |   7 +
 src/cpp/src/continuous_batching_impl.cpp      |   4 +-
 src/cpp/src/continuous_batching_impl.hpp      |   3 +-
 src/cpp/src/continuous_batching_pipeline.cpp  |  28 +-
 src/cpp/src/generation_config.cpp             |  11 +-
 .../continuous_batching_for_prompt_lookup.cpp |  85 +++++
 .../continuous_batching_for_prompt_lookup.hpp |  40 ++
 .../src/prompt_lookup/prompt_lookup_impl.cpp  | 159 ++++++++
 .../src/prompt_lookup/prompt_lookup_impl.hpp  |  49 +++
 ...batching_for_speculative_decoding_impl.cpp |   4 +-
 .../speculative_decoding_impl.cpp             |  23 +-
 .../speculative_decoding_metrics.cpp          |  59 +++
 .../speculative_decoding_metrics.hpp          |   7 +
 src/python/openvino_genai/__init__.py         |   2 +-
 .../openvino_genai/py_openvino_genai.pyi      |  13 +-
 src/python/py_generation_config.cpp           |   4 +-
 src/python/py_llm_pipeline.cpp                |   5 +-
 src/python/py_openvino_genai.cpp              |   1 -
 tests/cpp/CMakeLists.txt                      |   1 +
 27 files changed, 606 insertions(+), 406 deletions(-)
 create mode 100644 samples/python/prompt_lookup_decoding_lm/README.md
 create mode 100755 samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py
 create mode 100644 src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp
 create mode 100644 src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp
 create mode 100644 src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp
 create mode 100644 src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 107777bf74..2e9d72e263 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -491,7 +491,6 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat --task text-generation-with-past
       - name: run and compare
         run: |
           source ./ov/setupvars.sh
@@ -505,36 +504,22 @@ jobs:
 
           ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
           ./build/samples/cpp/text_generation/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt
+          python ./samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_py.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
               predicted_greedy = f.readline()
           with open('predictions_prompt_lookup.txt', 'r') as f:
               predicted_prompt_lookup = f.readline()
+          with open('predictions_py.txt', 'r') as f:
+              predicted_prompt_lookup_py = f.readline()
           assert predicted_greedy == predicted_prompt_lookup
+          assert predicted_greedy == predicted_prompt_lookup_py
+          assert predicted_prompt_lookup == predicted_prompt_lookup_py
           "
           echo "Prompt lookup" passed
-      - name: run and compare (model with seq_length_axis = 1)
-        run: |
-          source ./ov/setupvars.sh
-
-          echo 'Code:```python
-          def add(a, b):
-              return a + b
-          ```
-          Question: Can you please add 2 and 3
-          A:' > ./prompt.txt
-
-          ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./Qwen-7B-Chat/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
-          ./build/samples/cpp/text_generation/greedy_causal_lm ./Qwen-7B-Chat/ "$(<prompt.txt)" > predictions_greedy.txt
-          python -c "
-          with open('predictions_greedy.txt', 'r') as f:
-              predicted_greedy = f.readline()
-          with open('predictions_prompt_lookup.txt', 'r') as f:
-              predicted_prompt_lookup = f.readline()
-          assert predicted_greedy == predicted_prompt_lookup
-          "
-          echo "Prompt lookup" passed
-
+        env:
+          PYTHONPATH: "./build/:$PYTHONPATH"
+          LD_LIBRARY_PATH: "./build/openvino_genai/:$LD_LIBRARY_PATH"
   cpp-Phi-1_5:
     runs-on: ubuntu-20.04-16-cores
     defaults:
diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
index c899c6e47b..b0ce8b1b60 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
+++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
@@ -1,8 +1,6 @@
 # Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading)
-
 find_package(OpenVINOGenAI REQUIRED
     PATHS
         "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
@@ -10,21 +8,16 @@ find_package(OpenVINOGenAI REQUIRED
     NO_CMAKE_FIND_ROOT_PATH
 )
 
-add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp)
-target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime openvino::threading)
-set_target_properties(prompt_lookup_decoding_lm PROPERTIES
-    COMPILE_PDB_NAME prompt_lookup_decoding_lm
+set(TARGET_NAME prompt_lookup_decoding_lm)
+add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp)
+target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai)
+
+set_target_properties(${TARGET_NAME} PROPERTIES
+    COMPILE_PDB_NAME ${TARGET_NAME} 
     # Ensure out of box LC_RPATH on macOS with SIP
     INSTALL_RPATH_USE_LINK_PATH ON)
-target_compile_features(prompt_lookup_decoding_lm PRIVATE cxx_std_17)
-
-get_target_property(genai_imported openvino::genai IMPORTED_LOCATION)
-set(OPENVINO_TOKENIZERS_PATH $<IF:$<BOOL:${genai_imported}>,${genai_imported},$<TARGET_FILE_DIR:openvino::genai>>)
-set(OPENVINO_TOKENIZERS_FILENAME "${CMAKE_SHARED_LIBRARY_PREFIX}openvino_tokenizers${CMAKE_SHARED_LIBRARY_SUFFIX}")
-target_compile_definitions(prompt_lookup_decoding_lm PRIVATE
-    OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}/${OPENVINO_TOKENIZERS_FILENAME}")
 
-install(TARGETS prompt_lookup_decoding_lm
+install(TARGETS ${TARGET_NAME} 
         RUNTIME DESTINATION samples_bin/
         COMPONENT samples_bin
         EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
index 282220a4b1..e692110027 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
+++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
@@ -1,338 +1,45 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include <openvino/core/parallel.hpp>
 #include <openvino/openvino.hpp>
-#include <string_view>
 
-namespace {
-
-// only batch_size = 1 currently supported
-constexpr size_t BATCH_SIZE = 1;
-
-size_t get_seq_len_axis(std::shared_ptr<ov::Model> model) {
-    // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
-    // therefore usually seq_length_axis = 2
-    size_t seq_length_axis = 2;
-
-    // "ReadValue" node is KV cache representation in stateful model
-    std::string kv_node_type_name = std::string(ov::op::v6::ReadValue::get_type_info_static().name);
-
-    for (const auto op : model->get_ops()) {
-        if (op->get_type_name() != kv_node_type_name) {
-            continue;
-        }
-
-        // Shape example: [-1,4,0,64]
-        auto shape = op->get_input_partial_shape(0);
-
-        for (size_t i = 0; i < shape.rank().get_length(); i++) {
-            // Find axis = 0. This would be sequence length axis.
-            if (shape[i] == 0) {
-                seq_length_axis = i;
-            }
-        }
-        break;
-    }
-
-    return seq_length_axis;
-}
-
-std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string&& prompt) {
-    tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt});
-    tokenizer.infer();
-    return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
-}
-
-std::string detokenize(ov::InferRequest& detokenizer, std::vector<int64_t>& tokens) {
-    detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {BATCH_SIZE, tokens.size()}, tokens.data()});
-    detokenizer.infer();
-    return detokenizer.get_output_tensor().data<std::string>()[0];
-}
-
-// The following reasons require TextStreamer to keep a cache of previous tokens:
-// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
-// but detokenize(tokenize("prefix a")) == "prefix a"
-// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
-struct TextStreamer {
-    ov::InferRequest detokenizer;
-    std::vector<int64_t> token_cache;
-    size_t print_len = 0;
-
-    void put(int64_t token) {
-        token_cache.push_back(token);
-        std::string text = detokenize(detokenizer, token_cache);
-        if (!text.empty() && '\n' == text.back() && text.size() > print_len) {
-            // Flush the cache after the new line symbol
-            std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
-            token_cache.clear();
-            print_len = 0;
-            return;
-        }
-        constexpr char replacement[] = "\xef\xbf\xbd";  // MSVC with /utf-8 fails to compile � directly with newline in string literal error.
-        if (text.size() >= 3 && text.compare(text.size() - 3, 3, replacement) == 0) {
-            // Don't print incomplete text
-            return;
-        } else if (text.size() > print_len) {
-            // It is possible to have a shorter text after adding new token.
-            // Print to output only if text length is increaeseds.
-            std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
-            print_len = text.size();
-        }
-    }
-
-    void end() {
-        std::string text = detokenize(detokenizer, token_cache);
-        if (text.size() <= print_len)
-            return;
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
-        token_cache.clear();
-        print_len = 0;
-    }
-};
-
-ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len) {
-    // Copy elements from the old to a new tensor and return it.
-    // Trim kv tensor on sequence length axis
-    // key/values tensor shape example: [BATCH_SIZE, num_kv_heads, seq_len, head_size]
-    // Sequence length axis position may vary from one model to another
-
-    auto shape = tensor.get_shape();
-
-    OPENVINO_ASSERT(seq_len_axis < shape.size(),
-                    "Sequence length axis: ",
-                    seq_len_axis,
-                    " should be less than shape size: ",
-                    shape.size());
-
-    size_t old_seq_len = shape[seq_len_axis];
-
-    OPENVINO_ASSERT(new_seq_len <= old_seq_len);
-
-    // if new_seq_len equal to old one no need to copy tensor, return as is
-    if (old_seq_len == new_seq_len)
-        return tensor;
-
-    shape[seq_len_axis] = new_seq_len;
-
-    if (seq_len_axis == 0) {
-        tensor.set_shape(shape);
-        return tensor;
-    }
-
-    ov::Coordinate new_shape_begin{0, 0, 0, 0};
-    ov::Coordinate new_shape_end{shape};
-
-    auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end);
-
-    return new_tensor;
-}
-
-void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) {
-    // trim kv_cache values up to the new_seq_len
-    auto states = request.query_state();
-    ov::parallel_for(states.size(), [&](size_t i) {
-        ov::Tensor old_tensor = states.at(i).get_state();
-        states.at(i).set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
-    });
-}
-
-class PromptLookupCandidateGenerator {
-private:
-    const size_t max_ngram_size = 3;
-    size_t num_pred_tokens = 5;
-    const size_t max_pred_tokens = 20;
-
-public:
-    PromptLookupCandidateGenerator(const size_t max_ngram_size, const size_t num_pred_tokens)
-        : max_ngram_size{max_ngram_size},
-          num_pred_tokens{num_pred_tokens} {};
-
-    std::vector<int64_t> generate_candidates(const std::vector<int64_t>& input_ids) {
-        const size_t input_length = input_ids.size();
-
-        for (int32_t ngram_size = max_ngram_size; ngram_size > 0; ngram_size--) {
-            // extract last ngram_size tokens as search ngram
-            std::vector<int64_t> ngram = std::vector<int64_t>{input_ids.cend() - ngram_size, input_ids.cend()};
-
-            // find ngram match in input_ids
-            size_t ngram_i = 0;
-            for (size_t input_i = 0; input_i < input_length - ngram_size; input_i++) {
-                if (ngram[ngram_i] != input_ids[input_i]) {
-                    ngram_i = 0;
-                    continue;
-                }
-
-                ngram_i++;
-
-                if (ngram_i < ngram_size) {
-                    continue;
-                }
-
-                // match found with the end at input_i
-                size_t avaliable_num_pred = std::min(input_length - (input_i + 1), num_pred_tokens);
-
-                // return candidates with length of avaliable_num_pred
-                return std::vector<int64_t>{input_ids.cbegin() + input_i + 1,
-                                            input_ids.cbegin() + input_i + 1 + avaliable_num_pred};
-            }
-        }
-
-        return std::vector<int64_t>{};
-    }
-
-    void update_candidate_strategy(const size_t num_matches) {
-        // dynamically adjust number of generated candidates based on number of matches
-        // we want to balance the benefits of getting assistant tokens correct with the
-        // cost of forecasting incorrect assistant tokens.
-        if (num_matches == num_pred_tokens) {
-            num_pred_tokens = std::min(num_pred_tokens + 2, max_pred_tokens);
-        } else {
-            num_pred_tokens = std::max(num_pred_tokens - 1, size_t(1));
-        }
-    }
-};
-
-int64_t get_eos_token(const std::shared_ptr<ov::Model> tokenizer) {
-    auto rt_info = tokenizer->get_rt_info();  // Get the runtime info for the model
-
-    auto it = rt_info.find("eos_token_id");
-    if (it == rt_info.end()) {
-        throw std::runtime_error("EOS token ID not found in model's runtime information.");
-    }
-    return it->second.as<int64_t>();
-}
-
-}  // namespace
+#include "openvino/genai/llm_pipeline.hpp"
 
 int main(int argc, char* argv[]) try {
-    if (argc != 3) {
+    if (3 != argc) {
         throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> '<PROMPT>'");
     }
 
-    // tokenizer model
-    ov::Core core;
-    core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-
-    const std::string model_dir = std::string{argv[1]};
-
-    auto tokenizer_model = core.read_model(model_dir + "/openvino_tokenizer.xml");
-    // tokenizer and detokenizer work on CPU only
-    ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request();
-    auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]);
-
-    std::vector<int64_t> full_input_ids{input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size()};
-
-    ov::InferRequest detokenizer =
-        core.compile_model(model_dir + "/openvino_detokenizer.xml", "CPU").create_infer_request();
-    TextStreamer text_streamer{std::move(detokenizer)};
-
-    std::shared_ptr<ov::Model> ov_model = core.read_model(model_dir + "/openvino_model.xml");
-
-    size_t seq_len_axis = get_seq_len_axis(ov_model);
-
-    ov::InferRequest model = core.compile_model(ov_model, "CPU").create_infer_request();
-
-    model.set_tensor("input_ids", input_ids);
-    model.set_tensor("attention_mask", attention_mask);
-
-    ov::Tensor position_ids = model.get_tensor("position_ids");
-    position_ids.set_shape(input_ids.get_shape());
-    std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
-    size_t seq_len = input_ids.get_shape()[1];
-
-    // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1
-    model.get_tensor("beam_idx").set_shape({BATCH_SIZE});
-    model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
-
-    // To collect kv-cache for the <PROMPT> and to get the next token run the very first infer request
-    model.infer();
-
-    // logits shape is [BATCH_SIZE, seq_len, vocab_size]
-    auto logits = model.get_tensor("logits");
-    size_t vocab_size = logits.get_shape().back();
-    auto data_logits = logits.data<float>() + (seq_len - 1) * vocab_size;
-    int64_t out_token = std::max_element(data_logits, data_logits + vocab_size) - data_logits;
-
-    full_input_ids.push_back(out_token);
-
-    auto first_token = out_token;
-    text_streamer.put(out_token);
-
-    const int64_t EOS_TOKEN = get_eos_token(tokenizer_model);
-
-    // Prompt lookup decoding is a speculative decoding technique where the draft model replaced
-    // with string matching in the prompt to generate candidate token sequences.
-    int max_sequence_length = 100;
-    PromptLookupCandidateGenerator candidateGenerator{3, 5};
-
-    while (out_token != EOS_TOKEN && seq_len < max_sequence_length) {
-        auto candidates = candidateGenerator.generate_candidates(full_input_ids);
-
-        // cut redundant candidates on last iteration
-        size_t tokens_to_generate = max_sequence_length - seq_len;
-        candidates.resize(std::min(candidates.size(), tokens_to_generate - 1));
-        size_t candidates_size = candidates.size();
-
-        // candidates_size + 1 tokens will be fed at once in a single infer request.
-        input_ids.set_shape({BATCH_SIZE, candidates_size + 1});
-        input_ids.data<int64_t>()[0] = first_token;
-        std::copy_n(candidates.begin(), candidates_size, input_ids.data<int64_t>() + 1);
-
-        attention_mask.set_shape({BATCH_SIZE, seq_len + candidates_size + 1});
-        std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
-
-        position_ids.set_shape({BATCH_SIZE, candidates_size + 1});
-        std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), seq_len);
-
-        model.infer();
-
-        data_logits = logits.data<float>();  // [BATCH_SIZE, 1 + candidates_size, vocab_size]
-
-        // 1. accept current out token (if not eos)
-        // 2. check if it matches appropriate candidate
-        //      2.1 if it's match, continue - accept next token
-        //      2.2 it it's mismatch, stop iteration but still accept current token as it was last token generated by
-        //      model from a valid sequence.
-        size_t accepted_tokens_number = 0;
-        for (size_t i = 0; i < candidates_size + 1; i++) {
-            auto start = data_logits + vocab_size * i;
-            auto stop = data_logits + vocab_size * (i + 1);
-            out_token = std::max_element(start, stop) - start;
-
-            if (out_token == EOS_TOKEN) {
-                break;
-            }
-
-            text_streamer.put(out_token);
-            full_input_ids.push_back(out_token);
-            accepted_tokens_number++;
-
-            if (i == candidates_size || out_token != candidates[i]) {
-                break;
-            }
-        }
-
-        if (accepted_tokens_number > 0) {
-            candidateGenerator.update_candidate_strategy(accepted_tokens_number - 1);
-        }
-
-        // After the inference request, key/values have shape [BATCH_SIZE, seq_len + candidates_size, vocab_size].
-        // Increment the sequence length by the number of matched tokens, and
-        // trim the KV cache to match the new sequence length.
-        seq_len += accepted_tokens_number;
-        update_kv_cache(model, seq_len_axis, seq_len);
-
-        first_token = out_token;
-    }
-
-    text_streamer.end();
-    // Model is stateful which means that context (kv-cache) which belongs to a particular
-    // text sequence is accumulated inside the model during the generation loop above.
-    // This context should be reset before processing the next text sequence.
-    // While it is not required to reset context in this sample as only one sequence is processed,
-    // it is called for education purposes:
-    model.reset_state();
+    ov::genai::GenerationConfig config;
+    config.max_new_tokens = 100;
+    // Define candidates number for candidate generation
+    config.num_assistant_tokens = 5;
+    // Define max_ngram_size
+    config.max_ngram_size = 3;
+
+    std::string model_path = argv[1];
+    std::string prompt = argv[2];
+    
+    std::string device = "CPU";
+
+    ov::genai::SchedulerConfig scheduler_config;
+    scheduler_config.cache_size = 5;
+
+    ov::genai::LLMPipeline pipe(
+        model_path,
+        device,
+        ov::genai::prompt_lookup(true),
+        ov::genai::scheduler_config(scheduler_config));
+
+    auto streamer = [](std::string subword) {
+        std::cout << subword << std::flush;
+        return false;
+    };
+
+    // Since the streamer is set, the results will
+    // be printed each time a new token is generated.
+    pipe.generate(prompt, config, streamer);
+    std::cout << std::endl;
 } catch (const std::exception& error) {
     try {
         std::cerr << error.what() << '\n';
diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
index dc6761879c..487296566b 100644
--- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
+++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
@@ -29,7 +29,6 @@ int main(int argc, char* argv[]) try {
     ov::genai::SchedulerConfig scheduler_config;
     scheduler_config.cache_size = 5;
 
-    // Different devices require different block sizes, so different scheduler configs need to be set.
     ov::genai::LLMPipeline pipe(
         main_model_path,
         main_device,
diff --git a/samples/python/prompt_lookup_decoding_lm/README.md b/samples/python/prompt_lookup_decoding_lm/README.md
new file mode 100644
index 0000000000..1e5f4003d4
--- /dev/null
+++ b/samples/python/prompt_lookup_decoding_lm/README.md
@@ -0,0 +1,41 @@
+# prompt_lookup_decoding_lm Python sample that supports most popular models like LLaMA 3
+
+[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality.
+
+This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code.  Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
+
+## Download and convert the model and tokenizers
+
+The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
+
+It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported.
+
+```sh
+source <INSTALL_DIR>/setupvars.sh
+pip install --upgrade-strategy eager -r ../../requirements.txt
+optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
+```
+
+## Run
+
+Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample:
+
+`python prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "return 0;"`
+
+
+Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
+
+See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py
new file mode 100755
index 0000000000..557897b6b1
--- /dev/null
+++ b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import openvino_genai
+
+def streamer(subword): 
+        print(subword, end='', flush=True) 
+        # Return flag corresponds whether generation should be stopped. 
+        # False means continue generation. 
+        return False
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model_dir')
+    parser.add_argument('prompt')
+    args = parser.parse_args()
+
+    device = 'CPU'
+    scheduler_config = openvino_genai.SchedulerConfig()
+    # cache params
+    scheduler_config.cache_size = 2
+
+    pipe = openvino_genai.LLMPipeline(args.model_dir, device, scheduler_config=scheduler_config, prompt_lookup=True)
+    
+    config = openvino_genai.GenerationConfig()
+    config.max_new_tokens = 100
+    # add parameter to enable prompt lookup decoding to generate `num_assistant_tokens` candidates per iteration
+    config.num_assistant_tokens = 5
+    # Define max_ngram_size
+    config.max_ngram_size = 3
+
+    # Since the streamer is set, the results will be printed 
+    # every time a new token is generated and put into the streamer queue.
+    pipe.generate(args.prompt, config, streamer)
+
+if '__main__' == __name__:
+    main()
diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index 4a0637f2d9..74466ee488 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -55,10 +55,14 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
     class ImplInterface;
     class ContinuousBatchingImpl;
     class ContinuousBatchingForSpeculativeDecodingImpl;
+    class ContinuousBatchingForPromptLookupImpl;
     class SpeculativeDecodingImpl;
+    class PromptLookupImpl;
 
     friend class ContinuousBatchingForSpeculativeDecodingImpl;
+    friend class ContinuousBatchingForPromptLookupImpl;
     friend class SpeculativeDecodingImpl;
+    friend class PromptLookupImpl;
 
     std::shared_ptr<ImplInterface> m_impl;
 
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index 9d79240aa8..b8b222e347 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -71,9 +71,10 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
  * @param frequency_penalty reduces absolute log prob as many times as the token was generated.
  * @param rng_seed initializes random generator.
  *
- * Speculative decoding parameters:
- * @param assistant_confidence_threshold the lower token probability of candidate to be validated by main model in case of static strategy candidates number update.
- * @param num_assistant_tokens the defined candidates number to be generated by draft model in case of dynamic strategy candidates number update.
+ * Assisting generation parameters:
+ * @param assistant_confidence_threshold the lower token probability of candidate to be validated by main model in case of dynamic strategy candidates number update.
+ * @param num_assistant_tokens the defined candidates number to be generated by draft model/prompt lookup in case of static strategy candidates number update.
+ * @param max_ngram_size is maximum ngram to use when looking for matches in the prompt.
  */
 
 class OPENVINO_GENAI_EXPORTS GenerationConfig {
@@ -114,9 +115,10 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     float frequency_penalty = 0.0f;
     size_t rng_seed = 0;
 
-    // Speculative decoding
+    // Assisting generation parameters
     float assistant_confidence_threshold = 0.f;
     size_t num_assistant_tokens = 0;
+    size_t max_ngram_size = 0;
 
     // EOS special token
     int64_t eos_token_id = -1;
@@ -132,7 +134,10 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     bool is_greedy_decoding() const;
     bool is_beam_search() const;
     bool is_multinomial() const;
+    OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2025.0.0 release")
     bool is_speculative_decoding() const;
+    bool is_assisting_generation() const;
+    bool is_prompt_lookup() const;
     void update_generation_config(const ov::AnyMap& config_map);
 
     template <typename... Properties>
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index 44427d45b1..948baab6f4 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -320,5 +320,12 @@ inline std::pair<std::string, Any> draft_model(
 */
 static constexpr ov::Property<SchedulerConfig> scheduler_config{"scheduler_config"};
 
+/**
+* @brief enable prompt_lookup property serves to activate prompt lookup decoding.
+* Set `true` to activate this mode.
+* And create LLMPipeline instance with this config.
+*/
+static constexpr ov::Property<bool> prompt_lookup{"prompt_lookup"};
+
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index bf0c979d39..6e7e982a4c 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -16,10 +16,12 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
     const SchedulerConfig& scheduler_config,
     const std::string& device,
     const ov::AnyMap& properties,
-    const ov::genai::GenerationConfig& generation_config
+    const ov::genai::GenerationConfig& generation_config,
+    bool is_validation_mode_enabled
     ) {
     m_tokenizer = tokenizer;
     m_generation_config = generation_config;
+    m_is_validation_mode_enabled = is_validation_mode_enabled;
     
     ov::Core core;
 
diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp
index 780bff6a31..8da05c6dfa 100644
--- a/src/cpp/src/continuous_batching_impl.hpp
+++ b/src/cpp/src/continuous_batching_impl.hpp
@@ -58,7 +58,8 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
                            const SchedulerConfig& scheduler_config,
                            const std::string& device,
                            const ov::AnyMap& properties,
-                           const ov::genai::GenerationConfig& generation_config);
+                           const ov::genai::GenerationConfig& generation_config,
+                           bool is_validation_mode_enabled = false);
 
     GenerationHandle add_request(uint64_t request_id,
                                  const ov::Tensor& input_ids,
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 2faad4354e..148eb2fa9f 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -11,6 +11,7 @@
 #include "openvino/genai/tokenizer.hpp"
 #include "continuous_batching_impl.hpp"
 #include "speculative_decoding/speculative_decoding_impl.hpp"
+#include "prompt_lookup/prompt_lookup_impl.hpp"
 #include "timer.hpp"
 #include "utils.hpp"
 #include "debug_utils.hpp"
@@ -28,6 +29,15 @@ extract_draft_model_from_config(ov::AnyMap& config) {
     return draft_model;
 }
 
+inline bool
+extract_prompt_lookup_from_config(ov::AnyMap& config) {
+    bool res = false;
+    if (config.find(ov::genai::prompt_lookup.name()) != config.end()) {
+        res = config.at(ov::genai::prompt_lookup.name()).as<bool>();
+        config.erase(ov::genai::prompt_lookup.name());
+    }
+    return res;
+}
 
 ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::path& models_path,
                                                         const SchedulerConfig& scheduler_config,
@@ -36,12 +46,16 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
                                                         const ov::AnyMap& tokenizer_properties) {
     auto properties_without_draft_model = properties;
     auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
+    auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
     
     std::filesystem::path openvino_model_name = "openvino_model.xml";
     auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
     auto tokenizer = ov::genai::Tokenizer(models_path, tokenizer_properties);
     auto generation_config = utils::from_config_json_if_exists(models_path);
-    if (draft_model_desr.model == nullptr) {
+    if (is_prompt_lookup_enabled) {
+        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded");
+        m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
+    } else if (draft_model_desr.model == nullptr) {
         m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     } else {
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
@@ -57,11 +71,15 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     const ov::AnyMap& properties) {
     auto properties_without_draft_model = properties;
     auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
+    auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
     std::filesystem::path openvino_model_name = "openvino_model.xml";
     auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
     auto generation_config = utils::from_config_json_if_exists(models_path);
 
-    if (draft_model_desr.model == nullptr) {
+    if (is_prompt_lookup_enabled) {
+        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded");
+        m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
+    } else if (draft_model_desr.model == nullptr) {
         m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     } else {
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
@@ -79,9 +97,13 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     const ov::genai::GenerationConfig& generation_config) {
     auto properties_without_draft_model = properties;
     auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
+    auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
     auto model = utils::singleton_core().read_model(model_str, weights_tensor);
 
-    if (draft_model_desr.model == nullptr) {
+    if (is_prompt_lookup_enabled) {
+        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded");
+        m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
+    } else if (draft_model_desr.model == nullptr) {
         m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     } else {
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index 189cfeded7..35ae92d605 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -132,9 +132,17 @@ bool GenerationConfig::is_multinomial() const {
 }
 
 bool GenerationConfig::is_speculative_decoding() const {
+    return is_assisting_generation();
+}
+
+bool GenerationConfig::is_assisting_generation() const {
     return (assistant_confidence_threshold > 0 || num_assistant_tokens > 0);
 }
 
+bool GenerationConfig::is_prompt_lookup() const {
+    return (max_ngram_size > 0 && num_assistant_tokens > 0);
+}
+
 void GenerationConfig::validate() const {
     OPENVINO_ASSERT(eos_token_id == -1 || stop_token_ids.find(eos_token_id) != stop_token_ids.end(),
         "'stop_token_ids' must contain 'eos_token_id'. Please, call 'set_eos_token_id' with 'eos_token_id' value");
@@ -181,9 +189,10 @@ void GenerationConfig::validate() const {
         OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]");
         OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]");
     }
-    if (is_speculative_decoding()) {
+    if (is_assisting_generation()) {
         if (assistant_confidence_threshold != 0.f) {
             OPENVINO_ASSERT(num_assistant_tokens == 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`");
+            OPENVINO_ASSERT(!is_prompt_lookup(), "Parameters `assistant_confidence_threshold` cannot be used while Prompt Lookup decoding");
         } else {
             OPENVINO_ASSERT(num_assistant_tokens > 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`");
         };
diff --git a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp
new file mode 100644
index 0000000000..8c9e520728
--- /dev/null
+++ b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp
@@ -0,0 +1,85 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "continuous_batching_for_prompt_lookup.hpp"
+
+namespace ov::genai {
+
+std::map<uint64_t, ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl::SequenceLen>
+ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl::get_generated_request_len() {
+    std::map<uint64_t, ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl::SequenceLen> result;
+    for (const auto& request : m_requests) {
+        const auto request_id = request->get_request_id();
+        auto validation_len = request->get_num_tokens_to_validate();
+        auto generated_len = request->get_num_processed_tokens() - request->get_prompt_len() + 1;
+        result.insert({ request_id, { generated_len, validation_len } });
+    }
+    return result;
+}
+
+TokenIds ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl::generate_candidates(const TokenIds& input_ids, size_t num_pred_tokens, size_t max_ngram_size) {
+    if (num_pred_tokens == 0) {
+        return std::vector<int64_t>{};
+    }
+
+    const size_t input_length = input_ids.size();
+
+    for (int32_t ngram_size = max_ngram_size; ngram_size > 0; ngram_size--) {
+        // extract last ngram_size tokens as search ngram
+        std::vector<int64_t> ngram = std::vector<int64_t>{input_ids.cend() - ngram_size, input_ids.cend()};
+
+        // find ngram match in input_ids
+        size_t ngram_i = 0;
+        for (size_t input_i = 0; input_i < input_length - ngram_size; input_i++) {
+            if (ngram[ngram_i] != input_ids[input_i]) {
+                ngram_i = 0;
+                continue;
+            }
+
+            ngram_i++;
+
+            if (ngram_i < ngram_size) {
+                continue;
+            }
+
+            // match found with the end at input_i
+            size_t avaliable_num_pred = std::min(input_length - (input_i + 1), num_pred_tokens);
+
+            // return candidates with length of avaliable_num_pred
+            return std::vector<int64_t>{input_ids.cbegin() + input_i + 1,
+                                        input_ids.cbegin() + input_i + 1 + avaliable_num_pred};
+        }
+    }
+
+    return std::vector<int64_t>{};
+}
+
+void ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl::generate_candidates() {
+    for (auto& request : m_requests) {
+        const auto prompt = request->get_prompt_ids();
+        size_t max_validation_len = 0;
+        for (auto& running_sequence : request->get_running_sequences()) {
+            const auto generated_tokens = running_sequence->get_generated_ids();
+            TokenIds full_input_ids = prompt;
+            full_input_ids.insert(full_input_ids.end(), generated_tokens.begin(), generated_tokens.end());
+
+            size_t min_num_assistant_tokens = 0;
+            const auto sampling_params = request->get_sampling_parameters();
+            {
+                const auto generated_len = running_sequence->get_generated_len();
+                const auto left_generated_len = std::min(sampling_params.max_new_tokens, sampling_params.max_length) - generated_len - 1;
+                min_num_assistant_tokens = std::min(sampling_params.num_assistant_tokens, left_generated_len);
+            }
+            TokenIds candidates = generate_candidates(full_input_ids, min_num_assistant_tokens, sampling_params.max_ngram_size);
+
+            if (!candidates.empty()) {
+                for (const auto& candidate : candidates) {
+                    running_sequence->append_token(candidate, 0);
+                }
+                max_validation_len = std::max(max_validation_len, candidates.size());
+            }
+        }
+        request->set_num_validated_tokens(max_validation_len);
+    }
+}
+}
\ No newline at end of file
diff --git a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp
new file mode 100644
index 0000000000..8962aba0f2
--- /dev/null
+++ b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp
@@ -0,0 +1,40 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "openvino/genai/continuous_batching_pipeline.hpp"
+
+#include "continuous_batching_impl.hpp"
+
+namespace ov::genai {
+class ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl : public ContinuousBatchingPipeline::ContinuousBatchingImpl {
+public:
+    ContinuousBatchingForPromptLookupImpl() = default;
+
+    ContinuousBatchingForPromptLookupImpl(
+        const std::shared_ptr<ov::Model>& model,
+        const Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device,
+        const ov::AnyMap& properties,
+        const ov::genai::GenerationConfig& generation_config,
+        bool is_validation_mode_enabled = false) :
+    ContinuousBatchingImpl{ model,
+                            tokenizer,
+                            scheduler_config,
+                            device,
+                            properties,
+                            generation_config,
+                            true } {};
+                            
+    void generate_candidates();
+
+    // { generated_len, validation_len }
+    using SequenceLen = std::pair<uint64_t, uint64_t>;
+    std::map<uint64_t, SequenceLen> get_generated_request_len();
+
+protected:
+    TokenIds generate_candidates(const TokenIds& input_ids, size_t num_pred_tokens, size_t max_ngram_size);
+};
+}
\ No newline at end of file
diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp
new file mode 100644
index 0000000000..f934a56939
--- /dev/null
+++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp
@@ -0,0 +1,159 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "prompt_lookup_impl.hpp"
+#include "text_callback_streamer.hpp"
+
+namespace ov::genai {
+template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
+template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+
+GenerationHandle
+ContinuousBatchingPipeline::PromptLookupImpl::add_request(uint64_t request_id,
+                                                          const ov::Tensor& input_ids,
+                                                          ov::genai::GenerationConfig sampling_params) {
+    OPENVINO_ASSERT(sampling_params.is_prompt_lookup(), "`max_ngram_size` && `num_assistant_tokens` should be specified for `prompt lookup decoding`");
+    return m_pipeline->add_request(request_id, input_ids, sampling_params);
+};
+
+GenerationHandle
+ContinuousBatchingPipeline::PromptLookupImpl::add_request(uint64_t request_id,
+                                                          const std::string& prompt,
+                                                          ov::genai::GenerationConfig sampling_params) {
+    OPENVINO_ASSERT(sampling_params.is_prompt_lookup(), "`max_ngram_size` && `num_assistant_tokens` should be specified for `prompt lookup decoding`");
+    return m_pipeline->add_request(request_id, prompt, sampling_params);
+}
+
+bool ContinuousBatchingPipeline::PromptLookupImpl::has_non_finished_requests() {
+    return m_pipeline->has_non_finished_requests();
+}
+
+void ContinuousBatchingPipeline::PromptLookupImpl::step() {
+    ManualTimer candidates_timer("prompt_lookup_decoding: generate_candidates()");
+    candidates_timer.start();
+    m_pipeline->generate_candidates();
+    candidates_timer.end();
+    m_sd_metrics.draft_duration += candidates_timer.get_duration();
+    auto generated_len_before = m_pipeline->get_generated_request_len();
+
+    ManualTimer main_timer("prompt_lookup_decoding: step()");
+    main_timer.start();
+    m_pipeline->step();
+    main_timer.end();
+    m_sd_metrics.main_duration += main_timer.get_duration();
+    m_pipeline_metrics = m_pipeline->get_metrics();
+    auto generated_len_after = m_pipeline->get_generated_request_len();
+
+    for (const auto request : generated_len_before) {
+        auto request_id = request.first;
+        auto prev_validation_len = request.second.second;
+        if (prev_validation_len == 0) {
+            continue;
+        }
+        size_t num_matches = prev_validation_len;
+        float acceptance_rate = 1.f;
+        if (generated_len_after.count(request.first)) {
+            auto present_req_len = generated_len_after.at(request.first).first;
+            auto prev_full_req_len = request.second.first;
+
+            num_matches = (present_req_len - prev_full_req_len - 1);
+            acceptance_rate = static_cast<float>(num_matches) / static_cast<float>(prev_validation_len);
+        }        
+        m_sd_metrics.update_acceptance_rate(request_id, acceptance_rate * 100);
+        m_sd_metrics.update_draft_accepted_tokens(request_id, num_matches);
+    }
+
+    if (generated_len_after.empty() && 0) {
+        m_sd_metrics.print(true);
+        m_sd_metrics.clean_up();
+    }
+}
+
+std::vector<EncodedGenerationResult>
+ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vector<ov::Tensor>& input_ids,
+                                                       const std::vector<GenerationConfig>& sampling_params,
+                                                       const StreamerVariant& streamer) {
+    ManualTimer generate_timer("speculative_decoding: generate()");
+    generate_timer.start();
+    OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
+    OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
+    const std::shared_ptr<StreamerBase>& streamer_ptr = std::visit(overloaded{
+        [](std::monostate) -> std::shared_ptr<StreamerBase> {
+            return nullptr;
+        },
+        [](const std::shared_ptr<StreamerBase>& streamer) {
+            return streamer;
+        },
+        [this](const std::function<bool(std::string)>& streamer) -> std::shared_ptr<StreamerBase> {
+            return std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer);
+        }
+    }, streamer);
+
+    OPENVINO_ASSERT(streamer_ptr == nullptr || input_ids.size() == 1 && (sampling_params[0].is_greedy_decoding() || sampling_params[0].is_multinomial()),
+        "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
+
+    std::vector<GenerationHandle> main_generations;
+    for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) {
+        OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch.");   
+        OPENVINO_ASSERT(sampling_params[request_id].is_prompt_lookup(), "`max_ngram_size` && `num_assistant_tokens` should be specified for `prompt lookup decoding`"); 
+        main_generations.push_back(m_pipeline->add_request(request_id, input_ids[request_id], sampling_params[request_id]));
+    }
+
+    std::vector<EncodedGenerationResult> results;
+    results.reserve(input_ids.size());
+
+    bool continue_generation = true;
+    while (has_non_finished_requests() && continue_generation) {
+        step();
+        if (streamer_ptr) {
+            // not generated tokens like several prompt phase
+            if (!main_generations.at(0).get()->can_read()) {
+                continue;
+            }
+            std::unordered_map<uint64_t, GenerationOutput> token = main_generations.at(0).get()->back();
+            OPENVINO_ASSERT(1 <= token.size());
+            OPENVINO_ASSERT(1 <= token.begin()->second.generated_ids.size());
+            for (const auto& gen_token : token.begin()->second.generated_ids) {
+                continue_generation = !streamer_ptr->put(gen_token);
+                if (!continue_generation) {
+                    break;
+                }
+            }
+        }
+    }
+    if (streamer_ptr) {
+        streamer_ptr->end();
+    }
+
+    for (size_t generation_idx = 0; generation_idx < main_generations.size(); ++generation_idx) {
+        const auto& generation = main_generations[generation_idx];
+        EncodedGenerationResult result;
+        result.m_request_id = 1;
+        std::vector<GenerationOutput> generation_outputs = generation->read_all();
+        std::sort(generation_outputs.begin(), generation_outputs.end(), [=] (GenerationOutput& r1, GenerationOutput& r2) {
+            return r1.score > r2.score;
+        });
+
+        auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size());
+        for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
+            const auto& generation_output = generation_outputs[generation_output_idx];
+            m_sd_metrics.set_generated_len(generation_idx, generation_outputs[generation_output_idx].generated_ids.size());
+            result.m_generation_ids.push_back(std::move(generation_output.generated_ids));
+            result.m_scores.push_back(generation_output.score);
+        }
+        result.m_status = generation->get_status();
+        results.push_back(std::move(result));
+    }
+
+    OPENVINO_ASSERT(results.size() == input_ids.size());
+    generate_timer.end();
+    m_sd_metrics.total_duration = generate_timer.get_duration();
+
+    return results;
+}
+
+SpeculativeDecodingMetrics
+ContinuousBatchingPipeline::PromptLookupImpl::get_metrics() {
+    return m_sd_metrics;
+};
+}
diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp
new file mode 100644
index 0000000000..dae721741b
--- /dev/null
+++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp
@@ -0,0 +1,49 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "openvino/genai/continuous_batching_pipeline.hpp"
+#include "continuous_batching_impl.hpp"
+#include "continuous_batching_for_prompt_lookup.hpp"
+#include "speculative_decoding/speculative_decoding_metrics.hpp"
+#include "utils.hpp"
+
+namespace ov::genai {
+
+class ContinuousBatchingPipeline::PromptLookupImpl : public ContinuousBatchingPipeline::ImplInterface {
+protected:
+    std::shared_ptr<ContinuousBatchingForPromptLookupImpl> m_pipeline;
+    SpeculativeDecodingMetrics m_sd_metrics;
+    
+public:
+    PromptLookupImpl(const std::shared_ptr<ov::Model>& model,
+                     const Tokenizer& tokenizer,
+                     const SchedulerConfig& scheduler_config,
+                     const std::string& device,
+                     const ov::AnyMap& properties,
+                     const ov::genai::GenerationConfig& generation_config) {
+        m_tokenizer = tokenizer;
+        m_pipeline = std::make_shared<ContinuousBatchingForPromptLookupImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
+    };
+
+    GenerationHandle add_request(uint64_t request_id,
+                                 const ov::Tensor& input_ids,
+                                 ov::genai::GenerationConfig sampling_params) override;
+    GenerationHandle add_request(uint64_t request_id,
+                                 const std::string& prompt,
+                                 ov::genai::GenerationConfig sampling_params) override;
+
+    bool has_non_finished_requests() override;
+
+    void step() override;
+
+    std::vector<EncodedGenerationResult>
+    generate(const std::vector<ov::Tensor>& input_ids,
+             const std::vector<GenerationConfig>& sampling_params,
+             const StreamerVariant& streamer) override;
+
+    SpeculativeDecodingMetrics get_metrics();
+};
+
+}
\ No newline at end of file
diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp
index 06a51b38be..36f274f30f 100644
--- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp
@@ -141,7 +141,7 @@ init_request(
     LogitProcessor& logit_processor,
     bool is_update_logit_processor,
     bool is_init_all_sequences_in_request = false) {
-    OPENVINO_ASSERT(request->get_sampling_parameters().is_speculative_decoding(),
+    OPENVINO_ASSERT(request->get_sampling_parameters().is_assisting_generation(),
                     "Speculative decoding should have initialized options `assistant_confidence_threshold` xor `num_assistant_tokens` in `GenerationConfig`.");
     if (candidates.begin()->second.token_ids.empty() && !is_init_all_sequences_in_request) {
         return 0;
@@ -303,7 +303,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::m
         to_generate = false;
         for (auto& request : m_requests) {
             const auto& sampling_params = request->get_sampling_parameters();
-            if (!sampling_params.is_speculative_decoding()) {
+            if (!sampling_params.is_assisting_generation()) {
                 // generate only one token in case of non speculative decoding
                 request->pause_generation(true);
             } else if (request->get_num_processed_tokens() >= request->get_prompt_len() &&
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
index e4f3b1ad1f..4a0748b5c0 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -182,6 +182,11 @@ void ContinuousBatchingPipeline::SpeculativeDecodingImpl::step() {
         m_sd_metrics.update_acceptance_rate(request_id, acceptance_rate * 100);
         m_sd_metrics.update_draft_accepted_tokens(request_id, (updated_seq_info.inserted_tokens_cnt - updated_seq_info.removed_tokens_cnt));
     }
+
+    if (main_generated_requests.empty() && 0) {
+        m_sd_metrics.print(true);
+        m_sd_metrics.clean_up();
+    }
 }
 
 std::vector<EncodedGenerationResult>
@@ -266,24 +271,6 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
 
     OPENVINO_ASSERT(results.size() == input_ids.size());
     generate_timer.end();
-    m_sd_metrics.total_duration = generate_timer.get_duration();
-
-    // Print Speculative decoding metrics
-    if (0) {
-        std::cout << std::endl;
-        std::cout << "Total duration, ms: " << m_sd_metrics.total_duration << std::endl;
-        std::cout << "Draft model duration, ms: " << m_sd_metrics.draft_duration << std::endl;
-        std::cout << "Main model duration, ms: " << m_sd_metrics.main_duration << std::endl;
-        std::cout << "Draft model duration, %: " << m_sd_metrics.get_draft_duration_percentage() << std::endl;
-        std::cout << "Main model duration, %: " << m_sd_metrics.get_main_duration_percentage() << std::endl;
-        std::cout << "Main model iterations: " << m_sd_metrics.get_iteration_number(0) << std::endl;
-        std::cout << "Token per sec: " << float(sampling_params[0].max_new_tokens) / m_sd_metrics.total_duration << std::endl;
-        std::cout << "AVG acceptance rate, %: " << m_sd_metrics.get_avg_acceptance_rate(0) << std::endl;
-        std::cout << "Accepted tokens by draft model: " << m_sd_metrics.get_draft_accepted_tokens_counter(0) << std::endl;
-        std::cout << "Generated tokens: " << sampling_params[0].max_new_tokens << std::endl;
-        std::cout << "Accepted token rate, %: " << m_sd_metrics.get_draft_accepted_tokens_percentage(0) << std::endl;
-    }
-
     return results;
 }
 
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp
index 42d3f0b750..4e5602482a 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp
@@ -95,4 +95,63 @@ void SpeculativeDecodingMetrics::set_generated_len(int64_t request_id, size_t ge
     m_generated_len.insert({ request_id, generated_len });
 }
 
+size_t SpeculativeDecodingMetrics::get_generated_len(int64_t request_id) {
+    return m_generated_len.at(request_id);
+}
+
+std::vector<int64_t> SpeculativeDecodingMetrics::get_requests_id() {
+    std::vector<int64_t> result;
+    for (const auto& req : m_generated_len) {
+        result.push_back(req.first);
+    }
+    return result;
+}
+
+void SpeculativeDecodingMetrics::print_acceptance_rates() {
+    for (const auto& a : m_acceptance_rate) {
+        std::cout << "Request_id: " << a.first << " ||| ";
+        for (const auto& b : a.second) {
+            std::cout << b << " ";
+        }
+        std::cout << std::endl;
+    }
+}
+
+void SpeculativeDecodingMetrics::print(bool is_printing_per_request) {
+    if (total_duration == 0) {
+        total_duration = draft_duration + main_duration;
+    }
+    std::cout << "\n=============================== " << std::endl;
+    std::cout << "Total duration, ms: " << total_duration << std::endl;
+    std::cout << "Draft model duration, ms: " << draft_duration << std::endl;
+    std::cout << "Main model duration, ms: " << main_duration << std::endl;
+    std::cout << "Draft model duration, %: " << get_draft_duration_percentage() << std::endl;
+    std::cout << "Main model duration, %: " << get_main_duration_percentage() << std::endl;
+    std::cout << "AVG acceptance rate, %: " << get_avg_acceptance_rate(-1) << std::endl;
+    std::cout << "=============================== " << std::endl;
+    if (is_printing_per_request) {
+        for (const auto& i : get_requests_id()) {
+            std::cout << "REQUEST_ID: " << i << std::endl;
+            std::cout << "Main model iterations: " << get_iteration_number(i) << std::endl;
+            std::cout << "Token per sec: " << float(get_generated_len(i)) / total_duration << std::endl;
+            std::cout << "AVG acceptance rate, %: " << get_avg_acceptance_rate(i) << std::endl;
+            std::cout << "Accepted tokens by draft model: " << get_draft_accepted_tokens_counter(i) << std::endl;
+            std::cout << "Generated tokens: " << get_generated_len(i) << std::endl;
+            std::cout << "Accepted token rate, %: " << get_draft_accepted_tokens_percentage(i) << std::endl;
+            std::cout << "=============================== " << std::endl;
+        }
+        print_acceptance_rates();
+    }
+
+}
+
+void SpeculativeDecodingMetrics::clean_up() {
+    m_acceptance_rate.clear();
+    m_draft_accepted_tokens.clear();
+    m_generated_len.clear();
+    draft_duration = 0;
+    main_duration = 0;
+    total_duration = 0;
+}
+
 }
\ No newline at end of file
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp
index 5256128277..0d9173b99f 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp
@@ -28,6 +28,7 @@ class SpeculativeDecodingMetrics {
     void update_draft_accepted_tokens(int64_t request_id, size_t num_matches);
 
     void set_generated_len(int64_t request_id, size_t generated_len);
+    size_t get_generated_len(int64_t request_id);
 
     size_t get_iteration_number(int64_t request_id);
 
@@ -35,5 +36,11 @@ class SpeculativeDecodingMetrics {
     float get_main_duration_percentage();
     float get_inference_duration_percentage();
 
+    std::vector<int64_t> get_requests_id();
+
+    void print_acceptance_rates();
+    void print(bool is_printing_per_request = false);
+
+    void clean_up();
 };
 }
\ No newline at end of file
diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py
index 470ddd0cd8..a0b0faf58c 100644
--- a/src/python/openvino_genai/__init__.py
+++ b/src/python/openvino_genai/__init__.py
@@ -28,7 +28,7 @@
 # LLM pipeline
 from .py_openvino_genai import (
     LLMPipeline, 
-    draft_model
+    draft_model,
 )
 
 # LoRA
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index 6135a187eb..524ff0f921 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -575,6 +575,7 @@ class GenerationConfig:
     logprobs: int
     max_length: int
     max_new_tokens: int
+    max_ngram_size: int
     min_new_tokens: int
     no_repeat_ngram_size: int
     num_assistant_tokens: int
@@ -598,11 +599,13 @@ class GenerationConfig:
     @typing.overload
     def __init__(self, **kwargs) -> None:
         ...
+    def is_assisting_generation(self) -> bool:
+        ...
     def is_beam_search(self) -> bool:
         ...
     def is_greedy_decoding(self) -> bool:
         ...
-    def is_speculative_decoding(self) -> bool:
+    def is_prompt_lookup(self) -> bool:
         ...
     def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None:
         ...
@@ -2122,11 +2125,7 @@ class WhisperRawPerfMetrics:
     @property
     def features_extraction_durations(self) -> list[float]:
         ...
-class draft_model:
+def draft_model(models_path: os.PathLike, device: str = '', **kwargs) -> openvino._pyopenvino.OVAny:
     """
-    This class is used to enable Speculative Decoding
+    device on which inference will be performed
     """
-    def __init__(self, models_path: os.PathLike, device: str = '', **kwargs) -> None:
-        """
-        device on which inference will be performed
-        """
diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp
index d24a915dd6..b1a5c6cd2e 100644
--- a/src/python/py_generation_config.cpp
+++ b/src/python/py_generation_config.cpp
@@ -107,12 +107,14 @@ void init_generation_config(py::module_& m) {
         .def_readwrite("logprobs", &GenerationConfig::logprobs)
         .def_readwrite("assistant_confidence_threshold", &GenerationConfig::assistant_confidence_threshold)
         .def_readwrite("num_assistant_tokens", &GenerationConfig::num_assistant_tokens)
+        .def_readwrite("max_ngram_size", &GenerationConfig::max_ngram_size)
         .def_readwrite("include_stop_str_in_output", &GenerationConfig::include_stop_str_in_output)
         .def_readwrite("stop_token_ids", &GenerationConfig::stop_token_ids)
         .def_readwrite("adapters", &GenerationConfig::adapters)
         .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id"))
         .def("is_beam_search", &GenerationConfig::is_beam_search)
         .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding)
-        .def("is_speculative_decoding", &GenerationConfig::is_speculative_decoding)
+        .def("is_assisting_generation", &GenerationConfig::is_assisting_generation)
+        .def("is_prompt_lookup", &GenerationConfig::is_prompt_lookup)
         .def("update_generation_config", static_cast<void (GenerationConfig::*)(const ov::AnyMap&)>(&ov::genai::GenerationConfig::update_generation_config), py::arg("config_map"));
    }
diff --git a/src/python/py_llm_pipeline.cpp b/src/python/py_llm_pipeline.cpp
index b53cc56f10..b1d5136253 100644
--- a/src/python/py_llm_pipeline.cpp
+++ b/src/python/py_llm_pipeline.cpp
@@ -195,15 +195,14 @@ void init_llm_pipeline(py::module_& m) {
         .def("get_generation_config", &LLMPipeline::get_generation_config, py::return_value_policy::copy)
         .def("set_generation_config", &LLMPipeline::set_generation_config, py::arg("config"));
 
-    py::class_<ov::Any>(m, "draft_model", py::module_local(), "This class is used to enable Speculative Decoding")
-        .def(py::init([](
+    m.def("draft_model", [](
             const std::filesystem::path& models_path,
             const std::string& device,
             const py::kwargs& kwargs
         ) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
             return draft_model(models_path, device, pyutils::kwargs_to_any_map(kwargs)).second;
-        }),
+        },
         py::arg("models_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files",
         py::arg("device") = "", "device on which inference will be performed");
 }
diff --git a/src/python/py_openvino_genai.cpp b/src/python/py_openvino_genai.cpp
index e821c1cfdc..429f48f30d 100644
--- a/src/python/py_openvino_genai.cpp
+++ b/src/python/py_openvino_genai.cpp
@@ -21,7 +21,6 @@ using ov::genai::DecodedResults;
 using ov::genai::EncodedResults;
 using ov::genai::StreamerBase;
 using ov::genai::StringInputs;
-using ov::genai::draft_model;
 
 void init_lora_adapter(py::module_& m);
 void init_perf_metrics(py::module_& m);
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index f404e63cff..093cd993de 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -19,6 +19,7 @@ file(GLOB src_files "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sequence_group.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/cache_eviction.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sampler.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/speculative_decoding/*.cpp"
+                    "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/prompt_lookup/*.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/utils/*.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/utils.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/continuous_batching*.cpp"

From 499096a22d73e4365695f0d45077cd163aa45a1e Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Wed, 18 Dec 2024 20:27:04 +0400
Subject: [PATCH 027/110] [llm bench]: add infer latency for genai (#1397)

CVS-158466
port from 2024.6 to master
https://github.com/openvinotoolkit/openvino.genai/pull/1391
---
 tools/llm_bench/task/speech_to_text_generation.py  | 2 +-
 tools/llm_bench/task/text_generation.py            | 3 ++-
 tools/llm_bench/task/visual_language_generation.py | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/llm_bench/task/speech_to_text_generation.py b/tools/llm_bench/task/speech_to_text_generation.py
index f1e7ac54a0..15a47a8b6a 100644
--- a/tools/llm_bench/task/speech_to_text_generation.py
+++ b/tools/llm_bench/task/speech_to_text_generation.py
@@ -57,7 +57,7 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list):
             - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1])
         ).tolist()
         tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist()
-        tm_infer_list = None
+        tm_infer_list = (np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000).tolist()
         result_text = result_text.texts[0]
     else:
         start = time.perf_counter()
diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py
index 3f5b5ed301..485de94996 100644
--- a/tools/llm_bench/task/text_generation.py
+++ b/tools/llm_bench/task/text_generation.py
@@ -302,6 +302,7 @@ def token_printer():
     ).tolist()
 
     tm_list = np.array([first_token_time] + second_tokens_durations) / 1000
+    inference_durations = (np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000).tolist()
     log.debug('latency of all tokens:')
     [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
     iter_data = gen_output_data.gen_iterate_data(
@@ -323,7 +324,7 @@ def token_printer():
         num,
         iter_data,
         tm_list.tolist(),
-        None,
+        inference_durations.tolist(),
         warm_up=(num == 0),
         max_rss_mem=max_rss_mem_consumption,
         max_shared_mem=max_shared_mem_consumption,
diff --git a/tools/llm_bench/task/visual_language_generation.py b/tools/llm_bench/task/visual_language_generation.py
index c4144366b4..068ae0cf60 100644
--- a/tools/llm_bench/task/visual_language_generation.py
+++ b/tools/llm_bench/task/visual_language_generation.py
@@ -268,11 +268,12 @@ def run_visual_language_generation_genai(
         mm_embeddings_preparation_time=perf_metrics.get_prepare_embeddings_duration().mean
     )
     iter_data_list.append(iter_data)
+    inference_durations = np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000
     metrics_print.print_metrics(
         num,
         iter_data,
         tm_list.tolist(),
-        None,
+        inference_durations.tolist(),
         warm_up=(num == 0),
         max_rss_mem=max_rss_mem_consumption,
         max_shared_mem=max_shared_mem_consumption,

From 1542c60f9a07de77eb3485b3589b309d3c5b5347 Mon Sep 17 00:00:00 2001
From: Alexander Kozlov <kozzzloff@list.ru>
Date: Thu, 19 Dec 2024 10:39:36 +0300
Subject: [PATCH 028/110] Removed generator patching (#1408)

---
 tools/who_what_benchmark/whowhatbench/wwb.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py
index 026a6cc69b..04813f5fd8 100644
--- a/tools/who_what_benchmark/whowhatbench/wwb.py
+++ b/tools/who_what_benchmark/whowhatbench/wwb.py
@@ -1,7 +1,3 @@
-from .utils import patch_diffusers
-
-patch_diffusers()
-
 import argparse
 import difflib
 import numpy as np

From 7a02d2bca6cf29dfe8fdcd796fca0d33ef275426 Mon Sep 17 00:00:00 2001
From: Anna Likholat <anna.likholat@intel.com>
Date: Thu, 19 Dec 2024 08:41:00 +0100
Subject: [PATCH 029/110] [ImageGeneration] EulerAncestralDiscreteScheduler
 (#1407)

![image](https://github.com/user-attachments/assets/6b688510-50d9-4f32-b80d-cb8cfa0b4b79)

CVS-156803
CVS-158965

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .../genai/image_generation/scheduler.hpp      |   3 +-
 .../schedulers/euler_ancestral_discrete.cpp   | 261 ++++++++++++++++++
 .../schedulers/euler_ancestral_discrete.hpp   |  61 ++++
 .../image_generation/schedulers/scheduler.cpp |   3 +
 .../src/image_generation/schedulers/types.cpp |   2 +
 src/docs/SUPPORTED_MODELS.md                  |   1 +
 .../openvino_genai/py_openvino_genai.pyi      |   5 +-
 src/python/py_image_generation_pipelines.cpp  |   3 +-
 tools/llm_bench/llm_bench_utils/ov_utils.py   |   2 +-
 9 files changed, 337 insertions(+), 4 deletions(-)
 create mode 100644 src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp
 create mode 100644 src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.hpp

diff --git a/src/cpp/include/openvino/genai/image_generation/scheduler.hpp b/src/cpp/include/openvino/genai/image_generation/scheduler.hpp
index 21c266aa50..25c5e07a2f 100644
--- a/src/cpp/include/openvino/genai/image_generation/scheduler.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/scheduler.hpp
@@ -19,7 +19,8 @@ class OPENVINO_GENAI_EXPORTS Scheduler {
         DDIM,
         EULER_DISCRETE,
         FLOW_MATCH_EULER_DISCRETE,
-        PNDM
+        PNDM,
+        EULER_ANCESTRAL_DISCRETE
     };
 
     static std::shared_ptr<Scheduler> from_config(const std::filesystem::path& scheduler_config_path,
diff --git a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp
new file mode 100644
index 0000000000..a63a073cfc
--- /dev/null
+++ b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp
@@ -0,0 +1,261 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cassert>
+#include <random>
+#include <fstream>
+#include <iterator>
+
+#include "image_generation/schedulers/euler_ancestral_discrete.hpp"
+#include "image_generation/numpy_utils.hpp"
+
+namespace ov {
+namespace genai {
+
+EulerAncestralDiscreteScheduler::Config::Config(const std::filesystem::path& scheduler_config_path) {
+    std::ifstream file(scheduler_config_path);
+    OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path);
+
+    nlohmann::json data = nlohmann::json::parse(file);
+    using utils::read_json_param;
+    
+    read_json_param(data, "num_train_timesteps", num_train_timesteps);
+    read_json_param(data, "beta_start", beta_start);
+    read_json_param(data, "beta_end", beta_end);
+    read_json_param(data, "beta_schedule", beta_schedule);
+    read_json_param(data, "trained_betas", trained_betas);
+    read_json_param(data, "steps_offset", steps_offset);
+    read_json_param(data, "prediction_type", prediction_type);
+    read_json_param(data, "timestep_spacing", timestep_spacing);
+    read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr);
+}
+
+EulerAncestralDiscreteScheduler::EulerAncestralDiscreteScheduler(const std::filesystem::path& scheduler_config_path) 
+    : EulerAncestralDiscreteScheduler(Config(scheduler_config_path)) {
+}
+
+EulerAncestralDiscreteScheduler::EulerAncestralDiscreteScheduler(const Config& scheduler_config): m_config(scheduler_config) {
+    std::vector<float> alphas, betas;
+
+    using numpy_utils::linspace;
+
+    if (!m_config.trained_betas.empty()) {
+        betas = m_config.trained_betas;
+    } else if (m_config.beta_schedule == BetaSchedule::LINEAR) {
+        betas = linspace<float>(m_config.beta_start, m_config.beta_end, m_config.num_train_timesteps);
+    } else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) {
+        float start = std::sqrt(m_config.beta_start);
+        float end = std::sqrt(m_config.beta_end);
+        betas = linspace<float>(start, end, m_config.num_train_timesteps);
+        std::for_each(betas.begin(), betas.end(), [](float& x) {
+            x *= x;
+        });
+    // TODO: else if beta_schedule == "squaredcos_cap_v2"
+    } else {
+        OPENVINO_THROW(
+            "'beta_schedule' must be one of 'LINEAR' or 'SCALED_LINEAR'. Please, add support of other types");
+    }
+
+    if (m_config.rescale_betas_zero_snr) {
+        using numpy_utils::rescale_zero_terminal_snr;
+        rescale_zero_terminal_snr(betas);
+    }
+
+    std::transform(betas.begin(), betas.end(), std::back_inserter(alphas), [](float b) {
+        return 1.0f - b;
+    });
+
+    for (size_t i = 1; i <= alphas.size(); ++i) {
+        float alpha_cumprod =
+            std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies<float>{});
+        m_alphas_cumprod.push_back(alpha_cumprod);
+    }
+
+    if (m_config.rescale_betas_zero_snr) {
+        m_alphas_cumprod.back() = std::pow(2, -24);
+    }
+
+    for (auto it = m_alphas_cumprod.rbegin(); it != m_alphas_cumprod.rend(); ++it) {
+        float sigma = std::pow(((1 - (*it)) / (*it)), 0.5);
+        m_sigmas.push_back(sigma);
+    }
+    m_sigmas.push_back(0);
+
+    // setable values
+    auto linspaced =
+        linspace<float>(0.0f, static_cast<float>(m_config.num_train_timesteps - 1), m_config.num_train_timesteps, true);
+    for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) {
+        m_timesteps.push_back(static_cast<int64_t>(std::round(*it)));
+    }
+    m_num_inference_steps = -1;
+    m_step_index = -1;
+    m_begin_index = -1;
+    m_is_scale_input_called = false;
+}
+
+void EulerAncestralDiscreteScheduler::set_timesteps(size_t num_inference_steps, float strength) {
+    m_timesteps.clear();
+    m_sigmas.clear();
+    m_step_index = m_begin_index = -1;
+    m_num_inference_steps = num_inference_steps;
+    std::vector<float> sigmas;
+
+    switch (m_config.timestep_spacing) {
+    case TimestepSpacing::LINSPACE: {
+        using numpy_utils::linspace;
+        float end = static_cast<float>(m_config.num_train_timesteps - 1);
+        auto linspaced = linspace<float>(0.0f, end, num_inference_steps, true);
+        for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) {
+            m_timesteps.push_back(static_cast<int64_t>(std::round(*it)));
+        }
+        break;
+    }
+    case TimestepSpacing::LEADING: {
+        size_t step_ratio = m_config.num_train_timesteps / m_num_inference_steps;
+        for (size_t i = num_inference_steps - 1; i != -1; --i) {
+            m_timesteps.push_back(i * step_ratio + m_config.steps_offset);
+        }
+        break;
+    }
+    case TimestepSpacing::TRAILING: {
+        float step_ratio = static_cast<float>(m_config.num_train_timesteps) / static_cast<float>(m_num_inference_steps);
+        for (float i = m_config.num_train_timesteps; i > 0; i -= step_ratio) {
+            m_timesteps.push_back(static_cast<int64_t>(std::round(i)) - 1);
+        }
+        break;
+    }
+    default:
+        OPENVINO_THROW("Unsupported value for 'timestep_spacing'");
+    }
+
+    for (const float& i : m_alphas_cumprod) {
+        float sigma = std::pow(((1 - i) / i), 0.5);
+        sigmas.push_back(sigma);
+    }
+
+    using numpy_utils::interp;
+    std::vector<size_t> x_data_points(sigmas.size());
+    std::iota(x_data_points.begin(), x_data_points.end(), 0);
+    m_sigmas = interp(m_timesteps, x_data_points, sigmas);
+    m_sigmas.push_back(0.0f);
+
+    // apply 'strength' used in image generation
+    // in diffusers, it's https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L650
+    {
+        size_t init_timestep = std::min<size_t>(num_inference_steps * strength, num_inference_steps);
+        size_t t_start = std::max<size_t>(num_inference_steps - init_timestep, 0);
+        // keep original timesteps
+        m_schedule_timesteps = m_timesteps;
+        // while return patched ones by 'strength' parameter
+        m_timesteps = std::vector<int64_t>(m_timesteps.begin() + t_start, m_timesteps.end());
+        m_begin_index = t_start;
+    }
+}
+
+std::map<std::string, ov::Tensor> EulerAncestralDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) {
+    // noise_pred - model_output
+    // latents - sample
+    // inference_step
+
+    size_t timestep = m_timesteps[inference_step];
+
+    if (m_step_index == -1)
+        m_step_index = m_begin_index;
+
+    float sigma = m_sigmas[m_step_index];
+
+    float* model_output_data = noise_pred.data<float>();
+    float* sample_data = latents.data<float>();
+
+    ov::Tensor pred_original_sample(noise_pred.get_element_type(), noise_pred.get_shape());
+    float* pred_original_sample_data = pred_original_sample.data<float>();
+
+    switch (m_config.prediction_type) {
+    case PredictionType::EPSILON:
+        for (size_t i = 0; i < noise_pred.get_size(); ++i) {
+            pred_original_sample_data[i] = sample_data[i] - sigma * model_output_data[i];
+        }
+        break;
+    case PredictionType::V_PREDICTION:
+        for (size_t i = 0; i < noise_pred.get_size(); ++i) {
+            pred_original_sample_data[i] = model_output_data[i] * (-sigma / std::pow((std::pow(sigma, 2) + 1), 0.5)) +
+                                           (sample_data[i] / (std::pow(sigma, 2) + 1));
+        }
+        break;
+    default:
+        OPENVINO_THROW("Unsupported value for 'PredictionType': must be one of `epsilon`, or `v_prediction`");
+    }
+
+    float sigma_from = m_sigmas[m_step_index];
+    float sigma_to = m_sigmas[m_step_index + 1];
+    float sigma_up = std::sqrt(std::pow(sigma_to, 2) * (std::pow(sigma_from, 2) - std::pow(sigma_to, 2)) / std::pow(sigma_from, 2));
+    float sigma_down = std::sqrt(std::pow(sigma_to, 2) - std::pow(sigma_up, 2));
+    float dt = sigma_down - sigma;
+
+    ov::Tensor prev_sample = ov::Tensor(latents.get_element_type(), latents.get_shape());
+    float* prev_sample_data = prev_sample.data<float>();
+
+    ov::Tensor noise = generator->randn_tensor(noise_pred.get_shape());
+    const float* noise_data = noise.data<float>();
+
+    for (size_t i = 0; i < prev_sample.get_size(); ++i) {
+        float derivative = (sample_data[i] - pred_original_sample_data[i]) / sigma;
+        prev_sample_data[i] = (sample_data[i] + derivative * dt) + noise_data[i] * sigma_up;
+    }
+
+    m_step_index++;
+
+    return {{"latent", prev_sample}, {"denoised", pred_original_sample}};
+}
+
+size_t EulerAncestralDiscreteScheduler::_index_for_timestep(int64_t timestep) const{
+    for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) {
+        if (timestep == m_schedule_timesteps[i]) {
+            return i;
+        }
+    }
+
+    OPENVINO_THROW("Failed to find index for timestep ", timestep);
+}
+
+void EulerAncestralDiscreteScheduler::add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const {
+    size_t index_for_timestep = _index_for_timestep(latent_timestep);
+    const float sigma = m_sigmas[index_for_timestep];
+
+    float * init_latent_data = init_latent.data<float>();
+    const float * noise_data = noise.data<float>();
+
+    for (size_t i = 0; i < init_latent.get_size(); ++i) {
+        init_latent_data[i] = init_latent_data[i] + sigma * noise_data[i];
+    }
+}
+
+std::vector<int64_t> EulerAncestralDiscreteScheduler::get_timesteps() const {
+    return m_timesteps;
+}
+
+void EulerAncestralDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) {
+    if (m_step_index == -1)
+        m_step_index = m_begin_index;
+
+    float sigma = m_sigmas[m_step_index];
+    float* sample_data = sample.data<float>();
+    for (size_t i = 0; i < sample.get_size(); i++) {
+        sample_data[i] /= std::pow((std::pow(sigma, 2) + 1), 0.5);
+    }
+    m_is_scale_input_called = true;
+}
+
+float EulerAncestralDiscreteScheduler::get_init_noise_sigma() const {
+    float max_sigma = *std::max_element(m_sigmas.begin(), m_sigmas.end());
+
+    if (m_config.timestep_spacing == TimestepSpacing::LINSPACE ||
+        m_config.timestep_spacing == TimestepSpacing::TRAILING) {
+        return max_sigma;
+    }
+
+    return std::sqrt(std::pow(max_sigma, 2) + 1);
+}
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.hpp b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.hpp
new file mode 100644
index 0000000000..9d82c9a0a9
--- /dev/null
+++ b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.hpp
@@ -0,0 +1,61 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <filesystem>
+#include <list>
+#include <string>
+
+#include "image_generation/schedulers/types.hpp"
+#include "image_generation/schedulers/ischeduler.hpp"
+
+namespace ov {
+namespace genai {
+
+class EulerAncestralDiscreteScheduler : public IScheduler {
+public:
+    struct Config {
+        int32_t num_train_timesteps = 1000;
+        float beta_start = 0.0001f, beta_end = 0.02f;
+        BetaSchedule beta_schedule = BetaSchedule::LINEAR;
+        std::vector<float> trained_betas = {};
+        size_t steps_offset = 0;
+        PredictionType prediction_type = PredictionType::EPSILON;
+        TimestepSpacing timestep_spacing = TimestepSpacing::LEADING;
+        bool rescale_betas_zero_snr = false;
+
+        Config() = default;
+        explicit Config(const std::filesystem::path& scheduler_config_path);
+    };
+
+    explicit EulerAncestralDiscreteScheduler(const std::filesystem::path& scheduler_config_path);
+    explicit EulerAncestralDiscreteScheduler(const Config& scheduler_config);
+
+    void set_timesteps(size_t num_inference_steps, float strength) override;
+
+    std::vector<std::int64_t> get_timesteps() const override;
+
+    float get_init_noise_sigma() const override;
+
+    void scale_model_input(ov::Tensor sample, size_t inference_step) override;
+
+    std::map<std::string, ov::Tensor> step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) override;
+
+    void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const override;
+
+private:
+    Config m_config;
+
+    std::vector<float> m_alphas_cumprod, m_sigmas;
+    std::vector<int64_t> m_timesteps, m_schedule_timesteps;
+    size_t m_num_inference_steps;
+
+    int m_step_index, m_begin_index;
+    bool m_is_scale_input_called;
+
+    size_t _index_for_timestep(int64_t timestep) const;
+};
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/image_generation/schedulers/scheduler.cpp b/src/cpp/src/image_generation/schedulers/scheduler.cpp
index f9cd098346..868f6f05cf 100644
--- a/src/cpp/src/image_generation/schedulers/scheduler.cpp
+++ b/src/cpp/src/image_generation/schedulers/scheduler.cpp
@@ -11,6 +11,7 @@
 #include "image_generation/schedulers/euler_discrete.hpp"
 #include "image_generation/schedulers/flow_match_euler_discrete.hpp"
 #include "image_generation/schedulers/pndm.hpp"
+#include "image_generation/schedulers/euler_ancestral_discrete.hpp"
 
 namespace ov {
 namespace genai {
@@ -41,6 +42,8 @@ std::shared_ptr<Scheduler> Scheduler::from_config(const std::filesystem::path& s
         scheduler = std::make_shared<FlowMatchEulerDiscreteScheduler>(scheduler_config_path);
     } else if (scheduler_type == Scheduler::Type::PNDM) {
         scheduler = std::make_shared<PNDMScheduler>(scheduler_config_path);
+    } else if (scheduler_type == Scheduler::Type::EULER_ANCESTRAL_DISCRETE) {
+        scheduler = std::make_shared<EulerAncestralDiscreteScheduler>(scheduler_config_path);
     } else {
         OPENVINO_THROW("Unsupported scheduler type '", scheduler_type, ". Please, manually create scheduler via supported one");
     }
diff --git a/src/cpp/src/image_generation/schedulers/types.cpp b/src/cpp/src/image_generation/schedulers/types.cpp
index 2f7c6d3f25..5a9e5b6865 100644
--- a/src/cpp/src/image_generation/schedulers/types.cpp
+++ b/src/cpp/src/image_generation/schedulers/types.cpp
@@ -57,6 +57,8 @@ void read_json_param(const nlohmann::json& data, const std::string& name, Schedu
             param = Scheduler::FLOW_MATCH_EULER_DISCRETE;
         else if (scheduler_type_str == "PNDMScheduler")
             param = Scheduler::PNDM;
+        else if (scheduler_type_str == "EulerAncestralDiscreteScheduler")
+            param = Scheduler::EULER_ANCESTRAL_DISCRETE;
         else if (!scheduler_type_str.empty()) {
             OPENVINO_THROW("Unsupported value for 'scheduler' ", scheduler_type_str);
         }
diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md
index 8c922ee644..9762874596 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/src/docs/SUPPORTED_MODELS.md
@@ -217,6 +217,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
         <ul>
           <li><a href="https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9"><code>stabilityai/stable-diffusion-xl-base-0.9</code></a></li>
           <li><a href="https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0"><code>stabilityai/stable-diffusion-xl-base-1.0</code></a></li>
+          <li><a href="https://huggingface.co/stabilityai/sdxl-turbo"><code>stabilityai/sdxl-turbo</code></a></li>
         </ul>
       </td>
     </tr>
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index 524ff0f921..bfcb869157 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -1343,15 +1343,18 @@ class Scheduler:
           FLOW_MATCH_EULER_DISCRETE
         
           PNDM
+        
+          EULER_ANCESTRAL_DISCRETE
         """
         AUTO: typing.ClassVar[Scheduler.Type]  # value = <Type.AUTO: 0>
         DDIM: typing.ClassVar[Scheduler.Type]  # value = <Type.DDIM: 3>
+        EULER_ANCESTRAL_DISCRETE: typing.ClassVar[Scheduler.Type]  # value = <Type.EULER_ANCESTRAL_DISCRETE: 7>
         EULER_DISCRETE: typing.ClassVar[Scheduler.Type]  # value = <Type.EULER_DISCRETE: 4>
         FLOW_MATCH_EULER_DISCRETE: typing.ClassVar[Scheduler.Type]  # value = <Type.FLOW_MATCH_EULER_DISCRETE: 5>
         LCM: typing.ClassVar[Scheduler.Type]  # value = <Type.LCM: 1>
         LMS_DISCRETE: typing.ClassVar[Scheduler.Type]  # value = <Type.LMS_DISCRETE: 2>
         PNDM: typing.ClassVar[Scheduler.Type]  # value = <Type.PNDM: 6>
-        __members__: typing.ClassVar[dict[str, Scheduler.Type]]  # value = {'AUTO': <Type.AUTO: 0>, 'LCM': <Type.LCM: 1>, 'LMS_DISCRETE': <Type.LMS_DISCRETE: 2>, 'DDIM': <Type.DDIM: 3>, 'EULER_DISCRETE': <Type.EULER_DISCRETE: 4>, 'FLOW_MATCH_EULER_DISCRETE': <Type.FLOW_MATCH_EULER_DISCRETE: 5>, 'PNDM': <Type.PNDM: 6>}
+        __members__: typing.ClassVar[dict[str, Scheduler.Type]]  # value = {'AUTO': <Type.AUTO: 0>, 'LCM': <Type.LCM: 1>, 'LMS_DISCRETE': <Type.LMS_DISCRETE: 2>, 'DDIM': <Type.DDIM: 3>, 'EULER_DISCRETE': <Type.EULER_DISCRETE: 4>, 'FLOW_MATCH_EULER_DISCRETE': <Type.FLOW_MATCH_EULER_DISCRETE: 5>, 'PNDM': <Type.PNDM: 6>, 'EULER_ANCESTRAL_DISCRETE': <Type.EULER_ANCESTRAL_DISCRETE: 7>}
         def __eq__(self, other: typing.Any) -> bool:
             ...
         def __getstate__(self) -> int:
diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
index f5347c279d..311f3f3760 100644
--- a/src/python/py_image_generation_pipelines.cpp
+++ b/src/python/py_image_generation_pipelines.cpp
@@ -198,7 +198,8 @@ void init_image_generation_pipelines(py::module_& m) {
         .value("DDIM", ov::genai::Scheduler::Type::DDIM)
         .value("EULER_DISCRETE", ov::genai::Scheduler::Type::EULER_DISCRETE)
         .value("FLOW_MATCH_EULER_DISCRETE", ov::genai::Scheduler::Type::FLOW_MATCH_EULER_DISCRETE)
-        .value("PNDM", ov::genai::Scheduler::Type::PNDM);
+        .value("PNDM", ov::genai::Scheduler::Type::PNDM)
+        .value("EULER_ANCESTRAL_DISCRETE", ov::genai::Scheduler::Type::EULER_ANCESTRAL_DISCRETE);
     image_generation_scheduler.def_static("from_config",
         &ov::genai::Scheduler::from_config,
         py::arg("scheduler_config_path"),
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
index c3df84925b..316c9d0b89 100644
--- a/tools/llm_bench/llm_bench_utils/ov_utils.py
+++ b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -421,7 +421,7 @@ def get_vae_decoder_step_count(self):
 
     scheduler_type = data.get("scheduler", ["", ""])[1]
     if (scheduler_type not in ["LCMScheduler", "DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler", "EulerDiscreteScheduler",
-                               "FlowMatchEulerDiscreteScheduler"]):
+                               "FlowMatchEulerDiscreteScheduler", "EulerAncestralDiscreteScheduler"]):
         scheduler = openvino_genai.Scheduler.from_config(model_path / "scheduler/scheduler_config.json", openvino_genai.Scheduler.Type.DDIM)
         log.warning(f'Type of scheduler {scheduler_type} is unsupported. Please, be aware that it will be replaced to DDIMScheduler')
 

From 17f4eb32a1586aec8f42183e0667348b2cbd2fef Mon Sep 17 00:00:00 2001
From: Sofya Balandina <sofya.balandina@intel.com>
Date: Thu, 19 Dec 2024 07:45:17 +0000
Subject: [PATCH 030/110] fill prompt for sampler analysis with real tokens in
 VLM pipeline (#1247)

+ add missed token, if prev generation was finished because max length
was reached
---
 src/cpp/src/utils.cpp                         |  8 +++
 src/cpp/src/utils.hpp                         |  2 +
 .../src/visual_language/inputs_embedder.cpp   | 56 ++++++++++++-------
 .../src/visual_language/inputs_embedder.hpp   |  8 ++-
 src/cpp/src/visual_language/pipeline.cpp      | 12 ++--
 5 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index 9fa14b7f9f..be9fc972dc 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -381,6 +381,14 @@ void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t se
     }
 }
 
+ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, int64_t add_to_front) {
+    ov::Tensor new_tensor = ov::Tensor{ov::element::i64, {base_tensor.get_shape().at(0), base_tensor.get_shape().at(1) + 1}};
+    auto new_tensor_data = new_tensor.data<int64_t>();
+    new_tensor_data[0] = add_to_front;
+    std::copy_n(base_tensor.data<int64_t>(), base_tensor.get_size(), new_tensor_data + 1);
+    return new_tensor;
+}
+
 void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const char* model_title) {
     // Specify the name of the environment variable
     const char* env_var_name = "OPENVINO_LOG_LEVEL";
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 5342ac427c..96191387cd 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -104,6 +104,8 @@ size_t get_seq_len_axis(std::shared_ptr<const ov::Model> model);
 
 void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t seq_length_axis, std::optional<AdapterController> adapter_controller);
 
+ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, int64_t add_to_front);
+
 void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const char* model_title);
 
 }  // namespace utils
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index cf77dfce3c..8175d44b16 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -10,6 +10,7 @@
 
 #include "utils.hpp"
 
+
 namespace {
 
 constexpr size_t BATCH_SIZE = 1;
@@ -40,10 +41,12 @@ class InputsEmbedder::IInputsEmbedder {
     // Templated chat history
     std::string m_templated_chat_history;
     // Tokenized chat history
-    std::vector<int64_t> m_tokenized_chat_history;
+    std::vector<int64_t> m_tokenized_history;
     // The number of elements, which need to remove from the end of KV cache
     // removed elements will be added to inputs_ids
     size_t m_to_remove_from_hist = 0;
+    // Tail of previous output for LM in chat mode is missing in KV cache.
+    std::optional<int64_t> m_last_disappeared_token = std::nullopt;
 
 public:
     virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
@@ -56,26 +59,30 @@ class InputsEmbedder::IInputsEmbedder {
         return m_tokenizer;
     }
 
-    std::vector<int64_t> get_tokenized_chat_history() const {
-        return m_tokenized_chat_history;
+    std::vector<int64_t> get_tokenized_history() const {
+        return m_tokenized_history;
     }
 
     size_t get_amount_to_remove_from_hist() const {
         return m_to_remove_from_hist;
     }
 
-    void update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
-        std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_chat_history));
+    void update_tokenized_history(std::vector<int64_t> encoded_result, bool token_will_disappear) {
+        std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_history));
         m_to_remove_from_hist = 0;
+        if (token_will_disappear)
+            m_last_disappeared_token = encoded_result.back();
+        else
+            m_last_disappeared_token = std::nullopt;
     }
 
     virtual void start_chat(const std::string& system_message) {
         m_is_chat_conversation = true;
         m_to_remove_from_hist = 0;
-        if (!m_tokenized_chat_history.empty()) {
+        if (!m_tokenized_history.empty()) {
             m_history.clear();
             m_templated_chat_history.clear();
-            m_tokenized_chat_history.clear();
+            m_tokenized_history.clear();
         }
         if (system_message.empty()) {
             return;
@@ -98,7 +105,7 @@ class InputsEmbedder::IInputsEmbedder {
 
         m_history.clear();
         m_templated_chat_history.clear();
-        m_tokenized_chat_history.clear();
+        m_tokenized_history.clear();
     }
 
 protected:
@@ -165,37 +172,46 @@ class InputsEmbedder::IInputsEmbedder {
             // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
             // so let's check it out, find the trusted part and use it in on the next step
             size_t last_same_hist_token = 0;
-            if (!m_tokenized_chat_history.empty()) {
+            if (!m_tokenized_history.empty()) {
                 std::set<int64_t> stop_tokens = {m_tokenizer.get_eos_token_id()};
-                last_same_hist_token = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
+                last_same_hist_token = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_history, stop_tokens);
             }
 
-            if (m_tokenized_chat_history.empty()) {
+            if (m_tokenized_history.empty()) {
                 encoded_input_ids = new_chat_tokens;
             } else if (last_same_hist_token != SIZE_MAX) {
-                m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token;
+                m_to_remove_from_hist = m_tokenized_history.size() - last_same_hist_token;
+                // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it
+                m_to_remove_from_hist -= m_last_disappeared_token.has_value() ? 1 : 0;
 
                 ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.get_element_type(),
                                                    {1, new_chat_tokens.get_shape().at(1) - last_same_hist_token},
                                                    new_chat_tokens.data<int64_t>() + last_same_hist_token);
-                encoded_input_ids = new_tensor;
+                encoded_input_ids = ov::Tensor(new_chat_tokens.get_element_type(),
+                                                    {1, new_chat_tokens.get_shape().at(1) - last_same_hist_token});
+                new_tensor.copy_to(encoded_input_ids);
             } else {
                 encoded_input_ids = utils::subtract_chat_tokenized_inputs(
                     {new_chat_tokens}, prev_chat_tokens
                 ).input_ids;
+
+                if (m_last_disappeared_token.has_value())
+                    encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token);
             }
             auto end_tokenizer_time = std::chrono::steady_clock::now();
             metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             m_templated_chat_history = std::move(new_templated_chat_history);
-            m_tokenized_chat_history.clear();
-            std::copy(new_chat_tokens.data<int64_t>(), new_chat_tokens.data<int64_t>() + new_chat_tokens.get_size(),
-                        std::back_inserter(m_tokenized_chat_history));
+            m_tokenized_history.clear();
+            std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));
         } else {
             auto start_tokenizer_time = std::chrono::steady_clock::now();
             encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
             auto end_tokenizer_time = std::chrono::steady_clock::now();
             metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+            m_tokenized_history.clear();
+            std::copy_n(encoded_input_ids.data<int64_t>(), encoded_input_ids.get_size(), std::back_inserter(m_tokenized_history));
         }
+
         return encoded_input_ids;
     }
 
@@ -1172,12 +1188,12 @@ EmbeddingsModel InputsEmbedder::get_embedding_model() const {
     return m_impl->get_embedding_model();
 }
 
-std::vector<int64_t> InputsEmbedder::get_tokenized_chat_history() const {
-    return m_impl->get_tokenized_chat_history();
+std::vector<int64_t> InputsEmbedder::get_tokenized_history() const {
+    return m_impl->get_tokenized_history();
 }
 
-void InputsEmbedder::update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
-    return m_impl->update_tokenized_chat_history(encoded_result);
+void InputsEmbedder::update_tokenized_history(std::vector<int64_t> encoded_result, bool token_will_disappear) {
+    return m_impl->update_tokenized_history(encoded_result, token_will_disappear);
 }
 
 size_t InputsEmbedder::get_amount_to_remove_from_hist() const {
diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp
index 5c5b9d2b81..8c84c6ad43 100644
--- a/src/cpp/src/visual_language/inputs_embedder.hpp
+++ b/src/cpp/src/visual_language/inputs_embedder.hpp
@@ -41,16 +41,20 @@ class InputsEmbedder {
     Tokenizer get_tokenizer() const;
 
     // returns tokenized chat history
-    std::vector<int64_t> get_tokenized_chat_history() const;
+    std::vector<int64_t> get_tokenized_history() const;
+
     // add new results to tokenized chat history
-    void update_tokenized_chat_history(std::vector<int64_t> encoded_result);
+    void update_tokenized_history(std::vector<int64_t> encoded_result, bool token_will_disappear);
+
     // returns amount of elements, which need to remove from the end of the KV cache
     size_t get_amount_to_remove_from_hist() const;
 
     // starts chat and adds optional system_message to chat history
     void start_chat(const std::string& system_message);
+
     // adds currently generated text to chat history
     void update_chat_history(const std::string& decoded_results);
+
     // finishes chat and clears a chat history 
     void finish_chat();
 private:
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
index 1ce0cbf210..0d7aebc506 100644
--- a/src/cpp/src/visual_language/pipeline.cpp
+++ b/src/cpp/src/visual_language/pipeline.cpp
@@ -17,6 +17,7 @@
 #include "utils.hpp"
 #include "lm_encoding.hpp"
 
+
 using namespace ov::genai;
 
 namespace {
@@ -163,19 +164,18 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         auto to_remove_from_hist = m_inputs_embedder->get_amount_to_remove_from_hist();
         ov::genai::utils::trim_kv_cache(m_language, to_remove_from_hist, m_kv_cache_seq_length_axis, std::nullopt);
 
-        Sampler sampler = Sampler(m_tokenizer);
-
         std::vector<SequenceGroup::Ptr> requests;
         size_t request_id = 0;
         size_t block_size = 1; // not used
         bool enable_prefix_caching = false;
 
-        auto tokenized_chat_history = m_inputs_embedder->get_tokenized_chat_history();
         size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1) - to_remove_from_hist;
         size_t inputs_embeds_size = inputs_embeds.get_shape().at(1);
 
+        auto tokenized_history = m_inputs_embedder->get_tokenized_history();
         ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size });
-        std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), 0);
+        std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), m_tokenizer.get_pad_token_id());
+        std::copy(tokenized_history.begin(), tokenized_history.end(), prompt_ids.data<int64_t>());
 
         SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, prompt_ids, generation_config, block_size, enable_prefix_caching);
         sequence_group->set_sequence_group_ptr(sequence_group);
@@ -204,6 +204,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }};
         std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), history_size);
 
+        Sampler sampler = Sampler(m_tokenizer);
+
         ov::genai::EncodedResults encoded_result;
         int32_t m_selected_beam = 0;
         std::tie(encoded_result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, sampler, requests,
@@ -243,7 +245,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         decoded.perf_metrics.m_evaluated = false;
         decoded.perf_metrics.evaluate_statistics(generate_start_time);
 
-        m_inputs_embedder->update_tokenized_chat_history(encoded_result.tokens[0]);
+        m_inputs_embedder->update_tokenized_history(encoded_result.tokens[0], requests[0]->get_finished_sequences()[0]->get_finish_reason() == GenerationFinishReason::LENGTH);
 
         return decoded;
     }

From e1f910ddef54728cc1147c9f839a09cdc176c2dd Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 19 Dec 2024 11:48:09 +0100
Subject: [PATCH 031/110] Whisper pipeline: cache models in python tests
 (#1389)

Ticket: 159277
---
 tests/python_tests/test_whisper_generate_api.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_generate_api.py
index 5a68dd98b6..9a117bc939 100644
--- a/tests/python_tests/test_whisper_generate_api.py
+++ b/tests/python_tests/test_whisper_generate_api.py
@@ -25,7 +25,9 @@ def run_gc_after_test():
     yield
     gc.collect()
 
-@functools.lru_cache(1)
+# used whisper models are relatively small
+# cache them in memory to speedup tests
+@functools.lru_cache(3)
 def read_whisper_model(params, **tokenizer_kwargs):
     model_id, path = params
 

From 0be7b3df3d28fa6c9009f1f070851b21bac4a4bf Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 19 Dec 2024 12:06:50 +0100
Subject: [PATCH 032/110] Whisper pipeline: implement 'initial_prompt' and
 'hotwords' parameters (#1378)

Adds:
* `initial_prompt` parameter ([faster_whisper
reference](https://github.com/SYSTRAN/faster-whisper/blob/203dddb047fd2c3ed2a520fe1416467a527e0f37/faster_whisper/transcribe.py#L732))
- injects initial prompt tokens as a previous transcription into the
first processing window
* `hotwords` parameter ([faster_whisper
reference](https://github.com/SYSTRAN/faster-whisper/blob/203dddb047fd2c3ed2a520fe1416467a527e0f37/faster_whisper/transcribe.py#L768))
- injects hotwords tokens as a previous transcription into the all
processing windows
* Whisper pipeline usage notes in samples

Closes https://github.com/openvinotoolkit/openvino.genai/issues/1150
Ticket: 156888
---
 .../cpp/whisper_speech_recognition/README.md  | 85 ++++++++++++++++++
 .../whisper_speech_recognition.cpp            |  1 +
 .../whisper_speech_recognition/README.md      | 87 ++++++++++++++++++
 .../whisper_speech_recognition.py             |  7 +-
 .../genai/whisper_generation_config.hpp       | 34 ++++++-
 src/cpp/src/whisper/context_tokens.cpp        | 89 +++++++++++++++++++
 src/cpp/src/whisper/context_tokens.hpp        | 25 ++++++
 src/cpp/src/whisper/whisper.cpp               | 24 +++--
 src/cpp/src/whisper/whisper.hpp               |  2 +
 src/cpp/src/whisper_generation_config.cpp     |  5 +-
 src/cpp/src/whisper_pipeline.cpp              |  6 ++
 src/cpp/src/whisper_pipeline_static.cpp       |  3 +
 .../openvino_genai/py_openvino_genai.pyi      | 53 +++++++++++
 src/python/py_whisper_pipeline.cpp            | 28 ++++++
 .../python_tests/test_whisper_generate_api.py | 25 ++++++
 15 files changed, 460 insertions(+), 14 deletions(-)
 create mode 100644 src/cpp/src/whisper/context_tokens.cpp
 create mode 100644 src/cpp/src/whisper/context_tokens.hpp

diff --git a/samples/cpp/whisper_speech_recognition/README.md b/samples/cpp/whisper_speech_recognition/README.md
index 773135b648..d649266613 100644
--- a/samples/cpp/whisper_speech_recognition/README.md
+++ b/samples/cpp/whisper_speech_recognition/README.md
@@ -33,6 +33,91 @@ timestamps: [0, 2] text:  How are you doing today?
 
 See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
 
+# Whisper pipeline usage
+
+```c++
+#include "openvino/genai/whisper_pipeline.hpp"
+
+ov::genai::WhisperPipeline pipeline(model_dir, "CPU");
+// Pipeline expects normalized audio with Sample Rate of 16kHz
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech);
+//  How are you doing today?
+```
+
+### Transcription
+
+Whisper pipeline predicts the language of the source audio automatically.
+
+```c++
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech);
+//  How are you doing today?
+
+raw_speech = read_wav("fr_sample.wav");
+result = pipeline.generate(raw_speech);
+//  Il s'agit d'une entité très complexe qui consiste...
+```
+
+If the source audio languange is know in advance, it can be specified as an argument to `generate` method:
+
+```c++
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech, ov::genai::language("<|en|>"));
+//  How are you doing today?
+
+raw_speech = read_wav("fr_sample.wav");
+result = pipeline.generate(raw_speech, ov::genai::language("<|fr|>"));
+//  Il s'agit d'une entité très complexe qui consiste...
+```
+
+### Translation
+
+By default, Whisper performs the task of speech transcription, where the source audio language is the same as the target text language. To perform speech translation, where the target text is in English, set the task to "translate":
+
+```c++
+ov::genai::RawSpeechInput raw_speech = read_wav("fr_sample.wav");
+auto result = pipeline.generate(raw_speech, ov::genai::task("translate"));
+//  It is a very complex entity that consists...
+```
+
+### Timestamps prediction
+
+The model can predict timestamps. For sentence-level timestamps, pass the `return_timestamps` argument:
+
+```C++
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech, ov::genai::return_timestamps(true));
+
+std::cout << std::setprecision(2);
+for (auto& chunk : *result.chunks) {
+    std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n";
+}
+// timestamps: [0, 2] text:  How are you doing today?
+```
+
+### Long-Form audio Transcription
+
+The Whisper model is designed to work on audio samples of up to 30s in duration. Whisper pipeline uses sequential chunking algorithm to transcribe audio samples of arbitrary length.
+Sequential chunking algorithm uses a "sliding window", transcribing 30-second slices one after the other.
+
+### Initial prompt and hotwords
+
+Whisper pipeline has `initial_prompt` and `hotwords` generate arguments:
+* `initial_prompt`: initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window
+* `hotwords`: hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows
+
+The Whisper model can use that context to better understand the speech and maintain a consistent writing style. However, prompts do not need to be genuine transcripts from prior audio segments. Such prompts can be used to steer the model to use particular spellings or styles:
+
+```c++
+auto result = pipeline.generate(raw_speech);
+//  He has gone and gone for good answered Paul Icrom who...
+
+result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome"));
+//  He has gone and gone for good answered Polychrome who...
+```
+
+
 ### Troubleshooting
 
 #### Empty or rubbish output
diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
index 31d3f8c551..3df17a77f5 100644
--- a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
+++ b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
@@ -28,6 +28,7 @@ int main(int argc, char* argv[]) try {
 
     std::cout << result << "\n";
 
+    std::cout << std::setprecision(2);
     for (auto& chunk : *result.chunks) {
         std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n";
     }
diff --git a/samples/python/whisper_speech_recognition/README.md b/samples/python/whisper_speech_recognition/README.md
index 158bd18311..aeb46444bf 100644
--- a/samples/python/whisper_speech_recognition/README.md
+++ b/samples/python/whisper_speech_recognition/README.md
@@ -40,6 +40,93 @@ timestamps: [0, 2] text:  How are you doing today?
 
 See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
 
+# Whisper pipeline usage
+
+```python
+import openvino_genai
+import librosa
+
+def read_wav(filepath):
+    raw_speech, samplerate = librosa.load(filepath, sr=16000)
+    return raw_speech.tolist()
+
+pipe = openvino_genai.WhisperPipeline(model_dir, "CPU")
+# Pipeline expects normalized audio with Sample Rate of 16kHz
+raw_speech = read_wav('how_are_you_doing_today.wav')
+result = pipe.generate(raw_speech)
+#  How are you doing today?
+```
+
+### Transcription
+
+Whisper pipeline predicts the language of the source audio automatically.
+
+```python
+raw_speech = read_wav('how_are_you_doing_today.wav')
+result = pipe.generate(raw_speech)
+#  How are you doing today?
+
+raw_speech = read_wav('fr_sample.wav')
+result = pipe.generate(raw_speech)
+#  Il s'agit d'une entité très complexe qui consiste...
+```
+
+If the source audio languange is know in advance, it can be specified as an argument to `generate` method:
+
+```python
+raw_speech = read_wav("how_are_you_doing_today.wav")
+result = pipe.generate(raw_speech, language="<|en|>")
+#  How are you doing today?
+
+raw_speech = read_wav("fr_sample.wav")
+result = pipe.generate(raw_speech, language="<|fr|>")
+#  Il s'agit d'une entité très complexe qui consiste...
+```
+
+### Translation
+
+By default, Whisper performs the task of speech transcription, where the source audio language is the same as the target text language. To perform speech translation, where the target text is in English, set the task to "translate":
+
+```python
+raw_speech = read_wav("fr_sample.wav")
+result = pipe.generate(raw_speech, task="translate")
+# It is a very complex entity that consists...
+```
+
+### Timestamps prediction
+
+The model can predict timestamps. For sentence-level timestamps, pass the `return_timestamps` argument:
+
+```python
+raw_speech = read_wav("how_are_you_doing_today.wav")
+result = pipe.generate(raw_speech, return_timestamps=True)
+
+for chunk in result.chunks:
+    print(f"timestamps: [{chunk.start_ts:.2f}, {chunk.end_ts:.2f}] text: {chunk.text}")
+# timestamps: [0.00, 2.00] text:  How are you doing today?
+```
+
+### Long-Form audio Transcription
+
+The Whisper model is designed to work on audio samples of up to 30s in duration. Whisper pipeline uses sequential chunking algorithm to transcribe audio samples of arbitrary length.
+Sequential chunking algorithm uses a "sliding window", transcribing 30-second slices one after the other.
+
+### Initial prompt and hotwords
+
+Whisper pipeline has `initial_prompt` and `hotwords` generate arguments:
+* `initial_prompt`: initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window
+* `hotwords`: hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows
+
+The Whisper model can use that context to better understand the speech and maintain a consistent writing style. However, prompts do not need to be genuine transcripts from prior audio segments. Such prompts can be used to steer the model to use particular spellings or styles:
+
+```python
+result = pipe.generate(raw_speech)
+#  He has gone and gone for good answered Paul Icrom who...
+
+result = pipe.generate(raw_speech, initial_prompt="Polychrome")
+#  He has gone and gone for good answered Polychrome who...
+```
+
 ### Troubleshooting
 
 #### Empty or rubbish output
diff --git a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py
index 3fddfc8ffa..9cf3be5fa1 100755
--- a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py
+++ b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py
@@ -18,7 +18,7 @@ def main():
     parser.add_argument("wav_file_path")
     args = parser.parse_args()
 
-    device = "CPU"  # GPU can be used as well
+    device = "CPU"  # GPU, NPU can be used as well
     pipe = openvino_genai.WhisperPipeline(args.model_dir, device)
 
     config = pipe.get_generation_config()
@@ -34,8 +34,9 @@ def main():
 
     print(result)
 
-    for chunk in result.chunks:
-        print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}")
+    if result.chunks:
+        for chunk in result.chunks:
+            print(f"timestamps: [{chunk.start_ts:.2f}, {chunk.end_ts:.2f}] text: {chunk.text}")
 
 
 if "__main__" == __name__:
diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp
index 37b23cde74..44d611923d 100644
--- a/src/cpp/include/openvino/genai/whisper_generation_config.hpp
+++ b/src/cpp/include/openvino/genai/whisper_generation_config.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <optional>
 #include <filesystem>
+#include <optional>
 
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/runtime/compiled_model.hpp"
@@ -46,6 +46,9 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig {
     // Transcribe token id.
     int64_t transcribe_token_id = 50359;
 
+    // Corresponds to the ”<|startofprev|>” token.
+    int64_t prev_sot_token_id = 50361;
+
     // No timestamps token id.
     int64_t no_timestamps_token_id = 50363;
 
@@ -75,6 +78,32 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig {
     // Note that a segment of text refers to a sequence of one or more words, rather than individual words.
     bool return_timestamps = false;
 
+    /*
+     * Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing
+     * window. Can be used to steer the model to use particular spellings or styles.
+     *
+     * Example:
+     *  auto result = pipeline.generate(raw_speech);
+     *  //  He has gone and gone for good answered Paul Icrom who...
+     *
+     *  auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome"));
+     *  //  He has gone and gone for good answered Polychrome who...
+     */
+    std::optional<std::string> initial_prompt = std::nullopt;
+
+    /*
+     * Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows.
+     * Can be used to steer the model to use particular spellings or styles.
+     *
+     * Example:
+     *  auto result = pipeline.generate(raw_speech);
+     *  //  He has gone and gone for good answered Paul Icrom who...
+     *
+     *  auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome"));
+     *  //  He has gone and gone for good answered Polychrome who...
+     */
+    std::optional<std::string> hotwords = std::nullopt;
+
     // A list containing tokens that will be suppressed at the beginning of the sampling process.
     std::vector<int64_t> begin_suppress_tokens;
 
@@ -111,9 +140,12 @@ static constexpr ov::Property<int64_t> pad_token_id{"pad_token_id"};
 static constexpr ov::Property<int64_t> transcribe_token_id{"transcribe_token_id"};
 static constexpr ov::Property<int64_t> translate_token_id{"translate_token_id"};
 static constexpr ov::Property<int64_t> no_timestamps_token_id{"no_timestamps_token_id"};
+static constexpr ov::Property<int64_t> prev_sot_token_id{"prev_sot_token_id"};
 static constexpr ov::Property<std::string> language{"language"};
 static constexpr ov::Property<std::string> task{"task"};
 static constexpr ov::Property<bool> return_timestamps{"return_timestamps"};
+static constexpr ov::Property<std::string> initial_prompt{"initial_prompt"};
+static constexpr ov::Property<std::string> hotwords{"hotwords"};
 static constexpr ov::Property<std::map<std::string, int64_t>> lang_to_id{"lang_to_id"};
 
 }  // namespace genai
diff --git a/src/cpp/src/whisper/context_tokens.cpp b/src/cpp/src/whisper/context_tokens.cpp
new file mode 100644
index 0000000000..75ee442551
--- /dev/null
+++ b/src/cpp/src/whisper/context_tokens.cpp
@@ -0,0 +1,89 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "context_tokens.hpp"
+
+namespace {
+std::pair<std::vector<int64_t>, float> tokenize(std::string&& text,
+                                                const ov::genai::WhisperGenerationConfig& config,
+                                                ov::genai::Tokenizer& tokenizer) {
+    if (text.empty()) {
+        return {{}, 0.0f};
+    }
+
+    auto start_time = std::chrono::steady_clock::now();
+    auto encoded = tokenizer.encode(text, ov::genai::add_special_tokens(false));
+    auto duration = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - start_time);
+
+    auto input_ids = encoded.input_ids;
+    auto input_ids_data = input_ids.data<int64_t>();
+
+    std::vector<int64_t> prompt_tokens;
+    prompt_tokens.reserve(input_ids.get_size());
+
+    // even with ov::genai::add_special_tokens(false) tokenizer adds next special tokens. Ticket: 159569
+    std::set<int64_t> special_tokens{config.decoder_start_token_id, config.eos_token_id, config.no_timestamps_token_id};
+
+    for (size_t i = 0; i < input_ids.get_size(); i++) {
+        if (special_tokens.count(input_ids_data[i])) {
+            continue;
+        }
+
+        prompt_tokens.emplace_back(input_ids_data[i]);
+    }
+
+    return {prompt_tokens, duration};
+}
+}  // namespace
+
+namespace ov {
+namespace genai {
+
+std::pair<WhisperContextTokens, float> prepare_context_tokens(const WhisperGenerationConfig& config,
+                                                              Tokenizer& tokenizer) {
+    WhisperContextTokens context_tokens;
+    float duration = 0.0f;
+
+    if (config.initial_prompt.has_value()) {
+        auto [initial_prompt_tokens, initial_prompt_duration] =
+            tokenize(" " + *config.initial_prompt, config, tokenizer);
+        context_tokens.initial_prompt = std::move(initial_prompt_tokens);
+        duration += initial_prompt_duration;
+    }
+
+    if (config.hotwords.has_value()) {
+        auto [hotwords_tokens, hotwords_duration] = tokenize(" " + *config.hotwords, config, tokenizer);
+        context_tokens.hotwords = std::move(hotwords_tokens);
+        duration += hotwords_duration;
+    }
+
+    return {context_tokens, duration};
+}
+
+std::vector<int64_t> get_prompt_tokens(const WhisperContextTokens& context_tokens,
+                                       const WhisperGenerationConfig& config,
+                                       size_t chunk_offset) {
+    bool should_add_initial_prompt = !context_tokens.initial_prompt.empty() && chunk_offset == 0;
+    bool should_add_hotwords = !context_tokens.hotwords.empty();
+
+    if (!should_add_initial_prompt && !should_add_hotwords) {
+        return {};
+    }
+
+    std::vector<int64_t> prompt_tokens{config.prev_sot_token_id};
+
+    if (should_add_initial_prompt) {
+        prompt_tokens.insert(prompt_tokens.end(),
+                             context_tokens.initial_prompt.begin(),
+                             context_tokens.initial_prompt.end());
+    }
+
+    if (should_add_hotwords) {
+        prompt_tokens.insert(prompt_tokens.end(), context_tokens.hotwords.begin(), context_tokens.hotwords.end());
+    }
+
+    return prompt_tokens;
+}
+
+}  // namespace genai
+}  // namespace ov
diff --git a/src/cpp/src/whisper/context_tokens.hpp b/src/cpp/src/whisper/context_tokens.hpp
new file mode 100644
index 0000000000..0042ba8136
--- /dev/null
+++ b/src/cpp/src/whisper/context_tokens.hpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "openvino/genai/perf_metrics.hpp"
+#include "openvino/genai/whisper_generation_config.hpp"
+
+namespace ov {
+namespace genai {
+
+struct WhisperContextTokens {
+    std::vector<int64_t> initial_prompt;
+    std::vector<int64_t> hotwords;
+};
+
+std::pair<WhisperContextTokens, float> prepare_context_tokens(const WhisperGenerationConfig& config,
+                                                              Tokenizer& tokenizer);
+
+std::vector<int64_t> get_prompt_tokens(const WhisperContextTokens& context_tokens,
+                                       const WhisperGenerationConfig& config,
+                                       size_t chunk_offset);
+
+}  // namespace genai
+}  // namespace ov
diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp
index 355ccc619b..9d6aa698ce 100644
--- a/src/cpp/src/whisper/whisper.cpp
+++ b/src/cpp/src/whisper/whisper.cpp
@@ -8,6 +8,7 @@
 #include <regex>
 #include <thread>
 
+#include "context_tokens.hpp"
 #include "logit_processor.hpp"
 #include "openvino/genai/perf_metrics.hpp"
 #include "openvino/genai/whisper_generation_config.hpp"
@@ -175,11 +176,11 @@ int64_t detect_language(ov::Tensor& encoder_hidden_state,
     return output_token;
 }
 
-std::vector<int64_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
-                                      ov::InferRequest decoder,
-                                      const ov::genai::WhisperGenerationConfig& config,
-                                      const bool return_timestamps,
-                                      ov::genai::RawPerfMetrics& raw_metrics) {
+std::vector<int64_t> prepare_init_tokens(ov::Tensor& encoder_hidden_state,
+                                         ov::InferRequest decoder,
+                                         const ov::genai::WhisperGenerationConfig& config,
+                                         const bool return_timestamps,
+                                         ov::genai::RawPerfMetrics& raw_metrics) {
     if (!config.is_multilingual) {
         if (return_timestamps) {
             return std::vector<int64_t>{config.decoder_start_token_id};
@@ -290,6 +291,7 @@ namespace genai {
 
 WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& config,
                                        const ov::genai::WhisperConfig& model_config,
+                                       const WhisperContextTokens& context_tokens,
                                        const RawSpeechInput& raw_speech,
                                        ov::genai::WhisperInitializedModels& models,
                                        WhisperFeatureExtractor& feature_extractor,
@@ -313,7 +315,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
     // long-form audio processing requires timestamps to be enabled
     const bool return_timestamps = config.return_timestamps || !is_shortform;
 
-    std::vector<int64_t> init_ids;
+    std::vector<int64_t> init_tokens;
     std::vector<int64_t>& output_tokens = result.output_tokens;
     std::vector<Segment> segments;
 
@@ -335,14 +337,18 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
                                                 raw_metrics);
 
         // prepare init_ids just once for whole input
-        if (init_ids.empty()) {
-            init_ids = prepare_init_ids(hidden_state_tensor, models.decoder, config, return_timestamps, raw_metrics);
+        if (init_tokens.empty()) {
+            init_tokens =
+                prepare_init_tokens(hidden_state_tensor, models.decoder, config, return_timestamps, raw_metrics);
         }
 
+        std::vector<int64_t> chunk_init_tokens = ov::genai::get_prompt_tokens(context_tokens, config, chunk_offset);
+        chunk_init_tokens.insert(chunk_init_tokens.end(), init_tokens.begin(), init_tokens.end());
+
         auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor,
                                                             config,
                                                             models,
-                                                            init_ids,
+                                                            chunk_init_tokens,
                                                             max_new_tokens - output_tokens.size(),
                                                             return_timestamps,
                                                             raw_metrics,
diff --git a/src/cpp/src/whisper/whisper.hpp b/src/cpp/src/whisper/whisper.hpp
index 4904edf925..81f559db9f 100644
--- a/src/cpp/src/whisper/whisper.hpp
+++ b/src/cpp/src/whisper/whisper.hpp
@@ -5,6 +5,7 @@
 
 #include <openvino/openvino.hpp>
 
+#include "context_tokens.hpp"
 #include "openvino/genai/whisper_generation_config.hpp"
 #include "openvino/genai/whisper_pipeline.hpp"
 #include "whisper_config.hpp"
@@ -28,6 +29,7 @@ struct WhisperGenerateResult {
 
 WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& config,
                                        const ov::genai::WhisperConfig& model_config,
+                                       const WhisperContextTokens& context_tokens,
                                        const ov::genai::RawSpeechInput& raw_speech,
                                        ov::genai::WhisperInitializedModels& models,
                                        ov::genai::WhisperFeatureExtractor& feature_extractor,
diff --git a/src/cpp/src/whisper_generation_config.cpp b/src/cpp/src/whisper_generation_config.cpp
index 0fba4e962f..beb663caaf 100644
--- a/src/cpp/src/whisper_generation_config.cpp
+++ b/src/cpp/src/whisper_generation_config.cpp
@@ -8,8 +8,8 @@
 #include <nlohmann/json.hpp>
 #include <openvino/runtime/core.hpp>
 
-#include "utils.hpp"
 #include "json_utils.hpp"
+#include "utils.hpp"
 
 namespace ov {
 namespace genai {
@@ -31,6 +31,7 @@ WhisperGenerationConfig::WhisperGenerationConfig(const std::filesystem::path& js
     read_json_param(data, "pad_token_id", pad_token_id);
     read_json_param(data, "no_timestamps_token_id", no_timestamps_token_id);
     read_json_param(data, "max_initial_timestamp_index", max_initial_timestamp_index);
+    read_json_param(data, "prev_sot_token_id", prev_sot_token_id);
 
     read_json_param(data, "is_multilingual", is_multilingual);
     if (is_multilingual) {
@@ -73,6 +74,8 @@ void WhisperGenerationConfig::update_generation_config(const ov::AnyMap& config_
     read_anymap_param(config_map, "lang_to_id", lang_to_id);
     read_anymap_param(config_map, "task", task);
     read_anymap_param(config_map, "return_timestamps", return_timestamps);
+    read_anymap_param(config_map, "initial_prompt", initial_prompt);
+    read_anymap_param(config_map, "hotwords", hotwords);
 }
 
 size_t WhisperGenerationConfig::get_max_new_tokens(size_t prompt_length) const {
diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp
index d472a20238..f0fb34cdf6 100644
--- a/src/cpp/src/whisper_pipeline.cpp
+++ b/src/cpp/src/whisper_pipeline.cpp
@@ -9,6 +9,7 @@
 #include <variant>
 
 #include "utils.hpp"
+#include "whisper/context_tokens.hpp"
 #include "whisper/streamer.hpp"
 #include "whisper/whisper.hpp"
 #include "whisper/whisper_config.hpp"
@@ -91,8 +92,11 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
             streamer_ptr = std::make_shared<ChunkTextCallbackStreamer>(m_tokenizer, *callback);
         }
 
+        auto [context_tokens, tokenization_duration_microseconds] = prepare_context_tokens(config, m_tokenizer);
+
         auto generate_result = ov::genai::whisper_generate(config,
                                                            m_model_config,
+                                                           context_tokens,
                                                            raw_speech_input,
                                                            m_models,
                                                            m_feature_extractor,
@@ -102,6 +106,8 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
         generate_result.perf_metrics.raw_metrics.detokenization_durations.emplace_back(
             PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time));
 
+        result.perf_metrics.raw_metrics.tokenization_durations.emplace_back(tokenization_duration_microseconds);
+
         result.perf_metrics = generate_result.perf_metrics;
         auto& segments = generate_result.segments;
 
diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp
index 136819fa01..dc26789846 100644
--- a/src/cpp/src/whisper_pipeline_static.cpp
+++ b/src/cpp/src/whisper_pipeline_static.cpp
@@ -579,6 +579,9 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
     WhisperGenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
     config.validate();
 
+    OPENVINO_ASSERT(!config.initial_prompt.has_value(), "'initial_prompt' parameter is not supported on NPU device.");
+    OPENVINO_ASSERT(!config.hotwords.has_value(), "'hotwords' parameter is not supported on NPU device.");
+
     std::shared_ptr<ChunkStreamerBase> streamer_ptr;
     if (auto streamer_obj = std::get_if<std::monostate>(&streamer)) {
         streamer_ptr = nullptr;
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index bfcb869157..3d27b23052 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -1948,6 +1948,9 @@ class WhisperGenerationConfig:
         :param no_timestamps_token_id: No timestamps token id.
         :type no_timestamps_token_id: int
     
+        :param prev_sot_token_id: Corresponds to the ”<|startofprev|>” token.
+        :type prev_sot_token_id: int
+    
         :param is_multilingual:
         :type is_multilingual: bool
     
@@ -1976,10 +1979,34 @@ class WhisperGenerationConfig:
                            then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
                            Note that a segment of text refers to a sequence of one or more words, rather than individual words.
         :type return_timestamps: bool
+    
+        :param initial_prompt: Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing
+        window. Can be used to steer the model to use particular spellings or styles.
+    
+        Example:
+          auto result = pipeline.generate(raw_speech);
+          //  He has gone and gone for good answered Paul Icrom who...
+    
+          auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome"));
+          //  He has gone and gone for good answered Polychrome who...
+        :type initial_prompt: Optional[str]
+    
+        :param hotwords:  Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows.
+        Can be used to steer the model to use particular spellings or styles.
+    
+        Example:
+          auto result = pipeline.generate(raw_speech);
+          //  He has gone and gone for good answered Paul Icrom who...
+    
+          auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome"));
+          //  He has gone and gone for good answered Polychrome who...
+        :type hotwords: Optional[str]
     """
     begin_suppress_tokens: list[int]
     decoder_start_token_id: int
     eos_token_id: int
+    hotwords: str | None
+    initial_prompt: str | None
     is_multilingual: bool
     lang_to_id: dict[str, int]
     language: str | None
@@ -1988,6 +2015,7 @@ class WhisperGenerationConfig:
     max_new_tokens: int
     no_timestamps_token_id: int
     pad_token_id: int
+    prev_sot_token_id: int
     return_timestamps: bool
     suppress_tokens: list[int]
     task: str | None
@@ -2080,6 +2108,9 @@ class WhisperPipeline:
             :param no_timestamps_token_id: No timestamps token id.
             :type no_timestamps_token_id: int
         
+            :param prev_sot_token_id: Corresponds to the ”<|startofprev|>” token.
+            :type prev_sot_token_id: int
+        
             :param is_multilingual:
             :type is_multilingual: bool
         
@@ -2108,6 +2139,28 @@ class WhisperPipeline:
                                then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
                                Note that a segment of text refers to a sequence of one or more words, rather than individual words.
             :type return_timestamps: bool
+        
+            :param initial_prompt: Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing
+            window. Can be used to steer the model to use particular spellings or styles.
+        
+            Example:
+              auto result = pipeline.generate(raw_speech);
+              //  He has gone and gone for good answered Paul Icrom who...
+        
+              auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome"));
+              //  He has gone and gone for good answered Polychrome who...
+            :type initial_prompt: Optional[str]
+        
+            :param hotwords:  Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows.
+            Can be used to steer the model to use particular spellings or styles.
+        
+            Example:
+              auto result = pipeline.generate(raw_speech);
+              //  He has gone and gone for good answered Paul Icrom who...
+        
+              auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome"));
+              //  He has gone and gone for good answered Polychrome who...
+            :type hotwords: Optional[str]
         """
     def get_generation_config(self) -> WhisperGenerationConfig:
         ...
diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp
index 49152c03f4..cd42dcf58d 100644
--- a/src/python/py_whisper_pipeline.cpp
+++ b/src/python/py_whisper_pipeline.cpp
@@ -103,6 +103,9 @@ auto whisper_generation_config_docstring = R"(
     :param no_timestamps_token_id: No timestamps token id.
     :type no_timestamps_token_id: int
 
+    :param prev_sot_token_id: Corresponds to the ”<|startofprev|>” token.
+    :type prev_sot_token_id: int
+
     :param is_multilingual:
     :type is_multilingual: bool
 
@@ -131,6 +134,28 @@ auto whisper_generation_config_docstring = R"(
                        then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
                        Note that a segment of text refers to a sequence of one or more words, rather than individual words.
     :type return_timestamps: bool
+
+    :param initial_prompt: Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing
+    window. Can be used to steer the model to use particular spellings or styles.
+
+    Example:
+      auto result = pipeline.generate(raw_speech);
+      //  He has gone and gone for good answered Paul Icrom who...
+
+      auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome"));
+      //  He has gone and gone for good answered Polychrome who...
+    :type initial_prompt: Optional[str]
+
+    :param hotwords:  Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows.
+    Can be used to steer the model to use particular spellings or styles.
+
+    Example:
+      auto result = pipeline.generate(raw_speech);
+      //  He has gone and gone for good answered Paul Icrom who...
+
+      auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome"));
+      //  He has gone and gone for good answered Polychrome who...
+    :type hotwords: Optional[str]
 )";
 
 auto streamer_base_docstring = R"(
@@ -262,11 +287,14 @@ void init_whisper_pipeline(py::module_& m) {
         .def_readwrite("transcribe_token_id", &WhisperGenerationConfig::transcribe_token_id)
         .def_readwrite("max_initial_timestamp_index", &WhisperGenerationConfig::max_initial_timestamp_index)
         .def_readwrite("no_timestamps_token_id", &WhisperGenerationConfig::no_timestamps_token_id)
+        .def_readwrite("prev_sot_token_id", &WhisperGenerationConfig::prev_sot_token_id)
         .def_readwrite("is_multilingual", &WhisperGenerationConfig::is_multilingual)
         .def_readwrite("language", &WhisperGenerationConfig::language)
         .def_readwrite("lang_to_id", &WhisperGenerationConfig::lang_to_id)
         .def_readwrite("task", &WhisperGenerationConfig::task)
         .def_readwrite("return_timestamps", &WhisperGenerationConfig::return_timestamps)
+        .def_readwrite("initial_prompt", &WhisperGenerationConfig::initial_prompt)
+        .def_readwrite("hotwords", &WhisperGenerationConfig::hotwords)
         .def("set_eos_token_id", &WhisperGenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id"));
 
     py::class_<WhisperRawPerfMetrics>(m, "WhisperRawPerfMetrics", raw_perf_metrics_docstring)
diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_generate_api.py
index 9a117bc939..1450ef1f2e 100644
--- a/tests/python_tests/test_whisper_generate_api.py
+++ b/tests/python_tests/test_whisper_generate_api.py
@@ -570,6 +570,31 @@ def test_longform_audio(model_descr, test_sample):
     assert genai_result.chunks == None
 
 
+@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
+@pytest.mark.parametrize(
+    "test_sample",
+    get_samples_from_dataset(length=1),
+)
+@pytest.mark.precommit
+def test_initial_prompt_hotwords(model_descr, test_sample):
+    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
+
+    result = pipe.generate(test_sample)
+
+    assert "Joel Keaton" in result.texts[0]
+    assert "Joel Kyton" not in result.texts[0]
+
+    result = pipe.generate(test_sample, initial_prompt="Joel Kyton")
+
+    assert "Joel Keaton" not in result.texts[0]
+    assert "Joel Kyton" in result.texts[0]
+
+    result = pipe.generate(test_sample, hotwords="Joel Kyton")
+
+    assert "Joel Keaton" not in result.texts[0]
+    assert "Joel Kyton" in result.texts[0]
+
+
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
     "test_sample",

From c13e8e5a2effdb7834a40a10586dfdd39e72bd2a Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Thu, 19 Dec 2024 19:34:57 +0400
Subject: [PATCH 033/110] [ SD ] Fix of scheduler config for main_pipeline
 (#1406)

---
 .../speculative_decoding_impl.cpp               | 17 +++++++++--------
 .../utils/paged_attention_transformations.cpp   |  2 +-
 .../utils/paged_attention_transformations.hpp   |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
index 4a0748b5c0..46b7b106a6 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -46,14 +46,15 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
                                draft_scheduler_config = is_scheduler_undefined ? main_scheduler_config : draft_model_desc.scheduler_config;
     if (is_scheduler_undefined) {
         // split KV cache to 2 caches for main and draft models
-        size_t main_model_cache_size = utils::get_kv_cache_size(main_model),
-            draft_model_cache_size = utils::get_kv_cache_size(draft_model);
-        auto k = static_cast<float>(draft_model_cache_size) / (main_model_cache_size + draft_model_cache_size);
+        size_t main_model_hidden_size = utils::get_hidden_size(main_model),
+               draft_model_hidden_size = utils::get_hidden_size(draft_model);
+        auto k = static_cast<float>(draft_model_hidden_size) / (main_model_hidden_size + draft_model_hidden_size);
 
-        size_t main_cache_size = main_scheduler_config.cache_size * (1 - k),
+        size_t main_cache_size = std::ceil(main_scheduler_config.cache_size * (1.f - k)),
                draft_cache_size = main_scheduler_config.cache_size - main_cache_size;
+        OPENVINO_ASSERT(main_cache_size > 0, "KV cache model cache size should be > 0");
         if (draft_cache_size == 0) {
-            main_cache_size -= main_cache_size > 1 ? 1 : 0;
+            main_cache_size -= (main_cache_size > 1 ? 1 : 0);
             draft_cache_size = 1;
         }
 
@@ -63,7 +64,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
 
     ov::AnyMap draft_properties = draft_model_desc.properties == ov::AnyMap{} ? compile_properties : draft_model_desc.properties;
 
-    DeviceConfig main_device_config(core, main_scheduler_config, main_device, compile_properties),
+    DeviceConfig main_device_config(core, main_scheduler_config_updated, main_device, compile_properties),
                  draft_device_config(core, draft_scheduler_config, draft_device, draft_properties);
 
     utils::set_kv_cache_type_and_shape(main_model, main_device_config);
@@ -82,7 +83,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
     // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode
     m_main_pipeline = std::make_shared<ContinuousBatchingForSpeculativeDecodingImpl>(core,
         main_model, main_model_tokenizer, main_model_desc.generation_config,
-        main_device_config, main_scheduler_config, main_device, compile_properties, true);
+        main_device_config, main_scheduler_config_updated, main_device, compile_properties, true);
     m_draft_pipeline = std::make_shared<ContinuousBatchingForSpeculativeDecodingImpl>(core,
         draft_model, draft_model_tokenizer, draft_model_desc.generation_config,
         draft_device_config, draft_scheduler_config, draft_device, draft_properties, false);
@@ -278,4 +279,4 @@ SpeculativeDecodingMetrics
 ContinuousBatchingPipeline::SpeculativeDecodingImpl::get_speculative_decoding_metrics() {
     return m_sd_metrics;
 };
-}
\ No newline at end of file
+}
diff --git a/src/cpp/src/utils/paged_attention_transformations.cpp b/src/cpp/src/utils/paged_attention_transformations.cpp
index 53690f770c..16c9556151 100644
--- a/src/cpp/src/utils/paged_attention_transformations.cpp
+++ b/src/cpp/src/utils/paged_attention_transformations.cpp
@@ -16,7 +16,7 @@ inline ov::PartialShape to_partial_with_dyn_0_dim(const ov::Shape& static_shape)
     return partial_shape;
 }
 
-size_t get_kv_cache_size(const std::shared_ptr<ov::Model> model) {
+size_t get_hidden_size(const std::shared_ptr<ov::Model> model) {
     const auto& parameters = model->get_parameters();
     // extract num_kv_heads and head_size
     size_t kv_caches_inputs_offset = 2;
diff --git a/src/cpp/src/utils/paged_attention_transformations.hpp b/src/cpp/src/utils/paged_attention_transformations.hpp
index 3bc423d7bc..88ac0876c5 100644
--- a/src/cpp/src/utils/paged_attention_transformations.hpp
+++ b/src/cpp/src/utils/paged_attention_transformations.hpp
@@ -23,7 +23,7 @@ void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, Dev
 
 void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, bool per_layer_cache_control = false);
 
-size_t get_kv_cache_size(const std::shared_ptr<ov::Model> model);
+size_t get_hidden_size(const std::shared_ptr<ov::Model> model);
 
 void set_kv_cache_type_and_shape(std::shared_ptr<ov::Model> model, DeviceConfig& device_config);
 

From 19c66f5d3c316f0d54b1e4f2594d72b3a4add018 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Thu, 19 Dec 2024 18:06:27 +0100
Subject: [PATCH 034/110] Fail gracefully when openvino_tokenizer.xml is not
 available (#1413)

Is was failing with segfault. Now fails more gracefully
```
Check 'm_ireq_queue_tokenizer' failed at .../src/cpp/src/tokenizer.cpp:387:
Either openvino_tokenizer.xml was not provided or it was not loaded correctly. Tokenizer::encode is not available
```
CVS-158884
---
 src/cpp/src/tokenizer.cpp | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 642236d32a..ed6fbc0a06 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -194,10 +194,16 @@ class Tokenizer::TokenizerImpl {
 
     void setupTokenizer(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models,  const ov::AnyMap& properties) {
         auto [ov_tokenizer, ov_detokenizer] = models;
+        OPENVINO_ASSERT(ov_tokenizer || ov_detokenizer, "Neither tokenizer nor detokenzier models were provided");
 
-        m_older_than_24_5 = ov_tokenizer->get_rt_info().count("openvino_tokenizers_version") != 1;
         auto core = get_core_singleton();
         std::string device = "CPU"; // only CPU is supported for now
+        
+        std::string version_str;
+        utils::read_rt_info(ov_tokenizer != nullptr ? ov_tokenizer: ov_detokenizer , "openvino_tokenizers_version", version_str);
+        // Saving IR version was added only in 24.5, so if it's empty, then it's older than 24.5
+        m_older_than_24_5 = version_str.empty();
+
         if (ov_tokenizer) {
             ov::pass::Manager manager;
             manager.register_pass<MakeCombineSegmentsSatateful>();
@@ -230,7 +236,8 @@ class Tokenizer::TokenizerImpl {
         if (m_tokenizer) {
             // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
             encode("non empty string").input_ids;
-        if (m_detokenizer)
+        }
+        if (m_detokenizer) {
             decode({1, 33, 199, 42, 42});
         }
 
@@ -377,6 +384,9 @@ class Tokenizer::TokenizerImpl {
     }
 
     TokenizedInputs encode(std::string prompt, const ov::AnyMap& tokenization_params = {}) {
+        OPENVINO_ASSERT(m_ireq_queue_tokenizer, "Either openvino_tokenizer.xml was not provided or it was not loaded correctly. "
+                                                "Tokenizer::encode is not available");
+
         CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
         set_state_if_necessary(infer_request_guard, tokenization_params);
         size_t batch_size = 1;
@@ -390,6 +400,8 @@ class Tokenizer::TokenizerImpl {
     }
 
     TokenizedInputs encode(std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params = {}) {
+        OPENVINO_ASSERT(m_ireq_queue_tokenizer, "Either openvino_tokenizer.xml was not provided or it was not loaded correctly. "
+                                                "Tokenizer::encode is not available");
         TokenizedInputs unpadded;
         {
             CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());

From 4d18f8b264c79ddce3c2dc0997992c26ab5c6c5f Mon Sep 17 00:00:00 2001
From: Sofya Balandina <sofya.balandina@intel.com>
Date: Fri, 20 Dec 2024 08:03:49 +0000
Subject: [PATCH 035/110] Make Sampler a member of the class for llm/vlm
 pipelines (#1412)

cherry-pick https://github.com/openvinotoolkit/openvino.genai/pull/1347
to master
---
 src/cpp/src/llm_pipeline.cpp                 | 12 +++++++++---
 src/cpp/src/lm_encoding.cpp                  |  3 +++
 src/cpp/src/sampler.hpp                      |  7 ++++++-
 src/cpp/src/visual_language/pipeline.cpp     | 14 ++++++++++++--
 tests/python_tests/test_chat_generate_api.py |  7 +++++--
 5 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 6d9aae30fa..6fdb8ac1cd 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -45,6 +45,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
     ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
     size_t m_to_remove_from_hist = 0;
     size_t m_kv_cache_seq_length_axis = 2;
+    Sampler m_sampler;
 
     StatefulLLMPipeline(
         const ov::InferRequest& request,
@@ -75,7 +76,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         const std::string& device,
         const ov::AnyMap& config,
         const ov::genai::GenerationConfig& generation_config
-    ) : LLMPipelineImplBase(tokenizer, generation_config) {
+    ) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
         ov::Core core;
         ov::CompiledModel compiled_model;
         auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
@@ -96,6 +97,8 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         // If eos_token_id was not provided, take value
         if (m_generation_config.eos_token_id == -1)
             m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
+
+        m_sampler.set_seed(m_generation_config.rng_seed);
     }
 
     StatefulLLMPipeline(
@@ -358,9 +361,12 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                 requests.push_back(sequence_group);
             }
 
-            Sampler sampler = Sampler(m_tokenizer);
+            if (m_sampler.get_seed() != config.rng_seed) {
+                m_sampler.set_seed(config.rng_seed);
+            }
+
             std::tie(result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask, streamer_ptr,
-                                                                                  sampler, requests, position_ids, std::nullopt, m_selected_beam);
+                                                                                  m_sampler, requests, position_ids, std::nullopt, m_selected_beam);
         }
 
         if (is_chat_conversation) {
diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index 3ab041fa58..62c53cace4 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -247,6 +247,9 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
         // next_selected_beam = sampler.last_selected_beam(request);
     }
 
+    for (SequenceGroup::Ptr sequence_group : sequence_groups)
+        sampler.clear_request_info(sequence_group->get_request_id());
+
     return {results, next_selected_beam};
 }
 
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 0f7876cbf9..08a9863e0a 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -55,6 +55,7 @@ class Sampler {
     std::map<uint64_t, GroupBeamSearcher> m_beam_search_info;
 
     std::mt19937 rng_engine;
+    size_t seed = rng_engine.default_seed;
     // { request_id, logit_processor }
     std::map<uint64_t, LogitProcessor> m_logit_processors;
 
@@ -65,7 +66,11 @@ class Sampler {
     Sampler(Tokenizer & tokenizer) : m_tokenizer(tokenizer) {};
 
     SamplerOutput sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false);
-    void set_seed(size_t seed) { rng_engine.seed(seed); }
+    void set_seed(size_t new_seed) {
+        rng_engine.seed(new_seed);
+        seed = new_seed;
+    }
+    size_t get_seed() { return seed; }
 
     void clear_request_info(uint64_t request_id);
 
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
index 0d7aebc506..7bf1c1070a 100644
--- a/src/cpp/src/visual_language/pipeline.cpp
+++ b/src/cpp/src/visual_language/pipeline.cpp
@@ -67,6 +67,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
     float m_load_time_ms = 0;
     // Axis num in kv cache from m_language model, which contains information about history len
     size_t m_kv_cache_seq_length_axis = 2;
+    // Component for applying sampling to lm outputs
+    Sampler m_sampler;
 
     VLMPipelineImpl(
         const std::filesystem::path& models_dir,
@@ -105,6 +107,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         if (m_generation_config.eos_token_id == -1) {
             m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
         }
+
+        m_sampler = Sampler(m_tokenizer);
+        m_sampler.set_seed(m_generation_config.rng_seed);
     }
 
     VLMPipelineImpl(
@@ -140,6 +145,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         if (m_generation_config.eos_token_id == -1) {
             m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
         }
+
+        m_sampler = Sampler(m_tokenizer);
+        m_sampler.set_seed(m_generation_config.rng_seed);
     }
 
     VLMDecodedResults generate(
@@ -204,11 +212,13 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }};
         std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), history_size);
 
-        Sampler sampler = Sampler(m_tokenizer);
+        if (m_sampler.get_seed() != generation_config.rng_seed) {
+            m_sampler.set_seed(generation_config.rng_seed);
+        }
 
         ov::genai::EncodedResults encoded_result;
         int32_t m_selected_beam = 0;
-        std::tie(encoded_result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, sampler, requests,
+        std::tie(encoded_result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests,
                                                                                       position_ids, m_embedding, std::nullopt);
 
         auto decode_start_time = std::chrono::steady_clock::now();
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index 9260e671d6..d9661e538b 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -187,10 +187,13 @@ def test_set_chat_template():
     model_descr = get_chat_models_list()[0]
     model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
     pipe.get_tokenizer().set_chat_template("{% for message in messages %}{{ message['content'] }}{% endfor %}")
+    config = ov_genai.GenerationConfig()
+    config.max_new_tokens = 1
+    config.do_sample = False
     pipe.start_chat()
-    generated = pipe.generate("a", max_new_tokens=1)
+    generated = pipe.generate("a", config)
     pipe.finish_chat()
-    reference = pipe.generate("a", max_new_tokens=1)
+    reference = pipe.generate("a", config)
     assert generated == reference
 
 prompts = [

From 04d97283263de2303a9df61cf43d20a624e07d0d Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 20 Dec 2024 13:33:54 +0400
Subject: [PATCH 036/110] [LLM/VLM] Stop generation when streaming callback
 returns true (#1410)

Affects only stateful VLM and LLM pipelines and CB, SD implementation
should be fixed separately as 2 pipelines should be aborted in case of
exception / cancel via streaming callback
---
 src/cpp/src/continuous_batching_impl.cpp |  81 ++++++++++-------
 src/cpp/src/generation_handle.cpp        |   2 +-
 src/cpp/src/generation_stream.hpp        |   5 +-
 src/cpp/src/llm_pipeline.cpp             |  15 ++--
 src/cpp/src/lm_encoding.cpp              | 105 ++++++++++-------------
 src/cpp/src/sequence_group.hpp           |  14 +--
 src/cpp/src/visual_language/pipeline.cpp |   5 +-
 7 files changed, 113 insertions(+), 114 deletions(-)

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 6e7e982a4c..e1ffd062de 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -22,7 +22,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
     m_tokenizer = tokenizer;
     m_generation_config = generation_config;
     m_is_validation_mode_enabled = is_validation_mode_enabled;
-    
+
     ov::Core core;
 
     auto [core_properties, compile_properties] = utils::split_core_compile_config(properties);
@@ -255,18 +255,6 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         }
     }, streamer);
 
-    OPENVINO_ASSERT(streamer_ptr == nullptr || input_ids.size() == 1 && (sampling_params[0].is_greedy_decoding() || sampling_params[0].is_multinomial()),
-        "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
-
-    std::vector<GenerationHandle> generations;
-    for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) {
-        OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch.");
-        generations.push_back(add_request(request_id, input_ids[request_id], sampling_params[request_id]));
-    }
-
-    std::vector<EncodedGenerationResult> results;
-    results.reserve(m_awaiting_requests.size());
-
     auto drop_requests = [&] () {
         for (const std::shared_ptr<ov::genai::SequenceGroup> request : m_requests) {
             for (const auto& sequence: request->get_sequences()) {
@@ -279,25 +267,40 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         m_requests.clear();
     };
 
+    OPENVINO_ASSERT(streamer_ptr == nullptr || input_ids.size() == 1 && sampling_params[0].num_return_sequences == 1 &&
+        (sampling_params[0].is_greedy_decoding() || sampling_params[0].is_multinomial()),
+        "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
+
+    std::vector<GenerationHandle> generations;
+    for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) {
+        OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch.");
+        generations.push_back(add_request(request_id, input_ids[request_id], sampling_params[request_id]));
+    }
+    auto all_requests = m_awaiting_requests; // we need to store all requests to get results from them once generation has finished
+
     bool continue_generation = true;
     while (has_non_finished_requests() && continue_generation) {
         try {
             step();
         } catch (...) {
-            drop_requests();
+            drop_requests(); // remove all requests from pipeline state in case of exception
             throw;
         }
-        if (streamer_ptr && generations.at(0)->can_read()) {
-            std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
+
+        auto & generation = generations.at(0);
+        if (streamer_ptr && generation->can_read()) {
+            std::unordered_map<uint64_t, GenerationOutput> token = generation->back();
             for (const auto& gen_token : token.begin()->second.generated_ids) {
-                if (!streamer_ptr->put(gen_token)) {
+                continue_generation = !streamer_ptr->put(gen_token);
+                if (!continue_generation) {
+                    generation->drop();
                     break;
                 }
             }
         }
     }
 
-    if (streamer_ptr) {
+    if (streamer_ptr) { // push streamer's cache
         streamer_ptr->end();
     }
 
@@ -307,16 +310,32 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed");
     }
 
-    for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) {
-        const auto& generation = generations[generation_idx];
+    std::vector<EncodedGenerationResult> results;
+    results.reserve(all_requests.size());
+
+    for (size_t request_id = 0; request_id < all_requests.size(); ++request_id) {
+        const auto& request = all_requests[request_id];
+        auto sampling_params = request->get_sampling_parameters();
+        const auto& sequences = request->get_finished_sequences();
+        size_t num_outputs = std::min(sampling_params.num_return_sequences, sequences.size());
+
         EncodedGenerationResult result;
-        result.m_request_id = 1;
-        std::vector<GenerationOutput> generation_outputs = generation->read_all();
-        for (const auto& generation_output : generation_outputs) {
-            result.m_generation_ids.push_back(std::move(generation_output.generated_ids));
-            result.m_scores.push_back(generation_output.score);
+        result.m_request_id = request_id;
+        result.m_generation_ids.resize(num_outputs);
+        result.m_scores.resize(num_outputs);
+
+        for (size_t i = 0; i < num_outputs; ++i) {
+            const auto & sequence = sequences[i];
+            const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_probs();
+            const auto & generated_ids = sequence->get_generated_ids();
+
+            if (sampling_params.echo)
+                result.m_generation_ids[i] = request->get_prompt_ids();
+            std::copy(generated_ids.begin(), generated_ids.end(), std::back_inserter(result.m_generation_ids[i]));
+            result.m_scores[i] = score;
         }
-        result.m_status = generation->get_status();
+
+        result.m_status = generations[request_id]->get_status();
         results.push_back(std::move(result));
     }
 
@@ -408,7 +427,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
     for (size_t sequence_group_id = 0, currently_processed_tokens = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
         SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id];
         // requests not scheduled, in decoding phase or not echoing are not processed
-        if (!sequence_group->is_scheduled() || sequence_group->get_context_len() > sequence_group->get_prompt_len() || 
+        if (!sequence_group->is_scheduled() || sequence_group->get_context_len() > sequence_group->get_prompt_len() ||
             !sequence_group->get_sampling_parameters().echo)
             continue;
 
@@ -421,10 +440,10 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
 
         size_t num_prompt_tokens_processed = sequence_group->get_num_processed_tokens();
         OPENVINO_ASSERT(num_prompt_tokens_processed + actual_seq_len <= sequence_group->get_prompt_len());
-        
+
         // if we processed the whole prompt we don't include last logprob as it will be processed by the sampler (it's already completion)
-        // otherwise we include it as it will be used in the next part of the prompt 
-        int exclude_last_logprob = 1; 
+        // otherwise we include it as it will be used in the next part of the prompt
+        int exclude_last_logprob = 1;
         if (num_prompt_tokens_processed + actual_seq_len < sequence_group->get_prompt_len())
             exclude_last_logprob = 0;
 
@@ -435,7 +454,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
         for (int token_logits_offset = 0, token_id_offset = num_prompt_tokens_processed + 1;
              token_logits_offset < actual_seq_len - exclude_last_logprob;
              token_logits_offset++, token_id_offset++) {
-            
+
             const float* token_logits = (sequence_group_logits_data + token_logits_offset * vocab_size);
             int64_t token_id = sequence_group->get_prompt_ids()[token_id_offset];
             float token_logit = token_logits[token_id];
diff --git a/src/cpp/src/generation_handle.cpp b/src/cpp/src/generation_handle.cpp
index a1dd467523..0f10a85a86 100644
--- a/src/cpp/src/generation_handle.cpp
+++ b/src/cpp/src/generation_handle.cpp
@@ -17,7 +17,7 @@ GenerationStatus GenerationHandleImpl::get_status() {
 }
 
 bool GenerationHandleImpl::can_read() {
-    return !is_dropped() &&  m_generation_stream->can_read();
+    return !is_dropped() && m_generation_stream->can_read();
 }
 
 bool GenerationHandleImpl::is_dropped() {
diff --git a/src/cpp/src/generation_stream.hpp b/src/cpp/src/generation_stream.hpp
index 4d41f160e4..518699ba36 100644
--- a/src/cpp/src/generation_stream.hpp
+++ b/src/cpp/src/generation_stream.hpp
@@ -14,8 +14,6 @@ class GenerationStream {
     GenerationStatus m_status = GenerationStatus::RUNNING;
     SynchronizedQueue<GenerationOutputs> m_output_queue;
 
-    std::vector<uint64_t> last_sequence_ids;
-
 public:
     using Ptr = std::shared_ptr<GenerationStream>;
 
@@ -30,10 +28,11 @@ class GenerationStream {
         m_output_queue.push(std::move(outputs));
     }
 
-    // Retrieving vector of pairs <sequence_id, token_id> as we can generate multiple outputs for a single prompt
+    // Retrieving vector of pairs <sequence_id, token_ids> as we can generate multiple outputs for a single prompt
     GenerationOutputs back() {
         return m_output_queue.back();
     }
+
     GenerationOutputs read() {
         return m_output_queue.pull();
     }
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 6fdb8ac1cd..623333e349 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -284,10 +284,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         }
 
         auto batch_size = input_ids.get_shape().at(0);
-        if ((batch_size != 1 || !(config.is_greedy_decoding() || config.is_multinomial())) && streamer_ptr) {
-            OPENVINO_THROW("Currently streaming is possible only with batch size=1 and "
-                            "only for greedy or multinomial decoding");
-        }
+        OPENVINO_ASSERT(streamer_ptr == nullptr || batch_size == 1 && config.num_return_sequences == 1 &&
+            (config.is_greedy_decoding() || config.is_multinomial()),
+            "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
 
         auto num_inputs = m_model_runner.get_compiled_model().inputs().size();
         OPENVINO_ASSERT(num_inputs == 4 || num_inputs == 3, "Model should have 3 or 4 inputs: "
@@ -587,9 +586,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         std::vector<std::string> plain_replies;
         std::vector<float> plain_scores;
         for (GenerationResult& res : generated) {
-            if (GenerationStatus::FINISHED != res.m_status) {
-                OPENVINO_THROW("Got unfinished GenerationStatus");
-            }
+            OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
             std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies));
             std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
         }
@@ -645,9 +642,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         std::vector<std::vector<int64_t>> plain_tokens;
         std::vector<float> plain_scores;
         for (EncodedGenerationResult& res : generated) {
-            if (GenerationStatus::FINISHED != res.m_status) {
-                OPENVINO_THROW("Got unfinished GenerationStatus");
-            }
+            OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
             std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens));
             std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
         }
diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index 62c53cace4..8ef993e09f 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -67,33 +67,49 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
         generations.push_back(std::make_shared<GenerationHandleImpl>(sequence_group->get_generation_stream(), sequence_group->get_sampling_parameters()));
     }
 
+    auto active_sequence_groups{sequence_groups};
+
+    auto stream_generated_tokens = [&streamer_ptr, &generations, &active_sequence_groups]() {
+        GenerationHandle& handle = generations.at(0);
+        if (streamer_ptr && handle->can_read()) {
+            std::unordered_map<uint64_t, GenerationOutput> token = handle->back();
+            for (const auto& gen_token : token.begin()->second.generated_ids) {
+                if (streamer_ptr->put(gen_token)) {
+                    handle->drop();
+                    break;
+                }
+            }
+        }
+
+        // free non running requests
+        auto removed_it = std::remove_if(active_sequence_groups.begin(), active_sequence_groups.end(),
+            [](SequenceGroup::Ptr sg) -> bool {
+                return sg->has_finished() || sg->out_of_memory() || sg->handle_dropped();
+            });
+        active_sequence_groups.erase(removed_it, active_sequence_groups.end());
+    };
+
     ov::Shape prompts_shape = input_ids.get_shape();
     const size_t batch_size = prompts_shape[0];
 
     // Initialize results and performance metrics.
+
     EncodedResults results;
     auto& raw_perf_counters = results.perf_metrics.raw_metrics;
     raw_perf_counters.m_inference_durations = {{ MicroSeconds(0.0f) }};
 
     // Initialize inputs
-    if (m_embedding.has_value())
-        m_llm.set_tensor("inputs_embeds", input_ids);
-    else
-        m_llm.set_tensor("input_ids", input_ids);
-
+    m_llm.set_tensor(m_embedding.has_value() ? "inputs_embeds" : "input_ids", input_ids);
     m_llm.set_tensor("attention_mask", attention_mask);
-    
     if (position_ids.has_value())
         m_llm.set_tensor("position_ids", *position_ids);
 
     ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {batch_size});
-    auto beam_data = beam_idx.data<int32_t>();
-    if (selected_beam_idx.has_value())
-        beam_data[0] = *selected_beam_idx;
-    else
-        std::fill_n(beam_data, batch_size, 0);
+    std::fill_n(beam_idx.data<int32_t>(), batch_size, selected_beam_idx.has_value() ? *selected_beam_idx : 0);
     m_llm.set_tensor("beam_idx", beam_idx);
 
+    // "Prompt" phase
+
     const auto infer_start = std::chrono::steady_clock::now();
     m_llm.infer();
     const auto infer_end = std::chrono::steady_clock::now();
@@ -109,7 +125,6 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
     for (auto& sequence_group : sequence_groups) {
         sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - sequence_len);
         sequence_group->schedule_tokens(sequence_len);
-
     }
 
     std::map<size_t, size_t> beam_offets;
@@ -117,27 +132,11 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
         beam_offets.insert({sequence_groups.at(i)->get_request_id(), i});
 
     SamplerOutput sampler_output = sampler.sample(sequence_groups, logits);
+    stream_generated_tokens();
 
-    auto active_sequence_groups{sequence_groups};
-    auto get_active_sequence_groups = [](SequenceGroup::Ptr sg) { return sg->has_finished(); };
-
-    active_sequence_groups.erase(std::remove_if(active_sequence_groups.begin(),
-                                                active_sequence_groups.end(),
-                                                get_active_sequence_groups),
-                                 active_sequence_groups.end());
-    
-    auto stream_generated_tokens = [&streamer_ptr, &generations]() {
-        if (streamer_ptr && generations.at(0).get()->can_read()) {
-            std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
-            for (const auto& gen_token : token.begin()->second.generated_ids) {
-                if (!streamer_ptr->put(gen_token)) {
-                    break;
-                }
-            }
-        }
-    };
+    // "Generation" phase
 
-    while (active_sequence_groups.size() > 0) {
+    while (!active_sequence_groups.empty()) {
         size_t total_num_tokens = 0;
 
         for (auto& sequence_group : active_sequence_groups) {
@@ -178,20 +177,13 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
         }
 
         for (size_t i = 0; i < sequence_groups.size(); i++) {
-            if (i == 0)
-                beam_offets[sequence_groups.at(i)->get_request_id()] = 0;
-            else {
-                beam_offets[sequence_groups.at(i)->get_request_id()] = sequence_groups.at(i - 1)->num_running_seqs() + beam_offets[i -1];
-            }
+            beam_offets[sequence_groups.at(i)->get_request_id()] = i == 0 ? 0 : (sequence_groups.at(i - 1)->num_running_seqs() + beam_offets[i - 1]);
         }
 
         if (m_embedding.has_value()) {
             const ov::Tensor& embed_prompt_tensor = (*m_embedding).infer(new_input_ids);
-
-            m_llm.get_tensor("inputs_embeds").set_shape(embed_prompt_tensor.get_shape());
             m_llm.set_tensor("inputs_embeds", embed_prompt_tensor);
         } else {
-            m_llm.get_tensor("input_ids").set_shape(new_input_ids.get_shape());
             m_llm.set_tensor("input_ids", new_input_ids);
         }
 
@@ -201,7 +193,6 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
             update_position_ids(m_llm.get_tensor("position_ids"), m_llm.get_tensor("attention_mask"));
         }
 
-        m_llm.get_tensor("beam_idx").set_shape({ total_num_tokens });
         m_llm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {total_num_tokens}, next_beams.data()});
 
         const auto infer_start = std::chrono::steady_clock::now();
@@ -213,36 +204,30 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
         raw_perf_counters.m_new_token_times.emplace_back(infer_end);
         raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
 
-        stream_generated_tokens();
-
         sampler_output = sampler.sample(active_sequence_groups, m_llm.get_tensor("logits"));
-
-        active_sequence_groups.erase(std::remove_if(active_sequence_groups.begin(),
-                                                    active_sequence_groups.end(),
-                                                    get_active_sequence_groups),
-                                    active_sequence_groups.end());
+        stream_generated_tokens();
     }
 
-    // to stream last token
-    stream_generated_tokens();
-    if (streamer_ptr) {
+    if (streamer_ptr) { // push streamer's cache
         streamer_ptr->end();
     }
-    
+
+    // Collect results
+
     size_t next_selected_beam = 0;
     for (size_t i = 0; i < sequence_groups.size(); i++) {
         auto request = sequence_groups[i];
-        auto generation_outputs = generations[i]->read_all();
+        std::vector<GenerationOutput> generation_outputs;
+        auto sampling_params = request->get_sampling_parameters();
+        const auto& sequences = request->get_finished_sequences();
+        size_t num_outputs = std::min(request->get_sampling_parameters().num_return_sequences, sequences.size());
 
-        std::sort(generation_outputs.begin(), generation_outputs.end(), [] (const GenerationOutput& r1, const GenerationOutput& r2) {
-            return r1.score > r2.score;
-        });
+        for (size_t seq_id = 0; seq_id < num_outputs; ++seq_id) {
+            const auto & sequence = sequences[seq_id];
+            const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_probs();
 
-        auto num_outputs = std::min(request->get_sampling_parameters().num_return_sequences, generation_outputs.size());
-        for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
-            const auto& generation_output = generation_outputs[generation_output_idx];
-            results.tokens.push_back(std::move(generation_output.generated_ids));
-            results.scores.push_back(generation_output.score);
+            results.tokens.push_back(sequence->get_generated_ids());
+            results.scores.push_back(score);
         }
         // next_selected_beam = sampler.last_selected_beam(request);
     }
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 6755255fe8..c32e4a1189 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -173,8 +173,6 @@ class Sequence {
         return score;
     }
 
-
-
     // Each KV block can be uniquely identified by
     void set_sequence_group_ptr(std::shared_ptr<SequenceGroup> sequence_group) {
         m_sequence_group = sequence_group;
@@ -332,14 +330,16 @@ class SequenceGroup {
     std::vector<Sequence::CPtr> get_finished_sequences() const {
         std::vector<Sequence::CPtr> finished_seqs;
         for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) {
-            if (m_sequences[seq_id]->has_finished() || m_sequences[seq_id]->out_of_memory()) {
+            if (m_sequences[seq_id]->has_finished() || m_sequences[seq_id]->out_of_memory() || handle_dropped()) {
                 finished_seqs.push_back(m_sequences[seq_id]);
             }
         }
 
-        // do we need to sort sequences here or sampler can handle it for us?
-        std::sort(finished_seqs.begin(), finished_seqs.end(), [=] (Sequence::CPtr s1, Sequence::CPtr s2) {
-            return s1->get_beam_search_score(m_sampling_params) > s2->get_beam_search_score(m_sampling_params);
+        std::sort(finished_seqs.begin(), finished_seqs.end(), [=] (Sequence::CPtr s1, Sequence::CPtr s2) -> bool {
+            bool is_beam_search = m_sampling_params.is_beam_search();
+            const float score_1 = is_beam_search ? s1->get_beam_search_score(m_sampling_params) : s1->get_cumulative_log_probs();
+            const float score_2 = is_beam_search ? s2->get_beam_search_score(m_sampling_params) : s2->get_cumulative_log_probs();
+            return score_1 > score_2;
         });
 
         return finished_seqs;
@@ -571,7 +571,7 @@ class SequenceGroup {
         m_generation_stream->set_generation_status(status);
     }
 
-    bool handle_dropped() {
+    bool handle_dropped() const {
         return m_generation_stream->get_status() == GenerationStatus::DROPPED_BY_HANDLE;
     }
 
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
index 7bf1c1070a..ad4529e22f 100644
--- a/src/cpp/src/visual_language/pipeline.cpp
+++ b/src/cpp/src/visual_language/pipeline.cpp
@@ -203,8 +203,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
             },
         }, streamer);
 
-        OPENVINO_ASSERT((generation_config.is_greedy_decoding() || generation_config.is_multinomial() || !streamer_ptr),
-                        "Currently streaming is possible only for greedy or multinomial decoding");
+        OPENVINO_ASSERT(streamer_ptr == nullptr || generation_config.num_return_sequences == 1 &&
+            (generation_config.is_greedy_decoding() || generation_config.is_multinomial()),
+            "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
 
         ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, { 1, history_size + inputs_embeds_size }};
         std::fill_n(new_atten_mask.data<int64_t>(), new_atten_mask.get_size(), 1);

From 9e612b8f96d32c1dfaab9d877e477c224da7048e Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 20 Dec 2024 21:52:32 +0400
Subject: [PATCH 037/110] =?UTF-8?q?llm=5Fpipeline=5Fstatic:=20flush=20stre?=
 =?UTF-8?q?amer=20after=20generation=20loop=20is=20complete=E2=80=A6=20(#1?=
 =?UTF-8?q?418)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… (#1350)

Without these changes, chat_sample with NPU device produces responses
that are clipped by 4 characters:


![image](https://github.com/user-attachments/assets/e841bf36-948b-4899-820f-6b52460076e9)

Flushing the streamer (as

[get_lm_encoded_results()](https://github.com/openvinotoolkit/openvino.genai/blob/71ea7aae7357fa0bb21a5161ef078bef8ce7af7c/src/cpp/src/lm_encoding.cpp#L224)
does in non-static LLM cases) seems to resolve the issue.

Signed-off-by: Ryan Metcalfe <ryan.metcalfe@intel.com>
Co-authored-by: Ryan Metcalfe <107415876+RyanMetcalfeInt8@users.noreply.github.com>
---
 src/cpp/src/llm_pipeline_static.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 090aed9650..42430f70a6 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -1102,6 +1102,11 @@ EncodedResults StaticLLMPipeline::generate(
             m_kvcache_request.get_tensor(output_name).copy_to(kvcache_in_slice);
         }
     }
+
+    if (streamer_ptr) {
+        streamer_ptr->end();
+    }
+
     auto stop_time = std::chrono::steady_clock::now();
     // If is called without tokenization then that stat will not be reported.
     auto& metrics = results.perf_metrics;

From 930ec7eaa00ecf78058656eb08a6d5bedcf38539 Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Fri, 20 Dec 2024 20:22:29 +0000
Subject: [PATCH 038/110] StaticLLMPipeline: Cherry-pick num_key_value_heads
 not present in config.json (#1409)

Original: https://github.com/openvinotoolkit/openvino.genai/pull/1355
---
 src/cpp/src/llm_pipeline_static.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 42430f70a6..6f4f124894 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -407,7 +407,8 @@ ov::genai::ModelConfigDesc get_modeldesc_from_json(const std::filesystem::path&
     if (config_data.contains("_name_or_path")) {
         desc.name_or_path = config_data["_name_or_path"].get<std::string>();
     }
-    desc.num_key_value_heads = config_data["num_key_value_heads"].get<int>();
+    desc.num_key_value_heads = config_data.contains("num_key_value_heads")
+        ? config_data["num_key_value_heads"].get<int>() : -1;
     return desc;
 }
 

From 7d7134580ba15519578eb9968c5d9d4845192363 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sat, 21 Dec 2024 00:50:54 +0400
Subject: [PATCH 039/110] Pin optimum-intel commit (#1420)

Optimum-intel's main broke image generations models
See
https://github.com/openvinotoolkit/openvino.genai/actions/runs/12436082329/job/34723277088
---
 .github/workflows/llm_bench-python.yml | 4 ++--
 samples/export-requirements.txt        | 2 +-
 tests/python_tests/requirements.txt    | 2 +-
 tools/llm_bench/requirements.txt       | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index 1999bafcfe..8356805e19 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -151,7 +151,7 @@ jobs:
           rm -rf ./ov_models/internvl2-1B
       - name: WWB Tests
         run: |
-          pip install git+https://github.com/huggingface/optimum-intel.git
+          pip install git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a
           GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
           python -m pytest -v ${{ env.WWB_PATH }}/tests
   stateful:
@@ -190,7 +190,7 @@ jobs:
       - name: WWB Tests
         run: |
           pip install pytest
-          pip install git+https://github.com/huggingface/optimum-intel.git
+          pip install git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a
           GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
           python -m pytest -v ${{ env.WWB_PATH }}/tests
 
diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index 797b680b9a..d75fdbacee 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -2,7 +2,7 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino-tokenizers~=2025.0.0.0.dev
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index 3dac3f8b00..bc5324b211 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a
 numpy<2.0.0; sys_platform == 'darwin'
 onnx==1.17.0
 pytest
diff --git a/tools/llm_bench/requirements.txt b/tools/llm_bench/requirements.txt
index f5f4a3fdeb..acbc668c52 100644
--- a/tools/llm_bench/requirements.txt
+++ b/tools/llm_bench/requirements.txt
@@ -10,7 +10,7 @@ torch
 transformers>=4.40.0
 diffusers>=0.22.0
 #optimum is in dependency list of optimum-intel 
-git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel
+git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a#egg=optimum-intel
 git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf
 packaging
 psutil

From 74cdfc900fa60e6f0473895641cda0c9d0416738 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Sat, 21 Dec 2024 01:51:52 +0400
Subject: [PATCH 040/110] [ CB ][ SD ] Support streaming with using
 `stop_strings` and `include_stop_strings` (#1382)

*Details:*:
* Implement streaming with using `stop_strings` in CB like pipelines
* Change `stop_string_match` logic to encode them only once per request
* Do not stream tokens which are matched to the part of a `stop_string`
(Tests was a bit changes in this case according HF does not support
exclude `stop_strings`)

*Tickets:*
* CVS-158463

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 src/cpp/src/lm_encoding.cpp            |   2 +-
 src/cpp/src/sampler.cpp                | 166 +++++++++++++------------
 src/cpp/src/sampler.hpp                |   4 +-
 src/cpp/src/sequence_group.hpp         |  61 ++++++---
 src/cpp/src/text_callback_streamer.cpp |   2 +-
 src/cpp/src/text_callback_streamer.hpp |   2 +-
 tests/python_tests/common.py           |  39 +++++-
 tests/python_tests/test_sampling.py    |  10 +-
 8 files changed, 180 insertions(+), 106 deletions(-)

diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index 8ef993e09f..031214468e 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -239,4 +239,4 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
 }
 
 }  // namespace genai
-}  // namespace ov
\ No newline at end of file
+}  // namespace ov
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index f77463d767..9c18dc7721 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -85,75 +85,63 @@ std::string clean_wrapped_text(const std::string& wrapped_text, const std::strin
     return clean_text;
 }
 
+std::vector<int64_t> encode_and_process_string(const std::string& stop_string, ov::genai::Tokenizer& tokenizer) {
+    // encode stop_string
+    std::string stop_string_copy = stop_string;
+    ov::Tensor ov_encoded_stop_string = tokenizer.encode(stop_string_copy, ov::genai::add_special_tokens(false)).input_ids;
+    size_t tensor_size = ov_encoded_stop_string.get_size();
+    std::vector<int64_t> encoded_stop_string(tensor_size);
+    std::copy_n(ov_encoded_stop_string.data<int64_t>(), tensor_size, encoded_stop_string.begin());
+    return encoded_stop_string;
+}
+
+struct MatchStopStringResult {
+    size_t to_remove = 0;
+    // int64_t last_token_id = 0;
+    // bool is_to_update_last_token = false;
+    bool is_matched = false;
+};
+
 // Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned.
-int match_stop_string(Tokenizer & tokenizer, const TokenIds & generated_tokens, const std::set<std::string> & stop_strings) {
-    /*
-    For catching stop_string hit we run comparisons character-wise to catch cases where stop string 
-    overlaps with part of another token on both sides or is just a part of a single token. 
-    For every stop_string we iterate over generated tokens starting from the last one and going backwards. 
-    Every token is wrapped with prefix tokens to ensure tokenizer doesn't remove prefix whitespace of the actual token.
-    After that all tokens are decoded and prefix is removed from the decoded text, so we end up with decoded token.
-    Its characters are compared to the stop_string character at a current_position 
-    (position of a character in the stop_string counting from the last one) - at the beginning position is 0.
-    When characters match we increase current_position and check if we have a full match already, if not we continue.
-    If we have already matched some characters (current_position > 0) and next character is not matching 
-    before we reach the full match, then we reset current_position to 0. 
-    */ 
-    std::string prefix = "a";
-    auto prefix_ov = tokenizer.encode(prefix).input_ids;
-    std::vector<int64_t> prefix_tokens(prefix_ov.data<int64_t>(), prefix_ov.data<int64_t>() + prefix_ov.get_size());
-    std::string suffix = "b";
-    auto suffix_ov = tokenizer.encode(suffix).input_ids;
-    std::vector<int64_t> suffix_tokens(suffix_ov.data<int64_t>(), suffix_ov.data<int64_t>() + suffix_ov.get_size());
-
-    // Since whitespace can be added at the beginning of the suffix we also try to capture that behavior here
-    // and get suffix string that will actually be part of the decoded string so we can remove it correctly
-    auto wrapped_suffix_tokens = suffix_tokens;
-    wrapped_suffix_tokens.insert(wrapped_suffix_tokens.begin(), prefix_tokens.begin(), prefix_tokens.end());
-    std::string wrapped_suffix = tokenizer.decode(wrapped_suffix_tokens);
-    auto wrapper_pos = wrapped_suffix.find(prefix);
-    suffix = wrapped_suffix.substr(wrapper_pos + prefix.size());
-    
-    for (auto stop_string: stop_strings) {
-        int current_position = 0;
-        int num_matched_tokens = 0; 
-        // Getting reverse iterator to check tokens starting from the last one generated and going backwards
-        auto generated_tokens_rit = generated_tokens.rbegin();
-        std::vector<int64_t> tokens_buffer;
-        while (generated_tokens_rit != generated_tokens.rend()) {
-            num_matched_tokens++;
-            tokens_buffer.insert(tokens_buffer.begin(), *generated_tokens_rit);
-
-            std::vector<int64_t> wrapped_tokens = wrap_tokens(tokens_buffer, prefix_tokens, suffix_tokens);
-            std::string wrapped_text = tokenizer.decode(wrapped_tokens);
-            std::string clean_text = clean_wrapped_text(wrapped_text, prefix, suffix);
-
-            if (clean_text == "" || (clean_text.size() >= 3 && (clean_text.compare(clean_text.size() - 3, 3, "�") == 0))) { 
-                generated_tokens_rit++;
-                continue;
-            } else {
-                tokens_buffer.clear();
-            }
-            // Checking clean_text characters starting from the last one
-            for (auto clean_text_rit = clean_text.rbegin(); clean_text_rit != clean_text.rend(); clean_text_rit++) {
-                // On character match increment current_position for the next comparisons
-                if (*clean_text_rit == *(stop_string.rbegin() + current_position)) {
-                    current_position++;
-                    // If this is the last character from the stop_string we have a match
-                    if ((stop_string.rbegin() + current_position) == stop_string.rend()) {
-                        return num_matched_tokens;
-                    } 
-                } else if (current_position) {
-                    // Already found matching characters, but the last one didn't match, so we reset current_position
-                    current_position = 0;
-                    // Looking for the match will start over from this character so we decrement iterator
-                    clean_text_rit--;
+MatchStopStringResult match_stop_string(Tokenizer& tokenizer,
+                      const TokenIds& generated_tokens,
+                      const std::pair<size_t, std::set<std::string>>& stop_strings,
+                      bool is_include_to_output) {
+    MatchStopStringResult result;
+    if (generated_tokens.size() >= stop_strings.first) {
+        size_t offset = generated_tokens.size() - stop_strings.first;
+        TokenIds buffer(generated_tokens.begin() + offset, generated_tokens.end());
+        std::string decoded_buffer = tokenizer.decode(buffer);
+        for (const auto& stop_string : stop_strings.second) {
+            auto pos = decoded_buffer.find(stop_string);
+            if (pos != std::string::npos) {
+                result.is_matched = true;
+
+                auto stop_string_len = is_include_to_output ? stop_string.length() : 0;
+                decoded_buffer = decoded_buffer.substr(0, pos + stop_string_len);
+                // to remove word splitting symbols from tail
+                while (decoded_buffer.back() == ' ' || decoded_buffer.back() == '\n') {
+                    decoded_buffer.pop_back();
+                }
+                if (decoded_buffer.empty()) {
+                    result.to_remove = buffer.size();
+                    return result;
                 }
+
+                // find token cnt to be removed from sequence by decoding token by token
+                std::string decoded_partially_string = "";
+                for (size_t i = 0; i < buffer.size(); ++i) {
+                    decoded_partially_string += tokenizer.decode(TokenIds{buffer[i]});
+                    if (decoded_partially_string.find(decoded_buffer) != std::string::npos) {
+                        result.to_remove = buffer.size() - i - 1;
+                        break;
+                    }
+                }
+                return result;
             }
-            generated_tokens_rit++;
         }
     }
-    return 0;
+    return result;
 }
 
 // Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned.
@@ -245,7 +233,9 @@ std::map<size_t, int32_t> Sampler::GroupBeamSearcher::get_beam_idxs() {
     return next_beams;
 }
 
-void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output) {
+void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits,
+    SamplerOutput& sampler_output,
+    const std::pair<size_t, std::set<std::string>>& stop_strings) {
     assert(m_parameters.num_beams % m_parameters.num_beam_groups == 0 &&
         "number of beams should be divisible by number of groups");
     size_t group_size = m_parameters.num_beams / m_parameters.num_beam_groups;
@@ -392,19 +382,17 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, Sa
                 // There's probably a better way to do that, than copying whole vector...
                 std::vector<int64_t> token_ids = candidate.m_sequence->get_generated_ids();
                 token_ids.push_back(candidate.m_token_id);
-                int num_last_matched_tokens = match_stop_string(m_tokenizer, token_ids, m_sequence_group->get_sampling_parameters().stop_strings);
-                if (num_last_matched_tokens) {
+                auto match_result = match_stop_string(m_tokenizer, token_ids, stop_strings, m_parameters.include_stop_str_in_output);
+                if (match_result.is_matched) {
                     // If beam_token does not belong to top num_beams tokens, it should not be added
                     if (cand_idx >= group_size)
                         continue;
 
-                    if(!m_parameters.include_stop_str_in_output) {
-                        // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point)
-                        candidate.m_sequence->remove_last_tokens(num_last_matched_tokens - 1);
-                    }
+                    // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point)
+                    candidate.m_sequence->remove_last_tokens(match_result.to_remove);
 
                     // try to finish candidate
-                    try_to_finish_candidate(group, candidate, m_parameters.include_stop_str_in_output);
+                    try_to_finish_candidate(group, candidate);
                     continue;
                 }
             }
@@ -576,10 +564,11 @@ std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen
         }
 
         if (!sampling_params.stop_strings.empty()) {
-            int num_matched_last_tokens = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), sampling_params.stop_strings);
-            if (num_matched_last_tokens) {
-                if (!sampling_params.include_stop_str_in_output)
-                    running_sequence->remove_last_tokens(num_matched_last_tokens);
+            auto& stop_strings = m_stop_strings.at(sequence_group->get_request_id());
+            auto match_result = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), stop_strings, sampling_params.include_stop_str_in_output);
+            if (match_result.is_matched) {
+                running_sequence->remove_last_tokens(match_result.to_remove);
+
                 running_sequence->set_status(SequenceStatus::FINISHED);
                 running_sequence->set_finish_reason(GenerationFinishReason::STOP);
                 dropped_seq_ids.push_back(running_sequence->get_id());
@@ -741,6 +730,19 @@ float get_p_prime(Sequence::Ptr& running_sequence,
     return p_prime;
 }
 
+std::pair<size_t, std::set<std::string>>
+process_stop_strings(const std::set<std::string>& stop_strings, Tokenizer& tokenizer) {
+    std::pair<size_t, std::set<std::string>> result;
+    for (const auto& stop_string : stop_strings) {
+        auto encoded_stop_string = encode_and_process_string(stop_string, tokenizer);
+        if (result.first < encoded_stop_string.size()) {
+            result.first = encoded_stop_string.size();
+        }
+        result.second.insert(stop_string);
+    }
+    return result;
+}
+
 SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
                               ov::Tensor logits,
                               bool is_validation_mode_enabled) {
@@ -764,6 +766,12 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
         if (!m_logit_processors.count(request_id)) {
             m_logit_processors.insert({request_id, LogitProcessor(sampling_params, sequence_group->get_prompt_ids())});
         }
+        if (!m_stop_strings.count(request_id)) {
+            auto processed_stop_string = process_stop_strings(sampling_params.stop_strings, m_tokenizer);
+            m_stop_strings.insert({request_id, processed_stop_string});
+            sequence_group->set_stream_window_size(processed_stop_string.first);
+        }
+        auto& stop_strings = m_stop_strings.at(request_id);
         auto& logit_processor = m_logit_processors.at(request_id);
         const void * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens;
         ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data);
@@ -873,7 +881,7 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
                 }
 
                 // current algorithm already adds new tokens to running sequences and
-                m_beam_search_info.at(request_id).select_next_tokens(sequence_group_logits, sampler_output);
+                m_beam_search_info.at(request_id).select_next_tokens(sequence_group_logits, sampler_output, stop_strings);
 
                 // check max length stop criteria
                 std::vector<Sequence::Ptr> running_sequences = sequence_group->get_running_sequences();
@@ -886,8 +894,7 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
             // Notify handle after sampling is done. 
             // For non-streaming this is effective only when the generation is finished.
             OPENVINO_ASSERT(num_tokens_to_process >= max_removed_tokens_per_request);
-            size_t num_output_token_to_push = num_tokens_to_process - max_removed_tokens_per_request + 1;
-            sequence_group->notify_handle(num_output_token_to_push);
+            sequence_group->notify_handle();
         } else {
             // we are in prompt processing phase when prompt is split into chunks and processed step by step
         }
@@ -926,6 +933,7 @@ void Sampler::create_logit_processor(uint64_t request_id, const GenerationConfig
 void Sampler::clear_request_info(uint64_t request_id) { 
     m_beam_search_info.erase(request_id);
     m_logit_processors.erase(request_id);
+    m_stop_strings.erase(request_id);
 }
 
 int64_t Sampler::GroupBeamSearcher::Group::finish(Beam beam, const ov::genai::GenerationConfig& sampling_params) {
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 08a9863e0a..981e11560f 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -58,6 +58,8 @@ class Sampler {
     size_t seed = rng_engine.default_seed;
     // { request_id, logit_processor }
     std::map<uint64_t, LogitProcessor> m_logit_processors;
+    // { request_id, { max_encoded_len, { stop_strings }}}
+    std::map<int64_t, std::pair<size_t, std::set<std::string>>> m_stop_strings;
 
     Tokenizer m_tokenizer;
 
@@ -120,7 +122,7 @@ class Sampler::GroupBeamSearcher {
 public:
     explicit GroupBeamSearcher(SequenceGroup::Ptr sequence_group, Tokenizer tokenizer);
 
-    void select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output);
+    void select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output, const std::pair<size_t, std::set<std::string>>& stop_strings);
     void finalize(SamplerOutput& sampler_output);
     std::map<size_t, int32_t> get_beam_idxs();
 };
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index c32e4a1189..220e93c032 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -126,23 +126,28 @@ class Sequence {
         }
     }
 
-    GenerationOutput get_last_generation_output(size_t token_cnt = 1) {
+    GenerationOutput get_last_generation_output(size_t token_cnt = 1, size_t num_token_to_ignore = 0) {
         GenerationOutput output;
-        OPENVINO_ASSERT(m_generated_ids.size());
-        output.score = get_cumulative_log_probs();
+        if (token_cnt > 0) {
+            OPENVINO_ASSERT(m_generated_ids.size());
+            output.score = get_cumulative_log_probs();
 
-        auto generated_token_id = get_generated_ids();
-        auto generated_log_probs = get_generated_log_probs();
+            auto generated_token_id = get_generated_ids();
+            auto generated_log_probs = get_generated_log_probs();
 
-        OPENVINO_ASSERT(get_generated_len() >= token_cnt);
-        auto offset = get_generated_len() - token_cnt;
+            OPENVINO_ASSERT(get_generated_len() >= token_cnt);
+            if (get_generated_len() > num_token_to_ignore) {
+                auto offset = get_generated_len() - token_cnt - num_token_to_ignore;
+                auto offset_back = get_generated_len() - num_token_to_ignore;
 
-        std::vector<int64_t> token_id(generated_token_id.begin() + offset, generated_token_id.end());
-        std::vector<float> log_probs(generated_log_probs.begin() + offset, generated_log_probs.end());
+                std::vector<int64_t> token_id(generated_token_id.begin() + offset, generated_token_id.begin() + offset_back);
+                std::vector<float> log_probs(generated_log_probs.begin() + offset, generated_log_probs.begin() + offset_back);
 
-        output.generated_ids = token_id;
-        output.generated_log_probs = log_probs;
-        output.finish_reason = get_finish_reason();
+                output.generated_ids = token_id;
+                output.generated_log_probs = log_probs;
+                output.finish_reason = get_finish_reason();
+            }
+        }
         return output;
     }
 
@@ -219,6 +224,8 @@ class SequenceGroup {
     // flag to enable/disable token generation, e.g. in speculative decoding scenario
     bool m_is_gen_paused = false;
 
+    size_t m_num_streamed_tokens = 0, m_stream_window_size = 0;
+
 
     SequenceGroup(uint64_t request_id, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching)
         : m_request_id(request_id),
@@ -454,6 +461,10 @@ class SequenceGroup {
     size_t get_num_tokens_to_validate() {
         return m_num_validation_tokens;
     }
+    
+    void set_stream_window_size(size_t k) {
+        m_stream_window_size = k;
+    }
 
     size_t get_num_available_tokens_for_batching() const {
         OPENVINO_ASSERT(!has_finished(), "Internal error: this function cannot be called on finished sequence group");
@@ -601,7 +612,7 @@ class SequenceGroup {
         for (auto& sequence : m_sequences) {
             // todo: check seq.is_finished() to generate without several </s>
             // or is it ok to use padding?
-            auto output = sequence->get_last_generation_output(token_cnt);
+            auto output = sequence->get_last_generation_output(token_cnt, m_stream_window_size);
             if (m_sampling_params.echo && !m_has_echoed) {
                 output.generated_ids.insert(output.generated_ids.begin(), m_prompt_ids.begin(), m_prompt_ids.end());
                 output.generated_log_probs.insert(output.generated_log_probs.begin(), m_prompt_log_probs.begin(), m_prompt_log_probs.end());
@@ -612,24 +623,36 @@ class SequenceGroup {
         m_generation_stream->push(std::move(outputs));
     }
 
-    void notify_handle(size_t num_output_token_to_push = 0) {
+    void notify_handle() {
         if (out_of_memory()) {
             set_generation_status(GenerationStatus::IGNORED);
         } else if (has_finished()) {
             set_generation_status(GenerationStatus::FINISHED);
         }
         // For beam search streaming is not available, so we notify only upon finishing
-        if(m_sampling_params.is_beam_search()) {
+        if (m_sampling_params.is_beam_search()) {
             if (has_finished() || out_of_memory()) {
                 push_outputs();
             }
         } else if (m_sampling_params.is_greedy_decoding() || m_sampling_params.is_multinomial()) {
             // We can stream only when one sequence is returned and we don't use stop strings that would be excluded from the output
             // (after stop string is detected its tokens are already sent)
-            if (num_total_seqs() == 1 &&
-                (m_sampling_params.stop_strings.empty() || m_sampling_params.include_stop_str_in_output)) {
-                if (num_output_token_to_push)
-                    push_partial_outputs(num_output_token_to_push);
+            if (num_total_seqs() == 1) {
+                const auto generated_len = m_sequences.front()->get_generated_len();
+                if (has_finished()) {
+                    m_stream_window_size = 0;
+                }
+                if (generated_len <= (m_num_streamed_tokens + m_stream_window_size)) {
+                    return;
+                }
+                // speculative decoding draft handling
+                if (generated_len < m_num_streamed_tokens) {
+                    m_num_streamed_tokens = generated_len;
+                }
+                OPENVINO_ASSERT(generated_len >= (m_num_streamed_tokens + m_stream_window_size));
+                size_t num_output_token_to_push = generated_len - m_num_streamed_tokens - m_stream_window_size;
+                push_partial_outputs(num_output_token_to_push);
+                m_num_streamed_tokens += (num_output_token_to_push);
             } else if (has_finished() || out_of_memory()) {
                 push_outputs();
             }
diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp
index 314a7ffa4d..5938b55f6c 100644
--- a/src/cpp/src/text_callback_streamer.cpp
+++ b/src/cpp/src/text_callback_streamer.cpp
@@ -52,4 +52,4 @@ void TextCallbackStreamer::end() {
 ov::genai::StreamerBase::~StreamerBase() = default;
 
 }  // namespace genai
-}  // namespace ov
+}  // namespace ov
\ No newline at end of file
diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp
index a03b0deccb..6f0872ad1b 100644
--- a/src/cpp/src/text_callback_streamer.hpp
+++ b/src/cpp/src/text_callback_streamer.hpp
@@ -25,4 +25,4 @@ class TextCallbackStreamer: public StreamerBase {
 };
 
 }  // namespace genai
-}  // namespace ov
+}  // namespace ov
\ No newline at end of file
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 50ee452f5c..163a00192e 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -125,6 +125,34 @@ def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig:
     generation_config.include_stop_str_in_output = True
     return generation_config
 
+def get_greedy_stop_strings_exclude_from_output() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.max_new_tokens = 30
+    generation_config.stop_strings = { "machines" }
+    generation_config.include_stop_str_in_output = False
+    return generation_config
+
+def get_greedy_stop_strings_include_to_output() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.max_new_tokens = 30
+    generation_config.stop_strings = { "machines" }
+    generation_config.include_stop_str_in_output = True
+    return generation_config
+
+def get_greedy_n_stop_strings_exclude_from_output() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.max_new_tokens = 30
+    generation_config.stop_strings = { "machines", "manage" }
+    generation_config.include_stop_str_in_output = False
+    return generation_config
+
+def get_greedy_n_stop_strings_include_to_output() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.max_new_tokens = 30
+    generation_config.stop_strings = { "machines", "manage" }
+    generation_config.include_stop_str_in_output = True
+    return generation_config
+
 def get_multinomial_temperature() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.do_sample = True
@@ -359,9 +387,14 @@ def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, ge
             # Note, that for fp32 / fp16 models scores are different less than 0.001
             assert abs(hf_score - ov_score) < 0.02
 
-    assert len(hf_result.m_generation_ids) == len(ov_result.m_generation_ids)
-    for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids):
-        assert hf_text == ov_text
+    if not generation_config.include_stop_str_in_output and len(generation_config.stop_strings) > 0:
+        assert len(hf_result.m_generation_ids) >= len(ov_result.m_generation_ids)
+        for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids):
+            assert ov_text in hf_text
+    else:
+        assert len(hf_result.m_generation_ids) == len(ov_result.m_generation_ids)
+        for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids):
+            assert hf_text == ov_text
 
 def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path):
     model.save_pretrained(models_path)
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index 9aa6931d85..d5df28bfd6 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -21,6 +21,8 @@
     get_beam_search, get_beam_search_min_and_max_tokens, get_beam_search_with_single_stop_string, \
     get_beam_search_with_multiple_stop_strings, get_beam_search_with_multiple_stop_strings_no_match, get_multinomial_max_and_min_token, \
     get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \
+    get_greedy_stop_strings_exclude_from_output, get_greedy_stop_strings_include_to_output, \
+    get_greedy_n_stop_strings_exclude_from_output, get_greedy_n_stop_strings_include_to_output, \
     generate_and_compare_with_hf, get_multinomial_temperature_and_repetition_penalty, get_scheduler_config, \
     run_continuous_batching
 
@@ -77,7 +79,9 @@ def test_eos_greedy(tmp_path):
 @pytest.mark.precommit
 @pytest.mark.parametrize("generation_config", [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(),
                                                get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(), 
-                                               get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(), ],
+                                               get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(),
+                                               get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(),
+                                               get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output() ],
         ids=[
             "greedy",
             "greedy_with_min_and_max_tokens",
@@ -88,6 +92,10 @@ def test_eos_greedy(tmp_path):
             "beam",
             "beam_search_min_and_max_tokens",
             "beam_search_with_multiple_stop_strings_no_match",
+            "get_greedy_stop_strings_exclude_from_output",
+            "get_greedy_stop_strings_include_to_output",
+            "get_greedy_n_stop_strings_exclude_from_output",
+            "get_greedy_n_stop_strings_include_to_output"
             ])
 def test_individual_generation_configs_deterministic(tmp_path, generation_config):
     prompts = [

From 05d01ac415ce35703a343017f6fe1f49acec9477 Mon Sep 17 00:00:00 2001
From: Sofya Balandina <sofya.balandina@intel.com>
Date: Fri, 20 Dec 2024 23:31:43 +0000
Subject: [PATCH 041/110] Move beam search in case of chat scenario to
 sampler.cpp (#1215)

Task [CVS-156578](https://jira.devtools.intel.com/browse/CVS-156578)

- add missed token, if prev generation was finished because max length
was reached
---
 src/cpp/src/group_beam_searcher.cpp           | 455 ------------------
 src/cpp/src/llm_pipeline.cpp                  | 134 +++---
 src/cpp/src/lm_encoding.cpp                   |  39 +-
 src/cpp/src/lm_encoding.hpp                   |  10 +-
 src/cpp/src/utils.hpp                         |  15 +
 .../src/visual_language/inputs_embedder.cpp   |  65 ++-
 .../src/visual_language/inputs_embedder.hpp   |   6 +-
 src/cpp/src/visual_language/pipeline.cpp      |  13 +-
 8 files changed, 161 insertions(+), 576 deletions(-)
 delete mode 100644 src/cpp/src/group_beam_searcher.cpp

diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp
deleted file mode 100644
index a0262c0dc8..0000000000
--- a/src/cpp/src/group_beam_searcher.cpp
+++ /dev/null
@@ -1,455 +0,0 @@
-// Copyright (C) 2023-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include <openvino/runtime/tensor.hpp>
-
-#include <cassert>
-
-#include "openvino/genai/llm_pipeline.hpp"
-#include "utils.hpp"
-#include "lm_encoding.hpp"
-
-namespace {
-
-// Modified Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurrence in haystack
-std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std::vector<int64_t>& needle) {
-    if (needle.empty()) {  // no_repeat_ngram_size == 1, ban every token
-        return {haystack.begin(), haystack.end()};
-    }
-    std::vector<int> partial_match_table(needle.size() + 1, -1);
-    int cnd = 0;
-    for (size_t pos = 1; pos < needle.size(); ++pos) {
-        if (needle.at(pos) == needle.at(size_t(cnd))) {
-            partial_match_table.at(pos) = partial_match_table.at(size_t(cnd));
-        } else {
-            partial_match_table.at(pos) = cnd;
-            while (cnd >= 0 && needle.at(pos) != needle.at(size_t(cnd))) {
-                cnd = partial_match_table.at(size_t(cnd));
-            }
-        }
-        ++cnd;
-    }
-    partial_match_table.back() = cnd;
-    std::vector<int64_t> res;
-    size_t haystack_id = 0;
-    int needle_id = 0;
-    while (haystack_id < haystack.size() - 1) {
-        if (needle.at(size_t(needle_id)) == haystack.at(haystack_id)) {
-            ++haystack_id;
-            ++needle_id;
-            if (needle_id == int(needle.size())) {
-                res.push_back(haystack.at(haystack_id));
-                needle_id = partial_match_table.at(size_t(needle_id));
-            }
-        } else {
-            needle_id = partial_match_table.at(size_t(needle_id));
-            if (needle_id < 0) {
-                ++haystack_id;
-                ++needle_id;
-            }
-        }
-    }
-    return res;
-}
-
-struct Token {
-    float log_prob;
-    int64_t idx;
-};
-
-std::vector<Token> log_softmax(const ov::Tensor& logits, const size_t batch_idx) {
-    if (logits.get_shape().at(0) <= batch_idx) {
-        throw std::runtime_error("logits batch size doesn't match the number of beams");
-    }
-    size_t vocab_size = logits.get_shape().back();
-    size_t batch_offset = batch_idx * logits.get_shape().at(1) * vocab_size;
-    size_t sequence_offset = (logits.get_shape().at(1) - 1) * vocab_size;
-    const float* beam_logits = logits.data<const float>() + batch_offset + sequence_offset;
-    float max_logit = *std::max_element(beam_logits, beam_logits + vocab_size);
-    float log_sum = std::log(
-        std::accumulate(beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) {
-            return accumulated + std::exp(to_add - max_logit);
-        }));
-    std::vector<Token> tokens;
-    tokens.reserve(vocab_size);
-    for (size_t idx = 0; idx < vocab_size; ++idx) {
-        tokens.push_back({beam_logits[idx] - max_logit - log_sum, int64_t(idx)});
-    }
-    return tokens;
-}
-
-struct Beam {
-    float score = -std::numeric_limits<float>::infinity();  // The bigger, the better
-    std::vector<int64_t> tokens;
-    size_t global_beam_idx = 0;
-};
-
-bool greater(const Beam& left, const Beam& right) {
-    return left.score > right.score;
-}
-
-struct Parameters {
-    std::vector<std::vector<int64_t>> prompts;
-    int64_t eos_token_id;
-    size_t n_groups = 3;
-    size_t group_size = 5;
-    float diversity_penalty = 1.0;
-    size_t max_new_tokens = 20;
-    ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::HEURISTIC;
-    float length_penalty = 1.0;
-    size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
-
-    std::function<bool(const Beam&)> early_finish = [](const Beam&) {
-        return false;
-    };
-};
-
-struct Group {
-    std::vector<Beam> ongoing;   // Best beams in front
-    std::vector<Beam> min_heap;  // The worst of the best completed beams is the first
-    bool done = false;
-
-    void finish(Beam&& beam, const Parameters& parameters) {
-        beam.score /= std::pow(float(beam.tokens.size()), parameters.length_penalty);
-
-        min_heap.push_back(std::move(beam));
-        std::push_heap(min_heap.begin(), min_heap.end(), greater);
-        if (min_heap.size() > parameters.group_size) {
-            std::pop_heap(min_heap.begin(), min_heap.end(), greater);
-            min_heap.pop_back();
-        }
-    }
-    void is_done(const Parameters& parameters) {
-        if (min_heap.size() < parameters.group_size) {
-            return;
-        }
-        size_t cur_len = ongoing.front().tokens.size();
-        float best_sum_logprobs = ongoing.front().score;
-        float worst_score = min_heap.front().score;
-        switch (parameters.stop_criteria) {
-        case ov::genai::StopCriteria::EARLY:
-            done = true;
-            return;
-        case ov::genai::StopCriteria::HEURISTIC: {
-            float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
-            done = worst_score >= highest_attainable_score;
-            return;
-        }
-        case ov::genai::StopCriteria::NEVER: {
-            size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
-            float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
-            done = worst_score >= highest_attainable_score;
-            return;
-        }
-        default:
-            throw std::runtime_error("Never reached");
-        }
-    }
-};
-
-// GroupBeamSearcher processes logits prduced by a language model and accumulates beams using group beam search
-// algorithm. select_next_tokens() returns token ids selected by the algorithm and corresponding beam ids. These values
-// are used for next inference. select_next_tokens() returns empty, if all groups are completed
-struct GroupBeamSearcher {
-    Parameters parameters;
-    std::vector<std::vector<Group>> prompts_groups;
-
-    GroupBeamSearcher(Parameters parameters) : parameters{parameters}, prompts_groups{parameters.prompts.size()} {
-        if (parameters.no_repeat_ngram_size == 0) {
-            throw std::runtime_error("no_repeat_ngram_size must be positive");
-        }
-        for (std::vector<Group>& prompts_groups : prompts_groups) {
-            prompts_groups.resize(parameters.n_groups);
-            for (Group& group : prompts_groups) {
-                group.ongoing.resize(parameters.group_size);
-                group.ongoing.front().score = 0.0;
-            }
-        }
-    }
-
-    std::pair<std::vector<int64_t>, std::vector<int32_t>> select_next_tokens(const ov::Tensor& logits) {
-        std::vector<int64_t> next_tokens;
-        std::vector<int32_t> next_beams;
-
-        const size_t promts_size = parameters.prompts.size();
-
-        next_tokens.reserve(promts_size * parameters.n_groups * parameters.group_size);
-        next_beams.reserve(promts_size * parameters.n_groups * parameters.group_size);
-
-        size_t beam_count = 0;
-        size_t prompt_id = 0;
-        for (std::vector<Group>& groups : prompts_groups) {
-            for (Group& group : groups) {
-                if (group.done) {
-                    continue;
-                }
-                for (Beam& beam : group.ongoing) {
-                    // beam.tokens.empty() holds for the first select_next_tokens() call.
-                    // Every beam is constructed from the single batch at first call
-                    if (beam.tokens.empty()) {
-                        beam.global_beam_idx = prompt_id;
-                    } else {
-                        beam.global_beam_idx = beam_count;
-                        ++beam_count;
-                    }
-                }
-            }
-
-            prompt_id += 1;
-        }
-
-        for (int prompt_id = 0; prompt_id < promts_size; prompt_id++) {
-            const std::vector<int64_t> prompt = parameters.prompts[prompt_id];
-            std::vector<Group>& groups = prompts_groups[prompt_id];
-            auto [prompt_next_tokens, prompt_next_beams] = select_prompt_next_tokens(logits, prompt, groups);
-
-            next_tokens.insert(next_tokens.end(), prompt_next_tokens.begin(), prompt_next_tokens.end());
-            next_beams.insert(next_beams.end(), prompt_next_beams.begin(), prompt_next_beams.end());
-        }
-
-        return {next_tokens, next_beams};
-    }
-
-    std::pair<std::vector<int64_t>, std::vector<int32_t>> select_prompt_next_tokens(const ov::Tensor& logits,
-                                                                                    const std::vector<int64_t>& prompt,
-                                                                                    std::vector<Group>& groups) {
-        std::vector<int64_t> next_tokens;
-        std::vector<int32_t> next_beams;
-        next_tokens.reserve(parameters.n_groups * parameters.group_size);
-        next_beams.reserve(parameters.n_groups * parameters.group_size);
-
-        for (auto group = groups.begin(); group != groups.end(); ++group) {
-            if (group->done) {
-                continue;
-            }
-            std::vector<Beam> candidates;
-            candidates.reserve(parameters.group_size * 2 * parameters.group_size);
-            for (const Beam& beam : group->ongoing) {
-                std::vector<Token> tokens = log_softmax(logits, beam.global_beam_idx);
-                for (auto prev_group = groups.cbegin(); prev_group != group; ++prev_group) {
-                    for (const Beam& prev_beam : prev_group->ongoing) {
-                        if (prev_beam.tokens.size() > beam.tokens.size()) {
-                            tokens.at(size_t(prev_beam.tokens.back())).log_prob -= parameters.diversity_penalty;
-                        }
-                    }
-                }
-                std::vector<int64_t> full_text{prompt};
-                full_text.insert(full_text.end(), beam.tokens.begin(), beam.tokens.end());
-                if (full_text.size() > 1 && full_text.size() >= parameters.no_repeat_ngram_size) {
-                    auto tail_start = full_text.end() - ptrdiff_t(parameters.no_repeat_ngram_size) + 1;
-                    for (int64_t banned_token : kmp_search(full_text, {tail_start, full_text.end()})) {
-                        tokens.at(size_t(banned_token)).log_prob = -std::numeric_limits<float>::infinity();
-                    }
-                }
-                std::sort(tokens.begin(), tokens.end(), [](Token left, Token right) {
-                    return left.log_prob > right.log_prob;  // Most probable tokens in front
-                });
-                size_t add_count = 0;
-                for (Token token : tokens) {
-                    Beam new_candidate = beam;
-                    new_candidate.score += token.log_prob;
-                    new_candidate.tokens.push_back(token.idx);
-                    if (parameters.early_finish(new_candidate)) {
-                        group->finish(std::move(new_candidate), parameters);
-                    } else {
-                        candidates.push_back(std::move(new_candidate));
-                        ++add_count;
-                        if (add_count == 2 * parameters.group_size) {
-                            break;
-                        }
-                    }
-                }
-            }
-            // Sample 2 * group_size highest score tokens to get at least 1 non EOS token per beam
-            if (candidates.size() < 2 * parameters.group_size) {
-                throw std::runtime_error("No beams left to search");
-            }
-            auto to_sort = candidates.begin() + ptrdiff_t(2 * parameters.group_size);
-            std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater);
-            group->ongoing.clear();
-            for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) {
-                if (parameters.eos_token_id == candidates.at(cand_idx).tokens.back()) {
-                    // If beam_token does not belong to top num_beams tokens, it should not be added
-                    if (cand_idx >= parameters.group_size) {
-                        continue;
-                    }
-                    group->finish(std::move(candidates.at(cand_idx)), parameters);
-                } else {
-                    group->ongoing.push_back(std::move(candidates.at(cand_idx)));
-                    if (group->ongoing.size() == parameters.group_size) {
-                        break;
-                    }
-                }
-            }
-            group->is_done(parameters);
-            if (!group->done) {
-                for (const Beam& beam : group->ongoing) {
-                    next_tokens.push_back(beam.tokens.back());
-                    next_beams.push_back(int32_t(beam.global_beam_idx));
-                }
-            }
-        }
-        return {next_tokens, next_beams};
-    }
-};
-
-// Consume group_beam_searcher because beams are consumed
-std::vector<std::vector<std::vector<Beam>>> finalize(GroupBeamSearcher&& group_beam_searcher) {
-    std::vector<std::vector<std::vector<Beam>>> finalized;
-    finalized.resize(group_beam_searcher.prompts_groups.size());
-
-    for (size_t prompt_id = 0; prompt_id < group_beam_searcher.prompts_groups.size(); prompt_id++) {
-        std::vector<Group>& groups = group_beam_searcher.prompts_groups.at(prompt_id);
-        finalized.at(prompt_id).reserve(groups.size());
-
-        for (Group& group : groups) {
-            if (!group.done) {
-                for (Beam& beam : group.ongoing) {
-                    group.finish(std::move(beam), group_beam_searcher.parameters);
-                }
-            }
-            finalized.at(prompt_id).push_back(std::move(group.min_heap));
-        }
-    }
-
-    return finalized;
-}
-
-void reset_all_inputs_to_empty_tensors(ov::InferRequest& request) {
-    request.set_tensor("input_ids", ov::Tensor(ov::element::i64, {0, 0}));
-    request.set_tensor("beam_idx", ov::Tensor(ov::element::i32, {0}));
-    if (request.get_compiled_model().inputs().size() == 4)
-        request.set_tensor("position_ids", ov::Tensor(ov::element::i64, {0, 0}));
-}
-}  // namespace
-
-namespace ov {
-namespace genai {
-
-std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
-                           ov::Tensor input_ids,
-                           ov::Tensor attention_mask,
-                           GenerationConfig config, 
-                           std::optional<ov::Tensor> position_ids,
-                           std::optional<int32_t> selected_beam_idx) {
-    OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0,
-                    "number of beams should be divisible by number of groups");
-    
-    auto batch_size = input_ids.get_shape().at(0);
-    auto sequence_length = input_ids.get_shape().at(1);
-    
-    // Initialize beam search.
-    const int64_t* prompt_data = input_ids.data<const int64_t>();
-    std::vector<std::vector<int64_t>> prompts;
-    prompts.reserve(batch_size);
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        size_t batch_offset = batch * sequence_length;
-        const int64_t* prompt_start = prompt_data + batch_offset;
-        prompts.push_back(std::vector<int64_t>{prompt_start, prompt_start + sequence_length});
-    }
-
-    lm.set_tensor("input_ids", input_ids);
-    lm.set_tensor("attention_mask", attention_mask);
-    if (position_ids.has_value())
-        lm.set_tensor("position_ids", *position_ids);
-
-    ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {batch_size});
-    auto beam_data = beam_idx.data<int32_t>();
-    if (selected_beam_idx.has_value())
-        beam_data[0] = *selected_beam_idx;
-    else
-        std::fill_n(beam_data, batch_size, 0);
-    lm.set_tensor("beam_idx", beam_idx);
-
-    Parameters parameters{std::move(prompts)};
-    parameters.max_new_tokens = config.get_max_new_tokens(sequence_length);
-    parameters.eos_token_id = config.eos_token_id;
-    parameters.n_groups = config.num_beam_groups;
-    parameters.group_size = config.num_beams / config.num_beam_groups;
-    parameters.diversity_penalty = config.diversity_penalty;
-    parameters.length_penalty = config.length_penalty;
-    parameters.stop_criteria = config.stop_criteria;
-    parameters.no_repeat_ngram_size = config.no_repeat_ngram_size;
-    GroupBeamSearcher group_beam_searcher{parameters};
-
-    std::vector<int64_t> next_tokens;
-    std::vector<int32_t> next_beams;
-
-    // Reserve for performance counters.
-    std::vector<std::chrono::steady_clock::time_point> new_token_times;
-    std::vector<size_t> batch_sizes;
-    new_token_times.reserve(parameters.max_new_tokens);
-    batch_sizes.reserve(parameters.max_new_tokens);
-
-    for (size_t length_count = 0; ; ++length_count) {
-        lm.infer();
-
-        std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits"));
-        new_token_times.emplace_back(std::chrono::steady_clock::now());
-        batch_sizes.emplace_back(batch_size);
-
-        if (next_tokens.empty() || length_count == parameters.max_new_tokens - 1) {
-            // Break the cycle before masks are extended in update_attention_mask_with_beams.
-            // If generation is continued, attention_mask length should be equal to KV cache size.
-            break;
-        }
-        
-        size_t running_batch_size = next_tokens.size();
-        // Set pointers
-        lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {running_batch_size, 1}, next_tokens.data()});
-        lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {running_batch_size}, next_beams.data()});
-
-        // Set auxiliary inputs
-        update_attention_mask_with_beams(lm.get_tensor("attention_mask"), next_beams);
-        if (position_ids.has_value())
-            update_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask"));
-    }
-
-    reset_all_inputs_to_empty_tensors(lm);
-
-    auto scores_comparator = [](Beam& left, Beam& right) {
-        return (left.score > right.score);
-    };
-
-    auto result = finalize(std::move(group_beam_searcher));
-    ov::genai::EncodedResults results;
-    int32_t res_selected_beam_idx = 0;
-    results.scores.reserve(config.num_return_sequences * result.size());
-    results.tokens.reserve(config.num_return_sequences * result.size());
-    auto& raw_perf_counters = results.perf_metrics.raw_metrics;
-    raw_perf_counters.m_new_token_times = new_token_times;
-    raw_perf_counters.m_batch_sizes = batch_sizes;
-    
-    // align output with HF
-    for (size_t prompt_id = 0; prompt_id < result.size(); prompt_id++) {
-        auto prompt_group = result.at(prompt_id);
-        std::vector<std::reference_wrapper<Beam>> plain_beams;
-        plain_beams.reserve(parameters.n_groups * parameters.group_size);
-        for (std::vector<Beam>& group : prompt_group) {
-            for (Beam& beam : group) {
-                plain_beams.push_back(beam);
-            }
-        }
-        assert(config.num_return_sequences <= plain_beams.size());
-        std::partial_sort(
-            plain_beams.begin(),
-            plain_beams.begin() + config.num_return_sequences,
-            plain_beams.end(),
-            scores_comparator
-        );
-        res_selected_beam_idx = plain_beams.at(0).get().global_beam_idx;
-        for (
-            auto beam = plain_beams.begin();
-            beam != plain_beams.begin() + config.num_return_sequences;
-            ++beam
-        ) {
-            results.scores.push_back(beam->get().score);
-            results.tokens.push_back(std::move(beam->get().tokens));
-        }
-    }
-    
-    return {results, res_selected_beam_idx};
-}
-
-}  // namespace genai
-}  // namespace ov
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 623333e349..33180a9199 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -24,28 +24,23 @@
 namespace ov {
 namespace genai {
 
-std::pair<EncodedResults, int32_t> beam_search(
-    ov::InferRequest& lm,
-    ov::Tensor prompts,
-    ov::Tensor attention_mask,
-    GenerationConfig config,
-    std::optional<ov::Tensor> position_ids,
-    std::optional<int32_t> selected_beam_idx
-);
-
 class StatefulLLMPipeline final : public LLMPipelineImplBase {
 public:
     ov::InferRequest m_model_runner;
     bool is_chat_conversation = false;
     bool m_trust_encoded_history = true;
-    std::optional<int32_t> m_selected_beam = std::nullopt;
     ChatHistory m_history;
     std::string m_templated_chat_history = {};
     std::vector<int64_t> m_tokenized_chat_history;
     ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
-    size_t m_to_remove_from_hist = 0;
     size_t m_kv_cache_seq_length_axis = 2;
     Sampler m_sampler;
+    // Tail of previous output in chat mode is missing in KV cache, let's keep it
+    std::optional<int64_t> m_last_disappeared_token = std::nullopt;
+    // If sequence contains some symbols, which could be ambiguously encoded by tokenizer, we need to trim kv cache
+    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
+    // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
+    ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0};
 
     StatefulLLMPipeline(
         const ov::InferRequest& request,
@@ -154,35 +149,44 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                 // some symbols combinations can be encoded by the tokenizer in different ways
                 // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
                 // so let's check it out, find the trusted part and use it in on the next step
-                size_t last_same_hist_token = 0;
+                size_t trusted_history_length = 0;
                 if (!m_tokenized_chat_history.empty()) {
                     std::set<int64_t> stop_tokens = config.stop_token_ids;
-                    last_same_hist_token = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
-                    m_trust_encoded_history = last_same_hist_token == SIZE_MAX;
+                    trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
+                    m_trust_encoded_history = trusted_history_length == SIZE_MAX;
                 }
 
                 if (m_tokenized_chat_history.empty()) {
                     encoded_input = new_chat_tokens;
-                } else if (last_same_hist_token != SIZE_MAX) {
-                    m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token;
+                } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_kv_cache_need_to_update()) {
+                    // does_kv_cache_need_to_update will be true here if beam search is activated
+                    // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly
+                    // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager
+                    if (m_kv_history_manager.does_kv_cache_need_to_update()) {
+                        trusted_history_length = m_kv_history_manager.trusted_history_length;
+                    } else {
+                        m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_tokenized_chat_history.size() - trusted_history_length;
+                        // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it
+                        m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= m_last_disappeared_token.has_value() ? 1 : 0;
+                    }
 
                     ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
-                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - last_same_hist_token},
-                                                       new_chat_tokens.input_ids.data<int64_t>() + last_same_hist_token);
+                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length},
+                                                       new_chat_tokens.input_ids.data<int64_t>() + trusted_history_length);
 
                     ov::Tensor new_attention_mask(ov::element::i64, new_tensor.get_shape());
                     std::fill_n(new_attention_mask.data<int64_t>(), new_tensor.get_shape()[1], 1);
 
                     encoded_input.input_ids = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
-                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - last_same_hist_token});
+                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length});
                     new_tensor.copy_to(encoded_input.input_ids);
                     encoded_input.attention_mask = new_attention_mask;
-
-                    m_selected_beam = std::nullopt;
+                    m_last_disappeared_token = std::nullopt;
                 } else {
                     encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
                 }
                 m_templated_chat_history = new_templated_chat_history;
+
                 m_tokenized_chat_history.clear();
                 m_tokenized_chat_history.reserve(new_chat_tokens.input_ids.get_size());
                 std::copy_n(new_chat_tokens.input_ids.data<int64_t>(), new_chat_tokens.input_ids.get_size(),
@@ -264,6 +268,12 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
             std::copy(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history));
 
+        // Tail of previous output in chat mode is missing in KV cache.
+        if (m_last_disappeared_token.has_value()) {
+            attention_mask = ov::genai::utils::push_front_inputs(attention_mask, 1);
+            input_ids = ov::genai::utils::push_front_inputs(input_ids, *m_last_disappeared_token);
+        }
+
         GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
 
         // If eos_token_id was not provided, take value from default m_generation_config
@@ -294,7 +304,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                         "(input_ids, attention_mask, position_ids, beam_idx) "
                         "but you have '" + std::to_string(num_inputs) + "' inputs");
 
-        ov::genai::utils::trim_kv_cache(m_model_runner, m_to_remove_from_hist, m_kv_cache_seq_length_axis, m_adapter_controller);
+        ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_history_manager.num_tokens_to_remove_from_kv_cache, m_kv_cache_seq_length_axis, m_adapter_controller);
 
         size_t kv_cache_len = 0;
         ov::Tensor concatenated_attention_mask;
@@ -304,10 +314,12 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             // Between subsequent runs attention_mask should not be modified.
             auto atten_mask_history = m_model_runner.get_tensor("attention_mask");
             auto prompt_len = attention_mask.get_shape()[1];
-            kv_cache_len = atten_mask_history.get_shape()[1] - m_to_remove_from_hist;
+
+            kv_cache_len = atten_mask_history.get_shape()[1] - m_kv_history_manager.num_tokens_to_remove_from_kv_cache;
 
             ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}};
-            auto start_atten_hst = atten_mask_history.data<int64_t>() + kv_cache_len * (*m_selected_beam);
+            auto start_atten_hst = atten_mask_history.data<int64_t>();
+
             std::copy(start_atten_hst, start_atten_hst + kv_cache_len,
                     new_atten_mask.data<int64_t>());
             std::copy(attention_mask.data<int64_t>(), attention_mask.data<int64_t>() + prompt_len,
@@ -317,6 +329,8 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             concatenated_attention_mask = attention_mask;
         }
 
+        size_t prev_attn_mask_size = concatenated_attention_mask.get_shape()[1];
+
         bool position_ids_available = (num_inputs == 4);
         std::optional<ov::Tensor> position_ids = std::nullopt;
         if (position_ids_available) {
@@ -330,51 +344,55 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
         if (is_chat_conversation && !m_trust_encoded_history) {
             m_trust_encoded_history = true;
-            m_to_remove_from_hist = 0;
+            m_kv_history_manager.reset();
         }
 
-        ov::genai::EncodedResults result;
-        if (config.is_beam_search() && is_chat_conversation) {
-            std::tie(result, m_selected_beam) = beam_search(m_model_runner, input_ids, concatenated_attention_mask,
-                                                            config, position_ids, m_selected_beam);
-        } else {
-            std::vector<SequenceGroup::Ptr> requests;
-            size_t block_size = 1;
-            bool enable_prefix_caching = false;
-
-            for (size_t request_id = 0; request_id < batch_size; request_id++) {
-                SequenceGroup::Ptr sequence_group;
-                if (is_chat_conversation) {
-                    ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
-                    sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching);
-                } else {
-                    size_t seq_len = input_ids.get_shape().at(1);
-                    size_t batch_offset = request_id * seq_len;
-                    const int64_t* prompt_start = input_ids.data<const int64_t>() + batch_offset;
-                    std::vector<int64_t> tokenized_prompt(prompt_start, prompt_start + seq_len);
+        std::vector<SequenceGroup::Ptr> requests;
+        size_t block_size = 1;
+        bool enable_prefix_caching = false;
 
-                    sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_prompt, config, block_size, enable_prefix_caching);
-                }
+        for (size_t request_id = 0; request_id < batch_size; request_id++) {
+            SequenceGroup::Ptr sequence_group;
+            if (is_chat_conversation) {
+                ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
+                sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching);
+            } else {
+                size_t seq_len = input_ids.get_shape().at(1);
+                size_t batch_offset = request_id * seq_len;
+                const int64_t* prompt_start = input_ids.data<const int64_t>() + batch_offset;
+                std::vector<int64_t> tokenized_prompt(prompt_start, prompt_start + seq_len);
 
-                sequence_group->set_sequence_group_ptr(sequence_group);
-                requests.push_back(sequence_group);
+                sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_prompt, config, block_size, enable_prefix_caching);
             }
 
-            if (m_sampler.get_seed() != config.rng_seed) {
-                m_sampler.set_seed(config.rng_seed);
-            }
+            sequence_group->set_sequence_group_ptr(sequence_group);
+            requests.push_back(sequence_group);
+        }
 
-            std::tie(result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask, streamer_ptr,
-                                                                                  m_sampler, requests, position_ids, std::nullopt, m_selected_beam);
+        if (m_sampler.get_seed() != config.rng_seed) {
+            m_sampler.set_seed(config.rng_seed);
         }
 
+        ov::genai::EncodedResults result;
+        std::tie(result, m_last_disappeared_token) = ov::genai::get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask,
+                                                                                       streamer_ptr, m_sampler, requests, position_ids, std::nullopt);
+
         if (is_chat_conversation) {
+            // force remove from kv_cache last answer
+            if (config.is_beam_search() && m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) {
+                m_kv_history_manager.trusted_history_length = m_tokenized_chat_history.size();
+                m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size;
+            }
+
             std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
         } else {
             reset_kv_state();
-            m_selected_beam = std::nullopt;
+            m_last_disappeared_token = std::nullopt;
         }
 
+        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
+            std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
+
         auto stop_time = std::chrono::steady_clock::now();
 
         // If is called without tokenization then that stat will not be reported.
@@ -388,10 +406,10 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
     void start_chat(const std::string& system_message) override {
         is_chat_conversation = true;
-        m_selected_beam = std::nullopt;
         m_trust_encoded_history = true;
-        m_to_remove_from_hist = 0;
+        m_kv_history_manager.reset();
         m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+        m_last_disappeared_token = std::nullopt;
         if (!m_tokenized_chat_history.empty()) {
             reset_kv_state();
             m_history = {};
@@ -409,10 +427,10 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
     void finish_chat() override {
         is_chat_conversation = false;
-        m_selected_beam = std::nullopt;
         m_trust_encoded_history = true;
-        m_to_remove_from_hist = 0;
+        m_kv_history_manager.reset();
         m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+        m_last_disappeared_token = std::nullopt;
         if (!m_tokenized_chat_history.empty()) {
             reset_kv_state();
             m_history.clear();
diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index 031214468e..17a20dd961 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -9,12 +9,11 @@
 #include <regex>
 #include <vector>
 
+#include "utils.hpp"
+#include "debug_utils.hpp"
 #include "lm_encoding.hpp"
 #include "openvino/genai/perf_metrics.hpp"
 
-#include "debug_utils.hpp"
-
-#include "utils.hpp"
 
 namespace ov {
 namespace genai {
@@ -51,7 +50,7 @@ void update_attention_mask_with_beams(ov::Tensor&& attention_mask, std::vector<i
 }
 
 
-std::pair<EncodedResults, int32_t> get_lm_encoded_results(
+std::pair<EncodedResults, std::optional<int64_t>> get_lm_encoded_results(
     ov::InferRequest& m_llm,
     const ov::Tensor& input_ids,
     const ov::Tensor& attention_mask,
@@ -59,8 +58,7 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
     Sampler& sampler,
     std::vector<SequenceGroup::Ptr> sequence_groups,
     std::optional<ov::Tensor> position_ids,
-    std::optional<EmbeddingsModel> m_embedding,
-    std::optional<int32_t> selected_beam_idx
+    std::optional<EmbeddingsModel> m_embedding
 ) {
     std::vector<GenerationHandle> generations;
     for (SequenceGroup::Ptr sequence_group : sequence_groups) {
@@ -105,7 +103,7 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
         m_llm.set_tensor("position_ids", *position_ids);
 
     ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {batch_size});
-    std::fill_n(beam_idx.data<int32_t>(), batch_size, selected_beam_idx.has_value() ? *selected_beam_idx : 0);
+    std::fill_n(beam_idx.data<int32_t>(), batch_size, 0);
     m_llm.set_tensor("beam_idx", beam_idx);
 
     // "Prompt" phase
@@ -171,13 +169,13 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
                 // apply strides to shift to a next sequence
                 input_ids_data += num_scheduled_tokens;
 
-                // for different sequences iteration of beams started from 0, but we collect it to one input_ids#
+                // for different sequences iteration of beams started from 0, but we collect it to one input_ids
                 next_beams.push_back(beam_idxs[sequence->get_id()] + beam_offets.at(sequence_group->get_request_id()));
             }
         }
 
-        for (size_t i = 0; i < sequence_groups.size(); i++) {
-            beam_offets[sequence_groups.at(i)->get_request_id()] = i == 0 ? 0 : (sequence_groups.at(i - 1)->num_running_seqs() + beam_offets[i - 1]);
+        for (size_t i = 0; i < active_sequence_groups.size(); i++) {
+            beam_offets[active_sequence_groups.at(i)->get_request_id()] = i == 0 ? 0 : (active_sequence_groups.at(i - 1)->num_running_seqs() + beam_offets[i - 1]);
         }
 
         if (m_embedding.has_value()) {
@@ -212,15 +210,10 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
         streamer_ptr->end();
     }
 
-    // Collect results
-
-    size_t next_selected_beam = 0;
-    for (size_t i = 0; i < sequence_groups.size(); i++) {
-        auto request = sequence_groups[i];
-        std::vector<GenerationOutput> generation_outputs;
-        auto sampling_params = request->get_sampling_parameters();
-        const auto& sequences = request->get_finished_sequences();
-        size_t num_outputs = std::min(request->get_sampling_parameters().num_return_sequences, sequences.size());
+    for (auto& sequence_group : sequence_groups) {
+        auto sampling_params = sequence_group->get_sampling_parameters();
+        const auto& sequences = sequence_group->get_finished_sequences();
+        size_t num_outputs = std::min(sequence_group->get_sampling_parameters().num_return_sequences, sequences.size());
 
         for (size_t seq_id = 0; seq_id < num_outputs; ++seq_id) {
             const auto & sequence = sequences[seq_id];
@@ -229,13 +222,17 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
             results.tokens.push_back(sequence->get_generated_ids());
             results.scores.push_back(score);
         }
-        // next_selected_beam = sampler.last_selected_beam(request);
     }
 
     for (SequenceGroup::Ptr sequence_group : sequence_groups)
         sampler.clear_request_info(sequence_group->get_request_id());
 
-    return {results, next_selected_beam};
+    // it is not saved in KV cache, we need to add it for some cases
+    std::optional<int64_t> last_token_of_best_sequence = std::nullopt;
+    if (sequence_groups[0]->get_finished_sequences()[0]->get_finish_reason() == GenerationFinishReason::LENGTH || sequence_groups[0]->handle_dropped())
+        last_token_of_best_sequence = results.tokens[0].back();
+
+    return {results, last_token_of_best_sequence};
 }
 
 }  // namespace genai
diff --git a/src/cpp/src/lm_encoding.hpp b/src/cpp/src/lm_encoding.hpp
index fa6692ede0..c31cffb9bc 100644
--- a/src/cpp/src/lm_encoding.hpp
+++ b/src/cpp/src/lm_encoding.hpp
@@ -8,13 +8,9 @@
 namespace ov {
 namespace genai {
 
-std::pair<EncodedResults, int32_t> get_lm_encoded_results(ov::InferRequest& m_llm, const ov::Tensor& input_ids, const ov::Tensor& attention_mask,
-                                                          const std::shared_ptr<StreamerBase>& streamer_ptr, Sampler& sampler, std::vector<SequenceGroup::Ptr> sequence_groups,
-                                                          std::optional<ov::Tensor> position_ids, std::optional<EmbeddingsModel> m_embedding, std::optional<int32_t> selected_beam_idx);
-
-void update_attention_mask_with_beams(ov::Tensor&& attention_mask, std::vector<int32_t> next_beams);
-
-void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask);
+std::pair<EncodedResults, std::optional<int64_t>> get_lm_encoded_results(ov::InferRequest& m_llm, const ov::Tensor& input_ids, const ov::Tensor& attention_mask,
+                                                                         const std::shared_ptr<StreamerBase>& streamer_ptr, Sampler& sampler, std::vector<SequenceGroup::Ptr> sequence_groups,
+                                                                         std::optional<ov::Tensor> position_ids, std::optional<EmbeddingsModel> m_embedding);
 
 }
 }
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 96191387cd..57225e60ff 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -28,6 +28,21 @@ enum class GenerationChatInputsType {
     ENCODED_INPUTS = 2, // Type of inputs is EncodedInputs
 };
 
+struct HistoryRemoveManager
+{
+    size_t num_tokens_to_remove_from_kv_cache = 0;
+    size_t trusted_history_length = 0;
+
+    bool does_kv_cache_need_to_update() {
+        return (trusted_history_length > 0 || num_tokens_to_remove_from_kv_cache > 0);
+    }
+
+    void reset() {
+        num_tokens_to_remove_from_kv_cache = 0;
+        trusted_history_length = 0;
+    }
+};
+
 Tensor init_attention_mask(const Tensor& position_ids);
 
 void print_tensor(const ov::Tensor& tensor);
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index 8175d44b16..e53be4e1cd 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -42,11 +42,12 @@ class InputsEmbedder::IInputsEmbedder {
     std::string m_templated_chat_history;
     // Tokenized chat history
     std::vector<int64_t> m_tokenized_history;
-    // The number of elements, which need to remove from the end of KV cache
-    // removed elements will be added to inputs_ids
-    size_t m_to_remove_from_hist = 0;
     // Tail of previous output for LM in chat mode is missing in KV cache.
     std::optional<int64_t> m_last_disappeared_token = std::nullopt;
+    // If sequence contains some symbols, which could be ambiguous encoded by tokenizer, we need to trim kv cache
+    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
+    // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
+    ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0};
 
 public:
     virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
@@ -63,22 +64,26 @@ class InputsEmbedder::IInputsEmbedder {
         return m_tokenized_history;
     }
 
-    size_t get_amount_to_remove_from_hist() const {
-        return m_to_remove_from_hist;
+    size_t get_num_tokens_to_remove_from_hist() const {
+        return m_kv_history_manager.num_tokens_to_remove_from_kv_cache;
     }
 
-    void update_tokenized_history(std::vector<int64_t> encoded_result, bool token_will_disappear) {
+    void update_tokenized_history(const std::vector<int64_t>& encoded_result, std::optional<int64_t> last_disappeared_token, bool is_beam_search, size_t last_answer_len) {
+        if (is_beam_search) {
+            m_kv_history_manager.trusted_history_length = m_tokenized_history.size();
+            m_kv_history_manager.num_tokens_to_remove_from_kv_cache = last_answer_len;
+        } else {
+            m_kv_history_manager.reset();
+        }
+
+        m_last_disappeared_token = last_disappeared_token;
+  
         std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_history));
-        m_to_remove_from_hist = 0;
-        if (token_will_disappear)
-            m_last_disappeared_token = encoded_result.back();
-        else
-            m_last_disappeared_token = std::nullopt;
     }
 
     virtual void start_chat(const std::string& system_message) {
         m_is_chat_conversation = true;
-        m_to_remove_from_hist = 0;
+        m_kv_history_manager.reset();
         if (!m_tokenized_history.empty()) {
             m_history.clear();
             m_templated_chat_history.clear();
@@ -101,7 +106,7 @@ class InputsEmbedder::IInputsEmbedder {
 
     virtual void finish_chat() {
         m_is_chat_conversation = false;
-        m_to_remove_from_hist = 0;
+        m_kv_history_manager.reset();
 
         m_history.clear();
         m_templated_chat_history.clear();
@@ -171,24 +176,32 @@ class InputsEmbedder::IInputsEmbedder {
             // some symbols combinations can be encoded by the tokenizer in different ways
             // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
             // so let's check it out, find the trusted part and use it in on the next step
-            size_t last_same_hist_token = 0;
+            size_t trusted_history_length = 0;
             if (!m_tokenized_history.empty()) {
                 std::set<int64_t> stop_tokens = {m_tokenizer.get_eos_token_id()};
-                last_same_hist_token = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_history, stop_tokens);
+                trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_history, stop_tokens);
             }
 
             if (m_tokenized_history.empty()) {
                 encoded_input_ids = new_chat_tokens;
-            } else if (last_same_hist_token != SIZE_MAX) {
-                m_to_remove_from_hist = m_tokenized_history.size() - last_same_hist_token;
-                // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it
-                m_to_remove_from_hist -= m_last_disappeared_token.has_value() ? 1 : 0;
+
+            } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_kv_cache_need_to_update()) {
+                // does_kv_cache_need_to_update will be true here if beam search is activated
+                // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly
+                // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager
+                if (m_kv_history_manager.does_kv_cache_need_to_update()) {
+                    trusted_history_length = m_kv_history_manager.trusted_history_length;
+                } else {
+                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_tokenized_history.size() - trusted_history_length;
+                    // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it
+                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= m_last_disappeared_token.has_value() ? 1 : 0;
+                }
 
                 ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.get_element_type(),
-                                                   {1, new_chat_tokens.get_shape().at(1) - last_same_hist_token},
-                                                   new_chat_tokens.data<int64_t>() + last_same_hist_token);
+                                                   {1, new_chat_tokens.get_shape().at(1) - trusted_history_length},
+                                                   new_chat_tokens.data<int64_t>() + trusted_history_length);
                 encoded_input_ids = ov::Tensor(new_chat_tokens.get_element_type(),
-                                                    {1, new_chat_tokens.get_shape().at(1) - last_same_hist_token});
+                                                    {1, new_chat_tokens.get_shape().at(1) - trusted_history_length});
                 new_tensor.copy_to(encoded_input_ids);
             } else {
                 encoded_input_ids = utils::subtract_chat_tokenized_inputs(
@@ -1192,12 +1205,12 @@ std::vector<int64_t> InputsEmbedder::get_tokenized_history() const {
     return m_impl->get_tokenized_history();
 }
 
-void InputsEmbedder::update_tokenized_history(std::vector<int64_t> encoded_result, bool token_will_disappear) {
-    return m_impl->update_tokenized_history(encoded_result, token_will_disappear);
+void InputsEmbedder::update_tokenized_history(const std::vector<int64_t>& encoded_result, std::optional<int64_t> last_disappeared_token, bool is_beam_search, size_t last_answer_len) {
+    return m_impl->update_tokenized_history(encoded_result, last_disappeared_token, is_beam_search, last_answer_len);
 }
 
-size_t InputsEmbedder::get_amount_to_remove_from_hist() const {
-    return m_impl->get_amount_to_remove_from_hist();
+size_t InputsEmbedder::get_num_tokens_to_remove_from_hist() const {
+    return m_impl->get_num_tokens_to_remove_from_hist();
 }
 
 Tokenizer InputsEmbedder::get_tokenizer() const {
diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp
index 8c84c6ad43..1d72b742ab 100644
--- a/src/cpp/src/visual_language/inputs_embedder.hpp
+++ b/src/cpp/src/visual_language/inputs_embedder.hpp
@@ -43,11 +43,11 @@ class InputsEmbedder {
     // returns tokenized chat history
     std::vector<int64_t> get_tokenized_history() const;
 
-    // add new results to tokenized chat history
-    void update_tokenized_history(std::vector<int64_t> encoded_result, bool token_will_disappear);
+    // add new results to tokenized history
+    void update_tokenized_history(const std::vector<int64_t>& encoded_result, std::optional<int64_t> last_disappeared_token, bool is_beam_search, size_t last_answer_len);
 
     // returns amount of elements, which need to remove from the end of the KV cache
-    size_t get_amount_to_remove_from_hist() const;
+    size_t get_num_tokens_to_remove_from_hist() const;
 
     // starts chat and adds optional system_message to chat history
     void start_chat(const std::string& system_message);
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
index ad4529e22f..d625485205 100644
--- a/src/cpp/src/visual_language/pipeline.cpp
+++ b/src/cpp/src/visual_language/pipeline.cpp
@@ -169,7 +169,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics);
         auto end_get_inputs_embeds = std::chrono::steady_clock::now();
 
-        auto to_remove_from_hist = m_inputs_embedder->get_amount_to_remove_from_hist();
+        auto to_remove_from_hist = m_inputs_embedder->get_num_tokens_to_remove_from_hist();
         ov::genai::utils::trim_kv_cache(m_language, to_remove_from_hist, m_kv_cache_seq_length_axis, std::nullopt);
 
         std::vector<SequenceGroup::Ptr> requests;
@@ -218,9 +218,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         }
 
         ov::genai::EncodedResults encoded_result;
-        int32_t m_selected_beam = 0;
-        std::tie(encoded_result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests,
-                                                                                      position_ids, m_embedding, std::nullopt);
+        std::optional<int64_t> last_disappeared_token;
+        std::tie(encoded_result, last_disappeared_token) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests,
+                                                                                             position_ids, m_embedding);
 
         auto decode_start_time = std::chrono::steady_clock::now();
         VLMDecodedResults decoded;
@@ -230,6 +230,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         }
         auto decode_end_time = std::chrono::steady_clock::now();
 
+        m_inputs_embedder->update_tokenized_history(encoded_result.tokens[0], last_disappeared_token, generation_config.is_beam_search(),
+                                                    m_language.get_tensor("attention_mask").get_shape()[1] - (history_size + inputs_embeds_size));
+
         std::string decoded_results = decoded.texts.at(0);
         if (m_is_chat_conversation) {
             m_inputs_embedder->update_chat_history(decoded_results);
@@ -256,8 +259,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         decoded.perf_metrics.m_evaluated = false;
         decoded.perf_metrics.evaluate_statistics(generate_start_time);
 
-        m_inputs_embedder->update_tokenized_history(encoded_result.tokens[0], requests[0]->get_finished_sequences()[0]->get_finish_reason() == GenerationFinishReason::LENGTH);
-
         return decoded;
     }
 

From 2fb56d40cc3623c54538cfb8e72b3fa9b71708f3 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 23 Dec 2024 10:14:33 +0100
Subject: [PATCH 042/110] Add a command for whisper quantization (#1422)

      Co-authored-by: Alexander Kozlov <alexander.kozlov@intel.com>
---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c2509528c3..be3de5e8ce 100644
--- a/README.md
+++ b/README.md
@@ -331,10 +331,14 @@ For more examples check out our [Generative AI workflow](https://docs.openvino.a
 
 NOTE: Whisper Pipeline requires preprocessing of audio input (to adjust sampling rate and normalize)
  
- ### Converting and compressing image generation model from Hugging Face library
+ ### Converting and quantizing speech-to-text model from Hugging Face library
 ```sh
 #Download and convert to OpenVINO whisper-base model
 optimum-cli export openvino --trust-remote-code --model openai/whisper-base whisper-base
+
+#Download, convert and apply int8 static quantization to whisper-base model
+optimum-cli export openvino --trust-remote-code --model openai/whisper-base \
+--quant-mode int8 --dataset librispeech --num-samples 32 whisper-base-int8
 ```
 
 ### Run generation using Whisper Pipeline API in Python

From 3ed69638c56cd4164681f33cf0a24296de65e439 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Mon, 23 Dec 2024 10:16:31 +0100
Subject: [PATCH 043/110] remove redundant `.tolist()` (#1419)

![image](https://github.com/user-attachments/assets/77013e49-d1bd-4f3a-99aa-1d17e9b8f6b5)


- To fix remove redundant `.tolist()` since it was already done above.

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 tools/llm_bench/task/text_generation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py
index 485de94996..4822b228ca 100644
--- a/tools/llm_bench/task/text_generation.py
+++ b/tools/llm_bench/task/text_generation.py
@@ -301,7 +301,7 @@ def token_printer():
         - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1])
     ).tolist()
 
-    tm_list = np.array([first_token_time] + second_tokens_durations) / 1000
+    tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist()
     inference_durations = (np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000).tolist()
     log.debug('latency of all tokens:')
     [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
@@ -323,8 +323,8 @@ def token_printer():
     metrics_print.print_metrics(
         num,
         iter_data,
-        tm_list.tolist(),
-        inference_durations.tolist(),
+        tm_list,
+        inference_durations,
         warm_up=(num == 0),
         max_rss_mem=max_rss_mem_consumption,
         max_shared_mem=max_shared_mem_consumption,

From eac4f376e9fc509a68fc3c1f6a3637d9f19b7526 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 23 Dec 2024 14:11:37 +0400
Subject: [PATCH 044/110] [Image generation] Added i64 support for text
 encoders inputs (#1424)

Can be required for new optimum versions
---
 .../image_generation/models/clip_text_model.cpp  | 16 +++++++++++-----
 .../models/clip_text_model_with_projection.cpp   | 16 +++++++++++-----
 .../image_generation/models/t5_encoder_model.cpp | 10 +++++++---
 .../models/unet_inference_dynamic.hpp            | 16 ++++------------
 .../models/unet_inference_static_bs1.hpp         |  3 +--
 5 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/src/cpp/src/image_generation/models/clip_text_model.cpp b/src/cpp/src/image_generation/models/clip_text_model.cpp
index efbc840d4f..72fdc63082 100644
--- a/src/cpp/src/image_generation/models/clip_text_model.cpp
+++ b/src/cpp/src/image_generation/models/clip_text_model.cpp
@@ -118,13 +118,20 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string
     const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1;
 
     auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) {
-        std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
-
         ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids;
-        std::copy_n(input_ids_token.data<std::int64_t>(), input_ids_token.get_size(), input_ids.data<std::int32_t>());
+
+        if (input_ids.get_element_type() == ov::element::i32) {
+            std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
+            std::copy_n(input_ids_token.data<int64_t>(), input_ids_token.get_size(), input_ids.data<int32_t>());
+        } else {
+            std::fill_n(input_ids.data<int64_t>(), input_ids.get_size(), pad_token_id);
+            std::copy_n(input_ids_token.data<int64_t>(), input_ids_token.get_size(), input_ids.data<int64_t>());
+        }
     };
 
-    ov::Tensor input_ids(ov::element::i32, {text_embedding_batch_size, m_config.max_position_embeddings});
+    ov::Tensor input_ids = m_request.get_input_tensor();
+    input_ids.set_shape({text_embedding_batch_size, m_config.max_position_embeddings});
+
     size_t current_batch_idx = 0;
 
     if (do_classifier_free_guidance) {
@@ -141,7 +148,6 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string
                                                {current_batch_idx + 1, m_config.max_position_embeddings}));
 
     // text embeddings
-    m_request.set_tensor("input_ids", input_ids);
     m_request.infer();
 
     return m_request.get_output_tensor(0);
diff --git a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
index 982800a701..1160c30b6a 100644
--- a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
+++ b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
@@ -109,13 +109,20 @@ ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, con
     const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1;
 
     auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) {
-        std::fill_n(input_ids.data<int64_t>(), input_ids.get_size(), pad_token_id);
-
         ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids;
-        std::copy_n(input_ids_token.data<std::int64_t>(), input_ids_token.get_size(), input_ids.data<std::int64_t>());
+
+        if (input_ids.get_element_type() == ov::element::i32) {
+            std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
+            std::copy_n(input_ids_token.data<int64_t>(), input_ids_token.get_size(), input_ids.data<int32_t>());
+        } else {
+            std::fill_n(input_ids.data<int64_t>(), input_ids.get_size(), pad_token_id);
+            std::copy_n(input_ids_token.data<int64_t>(), input_ids_token.get_size(), input_ids.data<int64_t>());
+        }
     };
 
-    ov::Tensor input_ids(ov::element::i64, {text_embedding_batch_size, m_config.max_position_embeddings});
+    ov::Tensor input_ids = m_request.get_input_tensor();
+    input_ids.set_shape({text_embedding_batch_size, m_config.max_position_embeddings});
+
     size_t current_batch_idx = 0;
 
     if (do_classifier_free_guidance) {
@@ -132,7 +139,6 @@ ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, con
                                                {current_batch_idx + 1, m_config.max_position_embeddings}));
 
     // text embeddings
-    m_request.set_tensor("input_ids", input_ids);
     m_request.infer();
 
     return m_request.get_output_tensor(0);
diff --git a/src/cpp/src/image_generation/models/t5_encoder_model.cpp b/src/cpp/src/image_generation/models/t5_encoder_model.cpp
index 21df456d46..a83697b2e6 100644
--- a/src/cpp/src/image_generation/models/t5_encoder_model.cpp
+++ b/src/cpp/src/image_generation/models/t5_encoder_model.cpp
@@ -80,8 +80,13 @@ ov::Tensor T5EncoderModel::infer(const std::string& pos_prompt, const std::strin
         ov::Tensor input_ids_token = m_tokenizer.encode(prompt).input_ids;
         size_t min_length = std::min(input_ids.get_size(), input_ids_token.get_size());
 
-        std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
-        std::copy_n(input_ids_token.data<std::int64_t>(), min_length, input_ids.data<std::int32_t>());
+        if (input_ids.get_element_type() == ov::element::i32) {
+            std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
+            std::copy_n(input_ids_token.data<int64_t>(), min_length, input_ids.data<int32_t>());
+        } else {
+            std::fill_n(input_ids.data<int64_t>(), input_ids.get_size(), pad_token_id);
+            std::copy_n(input_ids_token.data<int64_t>(), min_length, input_ids.data<int64_t>());
+        }
     };
 
     ov::Tensor input_ids = m_request.get_input_tensor();
@@ -114,7 +119,6 @@ ov::Tensor T5EncoderModel::infer(const std::string& pos_prompt, const std::strin
                                                {current_batch_idx + 1, input_ids.get_shape()[1]}));
 
     // text embeddings
-    m_request.set_tensor("input_ids", input_ids);
     m_request.infer();
 
     return m_request.get_output_tensor(0);
diff --git a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
index 6dc285f76d..914fbcf50b 100644
--- a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
+++ b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
@@ -12,11 +12,8 @@ namespace genai {
 
 
 class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::UNetInference {
-
 public:
-
-    virtual void compile(std::shared_ptr<ov::Model> model, const std::string& device, const ov::AnyMap& properties) override
-    {
+    virtual void compile(std::shared_ptr<ov::Model> model, const std::string& device, const ov::AnyMap& properties) override {
         ov::Core core = utils::singleton_core();
 
         ov::CompiledModel compiled_model = core.compile_model(model, device, properties);
@@ -24,20 +21,17 @@ class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::
         m_request = compiled_model.create_infer_request();
     }
 
-    virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) override
-    {
+    virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) override {
         OPENVINO_ASSERT(m_request, "UNet model must be compiled first");
         m_request.set_tensor(tensor_name, encoder_hidden_states);
     }
 
-    virtual void set_adapters(AdapterController &adapter_controller, const AdapterConfig& adapters) override
-    {
+    virtual void set_adapters(AdapterController &adapter_controller, const AdapterConfig& adapters) override {
         OPENVINO_ASSERT(m_request, "UNet model must be compiled first");
         adapter_controller.apply(m_request, adapters);
     }
 
-    virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep) override
-    {
+    virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep) override {
         OPENVINO_ASSERT(m_request, "UNet model must be compiled first. Cannot infer non-compiled model");
 
         m_request.set_tensor("sample", sample);
@@ -49,10 +43,8 @@ class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::
     }
 
 private:
-
     ov::InferRequest m_request;
 };
 
-
 }  // namespace genai
 }  // namespace ov
\ No newline at end of file
diff --git a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp
index 7aa6f6301c..f63a8ea237 100644
--- a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp
+++ b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp
@@ -42,8 +42,7 @@ class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel
         ov::CompiledModel compiled_model = core.compile_model(model, device, properties);
         ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition batch-1 model");
 
-        for (int i = 0; i < m_native_batch_size; i++)
-        {
+        for (int i = 0; i < m_native_batch_size; i++) {
             m_requests[i] = compiled_model.create_infer_request();
         }
     }

From 1179cb611fa65910180e260cf31b98742113a896 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Mon, 23 Dec 2024 15:21:46 +0400
Subject: [PATCH 045/110] [LLM Bench] Allow Image Generation Models to Run in
 BF16 (#1368)

This change allows setting image generation models to BF16 using config
passed while running benchmark.

Co-authored-by: Ekaterina Aidova <ekaterina.aidova@intel.com>
Co-authored-by: guozhong wang <guozhong.wang@intel.com>
---
 tools/llm_bench/llm_bench_utils/pt_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/llm_bench/llm_bench_utils/pt_utils.py b/tools/llm_bench/llm_bench_utils/pt_utils.py
index 4c41efad01..dc2c6d05f5 100644
--- a/tools/llm_bench/llm_bench_utils/pt_utils.py
+++ b/tools/llm_bench/llm_bench_utils/pt_utils.py
@@ -131,6 +131,7 @@ def create_image_gen_model(model_path, device, **kwargs):
             model_class = PT_MODEL_CLASSES_MAPPING[model_type]
             start = time.perf_counter()
             pipe = model_class.from_pretrained(model_path)
+            pipe = set_bf16(pipe, device, **kwargs)
             end = time.perf_counter()
             from_pretrain_time = end - start
         else:

From 5d68567484594c915d6047cd9a31a95eab40962d Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Mon, 23 Dec 2024 15:22:05 +0400
Subject: [PATCH 046/110] [LLM Bench] Defining Framework in Torch Compile
 Benchmarking (#1354)

It looks like the framework needs to be specified as pytorch for the
models to be compile with torch compile, otherwise it takes the OV
framework route and never hits the torch compile code.
Although the following
[line](https://github.com/openvinotoolkit/openvino.genai/blob/b26fc8b7a484e0f66accba89ea9f972c6d23fda7/tools/llm_bench/llm_bench_utils/pt_utils.py#L157)
tries to use torch compile on the entire image generation pipeline which
causes issues since it is expected to compile the models within the
pipeline.

---------

Co-authored-by: Ekaterina Aidova <ekaterina.aidova@intel.com>
Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 tools/llm_bench/llm_bench_utils/model_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py
index f3e7d21777..78f72147c7 100644
--- a/tools/llm_bench/llm_bench_utils/model_utils.py
+++ b/tools/llm_bench/llm_bench_utils/model_utils.py
@@ -137,6 +137,9 @@ def analyze_args(args):
 
     model_framework = args.framework
     model_path = Path(args.model)
+    if model_args["torch_compile_backend"]:
+        log.info("Setting Framework to PyTorch Since torch_compile_backend is provided.")
+        model_framework = 'pt'
     if not model_path.exists():
         raise RuntimeError(f'==Failure FOUND==: Incorrect model path:{model_path}')
     if model_framework in ('ov', 'pt'):

From c09207cd497e250e8b3e7ad442cec3bc4181827e Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Mon, 23 Dec 2024 12:33:47 +0100
Subject: [PATCH 047/110] [test] Ensure that the first token generation is not
 included into TPOT (#1414)

CVS-155098
---
 src/cpp/src/perf_metrics.cpp            |  2 +-
 tests/python_tests/conftest.py          |  3 ++-
 tests/python_tests/test_generate_api.py | 10 +++++++++-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp
index 3bd6252c78..3725dc0cfc 100644
--- a/src/cpp/src/perf_metrics.cpp
+++ b/src/cpp/src/perf_metrics.cpp
@@ -101,7 +101,7 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
 
         auto ttft = tok_times[0] - start_time_val;
         raw_metrics.m_times_to_first_token = std::vector<MicroSeconds>();
-        raw_metrics.m_times_to_first_token.emplace_back(ttft / batch_sizes[0]);
+        raw_metrics.m_times_to_first_token.emplace_back(ttft);
         num_generated_tokens = batch_sizes[0];
         
         // The very first infer request (prefill stage) is slower than subsequent ones since we process a sequence of tokens.
diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py
index f98f47ecf3..e159045601 100644
--- a/tests/python_tests/conftest.py
+++ b/tests/python_tests/conftest.py
@@ -3,7 +3,8 @@
 
 def pytest_make_parametrize_id(config, val, argname):
     if argname in ['prompt', 'prompts', 'batched_prompts']:
-        return f'{val}'
+        # Print only first 1000 characters of long prompts.
+        return f'{val[:1000]}'
     elif argname == 'model_descr':
         return f"{val[0]}"
     elif argname == 'chat_config':
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index d15747be63..9bb9eff49c 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -798,6 +798,12 @@ def test_perf_metrics(model_descr, generation_config, prompt):
     assert (mean_ttft, std_ttft) == (perf_metrics.get_ttft().mean, perf_metrics.get_ttft().std)
     assert mean_ttft > 0 and mean_ttft < 1000.0
 
+    raw_metrics = perf_metrics.raw_metrics
+    durations = np.array(raw_metrics.m_durations) / 1000
+    # Check that prefill is not included in durations for TPOT calculation.
+    # For the very long prompt prefill is slow and TTFT is much larger than any other token genration duration.
+    assert np.all(mean_ttft > durations * 2)
+
     mean_tpot, std_tpot = perf_metrics.get_tpot()
     assert (mean_tpot, std_tpot) == (perf_metrics.get_tpot().mean, perf_metrics.get_tpot().std)
     assert mean_tpot > 0 and mean_ttft < 1000.0
@@ -822,7 +828,9 @@ def test_perf_metrics(model_descr, generation_config, prompt):
     assert std_detok_duration == 0
     
     # assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics
-    raw_metrics = perf_metrics.raw_metrics
+    assert np.allclose(mean_tpot, np.mean(durations))
+    assert np.allclose(std_tpot, np.std(durations))
+
     raw_dur = np.array(raw_metrics.generate_durations) / 1000
     assert np.allclose(mean_gen_duration, np.mean(raw_dur))
     assert np.allclose(std_gen_duration, np.std(raw_dur))

From 3496d453ee2a2dd1a0340247076ab64787094446 Mon Sep 17 00:00:00 2001
From: Ekaterina Shiryaeva <ekaterina.shiriaeva@intel.com>
Date: Mon, 23 Dec 2024 12:48:23 +0100
Subject: [PATCH 048/110] Add perf metrics support for WhisperStaticPipeline
 (#1337)

---
 src/cpp/src/whisper/whisper.cpp         | 37 ++-----------
 src/cpp/src/whisper/whisper_utils.cpp   | 46 ++++++++++++++++
 src/cpp/src/whisper/whisper_utils.hpp   | 22 ++++++++
 src/cpp/src/whisper_pipeline_static.cpp | 70 +++++++++++++++++++++----
 4 files changed, 131 insertions(+), 44 deletions(-)
 create mode 100644 src/cpp/src/whisper/whisper_utils.cpp
 create mode 100644 src/cpp/src/whisper/whisper_utils.hpp

diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp
index 9d6aa698ce..04993f288c 100644
--- a/src/cpp/src/whisper/whisper.cpp
+++ b/src/cpp/src/whisper/whisper.cpp
@@ -18,6 +18,7 @@
 #include "whisper_config.hpp"
 #include "whisper_feature_extractor.hpp"
 #include "whisper_models.hpp"
+#include "whisper_utils.hpp"
 
 using ov::genai::MicroSeconds;
 
@@ -79,17 +80,6 @@ void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) {
     }
 }
 
-void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) {
-    const auto infer_start = std::chrono::steady_clock::now();
-    request.infer();
-    const auto infer_end = std::chrono::steady_clock::now();
-    const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start);
-    raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
-    raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
-    raw_metrics.m_new_token_times.emplace_back(infer_end);
-    raw_metrics.m_batch_sizes.emplace_back(1);
-}
-
 int64_t decode(ov::Tensor& encoder_hidden_state,
                ov::InferRequest& decoder,
                std::vector<int64_t>& input_ids,
@@ -102,7 +92,7 @@ int64_t decode(ov::Tensor& encoder_hidden_state,
     ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data());
     decoder.set_tensor("input_ids", input_ids_tensor);
 
-    infer_with_perf_metrics(decoder, raw_metrics);
+    ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics);
 
     auto output_tensor = decoder.get_tensor("logits");
 
@@ -138,7 +128,7 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state,
     cache_position_tensor.set_shape({1});
     cache_position_tensor.data<int64_t>()[0] = cache_position;
 
-    infer_with_perf_metrics(decoder_with_past, raw_metrics);
+    ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics);
 
     auto output_tensor = decoder_with_past.get_tensor("logits");
 
@@ -265,25 +255,6 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
     return {false, output_tokens};
 }
 
-template <typename T>
-void filter_by_ranges(std::vector<T>& value, size_t offset, std::vector<std::pair<size_t, size_t>>& ranges) {
-    OPENVINO_ASSERT(ranges.empty() || value.size() >= (offset + ranges.back().second));
-    std::vector<T> result{value.begin(), value.begin() + offset};
-    for (auto [start, end] : ranges) {
-        result.insert(result.end(), value.begin() + offset + start, value.begin() + offset + end);
-    }
-
-    value = result;
-}
-
-void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
-                                size_t offset,
-                                std::vector<std::pair<size_t, size_t>>& ranges) {
-    filter_by_ranges(raw_metrics.m_token_infer_durations, offset, ranges);
-    filter_by_ranges(raw_metrics.m_new_token_times, offset, ranges);
-    filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges);
-}
-
 }  // namespace
 
 namespace ov {
@@ -362,7 +333,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
                                                                   feature_extractor.nb_max_frames,
                                                                   time_precision);
 
-            filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);
+            ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);
 
             segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end());
 
diff --git a/src/cpp/src/whisper/whisper_utils.cpp b/src/cpp/src/whisper/whisper_utils.cpp
new file mode 100644
index 0000000000..6e56a1439d
--- /dev/null
+++ b/src/cpp/src/whisper/whisper_utils.cpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "whisper_utils.hpp"
+
+namespace {
+
+template <typename T>
+void filter_by_ranges(std::vector<T>& value, size_t offset, std::vector<std::pair<size_t, size_t>>& ranges) {
+    OPENVINO_ASSERT(ranges.empty() || value.size() >= (offset + ranges.back().second));
+    std::vector<T> result{value.begin(), value.begin() + offset};
+    for (auto [start, end] : ranges) {
+        result.insert(result.end(), value.begin() + offset + start, value.begin() + offset + end);
+    }
+
+    value = result;
+}
+
+}  // namespace
+
+namespace ov {
+namespace genai {
+namespace utils {
+
+void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) {
+    const auto infer_start = std::chrono::steady_clock::now();
+    request.infer();
+    const auto infer_end = std::chrono::steady_clock::now();
+    const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start);
+    raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
+    raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
+    raw_metrics.m_new_token_times.emplace_back(infer_end);
+    raw_metrics.m_batch_sizes.emplace_back(1);
+}
+
+void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
+                                size_t offset,
+                                std::vector<std::pair<size_t, size_t>>& ranges) {
+    filter_by_ranges(raw_metrics.m_token_infer_durations, offset, ranges);
+    filter_by_ranges(raw_metrics.m_new_token_times, offset, ranges);
+    filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges);
+}
+
+}  // namespace utils
+}  // namespace genai
+}  // namespace ov
diff --git a/src/cpp/src/whisper/whisper_utils.hpp b/src/cpp/src/whisper/whisper_utils.hpp
new file mode 100644
index 0000000000..234feed6a8
--- /dev/null
+++ b/src/cpp/src/whisper/whisper_utils.hpp
@@ -0,0 +1,22 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <openvino/openvino.hpp>
+
+#include "openvino/genai/perf_metrics.hpp"
+
+namespace ov {
+namespace genai {
+namespace utils {
+
+void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics);
+
+void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
+                                size_t offset,
+                                std::vector<std::pair<size_t, size_t>>& ranges);
+
+}  // namespace utils
+}  // namespace genai
+}  // namespace ov
diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp
index dc26789846..cc61eb0659 100644
--- a/src/cpp/src/whisper_pipeline_static.cpp
+++ b/src/cpp/src/whisper_pipeline_static.cpp
@@ -14,6 +14,7 @@
 #include "whisper/timestamps.hpp"
 #include "whisper/whisper.hpp"
 #include "whisper/whisper_config.hpp"
+#include "whisper/whisper_utils.hpp"
 
 #include "openvino/core/layout.hpp"
 #include "openvino/core/preprocess/pre_post_process.hpp"
@@ -26,6 +27,8 @@
 #include "openvino/op/convert.hpp"
 #include "openvino/op/parameter.hpp"
 
+using ov::genai::MicroSeconds;
+
 namespace {
 
 template <typename T>
@@ -44,7 +47,8 @@ void copy_to_tensor(const std::vector<T>& src_vec, ov::Tensor dst_tensor) {
 ov::Tensor encode(ov::InferRequest& request,
                   std::vector<float>& mel_data,
                   const size_t feature_size,
-                  const size_t nb_max_frames) {
+                  const size_t nb_max_frames,
+                  ov::genai::RawPerfMetrics& raw_metrics) {
     OPENVINO_ASSERT(mel_data.size() == feature_size * nb_max_frames,
                     "Mel spectrogram required size: ",
                     feature_size,
@@ -54,7 +58,12 @@ ov::Tensor encode(ov::InferRequest& request,
                     mel_data.size(),
                     ".");
     copy_to_tensor(mel_data, request.get_tensor("input_features"));
+
+    const auto infer_start = std::chrono::steady_clock::now();
     request.infer();
+    const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
+    raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
+
     return request.get_tensor("last_hidden_state");
 }
 
@@ -140,13 +149,14 @@ int64_t decode(ov::Tensor& encoder_hidden_state,
                ov::InferRequest& decoder,
                const std::vector<int32_t>& init_ids,
                const ov::genai::WhisperGenerationConfig& config,
+               ov::genai::RawPerfMetrics& raw_metrics,
                const bool apply_logit_processors = true,
                const bool return_timestamps = false) {
     // NB: Fill decoder inputs
     encoder_hidden_state.copy_to(decoder.get_tensor("encoder_hidden_states"));
     set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id);
 
-    decoder.infer();
+    ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics);
 
     auto output_tensor = decoder.get_tensor("logits");
 
@@ -167,6 +177,7 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past,
                          const int64_t input_id,
                          const int64_t position_id,
                          const ov::genai::WhisperGenerationConfig& config,
+                         ov::genai::RawPerfMetrics& raw_metrics,
                          const bool return_timestamps,
                          const std::vector<int64_t>& generated_tokens) {
     // FIXME: Avoid this cast to i32. Why it's not i64 precision in model?
@@ -175,7 +186,7 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past,
     // FIXME: Is "attention_mask" supposed to be f16?
     decoder_with_past.get_tensor("attention_mask").data<ov::float16>()[position_id - 1] = 0u;
 
-    decoder_with_past.infer();
+    ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics);
 
     auto output_tensor = decoder_with_past.get_tensor("logits");
     ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens);
@@ -217,13 +228,17 @@ void prepare_decoder_with_past(ov::InferRequest& decoder_with_past, ov::InferReq
 
 int64_t detect_language(ov::Tensor& encoder_hidden_state,
                         ov::InferRequest decoder,
-                        const ov::genai::WhisperGenerationConfig& config) {
+                        const ov::genai::WhisperGenerationConfig& config,
+                        ov::genai::RawPerfMetrics& raw_metrics) {
     decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state});
 
     std::vector<int32_t> init_ids{static_cast<int32_t>(config.decoder_start_token_id)};
     set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id);
 
+    const auto infer_start = std::chrono::steady_clock::now();
     decoder.infer();
+    const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
+    raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
 
     auto output_tensor = decoder.get_tensor("logits");
 
@@ -246,7 +261,8 @@ int64_t detect_language(ov::Tensor& encoder_hidden_state,
 std::vector<int32_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
                                       ov::InferRequest& decoder,
                                       const ov::genai::WhisperGenerationConfig& config,
-                                      const bool return_timestamps) {
+                                      const bool return_timestamps,
+                                      ov::genai::RawPerfMetrics& raw_metrics) {
     if (!config.is_multilingual) {
         if (return_timestamps) {
             return std::vector<int32_t>{static_cast<int32_t>(config.decoder_start_token_id)};
@@ -263,7 +279,7 @@ std::vector<int32_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
             language_token_id = static_cast<int32_t>(config.lang_to_id.at(language));
         }
     } else {
-        language_token_id = detect_language(encoder_hidden_state, decoder, config);
+        language_token_id = detect_language(encoder_hidden_state, decoder, config, raw_metrics);
     }
 
     int32_t task_token_id = static_cast<int32_t>(config.transcribe_token_id);
@@ -289,8 +305,9 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
                                                   std::vector<int32_t> init_ids,
                                                   const size_t max_new_tokens,
                                                   const bool return_timestamps,
+                                                  ov::genai::RawPerfMetrics& raw_metrics,
                                                   const std::shared_ptr<ov::genai::ChunkStreamerBase> streamer) {
-    int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, true, return_timestamps);
+    int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, raw_metrics, true, return_timestamps);
     std::vector<int64_t> output_tokens{output_token};
 
     if (!return_timestamps && streamer && streamer->put(output_token)) {
@@ -308,6 +325,7 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
                                              output_tokens.back(),
                                              i + init_ids.size(),
                                              config,
+                                             raw_metrics,
                                              return_timestamps,
                                              output_tokens);
         update_past_key_value(models.decoder_with_past, models.decoder_with_past, i + init_ids.size());
@@ -576,6 +594,7 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
     const RawSpeechInput& raw_speech_input,
     OptionalWhisperGenerationConfig generation_config,
     ChunkStreamerVariant streamer) {
+    auto start_time = std::chrono::steady_clock::now();
     WhisperGenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
     config.validate();
 
@@ -591,14 +610,25 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
         streamer_ptr = std::make_shared<ChunkTextCallbackStreamer>(m_tokenizer, *callback);
     }
 
+    size_t max_new_tokens = config.get_max_new_tokens();
+
+    WhisperPerfMetrics perf_metrics;
+    perf_metrics.num_input_tokens = 0;
+    RawPerfMetrics& raw_metrics = perf_metrics.raw_metrics;
+    raw_metrics.m_new_token_times.reserve(max_new_tokens);
+    raw_metrics.m_batch_sizes.reserve(max_new_tokens);
+    raw_metrics.m_token_infer_durations.reserve(max_new_tokens);
+    raw_metrics.m_inference_durations = {{MicroSeconds(0.0f)}};
+
+    const auto extract_start = std::chrono::steady_clock::now();
     auto input_features = m_feature_extractor.extract(raw_speech_input);
+    const auto extract_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - extract_start);
+    perf_metrics.whisper_raw_metrics.features_extraction_durations.emplace_back(extract_ms);
 
     const bool is_shortform = input_features.n_frames <= m_feature_extractor.nb_max_frames;
     // long-form audio processing requires timestamps to be enabled
     const bool return_timestamps = config.return_timestamps || !is_shortform;
 
-    size_t max_new_tokens = config.get_max_new_tokens();
-
     std::vector<int32_t> init_ids;
     std::vector<int64_t> output_tokens;
     std::vector<Segment> segments;
@@ -619,11 +649,12 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
         ov::Tensor hidden_state_tensor = encode(m_models.encoder,
                                                 input_features_chunk,
                                                 m_feature_extractor.feature_size,
-                                                m_feature_extractor.nb_max_frames);
+                                                m_feature_extractor.nb_max_frames,
+                                                raw_metrics);
 
         // prepare init_ids just once for whole input
         if (init_ids.empty()) {
-            init_ids = prepare_init_ids(hidden_state_tensor, m_models.decoder, config, return_timestamps);
+            init_ids = prepare_init_ids(hidden_state_tensor, m_models.decoder, config, return_timestamps, raw_metrics);
         }
 
         auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor,
@@ -632,6 +663,7 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
                                                             init_ids,
                                                             max_new_tokens - output_tokens.size(),
                                                             return_timestamps,
+                                                            raw_metrics,
                                                             streamer_ptr);
 
         if (return_timestamps) {
@@ -640,6 +672,8 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
                                                                   m_feature_extractor.nb_max_frames,
                                                                   time_precision);
 
+            ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);
+
             segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end());
 
             output_tokens.insert(output_tokens.end(),
@@ -669,7 +703,11 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
         streamer_ptr->end();
     }
 
+    auto decode_start_time = std::chrono::steady_clock::now();
     WhisperDecodedResults result{std::vector{m_tokenizer.decode(output_tokens)}, std::vector{1.f}};
+    result.perf_metrics = perf_metrics;
+    result.perf_metrics.raw_metrics.detokenization_durations.emplace_back(
+            PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time));
 
     // if return_timestamps wasn't enabled by user
     if (!config.return_timestamps) {
@@ -681,13 +719,23 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
         chunks.reserve(segments.size());
 
         for (auto& segment : segments) {
+            decode_start_time = std::chrono::steady_clock::now();
             chunks.push_back(
                 WhisperDecodedResultChunk{segment.m_start, segment.m_end, m_tokenizer.decode(segment.m_tokens)});
+            result.perf_metrics.raw_metrics.detokenization_durations.emplace_back(
+                    PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time));
         }
 
         result.chunks = chunks;
     }
 
+    auto& metrics = result.perf_metrics;
+    metrics.load_time = this->m_load_time_ms;
+    auto stop_time = std::chrono::steady_clock::now();
+    metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+    metrics.raw_metrics.tokenization_durations.emplace_back(MicroSeconds(0.0f));
+    metrics.evaluate_statistics(start_time);
+
     return result;
 }
 

From d5921487836103b7e9f32c8577021ac2a4d9d912 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 24 Dec 2024 13:30:44 +0400
Subject: [PATCH 049/110] [Inpainting] Update stable_diffusion_xl_pipeline.hpp
 (#1427)

---
 src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
index 15f15219c2..c3ebcdf1f4 100644
--- a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
@@ -320,7 +320,7 @@ class StableDiffusionXLPipeline : public StableDiffusionPipeline {
             } else if (m_pipeline_type == PipelineType::INPAINTING) {
                 m_generation_config.guidance_scale = 7.5f;
                 m_generation_config.num_inference_steps = 50;
-                m_generation_config.strength == 0.9999f;
+                m_generation_config.strength = 0.9999f;
             }
         } else {
             OPENVINO_THROW("Unsupported class_name '", class_name, "'. Please, contact OpenVINO GenAI developers");

From db28c8c5775fe61a03519355f433f9885460e9e3 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 24 Dec 2024 13:31:24 +0400
Subject: [PATCH 050/110] Fix compile warnings in tokenizer.cpp (#1428)

```
  /Users/runner/work/openvino.genai/openvino.genai/src/cpp/src/tokenizer.cpp:238:40: warning: expression result unused [-Wunused-value]
              encode("non empty string").input_ids;
```
---
 src/cpp/src/tokenizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index ed6fbc0a06..5364acfd91 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -235,7 +235,7 @@ class Tokenizer::TokenizerImpl {
         // Initialize tokenizer's cache to save time later.
         if (m_tokenizer) {
             // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
-            encode("non empty string").input_ids;
+            encode("non empty string");
         }
         if (m_detokenizer) {
             decode({1, 33, 199, 42, 42});

From 0da48cd1fdb3dd9620b0a0f4d494d64d78d3a491 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 24 Dec 2024 21:34:34 +0400
Subject: [PATCH 051/110] Revert "Pin optimum-intel commit" (#1426)

Reverts openvinotoolkit/openvino.genai#1420

Fixed here https://github.com/huggingface/optimum-intel/pull/1091
---
 .github/workflows/llm_bench-python.yml | 4 ++--
 samples/export-requirements.txt        | 2 +-
 tests/python_tests/requirements.txt    | 5 +++--
 tools/llm_bench/requirements.txt       | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index 8356805e19..1999bafcfe 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -151,7 +151,7 @@ jobs:
           rm -rf ./ov_models/internvl2-1B
       - name: WWB Tests
         run: |
-          pip install git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a
+          pip install git+https://github.com/huggingface/optimum-intel.git
           GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
           python -m pytest -v ${{ env.WWB_PATH }}/tests
   stateful:
@@ -190,7 +190,7 @@ jobs:
       - name: WWB Tests
         run: |
           pip install pytest
-          pip install git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a
+          pip install git+https://github.com/huggingface/optimum-intel.git
           GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
           python -m pytest -v ${{ env.WWB_PATH }}/tests
 
diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index d75fdbacee..797b680b9a 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -2,7 +2,7 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino-tokenizers~=2025.0.0.0.dev
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index bc5324b211..00bffb6646 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,6 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a
-numpy<2.0.0; sys_platform == 'darwin'
+diffusers==0.31.0
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
 onnx==1.17.0
 pytest
 
diff --git a/tools/llm_bench/requirements.txt b/tools/llm_bench/requirements.txt
index acbc668c52..f5f4a3fdeb 100644
--- a/tools/llm_bench/requirements.txt
+++ b/tools/llm_bench/requirements.txt
@@ -10,7 +10,7 @@ torch
 transformers>=4.40.0
 diffusers>=0.22.0
 #optimum is in dependency list of optimum-intel 
-git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a#egg=optimum-intel
+git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel
 git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf
 packaging
 psutil

From 021d88059d1367ef5ccc7938183de3dcdaafe82f Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Tue, 24 Dec 2024 19:43:58 +0100
Subject: [PATCH 052/110] Dynamic KV cache allocation (#1364)

Dynamic KV cache allocation
Ticket: CVS-158409

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .../prompt_lookup_decoding_lm.cpp             |   6 +-
 .../speculative_decoding_lm.cpp               |   6 +-
 .../prompt_lookup_decoding_lm.py              |   5 +-
 .../speculative_decoding_lm.py                |   6 +-
 src/cpp/src/block_manager.hpp                 |  51 ++++++-
 src/cpp/src/cache_manager.hpp                 | 124 +++++++++++++++---
 src/cpp/src/continuous_batching_impl.cpp      |  10 +-
 src/cpp/src/device_config.hpp                 |  36 ++---
 src/cpp/src/llm_pipeline.cpp                  |  13 +-
 src/cpp/src/scheduler.hpp                     | 120 ++++++++++++++++-
 .../speculative_decoding_impl.cpp             |   3 +-
 .../utils/paged_attention_transformations.cpp |  10 +-
 tests/cpp/cache_manager.cpp                   | 114 ++++++++++++++--
 tests/cpp/scheduler.cpp                       |  59 ++++++---
 tests/python_tests/common.py                  |   1 -
 tests/python_tests/ov_genai_test_utils.py     |   1 -
 .../python_tests/test_cache_optimizations.py  |  27 +++-
 17 files changed, 480 insertions(+), 112 deletions(-)

diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
index e692110027..8b48dbade0 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
+++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
@@ -22,14 +22,10 @@ int main(int argc, char* argv[]) try {
     
     std::string device = "CPU";
 
-    ov::genai::SchedulerConfig scheduler_config;
-    scheduler_config.cache_size = 5;
-
     ov::genai::LLMPipeline pipe(
         model_path,
         device,
-        ov::genai::prompt_lookup(true),
-        ov::genai::scheduler_config(scheduler_config));
+        ov::genai::prompt_lookup(true));
 
     auto streamer = [](std::string subword) {
         std::cout << subword << std::flush;
diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
index 487296566b..e10228863f 100644
--- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
+++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
@@ -26,14 +26,10 @@ int main(int argc, char* argv[]) try {
     // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft.
     std::string main_device = "CPU", draft_device = "CPU";
 
-    ov::genai::SchedulerConfig scheduler_config;
-    scheduler_config.cache_size = 5;
-
     ov::genai::LLMPipeline pipe(
         main_model_path,
         main_device,
-        ov::genai::draft_model(draft_model_path, draft_device),
-        ov::genai::scheduler_config(scheduler_config));
+        ov::genai::draft_model(draft_model_path, draft_device));
 
     auto streamer = [](std::string subword) {
         std::cout << subword << std::flush;
diff --git a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py
index 557897b6b1..726391ba9b 100755
--- a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py
+++ b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py
@@ -18,11 +18,8 @@ def main():
     args = parser.parse_args()
 
     device = 'CPU'
-    scheduler_config = openvino_genai.SchedulerConfig()
-    # cache params
-    scheduler_config.cache_size = 2
 
-    pipe = openvino_genai.LLMPipeline(args.model_dir, device, scheduler_config=scheduler_config, prompt_lookup=True)
+    pipe = openvino_genai.LLMPipeline(args.model_dir, device, prompt_lookup=True)
     
     config = openvino_genai.GenerationConfig()
     config.max_new_tokens = 100
diff --git a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py
index 612e59474e..217b8a2730 100755
--- a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py
+++ b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py
@@ -25,13 +25,9 @@ def main():
     main_device = 'CPU'  # GPU can be used as well
     draft_device = 'CPU'
 
-    scheduler_config = openvino_genai.SchedulerConfig()
-    # cache params
-    scheduler_config.cache_size = 2
-
     draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device)
 
-    pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, scheduler_config=scheduler_config, draft_model=draft_model)
+    pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model)
     
     config = openvino_genai.GenerationConfig()
     config.max_new_tokens = 100
diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp
index dc82897dc8..4ca263777b 100644
--- a/src/cpp/src/block_manager.hpp
+++ b/src/cpp/src/block_manager.hpp
@@ -205,14 +205,20 @@ class BlockAllocator {
      * Blocks returned will be vectors with this size, each vector entry to be associated with a separate layer's KV cache.
      */
     BlockAllocator(size_t num_blocks, bool enable_prefix_caching, size_t num_layers = 1) :
-            m_free_blocks_num(num_layers, num_blocks), m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) {
+            m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) {
         OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero");
         m_free_blocks.resize(m_num_layers);
-        for (auto& per_layer_block_list : m_free_blocks) {
-            for (int block_id = 0; block_id < m_total_num_blocks; ++block_id) {
-                per_layer_block_list.push_back(std::make_shared<KVCacheBlock>(block_id));
+        if (num_blocks > 0) {
+            m_free_blocks_num = std::vector<size_t>(num_layers, num_blocks);
+            for (auto& per_layer_block_list : m_free_blocks) {
+                for (int block_id = 0; block_id < m_total_num_blocks; ++block_id) {
+                    per_layer_block_list.push_back(std::make_shared<KVCacheBlock>(block_id));
+                }
             }
         }
+        else {
+            m_free_blocks_num = std::vector<size_t>(m_num_layers, 0);
+        }
     }
 
     ~BlockAllocator() {
@@ -220,6 +226,21 @@ class BlockAllocator {
         // OPENVINO_ASSERT(m_total_num_blocks == m_free_blocks.size());
     }
 
+    void increase_kv_blocks_number(size_t new_kv_blocks_count) {
+        OPENVINO_ASSERT(new_kv_blocks_count > m_total_num_blocks, "New blocks number should be more than previous blocks number.");
+        size_t added_blocks = new_kv_blocks_count - m_total_num_blocks;
+        for (auto idx = 0; idx < m_free_blocks_num.size(); idx++) {
+            m_free_blocks_num[idx] += added_blocks;
+        }
+        for (auto& per_layer_block_list : m_free_blocks) {
+            for (int block_id = m_total_num_blocks; block_id < new_kv_blocks_count; ++block_id) {
+                per_layer_block_list.push_back(std::make_shared<KVCacheBlock>(block_id));
+            }
+        }
+        m_total_num_blocks = new_kv_blocks_count;
+    }
+
+
     /**
      * Returns the number of free blocks for a given layer.
      * @param layer_idx Index of the layer.
@@ -459,6 +480,13 @@ class BlockAllocator {
         for (size_t layer_idx = 0; layer_idx < m_num_layers; layer_idx++) sum += num_free_blocks(layer_idx);
         return static_cast<float>(m_num_layers * m_total_num_blocks - sum) / (m_num_layers * m_total_num_blocks) * 100;
     }
+
+    /**
+     * @return The total number of KV blocks .
+     */
+    size_t get_total_number_of_kv_blocks() const {
+        return m_total_num_blocks;
+    }
 };
 
 /**
@@ -713,6 +741,21 @@ class BlockManager {
         return m_allocator.get_used_percentage();
     }
 
+    /**
+     * Increases the number of KV blocks.
+     * @param num_blocks The new number of KV-blocks.
+     */
+    void increase_kv_blocks_number(size_t num_blocks) {
+        m_allocator.increase_kv_blocks_number(num_blocks);
+    }
+
+    /**
+     * @return The total number of KV blocks .
+     */
+    size_t get_total_number_of_kv_blocks() const {
+        return m_allocator.get_total_number_of_kv_blocks();
+    }
+
     /**
      * @brief Forks a sequence, establishing a new sequence from an existing one, reusing
      * currently allocated blocks of the existing sequence.
diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp
index a7444555ab..0c04823f4f 100644
--- a/src/cpp/src/cache_manager.hpp
+++ b/src/cpp/src/cache_manager.hpp
@@ -15,38 +15,118 @@ class CacheManager {
     DeviceConfig m_device_config;
     std::vector<ov::Tensor> m_key_cache;
     std::vector<ov::Tensor> m_value_cache;
+    size_t m_num_allocated_kv_blocks = 0;
     ov::Core m_core;
+    ov::InferRequest m_request;
+
+    ov::Shape set_first_dim_and_make_static(const ov::PartialShape& shape, size_t dim) {
+        ov::PartialShape res_shape = shape;
+        res_shape[0] = dim;
+        OPENVINO_ASSERT(res_shape.is_static());
+        return res_shape.to_shape();
+    }
+
+    void update_request_tensor(size_t decoder_layer_id) {
+        m_request.set_tensor(std::string("key_cache.") + std::to_string(decoder_layer_id), m_key_cache[decoder_layer_id]);
+        m_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_value_cache[decoder_layer_id]);
+    }
 
 public:
-    explicit CacheManager(const DeviceConfig &device_config, ov::Core core) :
+    explicit CacheManager(const DeviceConfig &device_config, ov::InferRequest request, ov::Core core) :
             m_device_config(device_config),
+            m_request(request),
             m_core(core) {
         m_key_cache.reserve(m_device_config.get_num_layers());
         m_value_cache.reserve(m_device_config.get_num_layers());
+    }
+
+    void allocate_cache_if_needed(size_t num_kv_blocks) {
+        if (m_num_allocated_kv_blocks >= num_kv_blocks) {
+            return;
+        }
+        OPENVINO_ASSERT(m_key_cache.size() == m_value_cache.size());
+        m_num_allocated_kv_blocks = num_kv_blocks;
+        ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), num_kv_blocks);
+        ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), num_kv_blocks);
+
+        const std::string device_name = m_device_config.get_device();
+
+        ov::Coordinate start_key{0,0,0,0};
+        ov::Coordinate start_value{0,0,0,0};
 
-        const std::string device_name = device_config.get_device();
         if (device_name.find("GPU") == std::string::npos) {// Allocate KV caches
             for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
-                ov::Tensor key_cache(device_config.get_cache_precision(), device_config.get_key_cache_shape());
-                ov::Tensor value_cache(device_config.get_cache_precision(), device_config.get_value_cache_shape());
+                ov::Tensor key_cache(m_device_config.get_cache_precision(), key_cache_shape);
+                ov::Tensor value_cache(m_device_config.get_cache_precision(), value_cache_shape);
+
+                auto key_cache_roi_end = static_cast<unsigned char*>(key_cache.data());
+                auto value_cache_roi_end = static_cast<unsigned char*>(value_cache.data());
+                size_t key_roi_size_byte = 0;
+                size_t value_roi_size_byte = 0;
+
+                if (m_key_cache.size() > decoder_layer_id) {
+                    ov::Coordinate end_key = m_key_cache[decoder_layer_id].get_shape();
+                    ov::Coordinate end_value = m_value_cache[decoder_layer_id].get_shape();
+
+                    key_roi_size_byte = m_key_cache[decoder_layer_id].get_byte_size();
+                    value_roi_size_byte = m_value_cache[decoder_layer_id].get_byte_size();
+                    key_cache_roi_end = static_cast<unsigned char*>(key_cache.data()) + key_roi_size_byte;
+                    value_cache_roi_end = static_cast<unsigned char*>(value_cache.data()) + value_roi_size_byte;
+                    
+                    // copy current cache data
+                    ov::Tensor dst_key_roi(key_cache, start_key, end_key);
+                    ov::Tensor dst_value_roi(value_cache, start_value, end_value);
+
+                    m_key_cache[decoder_layer_id].copy_to(dst_key_roi);
+                    m_value_cache[decoder_layer_id].copy_to(dst_value_roi);
+
+                }
 
-                // force allocation
-                std::memset(key_cache.data(), 0, key_cache.get_byte_size());
-                std::memset(value_cache.data(), 0, value_cache.get_byte_size());
+                // Some optimizations like AVX2, AVX512, AMX require a minimal shape and 
+                // perform multiplying by zero on the excess data. Uninitialized tensor data contain NAN's, 
+                // so NAN * 0 returns non-zero invalid data.
+                // So we need to set zeros to all newly allocated tensors data.
+                std::memset(key_cache_roi_end, 0, key_cache.get_byte_size() - key_roi_size_byte);
+                std::memset(value_cache_roi_end, 0, value_cache.get_byte_size() - value_roi_size_byte);
+                
+                // set new cache tensors
+                if (m_key_cache.size() > decoder_layer_id) {
+                    m_key_cache[decoder_layer_id] = key_cache;
+                    m_value_cache[decoder_layer_id] = value_cache;
+                }
+                else {
+                    m_key_cache.emplace_back(key_cache);
+                    m_value_cache.emplace_back(value_cache);
+                }
 
-                m_key_cache.emplace_back(key_cache);
-                m_value_cache.emplace_back(value_cache);
+                update_request_tensor(decoder_layer_id);
             }
         } else {
             auto remote_context = m_core.get_default_context(device_name);
             for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
-                ov::Tensor key_cache = remote_context.create_tensor(device_config.get_cache_precision(),
-                                                                    device_config.get_key_cache_shape());
-                ov::Tensor value_cache = remote_context.create_tensor(device_config.get_cache_precision(),
-                                                                      device_config.get_value_cache_shape());
-
-                m_key_cache.emplace_back(key_cache);
-                m_value_cache.emplace_back(value_cache);
+                ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(),
+                                                                    key_cache_shape);
+                ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(),
+                                                                      value_cache_shape);
+                
+                if (m_key_cache.size() > decoder_layer_id) {
+                    ov::Coordinate end_key = m_key_cache[decoder_layer_id].get_shape();
+                    ov::Coordinate end_value = m_value_cache[decoder_layer_id].get_shape();
+
+                    // copy current cache data
+                    ov::RemoteTensor dst_key_roi(key_cache, start_key, end_key);
+                    ov::RemoteTensor dst_value_roi(value_cache, start_value, end_value);
+                    dst_key_roi.copy_from(m_key_cache[decoder_layer_id]);
+                    dst_value_roi.copy_from(m_value_cache[decoder_layer_id]);
+
+                    m_key_cache[decoder_layer_id] = key_cache;
+                    m_value_cache[decoder_layer_id] = value_cache;
+                }
+                else {
+                    m_key_cache.emplace_back(key_cache);
+                    m_value_cache.emplace_back(value_cache);
+                }
+                update_request_tensor(decoder_layer_id);
             }
         }
     }
@@ -62,8 +142,8 @@ class CacheManager {
     }
 
     void copy_blocks(const std::map<size_t, std::list<size_t>>& block_copy_map) {
-        ov::Shape key_shape = m_device_config.get_key_cache_shape();
-        ov::Shape value_shape = m_device_config.get_value_cache_shape();
+        ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), m_num_allocated_kv_blocks);
+        ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), m_num_allocated_kv_blocks);
 
         ov::Coordinate key_src_start_roi(key_shape.size(), 0);
         ov::Coordinate key_src_end_roi = key_shape;
@@ -98,5 +178,13 @@ class CacheManager {
             }
         }
     }
+
+    std::shared_ptr<Core> get_core() {
+        return std::make_shared<Core>(m_core);
+    }
+
+    std::shared_ptr<DeviceConfig> get_device_config() {
+        return std::make_shared<DeviceConfig>(m_device_config);
+    }
 };
 }
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index e1ffd062de..52ec6a8302 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -53,11 +53,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
     ov::InferRequest infer_request = compiled_model.create_infer_request();
 
     // setup KV caches
-    m_cache_manager = std::make_shared<CacheManager>(device_config, core);
-    for (size_t decoder_layer_id = 0; decoder_layer_id < device_config.get_num_layers(); ++decoder_layer_id) {
-        infer_request.set_tensor(std::string("key_cache.") + std::to_string(decoder_layer_id), m_cache_manager->get_key_cache(decoder_layer_id));
-        infer_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_cache_manager->get_value_cache(decoder_layer_id));
-    }
+    m_cache_manager = std::make_shared<CacheManager>(device_config, infer_request, core);
 
     SchedulerConfig updated_config = scheduler_config;
     // update KV blocks number in scheduler config
@@ -71,8 +67,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
         // as it may lead to performance slowdown
         can_use_partial_preemption = false;
     }
-
-    m_scheduler = std::make_shared<Scheduler>(device_config.get_block_size(), updated_config, device_config.get_num_layers(), can_use_partial_preemption);
+    m_scheduler = std::make_shared<Scheduler>(device_config.get_block_size(), m_cache_manager, updated_config, device_config.get_num_layers(), can_use_partial_preemption);
     // and finally create model runner
     bool is_use_cache_eviction = m_scheduler->get_config().use_cache_eviction;
     m_model_runner = std::make_shared<ModelRunner>(infer_request, m_scheduler->get_block_size(), device_config.get_num_layers(), is_use_cache_eviction);
@@ -133,7 +128,6 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
     _pull_awaiting_requests();
 
     m_pipeline_metrics.requests = m_requests.size();
-
     Scheduler::Output scheduler_output;
     {
         static ManualTimer timer("scheduling");
diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp
index 2af4559ef1..371142701c 100644
--- a/src/cpp/src/device_config.hpp
+++ b/src/cpp/src/device_config.hpp
@@ -12,7 +12,7 @@
 namespace ov::genai {
 class DeviceConfig {
     ov::element::Type m_kv_cache_type;
-    ov::Shape m_key_cache_shape, m_value_cache_shape;
+    ov::PartialShape m_key_cache_shape, m_value_cache_shape;
     ov::Shape::value_type m_num_kv_heads, m_head_size, m_num_decoder_layers;
     size_t m_num_kv_blocks = 0;
     size_t m_block_size = 0;
@@ -80,11 +80,10 @@ class DeviceConfig {
             OPENVINO_THROW(m_device, " is not supported by OpenVINO Continuous Batching");
         }
 
-        OPENVINO_ASSERT(scheduling_config.num_kv_blocks > 0 || scheduling_config.cache_size > 0, "num_kv_blocks or cache_size should be more than zero.");
         if (scheduling_config.num_kv_blocks > 0) {
             m_num_kv_blocks = scheduling_config.num_kv_blocks;
         }
-        else {
+        else if (scheduling_config.cache_size > 0) {
             m_cache_size = scheduling_config.cache_size;
         }
     }
@@ -104,23 +103,22 @@ class DeviceConfig {
                 m_head_size += 8;
         }
 
-        if (m_num_kv_blocks == 0) {
-            OPENVINO_ASSERT(m_cache_size > 0, "num_kv_blocks or cache_size should be more than zero.");
+        if (m_num_kv_blocks == 0 && m_cache_size > 0) {
             size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024;
             m_num_kv_blocks = size_in_bytes / (m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * m_kv_cache_type.size());
         }
 
-        m_key_cache_shape = m_value_cache_shape = ov::Shape{m_num_kv_blocks,
-                                                            m_num_kv_heads,
-                                                            m_block_size,
-                                                            m_head_size};
+        m_key_cache_shape = m_value_cache_shape = ov::PartialShape{ov::Dimension::dynamic(),
+                                                                   ov::Dimension(m_num_kv_heads),
+                                                                   ov::Dimension(m_block_size),
+                                                                   ov::Dimension(m_head_size)};
 
         if (m_device.find("GPU") != std::string::npos) {
             // Update key shape, as the key's shape is different from the value's shape
-            m_key_cache_shape = ov::Shape{m_num_kv_blocks,
-                                          m_num_kv_heads,
-                                          m_head_size,
-                                          m_block_size};
+            m_key_cache_shape = ov::PartialShape{ov::Dimension::dynamic(),
+                                                 ov::Dimension(m_num_kv_heads),
+                                                 ov::Dimension(m_head_size),
+                                                 ov::Dimension(m_block_size)};
         }
     }
 
@@ -136,13 +134,13 @@ class DeviceConfig {
         return m_num_decoder_layers;
     }
 
-    ov::Shape get_key_cache_shape() const {
-        OPENVINO_ASSERT(!m_key_cache_shape.empty());
+    ov::PartialShape get_key_cache_shape() const {
+        OPENVINO_ASSERT(m_key_cache_shape.size());
         return m_key_cache_shape;
     }
 
-    ov::Shape get_value_cache_shape() const {
-        OPENVINO_ASSERT(!m_value_cache_shape.empty());
+    ov::PartialShape get_value_cache_shape() const {
+        OPENVINO_ASSERT(m_value_cache_shape.size());
         return m_value_cache_shape;
     }
 
@@ -153,5 +151,9 @@ class DeviceConfig {
     size_t get_block_size() const {
         return m_block_size;
     }
+
+    size_t get_block_size_in_bytes() const {
+        return m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * get_cache_precision().size();
+    }
 };
 }
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 33180a9199..be5ecf17fa 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -718,7 +718,9 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::AnyMap& properties
 ){
     auto start_time = std::chrono::steady_clock::now();
-    if (properties.find(ov::genai::scheduler_config.name()) != properties.end()) {
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+        properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
         auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, plugin_config);
     } else if (device == "NPU") {
@@ -737,7 +739,9 @@ ov::genai::LLMPipeline::LLMPipeline(
 ){
     auto start_time = std::chrono::steady_clock::now();
 
-    if (config.find(ov::genai::scheduler_config.name()) != config.end()) {
+    if (config.find(ov::genai::scheduler_config.name()) != config.end() || 
+        config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end() || 
+        config.find(ov::genai::prompt_lookup.name()) != config.end()) {
         auto [plugin_config, scheduler_config] = utils::split_scheduler_config(config);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, plugin_config);
     } else if (device == "NPU") {
@@ -760,7 +764,10 @@ ov::genai::LLMPipeline::LLMPipeline(
     auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config);
 
     auto start_time = std::chrono::steady_clock::now();
-    if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end()) {
+    if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end() || 
+        plugin_config.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_config.end() || 
+        plugin_config.find(ov::genai::prompt_lookup.name()) != plugin_config.end()){
+
         auto [plugin_config_, scheduler_config] = utils::split_scheduler_config(plugin_config);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
                                                               tokenizer, scheduler_config, device, plugin_config_, generation_config);
diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp
index 6de4adaa47..da65c68bec 100644
--- a/src/cpp/src/scheduler.hpp
+++ b/src/cpp/src/scheduler.hpp
@@ -7,10 +7,12 @@
 #include <cstdlib>
 #include <vector>
 
+#include "openvino/runtime/intel_gpu/properties.hpp"
 #include "openvino/genai/scheduler_config.hpp"
 #include "device_config.hpp"
 #include "block_manager.hpp"
 #include "sequence_group.hpp"
+#include "cache_manager.hpp"
 
 namespace ov::genai {
 class Scheduler {
@@ -20,6 +22,13 @@ class Scheduler {
     BlockManager m_block_manager;
     friend class CacheStateDumper;
 
+    bool m_dynamic_memory_allocation = false;
+
+    // Dynamic KV-cache allocation params
+    size_t m_kv_blocks_initial_multiplier = 2;
+    const float m_cache_growth_factor = 2; // commmon values 1.5 or 2
+
+    std::shared_ptr<CacheManager> m_cache_manager;
 public:
     struct Output {
         // IDs of scheduled groups
@@ -36,15 +45,20 @@ class Scheduler {
         float m_cache_usage = 0.0;
     };
 
-    explicit Scheduler(size_t block_size, const SchedulerConfig & config = {}, size_t num_layers = 1, bool can_use_partial_preemption = true) :
+    explicit Scheduler(size_t block_size, std::shared_ptr<CacheManager> cache_manager, const SchedulerConfig & config = {}, size_t num_layers = 1, bool can_use_partial_preemption = true) :
+            m_cache_manager(cache_manager),
             m_can_use_partial_preemption(can_use_partial_preemption),
             m_config(config),
             m_block_manager(m_config.num_kv_blocks, m_config.enable_prefix_caching, block_size, num_layers) {
+        
         OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero");
     }
 
     Output schedule(std::vector<SequenceGroup::Ptr>& sequence_groups) {
         Output scheduler_output;
+        if (m_block_manager.get_total_number_of_kv_blocks() == 0) {
+            _initialize_cache(sequence_groups);
+        }
 
         if (m_config.dynamic_split_fuse) {
             // deepspeed-mii case
@@ -64,9 +78,9 @@ class Scheduler {
             }
         }
 
+        m_cache_manager->allocate_cache_if_needed(m_block_manager.get_total_number_of_kv_blocks());
         _clear_waiting_sequences(sequence_groups);
         scheduler_output.m_cache_usage = m_block_manager.get_used_percentage();
-
         return scheduler_output;
     }
 
@@ -236,8 +250,13 @@ class Scheduler {
                 OPENVINO_ASSERT(currently_allocated_token_slots >= occupied_token_slots, "internal error");
                 size_t available_slots = currently_allocated_token_slots - occupied_token_slots,
                        required_slots = num_scheduled_tokens > available_slots ? num_scheduled_tokens - available_slots : 0;
-                size_t num_required_blocks = (required_slots + block_size - 1) / block_size, num_free_blocks = m_block_manager.num_free_blocks();
-                size_t num_scheduled_blocks = std::min(num_required_blocks, num_free_blocks);
+                size_t num_required_blocks = (required_slots + block_size - 1) / block_size;
+                while (num_required_blocks > m_block_manager.num_free_blocks()) {
+                    if (!_try_increase_cache()) {
+                        break;
+                    }
+                }
+                size_t num_scheduled_blocks = std::min(num_required_blocks, m_block_manager.num_free_blocks());
                 // some scheduled blocks can be no fully occupied, so we need to take min between num_scheduled_blocks
                 // and total "scheduled capacity"
                 num_scheduled_tokens = std::min(num_scheduled_tokens, available_slots + num_scheduled_blocks * block_size);
@@ -289,10 +308,16 @@ class Scheduler {
                 size_t num_scheduled_tokens_per_seq = std::min(available_tokens_per_seq_in_megabatch, num_available_tokens_per_seq);
                 sequence_group->schedule_tokens(num_scheduled_tokens_per_seq);
 
+                while (!m_block_manager.can_append_slots(sequence_group)){
+                    if (!_try_increase_cache()) {
+                        break;
+                    }
+                }
+
                 _apply_preemption(sequence_group_id, sequence_groups);
 
                 // if we can't preemt any more sequences, clear scheduled tokens and move to next sequence
-                if (!m_block_manager.can_append_slots(sequence_group)){
+                if (!m_block_manager.can_append_slots(sequence_group)) {
                     sequence_group->clear_scheduled_tokens();
                     continue;
                 }
@@ -370,6 +395,11 @@ class Scheduler {
                 // apply KV cache limitations
                 size_t block_size = get_block_size();
                 const size_t num_required_blocks = (sequence_len + block_size - 1) / block_size;
+                while (!m_block_manager.can_allocate_blocks(num_required_blocks)){
+                    if (!_try_increase_cache()) {
+                        break;
+                    }
+                }
                 if (!m_block_manager.can_allocate_blocks(num_required_blocks))
                     break;
 
@@ -405,6 +435,86 @@ class Scheduler {
             sequence_groups[sequence_group_id]->clear_waiting_sequences();
         }
     }
+
+    size_t _get_available_gpu_memory() {
+        auto device_config = m_cache_manager->get_device_config();
+        auto core = m_cache_manager->get_core();
+        auto device = device_config->get_device();
+        OPENVINO_ASSERT(device.find("GPU") != std::string::npos, "_get_available_gpu_memory() is applicable for GPU only.");
+        auto memory_statistics = core->get_property(device, ov::intel_gpu::memory_statistics);
+        auto device_type = core->get_property(device, ov::device::type);
+
+        // sum up all used device memory
+        std::vector<std::string> device_memory_types = {"cl_mem", "usm_device"};
+        size_t used_device_mem = 0;
+        for (auto mem_type: device_memory_types) {
+            used_device_mem += memory_statistics[mem_type];
+        }
+
+        if (device_type == ov::device::Type::INTEGRATED) {
+            used_device_mem += memory_statistics["usm_host"];
+        }
+
+        // there could be unaccounted extra memory reserved by kernels, kept
+        // in memory pools, etc
+        // therefore, add a threshold to account for this
+        float used_memory_threshold = 1.1;
+        used_device_mem *= used_memory_threshold;
+
+        // total device memory in bytes
+        auto total_device_memory = core->get_property(device, ov::intel_gpu::device_total_mem_size);
+
+        return total_device_memory - used_device_mem;
+    }
+
+    void _initialize_cache(const std::vector<SequenceGroup::Ptr>& sequence_groups) {
+        size_t blocks_sum = 0;
+        for (auto idx = 0; idx < sequence_groups.size(); idx++) {
+            auto seq_length = sequence_groups[idx]->get_prompt_len() * m_kv_blocks_initial_multiplier;
+            auto gen_config = sequence_groups[idx]->get_sampling_parameters();
+            seq_length = std::min(seq_length, sequence_groups[idx]->get_prompt_len() + gen_config.get_max_new_tokens(sequence_groups[idx]->get_prompt_len()));
+            size_t blocks_num = std::ceil((float)seq_length / m_block_manager.get_block_size());
+            if (gen_config.is_beam_search()) {
+                blocks_num *= gen_config.num_beams;
+            } else if (gen_config.is_multinomial()) {
+                blocks_num *= gen_config.num_return_sequences;
+            }
+            blocks_sum  += blocks_num;
+        }
+        m_block_manager.increase_kv_blocks_number(blocks_sum);
+        m_dynamic_memory_allocation = true;
+    }
+
+    bool _try_increase_cache() {
+        if (!m_dynamic_memory_allocation) {
+            return false;
+        }
+        auto device_config = m_cache_manager->get_device_config();
+        auto device = device_config->get_device();
+        size_t current_num_of_kv_blocks = m_block_manager.get_total_number_of_kv_blocks();
+        size_t new_blocks_num = current_num_of_kv_blocks * m_cache_growth_factor;
+
+        if (device.find("GPU") == std::string::npos) {
+            m_block_manager.increase_kv_blocks_number(new_blocks_num);
+        }
+        else {
+            size_t available_gpu_memory = _get_available_gpu_memory();
+            size_t required_memory = (new_blocks_num - current_num_of_kv_blocks) * device_config->get_block_size_in_bytes();
+            if (required_memory <= available_gpu_memory) {
+                m_block_manager.increase_kv_blocks_number(new_blocks_num);
+            } else {
+                size_t possible_blocks_to_add = available_gpu_memory / device_config->get_block_size_in_bytes();
+                if (possible_blocks_to_add > 0) {
+                    m_block_manager.increase_kv_blocks_number(current_num_of_kv_blocks + possible_blocks_to_add);
+                }
+                else {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
 };
 
 }
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
index 46b7b106a6..257c20bf01 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -52,8 +52,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
 
         size_t main_cache_size = std::ceil(main_scheduler_config.cache_size * (1.f - k)),
                draft_cache_size = main_scheduler_config.cache_size - main_cache_size;
-        OPENVINO_ASSERT(main_cache_size > 0, "KV cache model cache size should be > 0");
-        if (draft_cache_size == 0) {
+        if (draft_cache_size == 0 && main_cache_size > 0) {
             main_cache_size -= (main_cache_size > 1 ? 1 : 0);
             draft_cache_size = 1;
         }
diff --git a/src/cpp/src/utils/paged_attention_transformations.cpp b/src/cpp/src/utils/paged_attention_transformations.cpp
index 16c9556151..4dedcf989a 100644
--- a/src/cpp/src/utils/paged_attention_transformations.cpp
+++ b/src/cpp/src/utils/paged_attention_transformations.cpp
@@ -10,11 +10,6 @@ namespace ov {
 namespace genai {
 namespace utils {
 
-inline ov::PartialShape to_partial_with_dyn_0_dim(const ov::Shape& static_shape) {
-    ov::PartialShape partial_shape = static_shape;
-    partial_shape[0] = ov::Dimension::dynamic();
-    return partial_shape;
-}
 
 size_t get_hidden_size(const std::shared_ptr<ov::Model> model) {
     const auto& parameters = model->get_parameters();
@@ -65,9 +60,8 @@ void set_kv_cache_type_and_shape(std::shared_ptr<ov::Model> model, DeviceConfig&
     for (auto it_k = key_cache_params.begin(), it_v = value_cache_params.begin(); it_k != key_cache_params.end();++it_k, ++it_v) {
         it_k->second->set_element_type(device_config.get_cache_precision());
         it_v->second->set_element_type(device_config.get_cache_precision());
-        // TODO: CVS-145270
-        it_k->second->set_partial_shape(to_partial_with_dyn_0_dim(device_config.get_key_cache_shape()));
-        it_v->second->set_partial_shape(to_partial_with_dyn_0_dim(device_config.get_value_cache_shape()));
+        it_k->second->set_partial_shape(device_config.get_key_cache_shape());
+        it_v->second->set_partial_shape(device_config.get_value_cache_shape());
     }
 
     model->validate_nodes_and_infer_types();
diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp
index b2a5396d5f..7f07980389 100644
--- a/tests/cpp/cache_manager.cpp
+++ b/tests/cpp/cache_manager.cpp
@@ -7,8 +7,43 @@
 #include "scheduler.hpp"
 #include "device_config.hpp"
 #include "cache_manager.hpp"
+#include "openvino/op/concat.hpp"
 
-TEST(TestCacheManager, general_test) {
+using namespace ov::genai;
+
+std::shared_ptr<ov::Model> get_dummy_model(size_t num_layers) {
+    ov::NodeVector keys;
+    ov::NodeVector values;
+    ov::ParameterVector params;
+    auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()});
+    for (size_t i = 0; i < num_layers; i++) {
+        auto key = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
+        auto value = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
+        key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)});
+        value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)});
+        keys.push_back(key);
+        values.push_back(value);
+        params.push_back(key);
+        params.push_back(value);
+    }
+    const auto& concat1 = std::make_shared<ov::op::v0::Concat>(keys, 1);
+    const auto& concat2 = std::make_shared<ov::op::v0::Concat>(values, 1);
+    auto model = std::make_shared<ov::Model>(ov::NodeVector{concat1, concat2}, params);
+    return std::make_shared<ov::Model>(ov::NodeVector{concat1, concat2}, params);
+}
+
+size_t get_total_allocated_bytes(std::shared_ptr<ov::genai::CacheManager> cache_manager, size_t num_decoder_layers) {
+    size_t allocated_bytes = 0;
+    for (size_t i = 0; i < num_decoder_layers; i++) {
+        auto key_cache = cache_manager->get_key_cache(i);
+        auto value_cache = cache_manager->get_value_cache(i);
+        allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size();
+    }
+    return allocated_bytes;
+}
+
+
+TEST(TestCacheManager, test_cache_size_param) {
     ov::Core core;
     ov::genai::SchedulerConfig scheduler_config;
     scheduler_config.max_num_batched_tokens = 32;
@@ -21,14 +56,73 @@ TEST(TestCacheManager, general_test) {
     size_t num_decoder_layers = 12;
     device_config.set_model_params(12, 64, num_decoder_layers);
 
-    auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, core);
-
-    size_t allocated_bytes = 0;
-    for (size_t i = 0; i < num_decoder_layers; i++) {
-        auto key_cache = cache_manager->get_key_cache(i);
-        auto value_cache = cache_manager->get_value_cache(i);
-        allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size();
-    }
+    ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
+    auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
+    auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers());
+    cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks());
     
-    ASSERT_EQ(allocated_bytes, 2146959360);
+    ASSERT_EQ(get_total_allocated_bytes(cache_manager, num_decoder_layers), 2146959360);
 }
+
+
+TEST(TestCacheManager, test_kv_blocks_param) {
+    ov::Core core;
+    ov::genai::SchedulerConfig scheduler_config;
+    scheduler_config.max_num_batched_tokens = 32;
+    scheduler_config.num_kv_blocks = 150;
+    scheduler_config.cache_size = 0;
+    scheduler_config.max_num_seqs = 2;
+
+    const std::string device = "CPU";
+    ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
+    size_t num_decoder_layers = 12;
+    device_config.set_model_params(12, 64, num_decoder_layers);
+
+    ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
+    auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
+    auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers());
+    OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), scheduler_config.num_kv_blocks);
+}
+
+
+TEST(TestCacheManager, test_dynamic_cache_increase) {
+    ov::Core core;
+    ov::genai::SchedulerConfig scheduler_config;
+    scheduler_config.max_num_batched_tokens = 32;
+    scheduler_config.num_kv_blocks = 0;
+    scheduler_config.cache_size = 0;
+    scheduler_config.max_num_seqs = 2;
+
+    const std::string device = "CPU";
+    ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
+    size_t num_decoder_layers = 12;
+    size_t head_size = 64;
+    size_t num_kv_heads = 12;
+    device_config.set_model_params(num_kv_heads, head_size, num_decoder_layers);
+    size_t block_size_in_bytes = num_decoder_layers * 2 * num_kv_heads * device_config.get_block_size() * head_size * device_config.get_cache_precision().size();
+
+
+    ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
+    auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
+    auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers());
+
+    // check initial cache allocation
+    block_manager.increase_kv_blocks_number(100);
+    OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), 100);
+
+    cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks());
+    OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 100 * block_size_in_bytes);
+
+
+    // check cache increase
+    block_manager.increase_kv_blocks_number(200);
+    OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), 200);
+
+    cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks());
+    OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes);
+
+
+    // check that cache does not increase if new blocks were not allocated
+    cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks());
+    OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes);
+}
\ No newline at end of file
diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp
index 40c3e73747..ea1720faa2 100644
--- a/tests/cpp/scheduler.cpp
+++ b/tests/cpp/scheduler.cpp
@@ -4,6 +4,7 @@
 
 #include <gtest/gtest.h>
 #include "openvino/runtime/core.hpp"
+#include "openvino/op/concat.hpp"
 #include "openvino/genai/continuous_batching_pipeline.hpp"
 #include "openvino/genai/generation_config.hpp"
 #include "sequence_group.hpp"
@@ -17,6 +18,37 @@ void clear_finished_sequences(std::vector<SequenceGroup::Ptr>& requests) {
     });
     requests.erase(new_end, requests.end());
 }
+std::shared_ptr<ov::Model> get_model(size_t num_layers) {
+    ov::NodeVector keys;
+    ov::NodeVector values;
+    ov::ParameterVector params;
+    auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()});
+    for (size_t i = 0; i < num_layers; i++) {
+        auto key = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
+        auto value = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
+        key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)});
+        value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)});
+        keys.push_back(key);
+        values.push_back(value);
+        params.push_back(key);
+        params.push_back(value);
+    }
+    const auto& concat1 = std::make_shared<ov::op::v0::Concat>(keys, 1);
+    const auto& concat2 = std::make_shared<ov::op::v0::Concat>(values, 1);
+    auto model = std::make_shared<ov::Model>(ov::NodeVector{concat1, concat2}, params);
+    return std::make_shared<ov::Model>(ov::NodeVector{concat1, concat2}, params);
+}
+
+std::shared_ptr<CacheManager> init_cache_manager(SchedulerConfig scheduler_config) {
+    ov::Core core = ov::Core();
+    size_t num_decoder_layers = 12;
+    ov::InferRequest request = core.compile_model(get_model(num_decoder_layers)).create_infer_request();
+    size_t head_size = 64, head_size_u8 = head_size + 8;
+    size_t num_kv_heads = 12;
+    ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
+    device_config.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers);
+    return std::make_shared<CacheManager>(device_config, request, core);  
+}
 
 TEST(TestScheduler, general_test) {
     std::array<SchedulerConfig, 2> configs = {SchedulerConfig(), SchedulerConfig()};
@@ -40,10 +72,9 @@ TEST(TestScheduler, general_test) {
                                                                                 ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
         auto idx2 = (*sequence_group3)[0]->get_id();
         std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2, sequence_group3};
-                                                                        
         
         // schedule 3 sequence groups that use 6 kv blocks 
-        Scheduler scheduler = Scheduler(4, scheduler_config);
+        Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
         auto out1 = scheduler.schedule(requests);
 
         std::vector<uint64_t> ref_ids = {0, 1, 2};
@@ -144,7 +175,7 @@ TEST_P(AppendSlotsSchedulerTest, test_append_slots_considers_all_sequences) {
     auto idx1 = (*sequence_group2)[0]->get_id();
     std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
 
-    Scheduler scheduler = Scheduler(4, scheduler_config);
+    Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
     auto out1 = scheduler.schedule(requests);
 
     std::vector<uint64_t> ref_ids = {0, 1};
@@ -212,7 +243,7 @@ TEST_P(PartialPreemptionSchedulerTest, test_partial_preemption) {
 
 
     // schedule 2 sequence groups that use 5 kv blocks
-    Scheduler scheduler = Scheduler(4, scheduler_config);
+    Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
     auto out0 = scheduler.schedule(requests);
 
     for (auto seq: requests) {
@@ -297,7 +328,7 @@ TEST(TestScheduler, test_partial_preemption_beam_search) {
         sequence_group->set_sequence_group_ptr(sequence_group);
         std::vector<SequenceGroup::Ptr> requests = {sequence_group};
 
-        Scheduler scheduler = Scheduler(4, scheduler_config);
+        Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
         auto out = scheduler.schedule(requests);
         for (auto sequence: sequence_group->get_not_finished_sequences()) {
             sequence->append_token(token, 0.7);
@@ -405,11 +436,10 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
         SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
                                                                                 ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
         auto idx1 = (*sequence_group2)[0]->get_id();
-        std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
-                                                                        
+        std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};                                                
         
         // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks.
-        Scheduler scheduler = Scheduler(4, scheduler_config);
+        Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
         auto out1 = scheduler.schedule(requests);
 
         for (auto seq: requests) {
@@ -503,7 +533,7 @@ TEST(TestScheduler, prefix_caching_test) {
         std::vector<uint64_t> prompt_tokens = {0,1,2,3,4,5,6,7};
         std::vector<uint64_t> histrory_tokens = {};
         // schedule prompt
-        Scheduler scheduler = Scheduler(4, scheduler_config);
+        Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
 
         size_t chat_iterations = 10;
 
@@ -566,7 +596,7 @@ TEST(TestScheduler, prefix_caching_test_two_identical_sequences) {
         std::vector<uint64_t> prompt_tokens = {0,1,2,3,4,5,6,7};
         std::vector<uint64_t> histrory_tokens = {};
         // schedule prompt
-        Scheduler scheduler = Scheduler(4, scheduler_config);
+        Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
 
         size_t chat_iterations = 10;
 
@@ -640,7 +670,7 @@ TEST(TestScheduler, prefix_caching_with_max_new_tokens_equal_1) {
     for (auto scheduler_config: configs) {
         std::vector<uint64_t> prompt_tokens = {0,1,2,3,4,5,6,7};
         // schedule prompt
-        Scheduler scheduler = Scheduler(32, scheduler_config);
+        Scheduler scheduler = Scheduler(32, init_cache_manager(scheduler_config), scheduler_config);
 
         size_t chat_iterations = 2;
 
@@ -701,7 +731,7 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed) {
 
     // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks.
     const bool can_use_partial_preemption = false;
-    Scheduler scheduler = Scheduler(4, scheduler_config, 1, can_use_partial_preemption);
+    Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config, 1, can_use_partial_preemption);
     auto out1 = scheduler.schedule(requests);
 
     for (auto req : requests)
@@ -775,7 +805,7 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed2) {
 
     // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks.
     const bool can_use_partial_preemption = false;
-    Scheduler scheduler = Scheduler(4, scheduler_config, 1, can_use_partial_preemption);
+    Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config, 1, can_use_partial_preemption);
     scheduler.schedule(requests);
     for (auto req: requests)
         req->finish_iteration();
@@ -874,7 +904,6 @@ TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) {
     scheduler_config.use_cache_eviction = true;
     scheduler_config.cache_eviction_config = ov::genai::CacheEvictionConfig(2, 2, 6, ov::genai::AggregationMode::NORM_SUM);
 
-
     std::vector<uint64_t> tokens1 = {0, 1};  // 1 full block
     SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0,
                                                                          ov::Tensor(ov::element::i64, {tokens1.size()},
@@ -890,7 +919,7 @@ TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) {
     std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
 
 
-    Scheduler scheduler = Scheduler(2, scheduler_config);
+    Scheduler scheduler = Scheduler(2, init_cache_manager(scheduler_config), scheduler_config);
     // prompt phase - schedules 1 block for seq 1, 5 blocks for seq 2
     auto out = scheduler.schedule(requests);
 
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 163a00192e..cf5fbb3403 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -266,7 +266,6 @@ def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]:
 
 def get_scheduler_config(scheduler_params: dict = None) -> SchedulerConfig:
     scheduler_config = SchedulerConfig()
-    scheduler_config.cache_size = 1
     if scheduler_params is None:
         scheduler_config.dynamic_split_fuse = True
         # vLLM specific
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index b633497d32..5f2702a774 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -283,5 +283,4 @@ def load_pipe(configs: List[Tuple], temp_path):
 @functools.lru_cache(1)
 def get_continuous_batching(path):
     scheduler_config = ov_genai.SchedulerConfig()
-    scheduler_config.cache_size = 1
     return ov_genai.LLMPipeline(path, ov_genai.Tokenizer(path), 'CPU', **{"scheduler_config": scheduler_config})
diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_cache_optimizations.py
index 45704f9dc6..3c09d34756 100644
--- a/tests/python_tests/test_cache_optimizations.py
+++ b/tests/python_tests/test_cache_optimizations.py
@@ -15,7 +15,7 @@
 from openvino import serialize
 from transformers import AutoTokenizer
 
-from common import TESTS_ROOT
+from common import TESTS_ROOT, run_test_pipeline
 
 
 def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]:
@@ -145,3 +145,28 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t
     del model_cb_noopt
 
 
+def get_greedy_seq_len_300() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.num_return_sequences = 3
+    generation_config.max_new_tokens = 300
+    return generation_config
+
+def get_beam_search_seq_len_300() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.num_beam_groups = 3
+    generation_config.num_beams = 6
+    generation_config.max_new_tokens = 300
+    generation_config.num_return_sequences = generation_config.num_beams
+    return generation_config
+
+scheduler_params_list = [
+                         ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True}, get_greedy_seq_len_300()),
+                         ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True}, get_beam_search_seq_len_300()),
+                         ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": False}, get_greedy_seq_len_300()),
+                         ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": False}, get_beam_search_seq_len_300()),
+                         ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "use_cache_eviction": True, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_greedy_seq_len_300())]
+@pytest.mark.parametrize("params", scheduler_params_list)
+@pytest.mark.precommit
+def test_dynamic_memory_allocation(tmp_path, params):
+    run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1])
+

From c83f8160896994d5c2a917d7dbc7465c368d1c8e Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 25 Dec 2024 06:08:42 +0400
Subject: [PATCH 053/110] [GHA] Updated OpenVINO nightly (#1433)

To catch up https://github.com/openvinotoolkit/openvino/pull/28067
---
 .github/workflows/causal_lm_cpp.yml        | 8 ++++----
 .github/workflows/job_vlm_sample_llava.yml | 2 +-
 .github/workflows/lcm_dreamshaper_cpp.yml  | 4 ++--
 src/cpp/src/tokenizer.cpp                  | 8 ++++----
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 2e9d72e263..4aad3d4bc3 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -16,10 +16,10 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241205_x86_64.tgz
-  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241205_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241205_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/w_openvino_toolkit_windows_2025.0.0.dev20241205_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241224_x86_64.tgz
+  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241224_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/w_openvino_toolkit_windows_2025.0.0.dev20241224_x86_64.zip
 jobs:
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
diff --git a/.github/workflows/job_vlm_sample_llava.yml b/.github/workflows/job_vlm_sample_llava.yml
index 166284bd4b..5f4634616a 100644
--- a/.github/workflows/job_vlm_sample_llava.yml
+++ b/.github/workflows/job_vlm_sample_llava.yml
@@ -11,7 +11,7 @@ on:
         type: string
 
 env:
-  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241205_x86_64.tgz
+  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
 
 jobs:
   visual_language_chat_sample-ubuntu-llava:
diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index 258184e9e4..c525b0be68 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -18,8 +18,8 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.9'
-  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241205_x86_64.tgz
-  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/w_openvino_toolkit_windows_2025.0.0.dev20241205_x86_64.zip
+  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
+  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/w_openvino_toolkit_windows_2025.0.0.dev20241224_x86_64.zip
   OV_INSTALL_DIR: ${{ github.workspace }}/ov
 
 jobs:
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 5364acfd91..b098f96fe6 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -394,8 +394,8 @@ class Tokenizer::TokenizerImpl {
         infer_request_guard.get().start_async();
         infer_request_guard.get().wait();
         return get_copied_results(
-            infer_request_guard.get().get_tensor("input_ids"),
-            infer_request_guard.get().get_tensor("attention_mask")
+            infer_request_guard.get().get_output_tensor(0),
+            infer_request_guard.get().get_output_tensor(1)
         );
     }
 
@@ -412,8 +412,8 @@ class Tokenizer::TokenizerImpl {
             infer_request_guard.get().wait();
 
             unpadded = get_copied_results(
-                infer_request_guard.get().get_tensor("input_ids"),
-                infer_request_guard.get().get_tensor("attention_mask")
+                infer_request_guard.get().get_output_tensor(0),
+                infer_request_guard.get().get_output_tensor(1)
             );
         }
         return pad_left(unpadded.input_ids, unpadded.attention_mask);

From fabb5b312f92c3cf3bfae86f80c6a3bfbba95d78 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Wed, 25 Dec 2024 06:15:31 +0400
Subject: [PATCH 054/110] temporary use num_steps instead of infer_count for
 image generation (#1432)

workaround for CVS-159838
proper fix required on validation pipeline side

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .github/workflows/llm_bench-python.yml         |  6 +++---
 tools/llm_bench/benchmark.py                   |  4 +++-
 tools/llm_bench/llm_bench_utils/model_utils.py |  3 +++
 tools/llm_bench/task/image_generation.py       | 12 ++++++------
 4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index 1999bafcfe..56145c080c 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -114,14 +114,14 @@ jobs:
       - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux Optimum Intel
         run: |
           huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir ov_models/lcm_dreamshaper_v7
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum -ic 4
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum --num_steps 4
       - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI
         run: |
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 -ic 4
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --num_steps 4
       - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI and LoRA
         run: |
           wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 -ic 4
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 --num_steps 4
           rm -rf ./ov_models/lcm_dreamshaper_v7/
       - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
         run: |
diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py
index 5fa22497c1..39b6306e7f 100644
--- a/tools/llm_bench/benchmark.py
+++ b/tools/llm_bench/benchmark.py
@@ -158,7 +158,9 @@ def get_argprser():
     parser.add_argument('--set_torch_thread', default=0, type=num_infer_count_type, help='Set the number of Torch thread. ')
     parser.add_argument('-tl', '--tokens_len', type=int, required=False, help='The length of tokens print each time in streaming mode, chunk streaming.')
     parser.add_argument('--streaming', action='store_true', help='Set whether to use streaming mode, only applicable to LLM.')
-
+    parser.add_argument("--num_steps", type=int, required=False, help="Number of inference steps for image generation")
+    parser.add_argument("--height", type=int, required=False, help="Generated image height. Applicable only for Image Generation.")
+    parser.add_argument("--width", type=int, required=False, help="Generated image width. Applicable only for Image Generation.")
     return parser.parse_args()
 
 
diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py
index 78f72147c7..b3e2f23f0b 100644
--- a/tools/llm_bench/llm_bench_utils/model_utils.py
+++ b/tools/llm_bench/llm_bench_utils/model_utils.py
@@ -97,6 +97,9 @@ def analyze_args(args):
     model_args['prompt'] = args.prompt
     model_args['prompt_file'] = args.prompt_file
     model_args['infer_count'] = args.infer_count
+    model_args["num_steps"] = args.num_steps
+    model_args["height"] = args.height
+    model_args["width"] = args.width
     model_args['images'] = args.images
     model_args['seed'] = args.seed
     model_args['mem_consumption'] = args.memory_consumption
diff --git a/tools/llm_bench/task/image_generation.py b/tools/llm_bench/task/image_generation.py
index 7f43afe6e2..125794704d 100644
--- a/tools/llm_bench/task/image_generation.py
+++ b/tools/llm_bench/task/image_generation.py
@@ -25,10 +25,10 @@
 stable_diffusion_hook = StableDiffusionHook()
 
 
-def collects_input_args(image_param, model_type, model_name, infer_count=None, callback=None):
+def collects_input_args(image_param, model_type, model_name, infer_count=None, height=None, width=None, callback=None):
     input_args = {}
-    input_args["width"] = image_param.get('width', DEFAULT_IMAGE_WIDTH)
-    input_args["height"] = image_param.get('height', DEFAULT_IMAGE_HEIGHT)
+    input_args["width"] = image_param.get('width', width or DEFAULT_IMAGE_WIDTH)
+    input_args["height"] = image_param.get('height', height or DEFAULT_IMAGE_HEIGHT)
     if infer_count is None:
         input_args["num_inference_steps"] = image_param.get('steps', DEFAULT_INFERENCE_STEPS if 'lcm' not in model_name else LCM_DEFAULT_INFERENCE_STEPS)
     else:
@@ -60,7 +60,7 @@ def collects_input_args(image_param, model_type, model_name, infer_count=None, c
 def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None):
     set_seed(args['seed'])
     input_text = image_param['prompt']
-    input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["infer_count"])
+    input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["num_steps"], args.get("height"), args.get("width"))
     out_str = f"Input params: Batch_size={args['batch_size']}, " \
               f"steps={input_args['num_inference_steps']}, width={input_args['width']}, height={input_args['height']}"
     if 'guidance_scale' in input_args:
@@ -84,7 +84,7 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list,
         for bs_idx, in_text in enumerate(input_text_list):
             llm_bench_utils.output_file.output_image_input_text(in_text, args, image_id, bs_idx, proc_id)
     start = time.perf_counter()
-    res = pipe(input_text_list, **input_args).images
+    res = pipe(input_text_list, **input_args, num_images_per_prompt=2).images
     end = time.perf_counter()
     if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
         mem_consumption.end_collect_momory_consumption()
@@ -123,7 +123,7 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list,
 def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None):
     set_seed(args['seed'])
     input_text = image_param['prompt']
-    input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["infer_count"], callback)
+    input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["num_steps"], args.get("height"), args.get("width"), callback)
     out_str = f"Input params: Batch_size={args['batch_size']}, " \
               f"steps={input_args['num_inference_steps']}, width={input_args['width']}, height={input_args['height']}"
     if 'guidance_scale' in input_args:

From ca4460a71c95982177f5e119f74ac6e2ee33830e Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 25 Dec 2024 14:02:42 +0400
Subject: [PATCH 055/110] [GHA] Use latest OV on macos and windows (#1434)

---
 .github/workflows/mac.yml     | 2 +-
 .github/workflows/windows.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 7a4ee31beb..5cc8772ac5 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.9'
-  OV_BRANCH: 0080d90974ca84f9a6d359da3388a2a18a93b753
+  OV_BRANCH: master
   OV_TARBALL: ''
 
 jobs:
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 649d678c02..7e1aacc715 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.11'
-  OV_BRANCH: 0080d90974ca84f9a6d359da3388a2a18a93b753
+  OV_BRANCH: master
   OV_TARBALL: ''
 
 jobs:

From 0789c7b8273343908fb717824d52a74e73efd668 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 25 Dec 2024 15:20:14 +0400
Subject: [PATCH 056/110] [Text generation] Enable tests with
 Qwen2-0.5B-Instruct (#1438)

---
 .github/workflows/linux.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 0bb0c1af6e..6c94a907ea 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -270,7 +270,7 @@ jobs:
           - name: 'Whisper'
             cmd: 'tests/python_tests/test_whisper_generate_api.py'
           - name: 'LLM & VLM'
-            cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py -k "not Qwen2-0.5B-Instruct"' # Skip failed tests Qwen2-0.5B-Instruct
+            cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py'
     defaults:
       run:
         shell: bash

From 812163a2e15e31e94fa1261010c07f9a106f774a Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 25 Dec 2024 18:24:05 +0400
Subject: [PATCH 057/110] Moved tokenizers tests to a dedicated file (#1436)

---
 .github/workflows/mac.yml                     |    2 +-
 .github/workflows/windows.yml                 |    2 +-
 .../openvino/genai/generation_config.hpp      |   14 +-
 src/cpp/include/openvino/genai/tokenizer.hpp  |   30 +-
 src/cpp/src/generation_config.cpp             |    3 +
 src/cpp/src/tokenizer.cpp                     |   48 +-
 .../openvino_genai/py_openvino_genai.pyi      |   60 +-
 .../py_continuous_batching_pipeline.cpp       |    4 +-
 src/python/py_generation_config.cpp           |   20 +-
 tests/python_tests/common.py                  |   65 +-
 tests/python_tests/ov_genai_test_utils.py     |  112 +-
 .../python_tests/test_cache_optimizations.py  |    4 +-
 tests/python_tests/test_chat_generate_api.py  |  202 +--
 tests/python_tests/test_generate_api.py       |  391 ++---
 tests/python_tests/test_preemption.py         |    6 +-
 tests/python_tests/test_sampling.py           |   22 +-
 tests/python_tests/test_tokenizer.py          |  360 ++++
 .../python_tests/test_whisper_generate_api.py |   31 +-
 tests/python_tests/tokenizer_configs.py       | 1536 ++++++++---------
 19 files changed, 1461 insertions(+), 1451 deletions(-)
 create mode 100644 tests/python_tests/test_tokenizer.py

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 5cc8772ac5..a9af13bc66 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -225,7 +225,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -v ./tests/python_tests/test_tokenizer.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 7e1aacc715..f88bc4c6f3 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -236,7 +236,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -v ./tests/python_tests/test_tokenizer.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index b8b222e347..4ea75e94c5 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -45,6 +45,10 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
  * @param logprobs number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
  *                 Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
  *
+ * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
+ * @param presence_penalty reduces absolute log prob if the token was generated at least once.
+ * @param frequency_penalty reduces absolute log prob as many times as the token was generated.
+ *
  * Beam search specific parameters:
  * @param num_beams number of beams for beam search. 1 disables beam search.
  * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -61,15 +65,13 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
  *        "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
  *        "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
  *
- * Random sampling parameters:
+ * Random (or multinomial) sampling parameters:
+ * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
  * @param temperature the value used to modulate token probabilities for random sampling.
  * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
  * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
- * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
- * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
- * @param presence_penalty reduces absolute log prob if the token was generated at least once.
- * @param frequency_penalty reduces absolute log prob as many times as the token was generated.
  * @param rng_seed initializes random generator.
+ * @param num_return_sequences the number of sequences to generate from a single prompt.
  *
  * Assisting generation parameters:
  * @param assistant_confidence_threshold the lower token probability of candidate to be validated by main model in case of dynamic strategy candidates number update.
@@ -90,7 +92,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     size_t min_new_tokens = 0;
     bool echo = false;
     size_t logprobs = 0;
-    
+
     std::set<std::string> stop_strings;
     // Default setting in vLLM (and OpenAI API) is not to include stop string in the output
     bool include_stop_str_in_output = false;
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 38fc0aaf8c..548e4dc332 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -36,9 +36,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
 
     /**
      * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights
-     * 
-     * This constructor is used when tokenizer and detokenizer are separate models already loaded into memory. 
-     * When this constructor is used bos, eos, pad token ids are expected to be in IR. 
+     *
+     * This constructor is used when tokenizer and detokenizer are separate models already loaded into memory.
+     * When this constructor is used bos, eos, pad token ids are expected to be in IR.
      * If an IR is older (< 2024.3) then this tokens are default initialized to be ignored.
      * @param tokenizer_model_str tokenizer model string
      * @param tokenizer_weights_tensor ov::Tensor with tokenizer weights
@@ -55,9 +55,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     );
 
     /**
-     * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights. 
-     * 
-     * This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's 
+     * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights.
+     *
+     * This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's
      * tokenizer or detokenizer is defined from model input signature. When this constructor is used bos, eos, pad token ids
      * are expected to be in IR. If an IR is older (< 2024.3) then this tokens are default initialized to be ignored.
      * @param model_str model string
@@ -82,7 +82,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
         ov::Tensor& detokenizer_weights_tensor,
         Properties&&... properties
         ) : Tokenizer(tokenizer_model_str, tokenizer_weights_tensor, detokenizer_model_str, detokenizer_weights_tensor, ov::AnyMap{std::forward<Properties>(properties)...}) { }
-    
+
     /**
      * @brief ov::genai::Tokenizer constructor with variable number of properties
      * @param model_str model string
@@ -93,7 +93,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor,
               Properties&&... properties)
         : Tokenizer(model_str, weights_tensor, ov::AnyMap{std::forward<Properties>(properties)...}) { }
-    
+
     /**
      * @brief ov::genai::Tokenizer constructor with variable number of properties
      * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
@@ -111,7 +111,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     * @return pair of [input_ids, attention_mask]
     */
     TokenizedInputs encode(const std::string prompt, const ov::AnyMap& tokenization_params = {});
-    
+
     /**
     * @brief encode batch of prompts. Left padding will be applied by default
     * @param prompts vector storing batch of prompts
@@ -127,7 +127,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     * @param prompt std::string with input prompt
     * @param properties tokenization properties, e.g. ov::genai::add_special_tokens(false)
     * @return pair of [input_ids, attention_mask]
-    */    
+    */
     template <typename... Properties>
     util::EnableIfAllStringAny<TokenizedInputs, Properties...> encode(std::string& prompt, Properties&&... properties) {
         return encode(prompt, AnyMap{std::forward<Properties>(properties)...});
@@ -164,7 +164,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     }
 
     /**
-    * @brief decode tokens. 
+    * @brief decode tokens.
     * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
     * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return vector of std::string, with size = batch_size
@@ -183,7 +183,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     }
 
     /**
-    * @brief batched decoding of tokens. 
+    * @brief batched decoding of tokens.
     * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size
     * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return vector of std::string, with size equal to batch_size
@@ -203,8 +203,8 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
 
     /**
      * @brief Embeds input prompts with special tags for a chat scenario.
-     * 
-     * For example, for Qwen family models, the prompt "1+1=" would be transformed into 
+     *
+     * For example, for Qwen family models, the prompt "1+1=" would be transformed into
      * <|im_start|>user\n1+1=<|im_end|>\n<|im_start|>assistant\n.
      *
      * @param history A vector of maps, with chat history, e.g. [{"role": "user", "content": "prompt"}, ...].
@@ -214,7 +214,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
      * @throws Exception if the chat template was unable to parse the input history.
      */
     std::string apply_chat_template(ChatHistory history,
-                                    bool add_generation_prompt, 
+                                    bool add_generation_prompt,
                                     const std::string& chat_template = {}) const;
 
     /// @brief Override a chat_template read from tokenizer_config.json.
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index 35ae92d605..4ff184547e 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -185,6 +185,9 @@ void GenerationConfig::validate() const {
                     "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined.");
     if (is_beam_search()) {
         OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive");
+        if (num_beam_groups > 1) {
+            OPENVINO_ASSERT(diversity_penalty != 0.0f, "For grouped beam search 'diversity_penalty' should not be zero, it it fallbacks to non-grouped beam search");
+        }
     } else {
         OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]");
         OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]");
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index b098f96fe6..82c0a17a55 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -89,15 +89,16 @@ class Tokenizer::TokenizerImpl {
 public:
     ov::CompiledModel m_tokenizer;
     ov::CompiledModel m_detokenizer;
-    
+
     std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_tokenizer;
     std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_detokenizer;
-    // To change the adding special tokens mode we use a statefull subgraph, 
+
+    // To change the adding special tokens mode we use a statefull subgraph,
     // this flag holds the current state value of the CompiledModel.
     bool m_add_special_tokens = true;
     bool m_skip_special_tokens = true;
     bool m_older_than_24_5 = false;
-    
+
     int64_t m_pad_token_id = -1;
     int64_t m_bos_token_id = -1;
     int64_t m_eos_token_id = -1;
@@ -111,6 +112,7 @@ class Tokenizer::TokenizerImpl {
     void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, const ov::AnyMap& params) {
         bool add_special_tokens_flag = m_add_special_tokens;
         bool skip_special_tokens_flag = m_skip_special_tokens;
+
         ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
         ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);
 
@@ -126,11 +128,11 @@ class Tokenizer::TokenizerImpl {
             // state but the effect is incorrect.
             return;
         }
-        
+
         // add_special_tokens is managed by Select op with a bool input.
         ov::Tensor add_special_tensor = ov::Tensor(ov::element::boolean, {});
         *add_special_tensor.data<bool>() = add_special_tokens_flag;
-        
+
         // skip_special_tokens is managed by multiplication with a number, therefore i32.
         ov::Tensor skip_special_tensor = ov::Tensor(ov::element::i32, {1});
         *skip_special_tensor.data<int>() = skip_special_tokens_flag;
@@ -148,19 +150,19 @@ class Tokenizer::TokenizerImpl {
 
     TokenizerImpl() = default;
 
-    TokenizerImpl(const std::filesystem::path& models_papth,  const ov::AnyMap& properties) {
-        setupTokenizer(models_papth, properties);
+    TokenizerImpl(const std::filesystem::path& models_path,  const ov::AnyMap& properties) {
+        setup_tokenizer(models_path, properties);
     }
 
     TokenizerImpl(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models,  const ov::AnyMap& properties) {
-        setupTokenizer(models, properties);
+        setup_tokenizer(models, properties);
     }
 
-    void setupTokenizer(const std::filesystem::path& models_path,  const ov::AnyMap& properties) {
+    void setup_tokenizer(const std::filesystem::path& models_path, const ov::AnyMap& properties) {
         ScopedVar env_manager(tokenizers_relative_to_genai().string());
         auto core = get_core_singleton();
 
-        OPENVINO_ASSERT(models_path.extension() != ".xml", "'models_papth' parameter should be a path to a dir not a xml file");
+        OPENVINO_ASSERT(models_path.extension() != ".xml", "'models_path' parameter should be a path to a dir not a xml file");
 
         std::shared_ptr<ov::Model> ov_tokenizer = nullptr;
         std::shared_ptr<ov::Model> ov_detokenizer = nullptr;
@@ -168,12 +170,12 @@ class Tokenizer::TokenizerImpl {
         if (std::filesystem::exists(models_path / "openvino_tokenizer.xml")) {
             ov_tokenizer = core.read_model(models_path / "openvino_tokenizer.xml");
         }
-        
+
         if (std::filesystem::exists(models_path / "openvino_detokenizer.xml")) {
             ov_detokenizer = core.read_model(models_path / "openvino_detokenizer.xml");
         }
 
-        setupTokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties);
+        setup_tokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties);
 
         // If special tokens were not found from IR, try to read them from config.
         // This will be triggered only for IRs older than 2024.3.
@@ -184,21 +186,20 @@ class Tokenizer::TokenizerImpl {
             // Try to read tokenizer_config if some token ids or token str are not defined.
             read_tokenizer_config_if_necessary(models_path);
         }
-        
+
         // If chat_template was not found in IR, try to read them from config.
         if (m_chat_template.empty()) {
             m_chat_template = chat_template_from_tokenizer_json_if_exists(models_path);
         }
     }
-    
 
-    void setupTokenizer(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models,  const ov::AnyMap& properties) {
+    void setup_tokenizer(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models, const ov::AnyMap& properties) {
         auto [ov_tokenizer, ov_detokenizer] = models;
         OPENVINO_ASSERT(ov_tokenizer || ov_detokenizer, "Neither tokenizer nor detokenzier models were provided");
 
         auto core = get_core_singleton();
         std::string device = "CPU"; // only CPU is supported for now
-        
+
         std::string version_str;
         utils::read_rt_info(ov_tokenizer != nullptr ? ov_tokenizer: ov_detokenizer , "openvino_tokenizers_version", version_str);
         // Saving IR version was added only in 24.5, so if it's empty, then it's older than 24.5
@@ -231,7 +232,7 @@ class Tokenizer::TokenizerImpl {
                     return std::move(this->m_detokenizer.create_infer_request());
                 });
         }
-        
+
         // Initialize tokenizer's cache to save time later.
         if (m_tokenizer) {
             // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
@@ -286,10 +287,11 @@ class Tokenizer::TokenizerImpl {
 
         nlohmann::json data = nlohmann::json::parse(f);
 
-        using ov::genai::utils::read_json_param;
         // they are in the format {"bos_token": { "content": "<s>",... }}
-        auto read_token_content_str = [&data](std::string key_name, std::string& val) {
-            if (val == "" && data.contains(key_name)) { read_json_param(data[key_name], "content", val); }
+        auto read_token_content_str = [&data](const std::string& key_name, std::string& val) {
+            if (val.empty() && data.contains(key_name)) {
+                utils::read_json_param(data[key_name], "content", val);
+            }
         };
         read_token_content_str(pad_token_key_name, m_pad_token);
         read_token_content_str(bos_token_key_name, m_bos_token);
@@ -494,7 +496,7 @@ class Tokenizer::TokenizerImpl {
             {"is none", "is undefined"},
             {"= none", "= undefined"},
             // Jinja2Cpp does not support Python-style slicing, e.g. [1:].
-            // If chat template contains such slicing, we replace it with 
+            // If chat template contains such slicing, we replace it with
             // a placeholder at the moment.
             {"messages[1:]", "slice(messages, 1)"},
         };
@@ -537,7 +539,7 @@ class Tokenizer::TokenizerImpl {
         env.GetSettings().trimBlocks = true;
         jinja2::Template tpl(&env);
         tpl.Load(chat_tpl);
-        
+
         jinja2::UserCallable slice_callable = jinja2::MakeCallable(
             [](const jinja2::GenericList& messages, const size_t& start) {
                 jinja2::ValuesList result;
@@ -607,7 +609,7 @@ Tokenizer::Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, c
     ScopedVar env_manager(tokenizers_relative_to_genai().string());
     auto core = get_core_singleton();
     auto model = core.read_model(model_str, weights_tensor);
-    
+
     auto parameters = model->get_parameters();
     OPENVINO_ASSERT(!parameters.empty());
     if (parameters.front()->get_element_type() == ov::element::string) {
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index 3d27b23052..8510a8389f 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -361,10 +361,10 @@ class ContinuousBatchingPipeline:
     This class is used for generation with LLMs with continuous batchig
     """
     @typing.overload
-    def __init__(self, models_path: str, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None:
+    def __init__(self, models_path: os.PathLike, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None:
         ...
     @typing.overload
-    def __init__(self, models_path: str, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None:
+    def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None:
         ...
     @typing.overload
     def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, sampling_params: GenerationConfig) -> GenerationHandle:
@@ -522,17 +522,17 @@ class FluxTransformer2DModel:
 class GenerationConfig:
     """
     
-        Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group 
-        and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will 
+        Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
+        and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will
         be used while greedy and beam search parameters will not affect decoding at all.
     
-        Parameters: 
+        Parameters:
         max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
                        max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set.
         max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+        min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
         ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
         eos_token_id:  token_id of <eos> (end of sentence)
-        min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
         stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
         include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
         stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
@@ -540,6 +540,10 @@ class GenerationConfig:
         logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                         Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
     
+        repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+        presence_penalty: reduces absolute log prob if the token was generated at least once.
+        frequency_penalty: reduces absolute log prob as many times as the token was generated.
+    
         Beam search specific parameters:
         num_beams:         number of beams for beam search. 1 disables beam search.
         num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -550,8 +554,8 @@ class GenerationConfig:
             length_penalty < 0.0 encourages shorter sequences.
         num_return_sequences: the number of sequences to return for grouped beam search decoding.
         no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
-        stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values: 
-            "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; 
+        stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values:
+            "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates;
             "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
             "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
     
@@ -560,7 +564,7 @@ class GenerationConfig:
         top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
         top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
         do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
-        repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.    
+        num_return_sequences: the number of sequences to generate from a single prompt.
     """
     adapters: AdapterConfig | None
     assistant_confidence_threshold: float
@@ -951,17 +955,17 @@ class LLMPipeline:
             :rtype: DecodedResults, EncodedResults, str
          
          
-            Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group 
-            and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will 
+            Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
+            and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will
             be used while greedy and beam search parameters will not affect decoding at all.
         
-            Parameters: 
+            Parameters:
             max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
                            max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set.
             max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
             ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
             eos_token_id:  token_id of <eos> (end of sentence)
-            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
             stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
             include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
             stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
@@ -969,6 +973,10 @@ class LLMPipeline:
             logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                             Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
         
+            repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+            presence_penalty: reduces absolute log prob if the token was generated at least once.
+            frequency_penalty: reduces absolute log prob as many times as the token was generated.
+        
             Beam search specific parameters:
             num_beams:         number of beams for beam search. 1 disables beam search.
             num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -979,8 +987,8 @@ class LLMPipeline:
                 length_penalty < 0.0 encourages shorter sequences.
             num_return_sequences: the number of sequences to return for grouped beam search decoding.
             no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
-            stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values: 
-                "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; 
+            stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values:
+                "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates;
                 "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
                 "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
         
@@ -989,7 +997,7 @@ class LLMPipeline:
             top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
             top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
             do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
-            repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.    
+            num_return_sequences: the number of sequences to generate from a single prompt.
         """
     @typing.overload
     def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, device: str, config: dict[str, typing.Any] = {}, **kwargs) -> None:
@@ -1032,17 +1040,17 @@ class LLMPipeline:
             :rtype: DecodedResults, EncodedResults, str
          
          
-            Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group 
-            and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will 
+            Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
+            and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will
             be used while greedy and beam search parameters will not affect decoding at all.
         
-            Parameters: 
+            Parameters:
             max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
                            max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set.
             max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
             ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
             eos_token_id:  token_id of <eos> (end of sentence)
-            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
             stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
             include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
             stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
@@ -1050,6 +1058,10 @@ class LLMPipeline:
             logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                             Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
         
+            repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+            presence_penalty: reduces absolute log prob if the token was generated at least once.
+            frequency_penalty: reduces absolute log prob as many times as the token was generated.
+        
             Beam search specific parameters:
             num_beams:         number of beams for beam search. 1 disables beam search.
             num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -1060,8 +1072,8 @@ class LLMPipeline:
                 length_penalty < 0.0 encourages shorter sequences.
             num_return_sequences: the number of sequences to return for grouped beam search decoding.
             no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
-            stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values: 
-                "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; 
+            stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values:
+                "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates;
                 "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
                 "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
         
@@ -1070,7 +1082,7 @@ class LLMPipeline:
             top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
             top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
             do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
-            repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.    
+            num_return_sequences: the number of sequences to generate from a single prompt.
         """
     def get_generation_config(self) -> GenerationConfig:
         ...
@@ -1420,7 +1432,7 @@ class StopCriteria:
     """
     
         StopCriteria controls the stopping condition for grouped beam search.
-        
+    
         The following values are possible:
             "openvino_genai.StopCriteria.EARLY" stops as soon as there are `num_beams` complete candidates.
             "openvino_genai.StopCriteria.HEURISTIC" stops when is it unlikely to find better candidates.
diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp
index 772ba0af8a..be7a72481f 100644
--- a/src/python/py_continuous_batching_pipeline.cpp
+++ b/src/python/py_continuous_batching_pipeline.cpp
@@ -212,7 +212,7 @@ void init_continuous_batching_pipeline(py::module_& m) {
             .def_readonly("max_cache_usage", &PipelineMetrics::max_cache_usage);
 
     py::class_<ContinuousBatchingPipeline>(m, "ContinuousBatchingPipeline", "This class is used for generation with LLMs with continuous batchig")
-        .def(py::init([](const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& llm_plugin_config, const std::map<std::string, py::object>& tokenizer_plugin_config) {
+        .def(py::init([](const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& llm_plugin_config, const std::map<std::string, py::object>& tokenizer_plugin_config) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
             return std::make_unique<ContinuousBatchingPipeline>(models_path, scheduler_config, device, pyutils::properties_to_any_map(llm_plugin_config), pyutils::properties_to_any_map(tokenizer_plugin_config));
         }),
@@ -222,7 +222,7 @@ void init_continuous_batching_pipeline(py::module_& m) {
         py::arg("properties") = ov::AnyMap({}),
         py::arg("tokenizer_properties") = ov::AnyMap({}))
 
-        .def(py::init([](const std::string& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
+        .def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
             return std::make_unique<ContinuousBatchingPipeline>(models_path, tokenizer, scheduler_config, device, pyutils::properties_to_any_map(plugin_config));
         }),
diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp
index b1a5c6cd2e..f49bcf29bd 100644
--- a/src/python/py_generation_config.cpp
+++ b/src/python/py_generation_config.cpp
@@ -20,7 +20,7 @@ namespace {
 
 auto stop_criteria_docstring =  R"(
     StopCriteria controls the stopping condition for grouped beam search.
-    
+
     The following values are possible:
         "openvino_genai.StopCriteria.EARLY" stops as soon as there are `num_beams` complete candidates.
         "openvino_genai.StopCriteria.HEURISTIC" stops when is it unlikely to find better candidates.
@@ -30,17 +30,17 @@ auto stop_criteria_docstring =  R"(
 } // namespace
 
 char generation_config_docstring[] = R"(
-    Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group 
-    and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will 
+    Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
+    and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will
     be used while greedy and beam search parameters will not affect decoding at all.
 
-    Parameters: 
+    Parameters:
     max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
                    max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set.
     max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+    min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
     ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
     eos_token_id:  token_id of <eos> (end of sentence)
-    min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
     stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
     include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
     stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
@@ -48,6 +48,10 @@ char generation_config_docstring[] = R"(
     logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                     Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
 
+    repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+    presence_penalty: reduces absolute log prob if the token was generated at least once.
+    frequency_penalty: reduces absolute log prob as many times as the token was generated.
+
     Beam search specific parameters:
     num_beams:         number of beams for beam search. 1 disables beam search.
     num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -58,8 +62,8 @@ char generation_config_docstring[] = R"(
         length_penalty < 0.0 encourages shorter sequences.
     num_return_sequences: the number of sequences to return for grouped beam search decoding.
     no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
-    stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values: 
-        "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; 
+    stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values:
+        "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates;
         "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
         "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
 
@@ -68,7 +72,7 @@ char generation_config_docstring[] = R"(
     top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
     top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
     do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
-    repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.    
+    num_return_sequences: the number of sequences to generate from a single prompt.
 )";
 
 void init_generation_config(py::module_& m) {
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index cf5fbb3403..7e3c075405 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -42,13 +42,6 @@ def get_greedy_with_penalties() -> GenerationConfig:
     generation_config.max_new_tokens = 30
     return generation_config
 
-def get_greedy_with_min_and_max_tokens() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.min_new_tokens = 15
-    generation_config.max_new_tokens = 30
-    return generation_config
-
 def get_greedy_with_single_stop_string() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_return_sequences = 1
@@ -296,10 +289,12 @@ def convert_to_hf(
     kwargs['max_length'] = generation_config.max_length
     # has higher priority than 'max_length'
     kwargs['max_new_tokens'] = generation_config.max_new_tokens
+    kwargs['min_new_tokens'] = generation_config.min_new_tokens
     if generation_config.stop_strings:
         kwargs['stop_strings'] = generation_config.stop_strings
 
     # copy default parameters
+    kwargs['bos_token_id'] = default_generation_config.bos_token_id
     kwargs['eos_token_id'] = default_generation_config.eos_token_id
     kwargs['pad_token_id'] = default_generation_config.pad_token_id
     kwargs['repetition_penalty'] = generation_config.repetition_penalty
@@ -308,11 +303,12 @@ def convert_to_hf(
         # beam search case
         kwargs['num_beam_groups'] = generation_config.num_beam_groups
         kwargs['num_beams'] = generation_config.num_beams
-        kwargs['diversity_penalty'] = generation_config.diversity_penalty
         kwargs['length_penalty'] = generation_config.length_penalty
         kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size
         kwargs['num_return_sequences'] = generation_config.num_return_sequences
         kwargs['output_scores'] = True
+        if generation_config.num_beam_groups > 1:
+            kwargs['diversity_penalty'] = generation_config.diversity_penalty
     elif generation_config.do_sample:
         # mulitinomial
         kwargs['temperature'] = generation_config.temperature
@@ -328,7 +324,7 @@ def convert_to_hf(
 
 
 def run_hugging_face(
-    model,
+    opt_model,
     hf_tokenizer,
     prompts: List[str],
     generation_configs: List[GenerationConfig],
@@ -337,8 +333,9 @@ def run_hugging_face(
     for prompt, generation_config in zip(prompts, generation_configs):
         inputs = hf_tokenizer(prompt, return_tensors="pt")
         prompt_len = inputs['input_ids'].numel()
-        generate_outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], generation_config=convert_to_hf(model.generation_config, generation_config),
-                                        return_dict_in_generate=True, tokenizer=hf_tokenizer)
+        generate_outputs = opt_model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'],
+                                              generation_config=convert_to_hf(opt_model.generation_config, generation_config),
+                                              return_dict_in_generate=True, tokenizer=hf_tokenizer)
         all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
 
         generation_result = GenerationResult()
@@ -349,7 +346,7 @@ def run_hugging_face(
         generation_results.append(generation_result)
 
     del hf_tokenizer
-    del model
+    del opt_model
 
     return generation_results
 
@@ -360,14 +357,14 @@ def run_continuous_batching(
     prompts: List[str],
     generation_configs : List[GenerationConfig]
 ) -> List[GenerationResult]:
-    pipe = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config, "CPU", {}, {})
+    pipe = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU")
     output = pipe.generate(prompts, generation_configs)
     del pipe
     shutil.rmtree(models_path)
     return output
 
 
-def get_models_list(file_name: str):
+def read_models_list(file_name: str):
     models = []
     with open(file_name) as f:
         for model_name in f:
@@ -395,6 +392,14 @@ def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, ge
         for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids):
             assert hf_text == ov_text
 
+
+def get_hugging_face_model_and_tokenizer(model_id: str, use_optimum = True):
+    hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \
+                AutoModelForCausalLM.from_pretrained(model_id)
+    return opt_model, hf_tokenizer
+
+
 def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path):
     model.save_pretrained(models_path)
     # convert tokenizers as well
@@ -404,23 +409,6 @@ def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path):
     serialize(tokenizer, models_path / "openvino_tokenizer.xml")
     serialize(detokenizer, models_path / "openvino_detokenizer.xml")
 
-def get_model_and_tokenizer(model_id: str, use_optimum = True):
-    hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \
-            AutoModelForCausalLM.from_pretrained(model_id)
-    return model, hf_tokenizer
-
-def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path):
-    use_optimum = True
-    models_path : Path = tmp_path / model_id
-    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum)
-
-    if use_optimum:
-        save_ov_model_from_optimum(model, hf_tokenizer, models_path)
-
-    hf_results = run_hugging_face(model=model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs)
-    _generate_and_compare_with_reference_results(models_path, prompts, hf_results, generation_configs, scheduler_config)
-
 
 def _generate_and_compare_with_reference_results(models_path: Path, prompts: List[str], reference_results: List[GenerationResult], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig):
     ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs)
@@ -433,19 +421,32 @@ def _generate_and_compare_with_reference_results(models_path: Path, prompts: Lis
         compare_results(ref_result, ov_result, generation_config)
 
 
+def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path):
+    use_optimum = True
+    models_path : Path = tmp_path / model_id
+    opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum)
+
+    if use_optimum:
+        save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path)
+
+    hf_results = run_hugging_face(opt_model=opt_model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs)
+    _generate_and_compare_with_reference_results(models_path, prompts, hf_results, generation_configs, scheduler_config)
+
+
 def generate_and_compare_with_reference_text(models_path: Path, prompts: List[str], reference_texts_per_prompt: List[List[str]], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig):
     ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs)
 
     assert len(prompts) == len(reference_texts_per_prompt)
     assert len(prompts) == len(ov_results)
 
-    for prompt, ref_texts_for_this_prompt, ov_result, generation_config in zip(prompts, reference_texts_per_prompt, ov_results, generation_configs):
+    for prompt, ref_texts_for_this_prompt, ov_result in zip(prompts, reference_texts_per_prompt, ov_results):
         print(f"Prompt = {prompt}\nref text = {ref_texts_for_this_prompt}\nOV result = {ov_result.m_generation_ids}")
 
         assert len(ref_texts_for_this_prompt) == len(ov_result.m_generation_ids)
         for ref_text, ov_text in zip(ref_texts_for_this_prompt, ov_result.m_generation_ids):
             assert ref_text == ov_text
 
+
 def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None):
     prompts, generation_configs = get_test_dataset()
     scheduler_config = get_scheduler_config(scheduler_params)
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 5f2702a774..87b2147bcd 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -57,33 +57,6 @@ def get_models_list():
     return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
 
 
-def get_whisper_models_list(tiny_only=False, multilingual=False, en_only=False):
-    precommit_models = [
-        "openai/whisper-tiny",
-        "openai/whisper-tiny.en",
-        "distil-whisper/distil-small.en",
-    ]
-    if multilingual:
-        precommit_models = ["openai/whisper-tiny"]
-    if en_only:
-        precommit_models = ["openai/whisper-tiny.en", "distil-whisper/distil-small.en"]
-    if tiny_only:
-        precommit_models = ["openai/whisper-tiny"]
-
-    nightly_models = []
-
-    if pytest.run_marker == "precommit":
-        model_ids = precommit_models
-    else:
-        model_ids = nightly_models
-
-    if pytest.selected_model_ids:
-        model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
-
-    prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
-    return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
-
-
 def get_chat_models_list():
     precommit_models = [
         "Qwen/Qwen2-0.5B-Instruct",
@@ -101,90 +74,31 @@ def get_chat_models_list():
         model_ids = precommit_models
     else:
         model_ids = nightly_models
-    
+
     prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
     return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
 
 
-def get_chat_templates():
-    # Returns chat templates saved in tokenizer_configs.py, 
-    # but skips some models that currently are not processed correctly.
-
-    skipped_models = {
-        # TODO: openchat/openchat_3.5 and berkeley-nest/Starling-LM-7B-alpha have the same template.
-        # Need to enable and unskip, since it's preset in continuous batching and has >100 000 downloads.
-        "openchat/openchat-3.5-0106",
-        
-        # These models fail even on HF so no need to check if applying chat matches.
-        "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy",
-        "codellama/CodeLlama-34b-Instruct-hf",
-        "deepseek-ai/deepseek-math-7b-rl",
-        "allenai/tulu-2-7b",
-        "alexsobolev/IcaroLM",
-        "tokyotech-llm/Swallow-7b-instruct-v0.1",
-        "bofenghuang/vigogne-2-7b-chat",
-        "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k",
-        "AliAbdelrasheed/maqa_llama_4bit",
-        "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored",
-
-        # TODO: Need to support chat templates in more models: CVS-145963
-        # Either ov_genai is unable to parse chat_template or results do not match with HF.
-        "meta-llama/Meta-Llama-3-8B-Instruct",
-        "databricks/dbrx-instruct", # Chat template is not supported by Jinja2Cpp
-        "mosaicml/mpt-30b-chat",
-        "deepseek-ai/deepseek-coder-6.7b-instruct", # Chat template is not supported by Jinja2Cpp
-        "maldv/winter-garden-7b-alpha", # Chat template is not supported by Jinja2Cpp
-        "ishorn5/RTLCoder-Deepseek-v1.1", # Chat template is not supported by Jinja2Cpp
-        "openchat/openchat-3.5-0106",
-        "casperhansen/llama-3-70b-instruct-awq",
-        "TheBloke/deepseek-coder-33B-instruct-GPTQ",
-        "AI-Sweden-Models/gpt-sw3-356m-instruct",
-        "google/gemma-7b-it",
-        "THUDM/cogvlm2-llama3-chat-19B",
-        "KnutJaegersberg/internlm-20b-llama",
-        "maywell/Synatra-Mixtral-8x7B",
-        "MediaTek-Research/Breeze-7B-Instruct-v1_0",
-        "bofenghuang/vigostral-7b-chat",
-        "meetkai/functionary-small-v2.5", # Chat template is not supported by Jinja2Cpp
-        "openchat/openchat-3.6-8b-20240522",
-        "tenyx/TenyxChat-7B-v1",
-        "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2",
-        "yam-peleg/Hebrew-Gemma-11B-V2",
-        "shenzhi-wang/Llama3-8B-Chinese-Chat", # AssertionError
-        "nlpai-lab/KULLM3",
-        "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1",
-        "MediaTek-Research/Breeze-7B-Instruct-v0_1", 
-        "shanchen/llama3-8B-slerp-biomed-chat-chinese", # AssertionError
-        "MLP-KTLim/llama-3-Korean-Bllossom-8B",
-        "aloobun/CosmicBun-8B", # Chat template is not supported by Jinja2Cpp
-        "codellama/CodeLlama-70b-Instruct-hf",
-        "gorilla-llm/gorilla-openfunctions-v2", # Chat template is not supported by Jinja2Cpp
-        "BramVanroy/Llama-2-13b-chat-dutch"
-    }
-    from tokenizer_configs import get_tokenizer_configs
-    return [(k, v) for k, v in get_tokenizer_configs().items() if k not in skipped_models]
-
-
 @functools.lru_cache(1)
 def read_model(params, **tokenizer_kwargs):
     model_id, path = params
     
     from optimum.intel.openvino import OVModelForCausalLM
     from transformers import AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 
     if (path / "openvino_model.xml").exists():
         opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True, 
                                                        compile=False, device='CPU')
     else:
-        ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, 
+        ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, 
                                                                              with_detokenizer=True,
                                                                              **tokenizer_kwargs)
         openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml")
         openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml")
         
         # to store tokenizer config jsons with special tokens
-        tokenizer.save_pretrained(path)
+        hf_tokenizer.save_pretrained(path)
         
         opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, 
                                                        compile=False, device='CPU', load_in_8bit=False)
@@ -195,7 +109,7 @@ def read_model(params, **tokenizer_kwargs):
     return (
         model_id,
         path,
-        tokenizer,
+        hf_tokenizer,
         opt_model,
         ov_genai.LLMPipeline(path, 'CPU', **{'ENABLE_MMAP': False}),
     )
@@ -256,20 +170,8 @@ def model_tokenizers_path_tmp_path(tmpdir_factory):
     yield model_id, Path(temp_path)
 
 
-def load_tok(configs: List[Tuple], temp_path):
-    # load Tokenizer where all configs are cleared.
-    # remove existing jsons from previous tests
-    for json_file in temp_path.glob("*.json"):
-        json_file.unlink()
-
-    for config_json, config_name in configs:
-        with (temp_path / config_name).open('w') as f:
-            json.dump(config_json, f)
-    return ov_genai.Tokenizer(temp_path)
-
-
-def load_pipe(configs: List[Tuple], temp_path):
-    # Load LLMPipline where all configs are cleared.
+def load_genai_pipe_with_configs(configs: List[Tuple], temp_path):
+    # Load LLMPipeline where all configs are cleared.
     # remove existing jsons from previous tests
     for json_file in temp_path.glob("*.json"):
         json_file.unlink()
diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_cache_optimizations.py
index 3c09d34756..d89697ba42 100644
--- a/tests/python_tests/test_cache_optimizations.py
+++ b/tests/python_tests/test_cache_optimizations.py
@@ -112,8 +112,8 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t
     scheduler_config_opt.enable_prefix_caching = enable_prefix_caching
 
     models_path = converted_model.models_path
-    model_cb_noopt = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config, "CPU", {})
-    model_cb_opt = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config_opt, "CPU", {})
+    model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU")
+    model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU")
 
     tokenizer = converted_model.tokenizer
 
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index d9661e538b..07b4f7c15f 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -4,24 +4,21 @@
 import openvino_genai as ov_genai
 import pytest
 from typing import Dict, Tuple
+
 from ov_genai_test_utils import (
-    get_models_list,
     get_chat_models_list,
     read_model,
-    load_tok,
-    model_tmp_path,
-    get_chat_templates,
     get_continuous_batching,
 )
 
 
-configs = [
+generation_configs = [
     dict(do_sample=False, max_new_tokens=20),
     dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0)
 ]
 
 
-quenstions = [
+questions = [
     '1+1=',
     'What is the previous answer?',
     'Why is the Sun yellow?',
@@ -29,7 +26,7 @@
 ]
 
 
-@pytest.mark.parametrize("generation_config", configs)
+@pytest.mark.parametrize("generation_config", generation_configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
@@ -37,18 +34,18 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict):
     chat_history_hf = []
     chat_history_ov = []
     chat_prompt = ''
-    
+
     # Will set add_special_tokens=False inside pipeline when start_chat() is called.
     model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
 
-    pipe.start_chat()    
-    for prompt in quenstions:
+    pipe.start_chat()
+    for prompt in questions:
         chat_history_hf.append({'role': 'user', 'content': prompt})
         chat_history_ov.append({'role': 'user', 'content': prompt})
-        
+
         chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
         tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
-        
+
         answer = model_opt.generate(**tokenized, **generation_config)
         answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
         chat_history_hf.append({'role': 'assistant', 'content': answer_str})
@@ -57,14 +54,15 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict):
         chat_history_ov.append({'role': 'assistant', 'content': answer_ov})
 
     pipe.finish_chat()
-    
+
     if chat_history_ov != chat_history_hf:
         print(f'hf_output: {chat_history_hf}')
         print(f'ov_output: {chat_history_ov}')
+
     assert chat_history_ov == chat_history_hf
 
 
-@pytest.mark.parametrize("generation_config", configs)
+@pytest.mark.parametrize("generation_config", generation_configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
@@ -73,172 +71,48 @@ def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict)
     chat_history_hf = []
     chat_history_ov = []
     chat_prompt = ''
-    
+
     # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True.
     # Need to regenerate openvino_tokenizer/detokenizer.
-    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False)
-    
-    for prompt in quenstions:
+    model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False)
+    ov_tokenizer = ov_pipe.get_tokenizer()
+
+    for prompt in questions:
         chat_history_hf.append({'role': 'user', 'content': prompt})
         chat_history_ov.append({'role': 'user', 'content': prompt})
-        
-        chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
-        tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
-        
+
+        chat_prompt = hf_tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
+        tokenized = hf_tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
+
         answer = model_opt.generate(**tokenized, **generation_config)
-        answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
+        answer_str = hf_tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
         chat_history_hf.append({'role': 'assistant', 'content': answer_str})
-        
-        chat_prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True)
-        answer_ov = pipe.generate(chat_prompt, **generation_config)
+
+        chat_prompt = ov_tokenizer.apply_chat_template(chat_history_ov, add_generation_prompt=True)
+        answer_ov = ov_pipe.generate(chat_prompt, **generation_config)
         chat_history_ov.append({'role': 'assistant', 'content': answer_ov})
-  
+
     if chat_history_ov != chat_history_hf:
         print(f'hf_output: {chat_history_hf}')
         print(f'ov_output: {chat_history_ov}')
+
     assert chat_history_ov == chat_history_hf
 
 
-@pytest.mark.parametrize("generation_config", configs)
-@pytest.mark.parametrize("model_descr", get_chat_models_list())
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict):
-    # Check that when history is stored in KV cache results are the same as when history stored in a text.
-    device ='CPU'
-    
-    chat_history_with_kv_cache = []
-    chat_history_ov = []
-    
-    # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True.
-    # Need to regenerate openvino_tokenizer/detokenizer.
-    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False)
-    pipe_with_kv_cache = ov_genai.LLMPipeline(path, device, **{"ENABLE_MMAP": False})
-  
-    pipe_with_kv_cache.start_chat()
-    for question in quenstions:
-        chat_history_with_kv_cache.append({'role': 'user', 'content': question})
-        answer = pipe_with_kv_cache.generate(question, **generation_config)
-        chat_history_with_kv_cache.append({'role': 'assistant', 'content': answer})
-        
-        chat_history_ov.append({'role': 'user', 'content': question})
-        prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True)
-        answer = pipe.generate(prompt, **generation_config)
-        chat_history_ov.append({'role': 'assistant', 'content': answer})
-    pipe_with_kv_cache.finish_chat()
-
-    if chat_history_ov != chat_history_with_kv_cache:
-        print(f'kvcache_hist: {chat_history_with_kv_cache}')
-        print(f'text_history: {chat_history_ov}')
-    assert chat_history_ov == chat_history_with_kv_cache
-
-
-conversation = [
-    {'role': 'user', 'content': '1+1='},
-    {'role': 'assistant', 'content': '1 + 1 = 2'},
-    {'role': 'user', 'content': 'What is the previous answer?'},
-    {'role': 'assistant', 'content': 'The previous answer was: 1 + 1 = 2. Please ask me your next question.'},
-    {'role': 'user', 'content': 'Why is the sun yellow?'},
-    {'role': 'assistant', 'content': 'Because it emits yeloow light.'},
-    {'role': 'user', 'content': 'What was my first question?'},
-]
-@pytest.mark.precommit
-@pytest.mark.nightly
-@pytest.mark.parametrize('chat_config', get_chat_templates())
-def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
-    tokenizer_config = chat_config[1]
-
-    # Will load openvino_model for tiny-random-phi as a placeholder
-    # but indeed only Tokenizer and apply_chat_template will be tested.
-    model_id, path, tokenizer, opt_model, pipe = read_model(get_models_list()[0])
-    
-    full_history_str_hf = tokenizer.apply_chat_template(conversation, 
-        add_generation_prompt=False,
-        tokenize=False,
-        **tokenizer_config)
-    
-    tok = load_tok([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1])
-    tok.set_chat_template(tokenizer_config['chat_template'])
-    full_history_str = tok.apply_chat_template(conversation, add_generation_prompt=False)
-    if full_history_str != full_history_str_hf:
-        print(f'hf reference: {full_history_str_hf}')
-        print(f'ov_genai out: {full_history_str}')
-    assert full_history_str == full_history_str_hf
-
-
-@pytest.mark.parametrize("generation_config", configs[1:])
+@pytest.mark.parametrize("generation_config", generation_configs[1:])
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
 def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Dict):
-    model_id, path, tokenizer, model, stateful = read_model((model_descr[0], model_descr[1] / '_test_chat'))
-    cb = get_continuous_batching(path)
-    stateful.start_chat()
-    cb.start_chat()
-    for question in quenstions:
-        generated = cb.generate(question, **generation_config)
-        reference = stateful.generate(question, **generation_config)
-        assert generated == reference
-    # Test that finish_chat() doesn't fail just in case.
-    cb.finish_chat()
-
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_set_chat_template():
-    model_descr = get_chat_models_list()[0]
-    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
-    pipe.get_tokenizer().set_chat_template("{% for message in messages %}{{ message['content'] }}{% endfor %}")
-    config = ov_genai.GenerationConfig()
-    config.max_new_tokens = 1
-    config.do_sample = False
-    pipe.start_chat()
-    generated = pipe.generate("a", config)
-    pipe.finish_chat()
-    reference = pipe.generate("a", config)
-    assert generated == reference
+    model_id, path, hf_tokenizer, opt_model, ov_stateful_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+    cb_pipe = get_continuous_batching(path)
 
-prompts = [
-    '1+1=',
-    'What is the previous answer?',
-    'Why is the Sun yellow?',
-    'What was my first question?',
-    ['Why is the Sun yellow?'],
-    "若我有一亿美元，在人工智能盛行的今天，我怎样投资才能收益最大化？",
-    "מחרוזת בדיקה",
-    "Multiline\nstring!\nWow!",
-]
+    ov_stateful_pipe.start_chat()
+    cb_pipe.start_chat()
 
-@pytest.mark.precommit
-@pytest.mark.nightly
-@pytest.mark.parametrize("add_special_tokens", [True, False])
-@pytest.mark.parametrize("prompt", prompts)
-def test_add_special_tokens(add_special_tokens, prompt):
-    import numpy as np
-    model_descr = get_chat_models_list()[0]
-    model_id, path, hf_tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
-    genai_tokenzier = pipe.get_tokenizer()
-    
-    # Calling encode with add_special_tokens will set state flag.
-    res_genai = genai_tokenzier.encode(prompt, add_special_tokens).input_ids.data
-    res_hf = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"]
-    assert np.all(res_genai == res_hf)
+    for question in questions:
+        generated = cb_pipe.generate(question, **generation_config)
+        reference = ov_stateful_pipe.generate(question, **generation_config)
+        assert generated == reference
 
-@pytest.mark.precommit
-@pytest.mark.nightly
-@pytest.mark.parametrize("add_special_tokens", [True, False])
-@pytest.mark.parametrize("skip_special_tokens", [True, False])
-@pytest.mark.parametrize("prompt", prompts)
-def test_add_special_tokens(add_special_tokens, skip_special_tokens, prompt):
-    import numpy as np
-    model_descr = get_chat_models_list()[0]
-    model_id, path, hf_tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
-    genai_tokenizer = pipe.get_tokenizer()
-    
-    # Calling encode with add_special_tokens will set state flag.
-    res_genai = genai_tokenizer.encode(prompt, add_special_tokens).input_ids.data
-    res_hf = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"]
-    assert np.all(res_genai == res_hf)
-    
-    # Decode with skip_special_tokens
-    decoded_genai = genai_tokenizer.decode(res_genai, skip_special_tokens=skip_special_tokens)[0]
-    decoded_hf = hf_tokenizer.decode(res_hf[0], skip_special_tokens=skip_special_tokens)
-    assert decoded_genai == decoded_hf
+    # Test that finish_chat() doesn't fail just in case.
+    cb_pipe.finish_chat()
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 9bb9eff49c..824a3cca26 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -4,7 +4,6 @@
 import openvino_genai as ov_genai
 from openvino_genai import StopCriteria
 import pytest
-import transformers
 from typing import Union, List, Dict, Optional
 import numpy as np
 import openvino as ov
@@ -15,8 +14,7 @@
 from ov_genai_test_utils import (
     get_models_list, 
     read_model, 
-    load_pipe,
-    load_tok, 
+    load_genai_pipe_with_configs,
     model_tmp_path, 
     STOP_CRITERIA_MAP, 
     get_continuous_batching,
@@ -24,7 +22,7 @@
 
 
 def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]):
-    model_id, path, tokenizer, model, pipe = model_descr
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
     config = generation_config.copy()  # to avoid side effects
     num_beams = config['num_beams'] if 'num_beams' in config else 1
     config['num_return_sequences'] = num_beams
@@ -39,25 +37,25 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro
         # Do not apply 'repetition_penalty' if sampling is not used.
         config['do_sample'] = False
         config['repetition_penalty'] = 1.0 # 1.0 means no penalty
-    
+
     generation_config_hf = config.copy()
     if generation_config_hf.get('stop_criteria'):
         generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
     generation_config_hf.pop('ignore_eos', None)
 
     # Encode the batch of prompts
-    tokenizer.padding_side = "left"
-    encoded_prompts = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True)
+    hf_tokenizer.padding_side = "left"
+    encoded_prompts = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True)
     prompt_ids, attention_mask = encoded_prompts['input_ids'], encoded_prompts['attention_mask']
-    
-    hf_encoded_outputs = model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf)
+
+    hf_encoded_outputs = opt_model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf)
 
     hf_outputs = []
     for idx, hf_encoded_out in enumerate(hf_encoded_outputs):
         prompt_count = idx // num_beams
-        hf_outputs.append(tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True))
+        hf_outputs.append(hf_tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True))
 
-    ov_outputs = pipe.generate(prompts, **config).texts
+    ov_outputs = ov_pipe.generate(prompts, **config).texts
 
     hf_outputs.sort()
     ov_outputs.sort()
@@ -67,8 +65,9 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro
             print(f'ov_output: {ov_output}')
         assert hf_output == ov_output
 
-def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str):
-    model_id, path, tokenizer, model, pipe = model_descr
+
+def run_hf_ov_genai_comparison_text_inputs(model_descr, generation_config: Dict, prompt: str):
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
 
     config = generation_config.copy()  # to avoid side effects
 
@@ -85,12 +84,12 @@ def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str
         generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
     generation_config_hf.pop('ignore_eos', None)
 
-    encoded_prompt = tokenizer([prompt], return_tensors='pt', add_special_tokens=True)
+    encoded_prompt = hf_tokenizer([prompt], return_tensors='pt', add_special_tokens=True)
     prompt_ids, attention_mask = encoded_prompt['input_ids'], encoded_prompt['attention_mask']
-    hf_encoded_output = model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf)
-    hf_output = tokenizer.decode(hf_encoded_output[0, prompt_ids.shape[1]:], skip_special_tokens=True)
+    hf_encoded_output = opt_model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf)
+    hf_output = hf_tokenizer.decode(hf_encoded_output[0, prompt_ids.shape[1]:], skip_special_tokens=True)
 
-    ov_output = pipe.generate(prompt, **config)
+    ov_output = ov_pipe.generate(prompt, **config)
     if config.get('num_return_sequences', 1) > 1:
         assert hf_output in ov_output.texts
     else:
@@ -100,14 +99,15 @@ def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str
 
         assert hf_output == ov_output
 
-def hf_ov_genai_tensors_comparison(
+
+def run_hf_ov_genai_comparison_encoded_inputs(
         model_descr, 
         generation_config: Dict, 
         input_ids: np.ndarray, 
         attention_mask: Optional[np.array] = None
     ):
     device = 'CPU'
-    model_id, path, tokenizer, model, pipe = model_descr
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
 
     config = generation_config.copy()  # to avoid side effects
 
@@ -131,10 +131,8 @@ def hf_ov_genai_tensors_comparison(
         inputs_hf = dict(inputs=torch.tensor(input_ids))
         inputs_ov = ov.Tensor(input_ids)
 
-    hf_output = model.generate(**inputs_hf, **generation_config_hf)
-
-    pipe = ov_genai.LLMPipeline(path, device)
-    ov_output = pipe.generate(inputs_ov, **config)
+    hf_output = opt_model.generate(**inputs_hf, **generation_config_hf)
+    ov_output = ov_pipe.generate(inputs_ov, **config)
 
     hf_res = hf_output[0, input_ids.shape[1]:].numpy()
     ov_res = np.array(ov_output.tokens, dtype=np.int64)
@@ -154,7 +152,8 @@ def hf_ov_genai_tensors_comparison(
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_decoding(model_descr, generation_config, prompt):
-    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
+    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
+
 
 input_tensors_list = [
     # input_ids, attention_mask
@@ -165,62 +164,8 @@ def test_decoding(model_descr, generation_config, prompt):
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_ov_tensors(model_descr, inputs):
-    hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs)
-
-
-prompts = [
-    'table is made of',
-    '你好！ 你好嗎？',
-    'Alan Turing was a',
-    'The Sun is yellow because',
-    ['The Sun is yellow because', 'Alan Turing was a', 'Alan Turing was a']
-]
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_genai_tokenizer_encode(model_descr, prompt):
-    model_id, path, tokenizer, model, pipe = read_model(model_descr)
-    tok = pipe.get_tokenizer()
-    
-    encoded_ov = tok.encode(prompt).input_ids.data
-    if isinstance(prompt, list):
-        encoded_hf = tokenizer.batch_encode_plus(prompt)['input_ids']
-        for tokens_ov, tokens_hf in zip(encoded_ov, encoded_hf):
-            assert np.all(tokens_ov == tokens_hf)
-    else:
-        encoded_hf = tokenizer.encode(prompt)
-        assert np.all(encoded_hf == encoded_ov[0])
-
-encoded_prompts = [
-    [1, 1591, 338, 1754, 310],
-    [1, 17102,   323,  3864,   471,   263],
-    
-    # chineze characters
-    [1, 29871, 30919, 31076, 30584, 29871, 30919, 31076, 232, 154, 145, 30882],
-
-    # On meta-llama/Meta-Llama-3-8B-Instruct this becomes longer  after removing the last token
-    [3113, 264, 364, 267],
-
-    # batched tokens
-    [[1, 1591, 338, 1754, 310], [1, 1591, 338, 1754, 310], [1, 17102,   323,  3864,   471,   263]]
-]
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.parametrize("encoded_prompt", encoded_prompts)
-@pytest.mark.precommit
-def test_genai_tokenizer_decode(model_descr, encoded_prompt):
-    model_id, path, tokenizer, model, pipe = read_model(model_descr)
-    tok = pipe.get_tokenizer()
-    decoded_ov = tok.decode(encoded_prompt)
-    
-    if isinstance(encoded_prompt[0], list):
-        decoded_hf = tokenizer.batch_decode(encoded_prompt, skip_special_tokens=True)
-        for tokens_ov, tokens_hf in zip(decoded_ov, decoded_hf):
-            assert np.all(tokens_ov == tokens_hf)
-    else:
-        decoded_hf = tokenizer.decode(encoded_prompt, skip_special_tokens=True)
-        assert decoded_hf == decoded_ov
+def test_encoded_inputs(model_descr, inputs):
+    run_hf_ov_genai_comparison_encoded_inputs(read_model(model_descr), dict(max_new_tokens=20), *inputs)
 
 
 test_configs = [
@@ -239,7 +184,7 @@ def test_genai_tokenizer_decode(model_descr, encoded_prompt):
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_multibatch(model_descr, generation_config, prompts):
+def test_batch_text_input(model_descr, generation_config, prompts):
     run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts)
 
 
@@ -261,7 +206,7 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
         num_return_sequences=num_beam_groups * group_size, 
         max_new_tokens=max_new_tokens, 
     )
-    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
+    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
 
 
 @pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC])
@@ -283,7 +228,7 @@ def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
         max_new_tokens=max_new_tokens, 
         stop_criteria=stop_criteria,
     )
-    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
+    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
 
 
 # test long sequences
@@ -302,7 +247,7 @@ def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size,
         num_return_sequences=num_beam_groups * group_size, 
         max_new_tokens=max_new_tokens, 
     )
-    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
+    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
 
 
 @pytest.mark.parametrize("prompt", prompts)
@@ -317,17 +262,17 @@ def test_greedy_repetition_penalty(model_descr, prompt):
         max_new_tokens=20,
         do_sample=False
     )
-    run_hf_ov_genai_comparison((model_id, path, tokenizer, model, pipe), generation_config, prompt)
+    run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt)
 
     generation_config = dict(
         repetition_penalty=1.0,
         max_new_tokens=20,
         do_sample=False
     )
-    run_hf_ov_genai_comparison((model_id, path, tokenizer, model, pipe), generation_config, prompt)
+    run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt)
 
     ov_output = pipe.generate(prompt, **generation_config)
-    
+
     generation_config = dict(
         repetition_penalty=0.5,
         max_new_tokens=20,
@@ -346,19 +291,19 @@ def user_defined_callback(subword):
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_callback_one_string(callback):
-    pipe = read_model(get_models_list()[0])[4]
-    generation_config = pipe.get_generation_config()
+    ov_pipe = read_model(get_models_list()[0])[4]
+    generation_config = ov_pipe.get_generation_config()
     generation_config.max_new_tokens = 10
-    pipe.generate('table is made of', generation_config, callback)
+    ov_pipe.generate('table is made of', generation_config, callback)
 
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_callback_batch_fail(callback):
-    pipe = read_model(get_models_list()[0])[4]
+def test_callback_batch_throws(callback):
+    ov_pipe = read_model(get_models_list()[0])[4]
     with pytest.raises(RuntimeError):
-        pipe.generate(['1', '2'], ov_genai.GenerationConfig(), callback)
+        ov_pipe.generate(['1', '2'], ov_pipe.get_generation_config(), callback)
 
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
@@ -368,24 +313,25 @@ def test_callback_kwargs_one_string(callback):
     pipe = read_model(get_models_list()[0])[4]
     pipe.generate('table is made of', max_new_tokens=10, streamer=callback)
 
+
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
 @pytest.mark.nightly
 @pytest.mark.parametrize("model_descr", get_models_list())
 def test_callback_decoding_metallama(model_descr, callback):
-    # On metallam this prompt generates output which can shorten after adding new tokens.
+    # On metallama this prompt generates output which can shorten after adding new tokens.
     # Test that streamer correctly handles such cases.
     prompt = 'I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature'
     if model_descr[0] != 'meta-llama/Meta-Llama-3-8B-Instruct':
         pytest.skip()
-    pipe = read_model(model_descr)[4]
-    pipe.generate(prompt, max_new_tokens=300, streamer=callback)
+    ov_pipe = read_model(model_descr)[4]
+    ov_pipe.generate(prompt, max_new_tokens=300, streamer=callback)
 
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_callback_kwargs_batch_fail(callback):
+def test_callback_kwargs_batch_throws(callback):
     pipe = read_model(get_models_list()[0])[4]
     with pytest.raises(RuntimeError):
         pipe.generate(['1', '2'], max_new_tokens=10, streamer=callback)
@@ -408,200 +354,73 @@ def end(self):
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_streamer_one_string():
-    pipe = read_model(get_models_list()[0])[4]
-    generation_config = pipe.get_generation_config()
+    ov_pipe = read_model(get_models_list()[0])[4]
+    generation_config = ov_pipe.get_generation_config()
     generation_config.max_new_tokens = 10
-    printer = Printer(pipe.get_tokenizer())
-    pipe.generate('table is made of', generation_config, printer)
+    printer = Printer(ov_pipe.get_tokenizer())
+    ov_pipe.generate('table is made of', generation_config, printer)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_streamer_batch_fail():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
+def test_streamer_batch_throws():
+    ov_pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(ov_pipe.get_tokenizer())
     with pytest.raises(RuntimeError):
-        pipe.generate(['1', '2'], ov_genai.GenerationConfig(), printer)
+        ov_pipe.generate(['1', '2'], ov_pipe.get_generation_config(), printer)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_streamer_kwargs_one_string():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
-    pipe.generate('table is made of', max_new_tokens=10, do_sample=False, streamer=printer)
+    ov_pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(ov_pipe.get_tokenizer())
+    ov_pipe.generate('table is made of', max_new_tokens=10, do_sample=False, streamer=printer)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_streamer_kwargs_batch_fail():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
+def test_streamer_kwargs_batch_throws():
+    ov_pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(ov_pipe.get_tokenizer())
     with pytest.raises(RuntimeError):
-        pipe.generate('', num_beams=2, streamer=printer)
+        ov_pipe.generate('', num_beams=2, streamer=printer)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 def test_operator_with_callback_one_string(callback):
-    pipe = read_model(get_models_list()[0])[4]
-    ten_tokens = pipe.get_generation_config()
+    ov_pipe = read_model(get_models_list()[0])[4]
+    ten_tokens = ov_pipe.get_generation_config()
     ten_tokens.max_new_tokens = 10
-    pipe('talbe is made of', ten_tokens, callback)
+    ov_pipe('talbe is made of', ten_tokens, callback)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
-def test_operator_with_callback_batch_fail(callback):
-    pipe = read_model(get_models_list()[0])[4]
+def test_operator_with_callback_batch_throws(callback):
+    ov_pipe = read_model(get_models_list()[0])[4]
     with pytest.raises(RuntimeError):
-        pipe(['1', '2'], ov_genai.GenerationConfig(), callback)
+        ov_pipe(['1', '2'], ov_pipe.get_generation_config(), callback)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_operator_with_streamer_kwargs_one_string():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
-    pipe('hi', max_new_tokens=10, do_sample=True, streamer=printer)
+    ov_pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(ov_pipe.get_tokenizer())
+    ov_pipe('hi', max_new_tokens=10, do_sample=True, streamer=printer)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_operator_with_streamer_kwargs_batch_fail():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
+def test_operator_with_streamer_kwargs_batch_throws():
+    ov_pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(ov_pipe.get_tokenizer())
     with pytest.raises(RuntimeError):
-        pipe('', num_beams=2, streamer=printer)
-
-
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_load_special_tokens_ids_1(model_tmp_path):
-    # test when there is an available config.json
-    config_json = { 
-        "pad_token_id": 422,
-        "bos_token_id": 42, 
-        "eos_token_id": 37,
-    }
-    tok = load_tok([(config_json, "config.json")], model_tmp_path[1])
-    assert tok.get_pad_token_id() == config_json['pad_token_id']
-    assert tok.get_bos_token_id() == config_json['bos_token_id']
-    assert tok.get_eos_token_id() == config_json['eos_token_id']
-
-
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_load_special_tokens_str_2(model_tmp_path):
-    # test with special_tokens_map
-    special_tokens_map_json = { 
-        "pad_token": {"content": "<custom_pad>"},
-        "bos_token": {"content": "<custom_bos>"},
-        "eos_token": {"content": "<custom_eos>"},
-    }
-    tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], model_tmp_path[1])
-    assert tok.get_pad_token() == special_tokens_map_json['pad_token']["content"]
-    assert tok.get_bos_token() == special_tokens_map_json['bos_token']["content"]
-    assert tok.get_eos_token() == special_tokens_map_json['eos_token']["content"]
-
-
-@pytest.mark.precommit
-@pytest.mark.nightly
-@pytest.mark.skip(reason="CVS-158682 - RTInfo is not modified in tests for unknown reasons")
-def test_load_special_tokens_3_(model_tokenizers_path_tmp_path):
-    # special_tokens_map is not available 
-    # but tokenize_config.json exists
-    # will load both string and integer representations
-    tok_config_json = {
-        "added_tokens_decoder": {
-            "422": {"content": "<pad>"},
-            "37": {"content": "<s>"},
-            "42": {"content": "</s>"},
-        },
-        "pad_token": "<pad>",
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-    }
-
-    tok = load_tok([(tok_config_json, "tokenizer_config.json")], model_tokenizers_path_tmp_path[1])
-    assert tok.get_pad_token() == tok_config_json['pad_token']
-    assert tok.get_bos_token() == tok_config_json['bos_token']
-    assert tok.get_eos_token() == tok_config_json['eos_token']
-
-    assert tok.get_pad_token_id() == 422
-    assert tok.get_bos_token_id() == 37
-    assert tok.get_eos_token_id() == 42
-
-
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_load_special_tokens_3(model_tmp_path):
-    # both config.json is available and tokenizer_config.json available
-    # check that it does not read int values from tokenizer_config.json if they are in config.json
-    tok_config_json = {
-    "added_tokens_decoder": {
-        # integers differ from config.json to check they don't override config.json
-        "777": {"content": "<pad>"},
-        "888": {"content": "<s>"},
-        "656": {"content": "</s>"},
-    },
-    "pad_token": "<pad>",
-    "bos_token": "<s>",
-    "eos_token": "</s>",
-    }
-    config_json = { 
-        "pad_token_id": 422,
-        "bos_token_id": 42, 
-        "eos_token_id": 37,
-    }
-    configs = [
-        (tok_config_json, "tokenizer_config.json"),
-        (config_json, "config.json")
-    ]
-    tok = load_tok(configs, model_tmp_path[1])
-    assert tok.get_pad_token_id() == config_json['pad_token_id']
-    assert tok.get_bos_token_id() == config_json['bos_token_id']
-    assert tok.get_eos_token_id() == config_json['eos_token_id']
-
-    assert tok.get_pad_token() == tok_config_json['pad_token']
-    assert tok.get_bos_token() == tok_config_json['bos_token']
-    assert tok.get_eos_token() == tok_config_json['eos_token']
-
-
-@pytest.mark.precommit
-@pytest.mark.nightly
-@pytest.mark.xfail(
-    raises=AssertionError, 
-    reason="CVS-143410 ov tokenizer should be aligned with hf",
-    strict=False,
-)
-def test_load_special_tokens_4(model_tmp_path):
-    # only string representation is provided, find token integers by inference
-    model_id, temp_path = model_tmp_path
-    tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    
-    special_tokens_map_json = {}
-    token_str_int_map = {}
-    special_token_names = ['pad_token', 'bos_token', 'eos_token']
-    for token_str in special_token_names:
-        if hasattr(tokenizer, token_str):
-            token_val = getattr(tokenizer, token_str)
-            special_tokens_map_json.update({token_str: {"content": token_val}})
-            token_id = tokenizer(token_val, add_special_tokens=False)['input_ids'][0]
-            token_str_int_map.update({token_str: token_id})
-
-    # since only string representations are present in the json will try to get by inference
-    tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], temp_path)
-
-    # check ids inferred correctly for special tokens existing if HF tokenizer
-    if 'pad_token' in token_str_int_map:
-        assert tok.get_pad_token_id() == token_str_int_map['pad_token']
-    if 'bos_token' in token_str_int_map:
-        assert tok.get_bos_token_id() == token_str_int_map['bos_token']
-    if 'eos_token' in token_str_int_map:
-        assert tok.get_eos_token_id() == token_str_int_map['eos_token']
+        ov_pipe('', num_beams=2, streamer=printer)
 
 
 invalid_configs = [
@@ -617,23 +436,24 @@ def test_load_special_tokens_4(model_tmp_path):
 @pytest.mark.parametrize("generation_config", invalid_configs)
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_invalid_configs(model_tmp_path, generation_config):
+def test_invalid_generation_configs_throws(model_tmp_path, generation_config):
     model_id, temp_path = model_tmp_path
     config_json = {}
-    pipe = load_pipe([(config_json, "config.json")], temp_path)
+    ov_pipe = load_genai_pipe_with_configs([(config_json, "config.json")], temp_path)
     with pytest.raises(RuntimeError):
-        pipe.generate('blah blah', **generation_config)
+        ov_pipe.generate('blah blah', **generation_config)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_valid_configs(model_tmp_path):
     model_id, temp_path = model_tmp_path
-    pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path)
+    ov_pipe = load_genai_pipe_with_configs([({"eos_token_id": 37}, "config.json")], temp_path)
 
     config = ov_genai.GenerationConfig()
     config.do_sample = True  # no eos_token_id but it's loaded from config.json
-    pipe.set_generation_config(config)
+    ov_pipe.set_generation_config(config)
+
 
 invalid_py_configs = [
     dict(num_beam_groups=3, num_beams=15, do_sample=True),
@@ -648,49 +468,48 @@ def test_valid_configs(model_tmp_path):
 @pytest.mark.precommit
 @pytest.mark.nightly
 @pytest.mark.parametrize("generation_config", invalid_py_configs)
-def test_python_generation_config_validation(model_tmp_path, generation_config):
+def test_python_generation_config_validation_throws(model_tmp_path, generation_config):
     model_id, temp_path = model_tmp_path
-    pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path)
-    
+    ov_pipe = load_genai_pipe_with_configs([({"eos_token_id": 37}, "config.json")], temp_path)
+
     # 'unexisting_key_name' key validity is checked in pybind and ValueError will be returned
     #  instead of RuntimeError, which is returned when GenerationConfig values are validated
     return_exception_type = ValueError if 'unexisting_key_name' in generation_config else RuntimeError
     with pytest.raises(return_exception_type):
-        pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config))
+        ov_pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config))
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_unicode_pybind_decoding_1():
+def test_unicode_pybind_decoding_one_string():
     # On this model this prompt generates unfinished utf string.
     # Test that pybind will not fail.
     model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
-    pipe = read_model((model_id, path))[4]
-    res_str = pipe.generate(',', max_new_tokens=4)
+    ov_pipe = read_model((model_id, path))[4]
+    res_str = ov_pipe.generate(',', max_new_tokens=4)
     assert '�' == res_str[-1]
 
 
-
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_unicode_pybind_decoding_2():
+def test_unicode_pybind_decoding_batched():
     # On this model this prompt generates unfinished utf string.
     # Test that pybind will not fail.
     model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
-    pipe = read_model((model_id, path))[4]
-    res_str = pipe.generate([","], max_new_tokens=4)
+    ov_pipe = read_model((model_id, path))[4]
+    res_str = ov_pipe.generate([","], max_new_tokens=4)
     assert '�' == res_str.texts[0][-1]
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_unicode_pybind_decoding_3():
+def test_unicode_pybind_decoding_one_string_streamer():
     # On this model this prompt generates unfinished utf-8 string
     # and streams it. Test that pybind will not fail while we pass string to python.
     model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
-    pipe = read_model((model_id, path))[4]
+    ov_pipe = read_model((model_id, path))[4]
     res_str = []
-    pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x))
+    ov_pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x))
     assert '�' == res_str[-1]
 
 
@@ -741,22 +560,24 @@ def test_continuous_batching_vs_stateful(prompt, generation_config):
         for gen, ref in zip(generated.scores, reference.scores):
             assert math.isclose(gen, ref, abs_tol=0.0003)
 
+
 @pytest.mark.parametrize("prompt", prompts)
 @pytest.mark.precommit
 def test_cb_streamer_vs_return_vs_stateful(prompt):
-    model_id, path, tokenizer, model, stateful = read_model((
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((
         "facebook/opt-125m",
         Path("opt-125m")
     ))
-    cb = get_continuous_batching(path)
+    cb_pipe = get_continuous_batching(path)
     streamed = []
-    generated = cb.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword))
-    reference = stateful.generate(prompt, max_new_tokens=20)
+    generated = cb_pipe.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword))
+    reference = ov_pipe.generate(prompt, max_new_tokens=20)
     assert generated == "".join(streamed)
     assert "".join(streamed) == reference
 
+
 def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: str) -> ov_genai.PerfMetrics:
-    model_id, path, tokenizer, model, pipe = model_descr
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
 
     config = generation_config.copy()  # to avoid side effects
 
@@ -767,7 +588,7 @@ def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: st
         # Do not apply 'repetition_penalty' if sampling is not used.
         config['do_sample'] = False
         config['repetition_penalty'] = 1.0 # 1.0 means no penalty
-    return pipe.generate([prompt], **config).perf_metrics
+    return ov_pipe.generate([prompt], **config).perf_metrics
 
 
 test_cases = [
@@ -851,19 +672,19 @@ def test_perf_metrics(model_descr, generation_config, prompt):
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_batch_switch():
-    pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
-    pipe.generate(["a"], max_new_tokens=2)
-    pipe.generate(["1", "2"], max_new_tokens=2)
+    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
+    ov_pipe.generate(["a"], max_new_tokens=2)
+    ov_pipe.generate(["1", "2"], max_new_tokens=2)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_stop_token_ids():
-    pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
-    res = pipe.generate(
+    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
+    res = ov_pipe.generate(
         ov.Tensor([(1,)]),
         max_new_tokens=3,
-        stop_token_ids={-1, 9935, pipe.get_tokenizer().get_eos_token_id()},
+        stop_token_ids={-1, 9935, ov_pipe.get_tokenizer().get_eos_token_id()},
         include_stop_str_in_output=False
     )
     assert 2 == len(res.tokens[0])
@@ -873,8 +694,8 @@ def test_stop_token_ids():
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_stop_strings():
-    pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
-    res = pipe.generate(
+    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
+    res = ov_pipe.generate(
         "",
         max_new_tokens=5,
         stop_strings={"ignored", "боль"}
diff --git a/tests/python_tests/test_preemption.py b/tests/python_tests/test_preemption.py
index 49d6c8f6b0..7c648e73dc 100644
--- a/tests/python_tests/test_preemption.py
+++ b/tests/python_tests/test_preemption.py
@@ -4,7 +4,7 @@
 import pytest
 
 from openvino_genai import GenerationConfig
-from common import get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \
+from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \
     get_scheduler_config, run_test_pipeline, get_beam_search, get_greedy, \
     get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
     get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p
@@ -87,7 +87,7 @@ def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse):
         config.rng_seed = 0
         config.max_new_tokens = 30
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
 
     models_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, models_path)
@@ -168,7 +168,7 @@ def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse):
     for config in generation_configs:
         config.rng_seed = 0
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
 
     models_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, models_path)
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index d5df28bfd6..fbcce76bf7 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -10,7 +10,7 @@
 from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
 from typing import List, TypedDict
 
-from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, \
+from common import run_test_pipeline, read_models_list, get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, \
     generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, \
     get_greedy_with_penalties, get_multinomial_temperature, \
     get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \
@@ -28,18 +28,18 @@
 
 
 @pytest.mark.precommit
-@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit")))
+@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit")))
 def test_sampling_precommit(tmp_path, model_id):
     run_test_pipeline(tmp_path, model_id)
 
 
 @pytest.mark.nightly
-@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly")))
+@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly")))
 def test_sampling_nightly(tmp_path, model_id):
     run_test_pipeline(tmp_path, model_id)
 
 @pytest.mark.real_models
-@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models")))
+@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models")))
 def test_real_models(tmp_path, model_id):
     run_test_pipeline(tmp_path, model_id)
 
@@ -313,7 +313,7 @@ def test_individual_generation_configs_random(tmp_path, test_struct: RandomSampl
     generation_config.rng_seed = 0
     generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
 
     models_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, models_path)
@@ -337,12 +337,12 @@ def test_echo_without_completion(tmp_path, get_generation_config, max_num_batche
     scheduler_config.max_num_batched_tokens = max_num_batched_tokens
     generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
 
     model_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, model_path)
 
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config, "CPU", {})
+    pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU")
 
     outputs = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(outputs))
@@ -364,12 +364,12 @@ def test_echo_with_completion(tmp_path, get_generation_config, max_num_batched_t
     scheduler_config.max_num_batched_tokens = max_num_batched_tokens
     generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
 
     model_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, model_path)
 
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config, "CPU", {})
+    pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU")
 
     outputs = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(outputs))
@@ -392,12 +392,12 @@ def test_post_oom_health(tmp_path, sampling_config):
     scheduler_config.num_kv_blocks = 10
     generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
 
     models_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, models_path)
 
-    pipe = ContinuousBatchingPipeline(models_path.absolute().as_posix(), Tokenizer(models_path.absolute().as_posix()), scheduler_config, "CPU", {})
+    pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU")
     # First run should return incomplete response
     output = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert (len(output))
diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py
new file mode 100644
index 0000000000..0c2a106d50
--- /dev/null
+++ b/tests/python_tests/test_tokenizer.py
@@ -0,0 +1,360 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import numpy as np
+from transformers import AutoTokenizer
+from typing import Dict, Tuple, List
+import openvino_genai
+import json
+
+from ov_genai_test_utils import (
+    get_models_list,
+    get_chat_models_list,
+    read_model,
+    model_tmp_path
+)
+
+
+def load_genai_tokenizer_with_configs(configs: List[Tuple], temp_path):
+    # load Tokenizer where all configs are cleared.
+    # remove existing jsons from previous tests
+    for json_file in temp_path.glob("*.json"):
+        json_file.unlink()
+
+    for config_json, config_name in configs:
+        with (temp_path / config_name).open('w') as f:
+            json.dump(config_json, f)
+    return openvino_genai.Tokenizer(temp_path)
+
+
+def get_chat_templates():
+    # Returns chat templates saved in tokenizer_configs.py, 
+    # but skips some models that currently are not processed correctly.
+
+    skipped_models = {
+        # TODO: openchat/openchat_3.5 and berkeley-nest/Starling-LM-7B-alpha have the same template.
+        # Need to enable and unskip, since it's preset in continuous batching and has >100 000 downloads.
+        "openchat/openchat-3.5-0106",
+        
+        # These models fail even on HF so no need to check if applying chat matches.
+        "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy",
+        "codellama/CodeLlama-34b-Instruct-hf",
+        "deepseek-ai/deepseek-math-7b-rl",
+        "allenai/tulu-2-7b",
+        "alexsobolev/IcaroLM",
+        "tokyotech-llm/Swallow-7b-instruct-v0.1",
+        "bofenghuang/vigogne-2-7b-chat",
+        "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k",
+        "AliAbdelrasheed/maqa_llama_4bit",
+        "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored",
+
+        # TODO: Need to support chat templates in more models: CVS-145963
+        # Either ov_genai is unable to parse chat_template or results do not match with HF.
+        "meta-llama/Meta-Llama-3-8B-Instruct",
+        "databricks/dbrx-instruct", # Chat template is not supported by Jinja2Cpp
+        "mosaicml/mpt-30b-chat",
+        "deepseek-ai/deepseek-coder-6.7b-instruct", # Chat template is not supported by Jinja2Cpp
+        "maldv/winter-garden-7b-alpha", # Chat template is not supported by Jinja2Cpp
+        "ishorn5/RTLCoder-Deepseek-v1.1", # Chat template is not supported by Jinja2Cpp
+        "openchat/openchat-3.5-0106",
+        "casperhansen/llama-3-70b-instruct-awq",
+        "TheBloke/deepseek-coder-33B-instruct-GPTQ",
+        "AI-Sweden-Models/gpt-sw3-356m-instruct",
+        "google/gemma-7b-it",
+        "THUDM/cogvlm2-llama3-chat-19B",
+        "KnutJaegersberg/internlm-20b-llama",
+        "maywell/Synatra-Mixtral-8x7B",
+        "MediaTek-Research/Breeze-7B-Instruct-v1_0",
+        "bofenghuang/vigostral-7b-chat",
+        "meetkai/functionary-small-v2.5", # Chat template is not supported by Jinja2Cpp
+        "openchat/openchat-3.6-8b-20240522",
+        "tenyx/TenyxChat-7B-v1",
+        "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2",
+        "yam-peleg/Hebrew-Gemma-11B-V2",
+        "shenzhi-wang/Llama3-8B-Chinese-Chat", # AssertionError
+        "nlpai-lab/KULLM3",
+        "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1",
+        "MediaTek-Research/Breeze-7B-Instruct-v0_1", 
+        "shanchen/llama3-8B-slerp-biomed-chat-chinese", # AssertionError
+        "MLP-KTLim/llama-3-Korean-Bllossom-8B",
+        "aloobun/CosmicBun-8B", # Chat template is not supported by Jinja2Cpp
+        "codellama/CodeLlama-70b-Instruct-hf",
+        "gorilla-llm/gorilla-openfunctions-v2", # Chat template is not supported by Jinja2Cpp
+        "BramVanroy/Llama-2-13b-chat-dutch"
+    }
+
+    from tokenizer_configs import get_tokenizer_configs
+    return [(k, v) for k, v in get_tokenizer_configs().items() if k not in skipped_models]
+
+
+prompts = [
+    'table is made of',
+    '你好！ 你好嗎？',
+    'Alan Turing was a',
+    'The Sun is yellow because',
+    ['The Sun is yellow because', 'Alan Turing was a', 'Alan Turing was a']
+]
+@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.parametrize("prompt", prompts)
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_encode(model_descr, prompt):
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr)
+    ov_tokenizer = ov_pipe.get_tokenizer()
+
+    encoded_ov = ov_tokenizer.encode(prompt).input_ids.data
+    if isinstance(prompt, list):
+        encoded_hf = hf_tokenizer.batch_encode_plus(prompt)['input_ids']
+        for tokens_ov, tokens_hf in zip(encoded_ov, encoded_hf):
+            assert np.all(tokens_ov == tokens_hf)
+    else:
+        encoded_hf = hf_tokenizer.encode(prompt)
+        assert np.all(encoded_hf == encoded_ov[0])
+
+
+encoded_prompts = [
+    [1, 1591, 338, 1754, 310],
+    [1, 17102,   323,  3864,   471,   263],
+
+    # chineze characters
+    [1, 29871, 30919, 31076, 30584, 29871, 30919, 31076, 232, 154, 145, 30882],
+
+    # On meta-llama/Meta-Llama-3-8B-Instruct this becomes longer  after removing the last token
+    [3113, 264, 364, 267],
+
+    # batched tokens
+    [[1, 1591, 338, 1754, 310], [1, 1591, 338, 1754, 310], [1, 17102,   323,  3864,   471,   263]]
+]
+@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.parametrize("encoded_prompt", encoded_prompts)
+@pytest.mark.precommit
+def test_decode(model_descr, encoded_prompt):
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr)
+    ov_tokenizer = ov_pipe.get_tokenizer()
+    decoded_ov = ov_tokenizer.decode(encoded_prompt)
+
+    if isinstance(encoded_prompt[0], list):
+        decoded_hf = hf_tokenizer.batch_decode(encoded_prompt, skip_special_tokens=True)
+        for tokens_ov, tokens_hf in zip(decoded_ov, decoded_hf):
+            assert np.all(tokens_ov == tokens_hf)
+    else:
+        decoded_hf = hf_tokenizer.decode(encoded_prompt, skip_special_tokens=True)
+        assert decoded_hf == decoded_ov
+
+
+conversation = [
+    {'role': 'user', 'content': '1+1='},
+    {'role': 'assistant', 'content': '1 + 1 = 2'},
+    {'role': 'user', 'content': 'What is the previous answer?'},
+    {'role': 'assistant', 'content': 'The previous answer was: 1 + 1 = 2. Please ask me your next question.'},
+    {'role': 'user', 'content': 'Why is the sun yellow?'},
+    {'role': 'assistant', 'content': 'Because it emits yeloow light.'},
+    {'role': 'user', 'content': 'What was my first question?'},
+]
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.parametrize('chat_config', get_chat_templates())
+def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
+    tokenizer_config = chat_config[1]
+
+    # Will load openvino_model for tiny-random-phi as a placeholder
+    # but indeed only Tokenizer and apply_chat_template will be tested.
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(get_models_list()[0])
+
+    hf_full_history_str = hf_tokenizer.apply_chat_template(conversation,
+        add_generation_prompt=False,
+        tokenize=False,
+        **tokenizer_config)
+
+    ov_tokenizer = load_genai_tokenizer_with_configs([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1])
+    ov_tokenizer.set_chat_template(tokenizer_config['chat_template'])
+    ov_full_history_str = ov_tokenizer.apply_chat_template(conversation, add_generation_prompt=False)
+
+    if ov_full_history_str != hf_full_history_str:
+        print(f'hf reference: {hf_full_history_str}')
+        print(f'ov_genai out: {ov_full_history_str}')
+    assert ov_full_history_str == hf_full_history_str
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_set_chat_template():
+    model_descr = get_chat_models_list()[0]
+    model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+
+    prompt = "how are you?"
+    dummy_conversation = [
+        {'role': 'user', 'content': prompt},
+    ]
+
+    ov_tokenizer = ov_pipe.get_tokenizer()
+    identity_chat_template = "{% for message in messages %}{{ message['content'] }}{% endfor %}"
+
+    templated_prompt_inline = ov_tokenizer.apply_chat_template(dummy_conversation, add_generation_prompt=False, chat_template=identity_chat_template)
+
+    ov_tokenizer.set_chat_template(identity_chat_template)
+    templated_prompt = ov_tokenizer.apply_chat_template(dummy_conversation, add_generation_prompt=False)
+
+    assert templated_prompt_inline == templated_prompt
+    assert prompt == templated_prompt
+
+
+prompts = [
+    '1+1=',
+    'What is the previous answer?',
+    'Why is the Sun yellow?',
+    'What was my first question?',
+    ['Why is the Sun yellow?'],
+    "若我有一亿美元，在人工智能盛行的今天，我怎样投资才能收益最大化？",
+    "מחרוזת בדיקה",
+    "Multiline\nstring!\nWow!",
+]
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.parametrize("add_special_tokens", [True, False])
+@pytest.mark.parametrize("skip_special_tokens", [True, False])
+@pytest.mark.parametrize("prompt", prompts)
+def test_encode_decode_with_special_tokens_option(add_special_tokens, skip_special_tokens, prompt):
+    import numpy as np
+    model_descr = get_chat_models_list()[0]
+    model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+    ov_tokenzier = ov_pipe.get_tokenizer()
+
+    # Calling encode with 'add_special_tokens' will set state flag.
+    ov_res = ov_tokenzier.encode(prompt, add_special_tokens=add_special_tokens).input_ids.data
+    hf_res = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"]
+    assert np.all(ov_res == hf_res)
+
+    # Decode with 'skip_special_tokens'
+    decoded_genai = ov_tokenzier.decode(ov_res, skip_special_tokens=skip_special_tokens)[0]
+    decoded_hf = hf_tokenizer.decode(hf_res[0], skip_special_tokens=skip_special_tokens)
+    assert decoded_genai == decoded_hf
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_load_special_tokens_from_config_json(model_tmp_path):
+    # test when there is an available config.json
+    config_json = {
+        "pad_token_id": 422,
+        "bos_token_id": 42,
+        "eos_token_id": 37,
+    }
+    tok = load_genai_tokenizer_with_configs([(config_json, "config.json")], model_tmp_path[1])
+    assert tok.get_pad_token_id() == config_json['pad_token_id']
+    assert tok.get_bos_token_id() == config_json['bos_token_id']
+    assert tok.get_eos_token_id() == config_json['eos_token_id']
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_load_special_tokens_from_special_tokens_map_json(model_tmp_path):
+    # test with special_tokens_map
+    special_tokens_map_json = {
+        "pad_token": {"content": "<custom_pad>"},
+        "bos_token": {"content": "<custom_bos>"},
+        "eos_token": {"content": "<custom_eos>"},
+    }
+    tok = load_genai_tokenizer_with_configs([(special_tokens_map_json, "special_tokens_map.json")], model_tmp_path[1])
+    assert tok.get_pad_token() == special_tokens_map_json['pad_token']["content"]
+    assert tok.get_bos_token() == special_tokens_map_json['bos_token']["content"]
+    assert tok.get_eos_token() == special_tokens_map_json['eos_token']["content"]
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.skip(reason="CVS-158682 - RTInfo is not modified in tests for unknown reasons")
+def test_load_special_tokens_from_tokenizer_config_json(model_tokenizers_path_tmp_path):
+    # special_tokens_map is not available
+    # but tokenize_config.json exists
+    # will load both string and integer representations
+    tok_config_json = {
+        "added_tokens_decoder": {
+            "422": {"content": "<pad>"},
+            "37": {"content": "<s>"},
+            "42": {"content": "</s>"},
+        },
+        "pad_token": "<pad>",
+        "bos_token": "<s>",
+        "eos_token": "</s>",
+    }
+
+    tok = load_genai_tokenizer_with_configs([(tok_config_json, "tokenizer_config.json")], model_tokenizers_path_tmp_path[1])
+    assert tok.get_pad_token() == tok_config_json['pad_token']
+    assert tok.get_bos_token() == tok_config_json['bos_token']
+    assert tok.get_eos_token() == tok_config_json['eos_token']
+
+    assert tok.get_pad_token_id() == 422
+    assert tok.get_bos_token_id() == 37
+    assert tok.get_eos_token_id() == 42
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_load_special_tokens_from_tokenizer_config_and_config_json(model_tmp_path):
+    # both config.json is available and tokenizer_config.json available
+    # check that it does not read int values from tokenizer_config.json if they are in config.json
+    tok_config_json = {
+    "added_tokens_decoder": {
+        # integers differ from config.json to check they don't override config.json
+        "777": {"content": "<pad>"},
+        "888": {"content": "<s>"},
+        "656": {"content": "</s>"},
+    },
+    "pad_token": "<pad>",
+    "bos_token": "<s>",
+    "eos_token": "</s>",
+    }
+    config_json = {
+        "pad_token_id": 422,
+        "bos_token_id": 42,
+        "eos_token_id": 37,
+    }
+    configs = [
+        (tok_config_json, "tokenizer_config.json"),
+        (config_json, "config.json")
+    ]
+    tok = load_genai_tokenizer_with_configs(configs, model_tmp_path[1])
+    assert tok.get_pad_token_id() == config_json['pad_token_id']
+    assert tok.get_bos_token_id() == config_json['bos_token_id']
+    assert tok.get_eos_token_id() == config_json['eos_token_id']
+
+    assert tok.get_pad_token() == tok_config_json['pad_token']
+    assert tok.get_bos_token() == tok_config_json['bos_token']
+    assert tok.get_eos_token() == tok_config_json['eos_token']
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.xfail(
+    raises=AssertionError,
+    reason="CVS-143410 ov tokenizer should be aligned with hf",
+    strict=False,
+)
+def test_load_special_tokens_from_special_tokens_map_json_with_string_repr(model_tmp_path):
+    # only string representation is provided, find token integers by inference
+    model_id, temp_path = model_tmp_path
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+
+    special_tokens_map_json = {}
+    token_str_int_map = {}
+    special_token_names = ['pad_token', 'bos_token', 'eos_token']
+    for token_str in special_token_names:
+        if hasattr(tokenizer, token_str):
+            token_val = getattr(tokenizer, token_str)
+            special_tokens_map_json.update({token_str: {"content": token_val}})
+            token_id = tokenizer(token_val, add_special_tokens=False)['input_ids'][0]
+            token_str_int_map.update({token_str: token_id})
+
+    # since only string representations are present in the json will try to get by inference
+    tok = load_genai_tokenizer_with_configs([(special_tokens_map_json, "special_tokens_map.json")], temp_path)
+
+    # check ids inferred correctly for special tokens existing if HF tokenizer
+    if 'pad_token' in token_str_int_map:
+        assert tok.get_pad_token_id() == token_str_int_map['pad_token']
+    if 'bos_token' in token_str_int_map:
+        assert tok.get_bos_token_id() == token_str_int_map['bos_token']
+    if 'eos_token' in token_str_int_map:
+        assert tok.get_eos_token_id() == token_str_int_map['eos_token']
+
diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_generate_api.py
index 1450ef1f2e..aa78666e32 100644
--- a/tests/python_tests/test_whisper_generate_api.py
+++ b/tests/python_tests/test_whisper_generate_api.py
@@ -6,7 +6,6 @@
 import pytest
 import openvino_tokenizers
 import openvino
-from ov_genai_test_utils import get_whisper_models_list
 import datasets
 from transformers import WhisperProcessor, pipeline, AutoTokenizer
 from optimum.intel.openvino import OVModelForSpeechSeq2Seq
@@ -15,6 +14,8 @@
 import time
 import typing
 import numpy as np
+import os
+import pathlib
 
 @pytest.fixture(scope="class", autouse=True)
 def run_gc_after_test():
@@ -25,6 +26,34 @@ def run_gc_after_test():
     yield
     gc.collect()
 
+
+def get_whisper_models_list(tiny_only=False, multilingual=False, en_only=False):
+    precommit_models = [
+        "openai/whisper-tiny",
+        "openai/whisper-tiny.en",
+        "distil-whisper/distil-small.en",
+    ]
+    if multilingual:
+        precommit_models = ["openai/whisper-tiny"]
+    if en_only:
+        precommit_models = ["openai/whisper-tiny.en", "distil-whisper/distil-small.en"]
+    if tiny_only:
+        precommit_models = ["openai/whisper-tiny"]
+
+    nightly_models = []
+
+    if pytest.run_marker == "precommit":
+        model_ids = precommit_models
+    else:
+        model_ids = nightly_models
+
+    if pytest.selected_model_ids:
+        model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
+
+    prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
+    return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
+
+
 # used whisper models are relatively small
 # cache them in memory to speedup tests
 @functools.lru_cache(3)
diff --git a/tests/python_tests/tokenizer_configs.py b/tests/python_tests/tokenizer_configs.py
index 45d60f998d..2b51dc2b0d 100644
--- a/tests/python_tests/tokenizer_configs.py
+++ b/tests/python_tests/tokenizer_configs.py
@@ -2,1011 +2,1011 @@
 def get_tokenizer_configs():
     return {
         "meta-llama/Meta-Llama-3-8B-Instruct": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
         },
         "TheBloke/Mistral-7B-OpenOrca-GPTQ": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
         },
         "upstage/SOLAR-10.7B-Instruct-v1.0": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n'  + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n'  + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}"
         },
         "Nondzu/zephyr-speakleash-010-pl-3072-32-16-0.01": {
-        "bos_token": "<s>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
         },
         "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<s>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "</s>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": None,
-        "unk_token": {
-        "__type": "AddedToken",
-        "content": "<unk>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content + ' ' + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<s>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+             "__type": "AddedToken",
+                "content": "</s>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": None,
+                "unk_token": {
+                "__type": "AddedToken",
+                "content": "<unk>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content + ' ' + eos_token }}{% endif %}{% endfor %}"
         },
         "Qwen/Qwen1.5-0.5B": {
-        "bos_token": None,
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": None,
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
         },
         "Felladrin/Llama-68M-Chat-v1": {
-        "bos_token": "<|im_start|>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<|im_end|>",
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<|im_start|>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|im_end|>",
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "databricks/dbrx-instruct": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|pad|>",
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif 'system' not in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks \u2014 remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER\\'S QUERY.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message | trim + '<|im_end|>\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|pad|>",
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif 'system' not in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks \u2014 remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER\\'S QUERY.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message | trim + '<|im_end|>\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% endif %}{% endfor %}"
         },
         "speakleash/Bielik-7B-Instruct-v0.1": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + eos_token }}{% endif %}{% endfor %}"
         },
         "internlm/internlm2-chat-7b": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "Qwen/Qwen2-7B-Instruct": {
-        "bos_token": None,
-        "eos_token": "<|im_end|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": None,
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "codellama/CodeLlama-34b-Instruct-hf": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<s>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "</s>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": None,
-        "unk_token": {
-        "__type": "AddedToken",
-        "content": "<unk>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "</s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": None,
+                "unk_token": {
+                "__type": "AddedToken",
+                "content": "<unk>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}"
         },
         "OpenBuddy/openbuddy-llama3-8b-v21.1-8k": {
-        "bos_token": None,
-        "eos_token": "<|end|>",
-        "pad_token": "<|pad|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{{'<|role|>' + message['role'] + '<|says|>' + message['content'] + '<|end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|role|>assistant<|says|>' }}{% endif %}"
+            "bos_token": None,
+            "eos_token": "<|end|>",
+            "pad_token": "<|pad|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{{'<|role|>' + message['role'] + '<|says|>' + message['content'] + '<|end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|role|>assistant<|says|>' }}{% endif %}"
         },
         "mosaicml/mpt-30b-chat": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": None,
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not 'system' in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% elif (message['role'] == 'assistant') %}{% endif %}{% endfor %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": None,
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not 'system' in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% elif (message['role'] == 'assistant') %}{% endif %}{% endfor %}"
         },
         "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO": {
-        "bos_token": "<s>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{{bos_token}}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{{bos_token}}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "deepseek-ai/deepseek-coder-6.7b-instruct": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "<|EOT|>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": {
-        "__type": "AddedToken",
-        "content": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "unk_token": None,
-        "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "<|EOT|>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": {
+                "__type": "AddedToken",
+                "content": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "unk_token": None,
+            "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
         },
         "deepseek-ai/deepseek-math-7b-rl": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": {
-        "__type": "AddedToken",
-        "content": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "unk_token": None,
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": {
+                "__type": "AddedToken",
+                "content": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "unk_token": None,
+             "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
         },
         "FINGU-AI/FinguAI-Chat-v1": {
-        "bos_token": None,
-        "eos_token": "<|im_end|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": None,
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "allenai/tulu-2-7b": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<s>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "</s>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": None,
-        "unk_token": {
-        "__type": "AddedToken",
-        "content": "<unk>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "</s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": None,
+                "unk_token": {
+                "__type": "AddedToken",
+                "content": "<unk>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
         },
         "maldv/winter-garden-7b-alpha": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{{bos_token}}{% for message in messages %}{% if 'name' in message %}{{message['name'] + ('' if 'to' not in message else ' (to ' + message['to'] + ')') + ': ' + message['content'] + '\n\n'}}{% else %}{{message['content'] + '\n\n '}}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{{bos_token}}{% for message in messages %}{% if 'name' in message %}{{message['name'] + ('' if 'to' not in message else ' (to ' + message['to'] + ')') + ': ' + message['content'] + '\n\n'}}{% else %}{{message['content'] + '\n\n '}}{% endif %}{% endfor %}"
         },
         "mlabonne/NeuralMonarch-7B": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}"
         },
         "meta-llama/Llama-2-7b-chat-hf": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
         },
         "GritLM/GritLM-7B": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<s>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<s>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
         },
         "ishorn5/RTLCoder-Deepseek-v1.1": {
-        "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
-        "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "unk_token": None,
-        "chat_template": "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n"
+            "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
+            "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+            "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+            "unk_token": None,
+            "chat_template": "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n"
         },
         "jondurbin/bagel-34b-v0.2": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}"
         },
         "openchat/openchat-3.5-0106": {
-        "bos_token": "<s>",
-        "eos_token": "<|end_of_turn|>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "<|end_of_turn|>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}"
         },
         "mobiuslabsgmbh/aanaphi2-v0.1": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "[PAD]",
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'### Human: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{'### Assistant: '  + message['content'].strip() + '\n'}}{% endif %}{% endfor %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "[PAD]",
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'### Human: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{'### Assistant: '  + message['content'].strip() + '\n'}}{% endif %}{% endfor %}"
         },
         "typeof/mistral-60m": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}"
         },
         "turboderp/Cat-Llama-3-70B-instruct": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nBelow is a conversation between a curious user and a helpful AI assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nBelow is a conversation between a curious user and a helpful AI assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "saltlux/Ko-Llama3-Luxia-8B": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}"
         },
         "h2oai/h2o-danube2-1.8b-chat": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}"
         },
         "abhishek/autotrain-llama3-70b-orpo-v1": {
-        "bos_token": "<s>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<pad>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<pad>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}"
         },
         "casperhansen/llama-3-70b-instruct-awq": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
         },
         "01-ai/Yi-1.5-34B-Chat": {
-        "bos_token": "<|startoftext|>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<|startoftext|>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
         },
         "allenai/OLMo-7B-Instruct": {
-        "bos_token": None,
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|padding|>",
-        "unk_token": None,
-        "chat_template": "{{ eos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": None,
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|padding|>",
+            "unk_token": None,
+            "chat_template": "{{ eos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
         },
         "TheBloke/deepseek-coder-33B-instruct-GPTQ": {
-        "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
-        "eos_token": "<|EOT|>",
-        "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "unk_token": None,
-        "chat_template": "{%- set found_item = false -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set found_item = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n"
+            "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
+            "eos_token": "<|EOT|>",
+            "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+            "unk_token": None,
+            "chat_template": "{%- set found_item = false -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set found_item = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n"
         },
         "cognitivecomputations/dolphin-2.8-mistral-7b-v02": {
-        "bos_token": "<s>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "alexsobolev/IcaroLM": {
-        "bos_token": "<s>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{'<|im_start|>user\n' + message['value'] + '<|im_end|>\n'}}{% elif message['from'] == 'gpt' %}{{'<|im_start|>assistant\n' + message['value'] + '<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['value'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{'<|im_start|>user\n' + message['value'] + '<|im_end|>\n'}}{% elif message['from'] == 'gpt' %}{{'<|im_start|>assistant\n' + message['value'] + '<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['value'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "tokyotech-llm/Swallow-7b-instruct-v0.1": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<s>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "</s>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": None,
-        "unk_token": {
-        "__type": "AddedToken",
-        "content": "<unk>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = '\u3042\u306a\u305f\u306f\u8aa0\u5b9f\u3067\u512a\u79c0\u306a\u65e5\u672c\u4eba\u306e\u30a2\u30b7\u30b9\u30bf\u30f3\u30c8\u3067\u3059\u3002' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{{ bos_token }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST] ' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ''  + content.strip() + '' + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<s>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "</s>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": None,
+            "unk_token": {
+                "__type": "AddedToken",
+                "content": "<unk>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = '\u3042\u306a\u305f\u306f\u8aa0\u5b9f\u3067\u512a\u79c0\u306a\u65e5\u672c\u4eba\u306e\u30a2\u30b7\u30b9\u30bf\u30f3\u30c8\u3067\u3059\u3002' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{{ bos_token }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST] ' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ''  + content.strip() + '' + eos_token }}{% endif %}{% endfor %}"
         },
         "instructlab/merlinite-7b-lab": {
-        "bos_token": "<s>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|pad|>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>'+ '\n' + message['content'] + '\n'}}{% elif message['role'] == 'user' %}{{'<|user|>' + '\n' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|pad|>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>'+ '\n' + message['content'] + '\n'}}{% elif message['role'] == 'user' %}{{'<|user|>' + '\n' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}{% endif %}{% endfor %}"
         },
         "microsoft/Phi-3-medium-128k-instruct": {
-        "bos_token": "<s>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|placeholder6|>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|placeholder6|>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
         },
         "katuni4ka/tiny-random-phi3": {
-        "bos_token": "<s>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
         },
         "microsoft/Phi-3-mini-128k-instruct": {
-        "bos_token": "<s>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|placeholder6|>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|placeholder6|>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
         },
         "VAGOsolutions/SauerkrautLM-Qwen-32b": {
-        "bos_token": None,
-        "eos_token": "<|im_end|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": None,
-        "chat_template": "{% set system_message = 'Du bist ein freundlicher und hilfsbereiter KI-Assistent.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
+            "bos_token": None,
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": None,
+            "chat_template": "{% set system_message = 'Du bist ein freundlicher und hilfsbereiter KI-Assistent.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
         },
         "AI-Sweden-Models/gpt-sw3-356m-instruct": {
-        "bos_token": None,
-        "eos_token": None,
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{{ eos_token }}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}{% else %}{{ 'Bot: ' + message['content']}}{% endif %}{{ message['text'] }}{{ bos_token }}{% endfor %}Bot:"
+            "bos_token": None,
+            "eos_token": None,
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{{ eos_token }}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}{% else %}{{ 'Bot: ' + message['content']}}{% endif %}{{ message['text'] }}{{ bos_token }}{% endfor %}Bot:"
         },
         "google/gemma-7b-it": {
-        "bos_token": "<bos>",
-        "eos_token": "<eos>",
-        "pad_token": "<pad>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
+            "bos_token": "<bos>",
+            "eos_token": "<eos>",
+            "pad_token": "<pad>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
         },
         "ise-uiuc/Magicoder-S-DS-6.7B": {
-        "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
-        "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "unk_token": None,
-        "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n        {{ raise_exception('System messages are not allowed in this template.') }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'@@ Instruction\n' + message['content'] + '\n\n'}}\n        {%- else %}\n{{'@@ Response\n' + message['content'] + eos_token + '\n\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'@@ Response\n'}}"
+            "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
+            "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+            "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+            "unk_token": None,
+            "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n        {{ raise_exception('System messages are not allowed in this template.') }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'@@ Instruction\n' + message['content'] + '\n\n'}}\n        {%- else %}\n{{'@@ Response\n' + message['content'] + eos_token + '\n\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'@@ Response\n'}}"
         },
         "Deci/DeciLM-7B": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\n'  + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\n'  + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}"
         },
         "katuni4ka/tiny-random-minicpm": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<\u7528\u6237>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<\u7528\u6237>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}"
         },
         "UnicomLLM/Unichat-llama3-Chinese-8B-28K": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = message['content']  %}{% if loop.index0 == 0  %}{% set content =bos_token + content %}{% endif %}{% if loop.index0 ==1 %}{% set content =  'Human:' + content %}{% endif %}{% if loop.index0 %2!=0 and loop.index0 !=1 %}{% set content =  bos_token+'Human:' + content %}{% endif %}{% if loop.index0 !=0 and loop.index0 %2==0 and  not loop.last %}{% set content = 'Assistant:'+content+ eos_token %}{% endif %}{{ content+'\n' }}{% endfor %}{{ 'Assistant:' }}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = message['content']  %}{% if loop.index0 == 0  %}{% set content =bos_token + content %}{% endif %}{% if loop.index0 ==1 %}{% set content =  'Human:' + content %}{% endif %}{% if loop.index0 %2!=0 and loop.index0 !=1 %}{% set content =  bos_token+'Human:' + content %}{% endif %}{% if loop.index0 !=0 and loop.index0 %2==0 and  not loop.last %}{% set content = 'Assistant:'+content+ eos_token %}{% endif %}{{ content+'\n' }}{% endfor %}{{ 'Assistant:' }}"
         },
         "RLHFlow/LLaMA3-SFT": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '\n' + message['content'] + '<|eot_id|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '\n' + message['content'] + '<|eot_id|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}{% endif %}"
         },
         "bofenghuang/vigogne-2-7b-chat": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<s>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "</s>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": None,
-        "unk_token": {
-        "__type": "AddedToken",
-        "content": "<unk>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|system|>: ' + system_message + '\\n' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>: ' + message['content'].strip() + '\\n' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>: ' + message['content'].strip() + eos_token + '\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>:' }}{% endif %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<s>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "</s>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": None,
+            "unk_token": {
+                "__type": "AddedToken",
+                "content": "<unk>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|system|>: ' + system_message + '\\n' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>: ' + message['content'].strip() + '\\n' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>: ' + message['content'].strip() + eos_token + '\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>:' }}{% endif %}"
         },
         "aisingapore/sea-lion-7b-instruct": {
-        "bos_token": None,
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|padding|>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}### USER:\n{{ message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}### RESPONSE:\n{{ message['content'] + '\n\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}### RESPONSE:\n{% endif %}"
+            "bos_token": None,
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|padding|>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}### USER:\n{{ message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}### RESPONSE:\n{{ message['content'] + '\n\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}### RESPONSE:\n{% endif %}"
         },
         "microsoft/Phi-3-small-8k-instruct": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": None,
-        "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": None,
+            "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"
         },
         "THUDM/cogvlm2-llama3-chat-19B": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ eos_token }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ eos_token }}{% endif %}"
         },
         "tiiuae/falcon-11B": {
-        "bos_token": ">>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User: \n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ 'System: ' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Falcon:\n'  + message['content']}}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Falcon:' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": ">>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User: \n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ 'System: ' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Falcon:\n'  + message['content']}}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Falcon:' }}\n{% endif %}\n{% endfor %}"
         },
         "Mihaiii/Pallas-0.5": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'SYSTEM:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'USER:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'ASSISTANT:\n'  + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ 'ASSISTANT:\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'SYSTEM:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'USER:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'ASSISTANT:\n'  + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ 'ASSISTANT:\n' }}{% endif %}{% endfor %}"
         },
         "prithivida/Asimov-7B-v2": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'### ' + message['role'] + ': ' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ '### Assistant: ' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'### ' + message['role'] + ': ' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ '### Assistant: ' }}{% endif %}"
         },
         "dreamgen/opus-v1.2-7b": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>'}}{% if message['role']=='assistant' %}{{'text'}}{% else %}{{message['role']}}{% endif %}{{'\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>text\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>'}}{% if message['role']=='assistant' %}{{'text'}}{% else %}{{message['role']}}{% endif %}{{'\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>text\n' }}{% endif %}"
         },
         "KnutJaegersberg/internlm-20b-llama": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.last and message['role'] != 'user' %}{{ raise_exception('Most recent message must come from user!') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|User|>:' + message['content'] + '<eoh>\n'}}{% elif message['role'] == 'assistant' %}{{ '<|Bot|>:'  + message['content'] + '<eoa>\n'}}{% else %}{{ raise_exception('Only user and assistant roles are supported in this model!') }}{% endif %}{% endfor %}{{ '<|Bot|>:' }}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.last and message['role'] != 'user' %}{{ raise_exception('Most recent message must come from user!') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|User|>:' + message['content'] + '<eoh>\n'}}{% elif message['role'] == 'assistant' %}{{ '<|Bot|>:'  + message['content'] + '<eoa>\n'}}{% else %}{{ raise_exception('Only user and assistant roles are supported in this model!') }}{% endif %}{% endfor %}{{ '<|Bot|>:' }}"
         },
         "alpindale/WizardLM-2-8x22B": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{{ messages[0]['content'].strip() }}{% else %}{% set loop_messages = messages %}{{ 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' }}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ ' USER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% else %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ '\nUSER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ASSISTANT:' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{{ messages[0]['content'].strip() }}{% else %}{% set loop_messages = messages %}{{ 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' }}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ ' USER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% else %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ '\nUSER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ASSISTANT:' }}{% endif %}"
         },
         "yentinglin/Taiwan-LLM-7B-v2.0-base": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = '\u4f60\u662f\u4eba\u5de5\u667a\u6167\u52a9\u7406\uff0c\u4ee5\u4e0b\u662f\u7528\u6236\u548c\u4eba\u5de5\u667a\u80fd\u52a9\u7406\u4e4b\u9593\u7684\u5c0d\u8a71\u3002\u4f60\u8981\u5c0d\u7528\u6236\u7684\u554f\u984c\u63d0\u4f9b\u6709\u7528\u3001\u5b89\u5168\u3001\u8a73\u7d30\u548c\u79ae\u8c8c\u7684\u56de\u7b54\u3002' %}{% endif %}{{system_message + eos_token}}{% for message in loop_messages %}{% if message['role'] == 'user' %}USER: {{ message['content'].strip() + eos_token }}{% elif message['role'] == 'system' %}{{message['content'].strip() + eos_token}}{% elif message['role'] == 'assistant' %}ASSISTANT: {{ message['content'].strip() + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'ASSISTANT:'}}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = '\u4f60\u662f\u4eba\u5de5\u667a\u6167\u52a9\u7406\uff0c\u4ee5\u4e0b\u662f\u7528\u6236\u548c\u4eba\u5de5\u667a\u80fd\u52a9\u7406\u4e4b\u9593\u7684\u5c0d\u8a71\u3002\u4f60\u8981\u5c0d\u7528\u6236\u7684\u554f\u984c\u63d0\u4f9b\u6709\u7528\u3001\u5b89\u5168\u3001\u8a73\u7d30\u548c\u79ae\u8c8c\u7684\u56de\u7b54\u3002' %}{% endif %}{{system_message + eos_token}}{% for message in loop_messages %}{% if message['role'] == 'user' %}USER: {{ message['content'].strip() + eos_token }}{% elif message['role'] == 'system' %}{{message['content'].strip() + eos_token}}{% elif message['role'] == 'assistant' %}ASSISTANT: {{ message['content'].strip() + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'ASSISTANT:'}}{% endif %}"
         },
         "maywell/Synatra-Mixtral-8x7B": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}"
         },
         "MediaTek-Research/Breeze-7B-Instruct-v1_0": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }} {{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }} {{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
         },
         "MTSAIR/multi_verse_model": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'] + '\n### Response:\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% elif message['role'] == 'system' %}{{ '### System:\n' + message['content'] + '\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'] + '\n### Response:\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% elif message['role'] == 'system' %}{{ '### System:\n' + message['content'] + '\n' }}{% endif %}{% endfor %}"
         },
         "bofenghuang/vigostral-7b-chat": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
         },
         "SeaLLMs/SeaLLM-7B-v2.5": {
-        "bos_token": "<bos>",
-        "eos_token": "<eos>",
-        "pad_token": "<pad>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<eos>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<bos>",
+            "eos_token": "<eos>",
+            "pad_token": "<pad>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<eos>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "qnguyen3/Master-Yi-9B": {
-        "bos_token": "<|startoftext|>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<|startoftext|>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
         },
         "meetkai/functionary-small-v2.5": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + 'name=' + message['name'] + '\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '<|reserved_special_token_249|>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + 'name=' + message['name'] + '\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '<|reserved_special_token_249|>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
         },
         "h2oai/h2o-danube-1.8b-chat": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}"
         },
         "TheBloke/CodeLlama-70B-Instruct-AWQ": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'].strip() %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'].strip() %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}"
         },
         "FairMind/Phi-3-mini-4k-instruct-bnb-4bit-Ita": {
-        "bos_token": "<s>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] in ['user', 'system']) %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] in ['user', 'system']) %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
         },
         "ibm-granite/granite-8b-code-instruct": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n'  + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n'  + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}"
         },
         "dicta-il/dictalm2.0-instruct": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
         },
         "nvidia/Llama3-ChatQA-1.5-8B": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{{ bos_token }}{%- if messages[0]['role'] == 'system' -%}{% set loop_messages = messages[1:] %}{%- else -%}{% set loop_messages = messages %}{% endif %}System: This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context.\n\n{% for message in loop_messages %}{%- if message['role'] == 'user' -%}User: {{ message['content'].strip() + '\n\n' }}{%- else -%}Assistant: {{ message['content'].strip() + '\n\n' }}{%- endif %}{% if loop.last and message['role'] == 'user' %}Assistant:{% endif %}{% endfor %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{{ bos_token }}{%- if messages[0]['role'] == 'system' -%}{% set loop_messages = messages[1:] %}{%- else -%}{% set loop_messages = messages %}{% endif %}System: This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context.\n\n{% for message in loop_messages %}{%- if message['role'] == 'user' -%}User: {{ message['content'].strip() + '\n\n' }}{%- else -%}Assistant: {{ message['content'].strip() + '\n\n' }}{%- endif %}{% if loop.last and message['role'] == 'user' %}Assistant:{% endif %}{% endfor %}"
         },
         "openchat/openchat-3.6-8b-20240522": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] in ['user', 'assistant'] %}{% set content = '<|start_header_id|>GPT4 Correct ' + message['role'].title() + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% elif message['role'] == 'system' %}{% set content = '<|start_header_id|>System<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% else %}{{ raise_exception('Only user, assistant and system roles are supported!') }}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] in ['user', 'assistant'] %}{% set content = '<|start_header_id|>GPT4 Correct ' + message['role'].title() + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% elif message['role'] == 'system' %}{% set content = '<|start_header_id|>System<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% else %}{{ raise_exception('Only user, assistant and system roles are supported!') }}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n' }}{% endif %}"
         },
         "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<s>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "</s>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": None,
-        "unk_token": {
-        "__type": "AddedToken",
-        "content": "<unk>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if loop.last %}{{ 'Assistant: ' + message['content']}}{% else %}{{ 'Assistant: ' + message['content'] + eos_token + '\n' }}{% endif %}{% elif message['role'] == 'system' %}{{ message['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ 'Assistant:' }}{% endif %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "</s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": None,
+            "unk_token": {
+                "__type": "AddedToken",
+                "content": "<unk>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if loop.last %}{{ 'Assistant: ' + message['content']}}{% else %}{{ 'Assistant: ' + message['content'] + eos_token + '\n' }}{% endif %}{% elif message['role'] == 'system' %}{{ message['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ 'Assistant:' }}{% endif %}"
         },
         "tenyx/TenyxChat-7B-v1": {
-        "bos_token": "<s>",
-        "eos_token": "<|end_of_turn|>",
-        "pad_token": "<|end_of_turn|>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }} {% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User:' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ 'System:' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Assistant:'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Assistant:' }}{% endif %}\n{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|end_of_turn|>",
+            "pad_token": "<|end_of_turn|>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }} {% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User:' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ 'System:' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Assistant:'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Assistant:' }}{% endif %}\n{% endfor %}"
         },
         "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% if bos_token|length > 0 %}{{ '\n' }}{% endif %}{% if messages[0]['role'] != 'system' %}{{ '### Instruction:\nYou are an unbiased, uncensored, helpful assistant.' }}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '### Instruction:\n' + message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\n### Input:\n' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '\n\n### Response:\n' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, assistant, and system roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\n### Response:\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% if bos_token|length > 0 %}{{ '\n' }}{% endif %}{% if messages[0]['role'] != 'system' %}{{ '### Instruction:\nYou are an unbiased, uncensored, helpful assistant.' }}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '### Instruction:\n' + message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\n### Input:\n' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '\n\n### Response:\n' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, assistant, and system roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\n### Response:\n' }}{% endif %}"
         },
         "SeaLLMs/SeaLLM-7B-v2": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '</s>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '</s>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "cognitivecomputations/dolphin-2.6-mistral-7b-dpo-laser": {
-        "bos_token": "<s>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<|im_end|>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'system' %}\n{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|im_start|>assistant' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|im_end|>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'system' %}\n{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|im_start|>assistant' }}\n{% endif %}\n{% endfor %}"
         },
         "vaiv/llamion-14b-chat": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"
         },
         "yam-peleg/Hebrew-Gemma-11B-V2": {
-        "bos_token": "<bos>",
-        "eos_token": "<eos>",
-        "pad_token": "<pad>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
+            "bos_token": "<bos>",
+            "eos_token": "<eos>",
+            "pad_token": "<pad>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
         },
         "shenzhi-wang/Llama3-8B-Chinese-Chat": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": "<|eot_id|>",
-        "unk_token": None,
-        "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": "<|eot_id|>",
+            "unk_token": None,
+            "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
         },
         "ericzzz/falcon-rw-1b-chat": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": None,
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '[RESP] '  + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": None,
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '[RESP] '  + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}"
         },
         "NLPark/AnFeng_v3_Avocet": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}"
         },
         "microsoft/Phi-3-vision-128k-instruct": {
-        "bos_token": "<s>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}"
         },
         "jphme/em_german_leo_mistral": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{message['content'] + ' '}}{% elif message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' '}}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'] + ' '}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: '}}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{message['content'] + ' '}}{% elif message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' '}}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'] + ' '}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: '}}{% endif %}"
         },
         "nlpai-lab/KULLM3": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = \"\ub2f9\uc2e0\uc740 \uace0\ub824\ub300\ud559\uad50 NLP&AI \uc5f0\uad6c\uc2e4\uc5d0\uc11c \ub9cc\ub4e0 AI \ucc57\ubd07\uc785\ub2c8\ub2e4. \ub2f9\uc2e0\uc758 \uc774\ub984\uc740 'KULLM'\uc73c\ub85c, \ud55c\uad6d\uc5b4\ub85c\ub294 '\uad6c\ub984'\uc744 \ub73b\ud569\ub2c8\ub2e4. \ub2f9\uc2e0\uc740 \ube44\ub3c4\ub355\uc801\uc774\uac70\ub098, \uc131\uc801\uc774\uac70\ub098, \ubd88\ubc95\uc801\uc774\uac70\ub098 \ub610\ub294 \uc0ac\ud68c \ud1b5\ub150\uc801\uc73c\ub85c \ud5c8\uc6a9\ub418\uc9c0 \uc54a\ub294 \ubc1c\uc5b8\uc740 \ud558\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4. \uc0ac\uc6a9\uc790\uc640 \uc990\uac81\uac8c \ub300\ud654\ud558\uba70, \uc0ac\uc6a9\uc790\uc758 \uc751\ub2f5\uc5d0 \uac00\ub2a5\ud55c \uc815\ud655\ud558\uace0 \uce5c\uc808\ud558\uac8c \uc751\ub2f5\ud568\uc73c\ub85c\uc368 \ucd5c\ub300\ud55c \ub3c4\uc640\uc8fc\ub824\uace0 \ub178\ub825\ud569\ub2c8\ub2e4. \uc9c8\ubb38\uc774 \uc774\uc0c1\ud558\ub2e4\uba74, \uc5b4\ub5a4 \ubd80\ubd84\uc774 \uc774\uc0c1\ud55c\uc9c0 \uc124\uba85\ud569\ub2c8\ub2e4. \uac70\uc9d3 \uc815\ubcf4\ub97c \ubc1c\uc5b8\ud558\uc9c0 \uc54a\ub3c4\ub85d \uc8fc\uc758\ud569\ub2c8\ub2e4.\" %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]'}}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = \"\ub2f9\uc2e0\uc740 \uace0\ub824\ub300\ud559\uad50 NLP&AI \uc5f0\uad6c\uc2e4\uc5d0\uc11c \ub9cc\ub4e0 AI \ucc57\ubd07\uc785\ub2c8\ub2e4. \ub2f9\uc2e0\uc758 \uc774\ub984\uc740 'KULLM'\uc73c\ub85c, \ud55c\uad6d\uc5b4\ub85c\ub294 '\uad6c\ub984'\uc744 \ub73b\ud569\ub2c8\ub2e4. \ub2f9\uc2e0\uc740 \ube44\ub3c4\ub355\uc801\uc774\uac70\ub098, \uc131\uc801\uc774\uac70\ub098, \ubd88\ubc95\uc801\uc774\uac70\ub098 \ub610\ub294 \uc0ac\ud68c \ud1b5\ub150\uc801\uc73c\ub85c \ud5c8\uc6a9\ub418\uc9c0 \uc54a\ub294 \ubc1c\uc5b8\uc740 \ud558\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4. \uc0ac\uc6a9\uc790\uc640 \uc990\uac81\uac8c \ub300\ud654\ud558\uba70, \uc0ac\uc6a9\uc790\uc758 \uc751\ub2f5\uc5d0 \uac00\ub2a5\ud55c \uc815\ud655\ud558\uace0 \uce5c\uc808\ud558\uac8c \uc751\ub2f5\ud568\uc73c\ub85c\uc368 \ucd5c\ub300\ud55c \ub3c4\uc640\uc8fc\ub824\uace0 \ub178\ub825\ud569\ub2c8\ub2e4. \uc9c8\ubb38\uc774 \uc774\uc0c1\ud558\ub2e4\uba74, \uc5b4\ub5a4 \ubd80\ubd84\uc774 \uc774\uc0c1\ud55c\uc9c0 \uc124\uba85\ud569\ub2c8\ub2e4. \uac70\uc9d3 \uc815\ubcf4\ub97c \ubc1c\uc5b8\ud558\uc9c0 \uc54a\ub3c4\ub85d \uc8fc\uc758\ud569\ub2c8\ub2e4.\" %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]'}}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
         },
         "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1": {
-        "bos_token": "<bos>",
-        "eos_token": "<eos>",
-        "pad_token": "<pad>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}"
+            "bos_token": "<bos>",
+            "eos_token": "<eos>",
+            "pad_token": "<pad>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}"
         },
         "MediaTek-Research/Breeze-7B-Instruct-v0_1": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }}{{ system_message }}   {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }}{{ system_message }}   {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
         },
         "microsoft/DialoGPT-large": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": None,
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": None,
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
         },
         "meta-llama/Meta-Llama-Guard-2-8B": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{% if messages|length % 2 == 0 %}{% set role = 'Agent' %}{% else %}{% set role = 'User' %}{% endif %}{{ \"[INST] Task: Check if there is unsafe content in '\" + role + \"' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes. \nS2: Non-Violent Crimes. \nS3: Sex Crimes. \nS4: Child Exploitation. \nS5: Specialized Advice. \nS6: Privacy. \nS7: Intellectual Property. \nS8: Indiscriminate Weapons. \nS9: Hate. \nS10: Self-Harm. \nS11: Sexual Content. \n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\n\" }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{% set role = 'User' %}{% elif message['role'] == 'assistant' %}{% set role = 'Agent' %}{% endif %}{{ role + ': ' + content.strip() + '\n\n' }}{% endfor %}{{ \"<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST \" + role + \" message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]\" }}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{% if messages|length % 2 == 0 %}{% set role = 'Agent' %}{% else %}{% set role = 'User' %}{% endif %}{{ \"[INST] Task: Check if there is unsafe content in '\" + role + \"' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes. \nS2: Non-Violent Crimes. \nS3: Sex Crimes. \nS4: Child Exploitation. \nS5: Specialized Advice. \nS6: Privacy. \nS7: Intellectual Property. \nS8: Indiscriminate Weapons. \nS9: Hate. \nS10: Self-Harm. \nS11: Sexual Content. \n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\n\" }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{% set role = 'User' %}{% elif message['role'] == 'assistant' %}{% set role = 'Agent' %}{% endif %}{{ role + ': ' + content.strip() + '\n\n' }}{% endfor %}{{ \"<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST \" + role + \" message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]\" }}"
         },
         "chinoll/Yi-6b-200k-dpo": {
-        "bos_token": "<|startoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|Human|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|System|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|Assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|Assistant|>' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": "<|startoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|Human|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|System|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|Assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|Assistant|>' }}\n{% endif %}\n{% endfor %}"
         },
         "shanchen/llama3-8B-slerp-biomed-chat-chinese": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": "<|eot_id|>",
-        "unk_token": None,
-        "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are Llama3-8B-Chinese-Chat-v2, finetuned from Llama3-8B-Instruct on Chinese-English dataset using the ORPO algorithm. You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": "<|eot_id|>",
+            "unk_token": None,
+            "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are Llama3-8B-Chinese-Chat-v2, finetuned from Llama3-8B-Instruct on Chinese-English dataset using the ORPO algorithm. You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
         },
         "MLP-KTLim/llama-3-Korean-Bllossom-8B": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
         },
         "UnfilteredAI/UNfilteredAI-1B": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|user|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|user|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}{% endfor %}"
         },
         "abacusai/Smaug-Mixtral-v0.1": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{%if message['content'][0] == '$' %} {% endif %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{%if message['content'][0] == '$' %} {% endif %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
         },
         "ProbeMedicalYonseiMAILab/medllama3-v20": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": "<|eot_id|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\nHuman: ' + message['content'] +  eos_token }}{% elif message['role'] == 'assistant' %}{{ '\n\nAssistant: '  + message['content'] +  eos_token  }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\nAssistant: ' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": "<|eot_id|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\nHuman: ' + message['content'] +  eos_token }}{% elif message['role'] == 'assistant' %}{{ '\n\nAssistant: '  + message['content'] +  eos_token  }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\nAssistant: ' }}{% endif %}"
         },
         "vinai/PhoGPT-4B-Chat": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<pad>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'user' and loop.first %}{{ '### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '\n### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '\n### Tr\u1ea3 l\u1eddi: ' + message['content'] + eos_token }}{% endif %}{% if loop.last %}{% if message['role'] == 'user' and add_generation_prompt %}{{ '\n### Tr\u1ea3 l\u1eddi:' }}{% endif %}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<pad>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'user' and loop.first %}{{ '### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '\n### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '\n### Tr\u1ea3 l\u1eddi: ' + message['content'] + eos_token }}{% endif %}{% if loop.last %}{% if message['role'] == 'user' and add_generation_prompt %}{{ '\n### Tr\u1ea3 l\u1eddi:' }}{% endif %}{% endif %}{% endfor %}"
         },
         "lucyknada/microsoft_WizardLM-2-7B": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token + (messages[0]['content'].strip() + '\n\n' if messages[0]['role'] == 'system' else '') }}{% for message in (messages[1:] if messages[0]['role'] == 'system' else messages) %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token + (messages[0]['content'].strip() + '\n\n' if messages[0]['role'] == 'system' else '') }}{% for message in (messages[1:] if messages[0]['role'] == 'system' else messages) %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}{% endfor %}"
         },
         "bigcode/starcoder2-15b-instruct-v0.1": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": None,
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n        {{ raise_exception('System messages are not allowed in this template.') }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction\n' + message['content'] + '\n\n'}}\n        {%- else %}\n{{'### Response\n' + message['content'] + eos_token + '\n\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response\n'}}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": None,
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n        {{ raise_exception('System messages are not allowed in this template.') }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction\n' + message['content'] + '\n\n'}}\n        {%- else %}\n{{'### Response\n' + message['content'] + eos_token + '\n\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response\n'}}"
         },
         "AliAbdelrasheed/maqa_llama_4bit": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": "<|reserved_special_token_250|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% elif message['from'] == 'gpt' %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% else %}{{ '<|start_header_id|>' + message['from'] + '<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": "<|reserved_special_token_250|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% elif message['from'] == 'gpt' %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% else %}{{ '<|start_header_id|>' + message['from'] + '<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
         },
         "lightonai/alfred-40b-1023": {
-        "bos_token": None,
-        "eos_token": "<end_message>",
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<start_user>' + message['content'].strip() + '<end_message>' }}{% elif message['role'] == 'system' %}{{ '<start_system>' + message['content'].strip() + '<end_message>' }}{% elif message['role'] == 'assistant' %}{{ '<start_assistant>'  + message['content'] + '<end_message>' }}{% else %}{{ raise_exception('Only system, user and assistant roles are supported.') }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<start_assistant>' }}{% endif %}{% endfor %}"
+            "bos_token": None,
+            "eos_token": "<end_message>",
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<start_user>' + message['content'].strip() + '<end_message>' }}{% elif message['role'] == 'system' %}{{ '<start_system>' + message['content'].strip() + '<end_message>' }}{% elif message['role'] == 'assistant' %}{{ '<start_assistant>'  + message['content'] + '<end_message>' }}{% else %}{{ raise_exception('Only system, user and assistant roles are supported.') }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<start_assistant>' }}{% endif %}{% endfor %}"
         },
         "aloobun/CosmicBun-8B": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{%- set ns = namespace(found=false) -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- set ns.found = true -%}{%- endif -%}{%- endfor -%}{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'].rstrip() + '<|im_end|>\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'<|im_start|>user\n' + message['content'].rstrip() + '<|im_end|>\n'-}}{%- else -%}{{-'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'<|im_start|>assistant\n'-}}{%- endif -%}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{%- set ns = namespace(found=false) -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- set ns.found = true -%}{%- endif -%}{%- endfor -%}{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'].rstrip() + '<|im_end|>\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'<|im_start|>user\n' + message['content'].rstrip() + '<|im_end|>\n'-}}{%- else -%}{{-'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'<|im_start|>assistant\n'-}}{%- endif -%}"
         },
         "Undi95/Mixtral-8x7B-MoE-RP-Story": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}\n"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}\n"
         },
         "TIGER-Lab/MAmmoTH2-8B-Plus": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": "<|eot_id|>",
-        "unk_token": None,
-        "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|begin_of_text|>' + '<|start_header_id|>system<|end_header_id|>\\n\\n' + system_message + '<|eot_id|>' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\\n\\n' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": "<|eot_id|>",
+            "unk_token": None,
+            "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|begin_of_text|>' + '<|start_header_id|>system<|end_header_id|>\\n\\n' + system_message + '<|eot_id|>' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\\n\\n' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}"
         },
         "codellama/CodeLlama-70b-Instruct-hf": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'] | trim %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'] | trim %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}"
         },
         "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "[control_768]",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{{ '<s>' + system_message }}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ ' [INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "[control_768]",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{{ '<s>' + system_message }}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ ' [INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' }}{% endif %}{% endfor %}"
         },
         "gorilla-llm/gorilla-openfunctions-v2": {
-        "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
-        "eos_token": "<|EOT|>",
-        "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "unk_token": None,
-        "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Gorilla LLM model, developed by Gorilla LLM, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
+            "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
+            "eos_token": "<|EOT|>",
+            "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+            "unk_token": None,
+            "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Gorilla LLM model, developed by Gorilla LLM, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
         },
         "ghost-x/ghost-7b-alpha": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'plugins' %}\n{{ '<|plugins|>\n'  + message['content'] + '\n\nStandards for using the tool must comply with the following syntax:\n[execute]({\"type\": string, \"function\": string, \"arguments\": object})' + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% elif message['role'] == 'execute' %}\n{{ '<|assistant|>\n[execute]('  + message['content'] + ')<//>' + eos_token }}\n{% elif message['role'] == 'response' %}\n{{ '<|tool|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'plugins' %}\n{{ '<|plugins|>\n'  + message['content'] + '\n\nStandards for using the tool must comply with the following syntax:\n[execute]({\"type\": string, \"function\": string, \"arguments\": object})' + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% elif message['role'] == 'execute' %}\n{{ '<|assistant|>\n[execute]('  + message['content'] + ')<//>' + eos_token }}\n{% elif message['role'] == 'response' %}\n{{ '<|tool|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
         },
         "winninghealth/WiNGPT2-Llama-3-8B-Chat": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}System\uff1a{% endif %}{% if message['role'] == 'user' %}User\uff1a{% endif %}{% if message['role'] == 'assistant' %}Assistant\uff1a{% endif %}{{ message['content'] }}<|end_of_text|>\n {% endfor %}Assistant\uff1a"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}System\uff1a{% endif %}{% if message['role'] == 'user' %}User\uff1a{% endif %}{% if message['role'] == 'assistant' %}Assistant\uff1a{% endif %}{{ message['content'] }}<|end_of_text|>\n {% endfor %}Assistant\uff1a"
         },
         "BramVanroy/Llama-2-13b-chat-dutch": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{%set system_message = 'Je bent een behulpzame, respectvolle en eerlijke assistent. Antwoord altijd zo behulpzaam mogelijk. Je antwoorden mogen geen schadelijke, onethische, racistische, seksistische, gevaarlijke of illegale inhoud bevatten. Zorg ervoor dat je antwoorden sociaal onbevooroordeeld en positief van aard zijn.\n\nAls een vraag nergens op slaat of feitelijk niet coherent is, leg dan uit waarom in plaats van iets niet correct te antwoorden. Als je het antwoord op een vraag niet weet, deel dan geen onjuiste informatie.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\n' + content.strip() + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{%set system_message = 'Je bent een behulpzame, respectvolle en eerlijke assistent. Antwoord altijd zo behulpzaam mogelijk. Je antwoorden mogen geen schadelijke, onethische, racistische, seksistische, gevaarlijke of illegale inhoud bevatten. Zorg ervoor dat je antwoorden sociaal onbevooroordeeld en positief van aard zijn.\n\nAls een vraag nergens op slaat of feitelijk niet coherent is, leg dan uit waarom in plaats van iets niet correct te antwoorden. Als je het antwoord op een vraag niet weet, deel dan geen onjuiste informatie.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\n' + content.strip() + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
         },
         "THUDM/chatglm3-6b": {
-        "bos_token": None,
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}"
+            "bos_token": None,
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}"
         },
         "microsoft/Phi-3-mini-4k-instruct": {
-        "bos_token": "<s>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"
         },
         "mistralai/Mistral-7B-Instruct-v0.1": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + eos_token}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + eos_token}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n"
         },
         "meta-llama/Meta-Llama-3.1-8B-Instruct": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
         }
     }

From e8db2ef894267760e40cf3066110eadd72880a83 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 26 Dec 2024 10:33:06 +0000
Subject: [PATCH 058/110] Bump diffusers from 0.31.0 to 0.32.1 (#1441)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [diffusers](https://github.com/huggingface/diffusers) from 0.31.0
to 0.32.1.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/huggingface/diffusers/releases">diffusers's
releases</a>.</em></p>
<blockquote>
<h2>v0.32.1</h2>
<h1>TorchAO Quantizer fixes</h1>
<p>This patch release fixes a few bugs related to the TorchAO Quantizer
introduced in <a href="">v0.32.0</a>.</p>
<ul>
<li>Importing Diffusers would raise an error in PyTorch versions lower
than 2.3.0. This should no longer be a problem.</li>
<li>Device Map does not work as expected when using the quantizer. We
now raise an error if it is used. Support for using device maps with
different quantization backends will be added in the near future.</li>
<li>Quantization was not performed due to faulty logic. This is now
fixed and better tested.</li>
</ul>
<p>Refer to our <a
href="https://huggingface.co/docs/diffusers/">documentation</a> to learn
more about how to use different quantization backends.</p>
<h2>All commits</h2>
<ul>
<li>make style for <a
href="https://redirect.github.com/huggingface/diffusers/pull/10368">huggingface/diffusers#10368</a>
by <a href="https://github.com/yiyixuxu"><code>@​yiyixuxu</code></a> in
<a
href="https://redirect.github.com/huggingface/diffusers/issues/10370">#10370</a></li>
<li>fix test pypi installation in the release workflow by <a
href="https://github.com/sayakpaul"><code>@​sayakpaul</code></a> in <a
href="https://redirect.github.com/huggingface/diffusers/issues/10360">#10360</a></li>
<li>Fix TorchAO related bugs; revert device_map changes by <a
href="https://github.com/a-r-r-o-w"><code>@​a-r-r-o-w</code></a> in <a
href="https://redirect.github.com/huggingface/diffusers/issues/10371">#10371</a></li>
</ul>
<h2>Diffusers 0.32.0: New video pipelines, new image pipelines, new
quantization backends, new training scripts, and more</h2>
<p><a
href="https://github.com/user-attachments/assets/34d5f7ca-8e33-4401-8109-5c245ce7595f">https://github.com/user-attachments/assets/34d5f7ca-8e33-4401-8109-5c245ce7595f</a></p>
<p>This release took a while, but it has many exciting updates. It
contains several new pipelines for image and video generation, new
quantization backends, and more.</p>
<p>Going forward, to provide more transparency to the community about
ongoing developments and releases in Diffusers, we will be making use of
a <a
href="https://github.com/orgs/huggingface/projects/61/views/1">roadmap
tracker</a>.</p>
<h2>New Video Generation Pipelines 📹</h2>
<p>Open video generation models are on the rise, and we’re pleased to
provide comprehensive integration support for all of them. The following
video pipelines are bundled in this release:</p>
<ul>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/mochi">Mochi-1</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/allegro">Allegro</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video">LTXVideo</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/hunyuan_video">HunyuanVideo</a></li>
</ul>
<p>Check out <a
href="https://www.notion.so/Diffusers-0-32-0-release-15f1384ebcac8091ac5bf18c128639ab?pvs=21">this
section</a> to learn more about the fine-tuning options available for
these new video models.</p>
<h2>New Image Generation Pipelines</h2>
<ul>
<li>SANA
<ul>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/sana#diffusers.SanaPipeline">Text-to-image</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/sana#diffusers.SanaPAGPipeline">PAG</a></li>
</ul>
</li>
<li>Flux Control (including Control LoRA)
<ul>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux#depth-control">Depth
Control</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux#canny-control">Canny
Control</a></li>
</ul>
</li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux#redux">Flux
Redux</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux#fill-inpaintingoutpainting">Flux
Fill Inpainting / Outpainting</a></li>
<li><a
href="https://redirect.github.com/huggingface/diffusers/pull/9816">Flux
RF-Inversion</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_3#image-prompting-with-ip-adapters">SD3.5
ControlNet</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet_union">ControlNet
Union XL</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_3#image-prompting-with-ip-adapters">SD3.5
IP Adapter</a></li>
<li><a
href="https://redirect.github.com/huggingface/diffusers/pull/10261">Flux
IP adapter</a></li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/huggingface/diffusers/commit/e8aacda762e311505ba05ae340af23b149e37af3"><code>e8aacda</code></a>
Release: v0.32.1</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/12184f401566a599db1b905ceb96a9bf5f6b0dfa"><code>12184f4</code></a>
Fix TorchAO related bugs; revert device_map changes (<a
href="https://redirect.github.com/huggingface/diffusers/issues/10371">#10371</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/6e1d2da1948b3308f5d0028bb76787c32cc4ed5a"><code>6e1d2da</code></a>
fix test pypi installation in the release workflow (<a
href="https://redirect.github.com/huggingface/diffusers/issues/10360">#10360</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/11b1151840da2b3e3dd3fd49b2625fff46eecbc2"><code>11b1151</code></a>
make style for <a
href="https://redirect.github.com/huggingface/diffusers/pull/10368">huggingface/diffusers#10368</a>
(<a
href="https://redirect.github.com/huggingface/diffusers/issues/10370">#10370</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/cd4d0d8ffba6a434344c8836f3eebda96e5c49fe"><code>cd4d0d8</code></a>
Release: v0.32.0</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/4b557132ce955d58fd84572c03e79f43bdc91450"><code>4b55713</code></a>
[core] LTX Video 0.9.1 (<a
href="https://redirect.github.com/huggingface/diffusers/issues/10330">#10330</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/851dfa30ae111da62eedc3c2fe1e34e6ad43aa25"><code>851dfa3</code></a>
[Tests] Fix more tests sayak (<a
href="https://redirect.github.com/huggingface/diffusers/issues/10359">#10359</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/ea1ba0ba53bdd6569547e26e518f094745ed9d03"><code>ea1ba0b</code></a>
[LoRA] test fix (<a
href="https://redirect.github.com/huggingface/diffusers/issues/10351">#10351</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/9d27df8071bb39d117755200ace81a3669b4134c"><code>9d27df8</code></a>
Rename LTX blocks and docs title (<a
href="https://redirect.github.com/huggingface/diffusers/issues/10213">#10213</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/055d95543a41a47901195c47462c2976e3de6de7"><code>055d955</code></a>
Fix failing CogVideoX LoRA fuse test (<a
href="https://redirect.github.com/huggingface/diffusers/issues/10352">#10352</a>)</li>
<li>Additional commits viewable in <a
href="https://github.com/huggingface/diffusers/compare/v0.31.0...v0.32.1">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=diffusers&package-manager=pip&previous-version=0.31.0&new-version=0.32.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 samples/export-requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index 797b680b9a..a589696beb 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -6,7 +6,7 @@ optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
-diffusers==0.31.0 # For image generation pipelines
+diffusers==0.32.1 # For image generation pipelines
 timm==1.0.12  # For exporting InternVL2
 torchvision  # For visual language models
 transformers>=4.43 # For Whisper

From 94547e9d3bb8afa1d64054db186a334dcf92d6be Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 26 Dec 2024 10:41:17 +0000
Subject: [PATCH 059/110] Bump diffusers from 0.31.0 to 0.32.1 in
 /tests/python_tests (#1442)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [diffusers](https://github.com/huggingface/diffusers) from 0.31.0
to 0.32.1.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/huggingface/diffusers/releases">diffusers's
releases</a>.</em></p>
<blockquote>
<h2>v0.32.1</h2>
<h1>TorchAO Quantizer fixes</h1>
<p>This patch release fixes a few bugs related to the TorchAO Quantizer
introduced in <a href="">v0.32.0</a>.</p>
<ul>
<li>Importing Diffusers would raise an error in PyTorch versions lower
than 2.3.0. This should no longer be a problem.</li>
<li>Device Map does not work as expected when using the quantizer. We
now raise an error if it is used. Support for using device maps with
different quantization backends will be added in the near future.</li>
<li>Quantization was not performed due to faulty logic. This is now
fixed and better tested.</li>
</ul>
<p>Refer to our <a
href="https://huggingface.co/docs/diffusers/">documentation</a> to learn
more about how to use different quantization backends.</p>
<h2>All commits</h2>
<ul>
<li>make style for <a
href="https://redirect.github.com/huggingface/diffusers/pull/10368">huggingface/diffusers#10368</a>
by <a href="https://github.com/yiyixuxu"><code>@​yiyixuxu</code></a> in
<a
href="https://redirect.github.com/huggingface/diffusers/issues/10370">#10370</a></li>
<li>fix test pypi installation in the release workflow by <a
href="https://github.com/sayakpaul"><code>@​sayakpaul</code></a> in <a
href="https://redirect.github.com/huggingface/diffusers/issues/10360">#10360</a></li>
<li>Fix TorchAO related bugs; revert device_map changes by <a
href="https://github.com/a-r-r-o-w"><code>@​a-r-r-o-w</code></a> in <a
href="https://redirect.github.com/huggingface/diffusers/issues/10371">#10371</a></li>
</ul>
<h2>Diffusers 0.32.0: New video pipelines, new image pipelines, new
quantization backends, new training scripts, and more</h2>
<p><a
href="https://github.com/user-attachments/assets/34d5f7ca-8e33-4401-8109-5c245ce7595f">https://github.com/user-attachments/assets/34d5f7ca-8e33-4401-8109-5c245ce7595f</a></p>
<p>This release took a while, but it has many exciting updates. It
contains several new pipelines for image and video generation, new
quantization backends, and more.</p>
<p>Going forward, to provide more transparency to the community about
ongoing developments and releases in Diffusers, we will be making use of
a <a
href="https://github.com/orgs/huggingface/projects/61/views/1">roadmap
tracker</a>.</p>
<h2>New Video Generation Pipelines 📹</h2>
<p>Open video generation models are on the rise, and we’re pleased to
provide comprehensive integration support for all of them. The following
video pipelines are bundled in this release:</p>
<ul>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/mochi">Mochi-1</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/allegro">Allegro</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video">LTXVideo</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/hunyuan_video">HunyuanVideo</a></li>
</ul>
<p>Check out <a
href="https://www.notion.so/Diffusers-0-32-0-release-15f1384ebcac8091ac5bf18c128639ab?pvs=21">this
section</a> to learn more about the fine-tuning options available for
these new video models.</p>
<h2>New Image Generation Pipelines</h2>
<ul>
<li>SANA
<ul>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/sana#diffusers.SanaPipeline">Text-to-image</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/sana#diffusers.SanaPAGPipeline">PAG</a></li>
</ul>
</li>
<li>Flux Control (including Control LoRA)
<ul>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux#depth-control">Depth
Control</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux#canny-control">Canny
Control</a></li>
</ul>
</li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux#redux">Flux
Redux</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux#fill-inpaintingoutpainting">Flux
Fill Inpainting / Outpainting</a></li>
<li><a
href="https://redirect.github.com/huggingface/diffusers/pull/9816">Flux
RF-Inversion</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_3#image-prompting-with-ip-adapters">SD3.5
ControlNet</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet_union">ControlNet
Union XL</a></li>
<li><a
href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_3#image-prompting-with-ip-adapters">SD3.5
IP Adapter</a></li>
<li><a
href="https://redirect.github.com/huggingface/diffusers/pull/10261">Flux
IP adapter</a></li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/huggingface/diffusers/commit/e8aacda762e311505ba05ae340af23b149e37af3"><code>e8aacda</code></a>
Release: v0.32.1</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/12184f401566a599db1b905ceb96a9bf5f6b0dfa"><code>12184f4</code></a>
Fix TorchAO related bugs; revert device_map changes (<a
href="https://redirect.github.com/huggingface/diffusers/issues/10371">#10371</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/6e1d2da1948b3308f5d0028bb76787c32cc4ed5a"><code>6e1d2da</code></a>
fix test pypi installation in the release workflow (<a
href="https://redirect.github.com/huggingface/diffusers/issues/10360">#10360</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/11b1151840da2b3e3dd3fd49b2625fff46eecbc2"><code>11b1151</code></a>
make style for <a
href="https://redirect.github.com/huggingface/diffusers/pull/10368">huggingface/diffusers#10368</a>
(<a
href="https://redirect.github.com/huggingface/diffusers/issues/10370">#10370</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/cd4d0d8ffba6a434344c8836f3eebda96e5c49fe"><code>cd4d0d8</code></a>
Release: v0.32.0</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/4b557132ce955d58fd84572c03e79f43bdc91450"><code>4b55713</code></a>
[core] LTX Video 0.9.1 (<a
href="https://redirect.github.com/huggingface/diffusers/issues/10330">#10330</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/851dfa30ae111da62eedc3c2fe1e34e6ad43aa25"><code>851dfa3</code></a>
[Tests] Fix more tests sayak (<a
href="https://redirect.github.com/huggingface/diffusers/issues/10359">#10359</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/ea1ba0ba53bdd6569547e26e518f094745ed9d03"><code>ea1ba0b</code></a>
[LoRA] test fix (<a
href="https://redirect.github.com/huggingface/diffusers/issues/10351">#10351</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/9d27df8071bb39d117755200ace81a3669b4134c"><code>9d27df8</code></a>
Rename LTX blocks and docs title (<a
href="https://redirect.github.com/huggingface/diffusers/issues/10213">#10213</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/055d95543a41a47901195c47462c2976e3de6de7"><code>055d955</code></a>
Fix failing CogVideoX LoRA fuse test (<a
href="https://redirect.github.com/huggingface/diffusers/issues/10352">#10352</a>)</li>
<li>Additional commits viewable in <a
href="https://github.com/huggingface/diffusers/compare/v0.31.0...v0.32.1">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=diffusers&package-manager=pip&previous-version=0.31.0&new-version=0.32.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 tests/python_tests/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index 00bffb6646..c2c7d634f5 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-diffusers==0.31.0
+diffusers==0.32.1
 optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
 onnx==1.17.0

From 8fe0ff595015bae822b4c2867372e219900c4421 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 26 Dec 2024 20:33:10 +0400
Subject: [PATCH 060/110] Added more FLUX supported models (#1444)

---
 src/docs/SUPPORTED_MODELS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md
index 9762874596..44da29ced4 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/src/docs/SUPPORTED_MODELS.md
@@ -243,6 +243,8 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
           <li><a href="https://huggingface.co/Freepik/flux.1-lite-8B-alpha"><code>Freepik/flux.1-lite-8B-alpha</code></a></li>
           <li><a href="https://huggingface.co/black-forest-labs/FLUX.1-dev"><code>black-forest-labs/FLUX.1-dev</code></a></li>
           <li><a href="https://huggingface.co/shuttleai/shuttle-3-diffusion"><code>shuttleai/shuttle-3-diffusion</code></a></li>
+          <li><a href="https://huggingface.co/shuttleai/shuttle-3.1-aesthetic"><code>shuttleai/shuttle-3.1-aesthetic</code></a></li>
+          <li><a href="https://huggingface.co/Shakker-Labs/AWPortrait-FL"><code>Shakker-Labs/AWPortrait-FL</code></a></li>
         </ul>
       </td>
     </tr>

From 82b44fab5b538ef9e11ff47fcd245f7885c1a25f Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 27 Dec 2024 07:47:50 +0400
Subject: [PATCH 061/110] LLM tests restructuring (#1440)

- Merged chat scenario tests to test_llm_pipeline.py
- Created CB dedicated test_continuous_batching.py file with CB-specific
tests (in addition to test_llm_pipeline.py, which cover basic LLM
pipeline functionality)

CVS-159921
---
 .github/labeler.yml                           |  29 +-
 .github/workflows/linux.yml                   |   4 +-
 .github/workflows/mac.yml                     |   8 +-
 .github/workflows/windows.yml                 |   8 +-
 src/cpp/src/llm_pipeline.cpp                  |  12 +-
 tests/python_tests/common.py                  |  14 +-
 tests/python_tests/ov_genai_test_utils.py     |  29 +-
 tests/python_tests/test_chat_generate_api.py  | 118 --------
 ...emption.py => test_continuous_batching.py} | 165 ++++++++++-
 ...mizations.py => test_kv_cache_eviction.py} |   4 +-
 ...t_generate_api.py => test_llm_pipeline.py} | 273 ++++++++++--------
 .../python_tests/test_llm_pipeline_static.py  |   2 +-
 tests/python_tests/test_sampling.py           | 140 +++------
 .../{test_vlm_api.py => test_vlm_pipeline.py} |   0
 ...nerate_api.py => test_whisper_pipeline.py} |   0
 15 files changed, 418 insertions(+), 388 deletions(-)
 delete mode 100644 tests/python_tests/test_chat_generate_api.py
 rename tests/python_tests/{test_preemption.py => test_continuous_batching.py} (62%)
 rename tests/python_tests/{test_cache_optimizations.py => test_kv_cache_eviction.py} (98%)
 rename tests/python_tests/{test_generate_api.py => test_llm_pipeline.py} (87%)
 rename tests/python_tests/{test_vlm_api.py => test_vlm_pipeline.py} (100%)
 rename tests/python_tests/{test_whisper_generate_api.py => test_whisper_pipeline.py} (100%)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index c162f6aff4..f618bdb7fc 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -13,17 +13,20 @@
 - 'src/python/py_tokenizer.cpp'
 - 'thirdparty/openvino_tokenizers'
 - 'tests/python_tests/tokenizer_configs.py'
+- 'tests/python_tests/test_tokenizer.py'
 
 'category: LLM':
 - 'src/cpp/include/openvino/genai/llm_pipeline.hpp'
 - 'src/cpp/src/llm_pipeline.cpp'
+- 'src/cpp/src/lm_encoding.hpp'
 - 'src/cpp/src/lm_encoding.cpp'
 - 'src/cpp/src/llm_pipeline_base.hpp'
 - 'src/cpp/src/llm_pipeline_static.hpp'
 - 'src/cpp/src/llm_pipeline_static.cpp'
+- 'src/cpp/src/text_callback_streamer.cpp'
+- 'src/cpp/src/text_callback_streamer.hpp'
 - 'src/python/py_llm_pipeline.cpp'
-- 'tests/python_tests/test_generate_api.py'
-- 'tests/python_tests/test_chat_generate_api.py'
+- 'tests/python_tests/test_llm_pipeline.py'
 
 'category: sampling':
 - 'src/cpp/include/openvino/genai/generation_config.hpp'
@@ -35,6 +38,7 @@
 - 'tests/cpp/logit_filtering.cpp'
 - 'tests/cpp/generate_config.cpp'
 - 'tests/cpp/sampler.cpp'
+- 'tests/python_tests/test_sampling.py'
 
 'category: LoRA':
 - 'src/cpp/include/openvino/genai/lora_adapter.hpp'
@@ -54,9 +58,12 @@
 - 'src/cpp/include/openvino/genai/whisper_pipeline.hpp'
 - 'src/cpp/src/whisper/**/*'
 - 'src/cpp/src/whisper_generation_config.cpp'
+- 'src/cpp/src/whisper_pipeline_base.hpp'
 - 'src/cpp/src/whisper_pipeline.cpp'
+- 'src/cpp/src/whisper_pipeline_static.cpp'
+- 'src/cpp/src/whisper_pipeline_static.hpp'
 - 'src/python/py_whisper_pipeline.cpp'
-- 'tests/python_tests/test_whisper_generate_api.py'
+- 'tests/python_tests/test_whisper_pipeline.py'
 
 'category: Python API':
 - 'src/python/**/*'
@@ -65,10 +72,14 @@
 - 'src/include/openvino/genai/visual_language/**/*'
 - 'src/cpp/src/visual_language/**/*'
 - 'src/python/py_vlm_pipeline.cpp'
-- 'tests/python_tests/test_vlm_api.py'
+- 'tests/python_tests/test_vlm_pipeline.py'
 
 'category: speculative decoding':
 - 'src/cpp/src/speculative_decoding/**/*'
+- 'tests/cpp/speculative_decoding.cpp'
+
+'category: prompt lookup':
+- 'src/cpp/src/prompt_lookup/**/*'
 
 'category: continuous batching':
 - 'src/cpp/include/openvino/genai/cache_eviction.hpp'
@@ -91,19 +102,19 @@
 - 'src/cpp/src/generation_handle.cpp'
 - 'src/cpp/src/generation_stream.hpp'
 - 'src/cpp/src/model_runner.hpp'
-- 'src/cpp/src/paged_attention_transformations.cpp'
-- 'src/cpp/src/paged_attention_transformations.hpp'
+- 'src/cpp/src/utils/paged_attention_transformations.cpp'
+- 'src/cpp/src/utils/paged_attention_transformations.hpp'
 - 'src/cpp/src/scheduler.hpp'
 - 'src/cpp/src/sequence_group.cpp'
 - 'src/cpp/src/sequence_group.hpp'
 - 'src/cpp/src/timer.hpp'
 - 'src/python/py_continuous_batching_pipeline.cpp'
-- 'tests/python_tests/test_cache_optimizations.py'
-- 'tests/python_tests/test_preemption.py'
-- 'tests/python_tests/test_sampling.py'
+- 'tests/python_tests/test_continuous_batching.py'
+- 'tests/python_tests/test_kv_cache_eviction.py'
 - 'tests/cpp/block_allocator.cpp'
 - 'tests/cpp/block_hash_store.cpp'
 - 'tests/cpp/block_manager.cpp'
+- 'tests/cpp/cache_eviction.cpp'
 - 'tests/cpp/cache_manager.cpp'
 - 'tests/cpp/device_config.cpp'
 - 'tests/cpp/scheduler.cpp'
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 6c94a907ea..9b21491f9b 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -268,9 +268,9 @@ jobs:
       matrix:
         test:
           - name: 'Whisper'
-            cmd: 'tests/python_tests/test_whisper_generate_api.py'
+            cmd: 'tests/python_tests/test_whisper_pipeline.py'
           - name: 'LLM & VLM'
-            cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py'
+            cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_pipeline.py'
     defaults:
       run:
         shell: bash
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index a9af13bc66..4d9b7f032b 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -178,7 +178,7 @@ jobs:
     if: |
       always() &&
       (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success')
-    timeout-minutes: 90
+    timeout-minutes: 120
     defaults:
       run:
         shell: bash
@@ -235,7 +235,7 @@ jobs:
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
           python -c "from openvino_genai import LLMPipeline"
           python -m pip install ./tools/who_what_benchmark --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
+          python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_pipeline.py --ignore ./tests/python_tests/test_vlm_pipeline.py -k "not test_set_chat_template"
 
   genai_python_lib_whisper:
     name: OpenVINO genai extension whisper tests (cmake + wheel)
@@ -290,7 +290,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
+          python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k test_smoke
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
@@ -300,7 +300,7 @@ jobs:
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
           python -c "from openvino_genai import LLMPipeline"
           python -m pip install ./tools/who_what_benchmark --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
+          python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke"
 
   genai_package:
     name: OpenVINO genai extension (install to OpenVINO package)
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index f88bc4c6f3..fc63129281 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -245,7 +245,7 @@ jobs:
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
           python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
+          python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_pipeline.py --ignore ./tests/python_tests/test_vlm_pipeline.py -k "not test_set_chat_template"
 
   genai_python_lib_whisper:
     name: OpenVINO genai extension whisper tests (cmake + wheel)
@@ -301,7 +301,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
+          python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k test_smoke
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 
@@ -310,7 +310,7 @@ jobs:
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
           python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
+          python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke"
 
   genai_python_lib_vlm:
     name: OpenVINO genai VLM tests (cmake + wheel)
@@ -366,7 +366,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_vlm_api.py
+          python -m pytest -v ./tests/python_tests/test_vlm_pipeline.py
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index be5ecf17fa..5e448fe88c 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -703,8 +703,7 @@ std::pair<ov::AnyMap, ov::genai::ModelConfigDesc> split_model_descr(const ov::An
 ov::genai::LLMPipeline::LLMPipeline(
     const ov::InferRequest& request,
     const ov::genai::Tokenizer& tokenizer,
-    OptionalGenerationConfig generation_config
-) {
+    OptionalGenerationConfig generation_config) {
     auto start_time = std::chrono::steady_clock::now();
     m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer, generation_config);
     auto stop_time = std::chrono::steady_clock::now();
@@ -715,8 +714,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::filesystem::path& models_path,
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
-    const ov::AnyMap& properties
-){
+    const ov::AnyMap& properties) {
     auto start_time = std::chrono::steady_clock::now();
     if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
         properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
@@ -735,8 +733,7 @@ ov::genai::LLMPipeline::LLMPipeline(
 ov::genai::LLMPipeline::LLMPipeline(
     const std::filesystem::path& models_path,
     const std::string& device,
-    const ov::AnyMap& config
-){
+    const ov::AnyMap& config) {
     auto start_time = std::chrono::steady_clock::now();
 
     if (config.find(ov::genai::scheduler_config.name()) != config.end() || 
@@ -759,8 +756,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
     const ov::AnyMap& config,
-    const ov::genai::GenerationConfig& generation_config
-){
+    const ov::genai::GenerationConfig& generation_config) {
     auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config);
 
     auto start_time = std::chrono::steady_clock::now();
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 7e3c075405..f940d272ed 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -364,18 +364,6 @@ def run_continuous_batching(
     return output
 
 
-def read_models_list(file_name: str):
-    models = []
-    with open(file_name) as f:
-        for model_name in f:
-            model_name = model_name.strip()
-            # skip comment in model scope file
-            if model_name.startswith('#'):
-                continue
-            models.append(model_name)
-    return models
-
-
 def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig):
     if generation_config.is_beam_search():
         assert len(hf_result.m_scores) == len(ov_result.m_scores)
@@ -447,7 +435,7 @@ def generate_and_compare_with_reference_text(models_path: Path, prompts: List[st
             assert ref_text == ov_text
 
 
-def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None):
+def run_continuous_batching_pipeline_test(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None):
     prompts, generation_configs = get_test_dataset()
     scheduler_config = get_scheduler_config(scheduler_params)
 
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 87b2147bcd..3fc89cb8a7 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -32,7 +32,7 @@ def get_models_list():
         "HuggingFaceH4/zephyr-7b-beta",
         "ikala/redpajama-3b-chat",
         "mistralai/Mistral-7B-v0.1",
-        
+
         # "meta-llama/Llama-2-7b-chat-hf",  # Cannot be downloaded without access token
         # "google/gemma-2b-it",  # Cannot be downloaded without access token.
         # "google/gemma-7b-it",  # Cannot be downloaded without access token.
@@ -49,7 +49,7 @@ def get_models_list():
         model_ids = precommit_models
     else:
         model_ids = nightly_models
-    
+
     if pytest.selected_model_ids:
         model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
     # pytest.set_trace()
@@ -82,30 +82,30 @@ def get_chat_models_list():
 @functools.lru_cache(1)
 def read_model(params, **tokenizer_kwargs):
     model_id, path = params
-    
+
     from optimum.intel.openvino import OVModelForCausalLM
     from transformers import AutoTokenizer
     hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 
     if (path / "openvino_model.xml").exists():
-        opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True, 
+        opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True,
                                                        compile=False, device='CPU')
     else:
-        ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, 
+        ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer,
                                                                              with_detokenizer=True,
                                                                              **tokenizer_kwargs)
         openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml")
         openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml")
-        
+
         # to store tokenizer config jsons with special tokens
         hf_tokenizer.save_pretrained(path)
-        
-        opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, 
+
+        opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True,
                                                        compile=False, device='CPU', load_in_8bit=False)
         opt_model.generation_config.save_pretrained(path)
         opt_model.config.save_pretrained(path)
         opt_model.save_pretrained(path)
-    
+
     return (
         model_id,
         path,
@@ -116,11 +116,11 @@ def read_model(params, **tokenizer_kwargs):
 
 
 # in OpenVINO GenAI this parameter is called stop_criteria,
-# while in HF it's called early_stopping. 
+# while in HF it's called early_stopping.
 # HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER"
 STOP_CRITERIA_MAP = {
-    ov_genai.StopCriteria.NEVER: "never", 
-    ov_genai.StopCriteria.EARLY: True, 
+    ov_genai.StopCriteria.NEVER: "never",
+    ov_genai.StopCriteria.EARLY: True,
     ov_genai.StopCriteria.HEURISTIC: False
 }
 
@@ -137,6 +137,7 @@ def model_tmp_path(tmpdir_factory):
                 shutil.copy(src_file, temp_path / src_file.name)
     yield model_id, Path(temp_path)
 
+
 @pytest.fixture(scope="module")
 def model_tokenizers_path_tmp_path(tmpdir_factory):
     model_id, path, _, _, _ = read_model(get_models_list()[0])
@@ -146,7 +147,7 @@ def model_tokenizers_path_tmp_path(tmpdir_factory):
     # There was no easy way to add tokens to IR in tests, so we remove them
     # and set tokens in configs and to check if they are read and validated correctly.
     import openvino as ov
-    
+
     # copy openvino converted model and tokenizers
     for pattern in ['*.xml', '*.bin']:
         for src_file in path.glob(pattern):
@@ -162,7 +163,7 @@ def model_tokenizers_path_tmp_path(tmpdir_factory):
                     ov_model.set_rt_info("eos_token_id", "")
                     ov_model.set_rt_info("chat_template", "")
                     ov.save_model(ov_model, str(temp_path / src_file.name))
-                    
+
             if src_file in ['openvino_tokenizer.bin', 'openvino_detokenizer.bin']:
                 continue
             if src_file.is_file():
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
deleted file mode 100644
index 07b4f7c15f..0000000000
--- a/tests/python_tests/test_chat_generate_api.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (C) 2023-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import openvino_genai as ov_genai
-import pytest
-from typing import Dict, Tuple
-
-from ov_genai_test_utils import (
-    get_chat_models_list,
-    read_model,
-    get_continuous_batching,
-)
-
-
-generation_configs = [
-    dict(do_sample=False, max_new_tokens=20),
-    dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0)
-]
-
-
-questions = [
-    '1+1=',
-    'What is the previous answer?',
-    'Why is the Sun yellow?',
-    'What was my first question?'
-]
-
-
-@pytest.mark.parametrize("generation_config", generation_configs)
-@pytest.mark.parametrize("model_descr", get_chat_models_list())
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_chat_compare_with_HF(model_descr, generation_config: Dict):
-    chat_history_hf = []
-    chat_history_ov = []
-    chat_prompt = ''
-
-    # Will set add_special_tokens=False inside pipeline when start_chat() is called.
-    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
-
-    pipe.start_chat()
-    for prompt in questions:
-        chat_history_hf.append({'role': 'user', 'content': prompt})
-        chat_history_ov.append({'role': 'user', 'content': prompt})
-
-        chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
-        tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
-
-        answer = model_opt.generate(**tokenized, **generation_config)
-        answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
-        chat_history_hf.append({'role': 'assistant', 'content': answer_str})
-
-        answer_ov = pipe.generate(prompt, **generation_config)
-        chat_history_ov.append({'role': 'assistant', 'content': answer_ov})
-
-    pipe.finish_chat()
-
-    if chat_history_ov != chat_history_hf:
-        print(f'hf_output: {chat_history_hf}')
-        print(f'ov_output: {chat_history_ov}')
-
-    assert chat_history_ov == chat_history_hf
-
-
-@pytest.mark.parametrize("generation_config", generation_configs)
-@pytest.mark.parametrize("model_descr", get_chat_models_list())
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict):
-    # compares with HF when history in ov_genai is save as a text
-    chat_history_hf = []
-    chat_history_ov = []
-    chat_prompt = ''
-
-    # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True.
-    # Need to regenerate openvino_tokenizer/detokenizer.
-    model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False)
-    ov_tokenizer = ov_pipe.get_tokenizer()
-
-    for prompt in questions:
-        chat_history_hf.append({'role': 'user', 'content': prompt})
-        chat_history_ov.append({'role': 'user', 'content': prompt})
-
-        chat_prompt = hf_tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
-        tokenized = hf_tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
-
-        answer = model_opt.generate(**tokenized, **generation_config)
-        answer_str = hf_tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
-        chat_history_hf.append({'role': 'assistant', 'content': answer_str})
-
-        chat_prompt = ov_tokenizer.apply_chat_template(chat_history_ov, add_generation_prompt=True)
-        answer_ov = ov_pipe.generate(chat_prompt, **generation_config)
-        chat_history_ov.append({'role': 'assistant', 'content': answer_ov})
-
-    if chat_history_ov != chat_history_hf:
-        print(f'hf_output: {chat_history_hf}')
-        print(f'ov_output: {chat_history_ov}')
-
-    assert chat_history_ov == chat_history_hf
-
-
-@pytest.mark.parametrize("generation_config", generation_configs[1:])
-@pytest.mark.parametrize("model_descr", get_chat_models_list())
-@pytest.mark.precommit
-def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Dict):
-    model_id, path, hf_tokenizer, opt_model, ov_stateful_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
-    cb_pipe = get_continuous_batching(path)
-
-    ov_stateful_pipe.start_chat()
-    cb_pipe.start_chat()
-
-    for question in questions:
-        generated = cb_pipe.generate(question, **generation_config)
-        reference = ov_stateful_pipe.generate(question, **generation_config)
-        assert generated == reference
-
-    # Test that finish_chat() doesn't fail just in case.
-    cb_pipe.finish_chat()
diff --git a/tests/python_tests/test_preemption.py b/tests/python_tests/test_continuous_batching.py
similarity index 62%
rename from tests/python_tests/test_preemption.py
rename to tests/python_tests/test_continuous_batching.py
index 7c648e73dc..3a1e9fa092 100644
--- a/tests/python_tests/test_preemption.py
+++ b/tests/python_tests/test_continuous_batching.py
@@ -1,15 +1,172 @@
 # Copyright (C) 2018-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import os
 import pytest
+import math
+from typing import Dict
+
+from pathlib import Path
+from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
 
-from openvino_genai import GenerationConfig
 from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \
-    get_scheduler_config, run_test_pipeline, get_beam_search, get_greedy, \
+    get_scheduler_config, get_greedy, run_continuous_batching_pipeline_test, get_beam_search, get_greedy, \
     get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
     get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p
 from test_sampling import RandomSamplingTestStruct, get_current_platform_ref_texts
 
+from ov_genai_test_utils import (
+    get_chat_models_list,
+    read_model,
+    get_continuous_batching,
+)
+
+def read_models_list(file_name: str):
+    models = []
+    with open(file_name) as f:
+        for model_name in f:
+            model_name = model_name.strip()
+            # skip comment in model scope file
+            if model_name.startswith('#'):
+                continue
+            models.append(model_name)
+    return models
+
+#
+# e2e tests on random and real models
+#
+
+@pytest.mark.precommit
+@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit")))
+def test_e2e_precommit(tmp_path, model_id):
+    run_continuous_batching_pipeline_test(tmp_path, model_id)
+
+
+@pytest.mark.nightly
+@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly")))
+def test_e2e_nightly(tmp_path, model_id):
+    run_continuous_batching_pipeline_test(tmp_path, model_id)
+
+
+@pytest.mark.real_models
+@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models")))
+def test_e2e_real_models(tmp_path, model_id):
+    run_continuous_batching_pipeline_test(tmp_path, model_id)
+
+#
+# Comparison with stateful
+# TODO: remove these tests once test_llm_pipeline.py are generalized and parametrized to test both Stateful and PA paths
+#
+
+test_configs = [
+    dict(max_new_tokens=20),
+    dict(max_new_tokens=200, ignore_eos=True),
+    dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0)
+]
+batched_prompts = [
+    ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'],
+    ['hello', 'Here is the longest nowel ever: '],
+    ['Alan Turing was a', 'return 0', '你好！ 你好嗎？'],
+    ['table is made', 'table is made [force left pad tokens]']
+]
+@pytest.mark.parametrize("generation_config", test_configs)
+@pytest.mark.parametrize("prompt", batched_prompts[1:])  # num_beams=15 diverges on the first prompt.
+@pytest.mark.precommit
+def test_continuous_batching_vs_stateful(prompt, generation_config):
+    model_id, path, tokenizer, model, stateful = read_model((
+        "facebook/opt-125m",
+        Path("opt-125m")
+    ))
+    cb = get_continuous_batching(path)
+    generated = cb.generate(prompt, **generation_config)
+    reference = stateful.generate(prompt, **generation_config)
+    assert generated.texts == reference.texts
+    if 1 != generation_config.get("num_return_sequences", 1):
+        # Stateful puts zeroes to generated.scores. Don't compare them.
+        for gen, ref in zip(generated.scores, reference.scores):
+            assert math.isclose(gen, ref, abs_tol=0.0003)
+
+
+prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of']
+@pytest.mark.parametrize("prompt", prompts)
+@pytest.mark.precommit
+def test_cb_streamer_vs_return_vs_stateful(prompt):
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((
+        "facebook/opt-125m",
+        Path("opt-125m")
+    ))
+    cb_pipe = get_continuous_batching(path)
+    streamed = []
+    generated = cb_pipe.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword))
+    reference = ov_pipe.generate(prompt, max_new_tokens=20)
+    assert generated == "".join(streamed)
+    assert "".join(streamed) == reference
+
+
+generation_configs = [
+    dict(do_sample=False, max_new_tokens=20),
+    dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0)
+]
+questions = [
+    '1+1=',
+    'What is the previous answer?',
+    'Why is the Sun yellow?',
+    'What was my first question?'
+]
+@pytest.mark.parametrize("generation_config", generation_configs[1:])
+@pytest.mark.parametrize("model_descr", get_chat_models_list())
+@pytest.mark.precommit
+def test_chat_scenario_vs_stateful(model_descr, generation_config: Dict):
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+    cb_pipe = get_continuous_batching(path)
+
+    ov_pipe.start_chat()
+    cb_pipe.start_chat()
+
+    for question in questions:
+        generated = cb_pipe.generate(question, **generation_config)
+        reference = ov_pipe.generate(question, **generation_config)
+        assert generated == reference
+
+    # Test that finish_chat() doesn't fail just in case.
+    cb_pipe.finish_chat()
+
+#
+# Stress tests to check OOM case
+#
+
+@pytest.mark.precommit
+@pytest.mark.parametrize("sampling_config", [get_greedy(), get_beam_search(), get_multinomial_all_parameters()],
+                         ids=["greedy", "beam_search", "multinomial_all_parameters"])
+def test_post_oom_health(tmp_path, sampling_config):
+    generation_config = sampling_config
+    generation_config.ignore_eos = True
+    generation_config.max_new_tokens = 1000000
+
+    scheduler_config = get_scheduler_config()
+    scheduler_config.num_kv_blocks = 10 # Low cache size to trigger OOM quickly
+
+    model_id : str = "facebook/opt-125m"
+    opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
+
+    models_path : Path = tmp_path / model_id
+    save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path)
+
+    cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU")
+
+    # First run should return incomplete response
+    output = cb_pipe.generate(["What is OpenVINO?"], [generation_config])
+    assert (len(output))
+    assert (len(output[0].m_generation_ids))
+
+    # Same for the second run, here we want to make sure the cleanup works and we have free blocks after recent OOM
+    output = cb_pipe.generate(["What is OpenVINO?"], [generation_config])
+    assert (len(output))
+    assert (len(output[0].m_generation_ids))
+
+#
+# Pre-emption
+#
 
 def get_greedy_seq_len_300() -> GenerationConfig:
     generation_config = GenerationConfig()
@@ -36,7 +193,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
 @pytest.mark.parametrize("params", scheduler_params_list)
 @pytest.mark.precommit
 def test_preemption(tmp_path, params):
-    run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1])
+    run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1])
 
 
 multinomial_params = RandomSamplingTestStruct(
@@ -175,4 +332,4 @@ def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse):
 
     # needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq )
     scheduler_config = get_scheduler_config({"num_kv_blocks": 8, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256})
-    generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config)
\ No newline at end of file
+    generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config)
diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_kv_cache_eviction.py
similarity index 98%
rename from tests/python_tests/test_cache_optimizations.py
rename to tests/python_tests/test_kv_cache_eviction.py
index d89697ba42..bbd0da6bb2 100644
--- a/tests/python_tests/test_cache_optimizations.py
+++ b/tests/python_tests/test_kv_cache_eviction.py
@@ -15,7 +15,7 @@
 from openvino import serialize
 from transformers import AutoTokenizer
 
-from common import TESTS_ROOT, run_test_pipeline
+from common import TESTS_ROOT, run_continuous_batching_pipeline_test
 
 
 def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]:
@@ -168,5 +168,5 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
 @pytest.mark.parametrize("params", scheduler_params_list)
 @pytest.mark.precommit
 def test_dynamic_memory_allocation(tmp_path, params):
-    run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1])
+    run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", params[0], params[1])
 
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_llm_pipeline.py
similarity index 87%
rename from tests/python_tests/test_generate_api.py
rename to tests/python_tests/test_llm_pipeline.py
index 824a3cca26..9f00996a58 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_llm_pipeline.py
@@ -12,11 +12,12 @@
 import torch
 import math
 from ov_genai_test_utils import (
-    get_models_list, 
-    read_model, 
+    get_models_list,
+    read_model,
     load_genai_pipe_with_configs,
-    model_tmp_path, 
-    STOP_CRITERIA_MAP, 
+    get_chat_models_list,
+    model_tmp_path,
+    STOP_CRITERIA_MAP,
     get_continuous_batching,
 )
 
@@ -26,12 +27,12 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro
     config = generation_config.copy()  # to avoid side effects
     num_beams = config['num_beams'] if 'num_beams' in config else 1
     config['num_return_sequences'] = num_beams
-    
+
     if not isinstance(prompts, list):
         prompts = [prompts]
 
     if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config 
+        # Some HF models have default do_sample = True, and if we set beam search generation config
         # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
         # Need to set explicitly to False, but only if test arguments omitted this arg.
         # Do not apply 'repetition_penalty' if sampling is not used.
@@ -72,7 +73,7 @@ def run_hf_ov_genai_comparison_text_inputs(model_descr, generation_config: Dict,
     config = generation_config.copy()  # to avoid side effects
 
     if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config 
+        # Some HF models have default do_sample = True, and if we set beam search generation config
         # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
         # Need to set explicitly to False, but only if test arguments omitted this arg.
         # Do not apply 'repetition_penalty' if sampling is not used.
@@ -101,9 +102,9 @@ def run_hf_ov_genai_comparison_text_inputs(model_descr, generation_config: Dict,
 
 
 def run_hf_ov_genai_comparison_encoded_inputs(
-        model_descr, 
-        generation_config: Dict, 
-        input_ids: np.ndarray, 
+        model_descr,
+        generation_config: Dict,
+        input_ids: np.ndarray,
         attention_mask: Optional[np.array] = None
     ):
     device = 'CPU'
@@ -112,18 +113,18 @@ def run_hf_ov_genai_comparison_encoded_inputs(
     config = generation_config.copy()  # to avoid side effects
 
     if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config 
+        # Some HF models have default do_sample = True, and if we set beam search generation config
         # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
         # Need to set explicitly to False, but only if test arguments omitted this arg.
         # Do not apply 'repetition_penalty' if sampling is not used.
         config['do_sample'] = False
         config['repetition_penalty'] = 1.0 # 1.0 means no penalty
-    
+
     generation_config_hf = config.copy()
     if generation_config_hf.get('stop_criteria'):
         generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
     generation_config_hf.pop('ignore_eos', None)
-    
+
     if attention_mask is not None:
         inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask))
         inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask))
@@ -138,6 +139,9 @@ def run_hf_ov_genai_comparison_encoded_inputs(
     ov_res = np.array(ov_output.tokens, dtype=np.int64)
     assert np.all(ov_res == hf_res)
 
+#
+# e2e work
+#
 
 test_cases = [
     (dict(max_new_tokens=20), 'table is made of'),
@@ -197,14 +201,13 @@ def test_batch_text_input(model_descr, generation_config, prompts):
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
-                              max_new_tokens, diversity_penalty, prompt):
+def test_beam_search_decoding(model_descr, num_beam_groups, group_size, max_new_tokens, diversity_penalty, prompt):
     generation_config = dict(
-        num_beam_groups=num_beam_groups, 
-        num_beams=num_beam_groups * group_size, 
-        diversity_penalty=diversity_penalty, 
-        num_return_sequences=num_beam_groups * group_size, 
-        max_new_tokens=max_new_tokens, 
+        num_beam_groups=num_beam_groups,
+        num_beams=num_beam_groups * group_size,
+        diversity_penalty=diversity_penalty,
+        num_return_sequences=num_beam_groups * group_size,
+        max_new_tokens=max_new_tokens,
     )
     run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
 
@@ -215,17 +218,17 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
+def test_beam_search_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
     # todo: with EARLY stop_criteria looks like HF return invalid out with sentence<eos><unk><unk>
     # while genai ends sentence with <eos>
     if (stop_criteria == StopCriteria.EARLY):
         pytest.skip()
     generation_config = dict(
-        num_beam_groups=2, 
-        num_beams=2 * 3, 
-        diversity_penalty=1.0, 
-        num_return_sequences=2 * 3, 
-        max_new_tokens=max_new_tokens, 
+        num_beam_groups=2,
+        num_beams=2 * 3,
+        diversity_penalty=1.0,
+        num_return_sequences=2 * 3,
+        max_new_tokens=max_new_tokens,
         stop_criteria=stop_criteria,
     )
     run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
@@ -241,11 +244,11 @@ def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
 def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size,
                                     max_new_tokens, prompt):
     generation_config = dict(
-        num_beam_groups=num_beam_groups, 
-        num_beams=num_beam_groups * group_size, 
-        diversity_penalty=1.0, 
-        num_return_sequences=num_beam_groups * group_size, 
-        max_new_tokens=max_new_tokens, 
+        num_beam_groups=num_beam_groups,
+        num_beams=num_beam_groups * group_size,
+        diversity_penalty=1.0,
+        num_return_sequences=num_beam_groups * group_size,
+        max_new_tokens=max_new_tokens,
     )
     run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
 
@@ -283,6 +286,72 @@ def test_greedy_repetition_penalty(model_descr, prompt):
     assert(len(set(ov_output.split(' '))) > len(set(ov_output_half_penalty.split(' '))))
 
 
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_batch_size_switch():
+    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
+    ov_pipe.generate(["a"], max_new_tokens=2)
+    ov_pipe.generate(["1", "2"], max_new_tokens=2)
+    ov_pipe.generate(["a"], max_new_tokens=2)
+
+#
+# Chat scenario
+#
+
+generation_configs = [
+    dict(do_sample=False, max_new_tokens=20),
+    dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0)
+]
+
+
+questions = [
+    '1+1=',
+    'What is the previous answer?',
+    'Why is the Sun yellow?',
+    'What was my first question?'
+]
+
+
+@pytest.mark.parametrize("generation_config", generation_configs)
+@pytest.mark.parametrize("model_descr", get_chat_models_list())
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_chat_compare_with_HF(model_descr, generation_config: Dict):
+    chat_history_hf = []
+    chat_history_ov = []
+    chat_prompt = ''
+
+    # Will set add_special_tokens=False inside pipeline when start_chat() is called.
+    model_id, path, tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+
+    ov_pipe.start_chat()
+    for prompt in questions:
+        chat_history_hf.append({'role': 'user', 'content': prompt})
+        chat_history_ov.append({'role': 'user', 'content': prompt})
+
+        chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
+        tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
+
+        answer = opt_model.generate(**tokenized, **generation_config)
+        answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
+        chat_history_hf.append({'role': 'assistant', 'content': answer_str})
+
+        answer_ov = ov_pipe.generate(prompt, **generation_config)
+        chat_history_ov.append({'role': 'assistant', 'content': answer_ov})
+
+    ov_pipe.finish_chat()
+
+    if chat_history_ov != chat_history_hf:
+        print(f'hf_output: {chat_history_hf}')
+        print(f'ov_output: {chat_history_ov}')
+
+    assert chat_history_ov == chat_history_hf
+
+
+#
+# Streaming with callback
+#
+
 def user_defined_callback(subword):
     print(subword)
 
@@ -422,11 +491,14 @@ def test_operator_with_streamer_kwargs_batch_throws():
     with pytest.raises(RuntimeError):
         ov_pipe('', num_beams=2, streamer=printer)
 
+#
+# Tests on generation configs (invalid cases and handling within LLMPipeline)
+#
 
 invalid_configs = [
     dict(num_beam_groups=3, num_beams=15, do_sample=True),
     # TODO: CVS-158682 eos_token_id is still read from tiny-random-phi3 and we cannot modify RTInfo in tests
-    # dict(do_sample=True),  # no eos_token_id no max_new_tokens, no max_len 
+    # dict(do_sample=True),  # no eos_token_id no max_new_tokens, no max_len
     dict(eos_token_id=42, ignore_eos=True),  # no max_new_tokens, no max_len with ignore_eos
     dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty
     dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp
@@ -446,7 +518,7 @@ def test_invalid_generation_configs_throws(model_tmp_path, generation_config):
 
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_valid_configs(model_tmp_path):
+def test_eos_token_is_inherited_from_default_generation_config(model_tmp_path):
     model_id, temp_path = model_tmp_path
     ov_pipe = load_genai_pipe_with_configs([({"eos_token_id": 37}, "config.json")], temp_path)
 
@@ -454,6 +526,8 @@ def test_valid_configs(model_tmp_path):
     config.do_sample = True  # no eos_token_id but it's loaded from config.json
     ov_pipe.set_generation_config(config)
 
+    assert 37 == ov_pipe.get_generation_config().eos_token_id
+
 
 invalid_py_configs = [
     dict(num_beam_groups=3, num_beams=15, do_sample=True),
@@ -478,6 +552,9 @@ def test_python_generation_config_validation_throws(model_tmp_path, generation_c
     with pytest.raises(return_exception_type):
         ov_pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config))
 
+#
+# Work with Unicode in Python API
+#
 
 @pytest.mark.precommit
 @pytest.mark.nightly
@@ -512,69 +589,9 @@ def test_unicode_pybind_decoding_one_string_streamer():
     ov_pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x))
     assert '�' == res_str[-1]
 
-
-@pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory")
-@pytest.mark.precommit
-@pytest.mark.nightly
-@pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win")
-def test_left_pad():
-    # test left pad tokenizer post processing implementation
-    prompts = [
-        "The Sun is yellow because",
-        "The Sun is yellow because [force left pad tokens]"
-    ]
-    models = read_model(("microsoft/phi-1_5", Path("phi-1_5/")))
-
-    config = {
-        "max_new_tokens": 20,
-        "num_beam_groups": 2,
-        "num_beams": 2,
-        "num_return_sequences": 2,
-        "do_sample": False,
-        "diversity_penalty": 1.0,
-        # phi 1_5 has no eos_token_id in model configuration
-        # ov genai will detect eos_token_id from tokenizer config
-        # hf implementation doesn't fetch it from tokenizer config and defaults to None
-        # align ov genai and hf by setting eos_token_id explicitly
-        "eos_token_id": 50256,
-    }
-
-    models[2].pad_token = models[2].eos_token
-    run_hf_ov_genai_comparison_batched(models, config, prompts)
-
-
-@pytest.mark.parametrize("generation_config", test_configs)
-@pytest.mark.parametrize("prompt", batched_prompts[1:])  # num_beams=15 diverges on the first prompt.
-@pytest.mark.precommit
-def test_continuous_batching_vs_stateful(prompt, generation_config):
-    model_id, path, tokenizer, model, stateful = read_model((
-        "facebook/opt-125m",
-        Path("opt-125m")
-    ))
-    cb = get_continuous_batching(path)
-    generated = cb.generate(prompt, **generation_config)
-    reference = stateful.generate(prompt, **generation_config)
-    assert generated.texts == reference.texts
-    if 1 != generation_config.get("num_return_sequences", 1):
-        # Stateful puts zeroes to generated.scores. Don't compare them.
-        for gen, ref in zip(generated.scores, reference.scores):
-            assert math.isclose(gen, ref, abs_tol=0.0003)
-
-
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.precommit
-def test_cb_streamer_vs_return_vs_stateful(prompt):
-    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((
-        "facebook/opt-125m",
-        Path("opt-125m")
-    ))
-    cb_pipe = get_continuous_batching(path)
-    streamed = []
-    generated = cb_pipe.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword))
-    reference = ov_pipe.generate(prompt, max_new_tokens=20)
-    assert generated == "".join(streamed)
-    assert "".join(streamed) == reference
-
+#
+# Perf metrics
+#
 
 def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: str) -> ov_genai.PerfMetrics:
     model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
@@ -582,12 +599,13 @@ def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: st
     config = generation_config.copy()  # to avoid side effects
 
     if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config 
+        # Some HF models have default do_sample = True, and if we set beam search generation config
         # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
         # Need to set explicitly to False, but only if test arguments omitted this arg.
         # Do not apply 'repetition_penalty' if sampling is not used.
         config['do_sample'] = False
         config['repetition_penalty'] = 1.0 # 1.0 means no penalty
+
     return ov_pipe.generate([prompt], **config).perf_metrics
 
 
@@ -598,20 +616,21 @@ def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: st
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
+@pytest.mark.skip(reason="load_time + mean_gen_duration < total_time fails in https://github.com/openvinotoolkit/openvino.genai/actions/runs/12503590506/job/34884840100?pr=1440.")
 def test_perf_metrics(model_descr, generation_config, prompt):
     import time
     start_time = time.perf_counter()
     perf_metrics = run_perf_metrics_collection(read_model(model_descr), generation_config, prompt)
     total_time = (time.perf_counter() - start_time) * 1000
-    
+
     # Check that load time is adequate.
     load_time = perf_metrics.get_load_time()
-    assert load_time > 0 and load_time < 1000.0  
-    
+    assert load_time > 0 and load_time < 1000.0
+
     # Check that num input and generated tokens are adequate.
     num_generated_tokens = perf_metrics.get_num_generated_tokens()
-    assert num_generated_tokens > 0 and num_generated_tokens <= generation_config['max_new_tokens']  
-    
+    assert num_generated_tokens > 0 and num_generated_tokens <= generation_config['max_new_tokens']
+
     num_input_tokens = perf_metrics.get_num_input_tokens()
     assert num_input_tokens > 0 and num_input_tokens <= len(prompt)
 
@@ -622,7 +641,7 @@ def test_perf_metrics(model_descr, generation_config, prompt):
     raw_metrics = perf_metrics.raw_metrics
     durations = np.array(raw_metrics.m_durations) / 1000
     # Check that prefill is not included in durations for TPOT calculation.
-    # For the very long prompt prefill is slow and TTFT is much larger than any other token genration duration.
+    # For the very long prompt prefill is slow and TTFT is much larger than any other token generation duration.
     assert np.all(mean_ttft > durations * 2)
 
     mean_tpot, std_tpot = perf_metrics.get_tpot()
@@ -632,7 +651,7 @@ def test_perf_metrics(model_descr, generation_config, prompt):
     mean_throughput, std_throughput = perf_metrics.get_throughput()
     assert (mean_throughput, std_throughput) == (perf_metrics.get_throughput().mean, perf_metrics.get_throughput().std)
     assert mean_throughput > 0 and mean_throughput < 20000.0
-    
+
     mean_gen_duration, std_gen_duration = perf_metrics.get_generate_duration()
     assert (mean_gen_duration, std_gen_duration) == (perf_metrics.get_generate_duration().mean, perf_metrics.get_generate_duration().std)
     assert mean_gen_duration > 0 and load_time + mean_gen_duration < total_time
@@ -647,7 +666,7 @@ def test_perf_metrics(model_descr, generation_config, prompt):
     assert (mean_detok_duration, std_detok_duration) == (perf_metrics.get_detokenization_duration().mean, perf_metrics.get_detokenization_duration().std)
     assert mean_detok_duration > 0 and mean_detok_duration < mean_gen_duration
     assert std_detok_duration == 0
-    
+
     # assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics
     assert np.allclose(mean_tpot, np.mean(durations))
     assert np.allclose(std_tpot, np.std(durations))
@@ -668,15 +687,11 @@ def test_perf_metrics(model_descr, generation_config, prompt):
     assert len(raw_metrics.m_batch_sizes) > 0
     assert len(raw_metrics.m_durations) > 0
 
+#
+# Misc
+#
 
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_batch_switch():
-    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
-    ov_pipe.generate(["a"], max_new_tokens=2)
-    ov_pipe.generate(["1", "2"], max_new_tokens=2)
-
-
+# TODO: move to test_sampling.py
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_stop_token_ids():
@@ -691,6 +706,7 @@ def test_stop_token_ids():
     assert 9935 in res.tokens[0]
 
 
+# TODO: move to test_sampling.py
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_stop_strings():
@@ -701,3 +717,34 @@ def test_stop_strings():
         stop_strings={"ignored", "боль"}
     )
     assert "боль" not in res
+
+
+# TODO: move this test to test_tokenizer.py
+@pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory")
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win")
+def test_left_pad():
+    # test left pad tokenizer post processing implementation
+    prompts = [
+        "The Sun is yellow because",
+        "The Sun is yellow because [force left pad tokens]"
+    ]
+    models = read_model(("microsoft/phi-1_5", Path("phi-1_5/")))
+
+    config = {
+        "max_new_tokens": 20,
+        "num_beam_groups": 2,
+        "num_beams": 2,
+        "num_return_sequences": 2,
+        "do_sample": False,
+        "diversity_penalty": 1.0,
+        # phi 1_5 has no eos_token_id in model configuration
+        # ov genai will detect eos_token_id from tokenizer config
+        # hf implementation doesn't fetch it from tokenizer config and defaults to None
+        # align ov genai and hf by setting eos_token_id explicitly
+        "eos_token_id": 50256,
+    }
+
+    models[2].pad_token = models[2].eos_token
+    run_hf_ov_genai_comparison_batched(models, config, prompts)
diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py
index cad8b0fea0..c3500d15ac 100644
--- a/tests/python_tests/test_llm_pipeline_static.py
+++ b/tests/python_tests/test_llm_pipeline_static.py
@@ -145,7 +145,7 @@ def test_chat_generation(model_descr):
         'What was my first question?'
     ]
 
-    model_path = get_chat_models_lists()[0][1]
+    model_path = get_chat_models_list()[0][1]
 
     chat_history_stateful = generate_chat_history(model_path, "CPU", { }, questions)
     chat_history_static   = generate_chat_history(model_path, "NPU", common_config, questions)
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index fbcce76bf7..25ae9d8afa 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -10,13 +10,13 @@
 from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
 from typing import List, TypedDict
 
-from common import run_test_pipeline, read_models_list, get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, \
-    generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, \
+from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, \
+    get_greedy, get_beam_search, get_multinomial_temperature, \
     get_greedy_with_penalties, get_multinomial_temperature, \
     get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \
     get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, \
     get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
-    generate_and_compare_with_reference_text, get_greedy, get_greedy_with_min_and_max_tokens, \
+    get_greedy, get_greedy_with_min_and_max_tokens, \
     get_greedy_with_single_stop_string, get_greedy_with_multiple_stop_strings, get_greedy_with_multiple_stop_strings_no_match, \
     get_beam_search, get_beam_search_min_and_max_tokens, get_beam_search_with_single_stop_string, \
     get_beam_search_with_multiple_stop_strings, get_beam_search_with_multiple_stop_strings_no_match, get_multinomial_max_and_min_token, \
@@ -27,25 +27,9 @@
     run_continuous_batching
 
 
+# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests
 @pytest.mark.precommit
-@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit")))
-def test_sampling_precommit(tmp_path, model_id):
-    run_test_pipeline(tmp_path, model_id)
-
-
-@pytest.mark.nightly
-@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly")))
-def test_sampling_nightly(tmp_path, model_id):
-    run_test_pipeline(tmp_path, model_id)
-
-@pytest.mark.real_models
-@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models")))
-def test_real_models(tmp_path, model_id):
-    run_test_pipeline(tmp_path, model_id)
-
-
-@pytest.mark.precommit
-def test_eos_beam_search(tmp_path):
+def test_beam_search_has_eos_token_at_end(tmp_path):
     '''
     Current test checks that in case of beam search, some generation results
     explicitly have EOS token at the end, which is aligned with HF
@@ -61,8 +45,9 @@ def test_eos_beam_search(tmp_path):
     generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path)
 
 
+# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests
 @pytest.mark.precommit
-def test_eos_greedy(tmp_path):
+def test_greedy_has_eos_token_at_end(tmp_path):
     '''
     Current test checks that in case of gready, some generation results
     explicitly have EOS token at the end, which is aligned with HF:
@@ -76,55 +61,44 @@ def test_eos_greedy(tmp_path):
     scheduler_config = get_scheduler_config()
     generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path)
 
+
+# TODO: consider removing all these functions with generation configs and use Dict with properties, which can be converted to generation config
 @pytest.mark.precommit
-@pytest.mark.parametrize("generation_config", [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(),
-                                               get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(), 
-                                               get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(),
-                                               get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(),
-                                               get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output() ],
-        ids=[
-            "greedy",
-            "greedy_with_min_and_max_tokens",
-            "greedy_with_repetition_penalty",
-            "greedy_with_single_stop_string",
-            "greedy_with_multiple_stop_strings",
-            "greedy_with_multiple_stop_strings_no_match",
-            "beam",
-            "beam_search_min_and_max_tokens",
-            "beam_search_with_multiple_stop_strings_no_match",
-            "get_greedy_stop_strings_exclude_from_output",
-            "get_greedy_stop_strings_include_to_output",
-            "get_greedy_n_stop_strings_exclude_from_output",
-            "get_greedy_n_stop_strings_include_to_output"
-            ])
-def test_individual_generation_configs_deterministic(tmp_path, generation_config):
-    prompts = [
-            "What is OpenVINO?",
-            ]
+@pytest.mark.parametrize("generation_config",
+                         [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(),
+                          get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(),
+                          get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(),
+                          get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(),
+                          get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output()],
+                         ids=["greedy", "greedy_with_min_and_max_tokens", "greedy_with_repetition_penalty", "greedy_with_single_stop_string",
+                              "greedy_with_multiple_stop_strings", "greedy_with_multiple_stop_strings_no_match", "beam_search", "beam_search_min_and_max_tokens",
+                              "beam_search_with_multiple_stop_strings_no_match", "greedy_stop_strings_exclude_from_output", "greedy_stop_strings_include_to_output",
+                              "greedy_n_stop_strings_exclude_from_output", "greedy_n_stop_strings_include_to_output"])
+def test_sampling_against_optimum(tmp_path, generation_config):
+    prompts = [ "What is OpenVINO?" ]
     generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
     generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path)
 
+
 @pytest.mark.precommit
 @pytest.mark.xfail(
     raises=AssertionError,
     reason="Stop strings do not seem to work as expected with beam search in HF, so comparison will fail. If it changes, these cases shall be merged to the test above.",
     strict=True,
 )
-@pytest.mark.parametrize("generation_config", [get_beam_search_with_single_stop_string(), get_beam_search_with_multiple_stop_strings(),],
-        ids=[
-            "beam_search_with_single_stop_string",
-            "beam_search_with_multiple_stop_strings",
-            ])
+@pytest.mark.parametrize("generation_config", [get_beam_search_with_single_stop_string(), get_beam_search_with_multiple_stop_strings()],
+                         ids=["beam_search_with_single_stop_string", "beam_search_with_multiple_stop_strings"])
 def test_beam_search_with_stop_string(tmp_path, generation_config):
-    prompts = [
-            "What is OpenVINO?",
-            ]
+    prompts = [ "What is OpenVINO?" ]
     generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
     generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path)
 
 
+# TODO: remove platform specific reference texts once CVS-159912 is done and use comparison with HF
+# and merge this tests with 'test_sampling_against_optimum' by extending a list of generation configs
+
 class PlatformsRefTexts(TypedDict, total=False):
     linux: List[List[str]]
     win32: List[List[str]]
@@ -306,7 +280,7 @@ class RandomSamplingTestStruct:
              "multinomial_temperature_and_frequence_penalty",
              "greedy_with_penalties",
              "multinomial_max_and_min_token"])
-def test_individual_generation_configs_random(tmp_path, test_struct: RandomSamplingTestStruct):
+def test_multinomial_sampling_against_reference(tmp_path, test_struct: RandomSamplingTestStruct):
     generation_config = test_struct.generation_config
 
     prompts = test_struct.prompts
@@ -326,9 +300,10 @@ def test_individual_generation_configs_random(tmp_path, test_struct: RandomSampl
 
 
 @pytest.mark.precommit
-@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters])
+@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters],
+                         ids=["greedy", "beam_search", "multinomial_all_parameters"])
 @pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256])
-def test_echo_without_completion(tmp_path, get_generation_config, max_num_batched_tokens):
+def test_echo_prompt_phase_only(tmp_path, get_generation_config, max_num_batched_tokens):
     generation_config = get_generation_config()
     generation_config.max_new_tokens = 0
     generation_config.echo = True
@@ -337,14 +312,14 @@ def test_echo_without_completion(tmp_path, get_generation_config, max_num_batche
     scheduler_config.max_num_batched_tokens = max_num_batched_tokens
     generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
+    opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
 
     model_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(model, hf_tokenizer, model_path)
+    save_ov_model_from_optimum(opt_model, hf_tokenizer, model_path)
 
-    pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU")
+    cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU")
 
-    outputs = pipe.generate(["What is OpenVINO?"], generation_configs)
+    outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(outputs))
     for output in outputs:
         assert(len(output.m_generation_ids))
@@ -353,9 +328,10 @@ def test_echo_without_completion(tmp_path, get_generation_config, max_num_batche
 
 
 @pytest.mark.precommit
-@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters])
+@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters],
+                         ids=["greedy", "beam_search", "multinomial_all_parameters"])
 @pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256])
-def test_echo_with_completion(tmp_path, get_generation_config, max_num_batched_tokens):
+def test_echo_with_generation_phase(tmp_path, get_generation_config, max_num_batched_tokens):
     generation_config = get_generation_config()
     generation_config.max_new_tokens = 10
     generation_config.echo = True
@@ -364,45 +340,17 @@ def test_echo_with_completion(tmp_path, get_generation_config, max_num_batched_t
     scheduler_config.max_num_batched_tokens = max_num_batched_tokens
     generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
+    opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
 
     model_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(model, hf_tokenizer, model_path)
-
-    pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU")
+    save_ov_model_from_optimum(opt_model, hf_tokenizer, model_path)
 
-    outputs = pipe.generate(["What is OpenVINO?"], generation_configs)
+    cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU")
+    outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(outputs))
+
     for output in outputs:
         assert(len(output.m_generation_ids))
         for sequence in output.m_generation_ids:
             assert(sequence.startswith("What is OpenVINO?"))
             assert(len(sequence) > len("What is OpenVINO?"))
-
-
-@pytest.mark.precommit
-@pytest.mark.parametrize("sampling_config", [get_greedy(), get_beam_search(), get_multinomial_all_parameters()])
-def test_post_oom_health(tmp_path, sampling_config):
-    generation_config = sampling_config
-    generation_config.ignore_eos = True
-    generation_config.max_new_tokens = 1000000
-
-    scheduler_config = get_scheduler_config()
-    # Low cache size to trigger OOM quickly
-    scheduler_config.num_kv_blocks = 10
-    generation_configs = [generation_config]
-    model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
-
-    models_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(model, hf_tokenizer, models_path)
-
-    pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU")
-    # First run should return incomplete response
-    output = pipe.generate(["What is OpenVINO?"], generation_configs)
-    assert (len(output))
-    assert(len(output[0].m_generation_ids))
-    # Same for the second run, here we want to make sure the cleanup works and we have free blocks after recent OOM
-    output = pipe.generate(["What is OpenVINO?"], generation_configs)
-    assert (len(output))
-    assert(len(output[0].m_generation_ids))
diff --git a/tests/python_tests/test_vlm_api.py b/tests/python_tests/test_vlm_pipeline.py
similarity index 100%
rename from tests/python_tests/test_vlm_api.py
rename to tests/python_tests/test_vlm_pipeline.py
diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_pipeline.py
similarity index 100%
rename from tests/python_tests/test_whisper_generate_api.py
rename to tests/python_tests/test_whisper_pipeline.py

From 842c99edb567a701c289677a34a3af87553054e0 Mon Sep 17 00:00:00 2001
From: Mang Guo <mang.guo@intel.com>
Date: Fri, 27 Dec 2024 14:36:19 +0800
Subject: [PATCH 062/110] Support unfixed kv heads number (#1416)

Fix decilm-7b-instruct benchmark test failure. The number heads per
layer is not fixed in decilm-7b-instruct model, current code can not
handle such case. JIRA ticket CVS-157864.

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 src/cpp/src/cache_manager.hpp                 | 41 ++++++-------
 src/cpp/src/device_config.hpp                 | 61 ++++++++++++-------
 .../utils/paged_attention_transformations.cpp | 20 +++---
 tests/cpp/cache_manager.cpp                   | 13 ++--
 tests/cpp/device_config.cpp                   |  2 +-
 tests/cpp/scheduler.cpp                       |  2 +-
 6 files changed, 84 insertions(+), 55 deletions(-)

diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp
index 0c04823f4f..20d4c0c51c 100644
--- a/src/cpp/src/cache_manager.hpp
+++ b/src/cpp/src/cache_manager.hpp
@@ -46,8 +46,6 @@ class CacheManager {
         }
         OPENVINO_ASSERT(m_key_cache.size() == m_value_cache.size());
         m_num_allocated_kv_blocks = num_kv_blocks;
-        ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), num_kv_blocks);
-        ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), num_kv_blocks);
 
         const std::string device_name = m_device_config.get_device();
 
@@ -56,6 +54,8 @@ class CacheManager {
 
         if (device_name.find("GPU") == std::string::npos) {// Allocate KV caches
             for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
+                ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), num_kv_blocks);
+                ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), num_kv_blocks);
                 ov::Tensor key_cache(m_device_config.get_cache_precision(), key_cache_shape);
                 ov::Tensor value_cache(m_device_config.get_cache_precision(), value_cache_shape);
 
@@ -104,6 +104,8 @@ class CacheManager {
         } else {
             auto remote_context = m_core.get_default_context(device_name);
             for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
+                ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), num_kv_blocks);
+                ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), num_kv_blocks);
                 ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(),
                                                                     key_cache_shape);
                 ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(),
@@ -142,30 +144,27 @@ class CacheManager {
     }
 
     void copy_blocks(const std::map<size_t, std::list<size_t>>& block_copy_map) {
-        ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), m_num_allocated_kv_blocks);
-        ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), m_num_allocated_kv_blocks);
-
-        ov::Coordinate key_src_start_roi(key_shape.size(), 0);
-        ov::Coordinate key_src_end_roi = key_shape;
-        ov::Coordinate key_dst_start_roi(key_shape.size(), 0);
-        ov::Coordinate key_dst_end_roi = key_shape;
-
-        ov::Coordinate value_src_start_roi(value_shape.size(), 0);
-        ov::Coordinate value_src_end_roi = value_shape;
-        ov::Coordinate value_dst_start_roi(value_shape.size(), 0);
-        ov::Coordinate value_dst_end_roi = value_shape;
-
         for (const auto & blocks_pair : block_copy_map) {
             size_t src_block_id = blocks_pair.first;
-            key_src_end_roi[0] = (key_src_start_roi[0] = src_block_id) + 1;
-            value_src_end_roi[0] = (value_src_start_roi[0] = src_block_id) + 1;
-
             const std::list<size_t>& dst_block_ids = blocks_pair.second;
             for (size_t dst_block_id : dst_block_ids) {
-                key_dst_end_roi[0] = (key_dst_start_roi[0] = dst_block_id) + 1;
-                value_dst_end_roi[0] = (value_dst_start_roi[0] = dst_block_id) + 1;
-
                 for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
+                    ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), m_num_allocated_kv_blocks);
+                    ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), m_num_allocated_kv_blocks);
+                    ov::Coordinate key_src_start_roi(key_shape.size(), 0);
+                    ov::Coordinate key_src_end_roi = key_shape;
+                    ov::Coordinate key_dst_start_roi(key_shape.size(), 0);
+                    ov::Coordinate key_dst_end_roi = key_shape;
+            
+                    ov::Coordinate value_src_start_roi(value_shape.size(), 0);
+                    ov::Coordinate value_src_end_roi = value_shape;
+                    ov::Coordinate value_dst_start_roi(value_shape.size(), 0);
+                    ov::Coordinate value_dst_end_roi = value_shape;
+                    key_src_end_roi[0] = (key_src_start_roi[0] = src_block_id) + 1;
+                    value_src_end_roi[0] = (value_src_start_roi[0] = src_block_id) + 1;
+                    key_dst_end_roi[0] = (key_dst_start_roi[0] = dst_block_id) + 1;
+                    value_dst_end_roi[0] = (value_dst_start_roi[0] = dst_block_id) + 1;
+
                     ov::Tensor key_src_cache_roi(m_key_cache[decoder_layer_id], key_src_start_roi, key_src_end_roi);
                     ov::Tensor key_dst_cache_roi(m_key_cache[decoder_layer_id], key_dst_start_roi, key_dst_end_roi);
 
diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp
index 371142701c..cc2e21b9a1 100644
--- a/src/cpp/src/device_config.hpp
+++ b/src/cpp/src/device_config.hpp
@@ -12,8 +12,9 @@
 namespace ov::genai {
 class DeviceConfig {
     ov::element::Type m_kv_cache_type;
-    ov::PartialShape m_key_cache_shape, m_value_cache_shape;
-    ov::Shape::value_type m_num_kv_heads, m_head_size, m_num_decoder_layers;
+    std::vector<ov::PartialShape> m_key_cache_shape, m_value_cache_shape;
+    std::vector<ov::Shape::value_type> m_num_kv_heads;
+    ov::Shape::value_type m_head_size, m_num_decoder_layers;
     size_t m_num_kv_blocks = 0;
     size_t m_block_size = 0;
     size_t m_cache_size = 0;
@@ -88,11 +89,14 @@ class DeviceConfig {
         }
     }
 
-    void set_model_params(size_t num_kv_heads, size_t head_size, size_t num_decoder_layers) {
-        m_num_kv_heads = num_kv_heads;
+    void set_model_params(std::vector<size_t> num_kv_heads, size_t head_size, size_t num_decoder_layers) {
         m_head_size = head_size;
         m_num_decoder_layers = num_decoder_layers;
 
+        m_num_kv_heads.assign(num_kv_heads.begin(), num_kv_heads.end());
+        m_key_cache_shape.reserve(m_num_decoder_layers);
+        m_value_cache_shape.reserve(m_num_decoder_layers);
+
         if (m_device == "CPU") {
             // Scale, zero point and quantized data will be stored together.
             // The layout for per token per head:
@@ -104,21 +108,32 @@ class DeviceConfig {
         }
 
         if (m_num_kv_blocks == 0 && m_cache_size > 0) {
+            size_t block_size = 0;
             size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024;
-            m_num_kv_blocks = size_in_bytes / (m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * m_kv_cache_type.size());
+            for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) {
+                block_size += 2 * m_num_kv_heads[layer_id] * m_block_size * m_head_size * m_kv_cache_type.size();
+            }
+            m_num_kv_blocks = size_in_bytes / block_size;
         }
 
-        m_key_cache_shape = m_value_cache_shape = ov::PartialShape{ov::Dimension::dynamic(),
-                                                                   ov::Dimension(m_num_kv_heads),
-                                                                   ov::Dimension(m_block_size),
-                                                                   ov::Dimension(m_head_size)};
-
-        if (m_device.find("GPU") != std::string::npos) {
-            // Update key shape, as the key's shape is different from the value's shape
-            m_key_cache_shape = ov::PartialShape{ov::Dimension::dynamic(),
-                                                 ov::Dimension(m_num_kv_heads),
-                                                 ov::Dimension(m_head_size),
-                                                 ov::Dimension(m_block_size)};
+        for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) {
+            m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
+                                                         ov::Dimension(m_num_kv_heads[layer_id]),
+                                                         ov::Dimension(m_block_size),
+                                                         ov::Dimension(m_head_size)});
+
+            m_value_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
+                                                           ov::Dimension(m_num_kv_heads[layer_id]),
+                                                           ov::Dimension(m_block_size),
+                                                           ov::Dimension(m_head_size)});
+
+            if (m_device.find("GPU") != std::string::npos) {
+                // Update key shape, as the key's shape is different from the value's shape
+                m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
+                                                     ov::Dimension(m_num_kv_heads[layer_id]),
+                                                     ov::Dimension(m_head_size),
+                                                     ov::Dimension(m_block_size)});
+            }
         }
     }
 
@@ -134,14 +149,14 @@ class DeviceConfig {
         return m_num_decoder_layers;
     }
 
-    ov::PartialShape get_key_cache_shape() const {
+    ov::PartialShape get_key_cache_shape(size_t id) const {
         OPENVINO_ASSERT(m_key_cache_shape.size());
-        return m_key_cache_shape;
+        return m_key_cache_shape[id];
     }
 
-    ov::PartialShape get_value_cache_shape() const {
+    ov::PartialShape get_value_cache_shape(size_t id) const {
         OPENVINO_ASSERT(m_value_cache_shape.size());
-        return m_value_cache_shape;
+        return m_value_cache_shape[id];
     }
 
     size_t get_num_kv_blocks() const {
@@ -153,7 +168,11 @@ class DeviceConfig {
     }
 
     size_t get_block_size_in_bytes() const {
-        return m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * get_cache_precision().size();
+        size_t block_size = 0;
+        for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) {
+            block_size += 2 * m_num_kv_heads[layer_id] * m_block_size * m_head_size * get_cache_precision().size();
+        }
+        return block_size;
     }
 };
 }
diff --git a/src/cpp/src/utils/paged_attention_transformations.cpp b/src/cpp/src/utils/paged_attention_transformations.cpp
index 4dedcf989a..f564be8f19 100644
--- a/src/cpp/src/utils/paged_attention_transformations.cpp
+++ b/src/cpp/src/utils/paged_attention_transformations.cpp
@@ -53,15 +53,21 @@ void set_kv_cache_type_and_shape(std::shared_ptr<ov::Model> model, DeviceConfig&
     OPENVINO_ASSERT(key_cache_params.count(key_cache_param_name) != 0, "key_cache.0 tensor not found among model parameters");
     ov::PartialShape k_shape = key_cache_params[key_cache_param_name]->get_partial_shape();
     OPENVINO_ASSERT(k_shape.rank().get_length() == 3, "KV cache shape is expected to have rank 3, while shape is ", k_shape);
-    size_t num_kv_heads = k_shape[1].get_length(), head_size = k_shape[2].get_length();
-
+    size_t head_size = k_shape[2].get_length();
+    std::vector<size_t> num_kv_heads(num_layers);
+    for (size_t idx = 0; idx < num_layers; idx++) {
+        size_t num_heads = key_cache_params[std::string("key_cache.") + std::to_string(idx)]->get_partial_shape()[1].get_length();
+        num_kv_heads[idx] = num_heads;
+    }
     device_config.set_model_params(num_kv_heads, head_size, num_layers);
 
-    for (auto it_k = key_cache_params.begin(), it_v = value_cache_params.begin(); it_k != key_cache_params.end();++it_k, ++it_v) {
-        it_k->second->set_element_type(device_config.get_cache_precision());
-        it_v->second->set_element_type(device_config.get_cache_precision());
-        it_k->second->set_partial_shape(device_config.get_key_cache_shape());
-        it_v->second->set_partial_shape(device_config.get_value_cache_shape());
+    for (size_t idx = 0; idx < num_layers; idx++) {
+        auto k = key_cache_params[std::string("key_cache.") + std::to_string(idx)];
+        auto v = value_cache_params[std::string("value_cache.") + std::to_string(idx)];
+        k->set_element_type(device_config.get_cache_precision());
+        v->set_element_type(device_config.get_cache_precision());
+        k->set_partial_shape(device_config.get_key_cache_shape(idx));
+        v->set_partial_shape(device_config.get_value_cache_shape(idx));
     }
 
     model->validate_nodes_and_infer_types();
diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp
index 7f07980389..5dc848aba5 100644
--- a/tests/cpp/cache_manager.cpp
+++ b/tests/cpp/cache_manager.cpp
@@ -54,7 +54,8 @@ TEST(TestCacheManager, test_cache_size_param) {
     const std::string device = "CPU";
     ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
     size_t num_decoder_layers = 12;
-    device_config.set_model_params(12, 64, num_decoder_layers);
+    std::vector<size_t> num_kv_heads(12, 12);
+    device_config.set_model_params(num_kv_heads, 64, num_decoder_layers);
 
     ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
     auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
@@ -76,7 +77,8 @@ TEST(TestCacheManager, test_kv_blocks_param) {
     const std::string device = "CPU";
     ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
     size_t num_decoder_layers = 12;
-    device_config.set_model_params(12, 64, num_decoder_layers);
+    std::vector<size_t> num_kv_heads(12, 12);
+    device_config.set_model_params(num_kv_heads, 64, num_decoder_layers);
 
     ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
     auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
@@ -97,9 +99,12 @@ TEST(TestCacheManager, test_dynamic_cache_increase) {
     ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
     size_t num_decoder_layers = 12;
     size_t head_size = 64;
-    size_t num_kv_heads = 12;
+    std::vector<size_t> num_kv_heads(12, 12);
     device_config.set_model_params(num_kv_heads, head_size, num_decoder_layers);
-    size_t block_size_in_bytes = num_decoder_layers * 2 * num_kv_heads * device_config.get_block_size() * head_size * device_config.get_cache_precision().size();
+    size_t block_size_in_bytes = 0;
+    for (size_t layer_id = 0; layer_id < num_decoder_layers; layer_id++) {
+        block_size_in_bytes += 2 * num_kv_heads[layer_id] * device_config.get_block_size() * head_size * device_config.get_cache_precision().size();
+    }
 
 
     ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
diff --git a/tests/cpp/device_config.cpp b/tests/cpp/device_config.cpp
index 0d7435818f..973648f637 100644
--- a/tests/cpp/device_config.cpp
+++ b/tests/cpp/device_config.cpp
@@ -18,7 +18,7 @@ TEST(TestDeviceConfig, kv_cache_precision_u8) {
     const std::string device = "CPU";
     size_t num_decoder_layers = 12;
     size_t head_size = 64, head_size_u8 = head_size + 8;
-    size_t num_kv_heads = 12;
+    std::vector<size_t> num_kv_heads(12, 12);
 
     ov::genai::DeviceConfig device_config_default(core, scheduler_config, "CPU");
     device_config_default.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers);
diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp
index ea1720faa2..cc0b53a433 100644
--- a/tests/cpp/scheduler.cpp
+++ b/tests/cpp/scheduler.cpp
@@ -44,7 +44,7 @@ std::shared_ptr<CacheManager> init_cache_manager(SchedulerConfig scheduler_confi
     size_t num_decoder_layers = 12;
     ov::InferRequest request = core.compile_model(get_model(num_decoder_layers)).create_infer_request();
     size_t head_size = 64, head_size_u8 = head_size + 8;
-    size_t num_kv_heads = 12;
+    std::vector<size_t> num_kv_heads(12, 12);
     ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
     device_config.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers);
     return std::make_shared<CacheManager>(device_config, request, core);  

From c9d63b253a8069cf67d3ff6fa1c93b90eae9511c Mon Sep 17 00:00:00 2001
From: Alexander Kozlov <kozzzloff@list.ru>
Date: Fri, 27 Dec 2024 13:15:06 +0300
Subject: [PATCH 063/110] [WWB]: Add ImageText-to-Image pipeline validation
 (#1373)

CVS-159223

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .../tests/test_cli_image.py                   |  24 +-
 .../whowhatbench/__init__.py                  |   2 +
 .../whowhatbench/image2image.py               | 129 ++++++++
 .../whowhatbench/model_loaders.py             | 252 ++++++++++++++++
 .../whowhatbench/text2image_evaluator.py      |  17 +-
 tools/who_what_benchmark/whowhatbench/wwb.py  | 278 ++++--------------
 6 files changed, 464 insertions(+), 238 deletions(-)
 create mode 100644 tools/who_what_benchmark/whowhatbench/image2image.py
 create mode 100644 tools/who_what_benchmark/whowhatbench/model_loaders.py

diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py
index b2c2015f80..536d015612 100644
--- a/tools/who_what_benchmark/tests/test_cli_image.py
+++ b/tools/who_what_benchmark/tests/test_cli_image.py
@@ -20,6 +20,8 @@ def run_wwb(args):
 @pytest.mark.parametrize(
     ("model_id", "model_type", "backend"),
     [
+        ("hf-internal-testing/tiny-stable-diffusion-torch", "image-to-image", "hf"),
+        ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "image-to-image", "hf"),
         ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"),
         ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "openvino"),
         ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "text-to-image", "hf"),
@@ -40,6 +42,8 @@ def test_image_model_types(model_id, model_type, backend):
         "CPU",
         "--model-type",
         model_type,
+        "--num-inference-steps",
+        "2",
     ]
     if backend == "hf":
         wwb_args.append("--hf")
@@ -65,7 +69,8 @@ def test_image_model_types(model_id, model_type, backend):
 @pytest.mark.parametrize(
     ("model_id", "model_type"),
     [
-        ("echarlaix/tiny-random-stable-diffusion-xl", "text-to-image"),
+        ("OpenVINO/LCM_Dreamshaper_v7-int8-ov", "image-to-image"),
+        ("OpenVINO/LCM_Dreamshaper_v7-int8-ov", "text-to-image"),
     ],
 )
 def test_image_model_genai(model_id, model_type):
@@ -73,15 +78,15 @@ def test_image_model_genai(model_id, model_type):
         GT_FILE = os.path.join(temp_dir, "gt.csv")
         MODEL_PATH = os.path.join(temp_dir, model_id.replace("/", "--"))
 
-        result = subprocess.run(["optimum-cli", "export",
-                                 "openvino", "-m", model_id,
+        result = subprocess.run(["huggingface-cli", "download",
+                                 model_id, "--local-dir",
                                  MODEL_PATH],
                                 capture_output=True, text=True)
         assert result.returncode == 0
 
         wwb_args = [
             "--base-model",
-            MODEL_PATH,
+            model_id,
             "--num-samples",
             "1",
             "--gt-data",
@@ -90,6 +95,8 @@ def test_image_model_genai(model_id, model_type):
             "CPU",
             "--model-type",
             model_type,
+            "--num-inference-steps",
+            "2",
         ]
         result = run_wwb(wwb_args)
         assert result.returncode == 0
@@ -108,6 +115,8 @@ def test_image_model_genai(model_id, model_type):
             "--model-type",
             model_type,
             "--genai",
+            "--num-inference-steps",
+            "2",
         ]
         result = run_wwb(wwb_args)
 
@@ -131,6 +140,9 @@ def test_image_model_genai(model_id, model_type):
             model_type,
             "--output",
             output_dir,
+            "--genai",
+            "--num-inference-steps",
+            "2",
         ]
         result = run_wwb(wwb_args)
         assert result.returncode == 0
@@ -149,6 +161,8 @@ def test_image_model_genai(model_id, model_type):
             "CPU",
             "--model-type",
             model_type,
+            "--num-inference-steps",
+            "2",
         ]
         result = run_wwb(wwb_args)
         assert result.returncode == 0
@@ -182,6 +196,8 @@ def test_image_custom_dataset(model_id, model_type, backend):
         "google-research-datasets/conceptual_captions",
         "--dataset-field",
         "caption",
+        "--num-inference-steps",
+        "2",
     ]
     if backend == "hf":
         wwb_args.append("--hf")
diff --git a/tools/who_what_benchmark/whowhatbench/__init__.py b/tools/who_what_benchmark/whowhatbench/__init__.py
index 278db2c6a1..f608601ec8 100644
--- a/tools/who_what_benchmark/whowhatbench/__init__.py
+++ b/tools/who_what_benchmark/whowhatbench/__init__.py
@@ -3,6 +3,7 @@
 from .text_evaluator import TextEvaluator as Evaluator
 from .text2image_evaluator import Text2ImageEvaluator
 from .visualtext_evaluator import VisualTextEvaluator
+from .image2image import Image2ImageEvaluator
 
 
 __all__ = [
@@ -11,5 +12,6 @@
     "TextEvaluator",
     "Text2ImageEvaluator",
     "VisualTextEvaluator",
+    "Image2ImageEvaluator",
     "EVALUATOR_REGISTRY",
 ]
diff --git a/tools/who_what_benchmark/whowhatbench/image2image.py b/tools/who_what_benchmark/whowhatbench/image2image.py
new file mode 100644
index 0000000000..90eb6c7c87
--- /dev/null
+++ b/tools/who_what_benchmark/whowhatbench/image2image.py
@@ -0,0 +1,129 @@
+import os
+from typing import Any, Union
+
+import datasets
+import pandas as pd
+from tqdm import tqdm
+from transformers import set_seed
+import torch
+import openvino_genai
+
+from .registry import register_evaluator
+from .text2image_evaluator import Text2ImageEvaluator
+
+from .whowhat_metrics import ImageSimilarity
+
+
+def preprocess_fn(example):
+    return {
+        "prompts": example["Instruction_VLM-LLM"],
+        "images": example["source_img"],
+    }
+
+
+def prepare_default_data(num_samples=None):
+    DATASET_NAME = "paint-by-inpaint/PIPE"
+    NUM_SAMPLES = 10 if num_samples is None else num_samples
+    set_seed(42)
+    default_dataset = datasets.load_dataset(
+        DATASET_NAME, split="test", streaming=True
+    ).filter(lambda example: example["Instruction_VLM-LLM"] != "").take(NUM_SAMPLES)
+    return default_dataset.map(
+        lambda x: preprocess_fn(x), remove_columns=default_dataset.column_names
+    )
+
+
+@register_evaluator("image-to-image")
+class Image2ImageEvaluator(Text2ImageEvaluator):
+    def __init__(
+        self,
+        base_model: Any = None,
+        gt_data: str = None,
+        test_data: Union[str, list] = None,
+        metrics="similarity",
+        similarity_model_id: str = "openai/clip-vit-large-patch14",
+        num_inference_steps=4,
+        crop_prompts=True,
+        num_samples=None,
+        gen_image_fn=None,
+        seed=42,
+        is_genai=False,
+    ) -> None:
+        assert (
+            base_model is not None or gt_data is not None
+        ), "Text generation pipeline for evaluation or ground trush data must be defined"
+
+        self.test_data = test_data
+        self.metrics = metrics
+        self.crop_prompt = crop_prompts
+        self.num_samples = num_samples
+        self.num_inference_steps = num_inference_steps
+        self.seed = seed
+        self.similarity = None
+        self.similarity = ImageSimilarity(similarity_model_id)
+        self.last_cmp = None
+        self.gt_dir = os.path.dirname(gt_data)
+        self.generation_fn = gen_image_fn
+        self.is_genai = is_genai
+        self.resolution = None
+
+        if base_model:
+            self.gt_data = self._generate_data(
+                base_model, gen_image_fn, os.path.join(self.gt_dir, "reference")
+            )
+        else:
+            self.gt_data = pd.read_csv(gt_data, keep_default_na=False)
+
+    def _generate_data(self, model, gen_image_fn=None, image_dir="reference"):
+        def default_gen_image_fn(model, prompt, image, num_inference_steps, generator=None):
+            with torch.no_grad():
+                output = model(
+                    prompt,
+                    image=image,
+                    num_inference_steps=num_inference_steps,
+                    output_type="pil",
+                    strength=0.8,
+                    generator=generator,
+                )
+            return output.images[0]
+
+        generation_fn = gen_image_fn or default_gen_image_fn
+
+        if self.test_data:
+            if isinstance(self.test_data, str):
+                data = pd.read_csv(self.test_data)
+            else:
+                if isinstance(self.test_data, dict):
+                    assert "prompts" in self.test_data
+                    assert "images" in self.test_data
+                    data = dict(self.test_data)
+                data = pd.DataFrame.from_dict(data)
+        else:
+            data = pd.DataFrame.from_dict(prepare_default_data(self.num_samples))
+
+        prompts = data["prompts"]
+        images = data["images"]
+        output_images = []
+        rng = torch.Generator(device="cpu")
+
+        if not os.path.exists(image_dir):
+            os.makedirs(image_dir)
+
+        for i, (prompt, image) in tqdm(enumerate(zip(prompts, images)), desc="Evaluate pipeline"):
+            set_seed(self.seed)
+            rng = rng.manual_seed(self.seed)
+            output = generation_fn(
+                model,
+                prompt,
+                image=image,
+                num_inference_steps=self.num_inference_steps,
+                generator=openvino_genai.TorchGenerator(self.seed) if self.is_genai else rng
+            )
+            image_path = os.path.join(image_dir, f"{i}.png")
+            output.save(image_path)
+            output_images.append(image_path)
+
+        res_data = {"prompts": list(prompts), "images": output_images}
+        df = pd.DataFrame(res_data)
+
+        return df
diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py
new file mode 100644
index 0000000000..f54d232bc2
--- /dev/null
+++ b/tools/who_what_benchmark/whowhatbench/model_loaders.py
@@ -0,0 +1,252 @@
+import logging
+import json
+
+from transformers import AutoConfig, AutoModelForCausalLM, AutoModel, AutoModelForVision2Seq
+from diffusers import DiffusionPipeline, AutoPipelineForImage2Image
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class GenAIModelWrapper:
+    """
+    A helper class to store additional attributes for GenAI models
+    """
+
+    def __init__(self, model, model_dir, model_type):
+        self.model = model
+        self.model_type = model_type
+
+        if model_type == "text" or model_type == "visual-text":
+            self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
+        elif model_type == "text-to-image":
+            self.config = DiffusionPipeline.load_config(
+                model_dir, trust_remote_code=True)
+
+    def __getattr__(self, attr):
+        if attr in self.__dict__:
+            return getattr(self, attr)
+        else:
+            return getattr(self.model, attr)
+
+
+def load_text_genai_pipeline(model_dir, device="CPU", ov_config=None):
+    try:
+        import openvino_genai
+    except ImportError:
+        logger.error(
+            "Failed to import openvino_genai package. Please install it.")
+        exit(-1)
+    return GenAIModelWrapper(openvino_genai.LLMPipeline(model_dir, device=device, **ov_config), model_dir, "text")
+
+
+def load_text_model(
+    model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
+):
+    if use_hf:
+        logger.info("Using HF Transformers API")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id, trust_remote_code=True, device_map=device.lower()
+        )
+        model.eval()
+    elif use_genai:
+        logger.info("Using OpenVINO GenAI API")
+        model = load_text_genai_pipeline(model_id, device, ov_config)
+    else:
+        logger.info("Using Optimum API")
+        from optimum.intel.openvino import OVModelForCausalLM
+        try:
+            model = OVModelForCausalLM.from_pretrained(
+                model_id, trust_remote_code=True, device=device, ov_config=ov_config
+            )
+        except ValueError:
+            config = AutoConfig.from_pretrained(
+                model_id, trust_remote_code=True)
+            model = OVModelForCausalLM.from_pretrained(
+                model_id,
+                config=config,
+                trust_remote_code=True,
+                use_cache=True,
+                device=device,
+                ov_config=ov_config,
+            )
+
+    return model
+
+
+def load_text2image_genai_pipeline(model_dir, device="CPU", ov_config=None):
+    try:
+        import openvino_genai
+    except ImportError:
+        logger.error(
+            "Failed to import openvino_genai package. Please install it.")
+        exit(-1)
+
+    return GenAIModelWrapper(
+        openvino_genai.Text2ImagePipeline(model_dir, device=device, **ov_config),
+        model_dir,
+        "text-to-image"
+    )
+
+
+def load_text2image_model(
+    model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
+):
+    if use_genai:
+        logger.info("Using OpenvINO GenAI API")
+        model = load_text2image_genai_pipeline(model_id, device, ov_config)
+    elif use_hf:
+        logger.info("Using HF Transformers API")
+        model = DiffusionPipeline.from_pretrained(
+            model_id, trust_remote_code=True)
+    else:
+        logger.info("Using Optimum API")
+        from optimum.intel import OVPipelineForText2Image
+        TEXT2IMAGEPipeline = OVPipelineForText2Image
+
+        try:
+            model = TEXT2IMAGEPipeline.from_pretrained(
+                model_id, trust_remote_code=True, device=device, ov_config=ov_config
+            )
+        except ValueError:
+            config = AutoConfig.from_pretrained(
+                model_id, trust_remote_code=True)
+            model = TEXT2IMAGEPipeline.from_pretrained(
+                model_id,
+                config=config,
+                trust_remote_code=True,
+                use_cache=True,
+                device=device,
+                ov_config=ov_config,
+            )
+
+    return model
+
+
+def load_visual_text_genai_pipeline(model_dir, device="CPU", ov_config=None):
+    try:
+        import openvino_genai
+    except ImportError as e:
+        logger.error("Failed to import openvino_genai package. Please install it. Details:\n", e)
+        exit(-1)
+
+    return GenAIModelWrapper(
+        openvino_genai.VLMPipeline(model_dir, device, **ov_config),
+        model_dir,
+        "visual-text"
+    )
+
+
+def load_visual_text_model(
+    model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
+):
+    if use_hf:
+        logger.info("Using HF Transformers API")
+        config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+        try:
+            model = AutoModelForVision2Seq.from_pretrained(
+                model_id, trust_remote_code=True, device_map=device.lower()
+            )
+        except ValueError:
+            try:
+                model = AutoModel.from_pretrained(
+                    model_id, trust_remote_code=True, device_map=device.lower()
+                )
+            except ValueError:
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_id, trust_remote_code=True, device_map=device.lower(), _attn_implementation="eager", use_flash_attention_2=False
+                )
+        model.eval()
+    elif use_genai:
+        logger.info("Using OpenVINO GenAI API")
+        model = load_visual_text_genai_pipeline(model_id, device, ov_config)
+    else:
+        logger.info("Using Optimum API")
+        from optimum.intel.openvino import OVModelForVisualCausalLM
+        try:
+            model = OVModelForVisualCausalLM.from_pretrained(
+                model_id, trust_remote_code=True, device=device, ov_config=ov_config
+            )
+        except ValueError:
+            config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+            model = OVModelForVisualCausalLM.from_pretrained(
+                model_id,
+                config=config,
+                trust_remote_code=True,
+                use_cache=True,
+                device=device,
+                ov_config=ov_config,
+            )
+    return model
+
+
+def load_image2image_genai_pipeline(model_dir, device="CPU", ov_config=None):
+    try:
+        import openvino_genai
+    except ImportError as e:
+        logger.error("Failed to import openvino_genai package. Please install it. Details:\n", e)
+        exit(-1)
+
+    return GenAIModelWrapper(
+        openvino_genai.Image2ImagePipeline(model_dir, device, **ov_config),
+        model_dir,
+        "image-to-image"
+    )
+
+
+def load_imagetext2image_model(
+    model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
+):
+    if use_hf:
+        logger.info("Using HF Transformers API")
+        model = AutoPipelineForImage2Image.from_pretrained(
+            model_id, trust_remote_code=True
+        )
+    elif use_genai:
+        logger.info("Using OpenVINO GenAI API")
+        model = load_image2image_genai_pipeline(model_id, device, ov_config)
+    else:
+        logger.info("Using Optimum API")
+        from optimum.intel.openvino import OVPipelineForImage2Image
+        try:
+            model = OVPipelineForImage2Image.from_pretrained(
+                model_id, trust_remote_code=True, device=device, ov_config=ov_config
+            )
+        except ValueError:
+            config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+            model = OVPipelineForImage2Image.from_pretrained(
+                model_id,
+                config=config,
+                trust_remote_code=True,
+                use_cache=True,
+                device=device,
+                ov_config=ov_config,
+            )
+    return model
+
+
+def load_model(
+    model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
+):
+    if model_id is None:
+        return None
+
+    if ov_config:
+        with open(ov_config) as f:
+            ov_options = json.load(f)
+    else:
+        ov_options = {}
+
+    if model_type == "text":
+        return load_text_model(model_id, device, ov_options, use_hf, use_genai)
+    elif model_type == "text-to-image":
+        return load_text2image_model(
+            model_id, device, ov_options, use_hf, use_genai
+        )
+    elif model_type == "visual-text":
+        return load_visual_text_model(model_id, device, ov_options, use_hf, use_genai)
+    elif model_type == "image-to-image":
+        return load_imagetext2image_model(model_id, device, ov_options, use_hf, use_genai)
+    else:
+        raise ValueError(f"Unsupported model type: {model_type}")
diff --git a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py
index 0cced117e4..e930c48b0a 100644
--- a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py
+++ b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py
@@ -116,14 +116,15 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
 
     def _generate_data(self, model, gen_image_fn=None, image_dir="reference"):
         def default_gen_image_fn(model, prompt, num_inference_steps, generator=None):
-            output = model(
-                prompt,
-                num_inference_steps=num_inference_steps,
-                output_type="pil",
-                width=self.resolution[0],
-                height=self.resolution[0],
-                generator=generator,
-            )
+            with torch.no_grad():
+                output = model(
+                    prompt,
+                    num_inference_steps=num_inference_steps,
+                    output_type="pil",
+                    width=self.resolution[0],
+                    height=self.resolution[0],
+                    generator=generator,
+                )
             return output.images[0]
 
         generation_fn = gen_image_fn or default_gen_image_fn
diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py
index 04813f5fd8..2ff8c45975 100644
--- a/tools/who_what_benchmark/whowhatbench/wwb.py
+++ b/tools/who_what_benchmark/whowhatbench/wwb.py
@@ -1,18 +1,17 @@
 import argparse
 import difflib
 import numpy as np
-import json
 import logging
 import os
 
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModel, AutoModelForVision2Seq
+from transformers import AutoTokenizer, AutoProcessor
 import openvino as ov
 
 import pandas as pd
 from datasets import load_dataset
-from diffusers import DiffusionPipeline
 from PIL import Image
 
+from whowhatbench.model_loaders import load_model
 from whowhatbench import EVALUATOR_REGISTRY
 
 # Configure logging
@@ -20,224 +19,6 @@
 logger = logging.getLogger(__name__)
 
 
-class GenAIModelWrapper:
-    """
-    A helper class to store additional attributes for GenAI models
-    """
-
-    def __init__(self, model, model_dir, model_type):
-        self.model = model
-        self.model_type = model_type
-
-        if model_type == "text" or model_type == "visual-text":
-            self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
-        elif model_type == "text-to-image":
-            self.config = DiffusionPipeline.load_config(
-                model_dir, trust_remote_code=True)
-
-    def __getattr__(self, attr):
-        if attr in self.__dict__:
-            return getattr(self, attr)
-        else:
-            return getattr(self.model, attr)
-
-
-def load_text_genai_pipeline(model_dir, device="CPU", ov_config=None):
-    try:
-        import openvino_genai
-    except ImportError:
-        logger.error(
-            "Failed to import openvino_genai package. Please install it.")
-        exit(-1)
-    return GenAIModelWrapper(openvino_genai.LLMPipeline(model_dir, device=device, **ov_config), model_dir, "text")
-
-
-def load_text_model(
-    model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
-):
-    if use_hf:
-        logger.info("Using HF Transformers API")
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id, trust_remote_code=True, device_map=device.lower()
-        )
-        model.eval()
-    elif use_genai:
-        logger.info("Using OpenVINO GenAI API")
-        model = load_text_genai_pipeline(model_id, device, ov_config)
-    else:
-        logger.info("Using Optimum API")
-        from optimum.intel.openvino import OVModelForCausalLM
-        try:
-            model = OVModelForCausalLM.from_pretrained(
-                model_id, trust_remote_code=True, device=device, ov_config=ov_config
-            )
-        except ValueError:
-            config = AutoConfig.from_pretrained(
-                model_id, trust_remote_code=True)
-            model = OVModelForCausalLM.from_pretrained(
-                model_id,
-                config=config,
-                trust_remote_code=True,
-                use_cache=True,
-                device=device,
-                ov_config=ov_config,
-            )
-
-    return model
-
-
-def load_text2image_genai_pipeline(model_dir, device="CPU", ov_config=None):
-    try:
-        import openvino_genai
-    except ImportError:
-        logger.error(
-            "Failed to import openvino_genai package. Please install it.")
-        exit(-1)
-
-    return GenAIModelWrapper(
-        openvino_genai.Text2ImagePipeline(model_dir, device=device, **ov_config),
-        model_dir,
-        "text-to-image"
-    )
-
-
-def load_text2image_model(
-    model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
-):
-    if use_genai:
-        logger.info("Using OpenvINO GenAI API")
-        model = load_text2image_genai_pipeline(model_id, device, ov_config)
-    elif use_hf:
-        logger.info("Using HF Transformers API")
-        model = DiffusionPipeline.from_pretrained(
-            model_id, trust_remote_code=True)
-    else:
-        logger.info("Using Optimum API")
-        from optimum.intel import OVPipelineForText2Image
-        TEXT2IMAGEPipeline = OVPipelineForText2Image
-
-        try:
-            model = TEXT2IMAGEPipeline.from_pretrained(
-                model_id, trust_remote_code=True, device=device, ov_config=ov_config
-            )
-        except ValueError:
-            config = AutoConfig.from_pretrained(
-                model_id, trust_remote_code=True)
-            model = TEXT2IMAGEPipeline.from_pretrained(
-                model_id,
-                config=config,
-                trust_remote_code=True,
-                use_cache=True,
-                device=device,
-                ov_config=ov_config,
-            )
-
-    return model
-
-
-def load_visual_text_genai_pipeline(model_dir, device="CPU", ov_config=None):
-    try:
-        import openvino_genai
-    except ImportError as e:
-        logger.error("Failed to import openvino_genai package. Please install it. Details:\n", e)
-        exit(-1)
-
-    return GenAIModelWrapper(
-        openvino_genai.VLMPipeline(model_dir, device, **ov_config),
-        model_dir,
-        "visual-text"
-    )
-
-
-def load_visual_text_model(
-    model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
-):
-    if use_hf:
-        logger.info("Using HF Transformers API")
-        config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
-        try:
-            model = AutoModelForVision2Seq.from_pretrained(
-                model_id, trust_remote_code=True, device_map=device.lower()
-            )
-        except ValueError:
-            try:
-                model = AutoModel.from_pretrained(
-                    model_id, trust_remote_code=True, device_map=device.lower()
-                )
-            except ValueError:
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_id, trust_remote_code=True, device_map=device.lower(), _attn_implementation="eager", use_flash_attention_2=False
-                )
-        model.eval()
-    elif use_genai:
-        logger.info("Using OpenVINO GenAI API")
-        model = load_visual_text_genai_pipeline(model_id, device, ov_config)
-    else:
-        logger.info("Using Optimum API")
-        from optimum.intel.openvino import OVModelForVisualCausalLM
-        try:
-            model = OVModelForVisualCausalLM.from_pretrained(
-                model_id, trust_remote_code=True, device=device, ov_config=ov_config
-            )
-        except ValueError:
-            config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
-            model = OVModelForVisualCausalLM.from_pretrained(
-                model_id,
-                config=config,
-                trust_remote_code=True,
-                use_cache=True,
-                device=device,
-                ov_config=ov_config,
-            )
-    return model
-
-
-def load_model(
-    model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
-):
-    if model_id is None:
-        return None
-
-    if ov_config:
-        with open(ov_config) as f:
-            ov_options = json.load(f)
-    else:
-        ov_options = {}
-
-    if model_type == "text":
-        return load_text_model(model_id, device, ov_options, use_hf, use_genai)
-    elif model_type == "text-to-image":
-        return load_text2image_model(
-            model_type, model_id, device, ov_options, use_hf, use_genai
-        )
-    elif model_type == "visual-text":
-        return load_visual_text_model(model_id, device, ov_options, use_hf, use_genai)
-    else:
-        raise ValueError(f"Unsupported model type: {model_type}")
-
-
-def load_prompts(args):
-    if args.dataset is None:
-        return None
-    split = "validation"
-    if args.split is not None:
-        split = args.split
-    if "," in args.dataset:
-        path_name = args.dataset.split(",")
-        path = path_name[0]
-        name = path_name[1]
-    else:
-        path = args.dataset
-        name = None
-    data = load_dataset(path=path, name=name, split=split)
-
-    res = data[args.dataset_field]
-
-    res = {"prompts": list(res)}
-
-    return res
-
-
 def parse_args():
     parser = argparse.ArgumentParser(
         prog="WWB CLI",
@@ -274,9 +55,10 @@ def parse_args():
     parser.add_argument(
         "--model-type",
         type=str,
-        choices=["text", "text-to-image", "visual-text"],
+        choices=["text", "text-to-image", "visual-text", "image-to-image"],
         default="text",
-        help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation.",
+        help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation, "
+        "visual-text - for Visual Language Models, image-to-image - for image generation based on image and prompt",
     )
     parser.add_argument(
         "--data-encoder",
@@ -385,6 +167,26 @@ def check_args(args):
             "Wether --target-model, --target-data or --gt-data should be provided")
 
 
+def load_prompts(args):
+    if args.dataset is None:
+        return None
+    split = "validation"
+    if args.split is not None:
+        split = args.split
+    if "," in args.dataset:
+        path_name = args.dataset.split(",")
+        path = path_name[0]
+        name = path_name[1]
+    else:
+        path = args.dataset
+        name = None
+    data = load_dataset(path=path, name=name, split=split)
+
+    res = data[args.dataset_field]
+    res = {"prompts": list(res)}
+    return res
+
+
 def load_tokenizer(args):
     tokenizer = None
     if args.tokenizer is not None:
@@ -449,7 +251,7 @@ def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question):
 
 
 def genai_gen_image(model, prompt, num_inference_steps, generator=None):
-    if model.resolution[0] is not None:
+    if model.resolution is not None and model.resolution[0] is not None:
         image_tensor = model.generate(
             prompt,
             width=model.resolution[0],
@@ -467,8 +269,21 @@ def genai_gen_image(model, prompt, num_inference_steps, generator=None):
     return image
 
 
+def genai_gen_image2image(model, prompt, image, num_inference_steps, generator=None):
+    image_data = ov.Tensor(np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8))
+    image_tensor = model.generate(
+        prompt,
+        image=image_data,
+        num_inference_steps=num_inference_steps,
+        strength=0.8,
+        generator=generator,
+    )
+    image = Image.fromarray(image_tensor.data[0])
+    return image
+
+
 def genai_gen_visual_text(model, prompt, image, processor, tokenizer, max_new_tokens, crop_question):
-    image_data = ov.Tensor(np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.byte))
+    image_data = ov.Tensor(np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8))
     config = model.get_generation_config()
     config.max_new_tokens = max_new_tokens
     config.do_sample = False
@@ -529,6 +344,17 @@ def create_evaluator(base_model, args):
                 gen_answer_fn=genai_gen_visual_text if args.genai else None,
                 processor=processor,
             )
+        elif task == "image-to-image":
+            return EvaluatorCLS(
+                base_model=base_model,
+                gt_data=args.gt_data,
+                test_data=prompts,
+                num_samples=args.num_samples,
+                num_inference_steps=args.num_inference_steps,
+                gen_image_fn=genai_gen_image2image if args.genai else None,
+                is_genai=args.genai,
+                seed=args.seed,
+            )
         else:
             raise ValueError(f"Unsupported task: {task}")
 
@@ -637,7 +463,7 @@ def main():
     if args.verbose and (args.target_model or args.target_data):
         if args.model_type == "text" or args.model_type == "visual-text":
             print_text_results(evaluator)
-        elif "text-to-image" in args.model_type:
+        elif "text-to-image" in args.model_type or "image-to-image" in args.model_type:
             print_image_results(evaluator)
 
 
From b7e354f87bb012676401e72c70e20ca45caa1d6f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 27 Dec 2024 16:03:35 +0400
Subject: [PATCH 064/110] Bump py-build-cmake from 0.3.3 to 0.3.4 (#1447)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [py-build-cmake](https://github.com/tttapa/py-build-cmake) from
0.3.3 to 0.3.4.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/tttapa/py-build-cmake/releases">py-build-cmake's
releases</a>.</em></p>
<blockquote>
<h2>0.3.4</h2>
<ul>
<li>Added more <code>PY_BUILD_CMAKE_*</code> variables.</li>
<li>Renamed <code>PY_BUILD_CMAKE_MODULE_NAME</code> →
<code>PY_BUILD_CMAKE_IMPORT_NAME</code>,
<code>PY_BUILD_CMAKE_PACKAGE_NAME</code> →
<code>PY_BUILD_CMAKE_PROJECT_NAME</code>,
<code>PY_BUILD_CMAKE_PACKAGE_VERSION</code> →
<code>PY_BUILD_CMAKE_PROJECT_VERSION</code>
(the old variables are still available for backwards
compatibility).</li>
<li>More robust CMake FindPython hints.</li>
<li>New <em>Variables</em> reference: <a
href="https://tttapa.github.io/py-build-cmake/Variables.html">https://tttapa.github.io/py-build-cmake/Variables.html</a></li>
<li>Simplified minimal example CMakeLists.txt.</li>
<li>Improved documentation.</li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/tttapa/py-build-cmake/compare/0.3.3...0.3.4">https://github.com/tttapa/py-build-cmake/compare/0.3.3...0.3.4</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/tttapa/py-build-cmake/commit/3b3a54fb8647fa14bbb2cf642456055b3eb7caa0"><code>3b3a54f</code></a>
Version 0.3.4</li>
<li><a
href="https://github.com/tttapa/py-build-cmake/commit/db5b643fcb7df4fb0c8a79681383043986bc6e3f"><code>db5b643</code></a>
[Test] Only search for Development.SABIModule when using CPython</li>
<li><a
href="https://github.com/tttapa/py-build-cmake/commit/9ce1ed616799eb23440720970e51d100d4abd776"><code>9ce1ed6</code></a>
[Docs] add Variable documentation page</li>
<li><a
href="https://github.com/tttapa/py-build-cmake/commit/cfc8c55f1897258773a7f20e73a7aae7ed6d11f4"><code>cfc8c55</code></a>
Rename PY_BUILD_CMAKE_MODULE_NAME→PY_BUILD_CMAKE_IMPORT_NAME,
PY_BUILD_CMAKE_...</li>
<li><a
href="https://github.com/tttapa/py-build-cmake/commit/690efa7eb06cb24d3bcc13a85a167ee2972b61aa"><code>690efa7</code></a>
Reference CMake discourse thread about Python_ROOT</li>
<li><a
href="https://github.com/tttapa/py-build-cmake/commit/0eabd9665590787642afc525059cdbf71f1f7e9a"><code>0eabd96</code></a>
Add Python_ROOT CMake FindPython hint</li>
<li><a
href="https://github.com/tttapa/py-build-cmake/commit/dcc58dba1cecc85e2b4ce0412f1f8a78da660569"><code>dcc58db</code></a>
Add sanity check in MultiConfigOption</li>
<li><a
href="https://github.com/tttapa/py-build-cmake/commit/f8558a476f39f74803c21be6611b6edc20ee3117"><code>f8558a4</code></a>
More CMake FindPython hints, specify version, include dir and soabi</li>
<li><a
href="https://github.com/tttapa/py-build-cmake/commit/d1e1cd16e7f8c4dbd96ad560b9f1995e3bfa08fc"><code>d1e1cd1</code></a>
Add more PY_BUILD_CMAKE_* variables in CMake environment</li>
<li><a
href="https://github.com/tttapa/py-build-cmake/commit/5f20a0535f7249600aeb1d7cc9500ee24317b8a5"><code>5f20a05</code></a>
Simplify examples/minima-program, use a single CMakeLists.txt</li>
<li>Additional commits viewable in <a
href="https://github.com/tttapa/py-build-cmake/compare/0.3.3...0.3.4">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=py-build-cmake&package-manager=pip&previous-version=0.3.3&new-version=0.3.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5f952010f2..27318d42ed 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,7 @@ options = {"BUILD_TOKENIZERS" = "OFF"}
 
 [build-system]
 requires = [
-    "py-build-cmake==0.3.3",
+    "py-build-cmake==0.3.4",
     "openvino~=2025.0.0.0.dev",
     "pybind11-stubgen==2.5.1",
     "cmake~=3.23.0"

From ad31314a67105cd6a28d30c7f2c0b1a222265b43 Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@intel.com>
Date: Sat, 28 Dec 2024 03:47:12 +0800
Subject: [PATCH 065/110] Use singleton core for StatefulLLMPipeline (#1449)

Use utils::singleton_core() in LLMStatefulLLMPipeline

ticket: CVS-159945
---
 src/cpp/src/llm_pipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 5e448fe88c..3665c92227 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -72,7 +72,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         const ov::AnyMap& config,
         const ov::genai::GenerationConfig& generation_config
     ) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
-        ov::Core core;
+        ov::Core core = utils::singleton_core();
         ov::CompiledModel compiled_model;
         auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
         utils::slice_matmul_statefull_model(model);

From d88dda924775f735a78eaabb3a4f84b6a05c081f Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@intel.com>
Date: Sat, 28 Dec 2024 03:50:15 +0800
Subject: [PATCH 066/110] Fix typo for slice_matmul_stateful_model
 transformation (#1450)

Fix typo for `slice_matmul_statefull_model` to
`slice_matmul_stateful_model`
---
 src/cpp/src/llm_pipeline.cpp | 2 +-
 src/cpp/src/utils.cpp        | 2 +-
 src/cpp/src/utils.hpp        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 3665c92227..81f411020e 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -75,7 +75,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         ov::Core core = utils::singleton_core();
         ov::CompiledModel compiled_model;
         auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
-        utils::slice_matmul_statefull_model(model);
+        utils::slice_matmul_stateful_model(model);
         m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model);
 
         if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index be9fc972dc..83dbf15376 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -259,7 +259,7 @@ ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::Token
     return {new_input_ids, new_attention_mask};
 }
 
-void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model) {
+void slice_matmul_stateful_model(std::shared_ptr<ov::Model> model) {
     auto last_node = model->output(0).get_node()->input_value(0).get_node();
     ov::Node* matmul = dynamic_cast<ov::op::v0::MatMul*>(last_node);
     if (matmul) {
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 57225e60ff..6207c889a2 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -106,7 +106,7 @@ std::shared_ptr<ov::Model> read_model_with_config(const std::filesystem::path& m
 
 ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);
 
-void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model);
+void slice_matmul_stateful_model(std::shared_ptr<ov::Model> model);
 
 ov::Core singleton_core();
 

From 6c56a7b857447e5612e44a22a3bdc9624dcd527a Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sat, 28 Dec 2024 08:27:42 +0400
Subject: [PATCH 067/110] Tests for generation config (#1448)

CVS-159946
---
 .../beam_search_causal_lm.cpp                 |   1 +
 .../beam_search_causal_lm.py                  |   1 +
 .../openvino/genai/generation_config.hpp      |  28 +-
 src/cpp/src/generation_config.cpp             | 240 +++++++++++-------
 src/cpp/src/json_utils.hpp                    |  12 +
 src/cpp/src/llm_pipeline.cpp                  |   5 +-
 .../openvino_genai/py_openvino_genai.pyi      |  26 +-
 .../py_continuous_batching_pipeline.cpp       |   8 +-
 src/python/py_generation_config.cpp           |   8 +-
 src/python/py_image_generation_pipelines.cpp  |  14 +-
 src/python/py_llm_pipeline.cpp                |   9 +-
 src/python/py_utils.cpp                       |   5 +-
 src/python/py_vlm_pipeline.cpp                |   6 +-
 src/python/py_whisper_pipeline.cpp            |  12 +-
 tests/cpp/CMakeLists.txt                      |   4 +-
 tests/cpp/generate_config.cpp                 | 143 -----------
 tests/python_tests/common.py                  |   9 +-
 tests/python_tests/ov_genai_test_utils.py     |  15 +-
 .../python_tests/test_continuous_batching.py  |  26 +-
 tests/python_tests/test_generation_config.py  | 142 +++++++++++
 tests/python_tests/test_kv_cache_eviction.py  |   2 +-
 tests/python_tests/test_llm_pipeline.py       |  72 ++----
 tests/python_tests/test_tokenizer.py          |  17 +-
 .../continuous_batching_benchmark.cpp         |   5 -
 24 files changed, 450 insertions(+), 360 deletions(-)
 delete mode 100644 tests/cpp/generate_config.cpp
 create mode 100644 tests/python_tests/test_generation_config.py

diff --git a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp
index 236b31b351..fc18fa8e0c 100644
--- a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp
+++ b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp
@@ -17,6 +17,7 @@ int main(int argc, char* argv[]) try {
     config.max_new_tokens = 20;
     config.num_beam_groups = 3;
     config.num_beams = 15;
+    config.diversity_penalty = 1.0f;
     config.num_return_sequences = config.num_beams;
        
     // Since the streamer is set, the results will
diff --git a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py b/samples/python/beam_search_causal_lm/beam_search_causal_lm.py
index 16b8b76175..4e2430a47f 100755
--- a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py
+++ b/samples/python/beam_search_causal_lm/beam_search_causal_lm.py
@@ -19,6 +19,7 @@ def main():
     config.max_new_tokens = 20
     config.num_beam_groups = 3
     config.num_beams = 15
+    config.diversity_penalty = 1
     config.num_return_sequences = config.num_beams
 
     beams = pipe.generate(args.prompts, config)
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index 4ea75e94c5..164ff29131 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -93,15 +93,22 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     bool echo = false;
     size_t logprobs = 0;
 
+    // EOS special token
+    int64_t eos_token_id = -1;
     std::set<std::string> stop_strings;
     // Default setting in vLLM (and OpenAI API) is not to include stop string in the output
     bool include_stop_str_in_output = false;
     std::set<int64_t> stop_token_ids;
 
+    // penalties (not used in beam search)
+    float repetition_penalty = 1.0f;
+    float presence_penalty = 0.0;
+    float frequency_penalty = 0.0f;
+
     // Beam search specific
     size_t num_beam_groups = 1;
     size_t num_beams = 1;
-    float diversity_penalty = 1.0f;
+    float diversity_penalty = 0.0f;
     float length_penalty = 1.0f;
     size_t num_return_sequences = 1;
     size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
@@ -112,9 +119,6 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     float top_p = 1.0f;
     size_t top_k = std::numeric_limits<size_t>::max();
     bool do_sample = false;
-    float repetition_penalty = 1.0f;
-    float presence_penalty = 0.0;
-    float frequency_penalty = 0.0f;
     size_t rng_seed = 0;
 
     // Assisting generation parameters
@@ -122,9 +126,6 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     size_t num_assistant_tokens = 0;
     size_t max_ngram_size = 0;
 
-    // EOS special token
-    int64_t eos_token_id = -1;
-
     std::optional<AdapterConfig> adapters;
 
     /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0.
@@ -136,11 +137,13 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     bool is_greedy_decoding() const;
     bool is_beam_search() const;
     bool is_multinomial() const;
-    OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2025.0.0 release")
-    bool is_speculative_decoding() const;
     bool is_assisting_generation() const;
     bool is_prompt_lookup() const;
-    void update_generation_config(const ov::AnyMap& config_map);
+
+    OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2026.0.0 release")
+    bool is_speculative_decoding() const;
+
+    void update_generation_config(const ov::AnyMap& properties);
 
     template <typename... Properties>
     util::EnableIfAllStringAny<void, Properties...> update_generation_config(Properties&&... properties) {
@@ -187,8 +190,13 @@ static constexpr ov::Property<float> assistant_confidence_threshold{"assistant_c
 static constexpr ov::Property<size_t> num_assistant_tokens{"num_assistant_tokens"};
 
 // Predefined Configs
+
+OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release")
 OPENVINO_GENAI_EXPORTS GenerationConfig beam_search();
+OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release")
 OPENVINO_GENAI_EXPORTS GenerationConfig greedy();
+OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release")
 OPENVINO_GENAI_EXPORTS GenerationConfig multinomial();
+
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index 4ff184547e..59be603fd9 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -24,6 +24,7 @@ GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) {
 
     nlohmann::json data = nlohmann::json::parse(f);
 
+    read_json_param(data, "eos_token_id", eos_token_id);
     read_json_param(data, "max_new_tokens", max_new_tokens);
     read_json_param(data, "max_length", max_length);
     // note that ignore_eos is not present in HF GenerationConfig
@@ -32,28 +33,40 @@ GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) {
     read_json_param(data, "stop_strings", stop_strings);
     // note that include_stop_str_in_output is not present in HF GenerationConfig
     read_json_param(data, "include_stop_str_in_output", include_stop_str_in_output);
-    // note that stop_token_ids is not present in HF GenerationConfig
-    read_json_param(data, "stop_token_ids", stop_token_ids);
+    // note that stop_token_ids is not present in HF GenerationConfig, but some generation_config.json define
+    // multiple eos_token_id (e.g. https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/generation_config.json)
+    // so, we need to read them as 'stop_token_ids'
+    std::vector<int64_t> ordered_stop_token_ids;
+    read_json_param(data, "eos_token_id", ordered_stop_token_ids);
+
+    if (!ordered_stop_token_ids.empty()) {
+        for (int64_t stop_token_id : ordered_stop_token_ids)
+            stop_token_ids.insert(stop_token_id);
+
+        if (eos_token_id == -1) {
+            eos_token_id = ordered_stop_token_ids[0];
+        }
+    }
+
+    // note that echo is not present in HF GenerationConfig
+    read_json_param(data, "echo", echo);
+    // note that logprobs is not present in HF GenerationConfig
+    read_json_param(data, "logprobs", logprobs);
+
+    // penalties
+    read_json_param(data, "repetition_penalty", repetition_penalty);
+    // note that frequency_penalty is not present in HF GenerationConfig
+    read_json_param(data, "frequency_penalty", frequency_penalty);
+    // note that presence_penalty is not present in HF GenerationConfig
+    read_json_param(data, "presence_penalty", presence_penalty);
+
+    // beam search
     read_json_param(data, "num_beam_groups", num_beam_groups);
     read_json_param(data, "num_beams", num_beams);
     read_json_param(data, "diversity_penalty", diversity_penalty);
     read_json_param(data, "length_penalty", length_penalty);
     read_json_param(data, "num_return_sequences", num_return_sequences);
     read_json_param(data, "no_repeat_ngram_size", no_repeat_ngram_size);
-    read_json_param(data, "temperature", temperature);
-    read_json_param(data, "top_p", top_p);
-    read_json_param(data, "top_k", top_k);
-    read_json_param(data, "do_sample", do_sample);
-    read_json_param(data, "repetition_penalty", repetition_penalty);
-    read_json_param(data, "eos_token_id", eos_token_id);
-    // note that echo is not present in HF GenerationConfig
-    read_json_param(data, "echo", echo);
-    // note that logprobs is not present in HF GenerationConfig
-    read_json_param(data, "logprobs", logprobs);
-
-    // append EOS to stop_token_ids
-    if (eos_token_id != -1)
-        set_eos_token_id(eos_token_id);
 
     if (data.contains("early_stopping")) {
         auto field_type = data["early_stopping"].type();
@@ -65,6 +78,21 @@ GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) {
             stop_criteria = StopCriteria::HEURISTIC;
         }
     }
+
+    // multinomial
+    read_json_param(data, "do_sample", do_sample);
+    read_json_param(data, "temperature", temperature);
+    read_json_param(data, "top_p", top_p);
+    read_json_param(data, "top_k", top_k);
+
+    // assistant generation
+    read_json_param(data, "assistant_confidence_threshold", assistant_confidence_threshold);
+    read_json_param(data, "num_assistant_tokens", num_assistant_tokens);
+    read_json_param(data, "max_ngram_size", max_ngram_size);
+
+    // append EOS to stop_token_ids
+    if (eos_token_id != -1)
+        set_eos_token_id(eos_token_id);
 }
 
 void GenerationConfig::set_eos_token_id(size_t tokenizer_eos_token_id) {
@@ -79,35 +107,50 @@ void GenerationConfig::set_eos_token_id(size_t tokenizer_eos_token_id) {
     stop_token_ids.insert(eos_token_id);
 }
 
-void GenerationConfig::update_generation_config(const ov::AnyMap& config_map) {
+void GenerationConfig::update_generation_config(const ov::AnyMap& properties) {
     using utils::read_anymap_param;
 
-    read_anymap_param(config_map, "max_new_tokens", max_new_tokens);
-    read_anymap_param(config_map, "max_length", max_length);
-    read_anymap_param(config_map, "ignore_eos", ignore_eos);
-    read_anymap_param(config_map, "min_new_tokens", min_new_tokens);
-    read_anymap_param(config_map, "stop_strings", stop_strings);
-    read_anymap_param(config_map, "include_stop_str_in_output", include_stop_str_in_output);
-    read_anymap_param(config_map, "stop_token_ids", stop_token_ids);
-    read_anymap_param(config_map, "num_beam_groups", num_beam_groups);
-    read_anymap_param(config_map, "num_beams", num_beams);
-    read_anymap_param(config_map, "diversity_penalty", diversity_penalty);
-    read_anymap_param(config_map, "length_penalty", length_penalty);
-    read_anymap_param(config_map, "num_return_sequences", num_return_sequences);
-    read_anymap_param(config_map, "no_repeat_ngram_size", no_repeat_ngram_size);
-    read_anymap_param(config_map, "stop_criteria", stop_criteria);
-    read_anymap_param(config_map, "temperature", temperature);
-    read_anymap_param(config_map, "top_p", top_p);
-    read_anymap_param(config_map, "top_k", top_k);
-    read_anymap_param(config_map, "do_sample", do_sample);
-    read_anymap_param(config_map, "repetition_penalty", repetition_penalty);
-    read_anymap_param(config_map, "eos_token_id", eos_token_id);
-    read_anymap_param(config_map, "echo", echo);
-    read_anymap_param(config_map, "logprobs", logprobs);
-    read_anymap_param(config_map, "adapters", adapters);
+    // stop conditions
+    read_anymap_param(properties, "eos_token_id", eos_token_id);
+    read_anymap_param(properties, "max_new_tokens", max_new_tokens);
+    read_anymap_param(properties, "max_length", max_length);
+    read_anymap_param(properties, "ignore_eos", ignore_eos);
+    read_anymap_param(properties, "min_new_tokens", min_new_tokens);
+    read_anymap_param(properties, "stop_strings", stop_strings);
+    read_anymap_param(properties, "include_stop_str_in_output", include_stop_str_in_output);
+    read_anymap_param(properties, "stop_token_ids", stop_token_ids);
+
+    // generic
+    read_anymap_param(properties, "echo", echo);
+    read_anymap_param(properties, "logprobs", logprobs);
+    read_anymap_param(properties, "num_return_sequences", num_return_sequences);
+    read_anymap_param(properties, "adapters", adapters);
 
+    // penalties
+    read_anymap_param(properties, "frequency_penalty", frequency_penalty);
+    read_anymap_param(properties, "presence_penalty", presence_penalty);
+    read_anymap_param(properties, "repetition_penalty", repetition_penalty);
+
+    // beam search
+    read_anymap_param(properties, "num_beam_groups", num_beam_groups);
+    read_anymap_param(properties, "num_beams", num_beams);
+    read_anymap_param(properties, "diversity_penalty", diversity_penalty);
+    read_anymap_param(properties, "length_penalty", length_penalty);
+    read_anymap_param(properties, "stop_criteria", stop_criteria);
+    read_anymap_param(properties, "no_repeat_ngram_size", no_repeat_ngram_size);
+
+    // multinomial
+    read_anymap_param(properties, "do_sample", do_sample);
+    read_anymap_param(properties, "temperature", temperature);
+    read_anymap_param(properties, "top_p", top_p);
+    read_anymap_param(properties, "top_k", top_k);
     // TODO: add support of 'generator' property similar to Image generation
-    read_anymap_param(config_map, "rng_seed", rng_seed);
+    read_anymap_param(properties, "rng_seed", rng_seed);
+
+    // assistant generation
+    read_anymap_param(properties, "assistant_confidence_threshold", assistant_confidence_threshold);
+    read_anymap_param(properties, "num_assistant_tokens", num_assistant_tokens);
+    read_anymap_param(properties, "max_ngram_size", max_ngram_size);
 }
 
 size_t GenerationConfig::get_max_new_tokens(size_t prompt_length) const {
@@ -136,69 +179,94 @@ bool GenerationConfig::is_speculative_decoding() const {
 }
 
 bool GenerationConfig::is_assisting_generation() const {
-    return (assistant_confidence_threshold > 0 || num_assistant_tokens > 0);
+    return assistant_confidence_threshold > 0 || num_assistant_tokens > 0;
 }
 
 bool GenerationConfig::is_prompt_lookup() const {
-    return (max_ngram_size > 0 && num_assistant_tokens > 0);
+    return max_ngram_size > 0 && num_assistant_tokens > 0;
 }
 
 void GenerationConfig::validate() const {
+    OPENVINO_ASSERT(num_return_sequences > 0, "num_return_sequences must be greater than 0");
+
+    // Stop conditions
+
     OPENVINO_ASSERT(eos_token_id == -1 || stop_token_ids.find(eos_token_id) != stop_token_ids.end(),
         "'stop_token_ids' must contain 'eos_token_id'. Please, call 'set_eos_token_id' with 'eos_token_id' value");
 
-    OPENVINO_ASSERT(!do_sample || num_beams == 1, 
-                    "Beam search with sampling is not supported yet. "
-                    "Please either set do_sample=false to use beam search "
-                    "or set num_beams=1 if you with to use multinomial sampling.");
-    OPENVINO_ASSERT(num_return_sequences > 0, "num_return_sequences must be greater than 0");
+    auto stop_token_ids_it = std::find_if(stop_token_ids.begin(), stop_token_ids.end(), [] (int64_t stop_token_id) -> bool {
+        return stop_token_id < 0;
+    });
+    OPENVINO_ASSERT(stop_token_ids_it == stop_token_ids.end(), "'stop_token_ids' must be non-negative, but it contains a value ", *stop_token_ids_it);
+
+    OPENVINO_ASSERT(!ignore_eos || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX,
+                    "ignore_eos is true, in this case either 'max_new_tokens', or 'max_length' should be defined.");
+
+    OPENVINO_ASSERT(eos_token_id != -1 || !stop_token_ids.empty() || !stop_strings.empty() || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX,
+                    "Either 'eos_token_id', or 'stop_token_ids', or 'stop_strings', or 'max_new_tokens', or 'max_length' should be defined.");
+
     OPENVINO_ASSERT(max_new_tokens > 0 || (max_new_tokens == 0 && echo), "'max_new_tokens' must be greater than 0, if `echo` is set, 0 is also accepted");
     OPENVINO_ASSERT(min_new_tokens <= max_new_tokens, "min_new_tokens must be less or equal max_new_tokens");
-    OPENVINO_ASSERT(
-        num_beams % num_beam_groups == 0,
-        "number of beams should be divisible by number of groups"
-    );
-    
-    // max_new_tokens has priority over max_length
-    // if max_new_tokens is defined no need to check max_length
-    OPENVINO_ASSERT(max_new_tokens != SIZE_MAX ||  max_length > 0, 
-                    "'max_length' must be greater than 0 or 'max_new_tokens' should be defined");
-
-    OPENVINO_ASSERT(!do_sample || top_k > 0,
-                    "top_k must be a strictly positive, but got ",
-                    top_k);
-    OPENVINO_ASSERT(!do_sample || (top_p > 0 && top_p <= 1.0f),
-                    "top_p must be a positive float > 0 and < 1, but got ",
-                    top_p);
-    OPENVINO_ASSERT(!do_sample || temperature > 0,
-                    "Temperature must be a strictly positive float, but got ",
-                    temperature);
-
-    OPENVINO_ASSERT(repetition_penalty > 0,
-                    "Repetition penalty must be a strictly positive float, but got ",
-                    repetition_penalty);
-    
-    OPENVINO_ASSERT(!ignore_eos || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX,
-                    "ignore_eos == true, in this case either 'max_new_tokens', or 'max_length' should be defined.");
 
-    OPENVINO_ASSERT(eos_token_id != -1 || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX,
-                    "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined.");
+    // Sampling strategies
+
+    OPENVINO_ASSERT(num_return_sequences == 1 || (is_multinomial() || is_beam_search()), 
+        "'num_return_sequences' can be more than 1 only in case of beam search or multinomial sampling, but got ", num_return_sequences);
+
+    // generic penalties, but not supported by beam search currently
+    if (!is_beam_search()) {
+        OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "'frequence_penalty' penalty must be within [-2.0; 2.0], but got ", frequency_penalty);
+        OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "'presence_penalty' penalty must be within [-2.0; 2.0], but got ", presence_penalty);
+        OPENVINO_ASSERT(repetition_penalty > 0.0f, "'repetition_penalty' must be a strictly positive float, but got ", repetition_penalty);
+    } else {
+        OPENVINO_ASSERT(frequency_penalty == 0.0f, "'frequency_penalty' is not currently supported by beam search and should be 0.0f, but got ", frequency_penalty);
+        OPENVINO_ASSERT(presence_penalty == 0.0f, "'presence_penalty' is not currently supported by beam search and should be 0.0f, but got ", presence_penalty);
+        OPENVINO_ASSERT(repetition_penalty == 1.0f, "'repetition_penalty' is not currently supported by beam search and should be 1.0f, but got ", repetition_penalty);
+    }
+
+    if (is_multinomial()) {
+        OPENVINO_ASSERT(top_k >= 0, "When 'do_sample' is true, top_k must be a non-negative, but got ", top_k);
+        OPENVINO_ASSERT(top_p > 0 && top_p <= 1.0f, "When 'do_sample' is true, top_p must be a positive float > 0.0 and <= 1.0, but got ", top_p);
+        OPENVINO_ASSERT(temperature > 0, "When 'do_sample' is true, temperature must be a strictly positive float, but got ", temperature);
+    } else {
+        // parameters requiring multinomial
+        OPENVINO_ASSERT(top_k == std::numeric_limits<size_t>::max(), "When 'do_sample' is false, top_k must be max of size_t, but got ", top_k);
+        OPENVINO_ASSERT(top_p == 1.0f, "When 'do_sample' is false, top_p must be 1.0f, but got ", top_p);
+        OPENVINO_ASSERT(temperature == 1.0f, "When 'do_sample' is false, temperature must be a 1.0f, but got ", temperature);
+    }
+
     if (is_beam_search()) {
-        OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive");
+        OPENVINO_ASSERT(num_beams % num_beam_groups == 0, "'num_beams' (", num_beams, ") should be divisible by 'num_beam_groups' (", num_beam_groups, ")");
+        OPENVINO_ASSERT(num_beams >= num_return_sequences, "'num_beams' (", num_beams, ") must be greater equal than 'num_return_sequences' (", num_return_sequences, ")");
+
+        OPENVINO_ASSERT(!do_sample,
+                        "Beam search with sampling is not supported yet. "
+                        "Please either set do_sample=false to use beam search "
+                        "or set num_beams=1 if you with to use multinomial sampling.");
+
+        OPENVINO_ASSERT(no_repeat_ngram_size > 0, "'no_repeat_ngram_size' must be positive");
         if (num_beam_groups > 1) {
-            OPENVINO_ASSERT(diversity_penalty != 0.0f, "For grouped beam search 'diversity_penalty' should not be zero, it it fallbacks to non-grouped beam search");
+            OPENVINO_ASSERT(diversity_penalty != 0.0f, "For grouped beam search 'diversity_penalty' should not be zero, otherwise it fallbacks to non-grouped beam search");
+        } else {
+            OPENVINO_ASSERT(diversity_penalty == 0.0f, "For beam search 'diversity_penalty' is applicable only when grouped beam search is used, but got 'num_beam_groups' == 1");
         }
     } else {
-        OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]");
-        OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]");
+        // parameters requiring beam search
+        OPENVINO_ASSERT(num_beam_groups == 1, "'num_beam_groups' is supported by beam search only and should be 1 otherwise, but got ", num_beam_groups);
+        OPENVINO_ASSERT(no_repeat_ngram_size == std::numeric_limits<size_t>::max(), "'no_repeat_ngram_size' is supported only by beam search, otherwise should be set to max of size_t, but got ", no_repeat_ngram_size);
+        OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to ", diversity_penalty, " (default is 0.0f), which is supported only by beam search sampling");
+        OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to ", length_penalty, " (default is 1.0f), which is supported only by beam search sampling");
     }
+
+    // assistant generation
+
     if (is_assisting_generation()) {
-        if (assistant_confidence_threshold != 0.f) {
-            OPENVINO_ASSERT(num_assistant_tokens == 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`");
-            OPENVINO_ASSERT(!is_prompt_lookup(), "Parameters `assistant_confidence_threshold` cannot be used while Prompt Lookup decoding");
-        } else {
-            OPENVINO_ASSERT(num_assistant_tokens > 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`");
-        };
+        OPENVINO_ASSERT(!is_beam_search() && num_return_sequences == 1, "Beam search and parallel sampling are not compatible with assistant generation");
+        OPENVINO_ASSERT(assistant_confidence_threshold == 0.0f || num_assistant_tokens == 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`");
+    }
+
+    if (num_assistant_tokens == 0) {
+        OPENVINO_ASSERT(max_ngram_size == 0, "'max_ngram_size' should be set to default value 0 when prompt lookup is disabled");
     }
 }
 
diff --git a/src/cpp/src/json_utils.hpp b/src/cpp/src/json_utils.hpp
index 13d792e9db..4a4bb001df 100644
--- a/src/cpp/src/json_utils.hpp
+++ b/src/cpp/src/json_utils.hpp
@@ -4,6 +4,9 @@
 
 #pragma once
 
+#include <vector>
+#include <set>
+
 #include <nlohmann/json.hpp>
 
 namespace ov {
@@ -40,6 +43,15 @@ void read_json_param(const nlohmann::json& data, const std::string& name, std::v
     }
 }
 
+template <typename V>
+void read_json_param(const nlohmann::json& data, const std::string& name, std::set<V>& param) {
+    if (data.contains(name) && data[name].is_array()) {
+        for (const auto elem : data[name]) {
+            param.insert(elem.get<V>());
+        }
+    }
+}
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 81f411020e..3e378e78cf 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -72,7 +72,6 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         const ov::AnyMap& config,
         const ov::genai::GenerationConfig& generation_config
     ) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
-        ov::Core core = utils::singleton_core();
         ov::CompiledModel compiled_model;
         auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
         utils::slice_matmul_stateful_model(model);
@@ -81,10 +80,10 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
             m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
             m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device);   // TODO: Make the prefix name configurable
-            compiled_model = core.compile_model(model, device, *filtered_plugin_config);
+            compiled_model = utils::singleton_core().compile_model(model, device, *filtered_plugin_config);
             m_model_runner = compiled_model.create_infer_request();
         } else {
-            compiled_model = core.compile_model(model, device, plugin_config);
+            compiled_model = utils::singleton_core().compile_model(model, device, plugin_config);
             m_model_runner = compiled_model.create_infer_request();
         }
         ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model");
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index 8510a8389f..5d82fa89a3 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -367,16 +367,16 @@ class ContinuousBatchingPipeline:
     def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None:
         ...
     @typing.overload
-    def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, sampling_params: GenerationConfig) -> GenerationHandle:
+    def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, generation_config: GenerationConfig) -> GenerationHandle:
         ...
     @typing.overload
-    def add_request(self, request_id: int, prompt: str, sampling_params: GenerationConfig) -> GenerationHandle:
+    def add_request(self, request_id: int, prompt: str, generation_config: GenerationConfig) -> GenerationHandle:
         ...
     @typing.overload
-    def generate(self, input_ids: list[openvino._pyopenvino.Tensor], sampling_params: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[EncodedGenerationResult]:
+    def generate(self, input_ids: list[openvino._pyopenvino.Tensor], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[EncodedGenerationResult]:
         ...
     @typing.overload
-    def generate(self, prompts: list[str], sampling_params: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[GenerationResult]:
+    def generate(self, prompts: list[str], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[GenerationResult]:
         ...
     def get_config(self) -> GenerationConfig:
         ...
@@ -609,11 +609,15 @@ class GenerationConfig:
         ...
     def is_greedy_decoding(self) -> bool:
         ...
+    def is_multinomial(self) -> bool:
+        ...
     def is_prompt_lookup(self) -> bool:
         ...
     def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None:
         ...
-    def update_generation_config(self, config_map: dict[str, openvino._pyopenvino.OVAny]) -> None:
+    def update_generation_config(self, **kwargs) -> None:
+        ...
+    def validate(self) -> None:
         ...
 class GenerationFinishReason:
     """
@@ -826,7 +830,7 @@ class Image2ImagePipeline:
         ...
     def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None:
         ...
-    def set_generation_config(self, generation_config: ImageGenerationConfig) -> None:
+    def set_generation_config(self, config: ImageGenerationConfig) -> None:
         ...
     def set_scheduler(self, scheduler: Scheduler) -> None:
         ...
@@ -927,7 +931,7 @@ class InpaintingPipeline:
         ...
     def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None:
         ...
-    def set_generation_config(self, generation_config: ImageGenerationConfig) -> None:
+    def set_generation_config(self, config: ImageGenerationConfig) -> None:
         ...
     def set_scheduler(self, scheduler: Scheduler) -> None:
         ...
@@ -1615,7 +1619,7 @@ class Text2ImagePipeline:
         ...
     def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None:
         ...
-    def set_generation_config(self, generation_config: ImageGenerationConfig) -> None:
+    def set_generation_config(self, config: ImageGenerationConfig) -> None:
         ...
     def set_scheduler(self, scheduler: Scheduler) -> None:
         ...
@@ -1865,9 +1869,9 @@ class VLMPipeline:
         ...
     def get_tokenizer(self) -> Tokenizer:
         ...
-    def set_chat_template(self, new_template: str) -> None:
+    def set_chat_template(self, chat_template: str) -> None:
         ...
-    def set_generation_config(self, new_config: GenerationConfig) -> None:
+    def set_generation_config(self, config: GenerationConfig) -> None:
         ...
     def start_chat(self, system_message: str = '') -> None:
         ...
@@ -2043,6 +2047,8 @@ class WhisperGenerationConfig:
         ...
     def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None:
         ...
+    def update_generation_config(self, **kwargs) -> None:
+        ...
 class WhisperPerfMetrics(PerfMetrics):
     """
     
diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp
index be7a72481f..2b48e4d44d 100644
--- a/src/python/py_continuous_batching_pipeline.cpp
+++ b/src/python/py_continuous_batching_pipeline.cpp
@@ -235,22 +235,22 @@ void init_continuous_batching_pipeline(py::module_& m) {
         .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer)
         .def("get_config", &ContinuousBatchingPipeline::get_config)
         .def("get_metrics", &ContinuousBatchingPipeline::get_metrics)
-        .def("add_request", py::overload_cast<uint64_t, const ov::Tensor&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("input_ids"), py::arg("sampling_params"))
-        .def("add_request", py::overload_cast<uint64_t, const std::string&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("prompt"), py::arg("sampling_params"))
+        .def("add_request", py::overload_cast<uint64_t, const ov::Tensor&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("input_ids"), py::arg("generation_config"))
+        .def("add_request", py::overload_cast<uint64_t, const std::string&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("prompt"), py::arg("generation_config"))
         .def("step", &ContinuousBatchingPipeline::step)
         .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests)
         .def(
             "generate",
             py::overload_cast<const std::vector<ov::Tensor>&, const std::vector<ov::genai::GenerationConfig>&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate),
             py::arg("input_ids"),
-            py::arg("sampling_params"),
+            py::arg("generation_config"),
             py::arg("streamer") = std::monostate{}
         )
         .def(
             "generate",
             py::overload_cast<const std::vector<std::string>&, const std::vector<ov::genai::GenerationConfig>&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate),
             py::arg("prompts"),
-            py::arg("sampling_params"),
+            py::arg("generation_config"),
             py::arg("streamer") = std::monostate{}
         );
 }
diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp
index f49bcf29bd..a97a43fc5c 100644
--- a/src/python/py_generation_config.cpp
+++ b/src/python/py_generation_config.cpp
@@ -118,7 +118,13 @@ void init_generation_config(py::module_& m) {
         .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id"))
         .def("is_beam_search", &GenerationConfig::is_beam_search)
         .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding)
+        .def("is_multinomial", &GenerationConfig::is_multinomial)
         .def("is_assisting_generation", &GenerationConfig::is_assisting_generation)
         .def("is_prompt_lookup", &GenerationConfig::is_prompt_lookup)
-        .def("update_generation_config", static_cast<void (GenerationConfig::*)(const ov::AnyMap&)>(&ov::genai::GenerationConfig::update_generation_config), py::arg("config_map"));
+        .def("validate", &GenerationConfig::validate)
+        .def("update_generation_config", [](
+            ov::genai::GenerationConfig& config,
+            const py::kwargs& kwargs) {
+            config.update_generation_config(pyutils::kwargs_to_any_map(kwargs));
+        });
    }
diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
index 311f3f3760..c246557a97 100644
--- a/src/python/py_image_generation_pipelines.cpp
+++ b/src/python/py_image_generation_pipelines.cpp
@@ -224,7 +224,7 @@ void init_image_generation_pipelines(py::module_& m) {
         .def_readwrite("max_sequence_length", &ov::genai::ImageGenerationConfig::max_sequence_length)
         .def("validate", &ov::genai::ImageGenerationConfig::validate)
         .def("update_generation_config", [](
-            ov::genai::ImageGenerationConfig config,
+            ov::genai::ImageGenerationConfig& config,
             const py::kwargs& kwargs) {
             config.update_generation_config(pyutils::kwargs_to_any_map(kwargs));
         });
@@ -255,8 +255,8 @@ void init_image_generation_pipelines(py::module_& m) {
             device (str): Device to run the model on (e.g., CPU, GPU).
             kwargs: Text2ImagePipeline properties
         )")
-        .def("get_generation_config", &ov::genai::Text2ImagePipeline::get_generation_config)
-        .def("set_generation_config", &ov::genai::Text2ImagePipeline::set_generation_config, py::arg("generation_config"))
+        .def("get_generation_config", &ov::genai::Text2ImagePipeline::get_generation_config, py::return_value_policy::copy)
+        .def("set_generation_config", &ov::genai::Text2ImagePipeline::set_generation_config, py::arg("config"))
         .def("set_scheduler", &ov::genai::Text2ImagePipeline::set_scheduler, py::arg("scheduler"))
         .def("reshape", &ov::genai::Text2ImagePipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale"))
         .def_static("stable_diffusion", &ov::genai::Text2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae"))
@@ -323,8 +323,8 @@ void init_image_generation_pipelines(py::module_& m) {
             device (str): Device to run the model on (e.g., CPU, GPU).
             kwargs: Image2ImagePipeline properties
         )")
-        .def("get_generation_config", &ov::genai::Image2ImagePipeline::get_generation_config)
-        .def("set_generation_config", &ov::genai::Image2ImagePipeline::set_generation_config, py::arg("generation_config"))
+        .def("get_generation_config", &ov::genai::Image2ImagePipeline::get_generation_config, py::return_value_policy::copy)
+        .def("set_generation_config", &ov::genai::Image2ImagePipeline::set_generation_config, py::arg("config"))
         .def("set_scheduler", &ov::genai::Image2ImagePipeline::set_scheduler, py::arg("scheduler"))
         .def("reshape", &ov::genai::Image2ImagePipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale"))
         .def_static("stable_diffusion", &ov::genai::Image2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae"))
@@ -386,8 +386,8 @@ void init_image_generation_pipelines(py::module_& m) {
             device (str): Device to run the model on (e.g., CPU, GPU).
             kwargs: InpaintingPipeline properties
         )")
-        .def("get_generation_config", &ov::genai::InpaintingPipeline::get_generation_config)
-        .def("set_generation_config", &ov::genai::InpaintingPipeline::set_generation_config, py::arg("generation_config"))
+        .def("get_generation_config", &ov::genai::InpaintingPipeline::get_generation_config, py::return_value_policy::copy)
+        .def("set_generation_config", &ov::genai::InpaintingPipeline::set_generation_config, py::arg("config"))
         .def("set_scheduler", &ov::genai::InpaintingPipeline::set_scheduler, py::arg("scheduler"))
         .def("reshape", &ov::genai::InpaintingPipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale"))
         .def_static("stable_diffusion", &ov::genai::InpaintingPipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae"))
diff --git a/src/python/py_llm_pipeline.cpp b/src/python/py_llm_pipeline.cpp
index b1d5136253..7360975a0b 100644
--- a/src/python/py_llm_pipeline.cpp
+++ b/src/python/py_llm_pipeline.cpp
@@ -53,15 +53,10 @@ py::object call_common_generate(
     const pyutils::PyBindStreamerVariant& py_streamer,
     const py::kwargs& kwargs
 ) {
-    ov::genai::GenerationConfig default_config;
-    if (config.has_value()) {
-        default_config = *config;
-    } else {
-        default_config = pipe.get_generation_config();
-    }
+    ov::genai::GenerationConfig default_config = config.has_value() ? *config : pipe.get_generation_config();
     auto updated_config = pyutils::update_config_from_kwargs(default_config, kwargs);
+
     py::object results;
-    EncodedInputs tensor_data;
     StreamerVariant streamer = pyutils::pystreamer_to_streamer(py_streamer);
 
     // Call suitable generate overload for each type of input.
diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp
index 45a0c46174..34522409ea 100644
--- a/src/python/py_utils.cpp
+++ b/src/python/py_utils.cpp
@@ -358,7 +358,10 @@ ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::O
     ov::genai::GenerationConfig res_config;
     if(config.has_value())
         res_config = *config;
-    res_config.update_generation_config(kwargs_to_any_map(kwargs));
+
+    if (!kwargs.empty())
+        res_config.update_generation_config(kwargs_to_any_map(kwargs));
+
     return res_config;
 }
 
diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp
index 340cb3da62..b0cfa0a42a 100644
--- a/src/python/py_vlm_pipeline.cpp
+++ b/src/python/py_vlm_pipeline.cpp
@@ -150,10 +150,10 @@ void init_vlm_pipeline(py::module_& m) {
 
         .def("start_chat", &ov::genai::VLMPipeline::start_chat, py::arg("system_message") = "")
         .def("finish_chat", &ov::genai::VLMPipeline::finish_chat)
-        .def("set_chat_template", &ov::genai::VLMPipeline::set_chat_template, py::arg("new_template"))
+        .def("set_chat_template", &ov::genai::VLMPipeline::set_chat_template, py::arg("chat_template"))
         .def("get_tokenizer", &ov::genai::VLMPipeline::get_tokenizer)
-        .def("get_generation_config", &ov::genai::VLMPipeline::get_generation_config)
-        .def("set_generation_config", &ov::genai::VLMPipeline::set_generation_config, py::arg("new_config"))
+        .def("get_generation_config", &ov::genai::VLMPipeline::get_generation_config, py::return_value_policy::copy)
+        .def("set_generation_config", &ov::genai::VLMPipeline::set_generation_config, py::arg("config"))
         .def(
             "generate",
             [](ov::genai::VLMPipeline& pipe,
diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp
index cd42dcf58d..d290612ed6 100644
--- a/src/python/py_whisper_pipeline.cpp
+++ b/src/python/py_whisper_pipeline.cpp
@@ -187,7 +187,10 @@ OptionalWhisperGenerationConfig update_whisper_config_from_kwargs(const Optional
     WhisperGenerationConfig res_config;
     if (config.has_value())
         res_config = *config;
-    res_config.update_generation_config(pyutils::kwargs_to_any_map(kwargs));
+
+    if (!kwargs.empty())
+        res_config.update_generation_config(pyutils::kwargs_to_any_map(kwargs));
+
     return res_config;
 }
 
@@ -295,7 +298,12 @@ void init_whisper_pipeline(py::module_& m) {
         .def_readwrite("return_timestamps", &WhisperGenerationConfig::return_timestamps)
         .def_readwrite("initial_prompt", &WhisperGenerationConfig::initial_prompt)
         .def_readwrite("hotwords", &WhisperGenerationConfig::hotwords)
-        .def("set_eos_token_id", &WhisperGenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id"));
+        .def("set_eos_token_id", &WhisperGenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id"))
+        .def("update_generation_config", [](
+            ov::genai::WhisperGenerationConfig& config,
+            const py::kwargs& kwargs) {
+            config.update_generation_config(pyutils::kwargs_to_any_map(kwargs));
+        });;
 
     py::class_<WhisperRawPerfMetrics>(m, "WhisperRawPerfMetrics", raw_perf_metrics_docstring)
         .def(py::init<>())
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index 093cd993de..b8c2e625c5 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -25,8 +25,8 @@ file(GLOB src_files "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sequence_group.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/continuous_batching*.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/text_callback_streamer.cpp")
 
-add_executable(${TEST_TARGET_NAME} ${tests_src}
-        block_allocator.cpp)
+add_executable(${TEST_TARGET_NAME} ${tests_src})
+
 target_link_libraries(${TEST_TARGET_NAME} PRIVATE openvino::genai gtest_main)
 target_include_directories(${TEST_TARGET_NAME} PRIVATE "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src")
 target_sources(${TEST_TARGET_NAME} PRIVATE ${src_files})
diff --git a/tests/cpp/generate_config.cpp b/tests/cpp/generate_config.cpp
deleted file mode 100644
index 974fd499f8..0000000000
--- a/tests/cpp/generate_config.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright (C) 2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include <gtest/gtest.h>
-#include <openvino/core/except.hpp>
-#include "openvino/genai/generation_config.hpp"
-
-
-using namespace ov::genai;
-
-TEST(GenerationConfigTest, invalid_temperature) {
-    GenerationConfig config;
-    config.max_new_tokens = 20;
-    config.temperature = -0.1;
-    config.do_sample = true;
-    EXPECT_THROW(config.validate(), ov::Exception);
-}
-
-TEST(GenerationConfigTest, valid_temperature) {
-    GenerationConfig config;
-    config.max_new_tokens = 20;
-    config.do_sample = true;
-    config.temperature = 0.1;
-    EXPECT_NO_THROW(config.validate());
-}
-
-TEST(GenerationConfigTest, invalid_top_p) {
-    GenerationConfig config;
-    config.max_new_tokens = 20;
-    config.do_sample = true;
-    config.top_p = -0.5;
-    EXPECT_THROW(config.validate(), ov::Exception);
-    config.top_p = 1.1;
-    EXPECT_THROW(config.validate(), ov::Exception);
-}
-
-TEST(GenerationConfigTest, valid_top_p) {
-    GenerationConfig config;
-    config.max_new_tokens = 20;
-    config.do_sample = true;
-    config.top_p = 0.1;
-    EXPECT_NO_THROW(config.validate());
-}
-
-TEST(GenerationConfigTest, invalid_repeatition_penalty) {
-    GenerationConfig config;
-    config.max_new_tokens = 20;
-    config.do_sample = true;
-    config.repetition_penalty = -3.0;
-    EXPECT_THROW(config.validate(), ov::Exception);
-    config.repetition_penalty = -0.1;
-    EXPECT_THROW(config.validate(), ov::Exception);
-}
-
-TEST(GenerationConfigTest, valid_repeatition_penalty) {
-    GenerationConfig config;
-    config.max_new_tokens = 20;
-    config.do_sample = true;
-    config.repetition_penalty = 1.8;
-    EXPECT_NO_THROW(config.validate());
-    config.repetition_penalty = 0.1;
-    EXPECT_NO_THROW(config.validate());
-}
-
-TEST(GenerationConfigTest, invalid_presence_penalty) {
-    GenerationConfig config;
-    config.max_new_tokens = 20;
-    config.do_sample = true;
-    config.presence_penalty = 3.0;
-    EXPECT_THROW(config.validate(), ov::Exception);
-    config.presence_penalty = -3.1;
-    EXPECT_THROW(config.validate(), ov::Exception);
-}
-
-TEST(GenerationConfigTest, valid_presence_penalty) {
-    GenerationConfig config;
-    config.max_new_tokens = 20;
-    config.do_sample = true;
-    config.presence_penalty = 1.8;
-    EXPECT_NO_THROW(config.validate());
-    config.presence_penalty = -2.0;
-    EXPECT_NO_THROW(config.validate());
-}
-
-TEST(GenerationConfigTest, invalid_frequency_penalty) {
-    GenerationConfig config;
-    config.max_new_tokens = 20;
-    config.do_sample = true;
-    config.frequency_penalty = 3.0;
-    EXPECT_THROW(config.validate(), ov::Exception);
-    config.frequency_penalty = -3.1;
-    EXPECT_THROW(config.validate(), ov::Exception);
-}
-
-TEST(GenerationConfigTest, valid_frequency_penalty) {
-    GenerationConfig config;
-    config.max_new_tokens = 20;
-    config.do_sample = true;
-    config.frequency_penalty = 1.8;
-    EXPECT_NO_THROW(config.validate());
-    config.frequency_penalty = -2.0;
-    EXPECT_NO_THROW(config.validate());
-}
-
-ov::genai::GenerationConfig speculative_decoding_multinomial() {
-    auto speculative_decoding_multinomial_config = ov::genai::multinomial();
-    speculative_decoding_multinomial_config.num_assistant_tokens = 5;
-    return speculative_decoding_multinomial_config;
-}
-
-ov::genai::GenerationConfig speculative_decoding_greedy() {
-    auto speculative_decoding_greedy_config = ov::genai::greedy();
-    speculative_decoding_greedy_config.assistant_confidence_threshold = 0.4f;
-    return speculative_decoding_greedy_config;
-}
-
-TEST(GenerationConfigTest, invalid_static_spec_decoding) {
-    GenerationConfig config = speculative_decoding_greedy();
-    config.num_assistant_tokens = 5;
-    config.assistant_confidence_threshold = 0.2;
-    EXPECT_THROW(config.validate(), ov::Exception);
-}
-
-TEST(GenerationConfigTest, valid_static_spec_decoding) {
-    GenerationConfig config = speculative_decoding_greedy();
-    config.num_assistant_tokens = 5;
-    config.assistant_confidence_threshold = 0;
-    EXPECT_NO_THROW(config.validate());
-}
-
-TEST(GenerationConfigTest, invalid_dynamic_spec_decoding) {
-    GenerationConfig config = speculative_decoding_greedy();
-    config.num_assistant_tokens = 5;
-    config.assistant_confidence_threshold = 0.5;
-    EXPECT_THROW(config.validate(), ov::Exception);
-}
-
-TEST(GenerationConfigTest, valid_dynamic_spec_decoding) {
-    GenerationConfig config = speculative_decoding_greedy();
-    config.assistant_confidence_threshold = 0.5;
-    config.num_assistant_tokens = 0;
-    EXPECT_NO_THROW(config.validate());
-}
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index f940d272ed..9040fa435f 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -73,6 +73,7 @@ def get_beam_search() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_beam_groups = 3
     generation_config.num_beams = 6
+    generation_config.diversity_penalty = 1
     generation_config.max_new_tokens = 30
     generation_config.num_return_sequences = 3
     generation_config.num_return_sequences = generation_config.num_beams
@@ -82,6 +83,7 @@ def get_beam_search_min_and_max_tokens() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_beam_groups = 3
     generation_config.num_beams = 6
+    generation_config.diversity_penalty = 1
     generation_config.min_new_tokens = 15
     generation_config.max_new_tokens = 30
     generation_config.num_return_sequences = 3
@@ -92,6 +94,7 @@ def get_beam_search_with_single_stop_string() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_beam_groups = 3
     generation_config.num_beams = 6
+    generation_config.diversity_penalty = 1
     generation_config.max_new_tokens = 50
     generation_config.num_return_sequences = generation_config.num_beams
     generation_config.stop_strings = {"open sour"}  # expected match on "open source"
@@ -102,6 +105,7 @@ def get_beam_search_with_multiple_stop_strings() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_beam_groups = 3
     generation_config.num_beams = 6
+    generation_config.diversity_penalty = 1
     generation_config.max_new_tokens = 50
     generation_config.num_return_sequences = generation_config.num_beams
     generation_config.stop_strings = {".", "software", "Intel"}
@@ -112,6 +116,7 @@ def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_beam_groups = 3
     generation_config.num_beams = 6
+    generation_config.diversity_penalty = 1
     generation_config.max_new_tokens = 30
     generation_config.num_return_sequences = generation_config.num_beams
     generation_config.stop_strings = {"Einstein", "sunny", "geothermal"}
@@ -299,7 +304,7 @@ def convert_to_hf(
     kwargs['pad_token_id'] = default_generation_config.pad_token_id
     kwargs['repetition_penalty'] = generation_config.repetition_penalty
 
-    if generation_config.num_beams > 1:
+    if generation_config.is_beam_search():
         # beam search case
         kwargs['num_beam_groups'] = generation_config.num_beam_groups
         kwargs['num_beams'] = generation_config.num_beams
@@ -309,7 +314,7 @@ def convert_to_hf(
         kwargs['output_scores'] = True
         if generation_config.num_beam_groups > 1:
             kwargs['diversity_penalty'] = generation_config.diversity_penalty
-    elif generation_config.do_sample:
+    elif generation_config.is_multinomial():
         # mulitinomial
         kwargs['temperature'] = generation_config.temperature
         kwargs['top_k'] = generation_config.top_k
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 3fc89cb8a7..9e8e4681f9 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -111,7 +111,7 @@ def read_model(params, **tokenizer_kwargs):
         path,
         hf_tokenizer,
         opt_model,
-        ov_genai.LLMPipeline(path, 'CPU', **{'ENABLE_MMAP': False}),
+        ov_genai.LLMPipeline(path, 'CPU', ENABLE_MMAP=False),
     )
 
 
@@ -139,7 +139,7 @@ def model_tmp_path(tmpdir_factory):
 
 
 @pytest.fixture(scope="module")
-def model_tokenizers_path_tmp_path(tmpdir_factory):
+def model_tokenizers_tmp_path(tmpdir_factory):
     model_id, path, _, _, _ = read_model(get_models_list()[0])
     temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_'))
 
@@ -180,10 +180,15 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path):
     for config_json, config_name in configs:
         with (temp_path / config_name).open('w') as f:
             json.dump(config_json, f)
-    return ov_genai.LLMPipeline(temp_path, 'CPU')
+
+    ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU')
+
+    for _, config_name in configs:
+        os.remove(temp_path / config_name)
+
+    return ov_pipe
 
 
 @functools.lru_cache(1)
 def get_continuous_batching(path):
-    scheduler_config = ov_genai.SchedulerConfig()
-    return ov_genai.LLMPipeline(path, ov_genai.Tokenizer(path), 'CPU', **{"scheduler_config": scheduler_config})
+    return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig())
diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py
index 3a1e9fa092..01762bf9e3 100644
--- a/tests/python_tests/test_continuous_batching.py
+++ b/tests/python_tests/test_continuous_batching.py
@@ -105,7 +105,7 @@ def test_cb_streamer_vs_return_vs_stateful(prompt):
 
 generation_configs = [
     dict(do_sample=False, max_new_tokens=20),
-    dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0)
+    dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0, repetition_penalty=1.0)
 ]
 questions = [
     '1+1=',
@@ -113,19 +113,22 @@ def test_cb_streamer_vs_return_vs_stateful(prompt):
     'Why is the Sun yellow?',
     'What was my first question?'
 ]
-@pytest.mark.parametrize("generation_config", generation_configs[1:])
+@pytest.mark.parametrize("generation_config_kwargs", generation_configs[1:])
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
-def test_chat_scenario_vs_stateful(model_descr, generation_config: Dict):
+def test_chat_scenario_vs_stateful(model_descr, generation_config_kwargs: Dict):
     model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
     cb_pipe = get_continuous_batching(path)
 
     ov_pipe.start_chat()
     cb_pipe.start_chat()
 
+    generation_config = GenerationConfig(**generation_config_kwargs)
+    ov_pipe.set_generation_config(generation_config)
+
     for question in questions:
-        generated = cb_pipe.generate(question, **generation_config)
-        reference = ov_pipe.generate(question, **generation_config)
+        generated = cb_pipe.generate(question, generation_config=generation_config)
+        reference = ov_pipe.generate(question)
         assert generated == reference
 
     # Test that finish_chat() doesn't fail just in case.
@@ -168,9 +171,13 @@ def test_post_oom_health(tmp_path, sampling_config):
 # Pre-emption
 #
 
-def get_greedy_seq_len_300() -> GenerationConfig:
+def get_parallel_sampling_seq_len_300() -> GenerationConfig:
     generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 3
+    # TODO: add generation_config.generator and return parameters below
+    # generation_config.num_return_sequences = 3
+    # generation_config.do_sample = True
+    # generation_config.top_k = 10
+    # generation_config.top_p = 0.5
     generation_config.max_new_tokens = 300
     return generation_config
 
@@ -178,14 +185,15 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_beam_groups = 3
     generation_config.num_beams = 6
+    generation_config.diversity_penalty = 1
     generation_config.max_new_tokens = 300
     generation_config.num_return_sequences = generation_config.num_beams
     return generation_config
 
 scheduler_params_list = [({"num_kv_blocks": 2, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()),
                          ({"num_kv_blocks": 2, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()),
-                         ({"num_kv_blocks": 10, "dynamic_split_fuse": True}, get_greedy_seq_len_300()),
-                         ({"num_kv_blocks": 10, "dynamic_split_fuse": False}, get_greedy_seq_len_300()),
+                         ({"num_kv_blocks": 10, "dynamic_split_fuse": True}, get_parallel_sampling_seq_len_300()),
+                         ({"num_kv_blocks": 10, "dynamic_split_fuse": False}, get_parallel_sampling_seq_len_300()),
                          ({"num_kv_blocks": 34, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()),
                          ({"num_kv_blocks": 34, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()),
                          ({"num_kv_blocks": 100, "dynamic_split_fuse": True}, get_beam_search_seq_len_300()),
diff --git a/tests/python_tests/test_generation_config.py b/tests/python_tests/test_generation_config.py
new file mode 100644
index 0000000000..110caaf0e5
--- /dev/null
+++ b/tests/python_tests/test_generation_config.py
@@ -0,0 +1,142 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from openvino_genai import GenerationConfig
+from typing import Tuple, List
+import json
+import os
+import pytest
+
+configs = [
+    # stop conditions
+    dict(max_new_tokens=12),
+    dict(max_length=12),
+    dict(stop_token_ids={2}),
+    dict(eos_token_id=1, stop_token_ids={1}),
+    dict(stop_strings={"a", "b"}),
+    dict(ignore_eos=True, max_new_tokens=10),
+    dict(ignore_eos=True, max_length=10),
+    dict(max_new_tokens=0, echo=True),
+    dict(min_new_tokens=1, max_new_tokens=1),
+    # multinomial
+    dict(max_new_tokens=1, do_sample=True, num_return_sequences=2),
+    dict(max_new_tokens=1, do_sample=True, top_k=1),
+    dict(max_new_tokens=1, do_sample=True, top_p=0.5),
+    dict(max_new_tokens=1, do_sample=True, temperature=0.5),
+    # beam search
+    dict(max_new_tokens=1, num_beams=2),
+    dict(max_new_tokens=1, num_beams=2, num_return_sequences=1),
+    dict(max_new_tokens=1, num_beams=2, num_return_sequences=2),
+    dict(max_new_tokens=1, num_beams=4, num_beam_groups=2, diversity_penalty=1.0),
+    dict(max_new_tokens=1, num_beams=4, length_penalty=1.0),
+    dict(max_new_tokens=1, num_beams=4, no_repeat_ngram_size=2),
+    # assistant generation
+    dict(max_new_tokens=1, assistant_confidence_threshold=0.5),
+    dict(max_new_tokens=1, num_assistant_tokens=2),
+    dict(max_new_tokens=1, num_assistant_tokens=2, max_ngram_size=2), # prompt lookup
+]
+@pytest.mark.parametrize("generation_config_kwargs", configs)
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_valid_configs(generation_config_kwargs):
+    config = GenerationConfig(**generation_config_kwargs)
+    config.validate()
+
+    config = GenerationConfig()
+    config.update_generation_config(**generation_config_kwargs)
+    config.validate()
+
+
+invalid_configs = [
+    dict(num_return_sequences=0), # no reason to run with empty output
+    dict(num_return_sequences=2), # beam search or multimonial is required
+    # stop conditions
+    dict(), # no stop conditions at all
+    dict(eos_token_id=1), # 'stop_token_ids' does not contain 'eos_token_id'
+    dict(eos_token_id=1, stop_token_ids={2}), # 'stop_token_ids' is not empty, but does not contain 'eos_token_id'
+    dict(ignore_eos=True),  # no 'max_new_tokens', no 'max_length' with 'ignore_eos'
+    dict(stop_token_ids={-1}), # value in 'stop_token_ids' must be non-negative 
+    dict(max_new_tokens=0), # max new tokens cannot be empty (only when 'echo' is True)
+    dict(max_new_tokens=10, min_new_tokens=20), # 'max_new_tokens' must be >= 'min_new_tokens'
+    # penalties
+    dict(max_new_tokens=1, repetition_penalty=-1.0), # invalid repetition_penalty
+    dict(max_new_tokens=1, presence_penalty=-3.0), # invalid presence_penalty
+    dict(max_new_tokens=1, frequency_penalty=3.0), # invalid frequency_penalty
+    # multinomial sampling
+    dict(max_new_tokens=1, do_sample=True, top_p=1.1), # 'top_p' must be within (0, 1] when 'do_sample' is True
+    dict(max_new_tokens=1, do_sample=True, top_p=0), # 'top_p' must be within (0, 1] when 'do_sample' is True
+    dict(max_new_tokens=1, do_sample=True, temperature=-1.0), # invalid temp
+    # parameters requiring multimonial
+    dict(max_new_tokens=1, top_k=1), # requires do_sample=True
+    dict(max_new_tokens=1, top_p=0.5), # requires do_sample=True
+    dict(max_new_tokens=1, temperature=2.0), # requires do_sample=True
+    # beam search
+    dict(max_new_tokens=1, num_beams=2, num_return_sequences=3), # 'num_beams' must be >= 'num_return_sequences'
+    dict(max_new_tokens=1, num_beams=3, num_beam_groups=2), # 'num_beams' must be divisible by 'num_beam_groups'
+    dict(max_new_tokens=1, num_beams=3, do_sample=True), # 'beam sample is not supported
+    dict(max_new_tokens=1, num_beams=3, no_repeat_ngram_size=0), # invalid 'no_repeat_ngram_size'
+    dict(max_new_tokens=1, num_beams=4, num_beam_groups=2, diversity_penalty=0.0), # 'diversity_penalty' should not be a default value
+    dict(max_new_tokens=1, num_beams=4, diversity_penalty=1.0), # 'diversity_penalty' is used only for grouped beam search
+    dict(max_new_tokens=1, num_beams=2, frequency_penalty=1.0), # 'frequency_penalty' is not supported by beam search
+    dict(max_new_tokens=1, num_beams=2, presence_penalty=1.0), # 'presence_penalty' is not supported by beam search
+    dict(max_new_tokens=1, num_beams=2, repetition_penalty=0.0), # 'repetition_penalty' is not supported by beam search
+    # parameters requiring beam search
+    dict(max_new_tokens=1, num_beam_groups=2), # requiring beam search
+    dict(max_new_tokens=1, no_repeat_ngram_size=2), # requiring beam search
+    dict(max_new_tokens=1, diversity_penalty=1.0), # requiring beam search
+    dict(max_new_tokens=1, length_penalty=2), # requiring beam search
+    # assistant generation
+    dict(max_new_tokens=1, num_assistant_tokens=2, do_sample=True, num_return_sequences=2), # 'num_return_sequences' must be 1, as we cannot use different number of tokens per sequence within a group
+    dict(max_new_tokens=1, assistant_confidence_threshold=1.0, do_sample=True, num_return_sequences=2), # 'num_return_sequences' must be 1, as we cannot use different number of tokens per sequence within a group
+    dict(max_new_tokens=1, num_assistant_tokens=2, num_beams=2), # beam search is not compatible with assistant generation
+    dict(max_new_tokens=1, assistant_confidence_threshold=1.0, num_assistant_tokens=2), # 'assistant_confidence_threshold' and 'num_assistant_tokens' are mutually exclusive
+    dict(max_new_tokens=1, max_ngram_size=1), # 'max_ngram_size' is for prompt lookup, but assistant generation is turned off ('num_assistant_tokens' is 0)
+    # TODO: add tests for invalid properties
+]
+@pytest.mark.parametrize("generation_config_kwargs", invalid_configs)
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_invalid_generation_configs_throws(generation_config_kwargs):
+    config = GenerationConfig(**generation_config_kwargs)
+    with pytest.raises(RuntimeError):
+        config.validate()
+
+    config = GenerationConfig()
+    config.update_generation_config(**generation_config_kwargs)
+    with pytest.raises(RuntimeError):
+        config.validate()
+
+
+def load_genai_generation_config_from_file(configs: List[Tuple], temp_path):
+    for json_file in temp_path.glob("*.json"):
+        json_file.unlink()
+
+    for config_json, config_name in configs:
+        with (temp_path / config_name).open('w') as f:
+            json.dump(config_json, f)
+
+    ov_generation_config = GenerationConfig(temp_path / "generation_config.json")
+
+    for _, config_name in configs:
+        os.remove(temp_path / config_name)
+
+    return ov_generation_config
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_multiple_eos_are_read_as_stop_token_ids(tmp_path):
+    generation_config_json = {
+        "eos_token_id": [
+            2,
+            32000,
+            32007
+        ]
+    }
+    configs = [
+        (generation_config_json, "generation_config.json"),
+    ]
+
+    generation_config = load_genai_generation_config_from_file(configs, tmp_path)
+
+    assert generation_config.eos_token_id == 2
+    assert generation_config.stop_token_ids == { 2, 32000, 32007 }
diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py
index bbd0da6bb2..6228f53dd1 100644
--- a/tests/python_tests/test_kv_cache_eviction.py
+++ b/tests/python_tests/test_kv_cache_eviction.py
@@ -147,7 +147,6 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t
 
 def get_greedy_seq_len_300() -> GenerationConfig:
     generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 3
     generation_config.max_new_tokens = 300
     return generation_config
 
@@ -155,6 +154,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_beam_groups = 3
     generation_config.num_beams = 6
+    generation_config.diversity_penalty = 1
     generation_config.max_new_tokens = 300
     generation_config.num_return_sequences = generation_config.num_beams
     return generation_config
diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py
index 9f00996a58..6e3cce06d0 100644
--- a/tests/python_tests/test_llm_pipeline.py
+++ b/tests/python_tests/test_llm_pipeline.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import openvino_genai as ov_genai
-from openvino_genai import StopCriteria
+from openvino_genai import StopCriteria, GenerationConfig
 import pytest
 from typing import Union, List, Dict, Optional
 import numpy as np
@@ -18,7 +18,6 @@
     get_chat_models_list,
     model_tmp_path,
     STOP_CRITERIA_MAP,
-    get_continuous_batching,
 )
 
 
@@ -299,11 +298,10 @@ def test_batch_size_switch():
 #
 
 generation_configs = [
-    dict(do_sample=False, max_new_tokens=20),
-    dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0)
+    dict(max_new_tokens=20),
+    dict(max_new_tokens=10, num_beam_groups=3, num_beams=15, num_return_sequences=1, diversity_penalty=1.0)
 ]
 
-
 questions = [
     '1+1=',
     'What is the previous answer?',
@@ -311,12 +309,11 @@ def test_batch_size_switch():
     'What was my first question?'
 ]
 
-
-@pytest.mark.parametrize("generation_config", generation_configs)
+@pytest.mark.parametrize("generation_config_kwargs", generation_configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_chat_compare_with_HF(model_descr, generation_config: Dict):
+def test_chat_compare_with_HF(model_descr, generation_config_kwargs: Dict):
     chat_history_hf = []
     chat_history_ov = []
     chat_prompt = ''
@@ -324,6 +321,10 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict):
     # Will set add_special_tokens=False inside pipeline when start_chat() is called.
     model_id, path, tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
 
+    from transformers import GenerationConfig as HFGenerationConfig
+    hf_generation_config = HFGenerationConfig(**generation_config_kwargs)
+    ov_generation_config = GenerationConfig(**generation_config_kwargs)
+
     ov_pipe.start_chat()
     for prompt in questions:
         chat_history_hf.append({'role': 'user', 'content': prompt})
@@ -332,11 +333,11 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict):
         chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
         tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
 
-        answer = opt_model.generate(**tokenized, **generation_config)
+        answer = opt_model.generate(**tokenized, generation_config=hf_generation_config)
         answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
         chat_history_hf.append({'role': 'assistant', 'content': answer_str})
 
-        answer_ov = ov_pipe.generate(prompt, **generation_config)
+        answer_ov = ov_pipe.generate(prompt, generation_config=ov_generation_config)
         chat_history_ov.append({'role': 'assistant', 'content': answer_ov})
 
     ov_pipe.finish_chat()
@@ -492,30 +493,9 @@ def test_operator_with_streamer_kwargs_batch_throws():
         ov_pipe('', num_beams=2, streamer=printer)
 
 #
-# Tests on generation configs (invalid cases and handling within LLMPipeline)
+# Tests on generation configs handling
 #
 
-invalid_configs = [
-    dict(num_beam_groups=3, num_beams=15, do_sample=True),
-    # TODO: CVS-158682 eos_token_id is still read from tiny-random-phi3 and we cannot modify RTInfo in tests
-    # dict(do_sample=True),  # no eos_token_id no max_new_tokens, no max_len
-    dict(eos_token_id=42, ignore_eos=True),  # no max_new_tokens, no max_len with ignore_eos
-    dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty
-    dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp
-    dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p
-    dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k
-]
-@pytest.mark.parametrize("generation_config", invalid_configs)
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_invalid_generation_configs_throws(model_tmp_path, generation_config):
-    model_id, temp_path = model_tmp_path
-    config_json = {}
-    ov_pipe = load_genai_pipe_with_configs([(config_json, "config.json")], temp_path)
-    with pytest.raises(RuntimeError):
-        ov_pipe.generate('blah blah', **generation_config)
-
-
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_eos_token_is_inherited_from_default_generation_config(model_tmp_path):
@@ -529,28 +509,14 @@ def test_eos_token_is_inherited_from_default_generation_config(model_tmp_path):
     assert 37 == ov_pipe.get_generation_config().eos_token_id
 
 
-invalid_py_configs = [
-    dict(num_beam_groups=3, num_beams=15, do_sample=True),
-    # TODO: Currently unexpected params do not cause exceptions. Need to implement it in c++ and return this test
-  #  dict(unexisting_key_name=True),  # no eos_token_id no max_new_tokens, no max_len
-    dict(eos_token_id=42, ignore_eos=True),  # no max_new_tokens, no max_len with ignore_eos
-    dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty
-    dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp
-    dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p
-    dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k
-]
 @pytest.mark.precommit
 @pytest.mark.nightly
-@pytest.mark.parametrize("generation_config", invalid_py_configs)
-def test_python_generation_config_validation_throws(model_tmp_path, generation_config):
-    model_id, temp_path = model_tmp_path
-    ov_pipe = load_genai_pipe_with_configs([({"eos_token_id": 37}, "config.json")], temp_path)
-
-    # 'unexisting_key_name' key validity is checked in pybind and ValueError will be returned
-    #  instead of RuntimeError, which is returned when GenerationConfig values are validated
-    return_exception_type = ValueError if 'unexisting_key_name' in generation_config else RuntimeError
-    with pytest.raises(return_exception_type):
-        ov_pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config))
+def test_pipeline_validates_generation_config():
+    model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
+    ov_pipe = read_model((model_id, path))[4]
+    invalid_generation_config = dict(num_beam_groups=3, num_beams=15, do_sample=True) # beam sample is not supported
+    with pytest.raises(RuntimeError):
+        ov_pipe.generate("dummy prompt", **invalid_generation_config)
 
 #
 # Work with Unicode in Python API
@@ -699,7 +665,7 @@ def test_stop_token_ids():
     res = ov_pipe.generate(
         ov.Tensor([(1,)]),
         max_new_tokens=3,
-        stop_token_ids={-1, 9935, ov_pipe.get_tokenizer().get_eos_token_id()},
+        stop_token_ids={9935, ov_pipe.get_tokenizer().get_eos_token_id()},
         include_stop_str_in_output=False
     )
     assert 2 == len(res.tokens[0])
diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py
index 0c2a106d50..8129298763 100644
--- a/tests/python_tests/test_tokenizer.py
+++ b/tests/python_tests/test_tokenizer.py
@@ -1,6 +1,7 @@
 # Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import os
 import pytest
 import numpy as np
 from transformers import AutoTokenizer
@@ -17,15 +18,19 @@
 
 
 def load_genai_tokenizer_with_configs(configs: List[Tuple], temp_path):
-    # load Tokenizer where all configs are cleared.
-    # remove existing jsons from previous tests
     for json_file in temp_path.glob("*.json"):
         json_file.unlink()
 
     for config_json, config_name in configs:
         with (temp_path / config_name).open('w') as f:
             json.dump(config_json, f)
-    return openvino_genai.Tokenizer(temp_path)
+
+    ov_tokenizer = openvino_genai.Tokenizer(temp_path)
+
+    for _, config_name in configs:
+        os.remove(temp_path / config_name)
+
+    return ov_tokenizer
 
 
 def get_chat_templates():
@@ -181,7 +186,7 @@ def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
 @pytest.mark.nightly
 def test_set_chat_template():
     model_descr = get_chat_models_list()[0]
-    model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
 
     prompt = "how are you?"
     dummy_conversation = [
@@ -265,7 +270,7 @@ def test_load_special_tokens_from_special_tokens_map_json(model_tmp_path):
 @pytest.mark.precommit
 @pytest.mark.nightly
 @pytest.mark.skip(reason="CVS-158682 - RTInfo is not modified in tests for unknown reasons")
-def test_load_special_tokens_from_tokenizer_config_json(model_tokenizers_path_tmp_path):
+def test_load_special_tokens_from_tokenizer_config_json(model_tokenizers_tmp_path):
     # special_tokens_map is not available
     # but tokenize_config.json exists
     # will load both string and integer representations
@@ -280,7 +285,7 @@ def test_load_special_tokens_from_tokenizer_config_json(model_tokenizers_path_tm
         "eos_token": "</s>",
     }
 
-    tok = load_genai_tokenizer_with_configs([(tok_config_json, "tokenizer_config.json")], model_tokenizers_path_tmp_path[1])
+    tok = load_genai_tokenizer_with_configs([(tok_config_json, "tokenizer_config.json")], model_tokenizers_tmp_path[1])
     assert tok.get_pad_token() == tok_config_json['pad_token']
     assert tok.get_bos_token() == tok_config_json['bos_token']
     assert tok.get_eos_token() == tok_config_json['eos_token']
diff --git a/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp b/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp
index 6cf462fdf8..e0c50cda02 100644
--- a/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp
+++ b/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp
@@ -123,11 +123,6 @@ Dataset filtered_dataset(const std::string& models_path, const std::string& data
         ov::genai::GenerationConfig greedy_search = ov::genai::greedy();
         greedy_search.max_new_tokens = std::min(max_output_len, output_len);
         greedy_search.ignore_eos = true;
-        greedy_search.repetition_penalty = 1.0;
-        greedy_search.frequency_penalty = 0.0;
-        greedy_search.presence_penalty = 0.0;
-        greedy_search.diversity_penalty = 0.0;
-        greedy_search.length_penalty = 0.0;
 
         dataset.push_data(human_question, greedy_search);
         dataset.push_lens(input_len, output_len);

From ba0224fc829370d344d4057cfe5c277e9da12fd0 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 30 Dec 2024 15:36:55 +0400
Subject: [PATCH 068/110] Added LoRA support to CB, SD, PL (#1452)

CVS-159960
---
 .../genai/continuous_batching_pipeline.hpp    |  5 +-
 .../include/openvino/genai/lora_adapter.hpp   |  2 +-
 src/cpp/src/continuous_batching_impl.cpp      | 87 +++++++++++++------
 src/cpp/src/continuous_batching_impl.hpp      | 60 +++++++++----
 src/cpp/src/continuous_batching_pipeline.cpp  | 29 ++++---
 ...interface.cpp => icontinuous_batching.cpp} | 25 +++---
 ...interface.hpp => icontinuous_batching.hpp} | 36 +++++++-
 src/cpp/src/llm_pipeline.cpp                  |  1 -
 src/cpp/src/lora_adapter.cpp                  |  2 +-
 src/cpp/src/model_runner.hpp                  |  2 +-
 .../continuous_batching_for_prompt_lookup.cpp |  1 +
 .../src/prompt_lookup/prompt_lookup_impl.cpp  | 13 ++-
 .../src/prompt_lookup/prompt_lookup_impl.hpp  |  4 +-
 src/cpp/src/scheduler.hpp                     |  8 ++
 ...batching_for_speculative_decoding_impl.cpp |  2 +-
 .../speculative_decoding_impl.cpp             | 14 ++-
 .../speculative_decoding_impl.hpp             |  2 +-
 tests/cpp/CMakeLists.txt                      |  2 +
 18 files changed, 214 insertions(+), 81 deletions(-)
 rename src/cpp/src/{continuous_batching_impl_interface.cpp => icontinuous_batching.cpp} (79%)
 rename src/cpp/src/{continuous_batching_impl_interface.hpp => icontinuous_batching.hpp} (72%)

diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index 74466ee488..ed9fc3a30d 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -52,8 +52,9 @@ struct PipelineMetrics {
 
 class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
 protected:
-    class ImplInterface;
+    class IContinuousBatchingPipeline;
     class ContinuousBatchingImpl;
+
     class ContinuousBatchingForSpeculativeDecodingImpl;
     class ContinuousBatchingForPromptLookupImpl;
     class SpeculativeDecodingImpl;
@@ -64,7 +65,7 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
     friend class SpeculativeDecodingImpl;
     friend class PromptLookupImpl;
 
-    std::shared_ptr<ImplInterface> m_impl;
+    std::shared_ptr<IContinuousBatchingPipeline> m_impl;
 
     ContinuousBatchingPipeline() = default;
 
diff --git a/src/cpp/include/openvino/genai/lora_adapter.hpp b/src/cpp/include/openvino/genai/lora_adapter.hpp
index 277ec57cc3..b6b91bee20 100644
--- a/src/cpp/include/openvino/genai/lora_adapter.hpp
+++ b/src/cpp/include/openvino/genai/lora_adapter.hpp
@@ -188,7 +188,7 @@ class OPENVINO_GENAI_EXPORTS AdapterController {
     AdapterController(std::shared_ptr<ov::Model> model, const AdapterConfig& config, std::string device);
 
     // Apply adapters configured in the current config set last time, or set and use new config given as optional `config` argument
-    void apply(ov::InferRequest& request, const std::optional<AdapterConfig>& config = std::nullopt);
+    void apply(ov::InferRequest request, const std::optional<AdapterConfig>& config = std::nullopt);
 
     // Returns true if a given name is one of the state names created by this adapter controller for dynamic LoRA
     // Helps to distinguish LoRA states from other states (e.g. KV cache state) in the model for a partial state reset.
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 52ec6a8302..9e20171dcb 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -5,6 +5,7 @@
 #include "continuous_batching_impl.hpp"
 #include "utils.hpp"
 #include "utils/paged_attention_transformations.hpp"
+#include "lora_helper.hpp"
 
 namespace ov::genai {
 template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
@@ -17,8 +18,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
     const std::string& device,
     const ov::AnyMap& properties,
     const ov::genai::GenerationConfig& generation_config,
-    bool is_validation_mode_enabled
-    ) {
+    bool is_validation_mode_enabled) {
     m_tokenizer = tokenizer;
     m_generation_config = generation_config;
     m_is_validation_mode_enabled = is_validation_mode_enabled;
@@ -33,22 +33,33 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
     bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
     utils::apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control);
 
-    init(model, scheduler_config, compile_properties, device_config, core);
+    initialize_pipeline(model, scheduler_config, compile_properties, device_config, core);
 }
 
 void ContinuousBatchingPipeline::ContinuousBatchingImpl::_pull_awaiting_requests() {
     std::lock_guard<std::mutex> lock{m_awaiting_requests_mutex};
     m_requests.insert(m_requests.end(), m_awaiting_requests.begin(), m_awaiting_requests.end());
     m_awaiting_requests.clear();
+    m_pipeline_metrics.requests = m_requests.size();
 }
 
-void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
+void ContinuousBatchingPipeline::ContinuousBatchingImpl::initialize_pipeline(
     std::shared_ptr<ov::Model> model,
     const SchedulerConfig& scheduler_config,
     const ov::AnyMap& properties,
     const DeviceConfig& device_config,
     ov::Core& core) {
-    auto compiled_model = core.compile_model(model, device_config.get_device(), properties);
+    ov::CompiledModel compiled_model;
+
+    // apply LoRA
+    if (auto filtered_properties = extract_adapters_from_properties(properties, &m_generation_config.adapters)) {
+        m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
+        m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device_config.get_device());   // TODO: Make the prefix name configurable
+        compiled_model = core.compile_model(model, device_config.get_device(), *filtered_properties);
+    } else {
+        compiled_model = core.compile_model(model, device_config.get_device(), properties);
+    }
+
     ov::genai::utils::print_compiled_model_properties(compiled_model, "LLM with Paged Attention");
     ov::InferRequest infer_request = compiled_model.create_infer_request();
 
@@ -68,9 +79,12 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
         can_use_partial_preemption = false;
     }
     m_scheduler = std::make_shared<Scheduler>(device_config.get_block_size(), m_cache_manager, updated_config, device_config.get_num_layers(), can_use_partial_preemption);
-    // and finally create model runner
+
+    // model runner
     bool is_use_cache_eviction = m_scheduler->get_config().use_cache_eviction;
     m_model_runner = std::make_shared<ModelRunner>(infer_request, m_scheduler->get_block_size(), device_config.get_num_layers(), is_use_cache_eviction);
+
+    // sampler
     m_sampler = std::make_shared<Sampler>(m_tokenizer);
     m_sampler->set_seed(m_generation_config.rng_seed);
 
@@ -94,6 +108,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request
                                                                         m_scheduler->get_block_size(),
                                                                         m_scheduler->get_config().enable_prefix_caching);
     sequence_group->set_sequence_group_ptr(sequence_group);
+
     if (m_scheduler->get_config().enable_prefix_caching) {
         m_scheduler->restore_cached_blocks(sequence_group);
     }
@@ -102,6 +117,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request
         std::lock_guard<std::mutex> lock{m_awaiting_requests_mutex};
         m_awaiting_requests.push_back(sequence_group);
     }
+
     return std::make_shared<GenerationHandleImpl>(sequence_group->get_generation_stream(), sampling_params);
 };
 
@@ -113,6 +129,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request
     timer.start();
     ov::Tensor input_ids = m_tokenizer.encode(prompt).input_ids;
     timer.end();
+
     return add_request(request_id, input_ids, sampling_params);
 }
 
@@ -127,24 +144,26 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
 
     _pull_awaiting_requests();
 
-    m_pipeline_metrics.requests = m_requests.size();
     Scheduler::Output scheduler_output;
     {
-        static ManualTimer timer("scheduling");
-        timer.start();
-        m_scheduler->clean_empty_blocks(m_requests);
+        static ManualTimer scheduling_timer("scheduling");
+        scheduling_timer.start();
         scheduler_output = m_scheduler->schedule(m_requests);
+        scheduling_timer.end();
+
         m_pipeline_metrics.scheduled_requests = scheduler_output.m_scheduled_sequence_groups_ids.size();
         m_pipeline_metrics.cache_usage = scheduler_output.m_cache_usage;
-        m_pipeline_metrics.max_cache_usage =
-            std::max(m_pipeline_metrics.max_cache_usage, scheduler_output.m_cache_usage);
+        m_pipeline_metrics.max_cache_usage = std::max(m_pipeline_metrics.max_cache_usage, scheduler_output.m_cache_usage);
         _register_step_cache_usage(scheduler_output.m_cache_usage);
         m_pipeline_metrics.avg_cache_usage = _get_current_running_average_cache_usage();
+
+        static ManualTimer copy_blocks_timer("scheduling");
+        copy_blocks_timer.start();
         m_cache_manager->copy_blocks(scheduler_output.m_block_copy_map);
-        timer.end();
+        copy_blocks_timer.end();
     }
 
-    // if no tokens were scheduled, we are out of memory
+    // if no tokens were scheduled, we are out of memory => free all requests and return
     if (scheduler_output.m_total_num_scheduled_tokens == 0) {
         for (size_t i = 0; i < m_requests.size(); ++i) {
             SequenceGroup::Ptr sequence_group = m_requests[i];
@@ -166,15 +185,14 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
     }
 
 #ifdef DEBUG_CACHE_STATE_DUMP
-
     CacheStateDumper dumper(CacheStateDumper::get_run_id_for_generation_step(step_count, "before_eviction"));
     dumper.dump_cache_state(*m_scheduler, m_requests, step_count);
 #endif
-    const auto& sched_config = m_scheduler->get_config();
 
     // evict unimportant blocks from KV cache, if requested
+    const auto& sched_config = m_scheduler->get_config();
     if (sched_config.use_cache_eviction) {
-        maybe_evict_cache_blocks(sched_config);
+        _maybe_evict_cache_blocks(sched_config);
     }
 
 #ifdef DEBUG_CACHE_STATE_DUMP
@@ -183,6 +201,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
     step_count++;
 #endif
 
+    // process generation_config.echo parameetr
     _fill_prompt_log_probs(m_requests, logits);
 
     SamplerOutput sampler_output;
@@ -195,8 +214,8 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
 
     // process sampler_output (e.g. fork or drop sequences from BlockScheduler)
     {
-        static ManualTimer timer("fork / free sequence");
-        timer.start();
+        static ManualTimer free_fork_timer("fork / free sequence");
+        free_fork_timer.start();
 
         for (const auto& pair : sampler_output.m_forked_sequences) {
             uint64_t parent_id = pair.first;
@@ -208,35 +227,49 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
         for (auto seq_id : sampler_output.m_dropped_sequences)
             m_scheduler->free_sequence(seq_id);
 
-        timer.end();
+        free_fork_timer.end();
     }
 
     // notify requests dropped by handle
     {
-        static ManualTimer timer("notify requests dropped by handle");
-        timer.start();
+        static ManualTimer report_tokens_timer("notify requests dropped by handle");
+        report_tokens_timer.start();
         _notify_requests_dropped_by_handle();
-        timer.end();
+        report_tokens_timer.end();
     }
 
     // free non running requests for current step
 
     {
-        static ManualTimer timer("free non running requests");
-        timer.start();
+        static ManualTimer clean_up_requests_timer("free non running requests");
+        clean_up_requests_timer.start();
         _free_non_running_requests();
-        timer.end();
+        clean_up_requests_timer.end();
     }
 
     step_timer.end();
 }
 
+void ContinuousBatchingPipeline::ContinuousBatchingImpl::set_adapters(const std::optional<AdapterConfig>& adapters) {
+    if (m_adapter_controller) {
+        m_adapter_controller->apply(m_model_runner->get_infer_request(), adapters);
+    }
+}
+
 std::vector<EncodedGenerationResult>
 ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<ov::Tensor>& input_ids,
                                                              const std::vector<GenerationConfig>& sampling_params,
                                                              const StreamerVariant& streamer) {
     OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
     OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
+
+    // checks that all requests has the same LoRA adapters property value
+    for (size_t i = 1; i < sampling_params.size(); ++i) {
+        OPENVINO_ASSERT(sampling_params[i - 1].adapters == sampling_params[i].adapters,
+            "LoRA adapters value must be the same for all requests");
+    }
+    set_adapters(sampling_params[0].adapters);
+
     const std::shared_ptr<StreamerBase>& streamer_ptr = std::visit(overloaded{
         [](std::monostate) -> std::shared_ptr<StreamerBase> {
             return nullptr;
@@ -375,7 +408,7 @@ float ContinuousBatchingPipeline::ContinuousBatchingImpl::_get_current_running_a
     return std::accumulate(m_previous_step_cache_usages.begin(), m_previous_step_cache_usages.end(), 0.0) / m_previous_step_cache_usages.size();
 }
 
-void ContinuousBatchingPipeline::ContinuousBatchingImpl::maybe_evict_cache_blocks(const SchedulerConfig& sched_config) {
+void ContinuousBatchingPipeline::ContinuousBatchingImpl::_maybe_evict_cache_blocks(const SchedulerConfig& sched_config) {
     std::unordered_map<SequenceGroup::Ptr, size_t> seq_group_to_num_blocks_evicted_map;
     auto sequence_attention_scores = m_model_runner->get_last_attention_scores();
     for (auto& seq_id_and_attention_scores : sequence_attention_scores) {
diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp
index 8da05c6dfa..d319147f2c 100644
--- a/src/cpp/src/continuous_batching_impl.hpp
+++ b/src/cpp/src/continuous_batching_impl.hpp
@@ -3,16 +3,19 @@
 
 #pragma once
 
-#include "continuous_batching_impl_interface.hpp"
-#include "openvino/genai/continuous_batching_pipeline.hpp"
+#include "icontinuous_batching.hpp"
+
+#include "openvino/genai/lora_adapter.hpp"
 #include "cache_eviction.hpp"
 
 namespace ov::genai {
-class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatchingPipeline::ImplInterface {
+
+class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatchingPipeline::IContinuousBatchingPipeline {
 protected:
     std::shared_ptr<Scheduler> m_scheduler;
     std::shared_ptr<CacheManager> m_cache_manager;
     std::shared_ptr<ModelRunner> m_model_runner;
+    std::optional<AdapterController> m_adapter_controller;
     std::shared_ptr<Sampler> m_sampler;
 
     // current requests to process
@@ -26,7 +29,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
 
     static const size_t AVG_CACHE_USAGE_WINDOW_SIZE_IN_STEPS = 1000;
     std::deque<float> m_previous_step_cache_usages;
-    
+
     // flag to enable validation mode for sampler
     bool m_is_validation_mode_enabled = false;
 
@@ -37,21 +40,41 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
     // used by tests only
     ContinuousBatchingImpl() = default;
 
+    void initialize_pipeline(std::shared_ptr<ov::Model> model,
+                             const SchedulerConfig& scheduler_config,
+                             const ov::AnyMap& plugin_config,
+                             const DeviceConfig& device_config,
+                             ov::Core& core);
+
+    /**
+     * Pulls requests from awaiting queue to running queue
+     * Should be called within each call of step()
+     */
+    virtual void _pull_awaiting_requests();
+
+    /**
+     * Releases non-running (finished, dropped or OOM) requests from running queue
+     */
     void _free_non_running_requests();
+
+    /**
+     * Notify dropped requests by pushing empty output
+     */
     void _notify_requests_dropped_by_handle();
-    void _register_step_cache_usage(float step_cache_usage);
-    float _get_current_running_average_cache_usage() const;
-    void maybe_evict_cache_blocks(const SchedulerConfig& sched_config);
 
-    void init(std::shared_ptr<ov::Model> model,
-              const SchedulerConfig& scheduler_config,
-              const ov::AnyMap& plugin_config,
-              const DeviceConfig& device_config,
-              ov::Core& core);
+    /**
+     * Handles 'echo' generation parameter
+     */
+    void _fill_prompt_log_probs(std::vector<SequenceGroup::Ptr>& sequence_groups, ov::Tensor& logits);
 
-    virtual void _pull_awaiting_requests();
+    /**
+     * Performs KV cache eviction is enabled / requireed
+     */
+    void _maybe_evict_cache_blocks(const SchedulerConfig& sched_config);
+
+    void _register_step_cache_usage(float step_cache_usage);
+    float _get_current_running_average_cache_usage() const;
 
-    void _fill_prompt_log_probs(std::vector<SequenceGroup::Ptr>& sequence_groups, ov::Tensor& logits);
 public:
     ContinuousBatchingImpl(const std::shared_ptr<ov::Model>& model,
                            const Tokenizer& tokenizer,
@@ -64,6 +87,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
     GenerationHandle add_request(uint64_t request_id,
                                  const ov::Tensor& input_ids,
                                  ov::genai::GenerationConfig sampling_params) override;
+
     GenerationHandle add_request(uint64_t request_id,
                                  const std::string& prompt,
                                  ov::genai::GenerationConfig sampling_params) override;
@@ -76,5 +100,11 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
     generate(const std::vector<ov::Tensor>& input_ids,
              const std::vector<GenerationConfig>& sampling_params,
              const StreamerVariant& streamer) override;
+
+    /**
+     * Updates LoRA adapters for current generation call
+     */
+    void set_adapters(const std::optional<AdapterConfig>& adapters);
 };
-}
\ No newline at end of file
+
+} // namespace ov::genai
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 148eb2fa9f..8b7003e4ab 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -47,19 +47,20 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
     auto properties_without_draft_model = properties;
     auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
-    
+
     std::filesystem::path openvino_model_name = "openvino_model.xml";
     auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
     auto tokenizer = ov::genai::Tokenizer(models_path, tokenizer_properties);
     auto generation_config = utils::from_config_json_if_exists(models_path);
+
     if (is_prompt_lookup_enabled) {
-        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded");
+        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
-    } else if (draft_model_desr.model == nullptr) {
-        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
-    } else {
+    } else if (draft_model_desr.model != nullptr) {
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
         m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
+    } else {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     }
 }
 
@@ -77,13 +78,13 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     auto generation_config = utils::from_config_json_if_exists(models_path);
 
     if (is_prompt_lookup_enabled) {
-        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded");
+        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
-    } else if (draft_model_desr.model == nullptr) {
-        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
-    } else {
+    } else if (draft_model_desr.model != nullptr) {
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
         m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
+    } else {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     }
 }
 
@@ -101,13 +102,13 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     auto model = utils::singleton_core().read_model(model_str, weights_tensor);
 
     if (is_prompt_lookup_enabled) {
-        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded");
+        OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
-    } else if (draft_model_desr.model == nullptr) {
-        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
-    } else {
+    } else if (draft_model_desr.model != nullptr) {
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
-        m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);    
+        m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
+    } else {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     }
 }
 
diff --git a/src/cpp/src/continuous_batching_impl_interface.cpp b/src/cpp/src/icontinuous_batching.cpp
similarity index 79%
rename from src/cpp/src/continuous_batching_impl_interface.cpp
rename to src/cpp/src/icontinuous_batching.cpp
index 10fc102aa0..e32616b0aa 100644
--- a/src/cpp/src/continuous_batching_impl_interface.cpp
+++ b/src/cpp/src/icontinuous_batching.cpp
@@ -1,40 +1,41 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "continuous_batching_impl_interface.hpp"
+#include "icontinuous_batching.hpp"
 
 namespace ov::genai {
 
-GenerationConfig ContinuousBatchingPipeline::ImplInterface::get_config() const {
+GenerationConfig ContinuousBatchingPipeline::IContinuousBatchingPipeline::get_config() const {
     return m_generation_config;
 }
 
-PipelineMetrics ContinuousBatchingPipeline::ImplInterface::get_metrics() const {
+PipelineMetrics ContinuousBatchingPipeline::IContinuousBatchingPipeline::get_metrics() const {
     return m_pipeline_metrics;
 }
 
-Tokenizer ContinuousBatchingPipeline::ImplInterface::get_tokenizer() {
+Tokenizer ContinuousBatchingPipeline::IContinuousBatchingPipeline::get_tokenizer() {
     return m_tokenizer;
 }
 
-void ContinuousBatchingPipeline::ImplInterface::start_chat(const std::string& system_message) {
+void ContinuousBatchingPipeline::IContinuousBatchingPipeline::start_chat(const std::string& system_message) {
     if (!system_message.empty()) {
         m_history.push_back({{"role", "system"}, {"content", system_message}});
     }
     m_is_chat_conversation = true;
 };
 
-void ContinuousBatchingPipeline::ImplInterface::finish_chat() {
+void ContinuousBatchingPipeline::IContinuousBatchingPipeline::finish_chat() {
     m_is_chat_conversation = false;
     m_history.clear();
 };
 
 std::vector<GenerationResult>
-ContinuousBatchingPipeline::ImplInterface::generate(
+ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
     const std::vector<std::string>& prompts,
     std::vector<ov::genai::GenerationConfig> sampling_params,
     const StreamerVariant& streamer) {
     std::vector<ov::Tensor> input_ids;
+
     static ManualTimer timer("tokenize");
     if (m_is_chat_conversation) {
         OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts");
@@ -47,13 +48,15 @@ ContinuousBatchingPipeline::ImplInterface::generate(
         timer.end();
     } else {
         input_ids.reserve(prompts.size());
+        timer.start();
         for (const std::string& prompt : prompts) {
-            timer.start();
             input_ids.push_back(m_tokenizer.encode(prompt).input_ids);
-            timer.end();
         }
+        timer.end();
     }
+
     std::vector<EncodedGenerationResult> encoded = generate(input_ids, sampling_params, streamer);
+
     std::vector<GenerationResult> decoded;
     decoded.reserve(encoded.size());
     for (EncodedGenerationResult& res : encoded) {
@@ -65,6 +68,7 @@ ContinuousBatchingPipeline::ImplInterface::generate(
                 m_history.push_back({{"role", "assistant"}, {"content", generated.back()}});
             }
         }
+
         decoded.push_back(GenerationResult{
             res.m_request_id,
             std::move(generated),
@@ -72,6 +76,7 @@ ContinuousBatchingPipeline::ImplInterface::generate(
             res.m_status
         });
     }
+
     return decoded;
 }
-}
\ No newline at end of file
+}
diff --git a/src/cpp/src/continuous_batching_impl_interface.hpp b/src/cpp/src/icontinuous_batching.hpp
similarity index 72%
rename from src/cpp/src/continuous_batching_impl_interface.hpp
rename to src/cpp/src/icontinuous_batching.hpp
index 909383c98a..12030f06f7 100644
--- a/src/cpp/src/continuous_batching_impl_interface.hpp
+++ b/src/cpp/src/icontinuous_batching.hpp
@@ -12,7 +12,10 @@
 
 namespace ov::genai {
 
-class ContinuousBatchingPipeline::ImplInterface {
+/**
+ * Base interface for all continuous batching based pipelines
+ */
+class ContinuousBatchingPipeline::IContinuousBatchingPipeline {
 protected:
     Tokenizer m_tokenizer;
 
@@ -35,6 +38,7 @@ class ContinuousBatchingPipeline::ImplInterface {
             // std::cout << std::endl;
         }
     } m_perf;
+
     bool m_is_chat_conversation = false;
     ChatHistory m_history;
 
@@ -43,27 +47,57 @@ class ContinuousBatchingPipeline::ImplInterface {
     PipelineMetrics get_metrics() const;
     ov::genai::Tokenizer get_tokenizer();
 
+    /**
+     * Adds requests to awaiting queue using encoded inputs
+     */
     virtual GenerationHandle add_request(uint64_t request_id,
                                          const ov::Tensor& input_ids,
                                          ov::genai::GenerationConfig sampling_params) = 0;
+
+    /**
+     * Adds request to running queue based on string input
+     * This step also performs tokenization's encode
+     */
     virtual GenerationHandle add_request(uint64_t request_id,
                                          const std::string& prompt,
                                          ov::genai::GenerationConfig sampling_params) = 0;
     
+    /**
+     * Checks whether server (pipeline) has non-finished requests and step() should be called within a loop
+     */
     virtual bool has_non_finished_requests() = 0;
 
+    /**
+     * Performs a single inference step of all running (and pulls awaiting) requests
+     */
     virtual void step() = 0;
 
+    /**
+     * Performs monolitic generation based on encoded prompts
+     */
     virtual std::vector<EncodedGenerationResult>
     generate(const std::vector<ov::Tensor>& input_ids,
              const std::vector<GenerationConfig>& sampling_params,
              const StreamerVariant& streamer) = 0;
+
+    /**
+     * Performs monolitic generation based on text prompts
+     */
     std::vector<GenerationResult>
     generate(const std::vector<std::string>& prompts,
              std::vector<ov::genai::GenerationConfig> sampling_params,
              const StreamerVariant& streamer);
 
+    /**
+     * Starts chat with a given system prompt
+     * 
+     * In chat scenario prompts passed to `generate` method are accumulated inside the pipeline until `finish_chat` is called
+     */
     void start_chat(const std::string& system_message);
+
+    /**
+     * Ends chat
+     */
     void finish_chat();
 };
 }
\ No newline at end of file
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 3e378e78cf..74fe821a5e 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -15,7 +15,6 @@
 #include "llm_pipeline_static.hpp"
 #include "utils.hpp"
 #include "text_callback_streamer.hpp"
-#include "openvino/genai/lora_adapter.hpp"
 #include "lora_helper.hpp"
 #include "speculative_decoding/speculative_decoding_impl.hpp"
 #include "sampler.hpp"
diff --git a/src/cpp/src/lora_adapter.cpp b/src/cpp/src/lora_adapter.cpp
index fd446ef708..e060e55160 100644
--- a/src/cpp/src/lora_adapter.cpp
+++ b/src/cpp/src/lora_adapter.cpp
@@ -1305,7 +1305,7 @@ AdapterController::AdapterController(std::shared_ptr<ov::Model> model, const Ada
 
 
 // Call it every time when adapter config is changed; if adapter was configured as a static one, this call is not required
-void AdapterController::apply(ov::InferRequest& request, const std::optional<AdapterConfig>& config) {
+void AdapterController::apply(ov::InferRequest request, const std::optional<AdapterConfig>& config) {
     OPENVINO_ASSERT(m_pimpl || !config || !*config,
         "Adapters are passed to AdapterController but it was not configured to use adapters. "
         "Enable using adapters by pass them in the constructor first.");
diff --git a/src/cpp/src/model_runner.hpp b/src/cpp/src/model_runner.hpp
index 1b96cdc505..abc96ac423 100644
--- a/src/cpp/src/model_runner.hpp
+++ b/src/cpp/src/model_runner.hpp
@@ -52,7 +52,7 @@ class ModelRunner {
     /**
      * @return The ov::InferRequest this ModelRunner is handling.
      */
-    ov::InferRequest get_infer_request() const {
+    ov::InferRequest get_infer_request() {
         return m_request;
     }
 
diff --git a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp
index 8c9e520728..ffc8a8aab2 100644
--- a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp
+++ b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp
@@ -82,4 +82,5 @@ void ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl::generate
         request->set_num_validated_tokens(max_validation_len);
     }
 }
+
 }
\ No newline at end of file
diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp
index f934a56939..7a893a2603 100644
--- a/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp
+++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp
@@ -73,10 +73,19 @@ std::vector<EncodedGenerationResult>
 ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vector<ov::Tensor>& input_ids,
                                                        const std::vector<GenerationConfig>& sampling_params,
                                                        const StreamerVariant& streamer) {
-    ManualTimer generate_timer("speculative_decoding: generate()");
-    generate_timer.start();
     OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
     OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
+
+    ManualTimer generate_timer("speculative_decoding: generate()");
+    generate_timer.start();
+
+    // checks that all requests has the same LoRA adapters property value
+    for (size_t i = 1; i < sampling_params.size(); ++i) {
+        OPENVINO_ASSERT(sampling_params[i - 1].adapters == sampling_params[i].adapters,
+            "LoRA adapters value must be the same for all requests");
+    }
+    m_pipeline->set_adapters(sampling_params[0].adapters);
+
     const std::shared_ptr<StreamerBase>& streamer_ptr = std::visit(overloaded{
         [](std::monostate) -> std::shared_ptr<StreamerBase> {
             return nullptr;
diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp
index dae721741b..0c05c2afd0 100644
--- a/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp
+++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp
@@ -11,11 +11,11 @@
 
 namespace ov::genai {
 
-class ContinuousBatchingPipeline::PromptLookupImpl : public ContinuousBatchingPipeline::ImplInterface {
+class ContinuousBatchingPipeline::PromptLookupImpl : public ContinuousBatchingPipeline::IContinuousBatchingPipeline {
 protected:
     std::shared_ptr<ContinuousBatchingForPromptLookupImpl> m_pipeline;
     SpeculativeDecodingMetrics m_sd_metrics;
-    
+
 public:
     PromptLookupImpl(const std::shared_ptr<ov::Model>& model,
                      const Tokenizer& tokenizer,
diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp
index da65c68bec..0057b19329 100644
--- a/src/cpp/src/scheduler.hpp
+++ b/src/cpp/src/scheduler.hpp
@@ -56,6 +56,10 @@ class Scheduler {
 
     Output schedule(std::vector<SequenceGroup::Ptr>& sequence_groups) {
         Output scheduler_output;
+
+        // free some blocks taken by non-confirmed condidates in SD / prompt look-up
+        clean_empty_blocks(sequence_groups);
+
         if (m_block_manager.get_total_number_of_kv_blocks() == 0) {
             _initialize_cache(sequence_groups);
         }
@@ -84,6 +88,10 @@ class Scheduler {
         return scheduler_output;
     }
 
+    /**
+     * Some requests can contain empty blocks after prompt look-up or speculative decoding
+     * when candidates are not confirmed by main model and we need to free blocks, taken by these candidates
+     */
     void clean_empty_blocks(std::vector<SequenceGroup::Ptr>& seq_groups) {
         for (const auto& seq_group : seq_groups)
             m_block_manager.free_empty_physical_blocks(seq_group);
diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp
index 36f274f30f..5091218ccd 100644
--- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp
@@ -17,7 +17,7 @@ ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::Contin
     m_tokenizer = tokenizer;
     m_generation_config = generation_config;
     m_is_validation_mode_enabled = is_validation_mode_enabled;
-    init(model, scheduler_config, plugin_config, device_config, core);
+    initialize_pipeline(model, scheduler_config, plugin_config, device_config, core);
 }
 
 void
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
index 257c20bf01..4021742961 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -193,10 +193,20 @@ std::vector<EncodedGenerationResult>
 ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<ov::Tensor>& input_ids,
                                                               const std::vector<GenerationConfig>& sampling_params,
                                                               const StreamerVariant& streamer) {
-    ManualTimer generate_timer("speculative_decoding: generate()");
-    generate_timer.start();
     OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
     OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
+
+    ManualTimer generate_timer("speculative_decoding: generate()");
+    generate_timer.start();
+
+    // checks that all requests has the same LoRA adapters property value
+    for (size_t i = 1; i < sampling_params.size(); ++i) {
+        OPENVINO_ASSERT(sampling_params[i - 1].adapters == sampling_params[i].adapters,
+            "LoRA adapters value must be the same for all requests");
+    }
+    m_main_pipeline->set_adapters(sampling_params[0].adapters);
+    m_draft_pipeline->set_adapters(sampling_params[0].adapters);
+
     const std::shared_ptr<StreamerBase>& streamer_ptr = std::visit(overloaded{
         [](std::monostate) -> std::shared_ptr<StreamerBase> {
             return nullptr;
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp
index 3df02ac394..2f8067cbab 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp
@@ -34,7 +34,7 @@ struct ModelDesc {
     ModelDesc() = default;
 };
 
-class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBatchingPipeline::ImplInterface {
+class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBatchingPipeline::IContinuousBatchingPipeline {
 protected:
     std::shared_ptr<ContinuousBatchingForSpeculativeDecodingImpl> m_main_pipeline, m_draft_pipeline;
     SpeculativeDecodingMetrics m_sd_metrics;
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index b8c2e625c5..5880010841 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -23,6 +23,8 @@ file(GLOB src_files "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sequence_group.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/utils/*.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/utils.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/continuous_batching*.cpp"
+                    "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/icontinuous_batching.cpp"
+                    "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/lora_helper.cpp"
                     "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/text_callback_streamer.cpp")
 
 add_executable(${TEST_TARGET_NAME} ${tests_src})

From 0c5f03ba05e4332476398108f3681ae865e5d3a1 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 30 Dec 2024 16:41:36 +0400
Subject: [PATCH 069/110] Split LLMPipeline by several files (#1454)

---
 src/cpp/src/continuous_batching_adapter.hpp | 171 +++++
 src/cpp/src/llm_pipeline.cpp                | 723 ++------------------
 src/cpp/src/llm_pipeline_base.hpp           |  28 +-
 src/cpp/src/llm_pipeline_stateful.cpp       | 405 +++++++++++
 src/cpp/src/llm_pipeline_stateful.hpp       |  77 +++
 src/cpp/src/utils.hpp                       |   6 +-
 6 files changed, 746 insertions(+), 664 deletions(-)
 create mode 100644 src/cpp/src/continuous_batching_adapter.hpp
 create mode 100644 src/cpp/src/llm_pipeline_stateful.cpp
 create mode 100644 src/cpp/src/llm_pipeline_stateful.hpp

diff --git a/src/cpp/src/continuous_batching_adapter.hpp b/src/cpp/src/continuous_batching_adapter.hpp
new file mode 100644
index 0000000000..246cb51149
--- /dev/null
+++ b/src/cpp/src/continuous_batching_adapter.hpp
@@ -0,0 +1,171 @@
+
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "llm_pipeline_base.hpp"
+
+#include "openvino/genai/continuous_batching_pipeline.hpp"
+
+namespace ov::genai {
+
+Tokenizer dont_construct() {
+    OPENVINO_THROW("Continuous Batching backend can't be constructed"
+        "from ireq because the model must be transformed");
+}
+
+template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
+template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+
+class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
+    ContinuousBatchingPipeline m_impl;
+public:
+    ContinuousBatchingAdapter(
+        const ov::InferRequest& request,
+        const Tokenizer& tokenizer,
+        OptionalGenerationConfig generation_config
+    ): LLMPipelineImplBase{dont_construct(), GenerationConfig{}},
+        m_impl{std::filesystem::path{}, SchedulerConfig{}, std::string{}} { }
+
+    ContinuousBatchingAdapter(
+        const std::filesystem::path& models_path,
+        const Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device,
+        const ov::AnyMap& plugin_config
+    ): LLMPipelineImplBase{tokenizer, GenerationConfig()}, m_impl{
+        models_path.string(),
+        tokenizer,
+        scheduler_config,
+        device,
+        plugin_config} {
+        m_generation_config = m_impl.get_config();
+    }
+
+    ContinuousBatchingAdapter(
+        const std::string& model_str,
+        const ov::Tensor& weights_tensor,
+        const Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device,
+        const ov::AnyMap& plugin_config,
+        const ov::genai::GenerationConfig& generation_config
+    ): LLMPipelineImplBase{tokenizer, GenerationConfig()}, m_impl{
+        model_str, 
+        weights_tensor,
+        tokenizer,
+        scheduler_config,
+        device,
+        plugin_config,
+        generation_config} {}
+
+    ContinuousBatchingAdapter(
+        const std::filesystem::path& models_path,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device,
+        const ov::AnyMap& plugin_config
+    ): LLMPipelineImplBase{Tokenizer(models_path), GenerationConfig()}, m_impl{
+        models_path.string(),
+        m_tokenizer,
+        scheduler_config,
+        device,
+        plugin_config} {
+        m_generation_config = m_impl.get_config();
+    }
+
+    DecodedResults generate(
+        StringInputs inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override {
+        std::vector<std::string> prompts = std::visit(overloaded{
+            [](const std::string& prompt) {
+                return std::vector{prompt};
+            },
+            [](std::vector<std::string>& prompts) {
+                return prompts;
+            }
+        }, inputs);
+        const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
+        // -1 == config.eos_token_id and config.validate() are handled in m_impl.
+        std::vector<GenerationResult> generated = m_impl.generate(
+            prompts,
+            std::vector<GenerationConfig>{prompts.size(), config},
+            streamer
+        );
+        std::vector<std::string> plain_replies;
+        std::vector<float> plain_scores;
+        for (GenerationResult& res : generated) {
+            OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
+            std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies));
+            std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
+        }
+        return {std::move(plain_replies), std::move(plain_scores)};
+    }
+
+    EncodedResults generate(
+        const EncodedInputs& inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override {
+        std::vector<ov::Tensor> input_ids = std::visit(overloaded{
+            [](const ov::Tensor& inp) {
+                size_t batch_size = inp.get_shape().at(0);
+                if (1 == batch_size) {
+                    return std::vector{inp};
+                }
+                std::vector<ov::Tensor> input_ids;
+                input_ids.reserve(batch_size);
+                size_t max_len = inp.get_shape().at(1);
+                const int64_t* const source = inp.data<const int64_t>();
+                for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
+                    input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
+                    int64_t* destination = input_ids.back().data<int64_t>();
+                    std::copy_n(source + batch_id * max_len, max_len, destination);
+                }
+                return input_ids;
+            },
+            [](const TokenizedInputs& inp) {
+                size_t batch_size = inp.input_ids.get_shape().at(0);
+                std::vector<ov::Tensor> input_ids;
+                input_ids.reserve(batch_size);
+                size_t max_len = inp.input_ids.get_shape().at(1);
+                const int64_t* const source = inp.input_ids.data<const int64_t>();
+                const int64_t* const attention_mask = inp.attention_mask.data<const int64_t>();
+                for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
+                    input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
+                    int64_t* destination = input_ids.back().data<int64_t>();
+                    size_t copy_count = 0;
+                    for (size_t idx = 0; idx < max_len; ++idx) {
+                        if (1 == attention_mask[batch_id * max_len + idx]) {
+                            destination[copy_count++] = source[batch_id * max_len + idx];
+                        }
+                    }
+                    input_ids.back().set_shape({1, copy_count});
+                }
+                return input_ids;
+            }
+        }, inputs);
+
+        const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
+        // -1 == config.eos_token_id and config.validate() are handled in m_impl.
+        std::vector<EncodedGenerationResult> generated = m_impl.generate(input_ids, std::vector<GenerationConfig>{input_ids.size(), config}, streamer);
+        std::vector<std::vector<int64_t>> plain_tokens;
+        std::vector<float> plain_scores;
+        for (EncodedGenerationResult& res : generated) {
+            OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
+            std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens));
+            std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
+        }
+        return {std::move(plain_tokens), std::move(plain_scores)};
+    }
+
+    void start_chat(const std::string& system_message) override {
+        m_impl.start_chat();
+    };
+
+    void finish_chat() override {
+        m_impl.finish_chat();
+    };
+};
+
+} // namespace ov::genai
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 74fe821a5e..5022595da1 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -1,474 +1,48 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include <filesystem>
 #include <fstream>
-#include <variant>
-#include <algorithm>
+
 #include <nlohmann/json.hpp>
-#include <openvino/openvino.hpp>
-#include "openvino/genai/continuous_batching_pipeline.hpp"
-#include "openvino/genai/generation_config.hpp"
+
 #include "openvino/genai/llm_pipeline.hpp"
 #include "openvino/genai/perf_metrics.hpp"
-#include "llm_pipeline_base.hpp"
+
 #include "llm_pipeline_static.hpp"
-#include "utils.hpp"
-#include "text_callback_streamer.hpp"
-#include "lora_helper.hpp"
+#include "llm_pipeline_stateful.hpp"
+#include "continuous_batching_adapter.hpp"
 #include "speculative_decoding/speculative_decoding_impl.hpp"
-#include "sampler.hpp"
-#include "lm_encoding.hpp"
 
 namespace ov {
 namespace genai {
 
-class StatefulLLMPipeline final : public LLMPipelineImplBase {
-public:
-    ov::InferRequest m_model_runner;
-    bool is_chat_conversation = false;
-    bool m_trust_encoded_history = true;
-    ChatHistory m_history;
-    std::string m_templated_chat_history = {};
-    std::vector<int64_t> m_tokenized_chat_history;
-    ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
-    size_t m_kv_cache_seq_length_axis = 2;
-    Sampler m_sampler;
-    // Tail of previous output in chat mode is missing in KV cache, let's keep it
-    std::optional<int64_t> m_last_disappeared_token = std::nullopt;
-    // If sequence contains some symbols, which could be ambiguously encoded by tokenizer, we need to trim kv cache
-    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
-    // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
-    ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0};
-
-    StatefulLLMPipeline(
-        const ov::InferRequest& request,
-        const ov::genai::Tokenizer& tokenizer,
-        OptionalGenerationConfig generation_config=std::nullopt
-    ) : LLMPipelineImplBase(tokenizer),
-       m_model_runner(request) {
-       GenerationConfig default_config;
-       m_generation_config = (generation_config.has_value()) ? *generation_config : default_config;
-    }
-
-    StatefulLLMPipeline(
-        const std::filesystem::path& models_path,
-        const ov::genai::Tokenizer& tokenizer,
-        const std::string& device,
-        const ov::AnyMap& plugin_config
-    ) : StatefulLLMPipeline{
-            ov::genai::utils::read_model_with_config(models_path, plugin_config),
-            tokenizer, 
-            device, 
-            plugin_config, 
-            utils::from_config_json_if_exists(models_path)
-        } {}
-
-    StatefulLLMPipeline(
-        const std::shared_ptr<ov::Model>& model,
-        const ov::genai::Tokenizer& tokenizer,
-        const std::string& device,
-        const ov::AnyMap& config,
-        const ov::genai::GenerationConfig& generation_config
-    ) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
-        ov::CompiledModel compiled_model;
-        auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
-        utils::slice_matmul_stateful_model(model);
-        m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model);
-
-        if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
-            m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
-            m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device);   // TODO: Make the prefix name configurable
-            compiled_model = utils::singleton_core().compile_model(model, device, *filtered_plugin_config);
-            m_model_runner = compiled_model.create_infer_request();
-        } else {
-            compiled_model = utils::singleton_core().compile_model(model, device, plugin_config);
-            m_model_runner = compiled_model.create_infer_request();
-        }
-        ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model");
-
-        // If eos_token_id was not provided, take value
-        if (m_generation_config.eos_token_id == -1)
-            m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
-
-        m_sampler.set_seed(m_generation_config.rng_seed);
-    }
-
-    StatefulLLMPipeline(
-        const std::filesystem::path& models_path,
-        const std::string& device,
-        const ov::AnyMap& plugin_config
-    ) : StatefulLLMPipeline{models_path, Tokenizer(models_path), device, plugin_config} {}
-
-    DecodedResults generate(
-        StringInputs inputs,
-        OptionalGenerationConfig generation_config,
-        StreamerVariant streamer
-    ) override {
-        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
-            m_chat_input_type = ov::genai::utils::GenerationChatInputsType::STRING;
-
-        if (is_chat_conversation)
-            OPENVINO_ASSERT(m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS,
-                            "Chat doesn't support switching between input types. Please, continue using EncodedInputs or restart the chat.");
-
-        auto start_time = std::chrono::steady_clock::now();
-        GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
-        // If eos_token_id was not provided, take value from default m_generation_config
-        if (config.eos_token_id == -1)
-            config.set_eos_token_id(m_generation_config.eos_token_id);
-        config.validate();
-
-        TokenizedInputs encoded_input;
-
-        if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
-            OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
-            encoded_input = m_tokenizer.encode(*input_vector);
-        } else if (auto input_prompt = std::get_if<std::string>(&inputs)) {
-            std::string& prompt = *input_prompt;
-
-            if (is_chat_conversation) {
-                // KV cache in model already contains prompts and answers from previous iterations.
-                // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
-                // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
-                // <bos token> will be inserted on every iteration.
-                // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
-                // and takes only the difference between them.
-                // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
-                // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
-
-                m_history.push_back({{"role", "user"}, {"content", prompt}});
-                constexpr bool add_generation_prompt = true;
-                auto new_templated_chat_history  = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
-                // Do not add special tokens in chat scenario to be aligned with HF.
-                auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false));
-                auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
-
-                // some symbols combinations can be encoded by the tokenizer in different ways
-                // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
-                // so let's check it out, find the trusted part and use it in on the next step
-                size_t trusted_history_length = 0;
-                if (!m_tokenized_chat_history.empty()) {
-                    std::set<int64_t> stop_tokens = config.stop_token_ids;
-                    trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
-                    m_trust_encoded_history = trusted_history_length == SIZE_MAX;
-                }
-
-                if (m_tokenized_chat_history.empty()) {
-                    encoded_input = new_chat_tokens;
-                } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_kv_cache_need_to_update()) {
-                    // does_kv_cache_need_to_update will be true here if beam search is activated
-                    // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly
-                    // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager
-                    if (m_kv_history_manager.does_kv_cache_need_to_update()) {
-                        trusted_history_length = m_kv_history_manager.trusted_history_length;
-                    } else {
-                        m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_tokenized_chat_history.size() - trusted_history_length;
-                        // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it
-                        m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= m_last_disappeared_token.has_value() ? 1 : 0;
-                    }
-
-                    ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
-                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length},
-                                                       new_chat_tokens.input_ids.data<int64_t>() + trusted_history_length);
-
-                    ov::Tensor new_attention_mask(ov::element::i64, new_tensor.get_shape());
-                    std::fill_n(new_attention_mask.data<int64_t>(), new_tensor.get_shape()[1], 1);
-
-                    encoded_input.input_ids = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
-                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length});
-                    new_tensor.copy_to(encoded_input.input_ids);
-                    encoded_input.attention_mask = new_attention_mask;
-                    m_last_disappeared_token = std::nullopt;
-                } else {
-                    encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
-                }
-                m_templated_chat_history = new_templated_chat_history;
-
-                m_tokenized_chat_history.clear();
-                m_tokenized_chat_history.reserve(new_chat_tokens.input_ids.get_size());
-                std::copy_n(new_chat_tokens.input_ids.data<int64_t>(), new_chat_tokens.input_ids.get_size(),
-                            std::back_inserter(m_tokenized_chat_history));
-
-                // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
-            } else {
-                encoded_input = m_tokenizer.encode(prompt);
-            }
-        }
-
-        auto encode_stop_time =  std::chrono::steady_clock::now();
-        auto encoded_results = generate(encoded_input, config, streamer);
-
-        auto decode_start_time =  std::chrono::steady_clock::now();
-        DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
-        auto decode_stop_time =  std::chrono::steady_clock::now();
-
-        if (is_chat_conversation) {
-            // Tail of chat template is missing in KV cache.
-            // Find the tail to concatenate it with the next input prompt.
-            auto answer = decoded_results.texts[0];
-            m_templated_chat_history.append(answer);
-            m_history.push_back({{"role", "assistant"}, {"content", answer}});
-        }
-
-        // generate_durations
-        decoded_results.perf_metrics = encoded_results.perf_metrics;
-
-        auto& raw_counters = decoded_results.perf_metrics.raw_metrics;
-        auto stop_time = std::chrono::steady_clock::now();
-        raw_counters.generate_durations = std::vector<MicroSeconds>();
-        raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
-        raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time));
-        raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time));
-
-        // Added tokenization/detokenization times, and updated generate duration, need to reevaluate statistics.
-        decoded_results.perf_metrics.m_evaluated = false;
-        decoded_results.perf_metrics.evaluate_statistics(start_time);
-        return decoded_results;
-    }
-
-    void reset_kv_state() {
-        if(m_adapter_controller) {
-            for(auto& state: m_model_runner.query_state()) {
-                if(!m_adapter_controller->has_state_name(state.get_name())) {
-                    state.reset();
-                }
-            }
-        } else {
-            m_model_runner.reset_state();
-        }
-    }
-
-    EncodedResults generate(
-        const EncodedInputs& inputs,
-        OptionalGenerationConfig generation_config,
-        StreamerVariant streamer
-    ) override {
-        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
-            m_chat_input_type = ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS;
-
-        if (is_chat_conversation)
-            // if chat was run in StringInputs mode, but it was called EncodedInputs generate, last m_history entry will be with assistant role
-            OPENVINO_ASSERT(m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS || m_history.back()["role"] == "user",
-                            "Chat doesn't support switching between input types. Please, continue using StringInputs or restart the chat.");
-
-        auto start_time = std::chrono::steady_clock::now();
-        ov::Tensor input_ids;
-        ov::Tensor attention_mask;
-        if (auto data = std::get_if<ov::Tensor>(&inputs)) {
-            input_ids = *data;
-            attention_mask = ov::genai::utils::init_attention_mask(input_ids);
-        } else if (auto data = std::get_if<TokenizedInputs>(&inputs)) {
-            input_ids = data->input_ids;
-            attention_mask = data->attention_mask;
-        }
-
-        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
-            std::copy(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history));
-
-        // Tail of previous output in chat mode is missing in KV cache.
-        if (m_last_disappeared_token.has_value()) {
-            attention_mask = ov::genai::utils::push_front_inputs(attention_mask, 1);
-            input_ids = ov::genai::utils::push_front_inputs(input_ids, *m_last_disappeared_token);
-        }
-
-        GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
-
-        // If eos_token_id was not provided, take value from default m_generation_config
-        if (config.eos_token_id == -1)
-            config.set_eos_token_id(m_generation_config.eos_token_id);
-        config.validate();
-
-        // Stateful pipeline does not provide logprobs for prompt tokens
-        OPENVINO_ASSERT(config.echo == false, "Echo is not supported in the stateful pipeline");
-
-        std::shared_ptr<StreamerBase> streamer_ptr;
-        if (auto streamer_obj = std::get_if<std::monostate>(&streamer)) {
-            streamer_ptr = nullptr;
-        } else if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) {
-            streamer_ptr = *streamer_obj;
-        } else if (auto callback = std::get_if<std::function<bool(std::string)>>(&streamer)) {
-            streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
-        }
-
-        auto batch_size = input_ids.get_shape().at(0);
-        OPENVINO_ASSERT(streamer_ptr == nullptr || batch_size == 1 && config.num_return_sequences == 1 &&
-            (config.is_greedy_decoding() || config.is_multinomial()),
-            "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
-
-        auto num_inputs = m_model_runner.get_compiled_model().inputs().size();
-        OPENVINO_ASSERT(num_inputs == 4 || num_inputs == 3, "Model should have 3 or 4 inputs: "
-                        "either (input_ids, attention_mask, beam_idx) or "
-                        "(input_ids, attention_mask, position_ids, beam_idx) "
-                        "but you have '" + std::to_string(num_inputs) + "' inputs");
-
-        ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_history_manager.num_tokens_to_remove_from_kv_cache, m_kv_cache_seq_length_axis, m_adapter_controller);
-
-        size_t kv_cache_len = 0;
-        ov::Tensor concatenated_attention_mask;
-        if (is_chat_conversation && !m_tokenized_chat_history.empty()) {
-            OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1");
-            // If history is saved in KV cache, concatenate new attention_mask with the already existing.
-            // Between subsequent runs attention_mask should not be modified.
-            auto atten_mask_history = m_model_runner.get_tensor("attention_mask");
-            auto prompt_len = attention_mask.get_shape()[1];
-
-            kv_cache_len = atten_mask_history.get_shape()[1] - m_kv_history_manager.num_tokens_to_remove_from_kv_cache;
-
-            ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}};
-            auto start_atten_hst = atten_mask_history.data<int64_t>();
-
-            std::copy(start_atten_hst, start_atten_hst + kv_cache_len,
-                    new_atten_mask.data<int64_t>());
-            std::copy(attention_mask.data<int64_t>(), attention_mask.data<int64_t>() + prompt_len,
-                    new_atten_mask.data<int64_t>() + kv_cache_len);
-            concatenated_attention_mask = new_atten_mask;
-        } else {
-            concatenated_attention_mask = attention_mask;
-        }
-
-        size_t prev_attn_mask_size = concatenated_attention_mask.get_shape()[1];
-
-        bool position_ids_available = (num_inputs == 4);
-        std::optional<ov::Tensor> position_ids = std::nullopt;
-        if (position_ids_available) {
-            position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
-            utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len);
-        }
-
-        if(m_adapter_controller) {
-            m_adapter_controller->apply(m_model_runner, config.adapters);
-        }
-
-        if (is_chat_conversation && !m_trust_encoded_history) {
-            m_trust_encoded_history = true;
-            m_kv_history_manager.reset();
-        }
-
-        std::vector<SequenceGroup::Ptr> requests;
-        size_t block_size = 1;
-        bool enable_prefix_caching = false;
-
-        for (size_t request_id = 0; request_id < batch_size; request_id++) {
-            SequenceGroup::Ptr sequence_group;
-            if (is_chat_conversation) {
-                ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
-                sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching);
-            } else {
-                size_t seq_len = input_ids.get_shape().at(1);
-                size_t batch_offset = request_id * seq_len;
-                const int64_t* prompt_start = input_ids.data<const int64_t>() + batch_offset;
-                std::vector<int64_t> tokenized_prompt(prompt_start, prompt_start + seq_len);
-
-                sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_prompt, config, block_size, enable_prefix_caching);
-            }
-
-            sequence_group->set_sequence_group_ptr(sequence_group);
-            requests.push_back(sequence_group);
-        }
-
-        if (m_sampler.get_seed() != config.rng_seed) {
-            m_sampler.set_seed(config.rng_seed);
-        }
-
-        ov::genai::EncodedResults result;
-        std::tie(result, m_last_disappeared_token) = ov::genai::get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask,
-                                                                                       streamer_ptr, m_sampler, requests, position_ids, std::nullopt);
-
-        if (is_chat_conversation) {
-            // force remove from kv_cache last answer
-            if (config.is_beam_search() && m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) {
-                m_kv_history_manager.trusted_history_length = m_tokenized_chat_history.size();
-                m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size;
-            }
-
-            std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
-        } else {
-            reset_kv_state();
-            m_last_disappeared_token = std::nullopt;
-        }
-
-        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
-            std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
-
-        auto stop_time = std::chrono::steady_clock::now();
-
-        // If is called without tokenization then that stat will not be reported.
-        auto& metrics = result.perf_metrics;
-        metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1);
-        metrics.load_time = this->m_load_time_ms;
-        metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
-        metrics.evaluate_statistics(start_time);
-        return result;
-    }
-
-    void start_chat(const std::string& system_message) override {
-        is_chat_conversation = true;
-        m_trust_encoded_history = true;
-        m_kv_history_manager.reset();
-        m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
-        m_last_disappeared_token = std::nullopt;
-        if (!m_tokenized_chat_history.empty()) {
-            reset_kv_state();
-            m_history = {};
-            m_templated_chat_history = "";
-            m_tokenized_chat_history.clear();
-        }
-        if (system_message.empty())
-            return;
-
-        m_history.push_back({{"role", "system"}, {"content", system_message}});
-        constexpr bool add_generation_prompt = false;
+namespace {
 
-        m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
-    }
+/* 
+* NPU reads some properties from the config file, but when LLMPipeline is initialized
+* from the model_str and weights_tensor, there are not files. 
+* In the later case ModelDesc is stored in properties.
+* This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr.
+*/
+std::pair<ov::AnyMap, ov::genai::ModelConfigDesc> split_model_descr(const ov::AnyMap& properties) {
+    ov::AnyMap main_properties = properties;
+    ov::genai::ModelConfigDesc model_descr;
 
-    void finish_chat() override {
-        is_chat_conversation = false;
-        m_trust_encoded_history = true;
-        m_kv_history_manager.reset();
-        m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
-        m_last_disappeared_token = std::nullopt;
-        if (!m_tokenized_chat_history.empty()) {
-            reset_kv_state();
-            m_history.clear();
-            m_templated_chat_history.clear();
-            m_tokenized_chat_history.clear();
+    auto pop_property = [](ov::AnyMap& orig_propertis, const std::string& key, auto& value) {
+        if (orig_propertis.find(key) != orig_propertis.end()) {
+            value = orig_propertis.at(key).as<std::decay_t<decltype(value)>>();
+            orig_propertis.erase(key);
         }
-    }
-};
-
-DecodedResults LLMPipeline::generate(
-        StringInputs inputs,
-        OptionalGenerationConfig generation_config,
-        StreamerVariant streamer
-) {
-    return m_pimpl->generate(inputs, generation_config, streamer);
-}
-
-DecodedResults LLMPipeline::generate(StringInputs text, const ov::AnyMap& config_map) {
-    auto config_arg = utils::get_config_from_map(config_map);
-    GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
-    config.update_generation_config(config_map);
-
-    return m_pimpl->generate(text, config, utils::get_streamer_from_map(config_map));
-}
-
-EncodedResults LLMPipeline::generate(
-    const EncodedInputs& inputs,
-    OptionalGenerationConfig generation_config,
-    StreamerVariant streamer
-) {
-    return m_pimpl->generate(inputs, generation_config, streamer);
+    };
+    pop_property(main_properties, "name_or_path", model_descr.name_or_path);
+    pop_property(main_properties, "type", model_descr.type);
+    pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);
+    
+    return {main_properties, model_descr};
 }
 
-EncodedResults LLMPipeline::generate(const EncodedInputs& inputs, const ov::AnyMap& config_map) {
-    auto config_arg = utils::get_config_from_map(config_map);
-    GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
-    config.update_generation_config(config_map);
+} // namespace
 
-    return m_pimpl->generate(inputs, config, utils::get_streamer_from_map(config_map));
-}
 
 std::pair<std::string, Any> streamer(StreamerVariant func) {
     if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&func)) {
@@ -509,194 +83,7 @@ std::pair<std::string, Any> draft_model(
     return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
 }
 
-}  // namespace genai
-}  // namespace ov
-
-namespace {
-using namespace ov::genai;
-
-template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
-template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
-
-Tokenizer dont_construct() {
-    OPENVINO_THROW("Continuous Batching backend can't be constructed"
-        "from ireq because the model must be transformed");
-}
-
-class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
-public:
-    ContinuousBatchingPipeline m_impl;
-
-    ContinuousBatchingAdapter(
-        const ov::InferRequest& request,
-        const Tokenizer& tokenizer,
-        OptionalGenerationConfig generation_config
-    ): LLMPipelineImplBase{dont_construct()}, m_impl{{}, {}, {}} {}
-
-    ContinuousBatchingAdapter(
-        const std::filesystem::path& models_path,
-        const Tokenizer& tokenizer,
-        const SchedulerConfig& scheduler_config,
-        const std::string& device,
-        const ov::AnyMap& plugin_config
-    ): LLMPipelineImplBase{tokenizer}, m_impl{
-        models_path.string(),
-        tokenizer,
-        scheduler_config,
-        device,
-        plugin_config} {
-        m_generation_config = m_impl.get_config();
-    }
-
-    ContinuousBatchingAdapter(
-        const std::string& model_str,
-        const ov::Tensor& weights_tensor,
-        const Tokenizer& tokenizer,
-        const SchedulerConfig& scheduler_config,
-        const std::string& device,
-        const ov::AnyMap& plugin_config,
-        const ov::genai::GenerationConfig& generation_config
-    ): LLMPipelineImplBase{tokenizer}, m_impl{
-        model_str, 
-        weights_tensor,
-        tokenizer,
-        scheduler_config,
-        device,
-        plugin_config,
-        generation_config} {}
-
-    ContinuousBatchingAdapter(
-        const std::filesystem::path& models_path,
-        const SchedulerConfig& scheduler_config,
-        const std::string& device,
-        const ov::AnyMap& plugin_config
-    ): LLMPipelineImplBase{Tokenizer(models_path.string())}, m_impl{
-        models_path.string(),
-        m_tokenizer,
-        scheduler_config,
-        device,
-        plugin_config} {
-        m_generation_config = m_impl.get_config();
-    }
-
-    DecodedResults generate(
-        StringInputs inputs,
-        OptionalGenerationConfig generation_config,
-        StreamerVariant streamer
-    ) override {
-        std::vector<std::string> prompts = std::visit(overloaded{
-            [](const std::string& prompt) {
-                return std::vector{prompt};
-            },
-            [](std::vector<std::string>& prompts) {
-                return prompts;
-            }
-        }, inputs);
-        const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
-        // -1 == config.eos_token_id and config.validate() are handled in m_impl.
-        std::vector<GenerationResult> generated = m_impl.generate(
-            prompts,
-            std::vector<GenerationConfig>{prompts.size(), config},
-            streamer
-        );
-        std::vector<std::string> plain_replies;
-        std::vector<float> plain_scores;
-        for (GenerationResult& res : generated) {
-            OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
-            std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies));
-            std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
-        }
-        return {std::move(plain_replies), std::move(plain_scores)};
-    }
-
-    EncodedResults generate(
-        const EncodedInputs& inputs,
-        OptionalGenerationConfig generation_config,
-        StreamerVariant streamer
-    ) override {
-        std::vector<ov::Tensor> input_ids = std::visit(overloaded{
-            [](const ov::Tensor& inp) {
-                size_t batch_size = inp.get_shape().at(0);
-                if (1 == batch_size) {
-                    return std::vector{inp};
-                }
-                std::vector<ov::Tensor> input_ids;
-                input_ids.reserve(batch_size);
-                size_t max_len = inp.get_shape().at(1);
-                const int64_t* const source = inp.data<const int64_t>();
-                for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
-                    input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
-                    int64_t* destination = input_ids.back().data<int64_t>();
-                    std::copy_n(source + batch_id * max_len, max_len, destination);
-                }
-                return input_ids;
-            },
-            [](const TokenizedInputs& inp) {
-                size_t batch_size = inp.input_ids.get_shape().at(0);
-                std::vector<ov::Tensor> input_ids;
-                input_ids.reserve(batch_size);
-                size_t max_len = inp.input_ids.get_shape().at(1);
-                const int64_t* const source = inp.input_ids.data<const int64_t>();
-                const int64_t* const attention_mask = inp.attention_mask.data<const int64_t>();
-                for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
-                    input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
-                    int64_t* destination = input_ids.back().data<int64_t>();
-                    size_t copy_count = 0;
-                    for (size_t idx = 0; idx < max_len; ++idx) {
-                        if (1 == attention_mask[batch_id * max_len + idx]) {
-                            destination[copy_count++] = source[batch_id * max_len + idx];
-                        }
-                    }
-                    input_ids.back().set_shape({1, copy_count});
-                }
-                return input_ids;
-            }
-        }, inputs);
-        const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
-        // -1 == config.eos_token_id and config.validate() are handled in m_impl.
-        std::vector<EncodedGenerationResult> generated = m_impl.generate(input_ids, std::vector<GenerationConfig>{input_ids.size(), config}, streamer);
-        std::vector<std::vector<int64_t>> plain_tokens;
-        std::vector<float> plain_scores;
-        for (EncodedGenerationResult& res : generated) {
-            OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
-            std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens));
-            std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
-        }
-        return {std::move(plain_tokens), std::move(plain_scores)};
-    }
-
-    void start_chat(const std::string& system_message) override {
-        m_impl.start_chat();
-    };
-
-    void finish_chat() override {
-        m_impl.finish_chat();
-    };
-};
-
-/* 
-* NPU reads some properties from the config file, but when LLMPipeline is initialized
-* from the model_str and weights_tensor, there are not files. 
-* In the later case ModelDesc is stored in properties.
-* This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr.
-*/
-std::pair<ov::AnyMap, ov::genai::ModelConfigDesc> split_model_descr(const ov::AnyMap& properties) {
-    ov::AnyMap main_properties = properties;
-    ov::genai::ModelConfigDesc model_descr;
-
-    auto pop_property = [](ov::AnyMap& orig_propertis, const std::string& key, auto& value) {
-        if (orig_propertis.find(key) != orig_propertis.end()) {
-            value = orig_propertis.at(key).as<std::decay_t<decltype(value)>>();
-            orig_propertis.erase(key);
-        }
-    };
-    pop_property(main_properties, "name_or_path", model_descr.name_or_path);
-    pop_property(main_properties, "type", model_descr.type);
-    pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);
-    
-    return {main_properties, model_descr};
-}
-}
+// Public LLMPipeline
 
 ov::genai::LLMPipeline::LLMPipeline(
     const ov::InferRequest& request,
@@ -704,8 +91,6 @@ ov::genai::LLMPipeline::LLMPipeline(
     OptionalGenerationConfig generation_config) {
     auto start_time = std::chrono::steady_clock::now();
     m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer, generation_config);
-    auto stop_time = std::chrono::steady_clock::now();
-    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
 }
 
 ov::genai::LLMPipeline::LLMPipeline(
@@ -724,8 +109,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     } else {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
     }
-    auto stop_time = std::chrono::steady_clock::now();
-    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
+    m_pimpl->save_load_time(start_time);
 }
 
 ov::genai::LLMPipeline::LLMPipeline(
@@ -744,8 +128,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     } else {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, config);
     }
-    auto stop_time = std::chrono::steady_clock::now();
-    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
+    m_pimpl->save_load_time(start_time);
 }
 
 ov::genai::LLMPipeline::LLMPipeline(
@@ -795,16 +178,45 @@ ov::genai::LLMPipeline::LLMPipeline(
             plugin_config,
             generation_config);
     }
-    auto stop_time = std::chrono::steady_clock::now();
-    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
+    m_pimpl->save_load_time(start_time);
+}
+
+DecodedResults LLMPipeline::generate(
+        StringInputs inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer) {
+    return m_pimpl->generate(inputs, generation_config, streamer);
+}
+
+DecodedResults LLMPipeline::generate(StringInputs text, const ov::AnyMap& config_map) {
+    auto config_arg = utils::get_config_from_map(config_map);
+    GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
+    config.update_generation_config(config_map);
+
+    return m_pimpl->generate(text, config, utils::get_streamer_from_map(config_map));
+}
+
+EncodedResults LLMPipeline::generate(
+    const EncodedInputs& inputs,
+    OptionalGenerationConfig generation_config,
+    StreamerVariant streamer) {
+    return m_pimpl->generate(inputs, generation_config, streamer);
+}
+
+EncodedResults LLMPipeline::generate(const EncodedInputs& inputs, const ov::AnyMap& config_map) {
+    auto config_arg = utils::get_config_from_map(config_map);
+    GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
+    config.update_generation_config(config_map);
+
+    return m_pimpl->generate(inputs, config, utils::get_streamer_from_map(config_map));
 }
 
 ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
-    return m_pimpl->m_generation_config;
+    return m_pimpl->get_generation_config();
 }
 
 ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() {
-    return m_pimpl->m_tokenizer;
+    return m_pimpl->get_tokenizer();
 }
 
 void ov::genai::LLMPipeline::start_chat(const std::string& system_message) {
@@ -816,13 +228,10 @@ void ov::genai::LLMPipeline::finish_chat() {
 }
 
 void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& config) {
-    int64_t default_eos_token_id = m_pimpl->m_generation_config.eos_token_id;
-    m_pimpl->m_generation_config = config;
-    // if eos_token_id was not provided in config forward from default config
-    if (config.eos_token_id == -1)
-        m_pimpl->m_generation_config.set_eos_token_id(default_eos_token_id);
-
-    m_pimpl->m_generation_config.validate();
+    m_pimpl->set_generation_config(config);
 }
 
 ov::genai::LLMPipeline::~LLMPipeline() = default;
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/llm_pipeline_base.hpp b/src/cpp/src/llm_pipeline_base.hpp
index b2ad581e0b..5573272d7e 100644
--- a/src/cpp/src/llm_pipeline_base.hpp
+++ b/src/cpp/src/llm_pipeline_base.hpp
@@ -13,8 +13,26 @@ namespace genai {
 class LLMPipelineImplBase {
 public:
     LLMPipelineImplBase(const Tokenizer& tokenizer,
-                        const GenerationConfig& config = {})
-    : m_tokenizer(tokenizer), m_generation_config(config) {
+                        const GenerationConfig& config)
+    : m_tokenizer(tokenizer), m_generation_config(config) { }
+
+    Tokenizer get_tokenizer() {
+        return m_tokenizer;
+    }
+
+    GenerationConfig get_generation_config() const {
+        return m_generation_config;
+    }
+
+    void set_generation_config(GenerationConfig config) {
+        int64_t default_eos_token_id = m_generation_config.eos_token_id;
+        m_generation_config = config;
+
+        // if eos_token_id was not provided in config forward from default config
+        if (m_generation_config.eos_token_id == -1)
+            m_generation_config.set_eos_token_id(default_eos_token_id);
+
+        m_generation_config.validate();
     }
 
     virtual DecodedResults generate(
@@ -34,6 +52,12 @@ class LLMPipelineImplBase {
 
     virtual ~LLMPipelineImplBase() = default;
 
+    void save_load_time(std::chrono::steady_clock::time_point start_time) {
+        auto stop_time = std::chrono::steady_clock::now();
+        m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
+    }
+
+protected:
     Tokenizer m_tokenizer;
     GenerationConfig m_generation_config;
     std::optional<AdapterController> m_adapter_controller;
diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp
new file mode 100644
index 0000000000..bdaae50b04
--- /dev/null
+++ b/src/cpp/src/llm_pipeline_stateful.cpp
@@ -0,0 +1,405 @@
+
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "llm_pipeline_stateful.hpp"
+
+#include "lora_helper.hpp"
+#include "lm_encoding.hpp"
+#include "text_callback_streamer.hpp"
+
+
+namespace ov::genai {
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+    const ov::InferRequest& request,
+    const ov::genai::Tokenizer& tokenizer,
+    OptionalGenerationConfig generation_config)
+    : LLMPipelineImplBase(tokenizer, generation_config.has_value() ? *generation_config : GenerationConfig()),
+    m_model_runner(request) {}
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+    const std::filesystem::path& models_path,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string& device,
+    const ov::AnyMap& plugin_config)
+    : StatefulLLMPipeline{
+        ov::genai::utils::read_model_with_config(models_path, plugin_config),
+        tokenizer,
+        device,
+        plugin_config,
+        utils::from_config_json_if_exists(models_path)
+    } {}
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+    const std::shared_ptr<ov::Model>& model,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string& device,
+    const ov::AnyMap& config,
+    const ov::genai::GenerationConfig& generation_config)
+    : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
+    ov::CompiledModel compiled_model;
+    auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
+    utils::slice_matmul_stateful_model(model);
+    m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model);
+
+    if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
+        m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
+        m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device);   // TODO: Make the prefix name configurable
+        compiled_model = utils::singleton_core().compile_model(model, device, *filtered_plugin_config);
+        m_model_runner = compiled_model.create_infer_request();
+    } else {
+        compiled_model = utils::singleton_core().compile_model(model, device, plugin_config);
+        m_model_runner = compiled_model.create_infer_request();
+    }
+    ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model");
+
+    // If eos_token_id was not provided, take value
+    if (m_generation_config.eos_token_id == -1)
+        m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
+
+    m_sampler.set_seed(m_generation_config.rng_seed);
+}
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+    const std::filesystem::path& models_path,
+    const std::string& device,
+    const ov::AnyMap& plugin_config)
+    : StatefulLLMPipeline{models_path, Tokenizer(models_path), device, plugin_config} {}
+
+DecodedResults StatefulLLMPipeline::generate(
+    StringInputs inputs,
+    OptionalGenerationConfig generation_config,
+    StreamerVariant streamer) {
+    if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
+        m_chat_input_type = ov::genai::utils::GenerationChatInputsType::STRING;
+
+    if (is_chat_conversation)
+        OPENVINO_ASSERT(m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS,
+                        "Chat doesn't support switching between input types. Please, continue using EncodedInputs or restart the chat.");
+
+    auto start_time = std::chrono::steady_clock::now();
+    GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+    // If eos_token_id was not provided, take value from default m_generation_config
+    if (config.eos_token_id == -1)
+        config.set_eos_token_id(m_generation_config.eos_token_id);
+    config.validate();
+
+    TokenizedInputs encoded_input;
+
+    if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
+        OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
+        encoded_input = m_tokenizer.encode(*input_vector);
+    } else if (auto input_prompt = std::get_if<std::string>(&inputs)) {
+        std::string& prompt = *input_prompt;
+
+        if (is_chat_conversation) {
+            // KV cache in model already contains prompts and answers from previous iterations.
+            // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
+            // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
+            // <bos token> will be inserted on every iteration.
+            // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
+            // and takes only the difference between them.
+            // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
+            // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
+
+            m_history.push_back({{"role", "user"}, {"content", prompt}});
+            constexpr bool add_generation_prompt = true;
+            auto new_templated_chat_history  = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+            // Do not add special tokens in chat scenario to be aligned with HF.
+            auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false));
+            auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
+
+            // some symbols combinations can be encoded by the tokenizer in different ways
+            // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
+            // so let's check it out, find the trusted part and use it in on the next step
+            size_t trusted_history_length = 0;
+            if (!m_tokenized_chat_history.empty()) {
+                std::set<int64_t> stop_tokens = config.stop_token_ids;
+                trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
+                m_trust_encoded_history = trusted_history_length == SIZE_MAX;
+            }
+
+            if (m_tokenized_chat_history.empty()) {
+                encoded_input = new_chat_tokens;
+            } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_kv_cache_need_to_update()) {
+                // does_kv_cache_need_to_update will be true here if beam search is activated
+                // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly
+                // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager
+                if (m_kv_history_manager.does_kv_cache_need_to_update()) {
+                    trusted_history_length = m_kv_history_manager.trusted_history_length;
+                } else {
+                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_tokenized_chat_history.size() - trusted_history_length;
+                    // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it
+                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= m_last_disappeared_token.has_value() ? 1 : 0;
+                }
+
+                ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
+                                                    {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length},
+                                                    new_chat_tokens.input_ids.data<int64_t>() + trusted_history_length);
+
+                ov::Tensor new_attention_mask(ov::element::i64, new_tensor.get_shape());
+                std::fill_n(new_attention_mask.data<int64_t>(), new_tensor.get_shape()[1], 1);
+
+                encoded_input.input_ids = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
+                                                    {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length});
+                new_tensor.copy_to(encoded_input.input_ids);
+                encoded_input.attention_mask = new_attention_mask;
+                m_last_disappeared_token = std::nullopt;
+            } else {
+                encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
+            }
+            m_templated_chat_history = new_templated_chat_history;
+
+            m_tokenized_chat_history.clear();
+            m_tokenized_chat_history.reserve(new_chat_tokens.input_ids.get_size());
+            std::copy_n(new_chat_tokens.input_ids.data<int64_t>(), new_chat_tokens.input_ids.get_size(),
+                        std::back_inserter(m_tokenized_chat_history));
+
+            // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
+        } else {
+            encoded_input = m_tokenizer.encode(prompt);
+        }
+    }
+
+    auto encode_stop_time =  std::chrono::steady_clock::now();
+    auto encoded_results = generate(encoded_input, config, streamer);
+
+    auto decode_start_time =  std::chrono::steady_clock::now();
+    DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+    auto decode_stop_time =  std::chrono::steady_clock::now();
+
+    if (is_chat_conversation) {
+        // Tail of chat template is missing in KV cache.
+        // Find the tail to concatenate it with the next input prompt.
+        auto answer = decoded_results.texts[0];
+        m_templated_chat_history.append(answer);
+        m_history.push_back({{"role", "assistant"}, {"content", answer}});
+    }
+
+    // generate_durations
+    decoded_results.perf_metrics = encoded_results.perf_metrics;
+
+    auto& raw_counters = decoded_results.perf_metrics.raw_metrics;
+    auto stop_time = std::chrono::steady_clock::now();
+    raw_counters.generate_durations = std::vector<MicroSeconds>();
+    raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+    raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time));
+    raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time));
+
+    // Added tokenization/detokenization times, and updated generate duration, need to reevaluate statistics.
+    decoded_results.perf_metrics.m_evaluated = false;
+    decoded_results.perf_metrics.evaluate_statistics(start_time);
+    return decoded_results;
+}
+
+EncodedResults StatefulLLMPipeline::generate(
+    const EncodedInputs& inputs,
+    OptionalGenerationConfig generation_config,
+    StreamerVariant streamer) {
+    if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
+        m_chat_input_type = ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS;
+
+    if (is_chat_conversation)
+        // if chat was run in StringInputs mode, but it was called EncodedInputs generate, last m_history entry will be with assistant role
+        OPENVINO_ASSERT(m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS || m_history.back()["role"] == "user",
+                        "Chat doesn't support switching between input types. Please, continue using StringInputs or restart the chat.");
+
+    auto start_time = std::chrono::steady_clock::now();
+    ov::Tensor input_ids;
+    ov::Tensor attention_mask;
+    if (auto data = std::get_if<ov::Tensor>(&inputs)) {
+        input_ids = *data;
+        attention_mask = ov::genai::utils::init_attention_mask(input_ids);
+    } else if (auto data = std::get_if<TokenizedInputs>(&inputs)) {
+        input_ids = data->input_ids;
+        attention_mask = data->attention_mask;
+    }
+
+    if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
+        std::copy(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history));
+
+    // Tail of previous output in chat mode is missing in KV cache.
+    if (m_last_disappeared_token.has_value()) {
+        attention_mask = ov::genai::utils::push_front_inputs(attention_mask, 1);
+        input_ids = ov::genai::utils::push_front_inputs(input_ids, *m_last_disappeared_token);
+    }
+
+    GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+
+    // If eos_token_id was not provided, take value from default m_generation_config
+    if (config.eos_token_id == -1)
+        config.set_eos_token_id(m_generation_config.eos_token_id);
+    config.validate();
+
+    // Stateful pipeline does not provide logprobs for prompt tokens
+    OPENVINO_ASSERT(config.echo == false, "Echo is not supported in the stateful pipeline");
+
+    std::shared_ptr<StreamerBase> streamer_ptr;
+    if (auto streamer_obj = std::get_if<std::monostate>(&streamer)) {
+        streamer_ptr = nullptr;
+    } else if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) {
+        streamer_ptr = *streamer_obj;
+    } else if (auto callback = std::get_if<std::function<bool(std::string)>>(&streamer)) {
+        streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
+    }
+
+    auto batch_size = input_ids.get_shape().at(0);
+    OPENVINO_ASSERT(streamer_ptr == nullptr || batch_size == 1 && config.num_return_sequences == 1 &&
+        (config.is_greedy_decoding() || config.is_multinomial()),
+        "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
+
+    auto num_inputs = m_model_runner.get_compiled_model().inputs().size();
+    OPENVINO_ASSERT(num_inputs == 4 || num_inputs == 3, "Model should have 3 or 4 inputs: "
+                    "either (input_ids, attention_mask, beam_idx) or "
+                    "(input_ids, attention_mask, position_ids, beam_idx) "
+                    "but you have '" + std::to_string(num_inputs) + "' inputs");
+
+    ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_history_manager.num_tokens_to_remove_from_kv_cache, m_kv_cache_seq_length_axis, m_adapter_controller);
+
+    size_t kv_cache_len = 0;
+    ov::Tensor concatenated_attention_mask;
+    if (is_chat_conversation && !m_tokenized_chat_history.empty()) {
+        OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1");
+        // If history is saved in KV cache, concatenate new attention_mask with the already existing.
+        // Between subsequent runs attention_mask should not be modified.
+        auto atten_mask_history = m_model_runner.get_tensor("attention_mask");
+        auto prompt_len = attention_mask.get_shape()[1];
+
+        kv_cache_len = atten_mask_history.get_shape()[1] - m_kv_history_manager.num_tokens_to_remove_from_kv_cache;
+
+        ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}};
+        auto start_atten_hst = atten_mask_history.data<int64_t>();
+
+        std::copy(start_atten_hst, start_atten_hst + kv_cache_len,
+                new_atten_mask.data<int64_t>());
+        std::copy(attention_mask.data<int64_t>(), attention_mask.data<int64_t>() + prompt_len,
+                new_atten_mask.data<int64_t>() + kv_cache_len);
+        concatenated_attention_mask = new_atten_mask;
+    } else {
+        concatenated_attention_mask = attention_mask;
+    }
+
+    size_t prev_attn_mask_size = concatenated_attention_mask.get_shape()[1];
+
+    bool position_ids_available = (num_inputs == 4);
+    std::optional<ov::Tensor> position_ids = std::nullopt;
+    if (position_ids_available) {
+        position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
+        utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len);
+    }
+
+    if(m_adapter_controller) {
+        m_adapter_controller->apply(m_model_runner, config.adapters);
+    }
+
+    if (is_chat_conversation && !m_trust_encoded_history) {
+        m_trust_encoded_history = true;
+        m_kv_history_manager.reset();
+    }
+
+    std::vector<SequenceGroup::Ptr> requests;
+    size_t block_size = 1;
+    bool enable_prefix_caching = false;
+
+    for (size_t request_id = 0; request_id < batch_size; request_id++) {
+        SequenceGroup::Ptr sequence_group;
+        if (is_chat_conversation) {
+            ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
+            sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching);
+        } else {
+            size_t seq_len = input_ids.get_shape().at(1);
+            size_t batch_offset = request_id * seq_len;
+            const int64_t* prompt_start = input_ids.data<const int64_t>() + batch_offset;
+            std::vector<int64_t> tokenized_prompt(prompt_start, prompt_start + seq_len);
+
+            sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_prompt, config, block_size, enable_prefix_caching);
+        }
+
+        sequence_group->set_sequence_group_ptr(sequence_group);
+        requests.push_back(sequence_group);
+    }
+
+    if (m_sampler.get_seed() != config.rng_seed) {
+        m_sampler.set_seed(config.rng_seed);
+    }
+
+    ov::genai::EncodedResults result;
+    std::tie(result, m_last_disappeared_token) = get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask,
+                                                                        streamer_ptr, m_sampler, requests, position_ids, std::nullopt);
+
+    if (is_chat_conversation) {
+        // force remove from kv_cache last answer
+        if (config.is_beam_search() && m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) {
+            m_kv_history_manager.trusted_history_length = m_tokenized_chat_history.size();
+            m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size;
+        }
+
+        std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
+    } else {
+        reset_kv_state();
+        m_last_disappeared_token = std::nullopt;
+    }
+
+    if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
+        std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
+
+    auto stop_time = std::chrono::steady_clock::now();
+
+    // If is called without tokenization then that stat will not be reported.
+    auto& metrics = result.perf_metrics;
+    metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1);
+    metrics.load_time = m_load_time_ms;
+    metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+    metrics.evaluate_statistics(start_time);
+    return result;
+}
+
+void StatefulLLMPipeline::start_chat(const std::string& system_message) {
+    is_chat_conversation = true;
+    m_trust_encoded_history = true;
+    m_kv_history_manager.reset();
+    m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+    m_last_disappeared_token = std::nullopt;
+    if (!m_tokenized_chat_history.empty()) {
+        reset_kv_state();
+        m_history = {};
+        m_templated_chat_history = "";
+        m_tokenized_chat_history.clear();
+    }
+    if (system_message.empty())
+        return;
+
+    m_history.push_back({{"role", "system"}, {"content", system_message}});
+    constexpr bool add_generation_prompt = false;
+
+    m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+}
+
+void StatefulLLMPipeline::reset_kv_state() {
+    if(m_adapter_controller) {
+        for(auto& state: m_model_runner.query_state()) {
+            if(!m_adapter_controller->has_state_name(state.get_name())) {
+                state.reset();
+            }
+        }
+    } else {
+        m_model_runner.reset_state();
+    }
+}
+
+void StatefulLLMPipeline::finish_chat() {
+    is_chat_conversation = false;
+    m_trust_encoded_history = true;
+    m_kv_history_manager.reset();
+    m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+    m_last_disappeared_token = std::nullopt;
+    if (!m_tokenized_chat_history.empty()) {
+        reset_kv_state();
+        m_history.clear();
+        m_templated_chat_history.clear();
+        m_tokenized_chat_history.clear();
+    }
+}
+
+} // namespace ov::genai
diff --git a/src/cpp/src/llm_pipeline_stateful.hpp b/src/cpp/src/llm_pipeline_stateful.hpp
new file mode 100644
index 0000000000..dbf8d89391
--- /dev/null
+++ b/src/cpp/src/llm_pipeline_stateful.hpp
@@ -0,0 +1,77 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+
+#include "llm_pipeline_base.hpp"
+#include "sampler.hpp"
+#include "utils.hpp"
+
+namespace ov::genai {
+
+class StatefulLLMPipeline final : public LLMPipelineImplBase {
+    ov::InferRequest m_model_runner;
+    Sampler m_sampler;
+
+    // Chat scenario specific parameters
+    bool is_chat_conversation = false;
+    bool m_trust_encoded_history = true;
+    ChatHistory m_history;
+    std::string m_templated_chat_history = {};
+    std::vector<int64_t> m_tokenized_chat_history;
+    ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+    // Tail of previous output in chat mode is missing in KV cache, let's keep it
+    std::optional<int64_t> m_last_disappeared_token = std::nullopt;
+    // If sequence contains some symbols, which could be ambiguously encoded by tokenizer, we need to trim kv cache
+    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
+    // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
+    ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0};
+    size_t m_kv_cache_seq_length_axis = 2;
+
+    void reset_kv_state();
+public:
+
+    StatefulLLMPipeline(
+        const ov::InferRequest& request,
+        const ov::genai::Tokenizer& tokenizer,
+        OptionalGenerationConfig generation_config = std::nullopt
+    );
+
+    StatefulLLMPipeline(
+        const std::filesystem::path& models_path,
+        const ov::genai::Tokenizer& tokenizer,
+        const std::string& device,
+        const ov::AnyMap& plugin_config
+    );
+
+    StatefulLLMPipeline(
+        const std::shared_ptr<ov::Model>& model,
+        const ov::genai::Tokenizer& tokenizer,
+        const std::string& device,
+        const ov::AnyMap& config,
+        const ov::genai::GenerationConfig& generation_config
+    );
+
+    StatefulLLMPipeline(
+        const std::filesystem::path& models_path,
+        const std::string& device,
+        const ov::AnyMap& plugin_config
+    );
+
+    DecodedResults generate(
+        StringInputs inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override;
+
+    EncodedResults generate(
+        const EncodedInputs& inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override;
+
+    void start_chat(const std::string& system_message) override;
+
+    void finish_chat() override;
+};
+
+} // namespace ov::genai
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 6207c889a2..8f49bd471e 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -82,11 +82,7 @@ const std::string DRAFT_MODEL_ARG_NAME = "draft_model";
 template<typename Config = ov::genai::GenerationConfig>
 Config from_config_json_if_exists(const std::filesystem::path& models_path, const char config_name[] = "generation_config.json") {
     auto config_file_path = models_path / config_name;
-    if (std::filesystem::exists(config_file_path)) {
-        return Config{(config_file_path).string()};
-    } else {
-        return Config{};
-    }
+    return std::filesystem::exists(config_file_path) ? Config{config_file_path} : Config{};
 }
 
 ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map);

From 4be813ee3b8dacc8fba39b40cba2541089dfd597 Mon Sep 17 00:00:00 2001
From: Alexander Kozlov <kozzzloff@list.ru>
Date: Mon, 30 Dec 2024 19:49:41 +0300
Subject: [PATCH 070/110] [WWB]: Added validation for Inpainting pipeline
 (#1451)

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .../tests/test_cli_image.py                   |  34 +++--
 .../whowhatbench/__init__.py                  |   4 +-
 .../{image2image.py => im2im_evaluator.py}    |   0
 .../whowhatbench/inpaint_evaluator.py         | 133 ++++++++++++++++++
 .../whowhatbench/model_loaders.py             |  57 +++++++-
 tools/who_what_benchmark/whowhatbench/wwb.py  |  27 +++-
 6 files changed, 238 insertions(+), 17 deletions(-)
 rename tools/who_what_benchmark/whowhatbench/{image2image.py => im2im_evaluator.py} (100%)
 create mode 100644 tools/who_what_benchmark/whowhatbench/inpaint_evaluator.py

diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py
index 536d015612..7b966f049e 100644
--- a/tools/who_what_benchmark/tests/test_cli_image.py
+++ b/tools/who_what_benchmark/tests/test_cli_image.py
@@ -1,3 +1,4 @@
+import itertools
 import subprocess  # nosec B404
 import os
 import shutil
@@ -9,6 +10,9 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+MODEL_CACHE = tempfile.mkdtemp()
+OV_IMAGE_MODELS = ["OpenVINO/stable-diffusion-v1-5-int8-ov"]
+
 
 def run_wwb(args):
     logger.info(" ".join(["TRANSFOREMRS_VERBOSITY=debug wwb"] + args))
@@ -17,6 +21,19 @@ def run_wwb(args):
     return result
 
 
+def setup_module():
+    for model_id in OV_IMAGE_MODELS:
+        MODEL_PATH = os.path.join(MODEL_CACHE, model_id.replace("/", "--"))
+        subprocess.run(["huggingface-cli", "download",
+                        model_id, "--local-dir",
+                        MODEL_PATH], capture_output=True, text=True)
+
+
+def teardown_module():
+    logger.info("Remove models")
+    shutil.rmtree(MODEL_CACHE)
+
+
 @pytest.mark.parametrize(
     ("model_id", "model_type", "backend"),
     [
@@ -25,6 +42,8 @@ def run_wwb(args):
         ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"),
         ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "openvino"),
         ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "text-to-image", "hf"),
+        ("hf-internal-testing/tiny-stable-diffusion-torch", "image-inpainting", "hf"),
+        ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "image-inpainting", "hf"),
     ],
 )
 def test_image_model_types(model_id, model_type, backend):
@@ -68,21 +87,13 @@ def test_image_model_types(model_id, model_type, backend):
 
 @pytest.mark.parametrize(
     ("model_id", "model_type"),
-    [
-        ("OpenVINO/LCM_Dreamshaper_v7-int8-ov", "image-to-image"),
-        ("OpenVINO/LCM_Dreamshaper_v7-int8-ov", "text-to-image"),
-    ],
+    list(itertools.product(OV_IMAGE_MODELS,
+                           ["image-to-image", "text-to-image", "image-inpainting"])),
 )
 def test_image_model_genai(model_id, model_type):
     with tempfile.TemporaryDirectory() as temp_dir:
         GT_FILE = os.path.join(temp_dir, "gt.csv")
-        MODEL_PATH = os.path.join(temp_dir, model_id.replace("/", "--"))
-
-        result = subprocess.run(["huggingface-cli", "download",
-                                 model_id, "--local-dir",
-                                 MODEL_PATH],
-                                capture_output=True, text=True)
-        assert result.returncode == 0
+        MODEL_PATH = os.path.join(MODEL_CACHE, model_id.replace("/", "--"))
 
         wwb_args = [
             "--base-model",
@@ -169,7 +180,6 @@ def test_image_model_genai(model_id, model_type):
 
         shutil.rmtree("reference", ignore_errors=True)
         shutil.rmtree("target", ignore_errors=True)
-        shutil.rmtree(MODEL_PATH, ignore_errors=True)
         shutil.rmtree(output_dir, ignore_errors=True)
 
 
diff --git a/tools/who_what_benchmark/whowhatbench/__init__.py b/tools/who_what_benchmark/whowhatbench/__init__.py
index f608601ec8..194426f208 100644
--- a/tools/who_what_benchmark/whowhatbench/__init__.py
+++ b/tools/who_what_benchmark/whowhatbench/__init__.py
@@ -3,7 +3,8 @@
 from .text_evaluator import TextEvaluator as Evaluator
 from .text2image_evaluator import Text2ImageEvaluator
 from .visualtext_evaluator import VisualTextEvaluator
-from .image2image import Image2ImageEvaluator
+from .im2im_evaluator import Image2ImageEvaluator
+from .inpaint_evaluator import InpaintingEvaluator
 
 
 __all__ = [
@@ -13,5 +14,6 @@
     "Text2ImageEvaluator",
     "VisualTextEvaluator",
     "Image2ImageEvaluator",
+    "InpaintingEvaluator",
     "EVALUATOR_REGISTRY",
 ]
diff --git a/tools/who_what_benchmark/whowhatbench/image2image.py b/tools/who_what_benchmark/whowhatbench/im2im_evaluator.py
similarity index 100%
rename from tools/who_what_benchmark/whowhatbench/image2image.py
rename to tools/who_what_benchmark/whowhatbench/im2im_evaluator.py
diff --git a/tools/who_what_benchmark/whowhatbench/inpaint_evaluator.py b/tools/who_what_benchmark/whowhatbench/inpaint_evaluator.py
new file mode 100644
index 0000000000..c3fe0825f7
--- /dev/null
+++ b/tools/who_what_benchmark/whowhatbench/inpaint_evaluator.py
@@ -0,0 +1,133 @@
+import os
+from typing import Any, Union
+
+import datasets
+import pandas as pd
+from tqdm import tqdm
+from transformers import set_seed
+import torch
+import openvino_genai
+
+from .registry import register_evaluator
+from .text2image_evaluator import Text2ImageEvaluator
+
+from .whowhat_metrics import ImageSimilarity
+
+
+def preprocess_fn(example):
+    return {
+        "prompts": example["inpaint_caption"],
+        "images": example["coco_image"],
+        "masks": example["mask"],
+    }
+
+
+def prepare_default_data(num_samples=None):
+    DATASET_NAME = "phiyodr/InpaintCOCO"
+    NUM_SAMPLES = 10 if num_samples is None else num_samples
+    set_seed(42)
+    default_dataset = datasets.load_dataset(
+        DATASET_NAME, split="test", streaming=True
+    ).filter(lambda example: example["inpaint_caption"] != "").take(NUM_SAMPLES)
+    return default_dataset.map(
+        lambda x: preprocess_fn(x), remove_columns=default_dataset.column_names
+    )
+
+
+@register_evaluator("image-inpainting")
+class InpaintingEvaluator(Text2ImageEvaluator):
+    def __init__(
+        self,
+        base_model: Any = None,
+        gt_data: str = None,
+        test_data: Union[str, list] = None,
+        metrics="similarity",
+        similarity_model_id: str = "openai/clip-vit-large-patch14",
+        num_inference_steps=4,
+        crop_prompts=True,
+        num_samples=None,
+        gen_image_fn=None,
+        seed=42,
+        is_genai=False,
+    ) -> None:
+        assert (
+            base_model is not None or gt_data is not None
+        ), "Text generation pipeline for evaluation or ground trush data must be defined"
+
+        self.test_data = test_data
+        self.metrics = metrics
+        self.crop_prompt = crop_prompts
+        self.num_samples = num_samples
+        self.num_inference_steps = num_inference_steps
+        self.seed = seed
+        self.similarity = None
+        self.similarity = ImageSimilarity(similarity_model_id)
+        self.last_cmp = None
+        self.gt_dir = os.path.dirname(gt_data)
+        self.generation_fn = gen_image_fn
+        self.is_genai = is_genai
+        self.resolution = None
+
+        if base_model:
+            self.gt_data = self._generate_data(
+                base_model, gen_image_fn, os.path.join(self.gt_dir, "reference")
+            )
+        else:
+            self.gt_data = pd.read_csv(gt_data, keep_default_na=False)
+
+    def _generate_data(self, model, gen_image_fn=None, image_dir="reference"):
+        def default_gen_image_fn(model, prompt, image, mask, num_inference_steps, generator=None):
+            with torch.no_grad():
+                output = model(
+                    prompt,
+                    image=image,
+                    mask_image=mask,
+                    num_inference_steps=num_inference_steps,
+                    output_type="pil",
+                    generator=generator,
+                )
+            return output.images[0]
+
+        generation_fn = gen_image_fn or default_gen_image_fn
+
+        if self.test_data:
+            if isinstance(self.test_data, str):
+                data = pd.read_csv(self.test_data)
+            else:
+                if isinstance(self.test_data, dict):
+                    assert "prompts" in self.test_data
+                    assert "images" in self.test_data
+                    assert "masks" in self.test_data
+                    data = dict(self.test_data)
+                data = pd.DataFrame.from_dict(data)
+        else:
+            data = pd.DataFrame.from_dict(prepare_default_data(self.num_samples))
+
+        prompts = data["prompts"]
+        images = data["images"]
+        masks = data["masks"]
+        output_images = []
+        rng = torch.Generator(device="cpu")
+
+        if not os.path.exists(image_dir):
+            os.makedirs(image_dir)
+
+        for i, (prompt, image, mask) in tqdm(enumerate(zip(prompts, images, masks)), desc="Evaluate pipeline"):
+            set_seed(self.seed)
+            rng = rng.manual_seed(self.seed)
+            output = generation_fn(
+                model,
+                prompt,
+                image=image,
+                mask=mask,
+                num_inference_steps=self.num_inference_steps,
+                generator=openvino_genai.TorchGenerator(self.seed) if self.is_genai else rng
+            )
+            image_path = os.path.join(image_dir, f"{i}.png")
+            output.save(image_path)
+            output_images.append(image_path)
+
+        res_data = {"prompts": list(prompts), "images": output_images}
+        df = pd.DataFrame(res_data)
+
+        return df
diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py
index f54d232bc2..8a00c70852 100644
--- a/tools/who_what_benchmark/whowhatbench/model_loaders.py
+++ b/tools/who_what_benchmark/whowhatbench/model_loaders.py
@@ -2,7 +2,7 @@
 import json
 
 from transformers import AutoConfig, AutoModelForCausalLM, AutoModel, AutoModelForVision2Seq
-from diffusers import DiffusionPipeline, AutoPipelineForImage2Image
+from diffusers import DiffusionPipeline, AutoPipelineForImage2Image, AutoPipelineForInpainting
 
 
 logging.basicConfig(level=logging.INFO)
@@ -107,7 +107,7 @@ def load_text2image_model(
 
         try:
             model = TEXT2IMAGEPipeline.from_pretrained(
-                model_id, trust_remote_code=True, device=device, ov_config=ov_config
+                model_id, trust_remote_code=True, device=device, ov_config=ov_config, safety_checker=None,
             )
         except ValueError:
             config = AutoConfig.from_pretrained(
@@ -119,6 +119,7 @@ def load_text2image_model(
                 use_cache=True,
                 device=device,
                 ov_config=ov_config,
+                safety_checker=None,
             )
 
     return model
@@ -211,7 +212,7 @@ def load_imagetext2image_model(
         from optimum.intel.openvino import OVPipelineForImage2Image
         try:
             model = OVPipelineForImage2Image.from_pretrained(
-                model_id, trust_remote_code=True, device=device, ov_config=ov_config
+                model_id, trust_remote_code=True, device=device, ov_config=ov_config, safety_checker=None,
             )
         except ValueError:
             config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
@@ -222,6 +223,54 @@ def load_imagetext2image_model(
                 use_cache=True,
                 device=device,
                 ov_config=ov_config,
+                safety_checker=None,
+            )
+    return model
+
+
+def load_inpainting_genai_pipeline(model_dir, device="CPU", ov_config=None):
+    try:
+        import openvino_genai
+    except ImportError as e:
+        logger.error("Failed to import openvino_genai package. Please install it. Details:\n", e)
+        exit(-1)
+
+    return GenAIModelWrapper(
+        openvino_genai.InpaintingPipeline(model_dir, device, **ov_config),
+        model_dir,
+        "image-inpainting"
+    )
+
+
+def load_inpainting_model(
+    model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
+):
+    if use_hf:
+        logger.info("Using HF Transformers API")
+        model = AutoPipelineForInpainting.from_pretrained(
+            model_id, trust_remote_code=True
+        )
+    elif use_genai:
+        logger.info("Using OpenVINO GenAI API")
+        model = load_inpainting_genai_pipeline(model_id, device, ov_config)
+    else:
+        logger.info("Using Optimum API")
+        from optimum.intel.openvino import OVPipelineForInpainting
+        try:
+            model = OVPipelineForInpainting.from_pretrained(
+                model_id, trust_remote_code=True, device=device, ov_config=ov_config, safety_checker=None,
+            )
+        except ValueError as e:
+            logger.error("Failed to load inpaiting pipeline. Details:\n", e)
+            config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+            model = OVPipelineForInpainting.from_pretrained(
+                model_id,
+                config=config,
+                trust_remote_code=True,
+                use_cache=True,
+                device=device,
+                ov_config=ov_config,
+                safety_checker=None,
             )
     return model
 
@@ -248,5 +297,7 @@ def load_model(
         return load_visual_text_model(model_id, device, ov_options, use_hf, use_genai)
     elif model_type == "image-to-image":
         return load_imagetext2image_model(model_id, device, ov_options, use_hf, use_genai)
+    elif model_type == "image-inpainting":
+        return load_inpainting_model(model_id, device, ov_options, use_hf, use_genai)
     else:
         raise ValueError(f"Unsupported model type: {model_type}")
diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py
index 2ff8c45975..7acf3cf5aa 100644
--- a/tools/who_what_benchmark/whowhatbench/wwb.py
+++ b/tools/who_what_benchmark/whowhatbench/wwb.py
@@ -55,7 +55,7 @@ def parse_args():
     parser.add_argument(
         "--model-type",
         type=str,
-        choices=["text", "text-to-image", "visual-text", "image-to-image"],
+        choices=["text", "text-to-image", "visual-text", "image-to-image", "image-inpainting"],
         default="text",
         help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation, "
         "visual-text - for Visual Language Models, image-to-image - for image generation based on image and prompt",
@@ -282,6 +282,20 @@ def genai_gen_image2image(model, prompt, image, num_inference_steps, generator=N
     return image
 
 
+def genai_gen_inpainting(model, prompt, image, mask, num_inference_steps, generator=None):
+    image_data = ov.Tensor(np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8))
+    mask_data = ov.Tensor(np.array(mask.getdata()).reshape(1, mask.size[1], mask.size[0], 3).astype(np.uint8))
+    image_tensor = model.generate(
+        prompt,
+        image=image_data,
+        mask_image=mask_data,
+        num_inference_steps=num_inference_steps,
+        generator=generator,
+    )
+    image = Image.fromarray(image_tensor.data[0])
+    return image
+
+
 def genai_gen_visual_text(model, prompt, image, processor, tokenizer, max_new_tokens, crop_question):
     image_data = ov.Tensor(np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8))
     config = model.get_generation_config()
@@ -355,6 +369,17 @@ def create_evaluator(base_model, args):
                 is_genai=args.genai,
                 seed=args.seed,
             )
+        elif task == "image-inpainting":
+            return EvaluatorCLS(
+                base_model=base_model,
+                gt_data=args.gt_data,
+                test_data=prompts,
+                num_samples=args.num_samples,
+                num_inference_steps=args.num_inference_steps,
+                gen_image_fn=genai_gen_inpainting if args.genai else None,
+                is_genai=args.genai,
+                seed=args.seed,
+            )
         else:
             raise ValueError(f"Unsupported task: {task}")
 

From 653b2aeb92885eb44f664ad221418ac72eb0d9ab Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 31 Dec 2024 08:11:27 +0400
Subject: [PATCH 071/110] [CB] Simplify SequenceGroup API (#1456)

- Removed `enable_prefix_caching` parameter from `SequenceGroup` ctor
- Removed necessity to call `set_sequence_group_ptr` after creation of
sequence group
- Renamed `get_cumulative_log_probs` to `get_cumulative_log_prob` as it
returns a floating point value
---
 src/cpp/src/continuous_batching_impl.cpp      |   6 +-
 src/cpp/src/llm_pipeline_stateful.cpp         |   6 +-
 src/cpp/src/lm_encoding.cpp                   |  11 +-
 src/cpp/src/sequence_group.hpp                |  89 +++++++-------
 ...batching_for_speculative_decoding_impl.cpp |   2 +-
 src/cpp/src/visual_language/pipeline.cpp      |   4 +-
 tests/cpp/block_manager.cpp                   |  17 +--
 tests/cpp/cache_manager.cpp                   |  15 ++-
 tests/cpp/sampler.cpp                         |  12 +-
 tests/cpp/scheduler.cpp                       | 113 ++++++++----------
 tests/cpp/speculative_decoding.cpp            |   4 +-
 11 files changed, 129 insertions(+), 150 deletions(-)

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 9e20171dcb..3ab242418e 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -105,9 +105,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request
 
     SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, input_ids,
                                                                         sampling_params,
-                                                                        m_scheduler->get_block_size(),
-                                                                        m_scheduler->get_config().enable_prefix_caching);
-    sequence_group->set_sequence_group_ptr(sequence_group);
+                                                                        m_scheduler->get_block_size());
 
     if (m_scheduler->get_config().enable_prefix_caching) {
         m_scheduler->restore_cached_blocks(sequence_group);
@@ -353,7 +351,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
 
         for (size_t i = 0; i < num_outputs; ++i) {
             const auto & sequence = sequences[i];
-            const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_probs();
+            const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_prob();
             const auto & generated_ids = sequence->get_generated_ids();
 
             if (sampling_params.echo)
diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp
index bdaae50b04..cbcca62978 100644
--- a/src/cpp/src/llm_pipeline_stateful.cpp
+++ b/src/cpp/src/llm_pipeline_stateful.cpp
@@ -300,23 +300,21 @@ EncodedResults StatefulLLMPipeline::generate(
 
     std::vector<SequenceGroup::Ptr> requests;
     size_t block_size = 1;
-    bool enable_prefix_caching = false;
 
     for (size_t request_id = 0; request_id < batch_size; request_id++) {
         SequenceGroup::Ptr sequence_group;
         if (is_chat_conversation) {
             ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
-            sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching);
+            sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size);
         } else {
             size_t seq_len = input_ids.get_shape().at(1);
             size_t batch_offset = request_id * seq_len;
             const int64_t* prompt_start = input_ids.data<const int64_t>() + batch_offset;
             std::vector<int64_t> tokenized_prompt(prompt_start, prompt_start + seq_len);
 
-            sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_prompt, config, block_size, enable_prefix_caching);
+            sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_prompt, config, block_size);
         }
 
-        sequence_group->set_sequence_group_ptr(sequence_group);
         requests.push_back(sequence_group);
     }
 
diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index 17a20dd961..083c591927 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -119,10 +119,13 @@ std::pair<EncodedResults, std::optional<int64_t>> get_lm_encoded_results(
 
     auto logits = m_llm.get_tensor("logits");
 
-    int64_t sequence_len = logits.get_shape().at(1);
+    // since we have applied `Slice` operationto last MatMul, model output sequence lenght is 1
+    // so, we need to update sequence groups to think that they already have processed all prompt tokens except last ones
+    // and schedule only `output_sequence_len` ones
+    int64_t output_sequence_len = logits.get_shape().at(1);
     for (auto& sequence_group : sequence_groups) {
-        sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - sequence_len);
-        sequence_group->schedule_tokens(sequence_len);
+        sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - output_sequence_len);
+        sequence_group->schedule_tokens(output_sequence_len);
     }
 
     std::map<size_t, size_t> beam_offets;
@@ -217,7 +220,7 @@ std::pair<EncodedResults, std::optional<int64_t>> get_lm_encoded_results(
 
         for (size_t seq_id = 0; seq_id < num_outputs; ++seq_id) {
             const auto & sequence = sequences[seq_id];
-            const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_probs();
+            const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_prob();
 
             results.tokens.push_back(sequence->get_generated_ids());
             results.scores.push_back(score);
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 220e93c032..8f8d5f899e 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -4,9 +4,11 @@
 #pragma once
 
 #include <vector>
+#include <cassert>
 #include <set>
 #include <cstdlib>
 #include <string_view>
+#include <memory>
 
 #include "openvino/genai/generation_handle.hpp"
 #include "openvino/genai/generation_config.hpp"
@@ -40,32 +42,32 @@ class Sequence {
     GenerationFinishReason m_finish_reason = GenerationFinishReason::NONE;
     float m_cumulative_log_prob = 0.0f;
     std::vector<int64_t> m_prefix_hashes;
-    std::weak_ptr<SequenceGroup> m_sequence_group;
+    SequenceGroup* m_sequence_group = nullptr;
     static std::mutex m_counter_mutex;
 
     size_t _make_hash(size_t content_length);
-public:
-    using Ptr = std::shared_ptr<Sequence>;
-    using CPtr = std::shared_ptr<const Sequence>;
 
-    // don't use directly
-    Sequence(const uint64_t id) : m_grouped_id(id) {};
+    explicit Sequence(const uint64_t id) : m_grouped_id(id) {}
 
-    // don't use directly
     Sequence(const Sequence& seq, const uint64_t id) :
         m_generated_ids(seq.m_generated_ids),
         m_grouped_id(id),
         m_status(seq.m_status),
-        m_cumulative_log_prob(seq.m_cumulative_log_prob){
+        m_cumulative_log_prob(seq.m_cumulative_log_prob),
+        m_sequence_group(seq.m_sequence_group) {
         OPENVINO_ASSERT(seq.m_id != m_id);
     }
 
+public:
+    using Ptr = std::shared_ptr<Sequence>;
+    using CPtr = std::shared_ptr<const Sequence>;
+
     static Sequence::Ptr create(const uint64_t id) {
-        return std::make_shared<Sequence>(id);
+        return Sequence::Ptr(new Sequence(id));
     }
 
     static Sequence::Ptr fork(Sequence::CPtr sequence, const uint64_t id) {
-        return std::make_shared<Sequence>(*sequence, id);
+        return Sequence::Ptr(new Sequence(*sequence, id));
     }
 
     bool operator ==(const Sequence& other) const {
@@ -130,7 +132,7 @@ class Sequence {
         GenerationOutput output;
         if (token_cnt > 0) {
             OPENVINO_ASSERT(m_generated_ids.size());
-            output.score = get_cumulative_log_probs();
+            output.score = get_cumulative_log_prob();
 
             auto generated_token_id = get_generated_ids();
             auto generated_log_probs = get_generated_log_probs();
@@ -163,7 +165,7 @@ class Sequence {
         return m_generated_log_probs;
     }
 
-    float get_cumulative_log_probs() const {
+    float get_cumulative_log_prob() const {
         return m_cumulative_log_prob;
     }
 
@@ -173,20 +175,18 @@ class Sequence {
     }
 
     float get_beam_search_score(const ov::genai::GenerationConfig& sampling_params) const {
-        float cumulative_log_prob = get_cumulative_log_probs(), current_length = get_generated_len();
+        float cumulative_log_prob = get_cumulative_log_prob(), current_length = get_generated_len();
         float score = cumulative_log_prob / std::pow(current_length, sampling_params.length_penalty);
         return score;
     }
 
     // Each KV block can be uniquely identified by
-    void set_sequence_group_ptr(std::shared_ptr<SequenceGroup> sequence_group) {
+    void set_sequence_group_ptr(SequenceGroup* sequence_group) {
+        assert(sequence_group != nullptr);
         m_sequence_group = sequence_group;
     }
 
-    std::shared_ptr<SequenceGroup> get_sequence_group_ptr() const {
-        OPENVINO_ASSERT(!m_sequence_group.expired());
-        return m_sequence_group.lock();
-    }
+    std::shared_ptr<SequenceGroup> get_sequence_group_ptr() const;
 
     // Each KV block can be uniquely identified by
     // the tokens within the block and the tokens in the prefix before the block.
@@ -198,7 +198,7 @@ class Sequence {
 // - each sequence shares the same prompt and KV-caches for promp
 // - in case of beam search each sequence also shares specific part of generic phase
 //   via reference counter mechanism on BlockManager level
-class SequenceGroup {
+class SequenceGroup  : public std::enable_shared_from_this<SequenceGroup> {
     uint64_t m_request_id;
     std::vector<Sequence::Ptr> m_sequences;
     ov::genai::GenerationConfig m_sampling_params;
@@ -206,7 +206,6 @@ class SequenceGroup {
     TokenIds m_prompt_ids;
     std::vector<float> m_prompt_log_probs;
     GenerationStream::Ptr m_generation_stream;
-    bool m_enable_prefix_caching;
     size_t m_num_evicted_tokens = 0;
     bool m_has_echoed = false;
 
@@ -226,33 +225,32 @@ class SequenceGroup {
 
     size_t m_num_streamed_tokens = 0, m_stream_window_size = 0;
 
-
-    SequenceGroup(uint64_t request_id, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching)
+    SequenceGroup(uint64_t request_id, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size)
         : m_request_id(request_id),
           m_sampling_params(sampling_params),
           m_block_size(block_size),
-          m_enable_prefix_caching(enable_prefix_caching) {
-            m_generation_stream = GenerationStream::create();    
-           }
+          m_generation_stream(GenerationStream::create()) { }
 
 public:
     using Ptr = std::shared_ptr<SequenceGroup>;
     using CPtr = std::shared_ptr<const SequenceGroup>;
 
-    SequenceGroup(uint64_t request_id, const TokenIds& input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching)
-        : SequenceGroup(request_id, ov::Tensor(ov::element::i64, ov::Shape{input_ids.size()}, (void *)input_ids.data()), sampling_params, block_size, enable_prefix_caching) {
+    SequenceGroup(uint64_t request_id, const TokenIds& input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size)
+        : SequenceGroup(request_id, ov::Tensor(ov::element::i64, ov::Shape{input_ids.size()}, (void *)input_ids.data()), sampling_params, block_size) {
     }
 
-    SequenceGroup(uint64_t request_id, const ov::Tensor input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching)
-        : SequenceGroup(request_id, sampling_params, block_size, enable_prefix_caching) {
-        add_sequence(Sequence::create(m_next_sequence_id++));
-
+    SequenceGroup(uint64_t request_id, const ov::Tensor input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size)
+        : SequenceGroup(request_id, sampling_params, block_size) {
         m_prompt_ids.resize(input_ids.get_size());
         std::copy_n(input_ids.data<int64_t>(), input_ids.get_size(), m_prompt_ids.begin());
         m_prompt_log_probs.reserve(m_prompt_ids.size());
+
+        // create a single sequence
+        add_sequence(Sequence::create(m_next_sequence_id++));
     }
 
     void add_sequence(const Sequence::Ptr & sequence) {
+        sequence->set_sequence_group_ptr(this);
         m_sequences.emplace_back(sequence);
     }
 
@@ -322,7 +320,6 @@ class SequenceGroup {
         return it != m_sequences.end();
     }
 
-
     /**
      * @param seq_id Sequence identifier
      * @return Pointer to the sequence with this ID.
@@ -344,8 +341,8 @@ class SequenceGroup {
 
         std::sort(finished_seqs.begin(), finished_seqs.end(), [=] (Sequence::CPtr s1, Sequence::CPtr s2) -> bool {
             bool is_beam_search = m_sampling_params.is_beam_search();
-            const float score_1 = is_beam_search ? s1->get_beam_search_score(m_sampling_params) : s1->get_cumulative_log_probs();
-            const float score_2 = is_beam_search ? s2->get_beam_search_score(m_sampling_params) : s2->get_cumulative_log_probs();
+            const float score_1 = is_beam_search ? s1->get_beam_search_score(m_sampling_params) : s1->get_cumulative_log_prob();
+            const float score_2 = is_beam_search ? s2->get_beam_search_score(m_sampling_params) : s2->get_cumulative_log_prob();
             return score_1 > score_2;
         });
 
@@ -409,7 +406,6 @@ class SequenceGroup {
         m_num_evicted_tokens += num_evicted_tokens;
     }
 
-
     /**
      * Resets the eviction tracking on this sequence to the state prior to any eviction taking place.
      */
@@ -434,7 +430,6 @@ class SequenceGroup {
         return get_num_processed_tokens() + get_num_scheduled_tokens();
     }
 
-
     bool requires_sampling() const {
         return get_context_len() >= get_prompt_len() && get_context_len() > m_max_content_len && m_sampling_params.max_new_tokens > 0;
     }
@@ -513,7 +508,6 @@ class SequenceGroup {
         return (get_context_len() - get_num_evicted_tokens() + m_block_size - 1) / m_block_size;
     }
 
-
     // requires number of physical blocks for next generation
     size_t get_num_blocks() const {
         return get_num_logical_blocks();
@@ -524,10 +518,9 @@ class SequenceGroup {
     }
 
     Sequence::Ptr fork_sequence(Sequence::CPtr sequence) {
-        auto ptr = sequence->get_sequence_group_ptr();
-        m_sequences.emplace_back(Sequence::fork(std::move(sequence), m_next_sequence_id++));
-        set_sequence_group_ptr(ptr);
-        return m_sequences.back();
+        auto forked_sequence = Sequence::fork(sequence, m_next_sequence_id++);
+        m_sequences.emplace_back(forked_sequence);
+        return forked_sequence;
     }
 
     const ov::genai::GenerationConfig& get_sampling_parameters() const {
@@ -568,12 +561,6 @@ class SequenceGroup {
         return m_is_gen_paused;
     }
 
-    void set_sequence_group_ptr(std::shared_ptr<SequenceGroup> sequence_group) {
-        for (auto sequence: m_sequences) {
-            sequence->set_sequence_group_ptr(sequence_group);
-        }
-    }
-
     GenerationStream::Ptr get_generation_stream() {
         return m_generation_stream;
     }
@@ -600,7 +587,7 @@ class SequenceGroup {
                 output.generated_ids.insert(output.generated_ids.begin(), m_prompt_ids.begin(), m_prompt_ids.end());
                 output.generated_log_probs.insert(output.generated_log_probs.begin(), m_prompt_log_probs.begin(), m_prompt_log_probs.end());
             }
-            output.score = m_sampling_params.is_beam_search() ? sequence->get_beam_search_score(m_sampling_params) : sequence->get_cumulative_log_probs();
+            output.score = m_sampling_params.is_beam_search() ? sequence->get_beam_search_score(m_sampling_params) : sequence->get_cumulative_log_prob();
             output.finish_reason = sequence->get_finish_reason();
             outputs.emplace(sequence->get_grouped_id(), output);
         }
@@ -684,4 +671,10 @@ class SequenceGroup {
         m_generation_stream->push(std::move(outputs));
     } 
 };
+
+inline std::shared_ptr<SequenceGroup> Sequence::get_sequence_group_ptr() const {
+    assert(m_sequence_group != nullptr);
+    return m_sequence_group->shared_from_this();
+}
+
 }
diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp
index 5091218ccd..a1d0e85f17 100644
--- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp
@@ -159,7 +159,7 @@ init_request(
     for (const auto& candidate_sequence : candidates) {
         Sequence::Ptr sequence;
         if (is_init_all_sequences_in_request && candidate_sequence.first > 0) {
-            sequence = Sequence::Ptr(new Sequence(candidate_sequence.first));
+            sequence = Sequence::create(candidate_sequence.first);
             sequence->set_status(ov::genai::SequenceStatus::RUNNING);
             request->add_sequence(sequence);
         } else {
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
index d625485205..ebc5c3b5dd 100644
--- a/src/cpp/src/visual_language/pipeline.cpp
+++ b/src/cpp/src/visual_language/pipeline.cpp
@@ -175,7 +175,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         std::vector<SequenceGroup::Ptr> requests;
         size_t request_id = 0;
         size_t block_size = 1; // not used
-        bool enable_prefix_caching = false;
 
         size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1) - to_remove_from_hist;
         size_t inputs_embeds_size = inputs_embeds.get_shape().at(1);
@@ -185,8 +184,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), m_tokenizer.get_pad_token_id());
         std::copy(tokenized_history.begin(), tokenized_history.end(), prompt_ids.data<int64_t>());
 
-        SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, prompt_ids, generation_config, block_size, enable_prefix_caching);
-        sequence_group->set_sequence_group_ptr(sequence_group);
+        SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, prompt_ids, generation_config, block_size);
         requests.push_back(sequence_group);
 
         std::shared_ptr<StreamerBase> streamer_ptr = std::visit(overloaded{
diff --git a/tests/cpp/block_manager.cpp b/tests/cpp/block_manager.cpp
index 466cc23864..46c2fdddd7 100644
--- a/tests/cpp/block_manager.cpp
+++ b/tests/cpp/block_manager.cpp
@@ -13,12 +13,11 @@ TEST(TestBlockManager, general_test) {
     ov::genai::TokenIds prompt_ids;
 
     ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared<ov::genai::SequenceGroup>(
-        0, 
+        0,
         ov::Tensor(ov::element::i64, {
         prompt_ids.size()}, prompt_ids.data()),
         ov::genai::beam_search(),
-        4, 
-        false);
+        4);
     auto sequence = sequence_group->get_not_finished_sequences()[0];
     bm.allocate(sequence, 6);
     auto seq_id = sequence->get_id();
@@ -46,13 +45,11 @@ TEST(TestBlockManager, required_blocks_count) {
 
     std::vector<uint64_t> tokens = {0,1,2,3,4};
     ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared<ov::genai::SequenceGroup>(
-        0, 
+        0,
         ov::Tensor(ov::element::i64, {
         tokens.size()}, tokens.data()),
         ov::genai::beam_search(),
-        4,
-        false);
-    sequence_group->set_sequence_group_ptr(sequence_group);
+        4);
     sequence_group->schedule_tokens(5);
     auto required_blocks = bm.required_blocks_count(sequence_group);
     EXPECT_EQ(required_blocks, 2);
@@ -62,7 +59,7 @@ TEST(TestBlockManager, required_blocks_count) {
     EXPECT_EQ(bm.get_number_of_blocks_occupied_by_sequence(sequence_group), 2);
 
     sequence_group->finish_iteration();
-    auto sequence_to_fork = sequence_group->get_running_sequences()[0];    
+    auto sequence_to_fork = sequence_group->get_running_sequences()[0];
     for (size_t i = 0; i < 4; ++i) {
         const auto forked_sequence = sequence_group->fork_sequence(sequence_to_fork);
         bm.fork_sequence(sequence_to_fork->get_id(), forked_sequence->get_id());
@@ -98,9 +95,7 @@ TEST(TestBlockManager, CanFreeBlocksFromSequence) {
             ov::Tensor(ov::element::i64, {
                     tokens.size()}, tokens.data()),
             ov::genai::beam_search(),
-            BLOCK_SIZE,
-            false);
-    sequence_group->set_sequence_group_ptr(sequence_group);
+            BLOCK_SIZE);
     sequence_group->schedule_tokens(5);
     bm.append_slots(sequence_group);
     ASSERT_EQ(bm.num_free_blocks(), 5);
diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp
index 5dc848aba5..095cc39f09 100644
--- a/tests/cpp/cache_manager.cpp
+++ b/tests/cpp/cache_manager.cpp
@@ -11,14 +11,17 @@
 
 using namespace ov::genai;
 
-std::shared_ptr<ov::Model> get_dummy_model(size_t num_layers) {
+std::shared_ptr<ov::Model> get_dummy_model(ov::Core core, size_t num_layers) {
     ov::NodeVector keys;
     ov::NodeVector values;
     ov::ParameterVector params;
+    ov::element::Type inference_precision = core.get_property("CPU", ov::hint::inference_precision);
+    ov::element::Type kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16;
+
     auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()});
     for (size_t i = 0; i < num_layers; i++) {
-        auto key = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
-        auto value = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
+        auto key = std::make_shared<ov::op::v0::Parameter>(kv_cache_type, shape);
+        auto value = std::make_shared<ov::op::v0::Parameter>(kv_cache_type, shape);
         key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)});
         value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)});
         keys.push_back(key);
@@ -57,7 +60,7 @@ TEST(TestCacheManager, test_cache_size_param) {
     std::vector<size_t> num_kv_heads(12, 12);
     device_config.set_model_params(num_kv_heads, 64, num_decoder_layers);
 
-    ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
+    ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request();
     auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
     auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers());
     cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks());
@@ -80,7 +83,7 @@ TEST(TestCacheManager, test_kv_blocks_param) {
     std::vector<size_t> num_kv_heads(12, 12);
     device_config.set_model_params(num_kv_heads, 64, num_decoder_layers);
 
-    ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
+    ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request();
     auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
     auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers());
     OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), scheduler_config.num_kv_blocks);
@@ -107,7 +110,7 @@ TEST(TestCacheManager, test_dynamic_cache_increase) {
     }
 
 
-    ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
+    ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request();
     auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
     auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers());
 
diff --git a/tests/cpp/sampler.cpp b/tests/cpp/sampler.cpp
index f146ab7426..3741880827 100644
--- a/tests/cpp/sampler.cpp
+++ b/tests/cpp/sampler.cpp
@@ -38,7 +38,7 @@ TEST(SamplerValidationMode, gen_phase_to_cut_whole_seq) {
     std::vector<int64_t> input_vector{0, 1, 2, 3, 4};
     ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data());
     std::vector<SequenceGroup::Ptr> sequence_groups{
-        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)),
+        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32)),
     };
 
     // to emulate processed prompt and add next token [ 0 ]
@@ -82,7 +82,7 @@ TEST(SamplerValidationMode, gen_phase_to_cut_part_seq) {
     std::vector<int64_t> input_vector{0, 1, 2, 3, 4};
     ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data());
     std::vector<SequenceGroup::Ptr> sequence_groups{
-        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)),
+        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32)),
     };
 
     // to emulate processed prompt and add next token [ 0 ]
@@ -127,7 +127,7 @@ TEST(SamplerValidationMode, gen_phase) {
     std::vector<int64_t> input_vector{0, 1, 2, 3, 4};
     ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data());
     std::vector<SequenceGroup::Ptr> sequence_groups{
-        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)),
+        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32)),
     };
 
     // to emulate processed prompt and add next token [ 0 ]
@@ -171,7 +171,7 @@ TEST(SamplerValidationMode, prompt_phase_to_cut_part_seq) {
     std::vector<int64_t> input_vector{0, 1, 2, 3, 4};
     ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data());
     std::vector<SequenceGroup::Ptr> sequence_groups{
-        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)),
+        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32)),
     };
 
     // append candidates [ 0, 1, 1 ]
@@ -217,7 +217,7 @@ TEST(SamplerValidationMode, prompt_phase_to_cut_whole_seq) {
     std::vector<int64_t> input_vector{0, 1, 2, 3, 4};
     ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data());
     std::vector<SequenceGroup::Ptr> sequence_groups{
-        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)),
+        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32)),
     };
 
     // append candidates [ 1, 2, 3 ]
@@ -262,7 +262,7 @@ TEST(SamplerValidationMode, prompt_phase) {
     std::vector<int64_t> input_vector{0, 1, 2, 3, 4};
     ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data());
     std::vector<SequenceGroup::Ptr> sequence_groups{
-        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)),
+        SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32)),
     };
 
     // append candidates [ 0, 1, 2 ]
diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp
index cc0b53a433..23594adf50 100644
--- a/tests/cpp/scheduler.cpp
+++ b/tests/cpp/scheduler.cpp
@@ -18,14 +18,17 @@ void clear_finished_sequences(std::vector<SequenceGroup::Ptr>& requests) {
     });
     requests.erase(new_end, requests.end());
 }
-std::shared_ptr<ov::Model> get_model(size_t num_layers) {
+std::shared_ptr<ov::Model> get_model(ov::Core core, size_t num_layers) {
     ov::NodeVector keys;
     ov::NodeVector values;
     ov::ParameterVector params;
+    ov::element::Type inference_precision = core.get_property("CPU", ov::hint::inference_precision);
+    ov::element::Type kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16;
+
     auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()});
     for (size_t i = 0; i < num_layers; i++) {
-        auto key = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
-        auto value = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
+        auto key = std::make_shared<ov::op::v0::Parameter>(kv_cache_type, shape);
+        auto value = std::make_shared<ov::op::v0::Parameter>(kv_cache_type, shape);
         key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)});
         value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)});
         keys.push_back(key);
@@ -42,12 +45,12 @@ std::shared_ptr<ov::Model> get_model(size_t num_layers) {
 std::shared_ptr<CacheManager> init_cache_manager(SchedulerConfig scheduler_config) {
     ov::Core core = ov::Core();
     size_t num_decoder_layers = 12;
-    ov::InferRequest request = core.compile_model(get_model(num_decoder_layers)).create_infer_request();
+    ov::InferRequest request = core.compile_model(get_model(core, num_decoder_layers)).create_infer_request();
     size_t head_size = 64, head_size_u8 = head_size + 8;
     std::vector<size_t> num_kv_heads(12, 12);
     ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
     device_config.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers);
-    return std::make_shared<CacheManager>(device_config, request, core);  
+    return std::make_shared<CacheManager>(device_config, request, core);
 }
 
 TEST(TestScheduler, general_test) {
@@ -63,17 +66,17 @@ TEST(TestScheduler, general_test) {
     for (auto scheduler_config: configs) {
         std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7};
         SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                                ov::genai::greedy(), 4);
         auto idx0 = (*sequence_group1)[0]->get_id();
         SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                                ov::genai::greedy(), 4);
         auto idx1 = (*sequence_group2)[0]->get_id();
         SequenceGroup::Ptr sequence_group3 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                                ov::genai::greedy(), 4);
         auto idx2 = (*sequence_group3)[0]->get_id();
         std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2, sequence_group3};
-        
-        // schedule 3 sequence groups that use 6 kv blocks 
+
+        // schedule 3 sequence groups that use 6 kv blocks
         Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
         auto out1 = scheduler.schedule(requests);
 
@@ -82,7 +85,7 @@ TEST(TestScheduler, general_test) {
         EXPECT_EQ(out1.m_block_tables[idx0][0].size(), 2);
         EXPECT_EQ(out1.m_block_tables[idx1][0].size(), 2);
         EXPECT_EQ(out1.m_block_tables[idx2][0].size(), 2);
-        // tokens.size() * 2 tokens should be scheduled on prompt phase, corresponding to first three sequences 
+        // tokens.size() * 2 tokens should be scheduled on prompt phase, corresponding to first three sequences
         EXPECT_EQ(out1.m_total_num_scheduled_tokens, tokens.size() * 3);
         EXPECT_EQ(out1.is_prompt, !scheduler_config.dynamic_split_fuse);
 
@@ -109,7 +112,7 @@ TEST(TestScheduler, general_test) {
         EXPECT_EQ(out3.m_block_tables[idx0][0].size(), 3);
         EXPECT_EQ(out3.m_block_tables[idx1][0].size(), 3);
         // 2 tokens should be scheduled on generate phase for "0" and "1" sequence, "2" sequence should be preempted
-        EXPECT_EQ(out3.m_total_num_scheduled_tokens, 2); 
+        EXPECT_EQ(out3.m_total_num_scheduled_tokens, 2);
         EXPECT_FALSE(out3.is_prompt);
 
         // check that scheduler has no block table for sequence_group3
@@ -124,7 +127,7 @@ TEST(TestScheduler, general_test) {
 
         auto out4 = scheduler.schedule(requests);
 
-        // check that sequence_group3 is fully scehuled 
+        // check that sequence_group3 is fully scehuled
         EXPECT_EQ(out4.m_block_tables[idx2][0].size(), 2);
         EXPECT_FALSE(out4.m_block_tables[idx2][0][0]->is_free());
         EXPECT_EQ(out4.m_block_tables[idx2][0][0]->get_index(), 0);
@@ -168,10 +171,10 @@ TEST_P(AppendSlotsSchedulerTest, test_append_slots_considers_all_sequences) {
     auto scheduler_config = GetParam();
     std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7};
     SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     auto idx0 = (*sequence_group1)[0]->get_id();
     SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     auto idx1 = (*sequence_group2)[0]->get_id();
     std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
 
@@ -233,11 +236,11 @@ TEST_P(PartialPreemptionSchedulerTest, test_partial_preemption) {
     auto scheduler_config = GetParam();
     std::vector<uint64_t> tokens1 = {0,1,2,3,4,5,6,7,8,9,10};
     SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens1.size()}, tokens1.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     std::vector<uint64_t> tokens2 = {0,1,2,3,4,5,6,7};
     auto idx0 = (*sequence_group1)[0]->get_id();
     SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens2.size()}, tokens2.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     auto idx1 = (*sequence_group2)[0]->get_id();
     std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
 
@@ -324,9 +327,9 @@ TEST(TestScheduler, test_partial_preemption_beam_search) {
 
         // create beam search group
         SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                ov::genai::beam_search(), 4, scheduler_config.enable_prefix_caching);
-        sequence_group->set_sequence_group_ptr(sequence_group);
+                                                                                ov::genai::beam_search(), 4);
         std::vector<SequenceGroup::Ptr> requests = {sequence_group};
+        EXPECT_NO_THROW(requests[0]->get_running_sequences()[0]->get_sequence_group_ptr());
 
         Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
         auto out = scheduler.schedule(requests);
@@ -336,7 +339,7 @@ TEST(TestScheduler, test_partial_preemption_beam_search) {
         sequence_group->finish_iteration();
 
         // make 2 forked sequence
-        auto sequence_to_fork = sequence_group->get_running_sequences()[0];    
+        auto sequence_to_fork = sequence_group->get_running_sequences()[0];
         for (size_t i = 0; i < 2; ++i) {
             const auto forked_sequence = sequence_group->fork_sequence(sequence_to_fork);
             scheduler.fork_sequence(sequence_to_fork->get_id(), forked_sequence->get_id());
@@ -352,7 +355,7 @@ TEST(TestScheduler, test_partial_preemption_beam_search) {
             }
             sequence_group->finish_iteration();
         }
-        // currently sequence occupies 4 blocks (1 shared, 3 not shared) 
+        // currently sequence occupies 4 blocks (1 shared, 3 not shared)
 
         // make another 2 forked sequence
         for (size_t i = 0; i < 2; ++i) {
@@ -373,8 +376,7 @@ TEST(TestScheduler, test_partial_preemption_beam_search) {
 
         // create group, which requires 1 block
         SequenceGroup::Ptr sequence_group_greedy = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
-        sequence_group_greedy->set_sequence_group_ptr(sequence_group_greedy);
+                                                                                ov::genai::greedy(), 4);
 
         // set greedy group at the beginning of list to make it higher priority
         std::vector<SequenceGroup::Ptr> new_requests = {sequence_group_greedy, sequence_group};
@@ -386,8 +388,8 @@ TEST(TestScheduler, test_partial_preemption_beam_search) {
 
         EXPECT_EQ(sequence_group->get_num_processed_tokens(), 12);
         EXPECT_EQ(sequence_group->get_context_len(), 12);
-        
-        // beam search group should be partially preempted and 5 blocks should be released 
+
+        // beam search group should be partially preempted and 5 blocks should be released
         out = scheduler.schedule(new_requests);
         sequence_group_greedy->get_sequences()[0]->append_token(token, 0.5);
         sequence_group_greedy->finish_iteration();
@@ -399,8 +401,8 @@ TEST(TestScheduler, test_partial_preemption_beam_search) {
         EXPECT_EQ(scheduler.get_block_tables(*seqs[2])[0].size(), 2);
         EXPECT_EQ(scheduler.get_block_tables(*seqs[3])[0].size(), 2);
         EXPECT_EQ(scheduler.get_block_tables(*seqs[4])[0].size(), 2);
-        
-        // append another 20 tokens to greedy group, this should result in usage of all free blocks and 
+
+        // append another 20 tokens to greedy group, this should result in usage of all free blocks and
         // another partial preemption of beam search group
         for (size_t i = 0; i < 20; i++) {
             out = scheduler.schedule(new_requests);
@@ -431,13 +433,13 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
     for (auto scheduler_config: configs) {
         std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7,8,9,10,11};
         SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                                ov::genai::greedy(), 4);
         auto idx0 = (*sequence_group1)[0]->get_id();
         SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                                ov::genai::greedy(), 4);
         auto idx1 = (*sequence_group2)[0]->get_id();
-        std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};                                                
-        
+        std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
+
         // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks.
         Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
         auto out1 = scheduler.schedule(requests);
@@ -450,7 +452,7 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
 
         // sequence_group2 should be fully preempted
         auto out2 = scheduler.schedule(requests);
-        
+
         // check that sequence_group1 has one more allocated block
         auto block_tables_for_all_layers = scheduler.get_block_tables(*(*sequence_group1)[0]);
         auto block_table1 = block_tables_for_all_layers[0];
@@ -467,7 +469,7 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
 
         std::vector<uint64_t> ref_ids = {0};
         EXPECT_EQ(out2.m_scheduled_sequence_groups_ids, ref_ids);
-        EXPECT_EQ(out2.m_total_num_scheduled_tokens, 1); 
+        EXPECT_EQ(out2.m_total_num_scheduled_tokens, 1);
 
         if (scheduler_config.dynamic_split_fuse) {
             // for dynamic_split_fuse sequence_group2 is preemted partially, part of prompt is left
@@ -479,12 +481,12 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
             // for vllm case sequence_group2 is fully preempted
             EXPECT_FALSE(scheduler.has_block_table(idx1));
         }
-        
+
         for (auto seq: requests) {
             std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences();
             seq->finish_iteration();
         }
-        
+
         // finish first sequence
         requests[0]->get_running_sequences()[0]->set_status(SequenceStatus::FINISHED);
         scheduler.free_sequence(idx0);
@@ -496,11 +498,11 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
 
         if (scheduler_config.dynamic_split_fuse) {
             // remaining part of prompt should be scheduled
-            EXPECT_EQ(out3.m_total_num_scheduled_tokens, 4); 
+            EXPECT_EQ(out3.m_total_num_scheduled_tokens, 4);
         }
         else {
             // prompt should be fully scheduled
-            EXPECT_EQ(out3.m_total_num_scheduled_tokens, 12); 
+            EXPECT_EQ(out3.m_total_num_scheduled_tokens, 12);
         }
 
         EXPECT_EQ(out3.m_block_tables[idx1][0][0]->get_index(), 3);
@@ -541,16 +543,14 @@ TEST(TestScheduler, prefix_caching_test) {
             std::vector<uint64_t> tokens = histrory_tokens;
             tokens.insert(tokens.end(), prompt_tokens.begin(), prompt_tokens.end());
             SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                    ov::genai::greedy(), 4, 
-                                                                                    scheduler_config.enable_prefix_caching);
-            sequence_group->set_sequence_group_ptr(sequence_group);
+                                                                                    ov::genai::greedy(), 4);
             scheduler.restore_cached_blocks(sequence_group);
             std::vector<SequenceGroup::Ptr> requests = {sequence_group};
 
             auto out1 = scheduler.schedule(requests);
             if (chat_iteration == 0)
                 EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size());
-            else 
+            else
                 EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size() + 1);
             for (auto seq: requests) {
                 std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences();
@@ -604,14 +604,10 @@ TEST(TestScheduler, prefix_caching_test_two_identical_sequences) {
             std::vector<uint64_t> tokens = histrory_tokens;
             tokens.insert(tokens.end(), prompt_tokens.begin(), prompt_tokens.end());
             SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                    ov::genai::greedy(), 4, 
-                                                                                    scheduler_config.enable_prefix_caching);
+                                                                                    ov::genai::greedy(), 4);
 
             SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                                    ov::genai::greedy(), 4, 
-                                                                                    scheduler_config.enable_prefix_caching);
-            sequence_group1->set_sequence_group_ptr(sequence_group1);
-            sequence_group2->set_sequence_group_ptr(sequence_group2);
+                                                                                    ov::genai::greedy(), 4);
             std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
             // restore cached blocks
             for (auto request: requests) {
@@ -622,7 +618,7 @@ TEST(TestScheduler, prefix_caching_test_two_identical_sequences) {
             auto out1 = scheduler.schedule(requests);
             if (chat_iteration == 0)
                 EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size() * 2);
-            else 
+            else
                 EXPECT_EQ(out1.m_total_num_scheduled_tokens, (prompt_tokens.size() + 1) * 2);
             for (auto seq: requests) {
                 std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences();
@@ -650,7 +646,7 @@ TEST(TestScheduler, prefix_caching_test_two_identical_sequences) {
                 scheduler.free_sequence(idx0);
             }
             auto generated_ids = requests[0]->get_sequences()[0]->get_generated_ids();
-            
+
             histrory_tokens.insert(histrory_tokens.end(), prompt_tokens.begin(), prompt_tokens.end());
             histrory_tokens.insert(histrory_tokens.end(), generated_ids.begin(), generated_ids.end());
         }
@@ -676,10 +672,8 @@ TEST(TestScheduler, prefix_caching_with_max_new_tokens_equal_1) {
 
         for (size_t chat_iteration = 0; chat_iteration < chat_iterations; chat_iteration++) {
             SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {prompt_tokens.size()}, prompt_tokens.data()),
-                                                                                    ov::genai::greedy(), 32, 
-                                                                                    scheduler_config.enable_prefix_caching);
+                                                                                    ov::genai::greedy(), 32);
 
-            sequence_group->set_sequence_group_ptr(sequence_group);
             std::vector<SequenceGroup::Ptr> requests = {sequence_group};
             // restore cached blocks
             for (auto request: requests) {
@@ -690,7 +684,7 @@ TEST(TestScheduler, prefix_caching_with_max_new_tokens_equal_1) {
             auto out1 = scheduler.schedule(requests);
             if (chat_iteration == 0)
                 EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size());
-            else 
+            else
                 EXPECT_EQ(out1.m_total_num_scheduled_tokens, 1);
             for (auto seq: requests) {
                 std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences();
@@ -721,10 +715,10 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed) {
 
     std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7,8,9,10,11};
     SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     auto idx0 = (*sequence_group1)[0]->get_id();
     SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     auto idx1 = (*sequence_group2)[0]->get_id();
     std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
 
@@ -796,10 +790,10 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed2) {
 
     std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7,8,9};
     SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     auto idx0 = (*sequence_group1)[0]->get_id();
     SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
-                                                                            ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
+                                                                            ov::genai::greedy(), 4);
     auto idx1 = (*sequence_group2)[0]->get_id();
     std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
 
@@ -909,12 +903,11 @@ TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) {
                                                                          ov::Tensor(ov::element::i64, {tokens1.size()},
                                                                                     tokens1.data()),
                                                                          ov::genai::greedy(),
-                                                                         2,
-                                                                         scheduler_config.enable_prefix_caching);
+                                                                         2);
     std::vector<uint64_t> tokens2 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // 5 full blocks, larger than eviction arena size (3 blocks) - will start evicting already at prompt stage
     auto idx1 = (*sequence_group1)[0]->get_id();
     SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens2.size()}, tokens2.data()),
-                                                                         ov::genai::greedy(), 2, scheduler_config.enable_prefix_caching);
+                                                                         ov::genai::greedy(), 2);
     auto idx2 = (*sequence_group2)[0]->get_id();
     std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
 
diff --git a/tests/cpp/speculative_decoding.cpp b/tests/cpp/speculative_decoding.cpp
index bb10c2cc8f..1cf8db0fab 100644
--- a/tests/cpp/speculative_decoding.cpp
+++ b/tests/cpp/speculative_decoding.cpp
@@ -20,9 +20,7 @@ class CBForSDTest : public testing::Test, public ov::genai::ContinuousBatchingPi
 
             ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared<ov::genai::SequenceGroup>(request_id, input_ids,
                                                                                 sampling_params, 
-                                                                                32,
-                                                                                true);
-            sequence_group->set_sequence_group_ptr(sequence_group);
+                                                                                32);
 
             {
                 std::lock_guard<std::mutex> lock{m_awaiting_requests_mutex};

From b041c1561ac91f5e9dca1ebbe53be9e9922469c8 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 31 Dec 2024 13:49:26 +0400
Subject: [PATCH 072/110] [Python API] Clean up utils (#1457)

Hide non-used functions from `src/python/py_utils.hpp`
---
 src/python/py_llm_pipeline.cpp | 2 +-
 src/python/py_utils.cpp        | 9 ++++++---
 src/python/py_utils.hpp        | 8 +-------
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/python/py_llm_pipeline.cpp b/src/python/py_llm_pipeline.cpp
index 7360975a0b..2d5e5e6abc 100644
--- a/src/python/py_llm_pipeline.cpp
+++ b/src/python/py_llm_pipeline.cpp
@@ -71,7 +71,7 @@ py::object call_common_generate(
         DecodedResults res = pipe.generate(string_input, updated_config, streamer);
         // If input was a string return a single string otherwise return DecodedResults.
         if (updated_config.has_value() && (*updated_config).num_return_sequences == 1) {
-            results = py::cast<py::object>(pyutils::handle_utf8(res.texts)[0]);
+            results = py::cast<py::object>(pyutils::handle_utf8(res.texts[0]));
         } else {
             results = py::cast(res);
         }
diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp
index 34522409ea..5fdf6adce1 100644
--- a/src/python/py_utils.cpp
+++ b/src/python/py_utils.cpp
@@ -37,6 +37,8 @@ py::list handle_utf8(const std::vector<std::string>& decoded_res) {
     return res;
 }
 
+namespace {
+
 bool py_object_is_any_map(const py::object& py_obj) {
     if (!py::isinstance<py::dict>(py_obj)) {
         return false;
@@ -290,7 +292,9 @@ ov::Any py_object_to_any(const py::object& py_obj, std::string property_name) {
     OPENVINO_THROW("Property \"" + property_name + "\" got unsupported type.");
 }
 
-std::map<std::string, ov::Any> properties_to_any_map(const std::map<std::string, py::object>& properties) {
+} // namespace
+
+ov::AnyMap properties_to_any_map(const std::map<std::string, py::object>& properties) {
     std::map<std::string, ov::Any> properties_to_cpp;
     for (const auto& property : properties) {
         properties_to_cpp[property.first] = py_object_to_any(property.second, property.first);
@@ -298,7 +302,6 @@ std::map<std::string, ov::Any> properties_to_any_map(const std::map<std::string,
     return properties_to_cpp;
 }
 
-
 ov::AnyMap kwargs_to_any_map(const py::kwargs& kwargs) {
     ov::AnyMap params = {};
 
@@ -356,7 +359,7 @@ ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::O
         return std::nullopt;
 
     ov::genai::GenerationConfig res_config;
-    if(config.has_value())
+    if (config.has_value())
         res_config = *config;
 
     if (!kwargs.empty())
diff --git a/src/python/py_utils.hpp b/src/python/py_utils.hpp
index 20094196a6..feb9920dbe 100644
--- a/src/python/py_utils.hpp
+++ b/src/python/py_utils.hpp
@@ -28,13 +28,7 @@ py::list handle_utf8(const std::vector<std::string>& decoded_res);
 
 py::str handle_utf8(const std::string& text);
 
-ov::Any py_object_to_any(const py::object& py_obj, std::string property_name);
-
-bool py_object_is_any_map(const py::object& py_obj);
-
-ov::AnyMap py_object_to_any_map(const py::object& py_obj);
-
-std::map<std::string, ov::Any> properties_to_any_map(const std::map<std::string, py::object>& properties);
+ov::AnyMap properties_to_any_map(const std::map<std::string, py::object>& properties);
 
 ov::AnyMap kwargs_to_any_map(const py::kwargs& kwargs);
 

From afb4ad0735abd237b21156d3ad304add141ddd8b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 31 Dec 2024 15:51:17 +0400
Subject: [PATCH 073/110] Removed WAs for OpenVINO: pass properties as is
 (#1396)

Merge after https://github.com/openvinotoolkit/openvino/pull/28066
---
 .github/workflows/causal_lm_cpp.yml           |  8 ++--
 .github/workflows/job_vlm_sample_llava.yml    |  2 +-
 .github/workflows/lcm_dreamshaper_cpp.yml     |  4 +-
 .github/workflows/linux.yml                   |  4 +-
 .github/workflows/mac.yml                     |  6 +--
 .../workflows/stable_diffusion_1_5_cpp.yml    |  4 ++
 .github/workflows/windows.yml                 |  6 +--
 CMakeLists.txt                                |  1 -
 samples/export-requirements.txt               |  1 +
 src/cpp/CMakeLists.txt                        |  5 ++-
 src/cpp/src/continuous_batching_adapter.hpp   |  4 +-
 src/cpp/src/continuous_batching_impl.cpp      | 10 ++---
 src/cpp/src/continuous_batching_pipeline.cpp  |  5 +--
 .../models/autoencoder_kl.cpp                 | 12 ++----
 .../models/clip_text_model.cpp                |  6 +--
 .../clip_text_model_with_projection.cpp       |  6 +--
 .../models/flux_transformer_2d_model.cpp      |  5 +--
 .../models/sd3_transformer_2d_model.cpp       |  5 +--
 .../models/t5_encoder_model.cpp               | 10 ++---
 .../models/unet2d_condition_model.cpp         |  6 +--
 .../models/unet_inference_dynamic.hpp         |  5 +--
 src/cpp/src/llm_pipeline.cpp                  | 43 ++++++++++---------
 src/cpp/src/llm_pipeline_stateful.cpp         | 19 ++++----
 src/cpp/src/llm_pipeline_static.cpp           |  8 ++--
 .../speculative_decoding_impl.cpp             | 19 ++++----
 src/cpp/src/tokenizer.cpp                     | 37 ++++++++--------
 src/cpp/src/tokenizers_path.hpp               |  6 +--
 src/cpp/src/utils.cpp                         | 29 -------------
 src/cpp/src/utils.hpp                         |  3 --
 .../src/visual_language/embedding_model.cpp   |  2 +-
 .../src/visual_language/processor_config.cpp  |  2 +-
 src/cpp/src/visual_language/vlm_config.cpp    |  2 +-
 src/cpp/src/whisper_pipeline.cpp              | 17 ++++----
 src/cpp/src/whisper_pipeline_static.cpp       |  6 +--
 src/python/py_utils.cpp                       |  9 ++--
 src/python/py_utils.hpp                       |  4 +-
 tests/python_tests/requirements.txt           |  4 +-
 37 files changed, 141 insertions(+), 184 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 4aad3d4bc3..fb0c9c4b0b 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -16,10 +16,10 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241224_x86_64.tgz
-  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241224_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/w_openvino_toolkit_windows_2025.0.0.dev20241224_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241230_x86_64.tgz
+  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241230_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241230_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/w_openvino_toolkit_windows_2025.0.0.dev20241230_x86_64.zip
 jobs:
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
diff --git a/.github/workflows/job_vlm_sample_llava.yml b/.github/workflows/job_vlm_sample_llava.yml
index 5f4634616a..781526f71f 100644
--- a/.github/workflows/job_vlm_sample_llava.yml
+++ b/.github/workflows/job_vlm_sample_llava.yml
@@ -11,7 +11,7 @@ on:
         type: string
 
 env:
-  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
+  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241230_x86_64.tgz
 
 jobs:
   visual_language_chat_sample-ubuntu-llava:
diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index c525b0be68..cbd847240d 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -18,8 +18,8 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.9'
-  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
-  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/w_openvino_toolkit_windows_2025.0.0.dev20241224_x86_64.zip
+  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241230_x86_64.tgz
+  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17726-9ab2c1a18e7/w_openvino_toolkit_windows_2025.0.0.dev20241230_x86_64.zip
   OV_INSTALL_DIR: ${{ github.workspace }}/ov
 
 jobs:
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 9b21491f9b..0a991e2a54 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -109,10 +109,10 @@ jobs:
           merge-multiple: true
 
       - name: CMake Build
-        run: |    
+        run: |
           source ${{ env.OV_INSTALL_DIR }}/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ${{ env.SRC_DIR}} -B ${{ env.BUILD_DIR }}
-          cmake --build ${{ env.BUILD_DIR}} --config ${{ matrix.build-type }} --parallel $(nproc)
+          cmake --build ${{ env.BUILD_DIR}} --config ${{ matrix.build-type }} --parallel $(nproc) --verbose
           cmake --install ${{ env.BUILD_DIR }} --config ${{ matrix.build-type }} --prefix ${{ env.INSTALL_DIR }}
       
       - name: Pack Artifacts
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 4d9b7f032b..fb66271ff7 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -219,7 +219,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
+          cmake --build ./build/ --config Release --parallel --verbose
 
       - name: Test bindings
         run: |
@@ -284,7 +284,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release --target py_openvino_genai -j
+          cmake --build ./build/ --config Release --target py_openvino_genai --parallel --verbose
 
       - name: Test bindings
         run: |
@@ -350,7 +350,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
-          cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
+          cmake --build ./build/ --config ${{ matrix.build-type }} --target package --parallel --verbose
 
       - name: Build and Install dependencies
         run: |
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index 34c5a0f87e..e0bf5371b3 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -122,6 +122,8 @@ jobs:
           source openvino_sd_cpp/bin/activate
           optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --weight-format fp16 --task stable-diffusion models/dreamlike-art-dreamlike-anime-1.0/FP16
           wget -O ./models/soulcard.safetensors https://civitai.com/api/download/models/72591
+        env:
+          HF_HUB_ENABLE_HF_TRANSFER: 1
 
       - name: Run text2image app
         run: |
@@ -198,6 +200,8 @@ jobs:
           . "./openvino_sd_cpp/Scripts/Activate.ps1"
           optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 models/dreamlike-art-dreamlike-anime-1.0/FP16
           Invoke-WebRequest -Uri 'https://civitai.com/api/download/models/72591' -OutFile 'models/soulcard.safetensors'
+        env:
+          HF_HUB_ENABLE_HF_TRANSFER: 1
 
       - name: Run text2image app
         run: |
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index fc63129281..e396671b2c 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -230,7 +230,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
+          cmake --build ./build/ --config Release --parallel --verbose
 
       - name: Test bindings
         run: |
@@ -295,7 +295,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release --target py_openvino_genai -j
+          cmake --build ./build/ --config Release --target py_openvino_genai --parallel --verbose
 
       - name: Test bindings
         run: |
@@ -360,7 +360,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release --target py_openvino_genai -j
+          cmake --build ./build/ --config Release --target py_openvino_genai --parallel --verbose
 
       - name: Test bindings
         run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fec8df34af..181132e210 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,7 +85,6 @@ if(MSVC AND MSVC_VERSION GREATER_EQUAL 1930 AND MSVC_VERSION LESS 1941)
     add_compile_definitions(_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR)
 endif()
 
-
 add_subdirectory(thirdparty)
 add_subdirectory(src)
 if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/samples")
diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index a589696beb..af38558656 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -10,3 +10,4 @@ diffusers==0.32.1 # For image generation pipelines
 timm==1.0.12  # For exporting InternVL2
 torchvision  # For visual language models
 transformers>=4.43 # For Whisper
+hf_transfer # for faster models download, should used with env var HF_HUB_ENABLE_HF_TRANSFER=1
\ No newline at end of file
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index d02f32ded9..24367c17ce 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -59,11 +59,13 @@ ov_genai_build_jinja2cpp()
 file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/*.c")
 
 set(TARGET_NAME openvino_genai)
+
 add_library(${TARGET_NAME} SHARED ${SOURCE_FILES})
+add_library(openvino::genai ALIAS ${TARGET_NAME})
+
 if(TARGET openvino_tokenizers)
     add_dependencies(${TARGET_NAME} openvino_tokenizers)
 endif()
-add_library(openvino::genai ALIAS ${TARGET_NAME})
 
 target_include_directories(${TARGET_NAME}
     PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>" "$<INSTALL_INTERFACE:runtime/include>"
@@ -81,6 +83,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES
     LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>"
     RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>"
 )
+
 # Extract two last digits from OpenVINOGenAI_VERSION_MAJOR because SOVERSION can only contain up to 4 symbols.
 string(REGEX MATCH [=[[0-9][0-9]$]=] MAJOR_SUFFIX ${OpenVINOGenAI_VERSION_MAJOR})
 if(DEFINED PY_BUILD_CMAKE_PACKAGE_NAME AND LINUX)
diff --git a/src/cpp/src/continuous_batching_adapter.hpp b/src/cpp/src/continuous_batching_adapter.hpp
index 246cb51149..0b0065aa1f 100644
--- a/src/cpp/src/continuous_batching_adapter.hpp
+++ b/src/cpp/src/continuous_batching_adapter.hpp
@@ -33,7 +33,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         const std::string& device,
         const ov::AnyMap& plugin_config
     ): LLMPipelineImplBase{tokenizer, GenerationConfig()}, m_impl{
-        models_path.string(),
+        models_path,
         tokenizer,
         scheduler_config,
         device,
@@ -64,7 +64,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         const std::string& device,
         const ov::AnyMap& plugin_config
     ): LLMPipelineImplBase{Tokenizer(models_path), GenerationConfig()}, m_impl{
-        models_path.string(),
+        models_path,
         m_tokenizer,
         scheduler_config,
         device,
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 3ab242418e..15c0e69d58 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -23,17 +23,13 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
     m_generation_config = generation_config;
     m_is_validation_mode_enabled = is_validation_mode_enabled;
 
-    ov::Core core;
-
-    auto [core_properties, compile_properties] = utils::split_core_compile_config(properties);
-    core.set_property(core_properties);
-
-    DeviceConfig device_config(core, scheduler_config, device, compile_properties);
+    ov::Core core = utils::singleton_core();
+    DeviceConfig device_config(core, scheduler_config, device, properties);
 
     bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
     utils::apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control);
 
-    initialize_pipeline(model, scheduler_config, compile_properties, device_config, core);
+    initialize_pipeline(model, scheduler_config, properties, device_config, core);
 }
 
 void ContinuousBatchingPipeline::ContinuousBatchingImpl::_pull_awaiting_requests() {
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 8b7003e4ab..c1c0677ff3 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -48,8 +48,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
     auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
 
-    std::filesystem::path openvino_model_name = "openvino_model.xml";
-    auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+    auto model = utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, properties);
     auto tokenizer = ov::genai::Tokenizer(models_path, tokenizer_properties);
     auto generation_config = utils::from_config_json_if_exists(models_path);
 
@@ -74,7 +73,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
     std::filesystem::path openvino_model_name = "openvino_model.xml";
-    auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+    auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, properties_without_draft_model);
     auto generation_config = utils::from_config_json_if_exists(models_path);
 
     if (is_prompt_lookup_enabled) {
diff --git a/src/cpp/src/image_generation/models/autoencoder_kl.cpp b/src/cpp/src/image_generation/models/autoencoder_kl.cpp
index d3dd7324ee..ab8b87a13e 100644
--- a/src/cpp/src/image_generation/models/autoencoder_kl.cpp
+++ b/src/cpp/src/image_generation/models/autoencoder_kl.cpp
@@ -91,8 +91,7 @@ AutoencoderKL::Config::Config(const std::filesystem::path& config_path) {
 
 AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_decoder_path)
     : m_config(vae_decoder_path / "config.json") {
-    ov::Core core = utils::singleton_core();
-    m_decoder_model = core.read_model((vae_decoder_path / "openvino_model.xml").string());
+    m_decoder_model = utils::singleton_core().read_model(vae_decoder_path / "openvino_model.xml");
     // apply VaeImageProcessor postprocessing steps by merging them into the VAE decoder model
     merge_vae_image_post_processing();
 }
@@ -100,8 +99,7 @@ AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_decoder_path)
 AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_encoder_path,
                              const std::filesystem::path& vae_decoder_path)
     : AutoencoderKL(vae_decoder_path) {
-    ov::Core core = utils::singleton_core();
-    m_encoder_model = core.read_model((vae_encoder_path / "openvino_model.xml").string());
+    m_encoder_model = utils::singleton_core().read_model(vae_encoder_path / "openvino_model.xml");
 }
 
 AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_decoder_path,
@@ -131,8 +129,7 @@ AutoencoderKL::AutoencoderKL(const std::string& vae_decoder_model,
                              const Tensor& vae_decoder_weights,
                              const Config& vae_decoder_config)
     : m_config(vae_decoder_config) {
-    ov::Core core = utils::singleton_core();
-    m_decoder_model = core.read_model(vae_decoder_model, vae_decoder_weights);
+    m_decoder_model = utils::singleton_core().read_model(vae_decoder_model, vae_decoder_weights);
     // apply VaeImageProcessor postprocessing steps by merging them into the VAE decoder model
     merge_vae_image_post_processing();
 }
@@ -143,8 +140,7 @@ AutoencoderKL::AutoencoderKL(const std::string& vae_encoder_model,
                              const Tensor& vae_decoder_weights,
                              const Config& vae_decoder_config)
     : AutoencoderKL(vae_decoder_model, vae_decoder_weights, vae_decoder_config) {
-    ov::Core core = utils::singleton_core();
-    m_encoder_model = core.read_model(vae_encoder_model, vae_encoder_weights);
+    m_encoder_model = utils::singleton_core().read_model(vae_encoder_model, vae_encoder_weights);
 }
 
 AutoencoderKL::AutoencoderKL(const std::string& vae_decoder_model,
diff --git a/src/cpp/src/image_generation/models/clip_text_model.cpp b/src/cpp/src/image_generation/models/clip_text_model.cpp
index 72fdc63082..c49bd5f000 100644
--- a/src/cpp/src/image_generation/models/clip_text_model.cpp
+++ b/src/cpp/src/image_generation/models/clip_text_model.cpp
@@ -37,8 +37,7 @@ CLIPTextModel::Config::Config(const std::filesystem::path& config_path) {
 CLIPTextModel::CLIPTextModel(const std::filesystem::path& root_dir) :
     m_clip_tokenizer(get_tokenizer_path_by_text_encoder(root_dir)),
     m_config(root_dir / "config.json") {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model((root_dir / "openvino_model.xml").string());
+    m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
 }
 
 CLIPTextModel::CLIPTextModel(const std::filesystem::path& root_dir,
@@ -53,8 +52,7 @@ CLIPTextModel::CLIPTextModel(const std::string& model,
                              const Config& config,
                              const Tokenizer& clip_tokenizer) :
     m_clip_tokenizer(clip_tokenizer), m_config(config) {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model(model, weights);
+    m_model = utils::singleton_core().read_model(model, weights);
 }
 
 CLIPTextModel::CLIPTextModel(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
index 1160c30b6a..eb9289ab3e 100644
--- a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
+++ b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
@@ -28,8 +28,7 @@ CLIPTextModelWithProjection::Config::Config(const std::filesystem::path& config_
 CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::path& root_dir) :
     m_clip_tokenizer(get_tokenizer_path_by_text_encoder(root_dir)),
     m_config(root_dir / "config.json") {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model((root_dir / "openvino_model.xml").string());
+    m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
 }
 
 CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::path& root_dir,
@@ -44,8 +43,7 @@ CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& mode
                                                          const Config& config,
                                                          const Tokenizer& clip_tokenizer) :
     m_clip_tokenizer(clip_tokenizer), m_config(config) {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model(model, weights);
+    m_model = utils::singleton_core().read_model(model, weights);
 }
 
 CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
index b09f099655..648eda8ff2 100644
--- a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
+++ b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
@@ -26,7 +26,7 @@ FluxTransformer2DModel::Config::Config(const std::filesystem::path& config_path)
 
 FluxTransformer2DModel::FluxTransformer2DModel(const std::filesystem::path& root_dir)
     : m_config(root_dir / "config.json") {
-    m_model = utils::singleton_core().read_model((root_dir / "openvino_model.xml").string());
+    m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
     m_vae_scale_factor = ov::genai::get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
 }
 
@@ -42,8 +42,7 @@ FluxTransformer2DModel::FluxTransformer2DModel(const std::string& model,
                                                const Config& config,
                                                const size_t vae_scale_factor) :
     m_config(config), m_vae_scale_factor(vae_scale_factor) {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model(model, weights);
+    m_model = utils::singleton_core().read_model(model, weights);
 }
 
 FluxTransformer2DModel::FluxTransformer2DModel(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
index 33771f2316..0a7865b07a 100644
--- a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
+++ b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
@@ -28,7 +28,7 @@ SD3Transformer2DModel::Config::Config(const std::filesystem::path& config_path)
 
 SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_dir)
     : m_config(root_dir / "config.json") {
-    m_model = utils::singleton_core().read_model((root_dir / "openvino_model.xml").string());
+    m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
     m_vae_scale_factor = get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
 }
 
@@ -44,8 +44,7 @@ SD3Transformer2DModel::SD3Transformer2DModel(const std::string& model,
                                              const Config& config,
                                              const size_t vae_scale_factor) :
     m_config(config), m_vae_scale_factor(vae_scale_factor) {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model(model, weights);
+    m_model = utils::singleton_core().read_model(model, weights);
 }
 
 SD3Transformer2DModel::SD3Transformer2DModel(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/t5_encoder_model.cpp b/src/cpp/src/image_generation/models/t5_encoder_model.cpp
index a83697b2e6..32ae326eca 100644
--- a/src/cpp/src/image_generation/models/t5_encoder_model.cpp
+++ b/src/cpp/src/image_generation/models/t5_encoder_model.cpp
@@ -16,8 +16,7 @@ std::filesystem::path get_tokenizer_path_by_text_encoder(const std::filesystem::
 
 T5EncoderModel::T5EncoderModel(const std::filesystem::path& root_dir) :
     m_tokenizer(get_tokenizer_path_by_text_encoder(root_dir)) {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model((root_dir / "openvino_model.xml").string());
+    m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
 }
 
 T5EncoderModel::T5EncoderModel(const std::filesystem::path& root_dir,
@@ -31,8 +30,7 @@ T5EncoderModel::T5EncoderModel(const std::string& model,
                                const Tensor& weights,
                                const Tokenizer& tokenizer) :
     m_tokenizer(tokenizer) {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model(model, weights);
+    m_model = utils::singleton_core().read_model(model, weights);
 }
 
 T5EncoderModel::T5EncoderModel(const std::string& model,
@@ -60,9 +58,7 @@ T5EncoderModel& T5EncoderModel::reshape(int batch_size, int max_sequence_length)
 
 T5EncoderModel& T5EncoderModel::compile(const std::string& device, const ov::AnyMap& properties) {
     OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model");
-    ov::Core core = utils::singleton_core();
-    ov::CompiledModel compiled_model;
-    compiled_model = core.compile_model(m_model, device, properties);
+    ov::CompiledModel compiled_model = utils::singleton_core().compile_model(m_model, device, properties);
     ov::genai::utils::print_compiled_model_properties(compiled_model, "T5 encoder model");
     m_request = compiled_model.create_infer_request();
     // release the original model
diff --git a/src/cpp/src/image_generation/models/unet2d_condition_model.cpp b/src/cpp/src/image_generation/models/unet2d_condition_model.cpp
index ca65c9d9d6..ef35709761 100644
--- a/src/cpp/src/image_generation/models/unet2d_condition_model.cpp
+++ b/src/cpp/src/image_generation/models/unet2d_condition_model.cpp
@@ -30,8 +30,7 @@ UNet2DConditionModel::Config::Config(const std::filesystem::path& config_path) {
 
 UNet2DConditionModel::UNet2DConditionModel(const std::filesystem::path& root_dir) :
     m_config(root_dir / "config.json") {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model((root_dir / "openvino_model.xml").string());
+    m_model = utils::singleton_core().read_model(root_dir / "openvino_model.xml");
     m_vae_scale_factor = get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
 }
 
@@ -47,8 +46,7 @@ UNet2DConditionModel::UNet2DConditionModel(const std::string& model,
                                            const Config& config,
                                            const size_t vae_scale_factor) :
     m_config(config), m_vae_scale_factor(vae_scale_factor) {
-    ov::Core core = utils::singleton_core();
-    m_model = core.read_model(model, weights);
+    m_model = utils::singleton_core().read_model(model, weights);
 }
 
 UNet2DConditionModel::UNet2DConditionModel(const std::string& model,
diff --git a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
index 914fbcf50b..dd265e3eca 100644
--- a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
+++ b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
@@ -10,13 +10,10 @@
 namespace ov {
 namespace genai {
 
-
 class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::UNetInference {
 public:
     virtual void compile(std::shared_ptr<ov::Model> model, const std::string& device, const ov::AnyMap& properties) override {
-        ov::Core core = utils::singleton_core();
-
-        ov::CompiledModel compiled_model = core.compile_model(model, device, properties);
+        ov::CompiledModel compiled_model = utils::singleton_core().compile_model(model, device, properties);
         ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition dynamic model");
         m_request = compiled_model.create_infer_request();
     }
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 5022595da1..0125479f92 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -64,7 +64,7 @@ std::pair<std::string, Any> draft_model(
     auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
     
     std::filesystem::path openvino_model_name = "openvino_model.xml";
-    auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+    auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config);
     auto generation_config = utils::from_config_json_if_exists(models_path);
     auto tokenizer = ov::genai::Tokenizer(models_path);
     return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
@@ -115,19 +115,20 @@ ov::genai::LLMPipeline::LLMPipeline(
 ov::genai::LLMPipeline::LLMPipeline(
     const std::filesystem::path& models_path,
     const std::string& device,
-    const ov::AnyMap& config) {
+    const ov::AnyMap& properties) {
     auto start_time = std::chrono::steady_clock::now();
 
-    if (config.find(ov::genai::scheduler_config.name()) != config.end() || 
-        config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end() || 
-        config.find(ov::genai::prompt_lookup.name()) != config.end()) {
-        auto [plugin_config, scheduler_config] = utils::split_scheduler_config(config);
-        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, plugin_config);
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+        properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
+        auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, device_properties);
     } else if (device == "NPU") {
-        m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, config);
+        m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, properties);
     } else {
-        m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, config);
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
     }
+
     m_pimpl->save_load_time(start_time);
 }
 
@@ -136,18 +137,17 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::Tensor& weights_tensor,
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
-    const ov::AnyMap& config,
+    const ov::AnyMap& properties,
     const ov::genai::GenerationConfig& generation_config) {
-    auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config);
-
     auto start_time = std::chrono::steady_clock::now();
-    if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end() || 
-        plugin_config.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_config.end() || 
-        plugin_config.find(ov::genai::prompt_lookup.name()) != plugin_config.end()){
 
-        auto [plugin_config_, scheduler_config] = utils::split_scheduler_config(plugin_config);
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+        properties.find(ov::genai::prompt_lookup.name()) != properties.end()){
+
+        auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
-                                                              tokenizer, scheduler_config, device, plugin_config_, generation_config);
+                                                              tokenizer, scheduler_config, device, device_properties, generation_config);
     } else if (device == "NPU") {
         // TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution.
         // NPU reads some properties from the config file, but when LLMPipeline is initialized 
@@ -160,24 +160,25 @@ ov::genai::LLMPipeline::LLMPipeline(
         //                                      {"num_key_value_heads", 32}};
         // ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties);
         // This will convert from AnyMap to ModelDesc.
-        auto [properties, model_descr] = split_model_descr(plugin_config);
+        auto [filtered_properties, model_descr] = split_model_descr(properties);
 
         m_pimpl = std::make_unique<StaticLLMPipeline>(
             utils::singleton_core().read_model(model_str, weights_tensor), 
             model_descr,
             tokenizer,
             device,
-            properties,
+            filtered_properties,
             generation_config
         );
     } else {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(
-            utils::singleton_core().read_model(model_str, weights_tensor), 
+            utils::singleton_core().read_model(model_str, weights_tensor),
             tokenizer,
             device,
-            plugin_config,
+            properties,
             generation_config);
     }
+
     m_pimpl->save_load_time(start_time);
 }
 
diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp
index cbcca62978..890afe2ab9 100644
--- a/src/cpp/src/llm_pipeline_stateful.cpp
+++ b/src/cpp/src/llm_pipeline_stateful.cpp
@@ -7,7 +7,7 @@
 #include "lora_helper.hpp"
 #include "lm_encoding.hpp"
 #include "text_callback_streamer.hpp"
-
+#include "utils.hpp"
 
 namespace ov::genai {
 
@@ -22,12 +22,12 @@ StatefulLLMPipeline::StatefulLLMPipeline(
     const std::filesystem::path& models_path,
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
-    const ov::AnyMap& plugin_config)
+    const ov::AnyMap& properties)
     : StatefulLLMPipeline{
-        ov::genai::utils::read_model_with_config(models_path, plugin_config),
+        utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, properties),
         tokenizer,
         device,
-        plugin_config,
+        properties,
         utils::from_config_json_if_exists(models_path)
     } {}
 
@@ -35,21 +35,20 @@ StatefulLLMPipeline::StatefulLLMPipeline(
     const std::shared_ptr<ov::Model>& model,
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
-    const ov::AnyMap& config,
+    const ov::AnyMap& properties,
     const ov::genai::GenerationConfig& generation_config)
     : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
-    ov::CompiledModel compiled_model;
-    auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
     utils::slice_matmul_stateful_model(model);
     m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model);
 
-    if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
+    ov::CompiledModel compiled_model;
+    if (auto filtered_properties = extract_adapters_from_properties(properties, &m_generation_config.adapters)) {
         m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
         m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device);   // TODO: Make the prefix name configurable
-        compiled_model = utils::singleton_core().compile_model(model, device, *filtered_plugin_config);
+        compiled_model = utils::singleton_core().compile_model(model, device, *filtered_properties);
         m_model_runner = compiled_model.create_infer_request();
     } else {
-        compiled_model = utils::singleton_core().compile_model(model, device, plugin_config);
+        compiled_model = utils::singleton_core().compile_model(model, device, properties);
         m_model_runner = compiled_model.create_infer_request();
     }
     ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model");
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 6f4f124894..e163dce2df 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -398,7 +398,7 @@ KVAxesPosition get_kv_axes(const std::string& model_type) {
 
 ov::genai::ModelConfigDesc get_modeldesc_from_json(const std::filesystem::path& filepath) {
     std::ifstream file(filepath);
-    OPENVINO_ASSERT(file.is_open(), "Could not open file: " + filepath.string());
+    OPENVINO_ASSERT(file.is_open(), "Could not open file: ", filepath);
     nlohmann::json config_data = nlohmann::json::parse(file);
 
     ov::genai::ModelConfigDesc desc;
@@ -660,7 +660,7 @@ StaticLLMPipeline::StaticLLMPipeline(
     const auto use_blobs = pop_or_default(properties, "USE_BLOBS", false);
     if (!use_blobs) {
         ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
-        auto model = genai::utils::singleton_core().read_model((models_path / "openvino_model.xml").string());
+        auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, properties);
         setupAndCompileModels(model, device, model_desc, properties);
     } else {
         setupAndImportModels(models_path, device, properties);
@@ -727,7 +727,7 @@ void StaticLLMPipeline::setupAndCompileModels(
         9) Compile both models
     */
 
-    ov::Core core;
+    ov::Core core = utils::singleton_core();
 
     // NB: Get information about NPU if available
     auto npudesc = extract_npu_descriptor(core);
@@ -802,7 +802,7 @@ void StaticLLMPipeline::setupAndImportModels(
         3) Import generate model from model directory or specified path
         4) Fill in m_kvcache_desc
     */
-    ov::Core core;
+    ov::Core core = utils::singleton_core();
 
     auto import_blob = [this,
                         &models_path,
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
index 4021742961..f749ac4e81 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -25,10 +25,6 @@ bool are_tokenizers_equal(Tokenizer& lhs, Tokenizer& rhs) {
 
 ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, 
                                                                              const ov::genai::ModelDesc& draft_model_desc) {
-    ov::Core core;
-    auto [core_properties, compile_properties] = utils::split_core_compile_config(main_model_desc.properties);
-    core.set_property(core_properties);
-
     auto main_model = main_model_desc.model;
     auto draft_model = draft_model_desc.model;
 
@@ -39,12 +35,12 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
     utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction);
 
     std::string draft_device = draft_model_desc.device.empty() ? main_model_desc.device : draft_model_desc.device;
-
-    bool is_scheduler_undefined = draft_model_desc.scheduler_config == SchedulerConfig();
+    bool is_draft_scheduler_undefined = draft_model_desc.scheduler_config == SchedulerConfig();
 
     ov::genai::SchedulerConfig main_scheduler_config_updated = main_scheduler_config,
-                               draft_scheduler_config = is_scheduler_undefined ? main_scheduler_config : draft_model_desc.scheduler_config;
-    if (is_scheduler_undefined) {
+                               draft_scheduler_config = is_draft_scheduler_undefined ? main_scheduler_config : draft_model_desc.scheduler_config;
+
+    if (is_draft_scheduler_undefined) {
         // split KV cache to 2 caches for main and draft models
         size_t main_model_hidden_size = utils::get_hidden_size(main_model),
                draft_model_hidden_size = utils::get_hidden_size(draft_model);
@@ -61,9 +57,10 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
         draft_scheduler_config.cache_size = draft_cache_size;
     }
 
-    ov::AnyMap draft_properties = draft_model_desc.properties == ov::AnyMap{} ? compile_properties : draft_model_desc.properties;
+    ov::AnyMap draft_properties = draft_model_desc.properties.empty() ? main_model_desc.properties : draft_model_desc.properties;
 
-    DeviceConfig main_device_config(core, main_scheduler_config_updated, main_device, compile_properties),
+    ov::Core core = utils::singleton_core();
+    DeviceConfig main_device_config(core, main_scheduler_config_updated, main_device, main_model_desc.properties),
                  draft_device_config(core, draft_scheduler_config, draft_device, draft_properties);
 
     utils::set_kv_cache_type_and_shape(main_model, main_device_config);
@@ -82,7 +79,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
     // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode
     m_main_pipeline = std::make_shared<ContinuousBatchingForSpeculativeDecodingImpl>(core,
         main_model, main_model_tokenizer, main_model_desc.generation_config,
-        main_device_config, main_scheduler_config_updated, main_device, compile_properties, true);
+        main_device_config, main_scheduler_config_updated, main_device, main_model_desc.properties, true);
     m_draft_pipeline = std::make_shared<ContinuousBatchingForSpeculativeDecodingImpl>(core,
         draft_model, draft_model_tokenizer, draft_model_desc.generation_config,
         draft_device_config, draft_scheduler_config, draft_device, draft_properties, false);
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 82c0a17a55..e1def95931 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -148,18 +148,16 @@ class Tokenizer::TokenizerImpl {
         m_skip_special_tokens = skip_special_tokens_flag;
     }
 
-    TokenizerImpl() = default;
-
-    TokenizerImpl(const std::filesystem::path& models_path,  const ov::AnyMap& properties) {
+    TokenizerImpl(const std::filesystem::path& models_path, const ov::AnyMap& properties) {
         setup_tokenizer(models_path, properties);
     }
 
-    TokenizerImpl(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models,  const ov::AnyMap& properties) {
+    TokenizerImpl(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models, const ov::AnyMap& properties) {
         setup_tokenizer(models, properties);
     }
 
     void setup_tokenizer(const std::filesystem::path& models_path, const ov::AnyMap& properties) {
-        ScopedVar env_manager(tokenizers_relative_to_genai().string());
+        ScopedVar env_manager(tokenizers_relative_to_genai());
         auto core = get_core_singleton();
 
         OPENVINO_ASSERT(models_path.extension() != ".xml", "'models_path' parameter should be a path to a dir not a xml file");
@@ -168,11 +166,11 @@ class Tokenizer::TokenizerImpl {
         std::shared_ptr<ov::Model> ov_detokenizer = nullptr;
 
         if (std::filesystem::exists(models_path / "openvino_tokenizer.xml")) {
-            ov_tokenizer = core.read_model(models_path / "openvino_tokenizer.xml");
+            ov_tokenizer = core.read_model(models_path / "openvino_tokenizer.xml", {}, properties);
         }
 
         if (std::filesystem::exists(models_path / "openvino_detokenizer.xml")) {
-            ov_detokenizer = core.read_model(models_path / "openvino_detokenizer.xml");
+            ov_detokenizer = core.read_model(models_path / "openvino_detokenizer.xml", {}, properties);
         }
 
         setup_tokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties);
@@ -242,10 +240,12 @@ class Tokenizer::TokenizerImpl {
             decode({1, 33, 199, 42, 42});
         }
 
-        utils::read_rt_info(ov_tokenizer, "chat_template", m_chat_template);
-        utils::read_rt_info(ov_tokenizer, "pad_token_id", m_pad_token_id);
-        utils::read_rt_info(ov_tokenizer, "bos_token_id", m_bos_token_id);
-        utils::read_rt_info(ov_tokenizer, "eos_token_id", m_eos_token_id);
+        if (m_tokenizer) {
+            utils::read_rt_info(ov_tokenizer, "chat_template", m_chat_template);
+            utils::read_rt_info(ov_tokenizer, "pad_token_id", m_pad_token_id);
+            utils::read_rt_info(ov_tokenizer, "bos_token_id", m_bos_token_id);
+            utils::read_rt_info(ov_tokenizer, "eos_token_id", m_eos_token_id);
+        }
 
         m_chat_template = patch_chat_template(m_chat_template);
         if (m_detokenizer) {
@@ -389,12 +389,13 @@ class Tokenizer::TokenizerImpl {
         OPENVINO_ASSERT(m_ireq_queue_tokenizer, "Either openvino_tokenizer.xml was not provided or it was not loaded correctly. "
                                                 "Tokenizer::encode is not available");
 
-        CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
+        CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(m_ireq_queue_tokenizer.get());
         set_state_if_necessary(infer_request_guard, tokenization_params);
         size_t batch_size = 1;
         infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
         infer_request_guard.get().start_async();
         infer_request_guard.get().wait();
+
         return get_copied_results(
             infer_request_guard.get().get_output_tensor(0),
             infer_request_guard.get().get_output_tensor(1)
@@ -404,6 +405,7 @@ class Tokenizer::TokenizerImpl {
     TokenizedInputs encode(std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params = {}) {
         OPENVINO_ASSERT(m_ireq_queue_tokenizer, "Either openvino_tokenizer.xml was not provided or it was not loaded correctly. "
                                                 "Tokenizer::encode is not available");
+
         TokenizedInputs unpadded;
         {
             CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
@@ -418,6 +420,7 @@ class Tokenizer::TokenizerImpl {
                 infer_request_guard.get().get_output_tensor(1)
             );
         }
+
         return pad_left(unpadded.input_ids, unpadded.attention_mask);
     }
 
@@ -431,7 +434,7 @@ class Tokenizer::TokenizerImpl {
     }
 
     std::string decode(std::vector<int64_t> tokens, const ov::AnyMap& detokenization_params = {}) {
-        OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available");
+        OPENVINO_ASSERT(m_detokenizer, "Detokenizer model has not been provided. Tokenizer::decode is not available");
 
         CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get());
         set_state_if_necessary(infer_request_guard, detokenization_params);
@@ -443,7 +446,7 @@ class Tokenizer::TokenizerImpl {
     }
 
     std::vector<std::string> decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params = {}) {
-        OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available");
+        OPENVINO_ASSERT(m_detokenizer, "Detokenizer model has not been provided. Tokenizer::decode is not available");
         OPENVINO_ASSERT(tokens.get_element_type() == ov::element::i64, "tokens tensor element type should be an i64");
         OPENVINO_ASSERT(tokens.get_shape().size() == 2, "tokens tensor should of rank 2 with shape [batch_size, seq_len]");
 
@@ -459,7 +462,7 @@ class Tokenizer::TokenizerImpl {
     }
 
     std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines, const ov::AnyMap& detokenization_params = {}) {
-        OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available");
+        OPENVINO_ASSERT(m_detokenizer, "Detokenizer model has not been provided. Tokenizer::decode is not available");
 
         auto compare_lengths = [](const std::vector<int64_t>& a, const std::vector<int64_t>& b) {
             return a.size() < b.size();
@@ -597,7 +600,7 @@ Tokenizer::Tokenizer(
     ov::Tensor&  detokenizer_weights_tensor,
     const ov::AnyMap& properties
 ) {
-    ScopedVar env_manager(tokenizers_relative_to_genai().string());
+    ScopedVar env_manager(tokenizers_relative_to_genai());
     auto core = get_core_singleton();
 
     auto ov_tokenizer = core.read_model(tokenizer_model_str, tokenizer_weights_tensor);
@@ -606,7 +609,7 @@ Tokenizer::Tokenizer(
 }
 
 Tokenizer::Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties) {
-    ScopedVar env_manager(tokenizers_relative_to_genai().string());
+    ScopedVar env_manager(tokenizers_relative_to_genai());
     auto core = get_core_singleton();
     auto model = core.read_model(model_str, weights_tensor);
 
diff --git a/src/cpp/src/tokenizers_path.hpp b/src/cpp/src/tokenizers_path.hpp
index a8ef1cb214..489542f2aa 100644
--- a/src/cpp/src/tokenizers_path.hpp
+++ b/src/cpp/src/tokenizers_path.hpp
@@ -26,16 +26,16 @@ class ScopedVar {
 public:
     static constexpr char ENVIRONMENT_VARIABLE_NAME[] = "OPENVINO_TOKENIZERS_PATH_GENAI";
 
-    explicit ScopedVar(const std::string& environment_variable_value) {
+    explicit ScopedVar(const std::filesystem::path& environment_variable_value) {
 #ifdef _WIN32
         char* value = nullptr;
         size_t len = 0;
         _dupenv_s(&value, &len, ENVIRONMENT_VARIABLE_NAME);
         if (value == nullptr)
-            _putenv_s(ENVIRONMENT_VARIABLE_NAME, environment_variable_value.c_str());
+            _putenv_s(ENVIRONMENT_VARIABLE_NAME, environment_variable_value.string().c_str());
 #else
         if (!getenv(ENVIRONMENT_VARIABLE_NAME))
-            setenv(ENVIRONMENT_VARIABLE_NAME, environment_variable_value.c_str(), 1);
+            setenv(ENVIRONMENT_VARIABLE_NAME, environment_variable_value.string().c_str(), 1);
 #endif
         else
             was_already_set = true;
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index 83dbf15376..52faae02e9 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -200,27 +200,6 @@ ProcessorConfig from_any_map(
     return extracted_config;
 }
 
-/**
- * Split config by core and compile configs
- * There are not supported by `core.compile` function plugin options like `ENABLE_MMAP`
- * Move this options to `core.set_property` config
- */
-std::pair<ov::AnyMap, ov::AnyMap> split_core_compile_config(const ov::AnyMap& properties) {
-    const std::vector<std::string> unsupported_by_compile_properties{"ENABLE_MMAP"};
-    ov::AnyMap core_properties;
-    ov::AnyMap compile_properties{properties};
-
-    for (const auto option : unsupported_by_compile_properties) {
-        auto iter = properties.find(option);
-        if (iter != properties.end()) {
-            core_properties[option] = iter->second;
-            compile_properties.erase(option);
-        }
-    }
-
-    return {core_properties, compile_properties};
-};
-
 /**
  * scheduler_config is a separate config for continuous batching pipeline. 
  * This routine splits scheduler_config from plugin_config.
@@ -236,14 +215,6 @@ std::pair<ov::AnyMap, SchedulerConfig> split_scheduler_config(const ov::AnyMap&
     return {plugin_config, scheduler_config};
 };
 
-std::shared_ptr<ov::Model> read_model_with_config(const std::filesystem::path& models_path, const ov::AnyMap& properties) {
-    auto [core_properties, compile_properties] = split_core_compile_config(properties);
-    ov::Core core;
-    core.set_property(core_properties);
-    std::filesystem::path openvino_model_name = "openvino_model.xml";
-    return core.read_model((models_path / openvino_model_name).string());
-}
-
 ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend) {
     auto minuend_size = minuend.input_ids.get_size();
     auto subtrahend_size = subtrahend.input_ids.get_size();
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 8f49bd471e..af9d889115 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -95,11 +95,8 @@ ProcessorConfig from_any_map(
 );
 
 
-std::pair<ov::AnyMap, ov::AnyMap> split_core_compile_config(const ov::AnyMap& properties);
 std::pair<ov::AnyMap, SchedulerConfig> split_scheduler_config(const ov::AnyMap& properties);
 
-std::shared_ptr<ov::Model> read_model_with_config(const std::filesystem::path& models_path, const ov::AnyMap& properties);
-
 ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);
 
 void slice_matmul_stateful_model(std::shared_ptr<ov::Model> model);
diff --git a/src/cpp/src/visual_language/embedding_model.cpp b/src/cpp/src/visual_language/embedding_model.cpp
index 307bdcebac..a2a9750c33 100644
--- a/src/cpp/src/visual_language/embedding_model.cpp
+++ b/src/cpp/src/visual_language/embedding_model.cpp
@@ -21,7 +21,7 @@ EmbeddingsModel::EmbeddingsModel(const std::filesystem::path& model_dir,
                                  const std::string& device,
                                  const ov::AnyMap& properties) {
     ov::Core core = utils::singleton_core();
-    std::shared_ptr<ov::Model> m_model = core.read_model((model_dir / "openvino_text_embeddings_model.xml").string());
+    std::shared_ptr<ov::Model> m_model = core.read_model(model_dir / "openvino_text_embeddings_model.xml", {}, properties);
     // apply embedding postprocessing step by merging them into the model
     merge_postprocess(m_model, scale_emb);
 
diff --git a/src/cpp/src/visual_language/processor_config.cpp b/src/cpp/src/visual_language/processor_config.cpp
index 7b953e5bed..fc524fce9c 100644
--- a/src/cpp/src/visual_language/processor_config.cpp
+++ b/src/cpp/src/visual_language/processor_config.cpp
@@ -8,7 +8,7 @@
 
 ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_path) {
     std::ifstream stream(json_path);
-    OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config");
+    OPENVINO_ASSERT(stream.is_open(), "Failed to open '", json_path, "' with processor config");
     nlohmann::json parsed = nlohmann::json::parse(stream);
     using ov::genai::utils::read_json_param;
     read_json_param(parsed, "patch_size", patch_size); // For llava - stored in config.json vision_config
diff --git a/src/cpp/src/visual_language/vlm_config.cpp b/src/cpp/src/visual_language/vlm_config.cpp
index c4022ab80e..c711998128 100644
--- a/src/cpp/src/visual_language/vlm_config.cpp
+++ b/src/cpp/src/visual_language/vlm_config.cpp
@@ -8,7 +8,7 @@
 
 ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) {
     std::ifstream stream(json_path);
-    OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config");
+    OPENVINO_ASSERT(stream.is_open(), "Failed to open '", json_path, "' with processor config");
     nlohmann::json parsed = nlohmann::json::parse(stream);
     using ov::genai::utils::read_json_param;
     model_type = to_vlm_model_type(parsed.at("model_type"));
diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp
index f0fb34cdf6..70dbc48507 100644
--- a/src/cpp/src/whisper_pipeline.cpp
+++ b/src/cpp/src/whisper_pipeline.cpp
@@ -54,19 +54,16 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
                                 const ov::AnyMap& properties)
         : WhisperPipelineImplBase{models_path} {
         ov::Core core = utils::singleton_core();
-        auto [core_properties, compile_properties] = ov::genai::utils::split_core_compile_config(properties);
-        core.set_property(core_properties);
 
-        ov::CompiledModel compiled_model;
-        compiled_model =
-            core.compile_model((models_path / "openvino_encoder_model.xml").string(), device, compile_properties);
+        ov::CompiledModel compiled_model = core.compile_model(models_path / "openvino_encoder_model.xml", device, properties);
         ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper encoder model");
         m_models.encoder = compiled_model.create_infer_request();
-        compiled_model =
-            core.compile_model((models_path / "openvino_decoder_model.xml").string(), device, compile_properties);
+
+        compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties);
         ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper decoder model");
         m_models.decoder = compiled_model.create_infer_request();
-        compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, compile_properties);
+
+        compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, properties);
         m_models.decoder_with_past = compiled_model.create_infer_request();
         ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model");
 
@@ -81,6 +78,10 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
                                    ChunkStreamerVariant streamer) override {
         auto start_time = std::chrono::steady_clock::now();
         WhisperGenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+
+        // If eos_token_id was not provided, take value from default m_generation_config
+        if (config.eos_token_id == -1)
+            config.set_eos_token_id(m_generation_config.eos_token_id);
         config.validate();
 
         std::shared_ptr<ChunkStreamerBase> streamer_ptr;
diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp
index cc61eb0659..01fe882187 100644
--- a/src/cpp/src/whisper_pipeline_static.cpp
+++ b/src/cpp/src/whisper_pipeline_static.cpp
@@ -546,9 +546,9 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys
     : WhisperPipelineImplBase{models_path} {
     ov::Core core = utils::singleton_core();
 
-    auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml");
-    auto decoder_model = core.read_model(models_path / "openvino_decoder_model.xml");
-    auto decoder_with_past_model = core.read_model(models_path / "openvino_decoder_with_past_model.xml");
+    auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, properties);
+    auto decoder_model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties);
+    auto decoder_with_past_model = core.read_model(models_path / "openvino_decoder_with_past_model.xml", {}, properties);
 
     add_attention_mask_input_for_decoder(decoder_model);
     add_attention_mask_input(decoder_with_past_model);
diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp
index 5fdf6adce1..5c042d83d9 100644
--- a/src/python/py_utils.cpp
+++ b/src/python/py_utils.cpp
@@ -6,6 +6,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
+#include <pybind11/stl/filesystem.h>
 #include <pybind11/functional.h>
 
 #include <openvino/runtime/auto/properties.hpp>
@@ -295,7 +296,7 @@ ov::Any py_object_to_any(const py::object& py_obj, std::string property_name) {
 } // namespace
 
 ov::AnyMap properties_to_any_map(const std::map<std::string, py::object>& properties) {
-    std::map<std::string, ov::Any> properties_to_cpp;
+    ov::AnyMap properties_to_cpp;
     for (const auto& property : properties) {
         properties_to_cpp[property.first] = py_object_to_any(property.second, property.first);
     }
@@ -324,13 +325,13 @@ ov::AnyMap kwargs_to_any_map(const py::kwargs& kwargs) {
     return params;
 }
 
-std::string ov_tokenizers_module_path() {
+std::filesystem::path ov_tokenizers_module_path() {
     // Try a path relative to build artifacts folder first.
     std::filesystem::path from_relative = tokenizers_relative_to_genai();
     if (std::filesystem::exists(from_relative)) {
-        return from_relative.string();
+        return from_relative;
     }
-    return py::str(py::module_::import("openvino_tokenizers").attr("_ext_path"));
+    return py::module_::import("openvino_tokenizers").attr("_ext_path").cast<std::filesystem::path>();
 }
 
 ov::genai::StreamerVariant pystreamer_to_streamer(const PyBindStreamerVariant& py_streamer) {
diff --git a/src/python/py_utils.hpp b/src/python/py_utils.hpp
index feb9920dbe..9d78ab0930 100644
--- a/src/python/py_utils.hpp
+++ b/src/python/py_utils.hpp
@@ -1,6 +1,8 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
+#define PYBIND11_DETAILED_ERROR_MESSAGES
+
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
@@ -32,7 +34,7 @@ ov::AnyMap properties_to_any_map(const std::map<std::string, py::object>& proper
 
 ov::AnyMap kwargs_to_any_map(const py::kwargs& kwargs);
 
-std::string ov_tokenizers_module_path();
+std::filesystem::path ov_tokenizers_module_path();
 
 ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::OptionalGenerationConfig& config, const py::kwargs& kwargs);
 
diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index c2c7d634f5..c851c71ee5 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -4,6 +4,8 @@ optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
 onnx==1.17.0
 pytest
+pytest-html
+hf_transfer
 
 # requirements for specific models
 # - hf-tiny-model-private/tiny-random-RoFormerForCausalLM
@@ -32,4 +34,4 @@ sacremoses
 # - openai/whisper-base
 librosa
 soundfile
-datasets
\ No newline at end of file
+datasets

From 34dc4692c5f8a59eab278483a66fe639a4b0ecbc Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 31 Dec 2024 15:53:30 +0400
Subject: [PATCH 074/110] Fixed typo (#1458)

---
 src/cpp/src/continuous_batching_impl.cpp | 2 +-
 src/cpp/src/lm_encoding.cpp              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 15c0e69d58..7b076504d0 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -195,7 +195,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
     step_count++;
 #endif
 
-    // process generation_config.echo parameetr
+    // process generation_config.echo parameter
     _fill_prompt_log_probs(m_requests, logits);
 
     SamplerOutput sampler_output;
diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index 083c591927..9ef876d8aa 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -119,7 +119,7 @@ std::pair<EncodedResults, std::optional<int64_t>> get_lm_encoded_results(
 
     auto logits = m_llm.get_tensor("logits");
 
-    // since we have applied `Slice` operationto last MatMul, model output sequence lenght is 1
+    // since we have applied `Slice` operation to last MatMul, model output sequence lenght is 1
     // so, we need to update sequence groups to think that they already have processed all prompt tokens except last ones
     // and schedule only `output_sequence_len` ones
     int64_t output_sequence_len = logits.get_shape().at(1);

From 482fa791f926143fec05ae564363d4cd0cf93508 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 1 Jan 2025 14:12:56 +0400
Subject: [PATCH 075/110] Updated real_models list (#1459)

---
 tests/python_tests/models/real_models | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/tests/python_tests/models/real_models b/tests/python_tests/models/real_models
index 98fa18bd5e..420f8f53b6 100644
--- a/tests/python_tests/models/real_models
+++ b/tests/python_tests/models/real_models
@@ -11,7 +11,7 @@ EleutherAI/gpt-neo-2.7B
 EleutherAI/gpt-neox-20b
 EleutherAI/pythia-160m
 GAIR/Abel-7B-002
-# OrionStarAI/Orion-14B-Base: pip install flash_attn (https://github.com/huggingface/transformers/pull/30954)
+OrionStarAI/Orion-14B-Base
 PygmalionAI/pygmalion-6b
 Qwen/Qwen-7B
 Qwen/Qwen-7B-Chat
@@ -21,6 +21,8 @@ Qwen/Qwen1.5-7B
 Qwen/Qwen1.5-7B-Chat
 Qwen/Qwen1.5-MoE-A2.7B
 Qwen/Qwen1.5-MoE-A2.7B-Chat
+Qwen/Qwen2-7B
+Qwen/Qwen2-7B-Instruct
 Salesforce/codegen-350M-multi
 Salesforce/codegen-350M-nl
 Salesforce/codegen2-1b
@@ -48,15 +50,16 @@ bigscience/bloomz-1b7
 bigscience/bloomz-560m
 bigscience/bloomz-7b1
 cerebras/Cerebras-GPT-13B
-# core42/jais-13b: wrong output with PA
-# core42/jais-13b-chat: wrong output with PA
+core42/jais-13b
+core42/jais-13b-chat
 databricks/dolly-v1-6b
 databricks/dolly-v2-3b
 # deepseek-ai/deepseek-coder-33b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file
 # deepseek-ai/deepseek-coder-6.7b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file
-# deepseek-ai/deepseek-moe-16b-base: optimum -  Trying to export a deepseek model, that is a custom or unsupported architecture
-# facebook/blenderbot-3B: optimum - IndexError: tuple index out of range
-# facebook/incoder-1B: CB - Failed to detect "eos_token_id" in openvino_tokenizer.xml runtime information
+deepseek-ai/deepseek-moe-16b-base
+deepseek-ai/DeepSeek-V3-Base
+facebook/blenderbot-3B
+facebook/incoder-1B
 facebook/opt-1.3b
 facebook/opt-125m
 facebook/opt-2.7b
@@ -66,6 +69,7 @@ google/gemma-1.1-7b-it
 google/gemma-2b
 google/gemma-2b-it
 google/gemma-7b
+google/gemma-2-9b
 google/pegasus-big_patent
 google/pegasus-large
 gpt2
@@ -86,6 +90,10 @@ microsoft/DialoGPT-medium
 microsoft/Orca-2-7b
 microsoft/Phi-3-mini-128k-instruct
 microsoft/Phi-3-mini-4k-instruct
+microsoft/Phi-3-medium-128k-instruct
+microsoft/Phi-3-small-8k-instruct
+microsoft/Phi-3-small-128k-instruct
+microsoft/Phi-3.5-MoE-instruct
 # microsoft/biogpt: OpenVINO Tokenizers - openvino.runtime.exceptions.OVTypeError: Tokenizer type is not supported: <class 'transformers.models.biogpt.tokenization_biogpt.BioGptTokenizer'>
 microsoft/phi-1_5
 microsoft/phi-2
@@ -106,10 +114,10 @@ openbmb/MiniCPM-2B-dpo-bf16
 openbmb/MiniCPM-2B-sft-bf16
 openchat/openchat_3.5
 openlm-research/open_llama_13b
-# openlm-research/open_llama_3b: CPU - head size must be multiple of 16, current: 100
-# openlm-research/open_llama_3b_v2: CPU - head size must be multiple of 16, current: 100
+openlm-research/open_llama_3b
+openlm-research/open_llama_3b_v2
 # replit/replit-code-v1-3b: OpenVINO Tokenizers - AttributeError: 'ReplitLMTokenizer' object has no attribute 'sp_model'
-# rinna/bilingual-gpt-neox-4b: OpenVINO Tokenizers - trash output (https://jira.devtools.intel.com/browse/CVS-142063)
+rinna/bilingual-gpt-neox-4b
 rinna/youri-7b-chat
 stabilityai/stable-code-3b
 stabilityai/stable-zephyr-3b
@@ -120,3 +128,4 @@ tiiuae/falcon-rw-7b
 togethercomputer/RedPajama-INCITE-Chat-3B-v1
 # xverse/XVERSE-7B-Chat: Transformers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3
 # xverse/XVERSE-MoE-A4.2B: Transformers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3
+Deci/DeciLM-7B
\ No newline at end of file

From 2ab8fa8578e120e5978294fa8a69efac72874a87 Mon Sep 17 00:00:00 2001
From: Gorokhov Dmitriy <dmitry.gorokhov@intel.com>
Date: Fri, 3 Jan 2025 11:57:57 +0400
Subject: [PATCH 076/110] Removed usage of deprecated ov::affinity property
 (#1467)

OpenVINO PR: https://github.com/openvinotoolkit/openvino/pull/28247
---
 src/python/py_utils.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp
index 5c042d83d9..1fc34a36d2 100644
--- a/src/python/py_utils.cpp
+++ b/src/python/py_utils.cpp
@@ -262,8 +262,6 @@ ov::Any py_object_to_any(const py::object& py_obj, std::string property_name) {
         return py::cast<ov::device::Type>(py_obj);
     } else if (py::isinstance<ov::streams::Num>(py_obj)) {
         return py::cast<ov::streams::Num>(py_obj);
-    } else if (py::isinstance<ov::Affinity>(py_obj)) {
-        return py::cast<ov::Affinity>(py_obj);
     } else if (py::isinstance<ov::Tensor>(py_obj)) {
         return py::cast<ov::Tensor>(py_obj);
     } else if (py::isinstance<ov::Output<ov::Node>>(py_obj)) {

From 42f3053afdaa61d36958324aa834c3e2c951eedd Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 3 Jan 2025 17:35:51 +0400
Subject: [PATCH 077/110] Don't throw when some generation parameters are
 ignored (#1473)

Relaxed some checks introduced in
https://github.com/openvinotoolkit/openvino.genai/pull/1448

Tickets:
- CVS-159996
- CVS-159998
- CVS-160041
- CVS-160009
- CVS-160035
---
 src/cpp/src/generation_config.cpp            | 14 +++++++-------
 tests/python_tests/test_generation_config.py | 18 +++++++++---------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index 59be603fd9..25402e22e7 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -230,9 +230,9 @@ void GenerationConfig::validate() const {
         OPENVINO_ASSERT(temperature > 0, "When 'do_sample' is true, temperature must be a strictly positive float, but got ", temperature);
     } else {
         // parameters requiring multinomial
-        OPENVINO_ASSERT(top_k == std::numeric_limits<size_t>::max(), "When 'do_sample' is false, top_k must be max of size_t, but got ", top_k);
-        OPENVINO_ASSERT(top_p == 1.0f, "When 'do_sample' is false, top_p must be 1.0f, but got ", top_p);
-        OPENVINO_ASSERT(temperature == 1.0f, "When 'do_sample' is false, temperature must be a 1.0f, but got ", temperature);
+        // OPENVINO_ASSERT(top_k == std::numeric_limits<size_t>::max(), "When 'do_sample' is false, top_k must be max of size_t, but got ", top_k);
+        // OPENVINO_ASSERT(top_p == 1.0f, "When 'do_sample' is false, top_p must be 1.0f, but got ", top_p);
+        // OPENVINO_ASSERT(temperature == 1.0f, "When 'do_sample' is false, temperature must be a 1.0f, but got ", temperature);
     }
 
     if (is_beam_search()) {
@@ -252,10 +252,10 @@ void GenerationConfig::validate() const {
         }
     } else {
         // parameters requiring beam search
-        OPENVINO_ASSERT(num_beam_groups == 1, "'num_beam_groups' is supported by beam search only and should be 1 otherwise, but got ", num_beam_groups);
-        OPENVINO_ASSERT(no_repeat_ngram_size == std::numeric_limits<size_t>::max(), "'no_repeat_ngram_size' is supported only by beam search, otherwise should be set to max of size_t, but got ", no_repeat_ngram_size);
-        OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to ", diversity_penalty, " (default is 0.0f), which is supported only by beam search sampling");
-        OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to ", length_penalty, " (default is 1.0f), which is supported only by beam search sampling");
+        // OPENVINO_ASSERT(num_beam_groups == 1, "'num_beam_groups' is supported by beam search only and should be 1 otherwise, but got ", num_beam_groups);
+        // OPENVINO_ASSERT(no_repeat_ngram_size == std::numeric_limits<size_t>::max(), "'no_repeat_ngram_size' is supported only by beam search, otherwise should be set to max of size_t, but got ", no_repeat_ngram_size);
+        // OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to ", diversity_penalty, " (default is 0.0f), which is supported only by beam search sampling");
+        // OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to ", length_penalty, " (default is 1.0f), which is supported only by beam search sampling");
     }
 
     // assistant generation
diff --git a/tests/python_tests/test_generation_config.py b/tests/python_tests/test_generation_config.py
index 110caaf0e5..0a42685b05 100644
--- a/tests/python_tests/test_generation_config.py
+++ b/tests/python_tests/test_generation_config.py
@@ -23,6 +23,10 @@
     dict(max_new_tokens=1, do_sample=True, top_k=1),
     dict(max_new_tokens=1, do_sample=True, top_p=0.5),
     dict(max_new_tokens=1, do_sample=True, temperature=0.5),
+    # parameters requiring multimonial are ignored when do_sample=False
+    dict(max_new_tokens=1, top_k=1), # requires do_sample=True
+    dict(max_new_tokens=1, top_p=0.5), # requires do_sample=True
+    dict(max_new_tokens=1, temperature=2.0), # requires do_sample=True
     # beam search
     dict(max_new_tokens=1, num_beams=2),
     dict(max_new_tokens=1, num_beams=2, num_return_sequences=1),
@@ -30,6 +34,11 @@
     dict(max_new_tokens=1, num_beams=4, num_beam_groups=2, diversity_penalty=1.0),
     dict(max_new_tokens=1, num_beams=4, length_penalty=1.0),
     dict(max_new_tokens=1, num_beams=4, no_repeat_ngram_size=2),
+    # parameters requiring beam search are ignored when num_beams == 1
+    dict(max_new_tokens=1, num_beam_groups=2), # requiring beam search
+    dict(max_new_tokens=1, no_repeat_ngram_size=2), # requiring beam search
+    dict(max_new_tokens=1, diversity_penalty=1.0), # requiring beam search
+    dict(max_new_tokens=1, length_penalty=2), # requiring beam search
     # assistant generation
     dict(max_new_tokens=1, assistant_confidence_threshold=0.5),
     dict(max_new_tokens=1, num_assistant_tokens=2),
@@ -66,10 +75,6 @@ def test_valid_configs(generation_config_kwargs):
     dict(max_new_tokens=1, do_sample=True, top_p=1.1), # 'top_p' must be within (0, 1] when 'do_sample' is True
     dict(max_new_tokens=1, do_sample=True, top_p=0), # 'top_p' must be within (0, 1] when 'do_sample' is True
     dict(max_new_tokens=1, do_sample=True, temperature=-1.0), # invalid temp
-    # parameters requiring multimonial
-    dict(max_new_tokens=1, top_k=1), # requires do_sample=True
-    dict(max_new_tokens=1, top_p=0.5), # requires do_sample=True
-    dict(max_new_tokens=1, temperature=2.0), # requires do_sample=True
     # beam search
     dict(max_new_tokens=1, num_beams=2, num_return_sequences=3), # 'num_beams' must be >= 'num_return_sequences'
     dict(max_new_tokens=1, num_beams=3, num_beam_groups=2), # 'num_beams' must be divisible by 'num_beam_groups'
@@ -80,11 +85,6 @@ def test_valid_configs(generation_config_kwargs):
     dict(max_new_tokens=1, num_beams=2, frequency_penalty=1.0), # 'frequency_penalty' is not supported by beam search
     dict(max_new_tokens=1, num_beams=2, presence_penalty=1.0), # 'presence_penalty' is not supported by beam search
     dict(max_new_tokens=1, num_beams=2, repetition_penalty=0.0), # 'repetition_penalty' is not supported by beam search
-    # parameters requiring beam search
-    dict(max_new_tokens=1, num_beam_groups=2), # requiring beam search
-    dict(max_new_tokens=1, no_repeat_ngram_size=2), # requiring beam search
-    dict(max_new_tokens=1, diversity_penalty=1.0), # requiring beam search
-    dict(max_new_tokens=1, length_penalty=2), # requiring beam search
     # assistant generation
     dict(max_new_tokens=1, num_assistant_tokens=2, do_sample=True, num_return_sequences=2), # 'num_return_sequences' must be 1, as we cannot use different number of tokens per sequence within a group
     dict(max_new_tokens=1, assistant_confidence_threshold=1.0, do_sample=True, num_return_sequences=2), # 'num_return_sequences' must be 1, as we cannot use different number of tokens per sequence within a group

From 1fd1430af56ad6eb630917018267077464b0d76a Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sat, 4 Jan 2025 03:12:03 +0400
Subject: [PATCH 078/110] Move tests on sampling to test_sampling.py (#1465)

- Move extensive tests on decoding / sampling from test_llm_pipeline.py
tests to test_sampling.py
- Partially refactored common functions in common.py to be more generic
(to be continued in next PRs)
- Dropped partially predefined functions with generation configs and
replaced them in tests with dict of generation parameters, so you can
better see tests params closer to tests itself and avoid creating
numerous get_** for new generation values combinations.
- Sampling tests are now implemented on top of stateful model for better
comparison with optimum-intel
---
 .github/workflows/mac.yml                     |   4 +-
 .../openvino_genai/py_openvino_genai.pyi      |   1 +
 .../py_continuous_batching_pipeline.cpp       |  15 +-
 tests/python_tests/common.py                  | 346 +++++++++---------
 tests/python_tests/ov_genai_test_utils.py     |  49 ++-
 .../python_tests/test_continuous_batching.py  |  38 +-
 tests/python_tests/test_kv_cache_eviction.py  |   6 +-
 tests/python_tests/test_llm_pipeline.py       | 327 +++--------------
 .../python_tests/test_llm_pipeline_static.py  |  40 +-
 tests/python_tests/test_sampling.py           | 224 +++++-------
 tests/python_tests/test_vlm_pipeline.py       |  17 +-
 .../tests/test_cli_image.py                   |   9 +-
 12 files changed, 410 insertions(+), 666 deletions(-)

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index fb66271ff7..5402b79e70 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -1,4 +1,4 @@
-name: macOS (12, Python 3.9)
+name: macOS (12, Python 3.10)
 on:
   workflow_dispatch:
   pull_request:
@@ -16,7 +16,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  PYTHON_VERSION: '3.9'
+  PYTHON_VERSION: '3.10'
   OV_BRANCH: master
   OV_TARBALL: ''
 
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index 5d82fa89a3..9ff28859b9 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -697,6 +697,7 @@ class GenerationResult:
     """
     m_generation_ids: list[str]
     m_scores: list[float]
+    m_status: GenerationStatus
     def __init__(self) -> None:
         ...
     def __repr__(self) -> str:
diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp
index 2b48e4d44d..48eb124255 100644
--- a/src/python/py_continuous_batching_pipeline.cpp
+++ b/src/python/py_continuous_batching_pipeline.cpp
@@ -119,6 +119,13 @@ std::ostream& operator << (std::ostream& stream, const GenerationResult& generat
 } // namespace
 
 void init_continuous_batching_pipeline(py::module_& m) {
+    py::enum_<ov::genai::GenerationStatus>(m, "GenerationStatus")
+        .value("RUNNING", ov::genai::GenerationStatus::RUNNING)
+        .value("FINISHED", ov::genai::GenerationStatus::FINISHED)
+        .value("IGNORED", ov::genai::GenerationStatus::IGNORED)
+        .value("DROPPED_BY_PIPELINE", ov::genai::GenerationStatus::DROPPED_BY_PIPELINE)
+        .value("DROPPED_BY_HANDLE", ov::genai::GenerationStatus::DROPPED_BY_HANDLE);
+
     py::class_<GenerationResult>(m, "GenerationResult", generation_result_docstring)
         .def(py::init<>())
         .def_readonly("m_request_id", &GenerationResult::m_request_id)
@@ -130,6 +137,7 @@ void init_continuous_batching_pipeline(py::module_& m) {
                 r.m_generation_ids = generation_ids;
             })
         .def_readwrite("m_scores", &GenerationResult::m_scores)
+        .def_readwrite("m_status", &GenerationResult::m_status)
         .def("__repr__",
             [](const GenerationResult &r) -> py::str {
                 std::stringstream stream;
@@ -148,13 +156,6 @@ void init_continuous_batching_pipeline(py::module_& m) {
         .def_readwrite("m_generation_ids", &EncodedGenerationResult::m_generation_ids)
         .def_readwrite("m_scores", &EncodedGenerationResult::m_scores);
 
-    py::enum_<ov::genai::GenerationStatus>(m, "GenerationStatus")
-        .value("RUNNING", ov::genai::GenerationStatus::RUNNING)
-        .value("FINISHED", ov::genai::GenerationStatus::FINISHED)
-        .value("IGNORED", ov::genai::GenerationStatus::IGNORED)
-        .value("DROPPED_BY_PIPELINE", ov::genai::GenerationStatus::DROPPED_BY_PIPELINE)
-        .value("DROPPED_BY_HANDLE", ov::genai::GenerationStatus::DROPPED_BY_HANDLE);
-
     py::enum_<ov::genai::GenerationFinishReason>(m, "GenerationFinishReason")
         .value("NONE", ov::genai::GenerationFinishReason::NONE)
         .value("STOP", ov::genai::GenerationFinishReason::STOP)
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 9040fa435f..dc58d1ad2f 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -7,7 +7,7 @@
 
 from optimum.intel import OVModelForCausalLM
 from pathlib import Path
-from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationResult, GenerationConfig
+from openvino_genai import ContinuousBatchingPipeline, LLMPipeline, SchedulerConfig, GenerationResult, GenerationConfig, DecodedResults, StopCriteria
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers import GenerationConfig as HFGenerationConfig
 from typing import List, Tuple
@@ -20,20 +20,6 @@ def get_greedy() -> GenerationConfig:
     generation_config.max_new_tokens = 30
     return generation_config
 
-def get_greedy_with_min_and_max_tokens() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.min_new_tokens = 15
-    generation_config.max_new_tokens = 30
-    return generation_config
-
-def get_greedy_with_repetition_penalty() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.repetition_penalty = 2.0
-    generation_config.max_new_tokens = 30
-    return generation_config
-
 def get_greedy_with_penalties() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_return_sequences = 1
@@ -42,33 +28,6 @@ def get_greedy_with_penalties() -> GenerationConfig:
     generation_config.max_new_tokens = 30
     return generation_config
 
-def get_greedy_with_single_stop_string() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.min_new_tokens = 15
-    generation_config.max_new_tokens = 50
-    generation_config.stop_strings = {"anag"} # expected match on "manage"
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_greedy_with_multiple_stop_strings() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.min_new_tokens = 1
-    generation_config.max_new_tokens = 50
-    generation_config.stop_strings = {".", "software", "Intel"}
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_greedy_with_multiple_stop_strings_no_match() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.min_new_tokens = 1
-    generation_config.max_new_tokens = 50
-    generation_config.stop_strings = {"Einstein", "sunny", "geothermal"}
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
 def get_beam_search() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_beam_groups = 3
@@ -79,78 +38,6 @@ def get_beam_search() -> GenerationConfig:
     generation_config.num_return_sequences = generation_config.num_beams
     return generation_config
 
-def get_beam_search_min_and_max_tokens() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_beam_groups = 3
-    generation_config.num_beams = 6
-    generation_config.diversity_penalty = 1
-    generation_config.min_new_tokens = 15
-    generation_config.max_new_tokens = 30
-    generation_config.num_return_sequences = 3
-    generation_config.num_return_sequences = generation_config.num_beams
-    return generation_config
-
-def get_beam_search_with_single_stop_string() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_beam_groups = 3
-    generation_config.num_beams = 6
-    generation_config.diversity_penalty = 1
-    generation_config.max_new_tokens = 50
-    generation_config.num_return_sequences = generation_config.num_beams
-    generation_config.stop_strings = {"open sour"}  # expected match on "open source"
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_beam_search_with_multiple_stop_strings() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_beam_groups = 3
-    generation_config.num_beams = 6
-    generation_config.diversity_penalty = 1
-    generation_config.max_new_tokens = 50
-    generation_config.num_return_sequences = generation_config.num_beams
-    generation_config.stop_strings = {".", "software", "Intel"}
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_beam_groups = 3
-    generation_config.num_beams = 6
-    generation_config.diversity_penalty = 1
-    generation_config.max_new_tokens = 30
-    generation_config.num_return_sequences = generation_config.num_beams
-    generation_config.stop_strings = {"Einstein", "sunny", "geothermal"}
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_greedy_stop_strings_exclude_from_output() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.max_new_tokens = 30
-    generation_config.stop_strings = { "machines" }
-    generation_config.include_stop_str_in_output = False
-    return generation_config
-
-def get_greedy_stop_strings_include_to_output() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.max_new_tokens = 30
-    generation_config.stop_strings = { "machines" }
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
-def get_greedy_n_stop_strings_exclude_from_output() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.max_new_tokens = 30
-    generation_config.stop_strings = { "machines", "manage" }
-    generation_config.include_stop_str_in_output = False
-    return generation_config
-
-def get_greedy_n_stop_strings_include_to_output() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.max_new_tokens = 30
-    generation_config.stop_strings = { "machines", "manage" }
-    generation_config.include_stop_str_in_output = True
-    return generation_config
-
 def get_multinomial_temperature() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.do_sample = True
@@ -288,8 +175,10 @@ def convert_to_hf(
     default_generation_config : HFGenerationConfig,
     generation_config : GenerationConfig
 ) -> HFGenerationConfig:
-    kwargs = {}
+    if generation_config is None:
+        return
 
+    kwargs = {}
     # generic parameters
     kwargs['max_length'] = generation_config.max_length
     # has higher priority than 'max_length'
@@ -300,8 +189,16 @@ def convert_to_hf(
 
     # copy default parameters
     kwargs['bos_token_id'] = default_generation_config.bos_token_id
-    kwargs['eos_token_id'] = default_generation_config.eos_token_id
     kwargs['pad_token_id'] = default_generation_config.pad_token_id
+
+    if len(generation_config.stop_token_ids) > 0:
+        kwargs['eos_token_id'] = list(generation_config.stop_token_ids)
+    elif generation_config.eos_token_id != -1:
+        kwargs['eos_token_id'] = generation_config.eos_token_id
+    else:
+        kwargs['eos_token_id'] = default_generation_config.eos_token_id
+
+    # copy penalties
     kwargs['repetition_penalty'] = generation_config.repetition_penalty
 
     if generation_config.is_beam_search():
@@ -312,8 +209,20 @@ def convert_to_hf(
         kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size
         kwargs['num_return_sequences'] = generation_config.num_return_sequences
         kwargs['output_scores'] = True
+
         if generation_config.num_beam_groups > 1:
             kwargs['diversity_penalty'] = generation_config.diversity_penalty
+
+        # in OpenVINO GenAI this parameter is called stop_criteria,
+        # while in HF it's called early_stopping.
+        # HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER"
+        STOP_CRITERIA_MAP = {
+            StopCriteria.NEVER: "never",
+            StopCriteria.EARLY: True,
+            StopCriteria.HEURISTIC: False
+        }
+
+        kwargs['early_stopping'] = STOP_CRITERIA_MAP[generation_config.stop_criteria]
     elif generation_config.is_multinomial():
         # mulitinomial
         kwargs['temperature'] = generation_config.temperature
@@ -332,23 +241,55 @@ def run_hugging_face(
     opt_model,
     hf_tokenizer,
     prompts: List[str],
-    generation_configs: List[GenerationConfig],
+    generation_configs: List[GenerationConfig] | GenerationConfig,
 ) -> List[GenerationResult]:
     generation_results = []
-    for prompt, generation_config in zip(prompts, generation_configs):
-        inputs = hf_tokenizer(prompt, return_tensors="pt")
-        prompt_len = inputs['input_ids'].numel()
-        generate_outputs = opt_model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'],
-                                              generation_config=convert_to_hf(opt_model.generation_config, generation_config),
-                                              return_dict_in_generate=True, tokenizer=hf_tokenizer)
-        all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
 
-        generation_result = GenerationResult()
-        generation_result.m_generation_ids = all_text_batch
-        # sequences_scores are available only for beam search case
-        if generation_config.is_beam_search():
-            generation_result.m_scores = [score for score in generate_outputs.sequences_scores]
-        generation_results.append(generation_result)
+    if type(generation_configs) is list:
+        # process prompt by promp as we have multiple generation configs
+        for prompt, generation_config in zip(prompts, generation_configs):
+            hf_generation_config = convert_to_hf(opt_model.generation_config, generation_config)
+            inputs = hf_tokenizer(prompt, return_tensors="pt")
+            input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
+            prompt_len = 0 if generation_config.echo else input_ids.numel()
+
+            generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config,
+                                                  return_dict_in_generate=True, tokenizer=hf_tokenizer)
+            all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
+
+            generation_result = GenerationResult()
+            generation_result.m_generation_ids = all_text_batch
+            # sequences_scores are available only for beam search case
+            if generation_config.is_beam_search():
+                generation_result.m_scores = [score for score in generate_outputs.sequences_scores]
+            generation_results.append(generation_result)
+    else:
+        # process all prompts as a single batch as we have a single generation config for all prompts
+        inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True, padding_side='left')
+        input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
+        hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs)
+        hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config,
+                                                return_dict_in_generate=True, tokenizer=hf_tokenizer)
+
+        generation_ids = []
+        scores = []
+
+        for idx, hf_encoded_out in enumerate(hf_encoded_outputs.sequences):
+            prompt_idx = idx // hf_generation_config.num_return_sequences
+            prompt_len = 0 if generation_configs.echo else input_ids[prompt_idx].numel()
+            decoded_text = hf_tokenizer.decode(hf_encoded_out[prompt_len:], skip_special_tokens=True)
+            generation_ids.append(decoded_text)
+            if generation_configs.is_beam_search():
+                scores.append(hf_encoded_outputs.sequences_scores[idx])
+
+            # if we need to move to next generation result
+            if (idx + 1) // hf_generation_config.num_return_sequences != prompt_idx:
+                generation_result = GenerationResult()
+                generation_result.m_generation_ids = generation_ids
+                generation_result.m_scores = scores
+                generation_results.append(generation_result)
+                generation_ids = []
+                scores = []
 
     del hf_tokenizer
     del opt_model
@@ -360,16 +301,65 @@ def run_continuous_batching(
     models_path : Path,
     scheduler_config : SchedulerConfig,
     prompts: List[str],
-    generation_configs : List[GenerationConfig]
+    generation_configs : List[GenerationConfig] | GenerationConfig 
 ) -> List[GenerationResult]:
-    pipe = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU")
-    output = pipe.generate(prompts, generation_configs)
-    del pipe
+    if type(generation_configs) is not list:
+        generation_configs = [generation_configs] * len(prompts)
+ 
+    cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU')
+    output = cb_pipe.generate(prompts, generation_configs)
+
+    del cb_pipe
     shutil.rmtree(models_path)
+
     return output
 
 
-def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig):
+def get_default_properties():
+    import openvino.properties.hint as hints
+    import openvino as ov
+
+    return {
+        hints.inference_precision : ov.Type.f32,
+        hints.kv_cache_precision : ov.Type.f16,
+    }
+
+
+def run_llm_pipeline(
+    models_path : Path,
+    prompts: List[str],
+    generation_config : GenerationConfig,
+    use_cb : bool = False
+) -> List[GenerationResult]:
+    properties = get_default_properties()
+    if use_cb:
+        properties['scheduler_config'] = SchedulerConfig()
+
+    ov_pipe = LLMPipeline(models_path, device='CPU', **properties)
+
+    generate_outputs : DecodedResults = ov_pipe.generate(inputs=prompts, generation_config=generation_config)
+
+    index = 0
+    generation_results = []
+
+    for _ in prompts:
+        generation_result = GenerationResult()
+
+        generation_result.m_generation_ids = generate_outputs.texts[index : index + generation_config.num_return_sequences]
+        # sequences_scores are available only for beam search case
+        if generation_config.is_beam_search():
+            generation_result.m_scores = generate_outputs.scores[index : index + generation_config.num_return_sequences]
+        generation_results.append(generation_result)
+
+        index += generation_config.num_return_sequences
+
+    del ov_pipe
+    shutil.rmtree(models_path)
+
+    return generation_results
+
+
+def compare_generation_result(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig):
     if generation_config.is_beam_search():
         assert len(hf_result.m_scores) == len(ov_result.m_scores)
         for hf_score, ov_score in zip(hf_result.m_scores, ov_result.m_scores):
@@ -386,46 +376,79 @@ def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, ge
             assert hf_text == ov_text
 
 
-def get_hugging_face_model_and_tokenizer(model_id: str, use_optimum = True):
+def compare_generation_results(prompts: List[str], hf_results: List[GenerationResult], ov_results: List[GenerationResult], generation_configs: List[GenerationConfig] | GenerationConfig):
+    if type(generation_configs) is not list:
+        generation_configs = [generation_configs]
+
+    assert len(prompts) == len(hf_results)
+    assert len(prompts) == len(ov_results)
+
+    for prompt, ref_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs):
+        print(f"Prompt = {prompt}\nReference result = {ref_result}\nOpenVINO result = {ov_result.m_generation_ids}")
+        compare_generation_result(ref_result, ov_result, generation_config)
+
+
+def get_hugging_face_models(model_id: str):
     hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \
-                AutoModelForCausalLM.from_pretrained(model_id)
+    opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, ov_config=get_default_properties())
     return opt_model, hf_tokenizer
 
 
-def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path):
-    model.save_pretrained(models_path)
+def convert_models(opt_model : OVModelForCausalLM, hf_tokenizer : AutoTokenizer, models_path: Path):
+    opt_model.save_pretrained(models_path)
+
+    # to store tokenizer config jsons with special tokens
+    hf_tokenizer.save_pretrained(models_path)
+
+    # save generation config
+    opt_model.generation_config.save_pretrained(models_path)
+
     # convert tokenizers as well
     from openvino_tokenizers import convert_tokenizer
     from openvino import serialize
-    tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, skip_special_tokens=True)
+
+    tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True)
     serialize(tokenizer, models_path / "openvino_tokenizer.xml")
     serialize(detokenizer, models_path / "openvino_detokenizer.xml")
 
 
-def _generate_and_compare_with_reference_results(models_path: Path, prompts: List[str], reference_results: List[GenerationResult], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig):
-    ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs)
+def run_llm_pipeline_with_ref(model_id: str, prompts: List[str], generation_config: GenerationConfig | dict, tmp_path: Path, use_cb : bool = False):
+    models_path : Path = tmp_path / model_id
+    opt_model, hf_tokenizer = get_hugging_face_models(model_id)
 
-    assert len(prompts) == len(reference_results)
-    assert len(prompts) == len(ov_results)
+    if type(generation_config) is dict:
+        generation_config = GenerationConfig(**generation_config)
+
+    convert_models(opt_model, hf_tokenizer, models_path)
 
-    for prompt, ref_result, ov_result, generation_config in zip(prompts, reference_results, ov_results, generation_configs):
-        print(f"Prompt = {prompt}\nref result = {ref_result}\nOV result = {ov_result.m_generation_ids}")
-        compare_results(ref_result, ov_result, generation_config)
+    ov_results = run_llm_pipeline(models_path, prompts, generation_config, use_cb)
+    hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_config)
 
+    compare_generation_results(prompts, hf_results, ov_results, generation_config)
+
+
+def run_cb_pipeline_with_ref(tmp_path: str, model_id: str, scheduler_params: dict = {}, generation_config : GenerationConfig | dict = None):
+    prompts, generation_configs = get_test_dataset()
+    scheduler_config = get_scheduler_config(scheduler_params)
+
+    # override dataset's generation config
+    if generation_config is not None:
+        if type(generation_config) is dict:
+            generation_config = GenerationConfig(**generation_config)
+        generation_configs = [generation_config] * len(prompts)
 
-def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path):
-    use_optimum = True
     models_path : Path = tmp_path / model_id
-    opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum)
+    opt_model, hf_tokenizer = get_hugging_face_models(model_id)
 
-    if use_optimum:
-        save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path)
+    convert_models(opt_model, hf_tokenizer, models_path)
 
-    hf_results = run_hugging_face(opt_model=opt_model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs)
-    _generate_and_compare_with_reference_results(models_path, prompts, hf_results, generation_configs, scheduler_config)
+    hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_configs)
+    ov_results = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs)
 
+    compare_generation_results(prompts, hf_results, ov_results, generation_configs)
 
+
+# TODO: remove after Generator property is supported by LLMPipeline / VLMPipeline
 def generate_and_compare_with_reference_text(models_path: Path, prompts: List[str], reference_texts_per_prompt: List[List[str]], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig):
     ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs)
 
@@ -440,19 +463,6 @@ def generate_and_compare_with_reference_text(models_path: Path, prompts: List[st
             assert ref_text == ov_text
 
 
-def run_continuous_batching_pipeline_test(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None):
-    prompts, generation_configs = get_test_dataset()
-    scheduler_config = get_scheduler_config(scheduler_params)
-
-    if generation_config is not None:
-        generation_config.rng_seed = 0
-        generation_configs = [generation_config] * len(prompts)
-
-    generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path)
-
-
-DEFAULT_SCHEDULER_CONFIG = get_scheduler_config({"num_kv_blocks": 300, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256})
-
 def get_image_by_link(link):
     from PIL import Image
     import requests
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 9e8e4681f9..00c74f6628 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -13,6 +13,8 @@
 import shutil
 import json
 
+import openvino_genai as ov_genai
+
 
 def get_models_list():
     precommit_models = [
@@ -52,6 +54,7 @@ def get_models_list():
 
     if pytest.selected_model_ids:
         model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
+
     # pytest.set_trace()
     prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
     return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
@@ -81,66 +84,57 @@ def get_chat_models_list():
 
 @functools.lru_cache(1)
 def read_model(params, **tokenizer_kwargs):
-    model_id, path = params
+    model_id, models_path = params
 
     from optimum.intel.openvino import OVModelForCausalLM
     from transformers import AutoTokenizer
     hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 
-    if (path / "openvino_model.xml").exists():
-        opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True,
+    if (models_path / "openvino_model.xml").exists():
+        opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True,
                                                        compile=False, device='CPU')
     else:
         ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer,
                                                                              with_detokenizer=True,
                                                                              **tokenizer_kwargs)
-        openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml")
-        openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml")
+        openvino.save_model(ov_tokenizer, models_path / "openvino_tokenizer.xml")
+        openvino.save_model(ov_detokenizer, models_path / "openvino_detokenizer.xml")
 
         # to store tokenizer config jsons with special tokens
-        hf_tokenizer.save_pretrained(path)
+        hf_tokenizer.save_pretrained(models_path)
 
         opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True,
                                                        compile=False, device='CPU', load_in_8bit=False)
-        opt_model.generation_config.save_pretrained(path)
-        opt_model.config.save_pretrained(path)
-        opt_model.save_pretrained(path)
+        opt_model.generation_config.save_pretrained(models_path)
+        opt_model.config.save_pretrained(models_path)
+        opt_model.save_pretrained(models_path)
 
     return (
         model_id,
-        path,
+        models_path,
         hf_tokenizer,
         opt_model,
-        ov_genai.LLMPipeline(path, 'CPU', ENABLE_MMAP=False),
+        ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False),
     )
 
 
-# in OpenVINO GenAI this parameter is called stop_criteria,
-# while in HF it's called early_stopping.
-# HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER"
-STOP_CRITERIA_MAP = {
-    ov_genai.StopCriteria.NEVER: "never",
-    ov_genai.StopCriteria.EARLY: True,
-    ov_genai.StopCriteria.HEURISTIC: False
-}
-
-
 @pytest.fixture(scope="module")
 def model_tmp_path(tmpdir_factory):
-    model_id, path, _, _, _ = read_model(get_models_list()[0])
+    model_id, models_path, _, _, _ = read_model(get_models_list()[0])
     temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_'))
 
     # copy openvino converted model and tokenizers
     for pattern in ['*.xml', '*.bin']:
-        for src_file in path.glob(pattern):
+        for src_file in models_path.glob(pattern):
             if src_file.is_file():
                 shutil.copy(src_file, temp_path / src_file.name)
+
     yield model_id, Path(temp_path)
 
 
 @pytest.fixture(scope="module")
 def model_tokenizers_tmp_path(tmpdir_factory):
-    model_id, path, _, _, _ = read_model(get_models_list()[0])
+    model_id, models_path, _, _, _ = read_model(get_models_list()[0])
     temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_'))
 
     # If tokens were not found in IR, it fallback to reading from config.
@@ -148,10 +142,11 @@ def model_tokenizers_tmp_path(tmpdir_factory):
     # and set tokens in configs and to check if they are read and validated correctly.
     import openvino as ov
 
+    core = ov.Core()
+
     # copy openvino converted model and tokenizers
     for pattern in ['*.xml', '*.bin']:
-        for src_file in path.glob(pattern):
-            core = ov.Core()
+        for src_file in models_path.glob(pattern):
 
             # Update files if they are openvino_tokenizer.xml or openvino_detokenizer.xml
             if src_file.name in ['openvino_tokenizer.xml', 'openvino_detokenizer.xml']:
@@ -166,8 +161,10 @@ def model_tokenizers_tmp_path(tmpdir_factory):
 
             if src_file in ['openvino_tokenizer.bin', 'openvino_detokenizer.bin']:
                 continue
+
             if src_file.is_file():
                 shutil.copy(src_file, temp_path / src_file.name)
+
     yield model_id, Path(temp_path)
 
 
diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py
index 01762bf9e3..fabcf06b71 100644
--- a/tests/python_tests/test_continuous_batching.py
+++ b/tests/python_tests/test_continuous_batching.py
@@ -9,8 +9,8 @@
 from pathlib import Path
 from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
 
-from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \
-    get_scheduler_config, get_greedy, run_continuous_batching_pipeline_test, get_beam_search, get_greedy, \
+from common import get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \
+    get_scheduler_config, get_greedy, run_cb_pipeline_with_ref, get_beam_search, get_greedy, \
     get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
     get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p
 from test_sampling import RandomSamplingTestStruct, get_current_platform_ref_texts
@@ -39,19 +39,19 @@ def read_models_list(file_name: str):
 @pytest.mark.precommit
 @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit")))
 def test_e2e_precommit(tmp_path, model_id):
-    run_continuous_batching_pipeline_test(tmp_path, model_id)
+    run_cb_pipeline_with_ref(tmp_path, model_id)
 
 
 @pytest.mark.nightly
 @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly")))
 def test_e2e_nightly(tmp_path, model_id):
-    run_continuous_batching_pipeline_test(tmp_path, model_id)
+    run_cb_pipeline_with_ref(tmp_path, model_id)
 
 
 @pytest.mark.real_models
 @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models")))
 def test_e2e_real_models(tmp_path, model_id):
-    run_continuous_batching_pipeline_test(tmp_path, model_id)
+    run_cb_pipeline_with_ref(tmp_path, model_id)
 
 #
 # Comparison with stateful
@@ -77,8 +77,8 @@ def test_continuous_batching_vs_stateful(prompt, generation_config):
         "facebook/opt-125m",
         Path("opt-125m")
     ))
-    cb = get_continuous_batching(path)
-    generated = cb.generate(prompt, **generation_config)
+    cb_pipe = get_continuous_batching(path)
+    generated = cb_pipe.generate(prompt, **generation_config)
     reference = stateful.generate(prompt, **generation_config)
     assert generated.texts == reference.texts
     if 1 != generation_config.get("num_return_sequences", 1):
@@ -117,8 +117,8 @@ def test_cb_streamer_vs_return_vs_stateful(prompt):
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
 def test_chat_scenario_vs_stateful(model_descr, generation_config_kwargs: Dict):
-    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
-    cb_pipe = get_continuous_batching(path)
+    model_id, models_path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+    cb_pipe = get_continuous_batching(models_path)
 
     ov_pipe.start_chat()
     cb_pipe.start_chat()
@@ -150,10 +150,10 @@ def test_post_oom_health(tmp_path, sampling_config):
     scheduler_config.num_kv_blocks = 10 # Low cache size to trigger OOM quickly
 
     model_id : str = "facebook/opt-125m"
-    opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
+    opt_model, hf_tokenizer = get_hugging_face_models(model_id)
 
     models_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path)
+    convert_models(opt_model, hf_tokenizer, models_path)
 
     cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU")
 
@@ -201,7 +201,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
 @pytest.mark.parametrize("params", scheduler_params_list)
 @pytest.mark.precommit
 def test_preemption(tmp_path, params):
-    run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1])
+    run_cb_pipeline_with_ref(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1])
 
 
 multinomial_params = RandomSamplingTestStruct(
@@ -249,13 +249,12 @@ def test_preemption(tmp_path, params):
 def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse):
     generation_configs = multinomial_params.generation_config
     for config in generation_configs:
-        config.rng_seed = 0
         config.max_new_tokens = 30
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_models(model_id)
 
     models_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(model, hf_tokenizer, models_path)
+    convert_models(model, hf_tokenizer, models_path)
 
     scheduler_config = get_scheduler_config({"num_kv_blocks": 3, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256})
     generate_and_compare_with_reference_text(models_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config)
@@ -329,15 +328,12 @@ def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse):
 @pytest.mark.precommit
 @pytest.mark.skip(reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally.")
 def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse):
-    generation_configs = multinomial_params_n_seq.generation_config
-    for config in generation_configs:
-        config.rng_seed = 0
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
+    opt_model, hf_tokenizer = get_hugging_face_models(model_id)
 
     models_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(model, hf_tokenizer, models_path)
+    convert_models(opt_model, hf_tokenizer, models_path)
 
     # needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq )
     scheduler_config = get_scheduler_config({"num_kv_blocks": 8, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256})
-    generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config)
+    generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, multinomial_params_n_seq.generation_config, scheduler_config)
diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py
index 6228f53dd1..41281e9cab 100644
--- a/tests/python_tests/test_kv_cache_eviction.py
+++ b/tests/python_tests/test_kv_cache_eviction.py
@@ -15,7 +15,7 @@
 from openvino import serialize
 from transformers import AutoTokenizer
 
-from common import TESTS_ROOT, run_continuous_batching_pipeline_test
+from common import TESTS_ROOT, run_cb_pipeline_with_ref
 
 
 def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]:
@@ -150,6 +150,7 @@ def get_greedy_seq_len_300() -> GenerationConfig:
     generation_config.max_new_tokens = 300
     return generation_config
 
+
 def get_beam_search_seq_len_300() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_beam_groups = 3
@@ -159,6 +160,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
     generation_config.num_return_sequences = generation_config.num_beams
     return generation_config
 
+
 scheduler_params_list = [
                          ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True}, get_greedy_seq_len_300()),
                          ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True}, get_beam_search_seq_len_300()),
@@ -168,5 +170,5 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
 @pytest.mark.parametrize("params", scheduler_params_list)
 @pytest.mark.precommit
 def test_dynamic_memory_allocation(tmp_path, params):
-    run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", params[0], params[1])
+    run_cb_pipeline_with_ref(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1])
 
diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py
index 6e3cce06d0..986b342c59 100644
--- a/tests/python_tests/test_llm_pipeline.py
+++ b/tests/python_tests/test_llm_pipeline.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import openvino_genai as ov_genai
-from openvino_genai import StopCriteria, GenerationConfig
+from openvino_genai import GenerationConfig
 import pytest
 from typing import Union, List, Dict, Optional
 import numpy as np
@@ -10,152 +10,30 @@
 import sys
 from pathlib import Path
 import torch
-import math
+
+from common import run_llm_pipeline_with_ref, convert_to_hf
 from ov_genai_test_utils import (
     get_models_list,
     read_model,
     load_genai_pipe_with_configs,
     get_chat_models_list,
     model_tmp_path,
-    STOP_CRITERIA_MAP,
 )
 
-
-def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]):
-    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
-    config = generation_config.copy()  # to avoid side effects
-    num_beams = config['num_beams'] if 'num_beams' in config else 1
-    config['num_return_sequences'] = num_beams
-
-    if not isinstance(prompts, list):
-        prompts = [prompts]
-
-    if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config
-        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
-        # Need to set explicitly to False, but only if test arguments omitted this arg.
-        # Do not apply 'repetition_penalty' if sampling is not used.
-        config['do_sample'] = False
-        config['repetition_penalty'] = 1.0 # 1.0 means no penalty
-
-    generation_config_hf = config.copy()
-    if generation_config_hf.get('stop_criteria'):
-        generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
-    generation_config_hf.pop('ignore_eos', None)
-
-    # Encode the batch of prompts
-    hf_tokenizer.padding_side = "left"
-    encoded_prompts = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True)
-    prompt_ids, attention_mask = encoded_prompts['input_ids'], encoded_prompts['attention_mask']
-
-    hf_encoded_outputs = opt_model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf)
-
-    hf_outputs = []
-    for idx, hf_encoded_out in enumerate(hf_encoded_outputs):
-        prompt_count = idx // num_beams
-        hf_outputs.append(hf_tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True))
-
-    ov_outputs = ov_pipe.generate(prompts, **config).texts
-
-    hf_outputs.sort()
-    ov_outputs.sort()
-    for i, (hf_output, ov_output) in enumerate(zip(hf_outputs, ov_outputs)):
-        if hf_output != ov_output:
-            print(f'hf_output: {hf_output}')
-            print(f'ov_output: {ov_output}')
-        assert hf_output == ov_output
-
-
-def run_hf_ov_genai_comparison_text_inputs(model_descr, generation_config: Dict, prompt: str):
-    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
-
-    config = generation_config.copy()  # to avoid side effects
-
-    if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config
-        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
-        # Need to set explicitly to False, but only if test arguments omitted this arg.
-        # Do not apply 'repetition_penalty' if sampling is not used.
-        config['do_sample'] = False
-        config['repetition_penalty'] = 1.0 # 1.0 means no penalty
-
-    generation_config_hf = config.copy()
-    if generation_config_hf.get('stop_criteria'):
-        generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
-    generation_config_hf.pop('ignore_eos', None)
-
-    encoded_prompt = hf_tokenizer([prompt], return_tensors='pt', add_special_tokens=True)
-    prompt_ids, attention_mask = encoded_prompt['input_ids'], encoded_prompt['attention_mask']
-    hf_encoded_output = opt_model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf)
-    hf_output = hf_tokenizer.decode(hf_encoded_output[0, prompt_ids.shape[1]:], skip_special_tokens=True)
-
-    ov_output = ov_pipe.generate(prompt, **config)
-    if config.get('num_return_sequences', 1) > 1:
-        assert hf_output in ov_output.texts
-    else:
-        if hf_output != ov_output:
-            print(f'hf_output: {hf_output}')
-            print(f'ov_output: {ov_output}')
-
-        assert hf_output == ov_output
-
-
-def run_hf_ov_genai_comparison_encoded_inputs(
-        model_descr,
-        generation_config: Dict,
-        input_ids: np.ndarray,
-        attention_mask: Optional[np.array] = None
-    ):
-    device = 'CPU'
-    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
-
-    config = generation_config.copy()  # to avoid side effects
-
-    if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config
-        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
-        # Need to set explicitly to False, but only if test arguments omitted this arg.
-        # Do not apply 'repetition_penalty' if sampling is not used.
-        config['do_sample'] = False
-        config['repetition_penalty'] = 1.0 # 1.0 means no penalty
-
-    generation_config_hf = config.copy()
-    if generation_config_hf.get('stop_criteria'):
-        generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
-    generation_config_hf.pop('ignore_eos', None)
-
-    if attention_mask is not None:
-        inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask))
-        inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask))
-    else:
-        inputs_hf = dict(inputs=torch.tensor(input_ids))
-        inputs_ov = ov.Tensor(input_ids)
-
-    hf_output = opt_model.generate(**inputs_hf, **generation_config_hf)
-    ov_output = ov_pipe.generate(inputs_ov, **config)
-
-    hf_res = hf_output[0, input_ids.shape[1]:].numpy()
-    ov_res = np.array(ov_output.tokens, dtype=np.int64)
-    assert np.all(ov_res == hf_res)
-
 #
 # e2e work
 #
 
 test_cases = [
-    (dict(max_new_tokens=20), 'table is made of'),
     (dict(max_new_tokens=20), '你好！ 你好嗎？'),
-    (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'),
-    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'),
-    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'),
-    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'),
+    (dict(max_new_tokens=30, num_beams=15, num_beam_groups=3, num_return_sequences=15, diversity_penalty=1.0), 'Alan Turing was a'),
 ]
-@pytest.mark.parametrize("generation_config,prompt", test_cases)
+@pytest.mark.parametrize("generation_config_dict,prompt", test_cases)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_decoding(model_descr, generation_config, prompt):
-    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
+def test_string_inputs(model_descr, generation_config_dict, prompt):
+    run_llm_pipeline_with_ref(model_id=model_descr[0], prompts=[prompt], generation_config=generation_config_dict, tmp_path=model_descr[1])
 
 
 input_tensors_list = [
@@ -168,13 +46,32 @@ def test_decoding(model_descr, generation_config, prompt):
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_encoded_inputs(model_descr, inputs):
-    run_hf_ov_genai_comparison_encoded_inputs(read_model(model_descr), dict(max_new_tokens=20), *inputs)
+    device = 'CPU'
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr)
+
+    ov_generation_config = GenerationConfig(max_new_tokens=20)
+    hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config)
+
+    input_ids, attention_mask = inputs
+
+    if attention_mask is not None:
+        inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask))
+        inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask))
+    else:
+        inputs_hf = dict(inputs=torch.tensor(input_ids))
+        inputs_ov = ov.Tensor(input_ids)
+
+    hf_output = opt_model.generate(**inputs_hf, generation_config=hf_generation_config)
+    ov_output = ov_pipe.generate(inputs_ov, ov_generation_config)
+
+    hf_res = hf_output[0, input_ids.shape[1]:].numpy()
+    ov_res = np.array(ov_output.tokens, dtype=np.int64)
+    assert np.all(ov_res == hf_res)
 
 
 test_configs = [
     dict(max_new_tokens=20),
-    dict(max_new_tokens=200, ignore_eos=True),
-    dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0)
+    dict(max_new_tokens=20, num_beam_groups=2, num_beams=6, diversity_penalty=1.0)
 ]
 batched_prompts = [
     ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'],
@@ -182,107 +79,13 @@ def test_encoded_inputs(model_descr, inputs):
     ['Alan Turing was a', 'return 0', '你好！ 你好嗎？'],
     ['table is made', 'table is made [force left pad tokens]']
 ]
-@pytest.mark.parametrize("generation_config", test_configs)
+@pytest.mark.parametrize("generation_config_dict", test_configs)
 @pytest.mark.parametrize("prompts", batched_prompts)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_batch_text_input(model_descr, generation_config, prompts):
-    run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts)
-
-
-prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of']
-@pytest.mark.parametrize("num_beam_groups", [2, 3, 8])
-@pytest.mark.parametrize("group_size", [5, 3, 10])
-@pytest.mark.parametrize("max_new_tokens", [20, 15])
-@pytest.mark.parametrize("diversity_penalty", [1.0 , 1.5])
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_beam_search_decoding(model_descr, num_beam_groups, group_size, max_new_tokens, diversity_penalty, prompt):
-    generation_config = dict(
-        num_beam_groups=num_beam_groups,
-        num_beams=num_beam_groups * group_size,
-        diversity_penalty=diversity_penalty,
-        num_return_sequences=num_beam_groups * group_size,
-        max_new_tokens=max_new_tokens,
-    )
-    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
-
-
-@pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC])
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("max_new_tokens", [10, 80])
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_beam_search_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
-    # todo: with EARLY stop_criteria looks like HF return invalid out with sentence<eos><unk><unk>
-    # while genai ends sentence with <eos>
-    if (stop_criteria == StopCriteria.EARLY):
-        pytest.skip()
-    generation_config = dict(
-        num_beam_groups=2,
-        num_beams=2 * 3,
-        diversity_penalty=1.0,
-        num_return_sequences=2 * 3,
-        max_new_tokens=max_new_tokens,
-        stop_criteria=stop_criteria,
-    )
-    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
-
-
-# test long sequences
-@pytest.mark.parametrize("num_beam_groups", [2])
-@pytest.mark.parametrize("group_size", [5])
-@pytest.mark.parametrize("max_new_tokens", [800, 2000])
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.nightly
-def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size,
-                                    max_new_tokens, prompt):
-    generation_config = dict(
-        num_beam_groups=num_beam_groups,
-        num_beams=num_beam_groups * group_size,
-        diversity_penalty=1.0,
-        num_return_sequences=num_beam_groups * group_size,
-        max_new_tokens=max_new_tokens,
-    )
-    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
-
-
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_greedy_repetition_penalty(model_descr, prompt):
-    model_id, path, tokenizer, model, pipe = read_model(model_descr)
-
-    generation_config = dict(
-        repetition_penalty=2.0,
-        max_new_tokens=20,
-        do_sample=False
-    )
-    run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt)
-
-    generation_config = dict(
-        repetition_penalty=1.0,
-        max_new_tokens=20,
-        do_sample=False
-    )
-    run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt)
-
-    ov_output = pipe.generate(prompt, **generation_config)
-
-    generation_config = dict(
-        repetition_penalty=0.5,
-        max_new_tokens=20,
-        do_sample=False
-    )
-    ov_output_half_penalty = pipe.generate(prompt, **generation_config)
-
-    assert(len(set(ov_output.split(' '))) > len(set(ov_output_half_penalty.split(' '))))
+def test_batch_string_inputs(model_descr, generation_config_dict, prompts):
+    run_llm_pipeline_with_ref(model_id=model_descr[0], prompts=prompts, generation_config=generation_config_dict, tmp_path=model_descr[1])
 
 
 @pytest.mark.precommit
@@ -313,17 +116,14 @@ def test_batch_size_switch():
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_chat_compare_with_HF(model_descr, generation_config_kwargs: Dict):
+def test_chat_scenario(model_descr, generation_config_kwargs: Dict):
     chat_history_hf = []
     chat_history_ov = []
-    chat_prompt = ''
 
-    # Will set add_special_tokens=False inside pipeline when start_chat() is called.
     model_id, path, tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
 
-    from transformers import GenerationConfig as HFGenerationConfig
-    hf_generation_config = HFGenerationConfig(**generation_config_kwargs)
     ov_generation_config = GenerationConfig(**generation_config_kwargs)
+    hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config)
 
     ov_pipe.start_chat()
     for prompt in questions:
@@ -559,39 +359,27 @@ def test_unicode_pybind_decoding_one_string_streamer():
 # Perf metrics
 #
 
-def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: str) -> ov_genai.PerfMetrics:
-    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
-
-    config = generation_config.copy()  # to avoid side effects
-
-    if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config
-        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
-        # Need to set explicitly to False, but only if test arguments omitted this arg.
-        # Do not apply 'repetition_penalty' if sampling is not used.
-        config['do_sample'] = False
-        config['repetition_penalty'] = 1.0 # 1.0 means no penalty
-
-    return ov_pipe.generate([prompt], **config).perf_metrics
+def run_perf_metrics_collection(model_descr, generation_config_dict: dict, prompt: str) -> ov_genai.PerfMetrics:
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr)
+    return ov_pipe.generate([prompt], **generation_config_dict).perf_metrics
 
 
 test_cases = [
     (dict(max_new_tokens=20), 'table is made of'),
 ]
 @pytest.mark.parametrize("generation_config,prompt", test_cases)
-@pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-@pytest.mark.skip(reason="load_time + mean_gen_duration < total_time fails in https://github.com/openvinotoolkit/openvino.genai/actions/runs/12503590506/job/34884840100?pr=1440.")
-def test_perf_metrics(model_descr, generation_config, prompt):
+def test_perf_metrics(generation_config, prompt):
     import time
     start_time = time.perf_counter()
-    perf_metrics = run_perf_metrics_collection(read_model(model_descr), generation_config, prompt)
+    model_id, path = 'katuni4ka/tiny-random-gemma2', Path('katuni4ka-tiny-random-gemma2')
+    perf_metrics = run_perf_metrics_collection((model_id, path), generation_config, prompt)
     total_time = (time.perf_counter() - start_time) * 1000
 
     # Check that load time is adequate.
     load_time = perf_metrics.get_load_time()
-    assert load_time > 0 and load_time < 1000.0
+    assert load_time > 0 and load_time < 2000.0
 
     # Check that num input and generated tokens are adequate.
     num_generated_tokens = perf_metrics.get_num_generated_tokens()
@@ -657,34 +445,6 @@ def test_perf_metrics(model_descr, generation_config, prompt):
 # Misc
 #
 
-# TODO: move to test_sampling.py
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_stop_token_ids():
-    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
-    res = ov_pipe.generate(
-        ov.Tensor([(1,)]),
-        max_new_tokens=3,
-        stop_token_ids={9935, ov_pipe.get_tokenizer().get_eos_token_id()},
-        include_stop_str_in_output=False
-    )
-    assert 2 == len(res.tokens[0])
-    assert 9935 in res.tokens[0]
-
-
-# TODO: move to test_sampling.py
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_stop_strings():
-    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
-    res = ov_pipe.generate(
-        "",
-        max_new_tokens=5,
-        stop_strings={"ignored", "боль"}
-    )
-    assert "боль" not in res
-
-
 # TODO: move this test to test_tokenizer.py
 @pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory")
 @pytest.mark.precommit
@@ -698,7 +458,7 @@ def test_left_pad():
     ]
     models = read_model(("microsoft/phi-1_5", Path("phi-1_5/")))
 
-    config = {
+    generation_config_dict = {
         "max_new_tokens": 20,
         "num_beam_groups": 2,
         "num_beams": 2,
@@ -713,4 +473,5 @@ def test_left_pad():
     }
 
     models[2].pad_token = models[2].eos_token
-    run_hf_ov_genai_comparison_batched(models, config, prompts)
+    
+    run_llm_pipeline_with_ref(model_id=models[0], prompts=prompts, generation_config=generation_config_dict, tmp_path=models[1])
diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py
index c3500d15ac..6ef6162043 100644
--- a/tests/python_tests/test_llm_pipeline_static.py
+++ b/tests/python_tests/test_llm_pipeline_static.py
@@ -2,14 +2,18 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import openvino_genai as ov_genai
-from openvino.runtime import Core
 import pytest
+import platform
 import sys
 from ov_genai_test_utils import (
     get_models_list,
     get_chat_models_list,
+    read_model
 )
+from common import get_default_properties
 
+if sys.platform == 'darwin' or platform.machine() in ["aarch64", "arm64", "ARM64"]:
+    pytest.skip("NPU plugin is available only on Linux and Windows x86_64", allow_module_level=True)
 
 # This test suite is designed specifically to validate the functionality and robustness of the StaticLLMPipeline on NPUW:CPU.
 common_config = {
@@ -24,19 +28,18 @@
 def generate_chat_history(model_path, device, pipeline_config, questions):
     pipe = ov_genai.LLMPipeline(model_path, device, **pipeline_config)
     pipe.start_chat()
-    chat_history = [ pipe.generate(question, max_new_tokens=50) for question in questions ]
+    chat_history = [ pipe.generate(question, max_new_tokens=50, do_sample=False) for question in questions ]
     pipe.finish_chat()
     return chat_history
 
 
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_generation_compare_with_stateful():
     prompt = 'The Sun is yellow because'
-    model_path = get_models_list()[0][1]
+    model_path = read_model(get_models_list()[0])[1]
 
-    stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU")
+    stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU", **get_default_properties())
     ref_out = stateful_pipe.generate(prompt, max_new_tokens=100)
 
     static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
@@ -48,11 +51,10 @@ def test_generation_compare_with_stateful():
     assert ref_out == actual_out
 
 
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_length_properties_set_no_exception():
-    model_path = get_models_list()[0][1]
+    model_path = read_model(get_models_list()[0])[1]
     # NB: Check it doesn't throw any exception
     pipeline_config = { "MAX_PROMPT_LEN": 128, "MIN_RESPONSE_LEN": 64 }
     pipeline_config |= common_config
@@ -65,22 +67,20 @@ def test_length_properties_set_no_exception():
     { "MIN_RESPONSE_LEN": -1  },
     { "MIN_RESPONSE_LEN": "1" }
 ]
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.parametrize("pipeline_config", pipeline_configs)
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_invalid_length_properties_raise_error(pipeline_config):
-    model_path = get_models_list()[0][1]
+    model_path = read_model(get_models_list()[0])[1]
     pipeline_config |= common_config
     with pytest.raises(RuntimeError):
         pipe = ov_genai.LLMPipeline(model_path, "NPU", **pipeline_config)
 
 
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_batch_one_no_exception():
-    model_path = get_models_list()[0][1]
+    model_path = read_model(get_models_list()[0])[1]
     prompt = 'The Sun is yellow because'
     static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
     # Check it doesn't throw any exception when batch of size 1 is provided
@@ -88,11 +88,10 @@ def test_batch_one_no_exception():
 
 
 # TODO: For the further batch support
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_batch_raise_error():
-    model_path = get_models_list()[0][1]
+    model_path = read_model(get_models_list()[0])[1]
     prompt = 'The Sun is yellow because'
     pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
     with pytest.raises(RuntimeError):
@@ -101,26 +100,24 @@ def test_batch_raise_error():
 
 # TODO: For the further sampling support
 generation_configs = [
-    dict(num_beam_groups=3),
+    dict(num_beams=3),
     dict(do_sample=True)
 ]
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.parametrize("generation_config", generation_configs)
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_unsupported_sampling_raise_error(generation_config):
-    model_path = get_models_list()[0][1]
+    model_path = read_model(get_models_list()[0])[1]
     prompt = 'The Sun is yellow because'
     pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
     with pytest.raises(RuntimeError):
         pipe.generate(prompt, **generation_config)
 
 
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_max_number_of_tokens():
-    model_path = get_models_list()[0][1]
+    model_path = read_model(get_models_list()[0])[1]
     prompt = 'The Sun is yellow because'
     num_tokens = 128
 
@@ -133,11 +130,10 @@ def test_max_number_of_tokens():
 
 
 # FIXME: Known problem, output differs from stateful pipeline starting from 3rd prompt!
-@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.skip(reason="JIRA-144780: Output differs from stateful pipeline")
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_chat_generation(model_descr):
+def test_chat_generation():
     questions = [
         '1+1=',
         'What is the previous answer?',
@@ -145,9 +141,9 @@ def test_chat_generation(model_descr):
         'What was my first question?'
     ]
 
-    model_path = get_chat_models_list()[0][1]
+    model_path = read_model(get_chat_models_list()[0])[1]
 
-    chat_history_stateful = generate_chat_history(model_path, "CPU", { }, questions)
+    chat_history_stateful = generate_chat_history(model_path, "CPU", get_default_properties(), questions)
     chat_history_static   = generate_chat_history(model_path, "NPU", common_config, questions)
 
     print('npu chat: \n{chat_history_static}\n')
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index 25ae9d8afa..004d4f9d9d 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -1,84 +1,96 @@
 # Copyright (C) 2018-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-import os
+
 import sys
 import pytest
-import shutil
 import sys
 from dataclasses import dataclass
 from pathlib import Path
-from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
+from openvino_genai import GenerationConfig, StopCriteria
 from typing import List, TypedDict
 
-from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, \
-    get_greedy, get_beam_search, get_multinomial_temperature, \
-    get_greedy_with_penalties, get_multinomial_temperature, \
-    get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \
-    get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, \
-    get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
-    get_greedy, get_greedy_with_min_and_max_tokens, \
-    get_greedy_with_single_stop_string, get_greedy_with_multiple_stop_strings, get_greedy_with_multiple_stop_strings_no_match, \
-    get_beam_search, get_beam_search_min_and_max_tokens, get_beam_search_with_single_stop_string, \
-    get_beam_search_with_multiple_stop_strings, get_beam_search_with_multiple_stop_strings_no_match, get_multinomial_max_and_min_token, \
-    get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \
-    get_greedy_stop_strings_exclude_from_output, get_greedy_stop_strings_include_to_output, \
-    get_greedy_n_stop_strings_exclude_from_output, get_greedy_n_stop_strings_include_to_output, \
-    generate_and_compare_with_hf, get_multinomial_temperature_and_repetition_penalty, get_scheduler_config, \
-    run_continuous_batching
+from common import get_hugging_face_models, convert_models, run_llm_pipeline_with_ref, run_llm_pipeline
 
 
-# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests
 @pytest.mark.precommit
-def test_beam_search_has_eos_token_at_end(tmp_path):
-    '''
-    Current test checks that in case of beam search, some generation results
-    explicitly have EOS token at the end, which is aligned with HF
+@pytest.mark.parametrize("generation_config,prompt",
+                         [(dict(max_new_tokens=30), 'table is made of'),
+                          (dict(max_new_tokens=30, min_new_tokens=30), '你好！ 你好嗎？'),
+                          (dict(max_new_tokens=30, ignore_eos=True), 'Alan Turing was a'),
+                        #   (dict(max_length=40), 'table is made of'),
+                          (dict(stop_token_ids={28998}), 'The Sun is yellow because'), # since a test does not hang, it means stop token is met
+                        #   (dict(max_new_tokens=1, min_new_tokens=0, echo=True), 'What is OpenVINO?')
+                          ],
+                         ids=["max_new_tokens",
+                              "min_and_max_new_tokens",
+                              "max_new_tokens_and_ignore_eos_true",
+                            #   "max_length",
+                              "stop_token_ids",
+                            #   "echo_with_generation",
+                              ])
+def test_basic_stop_criteria(tmp_path, generation_config, prompt):
+    model_id : str = "katuni4ka/tiny-random-phi3"
+    run_llm_pipeline_with_ref(model_id, [prompt], generation_config, tmp_path)
 
-    Example of current output:
-    { -1.23264,  that I don't know about.
-    I don't know what you're talking about, but I'm pretty sure it's a Canadian thing.</s> }
-    '''
-    model_id = "facebook/opt-125m"
-    prompts = ["Tell me something about Canada"]
-    generation_configs = [get_beam_search()]
-    scheduler_config = get_scheduler_config()
-    generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path)
 
-
-# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests
 @pytest.mark.precommit
-def test_greedy_has_eos_token_at_end(tmp_path):
-    '''
-    Current test checks that in case of gready, some generation results
-    explicitly have EOS token at the end, which is aligned with HF:
+@pytest.mark.parametrize("generation_config",
+                         [dict(max_new_tokens=50, min_new_tokens=15, stop_strings={"anag"}, include_stop_str_in_output=True), # expected match on "manage"
+                          dict(max_new_tokens=50, min_new_tokens=1, stop_strings={".", "software", "Intel"}, include_stop_str_in_output=True),
+                          dict(max_new_tokens=50, min_new_tokens=1, stop_strings={"Einstein", "sunny", "geothermal"}, include_stop_str_in_output=True), # expected no match
+                          dict(max_new_tokens=30, stop_strings={ "machines" }, include_stop_str_in_output=False),
+                          dict(max_new_tokens=30, stop_strings={ "machines" }, include_stop_str_in_output=True),
+                          dict(max_new_tokens=30, stop_strings={ "machines", "manage" }, include_stop_str_in_output=False),
+                          dict(max_new_tokens=30, stop_strings={ "machines", "manage" }, include_stop_str_in_output=True),],
+                         ids=["single_stop_string",
+                              "multiple_stop_strings_match",
+                              "multiple_stop_strings_no_match",
+                              "single_stop_string_exclude_from_output",
+                              "single_stop_string_include_to_output",
+                              "multiple_stop_strings_exclude_from_output",
+                              "multiple_stop_strings_include_to_output"])
+def test_stop_strings(tmp_path, generation_config):
+    prompts = [ "What is OpenVINO?" ]
+    model_id : str = "facebook/opt-125m"
+    run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path)
 
-    Example of current output:
-    {  a software program</s> }
-    '''
-    model_id = "bigscience/bloomz-560m"
-    prompts = ["What is OpenVINO?"]
-    generation_configs = [get_greedy()]
-    scheduler_config = get_scheduler_config()
-    generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path)
+
+@pytest.mark.precommit
+@pytest.mark.parametrize("generation_config",
+                         [dict(max_new_tokens=30),
+                          dict(max_new_tokens=30, repetition_penalty=2.0),],
+                         ids=["basic",
+                              "repetition_penalty",])
+def test_greedy(tmp_path, generation_config):
+    prompts = [ "What is OpenVINO?" ]
+    model_id : str = "katuni4ka/tiny-random-phi3"
+    run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path)
 
 
-# TODO: consider removing all these functions with generation configs and use Dict with properties, which can be converted to generation config
 @pytest.mark.precommit
 @pytest.mark.parametrize("generation_config",
-                         [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(),
-                          get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(),
-                          get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(),
-                          get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(),
-                          get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output()],
-                         ids=["greedy", "greedy_with_min_and_max_tokens", "greedy_with_repetition_penalty", "greedy_with_single_stop_string",
-                              "greedy_with_multiple_stop_strings", "greedy_with_multiple_stop_strings_no_match", "beam_search", "beam_search_min_and_max_tokens",
-                              "beam_search_with_multiple_stop_strings_no_match", "greedy_stop_strings_exclude_from_output", "greedy_stop_strings_include_to_output",
-                              "greedy_n_stop_strings_exclude_from_output", "greedy_n_stop_strings_include_to_output"])
-def test_sampling_against_optimum(tmp_path, generation_config):
+                         [dict(max_new_tokens=30, num_beams=2),
+                          dict(max_new_tokens=30, num_beams=2, stop_criteria=StopCriteria.NEVER),
+                          dict(max_new_tokens=30, num_beams=2, stop_criteria=StopCriteria.EARLY),
+                        #   dict(max_new_tokens=30, num_beams=2, echo=True),
+                          dict(max_new_tokens=30, num_beams=2, length_penalty=1.0),
+                          dict(max_new_tokens=30, num_beams=2, no_repeat_ngram_size=2),
+                          dict(max_new_tokens=30, num_beams=6, num_beam_groups=3, diversity_penalty=1.2, num_return_sequences=3),
+                          dict(max_new_tokens=30, min_new_tokens=15, num_beams=2, num_return_sequences=1),
+                          dict(max_new_tokens=30, num_beams=2, stop_strings={"Einstein", "sunny", "geothermal"}, include_stop_str_in_output=True),],
+                         ids=["single_group_stop_criteria_heuristic",
+                              "single_group_stop_criteria_never",
+                              "single_group_stop_criteria_early",
+                            #   "single_group_with_echo",
+                              "single_group_lenght_penalty",
+                              "single_group_no_repeat_ngram_size",
+                              "multiple_groups",
+                              "single_group_min_new_tokens",
+                              "single_group_with_multiple_stop_strings_no_match",])
+def test_beam_search(tmp_path, generation_config):
     prompts = [ "What is OpenVINO?" ]
-    generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path)
+    run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path)
 
 
 @pytest.mark.precommit
@@ -87,13 +99,28 @@ def test_sampling_against_optimum(tmp_path, generation_config):
     reason="Stop strings do not seem to work as expected with beam search in HF, so comparison will fail. If it changes, these cases shall be merged to the test above.",
     strict=True,
 )
-@pytest.mark.parametrize("generation_config", [get_beam_search_with_single_stop_string(), get_beam_search_with_multiple_stop_strings()],
-                         ids=["beam_search_with_single_stop_string", "beam_search_with_multiple_stop_strings"])
+@pytest.mark.parametrize("generation_config",
+                         [dict(max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, num_return_sequences=6, stop_strings={"open sour"}, include_stop_str_in_output=True),
+                          dict(max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, num_return_sequences=6, stop_strings={".", "software", "Intel"}, include_stop_str_in_output=True),],
+                         ids=["single_stop_string_match", "multiple_stop_strings_match"])
 def test_beam_search_with_stop_string(tmp_path, generation_config):
     prompts = [ "What is OpenVINO?" ]
-    generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path)
+    run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path)
+
+
+@pytest.mark.precommit
+@pytest.mark.parametrize("generation_config",
+                         [dict(max_new_tokens=1, min_new_tokens=0, echo=True),
+                          dict(max_new_tokens=30, num_beams=2, echo=True),],
+                         ids=["echo_with_generation",
+                              "single_group_with_echo",])
+def test_echo(tmp_path, generation_config):
+    prompts = [ "What is OpenVINO?" ]
+    model_id : str = "facebook/opt-125m"
+    # TODO: support in stateful mode and remove 'use_cb=True' and this test at all
+    # as we can enable new parameters set in other tests
+    run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path, use_cb=True)
 
 
 # TODO: remove platform specific reference texts once CVS-159912 is done and use comparison with HF
@@ -123,6 +150,12 @@ class RandomSamplingTestStruct:
     prompts: List[str]
     ref_texts: List[List[str]]
 
+from common import get_multinomial_temperature, get_greedy_with_penalties, \
+    get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \
+    get_multinomial_temperature_top_p_and_top_k, get_multinomial_all_parameters, \
+    get_multinomial_temperature_and_num_return_sequence, get_multinomial_max_and_min_token, \
+    get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \
+    get_multinomial_temperature_and_repetition_penalty
 
 RANDOM_SAMPLING_TEST_CASES = [
     RandomSamplingTestStruct(
@@ -285,72 +318,15 @@ def test_multinomial_sampling_against_reference(tmp_path, test_struct: RandomSam
 
     prompts = test_struct.prompts
     generation_config.rng_seed = 0
-    generation_configs = [generation_config]
+    generation_configs = generation_config
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_models(model_id)
 
     models_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(model, hf_tokenizer, models_path)
+    convert_models(model, hf_tokenizer, models_path)
 
     # run multinomial without comparison with reference
-    _ = run_continuous_batching(models_path, DEFAULT_SCHEDULER_CONFIG, prompts, generation_configs)
+    _ = run_llm_pipeline(models_path, prompts, generation_configs)
 
     # Reference comparison is not performed as sampling results are non-deterministic.
     # Discrete_distribution impl depends on platform, model inference results may depend on CPU.
-
-
-@pytest.mark.precommit
-@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters],
-                         ids=["greedy", "beam_search", "multinomial_all_parameters"])
-@pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256])
-def test_echo_prompt_phase_only(tmp_path, get_generation_config, max_num_batched_tokens):
-    generation_config = get_generation_config()
-    generation_config.max_new_tokens = 0
-    generation_config.echo = True
-
-    scheduler_config = get_scheduler_config()
-    scheduler_config.max_num_batched_tokens = max_num_batched_tokens
-    generation_configs = [generation_config]
-    model_id : str = "facebook/opt-125m"
-    opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
-
-    model_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(opt_model, hf_tokenizer, model_path)
-
-    cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU")
-
-    outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs)
-    assert(len(outputs))
-    for output in outputs:
-        assert(len(output.m_generation_ids))
-        for sequence in output.m_generation_ids:
-            assert(sequence == "What is OpenVINO?")
-
-
-@pytest.mark.precommit
-@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters],
-                         ids=["greedy", "beam_search", "multinomial_all_parameters"])
-@pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256])
-def test_echo_with_generation_phase(tmp_path, get_generation_config, max_num_batched_tokens):
-    generation_config = get_generation_config()
-    generation_config.max_new_tokens = 10
-    generation_config.echo = True
-
-    scheduler_config = get_scheduler_config()
-    scheduler_config.max_num_batched_tokens = max_num_batched_tokens
-    generation_configs = [generation_config]
-    model_id : str = "facebook/opt-125m"
-    opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
-
-    model_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(opt_model, hf_tokenizer, model_path)
-
-    cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU")
-    outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs)
-    assert(len(outputs))
-
-    for output in outputs:
-        assert(len(output.m_generation_ids))
-        for sequence in output.m_generation_ids:
-            assert(sequence.startswith("What is OpenVINO?"))
-            assert(len(sequence) > len("What is OpenVINO?"))
diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py
index b4df6492bb..81c181bc54 100644
--- a/tests/python_tests/test_vlm_pipeline.py
+++ b/tests/python_tests/test_vlm_pipeline.py
@@ -6,8 +6,8 @@
 import pytest
 import transformers
 from optimum.intel.openvino import OVModelForVisualCausalLM
-from openvino_genai import VLMPipeline
-from common import get_greedy, get_image_by_link, get_beam_search, get_greedy, get_multinomial_all_parameters
+from openvino_genai import VLMPipeline, GenerationConfig
+from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters
 
 def get_ov_model(cache):
     model_dir = cache.mkdir("tiny-random-minicpmv-2_6")
@@ -49,21 +49,22 @@ def streamer(word: str) -> bool:
         return False
 
     models_path = get_ov_model(cache)
+    generation_config = GenerationConfig(max_new_tokens=30)
 
     for links in image_links_for_testing:
         images = []
         for link in links:
             images.append(get_image_by_link(link))
 
-        pipe = VLMPipeline(models_path, "CPU")
-        pipe.start_chat()
+        ov_pipe = VLMPipeline(models_path, "CPU")
+        ov_pipe.start_chat()
 
-        pipe.generate(prompts[0], images=images, generation_config=get_greedy(), streamer=streamer)
+        ov_pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer)
 
         for prompt in prompts[1:]:
-            pipe.generate(prompt, generation_config=get_greedy(), streamer=streamer)
+            ov_pipe.generate(prompt, generation_config=generation_config, streamer=streamer)
 
-        pipe.finish_chat()
+        ov_pipe.finish_chat()
 
 
 @pytest.mark.precommit
@@ -95,7 +96,7 @@ def test_perf_metrics(cache):
     images = [get_image_by_link(image_links[0])]
 
     pipe = VLMPipeline(models_path, "CPU")
-    result = pipe.generate(prompts[0], images=images, generation_config=get_greedy())
+    result = pipe.generate(prompts[0], images=images, generation_config=GenerationConfig(max_new_tokens=30))
 
     perf_metrics = result.perf_metrics
 
diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py
index 7b966f049e..fec9e96f4c 100644
--- a/tools/who_what_benchmark/tests/test_cli_image.py
+++ b/tools/who_what_benchmark/tests/test_cli_image.py
@@ -42,8 +42,8 @@ def teardown_module():
         ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"),
         ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "openvino"),
         ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "text-to-image", "hf"),
-        ("hf-internal-testing/tiny-stable-diffusion-torch", "image-inpainting", "hf"),
-        ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "image-inpainting", "hf"),
+        # ("hf-internal-testing/tiny-stable-diffusion-torch", "image-inpainting", "hf"),
+        # ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "image-inpainting", "hf"),
     ],
 )
 def test_image_model_types(model_id, model_type, backend):
@@ -88,7 +88,10 @@ def test_image_model_types(model_id, model_type, backend):
 @pytest.mark.parametrize(
     ("model_id", "model_type"),
     list(itertools.product(OV_IMAGE_MODELS,
-                           ["image-to-image", "text-to-image", "image-inpainting"])),
+                           ["image-to-image",
+                            "text-to-image",
+                            # "image-inpainting"
+                            ])),
 )
 def test_image_model_genai(model_id, model_type):
     with tempfile.TemporaryDirectory() as temp_dir:

From 002f84fecf311ac453c5c298b619cafabfdadd80 Mon Sep 17 00:00:00 2001
From: Oleg Pipikin <oleg.pipikin@intel.com>
Date: Sat, 4 Jan 2025 01:15:17 +0100
Subject: [PATCH 079/110] Add slice before matmut transformation for CB
 scenario (#1261)

CVS-154930
CVS-155533

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 src/cpp/src/continuous_batching_impl.cpp      | 14 ++---
 src/cpp/src/llm_pipeline_stateful.cpp         |  2 +-
 src/cpp/src/model_runner.hpp                  | 51 ++++++++++++++----
 src/cpp/src/sampler.cpp                       | 13 +++--
 src/cpp/src/sequence_group.hpp                | 13 +++++
 .../speculative_decoding_impl.cpp             |  2 +
 src/cpp/src/utils.cpp                         | 53 ++++++++++++++-----
 src/cpp/src/utils.hpp                         |  4 +-
 .../utils/paged_attention_transformations.hpp |  2 +
 9 files changed, 115 insertions(+), 39 deletions(-)

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 7b076504d0..44bfaf7f21 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -28,6 +28,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
 
     bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
     utils::apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control);
+    utils::apply_gather_before_matmul_transformation(model);
 
     initialize_pipeline(model, scheduler_config, properties, device_config, core);
 }
@@ -444,7 +445,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
     const float * logits_data = logits.data<float>();
     ov::Shape logits_shape = logits.get_shape();
     OPENVINO_ASSERT(logits_shape.size() == 3);
-    size_t batch_seq_len = logits_shape[1], vocab_size = logits_shape[2];
+    size_t vocab_size = logits_shape[2];
     for (size_t sequence_group_id = 0, currently_processed_tokens = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
         SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id];
         // requests not scheduled, in decoding phase or not echoing are not processed
@@ -454,18 +455,17 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
 
         size_t num_running_sequences = sequence_group->num_running_seqs();
         OPENVINO_ASSERT(num_running_sequences == 1);
-        size_t actual_seq_len = sequence_group->get_num_scheduled_tokens();
-        size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len);
+        size_t output_seq_len = sequence_group->get_output_seq_len();
 
         const float * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens;
 
         size_t num_prompt_tokens_processed = sequence_group->get_num_processed_tokens();
-        OPENVINO_ASSERT(num_prompt_tokens_processed + actual_seq_len <= sequence_group->get_prompt_len());
+        OPENVINO_ASSERT(num_prompt_tokens_processed + output_seq_len <= sequence_group->get_prompt_len());
 
         // if we processed the whole prompt we don't include last logprob as it will be processed by the sampler (it's already completion)
         // otherwise we include it as it will be used in the next part of the prompt
         int exclude_last_logprob = 1;
-        if (num_prompt_tokens_processed + actual_seq_len < sequence_group->get_prompt_len())
+        if (num_prompt_tokens_processed + output_seq_len < sequence_group->get_prompt_len())
             exclude_last_logprob = 0;
 
         // if we start processing the prompt we add "fake" log prob for the first position (begin of sequence)
@@ -473,7 +473,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
             sequence_group->append_prompt_log_prob(1.0);
 
         for (int token_logits_offset = 0, token_id_offset = num_prompt_tokens_processed + 1;
-             token_logits_offset < actual_seq_len - exclude_last_logprob;
+             token_logits_offset < output_seq_len - exclude_last_logprob;
              token_logits_offset++, token_id_offset++) {
 
             const float* token_logits = (sequence_group_logits_data + token_logits_offset * vocab_size);
@@ -498,7 +498,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
 
             sequence_group->append_prompt_log_prob(token_logit - max_value - log_sum);
         }
-        currently_processed_tokens += padded_amount_of_processed_tokens * num_running_sequences;
+        currently_processed_tokens += output_seq_len * num_running_sequences;
         // For max_new_tokens == 0, we don't reach sampling so need to notify handle separately
         if(sequence_group->get_sampling_parameters().max_new_tokens == 0) {
             sequence_group->notify_handle_echo_only();
diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp
index 890afe2ab9..153fcc6fce 100644
--- a/src/cpp/src/llm_pipeline_stateful.cpp
+++ b/src/cpp/src/llm_pipeline_stateful.cpp
@@ -38,7 +38,7 @@ StatefulLLMPipeline::StatefulLLMPipeline(
     const ov::AnyMap& properties,
     const ov::genai::GenerationConfig& generation_config)
     : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
-    utils::slice_matmul_stateful_model(model);
+    utils::apply_slice_before_matmul_transformation(model);
     m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model);
 
     ov::CompiledModel compiled_model;
diff --git a/src/cpp/src/model_runner.hpp b/src/cpp/src/model_runner.hpp
index abc96ac423..27eee9e27d 100644
--- a/src/cpp/src/model_runner.hpp
+++ b/src/cpp/src/model_runner.hpp
@@ -114,28 +114,54 @@ class ModelRunner {
         subsequence_begins_data[0] = 0;
         block_indices_begins_data[0] = 0;
 
+        bool matmul_gathering_is_available = false;
+        size_t gathering_current_index = 0;
+        std::vector<int64_t> gather_indices_values;
+        try {
+            std::ignore = m_request.get_tensor("sampled_tokens_indices");
+            matmul_gathering_is_available = true;
+        } catch (const ov::Exception&) {}
+
+
         for (size_t i = 0; i < num_sequence_groups; ++i) {
             size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i];
-            SequenceGroup::CPtr sequence_group = sequence_groups[seq_group_id];
-            std::vector<Sequence::CPtr> running_sequences = sequence_group->get_running_sequences();
+            SequenceGroup::Ptr sequence_group = sequence_groups[seq_group_id];
+            std::vector<Sequence::Ptr> running_sequences = sequence_group->get_running_sequences();
             size_t num_running_sequences = running_sequences.size();
             size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens();
             size_t group_position_id = sequence_group->get_num_processed_tokens();
+            size_t prompt_len = sequence_group->get_prompt_len();
 
-            // spec: In case of multiple input tokens for current sequence (prompt_len > 1),
-            // context_len corresponds to first token within subgroup of scheduled tokens
-            size_t group_context_len = group_position_id;
+            // Next variables are only for sliced matmul case
+            size_t output_seq_len = 0;
+            const bool echo_output = sequence_group->get_sampling_parameters().echo;
+            const bool sampling_is_required = sequence_group->requires_sampling();
+            const size_t tokens_to_sample_per_sequence = 1 + sequence_group->get_num_tokens_to_validate();
 
             for (size_t seq_id = 0; seq_id < num_running_sequences; ++seq_id) {
+                output_seq_len = 0;
                 Sequence::CPtr sequence = running_sequences[seq_id];
-
-                for (size_t token_id = 0, position_id = group_position_id; token_id < num_scheduled_tokens; ++token_id, ++position_id) {
+                for (size_t token_id = 0, position_id = group_position_id; token_id < num_scheduled_tokens; ++token_id, ++position_id, ++gathering_current_index) {
                     // compute token for current sequence
-                    input_ids_data[token_id] = position_id < sequence_group->get_prompt_len() ?
+                    input_ids_data[token_id] = position_id < prompt_len ?
                         sequence_group->get_prompt_ids()[position_id] :
-                        sequence->get_generated_ids()[position_id - sequence_group->get_prompt_len()];
+                        sequence->get_generated_ids()[position_id - prompt_len];
 
                     position_ids_data[token_id] = position_id;
+
+                    // Check if token gathering is required for the entire sequence group
+                    if (matmul_gathering_is_available && (sampling_is_required || echo_output)) {
+                        // Determine if the current token should be gathered
+                        if (echo_output ||
+                            // Skip gathering for prompt tokens
+                            group_position_id + token_id >= prompt_len - 1 &&
+                            // Gather only the last scheduled token or 1 + num_tokens_to_validate tokens for SD
+                            // In SD, tokens_to_sample_per_sequence may exceed num_scheduled_tokens
+                            token_id + tokens_to_sample_per_sequence >= num_scheduled_tokens) {
+                            gather_indices_values.push_back(gathering_current_index);
+                            output_seq_len++;
+                        }
+                    }
                 }
 
                 size_t expected_kv_cache_size = sequence_group->get_num_processed_tokens() - sequence_group->get_num_evicted_tokens();
@@ -153,6 +179,7 @@ class ModelRunner {
                 subsequence_begins_data += 1;
                 block_indices_begins_data += 1;
             }
+            sequence_group->set_output_seq_len(matmul_gathering_is_available ? output_seq_len : num_scheduled_tokens);
         }
 
         // typical LLM parameters
@@ -168,6 +195,12 @@ class ModelRunner {
         m_request.set_tensor("block_indices_begins", block_indices_begins);
         m_request.set_tensor("max_context_len", max_context_len);
 
+        if (matmul_gathering_is_available) {
+            ov::Tensor gather_indices(ov::element::i64, {gather_indices_values.size()});
+            std::memcpy(gather_indices.data(), gather_indices_values.data(), gather_indices_values.size() * sizeof(int64_t));
+            m_request.set_tensor("sampled_tokens_indices", gather_indices);
+        }
+
         // print_tensor("input_ids", input_ids);
         // print_tensor("position_ids", position_ids);
 
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index 9c18dc7721..b2e8add403 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -749,7 +749,7 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
     const float * logits_data = logits.data<float>();
     ov::Shape logits_shape = logits.get_shape();
     OPENVINO_ASSERT(logits_shape.size() == 3);
-    size_t batch_seq_len = logits_shape[1], vocab_size = logits_shape[2];
+    size_t vocab_size = logits_shape[2];
 
     SamplerOutput sampler_output;
     for (size_t sequence_group_id = 0, currently_processed_tokens = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
@@ -758,8 +758,7 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
             continue;
 
         size_t num_running_sequences = sequence_group->num_running_seqs();
-        size_t actual_seq_len = sequence_group->get_num_scheduled_tokens(); // points to a token which needs to be sampled
-        size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len);
+        size_t output_seq_len = sequence_group->get_output_seq_len();
         const ov::genai::GenerationConfig& sampling_params = sequence_group->get_sampling_parameters();
 
         const auto request_id = sequence_group->get_request_id();
@@ -774,13 +773,13 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
         auto& stop_strings = m_stop_strings.at(request_id);
         auto& logit_processor = m_logit_processors.at(request_id);
         const void * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens;
-        ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data);
+        ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, output_seq_len, vocab_size}, (void *)sequence_group_logits_data);
         size_t max_removed_tokens_per_request = 0, min_generated_len = std::numeric_limits<size_t>::max(), updated_validation_len = 0;
         if (sequence_group->requires_sampling()) {
             // get number of token to be validated
             auto num_tokens_to_process = sequence_group->get_num_tokens_to_validate();
-            if (num_tokens_to_process > actual_seq_len - 1) {
-                auto delta = num_tokens_to_process - (actual_seq_len - 1);
+            if (num_tokens_to_process > output_seq_len - 1) {
+                auto delta = num_tokens_to_process - (output_seq_len - 1);
                 updated_validation_len = std::max(updated_validation_len, delta);
                 num_tokens_to_process -= delta;
             }
@@ -914,7 +913,7 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
         }
 
         // accumulate a number of processed tokens
-        currently_processed_tokens += padded_amount_of_processed_tokens * num_running_sequences;
+        currently_processed_tokens += output_seq_len * num_running_sequences;
     }
 
     return sampler_output;
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 8f8d5f899e..14ce87c6f1 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -222,6 +222,8 @@ class SequenceGroup  : public std::enable_shared_from_this<SequenceGroup> {
     size_t m_num_validation_tokens = 0;
     // flag to enable/disable token generation, e.g. in speculative decoding scenario
     bool m_is_gen_paused = false;
+    // output seq len at current iteration
+    size_t m_output_seq_len = 0;
 
     size_t m_num_streamed_tokens = 0, m_stream_window_size = 0;
 
@@ -394,6 +396,14 @@ class SequenceGroup  : public std::enable_shared_from_this<SequenceGroup> {
         return m_num_processed_tokens;
     }
 
+    size_t get_output_seq_len() const {
+        return m_output_seq_len;
+    }
+
+    void set_output_seq_len(size_t len) {
+        m_output_seq_len = len;
+    }
+
     /**
      * Registers within the sequence group that a given amount of tokens
      * has been evicted from the underlying KV cache.
@@ -436,11 +446,14 @@ class SequenceGroup  : public std::enable_shared_from_this<SequenceGroup> {
 
     void schedule_tokens(size_t num_tokens) {
         m_num_scheduled_tokens = num_tokens;
+        // Unless otherwise specified, the sampler will process all scheduled tokens.
+        m_output_seq_len = num_tokens;
     }
 
     void clear_scheduled_tokens() {
         m_num_scheduled_tokens = 0;
         m_num_validation_tokens = 0;
+        m_output_seq_len = 0;
     }
 
     bool is_scheduled() const {
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
index f749ac4e81..526c5df2d4 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -33,6 +33,8 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
 
     utils::apply_paged_attention_transformations(main_model, main_model_desc.scheduler_config.use_cache_eviction);
     utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction);
+    utils::apply_gather_before_matmul_transformation(main_model);
+    utils::apply_gather_before_matmul_transformation(draft_model);
 
     std::string draft_device = draft_model_desc.device.empty() ? main_model_desc.device : draft_model_desc.device;
     bool is_draft_scheduler_undefined = draft_model_desc.scheduler_config == SchedulerConfig();
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index 52faae02e9..9261aa7a4a 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -4,9 +4,11 @@
 #include "utils.hpp"
 
 #include <fstream>
+#include <memory>
 
 #include "openvino/op/add.hpp"
 #include "openvino/op/divide.hpp"
+#include "openvino/op/gather.hpp"
 #include "openvino/op/multiply.hpp"
 #include "openvino/op/matmul.hpp"
 #include "openvino/op/slice.hpp"
@@ -230,23 +232,34 @@ ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::Token
     return {new_input_ids, new_attention_mask};
 }
 
-void slice_matmul_stateful_model(std::shared_ptr<ov::Model> model) {
-    auto last_node = model->output(0).get_node()->input_value(0).get_node();
-    ov::Node* matmul = dynamic_cast<ov::op::v0::MatMul*>(last_node);
-    if (matmul) {
-        // we have found matmul, do nothing
-    } else if(auto add = dynamic_cast<ov::op::v1::Add*>(last_node)) {
-        matmul = dynamic_cast<ov::op::v0::MatMul*>(add->input_value(0).get_node());
-    } else if (auto transpose = dynamic_cast<ov::op::v1::Transpose*>(last_node)) {
-        matmul = dynamic_cast<ov::op::v0::MatMul*>(transpose->input_value(0).get_node());
-    } else if (auto multiply = dynamic_cast<ov::op::v1::Multiply*>(last_node)) {
-        if (auto tanh = dynamic_cast<ov::op::v0::Tanh*>(multiply->input_value(0).get_node())) {
-            if (auto divide = dynamic_cast<ov::op::v1::Divide*>(tanh->input_value(0).get_node())) {
-                matmul = dynamic_cast<ov::op::v0::MatMul*>(divide->input_value(0).get_node());
+namespace {
+std::shared_ptr<ov::Node> find_llm_matmul(const std::shared_ptr<ov::Model>& model) {
+    auto last_node = model->output(0).get_node()->input_value(0).get_node_shared_ptr();
+    std::shared_ptr<ov::Node> matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(last_node);
+    // There are several patterns for matmul we are looking for:
+    // Matmul -> Result
+    // Matmul -> Add -> Result
+    // Matmul -> Transpose -> Result
+    // MatMul -> Divide -> Tanh -> Multiply -> Result
+    if (!matmul) {
+        if(auto add = std::dynamic_pointer_cast<ov::op::v1::Add>(last_node)) {
+            matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(add->input_value(0).get_node_shared_ptr());
+        } else if (auto transpose = std::dynamic_pointer_cast<ov::op::v1::Transpose>(last_node)) {
+            matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(transpose->input_value(0).get_node_shared_ptr());
+        } else if (auto multiply = std::dynamic_pointer_cast<ov::op::v1::Multiply>(last_node)) {
+            if (auto tanh = std::dynamic_pointer_cast<ov::op::v0::Tanh>(multiply->input_value(0).get_node_shared_ptr())) {
+                if (auto divide = std::dynamic_pointer_cast<ov::op::v1::Divide>(tanh->input_value(0).get_node_shared_ptr())) {
+                    matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(divide->input_value(0).get_node_shared_ptr());
+                }
             }
         }
     }
+    return matmul;
+}
+} // namespace
 
+void apply_slice_before_matmul_transformation(std::shared_ptr<ov::Model> model) {
+    auto matmul = find_llm_matmul(model);
     if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) {
         auto start = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
         auto stop = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-2});
@@ -257,6 +270,19 @@ void slice_matmul_stateful_model(std::shared_ptr<ov::Model> model) {
     }
 }
 
+void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model) {
+    auto matmul =  ov::genai::utils::find_llm_matmul(model);
+    if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) {
+        auto indices = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::PartialShape{-1});
+        indices->set_friendly_name("sampled_tokens_indices");
+        indices->output(0).get_tensor().set_names({"sampled_tokens_indices"});
+        auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{0});
+        auto gather = std::make_shared<ov::op::v8::Gather>(matmul->input_value(0), indices, axis);
+        matmul->input(0).replace_source_output(gather);
+        model->add_parameters({indices});
+    }
+}
+
 template <typename T>
 void read_rt_info(std::shared_ptr<ov::Model>& model, const char* name, T& value) {
     if (!model)
@@ -396,7 +422,6 @@ void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const ch
         }
     }
 }
-
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index af9d889115..ad0e1a05d4 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -99,7 +99,9 @@ std::pair<ov::AnyMap, SchedulerConfig> split_scheduler_config(const ov::AnyMap&
 
 ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);
 
-void slice_matmul_stateful_model(std::shared_ptr<ov::Model> model);
+void apply_slice_before_matmul_transformation(std::shared_ptr<ov::Model> model);
+
+void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model);
 
 ov::Core singleton_core();
 
diff --git a/src/cpp/src/utils/paged_attention_transformations.hpp b/src/cpp/src/utils/paged_attention_transformations.hpp
index 88ac0876c5..2cb32adcdc 100644
--- a/src/cpp/src/utils/paged_attention_transformations.hpp
+++ b/src/cpp/src/utils/paged_attention_transformations.hpp
@@ -27,6 +27,8 @@ size_t get_hidden_size(const std::shared_ptr<ov::Model> model);
 
 void set_kv_cache_type_and_shape(std::shared_ptr<ov::Model> model, DeviceConfig& device_config);
 
+void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model);
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov

From 31d632b5a6c2a3b93680feaf5d1894af8fd91afa Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Sat, 4 Jan 2025 11:20:14 +0000
Subject: [PATCH 080/110] StaticLLMPipeline: Support more generation options
 (#1431)

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 src/cpp/src/llm_pipeline_static.cpp           |  96 ++++++++++++-----
 src/cpp/src/llm_pipeline_static.hpp           |   5 +-
 src/cpp/src/sampler.cpp                       |   4 +-
 src/cpp/src/sampler.hpp                       |   4 +-
 src/cpp/src/sequence_group.hpp                |   6 +-
 .../python_tests/test_llm_pipeline_static.py  | 101 ++++++++++++++++--
 tools/llm_bench/task/text_generation.py       |   4 +-
 7 files changed, 170 insertions(+), 50 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index e163dce2df..de1038a716 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -1,8 +1,10 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #include "llm_pipeline_static.hpp"
 
+#include "sampler.hpp"
+
 #include <fstream>
 #include <regex>
 
@@ -235,12 +237,12 @@ enum class GenerateHint {
 
 std::string to_string(GenerateHint h) {
     switch(h) {
-        case GenerateHint::FAST_COMPILE : 
+        case GenerateHint::FAST_COMPILE :
             return "FAST_COMPILE";
-        case GenerateHint::BEST_PERF : 
+        case GenerateHint::BEST_PERF :
             return "BEST_PERF";
         default:
-            OPENVINO_THROW("Unsupported value for type GenerateHint provided");        
+            OPENVINO_THROW("Unsupported value for type GenerateHint provided");
     }
 }
 
@@ -632,6 +634,19 @@ void copy_columns_by_row_chunks(const ov::Tensor& src, ov::Tensor& dst) {
     }
 }
 
+void stream_generated_tokens(std::shared_ptr<ov::genai::StreamerBase> streamer_ptr,
+                             ov::genai::GenerationHandle& handle) {
+    if (streamer_ptr && handle->can_read()) {
+        std::unordered_map<uint64_t, ov::genai::GenerationOutput> token = handle->back();
+        for (const auto& gen_token : token.begin()->second.generated_ids) {
+            if (streamer_ptr->put(gen_token)) {
+                handle->drop();
+                break;
+            }
+        }
+    }
+}
+
 } // anonymous namespace
 
 namespace ov {
@@ -643,7 +658,8 @@ StaticLLMPipeline::StaticLLMPipeline(
     const std::string& device,
     const ov::AnyMap& config
 ) : LLMPipelineImplBase(tokenizer,
-                        utils::from_config_json_if_exists(models_path)) {
+                        utils::from_config_json_if_exists(models_path)),
+    m_sampler(m_tokenizer) {
     auto properties = config;
     /* NB: Static LLM pipeline consists of two models,
        first to process the input prompt (prefill),
@@ -672,6 +688,8 @@ StaticLLMPipeline::StaticLLMPipeline(
     if (m_generation_config.eos_token_id == -1) {
         m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
     }
+
+    m_sampler.set_seed(m_generation_config.rng_seed);
 };
 
 StaticLLMPipeline::StaticLLMPipeline(
@@ -688,8 +706,7 @@ StaticLLMPipeline::StaticLLMPipeline(
     const std::string& device,
     const ov::AnyMap& properties,
     const ov::genai::GenerationConfig& generation_config
-) : LLMPipelineImplBase(tokenizer, generation_config) {
-    
+) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
     bool use_blobs = false;
     auto anyopt = get_option<bool>(properties, "USE_BLOBS");
     if (anyopt.has_value()) {
@@ -708,6 +725,8 @@ StaticLLMPipeline::StaticLLMPipeline(
     if (m_generation_config.eos_token_id == -1) {
         m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
     }
+
+    m_sampler.set_seed(m_generation_config.rng_seed);
 }
 
 void StaticLLMPipeline::setupAndCompileModels(
@@ -955,7 +974,10 @@ EncodedResults StaticLLMPipeline::generate(
         attention_mask = data->attention_mask;
     }
 
-    if (input_ids.get_shape().at(0) > 1u) {
+    ov::Shape prompts_shape = input_ids.get_shape();
+    const size_t batch_size = prompts_shape[0];
+
+    if (batch_size > 1u) {
         OPENVINO_THROW("Currently only batch size=1 is supported");
     }
 
@@ -974,12 +996,14 @@ EncodedResults StaticLLMPipeline::generate(
         streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
     }
 
-    if (!config.is_greedy_decoding()) {
-        OPENVINO_THROW("Currently only greedy decoding is supported");
+    if (!config.is_greedy_decoding() && !config.is_multinomial()) {
+        OPENVINO_THROW("Currently only greedy and multinomial decoding are supported");
+    }
+
+    if (config.num_return_sequences != 1u) {
+        OPENVINO_THROW("Currently only \"num_return_sequences\" equal to 1 is supported!");
     }
 
-    ov::Shape prompts_shape = input_ids.get_shape();
-    const size_t batch_size = prompts_shape[0];
     ov::genai::EncodedResults results;
     auto& raw_perf_counters = results.perf_metrics.raw_metrics;
     // NB: Only batch=1 is supported now
@@ -1016,11 +1040,21 @@ EncodedResults StaticLLMPipeline::generate(
 
     // NB: Now there are prompt_len tokens in KV-cache
     m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(prompt_len);
-    int64_t last_token = utils::argmax(m_prefill_request.get_tensor("logits"), 0);
-    results.tokens[0].push_back(last_token);
-    if (streamer_ptr && streamer_ptr->put(last_token)) {
-        return results;
-    }
+
+    auto logits = m_prefill_request.get_tensor("logits");
+    int64_t output_sequence_len = logits.get_shape().at(1);
+
+    auto sequence_group = std::make_shared<SequenceGroup>(
+        0 /* request_id */, padded_input_ids, config, 1 /* block_size */);
+    sequence_group->update_processed_tokens_num(m_kvcache_desc.max_prompt_size - output_sequence_len);
+    sequence_group->schedule_tokens(output_sequence_len);
+
+    // NB: Controls what tokens are ready to be pushed into the streamer
+    GenerationHandle handle = std::make_shared<GenerationHandleImpl>(
+        sequence_group->get_generation_stream(), sequence_group->get_sampling_parameters());
+
+    SamplerOutput sampler_output = m_sampler.sample({sequence_group}, logits);
+    stream_generated_tokens(streamer_ptr, handle);
 
     // Outputs: logits, ...
     const auto kStartOutputKVCacheLayers = 1u;
@@ -1061,30 +1095,28 @@ EncodedResults StaticLLMPipeline::generate(
     std::fill(attention_mask_data, attention_mask_data + m_kvcache_desc.num_stored_tokens - 1u, 1u);
     attention_mask_data[m_kvcache_desc.total_size - 1] = 1u;
 
-    const size_t max_tokens = config.get_max_new_tokens(prompt_len);
-    for (int i = 0; i < max_tokens - 1; ++i) {
-        input_ids_data[0] = last_token;
+    while (sequence_group->is_running()) {
+        sequence_group->schedule_tokens(1);
+        const auto running_sequences = sequence_group->get_running_sequences();
+        OPENVINO_ASSERT(running_sequences.size() == 1u);
+
+        input_ids_data[0] = running_sequences.front()->get_generated_ids().back();
         position_ids_data[0] = m_kvcache_desc.num_stored_tokens;
         attention_mask_data[m_kvcache_desc.num_stored_tokens - 1] = 1u;
 
         m_kvcache_request.infer();
         m_kvcache_desc.num_stored_tokens += 1;
 
-        last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0);
-        results.tokens[0].push_back(last_token);
-
         raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
         raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
-        if (streamer_ptr && streamer_ptr->put(last_token)) {
-            break;
-        }
 
-        if (last_token == config.eos_token_id && !config.ignore_eos) {
-            break;
-        }
+        SamplerOutput sampler_output = m_sampler.sample(
+            {sequence_group}, m_kvcache_request.get_tensor("logits"));
+        stream_generated_tokens(streamer_ptr, handle);
 
         // NB: KV-cache is full, further generation is impossible
         if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) {
+            sequence_group->set_out_of_memory();
             break;
         }
 
@@ -1108,6 +1140,12 @@ EncodedResults StaticLLMPipeline::generate(
         streamer_ptr->end();
     }
 
+    OPENVINO_ASSERT(sequence_group->get_finished_sequences().size() == 1u);
+    auto sequence = sequence_group->get_finished_sequences().front();
+    results.tokens[0] = sequence->get_generated_ids();
+    results.scores[0] = sequence->get_cumulative_log_prob();
+    m_sampler.clear_request_info(sequence_group->get_request_id());
+
     auto stop_time = std::chrono::steady_clock::now();
     // If is called without tokenization then that stat will not be reported.
     auto& metrics = results.perf_metrics;
diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
index 7acc28c684..8dc7ef49a1 100644
--- a/src/cpp/src/llm_pipeline_static.hpp
+++ b/src/cpp/src/llm_pipeline_static.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
@@ -6,6 +6,7 @@
 #include <filesystem>
 
 #include "llm_pipeline_base.hpp"
+#include "sampler.hpp"
 
 namespace ov {
 namespace genai {
@@ -77,6 +78,8 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
         bool v_tensors_transposed;
     };
 
+    Sampler m_sampler;
+
     KVCacheDesc m_kvcache_desc;
     ov::InferRequest m_kvcache_request;
     ov::InferRequest m_prefill_request;
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index b2e8add403..54850f657b 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2023-2024 Intel Corporation
+// Copyright (C) 2023-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #include "sampler.hpp"
@@ -743,7 +743,7 @@ process_stop_strings(const std::set<std::string>& stop_strings, Tokenizer& token
     return result;
 }
 
-SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
+SamplerOutput Sampler::sample(const std::vector<SequenceGroup::Ptr> & sequence_groups,
                               ov::Tensor logits,
                               bool is_validation_mode_enabled) {
     const float * logits_data = logits.data<float>();
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 981e11560f..7796f93d1e 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -1,5 +1,5 @@
 
-// Copyright (C) 2023-2024 Intel Corporation
+// Copyright (C) 2023-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
@@ -67,7 +67,7 @@ class Sampler {
     Sampler() = default;
     Sampler(Tokenizer & tokenizer) : m_tokenizer(tokenizer) {};
 
-    SamplerOutput sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false);
+    SamplerOutput sample(const std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false);
     void set_seed(size_t new_seed) {
         rng_engine.seed(new_seed);
         seed = new_seed;
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 14ce87c6f1..b6bcc83530 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2023-2024 Intel Corporation
+// Copyright (C) 2023-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
@@ -292,8 +292,8 @@ class SequenceGroup  : public std::enable_shared_from_this<SequenceGroup> {
     }
 
     size_t num_finished_seqs() const {
-        return std::count_if(m_sequences.begin(), m_sequences.end(), [] (Sequence::CPtr seq) {
-            return seq->has_finished();
+        return std::count_if(m_sequences.begin(), m_sequences.end(), [this] (Sequence::CPtr seq) {
+            return seq->has_finished() || seq->out_of_memory() || handle_dropped();
         });
     }
 
diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py
index 6ef6162043..d2d3673356 100644
--- a/tests/python_tests/test_llm_pipeline_static.py
+++ b/tests/python_tests/test_llm_pipeline_static.py
@@ -1,7 +1,9 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 import openvino_genai as ov_genai
+from openvino_genai import GenerationConfig
+
 import pytest
 import platform
 import sys
@@ -12,9 +14,19 @@
 )
 from common import get_default_properties
 
+from common import                                      \
+    get_greedy,                                         \
+    get_greedy_with_penalties,                          \
+    get_multinomial_temperature,                        \
+    get_multinomial_all_parameters,                     \
+    get_multinomial_temperature_and_presence_penalty,   \
+    get_beam_search
+
+
 if sys.platform == 'darwin' or platform.machine() in ["aarch64", "arm64", "ARM64"]:
     pytest.skip("NPU plugin is available only on Linux and Windows x86_64", allow_module_level=True)
 
+
 # This test suite is designed specifically to validate the functionality and robustness of the StaticLLMPipeline on NPUW:CPU.
 common_config = {
                       'NPU_USE_NPUW': 'YES',
@@ -33,17 +45,22 @@ def generate_chat_history(model_path, device, pipeline_config, questions):
     return chat_history
 
 
+generation_configs = [
+    get_greedy(),
+    get_greedy_with_penalties()
+]
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_generation_compare_with_stateful():
-    prompt = 'The Sun is yellow because'
+@pytest.mark.parametrize("generation_config", generation_configs)
+def test_generation_compare_with_stateful(generation_config):
+    prompt = 'What is OpenVINO?'
     model_path = read_model(get_models_list()[0])[1]
 
-    stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU", **get_default_properties())
-    ref_out = stateful_pipe.generate(prompt, max_new_tokens=100)
+    stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU")
+    ref_out = stateful_pipe.generate(prompt, generation_config)
 
     static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
-    actual_out = static_pipe.generate(prompt, max_new_tokens=100)
+    actual_out = static_pipe.generate(prompt, generation_config)
 
     if ref_out != actual_out:
         print(f'ref_out: {ref_out}\n')
@@ -51,6 +68,25 @@ def test_generation_compare_with_stateful():
     assert ref_out == actual_out
 
 
+generation_configs = [
+    get_multinomial_temperature_and_presence_penalty()
+]
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.parametrize("generation_config", generation_configs)
+def test_multinomial_sampling(generation_config):
+    # Multinomial sampling is highly sensitive to raw logits values. For fair comparison,
+    # a reference implementation producing identical logits (e.g., from StaticLLMPipeline)
+    # would be necessary. However, the CPU in StatefulPipeline and StaticLLMPipeline may apply
+    # different optimizations due to differences in provided topologies, leading to slight
+    # variations in raw logits. Therefore, there is no reliable reference for validation,
+    # so only ensure that no exceptions are raised.
+    prompt = 'What is OpenVINO?'
+    model_path = read_model(get_models_list()[0])[1]
+    static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
+    actual_out = static_pipe.generate(prompt, generation_config)
+
+
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_length_properties_set_no_exception():
@@ -100,23 +136,25 @@ def test_batch_raise_error():
 
 # TODO: For the further sampling support
 generation_configs = [
-    dict(num_beams=3),
-    dict(do_sample=True)
+    get_beam_search(),
+    # NB: Only num_return_sequences=1 is supported!
+    get_multinomial_all_parameters()
 ]
 @pytest.mark.parametrize("generation_config", generation_configs)
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_unsupported_sampling_raise_error(generation_config):
     model_path = read_model(get_models_list()[0])[1]
-    prompt = 'The Sun is yellow because'
+    prompt = 'What is OpenVINO?'
+
     pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
     with pytest.raises(RuntimeError):
-        pipe.generate(prompt, **generation_config)
+        pipe.generate(prompt, generation_config)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_max_number_of_tokens():
+def test_terminate_by_max_number_of_tokens():
     model_path = read_model(get_models_list()[0])[1]
     prompt = 'The Sun is yellow because'
     num_tokens = 128
@@ -129,6 +167,47 @@ def test_max_number_of_tokens():
     assert len(encoded_results.tokens[0]) == num_tokens
 
 
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_terminate_by_out_of_memory():
+    model_path = read_model(get_models_list()[0])[1]
+    prompt = 'The Sun is yellow because'
+    pipeline_config = { "MAX_PROMPT_LEN": 64, "MIN_RESPONSE_LEN": 64 }
+    pipeline_config |= common_config
+    kv_cache_size = pipeline_config['MAX_PROMPT_LEN'] + pipeline_config['MIN_RESPONSE_LEN']
+
+    tokenizer = ov_genai.Tokenizer(model_path)
+    tokenized_input = tokenizer.encode(prompt)
+    input_len = tokenized_input.input_ids.get_shape()[1]
+
+    pipe = ov_genai.LLMPipeline(model_path, "NPU", **pipeline_config)
+    encoded_results = pipe.generate(tokenized_input, max_new_tokens=1000, ignore_eos=True)
+
+    assert len(encoded_results.tokens[0]) == (kv_cache_size - input_len + 1)
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_terminate_by_sampler():
+    model_path = read_model(get_models_list()[0])[1]
+    prompt = 'The Sun is yellow because'
+
+    current_iter = 0
+    num_iters = 10
+    def callback(subword):
+        nonlocal current_iter
+        current_iter += 1
+        return current_iter == num_iters
+
+    tokenizer = ov_genai.Tokenizer(model_path)
+    tokenized_input = tokenizer.encode(prompt)
+
+    pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
+    encoded_results = pipe.generate(tokenized_input, max_new_tokens=1000, ignore_eos=True, streamer=callback)
+
+    assert len(encoded_results.tokens[0]) == num_iters
+
+
 # FIXME: Known problem, output differs from stateful pipeline starting from 3rd prompt!
 @pytest.mark.skip(reason="JIRA-144780: Output differs from stateful pipeline")
 @pytest.mark.precommit
diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py
index 4822b228ca..c768d427e7 100644
--- a/tools/llm_bench/task/text_generation.py
+++ b/tools/llm_bench/task/text_generation.py
@@ -198,7 +198,6 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
 
 def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_index,
                               streamer, tokens_len, streaming, model_precision, proc_id, mem_consumption):
-    set_seed(args['seed'])
     input_text_list = [input_text] * args['batch_size']
     if args["output_dir"] is not None and num == 0:
         for bs_index, in_text in enumerate(input_text_list):
@@ -226,6 +225,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
         log.info(out_str)
     gen_config = model.get_generation_config()
     gen_config.max_new_tokens = max_gen_tokens
+    gen_config.rng_seed = args["seed"]
     gen_config.num_beams = args["num_beams"]
     gen_config.do_sample = False
     if args.get('draft_model', ''):
@@ -353,7 +353,6 @@ def token_printer():
 
 def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, args, iter_data_list, md5_list,
                                           prompt_index, streamer, tokens_len, streaming, model_precision, proc_id, mem_consumption):
-    set_seed(args['seed'])
     input_text_list = [input_text] * args['batch_size']
     if args["output_dir"] is not None and num == 0:
         for bs_index, in_text in enumerate(input_text_list):
@@ -379,6 +378,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
     max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
     streamer.reset()
     gen_config = model.get_generation_config()
+    gen_config.rng_seed = args["seed"]
     gen_config.max_new_tokens = max_gen_tokens
     gen_config.num_beams = args["num_beams"]
     gen_config.do_sample = False

From b4d0d3c3220e5cf0547e114b2b3da009b5f2e057 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 4 Jan 2025 21:14:48 +0400
Subject: [PATCH 081/110] Bump pillow from 11.0.0 to 11.1.0 (#1463)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [pillow](https://github.com/python-pillow/Pillow) from 11.0.0 to
11.1.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/python-pillow/Pillow/releases">pillow's
releases</a>.</em></p>
<blockquote>
<h2>11.1.0</h2>
<p><a
href="https://pillow.readthedocs.io/en/stable/releasenotes/11.1.0.html">https://pillow.readthedocs.io/en/stable/releasenotes/11.1.0.html</a></p>
<h2>Documentation</h2>
<ul>
<li>Added release notes for writing XMP bytes to JPEG and MPO <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8627">#8627</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Added release notes for using zlib-ng instead of zlib <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8599">#8599</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Replace python-pillow.org with python-pillow.github.io <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8586">#8586</a>
[<a href="https://github.com/hugovk"><code>@​hugovk</code></a>]</li>
<li>ImageFile tile is never None <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8582">#8582</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Only use start year in copyright, remove end years <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8577">#8577</a>
[<a href="https://github.com/hugovk"><code>@​hugovk</code></a>]</li>
<li>Python 3.12 is tested on MinGW <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8575">#8575</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Use brew formula to install libraqm <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8574">#8574</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Added link to GitHub releases in CHANGES <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8571">#8571</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Release drafter: move removals, deprecations, documentation up, and
uncategorised changes last <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8570">#8570</a>
[<a href="https://github.com/hugovk"><code>@​hugovk</code></a>]</li>
<li>Updated macOS tested Pillow versions <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8538">#8538</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Use test image filename <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8534">#8534</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Update Windows 11 Arm64 tested versions <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8523">#8523</a>
[<a href="https://github.com/nulano"><code>@​nulano</code></a>]</li>
<li>Move MPO into &quot;Fully supported formats&quot; <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8504">#8504</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Update license to MIT-CMU <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8490">#8490</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
</ul>
<h2>Dependencies</h2>
<ul>
<li>Update dependency mypy to v1.14.1 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8643">#8643</a>
[<a href="https://github.com/renovate"><code>@​renovate</code></a>]</li>
<li>Update dependency mypy to v1.14.0 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8613">#8613</a>
[<a href="https://github.com/renovate"><code>@​renovate</code></a>]</li>
<li>Updated libwebp to 1.5.0 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8612">#8612</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Updated libXau to 1.0.12 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8598">#8598</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Updated libjpeg-turbo to 3.1.0 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8595">#8595</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Updated harfbuzz to 10.1.0 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8533">#8533</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Updated openjpeg to 2.5.3 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8591">#8591</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Update dependency cibuildwheel to v2.22.0 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8580">#8580</a>
[<a href="https://github.com/renovate"><code>@​renovate</code></a>]</li>
<li>Update codecov/codecov-action action to v5 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8557">#8557</a>
[<a href="https://github.com/renovate"><code>@​renovate</code></a>]</li>
<li>Migrate renovate config <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8527">#8527</a>
[<a href="https://github.com/renovate"><code>@​renovate</code></a>]</li>
<li>Update dependency mypy to v1.13.0 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8491">#8491</a>
[<a href="https://github.com/renovate"><code>@​renovate</code></a>]</li>
<li>Update dependency mypy to v1.12.1 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8487">#8487</a>
[<a href="https://github.com/renovate"><code>@​renovate</code></a>]</li>
</ul>
<h2>Testing</h2>
<ul>
<li>Added CentOS Stream 10 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8646">#8646</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Use monkeypatch <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8628">#8628</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Pass file handle to ContainerIO <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8625">#8625</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Use register_handler <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8499">#8499</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Use monkeypatch <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8626">#8626</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Test libjpeg-turbo on macOS <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8596">#8596</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Test 3.13t (free-threaded) from Quansight-Labs/setup-python on Linux
and macOS <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8565">#8565</a>
[<a href="https://github.com/hugovk"><code>@​hugovk</code></a>]</li>
<li>Run gcc problem matcher on Python 3.13 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8541">#8541</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Add trove-classifiers&gt;=2024.10.12 to 'tests' extra and use for
Windows CI <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8514">#8514</a>
[<a href="https://github.com/hugovk"><code>@​hugovk</code></a>]</li>
<li>Apply security fixes to GitHub Actions <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8526">#8526</a>
[<a href="https://github.com/hugovk"><code>@​hugovk</code></a>]</li>
<li>Remove unused <code>gcov: true</code> for codecov-action@v4 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8521">#8521</a>
[<a href="https://github.com/hugovk"><code>@​hugovk</code></a>]</li>
<li>Added Fedora 41 <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8520">#8520</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
<li>Do not repeatedly save to the same path <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8512">#8512</a>
[<a
href="https://github.com/radarhere"><code>@​radarhere</code></a>]</li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/python-pillow/Pillow/blob/main/CHANGES.rst">pillow's
changelog</a>.</em></p>
<blockquote>
<h2>11.1.0 and newer</h2>
<p>See GitHub Releases:</p>
<ul>
<li><a
href="https://github.com/python-pillow/Pillow/releases">https://github.com/python-pillow/Pillow/releases</a></li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/python-pillow/Pillow/commit/4c1aed801e43c6b307e7135279ca1dbc02bbf052"><code>4c1aed8</code></a>
11.1.0 version bump</li>
<li><a
href="https://github.com/python-pillow/Pillow/commit/dfb368a7e904a7ed92012cf89a4e4804d7f410c8"><code>dfb368a</code></a>
Merge pull request <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8651">#8651</a>
from radarhere/blp</li>
<li><a
href="https://github.com/python-pillow/Pillow/commit/5d998d3fedb06666ae680e3ebe3f3547a9059727"><code>5d998d3</code></a>
Improved coverage</li>
<li><a
href="https://github.com/python-pillow/Pillow/commit/6b75e0687513fcdca8683699b5ce880a935ad8ce"><code>6b75e06</code></a>
Do not reread start of header in decoder</li>
<li><a
href="https://github.com/python-pillow/Pillow/commit/b89cc09944b4add584967bf1fa21208e92442def"><code>b89cc09</code></a>
Corrected BLP1 alpha depth handling</li>
<li><a
href="https://github.com/python-pillow/Pillow/commit/aa0f4127b8040b82822808fa2037fea1a162d3c5"><code>aa0f412</code></a>
Merge pull request <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8646">#8646</a>
from radarhere/centos</li>
<li><a
href="https://github.com/python-pillow/Pillow/commit/e34427167ddbaeece43490c4054c1e17fa21d77b"><code>e344271</code></a>
Added CentOS Stream 10</li>
<li><a
href="https://github.com/python-pillow/Pillow/commit/17f09f39936ddc60bb8348e74359b5c5b3de25b9"><code>17f09f3</code></a>
Merge pull request <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8644">#8644</a>
from radarhere/c99</li>
<li><a
href="https://github.com/python-pillow/Pillow/commit/d42f22baafca30050f4fc8b6bafcc39ef624d685"><code>d42f22b</code></a>
Added release notes</li>
<li><a
href="https://github.com/python-pillow/Pillow/commit/c7026d9bc84dc80e2fb3bbde901f77a40ef372b9"><code>c7026d9</code></a>
Merge pull request <a
href="https://redirect.github.com/python-pillow/Pillow/issues/8642">#8642</a>
from radarhere/bigtiff</li>
<li>Additional commits viewable in <a
href="https://github.com/python-pillow/Pillow/compare/11.0.0...11.1.0">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=pillow&package-manager=pip&previous-version=11.0.0&new-version=11.1.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 samples/deployment-requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/deployment-requirements.txt b/samples/deployment-requirements.txt
index 428e0955a5..c6ad9eaaa8 100644
--- a/samples/deployment-requirements.txt
+++ b/samples/deployment-requirements.txt
@@ -2,4 +2,4 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino_genai~=2025.0.0.0.dev
 librosa==0.10.2.post1  # For Whisper
-pillow==11.0.0  # Image processing for VLMs
+pillow==11.1.0  # Image processing for VLMs

From cb6b68e25027de0d39116de998391e436681ec1e Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sat, 4 Jan 2025 23:08:51 +0400
Subject: [PATCH 082/110] [Python] Use texts instead of cast operator (#1477)

---
 src/python/py_vlm_pipeline.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp
index b0cfa0a42a..570018f341 100644
--- a/src/python/py_vlm_pipeline.cpp
+++ b/src/python/py_vlm_pipeline.cpp
@@ -114,11 +114,11 @@ void init_vlm_pipeline(py::module_& m) {
 
     py::class_<ov::genai::VLMDecodedResults>(m, "VLMDecodedResults", decoded_results_docstring)
         .def(py::init<>())
-        .def_property_readonly("texts", [](const ov::genai::VLMDecodedResults &dr) -> py::typing::List<py::str> { return pyutils::handle_utf8((std::vector<std::string>)dr); })
+        .def_property_readonly("texts", [](const ov::genai::VLMDecodedResults &dr) -> py::typing::List<py::str> { return pyutils::handle_utf8(dr.texts); })
         .def_readonly("scores", &ov::genai::VLMDecodedResults::scores)
         .def_readonly("perf_metrics", &ov::genai::VLMDecodedResults::perf_metrics)
         .def("__str__", [](const ov::genai::VLMDecodedResults &dr) -> py::str {
-            auto valid_utf8_strings = pyutils::handle_utf8((std::vector<std::string>)dr);
+            auto valid_utf8_strings = pyutils::handle_utf8(dr.texts);
             py::str res;
             if (valid_utf8_strings.size() == 1)
                 return valid_utf8_strings[0];

From 9a2771562e49c10ba850a3fd0b46de1b7be280c1 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 6 Jan 2025 12:13:13 +0400
Subject: [PATCH 083/110] [Tests] Fixed HF warning (#1478)

```
test_continuous_batching.py: 7 warnings
test_kv_cache_eviction.py: 2 warnings
test_llm_pipeline.py: 6 warnings
test_sampling.py: 11 warnings
  /venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:774: UserWarning: `return_dict_in_generate` is NOT set to `True`, but `output_scores` is. When `return_dict_in_generate` is not `True`, `output_scores` is ignored.
    warnings.warn(
```
---
 tests/python_tests/common.py            |  8 ++++----
 tests/python_tests/test_llm_pipeline.py | 12 +++++++-----
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index dc58d1ad2f..bb34c1dcd4 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -179,6 +179,8 @@ def convert_to_hf(
         return
 
     kwargs = {}
+    kwargs['return_dict_in_generate'] = True
+
     # generic parameters
     kwargs['max_length'] = generation_config.max_length
     # has higher priority than 'max_length'
@@ -253,8 +255,7 @@ def run_hugging_face(
             input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
             prompt_len = 0 if generation_config.echo else input_ids.numel()
 
-            generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config,
-                                                  return_dict_in_generate=True, tokenizer=hf_tokenizer)
+            generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)
             all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
 
             generation_result = GenerationResult()
@@ -268,8 +269,7 @@ def run_hugging_face(
         inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True, padding_side='left')
         input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
         hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs)
-        hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config,
-                                                return_dict_in_generate=True, tokenizer=hf_tokenizer)
+        hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)
 
         generation_ids = []
         scores = []
diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py
index 986b342c59..5278f4424f 100644
--- a/tests/python_tests/test_llm_pipeline.py
+++ b/tests/python_tests/test_llm_pipeline.py
@@ -53,6 +53,7 @@ def test_encoded_inputs(model_descr, inputs):
     hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config)
 
     input_ids, attention_mask = inputs
+    prompt_len = input_ids.shape[1]
 
     if attention_mask is not None:
         inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask))
@@ -61,10 +62,10 @@ def test_encoded_inputs(model_descr, inputs):
         inputs_hf = dict(inputs=torch.tensor(input_ids))
         inputs_ov = ov.Tensor(input_ids)
 
-    hf_output = opt_model.generate(**inputs_hf, generation_config=hf_generation_config)
+    hf_output = opt_model.generate(**inputs_hf, generation_config=hf_generation_config).sequences[0]
     ov_output = ov_pipe.generate(inputs_ov, ov_generation_config)
 
-    hf_res = hf_output[0, input_ids.shape[1]:].numpy()
+    hf_res = hf_output[prompt_len:].numpy()
     ov_res = np.array(ov_output.tokens, dtype=np.int64)
     assert np.all(ov_res == hf_res)
 
@@ -132,9 +133,10 @@ def test_chat_scenario(model_descr, generation_config_kwargs: Dict):
 
         chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
         tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
+        prompt_len = tokenized['input_ids'].numel()
 
-        answer = opt_model.generate(**tokenized, generation_config=hf_generation_config)
-        answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
+        answer = opt_model.generate(**tokenized, generation_config=hf_generation_config).sequences[0]
+        answer_str = tokenizer.decode(answer[prompt_len:], skip_special_tokens=True)
         chat_history_hf.append({'role': 'assistant', 'content': answer_str})
 
         answer_ov = ov_pipe.generate(prompt, generation_config=ov_generation_config)
@@ -379,7 +381,7 @@ def test_perf_metrics(generation_config, prompt):
 
     # Check that load time is adequate.
     load_time = perf_metrics.get_load_time()
-    assert load_time > 0 and load_time < 2000.0
+    assert load_time > 0 and load_time < total_time
 
     # Check that num input and generated tokens are adequate.
     num_generated_tokens = perf_metrics.get_num_generated_tokens()

From db71b362f1ed9747b34d054230461821cb56cf84 Mon Sep 17 00:00:00 2001
From: "Anastasiya(Asya) Pronina" <anastasiya.pronina@intel.com>
Date: Mon, 6 Jan 2025 10:49:33 +0100
Subject: [PATCH 084/110] Static llm pipeline: stateful model (#1240)

Related PRs:
- OpenVINO: *https://github.com/openvinotoolkit/openvino/pull/27651*
- OpenVINO Unroll SPDA:
*https://github.com/openvinotoolkit/openvino/pull/27891*
- OpenVINO Prefill/Generate sections:
*https://github.com/openvinotoolkit/openvino/pull/28154*
- OpenVINO LLMCompiledModel refactoring:
https://github.com/openvinotoolkit/openvino/pull/28267

---------

Co-authored-by: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 src/cpp/src/llm_pipeline.cpp        |  12 +-
 src/cpp/src/llm_pipeline_static.cpp | 340 ++++++++++++++++++++++++++--
 src/cpp/src/llm_pipeline_static.hpp |  73 +++++-
 3 files changed, 402 insertions(+), 23 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 0125479f92..11efed8b32 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -20,13 +20,13 @@ namespace {
 
 /* 
 * NPU reads some properties from the config file, but when LLMPipeline is initialized
-* from the model_str and weights_tensor, there are not files. 
+* from the model_str and weights_tensor, there are no files.
 * In the later case ModelDesc is stored in properties.
 * This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr.
 */
-std::pair<ov::AnyMap, ov::genai::ModelConfigDesc> split_model_descr(const ov::AnyMap& properties) {
+std::pair<ov::AnyMap, ov::genai::static_llm::ModelConfigDesc> split_model_descr(const ov::AnyMap& properties) {
     ov::AnyMap main_properties = properties;
-    ov::genai::ModelConfigDesc model_descr;
+    ov::genai::static_llm::ModelConfigDesc model_descr;
 
     auto pop_property = [](ov::AnyMap& orig_propertis, const std::string& key, auto& value) {
         if (orig_propertis.find(key) != orig_propertis.end()) {
@@ -105,7 +105,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, plugin_config);
     } else if (device == "NPU") {
-        m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, tokenizer, device, properties);
+        m_pimpl = static_llm::LLMPipelineFactory::create(models_path, tokenizer, device, properties);
     } else {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
     }
@@ -124,7 +124,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, device_properties);
     } else if (device == "NPU") {
-        m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, properties);
+        m_pimpl = static_llm::LLMPipelineFactory::create(models_path, device, properties);
     } else {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
     }
@@ -162,7 +162,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         // This will convert from AnyMap to ModelDesc.
         auto [filtered_properties, model_descr] = split_model_descr(properties);
 
-        m_pimpl = std::make_unique<StaticLLMPipeline>(
+        m_pimpl = static_llm::LLMPipelineFactory::create(
             utils::singleton_core().read_model(model_str, weights_tensor), 
             model_descr,
             tokenizer,
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index de1038a716..94aa6e19fe 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -398,12 +398,12 @@ KVAxesPosition get_kv_axes(const std::string& model_type) {
     return axes;
 }
 
-ov::genai::ModelConfigDesc get_modeldesc_from_json(const std::filesystem::path& filepath) {
+ov::genai::static_llm::ModelConfigDesc get_modeldesc_from_json(const std::filesystem::path& filepath) {
     std::ifstream file(filepath);
     OPENVINO_ASSERT(file.is_open(), "Could not open file: ", filepath);
     nlohmann::json config_data = nlohmann::json::parse(file);
 
-    ov::genai::ModelConfigDesc desc;
+    ov::genai::static_llm::ModelConfigDesc desc;
     desc.type = config_data["model_type"].get<std::string>();
     // NB: In case _name_or_path field isn't presented in config.json
     if (config_data.contains("_name_or_path")) {
@@ -588,6 +588,19 @@ std::optional<uint32_t> pop_int_and_cast(ov::AnyMap& config, const std::string&
     return std::nullopt;
 }
 
+void update_config(ov::AnyMap& config, const std::pair<std::string, ov::Any>& pair) {
+    if (config.count(pair.first) == 0) {
+        config.insert(pair);
+    }
+}
+
+void rename_key(ov::AnyMap& config, const std::string& old_key, const std::string& new_key) {
+    if (config.count(old_key) != 0) {
+        auto opt_value = pop_option(config, old_key);
+        config[new_key] = opt_value.value();
+    }
+}
+
 ov::Tensor make_tensor_slice(ov::Tensor tensor, size_t dim, size_t start_pos, size_t end_pos) {
     ov::Shape start_shape(std::vector<size_t>(tensor.get_shape().size(), 0u));
     start_shape[dim] = start_pos;
@@ -647,12 +660,269 @@ void stream_generated_tokens(std::shared_ptr<ov::genai::StreamerBase> streamer_p
     }
 }
 
+enum StaticPipelineKind {
+    STATEFUL,
+    STATELESS
+};
+StaticPipelineKind str_to_pipeline(const std::string& str) {
+    if (str == "STATEFUL") {
+        return StaticPipelineKind::STATEFUL;
+    }
+    if (str == "STATELESS") {
+        return StaticPipelineKind::STATELESS;
+    }
+    OPENVINO_THROW("Unsupported \"PIPELINE\" provided: ",
+                   str, ". Please select either \"STATEFUL\" or \"STATELESS\".");
+}
 } // anonymous namespace
 
 namespace ov {
 namespace genai {
+namespace static_llm {
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+    const std::filesystem::path& models_path,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string&,
+    const ov::AnyMap& config
+) : LLMPipelineImplBase(tokenizer,
+                        utils::from_config_json_if_exists(models_path)) {
+
+    auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config);
+    ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
+    ov::AnyMap properties = config;
+
+    auto compiled = setupAndCompileModel(model, model_desc, properties);
+    m_request = compiled->create_infer_request();
+}
+
+
+StatefulLLMPipeline::StatefulLLMPipeline(
+    const std::shared_ptr<ov::Model>& model,
+    const ModelConfigDesc& model_desc,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string&,
+    const ov::AnyMap& properties,
+    const ov::genai::GenerationConfig& generation_config
+) : LLMPipelineImplBase(tokenizer, generation_config) {
+    ov::AnyMap properties_copy = properties;
+    auto compiled = setupAndCompileModel(model, model_desc, properties_copy);
+    m_request = compiled->create_infer_request();
+}
+
+std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
+    const std::shared_ptr<ov::Model>& model,
+    const ModelConfigDesc& model_desc,
+    ov::AnyMap& pipeline_config) {
+
+    const uint32_t kMaxPromptLen = pop_int_and_cast(pipeline_config, "MAX_PROMPT_LEN").value_or(1024u);
+    const uint32_t kMinResponseLen = pop_int_and_cast(pipeline_config, "MIN_RESPONSE_LEN").value_or(128u);
+    m_kvcache_total = kMaxPromptLen + kMinResponseLen;
+    std::string generate_hint = pop_or_default<std::string>(pipeline_config, "GENERATE_HINT", "FAST_COMPILE");
+
+    update_config(pipeline_config, {"NPU_USE_NPUW", "YES"});
+    update_config(pipeline_config, {"NPUW_LLM", "YES"});
+
+    KVAxesPosition axes = get_kv_axes(model_desc.type);
+    update_config(pipeline_config, {"NPUW_LLM_BATCH_DIM", axes.batch});
+    update_config(pipeline_config, {"NPUW_LLM_SEQ_LEN_DIM", axes.seq_len});
+
+    update_config(pipeline_config, {"NPUW_LLM_MAX_PROMPT_LEN", kMaxPromptLen});
+    update_config(pipeline_config, {"NPUW_LLM_MIN_RESPONSE_LEN", kMinResponseLen});
+    update_config(pipeline_config, {"NPUW_LLM_GENERATE_HINT", generate_hint});
+
+    // NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model
+    if ( model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" ||
+        (model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) {
+            update_config(pipeline_config, {"NPUW_LLM_OPTIMIZE_V_TENSORS", true});
+    }
+
+    rename_key(pipeline_config, "PREFILL_CONFIG", "NPUW_LLM_PREFILL_CONFIG");
+    rename_key(pipeline_config, "GENERATE_CONFIG", "NPUW_LLM_GENERATE_CONFIG");
+  
+    return std::make_shared<ov::CompiledModel>(genai::utils::singleton_core().compile_model(model, "NPU", pipeline_config));
+}
+
+DecodedResults StatefulLLMPipeline::generate(
+    StringInputs inputs,
+    OptionalGenerationConfig generation_config,
+    StreamerVariant streamer
+) {
+    auto start_time = std::chrono::steady_clock::now();
+
+    GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+    std::string prompt;
+    if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
+        OPENVINO_ASSERT(input_vector->size() == 1u, "Currently only batch size=1 is supported");
+        prompt = std::move(input_vector->front());
+    } else {
+        OPENVINO_ASSERT(std::holds_alternative<std::string>(inputs));
+        prompt = std::get<std::string>(inputs);
+    }
+
+    ov::genai::TokenizedInputs tokenized_input;
+    if (m_is_chat_conversation) {
+        m_history.push_back({{"role", "user"}, {"content", prompt}});
+        constexpr bool add_generation_prompt = true;
+        prompt = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+        // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF
+        tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false));
+    } else {
+        tokenized_input = m_tokenizer.encode(prompt);
+    }
+
+    auto encode_stop_time =  std::chrono::steady_clock::now();
+    auto encoded_results = generate(tokenized_input, config, streamer);
+
+    auto decode_start_time =  std::chrono::steady_clock::now();
+    DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+    auto decode_stop_time =  std::chrono::steady_clock::now();
+
+    if (m_is_chat_conversation) {
+        auto answer = decoded_results.texts[0];
+        m_history.push_back({{"role", "assistant"}, {"content", answer}});
+    }
+
+    // generate_durations
+    decoded_results.perf_metrics = encoded_results.perf_metrics;
+    auto& raw_counters = decoded_results.perf_metrics.raw_metrics;
+    auto stop_time = std::chrono::steady_clock::now();
+    raw_counters.generate_durations = std::vector<MicroSeconds>();
+    raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+    raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time));
+    raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time));
+    decoded_results.perf_metrics.m_evaluated = false;
+    decoded_results.perf_metrics.evaluate_statistics(start_time);
+    return decoded_results;
+}
+
+EncodedResults StatefulLLMPipeline::generate(
+    const EncodedInputs& inputs,
+    OptionalGenerationConfig generation_config,
+    StreamerVariant streamer
+) {
+    auto start_time = std::chrono::steady_clock::now();
+    ov::Tensor input_ids;
+    ov::Tensor attention_mask;
+
+    if (auto data = std::get_if<ov::Tensor>(&inputs)) {
+        input_ids = *data;
+        attention_mask = ov::genai::utils::init_attention_mask(input_ids);
+    } else if (auto data = std::get_if<TokenizedInputs>(&inputs)) {
+        input_ids = data->input_ids;
+        attention_mask = data->attention_mask;
+    }
+
+    OPENVINO_ASSERT(input_ids.get_shape().at(0) == 1u, "Currently only batch size=1 is supported");
+
+    GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+    // If eos_token_id was not provided, take value from default m_generation_config
+    if (config.eos_token_id == -1)
+        config.set_eos_token_id(m_generation_config.eos_token_id);
+    config.validate();
+
+    std::shared_ptr<StreamerBase> streamer_ptr;
+    if (auto streamer_obj = std::get_if<std::monostate>(&streamer)) {
+        streamer_ptr = nullptr;
+    } else if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) {
+        streamer_ptr = *streamer_obj;
+    } else if (auto callback = std::get_if<std::function<bool(std::string)>>(&streamer)) {
+        streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
+    }
+
+    OPENVINO_ASSERT(config.is_greedy_decoding(), "Currently only greedy decoding is supported");
+
+    ov::Shape prompts_shape = input_ids.get_shape();
+    const size_t batch_size = prompts_shape[0];
+    ov::genai::EncodedResults results;
+    auto& raw_perf_counters = results.perf_metrics.raw_metrics;
+    // NB: Only batch=1 is supported now
+    results.scores.resize(1u);
+    results.scores[0] = 0u;
+    results.tokens.resize(1u);
+
+    // TODO: Check if there is enough space in KV-cache to process input prompt
+    auto prompt_len = input_ids.get_size();
+
+    ov::Tensor position_ids{ov::element::i64, input_ids.get_shape()};
+    utils::initialize_position_ids(position_ids, attention_mask);
+
+    m_request.set_tensor("input_ids", input_ids);
+    m_request.set_tensor("attention_mask", attention_mask);
+    m_request.set_tensor("position_ids", position_ids);
+
+    m_request.infer();
+
+    int64_t last_token = utils::argmax(m_request.get_tensor("logits"), 0);
+
+    results.tokens[0].push_back(last_token);
+    if (streamer_ptr && streamer_ptr->put(last_token)) {
+        return results;
+    }
+
+    int64_t input_ids_data = -1;
+    int64_t position_ids_data = prompt_len - 1;
+    std::vector<int64_t> attention_mask_data(prompt_len - 1, 1);
+    m_request.set_tensor("input_ids", ov::Tensor(ov::element::i64, ov::Shape{1,1},  reinterpret_cast<void*>(&input_ids_data)));
+    m_request.set_tensor("position_ids", ov::Tensor(ov::element::i64, ov::Shape{1,1}, reinterpret_cast<void*>(&position_ids_data)));
 
-StaticLLMPipeline::StaticLLMPipeline(
+    const size_t max_tokens = config.get_max_new_tokens(prompt_len);
+    for (int i = 0; i < max_tokens - 1; ++i) {
+        // KV Cache is full, no further generation is possible
+        if (position_ids_data + 1 == m_kvcache_total) {
+            break;
+        }
+
+        // Just change the variables here, as pointers to them are already set to corresponding tensors
+        input_ids_data = last_token;
+        ++position_ids_data;
+        // However, attention_mask changes its shape on each iteration, it should be re-set explicitly
+        attention_mask_data.push_back(1);
+        m_request.set_tensor("attention_mask", ov::Tensor(ov::element::i64, ov::Shape{1,attention_mask_data.size()}, (void*)&attention_mask_data[0]));
+
+        m_request.infer();
+
+        last_token = utils::argmax(m_request.get_tensor("logits"), 0);
+        results.tokens[0].push_back(last_token);
+
+        raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
+        raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
+        if (streamer_ptr && streamer_ptr->put(last_token)) {
+            break;
+        }
+
+        if (last_token == config.eos_token_id && !config.ignore_eos) {
+            break;
+        }
+    }
+
+    if (streamer_ptr) {
+        streamer_ptr->end();
+    }
+
+    auto stop_time = std::chrono::steady_clock::now();
+    // If is called without tokenization then that stat will not be reported.
+    auto& metrics = results.perf_metrics;
+    metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1);
+    metrics.load_time = this->m_load_time_ms;
+    metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+    metrics.evaluate_statistics(start_time);
+    return results;
+}
+
+void StatefulLLMPipeline::start_chat(const std::string& system_message) {
+    if (!system_message.empty()) {
+        m_history.push_back({{"role", "system"}, {"content", system_message}});
+    }
+    m_is_chat_conversation = true;
+};
+
+void StatefulLLMPipeline::finish_chat() {
+    m_is_chat_conversation = false;
+    m_history.clear();
+};
+
+StatelessLLMPipeline::StatelessLLMPipeline(
     const std::filesystem::path& models_path,
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
@@ -692,14 +962,14 @@ StaticLLMPipeline::StaticLLMPipeline(
     m_sampler.set_seed(m_generation_config.rng_seed);
 };
 
-StaticLLMPipeline::StaticLLMPipeline(
+StatelessLLMPipeline::StatelessLLMPipeline(
     const std::filesystem::path& models_path,
     const std::string& device,
     const ov::AnyMap& properties
-) : StaticLLMPipeline(models_path, Tokenizer(models_path), device, properties) {
+) : StatelessLLMPipeline(models_path, Tokenizer(models_path), device, properties) {
 }
 
-StaticLLMPipeline::StaticLLMPipeline(
+StatelessLLMPipeline::StatelessLLMPipeline(
     const std::shared_ptr<ov::Model>& model,
     const ModelConfigDesc& model_desc,
     const ov::genai::Tokenizer& tokenizer,
@@ -729,7 +999,7 @@ StaticLLMPipeline::StaticLLMPipeline(
     m_sampler.set_seed(m_generation_config.rng_seed);
 }
 
-void StaticLLMPipeline::setupAndCompileModels(
+void StatelessLLMPipeline::setupAndCompileModels(
     const std::shared_ptr<ov::Model>& model,
     const std::string& device,
     const ModelConfigDesc& model_desc,
@@ -808,7 +1078,7 @@ void StaticLLMPipeline::setupAndCompileModels(
     ov::genai::utils::print_compiled_model_properties(prefill_compiled_model, "Static LLM prefill compiled model");
 }
 
-void StaticLLMPipeline::setupAndImportModels(
+void StatelessLLMPipeline::setupAndImportModels(
     const std::filesystem::path& models_path,
     const std::string& device,
     ov::AnyMap& properties) {
@@ -882,19 +1152,19 @@ void StaticLLMPipeline::setupAndImportModels(
     m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, 2u };
 }
 
-void StaticLLMPipeline::start_chat(const std::string& system_message) {
+void StatelessLLMPipeline::start_chat(const std::string& system_message) {
     if (!system_message.empty()) {
         m_history.push_back({{"role", "system"}, {"content", system_message}});
     }
     m_is_chat_conversation = true;
 };
 
-void StaticLLMPipeline::finish_chat() {
+void StatelessLLMPipeline::finish_chat() {
     m_is_chat_conversation = false;
     m_history.clear();
 };
 
-void StaticLLMPipeline::prepare_for_new_conversation() {
+void StatelessLLMPipeline::prepare_for_new_conversation() {
     fill_tensor<int64_t>(m_prefill_request.get_tensor("input_ids"), m_tokenizer.get_pad_token_id());
     fill_tensor<int64_t>(m_prefill_request.get_tensor("position_ids"), 0u);
     fill_tensor<int64_t>(m_prefill_request.get_tensor("attention_mask"), 0u);
@@ -902,7 +1172,7 @@ void StaticLLMPipeline::prepare_for_new_conversation() {
     m_kvcache_desc.num_stored_tokens = 0u;
 }
 
-DecodedResults StaticLLMPipeline::generate(
+DecodedResults StatelessLLMPipeline::generate(
     StringInputs inputs,
     OptionalGenerationConfig generation_config,
     StreamerVariant streamer
@@ -957,7 +1227,7 @@ DecodedResults StaticLLMPipeline::generate(
     return decoded_results;
 }
 
-EncodedResults StaticLLMPipeline::generate(
+EncodedResults StatelessLLMPipeline::generate(
     const EncodedInputs& inputs,
     OptionalGenerationConfig generation_config,
     StreamerVariant streamer
@@ -1156,5 +1426,49 @@ EncodedResults StaticLLMPipeline::generate(
     return results;
 }
 
+std::unique_ptr<LLMPipelineImplBase>
+LLMPipelineFactory::create(const std::filesystem::path& models_path,
+                           const ov::genai::Tokenizer& tokenizer,
+                           const std::string& device,
+                           const ov::AnyMap& config) {
+    auto properties = config;
+    const auto pipeline_mode = str_to_pipeline(pop_or_default(properties, "STATIC_PIPELINE", std::string("STATELESS")));
+    if (pipeline_mode == StaticPipelineKind::STATEFUL) {
+        return std::make_unique<ov::genai::static_llm::StatefulLLMPipeline>(models_path, tokenizer, device, properties);
+    }
+    return std::make_unique<ov::genai::static_llm::StatelessLLMPipeline>(models_path, tokenizer, device, properties);
+}
+
+std::unique_ptr<LLMPipelineImplBase>
+LLMPipelineFactory::create(const std::filesystem::path& models_path,
+                                 const std::string& device,
+                                 const ov::AnyMap& config) {
+    return create(models_path, Tokenizer(models_path), device, config);
+}
+
+std::unique_ptr<LLMPipelineImplBase> LLMPipelineFactory::create(const std::shared_ptr<ov::Model>& model,
+                                                                const ModelConfigDesc& model_desc,
+                                                                const ov::genai::Tokenizer& tokenizer,
+                                                                const std::string& device,
+                                                                const ov::AnyMap& properties,
+                                                                const ov::genai::GenerationConfig& generation_config) {
+    auto properties_copy = properties;
+    const auto pipeline_mode = str_to_pipeline(pop_or_default(properties_copy, "STATIC_PIPELINE", std::string("STATELESS")));
+    if (pipeline_mode == StaticPipelineKind::STATEFUL) {
+        return std::make_unique<ov::genai::static_llm::StatefulLLMPipeline>(model,
+                                                                            model_desc,
+                                                                            tokenizer,
+                                                                            device,
+                                                                            properties_copy,
+                                                                            generation_config);
+    }
+    return std::make_unique<ov::genai::static_llm::StatelessLLMPipeline>(model,
+                                                                         model_desc,
+                                                                         tokenizer,
+                                                                         device,
+                                                                         properties_copy,
+                                                                         generation_config);
+}
+}  // namespace static_llm
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
index 8dc7ef49a1..dd51c31b29 100644
--- a/src/cpp/src/llm_pipeline_static.hpp
+++ b/src/cpp/src/llm_pipeline_static.hpp
@@ -10,6 +10,7 @@
 
 namespace ov {
 namespace genai {
+namespace static_llm {
 
 struct ModelConfigDesc {
     std::string type;
@@ -17,16 +18,34 @@ struct ModelConfigDesc {
     int num_key_value_heads;
 };
 
-class StaticLLMPipeline final : public LLMPipelineImplBase {
+struct LLMPipelineFactory {
+    static std::unique_ptr<LLMPipelineImplBase> create(const std::filesystem::path& path,
+                                                       const ov::genai::Tokenizer& tokenizer,
+                                                       const std::string& device,
+                                                       const ov::AnyMap& config);
+
+    static std::unique_ptr<LLMPipelineImplBase> create(const std::filesystem::path& path,
+                                                       const std::string& device,
+                                                       const ov::AnyMap& config);
+
+    static std::unique_ptr<LLMPipelineImplBase> create(const std::shared_ptr<ov::Model>& model,
+                                                       const ModelConfigDesc& model_desc,
+                                                       const ov::genai::Tokenizer& tokenizer,
+                                                       const std::string& device,
+                                                       const ov::AnyMap& properties,
+                                                       const ov::genai::GenerationConfig& generation_config = {});
+};
+
+class StatefulLLMPipeline : public LLMPipelineImplBase {
 public:
-    StaticLLMPipeline(
+    StatefulLLMPipeline(
         const std::filesystem::path& path,
         const ov::genai::Tokenizer& tokenizer,
         const std::string& device,
         const ov::AnyMap& config
     );
 
-    StaticLLMPipeline(
+    StatefulLLMPipeline(
         const std::shared_ptr<ov::Model>& model,
         const ModelConfigDesc& model_desc,
         const ov::genai::Tokenizer& tokenizer,
@@ -35,12 +54,57 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
         const ov::genai::GenerationConfig& generation_config = {}
     );
 
-    StaticLLMPipeline(
+    std::shared_ptr<ov::CompiledModel> setupAndCompileModel(
+        const std::shared_ptr<ov::Model>& model,
+        const ModelConfigDesc& model_desc,
+        ov::AnyMap& pipeline_config);
+
+    DecodedResults generate(
+        StringInputs inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override;
+
+    EncodedResults generate(
+        const EncodedInputs& inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override;
+
+    void start_chat(const std::string& system_message) override;
+    void finish_chat() override;
+
+private:
+    uint32_t m_kvcache_total = 0u;
+    ov::InferRequest m_request;
+    bool m_is_chat_conversation = false;
+    ChatHistory m_history;
+};
+
+class StatelessLLMPipeline final : public LLMPipelineImplBase {
+public:
+    StatelessLLMPipeline(
+        const std::filesystem::path& path,
+        const ov::genai::Tokenizer& tokenizer,
+        const std::string& device,
+        const ov::AnyMap& config
+    );
+
+    StatelessLLMPipeline(
         const std::filesystem::path& path,
         const std::string& device,
         const ov::AnyMap& config
     );
 
+    StatelessLLMPipeline(
+        const std::shared_ptr<ov::Model>& model,
+        const ModelConfigDesc& model_desc,
+        const ov::genai::Tokenizer& tokenizer,
+        const std::string& device,
+        const ov::AnyMap& properties,
+        const ov::genai::GenerationConfig& generation_config = {}
+    );
+
     void setupAndCompileModels(
         const std::shared_ptr<ov::Model>& model,
         const std::string& device,
@@ -88,5 +152,6 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
     ChatHistory m_history;
 };
 
+}  // namespace static_llm
 }  // namespace genai
 }  // namespace ov

From b7051554de9df0441835dad3a43c8e5b527b9621 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 6 Jan 2025 16:08:11 +0400
Subject: [PATCH 085/110] [LLM Bench] Added support of OpenVINO pre-releases
 install and test (#1480)

---
 .../{llm_bench-python.yml => genai-tools.yml} | 85 +++++++++----------
 tests/python_tests/common.py                  |  2 +-
 tests/python_tests/test_kv_cache_eviction.py  |  2 +-
 tests/python_tests/test_vlm_pipeline.py       |  2 +-
 tools/llm_bench/requirements.txt              |  1 +
 tools/who_what_benchmark/requirements.txt     |  6 +-
 6 files changed, 45 insertions(+), 53 deletions(-)
 rename .github/workflows/{llm_bench-python.yml => genai-tools.yml} (78%)

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/genai-tools.yml
similarity index 78%
rename from .github/workflows/llm_bench-python.yml
rename to .github/workflows/genai-tools.yml
index 56145c080c..333bee3e11 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/genai-tools.yml
@@ -1,7 +1,7 @@
 # This workflow will install Python dependencies, run tests and lint with a single version of Python
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 
-name: llm_bench Python Test
+name: GenAI tools
 
 on:
   workflow_dispatch:
@@ -46,7 +46,8 @@ jobs:
         commit_packages_to_provide: wheels
         revision: latest_available_commit
 
-  build:
+  llm_bench:
+    name: 'LLM bench tests'
     defaults:
       run:
         shell: bash
@@ -60,7 +61,6 @@ jobs:
       OV_INSTALL_DIR: ${{ github.workspace }}/ov
       SRC_DIR: ${{ github.workspace }}
       LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench
-      WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark
 
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -70,6 +70,12 @@ jobs:
         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
           python-version: ${{ matrix.python-version }}
+      - name: Lint with flake8
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install flake8 pytest black
+          # stop the build if there are Python syntax errors or undefined names
+          python -m flake8 ${{ env.LLM_BENCH_PYPATH }} --config=${{ env.LLM_BENCH_PYPATH }}/setup.cfg
       - name: Download OpenVINO package
         uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
@@ -78,59 +84,42 @@ jobs:
           merge-multiple: true
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          python -m pip install flake8 pytest black
           python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
           python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
-          GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          python -m pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
         working-directory: ${{ env.OV_INSTALL_DIR }}
-      - name: Lint with flake8
-        run: |
-          # stop the build if there are Python syntax errors or undefined names
-          python -m flake8 ${{ env.LLM_BENCH_PYPATH }} --config=${{ env.LLM_BENCH_PYPATH }}/setup.cfg
-          python -m flake8 ${{ env.WWB_PATH }} --config=${{ env.WWB_PATH }}/setup.cfg
-      - name: Create code style diff for samples
-        if: failure()
-        run: |
-          python -m black -l 160 -S ${{ env.LLM_BENCH_PYPATH }}/
-          git diff > llm.bench_diff.diff
-      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
-        if: failure()
-        with:
-          name: llm.bench_diff
-          path: llm.bench_diff.diff
-      - name: Test native pytorch model on Linux
+      - name: Test native pytorch model
         run: |
           git clone --depth 1 https://huggingface.co/katuni4ka/tiny-random-qwen
           python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt -ic 20
           rm -rf tiny-random-qwen
         env:
           GIT_LFS_SKIP_SMUDGE: 0
-      - name: Test tiny-random-baichuan2 on Linux Optimum Intel
+      - name: Test tiny-random-baichuan2 Optimum Intel
         run: |
           optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16
           python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 --optimum -ic 10
           rm -rf ./ov_models/tiny-random-baichuan2
-      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux Optimum Intel
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov Optimum Intel
         run: |
           huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir ov_models/lcm_dreamshaper_v7
           python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum --num_steps 4
-      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov with GenAI
         run: |
           python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --num_steps 4
-      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI and LoRA
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov with GenAI and LoRA
         run: |
           wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
           python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 --num_steps 4
           rm -rf ./ov_models/lcm_dreamshaper_v7/
-      - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
+      - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Decoding via GenAI
         run: |
           optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16
           optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8
           python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --assistant_confidence_threshold 0.4 -ic 20
           python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --num_assistant_tokens 5 -ic 20
           rm -rf ov_models/TinyLlama-1.1B-Chat-v1.0
-      - name: Test whisper-tiny on Linux
+      - name: Test whisper-tiny via GenAI
         run: |
           GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech
           cd multilingual_librispeech
@@ -143,60 +132,64 @@ jobs:
           python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1
           rm -rf ./ov_models/whisper-tiny
           rm -rf multilingual_librispeech
-      - name: Text InternVL2-1B on Linux
+      - name: Text InternVL2-1B via GenAI
         run: |
           optimum-cli export openvino --model OpenGVLab/InternVL2-1B ./ov_models/internvl2-1B --task image-text-to-text --trust-remote-code
           python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20
           python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20 --optimum
           rm -rf ./ov_models/internvl2-1B
-      - name: WWB Tests
-        run: |
-          pip install git+https://github.com/huggingface/optimum-intel.git
-          GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
-          python -m pytest -v ${{ env.WWB_PATH }}/tests
-  stateful:
+
+  wwb:
+    name: 'WWB tests'
     defaults:
       run:
         shell: bash
     runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.11"]
     needs: [ openvino_download ]
     env:
       OV_INSTALL_DIR: ${{ github.workspace }}/ov
       SRC_DIR: ${{ github.workspace }}
-      LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench
       WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark
 
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           submodules: recursive
-      - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
-          python-version: "3.11"
+          python-version: ${{ matrix.python-version }}
+      - name: Lint with flake8
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install flake8 pytest black
+          # stop the build if there are Python syntax errors or undefined names
+          python -m flake8 ${{ env.WWB_PATH }} --config=${{ env.WWB_PATH }}/setup.cfg
       - name: Download OpenVINO package
         uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
           name: ${{ needs.openvino_download.outputs.ov_artifact_name }}
           path: ${{ env.OV_INSTALL_DIR }}
           merge-multiple: true
-      - name: Test stateful
+      - name: Install dependencies
         run: |
           python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
           python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
-          GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
-          python ${{ env.LLM_BENCH_PYPATH }}/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ${{ env.SRC_DIR }} --stateful
-          grep beam_idx ${{ env.SRC_DIR }}/pytorch/dldt/FP32/openvino_model.xml
+          python -m pip install -r ${{ env.WWB_PATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          python -m pip install git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel
         working-directory: ${{ env.OV_INSTALL_DIR }}
       - name: WWB Tests
         run: |
-          pip install pytest
-          pip install git+https://github.com/huggingface/optimum-intel.git
-          GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
+          python -m pip install -v ${{ env.WWB_PATH }}
           python -m pytest -v ${{ env.WWB_PATH }}/tests
 
   Overall_Status:
     name: ci/gha_overall_status_llm_bench
-    needs: [openvino_download, build, stateful]
+    needs: [openvino_download, llm_bench, wwb]
     if: ${{ always() }}
     runs-on: ubuntu-latest
     steps:
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index bb34c1dcd4..aa4c537dd6 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -390,7 +390,7 @@ def compare_generation_results(prompts: List[str], hf_results: List[GenerationRe
 
 def get_hugging_face_models(model_id: str):
     hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, ov_config=get_default_properties())
+    opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_properties())
     return opt_model, hf_tokenizer
 
 
diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py
index 41281e9cab..3dbf9297ee 100644
--- a/tests/python_tests/test_kv_cache_eviction.py
+++ b/tests/python_tests/test_kv_cache_eviction.py
@@ -42,7 +42,7 @@ class ConvertedModel:
 @pytest.fixture(scope='module')
 def converted_model(tmp_path_factory):
     model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-    model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True)
+    model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     models_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id
     model.save_pretrained(models_path)
diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py
index 81c181bc54..62c1c27e3b 100644
--- a/tests/python_tests/test_vlm_pipeline.py
+++ b/tests/python_tests/test_vlm_pipeline.py
@@ -19,7 +19,7 @@ def get_ov_model(cache):
     ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True)
     openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml")
     openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml")
-    model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, trust_remote_code=True)
+    model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True)
     processor.save_pretrained(model_dir)
     model.save_pretrained(model_dir)
     return model_dir
diff --git a/tools/llm_bench/requirements.txt b/tools/llm_bench/requirements.txt
index f5f4a3fdeb..6bf8d8cddf 100644
--- a/tools/llm_bench/requirements.txt
+++ b/tools/llm_bench/requirements.txt
@@ -1,5 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 numpy
+--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino
 openvino-tokenizers
diff --git a/tools/who_what_benchmark/requirements.txt b/tools/who_what_benchmark/requirements.txt
index 9d151abbf3..82a28accee 100644
--- a/tools/who_what_benchmark/requirements.txt
+++ b/tools/who_what_benchmark/requirements.txt
@@ -1,10 +1,8 @@
+accelerate>=0.26.0
 transformers>=4.35.2
 sentence-transformers>=2.2.2
-openvino
-openvino-tokenizers
 openvino-genai
-openvino-telemetry
-optimum-intel>=1.19.0
+optimum-intel[nncf]>=1.19.0
 pandas>=2.0.3
 numpy>=1.23.5
 tqdm>=4.66.1

From 91ebc2548fb6eef80247c6bd4d41cf6aba74592e Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Mon, 6 Jan 2025 17:18:40 +0400
Subject: [PATCH 086/110] [llm bench] remove outdated code from pt models
 loading (#1481)

CVS-150917
fix loading gptq models and some mismatches in files when model exported
without trust_remote_code and executed within
---
 tools/llm_bench/llm_bench_utils/pt_utils.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tools/llm_bench/llm_bench_utils/pt_utils.py b/tools/llm_bench/llm_bench_utils/pt_utils.py
index dc2c6d05f5..877c135a3c 100644
--- a/tools/llm_bench/llm_bench_utils/pt_utils.py
+++ b/tools/llm_bench/llm_bench_utils/pt_utils.py
@@ -62,11 +62,14 @@ def create_text_gen_model(model_path, device, **kwargs):
             model_class = PT_MODEL_CLASSES_MAPPING.get(model_type, PT_MODEL_CLASSES_MAPPING[default_model_type])
             token_class = TOKENIZE_CLASSES_MAPPING.get(model_type, TOKENIZE_CLASSES_MAPPING[default_model_type])
             start = time.perf_counter()
-            if model_type == 'chatglm':
-                model = model_class.from_pretrained(model_path, trust_remote_code=True).to('cpu', dtype=float)
-            else:
-                model = model_class.from_pretrained(model_path, trust_remote_code=True)
-            tokenizer = token_class.from_pretrained(model_path, trust_remote_code=True)
+            trust_remote_code = False
+            try:
+                model = model_class.from_pretrained(model_path, trust_remote_code=trust_remote_code)
+            except Exception:
+                start = time.perf_counter()
+                trust_remote_code = True
+                model = model_class.from_pretrained(model_path, trust_remote_code=trust_remote_code)
+            tokenizer = token_class.from_pretrained(model_path, trust_remote_code=trust_remote_code)
             end = time.perf_counter()
             from_pretrain_time = end - start
         else:

From d3d628a9080fba3d61df2a212bf0a0bdf0d9bd33 Mon Sep 17 00:00:00 2001
From: Alexander Kozlov <kozzzloff@list.ru>
Date: Mon, 6 Jan 2025 16:18:59 +0300
Subject: [PATCH 087/110] Fixed WWB im2im tests (#1482)

Using `datasets==3.2.0` breaks the Python process with some GIL issue:

```
Fatal Python error: PyGILState_Release: thread state 0x7fa56809ade0 must be current when releasing
Python runtime state: finalizing (tstate=0x0000000000ad2958)

Thread 0x00007fa6ae07c740 (most recent call first):
  <no Python frame>

Extension modules: numpy.core._multiarray_umath, numpy.core._multiarray_tests, numpy.linalg._umath_linalg, numpy.fft._pocketfft_internal, numpy.random._common, numpy.random.bit_generator, numpy.random._boun
ded_integers, numpy.random._mt19937, numpy.random.mtrand, numpy.random._philox,
```

---------

Co-authored-by: Ekaterina Aidova <ekaterina.aidova@intel.com>
---
 .github/workflows/genai-tools.yml                | 1 +
 tools/who_what_benchmark/requirements.txt        | 1 +
 tools/who_what_benchmark/tests/test_cli_image.py | 6 +++---
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/genai-tools.yml b/.github/workflows/genai-tools.yml
index 333bee3e11..d5ce156615 100644
--- a/.github/workflows/genai-tools.yml
+++ b/.github/workflows/genai-tools.yml
@@ -181,6 +181,7 @@ jobs:
           python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
           python -m pip install -r ${{ env.WWB_PATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
           python -m pip install git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel
+          pip install -U --force-reinstall datasets==3.1.0
         working-directory: ${{ env.OV_INSTALL_DIR }}
       - name: WWB Tests
         run: |
diff --git a/tools/who_what_benchmark/requirements.txt b/tools/who_what_benchmark/requirements.txt
index 82a28accee..d4b702de78 100644
--- a/tools/who_what_benchmark/requirements.txt
+++ b/tools/who_what_benchmark/requirements.txt
@@ -7,3 +7,4 @@ pandas>=2.0.3
 numpy>=1.23.5
 tqdm>=4.66.1
 diffusers
+datasets<3.2.0
diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py
index fec9e96f4c..ccd6ee1cec 100644
--- a/tools/who_what_benchmark/tests/test_cli_image.py
+++ b/tools/who_what_benchmark/tests/test_cli_image.py
@@ -42,8 +42,8 @@ def teardown_module():
         ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"),
         ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "openvino"),
         ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "text-to-image", "hf"),
-        # ("hf-internal-testing/tiny-stable-diffusion-torch", "image-inpainting", "hf"),
-        # ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "image-inpainting", "hf"),
+        ("hf-internal-testing/tiny-stable-diffusion-torch", "image-inpainting", "hf"),
+        ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "image-inpainting", "hf"),
     ],
 )
 def test_image_model_types(model_id, model_type, backend):
@@ -90,7 +90,7 @@ def test_image_model_types(model_id, model_type, backend):
     list(itertools.product(OV_IMAGE_MODELS,
                            ["image-to-image",
                             "text-to-image",
-                            # "image-inpainting"
+                            "image-inpainting"
                             ])),
 )
 def test_image_model_genai(model_id, model_type):

From b04b28b97b5356635444552cd2591aa661fa2888 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 6 Jan 2025 18:42:17 +0400
Subject: [PATCH 088/110] Update genai-tools.yml (#1484)

Since WWB has its own dedicated pipeline, where only WWB requirements
are installed, datasets are supposed of correct version after
tools/wwb/requirements.txt installation.

Let's check it on CI
---
 .github/workflows/genai-tools.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/genai-tools.yml b/.github/workflows/genai-tools.yml
index d5ce156615..333bee3e11 100644
--- a/.github/workflows/genai-tools.yml
+++ b/.github/workflows/genai-tools.yml
@@ -181,7 +181,6 @@ jobs:
           python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
           python -m pip install -r ${{ env.WWB_PATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
           python -m pip install git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel
-          pip install -U --force-reinstall datasets==3.1.0
         working-directory: ${{ env.OV_INSTALL_DIR }}
       - name: WWB Tests
         run: |

From 48dfd1619e42020ef92868051ed342910454af33 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 7 Jan 2025 00:39:11 +0400
Subject: [PATCH 089/110] [TESTS] Use FP32 inference precision, FP16 KV cache
 precision for pipelines (#1485)

OpenVINO plugins enable different kind of optimizations by default like
KV cache compression to int8, fp16 inference precision, while in GenAI
tests we want to test pipelines and how they are compared against HF /
optimum w/o extra optimizations:


https://github.com/openvinotoolkit/openvino.genai/blob/4db67aecac78885c6d1e302f348c9489e2154388/tests/python_tests/common.py#L318-L325

Hopefully, we can merge int8 KV cache by default for CB then
https://github.com/openvinotoolkit/openvino.genai/pull/1206, because in
tests we will still compare FP16 KV cache, while official Validation
should be responsible for validation against reference via WWB metrics.
---
 samples/export-requirements.txt                 |  2 +-
 src/python/openvino_genai/py_openvino_genai.pyi |  2 +-
 src/python/py_continuous_batching_pipeline.cpp  |  7 +++----
 tests/python_tests/common.py                    |  2 +-
 tests/python_tests/ov_genai_test_utils.py       | 12 ++++++------
 tests/python_tests/requirements.txt             |  2 +-
 tests/python_tests/test_continuous_batching.py  |  4 ++--
 tests/python_tests/test_kv_cache_eviction.py    |  8 ++++----
 tests/python_tests/test_vlm_pipeline.py         |  4 ++--
 9 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index af38558656..2f71891b7b 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -2,7 +2,7 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino-tokenizers~=2025.0.0.0.dev
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index 9ff28859b9..d405cd9bbf 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -364,7 +364,7 @@ class ContinuousBatchingPipeline:
     def __init__(self, models_path: os.PathLike, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None:
         ...
     @typing.overload
-    def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None:
+    def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, **kwargs) -> None:
         ...
     @typing.overload
     def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, generation_config: GenerationConfig) -> GenerationHandle:
diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp
index 48eb124255..975100cb11 100644
--- a/src/python/py_continuous_batching_pipeline.cpp
+++ b/src/python/py_continuous_batching_pipeline.cpp
@@ -223,15 +223,14 @@ void init_continuous_batching_pipeline(py::module_& m) {
         py::arg("properties") = ov::AnyMap({}),
         py::arg("tokenizer_properties") = ov::AnyMap({}))
 
-        .def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
+        .def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const py::kwargs& kwargs) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
-            return std::make_unique<ContinuousBatchingPipeline>(models_path, tokenizer, scheduler_config, device, pyutils::properties_to_any_map(plugin_config));
+            return std::make_unique<ContinuousBatchingPipeline>(models_path, tokenizer, scheduler_config, device, pyutils::kwargs_to_any_map(kwargs));
         }),
         py::arg("models_path"),
         py::arg("tokenizer"),
         py::arg("scheduler_config"),
-        py::arg("device"),
-        py::arg("properties") = ov::AnyMap({}))
+        py::arg("device"))
 
         .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer)
         .def("get_config", &ContinuousBatchingPipeline::get_config)
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index aa4c537dd6..2fca58a959 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -306,7 +306,7 @@ def run_continuous_batching(
     if type(generation_configs) is not list:
         generation_configs = [generation_configs] * len(prompts)
  
-    cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU')
+    cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=get_default_properties())
     output = cb_pipe.generate(prompts, generation_configs)
 
     del cb_pipe
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 00c74f6628..66fb58f46d 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -14,7 +14,7 @@
 import json
 
 import openvino_genai as ov_genai
-
+from common import get_default_properties
 
 def get_models_list():
     precommit_models = [
@@ -92,7 +92,7 @@ def read_model(params, **tokenizer_kwargs):
 
     if (models_path / "openvino_model.xml").exists():
         opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True,
-                                                       compile=False, device='CPU')
+                                                       compile=False, device='CPU', ov_config=get_default_properties())
     else:
         ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer,
                                                                              with_detokenizer=True,
@@ -104,7 +104,7 @@ def read_model(params, **tokenizer_kwargs):
         hf_tokenizer.save_pretrained(models_path)
 
         opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True,
-                                                       compile=False, device='CPU', load_in_8bit=False)
+                                                       compile=False, device='CPU', load_in_8bit=False, ov_config=get_default_properties())
         opt_model.generation_config.save_pretrained(models_path)
         opt_model.config.save_pretrained(models_path)
         opt_model.save_pretrained(models_path)
@@ -114,7 +114,7 @@ def read_model(params, **tokenizer_kwargs):
         models_path,
         hf_tokenizer,
         opt_model,
-        ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False),
+        ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False, **get_default_properties()),
     )
 
 
@@ -178,7 +178,7 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path):
         with (temp_path / config_name).open('w') as f:
             json.dump(config_json, f)
 
-    ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU')
+    ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **get_default_properties())
 
     for _, config_name in configs:
         os.remove(temp_path / config_name)
@@ -188,4 +188,4 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path):
 
 @functools.lru_cache(1)
 def get_continuous_batching(path):
-    return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig())
+    return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig(), **get_default_properties())
diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index c851c71ee5..e23eaacc21 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 diffusers==0.32.1
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
 numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
 onnx==1.17.0
 pytest
diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py
index fabcf06b71..d7ce0b1ece 100644
--- a/tests/python_tests/test_continuous_batching.py
+++ b/tests/python_tests/test_continuous_batching.py
@@ -9,7 +9,7 @@
 from pathlib import Path
 from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
 
-from common import get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \
+from common import get_default_properties, get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \
     get_scheduler_config, get_greedy, run_cb_pipeline_with_ref, get_beam_search, get_greedy, \
     get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
     get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p
@@ -155,7 +155,7 @@ def test_post_oom_health(tmp_path, sampling_config):
     models_path : Path = tmp_path / model_id
     convert_models(opt_model, hf_tokenizer, models_path)
 
-    cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU")
+    cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU", **get_default_properties())
 
     # First run should return incomplete response
     output = cb_pipe.generate(["What is OpenVINO?"], [generation_config])
diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py
index 3dbf9297ee..428047ea28 100644
--- a/tests/python_tests/test_kv_cache_eviction.py
+++ b/tests/python_tests/test_kv_cache_eviction.py
@@ -15,7 +15,7 @@
 from openvino import serialize
 from transformers import AutoTokenizer
 
-from common import TESTS_ROOT, run_cb_pipeline_with_ref
+from common import TESTS_ROOT, run_cb_pipeline_with_ref, get_default_properties
 
 
 def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]:
@@ -42,7 +42,7 @@ class ConvertedModel:
 @pytest.fixture(scope='module')
 def converted_model(tmp_path_factory):
     model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-    model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False)
+    model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False, ov_config=get_default_properties())
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     models_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id
     model.save_pretrained(models_path)
@@ -112,8 +112,8 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t
     scheduler_config_opt.enable_prefix_caching = enable_prefix_caching
 
     models_path = converted_model.models_path
-    model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU")
-    model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU")
+    model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, get_default_properties())
+    model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, get_default_properties())
 
     tokenizer = converted_model.tokenizer
 
diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py
index 62c1c27e3b..e6f897bcef 100644
--- a/tests/python_tests/test_vlm_pipeline.py
+++ b/tests/python_tests/test_vlm_pipeline.py
@@ -7,7 +7,7 @@
 import transformers
 from optimum.intel.openvino import OVModelForVisualCausalLM
 from openvino_genai import VLMPipeline, GenerationConfig
-from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters
+from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters, get_default_properties
 
 def get_ov_model(cache):
     model_dir = cache.mkdir("tiny-random-minicpmv-2_6")
@@ -19,7 +19,7 @@ def get_ov_model(cache):
     ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True)
     openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml")
     openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml")
-    model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True)
+    model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_properties())
     processor.save_pretrained(model_dir)
     model.save_pretrained(model_dir)
     return model_dir

From 09a542608b560959edb96e628915a1d6bd780c26 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Tue, 7 Jan 2025 11:13:35 +0400
Subject: [PATCH 090/110] [llm_bench] add support granite and granitemoe models
 (#1486)

related to https://github.com/huggingface/optimum-intel/pull/1099
added opportunity to test these models via llm_bench

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 tools/llm_bench/llm_bench_utils/config_class.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/llm_bench/llm_bench_utils/config_class.py b/tools/llm_bench/llm_bench_utils/config_class.py
index 7dd27b198b..9c149c98b6 100644
--- a/tools/llm_bench/llm_bench_utils/config_class.py
+++ b/tools/llm_bench/llm_bench_utils/config_class.py
@@ -102,7 +102,9 @@
         "olmo",
         "phi3",
         "starcoder",
-        "instruct-gpt"
+        "instruct-gpt",
+        "granite",
+        "granitemoe",
     ],
     'ldm_super_resolution': ['ldm-super-resolution'],
 }

From 9ac38f0d5c79c0864dafd8484b5a696261e3bfda Mon Sep 17 00:00:00 2001
From: Helena Kloosterman <helena.kloosterman@intel.com>
Date: Tue, 7 Jan 2025 10:01:12 +0100
Subject: [PATCH 091/110] Update VLM example code in README (#1466)

Add `pipe.start_chat()` to VLM example. Without this, inference with
several models results in empty outputs.

This can be removed if this will be the default for VLM models, but at
the moment, the most basic example should work with supported models.

Also changed printing the VLMDecodedResults to getting the generated
text and printing that (see comment from Ilya).
---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index be3de5e8ce..9d4543bed4 100644
--- a/README.md
+++ b/README.md
@@ -133,13 +133,15 @@ from PIL import Image
 
 # Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
 pipe = ov_genai.VLMPipeline("./InternVL2-1B", "CPU")
+pipe.start_chat()
 
 image = Image.open("dog.jpg")
 image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)
 image_data = ov.Tensor(image_data)  
 
 prompt = "Can you describe the image?"
-print(pipe.generate(prompt, image=image_data, max_new_tokens=100))
+result = pipe.generate(prompt, image=image_data, max_new_tokens=100)
+print(result.texts[0])
 ```
 
 ### Run generation using VLMPipeline in C++

From d7d117a4a6a47f024a07fb914d1ea3a1dd829c58 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Tue, 7 Jan 2025 10:01:25 +0100
Subject: [PATCH 092/110] Fix text streaming in samples (#1487)

Fix issue https://github.com/openvinotoolkit/openvino.genai/issues/1381

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 samples/python/multinomial_causal_lm/multinomial_causal_lm.py | 2 +-
 tools/llm_bench/llm_bench_utils/ov_utils.py                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py
index 953388ed6a..5ec9d54601 100755
--- a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py
+++ b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py
@@ -90,7 +90,7 @@ def put(self, token_id: int) -> bool:
             word = text[self.print_len:]            
             self.tokens_cache = []
             self.print_len = 0
-        elif len(text) >= 3 and text[-3:] == chr(65533):
+        elif len(text) >= 3 and text[-1] == chr(65533):
             # Don't print incomplete text.
             pass
         elif len(text) > self.print_len:
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
index 316c9d0b89..596da8cb3a 100644
--- a/tools/llm_bench/llm_bench_utils/ov_utils.py
+++ b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -701,7 +701,7 @@ def put(self, token_id: int) -> bool:
                 word = text[self.print_len:]
                 self.tokens_cache = []
                 self.print_len = 0
-            elif len(text) >= 3 and text[-3:] == chr(65533):
+            elif len(text) >= 3 and text[-1] == chr(65533):
                 # Don't print incomplete text.
                 pass
             elif len(text) > self.print_len:

From 65e8362e85a887af22e105d97d2333db921a1766 Mon Sep 17 00:00:00 2001
From: Alexander Kozlov <kozzzloff@list.ru>
Date: Tue, 7 Jan 2025 12:01:45 +0300
Subject: [PATCH 093/110] Added ability to compare results vs. llama.cpp
 (#1461)

Example:
```bash
rm -rf results/smollm2_N_FP16/gt.csv
mkdir -p results/smollm2_N_FP16

# References from PyTorch FP16
wwb --base-model HuggingFaceTB/SmolLM2-360M-Instruct --gt-data results/smollm2_N_FP16/gt.csv --hf --num-samples 4


#huggingface-cli download "bartowski/SmolLM2-360M-Instruct-GGUF" "SmolLM2-360M-Instruct-f16.gguf"
wwb --target-model models/SmolLM2-360M-Instruct-f16.gguf --gt-data results/smollm2_N_FP16/gt.csv --llamacpp --output results/smollm2_N_L_FP16 --num-samples
```
---
 .../whowhatbench/model_loaders.py             | 20 +++++-
 .../whowhatbench/text_evaluator.py            | 27 +++++---
 tools/who_what_benchmark/whowhatbench/wwb.py  | 61 ++++++++++++++++---
 3 files changed, 88 insertions(+), 20 deletions(-)

diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py
index 8a00c70852..c792a3c0b2 100644
--- a/tools/who_what_benchmark/whowhatbench/model_loaders.py
+++ b/tools/who_what_benchmark/whowhatbench/model_loaders.py
@@ -41,8 +41,19 @@ def load_text_genai_pipeline(model_dir, device="CPU", ov_config=None):
     return GenAIModelWrapper(openvino_genai.LLMPipeline(model_dir, device=device, **ov_config), model_dir, "text")
 
 
+def load_text_llamacpp_pipeline(model_dir):
+    try:
+        from llama_cpp import Llama
+    except ImportError:
+        logger.error(
+            "Failed to import llama_cpp package. Please install llama-cpp-python.")
+        exit(-1)
+    model = Llama(model_dir)
+    return model
+
+
 def load_text_model(
-    model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
+    model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False, use_llamacpp=False,
 ):
     if use_hf:
         logger.info("Using HF Transformers API")
@@ -53,6 +64,9 @@ def load_text_model(
     elif use_genai:
         logger.info("Using OpenVINO GenAI API")
         model = load_text_genai_pipeline(model_id, device, ov_config)
+    elif use_llamacpp:
+        logger.info("Using llama.cpp API")
+        model = load_text_llamacpp_pipeline(model_id)
     else:
         logger.info("Using Optimum API")
         from optimum.intel.openvino import OVModelForCausalLM
@@ -276,7 +290,7 @@ def load_inpainting_model(
 
 
 def load_model(
-    model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
+    model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False, use_llamacpp=False
 ):
     if model_id is None:
         return None
@@ -288,7 +302,7 @@ def load_model(
         ov_options = {}
 
     if model_type == "text":
-        return load_text_model(model_id, device, ov_options, use_hf, use_genai)
+        return load_text_model(model_id, device, ov_options, use_hf, use_genai, use_llamacpp)
     elif model_type == "text-to-image":
         return load_text2image_model(
             model_id, device, ov_options, use_hf, use_genai
diff --git a/tools/who_what_benchmark/whowhatbench/text_evaluator.py b/tools/who_what_benchmark/whowhatbench/text_evaluator.py
index 50ce224def..433521a186 100644
--- a/tools/who_what_benchmark/whowhatbench/text_evaluator.py
+++ b/tools/who_what_benchmark/whowhatbench/text_evaluator.py
@@ -108,6 +108,7 @@ def __init__(
         generation_config=None,
         generation_config_base=None,
         seqs_per_request=None,
+        use_chat_template=None,
     ) -> None:
         assert (
             base_model is not None or gt_data is not None
@@ -123,6 +124,7 @@ def __init__(
         self.generation_config_base = generation_config
         self.seqs_per_request = seqs_per_request
         self.generation_fn = gen_answer_fn
+        self.use_chat_template = use_chat_template
         if self.generation_config is not None:
             assert self.seqs_per_request is not None
 
@@ -202,15 +204,21 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
         return res
 
     def _generate_data(self, model, gen_answer_fn=None, generation_config=None):
-        def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question):
-            inputs = self.tokenizer(prompt, return_tensors="pt")
-
-            tokens = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
-
-            if crop_question:
-                tokens = tokens[:, inputs["input_ids"].shape[-1] :]
-
-            return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
+        def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question, use_chat_template=False):
+            if use_chat_template:
+                message = [{"role": "user", "content": prompt}]
+                inputs = tokenizer.apply_chat_template(message, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+                tokens = model.generate(inputs, do_sample=False, max_new_tokens=max_new_tokens)
+                if crop_question:
+                    tokens = tokens[:, inputs.shape[-1]:]
+                res = self.tokenizer.decode(tokens[0], skip_special_tokens=True)
+                return res
+            else:
+                inputs = self.tokenizer(prompt, return_tensors="pt")
+                tokens = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
+                if crop_question:
+                    tokens = tokens[:, inputs["input_ids"].shape[-1] :]
+                return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
 
         gen_answer_fn = gen_answer_fn or default_gen_answer
 
@@ -250,6 +258,7 @@ def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question):
                         p,
                         self.max_new_tokens,
                         self._crop_question,
+                        self.use_chat_template
                     )
                 )
         else:
diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py
index 7acf3cf5aa..7d4354f846 100644
--- a/tools/who_what_benchmark/whowhatbench/wwb.py
+++ b/tools/who_what_benchmark/whowhatbench/wwb.py
@@ -40,6 +40,11 @@ def parse_args():
         default=None,
         help="Tokenizer for divergency metric. If not provided, it will be load from base_model or target_model.",
     )
+    parser.add_argument(
+        "--chat-template",
+        action="store_true",
+        help="Whether apply the default chat template.",
+    )
     parser.add_argument(
         "--gt-data",
         default=None,
@@ -137,6 +142,11 @@ def parse_args():
         action="store_true",
         help="Use LLMPipeline from transformers library to instantiate the model.",
     )
+    parser.add_argument(
+        "--llamacpp",
+        action="store_true",
+        help="Use llama-cpp-python to instantiate the model.",
+    )
     parser.add_argument(
         "--image-size",
         type=int,
@@ -190,9 +200,13 @@ def load_prompts(args):
 def load_tokenizer(args):
     tokenizer = None
     if args.tokenizer is not None:
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.tokenizer, trust_remote_code=True
-        )
+        if args.llamacpp:
+            from llama_cpp.llama_tokenizer import LlamaHFTokenizer
+            tokenizer = LlamaHFTokenizer.from_pretrained(args.tokenizer)
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(
+                args.tokenizer, trust_remote_code=True
+            )
     elif args.base_model is not None:
         tokenizer = AutoTokenizer.from_pretrained(
             args.base_model, trust_remote_code=True
@@ -246,8 +260,29 @@ def diff_strings(a: str, b: str, *, use_loguru_colors: bool = False) -> str:
     return "".join(output)
 
 
-def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question):
-    return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens)
+def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False):
+    if use_chat_template:
+        model.start_chat()
+        result = model.generate(question, do_sample=False, max_new_tokens=max_new_tokens)
+        model.finish_chat()
+        return result
+    else:
+        return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens)
+
+
+def llamacpp_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False):
+    if use_chat_template:
+        output = model.create_chat_completion(messages=[{"role": "user", "content": question}], max_tokens=max_new_tokens, temperature=0.0)
+        text = output["choices"][0]["message"]["content"]
+        if skip_question:
+            text = text[len(question):]
+        return text
+    else:
+        output = model(question, max_tokens=max_new_tokens, echo=True, temperature=0.0)
+        text = output["choices"][0]["text"]
+        if skip_question:
+            text = text[len(question):]
+        return text
 
 
 def genai_gen_image(model, prompt, num_inference_steps, generator=None):
@@ -322,7 +357,15 @@ def create_evaluator(base_model, args):
         prompts = load_prompts(args)
 
         if task == "text":
-            tokenizer = load_tokenizer(args)
+            tokenizer = load_tokenizer(args) if not args.llamacpp else None
+
+            if args.genai:
+                gen_answer_fn = genai_gen_text
+            elif args.llamacpp:
+                gen_answer_fn = llamacpp_gen_text
+            else:
+                gen_answer_fn = None
+
             return EvaluatorCLS(
                 base_model=base_model,
                 gt_data=args.gt_data,
@@ -331,7 +374,8 @@ def create_evaluator(base_model, args):
                 similarity_model_id=args.data_encoder,
                 num_samples=args.num_samples,
                 language=args.language,
-                gen_answer_fn=genai_gen_text if args.genai else None,
+                gen_answer_fn=gen_answer_fn,
+                use_chat_template=args.chat_template,
             )
         elif task == "text-to-image":
             return EvaluatorCLS(
@@ -467,10 +511,11 @@ def main():
                 args.ov_config,
                 args.hf,
                 args.genai,
+                args.llamacpp
             )
             all_metrics_per_question, all_metrics = evaluator.score(
                 target_model,
-                evaluator.get_generation_fn() if args.genai else None,
+                evaluator.get_generation_fn() if args.genai or args.llamacpp else None,
                 output_dir=args.output
             )
         logger.info("Metrics for model: %s", args.target_model)

From db0fb9a27a18d1080bdb152c5c845e1a0a9b5941 Mon Sep 17 00:00:00 2001
From: "Anastasiya(Asya) Pronina" <anastasiya.pronina@intel.com>
Date: Tue, 7 Jan 2025 10:02:01 +0100
Subject: [PATCH 094/110] Replace 'CACHE_DIR' with 'NPUW_CACHE_DIR' in
 StatefulLLMPipeline (#1489)

Handle `CACHE_DIR` in `StatefulLLMPipeline` the same way as in
`StatelessLLMPipeline`
---
 src/cpp/src/llm_pipeline_static.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 94aa6e19fe..c98b571179 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -739,7 +739,10 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
 
     rename_key(pipeline_config, "PREFILL_CONFIG", "NPUW_LLM_PREFILL_CONFIG");
     rename_key(pipeline_config, "GENERATE_CONFIG", "NPUW_LLM_GENERATE_CONFIG");
-  
+
+    // Replace CACHE_DIR option if NPUW is enabled
+    set_npuw_cache_dir(pipeline_config);
+
     return std::make_shared<ov::CompiledModel>(genai::utils::singleton_core().compile_model(model, "NPU", pipeline_config));
 }
 

From 3e12db7a6f7e461d928abd0c8fcaca9a67db06bf Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 7 Jan 2025 18:43:59 +0400
Subject: [PATCH 095/110] Update datasets requirement from <3.2.0 to <3.3.0 in
 /tools/who_what_benchmark (#1491)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Updates the requirements on
[datasets](https://github.com/huggingface/datasets) to permit the latest
version.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/huggingface/datasets/releases">datasets's
releases</a>.</em></p>
<blockquote>
<h2>3.2.0</h2>
<h2>Dataset Features</h2>
<ul>
<li>Faster parquet streaming + filters with predicate pushdown by <a
href="https://github.com/lhoestq"><code>@​lhoestq</code></a> in <a
href="https://redirect.github.com/huggingface/datasets/pull/7309">huggingface/datasets#7309</a>
<ul>
<li>Up to +100% streaming speed</li>
<li>Fast filtering via predicate pushdown (skip files/row groups based
on predicate instead of downloading the full data), e.g.
<pre lang="python"><code>from datasets import load_dataset
filters = [('date', '&gt;=', '2023')]
ds = load_dataset(&quot;HuggingFaceFW/fineweb-2&quot;,
&quot;fra_Latn&quot;, streaming=True, filters=filters)
</code></pre>
</li>
</ul>
</li>
</ul>
<h2>Other improvements and bug fixes</h2>
<ul>
<li>fix conda release worlflow by <a
href="https://github.com/lhoestq"><code>@​lhoestq</code></a> in <a
href="https://redirect.github.com/huggingface/datasets/pull/7272">huggingface/datasets#7272</a></li>
<li>Add link to video dataset by <a
href="https://github.com/NielsRogge"><code>@​NielsRogge</code></a> in <a
href="https://redirect.github.com/huggingface/datasets/pull/7277">huggingface/datasets#7277</a></li>
<li>Raise error for incorrect JSON serialization by <a
href="https://github.com/varadhbhatnagar"><code>@​varadhbhatnagar</code></a>
in <a
href="https://redirect.github.com/huggingface/datasets/pull/7273">huggingface/datasets#7273</a></li>
<li>support for custom feature encoding/decoding by <a
href="https://github.com/alex-hh"><code>@​alex-hh</code></a> in <a
href="https://redirect.github.com/huggingface/datasets/pull/7284">huggingface/datasets#7284</a></li>
<li>update load_dataset doctring by <a
href="https://github.com/lhoestq"><code>@​lhoestq</code></a> in <a
href="https://redirect.github.com/huggingface/datasets/pull/7301">huggingface/datasets#7301</a></li>
<li>Let server decide default repo visibility by <a
href="https://github.com/Wauplin"><code>@​Wauplin</code></a> in <a
href="https://redirect.github.com/huggingface/datasets/pull/7302">huggingface/datasets#7302</a></li>
<li>fix: update elasticsearch version by <a
href="https://github.com/ruidazeng"><code>@​ruidazeng</code></a> in <a
href="https://redirect.github.com/huggingface/datasets/pull/7300">huggingface/datasets#7300</a></li>
<li>Fix typing in iterable_dataset.py by <a
href="https://github.com/lhoestq"><code>@​lhoestq</code></a> in <a
href="https://redirect.github.com/huggingface/datasets/pull/7304">huggingface/datasets#7304</a></li>
<li>Updated inconsistent output in documentation examples for
<code>ClassLabel</code> by <a
href="https://github.com/sergiopaniego"><code>@​sergiopaniego</code></a>
in <a
href="https://redirect.github.com/huggingface/datasets/pull/7293">huggingface/datasets#7293</a></li>
<li>More docs to from_dict to mention that the result lives in RAM by <a
href="https://github.com/lhoestq"><code>@​lhoestq</code></a> in <a
href="https://redirect.github.com/huggingface/datasets/pull/7316">huggingface/datasets#7316</a></li>
<li>Release: 3.2.0 by <a
href="https://github.com/lhoestq"><code>@​lhoestq</code></a> in <a
href="https://redirect.github.com/huggingface/datasets/pull/7317">huggingface/datasets#7317</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/ruidazeng"><code>@​ruidazeng</code></a>
made their first contribution in <a
href="https://redirect.github.com/huggingface/datasets/pull/7300">huggingface/datasets#7300</a></li>
<li><a
href="https://github.com/sergiopaniego"><code>@​sergiopaniego</code></a>
made their first contribution in <a
href="https://redirect.github.com/huggingface/datasets/pull/7293">huggingface/datasets#7293</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/huggingface/datasets/compare/3.1.0...3.2.0">https://github.com/huggingface/datasets/compare/3.1.0...3.2.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/huggingface/datasets/commit/fba47587a4f1771ad9f934a5e66f0860231405cd"><code>fba4758</code></a>
Release: 3.2.0 (<a
href="https://redirect.github.com/huggingface/datasets/issues/7317">#7317</a>)</li>
<li><a
href="https://github.com/huggingface/datasets/commit/8983782529101c1d9b526716dc68fbf3103a4f0e"><code>8983782</code></a>
More docs to from_dict to mention that the result lives in RAM (<a
href="https://redirect.github.com/huggingface/datasets/issues/7316">#7316</a>)</li>
<li><a
href="https://github.com/huggingface/datasets/commit/661d7bac29689e2d9eb74fba3d243939d6e9f25b"><code>661d7ba</code></a>
Faster parquet streaming + filters with predicate pushdown (<a
href="https://redirect.github.com/huggingface/datasets/issues/7309">#7309</a>)</li>
<li><a
href="https://github.com/huggingface/datasets/commit/b60ebb83cd668decaa21df66148beb44bce57739"><code>b60ebb8</code></a>
Updated inconsistent output in documentation examples for
<code>ClassLabel</code> (<a
href="https://redirect.github.com/huggingface/datasets/issues/7293">#7293</a>)</li>
<li><a
href="https://github.com/huggingface/datasets/commit/c9d3450d4a87de629f4b535650dceed74b499736"><code>c9d3450</code></a>
Update iterable_dataset.py (<a
href="https://redirect.github.com/huggingface/datasets/issues/7304">#7304</a>)</li>
<li><a
href="https://github.com/huggingface/datasets/commit/38d648e23cceb6b898813828b4b5370afea64119"><code>38d648e</code></a>
fix: update elasticsearch version (<a
href="https://redirect.github.com/huggingface/datasets/issues/7300">#7300</a>)</li>
<li><a
href="https://github.com/huggingface/datasets/commit/c8252f279934675c5ad624774c0d9c7af49225dc"><code>c8252f2</code></a>
Let server decide default repo visibility (<a
href="https://redirect.github.com/huggingface/datasets/issues/7302">#7302</a>)</li>
<li><a
href="https://github.com/huggingface/datasets/commit/06c3235a640d00bf59223ebabf3cb489a2891767"><code>06c3235</code></a>
update load_dataset doctring (<a
href="https://redirect.github.com/huggingface/datasets/issues/7301">#7301</a>)</li>
<li><a
href="https://github.com/huggingface/datasets/commit/17f17b3fe7f276e1b019cca8aa651bf7c818a928"><code>17f17b3</code></a>
support for custom feature encoding/decoding (<a
href="https://redirect.github.com/huggingface/datasets/issues/7284">#7284</a>)</li>
<li><a
href="https://github.com/huggingface/datasets/commit/2049c00921c59cdeb835137a1c49639cf175af07"><code>2049c00</code></a>
Raise error for incorrect JSON serialization (<a
href="https://redirect.github.com/huggingface/datasets/issues/7273">#7273</a>)</li>
<li>Additional commits viewable in <a
href="https://github.com/huggingface/datasets/compare/0.0.2...3.2.0">compare
view</a></li>
</ul>
</details>
<br />


Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 tools/who_what_benchmark/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/who_what_benchmark/requirements.txt b/tools/who_what_benchmark/requirements.txt
index d4b702de78..ab4192d56c 100644
--- a/tools/who_what_benchmark/requirements.txt
+++ b/tools/who_what_benchmark/requirements.txt
@@ -7,4 +7,4 @@ pandas>=2.0.3
 numpy>=1.23.5
 tqdm>=4.66.1
 diffusers
-datasets<3.2.0
+datasets<3.3.0

From 74fd08fa19e2cf7ffe0eaecb3f539f3f737ee002 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 7 Jan 2025 22:16:50 +0400
Subject: [PATCH 096/110] Revert "Update datasets requirement from <3.2.0 to
 <3.3.0 in /tools/who_what_benchmark" (#1495)

Reverts openvinotoolkit/openvino.genai#1491
---
 tools/who_what_benchmark/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/who_what_benchmark/requirements.txt b/tools/who_what_benchmark/requirements.txt
index ab4192d56c..d4b702de78 100644
--- a/tools/who_what_benchmark/requirements.txt
+++ b/tools/who_what_benchmark/requirements.txt
@@ -7,4 +7,4 @@ pandas>=2.0.3
 numpy>=1.23.5
 tqdm>=4.66.1
 diffusers
-datasets<3.3.0
+datasets<3.2.0

From d48326b0ecdefb5dd2a758a3536c4e7011c82934 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 8 Jan 2025 02:04:44 +0400
Subject: [PATCH 097/110] Enable ov_add_api_validator_post_build_step (#1402)

---
 src/cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 24367c17ce..ff804cd85a 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -101,7 +101,7 @@ endif()
 
 if(OpenVINODeveloperPackage_FOUND)
     # must be called after all target_link_libraries
-    # ov_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+    ov_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
 
     ov_ncc_naming_style(FOR_TARGET ${TARGET_NAME}
                         SOURCE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/include")

From cdf8118377b6654daeedf1634d6d157ac7668767 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 8 Jan 2025 08:06:57 +0400
Subject: [PATCH 098/110] [CB] Fix key cache shape for GPU (#1497)

Regression after
https://github.com/openvinotoolkit/openvino.genai/pull/1416

CVS-160158
---
 src/cpp/src/device_config.hpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp
index cc2e21b9a1..fee6c7abd1 100644
--- a/src/cpp/src/device_config.hpp
+++ b/src/cpp/src/device_config.hpp
@@ -117,22 +117,22 @@ class DeviceConfig {
         }
 
         for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) {
-            m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
-                                                         ov::Dimension(m_num_kv_heads[layer_id]),
-                                                         ov::Dimension(m_block_size),
-                                                         ov::Dimension(m_head_size)});
-
             m_value_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
                                                            ov::Dimension(m_num_kv_heads[layer_id]),
                                                            ov::Dimension(m_block_size),
                                                            ov::Dimension(m_head_size)});
 
-            if (m_device.find("GPU") != std::string::npos) {
+            if (m_device.find("GPU") == std::string::npos) {
+                m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
+                                                             ov::Dimension(m_num_kv_heads[layer_id]),
+                                                             ov::Dimension(m_block_size),
+                                                             ov::Dimension(m_head_size)});
+            } else  if (m_device.find("GPU") != std::string::npos) {
                 // Update key shape, as the key's shape is different from the value's shape
                 m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
-                                                     ov::Dimension(m_num_kv_heads[layer_id]),
-                                                     ov::Dimension(m_head_size),
-                                                     ov::Dimension(m_block_size)});
+                                                             ov::Dimension(m_num_kv_heads[layer_id]),
+                                                             ov::Dimension(m_head_size),
+                                                             ov::Dimension(m_block_size)});
             }
         }
     }

From fb16a71b3c5d8736d75f4201e33d398e967fa152 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 8 Jan 2025 14:47:39 +0400
Subject: [PATCH 099/110] Finally drop old LLM bench folder (#1498)

---
 llm_bench/python/README.md                    | 4 ----
 llm_bench/python/who_what_benchmark/README.md | 4 ----
 2 files changed, 8 deletions(-)
 delete mode 100644 llm_bench/python/README.md
 delete mode 100644 llm_bench/python/who_what_benchmark/README.md

diff --git a/llm_bench/python/README.md b/llm_bench/python/README.md
deleted file mode 100644
index 272ed11d1b..0000000000
--- a/llm_bench/python/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Benchmarking Script for Large Language Models
-
-> [!IMPORTANT]  
-> LLM bench code was moved to [tools](../../tools/llm_bench/) directory. Please navigate to the new directory for continue of tool usage.
\ No newline at end of file
diff --git a/llm_bench/python/who_what_benchmark/README.md b/llm_bench/python/who_what_benchmark/README.md
deleted file mode 100644
index 414b4d9342..0000000000
--- a/llm_bench/python/who_what_benchmark/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Simple Accuracy Benchmark for Generative AI models
-
-> [!IMPORTANT]  
-> Who What Benchmark code was moved to [tools](../../../tools/who_what_benchmark/) directory. Please navigate to the new directory for continue of tool usage.
\ No newline at end of file

From 5ab58ca70dd2774595ad82768074c7a497aa9377 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 8 Jan 2025 14:49:25 +0400
Subject: [PATCH 100/110] Add complete version information (#1500)

CVS-160212
---
 .github/workflows/genai-tools.yml             |  2 +-
 .github/workflows/linux.yml                   |  2 +-
 .../workflows/stable_diffusion_1_5_cpp.yml    |  4 +-
 CMakeLists.txt                                |  1 +
 cmake/templates/__version__.py.in             |  5 --
 cmake/templates/version.cpp.in                | 19 +++++
 cmake/templates/version.hpp.in                | 34 +++++++++
 cmake/version.cmake                           | 72 +++++++++++++++++++
 src/cpp/CMakeLists.txt                        | 16 ++++-
 src/python/CMakeLists.txt                     | 16 ++---
 src/python/clean_version.cmake                | 21 ++++++
 src/python/openvino_genai/__init__.py         |  5 +-
 src/python/openvino_genai/__init__.pyi        |  5 +-
 .../openvino_genai/py_openvino_genai.pyi      |  6 +-
 src/python/py_openvino_genai.cpp              |  7 ++
 15 files changed, 190 insertions(+), 25 deletions(-)
 delete mode 100644 cmake/templates/__version__.py.in
 create mode 100644 cmake/templates/version.cpp.in
 create mode 100644 cmake/templates/version.hpp.in
 create mode 100644 cmake/version.cmake
 create mode 100644 src/python/clean_version.cmake

diff --git a/.github/workflows/genai-tools.yml b/.github/workflows/genai-tools.yml
index 333bee3e11..bd6cb46362 100644
--- a/.github/workflows/genai-tools.yml
+++ b/.github/workflows/genai-tools.yml
@@ -44,7 +44,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: latest_available_commit
+        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
 
   llm_bench:
     name: 'LLM bench tests'
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 0a991e2a54..0d7a5b7bae 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -52,7 +52,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: latest_available_commit
+        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
 
     - name: Clone docker tag from OpenVINO repo
       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index e0bf5371b3..3b01697f26 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -45,7 +45,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: latest_available_commit
+        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
 
   openvino_download_windows:
     name: Download OpenVINO for Windows
@@ -71,7 +71,7 @@ jobs:
       with:
         platform: windows
         commit_packages_to_provide: wheels
-        revision: latest_available_commit
+        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
 
   stable_diffusion_1_5_cpp-linux:
     runs-on: ubuntu-22.04-8-cores
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 181132e210..3a67a24bab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,7 @@ if(NOT OpenVINODeveloperPackage_FOUND)
 endif()
 
 include(cmake/features.cmake)
+include(cmake/version.cmake)
 
 if(ENABLE_PYTHON)
     # the following two calls are required for cross-compilation
diff --git a/cmake/templates/__version__.py.in b/cmake/templates/__version__.py.in
deleted file mode 100644
index ce8e01a246..0000000000
--- a/cmake/templates/__version__.py.in
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Will be overwritten by cmake.
-__version__ = "@OpenVINOGenAI_VERSION@"
diff --git a/cmake/templates/version.cpp.in b/cmake/templates/version.cpp.in
new file mode 100644
index 0000000000..f6015832f9
--- /dev/null
+++ b/cmake/templates/version.cpp.in
@@ -0,0 +1,19 @@
+// Copyright (C) 2023-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/genai/version.hpp"
+
+namespace ov {
+namespace genai {
+
+const Version get_version() {
+    const static Version version = {
+        "@OpenVINOGenAI_FULL_VERSION@",
+        "OpenVINO GenAI version",
+    };
+
+    return version;
+}
+
+} // namespace genai
+} // namespace ov
diff --git a/cmake/templates/version.hpp.in b/cmake/templates/version.hpp.in
new file mode 100644
index 0000000000..34120ef632
--- /dev/null
+++ b/cmake/templates/version.hpp.in
@@ -0,0 +1,34 @@
+// Copyright (C) 2023-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "openvino/core/version.hpp"
+#include "openvino/genai/visibility.hpp"
+
+/**
+ * OpenVINO GenAI major version
+ */
+#define OPENVINO_GENAI_VERSION_MAJOR @OpenVINOGenAI_VERSION_MAJOR@
+
+/**
+ * OpenVINO GenAI minor version
+ */
+#define OPENVINO_GENAI_VERSION_MINOR @OpenVINOGenAI_VERSION_MINOR@
+
+/**
+ * OpenVINO GenAI patch version
+ */
+#define OPENVINO_GENAI_VERSION_PATCH @OpenVINOGenAI_VERSION_PATCH@
+
+namespace ov {
+namespace genai {
+
+/**
+ * Returns OpenVINO GenAI full version including git commit and hash information in form of:
+ *   <MAJOR>.<MINOR>.<PATCH>.<REVISION>-<COMMIT NUMBER>-<COMMIT HASH>[-<BRANCH SUFFIX>]
+ */
+OPENVINO_EXTERN_C OPENVINO_GENAI_EXPORTS const ov::Version OPENVINO_CDECL get_version();
+
+} // namespace genai
+} // namespace ov
diff --git a/cmake/version.cmake b/cmake/version.cmake
new file mode 100644
index 0000000000..b9b51e8fe2
--- /dev/null
+++ b/cmake/version.cmake
@@ -0,0 +1,72 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+find_package(Git QUIET)
+
+function(ov_genai_branch_name VAR)
+    if(GIT_FOUND)
+        execute_process(
+                COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD
+                WORKING_DIRECTORY ${OpenVINOGenAI_SOURCE_DIR}
+                OUTPUT_VARIABLE GIT_BRANCH
+                RESULT_VARIABLE EXIT_CODE
+                OUTPUT_STRIP_TRAILING_WHITESPACE)
+        if(EXIT_CODE EQUAL 0)
+            set(${VAR} ${GIT_BRANCH} PARENT_SCOPE)
+        endif()
+    endif()
+endfunction()
+
+function(ov_genai_commit_hash VAR)
+    if(GIT_FOUND)
+        execute_process(
+                COMMAND ${GIT_EXECUTABLE} rev-parse --short=11 HEAD
+                WORKING_DIRECTORY ${OpenVINOGenAI_SOURCE_DIR}
+                OUTPUT_VARIABLE GIT_COMMIT_HASH
+                RESULT_VARIABLE EXIT_CODE
+                OUTPUT_STRIP_TRAILING_WHITESPACE)
+        if(EXIT_CODE EQUAL 0)
+            set(${VAR} ${GIT_COMMIT_HASH} PARENT_SCOPE)
+        endif()
+    endif()
+endfunction()
+
+function(ov_genai_commit_number VAR)
+    set(GIT_COMMIT_NUMBER_FOUND OFF)
+    if(GIT_FOUND)
+        execute_process(
+                COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
+                WORKING_DIRECTORY ${OpenVINOGenAI_SOURCE_DIR}
+                OUTPUT_VARIABLE GIT_COMMIT_NUMBER
+                RESULT_VARIABLE EXIT_CODE
+                OUTPUT_STRIP_TRAILING_WHITESPACE)
+        if(EXIT_CODE EQUAL 0)
+            set(GIT_COMMIT_NUMBER_FOUND ON)
+            set(${VAR} ${GIT_COMMIT_NUMBER} PARENT_SCOPE)
+        endif()
+    endif()
+    if(NOT GIT_COMMIT_NUMBER_FOUND)
+        # set zeros since git is not available
+        set(${VAR} "000" PARENT_SCOPE)
+    endif()
+endfunction()
+
+function(ov_genai_full_version full_version)
+    if(GIT_FOUND)
+        ov_genai_branch_name(GIT_BRANCH)
+        ov_genai_commit_hash(GIT_COMMIT_HASH)
+        ov_genai_commit_number(GIT_COMMIT_NUMBER)
+
+        if(NOT GIT_BRANCH MATCHES "^(master|HEAD)$")
+            set(GIT_BRANCH_POSTFIX "-${GIT_BRANCH}")
+        endif()
+
+        set(${full_version} "${OpenVINOGenAI_VERSION}-${GIT_COMMIT_NUMBER}-${GIT_COMMIT_HASH}${GIT_BRANCH_POSTFIX}" PARENT_SCOPE)
+    else()
+        set(${full_version} "${OpenVINOGenAI_VERSION}" PARENT_SCOPE)
+    endif()
+endfunction()
+
+ov_genai_full_version(OpenVINOGenAI_FULL_VERSION)
+message(STATUS "OpenVINO GenAI full version: ${OpenVINOGenAI_FULL_VERSION}")
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index ff804cd85a..e954037daf 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -54,9 +54,18 @@ FetchContent_MakeAvailable(safetensors.h)
 
 ov_genai_build_jinja2cpp()
 
+# generate version files
+
+configure_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/version.hpp.in"
+               "${CMAKE_CURRENT_BINARY_DIR}/openvino/genai/version.hpp" @ONLY)
+
+configure_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/version.cpp.in"
+               "${CMAKE_CURRENT_BINARY_DIR}/version.cpp" @ONLY)
+
 # Library
 
 file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/*.c")
+list(APPEND SOURCE_FILES "${CMAKE_CURRENT_BINARY_DIR}/version.cpp")
 
 set(TARGET_NAME openvino_genai)
 
@@ -68,7 +77,9 @@ if(TARGET openvino_tokenizers)
 endif()
 
 target_include_directories(${TARGET_NAME}
-    PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>" "$<INSTALL_INTERFACE:runtime/include>"
+    PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+           "$<INSTALL_INTERFACE:runtime/include>"
+           "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>"
     PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src")
 
 target_include_directories(${TARGET_NAME} SYSTEM PRIVATE "${safetensors.h_SOURCE_DIR}")
@@ -145,6 +156,9 @@ install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets
 
 install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
         DESTINATION runtime/include COMPONENT core_genai_dev)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/openvino/genai/version.hpp
+        DESTINATION runtime/include/openvino/genai COMPONENT core_genai_dev)
+
 install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake
         NAMESPACE openvino:: DESTINATION runtime/cmake
         COMPONENT core_genai_dev)
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 75a2fd59a7..1293246260 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -34,9 +34,6 @@ file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py"
           "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/py_openvino_genai.pyi"
      DESTINATION "${CMAKE_BINARY_DIR}/openvino_genai/")
 
-configure_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/__version__.py.in"
-               "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py" @ONLY)
-
 if(OpenVINODeveloperPackage_FOUND)
     # TODO: commit changes separately
     # ov_add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
@@ -69,18 +66,12 @@ endif()
 install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py"
               "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.pyi"
               "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/py_openvino_genai.pyi"
-              "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py"
         DESTINATION python/openvino_genai
         COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
 install(TARGETS ${TARGET_NAME}
         LIBRARY DESTINATION python/openvino_genai
         COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
 
-install(FILES "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py"
-        DESTINATION openvino_genai
-        COMPONENT wheel_genai
-        EXCLUDE_FROM_ALL)
-
 install(FILES "${OpenVINOGenAI_SOURCE_DIR}/LICENSE"
               "${OpenVINOGenAI_SOURCE_DIR}/third-party-programs.txt"
               "${OpenVINOGenAI_SOURCE_DIR}/SECURITY.md"
@@ -154,7 +145,8 @@ if(pybind11_stubgen_AVAILABLE)
     endif()
 
     set(stub_files_location "${OpenVINOGenAI_BINARY_DIR}/src/python")
-    set(generated_files ${stub_files_location}/openvino_genai/__init__.pyi
+    set(init_pyi_file "${stub_files_location}/openvino_genai/__init__.pyi")
+    set(generated_files ${init_pyi_file}
                         ${stub_files_location}/openvino_genai/py_openvino_genai.pyi)
     set_source_files_properties(${generated_files} PROPERTIES GENERATED ON)
 
@@ -184,6 +176,9 @@ if(pybind11_stubgen_AVAILABLE)
                                             "${CMAKE_BINARY_DIR}/openvino_genai/py_openvino_genai.pyi"
         COMMAND "${CMAKE_COMMAND}" -E env PYTHONPATH=${CMAKE_BINARY_DIR}:${openvino_pythonpath}:$ENV{PYTHONPATH}
                 ${pybind11_stubgen} --output-dir ${stub_files_location} openvino_genai
+        COMMAND "${CMAKE_COMMAND}"
+                -D init_pyi_file=${init_pyi_file}
+                -P "${CMAKE_CURRENT_SOURCE_DIR}/clean_version.cmake"
         ${validation_command}
         ${copy_to_source_command}
         COMMAND "${CMAKE_COMMAND}" -E copy ${generated_files} "${CMAKE_BINARY_DIR}/openvino_genai/"
@@ -192,6 +187,7 @@ if(pybind11_stubgen_AVAILABLE)
             ${python_sources}
             ${validation_dependencies}
             "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py"
+            "${CMAKE_CURRENT_SOURCE_DIR}/clean_version.cmake"
             "${CMAKE_CURRENT_SOURCE_DIR}/compare_pyi.cmake"
         COMMENT "[${pybind11_stubgen_dep}] Generate .pyi files"
         VERBATIM)
diff --git a/src/python/clean_version.cmake b/src/python/clean_version.cmake
new file mode 100644
index 0000000000..f02e293493
--- /dev/null
+++ b/src/python/clean_version.cmake
@@ -0,0 +1,21 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+foreach(var IN ITEMS init_pyi_file)
+    if(NOT DEFINED ${var})
+        message(FATAL_ERROR "Variable ${var} is not defined")
+    endif()
+endforeach()
+
+file(STRINGS ${init_pyi_file} file_lines)
+
+foreach(file_line IN LISTS file_lines)
+    if(file_line MATCHES "^__version__.*")
+        set(file_line "__version__: str")
+    endif()
+
+    set(file_content "${file_content}${file_line}\n")
+endforeach()
+
+file(WRITE ${init_pyi_file} ${file_content})
diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py
index a0b0faf58c..0ad7ba3f12 100644
--- a/src/python/openvino_genai/__init__.py
+++ b/src/python/openvino_genai/__init__.py
@@ -5,8 +5,6 @@
 
 import openvino  # add_dll_directory for openvino lib
 import os
-from .__version__ import __version__
-
 
 if hasattr(os, "add_dll_directory"):
     os.add_dll_directory(os.path.dirname(__file__))
@@ -17,8 +15,11 @@
     RawPerfMetrics,
     PerfMetrics,
     StreamerBase,
+    get_version,
 )
 
+__version__ = get_version()
+
 # VLM pipeline
 
 from .py_openvino_genai import (
diff --git a/src/python/openvino_genai/__init__.pyi b/src/python/openvino_genai/__init__.pyi
index 187e0a0a06..0a401ae958 100644
--- a/src/python/openvino_genai/__init__.pyi
+++ b/src/python/openvino_genai/__init__.pyi
@@ -42,7 +42,8 @@ from openvino_genai.py_openvino_genai import WhisperPerfMetrics
 from openvino_genai.py_openvino_genai import WhisperPipeline
 from openvino_genai.py_openvino_genai import WhisperRawPerfMetrics
 from openvino_genai.py_openvino_genai import draft_model
+from openvino_genai.py_openvino_genai import get_version
 import os as os
 from . import py_openvino_genai
-__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'openvino', 'os', 'py_openvino_genai']
-__version__: str = '2025.0.0.0'
+__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version', 'openvino', 'os', 'py_openvino_genai']
+__version__: str
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index d405cd9bbf..5adde32db4 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -5,7 +5,7 @@ from __future__ import annotations
 import openvino._pyopenvino
 import os
 import typing
-__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model']
+__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version']
 class Adapter:
     """
     Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier.
@@ -2204,3 +2204,7 @@ def draft_model(models_path: os.PathLike, device: str = '', **kwargs) -> openvin
     """
     device on which inference will be performed
     """
+def get_version() -> str:
+    """
+    OpenVINO GenAI version
+    """
diff --git a/src/python/py_openvino_genai.cpp b/src/python/py_openvino_genai.cpp
index 429f48f30d..f8e577d5c8 100644
--- a/src/python/py_openvino_genai.cpp
+++ b/src/python/py_openvino_genai.cpp
@@ -11,6 +11,7 @@
 #include <pybind11/typing.h>
 
 #include "openvino/genai/llm_pipeline.hpp"
+#include "openvino/genai/version.hpp"
 
 #include "py_utils.hpp"
 
@@ -21,6 +22,7 @@ using ov::genai::DecodedResults;
 using ov::genai::EncodedResults;
 using ov::genai::StreamerBase;
 using ov::genai::StringInputs;
+using ov::genai::get_version;
 
 void init_lora_adapter(py::module_& m);
 void init_perf_metrics(py::module_& m);
@@ -82,7 +84,12 @@ class ConstructableStreamer: public StreamerBase {
 PYBIND11_MODULE(py_openvino_genai, m) {
     m.doc() = "Pybind11 binding for OpenVINO GenAI library";
 
+    m.def("get_version", [] () -> py::str {
+        return get_version().buildNumber;
+    }, get_version().description);
+
     init_perf_metrics(m);
+
     py::class_<DecodedResults>(m, "DecodedResults", decoded_results_docstring)
         .def(py::init<>())
         .def_property_readonly("texts", [](const DecodedResults &dr) -> py::typing::List<py::str> { return pyutils::handle_utf8((std::vector<std::string>)dr); })

From 3e5c8895650c64d73a9b15f5597c09f1a6b78fd3 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 8 Jan 2025 18:09:14 +0400
Subject: [PATCH 101/110] Added information about LoRA support (#1504)

---
 .github/workflows/mac.yml                     |  2 +-
 .github/workflows/windows.yml                 |  2 +-
 README.md                                     |  2 +-
 ...SUPPORTED_MODELS.md => SUPPORTED_MODELS.md | 29 ++++++++++++++++++-
 samples/cpp/visual_language_chat/README.md    |  2 +-
 .../cpp/whisper_speech_recognition/README.md  |  2 +-
 .../whisper_speech_recognition/README.md      |  2 +-
 7 files changed, 34 insertions(+), 7 deletions(-)
 rename src/docs/SUPPORTED_MODELS.md => SUPPORTED_MODELS.md (95%)

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 5402b79e70..062b83fc27 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.10'
-  OV_BRANCH: master
+  OV_BRANCH: 345163f87953fb0dd8dd590257eb7fc84378da8e
   OV_TARBALL: ''
 
 jobs:
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index e396671b2c..95a713d7a1 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.11'
-  OV_BRANCH: master
+  OV_BRANCH: 345163f87953fb0dd8dd590257eb7fc84378da8e
   OV_TARBALL: ''
 
 jobs:
diff --git a/README.md b/README.md
index 9d4543bed4..c5cf799973 100644
--- a/README.md
+++ b/README.md
@@ -394,7 +394,7 @@ See [here](https://openvinotoolkit.github.io/openvino_notebooks/?search=Automati
 
 ## Additional materials
 
-- [List of supported models](https://github.com/openvinotoolkit/openvino.genai/blob/master/src/docs/SUPPORTED_MODELS.md) (NOTE: models can work, but were not tried yet)
+- [List of supported models](https://github.com/openvinotoolkit/openvino.genai/blob/master/SUPPORTED_MODELS.md) (NOTE: models can work, but were not tried yet)
 - [OpenVINO Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
 - [Optimum-intel and OpenVINO](https://huggingface.co/docs/optimum/intel/openvino/export)
 
diff --git a/src/docs/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md
similarity index 95%
rename from src/docs/SUPPORTED_MODELS.md
rename to SUPPORTED_MODELS.md
index 44da29ced4..6b45f47890 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/SUPPORTED_MODELS.md
@@ -147,6 +147,8 @@
   </tbody>
 </table>
 
+> [!NOTE]
+> LoRA adapters are supported.
 
 The pipeline can work with other similar topologies produced by `optimum-intel` with the same model signature. The model is required to have the following inputs after the conversion:
 1. `input_ids` contains the tokens.
@@ -165,12 +167,14 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <th>Architecture</th>
       <th>Text 2 image</th>
       <th>Image 2 image</th>
+      <th>LoRA support</th>
       <th>Example HuggingFace Models</th>
     </tr>
     <tr>
       <td><code>Latent Consistency Model</code></td>
       <td>Supported</td>
       <td>Supported</td>
+      <td>Supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7"><code>SimianLuo/LCM_Dreamshaper_v7</code></a></li>
@@ -181,6 +185,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <td><code>Stable Diffusion</code></td>
       <td>Supported</td>
       <td>Supported</td>
+      <td>Supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/CompVis/stable-diffusion-v1-1"><code>CompVis/stable-diffusion-v1-1</code></a></li>
@@ -213,6 +218,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <td><code>Stable Diffusion XL</code></td>
       <td>Supported</td>
       <td>Supported</td>
+      <td>Supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9"><code>stabilityai/stable-diffusion-xl-base-0.9</code></a></li>
@@ -225,6 +231,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <td><code>Stable Diffusion 3</code></td>
       <td>Supported</td>
       <td>Not supported</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers"><code>stabilityai/stable-diffusion-3-medium-diffusers</code></a></li>
@@ -237,6 +244,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <td><code>Flux</code></td>
       <td>Supported</td>
       <td>Not supported</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/black-forest-labs/FLUX.1-schnell"><code>black-forest-labs/FLUX.1-schnell</code></a></li>
@@ -260,10 +268,12 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
   <tbody style="vertical-align: top;">
     <tr>
       <th>Architecture</th>
+      <th>LoRA support</th>
       <th>Example HuggingFace Models</th>
     </tr>
     <tr>
       <td><code>Stable Diffusion</code></td>
+      <td>Supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/stabilityai/stable-diffusion-2-inpainting"><code>stabilityai/stable-diffusion-2-inpainting</code></a></li>
@@ -275,13 +285,22 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     </tr>
     <tr>
       <td><code>Stable Diffusion XL</code></td>
+      <td>Supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/diffusers/stable-diffusion-xl-1.0-inpainting-0.1"><code>diffusers/stable-diffusion-xl-1.0-inpainting-0.1</code></a></li>
         </ul>
       </td>
     </tr>
-    </tr>
+    <!-- <tr>
+      <td><code>FLUX</code></td>
+      <td>Not supported</td>
+      <td>
+        <ul>
+          <li><a href="https://huggingface.co/black-forest-labs/FLUX.1-Fill-dev"><code>black-forest-labs/FLUX.1-Fill-dev</code></a></li>
+        </ul>
+      </td>
+    </tr> -->
   </tbody>
 </table>
 
@@ -292,11 +311,13 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     <tr>
       <th>Architecture</th>
       <th>Models</th>
+      <th>LoRA support</th>
       <th>Example HuggingFace Models</th>
     </tr>
     <tr>
       <td><code>InternVL2</code></td>
       <td>InternVL2</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/OpenGVLab/InternVL2-1B"><code>OpenGVLab/InternVL2-1B</code></a></li>
@@ -309,6 +330,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     <tr>
       <td><code>LLaVA</code></td>
       <td>LLaVA-v1.5</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/llava-hf/llava-1.5-7b-hf"><code>llava-hf/llava-1.5-7b-hf</code></a></li>
@@ -318,6 +340,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     <tr>
       <td><code>LLaVA-NeXT</code></td>
       <td>LLaVa-v1.6</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf"><code>llava-hf/llava-v1.6-mistral-7b-hf</code></a></li>
@@ -329,6 +352,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     <tr>
       <td><code>MiniCPMV</code></td>
       <td>MiniCPM-V-2_6</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/openbmb/MiniCPM-V-2_6"><code>openbmb/MiniCPM-V-2_6</code></a></li>
@@ -345,11 +369,13 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     <tr>
       <th>Architecture</th>
       <th>Models</th>
+      <th>LoRA support</th>
       <th>Example HuggingFace Models</th>
     </tr>
     <tr>
       <td rowspan=2><code>WhisperForConditionalGeneration</code></td>
       <td>Whisper</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/openai/whisper-tiny"><code>openai/whisper-tiny</code></a></li>
@@ -366,6 +392,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     </tr>
     <tr>
       <td>Distil-Whisper</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/distil-whisper/distil-small.en"><code>distil-whisper/distil-small.en</code></a></li>
diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md
index 39364d51ee..73baf0088a 100644
--- a/samples/cpp/visual_language_chat/README.md
+++ b/samples/cpp/visual_language_chat/README.md
@@ -29,7 +29,7 @@ Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/o
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model `llava-hf/llava-v1.6-mistral-7b-hf` can benefit from being run on a dGPU. Modify the source code to change the device for inference to the `GPU`.
 
-See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#visual-language-models) for the list of supported models.
+See [SUPPORTED_MODELS.md](../../../SUPPORTED_MODELS.md#visual-language-models) for the list of supported models.
 
 ## Run benchmark:
 
diff --git a/samples/cpp/whisper_speech_recognition/README.md b/samples/cpp/whisper_speech_recognition/README.md
index d649266613..2ea3322dee 100644
--- a/samples/cpp/whisper_speech_recognition/README.md
+++ b/samples/cpp/whisper_speech_recognition/README.md
@@ -31,7 +31,7 @@ Output:
 timestamps: [0, 2] text:  How are you doing today?
 ```
 
-See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
+See [SUPPORTED_MODELS.md](../../../SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
 
 # Whisper pipeline usage
 
diff --git a/samples/python/whisper_speech_recognition/README.md b/samples/python/whisper_speech_recognition/README.md
index aeb46444bf..5f373df2b7 100644
--- a/samples/python/whisper_speech_recognition/README.md
+++ b/samples/python/whisper_speech_recognition/README.md
@@ -38,7 +38,7 @@ Output:
 timestamps: [0, 2] text:  How are you doing today?
 ```
 
-See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
+See [SUPPORTED_MODELS.md](../../../SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
 
 # Whisper pipeline usage
 

From b353929fabdd4620f2c44b664ffbc0a474a88923 Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@intel.com>
Date: Thu, 9 Jan 2025 13:20:07 +0800
Subject: [PATCH 102/110] Update openvino tokenizers (#1506)

To test Llama3 fix:
https://github.com/openvinotoolkit/openvino_tokenizers/pull/357
---
 thirdparty/openvino_tokenizers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index bcfd3eda25..d5f0abf827 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit bcfd3eda25ae3ec423502a4074e35c774506c732
+Subproject commit d5f0abf8271f3cd8fc98d747b3e569fbeacca532

From ca0babefd952ac78bcb0008ced94beb380a73496 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 9 Jan 2025 09:21:27 +0400
Subject: [PATCH 103/110] Replaced chatglm2-6b with chatglm3-6b (#1505)

CVS-159975
---
 SUPPORTED_MODELS.md                       | 1 -
 tests/python_tests/models/real_models     | 1 -
 tests/python_tests/ov_genai_test_utils.py | 2 +-
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md
index 6b45f47890..79333fa45c 100644
--- a/SUPPORTED_MODELS.md
+++ b/SUPPORTED_MODELS.md
@@ -14,7 +14,6 @@
       <td>ChatGLM</td>
       <td>
         <ul>
-          <li><a href="https://huggingface.co/THUDM/chatglm2-6b"><code>THUDM/chatglm2-6b</code></a></li>
           <li><a href="https://huggingface.co/THUDM/chatglm3-6b"><code>THUDM/chatglm3-6b</code></a></li>
         </ul>
       </td>
diff --git a/tests/python_tests/models/real_models b/tests/python_tests/models/real_models
index 420f8f53b6..5fd8fe0500 100644
--- a/tests/python_tests/models/real_models
+++ b/tests/python_tests/models/real_models
@@ -27,7 +27,6 @@ Salesforce/codegen-350M-multi
 Salesforce/codegen-350M-nl
 Salesforce/codegen2-1b
 # Salesforce/xgen-7b-8k-base: Transformers issue - Object of type method is not JSON serializable (https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32)
-THUDM/chatglm2-6b
 THUDM/chatglm3-6b
 TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ
 TinyLlama/TinyLlama-1.1B-Chat-v0.6
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 66fb58f46d..ff55c3c378 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -26,7 +26,7 @@ def get_models_list():
         "facebook/opt-125m",
         "microsoft/phi-1_5",
         "microsoft/phi-2",
-        "THUDM/chatglm2-6b",
+        "THUDM/chatglm3-6b",
         "Qwen/Qwen2-0.5B-Instruct",
         "Qwen/Qwen-7B-Chat",
         "Qwen/Qwen1.5-7B-Chat",

From 2c6d67e039a22e32bc43b53533c3f5b27929eea6 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 9 Jan 2025 06:21:51 +0100
Subject: [PATCH 104/110] Whisper pipeline: refactor tests, disable
 `return_timestamps` check (#1496)

Ticket: 160055

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .github/workflows/windows.yml               |   6 +
 samples/export-requirements.txt             |   2 +-
 tests/python_tests/requirements.txt         |   2 +-
 tests/python_tests/test_whisper_pipeline.py | 434 ++++++++------------
 4 files changed, 169 insertions(+), 275 deletions(-)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 95a713d7a1..8f43af44ae 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -310,6 +310,12 @@ jobs:
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
           python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels
+          
+          # will install transformers 4.46.3 version
+          # transformers 4.46.3 will enable return_timestamps tests
+          # this check enabled for windows only. Ticket: 160205.
+          python -m pip install git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
+          
           python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke"
 
   genai_python_lib_vlm:
diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index 2f71891b7b..af38558656 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -2,7 +2,7 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino-tokenizers~=2025.0.0.0.dev
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index e23eaacc21..c851c71ee5 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 diffusers==0.32.1
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
 onnx==1.17.0
 pytest
diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py
index aa78666e32..c046d1ae2c 100644
--- a/tests/python_tests/test_whisper_pipeline.py
+++ b/tests/python_tests/test_whisper_pipeline.py
@@ -11,11 +11,13 @@
 from optimum.intel.openvino import OVModelForSpeechSeq2Seq
 import gc
 import json
-import time
 import typing
 import numpy as np
 import os
 import pathlib
+import importlib.metadata as metadata
+from packaging.version import parse
+
 
 @pytest.fixture(scope="class", autouse=True)
 def run_gc_after_test():
@@ -27,36 +29,29 @@ def run_gc_after_test():
     gc.collect()
 
 
-def get_whisper_models_list(tiny_only=False, multilingual=False, en_only=False):
-    precommit_models = [
+def get_whisper_models_list(tiny_only=False):
+    model_ids = [
         "openai/whisper-tiny",
-        "openai/whisper-tiny.en",
         "distil-whisper/distil-small.en",
     ]
-    if multilingual:
-        precommit_models = ["openai/whisper-tiny"]
-    if en_only:
-        precommit_models = ["openai/whisper-tiny.en", "distil-whisper/distil-small.en"]
-    if tiny_only:
-        precommit_models = ["openai/whisper-tiny"]
-
-    nightly_models = []
 
-    if pytest.run_marker == "precommit":
-        model_ids = precommit_models
-    else:
-        model_ids = nightly_models
+    if tiny_only:
+        model_ids = ["openai/whisper-tiny"]
 
     if pytest.selected_model_ids:
-        model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
+        model_ids = [
+            model_id
+            for model_id in model_ids
+            if model_id in pytest.selected_model_ids.split(" ")
+        ]
 
-    prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
-    return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
+    prefix = pathlib.Path(os.getenv("GENAI_MODELS_PATH_PREFIX", ""))
+    return [(model_id, prefix / model_id.split("/")[1]) for model_id in model_ids]
 
 
 # used whisper models are relatively small
 # cache them in memory to speedup tests
-@functools.lru_cache(3)
+@functools.lru_cache()
 def read_whisper_model(params, **tokenizer_kwargs):
     model_id, path = params
 
@@ -90,6 +85,7 @@ def read_whisper_model(params, **tokenizer_kwargs):
             model_id,
             export=True,
             trust_remote_code=True,
+            stateful=False,
             compile=False,
             device="CPU",
             load_in_8bit=False,
@@ -114,30 +110,39 @@ def read_whisper_model(params, **tokenizer_kwargs):
     )
 
 
-def compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id):
-    ds = datasets.load_dataset(dataset_id, "clean", split="validation")
-    opt_infer_time = 0
-    genai_infer_time = 0
-
-    for ds_row in ds:
-        audio_sample = ds_row["audio"]
+def run_huggingface(
+    pipeline,
+    sample,
+    config: ov_genai.WhisperGenerationConfig | None = None,
+):
+    if not config:
+        config = ov_genai.WhisperGenerationConfig()
+
+    return pipeline(
+        sample,
+        max_new_tokens=min(config.max_new_tokens, 444),
+        return_timestamps=config.return_timestamps,
+        generate_kwargs={"language": config.language, "task": config.task},
+    )
 
-        streamer_result = []
 
-        start = time.time()
-        genai_result = genai_pipe.generate(
-            audio_sample["array"].tolist(), streamer=lambda x: streamer_result.append(x)
-        )
-        genai_infer_time += time.time() - start
+def run_genai(
+    pipeline: ov_genai.WhisperPipeline,
+    sample,
+    config: ov_genai.WhisperGenerationConfig | None = None,
+    streamer: typing.Callable[[str], bool] | None = None,
+):
+    if not config:
+        config = ov_genai.WhisperGenerationConfig()
 
-        start = time.time()
-        result = opt_pipe(audio_sample)
-        opt_infer_time += time.time() - start
+    genai_config = pipeline.get_generation_config()
 
-        assert genai_result.texts[0] == result["text"]
-        assert "".join(streamer_result) == result["text"]
+    genai_config.max_new_tokens = config.max_new_tokens
+    genai_config.return_timestamps = config.return_timestamps
+    genai_config.task = config.task
+    genai_config.language = f"<|{config.language}|>" if config.language else None
 
-    print(f"Inference time\nOpt: {opt_infer_time}\nGenAI: {genai_infer_time}")
+    return pipeline.generate(sample, genai_config, streamer=streamer)
 
 
 def get_samples_from_dataset(
@@ -166,13 +171,50 @@ def get_samples_from_dataset(
     return [x["audio"]["array"] for x in ds]
 
 
-@pytest.mark.parametrize("model_descr", get_whisper_models_list())
-@pytest.mark.parametrize("dataset_id", ["hf-internal-testing/librispeech_asr_dummy"])
-@pytest.mark.precommit
-def test_whisper_on_hf_dataset(model_descr, dataset_id):
-    model_id, path, opt_pipe, genai_pipe = read_whisper_model(model_descr)
+def run_pipeline_with_ref(
+    model_id: str,
+    tmp_path: str,
+    sample: np.ndarray | list[np.ndarray],
+    generation_config: ov_genai.WhisperGenerationConfig | None = None,
+    streamer: typing.Callable[[str], bool] | None = None,
+):
+    _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path))
+
+    if type(sample) is np.ndarray and len(sample.shape) == 1:
+        sample = np.expand_dims(sample, 0)
+
+    for _sample in sample:
+        genai_result = run_genai(genai_pipe, _sample, generation_config, streamer)
+        hf_result = run_huggingface(hf_pipe, _sample, generation_config)
+
+        compare_results(hf_result, genai_result)
+
 
-    compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id)
+def compare_results(hf_result, genai_result):
+    assert genai_result.texts[0] == hf_result["text"]
+
+    # transformers 4.47 updated return_timestamps implementation
+    # remove once genai implementation aligned with transformers. Ticket 160205.
+    transformers_version_greater_4_47 = parse(
+        metadata.version("transformers")
+    ) >= parse("4.47.0")
+
+    if transformers_version_greater_4_47:
+        return
+
+    if "chunks" not in hf_result and genai_result.chunks is None:
+        return
+
+    assert len(genai_result.chunks) == len(hf_result["chunks"])
+
+    for opt_chunk, genai_chunk in zip(hf_result["chunks"], genai_result.chunks):
+        assert opt_chunk["text"] == genai_chunk.text
+        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
+        if opt_chunk["timestamp"][1]:
+            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
+        else:
+            assert opt_chunk["timestamp"][1] == None
+            assert round(genai_chunk.end_ts, 2) == -1.0
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
@@ -182,16 +224,11 @@ def test_whisper_on_hf_dataset(model_descr, dataset_id):
 )
 @pytest.mark.precommit
 def test_smoke(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(test_sample)
-
-    genai_result = pipe.generate(test_sample)
-
-    assert genai_result.texts[0] == expected["text"]
-
-    assert "chunks" not in expected
-    assert genai_result.chunks == None
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+    )
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
@@ -259,79 +296,55 @@ def test_whisper_constructors(model_descr, test_sample):
 def test_max_new_tokens(model_descr, test_sample):
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
 
-    expected = opt_pipe(test_sample, max_new_tokens=10)["text"]
+    expected = opt_pipe(test_sample, max_new_tokens=10)
 
     genai_result = pipe.generate(test_sample, max_new_tokens=10)
 
-    assert genai_result.texts[0] == expected
-
-    genai_result = pipe.generate(test_sample)
-
-    assert genai_result.texts[0] != expected
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 10
     genai_result = pipe.generate(test_sample, config)
-    assert genai_result.texts[0] == expected
+    compare_results(expected, genai_result)
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
-    "test_sample", get_samples_from_dataset(language="fr", length=3)
+    "test_samples",
+    [
+        (get_samples_from_dataset(language="fr", length=1), "fr"),
+        (get_samples_from_dataset(language="de", length=1), "de"),
+    ],
 )
 @pytest.mark.precommit
-def test_language_mode_fr(model_descr, test_sample):
-    model_id, path = model_descr
+def test_language_mode(model_descr, test_samples):
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
+    samples, language = test_samples
 
     expected = opt_pipe(
-        test_sample, max_new_tokens=30, generate_kwargs={"language": "fr"}
+        samples[0], max_new_tokens=30, generate_kwargs={"language": language}
     )
 
-    genai_result = pipe.generate(test_sample, max_new_tokens=30, language="<|fr|>")
-
-    assert genai_result.texts[0] == expected["text"]
-
-    config = pipe.get_generation_config()
-    config.max_new_tokens = 30
-    config.language = "<|fr|>"
-    genai_result = pipe.generate(test_sample, config)
-
-    assert genai_result.texts[0] == expected["text"]
-
-
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample", get_samples_from_dataset(language="de", length=3)
-)
-@pytest.mark.precommit
-def test_language_mode_de(model_descr, test_sample):
-    model_id, path = model_descr
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(
-        test_sample, max_new_tokens=30, generate_kwargs={"language": "de"}
+    genai_result = pipe.generate(
+        samples[0], max_new_tokens=30, language=f"<|{language}|>"
     )
 
-    genai_result = pipe.generate(test_sample, max_new_tokens=30, language="<|de|>")
-
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 30
-    config.language = "<|de|>"
-    genai_result = pipe.generate(test_sample, config)
+    config.language = f"<|{language}|>"
+    genai_result = pipe.generate(samples[0], config)
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
-    "test_sample", get_samples_from_dataset(language="fr", length=3)
+    "test_sample", get_samples_from_dataset(language="fr", length=1)
 )
 @pytest.mark.precommit
 def test_task_mode(model_descr, test_sample):
-    model_id, path = model_descr
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
 
     expected = opt_pipe(
@@ -344,7 +357,7 @@ def test_task_mode(model_descr, test_sample):
         test_sample, max_new_tokens=30, language="<|fr|>", task="translate"
     )
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 30
@@ -352,27 +365,7 @@ def test_task_mode(model_descr, test_sample):
     config.task = "translate"
     genai_result = pipe.generate(test_sample, config)
 
-    assert genai_result.texts[0] == expected["text"]
-
-    expected = opt_pipe(
-        test_sample,
-        max_new_tokens=30,
-        generate_kwargs={"language": "ru", "task": "translate"},
-    )
-
-    genai_result = pipe.generate(
-        test_sample, max_new_tokens=30, language="<|ru|>", task="translate"
-    )
-
-    assert genai_result.texts[0] == expected["text"]
-
-    config = pipe.get_generation_config()
-    config.max_new_tokens = 30
-    config.language = "<|ru|>"
-    config.task = "translate"
-    genai_result = pipe.generate(test_sample, config)
-
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     # seems to be equivalent to translate task
     expected = opt_pipe(
@@ -385,7 +378,7 @@ def test_task_mode(model_descr, test_sample):
         test_sample, max_new_tokens=30, language="<|en|>", task="transcribe"
     )
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 30
@@ -393,21 +386,20 @@ def test_task_mode(model_descr, test_sample):
     config.task = "transcribe"
     genai_result = pipe.generate(test_sample, config)
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
     "test_sample",
     [
-        *get_samples_from_dataset(language="fr", length=2),
-        *get_samples_from_dataset(language="de", length=2),
-        *get_samples_from_dataset(language="es", length=2),
+        *get_samples_from_dataset(language="fr", length=1),
+        *get_samples_from_dataset(language="de", length=1),
+        *get_samples_from_dataset(language="es", length=1),
     ],
 )
 @pytest.mark.precommit
 def test_language_autodetect(model_descr, test_sample):
-    model_id, path = model_descr
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
 
     input_features = opt_pipe.feature_extractor(test_sample)
@@ -415,188 +407,84 @@ def test_language_autodetect(model_descr, test_sample):
     # ensure detected language us not english
     assert language_id != pipe.get_generation_config().lang_to_id["<|en|>"]
 
-    expected = opt_pipe(
-        test_sample,
-        max_new_tokens=30,
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+        generation_config=ov_genai.WhisperGenerationConfig(max_new_tokens=30),
     )
 
-    genai_result = pipe.generate(test_sample, max_new_tokens=30)
-
-    assert genai_result.texts[0] == expected["text"]
-
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-    ],
-)
+@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1))
 @pytest.mark.precommit
 def test_return_timestamps_short_form(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-    # long form audio not supported yet
-    test_sample = test_sample[: 16000 * 30]
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
-    )
-
-    genai_result = pipe.generate(
-        test_sample.tolist(),
-        return_timestamps=True,
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+        generation_config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
     )
 
-    assert genai_result.texts[0] == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
-
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-    ],
-)
+@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1))
 @pytest.mark.precommit
 def test_return_timestamps_max_new_tokens_short_form(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-    # long form audio not supported yet
-    test_sample = test_sample[: 16000 * 30]
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
-        max_new_tokens=15,
-        generate_kwargs={"language": "en"},
-    )
-
-    genai_result = pipe.generate(
-        test_sample.tolist(),
-        max_new_tokens=15,
-        return_timestamps=True,
-        language="<|en|>",
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+        generation_config=ov_genai.WhisperGenerationConfig(
+            return_timestamps=True, language="en", max_new_tokens=30
+        ),
     )
 
-    assert genai_result.texts[0] == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
 
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        if opt_chunk["timestamp"][1]:
-            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-        else:
-            assert opt_chunk["timestamp"][1] == None
-            assert round(genai_chunk.end_ts, 2) == -1.0
-
-
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(multilingual=True))
+@pytest.mark.parametrize("model_descr", get_whisper_models_list())
 @pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-        *get_samples_from_dataset(language="fr", length=10, long_form=True),
-    ],
+    "test_sample", get_samples_from_dataset(length=10, long_form=True)
 )
 @pytest.mark.precommit
-def test_longform_audio_return_timestamps_multilingual(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
-    )
+def test_longform_audio(model_descr, test_sample):
+    _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr)
 
     streamer_result = []
 
-    genai_result = pipe.generate(
+    genai_result = run_genai(
+        genai_pipe,
         test_sample,
-        return_timestamps=True,
+        config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
         streamer=lambda x: streamer_result.append(x),
     )
 
-    assert genai_result.texts[0] == expected["text"]
-    assert "".join(streamer_result) == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
-
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        if opt_chunk["timestamp"][1]:
-            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-        else:
-            assert opt_chunk["timestamp"][1] == None
-            assert round(genai_chunk.end_ts, 2) == -1.0
-
-
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(en_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-    ],
-)
-@pytest.mark.precommit
-def test_longform_audio_return_timestamps_en(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
-    )
-
-    streamer_result = []
-
-    genai_result = pipe.generate(
+    hf_result = run_huggingface(
+        hf_pipe,
         test_sample,
-        return_timestamps=True,
-        streamer=lambda x: streamer_result.append(x),
+        config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
     )
 
-    assert genai_result.texts[0] == expected["text"]
-    assert "".join(streamer_result) == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
+    compare_results(hf_result, genai_result)
 
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        if opt_chunk["timestamp"][1]:
-            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-        else:
-            assert opt_chunk["timestamp"][1] == None
-            assert round(genai_chunk.end_ts, 2) == -1.0
+    assert "".join(streamer_result) == hf_result["text"]
 
 
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=3, long_form=True),
-        *get_samples_from_dataset(language="sp", length=3, long_form=True),
-    ],
-)
+@pytest.mark.parametrize("model_descr", get_whisper_models_list())
 @pytest.mark.precommit
-def test_longform_audio(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(test_sample, return_timestamps=True)
-
-    genai_result = pipe.generate(test_sample)
+def test_shortform(model_descr):
+    samples = []
+    ds = datasets.load_dataset(
+        "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
+    )
 
-    assert genai_result.texts[0] == expected["text"]
+    for ds_row in ds:
+        samples.append(ds_row["audio"]["array"])
 
-    assert genai_result.chunks == None
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=samples,
+    )
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))

From 5a82b84a643578c3b534e76088aa0f3125cad31e Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 9 Jan 2025 12:32:22 +0400
Subject: [PATCH 105/110] DOCS: unify package name usage across snippets in
 README.md (#1509)

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index c5cf799973..cea1e358bc 100644
--- a/README.md
+++ b/README.md
@@ -73,9 +73,9 @@ optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weigh
 ### Run generation using LLMPipeline API in Python
 
 ```python
-import openvino_genai as ov_genai
+import openvino_genai
 #Will run model on CPU, GPU or NPU are possible options
-pipe = ov_genai.LLMPipeline("./TinyLlama-1.1B-Chat-v1.0/", "CPU")
+pipe = openvino_genai.LLMPipeline("./TinyLlama-1.1B-Chat-v1.0/", "CPU")
 print(pipe.generate("The Sun is yellow because", max_new_tokens=100))
 ```
 
@@ -128,11 +128,11 @@ curl -O "https://storage.openvinotoolkit.org/test_data/images/dog.jpg"
 ```python
 import numpy as np
 import openvino as ov
-import openvino_genai as ov_genai
+import openvino_genai
 from PIL import Image
 
 # Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
-pipe = ov_genai.VLMPipeline("./InternVL2-1B", "CPU")
+pipe = openvino_genai.VLMPipeline("./InternVL2-1B", "CPU")
 pipe.start_chat()
 
 image = Image.open("dog.jpg")

From 2d5911b13b2bfab8a0433eaa38394bb9d064680f Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 9 Jan 2025 12:33:27 +0400
Subject: [PATCH 106/110] GHA: use latest OpenVINO master (#1511)

Fix to PA has been merged
https://github.com/openvinotoolkit/openvino/pull/28332
---
 .github/workflows/genai-tools.yml              | 2 +-
 .github/workflows/linux.yml                    | 2 +-
 .github/workflows/mac.yml                      | 2 +-
 .github/workflows/stable_diffusion_1_5_cpp.yml | 4 ++--
 .github/workflows/windows.yml                  | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/genai-tools.yml b/.github/workflows/genai-tools.yml
index bd6cb46362..333bee3e11 100644
--- a/.github/workflows/genai-tools.yml
+++ b/.github/workflows/genai-tools.yml
@@ -44,7 +44,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
+        revision: latest_available_commit
 
   llm_bench:
     name: 'LLM bench tests'
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 0d7a5b7bae..0a991e2a54 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -52,7 +52,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
+        revision: latest_available_commit
 
     - name: Clone docker tag from OpenVINO repo
       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 062b83fc27..7cb0ff98d3 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.10'
-  OV_BRANCH: 345163f87953fb0dd8dd590257eb7fc84378da8e
+  OV_BRANCH: 'master'
   OV_TARBALL: ''
 
 jobs:
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index 3b01697f26..e0bf5371b3 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -45,7 +45,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
+        revision: latest_available_commit
 
   openvino_download_windows:
     name: Download OpenVINO for Windows
@@ -71,7 +71,7 @@ jobs:
       with:
         platform: windows
         commit_packages_to_provide: wheels
-        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
+        revision: latest_available_commit
 
   stable_diffusion_1_5_cpp-linux:
     runs-on: ubuntu-22.04-8-cores
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 8f43af44ae..e65972110b 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.11'
-  OV_BRANCH: 345163f87953fb0dd8dd590257eb7fc84378da8e
+  OV_BRANCH: 'master'
   OV_TARBALL: ''
 
 jobs:

From 7ef754c88e13f2970272628d59c9202e773ce5f1 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 9 Jan 2025 12:41:05 +0400
Subject: [PATCH 107/110] [GHA] Increase timeout for
 cpp-multinomial-greedy_causal_lm-ubuntu (#1510)

See
https://github.com/openvinotoolkit/openvino.genai/actions/runs/12676190622/job/35328859923?pr=1507

It fails from time to time by timeout.
Let's increase it a bit to check whether it will make GHA CI more stable

---------

Co-authored-by: Vladimir Zlobin <vladimir.zlobin@intel.com>
---
 .github/workflows/causal_lm_cpp.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index fb0c9c4b0b..b6abbefac0 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -53,17 +53,17 @@ jobs:
           wget https://huggingface.co/smangrul/tinyllama_lora_sql/resolve/main/adapter_model.safetensors?download=true -O adapter_model.safetensors
       - run: >
           . ./ov/setupvars.sh
-          && timeout 25s ./build/samples/cpp/multinomial_causal_lm/multinomial_causal_lm ./open_llama_3b_v2/ a
+          && timeout 35s ./build/samples/cpp/multinomial_causal_lm/multinomial_causal_lm ./open_llama_3b_v2/ a
         env:
           PYTHONPATH: "./build"
       - run: >
           . ./ov/setupvars.sh
-          && timeout 25s ./samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./open_llama_3b_v2/ b
+          && timeout 35s ./samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./open_llama_3b_v2/ b
         env:
           PYTHONPATH: "./build"
       - run: >
           . ./ov/setupvars.sh
-          && timeout 25s ./build/samples/cpp/text_generation/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
+          && timeout 35s ./build/samples/cpp/text_generation/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
           | diff <(timeout 25s samples/python/text_generation/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") -
         env:
           PYTHONPATH: "./build"

From 0f543f453155c5911e6e82e07eb03e7678ce8aab Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 9 Jan 2025 15:01:57 +0400
Subject: [PATCH 108/110] [Image generation] added progress bar to C++ samples
 (#1512)

CVS-160052

Example:

![{EEC8BB42-3AD2-4362-B82D-8CFEF0BC11A0}](https://github.com/user-attachments/assets/1302f63f-2d65-475f-a31f-5d7ae427273e)
---
 samples/cpp/image_generation/CMakeLists.txt   | 25 +++++++++++++------
 .../heterogeneous_stable_diffusion.cpp        |  7 ++++--
 samples/cpp/image_generation/image2image.cpp  |  6 +++--
 samples/cpp/image_generation/inpainting.cpp   |  5 ++--
 .../cpp/image_generation/lora_text2image.cpp  |  7 ++++--
 samples/cpp/image_generation/progress_bar.hpp | 24 ++++++++++++++++++
 samples/cpp/image_generation/text2image.cpp   |  4 ++-
 7 files changed, 62 insertions(+), 16 deletions(-)
 create mode 100644 samples/cpp/image_generation/progress_bar.hpp

diff --git a/samples/cpp/image_generation/CMakeLists.txt b/samples/cpp/image_generation/CMakeLists.txt
index 004b305088..f3e4860ce0 100644
--- a/samples/cpp/image_generation/CMakeLists.txt
+++ b/samples/cpp/image_generation/CMakeLists.txt
@@ -11,12 +11,23 @@ find_package(OpenVINOGenAI REQUIRED
 file(DOWNLOAD https://raw.githubusercontent.com/nothings/stb/f75e8d1cad7d90d72ef7a4661f1b994ef78b4e31/stb_image.h ${CMAKE_BINARY_DIR}/stb_image.h
      EXPECTED_HASH MD5=27932e6fb3a2f26aee2fc33f2cb4e696)
 
+include(FetchContent)
+
+if(POLICY CMP0135)
+    cmake_policy(SET CMP0135 NEW)
+endif()
+
+FetchContent_Declare(indicators
+    URL https://github.com/p-ranav/indicators/archive/refs/tags/v2.3.tar.gz
+    URL_HASH SHA256=70da7a693ff7a6a283850ab6d62acf628eea17d386488af8918576d0760aef7b)
+FetchContent_MakeAvailable(indicators)
+
 # create main sample executable
 
 add_executable(text2image text2image.cpp imwrite.cpp)
 
 target_include_directories(text2image PRIVATE ${CMAKE_BINARY_DIR} "${CMAKE_CURRENT_SOURCE_DIR}")
-target_link_libraries(text2image PRIVATE openvino::genai)
+target_link_libraries(text2image PRIVATE openvino::genai indicators::indicators)
 
 set_target_properties(text2image PROPERTIES
     COMPILE_PDB_NAME text2image
@@ -33,7 +44,7 @@ install(TARGETS text2image
 add_executable(lora_text2image lora_text2image.cpp imwrite.cpp)
 
 target_include_directories(lora_text2image PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
-target_link_libraries(lora_text2image PRIVATE openvino::genai)
+target_link_libraries(lora_text2image PRIVATE openvino::genai indicators::indicators)
 
 set_target_properties(lora_text2image PROPERTIES
     COMPILE_PDB_NAME lora_text2image
@@ -52,7 +63,7 @@ add_executable(heterogeneous_stable_diffusion
     imwrite.cpp)
 
 target_include_directories(heterogeneous_stable_diffusion PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
-target_link_libraries(heterogeneous_stable_diffusion PRIVATE openvino::genai)
+target_link_libraries(heterogeneous_stable_diffusion PRIVATE openvino::genai indicators::indicators)
 
 set_target_properties(heterogeneous_stable_diffusion PROPERTIES
     COMPILE_PDB_NAME heterogeneous_stable_diffusion
@@ -69,7 +80,7 @@ install(TARGETS heterogeneous_stable_diffusion
 add_executable(image2image image2image.cpp load_image.cpp imwrite.cpp)
 
 target_include_directories(image2image PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_BINARY_DIR}")
-target_link_libraries(image2image PRIVATE openvino::genai)
+target_link_libraries(image2image PRIVATE openvino::genai indicators::indicators)
 
 set_target_properties(image2image PROPERTIES
     COMPILE_PDB_NAME image2image
@@ -80,13 +91,13 @@ install(TARGETS image2image
         RUNTIME DESTINATION samples_bin/
         COMPONENT samples_bin
         EXCLUDE_FROM_ALL)
-        
-# create LoRA sample executable
+
+# create inpainting executable
 
 add_executable(inpainting inpainting.cpp load_image.cpp imwrite.cpp)
 
 target_include_directories(inpainting PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_BINARY_DIR}")
-target_link_libraries(inpainting PRIVATE openvino::genai)
+target_link_libraries(inpainting PRIVATE openvino::genai indicators::indicators)
 
 set_target_properties(inpainting PROPERTIES
     COMPILE_PDB_NAME inpainting
diff --git a/samples/cpp/image_generation/heterogeneous_stable_diffusion.cpp b/samples/cpp/image_generation/heterogeneous_stable_diffusion.cpp
index 8203c37345..1bba41ffc5 100644
--- a/samples/cpp/image_generation/heterogeneous_stable_diffusion.cpp
+++ b/samples/cpp/image_generation/heterogeneous_stable_diffusion.cpp
@@ -1,9 +1,11 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "imwrite.hpp"
 #include "openvino/genai/image_generation/text2image_pipeline.hpp"
 
+#include "imwrite.hpp"
+#include "progress_bar.hpp"
+
 int32_t main(int32_t argc, char* argv[]) try {
     OPENVINO_ASSERT(argc >= 3 && argc <= 6,
                     "Usage: ",
@@ -102,7 +104,8 @@ int32_t main(int32_t argc, char* argv[]) try {
                                          ov::genai::width(width),
                                          ov::genai::height(height),
                                          ov::genai::guidance_scale(guidance_scale),
-                                         ov::genai::num_inference_steps(number_of_inference_steps_per_image));
+                                         ov::genai::num_inference_steps(number_of_inference_steps_per_image),
+                                         ov::genai::callback(progress_bar));
 
         imwrite("image_" + std::to_string(imagei) + ".bmp", image, true);
     }
diff --git a/samples/cpp/image_generation/image2image.cpp b/samples/cpp/image_generation/image2image.cpp
index c071b88362..2e1e5f57ba 100644
--- a/samples/cpp/image_generation/image2image.cpp
+++ b/samples/cpp/image_generation/image2image.cpp
@@ -3,8 +3,9 @@
 
 #include "openvino/genai/image_generation/image2image_pipeline.hpp"
 
-#include "load_image.hpp"
 #include "imwrite.hpp"
+#include "load_image.hpp"
+#include "progress_bar.hpp"
 
 int32_t main(int32_t argc, char* argv[]) try {
     OPENVINO_ASSERT(argc == 4, "Usage: ", argv[0], " <MODEL_DIR> '<PROMPT>' <IMAGE>");
@@ -17,7 +18,8 @@ int32_t main(int32_t argc, char* argv[]) try {
     ov::genai::Image2ImagePipeline pipe(models_path, device);
     ov::Tensor generated_image = pipe.generate(prompt, image,
         // controls how initial image is noised after being converted to latent space. `1` means initial image is fully noised
-        ov::genai::strength(0.8f));
+        ov::genai::strength(0.8f),
+        ov::genai::callback(progress_bar));
 
     // writes `num_images_per_prompt` images by pattern name
     imwrite("image_%d.bmp", generated_image, true);
diff --git a/samples/cpp/image_generation/inpainting.cpp b/samples/cpp/image_generation/inpainting.cpp
index 4c7a758450..a446035e0f 100644
--- a/samples/cpp/image_generation/inpainting.cpp
+++ b/samples/cpp/image_generation/inpainting.cpp
@@ -3,8 +3,9 @@
 
 #include "openvino/genai/image_generation/inpainting_pipeline.hpp"
 
-#include "load_image.hpp"
 #include "imwrite.hpp"
+#include "load_image.hpp"
+#include "progress_bar.hpp"
 
 int32_t main(int32_t argc, char* argv[]) try {
     OPENVINO_ASSERT(argc == 5, "Usage: ", argv[0], " <MODEL_DIR> '<PROMPT>' <IMAGE> <MASK_IMAGE>");
@@ -16,7 +17,7 @@ int32_t main(int32_t argc, char* argv[]) try {
     ov::Tensor mask_image = utils::load_image(mask_image_path);
 
     ov::genai::InpaintingPipeline pipe(models_path, device);
-    ov::Tensor generated_image = pipe.generate(prompt, image, mask_image);
+    ov::Tensor generated_image = pipe.generate(prompt, image, mask_image, ov::genai::callback(progress_bar));
 
     // writes `num_images_per_prompt` images by pattern name
     imwrite("image_%d.bmp", generated_image, true);
diff --git a/samples/cpp/image_generation/lora_text2image.cpp b/samples/cpp/image_generation/lora_text2image.cpp
index c1e6461db9..af042a2c89 100644
--- a/samples/cpp/image_generation/lora_text2image.cpp
+++ b/samples/cpp/image_generation/lora_text2image.cpp
@@ -4,6 +4,7 @@
 #include "openvino/genai/image_generation/text2image_pipeline.hpp"
 
 #include "imwrite.hpp"
+#include "progress_bar.hpp"
 
 int32_t main(int32_t argc, char* argv[]) try {
     OPENVINO_ASSERT(argc >= 3 && (argc - 3) % 2 == 0, "Usage: ", argv[0], " <MODEL_DIR> '<PROMPT>' [<LORA_SAFETENSORS> <ALPHA> ...]]");
@@ -27,7 +28,8 @@ int32_t main(int32_t argc, char* argv[]) try {
         ov::genai::width(512),
         ov::genai::height(896),
         ov::genai::num_inference_steps(20),
-        ov::genai::rng_seed(42));
+        ov::genai::rng_seed(42),
+        ov::genai::callback(progress_bar));
     imwrite("lora.bmp", image, true);
 
     std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n";
@@ -36,7 +38,8 @@ int32_t main(int32_t argc, char* argv[]) try {
         ov::genai::width(512),
         ov::genai::height(896),
         ov::genai::num_inference_steps(20),
-        ov::genai::rng_seed(42));
+        ov::genai::rng_seed(42),
+        ov::genai::callback(progress_bar));
     imwrite("baseline.bmp", image, true);
 
     return EXIT_SUCCESS;
diff --git a/samples/cpp/image_generation/progress_bar.hpp b/samples/cpp/image_generation/progress_bar.hpp
new file mode 100644
index 0000000000..cc0bf6ffaf
--- /dev/null
+++ b/samples/cpp/image_generation/progress_bar.hpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2023-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "indicators/progress_bar.hpp"
+
+bool progress_bar(size_t step, size_t num_steps, ov::Tensor& /* latent */) {
+    using namespace indicators;
+
+    static ProgressBar bar{
+        option::BarWidth{50},
+        option::ForegroundColor{Color::green},
+        option::FontStyles{std::vector<FontStyle>{FontStyle::bold}},
+        option::ShowElapsedTime{true},
+        option::ShowRemainingTime{true},
+    };
+
+    std::stringstream stream;
+    stream << "Image generation step " << (step + 1) << " / " << num_steps;
+
+    bar.set_option(option::PostfixText{stream.str()});
+    bar.set_progress((100 * (step + 1)) / num_steps);
+
+    return false;
+}
diff --git a/samples/cpp/image_generation/text2image.cpp b/samples/cpp/image_generation/text2image.cpp
index 6a97b3a074..5668259f90 100644
--- a/samples/cpp/image_generation/text2image.cpp
+++ b/samples/cpp/image_generation/text2image.cpp
@@ -4,6 +4,7 @@
 #include "openvino/genai/image_generation/text2image_pipeline.hpp"
 
 #include "imwrite.hpp"
+#include "progress_bar.hpp"
 
 int32_t main(int32_t argc, char* argv[]) try {
     OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " <MODEL_DIR> '<PROMPT>'");
@@ -16,7 +17,8 @@ int32_t main(int32_t argc, char* argv[]) try {
         ov::genai::width(512),
         ov::genai::height(512),
         ov::genai::num_inference_steps(20),
-        ov::genai::num_images_per_prompt(1));
+        ov::genai::num_images_per_prompt(1),
+        ov::genai::callback(progress_bar));
 
     // writes `num_images_per_prompt` images by pattern name
     imwrite("image_%d.bmp", image, true);

From 3b0ecb41d31ead159f23120e5553b277ae6bee8f Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 9 Jan 2025 16:34:57 +0400
Subject: [PATCH 109/110] Fixed copy-pasted error message (#1514)

---
 src/cpp/src/image_generation/image2image_pipeline.cpp | 4 ++--
 src/cpp/src/image_generation/inpainting_pipeline.cpp  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/cpp/src/image_generation/image2image_pipeline.cpp b/src/cpp/src/image_generation/image2image_pipeline.cpp
index 38ff5a0a4c..90c6e9fae4 100644
--- a/src/cpp/src/image_generation/image2image_pipeline.cpp
+++ b/src/cpp/src/image_generation/image2image_pipeline.cpp
@@ -23,7 +23,7 @@ Image2ImagePipeline::Image2ImagePipeline(const std::filesystem::path& root_dir)
     } else if (class_name == "StableDiffusionXLPipeline") {
         m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::IMAGE_2_IMAGE, root_dir);
     } else {
-        OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'");
+        OPENVINO_THROW("Unsupported image to image generation pipeline '", class_name, "'");
     }
 }
 
@@ -35,7 +35,7 @@ Image2ImagePipeline::Image2ImagePipeline(const std::filesystem::path& root_dir,
     } else if (class_name == "StableDiffusionXLPipeline") {
         m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties);
     } else {
-        OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'");
+        OPENVINO_THROW("Unsupported image to image generation pipeline '", class_name, "'");
     }
 }
 
diff --git a/src/cpp/src/image_generation/inpainting_pipeline.cpp b/src/cpp/src/image_generation/inpainting_pipeline.cpp
index a510be0a57..a9179f5fd0 100644
--- a/src/cpp/src/image_generation/inpainting_pipeline.cpp
+++ b/src/cpp/src/image_generation/inpainting_pipeline.cpp
@@ -26,7 +26,7 @@ InpaintingPipeline::InpaintingPipeline(const std::filesystem::path& root_dir) {
     } else if (class_name == "StableDiffusionXLPipeline" || class_name == "StableDiffusionXLInpaintPipeline") {
         m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::INPAINTING, root_dir);
     } else {
-        OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'");
+        OPENVINO_THROW("Unsupported inpainting pipeline '", class_name, "'");
     }
 }
 
@@ -40,7 +40,7 @@ InpaintingPipeline::InpaintingPipeline(const std::filesystem::path& root_dir, co
     } else if (class_name == "StableDiffusionXLPipeline" || class_name == "StableDiffusionXLInpaintPipeline") {
         m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::INPAINTING, root_dir, device, properties);
     } else {
-        OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'");
+        OPENVINO_THROW("Unsupported inpainting pipeline '", class_name, "'");
     }
 }
 

From f1802f58c02545c7891d7c5ea239a451d4d53065 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 9 Jan 2025 21:13:41 +0400
Subject: [PATCH 110/110] Fixed SEGFAULT when empty encoded inputs are passed
 to LLM (#1513)

---
 src/cpp/src/sequence_group.hpp          | 9 ++++++---
 tests/cpp/block_manager.cpp             | 2 +-
 tests/python_tests/test_llm_pipeline.py | 8 ++++++++
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index b6bcc83530..c423675e64 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -243,9 +243,12 @@ class SequenceGroup  : public std::enable_shared_from_this<SequenceGroup> {
 
     SequenceGroup(uint64_t request_id, const ov::Tensor input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size)
         : SequenceGroup(request_id, sampling_params, block_size) {
-        m_prompt_ids.resize(input_ids.get_size());
-        std::copy_n(input_ids.data<int64_t>(), input_ids.get_size(), m_prompt_ids.begin());
-        m_prompt_log_probs.reserve(m_prompt_ids.size());
+        size_t prompt_len = input_ids.get_size();
+        OPENVINO_ASSERT(prompt_len > 0, "Prompt length cannot be 0");
+
+        m_prompt_ids.resize(prompt_len);
+        std::copy_n(input_ids.data<int64_t>(), prompt_len, m_prompt_ids.begin());
+        m_prompt_log_probs.reserve(prompt_len);
 
         // create a single sequence
         add_sequence(Sequence::create(m_next_sequence_id++));
diff --git a/tests/cpp/block_manager.cpp b/tests/cpp/block_manager.cpp
index 46c2fdddd7..670a0dffe7 100644
--- a/tests/cpp/block_manager.cpp
+++ b/tests/cpp/block_manager.cpp
@@ -10,7 +10,7 @@
 
 TEST(TestBlockManager, general_test) {
     ov::genai::BlockManager bm = ov::genai::BlockManager(6, false, 4);
-    ov::genai::TokenIds prompt_ids;
+    ov::genai::TokenIds prompt_ids = {10, 0};
 
     ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared<ov::genai::SequenceGroup>(
         0,
diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py
index 5278f4424f..031c42a1dc 100644
--- a/tests/python_tests/test_llm_pipeline.py
+++ b/tests/python_tests/test_llm_pipeline.py
@@ -97,6 +97,14 @@ def test_batch_size_switch():
     ov_pipe.generate(["1", "2"], max_new_tokens=2)
     ov_pipe.generate(["a"], max_new_tokens=2)
 
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_empty_encoded_inputs_throw():
+    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
+    with pytest.raises(RuntimeError):
+        ov_pipe.generate(ov.Tensor(np.array([[]], dtype=np.int64)), max_new_tokens=2)
+
 #
 # Chat scenario
 #