diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 3d5697e4e3..962147945c 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -121,6 +121,7 @@ ov::genai::LLMPipeline::LLMPipeline( #ifdef OPENVINO_ARCH_X86_64 SchedulerConfig default_config; default_config.max_num_batched_tokens = std::numeric_limits::max(); // don't limit total batch size + default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios m_pimpl = std::make_unique(models_path, tokenizer, default_config, device, properties); #endif @@ -162,6 +163,7 @@ ov::genai::LLMPipeline::LLMPipeline( #ifdef OPENVINO_ARCH_X86_64 SchedulerConfig default_config; default_config.max_num_batched_tokens = std::numeric_limits::max(); // don't limit total batch size + default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios m_pimpl = std::make_unique(models_path, default_config, device, properties); #endif @@ -228,6 +230,7 @@ ov::genai::LLMPipeline::LLMPipeline( #ifdef OPENVINO_ARCH_X86_64 SchedulerConfig default_config; default_config.max_num_batched_tokens = std::numeric_limits::max(); // don't limit total batch size + default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios m_pimpl = std::make_unique(model_str, weights_tensor, tokenizer, default_config, device, properties, generation_config);