Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Automatically apply chat template in non-chat scenarios
Browse files Browse the repository at this point in the history
sbalandi committed Jan 17, 2025

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent 1b3c68d commit cd59e5b
Showing 6 changed files with 66 additions and 7 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -133,7 +133,6 @@ from PIL import Image

# Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
pipe = openvino_genai.VLMPipeline("./InternVL2-1B", "CPU")
pipe.start_chat()

image = Image.open("dog.jpg")
image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)
2 changes: 2 additions & 0 deletions src/README.md
Original file line number Diff line number Diff line change
@@ -73,6 +73,8 @@ output:
'it is made up of carbon atoms. The carbon atoms are arranged in a linear pattern, which gives the yellow color. The arrangement of carbon atoms in'
```
>**Note**: The chat_template from tokenizer_config.json will be automatically applied to the prompt at the generation stage. If you want to disable it, you can do it by calling pipe.get_tokenizer().set_chat_template("").
A simple chat in Python:
```python
import openvino_genai as ov_genai
12 changes: 11 additions & 1 deletion src/cpp/src/icontinuous_batching.cpp
Original file line number Diff line number Diff line change
@@ -55,7 +55,17 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
timer.start();
for (const std::string& prompt : prompts) {
const auto encode_start = std::chrono::steady_clock::now();
input_ids.push_back(m_tokenizer.encode(prompt).input_ids);
ov::Tensor encoded_inputs;
try {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
encoded_inputs = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids;
} catch (const std::exception& error) {
// in case when chat_template was not found in tokenizer_config.json or set
encoded_inputs = m_tokenizer.encode(prompt).input_ids;
}
input_ids.push_back(encoded_inputs);
tokenization_durations.emplace_back(PerfMetrics::get_microsec(std::chrono::steady_clock::now() - encode_start));
}
timer.end();
25 changes: 23 additions & 2 deletions src/cpp/src/llm_pipeline_stateful.cpp
Original file line number Diff line number Diff line change
@@ -88,7 +88,19 @@ DecodedResults StatefulLLMPipeline::generate(

if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
encoded_input = m_tokenizer.encode(*input_vector);
std::vector<std::string> templated_input_vector;
for (auto& input : *input_vector) {
try {
ChatHistory history({{{"role", "user"}, {"content", input}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
templated_input_vector.push_back(templated_prompt);
} catch (const std::exception& error) {
// in case when chat_template was not found in tokenizer_config.json or set
templated_input_vector.push_back(input);
}
}
encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false));
} else if (auto input_prompt = std::get_if<std::string>(&inputs)) {
std::string& prompt = *input_prompt;

@@ -157,7 +169,16 @@ DecodedResults StatefulLLMPipeline::generate(

// TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
} else {
encoded_input = m_tokenizer.encode(prompt);
std::string& prompt = *input_prompt;
try {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
encoded_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false));
} catch (const std::exception& error) {
// in case when chat_template was not found in tokenizer_config.json or set
encoded_input = m_tokenizer.encode(prompt);
}
}
}

20 changes: 18 additions & 2 deletions src/cpp/src/llm_pipeline_static.cpp
Original file line number Diff line number Diff line change
@@ -805,7 +805,15 @@ DecodedResults StatefulLLMPipeline::generate(
// for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF
tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false));
} else {
tokenized_input = m_tokenizer.encode(prompt);
try {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
tokenized_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false));
} catch (const std::exception& error) {
// in case when chat_template was not found in tokenizer_config.json or set
tokenized_input = m_tokenizer.encode(prompt);
}
}

auto encode_stop_time = std::chrono::steady_clock::now();
@@ -1273,7 +1281,15 @@ DecodedResults StatelessLLMPipeline::generate(
// for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF
tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false));
} else {
tokenized_input = m_tokenizer.encode(prompt);
try {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
tokenized_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false));
} catch (const std::exception& error) {
// in case when chat_template was not found in tokenizer_config.json or set
tokenized_input = m_tokenizer.encode(prompt);
}
}

auto encode_stop_time = std::chrono::steady_clock::now();
13 changes: 12 additions & 1 deletion src/cpp/src/visual_language/inputs_embedder.cpp
Original file line number Diff line number Diff line change
@@ -223,8 +223,19 @@ class InputsEmbedder::IInputsEmbedder {
m_tokenized_history.clear();
std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));
} else {
std::string templated_prompt;
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;

try {
templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
} catch (const std::exception& error) {
// Use fallback chat template if it was not found in tokenizer_config.json
templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt, chat_template_fallback);
}

auto start_tokenizer_time = std::chrono::steady_clock::now();
encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
encoded_input_ids = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids;
auto end_tokenizer_time = std::chrono::steady_clock::now();
metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
m_tokenized_history.clear();

0 comments on commit cd59e5b

Please sign in to comment.