diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index a1a08710a4..5a97982411 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -176,10 +176,16 @@ class Tokenizer::TokenizerImpl { void setupTokenizer(const std::pair, std::shared_ptr>& models, const ov::AnyMap& properties) { auto [ov_tokenizer, ov_detokenizer] = models; + OPENVINO_ASSERT(ov_tokenizer || ov_detokenizer, "Neither tokenizer nor detokenzier models were provided"); - m_older_than_24_5 = ov_tokenizer->get_rt_info().count("openvino_tokenizers_version") != 1; auto core = get_core_singleton(); std::string device = "CPU"; // only CPU is supported for now + + std::string version_str; + utils::read_rt_info(ov_tokenizer != nullptr ? ov_tokenizer: ov_detokenizer , "openvino_tokenizers_version", version_str); + // Saving IR version was added only in 24.5, so if it's empty, then it's older than 24.5 + m_older_than_24_5 = version_str.empty(); + if (ov_tokenizer) { ov::pass::Manager manager; manager.register_pass(); @@ -207,7 +213,8 @@ class Tokenizer::TokenizerImpl { if (m_tokenizer) { // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup. encode("non empty string").input_ids; - if (m_detokenizer) + } + if (m_detokenizer) { decode({1, 33, 199, 42, 42}); } @@ -354,6 +361,9 @@ class Tokenizer::TokenizerImpl { } TokenizedInputs encode(std::string prompt, const ov::AnyMap& tokenization_params = {}) { + OPENVINO_ASSERT(m_ireq_queue_tokenizer, "Either openvino_tokenizer.xml was not provided or it was not loaded correctly. " + "Tokenizer::encode is not available"); + bool add_special_tokens_flag = true; ov::genai::utils::read_anymap_param(tokenization_params, add_special_tokens.name(), add_special_tokens_flag); @@ -370,6 +380,8 @@ class Tokenizer::TokenizerImpl { } TokenizedInputs encode(std::vector& prompts, const ov::AnyMap& tokenization_params = {}) { + OPENVINO_ASSERT(m_ireq_queue_tokenizer, "Either openvino_tokenizer.xml was not provided or it was not loaded correctly. " + "Tokenizer::encode is not available"); TokenizedInputs unpadded; { bool add_special_tokens_flag = true; diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index 715f9b97a3..859e2a7586 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -779,7 +779,8 @@ def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: st def test_perf_metrics(model_descr, generation_config, prompt): import time start_time = time.perf_counter() - perf_metrics = run_perf_metrics_collection(read_model(model_descr), generation_config, prompt) + # To check prefill exclusion we need long initial prompt. + perf_metrics = run_perf_metrics_collection(read_model(model_descr), generation_config, prompt * 1000) total_time = (time.perf_counter() - start_time) * 1000 # Check that load time is adequate.