Merge remote-tracking branch 'upstream/master' into 2025.1.0-nightly

ilya-lavrenov · ilya-lavrenov · commit d8afcdf40916 · 2025-01-17T08:40:37.000+01:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -30,7 +30,7 @@ if(UNIX AND NOT (APPLE OR ANDROID OR CYGWIN))
 endif()
 
 project(OpenVINOGenAI
-        VERSION 2025.0.0.0
+        VERSION 2025.1.0.0
         DESCRIPTION "OpenVINO GenAI"
         HOMEPAGE_URL "https://github.com/openvinotoolkit/openvino.genai"
         LANGUAGES CXX C)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "openvino-genai"
-version = "2025.0.0.0"
+version = "2025.1.0.0"
 description = "Library of the most popular Generative AI model pipelines, optimized execution methods, and samples"
 requires-python = ">=3.9"
 readme = { file = "src/README.md", content-type="text/markdown" }
@@ -30,7 +30,7 @@ classifiers = [
     "Programming Language :: Python :: Implementation :: CPython"
 ]
 dependencies = [
-    "openvino_tokenizers~=2025.0.0.0.dev"
+    "openvino_tokenizers~=2025.1.0.0.dev"
 ]
 
 [tool.py-build-cmake.module]
@@ -52,7 +52,7 @@ options = {"BUILD_TOKENIZERS" = "OFF"}
 [build-system]
 requires = [
     "py-build-cmake==0.3.4",
-    "openvino~=2025.0.0.0.dev",
+    "openvino~=2025.1.0.0.dev",
     "pybind11-stubgen==2.5.1",
     "cmake~=3.23.0"
 ]
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
@@ -65,12 +65,15 @@ def test_stop_strings(tmp_path, generation_config):
     'What is OpenVINO?',
     'table is made of', 
     'The Sun is yellow because', 
-    '你好！ 你好嗎？'
+    '你好！ 你好嗎？',
     'I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature'
 ])
 @pytest.mark.parametrize("use_cb", [True, False])
 def test_greedy(tmp_path, generation_config, prompt, use_cb):
     model_id : str = "katuni4ka/tiny-random-phi3"
+    if sys.platform.startswith('win') and prompt.startswith('你'):
+        pytest.skip("For unknown reason this prompt fails on Win")
+
     run_llm_pipeline_with_ref(model_id=model_id, 
                             prompts=[prompt], 
                             generation_config=generation_config, 
diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py
@@ -147,9 +147,9 @@ def get_argprser():
     parser.add_argument("--draft_device", required=False, default=None, help="Inference device for Speculative decoding of draft model")
     parser.add_argument("--draft_cb_config", required=False, default=None,
                         help="Path to file with Continuous Batching Scheduler settings or dict for Speculative decoding of draft model")
-    parser.add_argument("--num_assistant_tokens", required=False, default=None, help="Config option num_assistant_tokens for Speculative decoding")
+    parser.add_argument("--num_assistant_tokens", required=False, default=None, help="Config option num_assistant_tokens for Speculative decoding", type=int)
     parser.add_argument("--assistant_confidence_threshold", required=False, default=None,
-                        help="Config option assistant_confidence_threshold for Speculative decoding")
+                        help="Config option assistant_confidence_threshold for Speculative decoding", type=float)
     parser.add_argument(
         '--end_token_stopping',
         action='store_true',
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -243,9 +243,13 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs):
 
     draft_model_path = kwargs.get("draft_model", '')
     cb = kwargs.get("use_cb", False)
-    if cb or draft_model_path:
+    cb_config = kwargs.get("cb_config")
+    use_streamer_metrics = False
+    if cb or cb_config is not None or draft_model_path:
         log.info("Continuous Batching mode activated")
-        ov_config["scheduler_config"] = get_scheduler_config_genai(kwargs.get("cb_config"))
+        ov_config["scheduler_config"] = get_scheduler_config_genai(cb_config)
+
+        use_streamer_metrics = not openvino_genai.get_version().startswith("2025.") or draft_model_path
 
     if draft_model_path:
         if not Path(draft_model_path).exists():
@@ -292,7 +296,7 @@ def get_tokens(self):
 
         def get_time_list(self):
             return self.token_generation_time
-    streamer = TokenStreamer(llm_pipe.get_tokenizer()) if cb or draft_model_path else None
+    streamer = TokenStreamer(llm_pipe.get_tokenizer()) if use_streamer_metrics else None
 
     return llm_pipe, tokenizer, end - start, streamer, True
 
diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py
@@ -181,14 +181,6 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
             log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} "
                         f"is different from md5 of the {num - 1} iteration {prev_md5}")
             metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
-            if not args.get("use_cb", False):
-                if num == 1:
-                    # if the device is CPU, throw exception
-                    if args['devices'].lower().startswith('cpu') is True:
-                        assert (result_md5_list == prev_md5)
-                else:
-                    # throw exception
-                    assert (result_md5_list == prev_md5)
     else:
         metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
     if bench_hook is not None:
@@ -231,10 +223,10 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
     if args.get('draft_model', ''):
         config_info = "Speculative decoding config: "
         if args.get('num_assistant_tokens', None):
-            gen_config.num_assistant_tokens = args['num_assistant_tokens']
+            gen_config.num_assistant_tokens = int(args['num_assistant_tokens'])
             config_info += f" num_assistant_tokens {gen_config.num_assistant_tokens}"
         if args.get('assistant_confidence_threshold', None):
-            gen_config.assistant_confidence_threshold = args['assistant_confidence_threshold']
+            gen_config.assistant_confidence_threshold = float(args['assistant_confidence_threshold'])
             config_info += f" assistant_confidence_threshold {gen_config.assistant_confidence_threshold}"
         log.info(config_info)
     start = time.perf_counter()
@@ -339,14 +331,6 @@ def token_printer():
             log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} "
                         f"is different from md5 of the {num - 1} iteration {prev_md5}")
             metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
-            if not args.get("use_cb", False):
-                if num == 1:
-                    # if the device is CPU, throw exception
-                    if args['devices'].lower().startswith('cpu') is True:
-                        assert (result_md5_list == prev_md5)
-                else:
-                    # throw exception
-                    assert (result_md5_list == prev_md5)
     else:
         metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
 
@@ -461,14 +445,6 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
             log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} "
                         f"is different from md5 of the {num - 1} iteration {prev_md5}")
             metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
-            if not args.get("use_cb", False):
-                if num == 1:
-                    # if the device is CPU, throw exception
-                    if args['devices'].lower().startswith('cpu') is True:
-                        assert (result_md5_list == prev_md5)
-                else:
-                    # throw exception
-                    assert (result_md5_list == prev_md5)
     else:
         metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
     streamer.reset()