From 1e8757a9340fe16410040232bcf1065cc8636158 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 6 Jan 2025 14:34:58 +0100 Subject: [PATCH] [TESTS] Use FP32 inference precision, FP16 KV cache precision for all pipelines --- samples/export-requirements.txt | 2 +- src/python/openvino_genai/py_openvino_genai.pyi | 2 +- src/python/py_continuous_batching_pipeline.cpp | 7 +++---- tests/python_tests/common.py | 2 +- tests/python_tests/ov_genai_test_utils.py | 12 ++++++------ tests/python_tests/requirements.txt | 2 +- tests/python_tests/test_continuous_batching.py | 4 ++-- tests/python_tests/test_kv_cache_eviction.py | 8 ++++---- tests/python_tests/test_vlm_pipeline.py | 4 ++-- 9 files changed, 21 insertions(+), 22 deletions(-) diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt index af38558656..2f71891b7b 100644 --- a/samples/export-requirements.txt +++ b/samples/export-requirements.txt @@ -2,7 +2,7 @@ --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly openvino-tokenizers~=2025.0.0.0.dev -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631 numpy<2.0.0; sys_platform == 'darwin' einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 9ff28859b9..d405cd9bbf 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -364,7 +364,7 @@ class ContinuousBatchingPipeline: def __init__(self, models_path: os.PathLike, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None: ... @typing.overload - def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None: + def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, **kwargs) -> None: ... @typing.overload def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, generation_config: GenerationConfig) -> GenerationHandle: diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index 48eb124255..975100cb11 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -223,15 +223,14 @@ void init_continuous_batching_pipeline(py::module_& m) { py::arg("properties") = ov::AnyMap({}), py::arg("tokenizer_properties") = ov::AnyMap({})) - .def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& plugin_config) { + .def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const py::kwargs& kwargs) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); - return std::make_unique(models_path, tokenizer, scheduler_config, device, pyutils::properties_to_any_map(plugin_config)); + return std::make_unique(models_path, tokenizer, scheduler_config, device, pyutils::kwargs_to_any_map(kwargs)); }), py::arg("models_path"), py::arg("tokenizer"), py::arg("scheduler_config"), - py::arg("device"), - py::arg("properties") = ov::AnyMap({})) + py::arg("device")) .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer) .def("get_config", &ContinuousBatchingPipeline::get_config) diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index aa4c537dd6..2fca58a959 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -306,7 +306,7 @@ def run_continuous_batching( if type(generation_configs) is not list: generation_configs = [generation_configs] * len(prompts) - cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU') + cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=get_default_properties()) output = cb_pipe.generate(prompts, generation_configs) del cb_pipe diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 00c74f6628..66fb58f46d 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -14,7 +14,7 @@ import json import openvino_genai as ov_genai - +from common import get_default_properties def get_models_list(): precommit_models = [ @@ -92,7 +92,7 @@ def read_model(params, **tokenizer_kwargs): if (models_path / "openvino_model.xml").exists(): opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True, - compile=False, device='CPU') + compile=False, device='CPU', ov_config=get_default_properties()) else: ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, with_detokenizer=True, @@ -104,7 +104,7 @@ def read_model(params, **tokenizer_kwargs): hf_tokenizer.save_pretrained(models_path) opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, - compile=False, device='CPU', load_in_8bit=False) + compile=False, device='CPU', load_in_8bit=False, ov_config=get_default_properties()) opt_model.generation_config.save_pretrained(models_path) opt_model.config.save_pretrained(models_path) opt_model.save_pretrained(models_path) @@ -114,7 +114,7 @@ def read_model(params, **tokenizer_kwargs): models_path, hf_tokenizer, opt_model, - ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False), + ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False, **get_default_properties()), ) @@ -178,7 +178,7 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path): with (temp_path / config_name).open('w') as f: json.dump(config_json, f) - ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU') + ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **get_default_properties()) for _, config_name in configs: os.remove(temp_path / config_name) @@ -188,4 +188,4 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path): @functools.lru_cache(1) def get_continuous_batching(path): - return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig()) + return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig(), **get_default_properties()) diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index c851c71ee5..e23eaacc21 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cpu diffusers==0.32.1 -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631 numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64" onnx==1.17.0 pytest diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py index fabcf06b71..d7ce0b1ece 100644 --- a/tests/python_tests/test_continuous_batching.py +++ b/tests/python_tests/test_continuous_batching.py @@ -9,7 +9,7 @@ from pathlib import Path from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer -from common import get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \ +from common import get_default_properties, get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \ get_scheduler_config, get_greedy, run_cb_pipeline_with_ref, get_beam_search, get_greedy, \ get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p @@ -155,7 +155,7 @@ def test_post_oom_health(tmp_path, sampling_config): models_path : Path = tmp_path / model_id convert_models(opt_model, hf_tokenizer, models_path) - cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU") + cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU", **get_default_properties()) # First run should return incomplete response output = cb_pipe.generate(["What is OpenVINO?"], [generation_config]) diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py index 3dbf9297ee..428047ea28 100644 --- a/tests/python_tests/test_kv_cache_eviction.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -15,7 +15,7 @@ from openvino import serialize from transformers import AutoTokenizer -from common import TESTS_ROOT, run_cb_pipeline_with_ref +from common import TESTS_ROOT, run_cb_pipeline_with_ref, get_default_properties def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]: @@ -42,7 +42,7 @@ class ConvertedModel: @pytest.fixture(scope='module') def converted_model(tmp_path_factory): model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False) + model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False, ov_config=get_default_properties()) tokenizer = AutoTokenizer.from_pretrained(model_id) models_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id model.save_pretrained(models_path) @@ -112,8 +112,8 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t scheduler_config_opt.enable_prefix_caching = enable_prefix_caching models_path = converted_model.models_path - model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU") - model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU") + model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, get_default_properties()) + model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, get_default_properties()) tokenizer = converted_model.tokenizer diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 62c1c27e3b..e6f897bcef 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -7,7 +7,7 @@ import transformers from optimum.intel.openvino import OVModelForVisualCausalLM from openvino_genai import VLMPipeline, GenerationConfig -from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters +from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters, get_default_properties def get_ov_model(cache): model_dir = cache.mkdir("tiny-random-minicpmv-2_6") @@ -19,7 +19,7 @@ def get_ov_model(cache): ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True) openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml") openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml") - model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True) + model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_properties()) processor.save_pretrained(model_dir) model.save_pretrained(model_dir) return model_dir