From 66d81532f090198773295ea070b8488c8f455141 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Tue, 7 Jan 2025 19:31:01 +0100 Subject: [PATCH 1/7] Refactor tests --- tests/python_tests/test_whisper_pipeline.py | 452 ++++++++------------ 1 file changed, 177 insertions(+), 275 deletions(-) diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py index aa78666e32..1c0871768c 100644 --- a/tests/python_tests/test_whisper_pipeline.py +++ b/tests/python_tests/test_whisper_pipeline.py @@ -16,6 +16,8 @@ import numpy as np import os import pathlib +from dataclasses import dataclass + @pytest.fixture(scope="class", autouse=True) def run_gc_after_test(): @@ -27,36 +29,31 @@ def run_gc_after_test(): gc.collect() -def get_whisper_models_list(tiny_only=False, multilingual=False, en_only=False): - precommit_models = [ +def get_whisper_models_list(tiny_only=False): + model_ids = [ "openai/whisper-tiny", - "openai/whisper-tiny.en", "distil-whisper/distil-small.en", ] - if multilingual: - precommit_models = ["openai/whisper-tiny"] - if en_only: - precommit_models = ["openai/whisper-tiny.en", "distil-whisper/distil-small.en"] - if tiny_only: - precommit_models = ["openai/whisper-tiny"] - nightly_models = [] + if tiny_only: + model_ids = ["openai/whisper-tiny"] - if pytest.run_marker == "precommit": - model_ids = precommit_models - else: - model_ids = nightly_models + pytest.selected_model_ids = None if pytest.selected_model_ids: - model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')] + model_ids = [ + model_id + for model_id in model_ids + if model_id in pytest.selected_model_ids.split(" ") + ] - prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) - return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] + prefix = pathlib.Path(os.getenv("GENAI_MODELS_PATH_PREFIX", "")) + return [(model_id, prefix / model_id.split("/")[1]) for model_id in model_ids] # used whisper models are relatively small # cache them in memory to speedup tests -@functools.lru_cache(3) +@functools.lru_cache() def read_whisper_model(params, **tokenizer_kwargs): model_id, path = params @@ -114,30 +111,48 @@ def read_whisper_model(params, **tokenizer_kwargs): ) -def compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id): - ds = datasets.load_dataset(dataset_id, "clean", split="validation") - opt_infer_time = 0 - genai_infer_time = 0 +@dataclass +class GenerationConfig: + task: str | None = None + language: str | None = None + return_timestamps: bool = False + max_new_tokens: int | None = None + streamer: typing.Callable[[str], bool] | None = None - for ds_row in ds: - audio_sample = ds_row["audio"] - streamer_result = [] +def run_huggingface( + pipeline, + sample, + config: GenerationConfig | None = None, +): + if not config: + config = GenerationConfig() + + return pipeline( + sample, + max_new_tokens=config.max_new_tokens, + return_timestamps=config.return_timestamps, + generate_kwargs={"language": config.language, "task": config.task}, + ) - start = time.time() - genai_result = genai_pipe.generate( - audio_sample["array"].tolist(), streamer=lambda x: streamer_result.append(x) - ) - genai_infer_time += time.time() - start - start = time.time() - result = opt_pipe(audio_sample) - opt_infer_time += time.time() - start +def run_genai( + pipeline: ov_genai.WhisperPipeline, + sample, + config: GenerationConfig | None = None, +): + if not config: + config = GenerationConfig() + + genai_config = pipeline.get_generation_config() - assert genai_result.texts[0] == result["text"] - assert "".join(streamer_result) == result["text"] + if config.max_new_tokens: + genai_config.max_new_tokens = config.max_new_tokens + genai_config.return_timestamps = config.return_timestamps + genai_config.task = config.task + genai_config.language = f"<|{config.language}|>" if config.language else None - print(f"Inference time\nOpt: {opt_infer_time}\nGenAI: {genai_infer_time}") + return pipeline.generate(sample, genai_config, streamer=config.streamer) def get_samples_from_dataset( @@ -166,13 +181,60 @@ def get_samples_from_dataset( return [x["audio"]["array"] for x in ds] -@pytest.mark.parametrize("model_descr", get_whisper_models_list()) -@pytest.mark.parametrize("dataset_id", ["hf-internal-testing/librispeech_asr_dummy"]) -@pytest.mark.precommit -def test_whisper_on_hf_dataset(model_descr, dataset_id): - model_id, path, opt_pipe, genai_pipe = read_whisper_model(model_descr) +def run_pipeline_with_ref( + model_id: str, + tmp_path: str, + sample: np.ndarray | list[np.ndarray], + generation_config: GenerationConfig | None = None, + print_infer_time=False, +): + _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path)) + + if type(sample) is np.ndarray and len(sample.shape) == 1: + sample = np.expand_dims(sample, 0) + + hf_infer_time, genai_infer_time = 0, 0 + hf_result, genai_result = None, None + for _sample in sample: + start = time.time() + genai_result = run_genai(genai_pipe, _sample, generation_config) + genai_infer_time += time.time() - start + + start = time.time() + hf_result = run_huggingface(hf_pipe, _sample, generation_config) + hf_infer_time += time.time() - start + + compare_results(hf_result, genai_result) + + if print_infer_time: + print(f"\nInference time HF: {hf_infer_time:.2f} GenAI: {genai_infer_time:.2f}") + + assert hf_result is not None + assert genai_result is not None - compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id) + return hf_result, genai_result + + +def compare_results(hf_result, genai_result): + assert genai_result.texts[0] == hf_result["text"] + + # transformers 4.47 updated return_timestamps implementation + # enable once genai implementation aligned with trasformets. Ticket 160205. + return + + if "chunks" not in hf_result and genai_result.chunks is None: + return + + assert len(genai_result.chunks) == len(hf_result["chunks"]) + + for opt_chunk, genai_chunk in zip(hf_result["chunks"], genai_result.chunks): + assert opt_chunk["text"] == genai_chunk.text + assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2) + if opt_chunk["timestamp"][1]: + assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2) + else: + assert opt_chunk["timestamp"][1] == None + assert round(genai_chunk.end_ts, 2) == -1.0 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @@ -182,16 +244,11 @@ def test_whisper_on_hf_dataset(model_descr, dataset_id): ) @pytest.mark.precommit def test_smoke(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - - expected = opt_pipe(test_sample) - - genai_result = pipe.generate(test_sample) - - assert genai_result.texts[0] == expected["text"] - - assert "chunks" not in expected - assert genai_result.chunks == None + run_pipeline_with_ref( + model_id=model_descr[0], + tmp_path=model_descr[1], + sample=test_sample, + ) @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @@ -259,79 +316,55 @@ def test_whisper_constructors(model_descr, test_sample): def test_max_new_tokens(model_descr, test_sample): model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - expected = opt_pipe(test_sample, max_new_tokens=10)["text"] + expected = opt_pipe(test_sample, max_new_tokens=10) genai_result = pipe.generate(test_sample, max_new_tokens=10) - assert genai_result.texts[0] == expected - - genai_result = pipe.generate(test_sample) - - assert genai_result.texts[0] != expected + compare_results(expected, genai_result) config = pipe.get_generation_config() config.max_new_tokens = 10 genai_result = pipe.generate(test_sample, config) - assert genai_result.texts[0] == expected + compare_results(expected, genai_result) @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @pytest.mark.parametrize( - "test_sample", get_samples_from_dataset(language="fr", length=3) + "test_samples", + [ + (get_samples_from_dataset(language="fr", length=1), "fr"), + (get_samples_from_dataset(language="de", length=1), "de"), + ], ) @pytest.mark.precommit -def test_language_mode_fr(model_descr, test_sample): - model_id, path = model_descr +def test_language_mode(model_descr, test_samples): model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + samples, language = test_samples expected = opt_pipe( - test_sample, max_new_tokens=30, generate_kwargs={"language": "fr"} + samples[0], max_new_tokens=30, generate_kwargs={"language": language} ) - genai_result = pipe.generate(test_sample, max_new_tokens=30, language="<|fr|>") - - assert genai_result.texts[0] == expected["text"] - - config = pipe.get_generation_config() - config.max_new_tokens = 30 - config.language = "<|fr|>" - genai_result = pipe.generate(test_sample, config) - - assert genai_result.texts[0] == expected["text"] - - -@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) -@pytest.mark.parametrize( - "test_sample", get_samples_from_dataset(language="de", length=3) -) -@pytest.mark.precommit -def test_language_mode_de(model_descr, test_sample): - model_id, path = model_descr - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - - expected = opt_pipe( - test_sample, max_new_tokens=30, generate_kwargs={"language": "de"} + genai_result = pipe.generate( + samples[0], max_new_tokens=30, language=f"<|{language}|>" ) - genai_result = pipe.generate(test_sample, max_new_tokens=30, language="<|de|>") - - assert genai_result.texts[0] == expected["text"] + compare_results(expected, genai_result) config = pipe.get_generation_config() config.max_new_tokens = 30 - config.language = "<|de|>" - genai_result = pipe.generate(test_sample, config) + config.language = f"<|{language}|>" + genai_result = pipe.generate(samples[0], config) - assert genai_result.texts[0] == expected["text"] + compare_results(expected, genai_result) @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @pytest.mark.parametrize( - "test_sample", get_samples_from_dataset(language="fr", length=3) + "test_sample", get_samples_from_dataset(language="fr", length=1) ) @pytest.mark.precommit def test_task_mode(model_descr, test_sample): - model_id, path = model_descr model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) expected = opt_pipe( @@ -344,7 +377,7 @@ def test_task_mode(model_descr, test_sample): test_sample, max_new_tokens=30, language="<|fr|>", task="translate" ) - assert genai_result.texts[0] == expected["text"] + compare_results(expected, genai_result) config = pipe.get_generation_config() config.max_new_tokens = 30 @@ -352,27 +385,7 @@ def test_task_mode(model_descr, test_sample): config.task = "translate" genai_result = pipe.generate(test_sample, config) - assert genai_result.texts[0] == expected["text"] - - expected = opt_pipe( - test_sample, - max_new_tokens=30, - generate_kwargs={"language": "ru", "task": "translate"}, - ) - - genai_result = pipe.generate( - test_sample, max_new_tokens=30, language="<|ru|>", task="translate" - ) - - assert genai_result.texts[0] == expected["text"] - - config = pipe.get_generation_config() - config.max_new_tokens = 30 - config.language = "<|ru|>" - config.task = "translate" - genai_result = pipe.generate(test_sample, config) - - assert genai_result.texts[0] == expected["text"] + compare_results(expected, genai_result) # seems to be equivalent to translate task expected = opt_pipe( @@ -385,7 +398,7 @@ def test_task_mode(model_descr, test_sample): test_sample, max_new_tokens=30, language="<|en|>", task="transcribe" ) - assert genai_result.texts[0] == expected["text"] + compare_results(expected, genai_result) config = pipe.get_generation_config() config.max_new_tokens = 30 @@ -393,21 +406,20 @@ def test_task_mode(model_descr, test_sample): config.task = "transcribe" genai_result = pipe.generate(test_sample, config) - assert genai_result.texts[0] == expected["text"] + compare_results(expected, genai_result) @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @pytest.mark.parametrize( "test_sample", [ - *get_samples_from_dataset(language="fr", length=2), - *get_samples_from_dataset(language="de", length=2), - *get_samples_from_dataset(language="es", length=2), + *get_samples_from_dataset(language="fr", length=1), + *get_samples_from_dataset(language="de", length=1), + *get_samples_from_dataset(language="es", length=1), ], ) @pytest.mark.precommit def test_language_autodetect(model_descr, test_sample): - model_id, path = model_descr model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) input_features = opt_pipe.feature_extractor(test_sample) @@ -415,189 +427,79 @@ def test_language_autodetect(model_descr, test_sample): # ensure detected language us not english assert language_id != pipe.get_generation_config().lang_to_id["<|en|>"] - expected = opt_pipe( - test_sample, - max_new_tokens=30, + run_pipeline_with_ref( + model_id=model_descr[0], + tmp_path=model_descr[1], + sample=test_sample, + generation_config=GenerationConfig(max_new_tokens=30), ) - genai_result = pipe.generate(test_sample, max_new_tokens=30) - - assert genai_result.texts[0] == expected["text"] - @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) -@pytest.mark.parametrize( - "test_sample", - [ - *get_samples_from_dataset(language="en", length=10, long_form=True), - ], -) +@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) @pytest.mark.precommit def test_return_timestamps_short_form(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - # long form audio not supported yet - test_sample = test_sample[: 16000 * 30] - - expected = opt_pipe( - test_sample, - return_timestamps=True, + run_pipeline_with_ref( + model_id=model_descr[0], + tmp_path=model_descr[1], + sample=test_sample, + generation_config=GenerationConfig(return_timestamps=True), ) - genai_result = pipe.generate( - test_sample.tolist(), - return_timestamps=True, - ) - - assert genai_result.texts[0] == expected["text"] - - assert len(genai_result.chunks) == len(expected["chunks"]) - - for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks): - assert opt_chunk["text"] == genai_chunk.text - assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2) - assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2) - @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) -@pytest.mark.parametrize( - "test_sample", - [ - *get_samples_from_dataset(language="en", length=10, long_form=True), - ], -) +@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) @pytest.mark.precommit def test_return_timestamps_max_new_tokens_short_form(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - # long form audio not supported yet - test_sample = test_sample[: 16000 * 30] - - expected = opt_pipe( - test_sample, - return_timestamps=True, - max_new_tokens=15, - generate_kwargs={"language": "en"}, + run_pipeline_with_ref( + model_id=model_descr[0], + tmp_path=model_descr[1], + sample=test_sample, + generation_config=GenerationConfig( + return_timestamps=True, language="en", max_new_tokens=30 + ), ) - genai_result = pipe.generate( - test_sample.tolist(), - max_new_tokens=15, - return_timestamps=True, - language="<|en|>", - ) - assert genai_result.texts[0] == expected["text"] - - assert len(genai_result.chunks) == len(expected["chunks"]) - - for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks): - assert opt_chunk["text"] == genai_chunk.text - assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2) - if opt_chunk["timestamp"][1]: - assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2) - else: - assert opt_chunk["timestamp"][1] == None - assert round(genai_chunk.end_ts, 2) == -1.0 - - -@pytest.mark.parametrize("model_descr", get_whisper_models_list(multilingual=True)) +@pytest.mark.parametrize("model_descr", get_whisper_models_list()) @pytest.mark.parametrize( - "test_sample", - [ - *get_samples_from_dataset(language="en", length=10, long_form=True), - *get_samples_from_dataset(language="fr", length=10, long_form=True), - ], + "test_sample", get_samples_from_dataset(length=10, long_form=True) ) @pytest.mark.precommit -def test_longform_audio_return_timestamps_multilingual(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - - expected = opt_pipe( - test_sample, - return_timestamps=True, - ) - +def test_longform_audio(model_descr, test_sample): streamer_result = [] - genai_result = pipe.generate( - test_sample, - return_timestamps=True, - streamer=lambda x: streamer_result.append(x), + hf_result, genai_result = run_pipeline_with_ref( + model_id=model_descr[0], + tmp_path=model_descr[1], + sample=test_sample, + generation_config=GenerationConfig( + return_timestamps=True, + streamer=lambda x: streamer_result.append(x), + ), ) - assert genai_result.texts[0] == expected["text"] - assert "".join(streamer_result) == expected["text"] + assert "".join(streamer_result) == hf_result["text"] - assert len(genai_result.chunks) == len(expected["chunks"]) - for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks): - assert opt_chunk["text"] == genai_chunk.text - assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2) - if opt_chunk["timestamp"][1]: - assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2) - else: - assert opt_chunk["timestamp"][1] == None - assert round(genai_chunk.end_ts, 2) == -1.0 - - -@pytest.mark.parametrize("model_descr", get_whisper_models_list(en_only=True)) -@pytest.mark.parametrize( - "test_sample", - [ - *get_samples_from_dataset(language="en", length=10, long_form=True), - ], -) +@pytest.mark.parametrize("model_descr", get_whisper_models_list()) @pytest.mark.precommit -def test_longform_audio_return_timestamps_en(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - - expected = opt_pipe( - test_sample, - return_timestamps=True, +def test_shortform(model_descr): + samples = [] + ds = datasets.load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation" ) - streamer_result = [] + for ds_row in ds: + samples.append(ds_row["audio"]["array"]) - genai_result = pipe.generate( - test_sample, - return_timestamps=True, - streamer=lambda x: streamer_result.append(x), + run_pipeline_with_ref( + model_id=model_descr[0], + tmp_path=model_descr[1], + sample=samples, + print_infer_time=True, ) - assert genai_result.texts[0] == expected["text"] - assert "".join(streamer_result) == expected["text"] - - assert len(genai_result.chunks) == len(expected["chunks"]) - - for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks): - assert opt_chunk["text"] == genai_chunk.text - assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2) - if opt_chunk["timestamp"][1]: - assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2) - else: - assert opt_chunk["timestamp"][1] == None - assert round(genai_chunk.end_ts, 2) == -1.0 - - -@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) -@pytest.mark.parametrize( - "test_sample", - [ - *get_samples_from_dataset(language="en", length=3, long_form=True), - *get_samples_from_dataset(language="sp", length=3, long_form=True), - ], -) -@pytest.mark.precommit -def test_longform_audio(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - - expected = opt_pipe(test_sample, return_timestamps=True) - - genai_result = pipe.generate(test_sample) - - assert genai_result.texts[0] == expected["text"] - - assert genai_result.chunks == None - @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @pytest.mark.parametrize( From 1900c76488af5bde8561721b28dda52fe374df66 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Wed, 8 Jan 2025 09:25:43 +0100 Subject: [PATCH 2/7] remove debug --- tests/python_tests/test_whisper_pipeline.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py index 1c0871768c..17e839c989 100644 --- a/tests/python_tests/test_whisper_pipeline.py +++ b/tests/python_tests/test_whisper_pipeline.py @@ -38,8 +38,6 @@ def get_whisper_models_list(tiny_only=False): if tiny_only: model_ids = ["openai/whisper-tiny"] - pytest.selected_model_ids = None - if pytest.selected_model_ids: model_ids = [ model_id From 09fe6c427f4e49fe95aa70cb37961396ce4b671c Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Wed, 8 Jan 2025 09:30:52 +0100 Subject: [PATCH 3/7] Unfix optimum intel --- samples/export-requirements.txt | 2 +- tests/python_tests/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt index 2f71891b7b..af38558656 100644 --- a/samples/export-requirements.txt +++ b/samples/export-requirements.txt @@ -2,7 +2,7 @@ --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly openvino-tokenizers~=2025.0.0.0.dev -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631 +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git numpy<2.0.0; sys_platform == 'darwin' einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index e23eaacc21..c851c71ee5 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cpu diffusers==0.32.1 -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631 +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64" onnx==1.17.0 pytest From 1b98cc0438e643f41a61387ac3f0db292ac17f90 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Wed, 8 Jan 2025 10:02:27 +0100 Subject: [PATCH 4/7] Remove debug --- tests/python_tests/test_whisper_pipeline.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py index 17e839c989..c89a74f3d7 100644 --- a/tests/python_tests/test_whisper_pipeline.py +++ b/tests/python_tests/test_whisper_pipeline.py @@ -184,29 +184,19 @@ def run_pipeline_with_ref( tmp_path: str, sample: np.ndarray | list[np.ndarray], generation_config: GenerationConfig | None = None, - print_infer_time=False, ): _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path)) if type(sample) is np.ndarray and len(sample.shape) == 1: sample = np.expand_dims(sample, 0) - hf_infer_time, genai_infer_time = 0, 0 hf_result, genai_result = None, None for _sample in sample: - start = time.time() genai_result = run_genai(genai_pipe, _sample, generation_config) - genai_infer_time += time.time() - start - - start = time.time() hf_result = run_huggingface(hf_pipe, _sample, generation_config) - hf_infer_time += time.time() - start compare_results(hf_result, genai_result) - if print_infer_time: - print(f"\nInference time HF: {hf_infer_time:.2f} GenAI: {genai_infer_time:.2f}") - assert hf_result is not None assert genai_result is not None @@ -495,7 +485,6 @@ def test_shortform(model_descr): model_id=model_descr[0], tmp_path=model_descr[1], sample=samples, - print_infer_time=True, ) From 125e7714dc7cde3d8faffa61f0cc9f14098067ea Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Wed, 8 Jan 2025 10:16:13 +0100 Subject: [PATCH 5/7] Do not return results from run_pipeline --- tests/python_tests/test_whisper_pipeline.py | 28 ++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py index c89a74f3d7..52787108da 100644 --- a/tests/python_tests/test_whisper_pipeline.py +++ b/tests/python_tests/test_whisper_pipeline.py @@ -190,18 +190,12 @@ def run_pipeline_with_ref( if type(sample) is np.ndarray and len(sample.shape) == 1: sample = np.expand_dims(sample, 0) - hf_result, genai_result = None, None for _sample in sample: genai_result = run_genai(genai_pipe, _sample, generation_config) hf_result = run_huggingface(hf_pipe, _sample, generation_config) compare_results(hf_result, genai_result) - assert hf_result is not None - assert genai_result is not None - - return hf_result, genai_result - def compare_results(hf_result, genai_result): assert genai_result.texts[0] == hf_result["text"] @@ -455,18 +449,24 @@ def test_return_timestamps_max_new_tokens_short_form(model_descr, test_sample): ) @pytest.mark.precommit def test_longform_audio(model_descr, test_sample): + _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr) + streamer_result = [] - hf_result, genai_result = run_pipeline_with_ref( - model_id=model_descr[0], - tmp_path=model_descr[1], - sample=test_sample, - generation_config=GenerationConfig( - return_timestamps=True, - streamer=lambda x: streamer_result.append(x), - ), + genai_result = run_genai( + genai_pipe, + test_sample, + config=GenerationConfig(streamer=lambda x: streamer_result.append(x)), + ) + + hf_result = run_huggingface( + hf_pipe, + test_sample, + config=GenerationConfig(return_timestamps=True), ) + compare_results(hf_result, genai_result) + assert "".join(streamer_result) == hf_result["text"] From bff7f909872f78ccdd4d54f9cafc340c97c166d0 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Wed, 8 Jan 2025 11:53:36 +0100 Subject: [PATCH 6/7] Use genai config --- tests/python_tests/test_whisper_pipeline.py | 46 +++++++++------------ 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py index 52787108da..bce95d8a62 100644 --- a/tests/python_tests/test_whisper_pipeline.py +++ b/tests/python_tests/test_whisper_pipeline.py @@ -11,12 +11,10 @@ from optimum.intel.openvino import OVModelForSpeechSeq2Seq import gc import json -import time import typing import numpy as np import os import pathlib -from dataclasses import dataclass @pytest.fixture(scope="class", autouse=True) @@ -85,6 +83,7 @@ def read_whisper_model(params, **tokenizer_kwargs): model_id, export=True, trust_remote_code=True, + stateful=False, compile=False, device="CPU", load_in_8bit=False, @@ -109,26 +108,17 @@ def read_whisper_model(params, **tokenizer_kwargs): ) -@dataclass -class GenerationConfig: - task: str | None = None - language: str | None = None - return_timestamps: bool = False - max_new_tokens: int | None = None - streamer: typing.Callable[[str], bool] | None = None - - def run_huggingface( pipeline, sample, - config: GenerationConfig | None = None, + config: ov_genai.WhisperGenerationConfig | None = None, ): if not config: - config = GenerationConfig() + config = ov_genai.WhisperGenerationConfig() return pipeline( sample, - max_new_tokens=config.max_new_tokens, + max_new_tokens=min(config.max_new_tokens, 444), return_timestamps=config.return_timestamps, generate_kwargs={"language": config.language, "task": config.task}, ) @@ -137,20 +127,20 @@ def run_huggingface( def run_genai( pipeline: ov_genai.WhisperPipeline, sample, - config: GenerationConfig | None = None, + config: ov_genai.WhisperGenerationConfig | None = None, + streamer: typing.Callable[[str], bool] | None = None, ): if not config: - config = GenerationConfig() + config = ov_genai.WhisperGenerationConfig() genai_config = pipeline.get_generation_config() - if config.max_new_tokens: - genai_config.max_new_tokens = config.max_new_tokens + genai_config.max_new_tokens = config.max_new_tokens genai_config.return_timestamps = config.return_timestamps genai_config.task = config.task genai_config.language = f"<|{config.language}|>" if config.language else None - return pipeline.generate(sample, genai_config, streamer=config.streamer) + return pipeline.generate(sample, genai_config, streamer=streamer) def get_samples_from_dataset( @@ -174,7 +164,8 @@ def get_samples_from_dataset( ds = typing.cast(datasets.IterableDataset, ds) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16000)) - ds = ds.take(length) + ds = ds.skip(8) + ds = ds.take(1) return [x["audio"]["array"] for x in ds] @@ -183,7 +174,8 @@ def run_pipeline_with_ref( model_id: str, tmp_path: str, sample: np.ndarray | list[np.ndarray], - generation_config: GenerationConfig | None = None, + generation_config: ov_genai.WhisperGenerationConfig | None = None, + streamer: typing.Callable[[str], bool] | None = None, ): _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path)) @@ -191,7 +183,7 @@ def run_pipeline_with_ref( sample = np.expand_dims(sample, 0) for _sample in sample: - genai_result = run_genai(genai_pipe, _sample, generation_config) + genai_result = run_genai(genai_pipe, _sample, generation_config, streamer) hf_result = run_huggingface(hf_pipe, _sample, generation_config) compare_results(hf_result, genai_result) @@ -413,7 +405,7 @@ def test_language_autodetect(model_descr, test_sample): model_id=model_descr[0], tmp_path=model_descr[1], sample=test_sample, - generation_config=GenerationConfig(max_new_tokens=30), + generation_config=ov_genai.WhisperGenerationConfig(max_new_tokens=30), ) @@ -425,7 +417,7 @@ def test_return_timestamps_short_form(model_descr, test_sample): model_id=model_descr[0], tmp_path=model_descr[1], sample=test_sample, - generation_config=GenerationConfig(return_timestamps=True), + generation_config=ov_genai.WhisperGenerationConfig(return_timestamps=True), ) @@ -437,7 +429,7 @@ def test_return_timestamps_max_new_tokens_short_form(model_descr, test_sample): model_id=model_descr[0], tmp_path=model_descr[1], sample=test_sample, - generation_config=GenerationConfig( + generation_config=ov_genai.WhisperGenerationConfig( return_timestamps=True, language="en", max_new_tokens=30 ), ) @@ -456,13 +448,13 @@ def test_longform_audio(model_descr, test_sample): genai_result = run_genai( genai_pipe, test_sample, - config=GenerationConfig(streamer=lambda x: streamer_result.append(x)), + streamer=lambda x: streamer_result.append(x), ) hf_result = run_huggingface( hf_pipe, test_sample, - config=GenerationConfig(return_timestamps=True), + config=ov_genai.WhisperGenerationConfig(return_timestamps=True), ) compare_results(hf_result, genai_result) From 6b2ee9450434b92bcd4230517df8e8415b264509 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Wed, 8 Jan 2025 12:28:52 +0100 Subject: [PATCH 7/7] Install transformers 4.46.3 on Win --- .github/workflows/windows.yml | 6 ++++++ tests/python_tests/test_whisper_pipeline.py | 15 +++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index e396671b2c..bfe7432e41 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -310,6 +310,12 @@ jobs: . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels + + # will install transformers 4.46.3 version + # transformers 4.46.3 will enable return_timestamps tests + # this check enabled for windows only. Ticket: 160205. + python -m pip install git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631 + python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke" genai_python_lib_vlm: diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py index bce95d8a62..c046d1ae2c 100644 --- a/tests/python_tests/test_whisper_pipeline.py +++ b/tests/python_tests/test_whisper_pipeline.py @@ -15,6 +15,8 @@ import numpy as np import os import pathlib +import importlib.metadata as metadata +from packaging.version import parse @pytest.fixture(scope="class", autouse=True) @@ -164,8 +166,7 @@ def get_samples_from_dataset( ds = typing.cast(datasets.IterableDataset, ds) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16000)) - ds = ds.skip(8) - ds = ds.take(1) + ds = ds.take(length) return [x["audio"]["array"] for x in ds] @@ -193,8 +194,13 @@ def compare_results(hf_result, genai_result): assert genai_result.texts[0] == hf_result["text"] # transformers 4.47 updated return_timestamps implementation - # enable once genai implementation aligned with trasformets. Ticket 160205. - return + # remove once genai implementation aligned with transformers. Ticket 160205. + transformers_version_greater_4_47 = parse( + metadata.version("transformers") + ) >= parse("4.47.0") + + if transformers_version_greater_4_47: + return if "chunks" not in hf_result and genai_result.chunks is None: return @@ -448,6 +454,7 @@ def test_longform_audio(model_descr, test_sample): genai_result = run_genai( genai_pipe, test_sample, + config=ov_genai.WhisperGenerationConfig(return_timestamps=True), streamer=lambda x: streamer_result.append(x), )