From 66d81532f090198773295ea070b8488c8f455141 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Tue, 7 Jan 2025 19:31:01 +0100
Subject: [PATCH 1/7] Refactor tests

---
 tests/python_tests/test_whisper_pipeline.py | 452 ++++++++------------
 1 file changed, 177 insertions(+), 275 deletions(-)

diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py
index aa78666e32..1c0871768c 100644
--- a/tests/python_tests/test_whisper_pipeline.py
+++ b/tests/python_tests/test_whisper_pipeline.py
@@ -16,6 +16,8 @@
 import numpy as np
 import os
 import pathlib
+from dataclasses import dataclass
+
 
 @pytest.fixture(scope="class", autouse=True)
 def run_gc_after_test():
@@ -27,36 +29,31 @@ def run_gc_after_test():
     gc.collect()
 
 
-def get_whisper_models_list(tiny_only=False, multilingual=False, en_only=False):
-    precommit_models = [
+def get_whisper_models_list(tiny_only=False):
+    model_ids = [
         "openai/whisper-tiny",
-        "openai/whisper-tiny.en",
         "distil-whisper/distil-small.en",
     ]
-    if multilingual:
-        precommit_models = ["openai/whisper-tiny"]
-    if en_only:
-        precommit_models = ["openai/whisper-tiny.en", "distil-whisper/distil-small.en"]
-    if tiny_only:
-        precommit_models = ["openai/whisper-tiny"]
 
-    nightly_models = []
+    if tiny_only:
+        model_ids = ["openai/whisper-tiny"]
 
-    if pytest.run_marker == "precommit":
-        model_ids = precommit_models
-    else:
-        model_ids = nightly_models
+    pytest.selected_model_ids = None
 
     if pytest.selected_model_ids:
-        model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
+        model_ids = [
+            model_id
+            for model_id in model_ids
+            if model_id in pytest.selected_model_ids.split(" ")
+        ]
 
-    prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
-    return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
+    prefix = pathlib.Path(os.getenv("GENAI_MODELS_PATH_PREFIX", ""))
+    return [(model_id, prefix / model_id.split("/")[1]) for model_id in model_ids]
 
 
 # used whisper models are relatively small
 # cache them in memory to speedup tests
-@functools.lru_cache(3)
+@functools.lru_cache()
 def read_whisper_model(params, **tokenizer_kwargs):
     model_id, path = params
 
@@ -114,30 +111,48 @@ def read_whisper_model(params, **tokenizer_kwargs):
     )
 
 
-def compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id):
-    ds = datasets.load_dataset(dataset_id, "clean", split="validation")
-    opt_infer_time = 0
-    genai_infer_time = 0
+@dataclass
+class GenerationConfig:
+    task: str | None = None
+    language: str | None = None
+    return_timestamps: bool = False
+    max_new_tokens: int | None = None
+    streamer: typing.Callable[[str], bool] | None = None
 
-    for ds_row in ds:
-        audio_sample = ds_row["audio"]
 
-        streamer_result = []
+def run_huggingface(
+    pipeline,
+    sample,
+    config: GenerationConfig | None = None,
+):
+    if not config:
+        config = GenerationConfig()
+
+    return pipeline(
+        sample,
+        max_new_tokens=config.max_new_tokens,
+        return_timestamps=config.return_timestamps,
+        generate_kwargs={"language": config.language, "task": config.task},
+    )
 
-        start = time.time()
-        genai_result = genai_pipe.generate(
-            audio_sample["array"].tolist(), streamer=lambda x: streamer_result.append(x)
-        )
-        genai_infer_time += time.time() - start
 
-        start = time.time()
-        result = opt_pipe(audio_sample)
-        opt_infer_time += time.time() - start
+def run_genai(
+    pipeline: ov_genai.WhisperPipeline,
+    sample,
+    config: GenerationConfig | None = None,
+):
+    if not config:
+        config = GenerationConfig()
+
+    genai_config = pipeline.get_generation_config()
 
-        assert genai_result.texts[0] == result["text"]
-        assert "".join(streamer_result) == result["text"]
+    if config.max_new_tokens:
+        genai_config.max_new_tokens = config.max_new_tokens
+    genai_config.return_timestamps = config.return_timestamps
+    genai_config.task = config.task
+    genai_config.language = f"<|{config.language}|>" if config.language else None
 
-    print(f"Inference time\nOpt: {opt_infer_time}\nGenAI: {genai_infer_time}")
+    return pipeline.generate(sample, genai_config, streamer=config.streamer)
 
 
 def get_samples_from_dataset(
@@ -166,13 +181,60 @@ def get_samples_from_dataset(
     return [x["audio"]["array"] for x in ds]
 
 
-@pytest.mark.parametrize("model_descr", get_whisper_models_list())
-@pytest.mark.parametrize("dataset_id", ["hf-internal-testing/librispeech_asr_dummy"])
-@pytest.mark.precommit
-def test_whisper_on_hf_dataset(model_descr, dataset_id):
-    model_id, path, opt_pipe, genai_pipe = read_whisper_model(model_descr)
+def run_pipeline_with_ref(
+    model_id: str,
+    tmp_path: str,
+    sample: np.ndarray | list[np.ndarray],
+    generation_config: GenerationConfig | None = None,
+    print_infer_time=False,
+):
+    _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path))
+
+    if type(sample) is np.ndarray and len(sample.shape) == 1:
+        sample = np.expand_dims(sample, 0)
+
+    hf_infer_time, genai_infer_time = 0, 0
+    hf_result, genai_result = None, None
+    for _sample in sample:
+        start = time.time()
+        genai_result = run_genai(genai_pipe, _sample, generation_config)
+        genai_infer_time += time.time() - start
+
+        start = time.time()
+        hf_result = run_huggingface(hf_pipe, _sample, generation_config)
+        hf_infer_time += time.time() - start
+
+        compare_results(hf_result, genai_result)
+
+    if print_infer_time:
+        print(f"\nInference time HF: {hf_infer_time:.2f} GenAI: {genai_infer_time:.2f}")
+
+    assert hf_result is not None
+    assert genai_result is not None
 
-    compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id)
+    return hf_result, genai_result
+
+
+def compare_results(hf_result, genai_result):
+    assert genai_result.texts[0] == hf_result["text"]
+
+    # transformers 4.47 updated return_timestamps implementation
+    # enable once genai implementation aligned with trasformets. Ticket 160205.
+    return
+
+    if "chunks" not in hf_result and genai_result.chunks is None:
+        return
+
+    assert len(genai_result.chunks) == len(hf_result["chunks"])
+
+    for opt_chunk, genai_chunk in zip(hf_result["chunks"], genai_result.chunks):
+        assert opt_chunk["text"] == genai_chunk.text
+        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
+        if opt_chunk["timestamp"][1]:
+            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
+        else:
+            assert opt_chunk["timestamp"][1] == None
+            assert round(genai_chunk.end_ts, 2) == -1.0
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
@@ -182,16 +244,11 @@ def test_whisper_on_hf_dataset(model_descr, dataset_id):
 )
 @pytest.mark.precommit
 def test_smoke(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(test_sample)
-
-    genai_result = pipe.generate(test_sample)
-
-    assert genai_result.texts[0] == expected["text"]
-
-    assert "chunks" not in expected
-    assert genai_result.chunks == None
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+    )
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
@@ -259,79 +316,55 @@ def test_whisper_constructors(model_descr, test_sample):
 def test_max_new_tokens(model_descr, test_sample):
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
 
-    expected = opt_pipe(test_sample, max_new_tokens=10)["text"]
+    expected = opt_pipe(test_sample, max_new_tokens=10)
 
     genai_result = pipe.generate(test_sample, max_new_tokens=10)
 
-    assert genai_result.texts[0] == expected
-
-    genai_result = pipe.generate(test_sample)
-
-    assert genai_result.texts[0] != expected
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 10
     genai_result = pipe.generate(test_sample, config)
-    assert genai_result.texts[0] == expected
+    compare_results(expected, genai_result)
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
-    "test_sample", get_samples_from_dataset(language="fr", length=3)
+    "test_samples",
+    [
+        (get_samples_from_dataset(language="fr", length=1), "fr"),
+        (get_samples_from_dataset(language="de", length=1), "de"),
+    ],
 )
 @pytest.mark.precommit
-def test_language_mode_fr(model_descr, test_sample):
-    model_id, path = model_descr
+def test_language_mode(model_descr, test_samples):
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
+    samples, language = test_samples
 
     expected = opt_pipe(
-        test_sample, max_new_tokens=30, generate_kwargs={"language": "fr"}
+        samples[0], max_new_tokens=30, generate_kwargs={"language": language}
     )
 
-    genai_result = pipe.generate(test_sample, max_new_tokens=30, language="<|fr|>")
-
-    assert genai_result.texts[0] == expected["text"]
-
-    config = pipe.get_generation_config()
-    config.max_new_tokens = 30
-    config.language = "<|fr|>"
-    genai_result = pipe.generate(test_sample, config)
-
-    assert genai_result.texts[0] == expected["text"]
-
-
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample", get_samples_from_dataset(language="de", length=3)
-)
-@pytest.mark.precommit
-def test_language_mode_de(model_descr, test_sample):
-    model_id, path = model_descr
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(
-        test_sample, max_new_tokens=30, generate_kwargs={"language": "de"}
+    genai_result = pipe.generate(
+        samples[0], max_new_tokens=30, language=f"<|{language}|>"
     )
 
-    genai_result = pipe.generate(test_sample, max_new_tokens=30, language="<|de|>")
-
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 30
-    config.language = "<|de|>"
-    genai_result = pipe.generate(test_sample, config)
+    config.language = f"<|{language}|>"
+    genai_result = pipe.generate(samples[0], config)
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
-    "test_sample", get_samples_from_dataset(language="fr", length=3)
+    "test_sample", get_samples_from_dataset(language="fr", length=1)
 )
 @pytest.mark.precommit
 def test_task_mode(model_descr, test_sample):
-    model_id, path = model_descr
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
 
     expected = opt_pipe(
@@ -344,7 +377,7 @@ def test_task_mode(model_descr, test_sample):
         test_sample, max_new_tokens=30, language="<|fr|>", task="translate"
     )
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 30
@@ -352,27 +385,7 @@ def test_task_mode(model_descr, test_sample):
     config.task = "translate"
     genai_result = pipe.generate(test_sample, config)
 
-    assert genai_result.texts[0] == expected["text"]
-
-    expected = opt_pipe(
-        test_sample,
-        max_new_tokens=30,
-        generate_kwargs={"language": "ru", "task": "translate"},
-    )
-
-    genai_result = pipe.generate(
-        test_sample, max_new_tokens=30, language="<|ru|>", task="translate"
-    )
-
-    assert genai_result.texts[0] == expected["text"]
-
-    config = pipe.get_generation_config()
-    config.max_new_tokens = 30
-    config.language = "<|ru|>"
-    config.task = "translate"
-    genai_result = pipe.generate(test_sample, config)
-
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     # seems to be equivalent to translate task
     expected = opt_pipe(
@@ -385,7 +398,7 @@ def test_task_mode(model_descr, test_sample):
         test_sample, max_new_tokens=30, language="<|en|>", task="transcribe"
     )
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 30
@@ -393,21 +406,20 @@ def test_task_mode(model_descr, test_sample):
     config.task = "transcribe"
     genai_result = pipe.generate(test_sample, config)
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
     "test_sample",
     [
-        *get_samples_from_dataset(language="fr", length=2),
-        *get_samples_from_dataset(language="de", length=2),
-        *get_samples_from_dataset(language="es", length=2),
+        *get_samples_from_dataset(language="fr", length=1),
+        *get_samples_from_dataset(language="de", length=1),
+        *get_samples_from_dataset(language="es", length=1),
     ],
 )
 @pytest.mark.precommit
 def test_language_autodetect(model_descr, test_sample):
-    model_id, path = model_descr
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
 
     input_features = opt_pipe.feature_extractor(test_sample)
@@ -415,189 +427,79 @@ def test_language_autodetect(model_descr, test_sample):
     # ensure detected language us not english
     assert language_id != pipe.get_generation_config().lang_to_id["<|en|>"]
 
-    expected = opt_pipe(
-        test_sample,
-        max_new_tokens=30,
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+        generation_config=GenerationConfig(max_new_tokens=30),
     )
 
-    genai_result = pipe.generate(test_sample, max_new_tokens=30)
-
-    assert genai_result.texts[0] == expected["text"]
-
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-    ],
-)
+@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1))
 @pytest.mark.precommit
 def test_return_timestamps_short_form(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-    # long form audio not supported yet
-    test_sample = test_sample[: 16000 * 30]
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+        generation_config=GenerationConfig(return_timestamps=True),
     )
 
-    genai_result = pipe.generate(
-        test_sample.tolist(),
-        return_timestamps=True,
-    )
-
-    assert genai_result.texts[0] == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
-
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-    ],
-)
+@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1))
 @pytest.mark.precommit
 def test_return_timestamps_max_new_tokens_short_form(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-    # long form audio not supported yet
-    test_sample = test_sample[: 16000 * 30]
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
-        max_new_tokens=15,
-        generate_kwargs={"language": "en"},
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+        generation_config=GenerationConfig(
+            return_timestamps=True, language="en", max_new_tokens=30
+        ),
     )
 
-    genai_result = pipe.generate(
-        test_sample.tolist(),
-        max_new_tokens=15,
-        return_timestamps=True,
-        language="<|en|>",
-    )
 
-    assert genai_result.texts[0] == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
-
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        if opt_chunk["timestamp"][1]:
-            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-        else:
-            assert opt_chunk["timestamp"][1] == None
-            assert round(genai_chunk.end_ts, 2) == -1.0
-
-
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(multilingual=True))
+@pytest.mark.parametrize("model_descr", get_whisper_models_list())
 @pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-        *get_samples_from_dataset(language="fr", length=10, long_form=True),
-    ],
+    "test_sample", get_samples_from_dataset(length=10, long_form=True)
 )
 @pytest.mark.precommit
-def test_longform_audio_return_timestamps_multilingual(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
-    )
-
+def test_longform_audio(model_descr, test_sample):
     streamer_result = []
 
-    genai_result = pipe.generate(
-        test_sample,
-        return_timestamps=True,
-        streamer=lambda x: streamer_result.append(x),
+    hf_result, genai_result = run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+        generation_config=GenerationConfig(
+            return_timestamps=True,
+            streamer=lambda x: streamer_result.append(x),
+        ),
     )
 
-    assert genai_result.texts[0] == expected["text"]
-    assert "".join(streamer_result) == expected["text"]
+    assert "".join(streamer_result) == hf_result["text"]
 
-    assert len(genai_result.chunks) == len(expected["chunks"])
 
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        if opt_chunk["timestamp"][1]:
-            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-        else:
-            assert opt_chunk["timestamp"][1] == None
-            assert round(genai_chunk.end_ts, 2) == -1.0
-
-
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(en_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-    ],
-)
+@pytest.mark.parametrize("model_descr", get_whisper_models_list())
 @pytest.mark.precommit
-def test_longform_audio_return_timestamps_en(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
+def test_shortform(model_descr):
+    samples = []
+    ds = datasets.load_dataset(
+        "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
     )
 
-    streamer_result = []
+    for ds_row in ds:
+        samples.append(ds_row["audio"]["array"])
 
-    genai_result = pipe.generate(
-        test_sample,
-        return_timestamps=True,
-        streamer=lambda x: streamer_result.append(x),
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=samples,
+        print_infer_time=True,
     )
 
-    assert genai_result.texts[0] == expected["text"]
-    assert "".join(streamer_result) == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
-
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        if opt_chunk["timestamp"][1]:
-            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-        else:
-            assert opt_chunk["timestamp"][1] == None
-            assert round(genai_chunk.end_ts, 2) == -1.0
-
-
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=3, long_form=True),
-        *get_samples_from_dataset(language="sp", length=3, long_form=True),
-    ],
-)
-@pytest.mark.precommit
-def test_longform_audio(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(test_sample, return_timestamps=True)
-
-    genai_result = pipe.generate(test_sample)
-
-    assert genai_result.texts[0] == expected["text"]
-
-    assert genai_result.chunks == None
-
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(

From 1900c76488af5bde8561721b28dda52fe374df66 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 8 Jan 2025 09:25:43 +0100
Subject: [PATCH 2/7] remove debug

---
 tests/python_tests/test_whisper_pipeline.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py
index 1c0871768c..17e839c989 100644
--- a/tests/python_tests/test_whisper_pipeline.py
+++ b/tests/python_tests/test_whisper_pipeline.py
@@ -38,8 +38,6 @@ def get_whisper_models_list(tiny_only=False):
     if tiny_only:
         model_ids = ["openai/whisper-tiny"]
 
-    pytest.selected_model_ids = None
-
     if pytest.selected_model_ids:
         model_ids = [
             model_id

From 09fe6c427f4e49fe95aa70cb37961396ce4b671c Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 8 Jan 2025 09:30:52 +0100
Subject: [PATCH 3/7] Unfix optimum intel

---
 samples/export-requirements.txt     | 2 +-
 tests/python_tests/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index 2f71891b7b..af38558656 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -2,7 +2,7 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino-tokenizers~=2025.0.0.0.dev
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index e23eaacc21..c851c71ee5 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 diffusers==0.32.1
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
 onnx==1.17.0
 pytest

From 1b98cc0438e643f41a61387ac3f0db292ac17f90 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 8 Jan 2025 10:02:27 +0100
Subject: [PATCH 4/7] Remove debug

---
 tests/python_tests/test_whisper_pipeline.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py
index 17e839c989..c89a74f3d7 100644
--- a/tests/python_tests/test_whisper_pipeline.py
+++ b/tests/python_tests/test_whisper_pipeline.py
@@ -184,29 +184,19 @@ def run_pipeline_with_ref(
     tmp_path: str,
     sample: np.ndarray | list[np.ndarray],
     generation_config: GenerationConfig | None = None,
-    print_infer_time=False,
 ):
     _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path))
 
     if type(sample) is np.ndarray and len(sample.shape) == 1:
         sample = np.expand_dims(sample, 0)
 
-    hf_infer_time, genai_infer_time = 0, 0
     hf_result, genai_result = None, None
     for _sample in sample:
-        start = time.time()
         genai_result = run_genai(genai_pipe, _sample, generation_config)
-        genai_infer_time += time.time() - start
-
-        start = time.time()
         hf_result = run_huggingface(hf_pipe, _sample, generation_config)
-        hf_infer_time += time.time() - start
 
         compare_results(hf_result, genai_result)
 
-    if print_infer_time:
-        print(f"\nInference time HF: {hf_infer_time:.2f} GenAI: {genai_infer_time:.2f}")
-
     assert hf_result is not None
     assert genai_result is not None
 
@@ -495,7 +485,6 @@ def test_shortform(model_descr):
         model_id=model_descr[0],
         tmp_path=model_descr[1],
         sample=samples,
-        print_infer_time=True,
     )
 
 

From 125e7714dc7cde3d8faffa61f0cc9f14098067ea Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 8 Jan 2025 10:16:13 +0100
Subject: [PATCH 5/7] Do not return results from run_pipeline

---
 tests/python_tests/test_whisper_pipeline.py | 28 ++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py
index c89a74f3d7..52787108da 100644
--- a/tests/python_tests/test_whisper_pipeline.py
+++ b/tests/python_tests/test_whisper_pipeline.py
@@ -190,18 +190,12 @@ def run_pipeline_with_ref(
     if type(sample) is np.ndarray and len(sample.shape) == 1:
         sample = np.expand_dims(sample, 0)
 
-    hf_result, genai_result = None, None
     for _sample in sample:
         genai_result = run_genai(genai_pipe, _sample, generation_config)
         hf_result = run_huggingface(hf_pipe, _sample, generation_config)
 
         compare_results(hf_result, genai_result)
 
-    assert hf_result is not None
-    assert genai_result is not None
-
-    return hf_result, genai_result
-
 
 def compare_results(hf_result, genai_result):
     assert genai_result.texts[0] == hf_result["text"]
@@ -455,18 +449,24 @@ def test_return_timestamps_max_new_tokens_short_form(model_descr, test_sample):
 )
 @pytest.mark.precommit
 def test_longform_audio(model_descr, test_sample):
+    _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr)
+
     streamer_result = []
 
-    hf_result, genai_result = run_pipeline_with_ref(
-        model_id=model_descr[0],
-        tmp_path=model_descr[1],
-        sample=test_sample,
-        generation_config=GenerationConfig(
-            return_timestamps=True,
-            streamer=lambda x: streamer_result.append(x),
-        ),
+    genai_result = run_genai(
+        genai_pipe,
+        test_sample,
+        config=GenerationConfig(streamer=lambda x: streamer_result.append(x)),
+    )
+
+    hf_result = run_huggingface(
+        hf_pipe,
+        test_sample,
+        config=GenerationConfig(return_timestamps=True),
     )
 
+    compare_results(hf_result, genai_result)
+
     assert "".join(streamer_result) == hf_result["text"]
 
 

From bff7f909872f78ccdd4d54f9cafc340c97c166d0 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 8 Jan 2025 11:53:36 +0100
Subject: [PATCH 6/7] Use genai config

---
 tests/python_tests/test_whisper_pipeline.py | 46 +++++++++------------
 1 file changed, 19 insertions(+), 27 deletions(-)

diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py
index 52787108da..bce95d8a62 100644
--- a/tests/python_tests/test_whisper_pipeline.py
+++ b/tests/python_tests/test_whisper_pipeline.py
@@ -11,12 +11,10 @@
 from optimum.intel.openvino import OVModelForSpeechSeq2Seq
 import gc
 import json
-import time
 import typing
 import numpy as np
 import os
 import pathlib
-from dataclasses import dataclass
 
 
 @pytest.fixture(scope="class", autouse=True)
@@ -85,6 +83,7 @@ def read_whisper_model(params, **tokenizer_kwargs):
             model_id,
             export=True,
             trust_remote_code=True,
+            stateful=False,
             compile=False,
             device="CPU",
             load_in_8bit=False,
@@ -109,26 +108,17 @@ def read_whisper_model(params, **tokenizer_kwargs):
     )
 
 
-@dataclass
-class GenerationConfig:
-    task: str | None = None
-    language: str | None = None
-    return_timestamps: bool = False
-    max_new_tokens: int | None = None
-    streamer: typing.Callable[[str], bool] | None = None
-
-
 def run_huggingface(
     pipeline,
     sample,
-    config: GenerationConfig | None = None,
+    config: ov_genai.WhisperGenerationConfig | None = None,
 ):
     if not config:
-        config = GenerationConfig()
+        config = ov_genai.WhisperGenerationConfig()
 
     return pipeline(
         sample,
-        max_new_tokens=config.max_new_tokens,
+        max_new_tokens=min(config.max_new_tokens, 444),
         return_timestamps=config.return_timestamps,
         generate_kwargs={"language": config.language, "task": config.task},
     )
@@ -137,20 +127,20 @@ def run_huggingface(
 def run_genai(
     pipeline: ov_genai.WhisperPipeline,
     sample,
-    config: GenerationConfig | None = None,
+    config: ov_genai.WhisperGenerationConfig | None = None,
+    streamer: typing.Callable[[str], bool] | None = None,
 ):
     if not config:
-        config = GenerationConfig()
+        config = ov_genai.WhisperGenerationConfig()
 
     genai_config = pipeline.get_generation_config()
 
-    if config.max_new_tokens:
-        genai_config.max_new_tokens = config.max_new_tokens
+    genai_config.max_new_tokens = config.max_new_tokens
     genai_config.return_timestamps = config.return_timestamps
     genai_config.task = config.task
     genai_config.language = f"<|{config.language}|>" if config.language else None
 
-    return pipeline.generate(sample, genai_config, streamer=config.streamer)
+    return pipeline.generate(sample, genai_config, streamer=streamer)
 
 
 def get_samples_from_dataset(
@@ -174,7 +164,8 @@ def get_samples_from_dataset(
 
     ds = typing.cast(datasets.IterableDataset, ds)
     ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16000))
-    ds = ds.take(length)
+    ds = ds.skip(8)
+    ds = ds.take(1)
 
     return [x["audio"]["array"] for x in ds]
 
@@ -183,7 +174,8 @@ def run_pipeline_with_ref(
     model_id: str,
     tmp_path: str,
     sample: np.ndarray | list[np.ndarray],
-    generation_config: GenerationConfig | None = None,
+    generation_config: ov_genai.WhisperGenerationConfig | None = None,
+    streamer: typing.Callable[[str], bool] | None = None,
 ):
     _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path))
 
@@ -191,7 +183,7 @@ def run_pipeline_with_ref(
         sample = np.expand_dims(sample, 0)
 
     for _sample in sample:
-        genai_result = run_genai(genai_pipe, _sample, generation_config)
+        genai_result = run_genai(genai_pipe, _sample, generation_config, streamer)
         hf_result = run_huggingface(hf_pipe, _sample, generation_config)
 
         compare_results(hf_result, genai_result)
@@ -413,7 +405,7 @@ def test_language_autodetect(model_descr, test_sample):
         model_id=model_descr[0],
         tmp_path=model_descr[1],
         sample=test_sample,
-        generation_config=GenerationConfig(max_new_tokens=30),
+        generation_config=ov_genai.WhisperGenerationConfig(max_new_tokens=30),
     )
 
 
@@ -425,7 +417,7 @@ def test_return_timestamps_short_form(model_descr, test_sample):
         model_id=model_descr[0],
         tmp_path=model_descr[1],
         sample=test_sample,
-        generation_config=GenerationConfig(return_timestamps=True),
+        generation_config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
     )
 
 
@@ -437,7 +429,7 @@ def test_return_timestamps_max_new_tokens_short_form(model_descr, test_sample):
         model_id=model_descr[0],
         tmp_path=model_descr[1],
         sample=test_sample,
-        generation_config=GenerationConfig(
+        generation_config=ov_genai.WhisperGenerationConfig(
             return_timestamps=True, language="en", max_new_tokens=30
         ),
     )
@@ -456,13 +448,13 @@ def test_longform_audio(model_descr, test_sample):
     genai_result = run_genai(
         genai_pipe,
         test_sample,
-        config=GenerationConfig(streamer=lambda x: streamer_result.append(x)),
+        streamer=lambda x: streamer_result.append(x),
     )
 
     hf_result = run_huggingface(
         hf_pipe,
         test_sample,
-        config=GenerationConfig(return_timestamps=True),
+        config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
     )
 
     compare_results(hf_result, genai_result)

From 6b2ee9450434b92bcd4230517df8e8415b264509 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 8 Jan 2025 12:28:52 +0100
Subject: [PATCH 7/7] Install transformers 4.46.3 on Win

---
 .github/workflows/windows.yml               |  6 ++++++
 tests/python_tests/test_whisper_pipeline.py | 15 +++++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index e396671b2c..bfe7432e41 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -310,6 +310,12 @@ jobs:
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
           python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels
+          
+          # will install transformers 4.46.3 version
+          # transformers 4.46.3 will enable return_timestamps tests
+          # this check enabled for windows only. Ticket: 160205.
+          python -m pip install git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
+          
           python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke"
 
   genai_python_lib_vlm:
diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py
index bce95d8a62..c046d1ae2c 100644
--- a/tests/python_tests/test_whisper_pipeline.py
+++ b/tests/python_tests/test_whisper_pipeline.py
@@ -15,6 +15,8 @@
 import numpy as np
 import os
 import pathlib
+import importlib.metadata as metadata
+from packaging.version import parse
 
 
 @pytest.fixture(scope="class", autouse=True)
@@ -164,8 +166,7 @@ def get_samples_from_dataset(
 
     ds = typing.cast(datasets.IterableDataset, ds)
     ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16000))
-    ds = ds.skip(8)
-    ds = ds.take(1)
+    ds = ds.take(length)
 
     return [x["audio"]["array"] for x in ds]
 
@@ -193,8 +194,13 @@ def compare_results(hf_result, genai_result):
     assert genai_result.texts[0] == hf_result["text"]
 
     # transformers 4.47 updated return_timestamps implementation
-    # enable once genai implementation aligned with trasformets. Ticket 160205.
-    return
+    # remove once genai implementation aligned with transformers. Ticket 160205.
+    transformers_version_greater_4_47 = parse(
+        metadata.version("transformers")
+    ) >= parse("4.47.0")
+
+    if transformers_version_greater_4_47:
+        return
 
     if "chunks" not in hf_result and genai_result.chunks is None:
         return
@@ -448,6 +454,7 @@ def test_longform_audio(model_descr, test_sample):
     genai_result = run_genai(
         genai_pipe,
         test_sample,
+        config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
         streamer=lambda x: streamer_result.append(x),
     )