Merge remote-tracking branch 'upstream/master' into cb-by-default

openvinotoolkit · Jan 7, 2025 · 6038663 · 6038663
2 parents 2f99472 + 48dfd16
commit 6038663
Show file tree

Hide file tree

Showing 37 changed files with 1,180 additions and 866 deletions.
diff --git a/.github/workflows/llm_bench-python.yml → .github/workflows/genai-tools.yml b/.github/workflows/llm_bench-python.yml → .github/workflows/genai-tools.yml
@@ -1,7 +1,7 @@
 # This workflow will install Python dependencies, run tests and lint with a single version of Python
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 
-name: llm_bench Python Test
+name: GenAI tools
 
 on:
   workflow_dispatch:
@@ -46,7 +46,8 @@ jobs:
         commit_packages_to_provide: wheels
         revision: latest_available_commit
 
-  build:
+  llm_bench:
+    name: 'LLM bench tests'
     defaults:
       run:
         shell: bash
@@ -60,7 +61,6 @@ jobs:
       OV_INSTALL_DIR: ${{ github.workspace }}/ov
       SRC_DIR: ${{ github.workspace }}
       LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench
-      WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark
 
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -70,6 +70,12 @@ jobs:
         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
           python-version: ${{ matrix.python-version }}
+      - name: Lint with flake8
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install flake8 pytest black
+          # stop the build if there are Python syntax errors or undefined names
+          python -m flake8 ${{ env.LLM_BENCH_PYPATH }} --config=${{ env.LLM_BENCH_PYPATH }}/setup.cfg
       - name: Download OpenVINO package
         uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
@@ -78,59 +84,42 @@ jobs:
           merge-multiple: true
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          python -m pip install flake8 pytest black
           python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
           python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
-          GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          python -m pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
         working-directory: ${{ env.OV_INSTALL_DIR }}
-      - name: Lint with flake8
-        run: |
-          # stop the build if there are Python syntax errors or undefined names
-          python -m flake8 ${{ env.LLM_BENCH_PYPATH }} --config=${{ env.LLM_BENCH_PYPATH }}/setup.cfg
-          python -m flake8 ${{ env.WWB_PATH }} --config=${{ env.WWB_PATH }}/setup.cfg
-      - name: Create code style diff for samples
-        if: failure()
-        run: |
-          python -m black -l 160 -S ${{ env.LLM_BENCH_PYPATH }}/
-          git diff > llm.bench_diff.diff
-      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
-        if: failure()
-        with:
-          name: llm.bench_diff
-          path: llm.bench_diff.diff
-      - name: Test native pytorch model on Linux
+      - name: Test native pytorch model
         run: |
           git clone --depth 1 https://huggingface.co/katuni4ka/tiny-random-qwen
           python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt -ic 20
           rm -rf tiny-random-qwen
         env:
           GIT_LFS_SKIP_SMUDGE: 0
-      - name: Test tiny-random-baichuan2 on Linux Optimum Intel
+      - name: Test tiny-random-baichuan2 Optimum Intel
         run: |
           optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16
           python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 --optimum -ic 10
           rm -rf ./ov_models/tiny-random-baichuan2
-      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux Optimum Intel
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov Optimum Intel
         run: |
           huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir ov_models/lcm_dreamshaper_v7
           python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum --num_steps 4
-      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov with GenAI
         run: |
           python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --num_steps 4
-      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI and LoRA
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov with GenAI and LoRA
         run: |
           wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
           python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 --num_steps 4
           rm -rf ./ov_models/lcm_dreamshaper_v7/
-      - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
+      - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Decoding via GenAI
         run: |
           optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16
           optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8
           python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --assistant_confidence_threshold 0.4 -ic 20
           python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --num_assistant_tokens 5 -ic 20
           rm -rf ov_models/TinyLlama-1.1B-Chat-v1.0
-      - name: Test whisper-tiny on Linux
+      - name: Test whisper-tiny via GenAI
         run: |
           GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech
           cd multilingual_librispeech
@@ -143,60 +132,64 @@ jobs:
           python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1
           rm -rf ./ov_models/whisper-tiny
           rm -rf multilingual_librispeech
-      - name: Text InternVL2-1B on Linux
+      - name: Text InternVL2-1B via GenAI
         run: |
           optimum-cli export openvino --model OpenGVLab/InternVL2-1B ./ov_models/internvl2-1B --task image-text-to-text --trust-remote-code
           python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20
           python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20 --optimum
           rm -rf ./ov_models/internvl2-1B
-      - name: WWB Tests
-        run: |
-          pip install git+https://github.com/huggingface/optimum-intel.git
-          GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
-          python -m pytest -v ${{ env.WWB_PATH }}/tests
-  stateful:
+
+  wwb:
+    name: 'WWB tests'
     defaults:
       run:
         shell: bash
     runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.11"]
     needs: [ openvino_download ]
     env:
       OV_INSTALL_DIR: ${{ github.workspace }}/ov
       SRC_DIR: ${{ github.workspace }}
-      LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench
       WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark
 
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           submodules: recursive
-      - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
-          python-version: "3.11"
+          python-version: ${{ matrix.python-version }}
+      - name: Lint with flake8
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install flake8 pytest black
+          # stop the build if there are Python syntax errors or undefined names
+          python -m flake8 ${{ env.WWB_PATH }} --config=${{ env.WWB_PATH }}/setup.cfg
       - name: Download OpenVINO package
         uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
           name: ${{ needs.openvino_download.outputs.ov_artifact_name }}
           path: ${{ env.OV_INSTALL_DIR }}
           merge-multiple: true
-      - name: Test stateful
+      - name: Install dependencies
         run: |
           python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
           python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
-          GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
-          python ${{ env.LLM_BENCH_PYPATH }}/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ${{ env.SRC_DIR }} --stateful
-          grep beam_idx ${{ env.SRC_DIR }}/pytorch/dldt/FP32/openvino_model.xml
+          python -m pip install -r ${{ env.WWB_PATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          python -m pip install git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel
         working-directory: ${{ env.OV_INSTALL_DIR }}
       - name: WWB Tests
         run: |
-          pip install pytest
-          pip install git+https://github.com/huggingface/optimum-intel.git
-          GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
+          python -m pip install -v ${{ env.WWB_PATH }}
           python -m pytest -v ${{ env.WWB_PATH }}/tests
 
   Overall_Status:
     name: ci/gha_overall_status_llm_bench
-    needs: [openvino_download, build, stateful]
+    needs: [openvino_download, llm_bench, wwb]
     if: ${{ always() }}
     runs-on: ubuntu-latest
     steps:

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -1,4 +1,4 @@
-name: macOS (12, Python 3.9)
+name: macOS (12, Python 3.10)
 on:
   workflow_dispatch:
   pull_request:
@@ -16,7 +16,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  PYTHON_VERSION: '3.9'
+  PYTHON_VERSION: '3.10'
   OV_BRANCH: master
   OV_TARBALL: ''
 

diff --git a/samples/deployment-requirements.txt b/samples/deployment-requirements.txt
@@ -2,4 +2,4 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino_genai~=2025.0.0.0.dev
 librosa==0.10.2.post1  # For Whisper
-pillow==11.0.0  # Image processing for VLMs
+pillow==11.1.0  # Image processing for VLMs
diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
@@ -2,7 +2,7 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino-tokenizers~=2025.0.0.0.dev
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
@@ -28,6 +28,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
 
     bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
     utils::apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control);
+    utils::apply_gather_before_matmul_transformation(model);
 
     initialize_pipeline(model, scheduler_config, properties, device_config, core);
 }
@@ -444,7 +445,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
     const float * logits_data = logits.data<float>();
     ov::Shape logits_shape = logits.get_shape();
     OPENVINO_ASSERT(logits_shape.size() == 3);
-    size_t batch_seq_len = logits_shape[1], vocab_size = logits_shape[2];
+    size_t vocab_size = logits_shape[2];
     for (size_t sequence_group_id = 0, currently_processed_tokens = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
         SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id];
         // requests not scheduled, in decoding phase or not echoing are not processed
@@ -454,26 +455,25 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
 
         size_t num_running_sequences = sequence_group->num_running_seqs();
         OPENVINO_ASSERT(num_running_sequences == 1);
-        size_t actual_seq_len = sequence_group->get_num_scheduled_tokens();
-        size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len);
+        size_t output_seq_len = sequence_group->get_output_seq_len();
 
         const float * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens;
 
         size_t num_prompt_tokens_processed = sequence_group->get_num_processed_tokens();
-        OPENVINO_ASSERT(num_prompt_tokens_processed + actual_seq_len <= sequence_group->get_prompt_len());
+        OPENVINO_ASSERT(num_prompt_tokens_processed + output_seq_len <= sequence_group->get_prompt_len());
 
         // if we processed the whole prompt we don't include last logprob as it will be processed by the sampler (it's already completion)
         // otherwise we include it as it will be used in the next part of the prompt
         int exclude_last_logprob = 1;
-        if (num_prompt_tokens_processed + actual_seq_len < sequence_group->get_prompt_len())
+        if (num_prompt_tokens_processed + output_seq_len < sequence_group->get_prompt_len())
             exclude_last_logprob = 0;
 
         // if we start processing the prompt we add "fake" log prob for the first position (begin of sequence)
         if (num_prompt_tokens_processed == 0)
             sequence_group->append_prompt_log_prob(1.0);
 
         for (int token_logits_offset = 0, token_id_offset = num_prompt_tokens_processed + 1;
-             token_logits_offset < actual_seq_len - exclude_last_logprob;
+             token_logits_offset < output_seq_len - exclude_last_logprob;
              token_logits_offset++, token_id_offset++) {
 
             const float* token_logits = (sequence_group_logits_data + token_logits_offset * vocab_size);
@@ -498,7 +498,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
 
             sequence_group->append_prompt_log_prob(token_logit - max_value - log_sum);
         }
-        currently_processed_tokens += padded_amount_of_processed_tokens * num_running_sequences;
+        currently_processed_tokens += output_seq_len * num_running_sequences;
         // For max_new_tokens == 0, we don't reach sampling so need to notify handle separately
         if(sequence_group->get_sampling_parameters().max_new_tokens == 0) {
             sequence_group->notify_handle_echo_only();

diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
@@ -230,9 +230,9 @@ void GenerationConfig::validate() const {
         OPENVINO_ASSERT(temperature > 0, "When 'do_sample' is true, temperature must be a strictly positive float, but got ", temperature);
     } else {
         // parameters requiring multinomial
-        OPENVINO_ASSERT(top_k == std::numeric_limits<size_t>::max(), "When 'do_sample' is false, top_k must be max of size_t, but got ", top_k);
-        OPENVINO_ASSERT(top_p == 1.0f, "When 'do_sample' is false, top_p must be 1.0f, but got ", top_p);
-        OPENVINO_ASSERT(temperature == 1.0f, "When 'do_sample' is false, temperature must be a 1.0f, but got ", temperature);
+        // OPENVINO_ASSERT(top_k == std::numeric_limits<size_t>::max(), "When 'do_sample' is false, top_k must be max of size_t, but got ", top_k);
+        // OPENVINO_ASSERT(top_p == 1.0f, "When 'do_sample' is false, top_p must be 1.0f, but got ", top_p);
+        // OPENVINO_ASSERT(temperature == 1.0f, "When 'do_sample' is false, temperature must be a 1.0f, but got ", temperature);
     }
 
     if (is_beam_search()) {
@@ -252,10 +252,10 @@ void GenerationConfig::validate() const {
         }
     } else {
         // parameters requiring beam search
-        OPENVINO_ASSERT(num_beam_groups == 1, "'num_beam_groups' is supported by beam search only and should be 1 otherwise, but got ", num_beam_groups);
-        OPENVINO_ASSERT(no_repeat_ngram_size == std::numeric_limits<size_t>::max(), "'no_repeat_ngram_size' is supported only by beam search, otherwise should be set to max of size_t, but got ", no_repeat_ngram_size);
-        OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to ", diversity_penalty, " (default is 0.0f), which is supported only by beam search sampling");
-        OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to ", length_penalty, " (default is 1.0f), which is supported only by beam search sampling");
+        // OPENVINO_ASSERT(num_beam_groups == 1, "'num_beam_groups' is supported by beam search only and should be 1 otherwise, but got ", num_beam_groups);
+        // OPENVINO_ASSERT(no_repeat_ngram_size == std::numeric_limits<size_t>::max(), "'no_repeat_ngram_size' is supported only by beam search, otherwise should be set to max of size_t, but got ", no_repeat_ngram_size);
+        // OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to ", diversity_penalty, " (default is 0.0f), which is supported only by beam search sampling");
+        // OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to ", length_penalty, " (default is 1.0f), which is supported only by beam search sampling");
     }
 
     // assistant generation