openvinotoolkit · ilya-lavrenov · Apr 9, 2024
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -38,20 +38,20 @@ jobs:
           source ./ov/setupvars.sh
           convert_tokenizer ./Mistral-7B-v0.1/pytorch/dldt/FP16/ --output ./Mistral-7B-v0.1/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
           timeout 50s ./build/beam_search_causal_lm ./Mistral-7B-v0.1/pytorch/dldt/FP16/ 69 > ./pred.txt
-          python -c " 
-          import transformers 
-          with open('pred.txt', 'r') as file: 
-              predictions = file.read() 
-          tokenizer = transformers.LlamaTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1') 
-          tokenized = tokenizer('69', return_tensors='pt') 
-          for beam in transformers.LlamaForCausalLM.from_pretrained('mistralai/Mistral-7B-v0.1').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): 
-              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' 
-              idx = predictions.find(ref) 
-              if -1 == idx: 
-                  raise RuntimeError(f'Missing "{ref=}" from predictions') 
-              predictions = predictions[:idx] + predictions[idx + len(ref):] 
-          " 
-          echo "69" passed 
+          python -c "
+          from transformers import AutoModelForCausalLM, AutoTokenizer
+          with open('pred.txt', 'r') as file:
+              predictions = file.read()
+          tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1')
+          tokenized = tokenizer('69', return_tensors='pt')
+          for beam in AutoModelForCausalLM.from_pretrained('mistralai/Mistral-7B-v0.1').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              idx = predictions.find(ref)
+              if -1 == idx:
+                  raise RuntimeError(f'Missing "{ref=}" from predictions')
+              predictions = predictions[:idx] + predictions[idx + len(ref):]
+          "
+          echo "69" passed
 
   cpp-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
@@ -237,6 +237,20 @@ jobs:
           source ./ov/setupvars.sh
           convert_tokenizer ./Qwen-7B-Chat/pytorch/dldt/FP16/ --output ./Qwen-7B-Chat/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
           timeout 50s ./build/beam_search_causal_lm ./Qwen-7B-Chat/pytorch/dldt/FP16/ 69 > ./pred.txt
+          python -c "
+          from transformers import AutoModelForCausalLM, AutoTokenizer
+          with open('pred.txt', 'r') as file:
+              predictions = file.read()
+          tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B-Chat', trust_remote_code=True)
+          tokenized = tokenizer('69', return_tensors='pt')
+          for beam in AutoModelForCausalLM.from_pretrained('Qwen/Qwen-7B-Chat', trust_remote_code=True).generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              idx = predictions.find(ref)
+              if -1 == idx:
+                  raise RuntimeError(f'Missing "{ref=}" from predictions')
+              predictions = predictions[:idx] + predictions[idx + len(ref):]
+          "
+          echo "69" passed
 
   cpp-beam_search_causal_lm-Qwen1_5-7B-Chat:
     runs-on: ubuntu-20.04-16-cores
@@ -264,6 +278,22 @@ jobs:
           source ./ov/setupvars.sh
           convert_tokenizer ./Qwen1.5-7B-Chat/pytorch/dldt/FP16/ --output ./Qwen1.5-7B-Chat/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
           timeout 50s ./build/beam_search_causal_lm ./Qwen1.5-7B-Chat/pytorch/dldt/FP16/ "你好！" > ./pred_qwen15.txt
+          python -c "
+          from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
+          with open('pred_qwen15.txt', 'r') as file:
+              predictions = file.read()
+          tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen1.5-7B-Chat', trust_remote_code=True)
+          tokenized = tokenizer('你好！', return_tensors='pt')
+          model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen1.5-7B-Chat', trust_remote_code=True)
+          model.generation_config = GenerationConfig(num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False)
+          for beam in model.generate(**tokenized):
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              idx = predictions.find(ref)
+              if -1 == idx:
+                  raise RuntimeError(f'Missing "{ref=}" from predictions')
+              predictions = predictions[:idx] + predictions[idx + len(ref):]
+          "
+          echo "你好！" passed
 
   cpp-beam_search_causal_lm-Phi-2:
     runs-on: ubuntu-20.04-16-cores
@@ -291,6 +321,20 @@ jobs:
           source ./ov/setupvars.sh
           convert_tokenizer ./Phi-2/pytorch/dldt/FP16/ --output ./Phi-2/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
           timeout 50s ./build/beam_search_causal_lm ./Phi-2/pytorch/dldt/FP16/ 69 > ./pred.txt
+          python -c "
+          from transformers import AutoModelForCausalLM, AutoTokenizer
+          with open('pred.txt', 'r') as file:
+              predictions = file.read()
+          tokenizer = AutoTokenizer.from_pretrained('microsoft/phi-2')
+          tokenized = tokenizer('69', return_tensors='pt')
+          for beam in AutoModelForCausalLM.from_pretrained('microsoft/phi-2').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              idx = predictions.find(ref)
+              if -1 == idx:
+                  raise RuntimeError(f'Missing "{ref=}" from predictions')
+              predictions = predictions[:idx] + predictions[idx + len(ref):]
+          "
+          echo "69" passed
 
   cpp-beam_search_causal_lm-notus-7b-v1:
     runs-on: ubuntu-20.04-16-cores
@@ -318,6 +362,20 @@ jobs:
           source ./ov/setupvars.sh
           convert_tokenizer ./notus-7b-v1/pytorch/dldt/FP16/ --output ./notus-7b-v1/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
           timeout 50s ./build/beam_search_causal_lm ./notus-7b-v1/pytorch/dldt/FP16/ 69 > ./pred.txt
+          python -c "
+          from transformers import AutoModelForCausalLM, AutoTokenizer
+          with open('pred.txt', 'r') as file:
+              predictions = file.read()
+          tokenizer = AutoTokenizer.from_pretrained('argilla/notus-7b-v1')
+          tokenized = tokenizer('69', return_tensors='pt')
+          for beam in AutoModelForCausalLM.from_pretrained('argilla/notus-7b-v1').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              idx = predictions.find(ref)
+              if -1 == idx:
+                  raise RuntimeError(f'Missing "{ref=}" from predictions')
+              predictions = predictions[:idx] + predictions[idx + len(ref):]
+          "
+          echo "69" passed
 
   cpp-speculative_decoding_lm-ubuntu:
     runs-on: ubuntu-20.04-16-cores
@@ -331,7 +389,7 @@ jobs:
       - name: Install OpenVINO
         run: |
           mkdir ./ov/
-          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2023.3/linux/l_openvino_toolkit_ubuntu20_2023.3.0.13775.ceeafaf64f3_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - name: Download, convert and build
         run: |
@@ -388,12 +446,12 @@ jobs:
       - name: Compare
         run: |
           python -c "
-          import transformers
+          from transformers import AutoModelForCausalLM, AutoTokenizer
           with open('pred_greedy.txt', 'r') as file:
               predictions = file.read()
-          tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5')
+          tokenizer = AutoTokenizer.from_pretrained('microsoft/phi-1_5')
           tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
-          for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False):
+          for output in AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False):
               ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
               idx = predictions.find(ref)
               if -1 == idx: