From a2379a97e902605a5cf866ca27d350350ba3f7d3 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 26 Feb 2024 14:36:14 +0100 Subject: [PATCH 01/12] initial test version --- optimum/intel/openvino/modeling_decoder.py | 75 +++++++++++++++++----- tests/openvino/test_modeling.py | 13 ++-- 2 files changed, 65 insertions(+), 23 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 8bcf877bff..3204e084ab 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -17,6 +17,7 @@ from pathlib import Path from tempfile import TemporaryDirectory from typing import Dict, Optional, Tuple, Union +from datetime import datetime import numpy as np import openvino @@ -210,6 +211,7 @@ def update_pkv_precision(self, force_fp32=False): if self.is_dynamic: self.model = self._reshape(self.model, -1, -1) self.request = None + self.compiled_model = None def _save_pretrained(self, save_directory: Union[str, Path]): """ @@ -335,6 +337,7 @@ def normalized_config(self): def compile(self): if self.request is None: super().compile() + self.compiled_model =self.request self.request = self.request.create_infer_request() def _make_stateful(self): @@ -353,6 +356,18 @@ class OVModelForCausalLM(OVBaseDecoderModel, GenerationMixin): export_feature = "text-generation" auto_model_class = AutoModelForCausalLM + def generate(self, *args, **kwargs): + self.compile() + infer_context = [self.compiled_model.create_infer_request()] + kwargs["infer_context"] = infer_context + return super().generate(*args, **kwargs) + + def __call__(self, *args, **kwargs): + self.compile() + infer_context = [self.compiled_model.create_infer_request()] + kwargs["infer_context"] = infer_context + return super().__call__(*args, **kwargs) + @add_start_docstrings_to_model_forward( INPUTS_DOCSTRING.format("batch_size, sequence_length") + TEXT_GENERATION_EXAMPLE.format( @@ -375,10 +390,13 @@ def prepare_inputs( batch_size = input_ids.shape[0] if self.config.model_type == "bloom": batch_size *= self.config.num_attention_heads - + #print("prepare inputs - input_ids:",input_ids) inputs = {} past_len = 0 + #print("model stateful", self.stateful) + #print("use cache", self.use_cache) if not self.stateful: + #print("prepare inputs - past_key_values:",past_key_values) if past_key_values is not None: if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: past_len = past_key_values[0][1].shape[-2] @@ -417,13 +435,16 @@ def prepare_inputs( inputs[input_name] = Tensor(model_inputs.get_element_type(), shape.get_shape()) else: # past_key_values are not used explicitly, instead they are handled inside the model - if past_key_values is None: + #print("past_values", past_key_values) + #if past_key_values is None: # This is the first iteration in a sequence, reset all states - if self.request is not None: - self.request.reset_state() + #if infer_request is not None: + # infer_request.reset_state() + # print("reseting state") # Set initial value for the next beam_idx input that will be used at the current iteration # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used - self.next_beam_idx = np.arange(batch_size, dtype=int) + #past_key_values = [np.arange(batch_size, dtype=int)] + ... inputs["input_ids"] = np.array(input_ids) # Add the attention_mask inputs when needed @@ -451,8 +472,10 @@ def prepare_inputs( if "beam_idx" in self.input_names: inputs["beam_idx"] = ( - self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int) + past_key_values[0] if past_key_values is not None else np.arange(batch_size, dtype=int) ) + #if past_key_values is not None: + # print("type",type(past_key_values[0])) return inputs @@ -462,10 +485,10 @@ def forward( attention_mask: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, position_ids: Optional[torch.LongTensor] = None, + infer_context: Optional[list[openvino.runtime.InferRequest]] = None, **kwargs, ) -> CausalLMOutputWithPast: self.compile() - inputs = self.prepare_inputs( input_ids=input_ids, attention_mask=attention_mask, @@ -473,21 +496,32 @@ def forward( position_ids=position_ids, **kwargs, ) - # Run inference - self.request.start_async(inputs, share_inputs=True) - self.request.wait() - logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + if self.stateful and past_key_values is not None: + infer_request = past_key_values[1] + else: + infer_request = infer_context[0] + #print("infer request", infer_context[0]) + #print("Inputs", inputs) + #print("past_values", past_key_values) + start = datetime.now() + infer_request.start_async(inputs, share_inputs=True) + infer_request.wait() + end = datetime.now() + print(start) + print(end) + print("Infernece time [s]", ((end - start).total_seconds())) + logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if self.stateful: # Need a marker to differentiate the first generate iteration from the others in # the first condition at the function beginning above. # It should be something that is not None and it should be True when converted to Boolean. - past_key_values = ((),) + past_key_values = ((inputs["beam_idx"]),infer_request) if not self.stateful: if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) - past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) + past_key_values = tuple(infer_context[0].get_tensor(key).data for key in self.key_value_output_names) if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) past_key_values = tuple( @@ -496,6 +530,7 @@ def forward( else: past_key_values = None + #print("logits", logits) return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation @@ -503,7 +538,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly attention_mask = kwargs.get("attention_mask", None) use_cache = kwargs.get("use_cache", None) - + infer_context = kwargs.get("infer_context", None) position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: # create position_ids on the fly for batch generation @@ -516,6 +551,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg "input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache, + "infer_context": infer_context, "position_ids": position_ids, "attention_mask": attention_mask, } @@ -532,9 +568,12 @@ def _reorder_cache( if self.stateful: # TODO: Apply it differently based on model type # TODO: At least for bloom we need to replicate values for each attention head - self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration + past_key_values = ((np.array(beam_idx)),past_key_values[1]) # save beam_idx to be used as an input in the next iteration return past_key_values else: + #print("_reorder_cache return", tuple( + # tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values + #)) return tuple( tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values ) @@ -650,8 +689,10 @@ def _reorder_cache( batch_size = beam_idx.shape[0] indices = np.array(range(batch_size * self.config.num_attention_heads)) indices = indices.reshape([batch_size, self.config.num_attention_heads]) - self.next_beam_idx = np.take(indices, beam_idx, 0).flatten() - return past_key_values + #self.next_beam_idx = np.take(indices, beam_idx, 0).flatten() + #return past_key_values + #print("_reorder_cache output",np.take(indices, beam_idx, 0).flatten()) + return ((np.take(indices, beam_idx, 0).flatten()),past_key_values[1]) else: standardized_past = self._convert_to_standard_cache(past_key_values, batch_size=len(beam_idx)) reordered_past = tuple( diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 2188b7061f..ae1c20becc 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -502,6 +502,7 @@ def test_compare_to_transformers(self, model_arch): set_seed(SEED) ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) + print("model", ov_model.stateful, ov_model.use_cache) self.assertIsInstance(ov_model.config, PretrainedConfig) self.assertTrue(ov_model.use_cache) @@ -518,13 +519,13 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue("logits" in ov_outputs) self.assertIsInstance(ov_outputs.logits, torch.Tensor) - self.assertTrue("past_key_values" in ov_outputs) - self.assertIsInstance(ov_outputs.past_key_values, tuple) + #self.assertTrue("past_key_values" in ov_outputs) + #self.assertIsInstance(ov_outputs.past_key_values, tuple) - is_stateful = ov_model.config.model_type not in {"gpt_bigcode", "llama"} and self.IS_SUPPORT_STATEFUL + is_stateful = self.IS_SUPPORT_STATEFUL self.assertEqual(ov_model.stateful, is_stateful) - if is_stateful: - self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) + #if is_stateful: + # self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) with torch.no_grad(): transformers_outputs = transformers_model(**tokens) @@ -1259,7 +1260,7 @@ def test_compare_with_and_without_past_key_values(self): **inputs, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 ) - self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) + #self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) self.assertTrue( From 6798a6602e259994a5edd5a6258642508fe075e5 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Wed, 28 Feb 2024 16:26:51 +0100 Subject: [PATCH 02/12] more tests and code cleanup --- optimum/intel/openvino/modeling_base.py | 1 + optimum/intel/openvino/modeling_decoder.py | 64 +++++--------- tests/openvino/test_modeling.py | 98 ++++++++++++++++++++-- tests/openvino/utils_tests.py | 36 ++++++++ 4 files changed, 150 insertions(+), 49 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 094840c297..681d1425a7 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -85,6 +85,7 @@ def __init__( self.model = model self.request = None + self.compiled_model = None if enable_compilation: self.compile() diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 3204e084ab..eac8a1b657 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -17,7 +17,6 @@ from pathlib import Path from tempfile import TemporaryDirectory from typing import Dict, Optional, Tuple, Union -from datetime import datetime import numpy as np import openvino @@ -133,7 +132,7 @@ def __init__( self.key_value_output_names = [key for key in self.output_names if "present" in key] self._original_model = self.model.clone() # keep original model for serialization self._pkv_precision = Type.f32 - self.next_beam_idx = None + # self.next_beam_idx = None self.update_pkv_precision() if self.is_dynamic: self.model = self._reshape(self.model, -1, -1) @@ -335,10 +334,10 @@ def normalized_config(self): return NormalizedConfigManager.get_normalized_config_class(self.config.model_type)(self.config) def compile(self): - if self.request is None: + if self.compiled_model is None: super().compile() - self.compiled_model =self.request - self.request = self.request.create_infer_request() + self.compiled_model = self.request + # self.request = self.request.create_infer_request() def _make_stateful(self): patch_stateful(self.config, self.model) @@ -390,13 +389,9 @@ def prepare_inputs( batch_size = input_ids.shape[0] if self.config.model_type == "bloom": batch_size *= self.config.num_attention_heads - #print("prepare inputs - input_ids:",input_ids) inputs = {} past_len = 0 - #print("model stateful", self.stateful) - #print("use cache", self.use_cache) if not self.stateful: - #print("prepare inputs - past_key_values:",past_key_values) if past_key_values is not None: if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: past_len = past_key_values[0][1].shape[-2] @@ -433,18 +428,6 @@ def prepare_inputs( else: shape[1] = 0 inputs[input_name] = Tensor(model_inputs.get_element_type(), shape.get_shape()) - else: - # past_key_values are not used explicitly, instead they are handled inside the model - #print("past_values", past_key_values) - #if past_key_values is None: - # This is the first iteration in a sequence, reset all states - #if infer_request is not None: - # infer_request.reset_state() - # print("reseting state") - # Set initial value for the next beam_idx input that will be used at the current iteration - # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used - #past_key_values = [np.arange(batch_size, dtype=int)] - ... inputs["input_ids"] = np.array(input_ids) # Add the attention_mask inputs when needed @@ -474,8 +457,6 @@ def prepare_inputs( inputs["beam_idx"] = ( past_key_values[0] if past_key_values is not None else np.arange(batch_size, dtype=int) ) - #if past_key_values is not None: - # print("type",type(past_key_values[0])) return inputs @@ -488,7 +469,6 @@ def forward( infer_context: Optional[list[openvino.runtime.InferRequest]] = None, **kwargs, ) -> CausalLMOutputWithPast: - self.compile() inputs = self.prepare_inputs( input_ids=input_ids, attention_mask=attention_mask, @@ -498,25 +478,24 @@ def forward( ) # Run inference if self.stateful and past_key_values is not None: + # for stateful models, infer request is created in generate and __call_ methods and passed in the cycle via past_key_values param infer_request = past_key_values[1] else: - infer_request = infer_context[0] - #print("infer request", infer_context[0]) - #print("Inputs", inputs) - #print("past_values", past_key_values) - start = datetime.now() + if infer_context[0] is not None: + infer_request = infer_context[ + 0 + ] # Use passed inference request if provided in kwargs, create new one overwise + else: + self.compile() + infer_request = self.compiled_model.create_infer_request() infer_request.start_async(inputs, share_inputs=True) infer_request.wait() - end = datetime.now() - print(start) - print(end) - print("Infernece time [s]", ((end - start).total_seconds())) logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) if self.stateful: # Need a marker to differentiate the first generate iteration from the others in # the first condition at the function beginning above. # It should be something that is not None and it should be True when converted to Boolean. - past_key_values = ((inputs["beam_idx"]),infer_request) + past_key_values = ((inputs["beam_idx"]), infer_request) if not self.stateful: if self.use_cache: @@ -530,7 +509,6 @@ def forward( else: past_key_values = None - #print("logits", logits) return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation @@ -568,12 +546,12 @@ def _reorder_cache( if self.stateful: # TODO: Apply it differently based on model type # TODO: At least for bloom we need to replicate values for each attention head - past_key_values = ((np.array(beam_idx)),past_key_values[1]) # save beam_idx to be used as an input in the next iteration + past_key_values = ( + (np.array(beam_idx)), + past_key_values[1], + ) # save beam_idx and infer_request to be used as an input in the next iteration return past_key_values else: - #print("_reorder_cache return", tuple( - # tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values - #)) return tuple( tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values ) @@ -689,10 +667,10 @@ def _reorder_cache( batch_size = beam_idx.shape[0] indices = np.array(range(batch_size * self.config.num_attention_heads)) indices = indices.reshape([batch_size, self.config.num_attention_heads]) - #self.next_beam_idx = np.take(indices, beam_idx, 0).flatten() - #return past_key_values - #print("_reorder_cache output",np.take(indices, beam_idx, 0).flatten()) - return ((np.take(indices, beam_idx, 0).flatten()),past_key_values[1]) + # self.next_beam_idx = np.take(indices, beam_idx, 0).flatten() + # return past_key_values + # print("_reorder_cache output",np.take(indices, beam_idx, 0).flatten()) + return ((np.take(indices, beam_idx, 0).flatten()), past_key_values[1]) else: standardized_past = self._convert_to_standard_cache(past_key_values, batch_size=len(beam_idx)) reordered_past = tuple( diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index ae1c20becc..df96a7fc5f 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -50,7 +50,7 @@ set_seed, ) from transformers.onnx.utils import get_preprocessor -from utils_tests import MODEL_NAMES +from utils_tests import MODEL_NAMES, run_on_multiple_threads from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS from optimum.intel import ( @@ -519,13 +519,9 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue("logits" in ov_outputs) self.assertIsInstance(ov_outputs.logits, torch.Tensor) - #self.assertTrue("past_key_values" in ov_outputs) - #self.assertIsInstance(ov_outputs.past_key_values, tuple) is_stateful = self.IS_SUPPORT_STATEFUL self.assertEqual(ov_model.stateful, is_stateful) - #if is_stateful: - # self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) with torch.no_grad(): transformers_outputs = transformers_model(**tokens) @@ -536,6 +532,51 @@ def test_compare_to_transformers(self, model_arch): del ov_model gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_compare_to_transformers_multithreading(self, model_arch): + model_id = MODEL_NAMES[model_arch] + if "llama_gptq" in model_arch: + self.skipTest("Not supported without gpu and disable_exllama=True option") + set_seed(SEED) + ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) + self.assertIsInstance(ov_model.config, PretrainedConfig) + self.assertTrue(ov_model.use_cache) + self.assertEqual(ov_model.stateful, self.IS_SUPPORT_STATEFUL) + transformers_model = AutoModelForCausalLM.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs_list = ["This is a sample", "Here is another sample", "That's the thrid one", "This is the last sample"] + tokens_list = [ + tokenizer(inputs, return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None) + for inputs in inputs_list + ] + + def run_ov_model(tokens, transformers_model, ov_model): + # global ov_model, transformers_model + position_ids = None + if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: + input_shape = tokens["input_ids"].shape + position_ids = ( + torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) + ) + ov_outputs = ov_model(**tokens, position_ids=position_ids) + + self.assertTrue("logits" in ov_outputs) + self.assertIsInstance(ov_outputs.logits, torch.Tensor) + # self.assertTrue("past_key_values" in ov_outputs) + # self.assertIsInstance(ov_outputs.past_key_values, tuple) + # if self.IS_SUPPORT_STATEFUL and model_arch != "gpt_bigcode": + # self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) + with torch.no_grad(): + transformers_outputs = transformers_model(**tokens) + # Compare tensor outputs + self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4)) + + run_on_multiple_threads(run_ov_model, tokens_list, (transformers_model, ov_model)) + + del transformers_model + del ov_model + gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] @@ -553,6 +594,30 @@ def test_pipeline(self, model_arch): del model gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_pipeline_multithreading(self, model_arch): + model_id = MODEL_NAMES[model_arch] + model = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=False, compile=False) + model.config.encoder_no_repeat_ngram_size = 0 + model.to("cpu") + model.half() + model.compile() + + def run_ov_model(input_text, model): + # Tokenizer is not supposed to be shared by multiple threads + tokenizer = AutoTokenizer.from_pretrained(model_id) + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + outputs = pipe(input_text, max_length=10) + self.assertEqual(pipe.device, model.device) + for i in range(len(outputs)): + self.assertTrue(all(input_text[i] in item["generated_text"] for item in outputs[i])) + del pipe + + inputs_list = [["This is a sample"], ["This is a second sample"], ["This is a third sample"]] + run_on_multiple_threads(run_ov_model, inputs_list, [model]) + del model + gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_multiple_inputs(self, model_arch): model_id = MODEL_NAMES[model_arch] @@ -569,6 +634,27 @@ def test_multiple_inputs(self, model_arch): del model gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_multiple_inputs_multithreading(self, model_arch): + model_id = MODEL_NAMES[model_arch] + set_seed(SEED) + model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.pad_token = tokenizer.eos_token + texts = ["this is a simple input", "this is a second simple input", "this is a third simple input"] + tokens = tokenizer(texts, padding=True, return_tensors="pt") + generation_config = GenerationConfig(encoder_no_repeat_ngram_size=0, max_new_tokens=20, num_beams=2) + + def run_ov_model(tokens, model): + outputs = model.generate(**tokens, generation_config=generation_config) + self.assertIsInstance(outputs, torch.Tensor) + self.assertEqual(outputs.shape[0], 3) + + tokens_list = [tokens, tokens, tokens, tokens] # running in 4 threads + run_on_multiple_threads(run_ov_model, tokens_list, [model]) + del model + gc.collect() + def test_model_and_decoder_same_device(self): model_id = MODEL_NAMES["gpt2"] model = OVModelForCausalLM.from_pretrained(model_id, export=True) @@ -1260,7 +1346,7 @@ def test_compare_with_and_without_past_key_values(self): **inputs, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 ) - #self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) + # self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) self.assertTrue( diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 8fabb34e38..5a1f65663a 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import threading + import numpy as np import torch @@ -132,3 +134,37 @@ def get_num_quantized_nodes(ov_model): if "4" in elem.get_output_element_type(i).get_type_name(): num_int4 += 1 return num_fake_quantize, num_int8, num_int4 + + +### Multithreading + + +class OVThread(threading.Thread): + def __init__(self, target, args): + super().__init__() + self.target = target + self.args = args + + def run(self): + self.exception = None + try: + self.target(*self.args) + except Exception as e: + self.exception = e + + def join(self): + super().join() + if self.exception: + raise self.exception + + +# Each set of args is run in a separate thread. +# Amount of such sets define how many threads are spawned. +def run_on_multiple_threads(target, list, extra_args): + threads = [] + for input in list: + threads.append(OVThread(target=target, args=(input, *extra_args))) + for thread in threads: + thread.start() + for thread in threads: + thread.join() From be1a32db558137c798de2d77f30b106cea498999 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Thu, 29 Feb 2024 16:34:55 +0100 Subject: [PATCH 03/12] fix python3.8 execution --- optimum/intel/openvino/modeling_decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 302ccdd540..60214669bd 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -16,7 +16,7 @@ import os from pathlib import Path from tempfile import TemporaryDirectory -from typing import Dict, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import numpy as np import openvino @@ -467,7 +467,7 @@ def forward( attention_mask: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, position_ids: Optional[torch.LongTensor] = None, - infer_context: Optional[list[openvino.runtime.InferRequest]] = None, + infer_context: Optional[List[openvino.runtime.InferRequest]] = None, **kwargs, ) -> CausalLMOutputWithPast: inputs = self.prepare_inputs( From fe71151a0179a367fa975c8f9637826ceb8ccb0b Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Wed, 6 Mar 2024 14:09:12 +0100 Subject: [PATCH 04/12] test fixes for latest transformers and review fixes --- optimum/intel/openvino/modeling_decoder.py | 19 +++++-------------- tests/openvino/test_modeling.py | 9 ++++----- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 60214669bd..679447738e 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -338,7 +338,6 @@ def compile(self): if self.compiled_model is None: super().compile() self.compiled_model = self.request - # self.request = self.request.create_infer_request() def _make_stateful(self): patch_stateful(self.config, self.model) @@ -358,16 +357,11 @@ class OVModelForCausalLM(OVBaseDecoderModel, GenerationMixin): def generate(self, *args, **kwargs): self.compile() - infer_context = [self.compiled_model.create_infer_request()] - kwargs["infer_context"] = infer_context + if kwargs.get("infer_request") is None: + infer_context = [self.compiled_model.create_infer_request()] + kwargs["infer_context"] = infer_context return super().generate(*args, **kwargs) - def __call__(self, *args, **kwargs): - self.compile() - infer_context = [self.compiled_model.create_infer_request()] - kwargs["infer_context"] = infer_context - return super().__call__(*args, **kwargs) - @add_start_docstrings_to_model_forward( INPUTS_DOCSTRING.format("batch_size, sequence_length") + TEXT_GENERATION_EXAMPLE.format( @@ -482,7 +476,7 @@ def forward( # for stateful models, infer request is created in generate and __call_ methods and passed in the cycle via past_key_values param infer_request = past_key_values[1] else: - if infer_context[0] is not None: + if infer_context is not None: infer_request = infer_context[ 0 ] # Use passed inference request if provided in kwargs, create new one overwise @@ -501,7 +495,7 @@ def forward( if not self.stateful: if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) - past_key_values = tuple(infer_context[0].get_tensor(key).data for key in self.key_value_output_names) + past_key_values = tuple(infer_request.get_tensor(key).data for key in self.key_value_output_names) if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) past_key_values = tuple( @@ -690,9 +684,6 @@ def _reorder_cache( batch_size = beam_idx.shape[0] indices = np.array(range(batch_size * self.config.num_attention_heads)) indices = indices.reshape([batch_size, self.config.num_attention_heads]) - # self.next_beam_idx = np.take(indices, beam_idx, 0).flatten() - # return past_key_values - # print("_reorder_cache output",np.take(indices, beam_idx, 0).flatten()) return ((np.take(indices, beam_idx, 0).flatten()), past_key_values[1]) else: standardized_past = self._convert_to_standard_cache(past_key_values, batch_size=len(beam_idx)) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index df96a7fc5f..8a8f244b07 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -516,11 +516,9 @@ def test_compare_to_transformers(self, model_arch): input_shape = tokens["input_ids"].shape position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) ov_outputs = ov_model(**tokens, position_ids=position_ids) - - self.assertTrue("logits" in ov_outputs) self.assertIsInstance(ov_outputs.logits, torch.Tensor) - is_stateful = self.IS_SUPPORT_STATEFUL + is_stateful = ov_model.config.model_type not in {"gpt_bigcode", "llama"} and self.IS_SUPPORT_STATEFUL self.assertEqual(ov_model.stateful, is_stateful) with torch.no_grad(): @@ -541,7 +539,8 @@ def test_compare_to_transformers_multithreading(self, model_arch): ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) self.assertIsInstance(ov_model.config, PretrainedConfig) self.assertTrue(ov_model.use_cache) - self.assertEqual(ov_model.stateful, self.IS_SUPPORT_STATEFUL) + is_stateful = ov_model.config.model_type not in {"gpt_bigcode", "llama"} and self.IS_SUPPORT_STATEFUL + self.assertEqual(ov_model.stateful, is_stateful) transformers_model = AutoModelForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) inputs_list = ["This is a sample", "Here is another sample", "That's the thrid one", "This is the last sample"] @@ -607,7 +606,7 @@ def run_ov_model(input_text, model): # Tokenizer is not supposed to be shared by multiple threads tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) - outputs = pipe(input_text, max_length=10) + outputs = pipe(input_text, max_length=30) self.assertEqual(pipe.device, model.device) for i in range(len(outputs)): self.assertTrue(all(input_text[i] in item["generated_text"] for item in outputs[i])) From 66676614aa7b6a8792cd32708019162a1d84e2fe Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 18 Mar 2024 15:42:13 +0100 Subject: [PATCH 05/12] review updates --- optimum/intel/openvino/modeling_decoder.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 679447738e..15d6c0e2ed 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -132,7 +132,6 @@ def __init__( self.key_value_output_names = [key for key in self.output_names if "present" in key] self._original_model = self.model.clone() # keep original model for serialization self._pkv_precision = Type.f32 - # self.next_beam_idx = None self.update_pkv_precision() if self.is_dynamic: self.model = self._reshape(self.model, -1, -1) @@ -335,9 +334,11 @@ def normalized_config(self): return NormalizedConfigManager.get_normalized_config_class(self.config.model_type)(self.config) def compile(self): - if self.compiled_model is None: + if self.request is None: super().compile() self.compiled_model = self.request + self.request = self.request.create_infer_request() + def _make_stateful(self): patch_stateful(self.config, self.model) @@ -464,6 +465,7 @@ def forward( infer_context: Optional[List[openvino.runtime.InferRequest]] = None, **kwargs, ) -> CausalLMOutputWithPast: + self.compile() inputs = self.prepare_inputs( input_ids=input_ids, attention_mask=attention_mask, @@ -477,9 +479,8 @@ def forward( infer_request = past_key_values[1] else: if infer_context is not None: - infer_request = infer_context[ - 0 - ] # Use passed inference request if provided in kwargs, create new one overwise + # Use passed inference request if provided in kwargs, create new one overwise + infer_request = infer_context[0] else: self.compile() infer_request = self.compiled_model.create_infer_request() @@ -541,10 +542,8 @@ def _reorder_cache( if self.stateful: # TODO: Apply it differently based on model type # TODO: At least for bloom we need to replicate values for each attention head - past_key_values = ( - (np.array(beam_idx)), - past_key_values[1], - ) # save beam_idx and infer_request to be used as an input in the next iteration + # save beam_idx and infer_request to be used as an input in the next iteration + past_key_values = ((np.array(beam_idx)), past_key_values[1]) return past_key_values else: return tuple( From ae47a4ec7b011d8b4ffe99d166ce85f6b6a7976c Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Thu, 28 Mar 2024 14:17:14 +0100 Subject: [PATCH 06/12] style fixes --- optimum/intel/openvino/modeling_decoder.py | 5 ++--- tests/openvino/test_modeling.py | 22 ++++++++++++---------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 4a64f867ea..34508959e2 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -346,7 +346,6 @@ def compile(self): self.compiled_model = self.request self.request = self.request.create_infer_request() - def _make_stateful(self): patch_stateful(self.config, self.model) self.stateful = True @@ -487,7 +486,7 @@ def forward( else: if infer_context is not None: # Use passed inference request if provided in kwargs, create new one overwise - infer_request = infer_context[0] + infer_request = infer_context[0] else: self.compile() infer_request = self.compiled_model.create_infer_request() @@ -550,7 +549,7 @@ def _reorder_cache( # TODO: Apply it differently based on model type # TODO: At least for bloom we need to replicate values for each attention head # save beam_idx and infer_request to be used as an input in the next iteration - past_key_values = ((np.array(beam_idx)), past_key_values[1]) + past_key_values = ((np.array(beam_idx)), past_key_values[1]) return past_key_values else: return tuple( diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 2fe0501165..3536ad6865 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -544,11 +544,11 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue("logits" in ov_outputs) self.assertIsInstance(ov_outputs.logits, torch.Tensor) - #self.assertTrue("past_key_values" in ov_outputs) - #self.assertIsInstance(ov_outputs.past_key_values, tuple) + # self.assertTrue("past_key_values" in ov_outputs) + # self.assertIsInstance(ov_outputs.past_key_values, tuple) is_stateful = ov_model.config.model_type not in not_stateful and self.IS_SUPPORT_STATEFUL self.assertEqual(ov_model.stateful, is_stateful) - #if is_stateful: + # if is_stateful: # self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) with torch.no_grad(): transformers_outputs = transformers_model(**tokens) @@ -571,7 +571,7 @@ def test_compare_to_transformers_multithreading(self, model_arch): if "gptq" in model_arch: self.skipTest("GPTQ model loading unsupported with AutoModelForCausalLM") - if model_arch in ["chatglm","baichuan2"]: + if model_arch in ["chatglm", "baichuan2"]: self.skipTest("Models " + model_id + "doesn't support concurrent execution in AutoModelForCausalLM") set_seed(SEED) @@ -599,15 +599,15 @@ def test_compare_to_transformers_multithreading(self, model_arch): def run_ov_model(tokens, transformers_model, ov_model): # global ov_model, transformers_model - #position_ids = None - #if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: + # position_ids = None + # if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: # input_shape = tokens["input_ids"].shape # position_ids = ( # torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) # ) set_seed(SEED) ov_outputs = ov_model(**tokens) - + self.assertTrue("logits" in ov_outputs) self.assertIsInstance(ov_outputs.logits, torch.Tensor) # self.assertTrue("past_key_values" in ov_outputs) @@ -618,7 +618,7 @@ def run_ov_model(tokens, transformers_model, ov_model): transformers_outputs = transformers_model(**tokens) # Compare tensor outputs self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4)) - #self.assertTrue(False) + # self.assertTrue(False) run_on_multiple_threads(run_ov_model, tokens_list, (transformers_model, ov_model)) @@ -661,7 +661,7 @@ def test_pipeline_multithreading(self, model_arch): "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True), "trust_remote_code": True, } - + model = OVModelForCausalLM.from_pretrained( model_id, export=True, use_cache=False, compile=False, **model_kwargs ) @@ -673,7 +673,9 @@ def test_pipeline_multithreading(self, model_arch): def run_ov_model(input_text, model): # Tokenizer is not supposed to be shared by multiple threads - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + tokenizer = AutoTokenizer.from_pretrained( + model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) outputs = pipe(input_text, max_length=30) self.assertEqual(pipe.device, model.device) From bd106a710a7b1643798cb44e115919d073475987 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Thu, 18 Apr 2024 16:47:20 +0200 Subject: [PATCH 07/12] concurrency without overriding past_key_values with infer context --- optimum/intel/openvino/modeling_decoder.py | 86 +++++++++++++++------- tests/openvino/test_modeling.py | 18 ++--- 2 files changed, 70 insertions(+), 34 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index a1ed0a148c..95945bed2b 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -28,7 +28,8 @@ from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.generation import GenerationMixin from transformers.modeling_outputs import CausalLMOutputWithPast - +from transformers.utils import ModelOutput +from dataclasses import dataclass from optimum.utils.normalized_config import NormalizedConfigManager from ...exporters.openvino import ensure_stateful_is_available, main_export, patch_stateful @@ -44,6 +45,23 @@ core = Core() +@dataclass +class OVCausalLMOutputWithPast(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + infer_request(`openvino.runtime.InferRequest` to be reused in the generation cycles. + beam_idx (`torch.Tensor` beam search algorimth context for the generation using stateful models + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + infer_request: Optional[openvino.runtime.InferRequest] = None + beam_idx: Optional[torch.Tensor] = None TEXT_GENERATION_EXAMPLE = r""" Example of text generation: @@ -341,12 +359,12 @@ class OVModelForCausalLM(OVBaseDecoderModel, GenerationMixin): export_feature = "text-generation" auto_model_class = AutoModelForCausalLM - def generate(self, *args, **kwargs): - self.compile() - if kwargs.get("infer_request") is None: - infer_context = [self.compiled_model.create_infer_request()] - kwargs["infer_context"] = infer_context - return super().generate(*args, **kwargs) +# def generate(self, *args, **kwargs): +# self.compile() +# if kwargs.get("infer_request") is None: +# infer_context = [self.compiled_model.create_infer_request()] +# kwargs["infer_context"] = infer_context +# return super().generate(*args, **kwargs) @add_start_docstrings_to_model_forward( INPUTS_DOCSTRING.format("batch_size, sequence_length") @@ -362,6 +380,7 @@ def prepare_inputs( attention_mask: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, position_ids: Optional[torch.LongTensor] = None, + beam_idx: Optional[torch.tensor] = None, **kwargs, ) -> Dict: if self.use_cache and past_key_values is not None: @@ -436,7 +455,7 @@ def prepare_inputs( if "beam_idx" in self.input_names: inputs["beam_idx"] = ( - past_key_values[0] if past_key_values is not None else np.arange(batch_size, dtype=int) + beam_idx if beam_idx is not None else np.arange(batch_size, dtype=int) ) return inputs @@ -447,28 +466,25 @@ def forward( attention_mask: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, position_ids: Optional[torch.LongTensor] = None, - infer_context: Optional[List[openvino.runtime.InferRequest]] = None, + infer_request: Optional[openvino.runtime.InferRequest] = None, + beam_idx: torch.Tensor = None, **kwargs, - ) -> CausalLMOutputWithPast: + ) -> OVCausalLMOutputWithPast: self.compile() inputs = self.prepare_inputs( input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values, position_ids=position_ids, + beam_idx=beam_idx, **kwargs, ) + # Run inference - if self.stateful and past_key_values is not None: - # for stateful models, infer request is created in generate and __call_ methods and passed in the cycle via past_key_values param - infer_request = past_key_values[1] - else: - if infer_context is not None: - # Use passed inference request if provided in kwargs, create new one overwise - infer_request = infer_context[0] - else: - self.compile() - infer_request = self.compiled_model.create_infer_request() + if infer_request is None: + self.compile() + infer_request = self.compiled_model.create_infer_request() + infer_request.start_async(inputs, share_inputs=True) infer_request.wait() logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device) @@ -476,7 +492,7 @@ def forward( # Need a marker to differentiate the first generate iteration from the others in # the first condition at the function beginning above. # It should be something that is not None and it should be True when converted to Boolean. - past_key_values = ((inputs["beam_idx"]), infer_request) + past_key_values = ((),) if not self.stateful: if self.use_cache: @@ -490,14 +506,31 @@ def forward( else: past_key_values = None - return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) + return OVCausalLMOutputWithPast(logits=logits, past_key_values=past_key_values, infer_request=infer_request, beam_idx=beam_idx) + + def _update_model_kwargs_for_generation( + self, outputs: OVCausalLMOutputWithPast, + model_kwargs: dict[str], + is_encoder_decoder: bool = False, + standardize_cache_format: bool = False, + ) -> dict[str]: + model_kwargs = super()._update_model_kwargs_for_generation( + outputs=outputs, + model_kwargs=model_kwargs, + is_encoder_decoder=is_encoder_decoder, + standardize_cache_format=standardize_cache_format, + ) + if "infer_request" in outputs: model_kwargs["infer_request"] = outputs["infer_request"] + if "beam_idx" in outputs: model_kwargs["beam_idx"] = outputs["beam_idx"] + return model_kwargs # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly attention_mask = kwargs.get("attention_mask", None) use_cache = kwargs.get("use_cache", None) - infer_context = kwargs.get("infer_context", None) + infer_request = kwargs.get("infer_request", None) + beam_idx = kwargs.get("beam_idx", None) position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: # create position_ids on the fly for batch generation @@ -510,7 +543,8 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg "input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache, - "infer_context": infer_context, + "infer_request": infer_request, + "beam_idx": beam_idx, "position_ids": position_ids, "attention_mask": attention_mask, } @@ -528,7 +562,7 @@ def _reorder_cache( # TODO: Apply it differently based on model type # TODO: At least for bloom we need to replicate values for each attention head # save beam_idx and infer_request to be used as an input in the next iteration - past_key_values = ((np.array(beam_idx)), past_key_values[1]) + return past_key_values else: return tuple( @@ -760,3 +794,5 @@ def _reorder_cache( return past_key_values else: return tuple(np.take(layer_past, beam_idx, 0) for layer_past in past_key_values) + + diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 2d68db18f8..958f3987d5 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -563,12 +563,12 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue("logits" in ov_outputs) self.assertIsInstance(ov_outputs.logits, torch.Tensor) - # self.assertTrue("past_key_values" in ov_outputs) - # self.assertIsInstance(ov_outputs.past_key_values, tuple) + self.assertTrue("past_key_values" in ov_outputs) + self.assertIsInstance(ov_outputs.past_key_values, tuple) is_stateful = ov_model.config.model_type not in not_stateful and self.IS_SUPPORT_STATEFUL self.assertEqual(ov_model.stateful, is_stateful) - # if is_stateful: - # self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) + if is_stateful: + self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) with torch.no_grad(): transformers_outputs = transformers_model(**tokens) @@ -596,16 +596,15 @@ def test_compare_to_transformers_multithreading(self, model_arch): set_seed(SEED) model_kwargs = {} if model_arch in self.REMOTE_CODE_MODELS: - model_kwargs = { - "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True), - "trust_remote_code": True, - } + model_kwargs = {"trust_remote_code": True} + ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs) self.assertIsInstance(ov_model.config, PretrainedConfig) self.assertTrue(ov_model.use_cache) self.assertEqual( ov_model.stateful, self.IS_SUPPORT_STATEFUL and ov_model.config.model_type not in not_stateful ) + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) if model_arch == "qwen": @@ -848,7 +847,7 @@ def test_default_filling_attention_mask(self): def test_default_filling_attention_mask_and_position_ids(self): model_id = MODEL_NAMES["llama"] - model_with_cache = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True) + model_with_cache = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True, stateful=False) tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer.pad_token = tokenizer.eos_token texts = ["this is a simple input"] @@ -866,6 +865,7 @@ def test_default_filling_attention_mask_and_position_ids(self): ) outs_without_attn_mask_step2 = model_with_cache(input_ids=input_ids, past_key_values=past_key_values) self.assertTrue(torch.allclose(outs_step2.logits, outs_without_attn_mask_step2.logits)) + print() del model_with_cache gc.collect() From 93e40eeaf88336875adff2328f2c194bbd957b6a Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 19 Apr 2024 08:29:10 +0200 Subject: [PATCH 08/12] fix passing beam_idx content for stateless models --- optimum/intel/openvino/modeling_decoder.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 95945bed2b..d4b45e8208 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -61,7 +61,7 @@ class OVCausalLMOutputWithPast(ModelOutput): hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None infer_request: Optional[openvino.runtime.InferRequest] = None - beam_idx: Optional[torch.Tensor] = None + #beam_idx: Optional[torch.Tensor] = None TEXT_GENERATION_EXAMPLE = r""" Example of text generation: @@ -359,12 +359,6 @@ class OVModelForCausalLM(OVBaseDecoderModel, GenerationMixin): export_feature = "text-generation" auto_model_class = AutoModelForCausalLM -# def generate(self, *args, **kwargs): -# self.compile() -# if kwargs.get("infer_request") is None: -# infer_context = [self.compiled_model.create_infer_request()] -# kwargs["infer_context"] = infer_context -# return super().generate(*args, **kwargs) @add_start_docstrings_to_model_forward( INPUTS_DOCSTRING.format("batch_size, sequence_length") @@ -380,7 +374,6 @@ def prepare_inputs( attention_mask: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, position_ids: Optional[torch.LongTensor] = None, - beam_idx: Optional[torch.tensor] = None, **kwargs, ) -> Dict: if self.use_cache and past_key_values is not None: @@ -455,7 +448,7 @@ def prepare_inputs( if "beam_idx" in self.input_names: inputs["beam_idx"] = ( - beam_idx if beam_idx is not None else np.arange(batch_size, dtype=int) + past_key_values[0] if past_key_values is not None else np.arange(batch_size, dtype=int) ) return inputs @@ -467,7 +460,6 @@ def forward( past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, position_ids: Optional[torch.LongTensor] = None, infer_request: Optional[openvino.runtime.InferRequest] = None, - beam_idx: torch.Tensor = None, **kwargs, ) -> OVCausalLMOutputWithPast: self.compile() @@ -476,7 +468,6 @@ def forward( attention_mask=attention_mask, past_key_values=past_key_values, position_ids=position_ids, - beam_idx=beam_idx, **kwargs, ) @@ -506,7 +497,7 @@ def forward( else: past_key_values = None - return OVCausalLMOutputWithPast(logits=logits, past_key_values=past_key_values, infer_request=infer_request, beam_idx=beam_idx) + return OVCausalLMOutputWithPast(logits=logits, past_key_values=past_key_values, infer_request=infer_request) def _update_model_kwargs_for_generation( self, outputs: OVCausalLMOutputWithPast, @@ -521,7 +512,6 @@ def _update_model_kwargs_for_generation( standardize_cache_format=standardize_cache_format, ) if "infer_request" in outputs: model_kwargs["infer_request"] = outputs["infer_request"] - if "beam_idx" in outputs: model_kwargs["beam_idx"] = outputs["beam_idx"] return model_kwargs # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation @@ -530,7 +520,6 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg attention_mask = kwargs.get("attention_mask", None) use_cache = kwargs.get("use_cache", None) infer_request = kwargs.get("infer_request", None) - beam_idx = kwargs.get("beam_idx", None) position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: # create position_ids on the fly for batch generation @@ -544,7 +533,6 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg "past_key_values": past_key_values, "use_cache": use_cache, "infer_request": infer_request, - "beam_idx": beam_idx, "position_ids": position_ids, "attention_mask": attention_mask, } @@ -562,8 +550,8 @@ def _reorder_cache( # TODO: Apply it differently based on model type # TODO: At least for bloom we need to replicate values for each attention head # save beam_idx and infer_request to be used as an input in the next iteration - - return past_key_values + # here, beam_idx content is passed inside the past_key_values + return ((beam_idx),) else: return tuple( tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values From e5b6075356b1a704b3368ead4ed9fa8f5cd24f67 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 22 Apr 2024 11:37:47 +0200 Subject: [PATCH 09/12] cleanup and style --- optimum/intel/openvino/modeling_decoder.py | 46 +++++++++++----------- tests/openvino/test_modeling.py | 7 ++-- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 0547112011..d5d1cbf908 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -15,9 +15,10 @@ import copy import logging import os +from dataclasses import dataclass from pathlib import Path from tempfile import TemporaryDirectory -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, Optional, Tuple, Union import numpy as np import openvino @@ -27,9 +28,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.generation import GenerationMixin -from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.utils import ModelOutput -from dataclasses import dataclass + from optimum.utils.normalized_config import NormalizedConfigManager from ...exporters.openvino import ensure_stateful_is_available, main_export, patch_stateful @@ -45,6 +45,7 @@ core = Core() + @dataclass class OVCausalLMOutputWithPast(ModelOutput): """ @@ -63,6 +64,7 @@ class OVCausalLMOutputWithPast(ModelOutput): infer_request: Optional[openvino.runtime.InferRequest] = None past_length: Optional[int] = None + TEXT_GENERATION_EXAMPLE = r""" Example of text generation: ```python @@ -137,11 +139,7 @@ def __init__( self.key_value_output_names = [key for key in self.output_names if "present" in key] self._original_model = self.model.clone() # keep original model for serialization self._pkv_precision = Type.f32 -#<<<<<<< HEAD -#======= -# self.next_beam_idx = None -# self._past_length = 0 -#>>>>>>> origin/main + self.update_pkv_precision() if self.is_dynamic: self.model = self._reshape(self.model, -1, -1) @@ -364,7 +362,6 @@ class OVModelForCausalLM(OVBaseDecoderModel, GenerationMixin): export_feature = "text-generation" auto_model_class = AutoModelForCausalLM - @add_start_docstrings_to_model_forward( INPUTS_DOCSTRING.format("batch_size, sequence_length") + TEXT_GENERATION_EXAMPLE.format( @@ -421,7 +418,6 @@ def prepare_inputs( shape[1] = 0 inputs[input_name] = Tensor(model_inputs.get_element_type(), shape.get_shape()) - inputs["input_ids"] = np.array(input_ids) # Add the attention_mask inputs when needed if "attention_mask" in self.input_names or "position_ids" in self.input_names: @@ -448,10 +444,10 @@ def prepare_inputs( if "beam_idx" in self.input_names: if past_key_values is not None: - if len(past_key_values[0]) > 0: + if len(past_key_values[0]) > 0: inputs["beam_idx"] = past_key_values[0] return inputs - inputs["beam_idx"] = np.arange(batch_size, dtype=int) + inputs["beam_idx"] = np.arange(batch_size, dtype=int) return inputs @@ -502,22 +498,27 @@ def forward( else: past_key_values = None - return OVCausalLMOutputWithPast(logits=logits, past_key_values=past_key_values, infer_request=infer_request, past_length=past_length) + return OVCausalLMOutputWithPast( + logits=logits, past_key_values=past_key_values, infer_request=infer_request, past_length=past_length + ) def _update_model_kwargs_for_generation( - self, outputs: OVCausalLMOutputWithPast, + self, + outputs: OVCausalLMOutputWithPast, model_kwargs: dict[str], is_encoder_decoder: bool = False, standardize_cache_format: bool = False, - ) -> dict[str]: + ) -> dict[str]: model_kwargs = super()._update_model_kwargs_for_generation( outputs=outputs, model_kwargs=model_kwargs, is_encoder_decoder=is_encoder_decoder, standardize_cache_format=standardize_cache_format, - ) - if "infer_request" in outputs: model_kwargs["infer_request"] = outputs["infer_request"] - if "past_length" in outputs: model_kwargs["past_length"] = outputs["past_length"] + ) + if "infer_request" in outputs: + model_kwargs["infer_request"] = outputs["infer_request"] + if "past_length" in outputs: + model_kwargs["past_length"] = outputs["past_length"] return model_kwargs # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation @@ -525,10 +526,9 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly attention_mask = kwargs.get("attention_mask", None) use_cache = kwargs.get("use_cache", None) -#<<<<<<< HEAD + infer_request = kwargs.get("infer_request", None) past_length = kwargs.get("past_length", 0) -#======= if past_key_values is not None: past_length = self._get_past_length(past_key_values, past_length=past_length) @@ -543,7 +543,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg elif past_length < input_ids.shape[1]: input_ids = input_ids[:, past_length:] # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens -#>>>>>>> origin/main + # >>>>>>> origin/main position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None and "position_ids" in self.input_names: # create position_ids on the fly for batch generation @@ -594,7 +594,7 @@ def _reorder_cache( # TODO: At least for bloom we need to replicate values for each attention head # save beam_idx and infer_request to be used as an input in the next iteration # here, beam_idx content is passed inside the past_key_values - + return ((beam_idx),) else: return tuple( @@ -788,5 +788,3 @@ def _reorder_cache( return past_key_values else: return tuple(np.take(layer_past, beam_idx, 0) for layer_past in past_key_values) - - diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 5caabf7a9f..ade89ba582 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -54,9 +54,8 @@ set_seed, ) from transformers.onnx.utils import get_preprocessor -from utils_tests import MODEL_NAMES, run_on_multiple_threads from transformers.testing_utils import slow -from utils_tests import MODEL_NAMES +from utils_tests import MODEL_NAMES, run_on_multiple_threads from optimum.intel import ( OVModelForAudioClassification, @@ -897,7 +896,7 @@ def test_default_filling_attention_mask_and_position_ids(self): ) outs_without_attn_mask_step2 = model_with_cache(input_ids=input_ids, past_key_values=past_key_values) self.assertTrue(torch.allclose(outs_step2.logits, outs_without_attn_mask_step2.logits)) - + del model_with_cache gc.collect() @@ -1495,7 +1494,7 @@ def test_compare_with_and_without_past_key_values(self): **inputs, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 ) - # self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) + self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) self.assertTrue( From f7653b25ac03b0d2c259ac401c0236f5681dc6af Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 22 Apr 2024 15:07:01 +0200 Subject: [PATCH 10/12] style --- tests/openvino/test_modeling.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index ade89ba582..7856824f65 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -639,8 +639,6 @@ def test_compare_to_transformers_multithreading(self, model_arch): self.assertIsInstance(ov_model.config, PretrainedConfig) self.assertTrue(ov_model.use_cache) self.assertEqual(ov_model.stateful, ov_model.config.model_type not in not_stateful) - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) - tokens = tokenizer("This is a sample output", return_tensors="pt") transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) From 0ecaa0fcb07c8115a9f5c4cd89a2e9d131b64ca2 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 22 Apr 2024 18:55:31 +0200 Subject: [PATCH 11/12] change dict to Dict from typing --- optimum/intel/openvino/modeling_decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index f9a3861492..9f58c8b232 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -504,10 +504,10 @@ def forward( def _update_model_kwargs_for_generation( self, outputs: OVCausalLMOutputWithPast, - model_kwargs: dict[str], + model_kwargs: Dict[str], is_encoder_decoder: bool = False, standardize_cache_format: bool = False, - ) -> dict[str]: + ) -> Dict[str]: model_kwargs = super()._update_model_kwargs_for_generation( outputs=outputs, model_kwargs=model_kwargs, From aec45a9dbb29aa295fab134503b8642f38bcbf3a Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 22 Apr 2024 23:51:08 +0200 Subject: [PATCH 12/12] fix type declaration --- optimum/intel/openvino/modeling_decoder.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 9f58c8b232..942c09f68e 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -17,7 +17,7 @@ from dataclasses import dataclass from pathlib import Path from tempfile import TemporaryDirectory -from typing import Dict, Optional, Tuple, Union +from typing import Any, Dict, Optional, Tuple, Union import numpy as np import openvino @@ -504,10 +504,10 @@ def forward( def _update_model_kwargs_for_generation( self, outputs: OVCausalLMOutputWithPast, - model_kwargs: Dict[str], + model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False, standardize_cache_format: bool = False, - ) -> Dict[str]: + ) -> Dict[str, Any]: model_kwargs = super()._update_model_kwargs_for_generation( outputs=outputs, model_kwargs=model_kwargs,