huggingface · dtrawins · Feb 26, 2024 · Feb 28, 2024 · Feb 29, 2024 · Feb 29, 2024
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
@@ -86,6 +86,7 @@ def __init__(
 
         self.model = model
         self.request = None
+        self.compiled_model = None
         if enable_compilation:
             self.compile()
 

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
@@ -16,7 +16,7 @@
 import os
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import openvino
@@ -132,7 +132,7 @@ def __init__(
         self.key_value_output_names = [key for key in self.output_names if "present" in key]
         self._original_model = self.model.clone()  # keep original model for serialization
         self._pkv_precision = Type.f32
-        self.next_beam_idx = None
+        # self.next_beam_idx = None
         self.update_pkv_precision()
         if self.is_dynamic:
             self.model = self._reshape(self.model, -1, -1)
@@ -210,6 +210,7 @@ def update_pkv_precision(self, force_fp32=False):
                 if self.is_dynamic:
                     self.model = self._reshape(self.model, -1, -1)
                 self.request = None
+                self.compiled_model = None
 
     def _save_pretrained(self, save_directory: Union[str, Path]):
         """
@@ -334,9 +335,9 @@ def normalized_config(self):
         return NormalizedConfigManager.get_normalized_config_class(self.config.model_type)(self.config)
 
     def compile(self):
-        if self.request is None:
+        if self.compiled_model is None:
             super().compile()
-            self.request = self.request.create_infer_request()
+            self.compiled_model = self.request
 
     def _make_stateful(self):
         patch_stateful(self.config, self.model)
@@ -354,6 +355,13 @@ class OVModelForCausalLM(OVBaseDecoderModel, GenerationMixin):
     export_feature = "text-generation"
     auto_model_class = AutoModelForCausalLM
 
+    def generate(self, *args, **kwargs):
+        self.compile()
+        if kwargs.get("infer_request") is None:
+            infer_context = [self.compiled_model.create_infer_request()]
+            kwargs["infer_context"] = infer_context
+        return super().generate(*args, **kwargs)
+
     @add_start_docstrings_to_model_forward(
         INPUTS_DOCSTRING.format("batch_size, sequence_length")
         + TEXT_GENERATION_EXAMPLE.format(
@@ -376,7 +384,6 @@ def prepare_inputs(
         batch_size = input_ids.shape[0]
         if self.config.model_type == "bloom":
             batch_size *= self.config.num_attention_heads
-
         inputs = {}
         past_len = 0
         if not self.stateful:
@@ -416,15 +423,6 @@ def prepare_inputs(
                         else:
                             shape[1] = 0
                     inputs[input_name] = Tensor(model_inputs.get_element_type(), shape.get_shape())
-        else:
-            # past_key_values are not used explicitly, instead they are handled inside the model
-            if past_key_values is None:
-                # This is the first iteration in a sequence, reset all states
-                if self.request is not None:
-                    self.request.reset_state()
-                # Set initial value for the next beam_idx input that will be used at the current iteration
-                # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used
-                self.next_beam_idx = np.arange(batch_size, dtype=int)
 
         inputs["input_ids"] = np.array(input_ids)
         # Add the attention_mask inputs when needed
@@ -452,7 +450,7 @@ def prepare_inputs(
 
         if "beam_idx" in self.input_names:
             inputs["beam_idx"] = (
-                self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int)
+                past_key_values[0] if past_key_values is not None else np.arange(batch_size, dtype=int)
             )
 
         return inputs
@@ -463,32 +461,41 @@ def forward(
         attention_mask: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         position_ids: Optional[torch.LongTensor] = None,
+        infer_context: Optional[List[openvino.runtime.InferRequest]] = None,
         **kwargs,
     ) -> CausalLMOutputWithPast:
-        self.compile()
-
         inputs = self.prepare_inputs(
             input_ids=input_ids,
             attention_mask=attention_mask,
             past_key_values=past_key_values,
             position_ids=position_ids,
             **kwargs,
         )
-
         # Run inference
-        self.request.start_async(inputs, share_inputs=True)
-        self.request.wait()
-        logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
+        if self.stateful and past_key_values is not None:
+            # for stateful models, infer request is created in generate and __call_ methods and passed in the cycle via past_key_values param
+            infer_request = past_key_values[1]
+        else:
+            if infer_context is not None:
+                infer_request = infer_context[
+                    0
+                ]  # Use passed inference request if provided in kwargs, create new one overwise
+            else:
+                self.compile()
+                infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs, share_inputs=True)
+        infer_request.wait()
+        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
         if self.stateful:
             # Need a marker to differentiate the first generate iteration from the others in
             # the first condition at the function beginning above.
             # It should be something that is not None and it should be True when converted to Boolean.
-            past_key_values = ((),)
+            past_key_values = ((inputs["beam_idx"]), infer_request)
 
         if not self.stateful:
             if self.use_cache:
                 # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer)
-                past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names)
+                past_key_values = tuple(infer_request.get_tensor(key).data for key in self.key_value_output_names)
                 if self.config.model_type not in MULTI_QUERY_ATTN_MODELS:
                     # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention)
                     past_key_values = tuple(
@@ -504,7 +511,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
         # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
         attention_mask = kwargs.get("attention_mask", None)
         use_cache = kwargs.get("use_cache", None)
-
+        infer_context = kwargs.get("infer_context", None)
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
             # create position_ids on the fly for batch generation
@@ -517,6 +524,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             "input_ids": input_ids,
             "past_key_values": past_key_values,
             "use_cache": use_cache,
+            "infer_context": infer_context,
             "position_ids": position_ids,
             "attention_mask": attention_mask,
         }
@@ -533,7 +541,10 @@ def _reorder_cache(
         if self.stateful:
             # TODO: Apply it differently based on model type
             # TODO: At least for bloom we need to replicate values for each attention head
-            self.next_beam_idx = np.array(beam_idx)  # save beam_idx to be used as an input in the next iteration
+            past_key_values = (
+                (np.array(beam_idx)),
+                past_key_values[1],
+            )  # save beam_idx and infer_request to be used as an input in the next iteration
             return past_key_values
         else:
             return tuple(
@@ -673,8 +684,7 @@ def _reorder_cache(
             batch_size = beam_idx.shape[0]
             indices = np.array(range(batch_size * self.config.num_attention_heads))
             indices = indices.reshape([batch_size, self.config.num_attention_heads])
-            self.next_beam_idx = np.take(indices, beam_idx, 0).flatten()
-            return past_key_values
+            return ((np.take(indices, beam_idx, 0).flatten()), past_key_values[1])
-            return ((np.take(indices, beam_idx, 0).flatten()), past_key_values[1])
+            return past_key_values
-            return ((np.take(indices, beam_idx, 0).flatten()), past_key_values[1])
+            return past_key_values
         else:
             standardized_past = self._convert_to_standard_cache(past_key_values, batch_size=len(beam_idx))
             reordered_past = tuple(

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -50,7 +50,7 @@
     set_seed,
 )
 from transformers.onnx.utils import get_preprocessor
-from utils_tests import MODEL_NAMES
+from utils_tests import MODEL_NAMES, run_on_multiple_threads
 
 from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS
 from optimum.intel import (
@@ -502,6 +502,7 @@ def test_compare_to_transformers(self, model_arch):
 
         set_seed(SEED)
         ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
+        print("model", ov_model.stateful, ov_model.use_cache)
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         self.assertTrue(ov_model.use_cache)
 
@@ -515,16 +516,10 @@ def test_compare_to_transformers(self, model_arch):
             input_shape = tokens["input_ids"].shape
             position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1])
         ov_outputs = ov_model(**tokens, position_ids=position_ids)
-
-        self.assertTrue("logits" in ov_outputs)
         self.assertIsInstance(ov_outputs.logits, torch.Tensor)
-        self.assertTrue("past_key_values" in ov_outputs)
-        self.assertIsInstance(ov_outputs.past_key_values, tuple)
 
         is_stateful = ov_model.config.model_type not in {"gpt_bigcode", "llama"} and self.IS_SUPPORT_STATEFUL
         self.assertEqual(ov_model.stateful, is_stateful)
-        if is_stateful:
-            self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0)
 
         with torch.no_grad():
             transformers_outputs = transformers_model(**tokens)
@@ -535,6 +530,52 @@ def test_compare_to_transformers(self, model_arch):
         del ov_model
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_compare_to_transformers_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        if "llama_gptq" in model_arch:
+            self.skipTest("Not supported without gpu and disable_exllama=True option")
+        set_seed(SEED)
+        ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
+        self.assertIsInstance(ov_model.config, PretrainedConfig)
+        self.assertTrue(ov_model.use_cache)
+        is_stateful = ov_model.config.model_type not in {"gpt_bigcode", "llama"} and self.IS_SUPPORT_STATEFUL
+        self.assertEqual(ov_model.stateful, is_stateful)
+        transformers_model = AutoModelForCausalLM.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs_list = ["This is a sample", "Here is another sample", "That's the thrid one", "This is the last sample"]
+        tokens_list = [
+            tokenizer(inputs, return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None)
+            for inputs in inputs_list
+        ]
+
+        def run_ov_model(tokens, transformers_model, ov_model):
+            # global ov_model, transformers_model
+            position_ids = None
+            if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS:
+                input_shape = tokens["input_ids"].shape
+                position_ids = (
+                    torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1])
+                )
+            ov_outputs = ov_model(**tokens, position_ids=position_ids)
+
+            self.assertTrue("logits" in ov_outputs)
+            self.assertIsInstance(ov_outputs.logits, torch.Tensor)
+            # self.assertTrue("past_key_values" in ov_outputs)
+            # self.assertIsInstance(ov_outputs.past_key_values, tuple)
+            # if self.IS_SUPPORT_STATEFUL and model_arch != "gpt_bigcode":
+            #    self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0)
+            with torch.no_grad():
+                transformers_outputs = transformers_model(**tokens)
+            # Compare tensor outputs
+            self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4))
+
+        run_on_multiple_threads(run_ov_model, tokens_list, (transformers_model, ov_model))
+
+        del transformers_model
+        del ov_model
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
@@ -552,6 +593,30 @@ def test_pipeline(self, model_arch):
         del model
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_pipeline_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        model = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=False, compile=False)
+        model.config.encoder_no_repeat_ngram_size = 0
+        model.to("cpu")
+        model.half()
+        model.compile()
+
+        def run_ov_model(input_text, model):
+            # Tokenizer is not supposed to be shared by multiple threads
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+            outputs = pipe(input_text, max_length=30)
+            self.assertEqual(pipe.device, model.device)
+            for i in range(len(outputs)):
+                self.assertTrue(all(input_text[i] in item["generated_text"] for item in outputs[i]))
+            del pipe
+
+        inputs_list = [["This is a sample"], ["This is a second sample"], ["This is a third sample"]]
+        run_on_multiple_threads(run_ov_model, inputs_list, [model])
+        del model
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_multiple_inputs(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
@@ -568,6 +633,27 @@ def test_multiple_inputs(self, model_arch):
         del model
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_multiple_inputs_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.pad_token = tokenizer.eos_token
+        texts = ["this is a simple input", "this is a second simple input", "this is a third simple input"]
+        tokens = tokenizer(texts, padding=True, return_tensors="pt")
+        generation_config = GenerationConfig(encoder_no_repeat_ngram_size=0, max_new_tokens=20, num_beams=2)
+
+        def run_ov_model(tokens, model):
+            outputs = model.generate(**tokens, generation_config=generation_config)
+            self.assertIsInstance(outputs, torch.Tensor)
+            self.assertEqual(outputs.shape[0], 3)
+
+        tokens_list = [tokens, tokens, tokens, tokens]  # running in 4 threads
+        run_on_multiple_threads(run_ov_model, tokens_list, [model])
+        del model
+        gc.collect()
+
     def test_model_and_decoder_same_device(self):
         model_id = MODEL_NAMES["gpt2"]
         model = OVModelForCausalLM.from_pretrained(model_id, export=True)
@@ -1259,7 +1345,7 @@ def test_compare_with_and_without_past_key_values(self):
                 **inputs, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
             )
 
-        self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
+        # self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
         self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH)
         self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH)
         self.assertTrue(

diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -12,6 +12,8 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import threading
+
 import numpy as np
 import torch
 
@@ -132,3 +134,37 @@ def get_num_quantized_nodes(ov_model):
             if "4" in elem.get_output_element_type(i).get_type_name():
                 num_int4 += 1
     return num_fake_quantize, num_int8, num_int4
+
+
+### Multithreading
+
+
+class OVThread(threading.Thread):
+    def __init__(self, target, args):
+        super().__init__()
+        self.target = target
+        self.args = args
+
+    def run(self):
+        self.exception = None
+        try:
+            self.target(*self.args)
+        except Exception as e:
+            self.exception = e
+
+    def join(self):
+        super().join()
+        if self.exception:
+            raise self.exception
+
+
+# Each set of args is run in a separate thread.
+# Amount of such sets define how many threads are spawned.
+def run_on_multiple_threads(target, list, extra_args):
+    threads = []
+    for input in list:
+        threads.append(OVThread(target=target, args=(input, *extra_args)))
+    for thread in threads:
+        thread.start()
+    for thread in threads:
+        thread.join()