diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 5e4dbdd193..ba3a039c8c 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -30,9 +30,7 @@ M2M100OnnxConfig, MPTOnnxConfig, PhiOnnxConfig, - Pix2StructOnnxConfig, T5OnnxConfig, - TrOCROnnxConfig, UNetOnnxConfig, VaeDecoderOnnxConfig, VaeEncoderOnnxConfig, @@ -853,7 +851,7 @@ def _create_dummy_input_generator_classes(self, **kwargs) -> List[DummyInputGene forces the other generators to use the same batch size, meaning they will all produce inputs of the same batch size. Override this method for custom behavior. """ - if getattr(self, "stateful"): + if getattr(self, "stateful", False): if "encoder_sequence_length" not in kwargs: sequence_len = kwargs.get("sequence_length", DEFAULT_DUMMY_SHAPES["sequence_length"]) kwargs["encoder_sequence_length"] = sequence_len + 2 @@ -1039,74 +1037,3 @@ class MarianOpenVINOConfig(M2M100OpenVINOConfig): ) class PegasusOpenVINOConfig(M2M100OpenVINOConfig): pass - - -@register_in_tasks_manager( - "pix2struct", - *[ - "image-to-text", - "image-to-text-with-past", - ], - library_name="transformers", -) -class Pix2StructOpenVINOConfig(Pix2StructOnnxConfig): - def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]: - dummy_inputs_generators = [] - dummy_inputs_generators.append(self.DUMMY_INPUT_GENERATOR_CLASSES[0](self.task, self._normalized_config)) - - if self._preprocessors is None or len(self._preprocessors) != 2: - raise ValueError( - f"Preprocessors for pix2struct need to be available for the ONNX export to infer input static shapes. Got: {self._preprocessors}" - ) - - encoder_sequence_length = self._preprocessors[1].image_processor.max_patches - if getattr(self, "stateful", False): - encoder_sequence_length += 2 - # A hack for DummyPix2StructInputGenerator to gain access to the preprocessors. - # TODO: we should probably pass preprocessors to all dummy input generators. - kwargs["preprocessors"] = self._preprocessors - for cls_ in self.DUMMY_INPUT_GENERATOR_CLASSES[1:]: - dummy_inputs_generators.append( - cls_(self.task, self._normalized_config, encoder_sequence_length=encoder_sequence_length, **kwargs) - ) - - return dummy_inputs_generators - - -@register_in_tasks_manager( - "trocr", - *[ - "feature-extraction", - "feature-extraction-with-past", - "image-to-text", - "image-to-text-with-past", - ], - library_name="transformers", -) -class TrOCROpenVINOConfig(TrOCROnnxConfig): - def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]: - dummy_text_input_generator = self.DUMMY_INPUT_GENERATOR_CLASSES[0]( - self.task, self._normalized_config, **kwargs - ) - dummy_decoder_text_input_generator = self.DUMMY_INPUT_GENERATOR_CLASSES[1]( - self.task, - self._normalized_config, - **kwargs, - ) - encoder_sequence_length = dummy_text_input_generator.sequence_length - - if getattr(self, "stateful", False): - encoder_sequence_length += 2 - dummy_seq2seq_past_key_values_generator = self.DUMMY_INPUT_GENERATOR_CLASSES[2]( - self.task, - self._normalized_config, - encoder_sequence_length=encoder_sequence_length, - **kwargs, - ) - dummy_inputs_generators = [ - dummy_text_input_generator, - dummy_decoder_text_input_generator, - dummy_seq2seq_past_key_values_generator, - ] - - return dummy_inputs_generators diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py index 8003882176..1c76290e78 100644 --- a/optimum/exporters/openvino/stateful.py +++ b/optimum/exporters/openvino/stateful.py @@ -189,10 +189,7 @@ def ensure_export_task_support_stateful(task: str, is_encoder_decoder: bool = Fa _ENCODER_DECODER_TASKS_WITH_PAST = ( "automatic-speech-recognition", - "document-question-answering", - "image-to-text", "text2text-generation", - "visual-question-answering", ) is_stateful = task.endswith("-with-past") and task.replace("-with-past", "") in _ENCODER_DECODER_TASKS_WITH_PAST @@ -223,6 +220,10 @@ def get_read_value_ops(model: ov.Model): return [op for op in model.get_ops() if op.get_type_name() == "ReadValue"] +def get_shape_of_ops(model: ov.Model): + return [op for op in model.get_ops() if op.get_type_name() == "ShapeOf"] + + def get_consumer_nodes(node): consumer_inputs = set().union(*[output.get_target_inputs() for output in node.outputs()]) return set(input.get_node() for input in consumer_inputs) @@ -230,7 +231,7 @@ def get_consumer_nodes(node): def find_output_nodes_of_dependent_subgraph(model: ov.Model, sources: list): # Search for nodes in the model graph that depend on nodes in `starts` list but independent of other model Parameter's/ReadValue's - other_inputs = set(model.get_parameters() + get_read_value_ops(model)) - set(sources) + other_inputs = set(model.get_parameters() + get_read_value_ops(model) + get_shape_of_ops(model)) - set(sources) other_nodes = find_dependent_nodes(model, other_inputs) source_dependent_nodes = find_dependent_nodes(model, sources) # TODO: Use symbols on dimensions to filter out ShapeOf subexpressions that do not bring new symbols in the subgraph diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index b3d368e7fc..378abf733b 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -592,7 +592,7 @@ def forward( if "beam_idx" in self.input_names: batch_size = input_ids.shape[0] inputs["beam_idx"] = ( - self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int) + self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=np.int32) ) # Run inference self.request.start_async(inputs, share_inputs=True) @@ -753,7 +753,9 @@ def _reshape(self, model: openvino.runtime.Model, batch_size: int, sequence_leng if is_decoder: if inputs.get_any_name().startswith("past_key_values"): shapes[inputs][2] = -1 - elif not inputs.get_any_name().startswith("encoder"): + elif not inputs.get_any_name().startswith("encoder") and not inputs.get_any_name().startswith( + "beam_idx" + ): shapes[inputs][1] = -1 model.reshape(shapes) return model @@ -836,7 +838,9 @@ def _reshape(self, model: openvino.runtime.Model, batch_size: int, sequence_leng if is_decoder: if inputs.get_any_name().startswith("past_key_values"): shapes[inputs][2] = -1 - elif not inputs.get_any_name().startswith("encoder"): + elif not inputs.get_any_name().startswith("encoder") and not inputs.get_any_name().startswith( + "beam_idx" + ): shapes[inputs][1] = -1 model.reshape(shapes) return model diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 05217b8f1d..1461904860 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1194,7 +1194,7 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(ov_model.encoder, OVEncoder) self.assertIsInstance(ov_model.decoder, OVDecoder) self.assertTrue(ov_model.decoder.stateful) - self.assertIsInstance(ov_model.decoder_with_past, None) + self.assertTrue(ov_model.decoder_with_past is None) self.assertIsInstance(ov_model.config, PretrainedConfig) transformers_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)