Merge remote-tracking branch 'sl/stateful' into ea/stateful

huggingface · Jan 8, 2024 · 70d086a · 70d086a
2 parents e2194b3 + 9992419
commit 70d086a
Show file tree

Hide file tree

Showing 5 changed files with 11 additions and 15 deletions.
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -125,7 +125,8 @@ def main_export(
             `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point, `f32` - means no compression.
         compression_ratio (`Optional[float]`, defaults to `None`):
             Compression ratio between primary and backup precision (only relevant to INT4).
-        stateful (`Optional[bool]`)  - Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs
+        stateful (`Optional[bool]`, defaults to `True`):
+            Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs
         **kwargs_shapes (`Dict`):
             Shapes to use during inference. This argument allows to override the default shapes used during the ONNX export.
 

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -30,8 +30,6 @@
 from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx
 from optimum.exporters.onnx.model_patcher import DecoderModelPatcher
 from optimum.utils import is_diffusers_available
-from .stateful import patch_stateful, raise_if_openvino_is_too_old
-from .better_transformer_patch import patch_model_with_bettertransformer
 
 from ...intel.utils.import_utils import is_nncf_available, is_optimum_version
 from .better_transformer_patch import patch_model_with_bettertransformer
@@ -130,7 +128,7 @@ def export(
             Compression ratio between primary and backup precision (only relevant to INT4).
         input_shapes (`Optional[Dict]`, defaults to `None`):
             If specified, allows to use specific shapes for the example input provided to the exporter.
-         stateful (`Optional[bool]`):
+        stateful (`Optional[bool]`, defaults to `False`):
             Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs
 
     Returns:
@@ -242,8 +240,6 @@ def export_pytorch_via_onnx(
             `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point.
         compression_ratio (`Optional[float]`, defaults to `None`):
             Compression ratio between primary and backup precision (only relevant to INT4).
-        stateful (`Optional[bool]`):
-            Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs
 
     Returns:
         `Tuple[List[str], List[str], bool]`: A tuple with an ordered list of the model's inputs, and the named inputs from
@@ -307,7 +303,7 @@ def export_pytorch(
             `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point.
         compression_ratio (`Optional[float]`, defaults to `None`):
             Compression ratio between primary and backup precision (only relevant to INT4).
-        stateful (`Optional[bool]`):
+        stateful (`Optional[bool]`, defaults to `False`):
             Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs
 
     Returns:
@@ -408,9 +404,11 @@ def ts_patched_forward(*args, **kwargs):
             if patch_model_forward:
                 model.forward = orig_forward
             if stateful:
-                raise ValueError(
-                    "Making stateful models is not supported when exporting to ONNX as an intermediate step. "
-                    "Set stateful=False, or provide a model that can be converted to OpenVINO without fallback to ONNX conversion path."
+                # cannot raise because stateful is enabled by default and it would break backward compatibility for models that couldn't convert to OV directly
+                # TODO: Implement stateful for ONNX path as well, not doing it right now because of lack of validation
+                logger.warn(
+                    "[ WARNING ] Making stateful models is not supported when exporting to ONNX as an intermediate step. Stateless model will be exported instead. "
+                    "Provide a model that can be converted to OpenVINO without fallback to ONNX conversion path."
                 )
             return export_pytorch_via_onnx(
                 model,
@@ -488,7 +486,7 @@ def export_models(
             Compression ratio between primary and backup precision (only relevant to INT4).
         model_kwargs (Optional[Dict[str, Any]], optional):
             Additional kwargs for model export.
-        stateful (`Optional[bool]`)
+        stateful (`Optional[bool]`, defaults to `False`)
             Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs
 
     Raises:

diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py
@@ -18,8 +18,6 @@
 import numpy as np
 from transformers import PretrainedConfig
 
-import numpy as np
-from packaging import version
 import openvino as ov
 from openvino.runtime import opset13
 from optimum.intel.utils.import_utils import _openvino_version, is_openvino_version

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
@@ -35,7 +35,6 @@
 from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS
 from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
-from ...exporters.openvino import patch_stateful, raise_if_openvino_is_too_old
 
 
 if is_transformers_version("<", "4.25.0"):

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -671,7 +671,7 @@ def test_stateful(self, model_arch):
     def test_stateful_on_converted_model(self):
         model_id = "vuiseng9/ov-gpt2-fp32-kv-cache"
         # reference without state
-        loaded_model = OVModelForCausalLM.from_pretrained(model_id)
+        loaded_model = OVModelForCausalLM.from_pretrained(model_id, stateful=False)
         self.assertIsInstance(loaded_model.config, PretrainedConfig)
         self.assertFalse(loaded_model.stateful)
         self.assertTrue(loaded_model.use_cache)