fix rotary emb initialization

eaidova · eaidova · commit 8405675386ec · 2024-07-29T17:51:49.000+04:00
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -16,13 +16,15 @@
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
 
 from packaging import version
+from transformers import PreTrainedModel, TFPreTrainedModel
 from transformers.utils import is_tf_available
 
 from optimum.exporters.onnx.config import TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig
 from optimum.exporters.onnx.model_configs import (
     CodeGenOnnxConfig,
     FalconOnnxConfig,
     GemmaOnnxConfig,
+    GPTNeoXOnnxConfig,
     LlamaOnnxConfig,
     MistralOnnxConfig,
     MPTOnnxConfig,
@@ -31,6 +33,7 @@
     VaeDecoderOnnxConfig,
     VaeEncoderOnnxConfig,
 )
+from optimum.exporters.onnx.model_patcher import ModelPatcher
 from optimum.exporters.tasks import TasksManager
 from optimum.utils import DEFAULT_DUMMY_SHAPES
 from optimum.utils.input_generators import (
@@ -50,6 +53,9 @@
     ChatGLMModelPatcher,
     CodeGenModelPatcher,
     DBRXModelPatcher,
+    FalconModelPatcher,
+    GptNeoxJapaneseModelPatcher,
+    GptNeoxModelPatcher,
     InternLM2Patcher,
     InternLMModelPatcher,
     JaisModelPatcher,
@@ -60,6 +66,7 @@
     PersimmonModelPatcher,
     Phi3ModelPatcher,
     QwenModelPatcher,
+    RotaryEmbPatcher,
     XverseModelPatcher,
 )
 
@@ -119,6 +126,11 @@ class Qwen2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return RotaryEmbPatcher(self, model, model_kwargs=model_kwargs)
+
 
 @register_in_tasks_manager("qwen2-moe", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class Qwen2MoEOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
@@ -128,6 +140,11 @@ class Qwen2MoEOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return RotaryEmbPatcher(self, model, model_kwargs=model_kwargs)
+
 
 @register_in_tasks_manager("minicpm", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class MiniCPMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
@@ -146,6 +163,11 @@ class StableLMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return RotaryEmbPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class ChatGLM2DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
     def __init__(
@@ -469,6 +491,12 @@ class Starcoder2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
+def patch_model_for_export(
+    self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+) -> "ModelPatcher":
+    return RotaryEmbPatcher(self, model, model_kwargs=model_kwargs)
+
+
 @register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
@@ -508,6 +536,24 @@ def patch_model_for_export(
         return MPTModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
+@register_in_tasks_manager(
+    "phi",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class PhiOpenVINOConfig(PhiOnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return RotaryEmbPatcher(self, model, model_kwargs=model_kwargs)
+
+
 @register_in_tasks_manager(
     "phi3",
     *[
@@ -578,6 +624,11 @@ class FalconOpenVINOConfig(FalconOnnxConfig):
     ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
     DUMMY_PKV_GENERATOR_CLASS = OVFalconDummyPastKeyValuesGenerator
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return FalconModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 @register_in_tasks_manager("unet", *["semantic-segmentation"], library_name="diffusers")
 class UNetOpenVINOConfig(UNetOnnxConfig):
@@ -671,6 +722,11 @@ class GPTNeoxJapaneseOpenVINOConfig(TextDecoderOnnxConfig):
     DEFAULT_ONNX_OPSET = 13
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return GptNeoxJapaneseModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 @register_in_tasks_manager(
     "cohere",
@@ -859,3 +915,21 @@ def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
         return MistralModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager(
+    "gpt-neox",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class GPTNeoxOpenVINOConfig(GPTNeoXOnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return GptNeoxModelPatcher(self, model, model_kwargs=model_kwargs)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -101,6 +101,15 @@ def patch_model_with_bettertransformer(model):
     return model
 
 
+# initialization of sin/cos cached in bf16/fp16 leads to accuracy loss
+# reinitialize them to save in float32 before export
+def _reinitialize_cos_sin_cached_fp32(rotary_emb):
+    if rotary_emb.cos_cached.dtype != torch.float32:
+        rotary_emb._set_cos_sin_cache(
+            seq_len=rotary_emb.max_position_embeddings, device=rotary_emb.inv_freq.device, dtype=torch.float32
+        )
+
+
 def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     """ """
     batch_size, sequence_length, hidden_dim = hidden_states.shape
@@ -149,6 +158,7 @@ def __enter__(self):
             layer.block_sparse_moe.forward = types.MethodType(
                 _mixtral_sparse_moe_block_forward, layer.block_sparse_moe
             )
+            _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
@@ -687,6 +697,9 @@ def __enter__(self):
             # mistral has some accuracy issues with bf16 with transformers >= 4.42
             # prefill rotary emb sin/cos for avoid this issue
             register_sin_cos_buffer(self._model)
+        else:
+            for layer in self._model.model.layers:
+                _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
@@ -2094,6 +2107,7 @@ def __enter__(self):
                 orig_self_attn_fwd = layer.self_attn.forward
                 layer.self_attn.forward = types.MethodType(_persimmon_self_attn_sdpa_forward, layer.self_attn)
                 layer.self_attn._orig_forward = orig_self_attn_fwd
+            _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
@@ -2221,3 +2235,31 @@ def __exit__(self, exc_type, exc_value, traceback):
             if hasattr(layer.attn, "_orig_attn"):
                 layer.attn._attn = layer.attn._orig_attn
                 layer.attn.forward = layer.attn._orig_forward
+
+
+class RotaryEmbPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        for layer in self._model.model.layers:
+            _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
+
+
+class FalconModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        for layer in self._model.transformer.h:
+            _reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb)
+
+
+class GptNeoxModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        for layer in self._model.gpt_neox.layers:
+            _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
+
+
+class GptNeoxJapaneseModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        for layer in self._model.gpt_neox_japanese.layers:
+            _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)