huggingface · echarlaix · Jan 23, 2024 · Jan 17, 2024 · Jan 22, 2024 · echarlaix
diff --git a/docs/source/inference.mdx b/docs/source/inference.mdx
@@ -186,11 +186,10 @@ It is possible to pass an `ov_config` parameter to `from_pretrained()` with cust
 model = OVModelForSequenceClassification.from_pretrained(model_id, ov_config={"INFERENCE_PRECISION_HINT":"f32"})
 ```
 
-Optimum Intel leverages OpenVINO's model caching to speed up model compiling. By default a `model_cache` directory is created in the model's directory in the [Hugging Face Hub cache](https://huggingface.co/docs/huggingface_hub/main/en/guides/manage-cache). To override this, use the ov_config parameter and set `CACHE_DIR` to a different value. To disable model caching, set `CACHE_DIR` to an empty string.
-
+Optimum Intel leverages OpenVINO's model caching to speed up model compiling on GPU. By default a `model_cache` directory is created in the model's directory in the [Hugging Face Hub cache](https://huggingface.co/docs/huggingface_hub/main/en/guides/manage-cache). To override this, use the ov_config parameter and set `CACHE_DIR` to a different value. To disable model caching on GPU, set `CACHE_DIR` to an empty string.
 
 ```python
-model = OVModelForSequenceClassification.from_pretrained(model_id, ov_config={"CACHE_DIR":""})
+model = OVModelForSequenceClassification.from_pretrained(model_id, device="GPU", ov_config={"PERFORMANCE_HINT": "LATENCY", "CACHE_DIR":""})
 ```
 
 ### Sequence-to-sequence models

diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
@@ -346,8 +346,12 @@ def compile(self):
         if self.request is None:
             logger.info(f"Compiling the model to {self._device} ...")
             ov_config = {**self.ov_config}
-            if "CACHE_DIR" not in self.ov_config.keys() and not str(self.model_save_dir).startswith(gettempdir()):
-                # Set default CACHE_DIR only if it is not set, and if the model is not in a temporary directory
+            if (
+                "CACHE_DIR" not in self.ov_config.keys()
+                and not str(self.model_save_dir).startswith(gettempdir())
+                and self._device.lower() == "gpu"
+            ):
+                # Set default CACHE_DIR only if it is not set, if the model is not in a temporary directory, and device is GPU
                 cache_dir = Path(self.model_save_dir).joinpath("model_cache")
                 ov_config["CACHE_DIR"] = str(cache_dir)
                 logger.info(f"Setting OpenVINO CACHE_DIR to {str(cache_dir)}")

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
@@ -537,11 +537,16 @@ def __init__(
         self._model_dir = Path(model_dir or parent_model._model_save_dir)
         config_path = self._model_dir / model_name / self.CONFIG_NAME
         self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {}
-        if "CACHE_DIR" not in self.ov_config.keys() and not str(self._model_dir).startswith(gettempdir()):
-            self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache")
 
     def _compile(self):
         if self.request is None:
+            if (
+                "CACHE_DIR" not in self.ov_config.keys()
+                and not str(self._model_dir).startswith(gettempdir())
+                and self.device.lower() == "gpu"
+            ):
+                self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache")
+
             logger.info(f"Compiling the {self._model_name} to {self.device} ...")
             self.request = core.compile_model(self.model, self.device, self.ov_config)
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html

diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py
@@ -266,34 +266,11 @@ def __init__(
         self.device = torch.device("cpu")
         self.decoder_with_past = None
         enable_compilation = kwargs.get("compile", True)
-        encoder_cache_dir = Path(self.model_save_dir).joinpath("encoder_cache")
-        ov_encoder_config = {**self.ov_config}
-
-        if "CACHE_DIR" not in ov_encoder_config.keys() and not str(self.model_save_dir).startswith(gettempdir()):
-            ov_encoder_config["CACHE_DIR"] = str(encoder_cache_dir)
-
-        self.encoder = OVEncoder(
-            self.encoder_model, self._device, ov_encoder_config, main_input_name=self.main_input_name
-        )
-
-        decoder_cache_dir = Path(self.model_save_dir).joinpath("decoder_cache")
-        ov_decoder_config = {**self.ov_config}
-
-        if "CACHE_DIR" not in ov_decoder_config.keys() and not str(self.model_save_dir).startswith(gettempdir()):
-            ov_decoder_config["CACHE_DIR"] = str(decoder_cache_dir)
-
-        self.decoder = OVDecoder(self.decoder_model, self._device, ov_decoder_config)
+        self.encoder = OVEncoder(self.encoder_model, self._device, parent_model=self)
+        self.decoder = OVDecoder(self.decoder_model, self._device, parent_model=self)
 
         if self.use_cache:
-            decoder_past_cache_dir = Path(self.model_save_dir).joinpath("decoder_past_cache")
-            ov_decoder_past_config = {**self.ov_config}
-
-            if "CACHE_DIR" not in ov_decoder_past_config.keys() and not str(self.model_save_dir).startswith(
-                gettempdir()
-            ):
-                ov_decoder_past_config["CACHE_DIR"] = str(decoder_past_cache_dir)
-
-            self.decoder_with_past = OVDecoder(self.decoder_with_past_model, self._device, ov_decoder_past_config)
+            self.decoder_with_past = OVDecoder(self.decoder_with_past_model, self._device, parent_model=self)
         if enable_compilation:
             self.compile()
 
@@ -435,13 +412,13 @@ class OVEncoder:
             The OpenVINO inference request associated to the encoder.
     """
 
-    def __init__(self, model: openvino.runtime.Model, device: str, ov_config: Dict, main_input_name="input_ids"):
+    def __init__(self, model: openvino.runtime.Model, device: str, parent_model: OVModelForSeq2SeqLM):
         self.model = model
         self._device = device
+        self.parent_model = parent_model
         self.device = torch.device("cpu")
         self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
-        self.main_input_name = main_input_name
-        self.ov_config = ov_config
+        self.main_input_name = self.parent_model.main_input_name or "input_ids"
         self.request = None
 
     @add_start_docstrings_to_model_forward(ENCODER_INPUTS_DOCSTRING)
@@ -471,9 +448,18 @@ def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
 
     def _compile(self):
+        ov_config = {**self.parent_model.ov_config}
+        if (
+            "CACHE_DIR" not in ov_config.keys()
+            and not str(self.parent_model.model_save_dir).startswith(gettempdir())
+            and self._device.lower() == "gpu"
+        ):
+            cache_dir = Path(self.parent_model.model_save_dir).joinpath("model_cache")
+            ov_config["CACHE_DIR"] = str(cache_dir)
+
         if self.request is None:
             logger.info(f"Compiling the encoder to {self._device} ...")
-            self.request = core.compile_model(self.model, self._device, self.ov_config)
+            self.request = core.compile_model(self.model, self._device, ov_config)
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
                 logger.info(f"{self._device} SUPPORTED_PROPERTIES:")
@@ -491,9 +477,10 @@ class OVDecoder:
             The device type used by this process.
     """
 
-    def __init__(self, model: openvino.runtime.Model, device: str, ov_config: Dict):
+    def __init__(self, model: openvino.runtime.Model, device: str, parent_model: OVModelForSeq2SeqLM):
         self.model = model
         self._device = device
+        self.parent_model = parent_model
         self.device = torch.device("cpu")
         self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
         self.key_value_input_names = [key for key in self.input_names if "key_values" in key]
@@ -508,7 +495,6 @@ def __init__(self, model: openvino.runtime.Model, device: str, ov_config: Dict):
             self.use_past = False
             self.num_pkv = 4
 
-        self.ov_config = ov_config
         self.request = None
 
     @add_start_docstrings_to_model_forward(DECODER_INPUTS_DOCSTRING)
@@ -574,9 +560,18 @@ def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
 
     def _compile(self):
+        ov_config = {**self.parent_model.ov_config}
+        if (
+            "CACHE_DIR" not in ov_config.keys()
+            and not str(self.parent_model.model_save_dir).startswith(gettempdir())
+            and self._device.lower() == "gpu"
+        ):
+            cache_dir = Path(self.parent_model.model_save_dir).joinpath("model_cache")
+            ov_config["CACHE_DIR"] = str(cache_dir)
+
         if self.request is None:
             logger.info(f"Compiling the decoder to {self._device} ...")
-            compiled_model = core.compile_model(self.model, self._device, self.ov_config)
+            compiled_model = core.compile_model(self.model, self._device, ov_config)
             self.request = compiled_model.create_infer_request()
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -90,7 +90,7 @@
 
 SEED = 42
 
-F32_CONFIG = {"CACHE_DIR": "", "INFERENCE_PRECISION_HINT": "f32"}
+F32_CONFIG = {"INFERENCE_PRECISION_HINT": "f32"}
 
 
 class Timer(object):
@@ -117,11 +117,6 @@ def test_load_from_hub_and_save_model(self):
         self.assertIsInstance(loaded_model.config, PretrainedConfig)
         loaded_model_outputs = loaded_model(**tokens)
 
-        # Test that model caching is automatically enabled
-        openvino_cache_dir = loaded_model.model_save_dir / "model_cache"
-        self.assertTrue(openvino_cache_dir.is_dir())
-        self.assertGreaterEqual(len(list(openvino_cache_dir.glob("*.blob"))), 1)
-
         # Test specifying ov_config with throughput hint and manual cache dir
         manual_openvino_cache_dir = loaded_model.model_save_dir / "manual_model_cache"
         ov_config = {"CACHE_DIR": str(manual_openvino_cache_dir), "PERFORMANCE_HINT": "THROUGHPUT"}
@@ -598,11 +593,15 @@ def test_compare_with_and_without_past_key_values(self):
         gc.collect()
 
     def test_auto_device_loading(self):
-        model_id = MODEL_NAMES["gpt2"]
         for device in ("AUTO", "AUTO:CPU"):
-            model = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True, device=device)
+            OV_MODEL_ID = "echarlaix/distilbert-base-uncased-finetuned-sst-2-english-openvino"
+            model = OVModelForSequenceClassification.from_pretrained(OV_MODEL_ID, device=device)
             model.half()
             self.assertEqual(model._device, device)
+            if device == "AUTO:CPU":
+                model = OVModelForSequenceClassification.from_pretrained(OV_MODEL_ID, device=device)
+                message = "Model should not be loaded from cache without explicitly setting CACHE_DIR"
+                self.assertFalse(model.request.get_property("LOADED_FROM_CACHE"), message)
             del model
             gc.collect()