Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable automatic CACHE_DIR for GPU inference only #520

Merged
merged 2 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions docs/source/inference.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,10 @@ It is possible to pass an `ov_config` parameter to `from_pretrained()` with cust
model = OVModelForSequenceClassification.from_pretrained(model_id, ov_config={"INFERENCE_PRECISION_HINT":"f32"})
```

Optimum Intel leverages OpenVINO's model caching to speed up model compiling. By default a `model_cache` directory is created in the model's directory in the [Hugging Face Hub cache](https://huggingface.co/docs/huggingface_hub/main/en/guides/manage-cache). To override this, use the ov_config parameter and set `CACHE_DIR` to a different value. To disable model caching, set `CACHE_DIR` to an empty string.

Optimum Intel leverages OpenVINO's model caching to speed up model compiling on GPU. By default a `model_cache` directory is created in the model's directory in the [Hugging Face Hub cache](https://huggingface.co/docs/huggingface_hub/main/en/guides/manage-cache). To override this, use the ov_config parameter and set `CACHE_DIR` to a different value. To disable model caching on GPU, set `CACHE_DIR` to an empty string.

```python
model = OVModelForSequenceClassification.from_pretrained(model_id, ov_config={"CACHE_DIR":""})
model = OVModelForSequenceClassification.from_pretrained(model_id, device="GPU", ov_config={"PERFORMANCE_HINT": "LATENCY", "CACHE_DIR":""})
```

### Sequence-to-sequence models
Expand Down
8 changes: 6 additions & 2 deletions optimum/intel/openvino/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,8 +346,12 @@ def compile(self):
if self.request is None:
logger.info(f"Compiling the model to {self._device} ...")
ov_config = {**self.ov_config}
if "CACHE_DIR" not in self.ov_config.keys() and not str(self.model_save_dir).startswith(gettempdir()):
# Set default CACHE_DIR only if it is not set, and if the model is not in a temporary directory
if (
"CACHE_DIR" not in self.ov_config.keys()
and not str(self.model_save_dir).startswith(gettempdir())
and self._device.lower() == "gpu"
):
# Set default CACHE_DIR only if it is not set, if the model is not in a temporary directory, and device is GPU
cache_dir = Path(self.model_save_dir).joinpath("model_cache")
ov_config["CACHE_DIR"] = str(cache_dir)
logger.info(f"Setting OpenVINO CACHE_DIR to {str(cache_dir)}")
Expand Down
9 changes: 7 additions & 2 deletions optimum/intel/openvino/modeling_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,11 +537,16 @@ def __init__(
self._model_dir = Path(model_dir or parent_model._model_save_dir)
config_path = self._model_dir / model_name / self.CONFIG_NAME
self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {}
if "CACHE_DIR" not in self.ov_config.keys() and not str(self._model_dir).startswith(gettempdir()):
self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache")

def _compile(self):
if self.request is None:
if (
"CACHE_DIR" not in self.ov_config.keys()
and not str(self._model_dir).startswith(gettempdir())
and self.device.lower() == "gpu"
):
self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache")

logger.info(f"Compiling the {self._model_name} to {self.device} ...")
self.request = core.compile_model(self.model, self.device, self.ov_config)
# OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
Expand Down
61 changes: 28 additions & 33 deletions optimum/intel/openvino/modeling_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,34 +266,11 @@ def __init__(
self.device = torch.device("cpu")
self.decoder_with_past = None
enable_compilation = kwargs.get("compile", True)
encoder_cache_dir = Path(self.model_save_dir).joinpath("encoder_cache")
ov_encoder_config = {**self.ov_config}

if "CACHE_DIR" not in ov_encoder_config.keys() and not str(self.model_save_dir).startswith(gettempdir()):
ov_encoder_config["CACHE_DIR"] = str(encoder_cache_dir)

self.encoder = OVEncoder(
self.encoder_model, self._device, ov_encoder_config, main_input_name=self.main_input_name
)

decoder_cache_dir = Path(self.model_save_dir).joinpath("decoder_cache")
ov_decoder_config = {**self.ov_config}

if "CACHE_DIR" not in ov_decoder_config.keys() and not str(self.model_save_dir).startswith(gettempdir()):
ov_decoder_config["CACHE_DIR"] = str(decoder_cache_dir)

self.decoder = OVDecoder(self.decoder_model, self._device, ov_decoder_config)
self.encoder = OVEncoder(self.encoder_model, self._device, parent_model=self)
self.decoder = OVDecoder(self.decoder_model, self._device, parent_model=self)

if self.use_cache:
decoder_past_cache_dir = Path(self.model_save_dir).joinpath("decoder_past_cache")
ov_decoder_past_config = {**self.ov_config}

if "CACHE_DIR" not in ov_decoder_past_config.keys() and not str(self.model_save_dir).startswith(
gettempdir()
):
ov_decoder_past_config["CACHE_DIR"] = str(decoder_past_cache_dir)

self.decoder_with_past = OVDecoder(self.decoder_with_past_model, self._device, ov_decoder_past_config)
self.decoder_with_past = OVDecoder(self.decoder_with_past_model, self._device, parent_model=self)
if enable_compilation:
self.compile()

Expand Down Expand Up @@ -435,13 +412,13 @@ class OVEncoder:
The OpenVINO inference request associated to the encoder.
"""

def __init__(self, model: openvino.runtime.Model, device: str, ov_config: Dict, main_input_name="input_ids"):
def __init__(self, model: openvino.runtime.Model, device: str, parent_model: OVModelForSeq2SeqLM):
self.model = model
self._device = device
self.parent_model = parent_model
echarlaix marked this conversation as resolved.
Show resolved Hide resolved
self.device = torch.device("cpu")
self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
self.main_input_name = main_input_name
self.ov_config = ov_config
self.main_input_name = self.parent_model.main_input_name or "input_ids"
self.request = None

@add_start_docstrings_to_model_forward(ENCODER_INPUTS_DOCSTRING)
Expand Down Expand Up @@ -471,9 +448,18 @@ def __call__(self, *args, **kwargs):
return self.forward(*args, **kwargs)

def _compile(self):
ov_config = {**self.parent_model.ov_config}
if (
"CACHE_DIR" not in ov_config.keys()
and not str(self.parent_model.model_save_dir).startswith(gettempdir())
and self._device.lower() == "gpu"
):
cache_dir = Path(self.parent_model.model_save_dir).joinpath("model_cache")
ov_config["CACHE_DIR"] = str(cache_dir)

if self.request is None:
logger.info(f"Compiling the encoder to {self._device} ...")
self.request = core.compile_model(self.model, self._device, self.ov_config)
self.request = core.compile_model(self.model, self._device, ov_config)
# OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
logger.info(f"{self._device} SUPPORTED_PROPERTIES:")
Expand All @@ -491,9 +477,10 @@ class OVDecoder:
The device type used by this process.
"""

def __init__(self, model: openvino.runtime.Model, device: str, ov_config: Dict):
def __init__(self, model: openvino.runtime.Model, device: str, parent_model: OVModelForSeq2SeqLM):
self.model = model
self._device = device
self.parent_model = parent_model
self.device = torch.device("cpu")
self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
self.key_value_input_names = [key for key in self.input_names if "key_values" in key]
Expand All @@ -508,7 +495,6 @@ def __init__(self, model: openvino.runtime.Model, device: str, ov_config: Dict):
self.use_past = False
self.num_pkv = 4

self.ov_config = ov_config
self.request = None

@add_start_docstrings_to_model_forward(DECODER_INPUTS_DOCSTRING)
Expand Down Expand Up @@ -574,9 +560,18 @@ def __call__(self, *args, **kwargs):
return self.forward(*args, **kwargs)

def _compile(self):
ov_config = {**self.parent_model.ov_config}
if (
"CACHE_DIR" not in ov_config.keys()
and not str(self.parent_model.model_save_dir).startswith(gettempdir())
and self._device.lower() == "gpu"
):
cache_dir = Path(self.parent_model.model_save_dir).joinpath("model_cache")
ov_config["CACHE_DIR"] = str(cache_dir)

if self.request is None:
logger.info(f"Compiling the decoder to {self._device} ...")
compiled_model = core.compile_model(self.model, self._device, self.ov_config)
compiled_model = core.compile_model(self.model, self._device, ov_config)
self.request = compiled_model.create_infer_request()
# OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
Expand Down
15 changes: 7 additions & 8 deletions tests/openvino/test_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@

SEED = 42

F32_CONFIG = {"CACHE_DIR": "", "INFERENCE_PRECISION_HINT": "f32"}
F32_CONFIG = {"INFERENCE_PRECISION_HINT": "f32"}


class Timer(object):
Expand All @@ -117,11 +117,6 @@ def test_load_from_hub_and_save_model(self):
self.assertIsInstance(loaded_model.config, PretrainedConfig)
loaded_model_outputs = loaded_model(**tokens)

# Test that model caching is automatically enabled
openvino_cache_dir = loaded_model.model_save_dir / "model_cache"
self.assertTrue(openvino_cache_dir.is_dir())
self.assertGreaterEqual(len(list(openvino_cache_dir.glob("*.blob"))), 1)

# Test specifying ov_config with throughput hint and manual cache dir
manual_openvino_cache_dir = loaded_model.model_save_dir / "manual_model_cache"
ov_config = {"CACHE_DIR": str(manual_openvino_cache_dir), "PERFORMANCE_HINT": "THROUGHPUT"}
Expand Down Expand Up @@ -598,11 +593,15 @@ def test_compare_with_and_without_past_key_values(self):
gc.collect()

def test_auto_device_loading(self):
model_id = MODEL_NAMES["gpt2"]
for device in ("AUTO", "AUTO:CPU"):
model = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True, device=device)
OV_MODEL_ID = "echarlaix/distilbert-base-uncased-finetuned-sst-2-english-openvino"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can be moved outside loop

model = OVModelForSequenceClassification.from_pretrained(OV_MODEL_ID, device=device)
model.half()
self.assertEqual(model._device, device)
if device == "AUTO:CPU":
model = OVModelForSequenceClassification.from_pretrained(OV_MODEL_ID, device=device)
message = "Model should not be loaded from cache without explicitly setting CACHE_DIR"
self.assertFalse(model.request.get_property("LOADED_FROM_CACHE"), message)
del model
gc.collect()

Expand Down
Loading