Skip to content

Commit 4c30c16

Browse files
committed
add support of nanollava model
1 parent 86598a6 commit 4c30c16

File tree

6 files changed

+593
-33
lines changed

6 files changed

+593
-33
lines changed

optimum/exporters/openvino/model_configs.py

Lines changed: 320 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
1818

1919
from packaging import version
20-
from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
20+
from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, TFPreTrainedModel
2121
from transformers.utils import is_tf_available
2222

2323
from optimum.exporters.onnx.config import OnnxConfig, TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig
@@ -75,6 +75,7 @@
7575
JaisModelPatcher,
7676
LlamaModelPatcher,
7777
LlavaImageEmbeddingModelPatcher,
78+
LlavaQwen2ImageEmbeddingsModelPatcher,
7879
MistralModelPatcher,
7980
MixtralModelPatcher,
8081
MPTModelPatcher,
@@ -1425,6 +1426,165 @@ class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig):
14251426
MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
14261427

14271428

1429+
@register_in_tasks_manager(
1430+
"llava-qwen2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers"
1431+
)
1432+
class LlavaQwen2OpenVINOConfig(OnnxConfig):
1433+
SUPPORTS_PAST = True
1434+
MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
1435+
SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaConfigBehavior]
1436+
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
1437+
DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)
1438+
1439+
def __init__(
1440+
self,
1441+
config: "PretrainedConfig",
1442+
task: str = "feature-extraction",
1443+
int_dtype: str = "int64",
1444+
float_dtype: str = "fp32",
1445+
behavior: LlavaConfigBehavior = LlavaConfigBehavior.VISION_EMBEDDINGS,
1446+
preprocessors: Optional[List[Any]] = None,
1447+
use_past: bool = False,
1448+
):
1449+
self._behavior = behavior
1450+
self._orig_config = config
1451+
if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
1452+
config = AutoConfig.from_pretrained(config.mm_vision_tower, trust_remote_code=True)
1453+
if hasattr(config, "vision_config"):
1454+
config = config.vision_config
1455+
super().__init__(
1456+
config=config,
1457+
task=task,
1458+
int_dtype=int_dtype,
1459+
float_dtype=float_dtype,
1460+
preprocessors=preprocessors,
1461+
)
1462+
1463+
@property
1464+
def inputs(self) -> Dict[str, Dict[int, str]]:
1465+
if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
1466+
return {}
1467+
return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}}
1468+
1469+
@property
1470+
def outputs(self) -> Dict[str, Dict[int, str]]:
1471+
if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
1472+
return {}
1473+
return {"last_hidden_state": {0: "batch_size"}}
1474+
1475+
def get_model_for_behavior(self, model, behavior: Union[str, LlavaConfigBehavior]):
1476+
if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior):
1477+
behavior = LlavaConfigBehavior(behavior)
1478+
1479+
if behavior == LlavaConfigBehavior.LANGUAGE:
1480+
model.forward = super(type(model), model).forward
1481+
return model
1482+
1483+
if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
1484+
return model
1485+
1486+
if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS:
1487+
text_embedding = model.model.embed_tokens
1488+
text_embedding.config = model.model.config
1489+
return text_embedding
1490+
1491+
def with_behavior(
1492+
self,
1493+
behavior: Union[str, LlavaConfigBehavior],
1494+
):
1495+
"""
1496+
Creates a config for different behaviour.
1497+
Args:
1498+
behavior ([`ConfigBehavior`]):
1499+
The behavior to use for the new instance.
1500+
"""
1501+
if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior):
1502+
behavior = LlavaConfigBehavior(behavior)
1503+
1504+
if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS:
1505+
model_type = self._orig_config.model_type.replace("llava-", "")
1506+
model_type = model_type.replace("_", "-")
1507+
if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
1508+
raise ValueError(
1509+
f"Unsupported language model type provided `{model_type}`. Please define custom export config"
1510+
)
1511+
1512+
if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
1513+
raise ValueError(
1514+
f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
1515+
)
1516+
internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
1517+
"text-generation-with-past"
1518+
]
1519+
internal_export_config = internal_export_config_class(
1520+
self._orig_config,
1521+
use_past=True,
1522+
use_past_in_inputs=True,
1523+
int_dtype=self.int_dtype,
1524+
float_dtype=self.float_dtype,
1525+
)
1526+
InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
1527+
export_config = InputEmbedOpenvVINOConfig(
1528+
self._orig_config,
1529+
task="feature-extraction",
1530+
int_dtype=self.int_dtype,
1531+
float_dtype=self.float_dtype,
1532+
)
1533+
return export_config
1534+
1535+
if behavior == LlavaConfigBehavior.LANGUAGE:
1536+
model_type = self._orig_config.model_type.replace("llava-", "")
1537+
model_type = model_type.replace("_", "-")
1538+
1539+
if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
1540+
raise ValueError(
1541+
f"Unsupported language model type provided `{model_type}`. Please define custom export config"
1542+
)
1543+
1544+
if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
1545+
raise ValueError(
1546+
f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
1547+
)
1548+
internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
1549+
"text-generation-with-past"
1550+
]
1551+
internal_export_config = internal_export_config_class(
1552+
self._orig_config,
1553+
use_past=True,
1554+
use_past_in_inputs=True,
1555+
int_dtype=self.int_dtype,
1556+
float_dtype=self.float_dtype,
1557+
)
1558+
export_config = LMInputEmbedsConfigHelper(internal_export_config)
1559+
export_config._normalized_config = internal_export_config._normalized_config
1560+
return export_config
1561+
1562+
if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
1563+
return self.__class__(
1564+
self._orig_config,
1565+
task=self.task,
1566+
int_dtype=self.int_dtype,
1567+
float_dtype=self.float_dtype,
1568+
behavior=behavior,
1569+
preprocessors=self._preprocessors,
1570+
)
1571+
1572+
def patch_model_for_export(
1573+
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
1574+
):
1575+
model_kwargs = model_kwargs or {}
1576+
if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS:
1577+
return super().patch_model_for_export(model, model_kwargs)
1578+
return LlavaQwen2ImageEmbeddingsModelPatcher(self, model, model_kwargs)
1579+
1580+
def rename_ambiguous_inputs(self, inputs):
1581+
if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
1582+
model_inputs = {}
1583+
model_inputs["images"] = inputs["pixel_values"]
1584+
return model_inputs
1585+
return super().rename_ambiguous_inputs(inputs)
1586+
1587+
14281588
class InternVLChatConfigBehavior(str, enum.Enum):
14291589
LANGUAGE = "language"
14301590
VISION_EMBEDDINGS = "vision_embeddings"
@@ -1577,6 +1737,165 @@ def patch_model_for_export(
15771737
return InternVLChatImageEmbeddingModelPatcher(self, model, model_kwargs)
15781738

15791739

1740+
@register_in_tasks_manager(
1741+
"llava-qwen2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers"
1742+
)
1743+
class LlavaQwen2OpenVINOConfig(OnnxConfig):
1744+
SUPPORTS_PAST = True
1745+
MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
1746+
SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaConfigBehavior]
1747+
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
1748+
DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)
1749+
1750+
def __init__(
1751+
self,
1752+
config: "PretrainedConfig",
1753+
task: str = "feature-extraction",
1754+
int_dtype: str = "int64",
1755+
float_dtype: str = "fp32",
1756+
behavior: LlavaConfigBehavior = LlavaConfigBehavior.VISION_EMBEDDINGS,
1757+
preprocessors: Optional[List[Any]] = None,
1758+
use_past: bool = False,
1759+
):
1760+
self._behavior = behavior
1761+
self._orig_config = config
1762+
if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
1763+
config = AutoConfig.from_pretrained(config.mm_vision_tower, trust_remote_code=True)
1764+
if hasattr(config, "vision_config"):
1765+
config = config.vision_config
1766+
super().__init__(
1767+
config=config,
1768+
task=task,
1769+
int_dtype=int_dtype,
1770+
float_dtype=float_dtype,
1771+
preprocessors=preprocessors,
1772+
)
1773+
1774+
@property
1775+
def inputs(self) -> Dict[str, Dict[int, str]]:
1776+
if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
1777+
return {}
1778+
return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}}
1779+
1780+
@property
1781+
def outputs(self) -> Dict[str, Dict[int, str]]:
1782+
if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
1783+
return {}
1784+
return {"last_hidden_state": {0: "batch_size"}}
1785+
1786+
def get_model_for_behavior(self, model, behavior: Union[str, LlavaConfigBehavior]):
1787+
if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior):
1788+
behavior = LlavaConfigBehavior(behavior)
1789+
1790+
if behavior == LlavaConfigBehavior.LANGUAGE:
1791+
model.forward = super(type(model), model).forward
1792+
return model
1793+
1794+
if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
1795+
return model
1796+
1797+
if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS:
1798+
text_embedding = model.model.embed_tokens
1799+
text_embedding.config = model.model.config
1800+
return text_embedding
1801+
1802+
def with_behavior(
1803+
self,
1804+
behavior: Union[str, LlavaConfigBehavior],
1805+
):
1806+
"""
1807+
Creates a config for different behaviour.
1808+
Args:
1809+
behavior ([`ConfigBehavior`]):
1810+
The behavior to use for the new instance.
1811+
"""
1812+
if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior):
1813+
behavior = LlavaConfigBehavior(behavior)
1814+
1815+
if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS:
1816+
model_type = self._orig_config.model_type.replace("llava-", "")
1817+
model_type = model_type.replace("_", "-")
1818+
if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
1819+
raise ValueError(
1820+
f"Unsupported language model type provided `{model_type}`. Please define custom export config"
1821+
)
1822+
1823+
if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
1824+
raise ValueError(
1825+
f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
1826+
)
1827+
internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
1828+
"text-generation-with-past"
1829+
]
1830+
internal_export_config = internal_export_config_class(
1831+
self._orig_config,
1832+
use_past=True,
1833+
use_past_in_inputs=True,
1834+
int_dtype=self.int_dtype,
1835+
float_dtype=self.float_dtype,
1836+
)
1837+
InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
1838+
export_config = InputEmbedOpenvVINOConfig(
1839+
self._orig_config,
1840+
task="feature-extraction",
1841+
int_dtype=self.int_dtype,
1842+
float_dtype=self.float_dtype,
1843+
)
1844+
return export_config
1845+
1846+
if behavior == LlavaConfigBehavior.LANGUAGE:
1847+
model_type = self._orig_config.model_type.replace("llava-", "")
1848+
model_type = model_type.replace("_", "-")
1849+
1850+
if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
1851+
raise ValueError(
1852+
f"Unsupported language model type provided `{model_type}`. Please define custom export config"
1853+
)
1854+
1855+
if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
1856+
raise ValueError(
1857+
f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
1858+
)
1859+
internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
1860+
"text-generation-with-past"
1861+
]
1862+
internal_export_config = internal_export_config_class(
1863+
self._orig_config,
1864+
use_past=True,
1865+
use_past_in_inputs=True,
1866+
int_dtype=self.int_dtype,
1867+
float_dtype=self.float_dtype,
1868+
)
1869+
export_config = LMInputEmbedsConfigHelper(internal_export_config)
1870+
export_config._normalized_config = internal_export_config._normalized_config
1871+
return export_config
1872+
1873+
if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
1874+
return self.__class__(
1875+
self._orig_config,
1876+
task=self.task,
1877+
int_dtype=self.int_dtype,
1878+
float_dtype=self.float_dtype,
1879+
behavior=behavior,
1880+
preprocessors=self._preprocessors,
1881+
)
1882+
1883+
def patch_model_for_export(
1884+
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
1885+
):
1886+
model_kwargs = model_kwargs or {}
1887+
if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS:
1888+
return super().patch_model_for_export(model, model_kwargs)
1889+
return LlavaQwen2ImageEmbeddingsModelPatcher(self, model, model_kwargs)
1890+
1891+
def rename_ambiguous_inputs(self, inputs):
1892+
if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
1893+
model_inputs = {}
1894+
model_inputs["images"] = inputs["pixel_values"]
1895+
return model_inputs
1896+
return super().rename_ambiguous_inputs(inputs)
1897+
1898+
15801899
class PooledProjectionsDummyInputGenerator(DummyInputGenerator):
15811900
SUPPORTED_INPUT_NAMES = ["pooled_projections"]
15821901

optimum/exporters/openvino/model_patcher.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2743,3 +2743,21 @@ def __exit__(self, exc_type, exc_value, traceback):
27432743
super().__exit__(exc_type, exc_value, traceback)
27442744
if hasattr(self._model.pos_embed, "_orig_forward"):
27452745
self._model.pos_embed.forward = self._model.pos_embed._orig_forward
2746+
2747+
2748+
class LlavaQwen2ImageEmbeddingsModelPatcher(ModelPatcher):
2749+
def __init__(
2750+
self,
2751+
config: "OnnxConfig",
2752+
model: Union["PreTrainedModel", "TFPreTrainedModel"],
2753+
model_kwargs: Dict[str, Any],
2754+
):
2755+
model.__orig_forward = model.forward
2756+
model.forward = model.encode_images
2757+
super().__init__(config, model, model_kwargs)
2758+
if not self._model.get_vision_tower().is_loaded:
2759+
self._model.get_vision_tower().load_model()
2760+
2761+
def __exit__(self, exc_type, exc_value, traceback):
2762+
super().__exit__(exc_type, exc_value, traceback)
2763+
self._model.forward = self._model.__orig_forward

optimum/exporters/openvino/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,4 +208,4 @@ def get_submodels(model):
208208
return custom_export, fn_get_submodels
209209

210210

211-
MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "internvl-chat"]
211+
MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat"]

0 commit comments

Comments
 (0)