|
17 | 17 | from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
18 | 18 |
|
19 | 19 | from packaging import version
|
20 |
| -from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel |
| 20 | +from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, TFPreTrainedModel |
21 | 21 | from transformers.utils import is_tf_available
|
22 | 22 |
|
23 | 23 | from optimum.exporters.onnx.config import OnnxConfig, TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig
|
|
75 | 75 | JaisModelPatcher,
|
76 | 76 | LlamaModelPatcher,
|
77 | 77 | LlavaImageEmbeddingModelPatcher,
|
| 78 | + LlavaQwen2ImageEmbeddingsModelPatcher, |
78 | 79 | MistralModelPatcher,
|
79 | 80 | MixtralModelPatcher,
|
80 | 81 | MPTModelPatcher,
|
@@ -1425,6 +1426,165 @@ class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig):
|
1425 | 1426 | MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
|
1426 | 1427 |
|
1427 | 1428 |
|
| 1429 | +@register_in_tasks_manager( |
| 1430 | + "llava-qwen2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers" |
| 1431 | +) |
| 1432 | +class LlavaQwen2OpenVINOConfig(OnnxConfig): |
| 1433 | + SUPPORTS_PAST = True |
| 1434 | + MIN_TRANSFORMERS_VERSION = version.parse("4.40.0") |
| 1435 | + SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaConfigBehavior] |
| 1436 | + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig |
| 1437 | + DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,) |
| 1438 | + |
| 1439 | + def __init__( |
| 1440 | + self, |
| 1441 | + config: "PretrainedConfig", |
| 1442 | + task: str = "feature-extraction", |
| 1443 | + int_dtype: str = "int64", |
| 1444 | + float_dtype: str = "fp32", |
| 1445 | + behavior: LlavaConfigBehavior = LlavaConfigBehavior.VISION_EMBEDDINGS, |
| 1446 | + preprocessors: Optional[List[Any]] = None, |
| 1447 | + use_past: bool = False, |
| 1448 | + ): |
| 1449 | + self._behavior = behavior |
| 1450 | + self._orig_config = config |
| 1451 | + if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1452 | + config = AutoConfig.from_pretrained(config.mm_vision_tower, trust_remote_code=True) |
| 1453 | + if hasattr(config, "vision_config"): |
| 1454 | + config = config.vision_config |
| 1455 | + super().__init__( |
| 1456 | + config=config, |
| 1457 | + task=task, |
| 1458 | + int_dtype=int_dtype, |
| 1459 | + float_dtype=float_dtype, |
| 1460 | + preprocessors=preprocessors, |
| 1461 | + ) |
| 1462 | + |
| 1463 | + @property |
| 1464 | + def inputs(self) -> Dict[str, Dict[int, str]]: |
| 1465 | + if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1466 | + return {} |
| 1467 | + return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}} |
| 1468 | + |
| 1469 | + @property |
| 1470 | + def outputs(self) -> Dict[str, Dict[int, str]]: |
| 1471 | + if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1472 | + return {} |
| 1473 | + return {"last_hidden_state": {0: "batch_size"}} |
| 1474 | + |
| 1475 | + def get_model_for_behavior(self, model, behavior: Union[str, LlavaConfigBehavior]): |
| 1476 | + if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior): |
| 1477 | + behavior = LlavaConfigBehavior(behavior) |
| 1478 | + |
| 1479 | + if behavior == LlavaConfigBehavior.LANGUAGE: |
| 1480 | + model.forward = super(type(model), model).forward |
| 1481 | + return model |
| 1482 | + |
| 1483 | + if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1484 | + return model |
| 1485 | + |
| 1486 | + if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS: |
| 1487 | + text_embedding = model.model.embed_tokens |
| 1488 | + text_embedding.config = model.model.config |
| 1489 | + return text_embedding |
| 1490 | + |
| 1491 | + def with_behavior( |
| 1492 | + self, |
| 1493 | + behavior: Union[str, LlavaConfigBehavior], |
| 1494 | + ): |
| 1495 | + """ |
| 1496 | + Creates a config for different behaviour. |
| 1497 | + Args: |
| 1498 | + behavior ([`ConfigBehavior`]): |
| 1499 | + The behavior to use for the new instance. |
| 1500 | + """ |
| 1501 | + if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior): |
| 1502 | + behavior = LlavaConfigBehavior(behavior) |
| 1503 | + |
| 1504 | + if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS: |
| 1505 | + model_type = self._orig_config.model_type.replace("llava-", "") |
| 1506 | + model_type = model_type.replace("_", "-") |
| 1507 | + if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: |
| 1508 | + raise ValueError( |
| 1509 | + f"Unsupported language model type provided `{model_type}`. Please define custom export config" |
| 1510 | + ) |
| 1511 | + |
| 1512 | + if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]: |
| 1513 | + raise ValueError( |
| 1514 | + f"Export config for text generation for `{model_type}` is not available. Please define custom export config" |
| 1515 | + ) |
| 1516 | + internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][ |
| 1517 | + "text-generation-with-past" |
| 1518 | + ] |
| 1519 | + internal_export_config = internal_export_config_class( |
| 1520 | + self._orig_config, |
| 1521 | + use_past=True, |
| 1522 | + use_past_in_inputs=True, |
| 1523 | + int_dtype=self.int_dtype, |
| 1524 | + float_dtype=self.float_dtype, |
| 1525 | + ) |
| 1526 | + InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS |
| 1527 | + export_config = InputEmbedOpenvVINOConfig( |
| 1528 | + self._orig_config, |
| 1529 | + task="feature-extraction", |
| 1530 | + int_dtype=self.int_dtype, |
| 1531 | + float_dtype=self.float_dtype, |
| 1532 | + ) |
| 1533 | + return export_config |
| 1534 | + |
| 1535 | + if behavior == LlavaConfigBehavior.LANGUAGE: |
| 1536 | + model_type = self._orig_config.model_type.replace("llava-", "") |
| 1537 | + model_type = model_type.replace("_", "-") |
| 1538 | + |
| 1539 | + if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: |
| 1540 | + raise ValueError( |
| 1541 | + f"Unsupported language model type provided `{model_type}`. Please define custom export config" |
| 1542 | + ) |
| 1543 | + |
| 1544 | + if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]: |
| 1545 | + raise ValueError( |
| 1546 | + f"Export config for text generation for `{model_type}` is not available. Please define custom export config" |
| 1547 | + ) |
| 1548 | + internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][ |
| 1549 | + "text-generation-with-past" |
| 1550 | + ] |
| 1551 | + internal_export_config = internal_export_config_class( |
| 1552 | + self._orig_config, |
| 1553 | + use_past=True, |
| 1554 | + use_past_in_inputs=True, |
| 1555 | + int_dtype=self.int_dtype, |
| 1556 | + float_dtype=self.float_dtype, |
| 1557 | + ) |
| 1558 | + export_config = LMInputEmbedsConfigHelper(internal_export_config) |
| 1559 | + export_config._normalized_config = internal_export_config._normalized_config |
| 1560 | + return export_config |
| 1561 | + |
| 1562 | + if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1563 | + return self.__class__( |
| 1564 | + self._orig_config, |
| 1565 | + task=self.task, |
| 1566 | + int_dtype=self.int_dtype, |
| 1567 | + float_dtype=self.float_dtype, |
| 1568 | + behavior=behavior, |
| 1569 | + preprocessors=self._preprocessors, |
| 1570 | + ) |
| 1571 | + |
| 1572 | + def patch_model_for_export( |
| 1573 | + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None |
| 1574 | + ): |
| 1575 | + model_kwargs = model_kwargs or {} |
| 1576 | + if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1577 | + return super().patch_model_for_export(model, model_kwargs) |
| 1578 | + return LlavaQwen2ImageEmbeddingsModelPatcher(self, model, model_kwargs) |
| 1579 | + |
| 1580 | + def rename_ambiguous_inputs(self, inputs): |
| 1581 | + if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1582 | + model_inputs = {} |
| 1583 | + model_inputs["images"] = inputs["pixel_values"] |
| 1584 | + return model_inputs |
| 1585 | + return super().rename_ambiguous_inputs(inputs) |
| 1586 | + |
| 1587 | + |
1428 | 1588 | class InternVLChatConfigBehavior(str, enum.Enum):
|
1429 | 1589 | LANGUAGE = "language"
|
1430 | 1590 | VISION_EMBEDDINGS = "vision_embeddings"
|
@@ -1577,6 +1737,165 @@ def patch_model_for_export(
|
1577 | 1737 | return InternVLChatImageEmbeddingModelPatcher(self, model, model_kwargs)
|
1578 | 1738 |
|
1579 | 1739 |
|
| 1740 | +@register_in_tasks_manager( |
| 1741 | + "llava-qwen2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers" |
| 1742 | +) |
| 1743 | +class LlavaQwen2OpenVINOConfig(OnnxConfig): |
| 1744 | + SUPPORTS_PAST = True |
| 1745 | + MIN_TRANSFORMERS_VERSION = version.parse("4.40.0") |
| 1746 | + SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaConfigBehavior] |
| 1747 | + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig |
| 1748 | + DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,) |
| 1749 | + |
| 1750 | + def __init__( |
| 1751 | + self, |
| 1752 | + config: "PretrainedConfig", |
| 1753 | + task: str = "feature-extraction", |
| 1754 | + int_dtype: str = "int64", |
| 1755 | + float_dtype: str = "fp32", |
| 1756 | + behavior: LlavaConfigBehavior = LlavaConfigBehavior.VISION_EMBEDDINGS, |
| 1757 | + preprocessors: Optional[List[Any]] = None, |
| 1758 | + use_past: bool = False, |
| 1759 | + ): |
| 1760 | + self._behavior = behavior |
| 1761 | + self._orig_config = config |
| 1762 | + if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1763 | + config = AutoConfig.from_pretrained(config.mm_vision_tower, trust_remote_code=True) |
| 1764 | + if hasattr(config, "vision_config"): |
| 1765 | + config = config.vision_config |
| 1766 | + super().__init__( |
| 1767 | + config=config, |
| 1768 | + task=task, |
| 1769 | + int_dtype=int_dtype, |
| 1770 | + float_dtype=float_dtype, |
| 1771 | + preprocessors=preprocessors, |
| 1772 | + ) |
| 1773 | + |
| 1774 | + @property |
| 1775 | + def inputs(self) -> Dict[str, Dict[int, str]]: |
| 1776 | + if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1777 | + return {} |
| 1778 | + return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}} |
| 1779 | + |
| 1780 | + @property |
| 1781 | + def outputs(self) -> Dict[str, Dict[int, str]]: |
| 1782 | + if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1783 | + return {} |
| 1784 | + return {"last_hidden_state": {0: "batch_size"}} |
| 1785 | + |
| 1786 | + def get_model_for_behavior(self, model, behavior: Union[str, LlavaConfigBehavior]): |
| 1787 | + if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior): |
| 1788 | + behavior = LlavaConfigBehavior(behavior) |
| 1789 | + |
| 1790 | + if behavior == LlavaConfigBehavior.LANGUAGE: |
| 1791 | + model.forward = super(type(model), model).forward |
| 1792 | + return model |
| 1793 | + |
| 1794 | + if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1795 | + return model |
| 1796 | + |
| 1797 | + if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS: |
| 1798 | + text_embedding = model.model.embed_tokens |
| 1799 | + text_embedding.config = model.model.config |
| 1800 | + return text_embedding |
| 1801 | + |
| 1802 | + def with_behavior( |
| 1803 | + self, |
| 1804 | + behavior: Union[str, LlavaConfigBehavior], |
| 1805 | + ): |
| 1806 | + """ |
| 1807 | + Creates a config for different behaviour. |
| 1808 | + Args: |
| 1809 | + behavior ([`ConfigBehavior`]): |
| 1810 | + The behavior to use for the new instance. |
| 1811 | + """ |
| 1812 | + if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior): |
| 1813 | + behavior = LlavaConfigBehavior(behavior) |
| 1814 | + |
| 1815 | + if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS: |
| 1816 | + model_type = self._orig_config.model_type.replace("llava-", "") |
| 1817 | + model_type = model_type.replace("_", "-") |
| 1818 | + if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: |
| 1819 | + raise ValueError( |
| 1820 | + f"Unsupported language model type provided `{model_type}`. Please define custom export config" |
| 1821 | + ) |
| 1822 | + |
| 1823 | + if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]: |
| 1824 | + raise ValueError( |
| 1825 | + f"Export config for text generation for `{model_type}` is not available. Please define custom export config" |
| 1826 | + ) |
| 1827 | + internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][ |
| 1828 | + "text-generation-with-past" |
| 1829 | + ] |
| 1830 | + internal_export_config = internal_export_config_class( |
| 1831 | + self._orig_config, |
| 1832 | + use_past=True, |
| 1833 | + use_past_in_inputs=True, |
| 1834 | + int_dtype=self.int_dtype, |
| 1835 | + float_dtype=self.float_dtype, |
| 1836 | + ) |
| 1837 | + InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS |
| 1838 | + export_config = InputEmbedOpenvVINOConfig( |
| 1839 | + self._orig_config, |
| 1840 | + task="feature-extraction", |
| 1841 | + int_dtype=self.int_dtype, |
| 1842 | + float_dtype=self.float_dtype, |
| 1843 | + ) |
| 1844 | + return export_config |
| 1845 | + |
| 1846 | + if behavior == LlavaConfigBehavior.LANGUAGE: |
| 1847 | + model_type = self._orig_config.model_type.replace("llava-", "") |
| 1848 | + model_type = model_type.replace("_", "-") |
| 1849 | + |
| 1850 | + if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: |
| 1851 | + raise ValueError( |
| 1852 | + f"Unsupported language model type provided `{model_type}`. Please define custom export config" |
| 1853 | + ) |
| 1854 | + |
| 1855 | + if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]: |
| 1856 | + raise ValueError( |
| 1857 | + f"Export config for text generation for `{model_type}` is not available. Please define custom export config" |
| 1858 | + ) |
| 1859 | + internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][ |
| 1860 | + "text-generation-with-past" |
| 1861 | + ] |
| 1862 | + internal_export_config = internal_export_config_class( |
| 1863 | + self._orig_config, |
| 1864 | + use_past=True, |
| 1865 | + use_past_in_inputs=True, |
| 1866 | + int_dtype=self.int_dtype, |
| 1867 | + float_dtype=self.float_dtype, |
| 1868 | + ) |
| 1869 | + export_config = LMInputEmbedsConfigHelper(internal_export_config) |
| 1870 | + export_config._normalized_config = internal_export_config._normalized_config |
| 1871 | + return export_config |
| 1872 | + |
| 1873 | + if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1874 | + return self.__class__( |
| 1875 | + self._orig_config, |
| 1876 | + task=self.task, |
| 1877 | + int_dtype=self.int_dtype, |
| 1878 | + float_dtype=self.float_dtype, |
| 1879 | + behavior=behavior, |
| 1880 | + preprocessors=self._preprocessors, |
| 1881 | + ) |
| 1882 | + |
| 1883 | + def patch_model_for_export( |
| 1884 | + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None |
| 1885 | + ): |
| 1886 | + model_kwargs = model_kwargs or {} |
| 1887 | + if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1888 | + return super().patch_model_for_export(model, model_kwargs) |
| 1889 | + return LlavaQwen2ImageEmbeddingsModelPatcher(self, model, model_kwargs) |
| 1890 | + |
| 1891 | + def rename_ambiguous_inputs(self, inputs): |
| 1892 | + if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: |
| 1893 | + model_inputs = {} |
| 1894 | + model_inputs["images"] = inputs["pixel_values"] |
| 1895 | + return model_inputs |
| 1896 | + return super().rename_ambiguous_inputs(inputs) |
| 1897 | + |
| 1898 | + |
1580 | 1899 | class PooledProjectionsDummyInputGenerator(DummyInputGenerator):
|
1581 | 1900 | SUPPORTED_INPUT_NAMES = ["pooled_projections"]
|
1582 | 1901 |
|
|
0 commit comments