Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix config saving when check on misplaced args broken #966

Merged
merged 13 commits into from
Nov 13, 2024
9 changes: 7 additions & 2 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
_get_open_clip_submodels_fn_and_export_configs,
clear_class_registry,
remove_none_from_dummy_inputs,
save_config,
)


Expand Down Expand Up @@ -659,7 +660,11 @@ def export_from_model(
files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
elif library_name != "diffusers":
if is_transformers_version(">=", "4.44.99"):
misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
# some model configs may have issues with loading without parameters initialization
try:
misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
except KeyError:
misplaced_generation_parameters = {}
if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0:
logger.warning(
"Moving the following attributes in the config to the generation config: "
Expand All @@ -671,7 +676,7 @@ def export_from_model(
setattr(model.config, param_name, None)

# Saving the model config and preprocessor as this is needed sometimes.
model.config.save_pretrained(output)
save_config(model.config, output)
generation_config = getattr(model, "generation_config", None)
if generation_config is not None:
try:
Expand Down
2 changes: 1 addition & 1 deletion optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1444,7 +1444,7 @@ class InternVLChatConfigBehavior(str, enum.Enum):
@register_in_tasks_manager("internvl-chat", *["image-text-to-text"], library_name="transformers")
class InternVLChatOpenVINOConfig(OnnxConfig):
SUPPORTED_BEHAVIORS = [model_type.value for model_type in InternVLChatConfigBehavior]
NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)

def __init__(
Expand Down
18 changes: 18 additions & 0 deletions optimum/exporters/openvino/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
# limitations under the License.

import inspect
import logging
from collections import namedtuple
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from transformers.utils import is_torch_available
Expand All @@ -25,6 +27,9 @@
from optimum.utils import is_diffusers_available


logger = logging.getLogger(__name__)


InputInfo = namedtuple("InputInfo", ["name", "shape", "type", "example"])


Expand Down Expand Up @@ -209,3 +214,16 @@ def get_submodels(model):


MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv"]


def save_config(config, save_dir):
try:
config.save_pretrained(save_dir)
except Exception as exp:
logger.warning(
f"Attempt to save config using standard API has failed with {exp}. There may be an issue with model config, please check its correctness before usage."
)
save_dir = Path(save_dir)
save_dir.mkdir(exist_ok=True, parents=True)
output_config_file = Path(save_dir / "config.json")
config.to_json_file(output_config_file, use_diff=True)
6 changes: 5 additions & 1 deletion optimum/intel/openvino/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,11 @@ def __init__(
self.generation_config = generation_config or GenerationConfig.from_model_config(config)

if is_transformers_version(">=", "4.44.99"):
misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
# some model configs may have issues with loading without parameters initialization
try:
misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
except KeyError:
misplaced_generation_parameters = {}
if len(misplaced_generation_parameters) > 0:
logger.warning(
"Moving the following attributes in the config to the generation config: "
Expand Down
6 changes: 5 additions & 1 deletion optimum/intel/openvino/modeling_base_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,11 @@ def __init__(
self.generation_config = generation_config or GenerationConfig.from_model_config(config)

if is_transformers_version(">=", "4.44.99"):
misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
# some model configs may have issues with loading without parameters initialization
try:
misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
except KeyError:
misplaced_generation_parameters = {}
if len(misplaced_generation_parameters) > 0:
logger.warning(
"Moving the following attributes in the config to the generation config: "
Expand Down
215 changes: 200 additions & 15 deletions optimum/intel/openvino/modeling_visual_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from PIL.Image import Image
from transformers import (
AutoConfig,
AutoImageProcessor,
GenerationConfig,
GenerationMixin,
PretrainedConfig,
Expand All @@ -24,6 +25,7 @@

from ...exporters.openvino import main_export
from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name
from ...exporters.openvino.utils import save_config
from .. import OVQuantizer
from .configuration import OVConfig, OVWeightQuantizationConfig
from .modeling_base import OVBaseModel, OVModelPart
Expand Down Expand Up @@ -319,6 +321,13 @@ def compile(self):
if part_model is not None:
part_model._compile()

def _save_config(self, save_directory):
"""
Saves a model configuration into a directory, so that it can be re-loaded using the
[`from_pretrained`] class method.
"""
save_config(self.config, save_directory)

def _save_pretrained(self, save_directory: Union[str, Path]):
"""
Saves the model to the OpenVINO IR format so that it can be re-loaded using the
Expand Down Expand Up @@ -728,9 +737,9 @@ def can_generate(self):
@staticmethod
@abstractmethod
def preprocess_inputs(
processor,
text: str,
image: Optional[Image] = None,
processor: Optional[AutoImageProcessor] = None,
tokenizer: Optional[PreTrainedTokenizer] = None,
):
"""
Expand Down Expand Up @@ -902,15 +911,23 @@ def _filter_unattended_tokens(self, input_ids, attention_mask, past_key_values):

@staticmethod
def preprocess_inputs(
processor,
text: str,
image: Optional[Image] = None,
processor: Optional[AutoImageProcessor] = None,
tokenizer: Optional[PreTrainedTokenizer] = None,
):
if image is None:
raise ValueError("Image is required.")
chat_template = [{"role": "user", "content": [{"type": "text", "text": text}, {"type": "image"}]}]
prompt = processor.apply_chat_template(chat_template, add_generation_prompt=True)
if processor is None:
raise ValueError("Processor is required.")
if getattr(processor, "chat_template", None) is not None:
chat_prompt = [{"role": "user", "content": [{"type": "text", "text": text}]}]
if image is not None:
chat_prompt[0]["content"].append({"type": "image"})
prompt = processor.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
else:
if image is not None and "<image>" not in text:
prompt = "<image>\n" + text
else:
prompt = text
inputs = processor(images=image, text=prompt, return_tensors="pt")
return inputs

Expand Down Expand Up @@ -1209,6 +1226,159 @@ def merge_vision_text_embeddings(
input_embeds = input_embeds.reshape(B, N, C)
return input_embeds, attention_mask, position_ids

def preprocess_inputs(
self,
text: str = "",
eaidova marked this conversation as resolved.
Show resolved Hide resolved
image: Optional[Image] = None,
processor: Optional[AutoImageProcessor] = None,
tokenizer: Optional[PreTrainedTokenizer] = None,
):
if tokenizer is None:
raise ValueError("Tokenizer is required.")
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode

IMG_START_TOKEN = "<img>"
IMG_END_TOKEN = "</img>"
IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose(
[
T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD),
]
)
return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=28, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height

# calculate the existing image aspect ratio
target_ratios = {
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if i * j <= max_num and i * j >= min_num
}
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)

# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images

def load_image(image, input_size=448, max_num=12):
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values

if image is not None:
if "<image>" not in text:
text = "<image>\n" + text
pixel_values = load_image(image, input_size=self.config.vision_config.image_size)
num_patches = pixel_values.shape[0]
num_image_token = int(
(self.config.vision_config.image_size // self.config.vision_config.patch_size) ** 2
* (self.config.downsample_ratio**2)
)
image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * num_image_token * num_patches + IMG_END_TOKEN
text = text.replace("<image>", image_tokens, 1)
text_inputs = tokenizer(text, return_tensors="pt")
inputs = dict(text_inputs)
inputs.update({"pixel_values": pixel_values})
else:
inputs = tokenizer(text, return_tensors="pt")
return inputs

# internvl has issue with check _get_non_default_parameters, as wrkaraund overide _prepare_generation_config
def _prepare_generation_config(
self, generation_config: Optional[GenerationConfig], **kwargs: Dict
) -> Tuple[GenerationConfig, Dict]:
using_model_generation_config = False
if generation_config is None:
if (
self.generation_config._from_model_config # 1)
and self.generation_config._original_object_hash == hash(self.generation_config) # 2)
):
new_generation_config = GenerationConfig.from_model_config(self.config)
if new_generation_config != self.generation_config: # 4)
warnings.warn(
"You have modified the pretrained model configuration to control generation. This is a"
" deprecated strategy to control generation and will be removed in v5."
" Please use and modify the model generation configuration (see"
" https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )",
UserWarning,
)
self.generation_config = new_generation_config

generation_config = self.generation_config
using_model_generation_config = True

generation_config = copy.deepcopy(generation_config)
model_kwargs = generation_config.update(**kwargs)
# If `generation_config` is provided, let's fallback ALL special tokens to the default values for the model
if not using_model_generation_config:
if generation_config.bos_token_id is None:
generation_config.bos_token_id = self.generation_config.bos_token_id
if generation_config.eos_token_id is None:
generation_config.eos_token_id = self.generation_config.eos_token_id
if generation_config.pad_token_id is None:
generation_config.pad_token_id = self.generation_config.pad_token_id
if generation_config.decoder_start_token_id is None:
generation_config.decoder_start_token_id = self.generation_config.decoder_start_token_id

return generation_config, model_kwargs


class _OVMiniCPMVForCausalLM(OVModelForVisualCausalLM):
additional_parts = ["resampler"]
Expand Down Expand Up @@ -1430,14 +1600,22 @@ def merge_vision_text_embeddings(

@staticmethod
def preprocess_inputs(
processor,
text: str,
image: Optional[Image] = None,
processor: Optional[AutoImageProcessor] = None,
tokenizer: Optional[PreTrainedTokenizer] = None,
):
if image is None:
raise ValueError("Image is required.")
prompt = f"<|im_start|>user\n(<image>./</image>)\n{text}<|im_end|>\n<|im_start|>assistant\n"
if processor is None:
raise ValueError("Processor is required.")
if getattr(processor, "chat_template", None) is not None:
messages = [{"role": "user", "content": text if image is None else "(<image>./</image>)\n" + text}]
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
else:
prompt = (
f"<|im_start|>user\n(<image>./</image>)\n{text}<|im_end|>\n<|im_start|>assistant\n"
if image is not None
else text
)
inputs = processor([prompt], [image], return_tensors="pt")
return inputs

Expand Down Expand Up @@ -1615,17 +1793,24 @@ def get_multimodal_embeddings(

@staticmethod
def preprocess_inputs(
processor,
text: str,
image: Optional[Image] = None,
processor: Optional[AutoImageProcessor] = None,
tokenizer: Optional[PreTrainedTokenizer] = None,
):
if tokenizer is None:
raise ValueError("Tokenizer is required.")
messages = [{"role": "user", "content": f"<image>\n{text}"}]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("<image>")]
input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
if image is not None and processor is None:
raise ValueError("Processor is required.")
text_content = f"<image>\n{text}" if image is not None else text
messages = [{"role": "user", "content": text_content}]
if tokenizer.chat_template is not None:
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
if image is not None:
text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("<image>")]
input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
else:
input_ids = tokenizer(text, return_tensors="pt").input_ids
attention_mask = torch.ones_like(input_ids, dtype=torch.int64)
result = {"input_ids": input_ids, "attention_mask": attention_mask}
if image is not None:
Expand Down
4 changes: 3 additions & 1 deletion optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,7 +784,9 @@ def _prepare_visual_causal_lm_dataset(self, config: OVWeightQuantizationConfig):
image = Image.open(requests.get(image_url, stream=True).raw)

try:
inputs = self.model.preprocess_inputs(processor, instruction, image, tokenizer)
inputs = self.model.preprocess_inputs(
text=instruction, image=image, processor=processor, tokenizer=tokenizer
)
except ValueError as value_error:
if "Tokenizer is required." in str(value_error) and tokenizer_error is not None:
raise tokenizer_error
Expand Down
Loading
Loading