Skip to content

Commit f30c5e1

Browse files
committed
fix convert
1 parent 1123376 commit f30c5e1

File tree

1 file changed

+17
-10
lines changed

1 file changed

+17
-10
lines changed

examples/llava/minicpmv-convert-image-encoder-to-gguf.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,7 @@ def get_input_embeddings(self) -> nn.Module:
413413

414414
import numpy as np
415415
from gguf import *
416+
from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
416417

417418
TEXT = "clip.text"
418419
VISION = "clip.vision"
@@ -542,6 +543,15 @@ def bytes_to_unicode():
542543
# model = CLIPModel.from_pretrained(dir_model)
543544
# processor = CLIPProcessor.from_pretrained(dir_model)
544545

546+
minicpmv_version = args.minicpmv_version
547+
emb_dim = 4096
548+
if minicpmv_version == 1:
549+
emb_dim = 2304
550+
elif minicpmv_version == 2:
551+
emb_dim = 4096
552+
elif minicpmv_version == 3:
553+
emb_dim = 3584
554+
545555
default_vision_config = {
546556
"hidden_size": 1152,
547557
"image_size": 980,
@@ -552,8 +562,12 @@ def bytes_to_unicode():
552562
"patch_size": 14,
553563
}
554564

555-
vision_config = SiglipVisionConfig(**default_vision_config)
556-
model = SiglipVisionTransformer(vision_config)
565+
if minicpmv_version == 3:
566+
vision_config = Idefics2VisionConfig(**default_vision_config)
567+
model = Idefics2VisionTransformer(vision_config)
568+
elif minicpmv_version == 3:
569+
vision_config = SiglipVisionConfig(**default_vision_config)
570+
model = SiglipVisionTransformer(vision_config)
557571

558572
processor = None
559573
# if model.attn_pool is not None:
@@ -566,14 +580,7 @@ def bytes_to_unicode():
566580
has_text_encoder = True
567581
has_vision_encoder = True
568582
has_minicpmv_projector = False
569-
minicpmv_version = args.minicpmv_version
570-
emb_dim = 4096
571-
if minicpmv_version == 1:
572-
emb_dim = 2304
573-
elif minicpmv_version == 2:
574-
emb_dim = 4096
575-
elif minicpmv_version == 3:
576-
emb_dim = 3584
583+
577584

578585
if args.text_only:
579586
fname_middle = "text-"

0 commit comments

Comments
 (0)