@@ -413,6 +413,7 @@ def get_input_embeddings(self) -> nn.Module:
413
413
414
414
import numpy as np
415
415
from gguf import *
416
+ from transformers .models .idefics2 .modeling_idefics2 import Idefics2VisionTransformer , Idefics2VisionConfig
416
417
417
418
TEXT = "clip.text"
418
419
VISION = "clip.vision"
@@ -542,6 +543,15 @@ def bytes_to_unicode():
542
543
# model = CLIPModel.from_pretrained(dir_model)
543
544
# processor = CLIPProcessor.from_pretrained(dir_model)
544
545
546
+ minicpmv_version = args .minicpmv_version
547
+ emb_dim = 4096
548
+ if minicpmv_version == 1 :
549
+ emb_dim = 2304
550
+ elif minicpmv_version == 2 :
551
+ emb_dim = 4096
552
+ elif minicpmv_version == 3 :
553
+ emb_dim = 3584
554
+
545
555
default_vision_config = {
546
556
"hidden_size" : 1152 ,
547
557
"image_size" : 980 ,
@@ -552,8 +562,12 @@ def bytes_to_unicode():
552
562
"patch_size" : 14 ,
553
563
}
554
564
555
- vision_config = SiglipVisionConfig (** default_vision_config )
556
- model = SiglipVisionTransformer (vision_config )
565
+ if minicpmv_version == 3 :
566
+ vision_config = Idefics2VisionConfig (** default_vision_config )
567
+ model = Idefics2VisionTransformer (vision_config )
568
+ elif minicpmv_version == 3 :
569
+ vision_config = SiglipVisionConfig (** default_vision_config )
570
+ model = SiglipVisionTransformer (vision_config )
557
571
558
572
processor = None
559
573
# if model.attn_pool is not None:
@@ -566,14 +580,7 @@ def bytes_to_unicode():
566
580
has_text_encoder = True
567
581
has_vision_encoder = True
568
582
has_minicpmv_projector = False
569
- minicpmv_version = args .minicpmv_version
570
- emb_dim = 4096
571
- if minicpmv_version == 1 :
572
- emb_dim = 2304
573
- elif minicpmv_version == 2 :
574
- emb_dim = 4096
575
- elif minicpmv_version == 3 :
576
- emb_dim = 3584
583
+
577
584
578
585
if args .text_only :
579
586
fname_middle = "text-"
0 commit comments