@@ -75,7 +75,7 @@ static std::string format(const char * fmt, ...) {
75
75
#define KEY_DESCRIPTION " general.description"
76
76
#define KEY_HAS_TEXT_ENC " clip.has_text_encoder"
77
77
#define KEY_HAS_VIS_ENC " clip.has_vision_encoder"
78
- #define KEY_HAS_LLAVA_PROJ " clip.has_llava_projector "
78
+ #define KEY_HAS_LLAVA_PROJ " clip.has_minicpmv_projector "
79
79
#define KEY_USE_GELU " clip.use_gelu"
80
80
#define KEY_N_EMBD " clip.%s.embedding_length"
81
81
#define KEY_N_FF " clip.%s.feed_forward_length"
@@ -526,7 +526,7 @@ struct clip_vision_model {
526
526
struct clip_ctx {
527
527
bool has_text_encoder = false ;
528
528
bool has_vision_encoder = false ;
529
- bool has_llava_projector = false ;
529
+ bool has_minicpmv_projector = false ;
530
530
531
531
struct clip_vision_model vision_model;
532
532
projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -606,7 +606,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
606
606
607
607
const int batch_size = imgs->size ;
608
608
609
- if (ctx->has_llava_projector ) {
609
+ if (ctx->has_minicpmv_projector ) {
610
610
GGML_ASSERT (batch_size == 1 );
611
611
}
612
612
@@ -1124,10 +1124,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, s
1124
1124
1125
1125
idx = gguf_find_key (ctx, KEY_HAS_LLAVA_PROJ);
1126
1126
if (idx != -1 ) {
1127
- new_clip->has_llava_projector = gguf_get_val_bool (ctx, idx);
1127
+ new_clip->has_minicpmv_projector = gguf_get_val_bool (ctx, idx);
1128
1128
}
1129
1129
1130
- GGML_ASSERT (new_clip->has_llava_projector ); // see monatis/clip.cpp for image and/or text encoding for semantic search
1130
+ GGML_ASSERT (new_clip->has_minicpmv_projector ); // see monatis/clip.cpp for image and/or text encoding for semantic search
1131
1131
GGML_ASSERT (new_clip->has_vision_encoder );
1132
1132
GGML_ASSERT (!new_clip->has_text_encoder );
1133
1133
@@ -1137,7 +1137,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, s
1137
1137
if (verbosity >= 1 ) {
1138
1138
LOG_TEE (" %s: text_encoder: %d\n " , __func__, new_clip->has_text_encoder );
1139
1139
LOG_TEE (" %s: vision_encoder: %d\n " , __func__, new_clip->has_vision_encoder );
1140
- LOG_TEE (" %s: llava_projector: %d\n " , __func__, new_clip->has_llava_projector );
1140
+ LOG_TEE (" %s: llava_projector: %d\n " , __func__, new_clip->has_minicpmv_projector );
1141
1141
LOG_TEE (" %s: model size: %.2f MB\n " , __func__, model_size / 1024.0 / 1024.0 );
1142
1142
LOG_TEE (" %s: metadata size: %.2f MB\n " , __func__, ggml_get_mem_size (meta) / 1024.0 / 1024.0 );
1143
1143
}
@@ -1939,7 +1939,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
1939
1939
}
1940
1940
1941
1941
int batch_size = imgs->size ;
1942
- if (ctx->has_llava_projector ) {
1942
+ if (ctx->has_minicpmv_projector ) {
1943
1943
GGML_ASSERT (batch_size == 1 ); // TODO: support multiple images
1944
1944
}
1945
1945
0 commit comments