add temp model support

OpenBMB · Oct 16, 2024 · 8da5562 · 8da5562
1 parent f21723a
commit 8da5562
Show file tree

Hide file tree

Showing 4 changed files with 157 additions and 19 deletions.
diff --git a/examples/llava/README-minicpmv-dev.md b/examples/llava/README-minicpmv-dev.md
@@ -6,40 +6,114 @@ Clone llama.cpp:
 ```bash
 git clone git@github.com:OpenBMB/llama.cpp.git
 cd llama.cpp
-git checkout minicpmv-main-dev
+git checkout minicpmv-main
 ```
 
 ### Usage of MiniCPM-V 2.6
 
-Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-dev-gguf) by us)
+Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
 
 ```bash
-python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-dev
-python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-dev --minicpmv-projector ../MiniCPM-V-dev/minicpmv.projector --output-dir ../MiniCPM-V-dev/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
-```
-
-add 'res = "llama-bpe"' in convert_hf_to_gguf.py 514 line
-```bash
-python ./convert_hf_to_gguf.py ../MiniCPM-V-dev/model
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 5
+python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
 
 # quantize int4 version
-./llama-quantize ../MiniCPM-V-dev/model/ggml-model-f16.gguf ../MiniCPM-V-dev/model/ggml-model-Q4_K_M.gguf Q4_K_M
+./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
 ```
 
 Build for Linux or Mac
 
 ```bash
 make
+make llama-minicpmv-cli
 ```
 
 Inference on Linux or Mac
 ```
 # run f16 version
-./llama-minicpmv-cli -m ../MiniCPM-V-dev/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-dev/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
 
 # run quantized int4 version
-./llama-minicpmv-cli -m ../MiniCPM-V-dev/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-dev/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 
 # or run in interactive mode
-./llama-minicpmv-cli -m ../MiniCPM-V-dev/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-dev/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
+./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
+```
+
+### Video
+Install FFmpeg
+```
+brew install ffmpeg
+brew install pkg-config
+```
+
+### Window
+Compiling from source code on window with `make` will be a little more complicated, but using `cmake` will be more convenient.
+
+CPU:
+```
+cmake -B build
+cmake --build build --config Release -t llama-minicpmv-cli
+```
+CUDA:
+```
+cmake -B build -DLLAMA_CUDA=ON
+cmake --build build --config Release -t llama-minicpmv-cli
+```
+
+### Android
+
+#### Build on Android device using Termux
+We found that build on Android device would bring better runtime performance, so we recommend to build on device.
+
+[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
+
+Install tools in Termux:
+```
+apt update && apt upgrade -y
+apt install git make cmake
+```
+
+It's recommended to move your model inside the `~/` directory for best performance:
+```
+cd storage/downloads
+mv model.gguf ~/
+```
+
+#### Building the Project using Android NDK
+Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
+
+Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
+
+```bash
+mkdir build-android
+cd build-android
+export NDK=/your_ndk_path
+cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
+make
+```
+
+Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
+
+Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
+
+(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
+```
+$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
+$cd /data/data/com.termux/files/home/bin
+$chmod +x ./*
+```
+
+Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
+
+```
+$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
+$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
+```
+
+Now, you can start chatting:
+```
+$cd /data/data/com.termux/files/home/bin
+$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 ```
diff --git a/examples/llava/README-minicpmv-l.md b/examples/llava/README-minicpmv-l.md
@@ -0,0 +1,45 @@
+## MiniCPM-V dev l
+
+### Prepare models and code
+
+Clone llama.cpp:
+```bash
+git clone git@github.com:OpenBMB/llama.cpp.git
+cd llama.cpp
+git checkout minicpmv-main-dev
+```
+
+### Usage of MiniCPM-V 2.6
+
+Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-l-gguf) by us)
+
+```bash
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-l
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-l --minicpmv-projector ../MiniCPM-V-l/minicpmv.projector --output-dir ../MiniCPM-V-l/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
+```
+
+add 'res = "llama-bpe"' in convert_hf_to_gguf.py 514 line
+```bash
+python ./convert_hf_to_gguf.py ../MiniCPM-V-l/model
+
+# quantize int4 version
+./llama-quantize ../MiniCPM-V-l/model/ggml-model-f16.gguf ../MiniCPM-V-l/model/ggml-model-Q4_K_M.gguf Q4_K_M
+```
+
+Build for Linux or Mac
+
+```bash
+make
+```
+
+Inference on Linux or Mac
+```
+# run f16 version
+./llama-minicpmv-cli -m ../MiniCPM-V-l/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-l/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+
+# run quantized int4 version
+./llama-minicpmv-cli -m ../MiniCPM-V-l/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-l/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+
+# or run in interactive mode
+./llama-minicpmv-cli -m ../MiniCPM-V-l/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-l/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
+```
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -658,6 +658,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         else if (ctx->minicpmv_version == 4) {
             pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
         }
+        else if (ctx->minicpmv_version == 5) {
+            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1536, pos_w * pos_h, 1);
+        }
         ggml_set_name(pos_embed, "pos_embed");
         ggml_set_input(pos_embed);
     }
@@ -991,6 +994,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                     n_head = hidden_size/d_head;
                     num_query = 96;
                 }
+                else if (ctx->minicpmv_version == 5) {
+                    hidden_size = 1536;
+                    n_head = hidden_size/d_head;
+                    num_query = 64;
+                }
 
                 struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
                 Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
@@ -1632,7 +1640,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
     }
 }
 
-inline float clip(float x, float lower, float upper) {
+inline int clip(int x, int lower, int upper) {
     return std::max(lower, std::min(x, upper));
 }
 
@@ -1836,10 +1844,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
     return refine_size;
 }
 
-inline int clip(int x, int lower, int upper) {
-    return std::max(lower, std::min(x, upper));
-}
-
 static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
     std::vector<int> candidate_split_grids_nums;
     for (int i : {multiple - 1, multiple, multiple + 1}) {
@@ -2230,6 +2234,9 @@ int clip_n_patches(const struct clip_ctx * ctx) {
         else if (ctx->minicpmv_version == 4) {
             n_patches = 96;
         }
+        else if (ctx->minicpmv_version == 5) {
+            n_patches = 64;
+        }
     }
 
     return n_patches;
@@ -2450,6 +2457,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             else if (ctx->minicpmv_version == 4) {
                 embed_dim = 4096;
             }
+            else if (ctx->minicpmv_version == 5) {
+                embed_dim = 1536;
+            }
             auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
 
             float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
@@ -2677,6 +2687,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
         else if (ctx->minicpmv_version == 4) {
             return 4096;
         }
+        else if (ctx->minicpmv_version == 5) {
+            return 1536;
+        }
     }
 
     std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];

diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
@@ -547,12 +547,19 @@ def bytes_to_unicode():
 emb_dim = 4096
 if minicpmv_version == 1:
     emb_dim = 2304
+    block_count = 26
 elif minicpmv_version == 2:
     emb_dim = 4096
+    block_count = 27
 elif minicpmv_version == 3:
     emb_dim = 3584
+    block_count = 27
 elif minicpmv_version == 4:
     emb_dim = 4096
+    block_count = 27
+elif minicpmv_version == 5:
+    emb_dim = 1536
+    block_count = 27
 
 default_vision_config = {
         "hidden_size": 1152,
@@ -629,7 +636,6 @@ def bytes_to_unicode():
     fout.add_uint32("clip.vision.projection_dim", 0)
     fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
     fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
-    block_count = 26
     fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
 
     if processor is not None: