From 8da55624bbf1f30d92b153603feaad344d15add8 Mon Sep 17 00:00:00 2001 From: caitianchi Date: Wed, 16 Oct 2024 21:12:59 +0800 Subject: [PATCH] add temp model support --- examples/llava/README-minicpmv-dev.md | 100 +++++++++++++++--- examples/llava/README-minicpmv-l.md | 45 ++++++++ examples/llava/clip.cpp | 23 +++- .../minicpmv-convert-image-encoder-to-gguf.py | 8 +- 4 files changed, 157 insertions(+), 19 deletions(-) create mode 100644 examples/llava/README-minicpmv-l.md diff --git a/examples/llava/README-minicpmv-dev.md b/examples/llava/README-minicpmv-dev.md index f1d76a48c8433..6628466a98622 100644 --- a/examples/llava/README-minicpmv-dev.md +++ b/examples/llava/README-minicpmv-dev.md @@ -6,40 +6,114 @@ Clone llama.cpp: ```bash git clone git@github.com:OpenBMB/llama.cpp.git cd llama.cpp -git checkout minicpmv-main-dev +git checkout minicpmv-main ``` ### Usage of MiniCPM-V 2.6 -Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-dev-gguf) by us) +Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us) ```bash -python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-dev -python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-dev --minicpmv-projector ../MiniCPM-V-dev/minicpmv.projector --output-dir ../MiniCPM-V-dev/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4 -``` - -add 'res = "llama-bpe"' in convert_hf_to_gguf.py 514 line -```bash -python ./convert_hf_to_gguf.py ../MiniCPM-V-dev/model +python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6 +python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 5 +python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model # quantize int4 version -./llama-quantize ../MiniCPM-V-dev/model/ggml-model-f16.gguf ../MiniCPM-V-dev/model/ggml-model-Q4_K_M.gguf Q4_K_M +./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M ``` Build for Linux or Mac ```bash make +make llama-minicpmv-cli ``` Inference on Linux or Mac ``` # run f16 version -./llama-minicpmv-cli -m ../MiniCPM-V-dev/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-dev/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" # run quantized int4 version -./llama-minicpmv-cli -m ../MiniCPM-V-dev/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-dev/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" # or run in interactive mode -./llama-minicpmv-cli -m ../MiniCPM-V-dev/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-dev/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i +./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i +``` + +### Video +Install FFmpeg +``` +brew install ffmpeg +brew install pkg-config +``` + +### Window +Compiling from source code on window with `make` will be a little more complicated, but using `cmake` will be more convenient. + +CPU: +``` +cmake -B build +cmake --build build --config Release -t llama-minicpmv-cli +``` +CUDA: +``` +cmake -B build -DLLAMA_CUDA=ON +cmake --build build --config Release -t llama-minicpmv-cli +``` + +### Android + +#### Build on Android device using Termux +We found that build on Android device would bring better runtime performance, so we recommend to build on device. + +[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required). + +Install tools in Termux: +``` +apt update && apt upgrade -y +apt install git make cmake +``` + +It's recommended to move your model inside the `~/` directory for best performance: +``` +cd storage/downloads +mv model.gguf ~/ +``` + +#### Building the Project using Android NDK +Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. + +Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: + +```bash +mkdir build-android +cd build-android +export NDK=/your_ndk_path +cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. +make +``` + +Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). + +Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: + +(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) +``` +$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ +$cd /data/data/com.termux/files/home/bin +$chmod +x ./* +``` + +Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` + +``` +$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/ +$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/ +``` + +Now, you can start chatting: +``` +$cd /data/data/com.termux/files/home/bin +$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" ``` diff --git a/examples/llava/README-minicpmv-l.md b/examples/llava/README-minicpmv-l.md new file mode 100644 index 0000000000000..f9e84204ce5a8 --- /dev/null +++ b/examples/llava/README-minicpmv-l.md @@ -0,0 +1,45 @@ +## MiniCPM-V dev l + +### Prepare models and code + +Clone llama.cpp: +```bash +git clone git@github.com:OpenBMB/llama.cpp.git +cd llama.cpp +git checkout minicpmv-main-dev +``` + +### Usage of MiniCPM-V 2.6 + +Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-l-gguf) by us) + +```bash +python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-l +python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-l --minicpmv-projector ../MiniCPM-V-l/minicpmv.projector --output-dir ../MiniCPM-V-l/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4 +``` + +add 'res = "llama-bpe"' in convert_hf_to_gguf.py 514 line +```bash +python ./convert_hf_to_gguf.py ../MiniCPM-V-l/model + +# quantize int4 version +./llama-quantize ../MiniCPM-V-l/model/ggml-model-f16.gguf ../MiniCPM-V-l/model/ggml-model-Q4_K_M.gguf Q4_K_M +``` + +Build for Linux or Mac + +```bash +make +``` + +Inference on Linux or Mac +``` +# run f16 version +./llama-minicpmv-cli -m ../MiniCPM-V-l/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-l/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# run quantized int4 version +./llama-minicpmv-cli -m ../MiniCPM-V-l/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-l/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# or run in interactive mode +./llama-minicpmv-cli -m ../MiniCPM-V-l/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-l/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i +``` diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index db2d2c5bf47b2..a2673b8c1b3f4 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -658,6 +658,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 else if (ctx->minicpmv_version == 4) { pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); } + else if (ctx->minicpmv_version == 5) { + pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1536, pos_w * pos_h, 1); + } ggml_set_name(pos_embed, "pos_embed"); ggml_set_input(pos_embed); } @@ -991,6 +994,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 n_head = hidden_size/d_head; num_query = 96; } + else if (ctx->minicpmv_version == 5) { + hidden_size = 1536; + n_head = hidden_size/d_head; + num_query = 64; + } struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); @@ -1632,7 +1640,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* } } -inline float clip(float x, float lower, float upper) { +inline int clip(int x, int lower, int upper) { return std::max(lower, std::min(x, upper)); } @@ -1836,10 +1844,6 @@ static std::pair uhd_get_refine_size(std::pair original_size return refine_size; } -inline int clip(int x, int lower, int upper) { - return std::max(lower, std::min(x, upper)); -} - static std::pair uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { std::vector candidate_split_grids_nums; for (int i : {multiple - 1, multiple, multiple + 1}) { @@ -2230,6 +2234,9 @@ int clip_n_patches(const struct clip_ctx * ctx) { else if (ctx->minicpmv_version == 4) { n_patches = 96; } + else if (ctx->minicpmv_version == 5) { + n_patches = 64; + } } return n_patches; @@ -2450,6 +2457,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima else if (ctx->minicpmv_version == 4) { embed_dim = 4096; } + else if (ctx->minicpmv_version == 5) { + embed_dim = 1536; + } auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); @@ -2677,6 +2687,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { else if (ctx->minicpmv_version == 4) { return 4096; } + else if (ctx->minicpmv_version == 5) { + return 1536; + } } std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py index 6e1a07bccdcb8..6043a270e1d41 100644 --- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py +++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py @@ -547,12 +547,19 @@ def bytes_to_unicode(): emb_dim = 4096 if minicpmv_version == 1: emb_dim = 2304 + block_count = 26 elif minicpmv_version == 2: emb_dim = 4096 + block_count = 27 elif minicpmv_version == 3: emb_dim = 3584 + block_count = 27 elif minicpmv_version == 4: emb_dim = 4096 + block_count = 27 +elif minicpmv_version == 5: + emb_dim = 1536 + block_count = 27 default_vision_config = { "hidden_size": 1152, @@ -629,7 +636,6 @@ def bytes_to_unicode(): fout.add_uint32("clip.vision.projection_dim", 0) fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16) fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) - block_count = 26 fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count) if processor is not None: