load lora

OpenBMB · Sep 14, 2024 · bafd61f · bafd61f
1 parent 2a16030
commit bafd61f
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 4 deletions.
diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
@@ -60,7 +60,7 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
     output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
 
     if os.path.exists(input_model):
-        model = torch.load(input_model, map_location="cpu")
+        model = torch.load(input_model, map_location="cpu", weights_only=True)
     else:
         input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
         # lazy import load_file only if lora is in safetensors format.
@@ -75,7 +75,6 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
 
     arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
     name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
-
     with open(input_json, "r") as f:
         params = json.load(f)
 
@@ -103,6 +102,7 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
         write_file_header(fout, params)
         for k, v in model.items():
             orig_k = k
+            print(orig_k)
             if k.endswith(".default.weight"):
                 k = k.replace(".default.weight", ".weight")
             if k in ["llama_proj.weight", "llama_proj.bias"]:
@@ -117,18 +117,25 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
             t = v.detach().numpy()
 
             prefix = "base_model.model."
+            if k.startswith(prefix):
+                k = k[len(prefix) :]
+            prefix = "llm."
             if k.startswith(prefix):
                 k = k[len(prefix) :]
 
-            lora_suffixes = (".lora_A.weight", ".lora_B.weight")
+            lora_suffixes = (".lora_A.weight", ".lora_B.weight", ".lora_A.default.weight", ".lora_B.default.weight")
+            print("k:", k)
             if k.endswith(lora_suffixes):
                 suffix = k[-len(lora_suffixes[0]):]
                 k = k[: -len(lora_suffixes[0])]
             else:
                 print(f"Error: unrecognized tensor name {orig_k}")
                 sys.exit(1)
+            print("k:", k)
 
             tname = name_map.get_name(k)
+            #model.layers.139.self_attn.v_proj'
+            #llm.model.layers.0.self_attn.q_proj
             if tname is None:
                 print(f"Error: could not map tensor name {orig_k}")
                 print(" Note: the arch parameter must be specified if the model is not llama")

diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
@@ -77,12 +77,30 @@ static struct llava_context * llava_init_context(gpt_params * params) {
     }
 
     llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
-
+    
     if (ctx_llama == NULL) {
         LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
         return NULL;
     }
 
+    for (unsigned int i = 0; i < params->lora_adapter.size(); ++i) {
+        const std::string & lora_adapter = std::get<0>(params->lora_adapter[i]);
+        float lora_scale = std::get<1>(params->lora_adapter[i]);
+        int err = llama_model_apply_lora_from_file(model,
+                                             lora_adapter.c_str(),
+                                             lora_scale,
+                                             ((i > 0) || params->lora_base.empty())
+                                                ? NULL
+                                                : params->lora_base.c_str(),
+                                             params->n_threads);
+        if (err != 0) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+            // llama_free(lctx);
+            // llama_free_model(model);
+            // return std::make_tuple(nullptr, nullptr);
+        }
+    }
+
     auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
 
     ctx_llava->ctx_llama = ctx_llama;