llama.cpp updated + pooling_type param

mgonzs13 · Mar 3, 2024 · 3f52989 · 3f52989
1 parent bce4565
commit 3f52989
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 8 deletions.
diff --git a/llama_bringup/launch/base.launch.py b/llama_bringup/launch/base.launch.py
@@ -41,7 +41,7 @@ def generate_launch_description():
                 "n_batch": LaunchConfiguration("n_batch", default=8),
 
                 "n_gpu_layers": LaunchConfiguration("n_gpu_layers", default=0),
-                "split_mode": LaunchConfiguration("split_mode", default="none"),
+                "split_mode": LaunchConfiguration("split_mode", default="layer"),
                 "main_gpu": LaunchConfiguration("main_gpu", default=0),
                 "tensor_split": LaunchConfiguration("tensor_split", default="[0.0]"),
 
@@ -50,7 +50,7 @@ def generate_launch_description():
 
                 "rope_freq_base": LaunchConfiguration("rope_freq_base", default=0.0),
                 "rope_freq_scale": LaunchConfiguration("rope_freq_scale", default=0.0),
-                "rope_scaling_type": LaunchConfiguration("rope_scaling_type", default="none"),
+                "rope_scaling_type": LaunchConfiguration("rope_scaling_type", default=""),
 
                 "yarn_ext_factor": LaunchConfiguration("yarn_ext_factor", default=-1.0),
                 "yarn_attn_factor": LaunchConfiguration("yarn_attn_factor", default=1.0),
@@ -77,6 +77,7 @@ def generate_launch_description():
                 "lora_adapter": LaunchConfiguration("lora_adapter", default=""),
                 "lora_base": LaunchConfiguration("lora_base", default=""),
                 "numa": LaunchConfiguration("numa", default="none"),
+                "pooling_type": LaunchConfiguration("pooling_type", default=""),
 
                 "prefix": ParameterValue(LaunchConfiguration("prefix", default=""), value_type=str),
                 "suffix": ParameterValue(LaunchConfiguration("suffix", default=""), value_type=str),

diff --git a/llama_bringup/llama_bringup/utils.py b/llama_bringup/llama_bringup/utils.py
@@ -62,7 +62,7 @@ def create_llama_launch(
     n_batch: int = 8,
 
     n_gpu_layers: int = 0,
-    split_mode: str = "none",
+    split_mode: str = "layer",
     main_gpu: int = 0,
     tensor_split: str = "[0.0]",
 
@@ -71,7 +71,7 @@ def create_llama_launch(
 
     rope_freq_base: float = 0.0,
     rope_freq_scale: float = 0.0,
-    rope_scaling_type: str = "none",
+    rope_scaling_type: str = "",
 
     yarn_ext_factor: float = -1.0,
     yarn_attn_factor: float = 1.0,
@@ -101,6 +101,7 @@ def create_llama_launch(
     lora_base_filename: str = "",
 
     numa: str = "none",
+    pooling_type: str = "",
 
     prefix: str = "",
     suffix: str = "",
@@ -152,7 +153,8 @@ def create_llama_launch(
 
             "model": download_model(model_repo, model_filename),
             "lora_base": download_model(lora_base_repo, lora_base_filename),
-            "numa": str(numa),
+            "numa": numa,
+            "pooling_type": pooling_type,
 
             "prefix": prefix,
             "suffix": suffix,

diff --git a/llama_ros/llama_cpp b/llama_ros/llama_cpp
diff --git a/llama_ros/src/llama_node.cpp b/llama_ros/src/llama_node.cpp
@@ -74,6 +74,7 @@ void LlamaNode::load_params(struct gpt_params &params) {
   std::string split_mode;
   std::string rope_scaling_type;
   std::string numa;
+  std::string pooling_type;
 
   std::vector<double> tensor_split;
 
@@ -98,9 +99,10 @@ void LlamaNode::load_params(struct gpt_params &params) {
                                                 {"model", ""},
                                                 {"lora_adapter", ""},
                                                 {"lora_base", ""},
-                                                {"split_mode", "none"},
-                                                {"rope_scaling_type", "none"},
+                                                {"split_mode", "layer"},
+                                                {"rope_scaling_type", ""},
                                                 {"numa", "none"},
+                                                {"pooling_type", ""},
                                                 {"cache_type_k", "f16"},
                                                 {"cache_type_v", "f16"},
                                                 {"prompt", ""},
@@ -172,6 +174,7 @@ void LlamaNode::load_params(struct gpt_params &params) {
   this->get_parameter("lora_adapter", lora_adapter);
   this->get_parameter("lora_base", params.lora_base);
   this->get_parameter("numa", numa);
+  this->get_parameter("pooling_type", pooling_type);
 
   this->get_parameter("n_parallel", params.n_parallel);
   this->get_parameter("n_sequences", params.n_sequences);
@@ -215,6 +218,8 @@ void LlamaNode::load_params(struct gpt_params &params) {
     params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR;
   } else if (rope_scaling_type == "yarn") {
     params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN;
+  } else {
+    params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
   }
 
   // numa
@@ -230,6 +235,17 @@ void LlamaNode::load_params(struct gpt_params &params) {
     params.numa = GGML_NUMA_STRATEGY_MIRROR;
   }
 
+  // pooling
+  if (pooling_type == "none") {
+    params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+  } else if (pooling_type == "mean") {
+    params.pooling_type = LLAMA_POOLING_TYPE_MEAN;
+  } else if (pooling_type == "cls") {
+    params.pooling_type = LLAMA_POOLING_TYPE_CLS;
+  } else {
+    params.pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED;
+  }
+
   // initial prompt
   if (!file_path.empty()) {
     std::ifstream file(file_path.c_str());
+14 −6		.github/workflows/server.yml
+6 −2		README.md
+13 −0		common/common.cpp
+5 −2		common/common.h
+108 −112		convert-hf-to-gguf.py
+3 −3		convert-llama-ggml-to-gguf.py
+36 −40		convert.py
+2 −2		examples/infill/infill.cpp
+1 −1		examples/server/README.md
+37 −19		examples/server/server.cpp
+35 −15		examples/server/tests/README.md
+4 −1		examples/server/tests/features/environment.py
+1 −0		examples/server/tests/features/issues.feature
+3 −2		examples/server/tests/features/parallel.feature
+55 −0		examples/server/tests/features/passkey.feature
+2 −1		examples/server/tests/features/security.feature
+15 −8		examples/server/tests/features/server.feature
+201 −58		examples/server/tests/features/steps/steps.py
+3 −2		examples/server/tests/features/wrong_usages.feature
+1 −0		examples/server/tests/requirements.txt
+1 −1		examples/server/tests/tests.sh
+1 −2		examples/server/utils.hpp
+9 −9		flake.lock
+1 −0		ggml-cuda.cu
+1 −1		ggml-quants.c
+44 −28		gguf-py/gguf/constants.py
+1 −1		gguf-py/gguf/gguf_writer.py
+74 −35		llama.cpp
+16 −4		llama.h