From 3f5298932c038d748f50b1069eef09fa802682fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81ngel=20Gonz=C3=A1lez=20Santamarta?=
 <mgons@unileon.es>
Date: Sun, 3 Mar 2024 19:36:15 +0100
Subject: [PATCH] llama.cpp updated + pooling_type param

---
 llama_bringup/launch/base.launch.py  |  5 +++--
 llama_bringup/llama_bringup/utils.py |  8 +++++---
 llama_ros/llama_cpp                  |  2 +-
 llama_ros/src/llama_node.cpp         | 20 ++++++++++++++++++--
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/llama_bringup/launch/base.launch.py b/llama_bringup/launch/base.launch.py
index 4faea2f5..28e389e3 100644
--- a/llama_bringup/launch/base.launch.py
+++ b/llama_bringup/launch/base.launch.py
@@ -41,7 +41,7 @@ def generate_launch_description():
                 "n_batch": LaunchConfiguration("n_batch", default=8),
 
                 "n_gpu_layers": LaunchConfiguration("n_gpu_layers", default=0),
-                "split_mode": LaunchConfiguration("split_mode", default="none"),
+                "split_mode": LaunchConfiguration("split_mode", default="layer"),
                 "main_gpu": LaunchConfiguration("main_gpu", default=0),
                 "tensor_split": LaunchConfiguration("tensor_split", default="[0.0]"),
 
@@ -50,7 +50,7 @@ def generate_launch_description():
 
                 "rope_freq_base": LaunchConfiguration("rope_freq_base", default=0.0),
                 "rope_freq_scale": LaunchConfiguration("rope_freq_scale", default=0.0),
-                "rope_scaling_type": LaunchConfiguration("rope_scaling_type", default="none"),
+                "rope_scaling_type": LaunchConfiguration("rope_scaling_type", default=""),
 
                 "yarn_ext_factor": LaunchConfiguration("yarn_ext_factor", default=-1.0),
                 "yarn_attn_factor": LaunchConfiguration("yarn_attn_factor", default=1.0),
@@ -77,6 +77,7 @@ def generate_launch_description():
                 "lora_adapter": LaunchConfiguration("lora_adapter", default=""),
                 "lora_base": LaunchConfiguration("lora_base", default=""),
                 "numa": LaunchConfiguration("numa", default="none"),
+                "pooling_type": LaunchConfiguration("pooling_type", default=""),
 
                 "prefix": ParameterValue(LaunchConfiguration("prefix", default=""), value_type=str),
                 "suffix": ParameterValue(LaunchConfiguration("suffix", default=""), value_type=str),
diff --git a/llama_bringup/llama_bringup/utils.py b/llama_bringup/llama_bringup/utils.py
index c9cf6764..c8183f81 100644
--- a/llama_bringup/llama_bringup/utils.py
+++ b/llama_bringup/llama_bringup/utils.py
@@ -62,7 +62,7 @@ def create_llama_launch(
     n_batch: int = 8,
 
     n_gpu_layers: int = 0,
-    split_mode: str = "none",
+    split_mode: str = "layer",
     main_gpu: int = 0,
     tensor_split: str = "[0.0]",
 
@@ -71,7 +71,7 @@ def create_llama_launch(
 
     rope_freq_base: float = 0.0,
     rope_freq_scale: float = 0.0,
-    rope_scaling_type: str = "none",
+    rope_scaling_type: str = "",
 
     yarn_ext_factor: float = -1.0,
     yarn_attn_factor: float = 1.0,
@@ -101,6 +101,7 @@ def create_llama_launch(
     lora_base_filename: str = "",
 
     numa: str = "none",
+    pooling_type: str = "",
 
     prefix: str = "",
     suffix: str = "",
@@ -152,7 +153,8 @@ def create_llama_launch(
 
             "model": download_model(model_repo, model_filename),
             "lora_base": download_model(lora_base_repo, lora_base_filename),
-            "numa": str(numa),
+            "numa": numa,
+            "pooling_type": pooling_type,
 
             "prefix": prefix,
             "suffix": suffix,
diff --git a/llama_ros/llama_cpp b/llama_ros/llama_cpp
index bbde6eb2..67be2ce1 160000
--- a/llama_ros/llama_cpp
+++ b/llama_ros/llama_cpp
@@ -1 +1 @@
-Subproject commit bbde6eb2561153aabbdfac5001c690fe00cad639
+Subproject commit 67be2ce1015d070b3b2cd488bcb041eefb61de72
diff --git a/llama_ros/src/llama_node.cpp b/llama_ros/src/llama_node.cpp
index 574fe97d..8413dd7b 100644
--- a/llama_ros/src/llama_node.cpp
+++ b/llama_ros/src/llama_node.cpp
@@ -74,6 +74,7 @@ void LlamaNode::load_params(struct gpt_params &params) {
   std::string split_mode;
   std::string rope_scaling_type;
   std::string numa;
+  std::string pooling_type;
 
   std::vector<double> tensor_split;
 
@@ -98,9 +99,10 @@ void LlamaNode::load_params(struct gpt_params &params) {
                                                 {"model", ""},
                                                 {"lora_adapter", ""},
                                                 {"lora_base", ""},
-                                                {"split_mode", "none"},
-                                                {"rope_scaling_type", "none"},
+                                                {"split_mode", "layer"},
+                                                {"rope_scaling_type", ""},
                                                 {"numa", "none"},
+                                                {"pooling_type", ""},
                                                 {"cache_type_k", "f16"},
                                                 {"cache_type_v", "f16"},
                                                 {"prompt", ""},
@@ -172,6 +174,7 @@ void LlamaNode::load_params(struct gpt_params &params) {
   this->get_parameter("lora_adapter", lora_adapter);
   this->get_parameter("lora_base", params.lora_base);
   this->get_parameter("numa", numa);
+  this->get_parameter("pooling_type", pooling_type);
 
   this->get_parameter("n_parallel", params.n_parallel);
   this->get_parameter("n_sequences", params.n_sequences);
@@ -215,6 +218,8 @@ void LlamaNode::load_params(struct gpt_params &params) {
     params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR;
   } else if (rope_scaling_type == "yarn") {
     params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN;
+  } else {
+    params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
   }
 
   // numa
@@ -230,6 +235,17 @@ void LlamaNode::load_params(struct gpt_params &params) {
     params.numa = GGML_NUMA_STRATEGY_MIRROR;
   }
 
+  // pooling
+  if (pooling_type == "none") {
+    params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+  } else if (pooling_type == "mean") {
+    params.pooling_type = LLAMA_POOLING_TYPE_MEAN;
+  } else if (pooling_type == "cls") {
+    params.pooling_type = LLAMA_POOLING_TYPE_CLS;
+  } else {
+    params.pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED;
+  }
+
   // initial prompt
   if (!file_path.empty()) {
     std::ifstream file(file_path.c_str());