llama.cpp updated

mgonzs13 · mgonzs13 · commit bce45652df54 · 2024-03-02T17:58:43.000+01:00
mul_mat_q removed
GGML_USE_CUBLAS removed
diff --git a/README.md b/README.md
@@ -29,11 +29,10 @@ $ colcon build
 
 ### CUDA
 
-To run llama_ros with CUDA, you have to install the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) and the following lines in the [CMakeLists.txt](llama_ros/CMakeLists.txt) must be uncommented:
+To run llama_ros with CUDA, you have to install the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) and the following line in the [CMakeLists.txt](llama_ros/CMakeLists.txt) must be uncommented:
 
 ```
 option(LLAMA_CUBLAS "llama: use cuBLAS" ON)
-add_compile_definitions(GGML_USE_CUBLAS)
 ```
 
 ## Usage
diff --git a/llama_bringup/launch/base.launch.py b/llama_bringup/launch/base.launch.py
@@ -58,7 +58,6 @@ def generate_launch_description():
                 "yarn_beta_slow": LaunchConfiguration("yarn_beta_slow", default=1.0),
                 "yarn_orig_ctx": LaunchConfiguration("yarn_orig_ctx", default=0),
 
-                "mul_mat_q": LaunchConfiguration("mul_mat_q", default=True),
                 "embedding": LaunchConfiguration("embedding", default=True),
                 "logits_all": LaunchConfiguration("logits_all", default=False),
                 "use_mmap": LaunchConfiguration("use_mmap", default=True),
diff --git a/llama_bringup/llama_bringup/utils.py b/llama_bringup/llama_bringup/utils.py
@@ -80,7 +80,6 @@ def create_llama_launch(
     yarn_orig_ctx: float = 0,
 
     embedding: bool = True,
-    mul_mat_q: bool = True,
     logits_all: bool = False,
     use_mmap: bool = True,
     use_mlock: bool = False,
@@ -136,7 +135,6 @@ def create_llama_launch(
             "yarn_orig_ctx": str(yarn_orig_ctx),
             "rope_scaling_type": str(rope_scaling_type),
 
-            "mul_mat_q": str(mul_mat_q),
             "embedding": str(embedding),
             "logits_all": str(logits_all),
             "use_mmap": str(use_mmap),
diff --git a/llama_ros/CMakeLists.txt b/llama_ros/CMakeLists.txt
@@ -7,7 +7,6 @@ endif()
 
 # cuBLAS
 # option(LLAMA_CUBLAS "llama: use cuBLAS" ON)
-# add_compile_definitions(GGML_USE_CUBLAS)
 
 # find dependencies
 find_package(ament_cmake REQUIRED)
diff --git a/llama_ros/llama_cpp b/llama_ros/llama_cpp
@@ -1 +1 @@
-Subproject commit 08c5ee87e4cceb603ecceac90734fcdade57311b
+Subproject commit bbde6eb2561153aabbdfac5001c690fe00cad639
diff --git a/llama_ros/src/llama_node.cpp b/llama_ros/src/llama_node.cpp
@@ -121,7 +121,6 @@ void LlamaNode::load_params(struct gpt_params &params) {
                                                std::vector<double>({0.0}));
   this->declare_parameters<bool>("", {
                                          {"debug", true},
-                                         {"mul_mat_q", true},
                                          {"embedding", true},
                                          {"logits_all", false},
                                          {"use_mmap", true},
@@ -140,7 +139,6 @@ void LlamaNode::load_params(struct gpt_params &params) {
   this->get_parameter("main_gpu", params.main_gpu);
   this->get_parameter("tensor_split", tensor_split);
 
-  this->get_parameter("mul_mat_q", params.mul_mat_q);
   this->get_parameter("embedding", params.embedding);
   this->get_parameter("logits_all", params.logits_all);
   this->get_parameter("use_mmap", params.use_mmap);
@@ -243,8 +241,7 @@ void LlamaNode::load_params(struct gpt_params &params) {
               std::istreambuf_iterator<char>(), back_inserter(params.prompt));
   }
 
-  // cublas
-#ifdef GGML_USE_CUBLAS
+  // split tensors
   GGML_ASSERT(tensor_split.size() <= llama_max_devices());
   for (size_t i = 0; i < llama_max_devices(); ++i) {
     if (i < tensor_split.size()) {
@@ -253,9 +250,6 @@ void LlamaNode::load_params(struct gpt_params &params) {
       params.tensor_split[i] = 0.0f;
     }
   }
-
-  params.mul_mat_q = false;
-#endif
 }
 
 void LlamaNode::tokenize_service_callback(