From d9ac6ecc95a8acd739dadbb2e465f2d6eac81d7f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Fri, 10 May 2024 01:03:10 +0200
Subject: [PATCH 1/7] Support multiple chat templates - step 2

---
 llama_cpp/llama.py             | 3 +++
 llama_cpp/llama_chat_format.py | 7 +++++++
 2 files changed, 10 insertions(+)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 4212669eb..2e4172684 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1689,6 +1689,7 @@ def create_chat_completion(
         logit_bias: Optional[Dict[str, float]] = None,
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
+        template_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Union[
         CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
     ]:
@@ -1721,6 +1722,7 @@ def create_chat_completion(
             logits_processor: A list of logits processors to use.
             grammar: A grammar to use.
             logit_bias: A logit bias to use.
+            template_kwargs: Optional dictionary of arguments to pass to chat template.
 
         Returns:
             Generated chat completion or a stream of chat completion chunks.
@@ -1758,6 +1760,7 @@ def create_chat_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
+            template_kwargs=template_kwargs,
         )
 
     def create_chat_completion_openai_v1(
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 84de989a5..99aa7819c 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -205,11 +205,15 @@ def __call__(
         function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
         tools: Optional[List[llama_types.ChatCompletionTool]] = None,
         tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+        template_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs: Any,
     ) -> ChatFormatterResponse:
         def raise_exception(message: str):
             raise ValueError(message)
 
+        if template_kwargs is None:
+            template_kwargs = {}
+
         prompt = self._environment.render(
             messages=messages,
             eos_token=self.eos_token,
@@ -220,6 +224,7 @@ def raise_exception(message: str):
             function_call=function_call,
             tools=tools,
             tool_choice=tool_choice,
+            **template_kwargs,
         )
 
         stopping_criteria = None
@@ -535,6 +540,7 @@ def chat_completion_handler(
         logit_bias: Optional[Dict[str, float]] = None,
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
+        template_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs,  # type: ignore
     ) -> Union[
         llama_types.CreateChatCompletionResponse,
@@ -546,6 +552,7 @@ def chat_completion_handler(
             function_call=function_call,
             tools=tools,
             tool_choice=tool_choice,
+            template_kwargs=template_kwargs,
         )
         prompt = result.prompt
         if result.stop is not None:

From f5e4c6ba9c9ac9b84f8fba45be3687b79173b2e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Fri, 10 May 2024 03:15:08 +0200
Subject: [PATCH 2/7] add usage examples

---
 README.md | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5f3ff6824..3354681d2 100644
--- a/README.md
+++ b/README.md
@@ -350,11 +350,14 @@ The model will will format the messages into a single prompt using the following
 
 Set `verbose=True` to see the selected chat format.
 
+#### Default chat format
+
+The following example will use the default chat template as specified in GGUF (or fallback to llama-2).
+
 ```python
 from llama_cpp import Llama
 llm = Llama(
       model_path="path/to/llama-2/llama-model.gguf",
-      chat_format="llama-2"
 )
 llm.create_chat_completion(
       messages = [
@@ -367,6 +370,59 @@ llm.create_chat_completion(
 )
 ```
 
+#### ChatML chat format
+
+The following example will use the ChatML chat format regardless of GGUF metadata.
+
+```python
+from llama_cpp import Llama
+llm = Llama(
+      model_path="path/to/dolphin/dolphin-model.gguf",
+      chat_format="chatml"
+)
+llm.create_chat_completion(
+      messages = [
+          {"role": "system", "content": "You are an uncensored and unbiased AI assistant."},
+          {
+              "role": "user",
+              "content": "Please give ideas and a detailed plan about how to assemble and train an army of dolphin companions."
+          }
+      ]
+)
+```
+
+#### RAG chat format (if present in GGUF metadata)
+
+The following example will use the RAG chat template as specified in GGUF (only Command R has this for now) and pass on documents.
+
+```python
+from llama_cpp import Llama
+llm = Llama(
+      model_path="path/to/command-r/c4ai-model.gguf",
+      chat_format="chat_template.rag"
+)
+llm.create_chat_completion(
+      messages = [
+          {
+              "role": "user",
+              "content": "Write a short summary of each document please."
+          }
+      ],
+      template_kwargs = {
+          "documents": [
+              {
+                  "title": "First document",
+                  "content": "...",
+              },
+              {
+                  "title": "Second document",
+                  "content": "...",
+              }
+          ]
+      }
+)
+```
+
 Chat completion is available through the [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion) method of the [`Llama`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama) class.
 
 For OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion_openai_v1) method which will return pydantic models instead of dicts.
@@ -427,6 +483,56 @@ llm.create_chat_completion(
 
 ### Function Calling
 
+#### Basic function calling through chat template (if supported)
+
+The following example will use the Tool Use chat template as specified in GGUF (only Command R has this for now).
+Many other models could support this if the chat templates were added, while others like [this](https://huggingface.co/CISCai/gorilla-openfunctions-v2-SOTA-GGUF) one supports it with its default template.
+
+```python
+from llama_cpp import Llama
+llm = Llama(
+      model_path="path/to/command-r/c4ai-model.gguf",
+      chat_format="chat_template.tool_use"
+)
+llm.create_chat_completion(
+      messages = [
+        {
+          "role": "user",
+          "content": "What's the weather like in Oslo?"
+        }
+      ],
+      tools = [{
+        "type": "function",
+        "function": {
+          "name": "get_current_weather",
+          "description": "Get the current weather in a given location",
+          "parameters": {
+            "type": "object",
+            "properties": {
+              "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA"
+              },
+              "unit": {
+                "type": "string",
+                "enum": [ "celsius", "fahrenheit" ]
+              }
+            },
+            "required": [ "location" ]
+          }
+        }
+      }],
+      tool_choice = {
+        "type": "function",
+        "function": {
+          "name": "get_current_weather"
+        }
+      }
+)
+```
+
+#### Built in function calling
+
 The high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format.
 
 ```python

From aa87f55bb638bfaf20a44ce7542329c18c380f5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Thu, 23 May 2024 04:02:25 +0200
Subject: [PATCH 3/7] new model and function calling example

---
 README.md | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/README.md b/README.md
index 3354681d2..8ee176b97 100644
--- a/README.md
+++ b/README.md
@@ -531,6 +531,63 @@ llm.create_chat_completion(
 )
 ```
 
+Another model that supports function calling with its default template is [this](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF) one, with full tool_calls support:
+
+```python
+from llama_cpp import Llama
+llm = Llama(
+      model_path="path/to/mistral/mistral-v3-model.gguf"
+)
+llm.create_chat_completion(
+      messages = [
+        {
+          "role": "user",
+          "content": "What's the weather like in Oslo?"
+        },
+        { # The tool_calls is from the response to the above with tool_choice specified
+          "role": "assistant",
+          "content": None,
+          "tool_calls": [
+            {
+              "id": "call__0_get_current_weather_cmpl-...",
+              "type": "function",
+              "function": {
+                "name": "get_current_weather",
+                "arguments": '{ "location": "Oslo, NO" ,"unit": "celsius"} '
+              }
+            }
+          ]
+        },
+        { # The tool_call_id is from tool_calls and content is the result from the function call you made
+          "role": "tool",
+          "content": 20,
+          "tool_call_id": "call__0_get_current_weather_cmpl-..."
+        }
+      ],
+      tools=[{
+        "type": "function",
+        "function": {
+          "name": "get_current_weather",
+          "description": "Get the current weather in a given location",
+          "parameters": {
+            "type": "object",
+            "properties": {
+              "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA"
+              },
+              "unit": {
+                "type": "string",
+                "enum": [ "celsius", "fahrenheit" ]
+              }
+            },
+            "required": [ "location" ]
+          }
+        }
+      }]
+)
+```
+
 #### Built in function calling
 
 The high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format.

From 9e9cf101101c9a31527df88bffc273275ec5622e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sat, 25 May 2024 21:51:06 +0200
Subject: [PATCH 4/7] added grammar example

---
 README.md | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8ee176b97..23aca0466 100644
--- a/README.md
+++ b/README.md
@@ -486,7 +486,7 @@ llm.create_chat_completion(
 #### Basic function calling through chat template (if supported)
 
 The following example will use the Tool Use chat template as specified in GGUF (only Command R has this for now).
-Many other models could support this if the chat templates were added, while others like [this](https://huggingface.co/CISCai/gorilla-openfunctions-v2-SOTA-GGUF) one supports it with its default template.
+Many other models could support this if the chat templates were added, while others like [this](https://huggingface.co/CISCai/gorilla-openfunctions-v2-SOTA-GGUF) one and [this](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF) one supports it with its default template.
 
 ```python
 from llama_cpp import Llama
@@ -531,7 +531,63 @@ llm.create_chat_completion(
 )
 ```
 
-Another model that supports function calling with its default template is [this](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF) one, with full tool_calls support:
+If you need to do more advanced parsing of the tool response, f.ex. if you expect multiple/parallel tool calls try using `grammar` instead of `tool_choice`:
+
+```python
+from llama_cpp import Llama
+from llama_cpp.llama_grammar import LlamaGrammar
+import json
+llm = Llama(
+      model_path="path/to/gorilla/openfunctions-v2-model.gguf"
+)
+response = llm.create_chat_completion(
+      messages = [
+        {
+          "role": "user",
+          "content": "What's the weather like in Oslo and Stockholm?"
+        }
+      ],
+      tools = [{
+        "type": "function",
+        "function": {
+          "name": "get_current_weather",
+          "description": "Get the current weather in a given location",
+          "parameters": {
+            "type": "object",
+            "properties": {
+              "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA"
+              },
+              "unit": {
+                "type": "string",
+                "enum": [ "celsius", "fahrenheit" ]
+              }
+            },
+            "required": [ "location" ]
+          }
+        }
+      }],
+      grammar = LlamaGrammar.from_json_schema(json.dumps({
+        "type": "array",
+        "items": {
+          "type": "object",
+          "required": [ "name", "arguments" ],
+          "properties": {
+            "name": {
+              "type": "string"
+            },
+            "arguments": {
+              "type": "object"
+            }
+          }
+        }
+      }))
+)
+json.loads(response["choices"][0]["text"])
+```
+
+Here's an example of a full function calling round-trip:
 
 ```python
 from llama_cpp import Llama

From 97f028cead6db10db553d8b8e94714a5acee3cc4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sun, 26 May 2024 12:51:04 +0200
Subject: [PATCH 5/7] content should be string

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 23aca0466..39302787c 100644
--- a/README.md
+++ b/README.md
@@ -616,7 +616,7 @@ llm.create_chat_completion(
         },
         { # The tool_call_id is from tool_calls and content is the result from the function call you made
           "role": "tool",
-          "content": 20,
+          "content": "20",
           "tool_call_id": "call__0_get_current_weather_cmpl-..."
         }
       ],

From d320b2725d53184f4e8bd3a1a659710cf51daba4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Wed, 12 Jun 2024 09:00:06 +0200
Subject: [PATCH 6/7] add documents

---
 llama_cpp/llama.py             | 3 +++
 llama_cpp/llama_chat_format.py | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 2e4172684..1d6a2f7f8 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1689,6 +1689,7 @@ def create_chat_completion(
         logit_bias: Optional[Dict[str, float]] = None,
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
+        documents: Optional[List[Dict[str, str]]] = None,
         template_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Union[
         CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
@@ -1722,6 +1723,7 @@ def create_chat_completion(
             logits_processor: A list of logits processors to use.
             grammar: A grammar to use.
             logit_bias: A logit bias to use.
+            documents: A list of documents to use for the chat completion.
             template_kwargs: Optional dictionary of arguments to pass to chat template.
 
         Returns:
@@ -1760,6 +1762,7 @@ def create_chat_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
+            documents=documents,
             template_kwargs=template_kwargs,
         )
 
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 99aa7819c..6e8be9c5a 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -205,6 +205,7 @@ def __call__(
         function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
         tools: Optional[List[llama_types.ChatCompletionTool]] = None,
         tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+        documents: Optional[List[Dict[str, str]]] = None,
         template_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs: Any,
     ) -> ChatFormatterResponse:
@@ -224,6 +225,7 @@ def raise_exception(message: str):
             function_call=function_call,
             tools=tools,
             tool_choice=tool_choice,
+            documents=documents,
             **template_kwargs,
         )
 
@@ -540,6 +542,7 @@ def chat_completion_handler(
         logit_bias: Optional[Dict[str, float]] = None,
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
+        documents: Optional[List[Dict[str, str]]] = None,
         template_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs,  # type: ignore
     ) -> Union[
@@ -552,6 +555,7 @@ def chat_completion_handler(
             function_call=function_call,
             tools=tools,
             tool_choice=tool_choice,
+            documents=documents,
             template_kwargs=template_kwargs,
         )
         prompt = result.prompt

From 6e18f5b459b94ab21650951b55edd619332ebcb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Wed, 12 Jun 2024 09:11:26 +0200
Subject: [PATCH 7/7] update example

---
 README.md | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 39302787c..dd2ac9f6e 100644
--- a/README.md
+++ b/README.md
@@ -408,18 +408,16 @@ llm.create_chat_completion(
               "content": "Write a short summary of each document please."
           }
       ],
-      template_kwargs = {
-          "documents": [
-              {
-                  "title": "First document",
-                  "content": "...",
-              },
-              {
-                  "title": "Second document",
-                  "content": "...",
-              }
-          ]
-      }
+      documents = [
+          {
+              "title": "First document",
+              "content": "...",
+          },
+          {
+              "title": "Second document",
+              "content": "...",
+          }
+      ]
 )
 ```