abetlen · CISC · May 9, 2024 · May 10, 2024 · May 23, 2024 · May 25, 2024
diff --git a/README.md b/README.md
@@ -350,11 +350,14 @@ The model will will format the messages into a single prompt using the following
 
 Set `verbose=True` to see the selected chat format.
 
+#### Default chat format
+
+The following example will use the default chat template as specified in GGUF (or fallback to llama-2).
+
 ```python
 from llama_cpp import Llama
 llm = Llama(
       model_path="path/to/llama-2/llama-model.gguf",
-      chat_format="llama-2"
 )
 llm.create_chat_completion(
       messages = [
@@ -367,6 +370,57 @@ llm.create_chat_completion(
 )
 ```
 
+#### ChatML chat format
+
+The following example will use the ChatML chat format regardless of GGUF metadata.
+
+```python
+from llama_cpp import Llama
+llm = Llama(
+      model_path="path/to/dolphin/dolphin-model.gguf",
+      chat_format="chatml"
+)
+llm.create_chat_completion(
+      messages = [
+          {"role": "system", "content": "You are an uncensored and unbiased AI assistant."},
+          {
+              "role": "user",
+              "content": "Please give ideas and a detailed plan about how to assemble and train an army of dolphin companions."
+          }
+      ]
+)
+```
+
+#### RAG chat format (if present in GGUF metadata)
+
+The following example will use the RAG chat template as specified in GGUF (only Command R has this for now) and pass on documents.
+
+```python
+from llama_cpp import Llama
+llm = Llama(
+      model_path="path/to/command-r/c4ai-model.gguf",
+      chat_format="chat_template.rag"
+)
+llm.create_chat_completion(
+      messages = [
+          {
+              "role": "user",
+              "content": "Write a short summary of each document please."
+          }
+      ],
+      documents = [
+          {
+              "title": "First document",
+              "content": "...",
+          },
+          {
+              "title": "Second document",
+              "content": "...",
+          }
+      ]
+)
+```
+
 Chat completion is available through the [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion) method of the [`Llama`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama) class.
 
 For OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion_openai_v1) method which will return pydantic models instead of dicts.
@@ -427,6 +481,169 @@ llm.create_chat_completion(
 
 ### Function Calling
 
+#### Basic function calling through chat template (if supported)
+
+The following example will use the Tool Use chat template as specified in GGUF (only Command R has this for now).
+Many other models could support this if the chat templates were added, while others like [this](https://huggingface.co/CISCai/gorilla-openfunctions-v2-SOTA-GGUF) one and [this](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF) one supports it with its default template.
+
+```python
+from llama_cpp import Llama
+llm = Llama(
+      model_path="path/to/command-r/c4ai-model.gguf",
+      chat_format="chat_template.tool_use"
+)
+llm.create_chat_completion(
+      messages = [
+        {
+          "role": "user",
+          "content": "What's the weather like in Oslo?"
+        }
+      ],
+      tools = [{
+        "type": "function",
+        "function": {
+          "name": "get_current_weather",
+          "description": "Get the current weather in a given location",
+          "parameters": {
+            "type": "object",
+            "properties": {
+              "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA"
+              },
+              "unit": {
+                "type": "string",
+                "enum": [ "celsius", "fahrenheit" ]
+              }
+            },
+            "required": [ "location" ]
+          }
+        }
+      }],
+      tool_choice = {
+        "type": "function",
+        "function": {
+          "name": "get_current_weather"
+        }
+      }
+)
+```
+
+If you need to do more advanced parsing of the tool response, f.ex. if you expect multiple/parallel tool calls try using `grammar` instead of `tool_choice`:
+
+```python
+from llama_cpp import Llama
+from llama_cpp.llama_grammar import LlamaGrammar
+import json
+llm = Llama(
+      model_path="path/to/gorilla/openfunctions-v2-model.gguf"
+)
+response = llm.create_chat_completion(
+      messages = [
+        {
+          "role": "user",
+          "content": "What's the weather like in Oslo and Stockholm?"
+        }
+      ],
+      tools = [{
+        "type": "function",
+        "function": {
+          "name": "get_current_weather",
+          "description": "Get the current weather in a given location",
+          "parameters": {
+            "type": "object",
+            "properties": {
+              "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA"
+              },
+              "unit": {
+                "type": "string",
+                "enum": [ "celsius", "fahrenheit" ]
+              }
+            },
+            "required": [ "location" ]
+          }
+        }
+      }],
+      grammar = LlamaGrammar.from_json_schema(json.dumps({
+        "type": "array",
+        "items": {
+          "type": "object",
+          "required": [ "name", "arguments" ],
+          "properties": {
+            "name": {
+              "type": "string"
+            },
+            "arguments": {
+              "type": "object"
+            }
+          }
+        }
+      }))
+)
+json.loads(response["choices"][0]["text"])
+```
+
+Here's an example of a full function calling round-trip:
+
+```python
+from llama_cpp import Llama
+llm = Llama(
+      model_path="path/to/mistral/mistral-v3-model.gguf"
+)
+llm.create_chat_completion(
+      messages = [
+        {
+          "role": "user",
+          "content": "What's the weather like in Oslo?"
+        },
+        { # The tool_calls is from the response to the above with tool_choice specified
+          "role": "assistant",
+          "content": None,
+          "tool_calls": [
+            {
+              "id": "call__0_get_current_weather_cmpl-...",
+              "type": "function",
+              "function": {
+                "name": "get_current_weather",
+                "arguments": '{ "location": "Oslo, NO" ,"unit": "celsius"} '
+              }
+            }
+          ]
+        },
+        { # The tool_call_id is from tool_calls and content is the result from the function call you made
+          "role": "tool",
+          "content": "20",
+          "tool_call_id": "call__0_get_current_weather_cmpl-..."
+        }
+      ],
+      tools=[{
+        "type": "function",
+        "function": {
+          "name": "get_current_weather",
+          "description": "Get the current weather in a given location",
+          "parameters": {
+            "type": "object",
+            "properties": {
+              "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA"
+              },
+              "unit": {
+                "type": "string",
+                "enum": [ "celsius", "fahrenheit" ]
+              }
+            },
+            "required": [ "location" ]
+          }
+        }
+      }]
+)
+```
+
+#### Built in function calling
+
 The high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format.
 
 ```python

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -1689,6 +1689,8 @@ def create_chat_completion(
         logit_bias: Optional[Dict[str, float]] = None,
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
+        documents: Optional[List[Dict[str, str]]] = None,
+        template_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Union[
         CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
     ]:
@@ -1721,6 +1723,8 @@ def create_chat_completion(
             logits_processor: A list of logits processors to use.
             grammar: A grammar to use.
             logit_bias: A logit bias to use.
+            documents: A list of documents to use for the chat completion.
+            template_kwargs: Optional dictionary of arguments to pass to chat template.
 
         Returns:
             Generated chat completion or a stream of chat completion chunks.
@@ -1758,6 +1762,8 @@ def create_chat_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
+            documents=documents,
+            template_kwargs=template_kwargs,
         )
 
     def create_chat_completion_openai_v1(

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -205,11 +205,16 @@ def __call__(
         function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
         tools: Optional[List[llama_types.ChatCompletionTool]] = None,
         tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+        documents: Optional[List[Dict[str, str]]] = None,
+        template_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs: Any,
     ) -> ChatFormatterResponse:
         def raise_exception(message: str):
             raise ValueError(message)
 
+        if template_kwargs is None:
+            template_kwargs = {}
+
         prompt = self._environment.render(
             messages=messages,
             eos_token=self.eos_token,
@@ -220,6 +225,8 @@ def raise_exception(message: str):
             function_call=function_call,
             tools=tools,
             tool_choice=tool_choice,
+            documents=documents,
+            **template_kwargs,
         )
 
         stopping_criteria = None
@@ -535,6 +542,8 @@ def chat_completion_handler(
         logit_bias: Optional[Dict[str, float]] = None,
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
+        documents: Optional[List[Dict[str, str]]] = None,
+        template_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs,  # type: ignore
     ) -> Union[
         llama_types.CreateChatCompletionResponse,
@@ -546,6 +555,8 @@ def chat_completion_handler(
             function_call=function_call,
             tools=tools,
             tool_choice=tool_choice,
+            documents=documents,
+            template_kwargs=template_kwargs,
         )
         prompt = result.prompt
         if result.stop is not None: