From d9ac6ecc95a8acd739dadbb2e465f2d6eac81d7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 10 May 2024 01:03:10 +0200 Subject: [PATCH 1/7] Support multiple chat templates - step 2 --- llama_cpp/llama.py | 3 +++ llama_cpp/llama_chat_format.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4212669eb..2e4172684 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1689,6 +1689,7 @@ def create_chat_completion( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + template_kwargs: Optional[Dict[str, Any]] = None, ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] ]: @@ -1721,6 +1722,7 @@ def create_chat_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use. logit_bias: A logit bias to use. + template_kwargs: Optional dictionary of arguments to pass to chat template. Returns: Generated chat completion or a stream of chat completion chunks. @@ -1758,6 +1760,7 @@ def create_chat_completion( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + template_kwargs=template_kwargs, ) def create_chat_completion_openai_v1( diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 84de989a5..99aa7819c 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -205,11 +205,15 @@ def __call__( function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, tools: Optional[List[llama_types.ChatCompletionTool]] = None, tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + template_kwargs: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> ChatFormatterResponse: def raise_exception(message: str): raise ValueError(message) + if template_kwargs is None: + template_kwargs = {} + prompt = self._environment.render( messages=messages, eos_token=self.eos_token, @@ -220,6 +224,7 @@ def raise_exception(message: str): function_call=function_call, tools=tools, tool_choice=tool_choice, + **template_kwargs, ) stopping_criteria = None @@ -535,6 +540,7 @@ def chat_completion_handler( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + template_kwargs: Optional[Dict[str, Any]] = None, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -546,6 +552,7 @@ def chat_completion_handler( function_call=function_call, tools=tools, tool_choice=tool_choice, + template_kwargs=template_kwargs, ) prompt = result.prompt if result.stop is not None: From f5e4c6ba9c9ac9b84f8fba45be3687b79173b2e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 10 May 2024 03:15:08 +0200 Subject: [PATCH 2/7] add usage examples --- README.md | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5f3ff6824..3354681d2 100644 --- a/README.md +++ b/README.md @@ -350,11 +350,14 @@ The model will will format the messages into a single prompt using the following Set `verbose=True` to see the selected chat format. +#### Default chat format + +The following example will use the default chat template as specified in GGUF (or fallback to llama-2). + ```python from llama_cpp import Llama llm = Llama( model_path="path/to/llama-2/llama-model.gguf", - chat_format="llama-2" ) llm.create_chat_completion( messages = [ @@ -367,6 +370,59 @@ llm.create_chat_completion( ) ``` +#### ChatML chat format + +The following example will use the ChatML chat format regardless of GGUF metadata. + +```python +from llama_cpp import Llama +llm = Llama( + model_path="path/to/dolphin/dolphin-model.gguf", + chat_format="chatml" +) +llm.create_chat_completion( + messages = [ + {"role": "system", "content": "You are an uncensored and unbiased AI assistant."}, + { + "role": "user", + "content": "Please give ideas and a detailed plan about how to assemble and train an army of dolphin companions." + } + ] +) +``` + +#### RAG chat format (if present in GGUF metadata) + +The following example will use the RAG chat template as specified in GGUF (only Command R has this for now) and pass on documents. + +```python +from llama_cpp import Llama +llm = Llama( + model_path="path/to/command-r/c4ai-model.gguf", + chat_format="chat_template.rag" +) +llm.create_chat_completion( + messages = [ + { + "role": "user", + "content": "Write a short summary of each document please." + } + ], + template_kwargs = { + "documents": [ + { + "title": "First document", + "content": "...", + }, + { + "title": "Second document", + "content": "...", + } + ] + } +) +``` + Chat completion is available through the [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion) method of the [`Llama`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama) class. For OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion_openai_v1) method which will return pydantic models instead of dicts. @@ -427,6 +483,56 @@ llm.create_chat_completion( ### Function Calling +#### Basic function calling through chat template (if supported) + +The following example will use the Tool Use chat template as specified in GGUF (only Command R has this for now). +Many other models could support this if the chat templates were added, while others like [this](https://huggingface.co/CISCai/gorilla-openfunctions-v2-SOTA-GGUF) one supports it with its default template. + +```python +from llama_cpp import Llama +llm = Llama( + model_path="path/to/command-r/c4ai-model.gguf", + chat_format="chat_template.tool_use" +) +llm.create_chat_completion( + messages = [ + { + "role": "user", + "content": "What's the weather like in Oslo?" + } + ], + tools = [{ + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + }, + "unit": { + "type": "string", + "enum": [ "celsius", "fahrenheit" ] + } + }, + "required": [ "location" ] + } + } + }], + tool_choice = { + "type": "function", + "function": { + "name": "get_current_weather" + } + } +) +``` + +#### Built in function calling + The high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format. ```python From aa87f55bb638bfaf20a44ce7542329c18c380f5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 23 May 2024 04:02:25 +0200 Subject: [PATCH 3/7] new model and function calling example --- README.md | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/README.md b/README.md index 3354681d2..8ee176b97 100644 --- a/README.md +++ b/README.md @@ -531,6 +531,63 @@ llm.create_chat_completion( ) ``` +Another model that supports function calling with its default template is [this](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF) one, with full tool_calls support: + +```python +from llama_cpp import Llama +llm = Llama( + model_path="path/to/mistral/mistral-v3-model.gguf" +) +llm.create_chat_completion( + messages = [ + { + "role": "user", + "content": "What's the weather like in Oslo?" + }, + { # The tool_calls is from the response to the above with tool_choice specified + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call__0_get_current_weather_cmpl-...", + "type": "function", + "function": { + "name": "get_current_weather", + "arguments": '{ "location": "Oslo, NO" ,"unit": "celsius"} ' + } + } + ] + }, + { # The tool_call_id is from tool_calls and content is the result from the function call you made + "role": "tool", + "content": 20, + "tool_call_id": "call__0_get_current_weather_cmpl-..." + } + ], + tools=[{ + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + }, + "unit": { + "type": "string", + "enum": [ "celsius", "fahrenheit" ] + } + }, + "required": [ "location" ] + } + } + }] +) +``` + #### Built in function calling The high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format. From 9e9cf101101c9a31527df88bffc273275ec5622e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 25 May 2024 21:51:06 +0200 Subject: [PATCH 4/7] added grammar example --- README.md | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8ee176b97..23aca0466 100644 --- a/README.md +++ b/README.md @@ -486,7 +486,7 @@ llm.create_chat_completion( #### Basic function calling through chat template (if supported) The following example will use the Tool Use chat template as specified in GGUF (only Command R has this for now). -Many other models could support this if the chat templates were added, while others like [this](https://huggingface.co/CISCai/gorilla-openfunctions-v2-SOTA-GGUF) one supports it with its default template. +Many other models could support this if the chat templates were added, while others like [this](https://huggingface.co/CISCai/gorilla-openfunctions-v2-SOTA-GGUF) one and [this](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF) one supports it with its default template. ```python from llama_cpp import Llama @@ -531,7 +531,63 @@ llm.create_chat_completion( ) ``` -Another model that supports function calling with its default template is [this](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF) one, with full tool_calls support: +If you need to do more advanced parsing of the tool response, f.ex. if you expect multiple/parallel tool calls try using `grammar` instead of `tool_choice`: + +```python +from llama_cpp import Llama +from llama_cpp.llama_grammar import LlamaGrammar +import json +llm = Llama( + model_path="path/to/gorilla/openfunctions-v2-model.gguf" +) +response = llm.create_chat_completion( + messages = [ + { + "role": "user", + "content": "What's the weather like in Oslo and Stockholm?" + } + ], + tools = [{ + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + }, + "unit": { + "type": "string", + "enum": [ "celsius", "fahrenheit" ] + } + }, + "required": [ "location" ] + } + } + }], + grammar = LlamaGrammar.from_json_schema(json.dumps({ + "type": "array", + "items": { + "type": "object", + "required": [ "name", "arguments" ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "object" + } + } + } + })) +) +json.loads(response["choices"][0]["text"]) +``` + +Here's an example of a full function calling round-trip: ```python from llama_cpp import Llama From 97f028cead6db10db553d8b8e94714a5acee3cc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 26 May 2024 12:51:04 +0200 Subject: [PATCH 5/7] content should be string --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 23aca0466..39302787c 100644 --- a/README.md +++ b/README.md @@ -616,7 +616,7 @@ llm.create_chat_completion( }, { # The tool_call_id is from tool_calls and content is the result from the function call you made "role": "tool", - "content": 20, + "content": "20", "tool_call_id": "call__0_get_current_weather_cmpl-..." } ], From d320b2725d53184f4e8bd3a1a659710cf51daba4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 12 Jun 2024 09:00:06 +0200 Subject: [PATCH 6/7] add documents --- llama_cpp/llama.py | 3 +++ llama_cpp/llama_chat_format.py | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2e4172684..1d6a2f7f8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1689,6 +1689,7 @@ def create_chat_completion( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + documents: Optional[List[Dict[str, str]]] = None, template_kwargs: Optional[Dict[str, Any]] = None, ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] @@ -1722,6 +1723,7 @@ def create_chat_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use. logit_bias: A logit bias to use. + documents: A list of documents to use for the chat completion. template_kwargs: Optional dictionary of arguments to pass to chat template. Returns: @@ -1760,6 +1762,7 @@ def create_chat_completion( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + documents=documents, template_kwargs=template_kwargs, ) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 99aa7819c..6e8be9c5a 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -205,6 +205,7 @@ def __call__( function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, tools: Optional[List[llama_types.ChatCompletionTool]] = None, tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + documents: Optional[List[Dict[str, str]]] = None, template_kwargs: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> ChatFormatterResponse: @@ -224,6 +225,7 @@ def raise_exception(message: str): function_call=function_call, tools=tools, tool_choice=tool_choice, + documents=documents, **template_kwargs, ) @@ -540,6 +542,7 @@ def chat_completion_handler( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + documents: Optional[List[Dict[str, str]]] = None, template_kwargs: Optional[Dict[str, Any]] = None, **kwargs, # type: ignore ) -> Union[ @@ -552,6 +555,7 @@ def chat_completion_handler( function_call=function_call, tools=tools, tool_choice=tool_choice, + documents=documents, template_kwargs=template_kwargs, ) prompt = result.prompt From 6e18f5b459b94ab21650951b55edd619332ebcb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 12 Jun 2024 09:11:26 +0200 Subject: [PATCH 7/7] update example --- README.md | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 39302787c..dd2ac9f6e 100644 --- a/README.md +++ b/README.md @@ -408,18 +408,16 @@ llm.create_chat_completion( "content": "Write a short summary of each document please." } ], - template_kwargs = { - "documents": [ - { - "title": "First document", - "content": "...", - }, - { - "title": "Second document", - "content": "...", - } - ] - } + documents = [ + { + "title": "First document", + "content": "...", + }, + { + "title": "Second document", + "content": "...", + } + ] ) ```