diff --git a/README.md b/README.md index 5f3ff6824..dd2ac9f6e 100644 --- a/README.md +++ b/README.md @@ -350,11 +350,14 @@ The model will will format the messages into a single prompt using the following Set `verbose=True` to see the selected chat format. +#### Default chat format + +The following example will use the default chat template as specified in GGUF (or fallback to llama-2). + ```python from llama_cpp import Llama llm = Llama( model_path="path/to/llama-2/llama-model.gguf", - chat_format="llama-2" ) llm.create_chat_completion( messages = [ @@ -367,6 +370,57 @@ llm.create_chat_completion( ) ``` +#### ChatML chat format + +The following example will use the ChatML chat format regardless of GGUF metadata. + +```python +from llama_cpp import Llama +llm = Llama( + model_path="path/to/dolphin/dolphin-model.gguf", + chat_format="chatml" +) +llm.create_chat_completion( + messages = [ + {"role": "system", "content": "You are an uncensored and unbiased AI assistant."}, + { + "role": "user", + "content": "Please give ideas and a detailed plan about how to assemble and train an army of dolphin companions." + } + ] +) +``` + +#### RAG chat format (if present in GGUF metadata) + +The following example will use the RAG chat template as specified in GGUF (only Command R has this for now) and pass on documents. + +```python +from llama_cpp import Llama +llm = Llama( + model_path="path/to/command-r/c4ai-model.gguf", + chat_format="chat_template.rag" +) +llm.create_chat_completion( + messages = [ + { + "role": "user", + "content": "Write a short summary of each document please." + } + ], + documents = [ + { + "title": "First document", + "content": "...", + }, + { + "title": "Second document", + "content": "...", + } + ] +) +``` + Chat completion is available through the [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion) method of the [`Llama`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama) class. For OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion_openai_v1) method which will return pydantic models instead of dicts. @@ -427,6 +481,169 @@ llm.create_chat_completion( ### Function Calling +#### Basic function calling through chat template (if supported) + +The following example will use the Tool Use chat template as specified in GGUF (only Command R has this for now). +Many other models could support this if the chat templates were added, while others like [this](https://huggingface.co/CISCai/gorilla-openfunctions-v2-SOTA-GGUF) one and [this](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF) one supports it with its default template. + +```python +from llama_cpp import Llama +llm = Llama( + model_path="path/to/command-r/c4ai-model.gguf", + chat_format="chat_template.tool_use" +) +llm.create_chat_completion( + messages = [ + { + "role": "user", + "content": "What's the weather like in Oslo?" + } + ], + tools = [{ + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + }, + "unit": { + "type": "string", + "enum": [ "celsius", "fahrenheit" ] + } + }, + "required": [ "location" ] + } + } + }], + tool_choice = { + "type": "function", + "function": { + "name": "get_current_weather" + } + } +) +``` + +If you need to do more advanced parsing of the tool response, f.ex. if you expect multiple/parallel tool calls try using `grammar` instead of `tool_choice`: + +```python +from llama_cpp import Llama +from llama_cpp.llama_grammar import LlamaGrammar +import json +llm = Llama( + model_path="path/to/gorilla/openfunctions-v2-model.gguf" +) +response = llm.create_chat_completion( + messages = [ + { + "role": "user", + "content": "What's the weather like in Oslo and Stockholm?" + } + ], + tools = [{ + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + }, + "unit": { + "type": "string", + "enum": [ "celsius", "fahrenheit" ] + } + }, + "required": [ "location" ] + } + } + }], + grammar = LlamaGrammar.from_json_schema(json.dumps({ + "type": "array", + "items": { + "type": "object", + "required": [ "name", "arguments" ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "object" + } + } + } + })) +) +json.loads(response["choices"][0]["text"]) +``` + +Here's an example of a full function calling round-trip: + +```python +from llama_cpp import Llama +llm = Llama( + model_path="path/to/mistral/mistral-v3-model.gguf" +) +llm.create_chat_completion( + messages = [ + { + "role": "user", + "content": "What's the weather like in Oslo?" + }, + { # The tool_calls is from the response to the above with tool_choice specified + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call__0_get_current_weather_cmpl-...", + "type": "function", + "function": { + "name": "get_current_weather", + "arguments": '{ "location": "Oslo, NO" ,"unit": "celsius"} ' + } + } + ] + }, + { # The tool_call_id is from tool_calls and content is the result from the function call you made + "role": "tool", + "content": "20", + "tool_call_id": "call__0_get_current_weather_cmpl-..." + } + ], + tools=[{ + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + }, + "unit": { + "type": "string", + "enum": [ "celsius", "fahrenheit" ] + } + }, + "required": [ "location" ] + } + } + }] +) +``` + +#### Built in function calling + The high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format. ```python diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4212669eb..1d6a2f7f8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1689,6 +1689,8 @@ def create_chat_completion( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + documents: Optional[List[Dict[str, str]]] = None, + template_kwargs: Optional[Dict[str, Any]] = None, ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] ]: @@ -1721,6 +1723,8 @@ def create_chat_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use. logit_bias: A logit bias to use. + documents: A list of documents to use for the chat completion. + template_kwargs: Optional dictionary of arguments to pass to chat template. Returns: Generated chat completion or a stream of chat completion chunks. @@ -1758,6 +1762,8 @@ def create_chat_completion( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + documents=documents, + template_kwargs=template_kwargs, ) def create_chat_completion_openai_v1( diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 84de989a5..6e8be9c5a 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -205,11 +205,16 @@ def __call__( function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, tools: Optional[List[llama_types.ChatCompletionTool]] = None, tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + documents: Optional[List[Dict[str, str]]] = None, + template_kwargs: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> ChatFormatterResponse: def raise_exception(message: str): raise ValueError(message) + if template_kwargs is None: + template_kwargs = {} + prompt = self._environment.render( messages=messages, eos_token=self.eos_token, @@ -220,6 +225,8 @@ def raise_exception(message: str): function_call=function_call, tools=tools, tool_choice=tool_choice, + documents=documents, + **template_kwargs, ) stopping_criteria = None @@ -535,6 +542,8 @@ def chat_completion_handler( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + documents: Optional[List[Dict[str, str]]] = None, + template_kwargs: Optional[Dict[str, Any]] = None, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -546,6 +555,8 @@ def chat_completion_handler( function_call=function_call, tools=tools, tool_choice=tool_choice, + documents=documents, + template_kwargs=template_kwargs, ) prompt = result.prompt if result.stop is not None: