diff --git a/.github/workflows/bench-docker.yml b/.github/workflows/bench-docker.yml new file mode 100644 index 0000000..c21adec --- /dev/null +++ b/.github/workflows/bench-docker.yml @@ -0,0 +1,89 @@ +name: Harbor Bench Docker image + +on: + workflow_dispatch: + push: + branches: + - 'main' + paths: + - bench/** + release: + types: [published] + +jobs: + build-and-push-image: + runs-on: ubuntu-latest + + permissions: + contents: read + packages: write + + env: + DOCKER_REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }}-bench + TAG: ${{ github.sha }} + + steps: + - name: Check out code + uses: actions/checkout@v4 + + - name: Free Disk Space Before Build + run: | + sudo rm -rf /usr/local/.ghcup + sudo rm -rf /opt/hostedtoolcache/CodeQL + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /usr/local/share/boost + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + # Log in to the GitHub Container Registry only when not running on a pull request event + - name: Login to Docker Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.DOCKER_REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }} + + - name: Debug + run: | + echo "PWD" + pwd + + echo "Workspace" + ls -la + + echo "Bench" + ls -la ./bench + + # Build and push the Docker image to GHCR for the main branch or specific tags + - name: Build and Push Docker Image + if: github.ref == 'refs/heads/main' + uses: docker/build-push-action@v6 + with: + context: ./bench + file: bench/Dockerfile + push: true + tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest + labels: version=${{ github.run_id }} + platforms: linux/amd64,linux/arm64 + + # For tagged releases, build and push the Docker image with the corresponding tag + - name: Build and Push Docker Image (Tagged) + if: startsWith(github.ref, 'refs/tags/') + uses: docker/build-push-action@v6 + with: + context: ./bench + file: bench/Dockerfile + push: true + tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }} + labels: version=${{ github.run_id }} + platforms: linux/amd64,linux/arm64 diff --git a/boost/src/chat.py b/boost/src/chat.py index dc15d1d..b3f252a 100644 --- a/boost/src/chat.py +++ b/boost/src/chat.py @@ -1,87 +1,11 @@ -import random +from typing import Optional -from typing import List, Optional +from chat_node import ChatNode import llm import log logger = log.setup_logger(__name__) -class ChatNode: - id: str - content: str - role: str - - parent: Optional['ChatNode'] - children: List['ChatNode'] - - visits: int - value: float - meta: dict - - def from_conversation(messages): - root_message = messages[0] - node = ChatNode(role=root_message['role'], content=root_message['content']) - - for message in messages[1:]: - node = node.add_child( - ChatNode(role=message['role'], content=message['content']) - ) - - return node - - def __init__(self, **kwargs): - self.id = ''.join( - random.choices('abcdefghijklmnopqrstuvwxyz0987654321', k=4) - ) - self.content = kwargs.get('content', '') - self.role = kwargs.get('role', '') - - self.parent = kwargs.get('parent', None) - self.children = kwargs.get('children', []) - - self.visits = kwargs.get('visits', 0) - self.value = kwargs.get('value', 0.0) - - self.meta = kwargs.get('meta', {}) - - def add_child(self, child: 'ChatNode'): - child.parent = self - self.children.append(child) - return child - - def best_child(self): - if not self.children: - return self - return max(self.children, key=lambda c: c.value).best_child() - - def parents(self): - parents = [self] - - while self.parent: - self = self.parent - parents.append(self) - - return parents[::-1] - - def history(self): - node = self - messages = [{ - "role": node.role, - "content": node.content, - }] - - while node.parent: - node = node.parent - messages.append({ - "role": node.role, - "content": node.content, - }) - - return messages[::-1] - - def __str__(self): - return f"{self.role}: {self.content}" - class Chat: tail: ChatNode @@ -94,11 +18,20 @@ def from_conversation(messages): def __init__(self, **kwargs): self.tail = kwargs.get('tail') self.llm = kwargs.get('llm') + self.chat_node_type = ChatNode + + self.Chat = Chat + self.ChatNode = self.chat_node_type + + def has_substring(self, substring): + return any(substring in msg.content for msg in self.plain()) def add_message(self, role, content): logger.debug(f"Chat message: {role}: {content[:50]}") - self.tail = self.tail.add_child(ChatNode(role=role, content=content)) + self.tail = self.tail.add_child( + self.__create_node(role=role, content=content) + ) return self.tail def user(self, content): @@ -107,18 +40,44 @@ def user(self, content): def assistant(self, content): return self.add_message('assistant', content) + def system(self, content): + return self.add_message('system', content) + def plain(self): return self.tail.parents() def history(self): return self.tail.history() + def __create_node(self, **kwargs): + NodeType = self.chat_node_type + return NodeType(**kwargs) + async def advance(self): + """ + Advance the chat completion + + Will not be streamed back to the client + """ + if not self.llm: raise ValueError("Chat: unable to advance without an LLM") - response = await self.llm.chat_completion(self) + response = await self.llm.chat_completion(chat=self) self.assistant(self.llm.get_response_content(response)) + async def emit_advance(self): + """ + Emit the next step in the chat completion + + Will be streamed back to the client + """ + + if not self.llm: + raise ValueError("Chat: unable to advance without an LLM") + + response = await self.llm.stream_chat_completion(chat=self) + self.assistant(response) + def __str__(self): - return '\n'.join([str(msg) for msg in self.parents()]) + return '\n'.join([str(msg) for msg in self.plain()]) diff --git a/boost/src/chat_node.py b/boost/src/chat_node.py new file mode 100644 index 0000000..eea6709 --- /dev/null +++ b/boost/src/chat_node.py @@ -0,0 +1,90 @@ +import random + +from typing import List, Optional +import log + +logger = log.setup_logger(__name__) + +class ChatNode: + id: str + content: str + role: str + + parent: Optional['ChatNode'] + children: List['ChatNode'] + + visits: int + value: float + meta: dict + + def from_conversation(messages): + root_message = messages[0] + node = ChatNode(role=root_message['role'], content=root_message['content']) + + for message in messages[1:]: + node = node.add_child( + ChatNode(role=message['role'], content=message['content']) + ) + + return node + + def __init__(self, **kwargs): + self.id = ''.join( + random.choices('abcdefghijklmnopqrstuvwxyz0987654321', k=4) + ) + self.content = kwargs.get('content', '') + self.role = kwargs.get('role', '') + + self.parent = kwargs.get('parent', None) + self.children = kwargs.get('children', []) + + self.visits = kwargs.get('visits', 0) + self.value = kwargs.get('value', 0.0) + + self.meta = kwargs.get('meta', {}) + + def add_parent(self, parent: 'ChatNode'): + parent.children.append(self) + self.parent = parent + return self + + def add_child(self, child: 'ChatNode'): + child.parent = self + self.children.append(child) + return child + + def best_child(self): + if not self.children: + return self + return max(self.children, key=lambda c: c.value).best_child() + + def contains(self, substring): + return substring.lower() in self.content.lower() + + def parents(self): + parents = [self] + + while self.parent: + self = self.parent + parents.append(self) + + return parents[::-1] + + def history(self): + node = self + messages = [{ + "role": node.role, + "content": node.content, + }] + + while node.parent: + node = node.parent + messages.append({ + "role": node.role, + "content": node.content, + }) + + return messages[::-1] + + def __str__(self): + return f"{self.role}: {self.content}" \ No newline at end of file diff --git a/boost/src/config.py b/boost/src/config.py index 82631e6..5b9a06a 100644 --- a/boost/src/config.py +++ b/boost/src/config.py @@ -183,49 +183,115 @@ def _convert_value(self, value: str) -> T: # Combining all the sources from # above into a single list -HARBOR_BOOST_APIS = [ +BOOST_APIS = [ *HARBOR_OPENAI_URLS.value, *HARBOR_BOOST_OPENAI_URLS.value, *HARBOR_BOOST_EXTRA_OPENAI_URLS.value ] -HARBOR_BOOST_KEYS = [ +BOOST_KEYS = [ *HARBOR_OPENAI_KEYS.value, *HARBOR_BOOST_OPENAI_KEYS.value, *HARBOR_BOOST_EXTRA_OPENAI_KEYS.value ] # ----------------- MODULES ----------------- -HARBOR_BOOST_MODULES = Config[StrList]( +BOOST_MODS = Config[StrList]( name='HARBOR_BOOST_MODULES', type=StrList, - default='', + default='klmbr;rcn;g1', description='A list of boost modules to load' ) -# ----------------- KLMBR ----------------- +BOOST_FOLDERS = Config[StrList]( + name='HARBOR_BOOST_MODULE_FOLDERS', + type=StrList, + default='modules;custom_modules', + description='A list of folders to load boost modules from' +) + +# ---------------- COMPLETION --------------- -HARBOR_BOOST_KLMBR_PERCENTAGE = Config[int]( +INTERMEDIATE_OUTPUT = Config[bool]( + name='HARBOR_BOOST_INTERMEDIATE_OUTPUT', + type=bool, + default='true', + description='Whether to output intermediate completions' +) + +STATUS_STYLE = Config[str]( + name='HARBOR_BOOST_STATUS_STYLE', + type=str, + default='md:codeblock', + description='The style of status messages' +) + +# ---------------- BEHAVIOR ----------------- + +SERVE_BASE_MODELS = Config[bool]( + name='HARBOR_BOOST_BASE_MODELS', + type=bool, + default='false', + description= + 'When enabled, boost will also serve original models from downstream APIs' +) + +MODEL_FILTER = Config[ConfigDict]( + name='HARBOR_BOOST_MODEL_FILTER', + type=ConfigDict, + default='', + description= + 'When specified, boost will only serve models which IDs match the filter' +) + +API_KEY = Config[str]( + name='HARBOR_BOOST_API_KEY', + type=str, + default='', + description='The API key to use for the boost API' +) + +API_KEYS = Config[StrList]( + name='HARBOR_BOOST_API_KEYS', + type=StrList, + default='', + description='A colon-separated list of API keys to use for the boost API' +) + +EXTRA_KEYS = Config[str]( + name='HARBOR_BOOST_API_KEY_*', + type=str, + default='', + description='Named API keys to use for the boost API' +) + +BOOST_AUTH = [ + key for key in [API_KEY.value, *API_KEYS.value, *EXTRA_KEYS.value] if key +] + +# ------------------ KLMBR ------------------ + +KLMBR_PERCENTAGE = Config[int]( name='HARBOR_BOOST_KLMBR_PERCENTAGE', type=int, default='15', description='The percentage of text to modify with the klmbr module' ) -HARBOR_BOOST_KLMBR_MODS = Config[StrList]( +KLMBR_MODS = Config[StrList]( name='HARBOR_BOOST_KLMBR_MODS', type=StrList, default='', description=f'The list of modifications klmbr will apply' ) -HARBOR_BOOST_KLMBR_STRAT = Config[str]( +KLMBR_STRAT = Config[str]( name='HARBOR_BOOST_KLMBR_STRAT', type=str, default='all', description='The strategy that selects messages to modify for the klmbr module' ) -HARBOR_BOOST_KLMBR_STRAT_PARAMS = Config[ConfigDict]( +KLMBR_STRAT_PARAMS = Config[ConfigDict]( name='HARBOR_BOOST_KLMBR_STRAT_PARAMS', type=ConfigDict, default='', @@ -235,43 +301,87 @@ def _convert_value(self, value: str) -> T: # ----------------- RCN ----------------- -HARBOR_BOOST_RCN_STRAT = Config[str]( +RCN_STRAT = Config[str]( name='HARBOR_RCN_STRAT', type=str, default='match', description='The strategy that selects messages to modify for the rcn module' ) -HARBOR_BOOST_RCN_STRAT_PARAMS = Config[ConfigDict]( +RCN_STRAT_PARAMS = Config[ConfigDict]( name='HARBOR_RCN_STRAT', type=ConfigDict, - # Default - last user message + # Default - last user message default='role=user,index=-1', - description= - 'The parameters for the strategy that selects messages to modify for the rcn module' + description='Parameters for rcn message selection' ) # ----------------- G1 ----------------- -HARBOR_BOOST_G1_STRAT = Config[str]( +G1_STRAT = Config[str]( name='HARBOR_G1_STRAT', type=str, default='match', description='The strategy that selects messages to modify for the g1 module' ) -HARBOR_BOOST_G1_STRAT_PARAMS = Config[ConfigDict]( +G1_STRAT_PARAMS = Config[ConfigDict]( name='HARBOR_G1_STRAT_PARAMS', type=ConfigDict, - # Default - last user message + # Default - last user message default='role=user,index=-1', - description= - 'The parameters for the strategy that selects messages to modify for the g1 module' + description='Parameters for g1 message selection' ) -HARBOR_BOOST_G1_MAX_STEPS = Config[int]( +G1_MAX_STEPS = Config[int]( name='HARBOR_G1_MAX_STEPS', type=int, default='15', description='The maximum number of reasoning steps to generate' -) \ No newline at end of file +) + +# ----------------- MCTS ----------------- + +MCTS_STRAT = Config[str]( + name='HARBOR_MCTS_STRAT', + type=str, + default='match', + description='The strategy that selects messages to target for the mcts module' +) + +MCTS_STRAT_PARAMS = Config[ConfigDict]( + name='HARBOR_MCTS_STRAT_PARAMS', + type=ConfigDict, + # Default - last user message + default='role=user,index=-1', + description='Parameters for mcts message selection' +) + +MCTS_MAX_SIMULATIONS = Config[int]( + name='HARBOR_MCTS_MAX_SIMULATIONS', + type=int, + default='2', + description='The maximum number of simulations to run (per iteration)' +) + +MCTS_MAX_ITERATIONS = Config[int]( + name='HARBOR_MCTS_MAX_ITERATIONS', + type=int, + default='2', + description='The maximum number of iterations to run' +) + +MCTS_THOUGHTS = Config[int]( + name='HARBOR_MCTS_THOUGHTS', + type=int, + default='2', + description= + 'The amount of thoughts (node expansions) to generate per simulation' +) + +MCTS_EXPLORATION_CONSTANT = Config[float]( + name='HARBOR_MCTS_EXPLORATION_CONSTANT', + type=float, + default='1.414', + description='The exploration constant for the MCTS algorithm' +) diff --git a/boost/src/custom_modules/.gitkeep b/boost/src/custom_modules/.gitkeep new file mode 100644 index 0000000..058e287 --- /dev/null +++ b/boost/src/custom_modules/.gitkeep @@ -0,0 +1 @@ +# It is our folder and we want it here, thanks \ No newline at end of file diff --git a/boost/src/custom_modules/example.py b/boost/src/custom_modules/example.py new file mode 100644 index 0000000..6dfb3c0 --- /dev/null +++ b/boost/src/custom_modules/example.py @@ -0,0 +1,178 @@ +import llm +import log +import chat as ch + +ID_PREFIX = 'example' +logger = log.setup_logger(ID_PREFIX) + + +async def apply(chat: 'ch.Chat', llm: 'llm.LLM'): + """ + 1. Working with a chat and chat nodes instances + This is where you can create some content programmatically, + that will later can be used for retrieving completions from + the downstream model. + """ + logger.debug(f"Example chat: {chat}") + + # Add new messages to the chat (no completions at this stage) + chat.user("Hello!") + chat.assistant("Hi! Would you like to learn more about Harbor Boost?") + chat.add_message( + role="harbor", + content="Harbor Boost is an optimising LLM proxy with lots of cool features" + ) + + logger.debug( + f"Chat history is a plain array of messages, from the tail: {chat.history()}" + ) + logger.debug( + f"Chat plain is a list of chat nodes, from the tail: {chat.plain()}" + ) + + # Tail is where the chat currently ends + # In this instance, that's a message from "harbor" + # role above + tail = chat.tail + + logger.debug( + f'Get all parents leading to a specific chat node: {tail.parents()}' + ) + logger.debug(f'Get one immediate parent: {tail.parent}') + + # We can modify the chat from the tail node directly + new_tail = tail.add_child( + ch.ChatNode(role="harbor", content="Chat nodes are everywhere!") + ) + + # However, such modifications are not reflected in the parent + # chat instance: + logger.debug(chat.tail == tail) # True + logger.debug(chat.tail == new_tail) # False + + # You can set a new tail for the chat, however + chat.tail = new_tail + # However, it's much easier to just work from the chat itself + chat.user('Alright, I think that is mostly it for now. Thanks!') + + # You can create new chat instances as needed + life_chat = ch.Chat.from_conversation( + [ + { + "role": "user", + "content": "What is the meaning of life? Answer with a tongue twister." + } + ] + ) + """ + 3.1 Programmatic messages and statuses + programmatic "public" messages that are streamed + back to the client as they are emitted here + (no way to "undo" or rewrite them) + """ + # You can tweak how status messages are delivered + # via the BOOST_STATUS_STYLE config option. + await llm.emit_status('Status and message examples') + await llm.emit_message("We can emit text at any time. ") + await llm.emit_message( + "\n_Note that you are responsible for correct formatting._\n" + ) + """ + 3.2. Internal LLM completions + "llm" is a representation of the downstream model + that is being boosted. It comes with a few helper + methods that tie up the module workflow together and + is pre-configured to hit the downstream API with expected parameters. + + The completions below are "internal", they are not streamed + back to the client by default. Read further for "streamed" or + "public" completions. + """ + await llm.emit_status('Collecting internal completions...') + word = "Roses" + results = [ + # You can retrieve completion for some plain text + await llm.chat_completion(prompt="Hi!", resolve=True), + # You can include key/value pairs to be formatted in the prompt + await llm.chat_completion( + prompt="Tell me about {word} in ONE SHORT SENTENCE.", + word=word, + resolve=True, + ), + # You can also provide a list of messages + # in the OpenAI-compatible format + await llm.chat_completion( + messages=[ + { + "role": "user", + "content": "Tell me about roses" + }, { + "role": "assistant", + "content": "Sure, I can reply in three words! Here they are:" + } + ], + resolve=True + ), + # You can also provide a chat instance, + # Note that "resolve" is not set - the result + # will be in raw API format + f"\n```json\n{await llm.chat_completion(chat=life_chat)}\n```\n" + ] + # Results will now appear in the user's message + await llm.emit_status('Displaying collected results') + for i, result in enumerate(results): + await llm.emit_message(f"\nResult {i}: {result}\n") + """ + 3.3. Public/Streamed LLM completions + You can decide to stream responses from the downstream LLM + as they are being generated, for example when there's a long + chunk that needs to be retained in the global response. + """ + await llm.emit_status('Response streaming examples') + + # Same signatures as chat_completion + streamed_results = [ + # You can retrieve completion for some plain text + await llm.stream_chat_completion(prompt="Hi!"), + # You can include key/value pairs to be formatted in the prompt + await llm.stream_chat_completion( + prompt="Tell me about {word} in ONE SHORT SENTENCE.", + word=word, + ), + # You can also provide a list of messages + # in the OpenAI-compatible format + await llm.stream_chat_completion( + messages=[ + { + "role": "user", + "content": "Tell me about roses" + }, { + "role": "assistant", + "content": "Sure, I can reply in three words! Here they are:" + } + ], + ), + # You can also provide a chat instance + await llm.stream_chat_completion(chat=life_chat) + ] + # Streamed results are still buffered and available + # for you to use (plain text). + logger.debug(f"Streamed results: {streamed_results}") + + # Note that it's on you to apply formatting that will make + # sense in the context of the global message stream. + await llm.emit_message("\nThose are all results so far.\n") + """ + 4. Final completion + Note that none of the above will actually reach the Client + if the BOOST_INTERMEDIATE_OUTPUT is set to "false". + The "final" completion below, however, will *always* be streamed back. + It accepts all the same inputs as "chat_completion" and "stream_chat_completion" above. + You don't have to call it, but the output will be completely empty if the + "final" completion is not called and intermediate outputs are disabled. + + Think of this as a way to wrap up the module execution and + present the user with the final result. + """ + await llm.emit_status('Final completion') + await llm.stream_final_completion(prompt="Wish me a good luck!",) diff --git a/boost/src/format.py b/boost/src/format.py new file mode 100644 index 0000000..25db750 --- /dev/null +++ b/boost/src/format.py @@ -0,0 +1,18 @@ +from config import STATUS_STYLE + +status_formatters = { + "md:codeblock": "\n```boost\n{status}\n```\n", + "md:h1": "\n\n# {status}\n\n", + "md:h2": "\n\n## {status}\n\n", + "md:h3": "\n\n### {status}\n\n", + "plain": "\n\n{status}\n\n", + "none": "" +} + +def format_status(status: str): + desired_format = STATUS_STYLE.value + + if desired_format not in status_formatters: + desired_format = "md:codeblock" + + return status_formatters[desired_format].format(status=status) diff --git a/boost/src/llm.py b/boost/src/llm.py index 3b4f4b3..d938db7 100644 --- a/boost/src/llm.py +++ b/boost/src/llm.py @@ -1,22 +1,19 @@ from typing import Optional, AsyncGenerator -import httpx -import json +import traceback -import modules.klmbr as klmbr -import modules.rcn as rcn -import modules.g1 as g1 +import json +import asyncio +import time +import httpx -import chat +from config import INTERMEDIATE_OUTPUT +import chat as ch import log +import format +import mods logger = log.setup_logger(__name__) -mods = { - "klmbr": klmbr, - "rcn": rcn, - "g1": g1, -} - class LLM: url: str @@ -26,6 +23,12 @@ class LLM: params: dict module: str + queue: asyncio.Queue + is_streaming: bool + is_final_stream: bool + + cpl_id: int + def __init__(self, **kwargs): self.url = kwargs.get('url') self.headers = kwargs.get('headers', {}) @@ -33,16 +36,27 @@ def __init__(self, **kwargs): self.model = kwargs.get('model') self.params = kwargs.get('params', {}) - messages = kwargs.get('messages', []) - self.messages = messages - self.chat = chat.Chat.from_conversation(messages) + self.chat = self.resolve_chat(**kwargs) + self.messages = self.chat.history() self.module = kwargs.get('module') + self.queue = asyncio.Queue() + self.is_streaming = False + self.is_final_stream = False + + self.cpl_id = 0 + @property def chat_completion_endpoint(self): return f"{self.url}/chat/completions" + def generate_system_fingerprint(self): + return "fp_boost" + + def generate_chunk_id(self): + return f"chatcmpl-{++self.cpl_id}" + def get_response_content(self, response): return response['choices'][0]['message']['content'] @@ -50,7 +64,13 @@ def get_chunk_content(self, chunk): return chunk["choices"][0]["delta"]["content"] def parse_chunk(self, chunk): - chunk_str = chunk.decode('utf-8').split("\n")[0] + if isinstance(chunk, dict): + return chunk + + if isinstance(chunk, bytes): + chunk = chunk.decode('utf-8') + + chunk_str = chunk.split("\n")[0] if chunk_str.startswith("data: "): chunk_str = chunk_str[6:] @@ -60,23 +80,130 @@ def parse_chunk(self, chunk): logger.error(f"Failed to parse chunk: {chunk_str}") return {} - async def apply(self): - logger.debug('Applying boost...') + def output_from_chunk(self, chunk): + return { + "id": chunk["id"], + "object": "chat.completion", + "created": chunk["created"], + "model": self.model, + "system_fingerprint": self.generate_system_fingerprint(), + "choices": + [ + { + "index": choice["index"], + "message": + { + "role": choice["delta"].get("role", "assistant"), + "content": choice["delta"].get("content", "") + }, + "finish_reason": None + } for choice in chunk["choices"] + ], + "usage": { + "prompt_tokens": 0, + "completion_tokens": 0, + "total_tokens": 0 + } + } + + def chunk_from_message(self, message: str): + now = int(time.time()) + + return { + "id": + self.generate_chunk_id(), + "object": + "chat.completion.chunk", + "created": + now, + "model": + self.model, + "system_fingerprint": + self.generate_system_fingerprint(), + "choices": + [ + { + "index": 0, + "delta": { + "role": "assistant", + "content": message + }, + "finish_reason": None + } + ] + } + + def chunk_to_string(self, chunk): + if isinstance(chunk, dict): + chunk = f"data: {json.dumps(chunk)}\n\n" + + return chunk + + async def serve(self): + logger.debug('Serving boosted LLM...') if self.module is None: logger.debug("No module specified") return self.stream_chat_completion() - mod = mods.get(self.module) + mod = mods.registry.get(self.module) if mod is None: logger.error(f"Module '{self.module}' not found.") return - logger.debug(f"Applying '{self.module}' to '{self.model}'") - return await mod.apply(chat=self.chat, llm=self) + async def apply_mod(): + logger.debug(f"Applying '{self.module}' to '{self.model}'") + try: + await mod.apply(chat=self.chat, llm=self) + except Exception as e: + logger.error(f"Failed to apply module '{self.module}': {e}") + for line in traceback.format_tb(e.__traceback__): + logger.error(line) + + logger.debug(f"'{self.module}' application complete for '{self.model}'") + await self.emit_done() + + asyncio.create_task(apply_mod()) + return self.response_stream() + + async def generator(self): + self.is_streaming = True + + while self.is_streaming: + chunk = await self.queue.get() + + if chunk is None: + break + yield chunk + + async def response_stream(self): + async for chunk in self.generator(): + # Final stream is always passed back as + # that's the useful payload of a given iteration + if INTERMEDIATE_OUTPUT.value or self.is_final_stream: + yield chunk + + async def emit_status(self, status): + await self.emit_message(format.format_status(status)) + + async def emit_message(self, message): + await self.emit_chunk(self.chunk_from_message(message)) + + async def emit_chunk(self, chunk): + await self.queue.put(self.chunk_to_string(chunk)) + + async def emit_done(self): + await self.queue.put(None) + self.is_streaming = False + + async def stream_final_completion(self, **kwargs): + self.is_final_stream = True + return await self.stream_chat_completion(**kwargs) + + async def stream_chat_completion(self, **kwargs): + chat = self.resolve_chat(**kwargs) - async def stream_chat_completion(self, chat: Optional['chat.Chat'] = None): logger.debug( f"Streaming Chat Completion for '{self.chat_completion_endpoint}" ) @@ -84,6 +211,8 @@ async def stream_chat_completion(self, chat: Optional['chat.Chat'] = None): if chat is None: chat = self.chat + result = "" + async with httpx.AsyncClient(timeout=None) as client: async with client.stream( "POST", @@ -97,11 +226,23 @@ async def stream_chat_completion(self, chat: Optional['chat.Chat'] = None): } ) as response: async for chunk in response.aiter_bytes(): - yield chunk + parsed = self.parse_chunk(chunk) + content = self.get_chunk_content(parsed) + result += content - async def chat_completion(self, chat: Optional['chat.Chat'] = None): - logger.debug(f"Chat Completion for '{self.chat_completion_endpoint}'") + # We emit done after the module + # application has completed + if not '[DONE]' in f"{chunk}": + await self.emit_chunk(chunk) + + return result + async def chat_completion(self, **kwargs): + chat = self.resolve_chat(**kwargs) + should_resolve = kwargs.get("resolve", False) + + logger.debug(f"Chat Completion for '{self.chat_completion_endpoint}'") + logger.debug(f"Chat: {chat}") if chat is None: chat = self.chat @@ -114,8 +255,10 @@ async def chat_completion(self, chat: Optional['chat.Chat'] = None): response = await client.post( self.chat_completion_endpoint, headers=self.headers, json=body ) - - return response.json() + result = response.json() + if should_resolve: + return self.get_response_content(result) + return result async def consume_stream(self, stream: AsyncGenerator[bytes, None]): output_obj = None @@ -123,40 +266,31 @@ async def consume_stream(self, stream: AsyncGenerator[bytes, None]): async for chunk_bytes in stream: chunk = self.parse_chunk(chunk_bytes) - if output_obj is None: - # First chunk - init - output_obj = { - "id": chunk["id"], - "object": "chat.completion", - "created": chunk["created"], - "model": self.model, - "system_fingerprint": chunk["system_fingerprint"], - "choices": - [ - { - "index": choice["index"], - "message": - { - "role": choice["delta"].get("role", "assistant"), - "content": "" - }, - "finish_reason": None - } for choice in chunk["choices"] - ], - "usage": - { - "prompt_tokens": 0, - "completion_tokens": 0, - "total_tokens": 0 - } - } - + output_obj = self.output_from_chunk(chunk) chunk_content = self.get_chunk_content(chunk) content += chunk_content - # Set the aggregated content if output_obj: output_obj["choices"][0]["message"]["content"] = content return output_obj + + def resolve_chat( + self, + messages: Optional[list] = None, + chat: Optional['ch.Chat'] = None, + prompt: Optional[str] = None, + **prompt_kwargs + ): + if chat is not None: + return chat + + if messages is not None: + return ch.Chat.from_conversation(messages) + + if prompt is not None: + message = prompt.format(**prompt_kwargs) + return ch.Chat.from_conversation([{"role": "user", "content": message}]) + + return self.chat diff --git a/boost/src/main.py b/boost/src/main.py index 2efa48a..cdb3821 100644 --- a/boost/src/main.py +++ b/boost/src/main.py @@ -1,21 +1,30 @@ -from typing import List, Dict, Any import httpx import json -from fastapi import FastAPI, Request, HTTPException +from fastapi import FastAPI, Request, HTTPException, Depends, Security +from fastapi.security.api_key import APIKeyHeader from fastapi.responses import JSONResponse, StreamingResponse -from config import HARBOR_BOOST_OPENAI_URLS, HARBOR_BOOST_OPENAI_KEYS +from config import MODEL_FILTER, SERVE_BASE_MODELS, BOOST_AUTH from log import setup_logger +import selection import mapper import config +import mods import llm logger = setup_logger(__name__) app = FastAPI() +auth_header = APIKeyHeader(name="Authorization", auto_error=False) # ------------------------------ +async def get_api_key(api_key_header: str = Security(auth_header)): + if len(BOOST_AUTH) == 0: + return + if api_key_header in BOOST_AUTH: + return api_key_header + raise HTTPException(status_code=403, detail="Unauthorized") @app.get("/") @@ -35,34 +44,42 @@ async def health(): @app.get("/v1/models") -async def get_boost_models(): +async def get_boost_models(api_key: str = Depends(get_api_key)): downstream = await mapper.list_downstream() - enabled_modules = config.HARBOR_BOOST_MODULES.value - - proxy_models = [] + enabled_modules = config.BOOST_MODS.value + should_filter = len(MODEL_FILTER.value) > 0 + serve_base_models = SERVE_BASE_MODELS.value + candidates = [] + final = [] for model in downstream: - proxy_models.append(model) + if serve_base_models: + candidates.append(model) + for module in enabled_modules: - mod = llm.mods.get(module) - proxy_models.append(mapper.get_proxy_model(mod, model)) + mod = mods.registry.get(module) + candidates.append(mapper.get_proxy_model(mod, model)) + + for model in candidates: + should_serve = True - return JSONResponse(content=proxy_models, status_code=200) + if should_filter: + should_serve = selection.matches_filter(model, MODEL_FILTER.value) + if should_serve: + final.append(model) -async def fetch_stream(url: str, headers: dict, json_data: dict): - async with httpx.AsyncClient() as client: - async with client.stream( - "POST", url, headers=headers, json=json_data - ) as response: - async for chunk in response.aiter_bytes(): - yield chunk + logger.debug(f"Serving {len(final)} models in the API") + + return JSONResponse(content=final, status_code=200) @app.post("/v1/chat/completions") -async def post_boost_chat_completion(request: Request): +async def post_boost_chat_completion(request: Request, api_key: str = Depends(get_api_key)): body = await request.body() + logger.debug(f"Request body: {body}") + try: decoded = body.decode("utf-8") json_body = json.loads(decoded) @@ -71,24 +88,39 @@ async def post_boost_chat_completion(request: Request): logger.debug(f"Invalid JSON in request body: {body[:100]}") raise HTTPException(status_code=400, detail="Invalid JSON in request body") + # Refresh downstream models to ensure + # that we know where to route the requests await mapper.list_downstream() + # Get our proxy model configuration proxy_config = mapper.resolve_request_config(json_body) - proxy_llm = llm.LLM(**proxy_config) + proxy = llm.LLM(**proxy_config) + + # We don't want to trigger potentially + # expensive workflows for title generation + if mapper.is_title_generation_task(proxy): + logger.debug("Detected title generation task, skipping boost") + return JSONResponse(content=await proxy.chat_completion(), status_code=200) - # This is where the boost happens - completion = await proxy_llm.apply() + # This is where the "boost" happens + completion = await proxy.serve() - logger.debug('Completion: %s', completion) + if completion is None: + return JSONResponse( + content={"error": "No completion returned"}, status_code=500 + ) if stream: return StreamingResponse(completion, media_type="text/event-stream") else: - content = await proxy_llm.consume_stream(completion) + content = await proxy.consume_stream(completion) return JSONResponse(content=content, status_code=200) +# ------------ Startup ---------------- -logger.info(f"Boosting: {config.HARBOR_BOOST_APIS}") +logger.info(f"Boosting: {config.BOOST_APIS}") +if len(BOOST_AUTH) == 0: + logger.warn("No API keys specified - boost will accept all requests") if __name__ == "__main__": import uvicorn diff --git a/boost/src/mapper.py b/boost/src/mapper.py index 911c11a..5bdfffa 100644 --- a/boost/src/mapper.py +++ b/boost/src/mapper.py @@ -1,13 +1,12 @@ import httpx from typing import Dict -from fastapi import Request -from log import setup_logger -from llm import mods import config +import mods +import log -logger = setup_logger(__name__) +logger = log.setup_logger(__name__) MODEL_TO_BACKEND: Dict[str, str] = {} @@ -17,8 +16,8 @@ async def list_downstream(): all_models = [] for url, key in zip( - config.HARBOR_BOOST_APIS, - config.HARBOR_BOOST_KEYS, + config.BOOST_APIS, + config.BOOST_KEYS, ): try: endpoint = f"{url}/models" @@ -54,22 +53,21 @@ def get_proxy_model(module, model: dict) -> Dict: "name": f"{module.ID_PREFIX} {model['id']}", } + def resolve_proxy_model(model_id: str) -> Dict: parts = model_id.split("-") - - if parts[0] in mods: + if parts[0] in mods.registry: return "-".join(parts[1:]) - return model_id + def resolve_proxy_module(model_id: str) -> Dict: parts = model_id.split("-") - - if parts[0] in mods: + if parts[0] in mods.registry: return parts[0] - return None + def resolve_request_config(body: Dict) -> Dict: model = body.get("model") messages = body.get("messages") @@ -82,20 +80,26 @@ def resolve_request_config(body: Dict) -> Dict: proxy_module = resolve_proxy_module(model) proxy_backend = MODEL_TO_BACKEND.get(proxy_model) - logger.debug(f"Resolved proxy model: {proxy_model}, proxy module: {proxy_module}, proxy backend: {proxy_backend}") - + logger.debug( + f"Resolved proxy model: {proxy_model}, proxy module: {proxy_module}, proxy backend: {proxy_backend}" + ) - proxy_key = config.HARBOR_BOOST_KEYS[config.HARBOR_BOOST_APIS.index(proxy_backend)] + proxy_key = config.BOOST_KEYS[ + config.BOOST_APIS.index(proxy_backend)] return { "url": proxy_backend, - "headers": { - "Authorization": f"Bearer {proxy_key}", - "Content-Type": "application/json", - }, - + "headers": + { + "Authorization": f"Bearer {proxy_key}", + "Content-Type": "application/json", + }, "model": proxy_model, "params": params, "messages": messages, "module": proxy_module, } + +def is_title_generation_task(llm: 'LLM'): + # TODO: Better way to identify? + return llm.chat.has_substring("3-5 word title") diff --git a/boost/src/mods.py b/boost/src/mods.py new file mode 100644 index 0000000..dcde114 --- /dev/null +++ b/boost/src/mods.py @@ -0,0 +1,39 @@ +""" +This module is responsible for loading all the modules +from the configured folders and registering them in +the registry. +""" + +import os +import importlib + +from config import INTERMEDIATE_OUTPUT, BOOST_FOLDERS + +import log + +logger = log.setup_logger(__name__) + +registry = {} + + +def load_folder(folder): + logger.debug(f"Loading modules from '{folder}'") + for filename in os.listdir(folder): + is_target_mod = filename.endswith(".py") and filename != "__init__.py" + + if is_target_mod: + module_name = filename[:-3] + module = importlib.import_module(f"{folder}.{module_name}") + + if hasattr(module, "ID_PREFIX"): + logger.debug(f"Registering '{module.ID_PREFIX}'...") + registry[module_name] = module + + +for folder in BOOST_FOLDERS.value: + load_folder(folder) + +if len(registry) == 0: + logger.warning("No modules loaded. Is boost configured correctly?") +else: + logger.info(f"Loaded {len(registry)} modules: {', '.join(registry.keys())}") diff --git a/boost/src/modules/g1.py b/boost/src/modules/g1.py index 5de10b6..e26bacd 100644 --- a/boost/src/modules/g1.py +++ b/boost/src/modules/g1.py @@ -1,71 +1,68 @@ # g1 - the approach from: https://github.com/bklieger-groq/g1 # Harbor also uses same logic for ol1 service -from chat import Chat, ChatNode -from config import HARBOR_BOOST_G1_STRAT, HARBOR_BOOST_G1_STRAT_PARAMS, HARBOR_BOOST_G1_MAX_STEPS -from selection import apply_selection_strategy +from config import G1_STRAT, G1_STRAT_PARAMS, G1_MAX_STEPS import llm import log +import selection +import chat as ch logger = log.setup_logger(__name__) ID_PREFIX = "g1" -async def apply(chat: Chat, llm: 'llm.LLM'): - strat = HARBOR_BOOST_G1_STRAT.value - strat_params = HARBOR_BOOST_G1_STRAT_PARAMS.value - max_steps = HARBOR_BOOST_G1_MAX_STEPS.value +async def apply(chat: 'ch.Chat', llm: 'llm.LLM'): + strat = G1_STRAT.value + strat_params = G1_STRAT_PARAMS.value + max_steps = G1_MAX_STEPS.value debug_info = { "strat": strat, "strat_params": strat_params, "max_steps": max_steps, } - logger.debug(f"g1: {debug_info}") + logger.debug(f"{ID_PREFIX}: {debug_info}") - nodes = apply_selection_strategy( - chat, - strategy=HARBOR_BOOST_G1_STRAT.value, - params=HARBOR_BOOST_G1_STRAT_PARAMS.value - ) + nodes = selection.apply_strategy(chat, strategy=strat, params=strat_params) if (len(nodes) > 1): logger.warning( - "G1: Matched multiple nodes, only the first one will be processed." + f"{ID_PREFIX}: Matched multiple nodes, only the first one will be processed." ) if len(nodes) == 0: - log.info("G1: No nodes matched, skipping.") - return llm.stream_chat_completion(chat) + log.info(f"{ID_PREFIX}: No nodes matched, skipping.") + return await llm.stream_final_completion() node = nodes[0] - g1_chat = Chat( + output = ch.Chat( llm=llm, - tail=ChatNode( + tail=ch.ChatNode( role="system", content= f"""You are an expert AI assistant that explains your reasoning step by step. For each step, provide a title that describes what you're doing in that step, along with the content. Decide if you need another step or if you're ready to give the final answer. In your response write "ACTION" followed by either 'continue' or 'final_answer'. USE AS MANY REASONING STEPS AS POSSIBLE. AT LEAST 3. BE AWARE OF YOUR LIMITATIONS AS AN LLM AND WHAT YOU CAN AND CANNOT DO. IN YOUR REASONING, INCLUDE EXPLORATION OF ALTERNATIVE ANSWERS. CONSIDER YOU MAY BE WRONG, AND IF YOU ARE WRONG IN YOUR REASONING, WHERE IT WOULD BE. FULLY TEST ALL OTHER POSSIBILITIES. YOU CAN BE WRONG. WHEN YOU SAY YOU ARE RE-EXAMINING, ACTUALLY RE-EXAMINE, AND USE ANOTHER APPROACH TO DO SO. DO NOT JUST SAY YOU ARE RE-EXAMINING. USE AT LEAST 3 METHODS TO DERIVE THE ANSWER. USE BEST PRACTICES.""" .strip() ) ) - g1_chat.user(node.content) - g1_chat.assistant( + output.user(node.content) + output.assistant( "Thank you! I will now think step by step following my instructions, starting at the beginning after decomposing the problem." ) + steps = 0 while True: - await g1_chat.advance() - - tail = g1_chat.tail - if tail.role == "assistant" and "final_answer" in tail.content: - break + await llm.emit_status(f'Step: {steps + 1}') + await output.emit_advance() + steps += 1 - if len(g1_chat.history()) >= max_steps: + if output.tail.contains("final_answer") or steps >= max_steps: break - g1_chat.user("Please provide the final answer based on your reasoning above. You don't have to mention 'ACTION' in your response.") - - return llm.stream_chat_completion(g1_chat) + output.user( + "Please provide the final answer based on your reasoning above. You don't have to mention 'ACTION' in your response." + ) + await llm.emit_status('Final Answer') + await llm.stream_final_completion(chat=output) diff --git a/boost/src/modules/klmbr.py b/boost/src/modules/klmbr.py index 681071b..8d42565 100644 --- a/boost/src/modules/klmbr.py +++ b/boost/src/modules/klmbr.py @@ -3,12 +3,12 @@ import random -from chat import Chat -from config import HARBOR_BOOST_KLMBR_MODS, HARBOR_BOOST_KLMBR_PERCENTAGE, HARBOR_BOOST_KLMBR_STRAT, HARBOR_BOOST_KLMBR_STRAT_PARAMS -from selection import apply_selection_strategy +from config import KLMBR_MODS, KLMBR_PERCENTAGE, KLMBR_STRAT, KLMBR_STRAT_PARAMS import log import llm +import chat as ch +import selection logger = log.setup_logger(__name__) @@ -164,11 +164,11 @@ def modify_text(**kwargs): return modified_text, word_mapping -async def apply(chat: Chat, llm: 'llm.LLM'): - strat = HARBOR_BOOST_KLMBR_STRAT.value - strat_params = HARBOR_BOOST_KLMBR_STRAT_PARAMS.value - percentage = HARBOR_BOOST_KLMBR_PERCENTAGE.value - mods = HARBOR_BOOST_KLMBR_MODS.value +async def apply(chat: 'ch.Chat', llm: 'llm.LLM'): + strat = KLMBR_STRAT.value + strat_params = KLMBR_STRAT_PARAMS.value + percentage = KLMBR_PERCENTAGE.value + mods = KLMBR_MODS.value debug_info = { "strat": strat, "strat_params": strat_params, @@ -176,15 +176,16 @@ async def apply(chat: Chat, llm: 'llm.LLM'): "mods": mods, } - logger.debug(f"klmbr: {debug_info}") + logger.debug(f"{ID_PREFIX}: {debug_info}") - nodes = apply_selection_strategy(chat, strategy=strat, params=strat_params) + nodes = selection.apply_strategy(chat, strategy=strat, params=strat_params) for node in nodes: content, mapping = modify_text( text=node.content, percentage=percentage, mods=mods ) node.content = content - node.meta["klmbr"] = mapping + node.meta[ID_PREFIX] = mapping - return llm.stream_chat_completion(chat=chat) + await llm.emit_status(llm.chat.tail.content) + await llm.stream_final_completion(chat=chat) diff --git a/boost/src/modules/mcts.py b/boost/src/modules/mcts.py new file mode 100644 index 0000000..6abc9f6 --- /dev/null +++ b/boost/src/modules/mcts.py @@ -0,0 +1,324 @@ +# Recursive Certainty Validation - RCN +# aka "Are you sure? + +import re +from typing import Optional, List +from config import MCTS_STRAT, MCTS_STRAT_PARAMS, MCTS_EXPLORATION_CONSTANT, MCTS_MAX_SIMULATIONS, MCTS_MAX_ITERATIONS, MCTS_THOUGHTS + +import llm +import log +import random +import math +import chat as ch +import selection + +# ============================================================================== + +logger = log.setup_logger(__name__) +ID_PREFIX = "mcts" + +# ============================================================================== + +thoughts_prompt = """ + +Give a suggestion on how this answer can be improved. +WRITE ONLY AN IMPROVEMENT SUGGESTION AND NOTHING ELSE. +YOUR REPLY SHOULD BE A SINGLE SENTENCE. + + + +{question} + + + +{answer} + +""".strip() + +eval_answer_prompt = """ +Given the following text: +"{answer}" + +How well does it answers this question: +"{question}" + +Rate the answer from 1 to 10, where 1 is completely wrong or irrelevant and 10 is a perfect answer. +Reply with a single number between 1 and 10 only. Do not write anything else, it will be discarded. +THINK CAREFULLY AND USE BEST PRACTICES. +""".strip() + +analyze_prompt = """ +Iteration Analysis: + +Original question: {question} +Best answer found: {best_answer} +Best score achieved: {best_score} + +Analyze this iteration of the thought process. Consider the following: +1. What aspects of the best answer made it successful? +2. What patterns or approaches led to higher-scoring thoughts? +3. Were there any common pitfalls or irrelevant tangents in lower-scoring thoughts? +4. How can the thought generation process be improved for the next iteration? + +Provide a concise analysis and suggest one specific improvement strategy for the next iteration. +""".strip() + +update_prompt = """ + +Your task is to read the question and the answer below, then analyse the given critique. +When you are done - think about how the answer can be improved based on the critique. +WRITE A REVISED ANSWER THAT ADDRESSES THE CRITIQUE. DO NOT WRITE ANYTHING ELSE. + + +{question} + + +{answer} + + +{improvements} + +""".strip() + +initial_prompt = """ + +Answer the question below. Do not pay attention to, unexpected casing, punctuation or accent marks. + + + +{question} + +""" + + +class MCTSNode(ch.ChatNode): + children: List['MCTSNode'] + exploration_weight: float + max_children = 2 + + def fully_expanded(self): + return len(self.children) >= self.max_children + + def uct_value(self): + epsilon = 1e-6 + + return self.value / (self.visits + + epsilon) + self.exploration_weight * math.sqrt( + math.log(self.parent.visits) / + (self.visits + epsilon) + ) + + def mermaid(self, offset=0, selected=None): + padding = " " * offset + msg = f"{padding}{self.id}({self.id}:{self.visits} - {escape_mermaid(self.content[:25])})\n" + + if selected == self.id: + msg += f"{padding}style {self.id} stroke:#0ff\n" + + for child in self.children: + msg += child.mermaid(offset + 4, selected) + msg += f"{padding}{self.id} --> {child.id}\n" + + return msg + + +class MCTS: + question: str + root: MCTSNode + llm: 'llm.LLM' + selected: Optional['ch.ChatNode'] + exploration_weight: float + thoughts: int + + def __init__(self, **kwargs): + self.question = kwargs.get("question") + self.root = kwargs.get("root") + self.llm = kwargs.get("llm") + self.selected = None + self.exploration_weight = kwargs.get( + "exploration_weight", MCTS_EXPLORATION_CONSTANT.value + ) + self.thoughts = kwargs.get("thoughts", MCTS_THOUGHTS.value) + + async def select(self): + logger.debug("Selecting node...") + node = self.root + while node.children: + node = self.uct_select(node) + return node + + async def expand(self, node): + logger.debug(f"Expanding node {node.id}...") + await self.llm.emit_status(f"Thinking about {node.id}...") + + for _ in range(random.randint(self.thoughts, self.thoughts + 1)): + thought = await self.generate_thought(node.content) + await self.llm.emit_message(f"\nThought: {thought}\n") + new_content = await self.update_approach(node.content, thought) + child = self.create_node(content=new_content, parent=node) + node.add_child(child) + + return random.choice(node.children) + + async def simulate(self, node: MCTSNode): + logger.debug(f"Simulating node {node.id}...") + await self.llm.emit_status(f"Thinking about {node.id}...") + await self.llm.emit_message(self.mermaid()) + return await self.evaluate_answer(node.content) + + def backpropagate(self, node: MCTSNode, score: float): + logger.debug(f"Backpropagating from {node.id}...") + while node: + node.visits += 1 + node.value += score + node = node.parent + + def uct_select(self, node: MCTSNode): + logger.debug(f"Selecting uct {node.id}...") + return max(node.children, key=lambda child: child.uct_value()) + + def best_child(self): + return self.root.best_child() + + async def search(self, num_simulations): + logger.debug("Starting search...") + + for _ in range(num_simulations): + leaf = await self.select() + self.selected = leaf + if not leaf.fully_expanded(): + leaf = await self.expand(leaf) + score = await self.simulate(leaf) + self.backpropagate(leaf, score) + + return self.selected + + def create_node(self, **kwargs): + node = MCTSNode(**kwargs) + node.exploration_weight = self.exploration_weight + + return node + + async def generate_thought(self, answer): + return await self.llm.chat_completion( + prompt=thoughts_prompt, + answer=answer, + question=self.question, + resolve=True + ) + + async def analyze_iteration(self, best_answer, best_score): + return await self.llm.chat_completion( + prompt=analyze_prompt, + question=self.question, + best_answer=best_answer, + best_score=best_score, + resolve=True + ) + + async def update_approach(self, answer, improvements): + return await self.llm.chat_completion( + prompt=update_prompt, + question=self.question, + answer=answer, + improvements=improvements, + resolve=True, + ) + + async def evaluate_answer(self, answer): + result = await self.llm.chat_completion( + prompt=eval_answer_prompt, + answer=answer, + question=self.question, + resolve=True, + ) + + try: + score = re.search(r"\d+", result).group() + return int(score) + except AttributeError: + logger.error(f"AnswerEval: unable to parse \"{result[:100]}\"") + return 0 + + def mermaid(self, selected=None): + return f""" +```mermaid +graph LR +{self.root.mermaid(0, selected.id if selected else self.selected.id)} +``` +""" + + +def escape_mermaid(text): + return text.replace('"', """).replace("(", "(").replace(")", ")") + + +# ============================================================================== +async def apply(chat: 'ch.Chat', llm: 'llm.LLM'): + strat = MCTS_STRAT.value + strat_params = MCTS_STRAT_PARAMS.value + exploration_constant = MCTS_EXPLORATION_CONSTANT.value + max_simulations = MCTS_MAX_SIMULATIONS.value + max_iterations = MCTS_MAX_ITERATIONS.value + thoughts = MCTS_THOUGHTS.value + + debug_info = { + "strat": strat, + "strat_params": strat_params, + "exploration_constant": exploration_constant, + "max_simulations": max_simulations, + "max_iterations": max_iterations, + "thoughts": thoughts, + } + + logger.debug(f"{ID_PREFIX}: {debug_info}") + nodes = selection.apply_strategy(chat, strategy=strat, params=strat_params) + + if (len(nodes) > 1): + logger.warning( + f"{ID_PREFIX}: Matched multiple nodes, only the first one will be processed." + ) + + if len(nodes) == 0: + log.info(f"{ID_PREFIX}: No nodes matched, skipping.") + return llm.stream_chat_completion() + + node = nodes[0] + question = node.content + + await llm.emit_status('Preparing initial thoughts...') + mcts_chat = ch.Chat( + llm=llm, + tail=MCTSNode( + role="user", content=initial_prompt.format(question=question) + ) + ) + mcts_chat.chat_node_type = MCTSNode + mcts_chat.llm = llm + await mcts_chat.emit_advance() + + await llm.emit_status('Starting MCTS search...') + mcts = MCTS( + question=question, + root=mcts_chat.tail, + llm=llm, + exploration_weight=exploration_constant, + thoughts=thoughts + ) + + best_answer = None + best_score = -float("inf") + + for i in range(max_iterations): + await llm.emit_status(f"MCTS iteration {i + 1}/{max_iterations}...") + best_node = await mcts.search(max_simulations) + score = await mcts.evaluate_answer(best_node.content) + + if score > best_score: + best_answer = best_node.content + best_score = score + + # Final completion + mcts_chat.assistant(f"Here is the best answer I can think of: {best_answer}") + mcts_chat.user('Thank you, now please summarize it for me.') + await llm.stream_final_completion(chat=mcts_chat) diff --git a/boost/src/modules/rcn.py b/boost/src/modules/rcn.py index 08019e6..5771453 100644 --- a/boost/src/modules/rcn.py +++ b/boost/src/modules/rcn.py @@ -1,52 +1,47 @@ # Recursive Certainty Validation - RCN # aka "Are you sure? -from chat import Chat -from config import HARBOR_BOOST_RCN_STRAT, HARBOR_BOOST_RCN_STRAT_PARAMS -from selection import apply_selection_strategy +from config import RCN_STRAT, RCN_STRAT_PARAMS import llm import log +import chat as ch +import selection logger = log.setup_logger(__name__) ID_PREFIX = "rcn" -async def apply(chat: Chat, llm: 'llm.LLM'): - strat = HARBOR_BOOST_RCN_STRAT.value - strat_params = HARBOR_BOOST_RCN_STRAT_PARAMS.value +async def apply(chat: 'ch.Chat', llm: 'llm.LLM'): + strat = RCN_STRAT.value + strat_params = RCN_STRAT_PARAMS.value debug_info = { "strat": strat, "strat_params": strat_params, } - logger.debug(f"rcn: {debug_info}") - - nodes = apply_selection_strategy( - chat, - strategy=HARBOR_BOOST_RCN_STRAT.value, - params=HARBOR_BOOST_RCN_STRAT_PARAMS.value - ) + logger.debug(f"{ID_PREFIX}: {debug_info}") + nodes = selection.apply_strategy(chat, strategy=strat, params=strat_params) if (len(nodes) > 1): logger.warning( - "RCN: Matched multiple nodes, only the first one will be processed." + f"{ID_PREFIX}: Matched multiple nodes, only the first one will be processed." ) if len(nodes) == 0: - log.info("RCN: No nodes matched, skipping.") + log.info(f"{ID_PREFIX}: No nodes matched, skipping.") return llm.stream_chat_completion(chat) node = nodes[0] question = node.content - rcn_chat = Chat.from_conversation( + output = chat.Chat.from_conversation( [ { "role": "system", - "content": + "content": """ YOU HAVE LIMITATIONS AS AN LLM. DO NOT OVERCOMPLICATE THINGS. YOU MAKE MISTAKES ALL THE TIME, SO BE CAREFUL IN YOUR REASONING. WHEN SOLVING PROBLEMS - DECOMPOSE THEM INTO SMALLER PARTS. SOLVE PARTS ONE BY ONE SEQUENTIALLY. @@ -66,14 +61,15 @@ async def apply(chat: Chat, llm: 'llm.LLM'): } ] ) - rcn_chat.llm = llm + output.llm = llm - await rcn_chat.advance() - rcn_chat.user("Are you sure?") - await rcn_chat.advance() - rcn_chat.user("Is this yout final answer?") - await rcn_chat.advance() - rcn_chat.user("Now prepare your final answer. Write it as a response to this message. Do not write anything else.") + await output.advance() + output.user("Are you sure?") + await output.advance() + output.user("Is this yout final answer?") + await output.advance() + output.user( + "Now prepare your final answer. Write it as a response to this message. Do not write anything else." + ) - # This is streamed back - return llm.stream_chat_completion(rcn_chat) + await llm.stream_final_completion(output) diff --git a/boost/src/selection.py b/boost/src/selection.py index 9722492..399214c 100644 --- a/boost/src/selection.py +++ b/boost/src/selection.py @@ -1,4 +1,5 @@ import random +import re from chat import Chat @@ -9,6 +10,7 @@ def percentage(chat: Chat, **kwargs): return nodes[:num_nodes] + def match(chat: Chat, **kwargs): substring = kwargs.get("substring", "") role = kwargs.get("role", "") @@ -27,18 +29,23 @@ def match(chat: Chat, **kwargs): return nodes + def user(chat: Chat): return match(chat, role="user") + def all(chat: Chat): return chat.plain() + def first(chat: Chat): return match(chat, index=0) + def last(chat: Chat): return match(chat, index=-1) + def any(chat: Chat): return [random.choice(chat.plain())] @@ -53,5 +60,35 @@ def any(chat: Chat): "user": user, } -def apply_selection_strategy(chat: Chat, strategy: str, params: dict): - return selection_strategies[strategy](chat, **params) \ No newline at end of file + +def apply_strategy(chat: Chat, strategy: str, params: dict): + return selection_strategies[strategy](chat, **params) + +def match_regex(value, regex): + return bool(re.match(regex, value)) + +def match_substring(value, substring): + return substring in value + +def match_exact(value, target): + return value == target + +def matches_filter(obj: dict, filter: dict): + for key in filter.keys(): + value = filter[key] + field, operation = key.split('.') if '.' in key else (key, 'exact') + + if field not in obj: + return False + + if operation == 'regex': + if not match_regex(str(obj[field]), value): + return False + elif operation == 'contains': + if not match_substring(str(obj[field]), value): + return False + else: + if not match_exact(str(obj[field]), value): + return False + + return True \ No newline at end of file diff --git a/compose.boost.yml b/compose.boost.yml index b18379d..45220c0 100644 --- a/compose.boost.yml +++ b/compose.boost.yml @@ -15,14 +15,5 @@ services: - ${HARBOR_VLLM_CACHE}:/root/.cache/vllm ports: - ${HARBOR_BOOST_HOST_PORT}:8000 - environment: - - HARBOR_BOOST_KLMBR_PERCENTAGE=${HARBOR_BOOST_KLMBR_PERCENTAGE} - - HARBOR_BOOST_KLMBR_MODS=${HARBOR_BOOST_KLMBR_MODS} - - HARBOR_BOOST_KLMBR_STRAT=${HARBOR_BOOST_KLMBR_STRAT} - - HARBOR_BOOST_KLMBR_STRAT_PARAMS=${HARBOR_BOOST_KLMBR_STRAT_PARAMS} - - HARBOR_BOOST_RCN_STRAT=${HARBOR_BOOST_RCN_STRAT} - - HARBOR_BOOST_RCN_STRAT_PARAMS=${HARBOR_BOOST_RCN_STRAT_PARAMS} - - HARBOR_BOOST_OPENAI_URLS=${HARBOR_BOOST_OPENAI_URLS} - - HARBOR_BOOST_OPENAI_KEYS=${HARBOR_BOOST_OPENAI_KEYS} networks: - harbor-network diff --git a/harbor.sh b/harbor.sh index bfab92e..9010111 100755 --- a/harbor.sh +++ b/harbor.sh @@ -3300,7 +3300,7 @@ run_stt_command() { # ======================================================================== # Globals -version="0.1.30" +version="0.1.31" harbor_repo_url="https://github.com/av/harbor.git" delimiter="|" scramble_exit_code=42 diff --git a/http-catalog/boost.http b/http-catalog/boost.http index d74e7d0..c4e7ba8 100644 --- a/http-catalog/boost.http +++ b/http-catalog/boost.http @@ -37,7 +37,9 @@ Authorization: sk-fake "messages": [ {"role": "user", "content": "Suggest me a random color"} ], - "temperature": 0 + "temperature": 0, + "max_tokens": 20, + "stream": true } ### @@ -98,4 +100,19 @@ Authorization: sk-fake ], "temperature": 0, "stream": false +} + +### + +POST {{host}}/v1/chat/completions +Content-Type: application/json +Authorization: sk-boost + +{ + "model": "example-llama3.1:8b", + "messages": [ + {"role": "user", "content": "Boost?"} + ], + "temperature": 0, + "stream": false } \ No newline at end of file diff --git a/http-catalog/ollama.http b/http-catalog/ollama.http index e45602f..59a585a 100644 --- a/http-catalog/ollama.http +++ b/http-catalog/ollama.http @@ -36,7 +36,7 @@ curl {{host}}/v1/chat/completions -H 'Content-Type: application/json' -H "Author } ], "max_tokens": 30, - "stream": false + "stream": true }' ### diff --git a/package.json b/package.json index 942189a..2729453 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@av/harbor", - "version": "0.1.30", + "version": "0.1.31", "bin": { "harbor": "./bin/harbor" } diff --git a/profiles/default.env b/profiles/default.env index b909e5e..70abf58 100644 --- a/profiles/default.env +++ b/profiles/default.env @@ -324,6 +324,13 @@ HARBOR_BOOST_HOST_PORT=34131 HARBOR_BOOST_OPENAI_URLS="" HARBOR_BOOST_OPENAI_KEYS="" HARBOR_BOOST_MODULES="klmbr;rcn;g1" +HARBOR_BOOST_MODULE_FOLDERS="modules;custom_modules" +HARBOR_BOOST_INTERMEDIATE_OUTPUT=true +HARBOR_BOOST_STATUS_STYLE="md:codeblock" +HARBOR_BOOST_BASE_MODELS="false" +HARBOR_BOOST_MODEL_FILTER="" +HARBOR_BOOST_API_KEY="" +HARBOR_BOOST_API_KEYS="" # Boost - klmbr HARBOR_BOOST_KLMBR_PERCENTAGE=35 HARBOR_BOOST_KLMBR_MODS="all" @@ -336,6 +343,12 @@ HARBOR_BOOST_RCN_STRAT_PARAMS="role=user,index=-1" HARBOR_BOOST_G1_STRAT="match" HARBOR_BOOST_G1_STRAT_PARAMS="role=user,index=-1" HARBOR_BOOST_G1_MAX_STEPS=15 +# Boost - mcts +HARBOR_BOOST_MCTS_STRAT="match" +HARBOR_BOOST_MCTS_STRAT_PARAMS="role=user,index=-1" +HARBOR_BOOST_MCTS_MAX_SIMULATIONS=2 +HARBOR_BOOST_MCTS_MAX_ITERATIONS=2 +HARBOR_BOOST_MCTS_THOUGHTS=2 # OpenHands HARBOR_OPENHANDS_HOST_PORT=34141