diff --git a/examples/audio_example.py b/examples/audio_example.py new file mode 100644 index 00000000..146b0002 --- /dev/null +++ b/examples/audio_example.py @@ -0,0 +1,70 @@ +""" +Example: Multimodal RLM - Audio Support (TTS and Transcription) + +This demonstrates the audio capabilities of RLM: +- speak() for text-to-speech +- audio_query() for audio transcription/analysis +""" + +import os + +from dotenv import load_dotenv + +from rlm import RLM +from rlm.logger import RLMLogger + +load_dotenv() + +# Get the directory where this script is located +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) + +logger = RLMLogger(log_dir="./logs") + +# Use Gemini which supports audio +rlm = RLM( + backend="gemini", + backend_kwargs={ + "model_name": "gemini-2.5-flash", + "api_key": os.getenv("GEMINI_API_KEY"), + }, + environment="local", + environment_kwargs={}, + max_depth=1, + logger=logger, + verbose=True, + enable_multimodal=True, # Enable multimodal functions (vision_query, audio_query, speak) +) + +# Example 1: Text-to-Speech +# Ask RLM to generate speech +context = { + "task": "Generate a spoken greeting", + "message": "Hello! This is a test of the RLM text-to-speech capability.", + "output_path": os.path.join(SCRIPT_DIR, "generated_speech.aiff"), +} + +result = rlm.completion( + prompt=context, + root_prompt="Use speak(text, output_path) to convert context['message'] to audio and save it to context['output_path']. Return the path.", +) + +print("\n" + "=" * 50) +print("TTS RESULT:") +print("=" * 50) +print(result.response) + + +# Example 2: Audio Analysis (if you have an audio file) +# Uncomment this section if you have an audio file to analyze: +""" +audio_context = { + "task": "Transcribe the audio", + "audio_file": "/path/to/your/audio.mp3", +} + +result = rlm.completion( + prompt=audio_context, + root_prompt="Use audio_query to transcribe the audio file.", +) +print(result.response) +""" diff --git a/examples/multimodal_example.py b/examples/multimodal_example.py new file mode 100644 index 00000000..8855e734 --- /dev/null +++ b/examples/multimodal_example.py @@ -0,0 +1,52 @@ +""" +Example: Multimodal RLM - Analyzing Images with Vision + +This demonstrates the multimodal capabilities of RLM using the +vision_query() function to analyze images. +""" + +import os + +from dotenv import load_dotenv + +from rlm import RLM +from rlm.logger import RLMLogger + +load_dotenv() + +# Get the directory where this script is located +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +TEST_IMAGE = os.path.join(SCRIPT_DIR, "test_image.png") + +logger = RLMLogger(log_dir="./logs") + +# Use Gemini which supports vision +rlm = RLM( + backend="gemini", + backend_kwargs={ + "model_name": "gemini-2.5-flash", + "api_key": os.getenv("GEMINI_API_KEY"), + }, + environment="local", + environment_kwargs={}, + max_depth=1, + logger=logger, + verbose=True, + enable_multimodal=True, # Enable multimodal functions (vision_query, audio_query, speak) +) + +# Create a context that includes references to images +context = { + "query": "Analyze the image and tell me what fruits are visible.", + "images": [TEST_IMAGE], +} + +result = rlm.completion( + prompt=context, + root_prompt="What fruits are in the image? Use vision_query to analyze the image.", +) + +print("\n" + "=" * 50) +print("FINAL RESULT:") +print("=" * 50) +print(result.response) diff --git a/rlm/clients/gemini.py b/rlm/clients/gemini.py index 7f6dc152..81e686a6 100644 --- a/rlm/clients/gemini.py +++ b/rlm/clients/gemini.py @@ -1,5 +1,7 @@ +import base64 import os from collections import defaultdict +from pathlib import Path from typing import Any from dotenv import load_dotenv @@ -11,6 +13,119 @@ load_dotenv() + +def _load_image_as_part(image_source: str | dict) -> types.Part: + """Load an image and return a Gemini Part object. + + Args: + image_source: Either a file path (str), URL (str starting with http), + or a dict with 'type' and 'data' keys for base64 images. + + Returns: + A Gemini Part object containing the image. + """ + if isinstance(image_source, dict): + # Base64 encoded image: {"type": "base64", "media_type": "image/png", "data": "..."} + if image_source.get("type") == "base64": + image_bytes = base64.b64decode(image_source["data"]) + mime_type = image_source.get("media_type", "image/png") + return types.Part.from_bytes(data=image_bytes, mime_type=mime_type) + # URL format from OpenAI-style: {"type": "image_url", "image_url": {"url": "..."}} + elif image_source.get("type") == "image_url": + url = image_source["image_url"]["url"] + if url.startswith("data:"): + # Data URL: data:image/png;base64,... + header, data = url.split(",", 1) + mime_type = header.split(":")[1].split(";")[0] + image_bytes = base64.b64decode(data) + return types.Part.from_bytes(data=image_bytes, mime_type=mime_type) + else: + return types.Part.from_uri(file_uri=url, mime_type="image/jpeg") + elif isinstance(image_source, str): + if image_source.startswith(("http://", "https://")): + # URL + return types.Part.from_uri(file_uri=image_source, mime_type="image/jpeg") + else: + # Local file path + path = Path(image_source) + if path.exists(): + mime_type = _get_mime_type(path) + with open(path, "rb") as f: + return types.Part.from_bytes(data=f.read(), mime_type=mime_type) + else: + raise FileNotFoundError(f"Image file not found: {image_source}") + raise ValueError(f"Unsupported image source type: {type(image_source)}") + + +def _get_mime_type(path: Path) -> str: + """Get MIME type from file extension.""" + suffix = path.suffix.lower() + mime_types = { + # Images + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".gif": "image/gif", + ".webp": "image/webp", + ".bmp": "image/bmp", + # Audio + ".mp3": "audio/mpeg", + ".wav": "audio/wav", + ".ogg": "audio/ogg", + ".flac": "audio/flac", + ".m4a": "audio/mp4", + ".aac": "audio/aac", + ".webm": "audio/webm", + # Video + ".mp4": "video/mp4", + ".mpeg": "video/mpeg", + ".mov": "video/quicktime", + ".avi": "video/x-msvideo", + ".mkv": "video/x-matroska", + } + return mime_types.get(suffix, "application/octet-stream") + + +def _load_audio_as_part(audio_source: str | dict) -> types.Part: + """Load an audio file and return a Gemini Part object. + + Args: + audio_source: Either a file path (str), URL (str starting with http), + or a dict with 'type' and 'data' keys for base64 audio. + + Returns: + A Gemini Part object containing the audio. + """ + if isinstance(audio_source, dict): + # Base64 encoded audio + if audio_source.get("type") == "base64": + audio_bytes = base64.b64decode(audio_source["data"]) + mime_type = audio_source.get("media_type", "audio/mpeg") + return types.Part.from_bytes(data=audio_bytes, mime_type=mime_type) + # Path format + elif audio_source.get("type") == "audio_path": + path = Path(audio_source.get("path", "")) + if path.exists(): + mime_type = _get_mime_type(path) + with open(path, "rb") as f: + return types.Part.from_bytes(data=f.read(), mime_type=mime_type) + else: + raise FileNotFoundError(f"Audio file not found: {audio_source.get('path')}") + elif isinstance(audio_source, str): + if audio_source.startswith(("http://", "https://")): + # URL - let Gemini fetch it + return types.Part.from_uri(file_uri=audio_source, mime_type="audio/mpeg") + else: + # Local file path + path = Path(audio_source) + if path.exists(): + mime_type = _get_mime_type(path) + with open(path, "rb") as f: + return types.Part.from_bytes(data=f.read(), mime_type=mime_type) + else: + raise FileNotFoundError(f"Audio file not found: {audio_source}") + raise ValueError(f"Unsupported audio source type: {type(audio_source)}") + DEFAULT_GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") @@ -95,7 +210,18 @@ async def acompletion( def _prepare_contents( self, prompt: str | list[dict[str, Any]] ) -> tuple[list[types.Content] | str, str | None]: - """Prepare contents and extract system instruction for Gemini API.""" + """Prepare contents and extract system instruction for Gemini API. + + Supports multimodal content where message content can be: + - A string (text only) + - A list of content items (text and images mixed) + + Image items can be: + - {"type": "text", "text": "..."} + - {"type": "image_url", "image_url": {"url": "..."}} + - {"type": "image_path", "path": "/path/to/image.png"} + - {"type": "base64", "media_type": "image/png", "data": "..."} + """ system_instruction = None if isinstance(prompt, str): @@ -110,20 +236,80 @@ def _prepare_contents( if role == "system": # Gemini handles system instruction separately - system_instruction = content - elif role == "user": - contents.append(types.Content(role="user", parts=[types.Part(text=content)])) - elif role == "assistant": - # Gemini uses "model" instead of "assistant" - contents.append(types.Content(role="model", parts=[types.Part(text=content)])) + if isinstance(content, str): + system_instruction = content + elif isinstance(content, list): + # Extract text from system message list + system_parts = [] + for item in content: + if isinstance(item, str): + system_parts.append(item) + elif isinstance(item, dict) and item.get("type") == "text": + system_parts.append(item.get("text", "")) + system_instruction = "\n".join(system_parts) + elif role in ("user", "assistant"): + gemini_role = "user" if role == "user" else "model" + parts = self._content_to_parts(content) + if parts: + contents.append(types.Content(role=gemini_role, parts=parts)) else: # Default to user role for unknown roles - contents.append(types.Content(role="user", parts=[types.Part(text=content)])) + parts = self._content_to_parts(content) + if parts: + contents.append(types.Content(role="user", parts=parts)) return contents, system_instruction raise ValueError(f"Invalid prompt type: {type(prompt)}") + def _content_to_parts(self, content: str | list) -> list[types.Part]: + """Convert message content to Gemini Parts. + + Args: + content: Either a string or a list of content items. + + Returns: + List of Gemini Part objects. + """ + if isinstance(content, str): + return [types.Part(text=content)] + + if isinstance(content, list): + parts = [] + for item in content: + if isinstance(item, str): + parts.append(types.Part(text=item)) + elif isinstance(item, dict): + item_type = item.get("type", "text") + if item_type == "text": + parts.append(types.Part(text=item.get("text", ""))) + elif item_type in ("image_url", "image_path", "base64"): + try: + # Use image_path for local files + if item_type == "image_path": + image_part = _load_image_as_part(item.get("path", "")) + else: + image_part = _load_image_as_part(item) + parts.append(image_part) + except Exception as e: + # If image loading fails, add error as text + parts.append(types.Part(text=f"[Image load error: {e}]")) + elif item_type == "audio_path": + try: + audio_part = _load_audio_as_part(item.get("path", "")) + parts.append(audio_part) + except Exception as e: + parts.append(types.Part(text=f"[Audio load error: {e}]")) + elif item_type == "audio_url": + try: + audio_part = _load_audio_as_part(item.get("url", "")) + parts.append(audio_part) + except Exception as e: + parts.append(types.Part(text=f"[Audio load error: {e}]")) + return parts + + return [types.Part(text=str(content))] + def _track_cost(self, response: types.GenerateContentResponse, model: str): self.model_call_counts[model] += 1 diff --git a/rlm/core/rlm.py b/rlm/core/rlm.py index 5f303928..f9a0a45b 100644 --- a/rlm/core/rlm.py +++ b/rlm/core/rlm.py @@ -21,6 +21,7 @@ format_iteration, ) from rlm.utils.prompts import ( + RLM_MULTIMODAL_SYSTEM_PROMPT, RLM_SYSTEM_PROMPT, QueryMetadata, build_rlm_system_prompt, @@ -52,6 +53,7 @@ def __init__( logger: RLMLogger | None = None, verbose: bool = False, persistent: bool = False, + enable_multimodal: bool = False, ): """ Args: @@ -68,6 +70,7 @@ def __init__( logger: The logger to use for the RLM. verbose: Whether to print verbose output in rich to console. persistent: If True, reuse the environment across completion() calls for multi-turn conversations. + enable_multimodal: If True, enable multimodal functions (vision_query, audio_query, speak) in the REPL environment. """ # Store config for spawning per-completion self.backend = backend @@ -90,7 +93,16 @@ def __init__( self.depth = depth self.max_depth = max_depth self.max_iterations = max_iterations - self.system_prompt = custom_system_prompt if custom_system_prompt else RLM_SYSTEM_PROMPT + self.enable_multimodal = enable_multimodal + + # Select system prompt: custom > multimodal > base + if custom_system_prompt: + self.system_prompt = custom_system_prompt + elif enable_multimodal: + self.system_prompt = RLM_MULTIMODAL_SYSTEM_PROMPT + else: + self.system_prompt = RLM_SYSTEM_PROMPT + self.logger = logger self.verbose = VerbosePrinter(enabled=verbose) @@ -165,6 +177,7 @@ def _spawn_completion_context(self, prompt: str | dict[str, Any]): env_kwargs["lm_handler_address"] = (lm_handler.host, lm_handler.port) env_kwargs["context_payload"] = prompt env_kwargs["depth"] = self.depth + 1 # Environment depth is RLM depth + 1 + env_kwargs["enable_multimodal"] = self.enable_multimodal # Pass multimodal flag to environment environment: BaseEnv = get_environment(self.environment_type, env_kwargs) if self.persistent: diff --git a/rlm/environments/local_repl.py b/rlm/environments/local_repl.py index 5aa1b696..91d502c6 100644 --- a/rlm/environments/local_repl.py +++ b/rlm/environments/local_repl.py @@ -125,6 +125,7 @@ def __init__( setup_code: str | None = None, persistent: bool = False, depth: int = 1, + enable_multimodal: bool = False, **kwargs, ): super().__init__(persistent=persistent, depth=depth, **kwargs) @@ -135,6 +136,7 @@ def __init__( self._lock = threading.Lock() self._context_count: int = 0 self._history_count: int = 0 + self.enable_multimodal = enable_multimodal # Setup globals, locals, and modules in environment. self.setup() @@ -159,10 +161,17 @@ def setup(self): # Track LLM calls made during code execution self._pending_llm_calls: list[RLMChatCompletion] = [] - # Add helper functions + # Add core helper functions (always available) self.globals["FINAL_VAR"] = self._final_var self.globals["llm_query"] = self._llm_query self.globals["llm_query_batched"] = self._llm_query_batched + + # Add multimodal helper functions only if multimodal is enabled + if self.enable_multimodal: + self.globals["vision_query"] = self._vision_query + self.globals["vision_query_batched"] = self._vision_query_batched + self.globals["audio_query"] = self._audio_query + self.globals["speak"] = self._speak def _final_var(self, variable_name: str) -> str: """Return the value of a variable as a final answer.""" @@ -228,6 +237,215 @@ def _llm_query_batched(self, prompts: list[str], model: str | None = None) -> li except Exception as e: return [f"Error: LM query failed - {e}"] * len(prompts) + def _vision_query( + self, prompt: str, images: list[str], model: str | None = None + ) -> str: + """Query a vision-capable LM with text and images. + + Args: + prompt: The text prompt describing what to analyze. + images: List of image paths or URLs to analyze. + model: Optional model name to use (if handler has multiple clients). + + Returns: + The LM's response analyzing the images. + + Example: + description = vision_query("What objects are in this image?", ["photo.jpg"]) + """ + if not self.lm_handler_address: + return "Error: No LM handler configured" + + try: + # Build multimodal content list + content = [{"type": "text", "text": prompt}] + for img in images: + if img.startswith(("http://", "https://")): + content.append({ + "type": "image_url", + "image_url": {"url": img} + }) + else: + content.append({ + "type": "image_path", + "path": img + }) + + # Send as a message with multimodal content + multimodal_prompt = [{"role": "user", "content": content}] + request = LMRequest(prompt=multimodal_prompt, model=model, depth=self.depth) + response = send_lm_request(self.lm_handler_address, request) + + if not response.success: + return f"Error: {response.error}" + + # Track this LLM call + self._pending_llm_calls.append(response.chat_completion) + return response.chat_completion.response + except Exception as e: + return f"Error: Vision query failed - {e}" + + def _vision_query_batched( + self, prompts: list[str], images_list: list[list[str]], model: str | None = None + ) -> list[str]: + """Query a vision-capable LM with multiple prompts and images concurrently. + + Args: + prompts: List of text prompts. + images_list: List of image lists, one per prompt. + model: Optional model name to use. + + Returns: + List of responses in the same order as input prompts. + + Example: + results = vision_query_batched( + ["What's in image 1?", "What's in image 2?"], + [["img1.jpg"], ["img2.jpg"]] + ) + """ + if not self.lm_handler_address: + return ["Error: No LM handler configured"] * len(prompts) + + if len(prompts) != len(images_list): + return ["Error: prompts and images_list must have same length"] * len(prompts) + + try: + # Build multimodal prompts + multimodal_prompts = [] + for prompt, images in zip(prompts, images_list): + content = [{"type": "text", "text": prompt}] + for img in images: + if img.startswith(("http://", "https://")): + content.append({ + "type": "image_url", + "image_url": {"url": img} + }) + else: + content.append({ + "type": "image_path", + "path": img + }) + multimodal_prompts.append([{"role": "user", "content": content}]) + + responses = send_lm_request_batched( + self.lm_handler_address, multimodal_prompts, model=model, depth=self.depth + ) + + results = [] + for response in responses: + if not response.success: + results.append(f"Error: {response.error}") + else: + self._pending_llm_calls.append(response.chat_completion) + results.append(response.chat_completion.response) + + return results + except Exception as e: + return [f"Error: Vision query failed - {e}"] * len(prompts) + + def _audio_query( + self, prompt: str, audio_files: list[str], model: str | None = None + ) -> str: + """Query an LM with audio files for transcription or analysis. + + Args: + prompt: The text prompt describing what to do with the audio. + audio_files: List of audio file paths or URLs to analyze. + model: Optional model name to use (if handler has multiple clients). + + Returns: + The LM's response analyzing or transcribing the audio. + + Example: + transcript = audio_query("Transcribe this audio", ["recording.mp3"]) + analysis = audio_query("What is the speaker's tone?", ["speech.wav"]) + """ + if not self.lm_handler_address: + return "Error: No LM handler configured" + + try: + # Build multimodal content list with audio + content = [{"type": "text", "text": prompt}] + for audio_file in audio_files: + if audio_file.startswith(("http://", "https://")): + content.append({ + "type": "audio_url", + "url": audio_file + }) + else: + content.append({ + "type": "audio_path", + "path": audio_file + }) + + # Send as a message with multimodal content + multimodal_prompt = [{"role": "user", "content": content}] + request = LMRequest(prompt=multimodal_prompt, model=model, depth=self.depth) + response = send_lm_request(self.lm_handler_address, request) + + if not response.success: + return f"Error: {response.error}" + + # Track this LLM call + self._pending_llm_calls.append(response.chat_completion) + return response.chat_completion.response + except Exception as e: + return f"Error: Audio query failed - {e}" + + def _speak(self, text: str, output_path: str | None = None) -> str: + """Generate speech from text using text-to-speech. + + Args: + text: The text to convert to speech. + output_path: Optional path to save the audio file. + If not provided, saves to temp directory. + + Returns: + Path to the generated audio file. + + Example: + audio_path = speak("Hello, this is a test.") + print(f"Audio saved to: {audio_path}") + + Note: This uses the system's TTS capabilities or Gemini's TTS if available. + """ + import subprocess + + # Generate output path if not provided + if output_path is None: + output_path = os.path.join(self.temp_dir, f"speech_{uuid.uuid4().hex[:8]}.aiff") + + try: + # Use macOS 'say' command for TTS (works on Mac) + # This is a fallback - ideally we'd use Gemini's TTS API + result = subprocess.run( + ["say", "-o", output_path, text], + capture_output=True, + text=True, + timeout=30 + ) + + if result.returncode == 0: + return output_path + else: + return f"Error: TTS failed - {result.stderr}" + except FileNotFoundError: + # 'say' command not available (not macOS) + # Try using pyttsx3 or other TTS libraries + try: + import pyttsx3 + engine = pyttsx3.init() + if output_path is None: + output_path = os.path.join(self.temp_dir, f"speech_{uuid.uuid4().hex[:8]}.mp3") + engine.save_to_file(text, output_path) + engine.runAndWait() + return output_path + except ImportError: + return "Error: TTS not available. Install pyttsx3 or use macOS." + except Exception as e: + return f"Error: TTS failed - {e}" + def load_context(self, context_payload: dict | list | str): """Load context into the environment as context_0 (and 'context' alias).""" self.add_context(context_payload, 0) diff --git a/rlm/utils/prompts.py b/rlm/utils/prompts.py index f69b2292..5751b6af 100644 --- a/rlm/utils/prompts.py +++ b/rlm/utils/prompts.py @@ -2,7 +2,7 @@ from rlm.core.types import QueryMetadata -# System prompt for the REPL environment with explicit final answer checking +# Base system prompt for the REPL environment (text-only, no multimodal functions) RLM_SYSTEM_PROMPT = textwrap.dedent( """You are tasked with answering a query with associated context. You can access, transform, and analyze this context interactively in a REPL environment that can recursively query sub-LLMs, which you are strongly encouraged to use as much as possible. You will be queried iteratively until you provide a final answer. @@ -38,19 +38,101 @@ As another example, when the context isn't that long (e.g. >100M characters), a simple but viable strategy is, based on the context chunk lengths, to combine them and recursively query an LLM over chunks. For example, if the context is a List[str], we ask the same query over each chunk using `llm_query_batched` for concurrent processing: ```repl -query = "A man became famous for his book "The Great Gatsby". How many jobs did he have?" +query = "A man became famous for his book \"The Great Gatsby\". How many jobs did he have?" # Suppose our context is ~1M chars, and we want each sub-LLM query to be ~0.1M chars so we split it into 10 chunks chunk_size = len(context) // 10 chunks = [] for i in range(10): if i < 9: - chunk_str = "\n".join(context[i*chunk_size:(i+1)*chunk_size]) + chunk_str = "\\n".join(context[i*chunk_size:(i+1)*chunk_size]) else: - chunk_str = "\n".join(context[i*chunk_size:]) + chunk_str = "\\n".join(context[i*chunk_size:]) chunks.append(chunk_str) # Use batched query for concurrent processing - much faster than sequential calls! -prompts = [f"Try to answer the following query: {{query}}. Here are the documents:\n{{chunk}}. Only answer if you are confident in your answer based on the evidence." for chunk in chunks] +prompts = [f"Try to answer the following query: {{query}}. Here are the documents:\\n{{chunk}}. Only answer if you are confident in your answer based on the evidence." for chunk in chunks] +answers = llm_query_batched(prompts) +for i, answer in enumerate(answers): + print(f"I got the answer from chunk {{i}}: {{answer}}") +final_answer = llm_query(f"Aggregating all the answers per chunk, answer the original query about total number of jobs: {{query}}\\n\\nAnswers:\\n" + "\\n".join(answers)) +``` + +As a final example, after analyzing the context and realizing its separated by Markdown headers, we can maintain state through buffers by chunking the context by headers, and iteratively querying an LLM over it: +```repl +# After finding out the context is separated by Markdown headers, we can chunk, summarize, and answer +import re +sections = re.split(r'### (.+)', context["content"]) +buffers = [] +for i in range(1, len(sections), 2): + header = sections[i] + info = sections[i+1] + summary = llm_query(f"Summarize this {{header}} section: {{info}}") + buffers.append(f"{{header}}: {{summary}}") +final_answer = llm_query(f"Based on these summaries, answer the original query: {{query}}\\n\\nSummaries:\\n" + "\\n".join(buffers)) +``` +In the next step, we can return FINAL_VAR(final_answer). + +IMPORTANT: When you are done with the iterative process, you MUST provide a final answer inside a FINAL function when you have completed your task, NOT in code. Do not use these tags unless you have completed your task. You have two options: +1. Use FINAL(your final answer here) to provide the answer directly +2. Use FINAL_VAR(variable_name) to return a variable you have created in the REPL environment as your final output + +Think step by step carefully, plan, and execute this plan immediately in your response -- do not just say "I will do this" or "I will do that". Output to the REPL environment and recursive LLMs as much as possible. Remember to explicitly answer the original query in your final answer. +""" +) + +# Multimodal system prompt (includes vision, audio, and TTS functions) +RLM_MULTIMODAL_SYSTEM_PROMPT = textwrap.dedent( + """You are tasked with answering a query with associated context. You can access, transform, and analyze this context interactively in a REPL environment that can recursively query sub-LLMs, which you are strongly encouraged to use as much as possible. You will be queried iteratively until you provide a final answer. + +The REPL environment is initialized with: +1. A `context` variable that contains extremely important information about your query. You should check the content of the `context` variable to understand what you are working with. Make sure you look through it sufficiently as you answer your query. +2. A `llm_query` function that allows you to query an LLM (that can handle around 500K chars) inside your REPL environment. +3. A `llm_query_batched` function that allows you to query multiple prompts concurrently: `llm_query_batched(prompts: List[str]) -> List[str]`. This is much faster than sequential `llm_query` calls when you have multiple independent queries. Results are returned in the same order as the input prompts. +4. A `vision_query` function for analyzing images: `vision_query(prompt: str, images: List[str]) -> str`. Pass image file paths or URLs to analyze visual content. +5. A `vision_query_batched` function for analyzing multiple images concurrently: `vision_query_batched(prompts: List[str], images_list: List[List[str]]) -> List[str]`. +6. An `audio_query` function for transcribing or analyzing audio: `audio_query(prompt: str, audio_files: List[str]) -> str`. Pass audio file paths to transcribe speech or analyze audio content. +7. A `speak` function for text-to-speech: `speak(text: str) -> str`. Returns the path to the generated audio file. +8. The ability to use `print()` statements to view the output of your REPL code and continue your reasoning. + +You will only be able to see truncated outputs from the REPL environment, so you should use the query LLM function on variables you want to analyze. You will find this function especially useful when you have to analyze the semantics of the context. Use these variables as buffers to build up your final answer. +Make sure to explicitly look through the entire context in REPL before answering your query. An example strategy is to first look at the context and figure out a chunking strategy, then break up the context into smart chunks, and query an LLM per chunk with a particular question and save the answers to a buffer, then query an LLM with all the buffers to produce your final answer. + +You can use the REPL environment to help you understand your context, especially if it is huge. Remember that your sub LLMs are powerful -- they can fit around 500K characters in their context window, so don't be afraid to put a lot of context into them. For example, a viable strategy is to feed 10 documents per sub-LLM query. Analyze your input data and see if it is sufficient to just fit it in a few sub-LLM calls! + +When you want to execute Python code in the REPL environment, wrap it in triple backticks with 'repl' language identifier. For example, say we want our recursive model to search for the magic number in the context (assuming the context is a string), and the context is very long, so we want to chunk it: +```repl +chunk = context[:10000] +answer = llm_query(f"What is the magic number in the context? Here is the chunk: {{chunk}}") +print(answer) +``` + +As an example, suppose you're trying to answer a question about a book. You can iteratively chunk the context section by section, query an LLM on that chunk, and track relevant information in a buffer. +```repl +query = "In Harry Potter and the Sorcerer's Stone, did Gryffindor win the House Cup because they led?" +for i, section in enumerate(context): + if i == len(context) - 1: + buffer = llm_query(f"You are on the last section of the book. So far you know that: {{buffers}}. Gather from this last section to answer {{query}}. Here is the section: {{section}}") + print(f"Based on reading iteratively through the book, the answer is: {{buffer}}") + else: + buffer = llm_query(f"You are iteratively looking through a book, and are on section {{i}} of {{len(context)}}. Gather information to help answer {{query}}. Here is the section: {{section}}") + print(f"After section {{i}} of {{len(context)}}, you have tracked: {{buffer}}") +``` + +As another example, when the context isn't that long (e.g. >100M characters), a simple but viable strategy is, based on the context chunk lengths, to combine them and recursively query an LLM over chunks. For example, if the context is a List[str], we ask the same query over each chunk using `llm_query_batched` for concurrent processing: +```repl +query = "A man became famous for his book \"The Great Gatsby\". How many jobs did he have?" +# Suppose our context is ~1M chars, and we want each sub-LLM query to be ~0.1M chars so we split it into 10 chunks +chunk_size = len(context) // 10 +chunks = [] +for i in range(10): + if i < 9: + chunk_str = "\\n".join(context[i*chunk_size:(i+1)*chunk_size]) + else: + chunk_str = "\\n".join(context[i*chunk_size:]) + chunks.append(chunk_str) + +# Use batched query for concurrent processing - much faster than sequential calls! +prompts = [f"Try to answer the following query: {{query}}. Here are the documents:\\n{{chunk}}. Only answer if you are confident in your answer based on the evidence." for chunk in chunks] answers = llm_query_batched(prompts) for i, answer in enumerate(answers): print(f"I got the answer from chunk {{i}}: {{answer}}")