axon-rl · lkevinzc · Oct 13, 2025 · Aug 31, 2025 · Aug 31, 2025 · Aug 31, 2025
diff --git a/examples/multiagent/README.md b/examples/multiagent/README.md
@@ -0,0 +1,51 @@
+# Multi-Agent Examples for GEM
+
+This directory contains multi-agent environment examples using GEM's MultiAgentEnv framework.
+
+## TAU-BENCH Retail Integration
+
+The `tau_bench_retail/` directory contains the official integration of TAU-BENCH Retail benchmark into GEM. TAU-BENCH evaluates tool-augmented LLM agents on realistic customer service tasks in a retail environment.
+
+### Setup
+
+1. Clone the TAU-bench repository:
+```bash
+cd tau_bench_retail
+git clone https://github.com/sierra-research/tau-bench.git
+```
+
+2. Set your API key:
+```bash
+export OPENAI_API_KEY="your-key-here"
+```
+
+3. Run the evaluation:
+```bash
+python run_eval.py
+```
+
+### Directory Structure
+
+```
+multiagent/
+└── tau_bench_retail/
+    ├── tau_bench_env.py       # GEM environment wrapper for TAU-bench
+    ├── tau_bench_agent.py     # Agent with tool-calling capabilities
+    ├── run_eval.py            # Evaluation script
+    └── tau-bench/             # Cloned TAU-bench repository (git ignored)
+        └── tau_bench/
+            └── envs/
+                └── retail/    # TAU-bench retail assets
+                    ├── data/  # JSON data files
+                    ├── tools/ # Tool implementations
+                    ├── tasks_*.py  # Task definitions
+                    └── wiki.md     # Agent policy
+```
+
+## Performance
+
+TAU-bench Retail: **78/115 (67.8%)**
+
+## Available Tools
+
+16 customer service tools including order management, user identification, information retrieval, and support functions.
diff --git a/examples/multiagent/tau_bench_retail/.gitignore b/examples/multiagent/tau_bench_retail/.gitignore
@@ -0,0 +1,5 @@
+experiments/results/
+*.pyc
+__pycache__/
+.DS_Store
+tau-bench/
diff --git a/examples/multiagent/tau_bench_retail/README.md b/examples/multiagent/tau_bench_retail/README.md
@@ -0,0 +1,66 @@
+# TAU-bench Retail - GEM MultiAgentEnv Integration
+
+Clean implementation of TAU-bench retail benchmark using GEM's MultiAgentEnv API.
+
+**Performance**: 78/115 (67.8%) - Exceeds target of 60.4%
+
+## Setup
+
+### 1. Clone TAU-bench Repository
+
+```bash
+# Clone the official TAU-bench repository
+git clone https://github.com/sierra-research/tau-bench.git
+
+# Option 1: Clone to the default location (within tau_bench_retail directory)
+cd examples/multiagent/tau_bench_retail
+git clone https://github.com/sierra-research/tau-bench.git
+
+# Option 2: Clone anywhere and set environment variable
+git clone https://github.com/sierra-research/tau-bench.git /path/to/tau-bench
+export TAU_BENCH_PATH=/path/to/tau-bench
+```
+
+### 2. Install Dependencies
+```bash
+# Install GEM
+cd /path/to/gem/
+pip install -e .
+
+# Install TAU-bench
+cd /path/to/gem/examples/multiagent/tau_bench_retail/tau-bench
+pip install -e .
+```
+
+### 3. Set API Keys
+
+```bash
+# Required for OpenAI models
+export OPENAI_API_KEY="your-key"
+
+# Optional: For OpenRouter models (Gemini, Claude, DeepSeek)
+export OPENROUTER_API_KEY="your-key"
+```
+
+### 4. Run Evaluation
+
+```bash
+python run_eval.py
+```
+
+## Files
+
+- `tau_bench_env.py` - GEM MultiAgentEnv environment wrapper
+- `tau_bench_agent.py` - Agent with OpenRouter-style tool calling
+- `run_eval.py` - Evaluation runner (115 test tasks)
+
+## Model Support
+
+Supported models via `run_eval.py`:
+- OpenAI: `gpt-4o`
+- OpenRouter: `google/gemini-2.0-flash-001`, `deepseek/deepseek-chat`, `anthropic/claude-3.5-sonnet`
+
+For OpenRouter models:
+```bash
+export OPENROUTER_API_KEY="your-key"
+```
diff --git a/examples/multiagent/tau_bench_retail/run_eval.py b/examples/multiagent/tau_bench_retail/run_eval.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+import os
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+sys.path.insert(0, os.path.dirname(__file__))
+from tau_bench_agent import TauBenchAgent
+from tau_bench_env import TauBenchEnv
+
+
+def eval_task(args):
+    task_idx, model, provider, user_model, user_provider = args
+    try:
+        env = TauBenchEnv(
+            task_split="test", user_model=user_model, user_provider=user_provider
+        )
+        agent = TauBenchAgent(model=model, provider=provider, temperature=0.0)
+        result = agent.solve(env, task_index=task_idx)
+        return task_idx, result["reward"]
+    except Exception as e:
+        print(f"Task {task_idx} error: {e}")
+        return task_idx, 0.0
+
+
+if __name__ == "__main__":
+    # OpenAI: model="gpt-4o", provider="openai"
+    # Gemini: model="google/gemini-2.0-flash-001", provider="openrouter"
+    # DeepSeek: model="deepseek/deepseek-chat", provider="openrouter"
+    # Claude: model="anthropic/claude-3.5-sonnet", provider="openrouter"
+
+    model = "gpt-4o"
+    provider = "openai"
+    user_model = "gpt-4o"
+    user_provider = "openai"
+
+    print(f"Running 115 tasks with {model} via {provider}")
+    print(f"User simulator: {user_model} via {user_provider}")
+    print("=" * 60)
+
+    tasks = [(i, model, provider, user_model, user_provider) for i in range(115)]
+    results = []
+    passed = 0
+
+    with ThreadPoolExecutor(max_workers=32) as executor:
+        futures = {executor.submit(eval_task, args): args[0] for args in tasks}
+
+        for future in as_completed(futures):
+            task_idx, reward = future.result()
+            results.append((task_idx, reward))
+
+            if reward > 0:
+                passed += 1
+
+            completed = len(results)
+            print(
+                f"Task {task_idx}: {'✓' if reward > 0 else '✗'} | "
+                f"{completed}/115 | Pass@1: {passed}/{completed} ({100*passed/completed:.1f}%)"
+            )
+
+    print(f"\n{'='*60}")
+    print(f"FINAL: {passed}/115 ({100*passed/115:.1f}%)")
+    print(f"Target: 60.4%")
+    print(f"{'='*60}")
diff --git a/examples/multiagent/tau_bench_retail/tau_bench_agent.py b/examples/multiagent/tau_bench_retail/tau_bench_agent.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+import json
+from typing import Any, Dict, List
+
+from litellm import completion
+
+
+class TauBenchAgent:
+    """Agent using OpenRouter-style tool calling pattern"""
+
+    def __init__(
+        self, model: str = "gpt-4o", provider: str = "openai", temperature: float = 0.0
+    ):
+        self.model = model
+        self.provider = provider
+        self.temperature = temperature
+
+    def solve(
+        self, env, task_index: int = 0, max_num_steps: int = 30
+    ) -> Dict[str, Any]:
+        observations, infos = env.reset(task_index=task_index)
+
+        messages: List[Dict[str, Any]] = [
+            {"role": "system", "content": env.wiki},
+            {"role": "user", "content": observations["assistant"]},
+        ]
+
+        reward = 0.0
+        num_steps = 0
+
+        for _ in range(max_num_steps):
+            request = {
+                "model": self.model,
+                "messages": messages,
+                "tools": env.tool_definitions,
+                "temperature": self.temperature,
+            }
+
+            response = completion(custom_llm_provider=self.provider, **request)
+            response_message = response.choices[0].message
+            messages.append(response_message.model_dump())
+
+            if hasattr(response_message, "tool_calls") and response_message.tool_calls:
+                for tool_call in response_message.tool_calls:
+                    tool_name = tool_call.function.name
+                    tool_args = json.loads(tool_call.function.arguments)
+
+                    action_json = json.dumps({"name": tool_name, "kwargs": tool_args})
+                    observations, rewards, terminations, truncations, env_infos = (
+                        env.step({"assistant": action_json})
+                    )
+
+                    reward = rewards.get("assistant", 0.0)
+                    messages.append(
+                        {
+                            "role": "tool",
+                            "tool_call_id": tool_call.id,
+                            "content": observations["assistant"],
+                        }
+                    )
+
+                    num_steps += 1
+                    if terminations.get("assistant", False):
+                        break
+            else:
+                content = response_message.content or ""
+                action_json = json.dumps(
+                    {"name": "respond", "kwargs": {"content": content}}
+                )
+
+                observations, rewards, terminations, truncations, env_infos = env.step(
+                    {"assistant": action_json}
+                )
+
+                reward = rewards.get("assistant", 0.0)
+                messages.append({"role": "user", "content": observations["assistant"]})
+                num_steps += 1
+
+            if terminations.get("assistant", False):
+                break
+
+        return {
+            "reward": reward,
+            "task_id": env.task.user_id,
+            "task_index": task_index,
+            "num_steps": num_steps,
+        }