NillionNetwork
diff --git a/‎docker/compose/docker-compose.llama-8b-gpu.yml‎
Lines changed: 4 additions & 3 deletions b/‎docker/compose/docker-compose.llama-8b-gpu.yml‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎docker/compose/docker-compose.nilai-prod-2.yml‎
Lines changed: 3 additions & 1 deletion b/‎docker/compose/docker-compose.nilai-prod-2.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docker/vllm.Dockerfile‎
Lines changed: 1 addition & 0 deletions b/‎docker/vllm.Dockerfile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎nilai-api/pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎nilai-api/pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎nilai-api/src/nilai_api/handlers/tools/code_execution.py‎
Lines changed: 38 additions & 0 deletions b/‎nilai-api/src/nilai_api/handlers/tools/code_execution.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎nilai-api/src/nilai_api/handlers/tools/tool_router.py‎
Lines changed: 189 additions & 0 deletions b/‎nilai-api/src/nilai_api/handlers/tools/tool_router.py‎
Lines changed: 189 additions & 0 deletions
diff --git a/‎nilai-api/src/nilai_api/routers/private.py‎
Lines changed: 29 additions & 3 deletions b/‎nilai-api/src/nilai_api/routers/private.py‎
Lines changed: 29 additions & 3 deletions
diff --git a/‎packages/nilai-common/src/nilai_common/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎packages/nilai-common/src/nilai_common/__init__.py‎
Lines changed: 8 additions & 0 deletions
@@ -20,21 +20,22 @@ services:
         condition: service_healthy
     command: >
       --model meta-llama/Llama-3.1-8B-Instruct
-      --gpu-memory-utilization 0.20
+      --gpu-memory-utilization 0.95
       --max-model-len 10000
       --max-num-batched-tokens 10000
       --tensor-parallel-size 1
-      --enable-auto-tool-choice
       --tool-call-parser llama3_json
       --uvicorn-log-level warning
+      --enable-auto-tool-choice
+      --chat-template /opt/vllm/templates/llama3.1_tool_json.jinja
     environment:
       - SVC_HOST=llama_8b_gpu
       - SVC_PORT=8000
       - ETCD_HOST=etcd
       - ETCD_PORT=2379
       - TOOL_SUPPORT=true
     volumes:
-      - hugging_face_models:/root/.cache/huggingface  # cache models
+      - hugging_face_models:/root/.cache/huggingface
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
       interval: 30s
 
@@ -32,14 +32,16 @@ services:
       --enable-auto-tool-choice
       --tool-call-parser llama3_json
       --uvicorn-log-level warning
+      --enable-auto-tool-choice
+      --chat-template /opt/vllm/templates/llama3.1_tool_json.jinja
     environment:
       - SVC_HOST=llama_8b_gpu
       - SVC_PORT=8000
       - ETCD_HOST=etcd
       - ETCD_PORT=2379
       - TOOL_SUPPORT=true
     volumes:
-      - hugging_face_models:/root/.cache/huggingface  # cache models
+      - hugging_face_models:/root/.cache/huggingface
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
       interval: 30s
 
@@ -10,6 +10,7 @@ FROM vllm/vllm-openai:v0.10.1
 # ENV EXEC_PATH=nilai_models.models.${MODEL_NAME}:app
 
 COPY --link . /daemon/
+COPY --link vllm_templates /opt/vllm/templates
 
 WORKDIR /daemon/nilai-models/
 
 
@@ -37,6 +37,7 @@ dependencies = [
     "trafilatura>=1.7.0",
     "secretvaults",
     "nilauth-credit-middleware>=0.1.0",
+    "e2b-code-interpreter>=1.0.3",
 ]
 
 
 
@@ -0,0 +1,38 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+
+from e2b_code_interpreter import Sandbox
+
+logger = logging.getLogger(__name__)
+
+
+def _run_in_sandbox_sync(code: str) -> str:
+    """Execute Python code in an e2b sandbox and return the textual output or stdout if available."""
+    try:
+        with Sandbox.create() as sandbox:
+            exec_ = sandbox.run_code(code)
+            if exec_.text:
+                return exec_.text
+            if getattr(exec_, "logs", None) and getattr(exec_.logs, "stdout", None):
+                return "\n".join(exec_.logs.stdout)
+            return ""
+    except Exception as e:
+        logger.error("Error executing code in sandbox: %s", e)
+        raise
+
+
+async def execute_python(code: str) -> str:
+    """Execute Python code in an e2b Code Interpreter sandbox and return the textual output.
+
+    This function is async-safe and runs the blocking execution in a thread.
+    """
+    logger.info("Executing Python code asynchronously")
+    try:
+        result = await asyncio.to_thread(_run_in_sandbox_sync, code)
+        logger.info("Python code execution completed successfully")
+        return result
+    except Exception as e:
+        logger.error(f"Error in async Python code execution: {e}")
+        raise
@@ -0,0 +1,189 @@
+from __future__ import annotations
+
+import json
+import uuid
+from typing import List, Optional, Tuple, cast
+from nilai_common import (
+    Message,
+    MessageAdapter,
+    ChatRequest,
+    ChatCompletion,
+    ChatCompletionMessage,
+    ChatCompletionMessageToolCall,
+    ChatToolFunction,
+)
+
+from . import code_execution
+from openai import AsyncOpenAI
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+async def route_and_execute_tool_call(
+    tool_call: ChatCompletionMessageToolCall,
+) -> Message:
+    """Route a single tool call to its implementation and return a tool message.
+
+    The returned message is a dict compatible with OpenAI's ChatCompletionMessageParam
+    with role="tool".
+    """
+    func_name = tool_call.function.name
+    arguments = tool_call.function.arguments or "{}"
+
+    if func_name == "execute_python":
+        # arguments is a JSON string
+        try:
+            args = json.loads(arguments)
+        except Exception:
+            args = {}
+        code = args.get("code", "")
+        result = await code_execution.execute_python(code)
+        logger.info(f"[tool] execute_python result: {result}")
+        return MessageAdapter.new_tool_message(
+            name="execute_python",
+            content=result,
+            tool_call_id=tool_call.id,
+        )
+
+    # Unknown tool: return an error message to the model
+    return MessageAdapter.new_tool_message(
+        name=func_name,
+        content=f"Tool '{func_name}' not implemented",
+        tool_call_id=tool_call.id,
+    )
+
+
+async def process_tool_calls(
+    tool_calls: List[ChatCompletionMessageToolCall],
+) -> List[Message]:
+    """Process a list of tool calls and return their corresponding tool messages.
+
+    Routes each tool call to its implementation and collects the results as
+    tool messages that can be appended to the conversation history.
+    """
+    msgs: List[Message] = []
+    for tc in tool_calls:
+        msg = await route_and_execute_tool_call(tc)
+        msgs.append(msg)
+    return msgs
+
+
+def extract_tool_calls_from_response_message(
+    response_message: ChatCompletionMessage,
+) -> List[ChatCompletionMessageToolCall]:
+    """Return tool calls from a ChatCompletionMessage, parsing content if needed.
+
+    Many models may emit function-calling either via the structured `tool_calls`
+    field or encode it as JSON in the assistant `content`. This helper returns a
+    normalized list of `ChatCompletionMessageToolCall` objects, using a
+    best-effort parse of the content when `tool_calls` is empty.
+    """
+    if response_message.tool_calls:
+        return cast(List[ChatCompletionMessageToolCall], response_message.tool_calls)
+
+    try:
+        adapter = MessageAdapter(
+            raw=cast(
+                Message,
+                response_message.model_dump(exclude_unset=True),
+            )
+        )
+        content: Optional[str] = adapter.extract_text()
+    except Exception:
+        content = response_message.content
+
+    if not content:
+        return []
+
+    try:
+        data = json.loads(content)
+    except Exception:
+        return []
+
+    if not isinstance(data, dict):
+        return []
+
+    # Support multiple possible schemas
+    fn = data.get("function")
+    if isinstance(fn, dict) and "name" in fn:
+        name = fn.get("name")
+        args = fn.get("parameters", {})
+    else:
+        # Fallbacks for other schemas
+        name = data.get("name") or data.get("tool") or data.get("function_name")
+        raw_args = data.get("arguments")
+        try:
+            args = (
+                (json.loads(raw_args) if isinstance(raw_args, str) else raw_args)
+                or data.get("parameters", {})
+                or {}
+            )
+        except Exception:
+            args = data.get("parameters", {}) or {}
+
+    if not isinstance(name, str) or not name:
+        return []
+
+    try:
+        tool_call = ChatCompletionMessageToolCall(
+            id=f"call_{uuid.uuid4()}",
+            type="function",
+            function=ChatToolFunction(name=name, arguments=json.dumps(args)),
+        )
+    except Exception:
+        return []
+
+    return [tool_call]
+
+
+async def handle_tool_workflow(
+    client: AsyncOpenAI,
+    req: ChatRequest,
+    current_messages: List[Message],
+    first_response: ChatCompletion,
+) -> Tuple[ChatCompletion, int, int]:
+    """Execute tool workflow if requested and return final completion and usage.
+
+    - Extracts tool calls from the first response (structured or JSON in content)
+    - Executes tools and appends tool messages
+    - Runs a follow-up completion providing tool outputs
+    - Returns the final ChatCompletion and aggregated usage (prompt, completion)
+    """
+    logger.info("[tools] evaluating tool workflow for response")
+
+    prompt_tokens = first_response.usage.prompt_tokens if first_response.usage else 0
+    completion_tokens = (
+        first_response.usage.completion_tokens if first_response.usage else 0
+    )
+
+    response_message = first_response.choices[0].message
+    tool_calls = extract_tool_calls_from_response_message(response_message)
+    logger.info(f"[tools] extracted tool_calls: {tool_calls}")
+
+    if not tool_calls:
+        return first_response, 0, 0
+
+    assistant_tool_call_msg = MessageAdapter.new_assistant_tool_call_message(tool_calls)
+    current_messages = [*current_messages, assistant_tool_call_msg]
+
+    tool_messages = await process_tool_calls(tool_calls)
+    current_messages.extend(tool_messages)
+
+    request_kwargs = {
+        "model": req.model,
+        "messages": current_messages,  # type: ignore[arg-type]
+        "top_p": req.top_p,
+        "temperature": req.temperature,
+        "max_tokens": req.max_tokens,
+        "tool_choice": "none",
+    }
+
+    logger.info("[tools] performing follow-up completion with tool outputs")
+    second: ChatCompletion = await client.chat.completions.create(**request_kwargs)  # type: ignore
+    if second.usage:
+        prompt_tokens += second.usage.prompt_tokens
+        completion_tokens += second.usage.completion_tokens
+
+    return second, prompt_tokens, completion_tokens
@@ -9,6 +9,7 @@
 from nilai_api.credit import LLMMeter, LLMUsage
 from nilai_api.handlers.nilrag import handle_nilrag
 from nilai_api.handlers.web_search import handle_web_search
+from nilai_api.handlers.tools.tool_router import handle_tool_workflow
 
 from fastapi import APIRouter, Body, Depends, HTTPException, status, Request
 from fastapi.responses import StreamingResponse
@@ -158,7 +159,7 @@ async def chat_completion_web_search_rate_limit(request: Request) -> bool:
         chat_request = ChatRequest(**body)
     except ValueError:
         raise HTTPException(status_code=400, detail="Invalid request body")
-    return getattr(chat_request, "web_search", False)
+    return bool(chat_request.web_search)
 
 
 @router.post("/v1/chat/completions", tags=["Chat"], response_model=None)
@@ -402,6 +403,8 @@ async def chat_completion_stream_generator() -> AsyncGenerator[str, None]:
     }
     if req.tools:
         request_kwargs["tools"] = req.tools  # type: ignore
+        request_kwargs["tool_choice"] = req.tool_choice
+
     logger.info(f"[chat] call start request_id={request_id}")
     logger.info(f"[chat] call message: {current_messages}")
     t_call = time.monotonic()
@@ -410,11 +413,20 @@ async def chat_completion_stream_generator() -> AsyncGenerator[str, None]:
         f"[chat] call done request_id={request_id} duration_ms={(time.monotonic() - t_call) * 1000:.0f}"
     )
     logger.info(f"[chat] call response: {response}")
+
+    # Handle tool workflow fully inside tools.router
+    (
+        final_completion,
+        agg_prompt_tokens,
+        agg_completion_tokens,
+    ) = await handle_tool_workflow(client, req, current_messages, response)
+    logger.info(f"[chat] call final_completion: {final_completion}")
     model_response = SignedChatCompletion(
-        **response.model_dump(),
+        **final_completion.model_dump(),
         signature="",
         sources=sources,
     )
+
     logger.info(
         f"[chat] model_response request_id={request_id} duration_ms={(time.monotonic() - t_call) * 1000:.0f}"
     )
@@ -424,7 +436,21 @@ async def chat_completion_stream_generator() -> AsyncGenerator[str, None]:
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
             detail="Model response does not contain usage statistics",
         )
-    # Update token usage
+
+    if agg_prompt_tokens or agg_completion_tokens:
+        total_prompt_tokens = response.usage.prompt_tokens
+        total_completion_tokens = response.usage.completion_tokens
+
+        total_prompt_tokens += agg_prompt_tokens
+        total_completion_tokens += agg_completion_tokens
+
+        model_response.usage.prompt_tokens = total_prompt_tokens
+        model_response.usage.completion_tokens = total_completion_tokens
+        model_response.usage.total_tokens = (
+            total_prompt_tokens + total_completion_tokens
+        )
+
+    # Update token usage in DB
     await UserManager.update_token_usage(
         auth_info.user.userid,
         prompt_tokens=model_response.usage.prompt_tokens,
 
@@ -3,6 +3,10 @@
     ChatRequest,
     SignedChatCompletion,
     Choice,
+    ChatCompletion,
+    ChatCompletionMessage,
+    ChatCompletionMessageToolCall,
+    ChatToolFunction,
     HealthCheckResponse,
     ModelEndpoint,
     ModelMetadata,
@@ -29,6 +33,10 @@
     "ChatRequest",
     "SignedChatCompletion",
     "Choice",
+    "ChatCompletion",
+    "ChatCompletionMessage",
+    "ChatCompletionMessageToolCall",
+    "ChatToolFunction",
     "ModelMetadata",
     "Usage",
     "AttestationReport",
Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,7 @@ dependencies = [`
`37`	`37`	`"trafilatura>=1.7.0",`
`38`	`38`	`"secretvaults",`
`39`	`39`	`"nilauth-credit-middleware>=0.1.0",`
	`40`	`+ "e2b-code-interpreter>=1.0.3",`
`40`	`41`	`]`
`41`	`42`
`42`	`43`