sofatutor · mfittko · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025
diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py
@@ -1542,6 +1542,17 @@ def _is_recognized_call_type_for_logging(
         """
         Returns True if the call type is recognized for logging (eg. ModelResponse, ModelResponseStream, etc.)
         """
+        # Ensure Text-to-Speech calls are recognized for logging even if the
+        # provider returns a lightweight streaming adapter instead of
+        # HttpxBinaryResponseContent. This guarantees building a
+        # standard_logging_object for TTS so downstream proxy callbacks can
+        # track spend and budgets.
+        try:
+            if self.call_type in (CallTypes.speech.value, CallTypes.aspeech.value):
+                return True
+        except Exception:
+            # If call_type is missing for any reason, fallthrough to type checks
+            pass
         if (
             isinstance(logging_result, ModelResponse)
             or isinstance(logging_result, ModelResponseStream)

diff --git a/litellm/llms/openai/openai.py b/litellm/llms/openai/openai.py
@@ -63,6 +63,26 @@
 openAIGPT5Config = OpenAIGPT5Config()
 
 
+class _DeferredOpenAITTSStream:
+    """
+    Opens the OpenAI streaming context only when aiter_bytes() is consumed,
+    keeping the upstream stream alive while the proxy yields chunks.
+    """
+
+    def __init__(self, client: AsyncOpenAI, request_kwargs: dict):
+        self._client = client
+        self._request_kwargs = request_kwargs
+        self._hidden_params: dict = {}
+
+    async def aiter_bytes(self, chunk_size: int = 1024):
+        async with self._client.audio.speech.with_streaming_response.create(
+            **self._request_kwargs
+        ) as streamed:
+            async for chunk in streamed.http_response.aiter_bytes(
+                chunk_size=chunk_size
+            ):
+                yield chunk
+
 class MistralEmbeddingConfig:
     """
     Reference: https://docs.mistral.ai/api/#operation/createEmbedding
@@ -1433,6 +1453,8 @@ def audio_speech(
             client=client,
         )
 
+        # For sync path, fall back to simple non-streaming create (keeps behavior for sync speech())
+        # Proxy uses async path; real streaming is handled in async_audio_speech via deferred stream.
         response = cast(OpenAI, openai_client).audio.speech.create(
             model=model,
             voice=voice,  # type: ignore
@@ -1467,14 +1489,25 @@ async def async_audio_speech(
             ),
         )
 
-        response = await openai_client.audio.speech.create(
-            model=model,
-            voice=voice,  # type: ignore
-            input=input,
+        # Return a deferred streaming object so proxy can iterate without prematurely closing upstream
+        request_kwargs = {
+            "model": model,
+            "voice": voice,
+            "input": input,
             **optional_params,
-        )
-
-        return HttpxBinaryResponseContent(response=response.response)
+        }
+        deferred = _DeferredOpenAITTSStream(client=openai_client, request_kwargs=request_kwargs)
+        # Adapt to HttpxBinaryResponseContent interface by exposing aiter_bytes()
+        class _Adapter:
+            _hidden_params: dict = {}
+
+            async def aiter_bytes(self, chunk_size: int = 1024):
+                async def _gen():
+                    async for b in deferred.aiter_bytes(chunk_size=chunk_size):
+                        yield b
+                return _gen()
+
+        return _Adapter()  # type: ignore
 
 
 class OpenAIFilesAPI(BaseLLM):

diff --git a/litellm/utils.py b/litellm/utils.py
@@ -780,6 +780,17 @@ def function_setup(  # noqa: PLR0915
             call_type == CallTypes.aspeech.value or call_type == CallTypes.speech.value
         ):
             messages = kwargs.get("input", "speech")
+            # Ensure TTS input is recorded on the logging object for character-based
+            # pricing in the cost calculator.
+            try:
+                if isinstance(messages, str):
+                    # This is set later when the Logging object is constructed; we
+                    # also redundantly set it in model_call_details for safety.
+                    kwargs.setdefault("metadata", {})
+                    # model_call_details populated below will read from kwargs
+
+            except Exception:
+                pass
         elif (
             call_type == CallTypes.aresponses.value
             or call_type == CallTypes.responses.value

diff --git a/scripts/verify_tts_streaming.py b/scripts/verify_tts_streaming.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+import argparse
+import contextlib
+import os
+import sys
+import time
+from typing import Optional
+
+import httpx
+
+
+def build_url(base_url: str, endpoint_path: str) -> str:
+    if base_url.endswith("/"):
+        base_url = base_url[:-1]
+    if not endpoint_path.startswith("/"):
+        endpoint_path = "/" + endpoint_path
+    return base_url + endpoint_path
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Verify TTS streaming via chunked transfer")
+    parser.add_argument(
+        "--base-url",
+        default=os.environ.get("OPENAI_BASE_URL", "http://0.0.0.0:4000"),
+        help="Base URL for the API (default from OPENAI_BASE_URL or http://0.0.0.0:4000)",
+    )
+    parser.add_argument(
+        "--endpoint-path",
+        default="/v1/audio/speech",
+        help="Endpoint path to call (e.g. /v1/audio/speech or /openai/audio/speech)",
+    )
+    parser.add_argument(
+        "--model",
+        default="gpt-4o-mini-tts",
+        help="Model name (default: gpt-4o-mini-tts)",
+    )
+    parser.add_argument(
+        "--voice",
+        default="shimmer",
+        help="Voice to use (default: shimmer)",
+    )
+    parser.add_argument(
+        "--input",
+        default=(
+            "Once upon a time, in a bustling city nestled between rolling hills and a sparkling river, there lived a young inventor named Elara. Elara was known throughout the city for her boundless curiosity and her knack for creating marvelous contraptions from the most ordinary of objects. One day, while exploring the attic of her late grandfather’s house, she stumbled upon a dusty, leather-bound journal filled with cryptic notes and intricate sketches of a mysterious machine. Intrigued, Elara spent days deciphering the journal, piecing together the purpose of the device. It was said to be a portal, capable of bridging worlds and connecting distant realms. Driven by excitement and a sense of adventure, Elara gathered the necessary parts—cogs, wires, crystals, and a peculiar brass key—and began assembling the machine in her workshop. As she tightened the final bolt and inserted the key, the device hummed to life, casting a shimmering blue light across the room. With a deep breath, Elara stepped forward and activated the portal. Instantly, she was enveloped in a whirlwind of colors and sounds, feeling herself transported beyond the boundaries of her world. When the light faded, she found herself standing in a lush, enchanted forest, where trees whispered secrets and fantastical creatures roamed freely. Elara realized she had crossed into a realm of endless possibilities, where her inventions could shape the very fabric of reality. Determined to explore and learn, she set off down a winding path, eager to uncover the wonders and challenges that awaited her in this extraordinary new world. And so began Elara’s greatest adventure, one that would test her ingenuity, courage, and heart, and ultimately reveal the true power of imagination and discovery."
+        ),
+        help="Text to synthesize",
+    )
+    parser.add_argument(
+        "--response-format",
+        default="mp3",
+        help="Audio response format (default: mp3)",
+    )
+    parser.add_argument(
+        "--output",
+        default=None,
+        help="Optional path to write audio to (if omitted, data is discarded)",
+    )
+    parser.add_argument(
+        "--http2",
+        action="store_true",
+        help="Enable HTTP/2 (default: off). Leave off to see chunked headers in HTTP/1.1",
+    )
+    args = parser.parse_args()
+
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        print("ERROR: OPENAI_API_KEY is not set in the environment", file=sys.stderr)
+        return 2
+
+    url = build_url(args.base_url, args.endpoint_path)
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+        "Accept": "audio/mpeg",
+    }
+    json_body = {
+        "model": args.model,
+        "input": args.input,
+        "voice": args.voice,
+        "response_format": args.response_format,
+    }
+
+    print(f"Requesting: {url}")
+    print(f"HTTP/2: {'on' if args.http2 else 'off'} (HTTP/1.1 if off)")
+
+    # Force HTTP/1.1 by default to make Transfer-Encoding: chunked visible when streaming.
+    # For HTTP/2, chunked header will not be present even when streaming works.
+    start_req = time.time()
+    first_byte_at: Optional[float] = None
+    total_bytes = 0
+
+    with httpx.Client(http2=args.http2, timeout=None) as client:
+        with client.stream("POST", url, headers=headers, json=json_body) as resp:
+            status = resp.status_code
+            # Print key headers that indicate buffering vs streaming
+            cl = resp.headers.get("content-length")
+            te = resp.headers.get("transfer-encoding")
+            server = resp.headers.get("server")
+            print(f"Status: {status}")
+            print(f"Content-Type: {resp.headers.get('content-type')}")
+            print(f"Content-Length: {cl}")
+            print(f"Transfer-Encoding: {te}")
+            print(f"Server: {server}")
+
+            # Stream body
+            sink_cm = open(args.output, "wb") if args.output else contextlib.nullcontext()
+            with sink_cm as sink:
+                for chunk in resp.iter_bytes():
+                    if not first_byte_at:
+                        first_byte_at = time.time()
+                        print(
+                            f"First byte after {first_byte_at - start_req:.3f}s"
+                        )
+                    total_bytes += len(chunk)
+                    if sink and hasattr(sink, "write"):
+                        sink.write(chunk)  # type: ignore
+
+    end = time.time()
+    print(f"Total bytes: {total_bytes}")
+    print(f"Total time: {end - start_req:.3f}s")
+    if first_byte_at:
+        print(f"Time to first byte: {first_byte_at - start_req:.3f}s")
+
+    print()
+    print("Interpretation:")
+    print("- If Content-Length is absent and Transfer-Encoding is chunked (HTTP/1.1), it streamed.")
+    print("- If Content-Length is present, the response was buffered by an intermediary or origin.")
+    print("- Even with HTTP/2 (no chunked header), early first byte indicates streaming.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
+
+
diff --git a/tests/test_litellm/test_tts_deferred_streaming.py b/tests/test_litellm/test_tts_deferred_streaming.py
@@ -0,0 +1,73 @@
+import asyncio
+
+from types import SimpleNamespace
+
+from litellm.llms.openai.openai import _DeferredOpenAITTSStream
+
+
+class _FakeHTTPResponse:
+    def __init__(self, chunks):
+        self._chunks = chunks
+
+    async def aiter_bytes(self, chunk_size: int = 1024):
+        for c in self._chunks:
+            await asyncio.sleep(0)
+            yield c
+
+
+class _FakeStreamed:
+    def __init__(self, chunks, enter_counter):
+        self.http_response = _FakeHTTPResponse(chunks)
+        self._enter_counter = enter_counter
+
+    async def __aenter__(self):
+        self._enter_counter["count"] += 1
+        return self
+
+    async def __aexit__(self, exc_type, exc, tb):
+        return False
+
+
+class _FakeContextFactory:
+    def __init__(self, chunks, enter_counter):
+        self._chunks = chunks
+        self._enter_counter = enter_counter
+
+    def __call__(self, **kwargs):
+        # Return an async context manager compatible object
+        return _FakeStreamed(self._chunks, self._enter_counter)
+
+
+def _make_fake_client(chunks, enter_counter):
+    client = SimpleNamespace()
+    client.audio = SimpleNamespace()
+    client.audio.speech = SimpleNamespace()
+    client.audio.speech.with_streaming_response = SimpleNamespace()
+    # create(**kwargs) should return an async context manager
+    client.audio.speech.with_streaming_response.create = _FakeContextFactory(chunks, enter_counter)
+    return client
+
+
+def test_deferred_streaming_yields_bytes():
+    chunks = [b"one", b"two", b"three"]
+    enter_counter = {"count": 0}
+    client = _make_fake_client(chunks, enter_counter)
+    stream = _DeferredOpenAITTSStream(
+        client=client,
+        request_kwargs={"model": "x", "voice": "y", "input": "z"},
+    )
+
+    # Ensure stream context not opened until iteration
+    assert enter_counter["count"] == 0
+
+    async def _collect():
+        out_local = []
+        async for b in stream.aiter_bytes(chunk_size=2):
+            out_local.append(b)
+        return out_local
+
+    out = asyncio.run(_collect())
+    assert out == chunks
+    # Ensure context was opened exactly once during iteration
+    assert enter_counter["count"] == 1
+
diff --git a/tests/test_litellm/test_tts_logging_standard_payload.py b/tests/test_litellm/test_tts_logging_standard_payload.py
@@ -0,0 +1,33 @@
+import pytest
+
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging
+from litellm.types.utils import CallTypes
+
+class _DeferredTTSAdapter:
+    _hidden_params = {}
+    async def aiter_bytes(self, chunk_size: int = 1024):
+        async def _gen():
+            yield b"bytes"
+        return _gen()
+
+@pytest.mark.asyncio
+async def test_aspeech_logging_builds_standard_payload_for_tts():
+    logging_obj = LiteLLMLogging(
+        model="gpt-4o-mini-tts",
+        messages=[],
+        stream=False,
+        litellm_call_id="test-call",
+        function_id="test-func",
+        call_type=CallTypes.aspeech.value,
+        start_time=None,
+        kwargs={"input": "hello world"},
+    )
+
+    result = _DeferredTTSAdapter()
+    await logging_obj.async_success_handler(result=result)
+
+    assert "standard_logging_object" in logging_obj.model_call_details, (
+        "standard_logging_object should be built for TTS/aspeech responses"
+    )
+    sl = logging_obj.model_call_details["standard_logging_object"]
+    assert sl is None or isinstance(sl, dict)