From 7e0b132c04e3561092e68baa4488f6551b43ab75 Mon Sep 17 00:00:00 2001 From: Manuel Fittko Date: Thu, 25 Sep 2025 12:48:12 +0200 Subject: [PATCH 1/8] Enhance OpenAI audio speech methods to support streaming responses using context managers, allowing for efficient byte iteration without buffering. --- litellm/llms/openai/openai.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/litellm/llms/openai/openai.py b/litellm/llms/openai/openai.py index 3347e5332425..7c36b14da7d1 100644 --- a/litellm/llms/openai/openai.py +++ b/litellm/llms/openai/openai.py @@ -1433,13 +1433,14 @@ def audio_speech( client=client, ) - response = cast(OpenAI, openai_client).audio.speech.create( + # Use streaming response via context manager so callers can aiter_bytes without buffering + with cast(OpenAI, openai_client).audio.speech.with_streaming_response.create( model=model, voice=voice, # type: ignore input=input, **optional_params, - ) - return HttpxBinaryResponseContent(response=response.response) + ) as streamed: + return HttpxBinaryResponseContent(response=streamed.http_response) async def async_audio_speech( self, @@ -1467,14 +1468,14 @@ async def async_audio_speech( ), ) - response = await openai_client.audio.speech.create( + # Use streaming response via context manager so callers can aiter_bytes without buffering + async with openai_client.audio.speech.with_streaming_response.create( model=model, voice=voice, # type: ignore input=input, **optional_params, - ) - - return HttpxBinaryResponseContent(response=response.response) + ) as streamed: + return HttpxBinaryResponseContent(response=streamed.http_response) class OpenAIFilesAPI(BaseLLM): From f69183aa88b6cd586c118078cfa61504a2d12061 Mon Sep 17 00:00:00 2001 From: Manuel Fittko Date: Thu, 25 Sep 2025 13:29:04 +0200 Subject: [PATCH 2/8] Implement deferred streaming for OpenAI audio speech methods, allowing for efficient byte iteration without prematurely closing the upstream stream. This change enhances the async audio speech functionality while maintaining compatibility with existing synchronous behavior. --- litellm/llms/openai/openai.py | 54 ++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/litellm/llms/openai/openai.py b/litellm/llms/openai/openai.py index 7c36b14da7d1..dc66d08166a8 100644 --- a/litellm/llms/openai/openai.py +++ b/litellm/llms/openai/openai.py @@ -63,6 +63,26 @@ openAIGPT5Config = OpenAIGPT5Config() +class _DeferredOpenAITTSStream: + """ + Opens the OpenAI streaming context only when aiter_bytes() is consumed, + keeping the upstream stream alive while the proxy yields chunks. + """ + + def __init__(self, client: AsyncOpenAI, request_kwargs: dict): + self._client = client + self._request_kwargs = request_kwargs + self._hidden_params: dict = {} + + async def aiter_bytes(self, chunk_size: int = 1024): + async with self._client.audio.speech.with_streaming_response.create( + **self._request_kwargs + ) as streamed: + async for chunk in streamed.http_response.aiter_bytes( + chunk_size=chunk_size + ): + yield chunk + class MistralEmbeddingConfig: """ Reference: https://docs.mistral.ai/api/#operation/createEmbedding @@ -1433,14 +1453,15 @@ def audio_speech( client=client, ) - # Use streaming response via context manager so callers can aiter_bytes without buffering - with cast(OpenAI, openai_client).audio.speech.with_streaming_response.create( + # For sync path, fall back to simple non-streaming create (keeps behavior for sync speech()) + # Proxy uses async path; real streaming is handled in async_audio_speech via deferred stream. + response = cast(OpenAI, openai_client).audio.speech.create( model=model, voice=voice, # type: ignore input=input, **optional_params, - ) as streamed: - return HttpxBinaryResponseContent(response=streamed.http_response) + ) + return HttpxBinaryResponseContent(response=response.response) async def async_audio_speech( self, @@ -1468,14 +1489,25 @@ async def async_audio_speech( ), ) - # Use streaming response via context manager so callers can aiter_bytes without buffering - async with openai_client.audio.speech.with_streaming_response.create( - model=model, - voice=voice, # type: ignore - input=input, + # Return a deferred streaming object so proxy can iterate without prematurely closing upstream + request_kwargs = { + "model": model, + "voice": voice, + "input": input, **optional_params, - ) as streamed: - return HttpxBinaryResponseContent(response=streamed.http_response) + } + deferred = _DeferredOpenAITTSStream(client=openai_client, request_kwargs=request_kwargs) + # Adapt to HttpxBinaryResponseContent interface by exposing aiter_bytes() + class _Adapter: + _hidden_params: dict = {} + + async def aiter_bytes(self, chunk_size: int = 1024): + async def _gen(): + async for b in deferred.aiter_bytes(chunk_size=chunk_size): + yield b + return _gen() + + return _Adapter() # type: ignore class OpenAIFilesAPI(BaseLLM): From 2b77d76208c91e7bf84e724b4260f849416601bc Mon Sep 17 00:00:00 2001 From: Manuel Fittko Date: Thu, 25 Sep 2025 14:26:25 +0200 Subject: [PATCH 3/8] Add verify_tts_streaming.py to test TTS streaming behavior (headers, TTFB, bytes) --- scripts/verify_tts_streaming.py | 136 ++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 scripts/verify_tts_streaming.py diff --git a/scripts/verify_tts_streaming.py b/scripts/verify_tts_streaming.py new file mode 100644 index 000000000000..01ee07142085 --- /dev/null +++ b/scripts/verify_tts_streaming.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +import argparse +import contextlib +import os +import sys +import time +from typing import Optional + +import httpx + + +def build_url(base_url: str, endpoint_path: str) -> str: + if base_url.endswith("/"): + base_url = base_url[:-1] + if not endpoint_path.startswith("/"): + endpoint_path = "/" + endpoint_path + return base_url + endpoint_path + + +def main() -> int: + parser = argparse.ArgumentParser(description="Verify TTS streaming via chunked transfer") + parser.add_argument( + "--base-url", + default=os.environ.get("OPENAI_BASE_URL", "http://0.0.0.0:4000"), + help="Base URL for the API (default from OPENAI_BASE_URL or http://0.0.0.0:4000)", + ) + parser.add_argument( + "--endpoint-path", + default="/v1/audio/speech", + help="Endpoint path to call (e.g. /v1/audio/speech or /openai/audio/speech)", + ) + parser.add_argument( + "--model", + default="gpt-4o-mini-tts", + help="Model name (default: gpt-4o-mini-tts)", + ) + parser.add_argument( + "--voice", + default="shimmer", + help="Voice to use (default: shimmer)", + ) + parser.add_argument( + "--input", + default=( + "Once upon a time, in a bustling city nestled between rolling hills and a sparkling river, there lived a young inventor named Elara. Elara was known throughout the city for her boundless curiosity and her knack for creating marvelous contraptions from the most ordinary of objects. One day, while exploring the attic of her late grandfather’s house, she stumbled upon a dusty, leather-bound journal filled with cryptic notes and intricate sketches of a mysterious machine. Intrigued, Elara spent days deciphering the journal, piecing together the purpose of the device. It was said to be a portal, capable of bridging worlds and connecting distant realms. Driven by excitement and a sense of adventure, Elara gathered the necessary parts—cogs, wires, crystals, and a peculiar brass key—and began assembling the machine in her workshop. As she tightened the final bolt and inserted the key, the device hummed to life, casting a shimmering blue light across the room. With a deep breath, Elara stepped forward and activated the portal. Instantly, she was enveloped in a whirlwind of colors and sounds, feeling herself transported beyond the boundaries of her world. When the light faded, she found herself standing in a lush, enchanted forest, where trees whispered secrets and fantastical creatures roamed freely. Elara realized she had crossed into a realm of endless possibilities, where her inventions could shape the very fabric of reality. Determined to explore and learn, she set off down a winding path, eager to uncover the wonders and challenges that awaited her in this extraordinary new world. And so began Elara’s greatest adventure, one that would test her ingenuity, courage, and heart, and ultimately reveal the true power of imagination and discovery." + ), + help="Text to synthesize", + ) + parser.add_argument( + "--response-format", + default="mp3", + help="Audio response format (default: mp3)", + ) + parser.add_argument( + "--output", + default=None, + help="Optional path to write audio to (if omitted, data is discarded)", + ) + parser.add_argument( + "--http2", + action="store_true", + help="Enable HTTP/2 (default: off). Leave off to see chunked headers in HTTP/1.1", + ) + args = parser.parse_args() + + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + print("ERROR: OPENAI_API_KEY is not set in the environment", file=sys.stderr) + return 2 + + url = build_url(args.base_url, args.endpoint_path) + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + "Accept": "audio/mpeg", + } + json_body = { + "model": args.model, + "input": args.input, + "voice": args.voice, + "response_format": args.response_format, + } + + print(f"Requesting: {url}") + print(f"HTTP/2: {'on' if args.http2 else 'off'} (HTTP/1.1 if off)") + + # Force HTTP/1.1 by default to make Transfer-Encoding: chunked visible when streaming. + # For HTTP/2, chunked header will not be present even when streaming works. + start_req = time.time() + first_byte_at: Optional[float] = None + total_bytes = 0 + + with httpx.Client(http2=args.http2, timeout=None) as client: + with client.stream("POST", url, headers=headers, json=json_body) as resp: + status = resp.status_code + # Print key headers that indicate buffering vs streaming + cl = resp.headers.get("content-length") + te = resp.headers.get("transfer-encoding") + server = resp.headers.get("server") + print(f"Status: {status}") + print(f"Content-Type: {resp.headers.get('content-type')}") + print(f"Content-Length: {cl}") + print(f"Transfer-Encoding: {te}") + print(f"Server: {server}") + + # Stream body + sink_cm = open(args.output, "wb") if args.output else contextlib.nullcontext() + with sink_cm as sink: + for chunk in resp.iter_bytes(): + if not first_byte_at: + first_byte_at = time.time() + print( + f"First byte after {first_byte_at - start_req:.3f}s" + ) + total_bytes += len(chunk) + if sink and hasattr(sink, "write"): + sink.write(chunk) # type: ignore + + end = time.time() + print(f"Total bytes: {total_bytes}") + print(f"Total time: {end - start_req:.3f}s") + if first_byte_at: + print(f"Time to first byte: {first_byte_at - start_req:.3f}s") + + print() + print("Interpretation:") + print("- If Content-Length is absent and Transfer-Encoding is chunked (HTTP/1.1), it streamed.") + print("- If Content-Length is present, the response was buffered by an intermediary or origin.") + print("- Even with HTTP/2 (no chunked header), early first byte indicates streaming.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) + + From 10ba952756fa8d152aa1e546d14abf1fb8a6d1f2 Mon Sep 17 00:00:00 2001 From: Manuel Fittko Date: Thu, 25 Sep 2025 14:29:54 +0200 Subject: [PATCH 4/8] test: add minimal deferred TTS streaming unit test (skipped if async plugin missing) --- tests/litellm/test_tts_deferred_streaming.py | 63 ++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 tests/litellm/test_tts_deferred_streaming.py diff --git a/tests/litellm/test_tts_deferred_streaming.py b/tests/litellm/test_tts_deferred_streaming.py new file mode 100644 index 000000000000..7625d801d1ee --- /dev/null +++ b/tests/litellm/test_tts_deferred_streaming.py @@ -0,0 +1,63 @@ +import asyncio + +import pytest + +from litellm.llms.openai.openai import _DeferredOpenAITTSStream + + +class _FakeHTTPResponse: + def __init__(self, chunks): + self._chunks = chunks + + async def aiter_bytes(self, chunk_size: int = 1024): + for c in self._chunks: + await asyncio.sleep(0) + yield c + + +class _FakeStreamed: + def __init__(self, chunks): + self.http_response = _FakeHTTPResponse(chunks) + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + +class _FakeContextFactory: + def __init__(self, chunks): + self._chunks = chunks + + def __call__(self, **kwargs): + # Return an async context manager compatible object + return _FakeStreamed(self._chunks) + + +class _FakeClientNS: + pass + + +def _make_fake_client(chunks): + client = _FakeClientNS() + client.audio = _FakeClientNS() + client.audio.speech = _FakeClientNS() + client.audio.speech.with_streaming_response = _FakeClientNS() + # create(**kwargs) should return an async context manager + client.audio.speech.with_streaming_response.create = _FakeContextFactory(chunks) + return client + + +@pytest.mark.asyncio +async def test_deferred_streaming_yields_bytes(): + chunks = [b"one", b"two", b"three"] + client = _make_fake_client(chunks) + stream = _DeferredOpenAITTSStream(client=client, request_kwargs={"model": "x", "voice": "y", "input": "z"}) + + out = [] + async for b in stream.aiter_bytes(chunk_size=2): + out.append(b) + + assert out == chunks + From 517cf69fa8aed1ca66a5eefc6823c2ad718b6a19 Mon Sep 17 00:00:00 2001 From: Manuel Fittko Date: Thu, 25 Sep 2025 15:17:07 +0200 Subject: [PATCH 5/8] refactor: replace custom fake client class with SimpleNamespace for cleaner test setup in TTS deferred streaming tests --- tests/litellm/test_tts_deferred_streaming.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/litellm/test_tts_deferred_streaming.py b/tests/litellm/test_tts_deferred_streaming.py index 7625d801d1ee..3e4a8e78ce28 100644 --- a/tests/litellm/test_tts_deferred_streaming.py +++ b/tests/litellm/test_tts_deferred_streaming.py @@ -1,6 +1,7 @@ import asyncio import pytest +from types import SimpleNamespace from litellm.llms.openai.openai import _DeferredOpenAITTSStream @@ -35,15 +36,11 @@ def __call__(self, **kwargs): return _FakeStreamed(self._chunks) -class _FakeClientNS: - pass - - def _make_fake_client(chunks): - client = _FakeClientNS() - client.audio = _FakeClientNS() - client.audio.speech = _FakeClientNS() - client.audio.speech.with_streaming_response = _FakeClientNS() + client = SimpleNamespace() + client.audio = SimpleNamespace() + client.audio.speech = SimpleNamespace() + client.audio.speech.with_streaming_response = SimpleNamespace() # create(**kwargs) should return an async context manager client.audio.speech.with_streaming_response.create = _FakeContextFactory(chunks) return client From 38c0c58632d0c6f3a7e6ebee758664a7de39818d Mon Sep 17 00:00:00 2001 From: Manuel Fittko Date: Thu, 25 Sep 2025 15:24:41 +0200 Subject: [PATCH 6/8] test: enhance deferred TTS streaming test to verify context manager behavior and ensure proper streaming iteration --- tests/litellm/test_tts_deferred_streaming.py | 43 +++++++++++++------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/tests/litellm/test_tts_deferred_streaming.py b/tests/litellm/test_tts_deferred_streaming.py index 3e4a8e78ce28..a21c3e5b54f2 100644 --- a/tests/litellm/test_tts_deferred_streaming.py +++ b/tests/litellm/test_tts_deferred_streaming.py @@ -1,6 +1,5 @@ import asyncio -import pytest from types import SimpleNamespace from litellm.llms.openai.openai import _DeferredOpenAITTSStream @@ -17,10 +16,12 @@ async def aiter_bytes(self, chunk_size: int = 1024): class _FakeStreamed: - def __init__(self, chunks): + def __init__(self, chunks, enter_counter): self.http_response = _FakeHTTPResponse(chunks) + self._enter_counter = enter_counter async def __aenter__(self): + self._enter_counter["count"] += 1 return self async def __aexit__(self, exc_type, exc, tb): @@ -28,33 +29,45 @@ async def __aexit__(self, exc_type, exc, tb): class _FakeContextFactory: - def __init__(self, chunks): + def __init__(self, chunks, enter_counter): self._chunks = chunks + self._enter_counter = enter_counter def __call__(self, **kwargs): # Return an async context manager compatible object - return _FakeStreamed(self._chunks) + return _FakeStreamed(self._chunks, self._enter_counter) -def _make_fake_client(chunks): +def _make_fake_client(chunks, enter_counter): client = SimpleNamespace() client.audio = SimpleNamespace() client.audio.speech = SimpleNamespace() client.audio.speech.with_streaming_response = SimpleNamespace() # create(**kwargs) should return an async context manager - client.audio.speech.with_streaming_response.create = _FakeContextFactory(chunks) + client.audio.speech.with_streaming_response.create = _FakeContextFactory(chunks, enter_counter) return client -@pytest.mark.asyncio -async def test_deferred_streaming_yields_bytes(): +def test_deferred_streaming_yields_bytes(): chunks = [b"one", b"two", b"three"] - client = _make_fake_client(chunks) - stream = _DeferredOpenAITTSStream(client=client, request_kwargs={"model": "x", "voice": "y", "input": "z"}) - - out = [] - async for b in stream.aiter_bytes(chunk_size=2): - out.append(b) - + enter_counter = {"count": 0} + client = _make_fake_client(chunks, enter_counter) + stream = _DeferredOpenAITTSStream( + client=client, + request_kwargs={"model": "x", "voice": "y", "input": "z"}, + ) + + # Ensure stream context not opened until iteration + assert enter_counter["count"] == 0 + + async def _collect(): + out_local = [] + async for b in stream.aiter_bytes(chunk_size=2): + out_local.append(b) + return out_local + + out = asyncio.run(_collect()) assert out == chunks + # Ensure context was opened exactly once during iteration + assert enter_counter["count"] == 1 From 23af7503c4fae2e92fe4d8288439c23c47f754c4 Mon Sep 17 00:00:00 2001 From: Manuel Fittko Date: Fri, 26 Sep 2025 08:54:13 +0200 Subject: [PATCH 7/8] move test file to whre it belongs --- tests/{litellm => test_litellm}/test_tts_deferred_streaming.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{litellm => test_litellm}/test_tts_deferred_streaming.py (100%) diff --git a/tests/litellm/test_tts_deferred_streaming.py b/tests/test_litellm/test_tts_deferred_streaming.py similarity index 100% rename from tests/litellm/test_tts_deferred_streaming.py rename to tests/test_litellm/test_tts_deferred_streaming.py From 4d71977723865c98165b95c857cf79d73a014201 Mon Sep 17 00:00:00 2001 From: Manuel Fittko Date: Fri, 26 Sep 2025 12:29:55 +0200 Subject: [PATCH 8/8] feat(logging): enhance TTS logging to ensure standard payload construction for speech calls and add unit tests for verification --- litellm/litellm_core_utils/litellm_logging.py | 11 +++++++ litellm/utils.py | 11 +++++++ .../test_tts_logging_standard_payload.py | 33 +++++++++++++++++++ 3 files changed, 55 insertions(+) create mode 100644 tests/test_litellm/test_tts_logging_standard_payload.py diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index 64986970d001..1fa63ccdd2b6 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -1542,6 +1542,17 @@ def _is_recognized_call_type_for_logging( """ Returns True if the call type is recognized for logging (eg. ModelResponse, ModelResponseStream, etc.) """ + # Ensure Text-to-Speech calls are recognized for logging even if the + # provider returns a lightweight streaming adapter instead of + # HttpxBinaryResponseContent. This guarantees building a + # standard_logging_object for TTS so downstream proxy callbacks can + # track spend and budgets. + try: + if self.call_type in (CallTypes.speech.value, CallTypes.aspeech.value): + return True + except Exception: + # If call_type is missing for any reason, fallthrough to type checks + pass if ( isinstance(logging_result, ModelResponse) or isinstance(logging_result, ModelResponseStream) diff --git a/litellm/utils.py b/litellm/utils.py index 0721b023d2bb..e4e7a2c02dc6 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -780,6 +780,17 @@ def function_setup( # noqa: PLR0915 call_type == CallTypes.aspeech.value or call_type == CallTypes.speech.value ): messages = kwargs.get("input", "speech") + # Ensure TTS input is recorded on the logging object for character-based + # pricing in the cost calculator. + try: + if isinstance(messages, str): + # This is set later when the Logging object is constructed; we + # also redundantly set it in model_call_details for safety. + kwargs.setdefault("metadata", {}) + # model_call_details populated below will read from kwargs + + except Exception: + pass elif ( call_type == CallTypes.aresponses.value or call_type == CallTypes.responses.value diff --git a/tests/test_litellm/test_tts_logging_standard_payload.py b/tests/test_litellm/test_tts_logging_standard_payload.py new file mode 100644 index 000000000000..5af338f65ef1 --- /dev/null +++ b/tests/test_litellm/test_tts_logging_standard_payload.py @@ -0,0 +1,33 @@ +import pytest + +from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging +from litellm.types.utils import CallTypes + +class _DeferredTTSAdapter: + _hidden_params = {} + async def aiter_bytes(self, chunk_size: int = 1024): + async def _gen(): + yield b"bytes" + return _gen() + +@pytest.mark.asyncio +async def test_aspeech_logging_builds_standard_payload_for_tts(): + logging_obj = LiteLLMLogging( + model="gpt-4o-mini-tts", + messages=[], + stream=False, + litellm_call_id="test-call", + function_id="test-func", + call_type=CallTypes.aspeech.value, + start_time=None, + kwargs={"input": "hello world"}, + ) + + result = _DeferredTTSAdapter() + await logging_obj.async_success_handler(result=result) + + assert "standard_logging_object" in logging_obj.model_call_details, ( + "standard_logging_object should be built for TTS/aspeech responses" + ) + sl = logging_obj.model_call_details["standard_logging_object"] + assert sl is None or isinstance(sl, dict)