Skip to content
11 changes: 11 additions & 0 deletions litellm/litellm_core_utils/litellm_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -1542,6 +1542,17 @@ def _is_recognized_call_type_for_logging(
"""
Returns True if the call type is recognized for logging (eg. ModelResponse, ModelResponseStream, etc.)
"""
# Ensure Text-to-Speech calls are recognized for logging even if the
# provider returns a lightweight streaming adapter instead of
# HttpxBinaryResponseContent. This guarantees building a
# standard_logging_object for TTS so downstream proxy callbacks can
# track spend and budgets.
try:
if self.call_type in (CallTypes.speech.value, CallTypes.aspeech.value):
return True
except Exception:
# If call_type is missing for any reason, fallthrough to type checks
pass
if (
isinstance(logging_result, ModelResponse)
or isinstance(logging_result, ModelResponseStream)
Expand Down
47 changes: 40 additions & 7 deletions litellm/llms/openai/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,26 @@
openAIGPT5Config = OpenAIGPT5Config()


class _DeferredOpenAITTSStream:
"""
Opens the OpenAI streaming context only when aiter_bytes() is consumed,
keeping the upstream stream alive while the proxy yields chunks.
"""

def __init__(self, client: AsyncOpenAI, request_kwargs: dict):
self._client = client
self._request_kwargs = request_kwargs
self._hidden_params: dict = {}

async def aiter_bytes(self, chunk_size: int = 1024):
async with self._client.audio.speech.with_streaming_response.create(
**self._request_kwargs
) as streamed:
async for chunk in streamed.http_response.aiter_bytes(
chunk_size=chunk_size
):
yield chunk

class MistralEmbeddingConfig:
"""
Reference: https://docs.mistral.ai/api/#operation/createEmbedding
Expand Down Expand Up @@ -1433,6 +1453,8 @@ def audio_speech(
client=client,
)

# For sync path, fall back to simple non-streaming create (keeps behavior for sync speech())
# Proxy uses async path; real streaming is handled in async_audio_speech via deferred stream.
response = cast(OpenAI, openai_client).audio.speech.create(
model=model,
voice=voice, # type: ignore
Expand Down Expand Up @@ -1467,14 +1489,25 @@ async def async_audio_speech(
),
)

response = await openai_client.audio.speech.create(
model=model,
voice=voice, # type: ignore
input=input,
# Return a deferred streaming object so proxy can iterate without prematurely closing upstream
request_kwargs = {
"model": model,
"voice": voice,
"input": input,
**optional_params,
)

return HttpxBinaryResponseContent(response=response.response)
}
deferred = _DeferredOpenAITTSStream(client=openai_client, request_kwargs=request_kwargs)
# Adapt to HttpxBinaryResponseContent interface by exposing aiter_bytes()
class _Adapter:
_hidden_params: dict = {}

async def aiter_bytes(self, chunk_size: int = 1024):
async def _gen():
async for b in deferred.aiter_bytes(chunk_size=chunk_size):
yield b
return _gen()

return _Adapter() # type: ignore


class OpenAIFilesAPI(BaseLLM):
Expand Down
11 changes: 11 additions & 0 deletions litellm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,17 @@ def function_setup( # noqa: PLR0915
call_type == CallTypes.aspeech.value or call_type == CallTypes.speech.value
):
messages = kwargs.get("input", "speech")
# Ensure TTS input is recorded on the logging object for character-based
# pricing in the cost calculator.
try:
if isinstance(messages, str):
# This is set later when the Logging object is constructed; we
# also redundantly set it in model_call_details for safety.
kwargs.setdefault("metadata", {})
# model_call_details populated below will read from kwargs

except Exception:
pass
elif (
call_type == CallTypes.aresponses.value
or call_type == CallTypes.responses.value
Expand Down
136 changes: 136 additions & 0 deletions scripts/verify_tts_streaming.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#!/usr/bin/env python3
import argparse
import contextlib
import os
import sys
import time
from typing import Optional

import httpx


def build_url(base_url: str, endpoint_path: str) -> str:
if base_url.endswith("/"):
base_url = base_url[:-1]
if not endpoint_path.startswith("/"):
endpoint_path = "/" + endpoint_path
return base_url + endpoint_path


def main() -> int:
parser = argparse.ArgumentParser(description="Verify TTS streaming via chunked transfer")
parser.add_argument(
"--base-url",
default=os.environ.get("OPENAI_BASE_URL", "http://0.0.0.0:4000"),
help="Base URL for the API (default from OPENAI_BASE_URL or http://0.0.0.0:4000)",
)
parser.add_argument(
"--endpoint-path",
default="/v1/audio/speech",
help="Endpoint path to call (e.g. /v1/audio/speech or /openai/audio/speech)",
)
parser.add_argument(
"--model",
default="gpt-4o-mini-tts",
help="Model name (default: gpt-4o-mini-tts)",
)
parser.add_argument(
"--voice",
default="shimmer",
help="Voice to use (default: shimmer)",
)
parser.add_argument(
"--input",
default=(
"Once upon a time, in a bustling city nestled between rolling hills and a sparkling river, there lived a young inventor named Elara. Elara was known throughout the city for her boundless curiosity and her knack for creating marvelous contraptions from the most ordinary of objects. One day, while exploring the attic of her late grandfather’s house, she stumbled upon a dusty, leather-bound journal filled with cryptic notes and intricate sketches of a mysterious machine. Intrigued, Elara spent days deciphering the journal, piecing together the purpose of the device. It was said to be a portal, capable of bridging worlds and connecting distant realms. Driven by excitement and a sense of adventure, Elara gathered the necessary parts—cogs, wires, crystals, and a peculiar brass key—and began assembling the machine in her workshop. As she tightened the final bolt and inserted the key, the device hummed to life, casting a shimmering blue light across the room. With a deep breath, Elara stepped forward and activated the portal. Instantly, she was enveloped in a whirlwind of colors and sounds, feeling herself transported beyond the boundaries of her world. When the light faded, she found herself standing in a lush, enchanted forest, where trees whispered secrets and fantastical creatures roamed freely. Elara realized she had crossed into a realm of endless possibilities, where her inventions could shape the very fabric of reality. Determined to explore and learn, she set off down a winding path, eager to uncover the wonders and challenges that awaited her in this extraordinary new world. And so began Elara’s greatest adventure, one that would test her ingenuity, courage, and heart, and ultimately reveal the true power of imagination and discovery."
),
help="Text to synthesize",
)
parser.add_argument(
"--response-format",
default="mp3",
help="Audio response format (default: mp3)",
)
parser.add_argument(
"--output",
default=None,
help="Optional path to write audio to (if omitted, data is discarded)",
)
parser.add_argument(
"--http2",
action="store_true",
help="Enable HTTP/2 (default: off). Leave off to see chunked headers in HTTP/1.1",
)
args = parser.parse_args()

api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
print("ERROR: OPENAI_API_KEY is not set in the environment", file=sys.stderr)
return 2

url = build_url(args.base_url, args.endpoint_path)
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
"Accept": "audio/mpeg",
}
json_body = {
"model": args.model,
"input": args.input,
"voice": args.voice,
"response_format": args.response_format,
}

print(f"Requesting: {url}")
print(f"HTTP/2: {'on' if args.http2 else 'off'} (HTTP/1.1 if off)")

# Force HTTP/1.1 by default to make Transfer-Encoding: chunked visible when streaming.
# For HTTP/2, chunked header will not be present even when streaming works.
start_req = time.time()
first_byte_at: Optional[float] = None
total_bytes = 0

with httpx.Client(http2=args.http2, timeout=None) as client:
with client.stream("POST", url, headers=headers, json=json_body) as resp:
status = resp.status_code
# Print key headers that indicate buffering vs streaming
cl = resp.headers.get("content-length")
te = resp.headers.get("transfer-encoding")
server = resp.headers.get("server")
print(f"Status: {status}")
print(f"Content-Type: {resp.headers.get('content-type')}")
print(f"Content-Length: {cl}")
print(f"Transfer-Encoding: {te}")
print(f"Server: {server}")

# Stream body
sink_cm = open(args.output, "wb") if args.output else contextlib.nullcontext()
with sink_cm as sink:
for chunk in resp.iter_bytes():
if not first_byte_at:
first_byte_at = time.time()
print(
f"First byte after {first_byte_at - start_req:.3f}s"
)
total_bytes += len(chunk)
if sink and hasattr(sink, "write"):
sink.write(chunk) # type: ignore

end = time.time()
print(f"Total bytes: {total_bytes}")
print(f"Total time: {end - start_req:.3f}s")
if first_byte_at:
print(f"Time to first byte: {first_byte_at - start_req:.3f}s")

print()
print("Interpretation:")
print("- If Content-Length is absent and Transfer-Encoding is chunked (HTTP/1.1), it streamed.")
print("- If Content-Length is present, the response was buffered by an intermediary or origin.")
print("- Even with HTTP/2 (no chunked header), early first byte indicates streaming.")
return 0


if __name__ == "__main__":
raise SystemExit(main())


73 changes: 73 additions & 0 deletions tests/test_litellm/test_tts_deferred_streaming.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import asyncio

from types import SimpleNamespace

from litellm.llms.openai.openai import _DeferredOpenAITTSStream


class _FakeHTTPResponse:
def __init__(self, chunks):
self._chunks = chunks

async def aiter_bytes(self, chunk_size: int = 1024):
for c in self._chunks:
await asyncio.sleep(0)
yield c


class _FakeStreamed:
def __init__(self, chunks, enter_counter):
self.http_response = _FakeHTTPResponse(chunks)
self._enter_counter = enter_counter

async def __aenter__(self):
self._enter_counter["count"] += 1
return self

async def __aexit__(self, exc_type, exc, tb):
return False


class _FakeContextFactory:
def __init__(self, chunks, enter_counter):
self._chunks = chunks
self._enter_counter = enter_counter

def __call__(self, **kwargs):
# Return an async context manager compatible object
return _FakeStreamed(self._chunks, self._enter_counter)


def _make_fake_client(chunks, enter_counter):
client = SimpleNamespace()
client.audio = SimpleNamespace()
client.audio.speech = SimpleNamespace()
client.audio.speech.with_streaming_response = SimpleNamespace()
# create(**kwargs) should return an async context manager
client.audio.speech.with_streaming_response.create = _FakeContextFactory(chunks, enter_counter)
return client


def test_deferred_streaming_yields_bytes():
chunks = [b"one", b"two", b"three"]
enter_counter = {"count": 0}
client = _make_fake_client(chunks, enter_counter)
stream = _DeferredOpenAITTSStream(
client=client,
request_kwargs={"model": "x", "voice": "y", "input": "z"},
)

# Ensure stream context not opened until iteration
assert enter_counter["count"] == 0

async def _collect():
out_local = []
async for b in stream.aiter_bytes(chunk_size=2):
out_local.append(b)
return out_local

out = asyncio.run(_collect())
assert out == chunks
# Ensure context was opened exactly once during iteration
assert enter_counter["count"] == 1

33 changes: 33 additions & 0 deletions tests/test_litellm/test_tts_logging_standard_payload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pytest

from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging
from litellm.types.utils import CallTypes

class _DeferredTTSAdapter:
_hidden_params = {}
async def aiter_bytes(self, chunk_size: int = 1024):
async def _gen():
yield b"bytes"
return _gen()

@pytest.mark.asyncio
async def test_aspeech_logging_builds_standard_payload_for_tts():
logging_obj = LiteLLMLogging(
model="gpt-4o-mini-tts",
messages=[],
stream=False,
litellm_call_id="test-call",
function_id="test-func",
call_type=CallTypes.aspeech.value,
start_time=None,
kwargs={"input": "hello world"},
)

result = _DeferredTTSAdapter()
await logging_obj.async_success_handler(result=result)

assert "standard_logging_object" in logging_obj.model_call_details, (
"standard_logging_object should be built for TTS/aspeech responses"
)
sl = logging_obj.model_call_details["standard_logging_object"]
assert sl is None or isinstance(sl, dict)