Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dynamic trim gap #16

Closed
wants to merge 13 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion api/src/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@ class Settings(BaseSettings):
voices_dir: str = "voices"
sample_rate: int = 24000
max_chunk_size: int = 300 # Maximum size of text chunks for processing
gap_trim_ms: int = 250 # Amount to trim from streaming chunk ends in milliseconds

gap_trim_ms: int = 25 # Amount to trim from streaming chunk ends in milliseconds
dynamic_gap_trim_padding_ms: int = 410 # Padding to add to dynamic gap trim
dynamic_gap_trim_padding_char_multiplier: dict[str,float] = {".":1,"!":0.9,"?":1,",":0.8}

# ONNX Optimization Settings
onnx_num_threads: int = 4 # Number of threads for intra-op parallelism
onnx_inter_op_threads: int = 4 # Number of threads for inter-op parallelism
Expand Down
2 changes: 1 addition & 1 deletion api/src/routers/openai_compatible.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ async def create_speech(
"wav": "audio/wav",
"pcm": "audio/pcm",
}.get(request.response_format, f"audio/{request.response_format}")

logger.debug(f"Stream requested: {request.stream}")
# Check if streaming is requested (default for OpenAI client)
if request.stream:
# Stream audio chunks as they're generated
Expand Down
83 changes: 68 additions & 15 deletions api/src/services/audio.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Audio conversion service"""

from io import BytesIO
from pydoc import text

import math
import numpy as np
import scipy.io.wavfile as wavfile
import soundfile as sf
Expand All @@ -19,22 +21,66 @@ def __init__(self):
self.sample_rate = 24000 # Sample rate of the audio
self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)

def normalize(
self, audio_data: np.ndarray, is_last_chunk: bool = False
) -> np.ndarray:
"""Convert audio data to int16 range and trim chunk boundaries"""
if len(audio_data) == 0:
raise ValueError("Audio data cannot be empty")
self.samples_to_pad_start= int(50 * self.sample_rate / 1000)

def find_first_last_non_silent(self,audio_data: np.ndarray, chunk:str,speed: float, silence_threshold_db: int = -45,is_last_chunk:bool=False) -> tuple[int, int]:
"""
Finds the indices of the first and last non-silent samples in audio data.
"""


pad_multiplier=1
split_character=chunk.strip()
if len(split_character) > 0:
split_character=split_character[-1]
if split_character in settings.dynamic_gap_trim_padding_char_multiplier:
pad_multiplier=settings.dynamic_gap_trim_padding_char_multiplier[split_character]

if not is_last_chunk:
samples_to_pad_end= max(int((settings.dynamic_gap_trim_padding_ms * self.sample_rate * pad_multiplier) / 1000) - self.samples_to_pad_start, 0)
else:
samples_to_pad_end=self.samples_to_pad_start
# Convert dBFS threshold to amplitude
amplitude_threshold = self.int16_max * (10 ** (silence_threshold_db / 20))

# Find all samples above the silence threshold
non_silent_index_start, non_silent_index_end = None,None

for X in range(0,len(audio_data)):
if audio_data[X] > amplitude_threshold:
non_silent_index_start=X
break

# Simple float32 to int16 conversion
for X in range(len(audio_data) - 1, -1, -1):
if audio_data[X] > amplitude_threshold:
non_silent_index_end=X
break

# Handle the case where the entire audio is silent
if non_silent_index_start == None or non_silent_index_end == None:
return 0, len(audio_data)

return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data))

def normalize(self, audio_data: np.ndarray, chunk:str, speed:float, is_last_chunk: bool = False) -> np.ndarray:
"""Normalize audio data to int16 range and trim chunk boundaries"""
# Convert to float32 if not already

audio_float = audio_data.astype(np.float32)

# Trim for non-final chunks
if not is_last_chunk and len(audio_float) > self.samples_to_trim:

audio_float = audio_float[:-self.samples_to_trim]

# Direct scaling like the non-streaming version
return (audio_float * 32767).astype(np.int16)

audio_int=(audio_float * self.int16_max).astype(np.int16)

start_index,end_index=self.find_first_last_non_silent(audio_int,chunk,speed)


# Scale to int16 range
return audio_int[start_index:end_index]



class AudioService:
Expand All @@ -59,11 +105,15 @@ def convert_audio(
audio_data: np.ndarray,
sample_rate: int,
output_format: str,
speed: float=1,
is_first_chunk: bool = True,
is_last_chunk: bool = False,
normalizer: AudioNormalizer = None,
format_settings: dict = None,
stream: bool = True,

chunk: str = ""

) -> bytes:
"""Convert audio data to specified format

Expand Down Expand Up @@ -93,11 +143,14 @@ def convert_audio(

try:
# Always normalize audio to ensure proper amplitude scaling
if normalizer is None:
normalizer = AudioNormalizer()
normalized_audio = normalizer.normalize(
audio_data, is_last_chunk=is_last_chunk
)

if stream:
if normalizer is None:
normalizer = AudioNormalizer()
normalized_audio = normalizer.normalize(audio_data,chunk,speed, is_last_chunk=is_last_chunk)
else:
normalized_audio = audio_data


if output_format == "pcm":
# Raw 16-bit PCM samples, no header
Expand Down
15 changes: 10 additions & 5 deletions api/src/services/tts_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def _generate_audio_internal(
start_time = time.time()

try:
stream_normalizer = AudioNormalizer()

# Normalize text once at the start
if not text:
raise ValueError("Text is empty after preprocessing")
Expand All @@ -71,10 +73,10 @@ def _generate_audio_internal(
if stitch_long_output:
# Preprocess all chunks to phonemes/tokens
chunks_data = []
for chunk in chunker.split_text(text):
for index,chunk in enumerate(chunker.split_text(text)):
try:
phonemes, tokens = TTSModel.process_text(chunk, voice[0])
chunks_data.append((chunk, tokens))
chunks_data.append((chunk, tokens, index))
except Exception as e:
logger.error(
f"Failed to process chunk: '{chunk}'. Error: {str(e)}"
Expand All @@ -86,12 +88,13 @@ def _generate_audio_internal(

# Generate audio for all chunks
audio_chunks = []
for chunk, tokens in chunks_data:
for chunk, tokens, chunk_index in chunks_data:
try:
chunk_audio = TTSModel.generate_from_tokens(
tokens, voicepack, speed
)
if chunk_audio is not None:
chunk_audio=stream_normalizer.normalize(chunk_audio,chunk,speed,(chunk_index == len(chunks_data) - 1))
audio_chunks.append(chunk_audio)
else:
logger.error(f"No audio generated for chunk: '{chunk}'")
Expand All @@ -113,8 +116,8 @@ def _generate_audio_internal(
else:
# Process single chunk
phonemes, tokens = TTSModel.process_text(text, voice[0])
audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)

chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
audio = stream_normalizer.normalize(chunk_audio,text,speed,True)
processing_time = time.time() - start_time
return audio, processing_time

Expand Down Expand Up @@ -182,9 +185,11 @@ async def generate_audio_stream(
chunk_audio,
24000,
output_format,
speed,
is_first_chunk=is_first,
normalizer=stream_normalizer,
is_last_chunk=(next_chunk is None), # Last if no next chunk
chunk=current_chunk,
stream=True # Ensure proper streaming format handling
)

Expand Down
112 changes: 112 additions & 0 deletions api/tests/test_audio_normalizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Tests for AudioNormalizer"""

import numpy as np
import pytest
from api.src.services.audio import AudioNormalizer

@pytest.fixture
def normalizer():
"""Create an AudioNormalizer instance"""
return AudioNormalizer()

@pytest.fixture
def silent_audio():
"""Generate silent audio data"""
return np.zeros(24000, dtype=np.int16) # 1 second of silence

@pytest.fixture
def speech_audio():
"""Generate audio data with speech-like content"""
# Create 1 second of audio with speech in the middle
audio = np.zeros(24000, dtype=np.int16)
# Add speech-like content from 0.25s to 0.75s (leaving silence at start/end)
speech_start = 6000 # 0.25s * 24000
speech_end = 18000 # 0.75s * 24000
# Generate non-zero values for speech section
audio[speech_start:speech_end] = np.random.randint(-32768//2, 32767//2, speech_end-speech_start, dtype=np.int16)
return audio

def test_find_first_last_non_silent_all_silent(normalizer, silent_audio):
"""Test silence detection with completely silent audio"""
start, end = normalizer.find_first_last_non_silent(silent_audio,"",1)
assert start == 0
assert end == len(silent_audio)

def test_find_first_last_non_silent_with_speech(normalizer, speech_audio):
"""Test silence detection with audio containing speech"""
start, end = normalizer.find_first_last_non_silent(speech_audio,"",1)

# Should detect speech section with padding
# Start should be before 0.25s (with 50ms padding)
assert start < 6000
# End should be after 0.75s (with dynamic padding)
assert end > 18000
# But shouldn't extend beyond audio length
assert end <= len(speech_audio)

def test_normalize_streaming_chunks(normalizer):
"""Test normalizing streaming audio chunks"""
# Create three 100ms chunks
chunk_samples = 2400 # 100ms at 24kHz
chunks = []

# First chunk: silence then speech
chunk1 = np.zeros(chunk_samples, dtype=np.float32)
chunk1[1200:] = np.random.random(1200) * 2 - 1 # Speech in second half
chunks.append(chunk1)

# Second chunk: all speech
chunk2 = (np.random.random(chunk_samples) * 2 - 1).astype(np.float32)
chunks.append(chunk2)

# Third chunk: speech then silence
chunk3 = np.zeros(chunk_samples, dtype=np.float32)
chunk3[:1200] = np.random.random(1200) * 2 - 1 # Speech in first half
chunks.append(chunk3)

# Process chunks
results = []
for i, chunk in enumerate(chunks):
is_last = (i == len(chunks) - 1)
normalized = normalizer.normalize(chunk, is_last_chunk=is_last)
results.append(normalized)

# Verify results
# First chunk should trim silence from start but keep end for continuity
assert len(results[0]) < len(chunk1)
# Middle chunk should be similar length to input
assert abs(len(results[1]) - len(chunk2)) < 100
# Last chunk should trim silence from end
assert len(results[2]) < len(chunk3)

def test_normalize_amplitude(normalizer):
"""Test audio amplitude normalization"""
# Create audio with values outside int16 range
audio = np.random.random(1000) * 1e5

result = normalizer.normalize(audio)

# Check result is within int16 range
assert np.max(np.abs(result)) <= 32767
assert result.dtype == np.int16

def test_padding_behavior(normalizer, speech_audio):
"""Test start and end padding behavior"""
result = normalizer.normalize(speech_audio)

# Find actual speech content in result (non-zero values)
non_zero = np.nonzero(result)[0]
first_speech = non_zero[0]
last_speech = non_zero[-1]

# Verify we have some padding before first speech
# Should be close to 50ms (1200 samples at 24kHz)
start_padding = first_speech
assert 0 < start_padding <= 1200

# Verify we have some padding after last speech
# Should be close to dynamic_gap_trim_padding_ms - 50ms
end_padding = len(result) - last_speech - 1
expected_end_padding = int((410 - 50) * 24000 / 1000) # ~8640 samples
padding_tolerance = 100 # Allow some variation
assert abs(end_padding - expected_end_padding) < padding_tolerance
Loading