remsky · fireblade2534 · Jan 9, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 9, 2025
diff --git a/api/src/core/config.py b/api/src/core/config.py
@@ -19,8 +19,11 @@ class Settings(BaseSettings):
     voices_dir: str = "voices"
     sample_rate: int = 24000
     max_chunk_size: int = 300  # Maximum size of text chunks for processing
-    gap_trim_ms: int = 250  # Amount to trim from streaming chunk ends in milliseconds
 
+    gap_trim_ms: int = 25  # Amount to trim from streaming chunk ends in milliseconds
+    dynamic_gap_trim_padding_ms: int = 410 # Padding to add to dynamic gap trim
+    dynamic_gap_trim_padding_char_multiplier: dict[str,float] = {".":1,"!":0.9,"?":1,",":0.8}
+
     # ONNX Optimization Settings
     onnx_num_threads: int = 4  # Number of threads for intra-op parallelism
     onnx_inter_op_threads: int = 4  # Number of threads for inter-op parallelism

diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py
@@ -95,7 +95,7 @@ async def create_speech(
             "wav": "audio/wav",
             "pcm": "audio/pcm",
         }.get(request.response_format, f"audio/{request.response_format}")
-
+        logger.debug(f"Stream requested: {request.stream}")
         # Check if streaming is requested (default for OpenAI client)
         if request.stream:
             # Stream audio chunks as they're generated

diff --git a/api/src/services/audio.py b/api/src/services/audio.py
@@ -1,7 +1,9 @@
 """Audio conversion service"""
 
 from io import BytesIO
+from pydoc import text
 
+import math
 import numpy as np
 import scipy.io.wavfile as wavfile
 import soundfile as sf
@@ -19,22 +21,66 @@ def __init__(self):
         self.sample_rate = 24000  # Sample rate of the audio
         self.samples_to_trim = int(self.chunk_trim_ms * self.sample_rate / 1000)
 
-    def normalize(
-        self, audio_data: np.ndarray, is_last_chunk: bool = False
-    ) -> np.ndarray:
-        """Convert audio data to int16 range and trim chunk boundaries"""
-        if len(audio_data) == 0:
-            raise ValueError("Audio data cannot be empty")
+        self.samples_to_pad_start= int(50 * self.sample_rate / 1000)
+
+    def find_first_last_non_silent(self,audio_data: np.ndarray, chunk:str,speed: float, silence_threshold_db: int = -45,is_last_chunk:bool=False) -> tuple[int, int]:
+        """
+        Finds the indices of the first and last non-silent samples in audio data.
+        """
+
+
+        pad_multiplier=1
+        split_character=chunk.strip()
+        if len(split_character) > 0:
+            split_character=split_character[-1]
+            if split_character in settings.dynamic_gap_trim_padding_char_multiplier:
+                pad_multiplier=settings.dynamic_gap_trim_padding_char_multiplier[split_character]
+
+        if not is_last_chunk:
+            samples_to_pad_end= max(int((settings.dynamic_gap_trim_padding_ms * self.sample_rate * pad_multiplier) / 1000) - self.samples_to_pad_start, 0)
+        else:
+            samples_to_pad_end=self.samples_to_pad_start
+        # Convert dBFS threshold to amplitude
+        amplitude_threshold = self.int16_max * (10 ** (silence_threshold_db / 20))
+
+        # Find all samples above the silence threshold
+        non_silent_index_start, non_silent_index_end = None,None 
+
+        for X in range(0,len(audio_data)):
+            if audio_data[X] > amplitude_threshold:
+                non_silent_index_start=X
+                break
 
-        # Simple float32 to int16 conversion
+        for X in range(len(audio_data) - 1, -1, -1):
+            if audio_data[X] > amplitude_threshold:
+                non_silent_index_end=X
+                break
+
+        # Handle the case where the entire audio is silent
+        if non_silent_index_start == None or non_silent_index_end == None:
+            return 0, len(audio_data)
+
+        return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data))
+
+    def normalize(self, audio_data: np.ndarray, chunk:str, speed:float, is_last_chunk: bool = False) -> np.ndarray:
+        """Normalize audio data to int16 range and trim chunk boundaries"""
+        # Convert to float32 if not already
+
         audio_float = audio_data.astype(np.float32)
 
         # Trim for non-final chunks
         if not is_last_chunk and len(audio_float) > self.samples_to_trim:
+
             audio_float = audio_float[:-self.samples_to_trim]
-
-        # Direct scaling like the non-streaming version
-        return (audio_float * 32767).astype(np.int16)
+
+        audio_int=(audio_float * self.int16_max).astype(np.int16)
+
+        start_index,end_index=self.find_first_last_non_silent(audio_int,chunk,speed)
+
+
+        # Scale to int16 range
+        return audio_int[start_index:end_index]
+
 
 
 class AudioService:
@@ -59,11 +105,15 @@ def convert_audio(
         audio_data: np.ndarray,
         sample_rate: int,
         output_format: str,
+        speed: float=1,
         is_first_chunk: bool = True,
         is_last_chunk: bool = False,
         normalizer: AudioNormalizer = None,
         format_settings: dict = None,
         stream: bool = True,
+
+        chunk: str = ""
+
     ) -> bytes:
         """Convert audio data to specified format
 
@@ -93,11 +143,14 @@ def convert_audio(
 
         try:
             # Always normalize audio to ensure proper amplitude scaling
-            if normalizer is None:
-                normalizer = AudioNormalizer()
-            normalized_audio = normalizer.normalize(
-                audio_data, is_last_chunk=is_last_chunk
-            )
+
+            if stream:
+                if normalizer is None:
+                    normalizer = AudioNormalizer()
+                normalized_audio = normalizer.normalize(audio_data,chunk,speed, is_last_chunk=is_last_chunk)
+            else:
+                normalized_audio = audio_data
+
 
             if output_format == "pcm":
                 # Raw 16-bit PCM samples, no header

diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py
@@ -51,6 +51,8 @@ def _generate_audio_internal(
         start_time = time.time()
 
         try:
+            stream_normalizer = AudioNormalizer()
+
             # Normalize text once at the start
             if not text:
                 raise ValueError("Text is empty after preprocessing")
@@ -71,10 +73,10 @@ def _generate_audio_internal(
             if stitch_long_output:
                 # Preprocess all chunks to phonemes/tokens
                 chunks_data = []
-                for chunk in chunker.split_text(text):
+                for index,chunk in enumerate(chunker.split_text(text)):
                     try:
                         phonemes, tokens = TTSModel.process_text(chunk, voice[0])
-                        chunks_data.append((chunk, tokens))
+                        chunks_data.append((chunk, tokens, index))
                     except Exception as e:
                         logger.error(
                             f"Failed to process chunk: '{chunk}'. Error: {str(e)}"
@@ -86,12 +88,13 @@ def _generate_audio_internal(
 
                 # Generate audio for all chunks
                 audio_chunks = []
-                for chunk, tokens in chunks_data:
+                for chunk, tokens, chunk_index in chunks_data:
                     try:
                         chunk_audio = TTSModel.generate_from_tokens(
                             tokens, voicepack, speed
                         )
                         if chunk_audio is not None:
+                            chunk_audio=stream_normalizer.normalize(chunk_audio,chunk,speed,(chunk_index == len(chunks_data) - 1))
                             audio_chunks.append(chunk_audio)
                         else:
                             logger.error(f"No audio generated for chunk: '{chunk}'")
@@ -113,8 +116,8 @@ def _generate_audio_internal(
             else:
                 # Process single chunk
                 phonemes, tokens = TTSModel.process_text(text, voice[0])
-                audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
-
+                chunk_audio = TTSModel.generate_from_tokens(tokens, voicepack, speed)
+                audio = stream_normalizer.normalize(chunk_audio,text,speed,True)
             processing_time = time.time() - start_time
             return audio, processing_time
 
@@ -182,9 +185,11 @@ async def generate_audio_stream(
                             chunk_audio,
                             24000,
                             output_format,
+                            speed,
                             is_first_chunk=is_first,
                             normalizer=stream_normalizer,
                             is_last_chunk=(next_chunk is None),  # Last if no next chunk
+                            chunk=current_chunk,
                             stream=True  # Ensure proper streaming format handling
                         )
 

diff --git a/api/tests/test_audio_normalizer.py b/api/tests/test_audio_normalizer.py
@@ -0,0 +1,112 @@
+"""Tests for AudioNormalizer"""
+
+import numpy as np
+import pytest
+from api.src.services.audio import AudioNormalizer
+
+@pytest.fixture
+def normalizer():
+    """Create an AudioNormalizer instance"""
+    return AudioNormalizer()
+
+@pytest.fixture
+def silent_audio():
+    """Generate silent audio data"""
+    return np.zeros(24000, dtype=np.int16)  # 1 second of silence
+
+@pytest.fixture
+def speech_audio():
+    """Generate audio data with speech-like content"""
+    # Create 1 second of audio with speech in the middle
+    audio = np.zeros(24000, dtype=np.int16)
+    # Add speech-like content from 0.25s to 0.75s (leaving silence at start/end)
+    speech_start = 6000  # 0.25s * 24000
+    speech_end = 18000   # 0.75s * 24000
+    # Generate non-zero values for speech section
+    audio[speech_start:speech_end] = np.random.randint(-32768//2, 32767//2, speech_end-speech_start, dtype=np.int16)
+    return audio
+
+def test_find_first_last_non_silent_all_silent(normalizer, silent_audio):
+    """Test silence detection with completely silent audio"""
+    start, end = normalizer.find_first_last_non_silent(silent_audio,"",1)
+    assert start == 0
+    assert end == len(silent_audio)
+
+def test_find_first_last_non_silent_with_speech(normalizer, speech_audio):
+    """Test silence detection with audio containing speech"""
+    start, end = normalizer.find_first_last_non_silent(speech_audio,"",1)
+
+    # Should detect speech section with padding
+    # Start should be before 0.25s (with 50ms padding)
+    assert start < 6000
+    # End should be after 0.75s (with dynamic padding)
+    assert end > 18000
+    # But shouldn't extend beyond audio length
+    assert end <= len(speech_audio)
+
+def test_normalize_streaming_chunks(normalizer):
+    """Test normalizing streaming audio chunks"""
+    # Create three 100ms chunks
+    chunk_samples = 2400  # 100ms at 24kHz
+    chunks = []
+
+    # First chunk: silence then speech
+    chunk1 = np.zeros(chunk_samples, dtype=np.float32)
+    chunk1[1200:] = np.random.random(1200) * 2 - 1  # Speech in second half
+    chunks.append(chunk1)
+
+    # Second chunk: all speech
+    chunk2 = (np.random.random(chunk_samples) * 2 - 1).astype(np.float32)
+    chunks.append(chunk2)
+
+    # Third chunk: speech then silence
+    chunk3 = np.zeros(chunk_samples, dtype=np.float32)
+    chunk3[:1200] = np.random.random(1200) * 2 - 1  # Speech in first half
+    chunks.append(chunk3)
+
+    # Process chunks
+    results = []
+    for i, chunk in enumerate(chunks):
+        is_last = (i == len(chunks) - 1)
+        normalized = normalizer.normalize(chunk, is_last_chunk=is_last)
+        results.append(normalized)
+
+    # Verify results
+    # First chunk should trim silence from start but keep end for continuity
+    assert len(results[0]) < len(chunk1)
+    # Middle chunk should be similar length to input
+    assert abs(len(results[1]) - len(chunk2)) < 100
+    # Last chunk should trim silence from end
+    assert len(results[2]) < len(chunk3)
+
+def test_normalize_amplitude(normalizer):
+    """Test audio amplitude normalization"""
+    # Create audio with values outside int16 range
+    audio = np.random.random(1000) * 1e5
+
+    result = normalizer.normalize(audio)
+
+    # Check result is within int16 range
+    assert np.max(np.abs(result)) <= 32767
+    assert result.dtype == np.int16
+
+def test_padding_behavior(normalizer, speech_audio):
+    """Test start and end padding behavior"""
+    result = normalizer.normalize(speech_audio)
+
+    # Find actual speech content in result (non-zero values)
+    non_zero = np.nonzero(result)[0]
+    first_speech = non_zero[0]
+    last_speech = non_zero[-1]
+
+    # Verify we have some padding before first speech
+    # Should be close to 50ms (1200 samples at 24kHz)
+    start_padding = first_speech
+    assert 0 < start_padding <= 1200
+
+    # Verify we have some padding after last speech
+    # Should be close to dynamic_gap_trim_padding_ms - 50ms
+    end_padding = len(result) - last_speech - 1
+    expected_end_padding = int((410 - 50) * 24000 / 1000)  # ~8640 samples
+    padding_tolerance = 100  # Allow some variation
+    assert abs(end_padding - expected_end_padding) < padding_tolerance