add chunk_size option;

lijiaping · lijiaping · commit d64c34a369fc · 2024-12-04T03:42:50.000Z
diff --git a/app/schemas.py b/app/schemas.py
@@ -296,6 +296,12 @@ class WhsiperModelParams(BaseModel):
     batch_size: int = Field(
         Query(8, description="The preferred batch size for inference")
     )
+    chunk_size: int = Field(
+        Query(
+            20,
+            description="Chunk size for merging VAD segments. Default is 20, reduce this if the chunk is too long.",
+        )
+    )
     compute_type: ComputeType = Field(
         Query("float16", description="Type of computation")
     )
diff --git a/app/services.py b/app/services.py
@@ -128,6 +128,7 @@ def process_transcribe(
         vad_options_params.model_dump(),
         model_params.language,
         model_params.batch_size,
+        model_params.chunk_size,
         model_params.model,
         model_params.device,
         model_params.device_index,
diff --git a/app/whisperx_services.py b/app/whisperx_services.py
@@ -32,6 +32,7 @@ def transcribe_with_whisper(
     vad_options,
     language,
     batch_size: int = 16,
+    chunk_size: int = 20,
     model: str = WHISPER_MODEL,
     device: str = device,
     device_index: int = 0,
@@ -44,6 +45,7 @@ def transcribe_with_whisper(
     Args:
        audio (Audio): The audio to transcribe.
        batch_size (int): Batch size for transcription (default 16).
+       chunk_size (int): Chunk size for transcription (default 20).
        model (str): Name of the Whisper model to use.
        device (str): Device to use for PyTorch inference.
        device_index (int): Device index to use for FasterWhisper inference.
@@ -88,7 +90,7 @@ def transcribe_with_whisper(
         threads=faster_whisper_threads,
     )
     logger.debug("Transcription model loaded successfully")
-    result = model.transcribe(audio=audio, batch_size=batch_size, language=language)
+    result = model.transcribe(audio=audio, batch_size=batch_size, chunk_size=chunk_size, language=language)
 
     # Log GPU memory before cleanup
     if torch.cuda.is_available():
@@ -250,10 +252,11 @@ def process_audio_common(params: SpeechToTextProcessingParams, session):
         )
 
         logger.debug(
-            "Transcription parameters - task: %s, language: %s, batch_size: %d, model: %s, device: %s, device_index: %d, compute_type: %s, threads: %d",
+            "Transcription parameters - task: %s, language: %s, batch_size: %d, chunk_size: %d, model: %s, device: %s, device_index: %d, compute_type: %s, threads: %d",
             params.whisper_model_params.task,
             params.whisper_model_params.language,
             params.whisper_model_params.batch_size,
+            params.whisper_model_params.chunk_size,
             params.whisper_model_params.model,
             params.whisper_model_params.device,
             params.whisper_model_params.device_index,
@@ -268,6 +271,7 @@ def process_audio_common(params: SpeechToTextProcessingParams, session):
             vad_options=params.vad_options,
             language=params.whisper_model_params.language,
             batch_size=params.whisper_model_params.batch_size,
+            chunk_size=params.whisper_model_params.chunk_size,
             model=params.whisper_model_params.model,
             device=params.whisper_model_params.device,
             device_index=params.whisper_model_params.device_index,

Original file line number	Diff line number	Diff line change
`@@ -296,6 +296,12 @@ class WhsiperModelParams(BaseModel):`
`296`	`296`	`batch_size: int = Field(`
`297`	`297`	`Query(8, description="The preferred batch size for inference")`
`298`	`298`	`)`
	`299`	`+ chunk_size: int = Field(`
	`300`	`+ Query(`
	`301`	`+ 20,`
	`302`	`+ description="Chunk size for merging VAD segments. Default is 20, reduce this if the chunk is too long.",`
	`303`	`+ )`
	`304`	`+ )`
`299`	`305`	`compute_type: ComputeType = Field(`
`300`	`306`	`Query("float16", description="Type of computation")`
`301`	`307`	`)`