Skip to content

Commit d64c34a

Browse files
author
lijiaping
committed
add chunk_size option;
1 parent ee99967 commit d64c34a

File tree

3 files changed

+13
-2
lines changed

3 files changed

+13
-2
lines changed

app/schemas.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,12 @@ class WhsiperModelParams(BaseModel):
296296
batch_size: int = Field(
297297
Query(8, description="The preferred batch size for inference")
298298
)
299+
chunk_size: int = Field(
300+
Query(
301+
20,
302+
description="Chunk size for merging VAD segments. Default is 20, reduce this if the chunk is too long.",
303+
)
304+
)
299305
compute_type: ComputeType = Field(
300306
Query("float16", description="Type of computation")
301307
)

app/services.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ def process_transcribe(
128128
vad_options_params.model_dump(),
129129
model_params.language,
130130
model_params.batch_size,
131+
model_params.chunk_size,
131132
model_params.model,
132133
model_params.device,
133134
model_params.device_index,

app/whisperx_services.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def transcribe_with_whisper(
3232
vad_options,
3333
language,
3434
batch_size: int = 16,
35+
chunk_size: int = 20,
3536
model: str = WHISPER_MODEL,
3637
device: str = device,
3738
device_index: int = 0,
@@ -44,6 +45,7 @@ def transcribe_with_whisper(
4445
Args:
4546
audio (Audio): The audio to transcribe.
4647
batch_size (int): Batch size for transcription (default 16).
48+
chunk_size (int): Chunk size for transcription (default 20).
4749
model (str): Name of the Whisper model to use.
4850
device (str): Device to use for PyTorch inference.
4951
device_index (int): Device index to use for FasterWhisper inference.
@@ -88,7 +90,7 @@ def transcribe_with_whisper(
8890
threads=faster_whisper_threads,
8991
)
9092
logger.debug("Transcription model loaded successfully")
91-
result = model.transcribe(audio=audio, batch_size=batch_size, language=language)
93+
result = model.transcribe(audio=audio, batch_size=batch_size, chunk_size=chunk_size, language=language)
9294

9395
# Log GPU memory before cleanup
9496
if torch.cuda.is_available():
@@ -250,10 +252,11 @@ def process_audio_common(params: SpeechToTextProcessingParams, session):
250252
)
251253

252254
logger.debug(
253-
"Transcription parameters - task: %s, language: %s, batch_size: %d, model: %s, device: %s, device_index: %d, compute_type: %s, threads: %d",
255+
"Transcription parameters - task: %s, language: %s, batch_size: %d, chunk_size: %d, model: %s, device: %s, device_index: %d, compute_type: %s, threads: %d",
254256
params.whisper_model_params.task,
255257
params.whisper_model_params.language,
256258
params.whisper_model_params.batch_size,
259+
params.whisper_model_params.chunk_size,
257260
params.whisper_model_params.model,
258261
params.whisper_model_params.device,
259262
params.whisper_model_params.device_index,
@@ -268,6 +271,7 @@ def process_audio_common(params: SpeechToTextProcessingParams, session):
268271
vad_options=params.vad_options,
269272
language=params.whisper_model_params.language,
270273
batch_size=params.whisper_model_params.batch_size,
274+
chunk_size=params.whisper_model_params.chunk_size,
271275
model=params.whisper_model_params.model,
272276
device=params.whisper_model_params.device,
273277
device_index=params.whisper_model_params.device_index,

0 commit comments

Comments
 (0)