Merge pull request #59 from pavelzbornik:dev

Add support for distilled and custom models in README and schemas
pavelzbornik · Jan 10, 2025 · 685187a · 685187a
2 parents ea87c5b + 4345221
commit 685187a
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -76,6 +76,8 @@ WhisperX supports these model sizes:
 - `small`, `small.en`
 - `medium`, `medium.en`
 - `large`, `large-v1`, `large-v2`, `large-v3`, `large-v3-turbo`
+- Distilled models: `distil-large-v2`, `distil-medium.en`, `distil-small.en`, `distil-large-v3`
+- Custom models: [`nyrahealth/faster_CrisperWhisper`](https://github.com/nyrahealth/CrisperWhisper)
 
 Set default model in `.env` using `WHISPER_MODEL=` (default: tiny)
 

diff --git a/app/schemas.py b/app/schemas.py
@@ -152,6 +152,11 @@ class WhisperModel(str, Enum):
     large_v2 = "large-v2"
     large_v3 = "large-v3"
     large_v3_turbo = "large-v3-turbo"
+    distil_large_v2 = "distil-large-v2"
+    distil_medium_en = "distil-medium.en"
+    distil_small_en = "distil-small.en"
+    distil_large_v3 = "distil-large-v3"
+    faster_crisper_whisper = "nyrahealth/faster_CrisperWhisper"
 
 
 class Device(str, Enum):

diff --git a/app/whisperx_services.py b/app/whisperx_services.py
@@ -71,15 +71,15 @@ def transcribe_with_whisper(
 
     logger.debug(
         "Loading model with config - model: %s, device: %s, compute_type: %s, threads: %d, task: %s, language: %s",
-        model,
+        model.value,
         device,
         compute_type,
         faster_whisper_threads,
         task,
         language,
     )
     model = load_model(
-        model,
+        model.value,
         device,
         device_index=device_index,
         compute_type=compute_type,
@@ -90,7 +90,9 @@ def transcribe_with_whisper(
         threads=faster_whisper_threads,
     )
     logger.debug("Transcription model loaded successfully")
-    result = model.transcribe(audio=audio, batch_size=batch_size, chunk_size=chunk_size, language=language)
+    result = model.transcribe(
+        audio=audio, batch_size=batch_size, chunk_size=chunk_size, language=language
+    )
 
     # Log GPU memory before cleanup
     if torch.cuda.is_available():