Samagra-Development · SarveshAtawane · Aug 12, 2024 · Aug 12, 2024 · Sep 3, 2024 · Sep 3, 2024
diff --git a/src/text2speech/Whisper_TTS/Dockerfile b/src/text2speech/Whisper_TTS/Dockerfile
@@ -0,0 +1,13 @@
+FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
+
+WORKDIR /app
+
+COPY requirements.txt /app/
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . /app/
+
+EXPOSE 8000
+
+CMD ["python", "api.py"]
diff --git a/src/text2speech/Whisper_TTS/README.md b/src/text2speech/Whisper_TTS/README.md
@@ -0,0 +1,107 @@
+This project uses WhisperSpeech to convert text to speech, with an option for voice cloning.
+reference - https://github.com/collabora/WhisperSpeech
+
+## Endpoints
+
+### Text-to-Speech
+
+**Endpoint:** `/tts`  
+**Method:** `POST`  
+**Description:** Converts text to speech and returns an audio file.
+
+**Request Body:**
+```json
+{
+  "text": "Your text here",
+  "language": "en",
+  "cps": 15,
+  "use_voice_cloning": false
+}
+```
+
+- `text`: The text to be converted to speech.
+- `language`: The language for the speech output (default: "en").
+- `cps`: Characters per second for speech synthesis (default: 10.5).
+- `use_voice_cloning`: Optional flag to enable voice cloning (default: false).
+
+**Example Request:**
+
+Basic TTS:
+```bash
+curl -X POST http://localhost:8000/tts \
+  -H "Content-Type: application/json" \
+  -d '{
+    "text": "Hello, world! This is a test of the text-to-speech system.",
+    "language": "en",
+    "cps": 15
+  }' \
+  --output output.wav
+```
+
+Text-to-Speech with Voice Cloning:
+```bash
+curl -X POST http://localhost:8000/tts \
+  -H "Content-Type: application/json" \
+  -d '{
+    "text": "This is a voice cloning test.",
+    "language": "en",
+    "cps": 15,
+    "use_voice_cloning": true
+  }' \
+  --output cloned_output.wav
+```
+
+**Notes:**
+
+- For voice cloning to work, ensure that `reference_audio.wav` is present in the same directory as the server script.
+
+## Docker Deployment
+
+1. **Build the Docker Image:**
+   ```bash
+   docker build -t quart-tts-gpu-app .
+   ```
+
+2. **Run the Docker Container:**
+   ```bash
+   docker run --gpus all -p 8000:8000 quart-tts-gpu-app
+   ```
+
+## Setting Up Without Docker
+
+1. **Clone the Repository:**
+
+2. **Set Up a Virtual Environment (Optional but recommended):**
+
+   ```bash
+   python -m venv venv
+   source venv/bin/activate  # On Windows, use `venv\Scripts\activate`
+   ```
+
+3. **Install Dependencies:**
+
+   Ensure `requirements.txt` is present in the directory, then run:
+   ```bash
+   pip install --no-cache-dir -r requirements.txt
+   ```
+
+4. **Prepare the Reference Audio (For Voice Cloning):**
+
+   Place `reference_audio.wav` in the same directory as your `api.py` script if you plan to use voice cloning.
+
+5. **Run the Quart Application:**
+
+   ```bash
+   python api.py
+   ```
+
+   The service will be accessible at `http://localhost:8000`.
+
+5. **Example Ouput:**
+    - ouptut with cloning
+        https://github.com/user-attachments/assets/fa4bc138-e17a-4e1c-a88a-8dd5f37e0db4
+
+
+    - output without cloning
+        https://github.com/user-attachments/assets/1bb4ee27-3730-4cca-85f0-f72629d7c640
+
diff --git a/src/text2speech/Whisper_TTS/api.py b/src/text2speech/Whisper_TTS/api.py
@@ -0,0 +1,47 @@
+from quart import Quart, request, send_file
+from model import Model
+from request import ModelRequest
+import logging
+import os
+
+app = Quart(__name__)
+model = None
+
+logging.basicConfig(level=logging.DEBUG)
+
+@app.before_serving
+async def startup():
+    global model
+    app.logger.info("Initializing model...")
+    model = Model()
+    app.logger.info("Model initialized successfully")
+
+@app.route('/tts', methods=['POST'])
+async def text_to_speech():
+    global model
+    app.logger.info("Received request for text-to-speech")
+    try:
+        data = await request.get_json()
+        app.logger.debug(f"Received data: {data}")
+
+        use_voice_cloning = data.get('use_voice_cloning', False)
+        req = ModelRequest(data)
+
+        if use_voice_cloning:
+            app.logger.info("Voice cloning requested")
+            reference_audio_path = 'reference_audio.wav'
+            if not os.path.exists(reference_audio_path):
+                raise FileNotFoundError("Reference audio file not found in the current directory")
+            with open(reference_audio_path, 'rb') as voice_cloning_audio:
+                result = await model.inference(req, voice_cloning_audio)
+        else:
+            result = await model.inference(req)
+
+        app.logger.info("Text-to-speech process completed successfully")
+        return await send_file(result['output_file'])
+    except Exception as e:
+        app.logger.error(f"Error in text_to_speech: {str(e)}")
+        return {"error": str(e)}, 500
+
+if __name__ == "__main__":
+    app.run(host='0.0.0.0', port=8000, debug=True)
diff --git a/src/text2speech/Whisper_TTS/audio/output_with_cloning.wav b/src/text2speech/Whisper_TTS/audio/output_with_cloning.wav
diff --git a/src/text2speech/Whisper_TTS/audio/output_without_cloning.wav b/src/text2speech/Whisper_TTS/audio/output_without_cloning.wav
diff --git a/src/text2speech/Whisper_TTS/model.py b/src/text2speech/Whisper_TTS/model.py
@@ -0,0 +1,37 @@
+import torchaudio
+from whisperspeech.pipeline import Pipeline
+from request import ModelRequest
+import uuid
+
+class Model:
+
+    def __init__(self):
+        self.pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model')
+
+    def generate_uuid(self):
+        return uuid.uuid4()
+
+    async def inference(self, request: ModelRequest, voice_cloning_audio=None):
+        reference_audio_path = None
+        if request.use_voice_cloning and voice_cloning_audio:
+            reference_audio_path = 'reference_audio.wav'
+        audio = self.text_to_speech(request.text, language=request.language, cps=request.cps, reference_audio_path=reference_audio_path)
+
+        output_file = f"audio/output_{self.generate_uuid()}.wav" if not reference_audio_path else f"audio/output_cloned_{self.generate_uuid()}.wav"
+        self.save_audio(audio, output_file)
+        return {"output_file": output_file}
+
+    def text_to_speech(self, text, language='en', cps=10.5, reference_audio_path=None):
+        if reference_audio_path:
+            audio = self.pipe.generate(text, lang=language, cps=cps, speaker=reference_audio_path)
+        else:
+            audio = self.pipe.generate(text, lang=language, cps=cps)
+
+        audio_cpu = audio.cpu().squeeze()
+        if audio_cpu.dim() == 1:
+            audio_cpu = audio_cpu.unsqueeze(0)
+        return audio_cpu
+
+    def save_audio(self, audio, output_file, sample_rate=24000):
+        torchaudio.save(output_file, audio, sample_rate=sample_rate, encoding="PCM_F")
+        print(f"Generated audio file: {output_file}")
diff --git a/src/text2speech/Whisper_TTS/reference_audio.wav b/src/text2speech/Whisper_TTS/reference_audio.wav
diff --git a/src/text2speech/Whisper_TTS/request.py b/src/text2speech/Whisper_TTS/request.py
@@ -0,0 +1,6 @@
+class ModelRequest:
+    def __init__(self, data):
+        self.text = data.get('text', '')
+        self.language = data.get('language', 'en')
+        self.cps = data.get('cps', 10.5)
+        self.use_voice_cloning = data.get('use_voice_cloning', False)
diff --git a/src/text2speech/Whisper_TTS/requirements.txt b/src/text2speech/Whisper_TTS/requirements.txt
@@ -0,0 +1,5 @@
+quart
+torch
+torchaudio
+whisperspeech
+webdataset