Merge pull request #21 from matatonic/dev

0.13.0
matatonic · Jun 25, 2024 · 65c03e3 · 65c03e3
2 parents 18c73ce + 34bf525
commit 65c03e3
Show file tree

Hide file tree

Showing 20 changed files with 491 additions and 160 deletions.
diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml
@@ -55,6 +55,7 @@ jobs:
           push: true
           tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
           labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
 
       # For tagged releases, build and push the Docker image with the corresponding tag
       - name: Build and Push Docker Image (Tagged)
@@ -66,8 +67,9 @@ jobs:
           push: true
           tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
           labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
 
-  build-and-push-alt-image:
+  build-and-push-min-image:
     runs-on: ubuntu-latest
 
     permissions:
@@ -113,6 +115,7 @@ jobs:
           push: true
           tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
           labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
 
       # For tagged releases, build and push the Docker image with the corresponding tag
       - name: Build and Push Docker Image (Tagged)
@@ -124,4 +127,70 @@ jobs:
           push: true
           tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
           labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
+
+  build-and-push-rocm-image:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+
+    env:
+      # Set up environment variables for the job
+      USE_ROCM: 1
+      DOCKER_REGISTRY: ghcr.io
+      IMAGE_NAME: ${{ github.repository }}-rocm
+      TAG: ${{ github.sha }}
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          install: true
+
+      # Log in to the GitHub Container Registry only when not running on a pull request event
+      - name: Login to Docker Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.DOCKER_REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      # Build and push the Docker image to GHCR for the main branch or specific tags
+      - name: Build and Push Docker Image
+        if: github.ref == 'refs/heads/main'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+          labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
+          build-args: |
+            USE_ROCM=1
+
+      # For tagged releases, build and push the Docker image with the corresponding tag
+      - name: Build and Push Docker Image (Tagged)
+        if: startsWith(github.ref, 'refs/tags/')
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
+          labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
+          build-args: |
+            USE_ROCM=1
 
diff --git a/Dockerfile b/Dockerfile
@@ -1,23 +1,28 @@
 FROM python:3.11-slim
 
-RUN apt-get update && \
-    apt-get install --no-install-recommends -y curl ffmpeg git && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+ARG TARGETPLATFORM
+RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg
+RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then apt-get install --no-install-recommends -y build-essential ; fi
+RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ; fi
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /app
 RUN mkdir -p voices config
 
-COPY requirements.txt /app/
+ARG USE_ROCM
+ENV USE_ROCM=${USE_ROCM}
+
+COPY requirements*.txt /app/
+RUN if [ "${USE_ROCM}" = "1" ]; then mv /app/requirements-rocm.txt /app/requirements.txt; fi
 RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
 
-COPY speech.py openedai.py say.py *.sh README.md LICENSE /app/
-COPY config/voice_to_speaker.default.yaml config/pre_process_map.default.yaml /app/config/
+COPY speech.py openedai.py say.py *.sh *.default.yaml README.md LICENSE /app/
 
 ARG PRELOAD_MODEL
 ENV PRELOAD_MODEL=${PRELOAD_MODEL}
 ENV TTS_HOME=voices
 ENV HF_HOME=voices
-ENV OPENEDAI_LOG_LEVEL=INFO
 ENV COQUI_TOS_AGREED=1
 
 CMD bash startup.sh
diff --git a/Dockerfile.min b/Dockerfile.min
@@ -1,20 +1,20 @@
 FROM python:3.11-slim
 
-RUN apt-get update && \
-    apt-get install --no-install-recommends -y curl ffmpeg && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+ARG TARGETPLATFORM
+RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg
+RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then apt-get install --no-install-recommends -y build-essential ; fi
+RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ; fi
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /app
 RUN mkdir -p voices config
 
-RUN --mount=type=cache,target=/root/.cache/pip pip install piper-tts==1.2.0 pyyaml fastapi uvicorn loguru numpy\<2
-
-
-COPY speech.py openedai.py say.py *.sh README.md LICENSE /app/
-COPY config/voice_to_speaker.default.yaml config/pre_process_map.default.yaml /app/config/
+COPY requirements*.txt /app/
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements-min.txt
+COPY speech.py openedai.py say.py *.sh *.default.yaml README.md LICENSE /app/
 
 ENV TTS_HOME=voices
 ENV HF_HOME=voices
-ENV OPENEDAI_LOG_LEVEL=INFO
 
 CMD bash startup.min.sh
diff --git a/README.md b/README.md
@@ -19,13 +19,24 @@ Details:
 * Model `tts-1-hd` via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, but requires around 4GB GPU VRAM)
   * Custom cloned voices can be used for tts-1-hd, See: [Custom Voices Howto](#custom-voices-howto)
   * 🌐 [Multilingual](#multilingual) support with XTTS voices
+  * [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
 * Occasionally, certain words or symbols may sound incorrect, you can fix them with regex via `pre_process_map.yaml`
 
 
 If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
 
 ## Recent Changes
 
+Version 0.13.0, 2024-06-25
+
+* Added [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
+* Initial prebuilt arm64 image support (Apple M-series, Raspberry Pi - MPS is not supported in XTTS/torch), thanks @JakeStevenson, @hchasens
+* Initial attempt at AMD GPU (ROCm 5.7) support
+* Parler-tts support removed
+* Move the *.default.yaml to the root folder
+* Run the docker as a service by default (`restart: unless-stopped`)
+* Added `audio_reader.py` for streaming text input and reading long texts
+
 Version 0.12.3, 2024-06-17
 
 * Additional logging details for BadRequests (400)
@@ -75,62 +86,68 @@ Version: 0.7.3, 2024-03-20
 
 ## Installation instructions
 
-1) Copy the `sample.env` to `speech.env` (customize if needed)
+### Create a `speech.env` environment file
+
+Copy the `sample.env` to `speech.env` (customize if needed)
 ```bash
 cp sample.env speech.env
 ```
 
-2. Option: Docker (**recommended**) (prebuilt images are available)
-
-Run the server:
-```shell
-docker compose up
+#### Defaults
+```bash
+TTS_HOME=voices
+HF_HOME=voices
+#PRELOAD_MODEL=xtts
+#PRELOAD_MODEL=xtts_v2.0.2
+#EXTRA_ARGS=--log-level DEBUG
+#USE_ROCM=1
 ```
-For a minimal docker image with only piper support (<1GB vs. 8GB), use `docker compose -f docker-compose.min.yml up`
-
-To install the docker image as a service, edit the `docker-compose.yml` and uncomment `restart: unless-stopped`, then start the service with: `docker compose up -d`
 
-
-2. Option: Manual installation:
+### Option A: Manual installation
 ```shell
 # install curl and ffmpeg
 sudo apt install curl ffmpeg
 # Create & activate a new virtual environment (optional but recommended)
 python -m venv .venv
 source .venv/bin/activate
 # Install the Python requirements
+# - use requirements-rocm.txt for AMD GPU (ROCm support)
+# - use requirements-min.txt for piper only (CPU only)
 pip install -r requirements.txt
 # run the server
 bash startup.sh
 ```
 
+> On first run, the voice models will be downloaded automatically. This might take a while depending on your network connection.
 
-## Usage
+### Option B: Docker Image (*recommended*)
 
-```
-usage: speech.py [-h] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST] [-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
+#### Nvidia GPU (cuda)
 
-OpenedAI Speech API Server
+```shell
+docker compose up
+```
 
-options:
-  -h, --help            show this help message and exit
-  --xtts_device XTTS_DEVICE
-                        Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
-  --preload PRELOAD     Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
-  -P PORT, --port PORT  Server tcp port (default: 8000)
-  -H HOST, --host HOST  Host to listen on, Ex. 0.0.0.0 (default: 0.0.0.0)
-  -L {DEBUG,INFO,WARNING,ERROR,CRITICAL}, --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
-                        Set the log level (default: INFO)
+#### AMD GPU (ROCm support)
 
+```shell
+docker compose -d docker-compose.rocm.yml up
 ```
 
-## API Documentation
+#### ARM64 (Apple M-series, Raspberry Pi)
 
-* [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech)
-* [OpenAI API Reference](https://platform.openai.com/docs/api-reference/audio/createSpeech)
+> XTTS only has CPU support here and will be very slow, you can use the Nvidia image for XTTS with CPU (slow), or use the piper only image (recommended)
+
+#### CPU only, No GPU (piper only)
+
+> For a minimal docker image with only piper support (<1GB vs. 8GB).
+
+```shell
+docker compose -f docker-compose.min.yml up
+```
 
 
-### Sample API Usage
+## Sample Usage
 
 You can use it like this:
 
@@ -147,7 +164,7 @@ curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -
 Or just like this:
 
 ```shell
-curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
+curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
     "input": "The quick brown fox jumped over the lazy dog."}' > speech.mp3
 ```
 
@@ -175,34 +192,25 @@ with client.audio.speech.with_streaming_response.create(
 Also see the `say.py` sample application for an example of how to use the openai-python API.
 
 ```shell
-python say.py -t "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
-python say.py -t "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
+# play the audio, requires 'pip install playsound'
+python say.py -t "The quick brown fox jumped over the lazy dog." -p
+# save to a file in flac format
+python say.py -t "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac
 ```
 
-```
-usage: say.py [-h] [-m MODEL] [-v VOICE] [-f {mp3,aac,opus,flac}] [-s SPEED] [-t TEXT] [-i INPUT] [-o OUTPUT] [-p]
-
-Text to speech using the OpenAI API
-
-options:
-  -h, --help            show this help message and exit
-  -m MODEL, --model MODEL
-                        The model to use (default: tts-1)
-  -v VOICE, --voice VOICE
-                        The voice of the speaker (default: alloy)
-  -f {mp3,aac,opus,flac}, --format {mp3,aac,opus,flac}
-                        The output audio format (default: mp3)
-  -s SPEED, --speed SPEED
-                        playback speed, 0.25-4.0 (default: 1.0)
-  -t TEXT, --text TEXT  Provide text to read on the command line (default: None)
-  -i INPUT, --input INPUT
-                        Read text from a file (default is to read from stdin) (default: None)
-  -o OUTPUT, --output OUTPUT
-                        The filename to save the output to (default: None)
-  -p, --playsound       Play the audio (default: False)
+You can also try the included `audio_reader.py` for listening to longer text and streamed input.
 
+Example usage:
+```bash
+python audio_reader.py -s 2 < LICENSE # read the software license - fast
 ```
 
+## OpenAI API Documentation and Guide
+
+* [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech)
+* [OpenAI API Reference](https://platform.openai.com/docs/api-reference/audio/createSpeech)
+
+
 ## Custom Voices Howto
 
 ### Piper
@@ -251,13 +259,13 @@ For example:
 ...
 tts-1-hd:
   me:
-    model: xtts_v2.0.2 # you can specify different xtts versions
+    model: xtts
     speaker: voices/me.wav # this could be you
 ```
 
 ## Multilingual
 
-Multilingual support was added in version 0.11.0 and is available only with the XTTS v2 model.
+Multilingual cloning support was added in version 0.11.0 and is available only with the XTTS v2 model. To use multilingual voices with piper simply download a language specific voice.
 
 Coqui XTTSv2 has support for 16 languages: English (`en`), Spanish (`es`), French (`fr`), German (`de`), Italian (`it`), Portuguese (`pt`), Polish (`pl`), Turkish (`tr`), Russian (`ru`), Dutch (`nl`), Czech (`cs`), Arabic (`ar`), Chinese (`zh-cn`), Japanese (`ja`), Hungarian (`hu`) and Korean (`ko`).
 
@@ -284,3 +292,24 @@ Remove:
 These lines were added to the `config/pre_process_map.yaml` config file by default before version 0.11.0:
 
 4) Your new multi-lingual speaker voice is ready to use!
+
+
+## Custom Fine-Tuned Model Support
+
+Adding a custom xtts model is simple. Here is an example of how to add a custom fine-tuned 'halo' XTTS model.
+
+1) Save the model folder under `voices/` (all 4 files are required, including the vocab.json from the model)
+```
+openedai-speech$ ls voices/halo/
+config.json  vocab.json  model.pth  sample.wav
+```
+2) Add the custom voice entry under the `tts-1-hd` section of `config/voice_to_speaker.yaml`:
+```yaml
+tts-1-hd:
+...
+  halo:
+    model: halo # This name is required to be unique
+    speaker: voices/halo/sample.wav # voice sample is required
+    model_path: voices/halo
+```
+3) The model will be loaded when you access the voice for the first time (`--preload` doesn't work with custom models yet)