-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #8 from rmcpantoja/piper
Piper implementation
- Loading branch information
Showing
13 changed files
with
532 additions
and
36 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
name: VeTube-x86 | ||
|
||
on: | ||
push: | ||
tags: ["*"] | ||
branches: [ master , piper ] | ||
pull_request: | ||
branches: [ master , piper ] | ||
workflow_dispatch: | ||
|
||
jobs: | ||
build: | ||
runs-on: windows-latest | ||
|
||
steps: | ||
- name: Source checkout | ||
uses: actions/checkout@v3 | ||
|
||
- name: Configure Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: 3.10.11 | ||
architecture: x86 | ||
|
||
- name: Install dependencies | ||
run: | | ||
pip install --upgrade pip wheel setuptools | ||
pip install -r requirements.txt | ||
pip install pyinstaller gdown | ||
pip install --upgrade pyzmq httpx httpcore future | ||
git clone https://github.com/mush42/espeak-phonemizer-windows | ||
- name: Compiling | ||
run: | | ||
pyinstaller VeTube.py | ||
gdown 1ZtF6zus0A7kC9Lwr_kTUbw0MiOoZq29H -O dist/VeTube/bootstrap.exe | ||
cp -R doc dist/VeTube/ | ||
cp -R locales dist/VeTube/ | ||
cp -R readme dist/VeTube/ | ||
cp -R sounds dist/VeTube/ | ||
cp -R espeak-phonemizer-windows/espeak_phonemizer dist/VeTube/ | ||
- name: Create zip | ||
run: | | ||
cd dist | ||
7z a ../VeTube-x86.zip VeTube/ | ||
cd .. | ||
- name: Upload zip | ||
uses: actions/upload-artifact@v3 | ||
with: | ||
name: VeTube-x86 | ||
path: dist | ||
if-no-files-found: error | ||
|
||
vetube_release: | ||
runs-on: windows-latest | ||
if: ${{ startsWith(github.ref, 'refs/tags/') }} | ||
needs: ["build"] | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: download | ||
uses: actions/download-artifact@v3 | ||
- name: Release | ||
uses: softprops/action-gh-release@v1 | ||
with: | ||
files: VeTube-x86.zip | ||
fail_on_unmatched_files: true | ||
prerelease: ${{ contains(github.ref, '-') }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
name: VeTube-x64 | ||
|
||
on: | ||
push: | ||
tags: ["*"] | ||
branches: [ master , piper ] | ||
pull_request: | ||
branches: [ master , piper ] | ||
workflow_dispatch: | ||
|
||
jobs: | ||
build: | ||
runs-on: windows-latest | ||
|
||
steps: | ||
- name: Source checkout | ||
uses: actions/checkout@v3 | ||
|
||
- name: Configure Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: 3.10.11 | ||
architecture: x64 | ||
|
||
- name: Install dependencies | ||
run: | | ||
pip install --upgrade pip wheel setuptools | ||
pip install -r requirements.txt | ||
pip install pyinstaller gdown | ||
pip install --upgrade pyzmq httpx httpcore future | ||
git clone https://github.com/mush42/espeak-phonemizer-windows | ||
- name: Compiling | ||
run: | | ||
pyinstaller VeTube.py | ||
gdown 1ZtF6zus0A7kC9Lwr_kTUbw0MiOoZq29H -O dist/VeTube/bootstrap.exe | ||
cp -R doc dist/VeTube/ | ||
cp -R locales dist/VeTube/ | ||
cp -R readme dist/VeTube/ | ||
cp -R sounds dist/VeTube/ | ||
cp -R espeak-phonemizer-windows/espeak_phonemizer dist/VeTube/ | ||
- name: Create zip | ||
run: | | ||
cd dist | ||
7z a ../VeTube-x64.zip VeTube/ | ||
cd .. | ||
- name: Upload zip | ||
uses: actions/upload-artifact@v3 | ||
with: | ||
name: VeTube-x64 | ||
path: dist | ||
if-no-files-found: error | ||
|
||
vetube_release: | ||
runs-on: windows-latest | ||
if: ${{ startsWith(github.ref, 'refs/tags/') }} | ||
needs: ["build"] | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: download | ||
uses: actions/download-artifact@v3 | ||
- name: Release | ||
uses: softprops/action-gh-release@v1 | ||
with: | ||
files: VeTube-x64.zip | ||
fail_on_unmatched_files: true | ||
prerelease: ${{ contains(github.ref, '-') }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,9 @@ | ||
__pycache__/ | ||
*.pyc | ||
*.pyc | ||
*.onnx | ||
*.onnx.json | ||
piper/voices/* | ||
piper/voices/*/*.onnx | ||
piper/voices/*/*.onnx.json | ||
data.json | ||
keys.txt |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
import io | ||
import json | ||
import logging | ||
import wave | ||
from dataclasses import dataclass | ||
from pathlib import Path | ||
from typing import List, Mapping, Optional, Sequence, Union | ||
|
||
import numpy as np | ||
import onnxruntime | ||
from espeak_phonemizer import Phonemizer | ||
|
||
_LOGGER = logging.getLogger(__name__) | ||
|
||
_BOS = "^" | ||
_EOS = "$" | ||
_PAD = "_" | ||
|
||
|
||
@dataclass | ||
class PiperConfig: | ||
num_symbols: int | ||
num_speakers: int | ||
sample_rate: int | ||
espeak_voice: str | ||
length_scale: float | ||
noise_scale: float | ||
noise_w: float | ||
phoneme_id_map: Mapping[str, Sequence[int]] | ||
|
||
|
||
class Piper: | ||
def __init__( | ||
self, | ||
model_path: Union[str, Path], | ||
config_path: Optional[Union[str, Path]] = None, | ||
use_cuda: bool = False, | ||
): | ||
if config_path is None: | ||
config_path = f"{model_path}.json" | ||
|
||
self.config = load_config(config_path) | ||
self.phonemizer = Phonemizer(self.config.espeak_voice) | ||
self.model = onnxruntime.InferenceSession( | ||
str(model_path), | ||
sess_options=onnxruntime.SessionOptions(), | ||
providers=["CPUExecutionProvider"] | ||
if not use_cuda | ||
else ["CUDAExecutionProvider"], | ||
) | ||
|
||
def synthesize( | ||
self, | ||
text: str, | ||
speaker_id: Optional[int] = None, | ||
length_scale: Optional[float] = None, | ||
noise_scale: Optional[float] = None, | ||
noise_w: Optional[float] = None, | ||
) -> bytes: | ||
"""Synthesize WAV audio from text.""" | ||
if length_scale is None: | ||
length_scale = self.config.length_scale | ||
|
||
if noise_scale is None: | ||
noise_scale = self.config.noise_scale | ||
|
||
if noise_w is None: | ||
noise_w = self.config.noise_w | ||
|
||
phonemes_str = self.phonemizer.phonemize(text) | ||
phonemes = [_BOS] + list(phonemes_str) | ||
phoneme_ids: List[int] = [] | ||
|
||
for phoneme in phonemes: | ||
if phoneme in self.config.phoneme_id_map: | ||
phoneme_ids.extend(self.config.phoneme_id_map[phoneme]) | ||
phoneme_ids.extend(self.config.phoneme_id_map[_PAD]) | ||
else: | ||
_LOGGER.warning("No id for phoneme: %s", phoneme) | ||
|
||
phoneme_ids.extend(self.config.phoneme_id_map[_EOS]) | ||
|
||
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0) | ||
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64) | ||
scales = np.array( | ||
[noise_scale, length_scale, noise_w], | ||
dtype=np.float32, | ||
) | ||
|
||
if (self.config.num_speakers > 1) and (speaker_id is not None): | ||
# Default speaker | ||
speaker_id = 0 | ||
|
||
sid = None | ||
|
||
if speaker_id is not None: | ||
sid = np.array([speaker_id], dtype=np.int64) | ||
|
||
# Synthesize through Onnx | ||
audio = self.model.run( | ||
None, | ||
{ | ||
"input": phoneme_ids_array, | ||
"input_lengths": phoneme_ids_lengths, | ||
"scales": scales, | ||
"sid": sid, | ||
}, | ||
)[0].squeeze((0, 1)) | ||
audio = audio_float_to_int16(audio.squeeze()) | ||
return audio, self.config.sample_rate | ||
|
||
|
||
def load_config(config_path: Union[str, Path]) -> PiperConfig: | ||
with open(config_path, "r", encoding="utf-8") as config_file: | ||
config_dict = json.load(config_file) | ||
inference = config_dict.get("inference", {}) | ||
|
||
return PiperConfig( | ||
num_symbols=config_dict["num_symbols"], | ||
num_speakers=config_dict["num_speakers"], | ||
sample_rate=config_dict["audio"]["sample_rate"], | ||
espeak_voice=config_dict["espeak"]["voice"], | ||
noise_scale=inference.get("noise_scale", 0.667), | ||
length_scale=inference.get("length_scale", 1.0), | ||
noise_w=inference.get("noise_w", 0.8), | ||
phoneme_id_map=config_dict["phoneme_id_map"], | ||
) | ||
|
||
|
||
def audio_float_to_int16( | ||
audio: np.ndarray, max_wav_value: float = 32767.0 | ||
) -> np.ndarray: | ||
"""Normalize audio and convert to int16 range""" | ||
audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio)))) | ||
audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value) | ||
audio_norm = audio_norm.astype("int16") | ||
return audio_norm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import logging | ||
from functools import partial | ||
from pathlib import Path | ||
import sounddevice as sd | ||
from . import Piper | ||
|
||
class piperSpeak: | ||
def __init__(self, model_path): | ||
self.model_path = model_path | ||
self.speaker_id = None | ||
self.length_scale = 1 | ||
self.noise_scale = 0.667 | ||
self.noise_w = 0.8 | ||
self.synthesize = None | ||
self.voice = None | ||
|
||
def load_model(self): | ||
if self.voice: | ||
return self.voice | ||
self.voice = Piper(self.model_path) | ||
|
||
def set_rate(self, new_scale): | ||
self.length_scale = new_scale | ||
|
||
def set_speaker(self, sid): | ||
self.speaker_id = sid | ||
|
||
def is_multispeaker(self): | ||
return self.voice.config.num_speakers > 1 | ||
|
||
def list_speakers(self): | ||
if self.is_multispeaker(): | ||
return self.voice.config.speaker_id_map | ||
else: | ||
raise Exception("This is not a multispeaker model!") | ||
|
||
def speak(self, text): | ||
self.synthesize = self.load_model() | ||
if self.speaker_id is None and self.is_multispeaker(): | ||
self.set_speaker(0) | ||
audio_norm, sample_rate = self.voice.synthesize( | ||
text, | ||
self.speaker_id, | ||
self.length_scale, | ||
self.noise_scale, | ||
self.noise_w | ||
) | ||
sd.play(audio_norm, sample_rate) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# lector: | ||
from accessible_output2.outputs import auto, sapi5 | ||
from .Piper import Piper, speaker | ||
import glob | ||
""" | ||
Esto es un gestionador de TTS. Permite manejar el uso de diferentes motores de texto a voz como: | ||
1. accessible output2 | ||
2. Piper | ||
""" | ||
def configurar_tts(lector): | ||
if lector == "auto": | ||
return auto.Auto() | ||
elif lector == "sapi5": | ||
return sapi5.SAPI5() | ||
elif lector == "piper": | ||
return speaker | ||
else: | ||
raise Exception("Lector no soportado.") | ||
|
||
def detect_onnx_models(path): | ||
onnx_models = glob.glob(path + '/*/*.onnx') | ||
if len(onnx_models) > 1: | ||
return onnx_models | ||
elif len(onnx_models) == 1: | ||
return onnx_models[0] | ||
else: | ||
return None |
Oops, something went wrong.