forked from ggerganov/whisper.cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
talk, talk-llama : pass text_to_speak as a file (ggerganov#1865)
* talk-llama: pass file instead of arg it is too hard to quote text in a portable way * talk-llama: pass heard_ok as a file * talk-llama: let eleven-labs.py accept options Options: -v voice, -s savefile, -p (--play) * talk-llama: check installed commands in "speak" Pass "-q" to eleven-labs.py to skip checking whether elevenlabs is installed * talk-llama: pass voice_id again in order to sync talk with talk-llama * talk: sync with talk-llama Passing text_to_speak as a file is safer and more portable cf. https://stackoverflow.com/a/59036879/45375 * talk and talk-llama: get all installed voices in speak.ps1 * talk and talk-llama: get voices from api * talk and talk-llama: add more options to eleven-labs.py and remove DEFAULT_VOICE because it is deprecated (https://www.reddit.com/r/ElevenLabs/comments/1830abt/what_happened_to_bella/) ``` usage: eleven-labs.py [-q] [-l] [-h] [-n NAME | -v NUMBER] [-f KEY=VAL] [-s FILE | -p] [TEXTFILE] options: -q, --quick skip checking the required library action: TEXTFILE read the text file (default: stdin) -l, --list show the list of voices and exit -h, --help show this help and exit voice selection: -n NAME, --name NAME get a voice object by name (default: Arnold) -v NUMBER, --voice NUMBER get a voice object by number (see --list) -f KEY=VAL, --filter KEY=VAL filter voices by labels (default: "use case=narration") this option can be used multiple times filtering will be disabled if the first -f has no "=" (e.g. -f "any") output: -s FILE, --save FILE save the TTS to a file (default: audio.mp3) -p, --play play the TTS with ffplay ``` * examples: add speak_with_file() as suggested in the review * talk and talk-llama: ignore to_speak.txt
- Loading branch information
Showing
13 changed files
with
258 additions
and
91 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
audio.mp3 | ||
to_speak.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,80 @@ | ||
import sys | ||
import importlib.util | ||
import argparse | ||
import textwrap | ||
|
||
if importlib.util.find_spec("elevenlabs") is None: | ||
print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'") | ||
sys.exit() | ||
parser = argparse.ArgumentParser(add_help=False, | ||
formatter_class=argparse.RawTextHelpFormatter) | ||
parser.add_argument("-q", "--quick", action="store_true", | ||
help="skip checking the required library") | ||
|
||
modes = parser.add_argument_group("action") | ||
modes.add_argument("inputfile", metavar="TEXTFILE", | ||
nargs='?', type=argparse.FileType(), default=sys.stdin, | ||
help="read the text file (default: stdin)") | ||
modes.add_argument("-l", "--list", action="store_true", | ||
help="show the list of voices and exit") | ||
modes.add_argument("-h", "--help", action="help", | ||
help="show this help and exit") | ||
|
||
selopts = parser.add_argument_group("voice selection") | ||
selmodes = selopts.add_mutually_exclusive_group() | ||
selmodes.add_argument("-n", "--name", | ||
default="Arnold", | ||
help="get a voice object by name (default: Arnold)") | ||
selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER", | ||
help="get a voice object by number (see --list)") | ||
selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL", | ||
default=["use case=narration"], | ||
help=textwrap.dedent('''\ | ||
filter voices by labels (default: "use case=narration") | ||
this option can be used multiple times | ||
filtering will be disabled if the first -f has no "=" (e.g. -f "any") | ||
''')) | ||
|
||
outmodes = parser.add_argument_group("output") | ||
outgroup = outmodes.add_mutually_exclusive_group() | ||
outgroup.add_argument("-s", "--save", metavar="FILE", | ||
default="audio.mp3", | ||
help="save the TTS to a file (default: audio.mp3)") | ||
outgroup.add_argument("-p", "--play", action="store_true", | ||
help="play the TTS with ffplay") | ||
|
||
args = parser.parse_args() | ||
|
||
from elevenlabs import generate, play, save | ||
if not args.quick: | ||
import importlib.util | ||
if importlib.util.find_spec("elevenlabs") is None: | ||
print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'") | ||
sys.exit() | ||
|
||
# Get a Voice object, by name or UUID | ||
voice = "Arnold" #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh | ||
from elevenlabs import voices, generate, play, save | ||
|
||
if args.filter and "=" in args.filter[0]: | ||
voicelist = voices() | ||
for f in args.filter: | ||
label, value = f.split("=") | ||
voicelist = filter(lambda x: x.labels.get(label) == value, voicelist) | ||
voicelist = list(voicelist) | ||
else: | ||
voicelist = list(voices()) | ||
|
||
if args.list: | ||
for i, v in enumerate(voicelist): | ||
print(str(i) + ": " + v.name + " " + str(v.labels)) | ||
sys.exit() | ||
|
||
if args.voice: | ||
voice = voicelist[args.voice % len(voicelist)] | ||
else: | ||
voice = args.name | ||
# if -n should consult -f, use the following | ||
#voice = next(x for x in voicelist if x.name == args.name) | ||
|
||
# Generate the TTS | ||
audio = generate( | ||
text=str(sys.argv[2:]), | ||
voice=voice | ||
text=str(args.inputfile.read()), | ||
voice=voice | ||
) | ||
|
||
# Save the TTS to a file | ||
save(audio, "audio.mp3") | ||
if args.play: | ||
play(audio) | ||
else: | ||
save(audio, args.save) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,32 +1,40 @@ | ||
#!/bin/bash | ||
|
||
# Usage: | ||
# speak.sh <voice_id> <text-to-speak> | ||
|
||
# espeak | ||
# Mac OS: brew install espeak | ||
# Linux: apt-get install espeak | ||
# | ||
#espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2" | ||
|
||
# piper | ||
# | ||
# https://github.com/rhasspy/piper | ||
# | ||
# Tested with Linux: | ||
# | ||
#echo "$2" | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw - | ||
# speak <voice_id> <textfile> | ||
|
||
function installed() { command -v $1 >/dev/null 2>&1; } | ||
|
||
if installed espeak; then | ||
espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2 | ||
|
||
elif installed piper && installed aplay; then | ||
cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw - | ||
|
||
# for Mac | ||
say "$2" | ||
elif installed say; then | ||
say -f $2 | ||
|
||
# Eleven Labs | ||
# To use it, install the elevenlabs module from pip (pip install elevenlabs) | ||
# It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY=' | ||
#Keep the line commented to use the free version whitout api key | ||
# | ||
#export ELEVEN_API_KEY=your_api_key | ||
#wd=$(dirname $0) | ||
#script=$wd/eleven-labs.py | ||
#python3 $script $1 "$2" >/dev/null 2>&1 | ||
#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1 | ||
elif installed python3 && \ | ||
python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \ | ||
installed ffplay; then | ||
# It's possible to use the API for free with limited number of characters. | ||
# To increase this limit register to https://beta.elevenlabs.io to get an api key | ||
# and paste it after 'ELEVEN_API_KEY=' | ||
# Keep the line commented to use the free version without api key | ||
#export ELEVEN_API_KEY=your_api_key | ||
wd=$(dirname $0) | ||
script=$wd/eleven-labs.py | ||
python3 $script -q -p -v $1 $2 >/dev/null 2>&1 | ||
|
||
# Uncomment to keep the audio file | ||
#python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1 | ||
#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1 | ||
|
||
else | ||
echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),' | ||
echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,' | ||
echo 'or elevenlabs ("pip install elevenlabs") with ffplay.' | ||
echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)' | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
@powershell -ExecutionPolicy Bypass -F examples\talk\speak.ps1 %1 %2 | ||
@powershell -ExecutionPolicy Bypass -F examples\talk-llama\speak.ps1 %1 %2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,14 @@ | ||
# Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser | ||
param( | ||
# voice options are David or Zira | ||
[Parameter(Mandatory=$true)][string]$voice, | ||
[Parameter(Mandatory=$true)][string]$text | ||
[Parameter(Mandatory=$true)][int]$voicenum, | ||
[Parameter(Mandatory=$true)][string]$textfile | ||
) | ||
|
||
Add-Type -AssemblyName System.Speech; | ||
$speak = New-Object System.Speech.Synthesis.SpeechSynthesizer; | ||
$speak.SelectVoice("Microsoft $voice Desktop"); | ||
$voiceoptions = $speak.GetInstalledVoices("en-US"); | ||
$voice = $voiceoptions[$voicenum % $voiceoptions.count]; | ||
$speak.SelectVoice($voice.VoiceInfo.Name); | ||
$speak.Rate="0"; | ||
$text = Get-Content -Path $textfile; | ||
$speak.Speak($text); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
audio.mp3 | ||
to_speak.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,80 @@ | ||
import sys | ||
import importlib.util | ||
import argparse | ||
import textwrap | ||
|
||
if importlib.util.find_spec("elevenlabs") is None: | ||
print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'") | ||
sys.exit() | ||
parser = argparse.ArgumentParser(add_help=False, | ||
formatter_class=argparse.RawTextHelpFormatter) | ||
parser.add_argument("-q", "--quick", action="store_true", | ||
help="skip checking the required library") | ||
|
||
modes = parser.add_argument_group("action") | ||
modes.add_argument("inputfile", metavar="TEXTFILE", | ||
nargs='?', type=argparse.FileType(), default=sys.stdin, | ||
help="read the text file (default: stdin)") | ||
modes.add_argument("-l", "--list", action="store_true", | ||
help="show the list of voices and exit") | ||
modes.add_argument("-h", "--help", action="help", | ||
help="show this help and exit") | ||
|
||
selopts = parser.add_argument_group("voice selection") | ||
selmodes = selopts.add_mutually_exclusive_group() | ||
selmodes.add_argument("-n", "--name", | ||
default="Arnold", | ||
help="get a voice object by name (default: Arnold)") | ||
selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER", | ||
help="get a voice object by number (see --list)") | ||
selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL", | ||
default=["use case=narration"], | ||
help=textwrap.dedent('''\ | ||
filter voices by labels (default: "use case=narration") | ||
this option can be used multiple times | ||
filtering will be disabled if the first -f has no "=" (e.g. -f "any") | ||
''')) | ||
|
||
outmodes = parser.add_argument_group("output") | ||
outgroup = outmodes.add_mutually_exclusive_group() | ||
outgroup.add_argument("-s", "--save", metavar="FILE", | ||
default="audio.mp3", | ||
help="save the TTS to a file (default: audio.mp3)") | ||
outgroup.add_argument("-p", "--play", action="store_true", | ||
help="play the TTS with ffplay") | ||
|
||
args = parser.parse_args() | ||
|
||
from elevenlabs import generate, play, save | ||
if not args.quick: | ||
import importlib.util | ||
if importlib.util.find_spec("elevenlabs") is None: | ||
print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'") | ||
sys.exit() | ||
|
||
# Get a Voice object, by name or UUID | ||
voice = "Arnold" #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh | ||
from elevenlabs import voices, generate, play, save | ||
|
||
if args.filter and "=" in args.filter[0]: | ||
voicelist = voices() | ||
for f in args.filter: | ||
label, value = f.split("=") | ||
voicelist = filter(lambda x: x.labels.get(label) == value, voicelist) | ||
voicelist = list(voicelist) | ||
else: | ||
voicelist = list(voices()) | ||
|
||
if args.list: | ||
for i, v in enumerate(voicelist): | ||
print(str(i) + ": " + v.name + " " + str(v.labels)) | ||
sys.exit() | ||
|
||
if args.voice: | ||
voice = voicelist[args.voice % len(voicelist)] | ||
else: | ||
voice = args.name | ||
# if -n should consult -f, use the following | ||
#voice = next(x for x in voicelist if x.name == args.name) | ||
|
||
# Generate the TTS | ||
audio = generate( | ||
text=str(sys.argv[2:]), | ||
voice=voice | ||
text=str(args.inputfile.read()), | ||
voice=voice | ||
) | ||
|
||
# Save the TTS to a file | ||
save(audio, "audio.mp3") | ||
if args.play: | ||
play(audio) | ||
else: | ||
save(audio, args.save) |
Oops, something went wrong.