Skip to content

Commit

Permalink
Merge pull request #40 from jhudsl/39-voice-cloning
Browse files Browse the repository at this point in the history
Voice Cloning
  • Loading branch information
howardbaik authored Feb 15, 2024
2 parents f3ab315 + 8d540da commit dc2c169
Show file tree
Hide file tree
Showing 8 changed files with 139 additions and 23 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ inst/doc
.Rhistory
.DS_Store
docs
*.wav
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Imports:
knitr,
magrittr,
conrad,
reticulate,
tidyr,
tuneR,
utils,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ export(tts_auth)
export(tts_bind_wav)
export(tts_coqui)
export(tts_coqui_auth)
export(tts_coqui_vc)
export(tts_coqui_voices)
export(tts_default_voice)
export(tts_google)
Expand Down
7 changes: 7 additions & 0 deletions R/aaa_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,10 @@ coqui_path_missing <- paste(
"If you've already downloaded the software, use function",
"'set_coqui_path(path = \"path/to/coqui/tts\")' to point R to your local coqui tts Executable File"
)

# Open private audio (.wav, .mp3) or video (.mp4) files
system_open <- function(path) {
system(paste0("open ", path))
}


123 changes: 104 additions & 19 deletions R/tts.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#' Text-to-Speech (Speech Synthesis)
#'
#' @description Convert text-to-speech using various engines, including Amazon
#' Polly, Coqui TTS, Google Cloud Text-to-Speech API, and Microsoft Cognitive
#' Polly, Coqui TTS and XTTS (Voice-Cloning), Google Cloud Text-to-Speech API, and Microsoft Cognitive
#' Services Text to Speech REST API.
#'
#' With the exception of Coqui TTS, all these engines are accessible as R
Expand All @@ -11,6 +11,7 @@
#' * [conrad](https://github.com/fhdsl/conrad) is a client to the Microsoft Cognitive Services Text to Speech REST API
#'
#' @param text A character vector of text to be spoken
#' @param speaker_wav (Coqui Voice Cloning only) Speaker voice to clone
#' @param exec_path System path to Coqui TTS executable
#' @param output_format Format of output files: "mp3" or "wav"
#' @param voice Full voice name
Expand Down Expand Up @@ -49,6 +50,10 @@
# Coqui TTS
#' tts("Hello world! This is Coqui TTS", service = "coqui")
#'
#' Coqui Voice Cloning (XTTS)
#' TODO
#'
#'
# Google Cloud Text-to-Speech API
#' tts("Hello world! This is Google Cloud", service = "google")
#'
Expand All @@ -57,21 +62,37 @@
#' }
tts = function(
text,
speaker_wav = "speaker.wav",
output_format = c("mp3", "wav"),
service = c("amazon", "google", "microsoft", "coqui"),
service = c("coqui", "coqui-vc", "amazon", "google", "microsoft"),
bind_audio = TRUE,
...) {

service = match.arg(service)
if (!tts_auth(service = service)) {
warning(paste0("Service ", service, " not authorized/unavailable"))
}

output_format = match.arg(output_format)
if (service == "google") {
res = tts_google(

if (service == "coqui") {
cli::cli_alert_info("This service does not support MP3 format; will produce a WAV audio output.")
use_coqui()
coqui_path <- getOption("path_to_coqui")

res <- tts_coqui(
text = text,
output_format = output_format,
exec_path = coqui_path,
output_format = "wav",
bind_audio = bind_audio,
...)
}
if (service == "coqui-vc") {
cli::cli_alert_info("This service does not support MP3 format; will produce a WAV audio output.")
# TODO: Specify Python version, just as we specify path to coqui above

res <- tts_coqui_vc(
text = text,
speaker_wav = speaker_wav,
bind_audio = bind_audio,
...)
}
Expand All @@ -82,22 +103,17 @@ tts = function(
bind_audio = bind_audio,
...)
}
if (service == "microsoft") {
res = tts_microsoft(
if (service == "google") {
res = tts_google(
text = text,
output_format = output_format,
bind_audio = bind_audio,
...)
}
if (service == "coqui") {
cli::cli_alert_info("Coqui TTS does not support MP3 format; will produce a WAV audio output.")
use_coqui()
coqui_path <- getOption("path_to_coqui")

res <- tts_coqui(
if (service == "microsoft") {
res = tts_microsoft(
text = text,
exec_path = coqui_path,
output_format = "wav",
output_format = output_format,
bind_audio = bind_audio,
...)
}
Expand Down Expand Up @@ -415,9 +431,11 @@ tts_coqui <- function(
}, FUN.VALUE = character(1L), USE.NAMES = FALSE)
out = lapply(res, tts_audio_read,
output_format = audio_type)
df = dplyr::tibble(original_text = string,
text = string_processed,
wav = out, file = normalizePath(res))

# Output
dplyr::tibble(original_text = string,
text = string_processed,
wav = out, file = normalizePath(res))
})
}

Expand All @@ -443,3 +461,70 @@ tts_coqui <- function(
}
res
}

#' @export
#' @rdname tts
tts_coqui_vc <- function(
text,
speaker_wav,
language = "en",
python_version = "/opt/homebrew/Caskroom/miniforge/base/bin/python",
gpu = FALSE,
bind_audio = TRUE,
save_local = FALSE,
save_local_dest = NULL,
...) {
# Specify version of Python to be used by reticulate
reticulate::use_python(python_version)
# Import TTS
TTS_api <- reticulate::import("TTS.api")
# Model name
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
# TTS
tts <- TTS_api$TTS(model_name, gpu = gpu)

# Execute model
res = lapply(text, function(string) {
string_processed = tts_split_text(string, limit = 600)

res = vapply(string_processed, function(tt) {
output_path = tts_temp_audio("wav")
tts$tts_to_file(text = tt,
max_new_tokens = 600,
file_path = output_path,
speaker_wav = speaker_wav,
language = language)
# Output file path
output_path
}, FUN.VALUE = character(1L), USE.NAMES = FALSE)
out = lapply(res, tts_audio_read,
output_format = "wav")

# Output
dplyr::tibble(original_text = string,
text = string_processed,
wav = out, file = normalizePath(res))
})

# Post-processing
names(res) <- seq_along(text)
res <- dplyr::bind_rows(res, .id = "index")
res$index <- as.numeric(res$index)
res$audio_type <- "wav"

if (bind_audio) {
res = tts_bind_wav(res)
}
if ("wav" %in% colnames(res)) {
res$duration = vapply(res$wav, wav_duration, FUN.VALUE = numeric(1))
}
# Copy and paste audio file into local folder
if (save_local) {
if (!is.null(save_local_dest)) {
file.copy(normalizePath(res$file), save_local_dest)
} else {
cli::cli_alert_danger("Provide local destination where audio file will be saved")
}
}
res
}
3 changes: 2 additions & 1 deletion R/tts_auth.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
tts_auth = function(service = c("amazon",
"google",
"microsoft",
"coqui"),
"coqui",
"coqui-vc"),
key_or_json_file = NULL,
...) {
service = match.arg(service)
Expand Down
24 changes: 22 additions & 2 deletions man/tts.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/tts_auth.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit dc2c169

Please sign in to comment.