From 1fc2161d648bdb0de89180502609c282c20b3c41 Mon Sep 17 00:00:00 2001 From: Howard Baek <50791792+howardbaek@users.noreply.github.com> Date: Mon, 27 Nov 2023 17:36:58 -0800 Subject: [PATCH 01/10] Add more meat to `tts_coqui_vc()` --- R/tts.R | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/R/tts.R b/R/tts.R index 54e9e0c..6a49e52 100644 --- a/R/tts.R +++ b/R/tts.R @@ -443,3 +443,74 @@ tts_coqui <- function( } res } + + +#' @export +#' @rdname tts +tts_coqui_vc <- function( + text, + speaker_wav, + language = "en", + gpu = FALSE, + bind_audio = TRUE, + save_local = FALSE, + save_local_dest = NULL, + ...) { + # TODO: Read Managing an R Package’s Python Dependencies + # https://rstudio.github.io/reticulate/articles/python_dependencies.html + + # Specify version of Python to be used by reticulate + reticulate::use_python("/opt/homebrew/Caskroom/miniforge/base/bin/python") + # Import TTS + TTS_api <- reticulate::import("TTS.api") + # Model name + model_name <- "tts_models/multilingual/multi-dataset/xtts_v2" + # TTS + tts <- TTS_api$TTS(model_name, gpu = gpu) + + + # Execute model + res = lapply(text, function(string) { + string_processed = tts_split_text(string, limit = 600) + + res = vapply(string_processed, function(tt) { + output_path = tts_temp_audio("wav") + tts$tts_to_file(text = tt, + max_new_tokens = 600, + file_path = output_path, + speaker_wav = speaker_wav, + language = language) + # Output file path + output_path + }, FUN.VALUE = character(1L), USE.NAMES = FALSE) + out = lapply(res, tts_audio_read, + output_format = "wav") + + # Output + dplyr::tibble(original_text = string, + text = string_processed, + wav = out, file = normalizePath(res)) + }) + + # Post-processing + names(res) = seq_along(text) + res = dplyr::bind_rows(res, .id = "index") + res$index = as.numeric(res$index) + res$audio_type = "wav" + + if (bind_audio) { + res = tts_bind_wav(res) + } + if ("wav" %in% colnames(res)) { + res$duration = vapply(res$wav, wav_duration, FUN.VALUE = numeric(1)) + } + # Copy and paste audio file into local folder + if (save_local) { + if (!is.null(save_local_dest)) { + file.copy(normalizePath(res$file), save_local_dest) + } else { + cli::cli_alert_danger("Provide local destination where audio file will be saved") + } + } + res +} From 7bfafeab2fee38175865250782e26dc72ed18a3e Mon Sep 17 00:00:00 2001 From: Howard Baek <50791792+howardbaek@users.noreply.github.com> Date: Mon, 27 Nov 2023 17:52:00 -0800 Subject: [PATCH 02/10] Finish writing `tts_coqui_vc()` --- R/tts.R | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/R/tts.R b/R/tts.R index 6a49e52..8f1de36 100644 --- a/R/tts.R +++ b/R/tts.R @@ -415,9 +415,11 @@ tts_coqui <- function( }, FUN.VALUE = character(1L), USE.NAMES = FALSE) out = lapply(res, tts_audio_read, output_format = audio_type) - df = dplyr::tibble(original_text = string, - text = string_processed, - wav = out, file = normalizePath(res)) + + # Output + dplyr::tibble(original_text = string, + text = string_processed, + wav = out, file = normalizePath(res)) }) } @@ -493,10 +495,10 @@ tts_coqui_vc <- function( }) # Post-processing - names(res) = seq_along(text) - res = dplyr::bind_rows(res, .id = "index") - res$index = as.numeric(res$index) - res$audio_type = "wav" + names(res) <- seq_along(text) + res <- dplyr::bind_rows(res, .id = "index") + res$index <- as.numeric(res$index) + res$audio_type <- "wav" if (bind_audio) { res = tts_bind_wav(res) From ba710b85a236bfcd045f0dea0f17205938315c2c Mon Sep 17 00:00:00 2001 From: Howard Baek <50791792+howardbaek@users.noreply.github.com> Date: Thu, 30 Nov 2023 17:56:22 -0800 Subject: [PATCH 03/10] gitignore wav audio files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 174f712..ad78f46 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ inst/doc .Rhistory .DS_Store docs +*.wav From aaf3f7415eb41936c99abdd19d140058a6d60e68 Mon Sep 17 00:00:00 2001 From: Howard Baek <50791792+howardbaek@users.noreply.github.com> Date: Thu, 30 Nov 2023 18:01:32 -0800 Subject: [PATCH 04/10] Start incorporating `tts_coqui_vc()` into `tts()` --- R/tts.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/tts.R b/R/tts.R index 8f1de36..4ceb6fe 100644 --- a/R/tts.R +++ b/R/tts.R @@ -1,7 +1,7 @@ #' Text-to-Speech (Speech Synthesis) #' #' @description Convert text-to-speech using various engines, including Amazon -#' Polly, Coqui TTS, Google Cloud Text-to-Speech API, and Microsoft Cognitive +#' Polly, Coqui TTS and XTTS (Voice-Cloning), Google Cloud Text-to-Speech API, and Microsoft Cognitive #' Services Text to Speech REST API. #' #' With the exception of Coqui TTS, all these engines are accessible as R @@ -49,6 +49,10 @@ # Coqui TTS #' tts("Hello world! This is Coqui TTS", service = "coqui") #' +#' Coqui Voice Cloning (XTTS) +#' TODO +#' +#' # Google Cloud Text-to-Speech API #' tts("Hello world! This is Google Cloud", service = "google") #' @@ -58,7 +62,7 @@ tts = function( text, output_format = c("mp3", "wav"), - service = c("amazon", "google", "microsoft", "coqui"), + service = c("coqui", "coqui-vc", "amazon", "google", "microsoft"), bind_audio = TRUE, ...) { From 2ea92389a4564b6fbab665071f16ce82b72dc532 Mon Sep 17 00:00:00 2001 From: Howard Baek <50791792+howardbaek@users.noreply.github.com> Date: Wed, 3 Jan 2024 10:58:57 -0800 Subject: [PATCH 05/10] Add `reticulate` to Imports --- DESCRIPTION | 1 + R/tts.R | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a3c3521..8485444 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,6 +39,7 @@ Imports: knitr, magrittr, conrad, + reticulate, tidyr, tuneR, utils, diff --git a/R/tts.R b/R/tts.R index 4ceb6fe..f71fb7d 100644 --- a/R/tts.R +++ b/R/tts.R @@ -462,9 +462,6 @@ tts_coqui_vc <- function( save_local = FALSE, save_local_dest = NULL, ...) { - # TODO: Read Managing an R Package’s Python Dependencies - # https://rstudio.github.io/reticulate/articles/python_dependencies.html - # Specify version of Python to be used by reticulate reticulate::use_python("/opt/homebrew/Caskroom/miniforge/base/bin/python") # Import TTS @@ -474,7 +471,6 @@ tts_coqui_vc <- function( # TTS tts <- TTS_api$TTS(model_name, gpu = gpu) - # Execute model res = lapply(text, function(string) { string_processed = tts_split_text(string, limit = 600) From a6986ed2773d19044f553adca495284c42944231 Mon Sep 17 00:00:00 2001 From: Howard Baek <50791792+howardbaek@users.noreply.github.com> Date: Wed, 3 Jan 2024 11:01:18 -0800 Subject: [PATCH 06/10] Add function to open audio files --- R/aaa_utils.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/R/aaa_utils.R b/R/aaa_utils.R index bf47e64..2c1dfc2 100644 --- a/R/aaa_utils.R +++ b/R/aaa_utils.R @@ -167,3 +167,10 @@ coqui_path_missing <- paste( "If you've already downloaded the software, use function", "'set_coqui_path(path = \"path/to/coqui/tts\")' to point R to your local coqui tts Executable File" ) + +# Open private audio files +system_open <- function(path) { + system(paste0("open ", path)) +} + + From a66844e9897a499e5c68a09b2731699cebab2131 Mon Sep 17 00:00:00 2001 From: Howard Baek <50791792+howardbaek@users.noreply.github.com> Date: Wed, 3 Jan 2024 11:28:45 -0800 Subject: [PATCH 07/10] WIP: `tts()` and `tts_auth()` --- R/tts.R | 45 +++++++++++++++++++++++++++++---------------- R/tts_auth.R | 3 ++- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/R/tts.R b/R/tts.R index f71fb7d..f88deca 100644 --- a/R/tts.R +++ b/R/tts.R @@ -11,6 +11,7 @@ #' * [conrad](https://github.com/fhdsl/conrad) is a client to the Microsoft Cognitive Services Text to Speech REST API #' #' @param text A character vector of text to be spoken +#' @param speaker_wav (Coqui Voice Cloning only) Speaker voice to clone #' @param exec_path System path to Coqui TTS executable #' @param output_format Format of output files: "mp3" or "wav" #' @param voice Full voice name @@ -61,6 +62,7 @@ #' } tts = function( text, + speaker_wav = "speaker.wav", output_format = c("mp3", "wav"), service = c("coqui", "coqui-vc", "amazon", "google", "microsoft"), bind_audio = TRUE, @@ -70,12 +72,27 @@ tts = function( if (!tts_auth(service = service)) { warning(paste0("Service ", service, " not authorized/unavailable")) } - output_format = match.arg(output_format) - if (service == "google") { - res = tts_google( + + if (service == "coqui") { + cli::cli_alert_info("This service does not support MP3 format; will produce a WAV audio output.") + use_coqui() + coqui_path <- getOption("path_to_coqui") + + res <- tts_coqui( text = text, - output_format = output_format, + exec_path = coqui_path, + output_format = "wav", + bind_audio = bind_audio, + ...) + } + if (service == "coqui-vc") { + cli::cli_alert_info("This service does not support MP3 format; will produce a WAV audio output.") + # TODO: Specify Python version, just as we specify path to coqui above + + res <- tts_coqui_vc( + text = text, + speaker_wav = speaker_wav, bind_audio = bind_audio, ...) } @@ -86,22 +103,17 @@ tts = function( bind_audio = bind_audio, ...) } - if (service == "microsoft") { - res = tts_microsoft( + if (service == "google") { + res = tts_google( text = text, output_format = output_format, bind_audio = bind_audio, ...) } - if (service == "coqui") { - cli::cli_alert_info("Coqui TTS does not support MP3 format; will produce a WAV audio output.") - use_coqui() - coqui_path <- getOption("path_to_coqui") - - res <- tts_coqui( + if (service == "microsoft") { + res = tts_microsoft( text = text, - exec_path = coqui_path, - output_format = "wav", + output_format = output_format, bind_audio = bind_audio, ...) } @@ -457,17 +469,18 @@ tts_coqui_vc <- function( text, speaker_wav, language = "en", + python_version = "/opt/homebrew/Caskroom/miniforge/base/bin/python", gpu = FALSE, bind_audio = TRUE, save_local = FALSE, save_local_dest = NULL, ...) { # Specify version of Python to be used by reticulate - reticulate::use_python("/opt/homebrew/Caskroom/miniforge/base/bin/python") + reticulate::use_python(python_version) # Import TTS TTS_api <- reticulate::import("TTS.api") # Model name - model_name <- "tts_models/multilingual/multi-dataset/xtts_v2" + model_name = "tts_models/multilingual/multi-dataset/xtts_v2" # TTS tts <- TTS_api$TTS(model_name, gpu = gpu) diff --git a/R/tts_auth.R b/R/tts_auth.R index 6b2afd5..f8b77df 100644 --- a/R/tts_auth.R +++ b/R/tts_auth.R @@ -31,7 +31,8 @@ tts_auth = function(service = c("amazon", "google", "microsoft", - "coqui"), + "coqui", + "coqui-vc"), key_or_json_file = NULL, ...) { service = match.arg(service) From 619994fbc6e8aefca8533a294792712ee08db068 Mon Sep 17 00:00:00 2001 From: Howard Baek <50791792+howardbaek@users.noreply.github.com> Date: Thu, 4 Jan 2024 15:17:33 -0800 Subject: [PATCH 08/10] Add `tts_coqui_vc()` to NAMESPACE --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index 3b2cb2d..0bbf01b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(tts_auth) export(tts_bind_wav) export(tts_coqui) export(tts_coqui_auth) +export(tts_coqui_vc) export(tts_coqui_voices) export(tts_default_voice) export(tts_google) From 56ae97a4dfcb56783227bd64dbf6927f1f7a3ffd Mon Sep 17 00:00:00 2001 From: Howard Baek <50791792+howardbaek@users.noreply.github.com> Date: Thu, 4 Jan 2024 15:17:39 -0800 Subject: [PATCH 09/10] Document --- R/tts.R | 1 - man/tts.Rd | 24 ++++++++++++++++++++++-- man/tts_auth.Rd | 2 +- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/R/tts.R b/R/tts.R index f88deca..956c81b 100644 --- a/R/tts.R +++ b/R/tts.R @@ -462,7 +462,6 @@ tts_coqui <- function( res } - #' @export #' @rdname tts tts_coqui_vc <- function( diff --git a/man/tts.Rd b/man/tts.Rd index 6e95f75..f99dd5b 100644 --- a/man/tts.Rd +++ b/man/tts.Rd @@ -6,12 +6,14 @@ \alias{tts_google} \alias{tts_microsoft} \alias{tts_coqui} +\alias{tts_coqui_vc} \title{Text-to-Speech (Speech Synthesis)} \usage{ tts( text, + speaker_wav = "speaker.wav", output_format = c("mp3", "wav"), - service = c("amazon", "google", "microsoft", "coqui"), + service = c("coqui", "coqui-vc", "amazon", "google", "microsoft"), bind_audio = TRUE, ... ) @@ -57,10 +59,24 @@ tts_coqui( save_local_dest = NULL, ... ) + +tts_coqui_vc( + text, + speaker_wav, + language = "en", + python_version = "/opt/homebrew/Caskroom/miniforge/base/bin/python", + gpu = FALSE, + bind_audio = TRUE, + save_local = FALSE, + save_local_dest = NULL, + ... +) } \arguments{ \item{text}{A character vector of text to be spoken} +\item{speaker_wav}{(Coqui Voice Cloning only) Speaker voice to clone} + \item{output_format}{Format of output files: "mp3" or "wav"} \item{service}{Service to use (Amazon, Google, Microsoft, or Coqui)} @@ -101,7 +117,7 @@ A standardized \code{tibble} featuring the following columns: } \description{ Convert text-to-speech using various engines, including Amazon -Polly, Coqui TTS, Google Cloud Text-to-Speech API, and Microsoft Cognitive +Polly, Coqui TTS and XTTS (Voice-Cloning), Google Cloud Text-to-Speech API, and Microsoft Cognitive Services Text to Speech REST API. With the exception of Coqui TTS, all these engines are accessible as R @@ -119,6 +135,10 @@ tts("Hello world! This is Amazon Polly", service = "amazon") tts("Hello world! This is Coqui TTS", service = "coqui") +Coqui Voice Cloning (XTTS) +TODO + + tts("Hello world! This is Google Cloud", service = "google") tts("Hello world! This is Microsoft", service = "microsoft") diff --git a/man/tts_auth.Rd b/man/tts_auth.Rd index 388f655..bcc1fcb 100644 --- a/man/tts_auth.Rd +++ b/man/tts_auth.Rd @@ -9,7 +9,7 @@ \title{Authentication for Text-to-Speech (Speech Synthesis) Engines} \usage{ tts_auth( - service = c("amazon", "google", "microsoft", "coqui"), + service = c("amazon", "google", "microsoft", "coqui", "coqui-vc"), key_or_json_file = NULL, ... ) From 8d540da95b23a14b60925c6ca4183bb40319d193 Mon Sep 17 00:00:00 2001 From: Howard Baek <50791792+howardbaek@users.noreply.github.com> Date: Wed, 10 Jan 2024 12:18:51 -0800 Subject: [PATCH 10/10] Improve comments on `system_open()` --- R/aaa_utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/aaa_utils.R b/R/aaa_utils.R index 2c1dfc2..83677c0 100644 --- a/R/aaa_utils.R +++ b/R/aaa_utils.R @@ -168,7 +168,7 @@ coqui_path_missing <- paste( "'set_coqui_path(path = \"path/to/coqui/tts\")' to point R to your local coqui tts Executable File" ) -# Open private audio files +# Open private audio (.wav, .mp3) or video (.mp4) files system_open <- function(path) { system(paste0("open ", path)) }