From d6abc085a7d788838103b519ee61310973f2142c Mon Sep 17 00:00:00 2001 From: Quan HL Date: Wed, 16 Oct 2024 12:55:52 +0700 Subject: [PATCH 1/2] support playht3.0 languages --- lib/routes/api/speech-credentials.js | 4 +- lib/utils/speech-data/tts-languages-playht.js | 152 ++++++++++++++++++ lib/utils/speech-utils.js | 72 ++++++--- 3 files changed, 201 insertions(+), 27 deletions(-) create mode 100644 lib/utils/speech-data/tts-languages-playht.js diff --git a/lib/routes/api/speech-credentials.js b/lib/routes/api/speech-credentials.js index 79326a0f..f8ede015 100644 --- a/lib/routes/api/speech-credentials.js +++ b/lib/routes/api/speech-credentials.js @@ -875,7 +875,7 @@ router.get('/:sid/test', async(req, res) => { router.get('/speech/supportedLanguagesAndVoices', async(req, res) => { const {logger, getTtsVoices} = req.app.locals; try { - const {vendor, label} = req.query; + const {vendor, label, create_new} = req.query; if (!vendor) { throw new DbErrorBadRequest('vendor is required'); } @@ -883,7 +883,7 @@ router.get('/speech/supportedLanguagesAndVoices', async(req, res) => { const service_provider_sid = req.user.service_provider_sid || req.body.service_provider_sid || parseServiceProviderSid(req); - const credentials = await SpeechCredential.getSpeechCredentialsByVendorAndLabel( + const credentials = create_new ? null : await SpeechCredential.getSpeechCredentialsByVendorAndLabel( service_provider_sid, account_sid, vendor, label); const tmp = credentials && credentials.length > 0 ? credentials[0] : null; const cred = tmp ? JSON.parse(decrypt(tmp.credential)) : null; diff --git a/lib/utils/speech-data/tts-languages-playht.js b/lib/utils/speech-data/tts-languages-playht.js new file mode 100644 index 00000000..c2ec7588 --- /dev/null +++ b/lib/utils/speech-data/tts-languages-playht.js @@ -0,0 +1,152 @@ +// languages.js + +module.exports = [ + { + name: 'English', + value: 'english' + }, + { + name: 'Mandarin', + value: 'mandarin' + }, + { + name: 'Hindi', + value: 'hindi' + }, + { + name: 'Japanese', + value: 'japanese' + }, + { + name: 'Korean', + value: 'korean' + }, + { + name: 'Arabic', + value: 'arabic' + }, + { + name: 'Spanish', + value: 'spanish' + }, + { + name: 'French', + value: 'french' + }, + { + name: 'Italian', + value: 'italian' + }, + { + name: 'Portuguese', + value: 'portuguese' + }, + { + name: 'German', + value: 'german' + }, + { + name: 'Dutch', + value: 'dutch' + }, + { + name: 'Swedish', + value: 'swedish' + }, + { + name: 'Czech', + value: 'czech' + }, + { + name: 'Polish', + value: 'polish' + }, + { + name: 'Russian', + value: 'russian' + }, + { + name: 'Bulgarian', + value: 'bulgarian' + }, + { + name: 'Hebrew', + value: 'hebrew' + }, + { + name: 'Greek', + value: 'greek' + }, + { + name: 'Turkish', + value: 'turkish' + }, + { + name: 'Afrikaans', + value: 'afrikaans' + }, + { + name: 'Xhosa', + value: 'xhosa' + }, + { + name: 'Tagalog', + value: 'tagalog' + }, + { + name: 'Malay', + value: 'malay' + }, + { + name: 'Indonesian', + value: 'indonesian' + }, + { + name: 'Bengali', + value: 'bengali' + }, + { + name: 'Serbian', + value: 'serbian' + }, + { + name: 'Thai', + value: 'thai' + }, + { + name: 'Urdu', + value: 'urdu' + }, + { + name: 'Croatian', + value: 'croatian' + }, + { + name: 'Hungarian', + value: 'hungarian' + }, + { + name: 'Danish', + value: 'danish' + }, + { + name: 'Amharic', + value: 'amharic' + }, + { + name: 'Albanian', + value: 'albanian' + }, + { + name: 'Catalan', + value: 'catalan' + }, + { + name: 'Ukrainian', + value: 'ukrainian' + }, + { + name: 'Galician', + value: 'galician' + } +]; diff --git a/lib/utils/speech-utils.js b/lib/utils/speech-utils.js index 99993cd6..ad72b588 100644 --- a/lib/utils/speech-utils.js +++ b/lib/utils/speech-utils.js @@ -25,6 +25,7 @@ const TtsModelDeepgram = require('./speech-data/tts-model-deepgram'); const TtsModelElevenLabs = require('./speech-data/tts-model-elevenlabs'); const TtsModelWhisper = require('./speech-data/tts-model-whisper'); const TtsModelPlayHT = require('./speech-data/tts-model-playht'); +const ttsLanguagesPlayHt = require('./speech-data/tts-languages-playht'); const TtsModelRimelabs = require('./speech-data/tts-model-rimelabs'); const SttGoogleLanguagesVoices = require('./speech-data/stt-google'); @@ -40,6 +41,7 @@ const SttSpeechmaticsLanguagesVoices = require('./speech-data/stt-speechmatics') const SttAssemblyaiLanguagesVoices = require('./speech-data/stt-assemblyai'); const SttVerbioLanguagesVoices = require('./speech-data/stt-verbio'); + const testSonioxStt = async(logger, credentials) => { const api_key = credentials; const soniox = new SpeechClient(api_key); @@ -869,6 +871,7 @@ const fetchLayHTVoices = async(credential) => { async function getLanguagesVoicesForPlayHT(credential) { if (credential) { + const {voice_engine} = credential; const [cloned_voice, voices] = await fetchLayHTVoices(credential); const list_voices = [...cloned_voice, ...voices]; @@ -876,38 +879,57 @@ async function getLanguagesVoicesForPlayHT(credential) { let name = `${d.name} -${concat(d.accent)}${concat(d.age)}${concat(d.gender)}${concat(d.loudness)}` + `${concat(d.style)}${concat(d.tempo)}${concat(d.texture)}` ; name = name.endsWith(',') ? name.trim().slice(0, -1) : name; + name += !d.language_code ? ' - Custom Voice' : ''; + return { value: `${d.id}`, name }; }; - const ttsVoices = list_voices.reduce((acc, voice) => { - // Play3.0 support all voice for PlayHT2.0* - const filteredVoiceEngine = credential.voice_engine === 'Play3.0' ? - `${credential.voice_engine}_PlayHT2.0_PlayHT2.0-turbo` : credential.voice_engine; - if (!filteredVoiceEngine.includes(voice.voice_engine)) { + const buildPlay30Payload = () => { + // PlayHT3.0 can play different languages with differrent voice. + // all voices will be added to english language by default and orther langauges will get voices from english. + const ttsVoices = ttsLanguagesPlayHt.map((l) => ({ + ...l, + voices: l.value === 'english' ? list_voices.map((v) => buildVoice(v)) : [] + })); + return tranform(ttsVoices, undefined, TtsModelPlayHT); + }; + + const buildPayload = () => { + const ttsVoices = list_voices.reduce((acc, voice) => { + if (!voice_engine.includes(voice.voice_engine)) { + return acc; + } + const languageCode = voice.language_code; + // custom voice does not have language code + if (!languageCode) { + voice.language_code = 'en'; + voice.language = 'Custom-English'; + } + const existingLanguage = acc.find((lang) => lang.value === languageCode); + if (existingLanguage) { + existingLanguage.voices.push(buildVoice(voice)); + } else { + acc.push({ + value: voice.language_code, + name: voice.language, + voices: [buildVoice(voice)] + }); + } return acc; - } - const languageCode = voice.language_code; - // custom voice does not have language code - if (!languageCode) { - voice.language_code = 'en'; - voice.language = 'Custom-English'; - } - const existingLanguage = acc.find((lang) => lang.value === languageCode); - if (existingLanguage) { - existingLanguage.voices.push(buildVoice(voice)); - } else { - acc.push({ - value: voice.language_code, - name: voice.language, - voices: [buildVoice(voice)] - }); - } - return acc; - }, []); - return tranform(ttsVoices, undefined, TtsModelPlayHT); + }, []); + return tranform(ttsVoices, undefined, TtsModelPlayHT); + }; + + switch (voice_engine) { + case 'Play3.0': + return buildPlay30Payload(); + + default: + return buildPayload(); + } } return tranform(TtsPlayHtLanguagesVoices, undefined, TtsModelPlayHT); } From 32d50b5649883b1f6fb51d959c45782fa9b31715 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Wed, 16 Oct 2024 18:26:33 +0700 Subject: [PATCH 2/2] update speech utils version --- package-lock.json | 8 ++++---- package.json | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/package-lock.json b/package-lock.json index 3665b2be..441aacdb 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,7 +19,7 @@ "@jambonz/lamejs": "^1.2.2", "@jambonz/mw-registrar": "^0.2.7", "@jambonz/realtimedb-helpers": "^0.8.10", - "@jambonz/speech-utils": "^0.1.18", + "@jambonz/speech-utils": "^0.1.19", "@jambonz/time-series": "^0.2.8", "@jambonz/verb-specifications": "^0.0.72", "@soniox/soniox-node": "^1.2.2", @@ -2224,9 +2224,9 @@ } }, "node_modules/@jambonz/speech-utils": { - "version": "0.1.18", - "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.1.18.tgz", - "integrity": "sha512-GlcPvUIKcyiiH4cfUPXyYZtP1HIIdFbrqYUmeTmeBaOuZUrJ0xW+TAp/pbysh54vgPnAfcS43Y3ciULx0S3IjQ==", + "version": "0.1.19", + "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.1.19.tgz", + "integrity": "sha512-3R4o1zJYnUIqMsT2GmYAItcAnhEz6NVTNqk5aHSAvBLFSMpqdeEPrTPaKVmQAPS8aIY7crhbJmJx8L3i2xDzDg==", "license": "MIT", "dependencies": { "@aws-sdk/client-polly": "^3.496.0", diff --git a/package.json b/package.json index 39e48afb..87f965d8 100644 --- a/package.json +++ b/package.json @@ -29,7 +29,7 @@ "@jambonz/lamejs": "^1.2.2", "@jambonz/mw-registrar": "^0.2.7", "@jambonz/realtimedb-helpers": "^0.8.10", - "@jambonz/speech-utils": "^0.1.18", + "@jambonz/speech-utils": "^0.1.19", "@jambonz/time-series": "^0.2.8", "@jambonz/verb-specifications": "^0.0.72", "@soniox/soniox-node": "^1.2.2",