support verbio speech (#323)

* support verbio speech * wip * update speech version * update verb specification
jambonz · May 29, 2024 · d33d0aa · d33d0aa
1 parent ffe9cb2
commit d33d0aa
Show file tree

Hide file tree

Showing 8 changed files with 212 additions and 15 deletions.
diff --git a/app.js b/app.js
@@ -54,6 +54,7 @@ const {
   getTtsSize,
   purgeTtsCache,
   getAwsAuthToken,
+  getVerbioAccessToken,
   synthAudio
 } = require('@jambonz/speech-utils')({}, logger);
 const {
@@ -99,6 +100,7 @@ app.locals = {
   getTtsVoices,
   getTtsSize,
   getAwsAuthToken,
+  getVerbioAccessToken,
   purgeTtsCache,
   synthAudio,
   lookupAppBySid,

diff --git a/db/jambones.sqs b/db/jambones.sqs
@@ -551,7 +551,7 @@
         </location>
         <size>
             <width>293.00</width>
-            <height>560.00</height>
+            <height>540.00</height>
         </size>
         <zorder>6</zorder>
         <SQLField>
@@ -3111,11 +3111,11 @@
         <SourceSidebarWidth><![CDATA[312.000000]]></SourceSidebarWidth>
         <SQLEditorFileFormatVersion><![CDATA[4]]></SQLEditorFileFormatVersion>
         <uid><![CDATA[58C99A00-06C9-478C-A667-C63842E088F3]]></uid>
-        <windowHeight><![CDATA[870.000000]]></windowHeight>
-        <windowLocationX><![CDATA[-1164.000000]]></windowLocationX>
-        <windowLocationY><![CDATA[1131.000000]]></windowLocationY>
-        <windowScrollOrigin><![CDATA[{0.5, 0}]]></windowScrollOrigin>
-        <windowWidth><![CDATA[1512.000000]]></windowWidth>
+        <windowHeight><![CDATA[1027.000000]]></windowHeight>
+        <windowLocationX><![CDATA[1728.000000]]></windowLocationX>
+        <windowLocationY><![CDATA[65.000000]]></windowLocationY>
+        <windowScrollOrigin><![CDATA[{1, 0}]]></windowScrollOrigin>
+        <windowWidth><![CDATA[1675.000000]]></windowWidth>
     </SQLDocumentInfo>
     <AllowsIndexRenamingOnInsert><![CDATA[1]]></AllowsIndexRenamingOnInsert>
     <defaultLabelExpanded><![CDATA[1]]></defaultLabelExpanded>

diff --git a/lib/routes/api/speech-credentials.js b/lib/routes/api/speech-credentials.js
@@ -8,7 +8,9 @@ const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid} = req
 const {decryptCredential, testWhisper, testDeepgramTTS,
   getLanguagesAndVoicesForVendor,
   testPlayHT,
-  testRimelabs} = require('../../utils/speech-utils');
+  testRimelabs,
+  testVerbioTts,
+  testVerbioStt} = require('../../utils/speech-utils');
 const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
 const {
   testGoogleTts,
@@ -116,6 +118,7 @@ const encryptCredential = (obj) => {
     role_arn,
     region,
     client_id,
+    client_secret,
     secret,
     nuance_tts_uri,
     nuance_stt_uri,
@@ -140,6 +143,7 @@ const encryptCredential = (obj) => {
     model_id,
     user_id,
     voice_engine,
+    engine_version,
     options
   } = obj;
 
@@ -255,6 +259,13 @@ const encryptCredential = (obj) => {
       const whisperData = JSON.stringify({api_key, model_id});
       return encrypt(whisperData);
 
+    case 'verbio':
+      assert(engine_version, 'invalid verbio speech credential: client_id is required');
+      assert(client_id, 'invalid verbio speech credential: client_id is required');
+      assert(client_secret, 'invalid verbio speech credential: secret is required');
+      const verbioData = JSON.stringify({client_id, client_secret, engine_version});
+      return encrypt(verbioData);
+
     default:
       if (vendor.startsWith('custom:')) {
         const customData = JSON.stringify({auth_token, custom_stt_url, custom_tts_url});
@@ -501,7 +512,7 @@ router.put('/:sid', async(req, res) => {
  * Test a credential
  */
 router.get('/:sid/test', async(req, res) => {
-  const {logger, synthAudio} = req.app.locals;
+  const {logger, synthAudio, getVerbioAccessToken} = req.app.locals;
   try {
     const sid = parseSpeechCredentialSid(req);
     const creds = await SpeechCredential.retrieve(sid);
@@ -672,8 +683,7 @@ router.get('/:sid/test', async(req, res) => {
           SpeechCredential.sttTestResult(sid, false);
         }
       }
-    }
-    else if (cred.vendor === 'deepgram') {
+    } else if (cred.vendor === 'deepgram') {
       const {api_key} = credential;
       if (cred.use_for_tts) {
         try {
@@ -803,6 +813,27 @@ router.get('/:sid/test', async(req, res) => {
           SpeechCredential.ttsTestResult(sid, false);
         }
       }
+    } else if (cred.vendor === 'verbio') {
+      if (cred.use_for_tts) {
+        try {
+          await testVerbioTts(logger, synthAudio, credential);
+          results.tts.status = 'ok';
+          SpeechCredential.ttsTestResult(sid, true);
+        } catch (err) {
+          results.tts = {status: 'fail', reason: err.message};
+          SpeechCredential.ttsTestResult(sid, false);
+        }
+      }
+      if (cred.use_for_stt) {
+        try {
+          await testVerbioStt(logger, getVerbioAccessToken, credential);
+          results.stt.status = 'ok';
+          SpeechCredential.sttTestResult(sid, true);
+        } catch (err) {
+          results.stt = {status: 'fail', reason: err.message};
+          SpeechCredential.sttTestResult(sid, false);
+        }
+      }
     }
 
     res.status(200).json(results);

diff --git a/lib/utils/speech-data/stt-verbio.js b/lib/utils/speech-data/stt-verbio.js
@@ -0,0 +1,14 @@
+module.exports = [
+  { name: 'US English', value: 'en-US' },
+  { name: 'British English', value: 'en-GB' },
+  { name: 'LATAM Spanish', value: 'en-USes-419' },
+  { name: 'Spanish', value: 'es' },
+  { name: 'Catalan', value: 'ca-ES', version: 'v2' },
+  { name: 'Brazilian Portuguese', value: 'pt-BR' },
+  { name: 'French', value: 'fr', version: 'v1' },
+  { name: 'Canadian French', value: 'fr-CA', version: 'v1' },
+  { name: 'German', value: 'de', version: 'v1' },
+  { name: 'Italian', value: 'it', version: 'v1' },
+  { name: 'Turkish', value: 'tr', version: 'v1' },
+  { name: 'Japanese', value: 'ja', version: 'v1' },
+];
diff --git a/lib/utils/speech-data/tts-verbio.js b/lib/utils/speech-data/tts-verbio.js
@@ -0,0 +1,62 @@
+module.exports = [
+  {
+    value: 'en-US',
+    name: 'US English',
+    voices: [
+      {
+        value: 'tommy_en_us',
+        name: 'Tommy-Male',
+      },
+    ],
+  },
+  {
+    value: 'es-ES',
+    name: 'Castilian Spanish',
+    voices: [
+      {
+        value: 'david_es_es',
+        name: 'David-Male',
+      },
+    ],
+  },
+  {
+    value: 'es-PE',
+    name: 'Peruvian Spanish',
+    voices: [
+      {
+        value: 'miguel_es_pe',
+        name: 'Miguel-Male',
+      },
+    ],
+  },
+  {
+    value: 'es-PE',
+    name: 'Peruvian Spanish',
+    voices: [
+      {
+        value: 'luz_es_pe',
+        name: 'Luz-Female',
+      },
+    ],
+  },
+  {
+    value: 'pt-BR',
+    name: 'Brazilian Portuguese',
+    voices: [
+      {
+        value: 'bel_pt_br',
+        name: 'Bel-Female',
+      },
+    ],
+  },
+  {
+    value: 'ca-ES',
+    name: 'Catalan',
+    voices: [
+      {
+        value: 'anna_ca',
+        name: 'Anna-Female',
+      },
+    ],
+  },
+];
diff --git a/lib/utils/speech-utils.js b/lib/utils/speech-utils.js
@@ -18,6 +18,7 @@ const TtsNvidiaLanguagesVoices = require('./speech-data/tts-nvidia');
 const TtsElevenlabsLanguagesVoices = require('./speech-data/tts-elevenlabs');
 const TtsWhisperLanguagesVoices = require('./speech-data/tts-whisper');
 const TtsPlayHtLanguagesVoices = require('./speech-data/tts-playht');
+const TtsVerbioLanguagesVoices = require('./speech-data/tts-verbio');
 
 const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
 const TtsModelElevenLabs = require('./speech-data/tts-model-elevenlabs');
@@ -35,6 +36,7 @@ const SttNvidiaLanguagesVoices = require('./speech-data/stt-nvidia');
 const SttCobaltLanguagesVoices = require('./speech-data/stt-cobalt');
 const SttSonioxLanguagesVoices = require('./speech-data/stt-soniox');
 const SttAssemblyaiLanguagesVoices = require('./speech-data/stt-assemblyai');
+const SttVerbioLanguagesVoices = require('./speech-data/stt-verbio');
 
 const testSonioxStt = async(logger, credentials) => {
   const api_key = credentials;
@@ -366,6 +368,43 @@ const testWellSaidStt = async(logger, credentials) => {
   return true;
 };
 
+const testVerbioTts = async(logger, synthAudio, credentials) => {
+  try {
+    await synthAudio(
+      {
+        increment: () => {},
+        histogram: () => {}
+      },
+      {
+        vendor: 'verbio',
+        credentials,
+        language: 'en-US',
+        voice: 'tommy_en-us',
+        text: 'Hi there and welcome to jambones!'
+      }
+    );
+  } catch (err) {
+    logger.info({err}, 'synth Verbio returned error');
+    throw err;
+  }
+};
+const testVerbioStt = async(logger, getVerbioAccessToken, credentials) => {
+  const token = await getVerbioAccessToken(credentials);
+  try {
+    const post = bent('https://us.rest.speechcenter.verbio.com', 'POST', 'json', {
+      'Authorization': `Bearer ${token.access_token}`,
+      'User-Agent': 'jambonz',
+      'Content-Type': 'audio/wav'
+    });
+    const json = await post('/api/v1/recognize?language=en-US&version=V1',
+      fs.readFileSync(`${__dirname}/../../data/test_audio.wav`));
+    logger.debug({json}, 'successfully speech to text from verbio');
+  } catch (err) {
+    logger.info({err}, 'testWellSaidTts returned error');
+    throw err;
+  }
+};
+
 const testAssemblyStt = async(logger, credentials) => {
   const {api_key} = credentials;
 
@@ -512,6 +551,11 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
     const o = JSON.parse(decrypt(credential));
     obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
     obj.model_id = o.model_id;
+  } else if ('verbio' === obj.vendor) {
+    const o = JSON.parse(decrypt(credential));
+    obj.client_id = o.client_id;
+    obj.client_secret = isObscureKey ? obscureKey(o.client_secret) : o.client_secret;
+    obj.engine_version = o.engine_version;
   }
 }
 
@@ -568,6 +612,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
       return await getLanguagesVoicesForAssemblyAI(credential, getTtsVoices, logger);
     case 'whisper':
       return await getLanguagesVoicesForWhisper(credential, getTtsVoices, logger);
+    case 'verbio':
+      return await getLanguagesVoicesForVerbio(credential, getTtsVoices, logger);
     default:
       logger.info(`invalid vendor ${vendor}, return empty result`);
       throw new Error(`Invalid vendor ${vendor}`);
@@ -816,6 +862,23 @@ async function getLanguagesVoicesForWhisper(credential) {
   return tranform(TtsWhisperLanguagesVoices, undefined, TtsModelWhisper);
 }
 
+async function getLanguagesVoicesForVerbio(credentials, getTtsVoices, logger) {
+  const stt = SttVerbioLanguagesVoices.reduce((acc, v) => {
+    if (!v.version || credentials.engine_version === v.version) {
+      acc.push(v);
+    }
+    return acc;
+  }, []);
+  try {
+    const data = await getTtsVoices({vendor: 'verbio', credentials});
+    const voices = parseVerbioLanguagesVoices(data);
+    return tranform(voices, stt, undefined);
+  } catch (err) {
+    logger.info({err}, 'there is error while fetching verbio speech voices');
+    return tranform(TtsVerbioLanguagesVoices, stt, undefined);
+  }
+}
+
 function tranform(tts, stt, models) {
   return {
     ...(tts && {tts}),
@@ -943,6 +1006,29 @@ function parseMicrosoftLanguagesVoices(data) {
   }, []);
 }
 
+function parseVerbioLanguagesVoices(data) {
+  return data.reduce((acc, voice) => {
+    const languageCode = voice.language;
+    const existingLanguage = acc.find((lang) => lang.value === languageCode);
+    if (existingLanguage) {
+      existingLanguage.voices.push({
+        value: voice.voice_id,
+        name: voice.name,
+      });
+    } else {
+      acc.push({
+        value: voice.language,
+        name: voice.language,
+        voices: [{
+          value: voice.voice_id,
+          name: voice.name,
+        }]
+      });
+    }
+    return acc;
+  }, []);
+}
+
 module.exports = {
   testGoogleTts,
   testGoogleStt,
@@ -966,5 +1052,7 @@ module.exports = {
   getSpeechCredential,
   decryptCredential,
   testWhisper,
+  testVerbioTts,
+  testVerbioStt,
   getLanguagesAndVoicesForVendor
 };
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -31,7 +31,7 @@
     "@jambonz/realtimedb-helpers": "^0.8.9",
     "@jambonz/speech-utils": "^0.1.3",
     "@jambonz/time-series": "^0.2.8",
-    "@jambonz/verb-specifications": "^0.0.69",
+    "@jambonz/verb-specifications": "^0.0.72",
     "@soniox/soniox-node": "^1.2.2",
     "argon2": "^0.40.1",
     "assemblyai": "^4.3.4",