Skip to content

Commit

Permalink
support verbio speech (#323)
Browse files Browse the repository at this point in the history
* support verbio speech

* wip

* update speech version

* update verb specification
  • Loading branch information
xquanluu authored May 29, 2024
1 parent ffe9cb2 commit d33d0aa
Show file tree
Hide file tree
Showing 8 changed files with 212 additions and 15 deletions.
2 changes: 2 additions & 0 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ const {
getTtsSize,
purgeTtsCache,
getAwsAuthToken,
getVerbioAccessToken,
synthAudio
} = require('@jambonz/speech-utils')({}, logger);
const {
Expand Down Expand Up @@ -99,6 +100,7 @@ app.locals = {
getTtsVoices,
getTtsSize,
getAwsAuthToken,
getVerbioAccessToken,
purgeTtsCache,
synthAudio,
lookupAppBySid,
Expand Down
12 changes: 6 additions & 6 deletions db/jambones.sqs
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,7 @@
</location>
<size>
<width>293.00</width>
<height>560.00</height>
<height>540.00</height>
</size>
<zorder>6</zorder>
<SQLField>
Expand Down Expand Up @@ -3111,11 +3111,11 @@
<SourceSidebarWidth><![CDATA[312.000000]]></SourceSidebarWidth>
<SQLEditorFileFormatVersion><![CDATA[4]]></SQLEditorFileFormatVersion>
<uid><![CDATA[58C99A00-06C9-478C-A667-C63842E088F3]]></uid>
<windowHeight><![CDATA[870.000000]]></windowHeight>
<windowLocationX><![CDATA[-1164.000000]]></windowLocationX>
<windowLocationY><![CDATA[1131.000000]]></windowLocationY>
<windowScrollOrigin><![CDATA[{0.5, 0}]]></windowScrollOrigin>
<windowWidth><![CDATA[1512.000000]]></windowWidth>
<windowHeight><![CDATA[1027.000000]]></windowHeight>
<windowLocationX><![CDATA[1728.000000]]></windowLocationX>
<windowLocationY><![CDATA[65.000000]]></windowLocationY>
<windowScrollOrigin><![CDATA[{1, 0}]]></windowScrollOrigin>
<windowWidth><![CDATA[1675.000000]]></windowWidth>
</SQLDocumentInfo>
<AllowsIndexRenamingOnInsert><![CDATA[1]]></AllowsIndexRenamingOnInsert>
<defaultLabelExpanded><![CDATA[1]]></defaultLabelExpanded>
Expand Down
39 changes: 35 additions & 4 deletions lib/routes/api/speech-credentials.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid} = req
const {decryptCredential, testWhisper, testDeepgramTTS,
getLanguagesAndVoicesForVendor,
testPlayHT,
testRimelabs} = require('../../utils/speech-utils');
testRimelabs,
testVerbioTts,
testVerbioStt} = require('../../utils/speech-utils');
const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
const {
testGoogleTts,
Expand Down Expand Up @@ -116,6 +118,7 @@ const encryptCredential = (obj) => {
role_arn,
region,
client_id,
client_secret,
secret,
nuance_tts_uri,
nuance_stt_uri,
Expand All @@ -140,6 +143,7 @@ const encryptCredential = (obj) => {
model_id,
user_id,
voice_engine,
engine_version,
options
} = obj;

Expand Down Expand Up @@ -255,6 +259,13 @@ const encryptCredential = (obj) => {
const whisperData = JSON.stringify({api_key, model_id});
return encrypt(whisperData);

case 'verbio':
assert(engine_version, 'invalid verbio speech credential: client_id is required');
assert(client_id, 'invalid verbio speech credential: client_id is required');
assert(client_secret, 'invalid verbio speech credential: secret is required');
const verbioData = JSON.stringify({client_id, client_secret, engine_version});
return encrypt(verbioData);

default:
if (vendor.startsWith('custom:')) {
const customData = JSON.stringify({auth_token, custom_stt_url, custom_tts_url});
Expand Down Expand Up @@ -501,7 +512,7 @@ router.put('/:sid', async(req, res) => {
* Test a credential
*/
router.get('/:sid/test', async(req, res) => {
const {logger, synthAudio} = req.app.locals;
const {logger, synthAudio, getVerbioAccessToken} = req.app.locals;
try {
const sid = parseSpeechCredentialSid(req);
const creds = await SpeechCredential.retrieve(sid);
Expand Down Expand Up @@ -672,8 +683,7 @@ router.get('/:sid/test', async(req, res) => {
SpeechCredential.sttTestResult(sid, false);
}
}
}
else if (cred.vendor === 'deepgram') {
} else if (cred.vendor === 'deepgram') {
const {api_key} = credential;
if (cred.use_for_tts) {
try {
Expand Down Expand Up @@ -803,6 +813,27 @@ router.get('/:sid/test', async(req, res) => {
SpeechCredential.ttsTestResult(sid, false);
}
}
} else if (cred.vendor === 'verbio') {
if (cred.use_for_tts) {
try {
await testVerbioTts(logger, synthAudio, credential);
results.tts.status = 'ok';
SpeechCredential.ttsTestResult(sid, true);
} catch (err) {
results.tts = {status: 'fail', reason: err.message};
SpeechCredential.ttsTestResult(sid, false);
}
}
if (cred.use_for_stt) {
try {
await testVerbioStt(logger, getVerbioAccessToken, credential);
results.stt.status = 'ok';
SpeechCredential.sttTestResult(sid, true);
} catch (err) {
results.stt = {status: 'fail', reason: err.message};
SpeechCredential.sttTestResult(sid, false);
}
}
}

res.status(200).json(results);
Expand Down
14 changes: 14 additions & 0 deletions lib/utils/speech-data/stt-verbio.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
module.exports = [
{ name: 'US English', value: 'en-US' },
{ name: 'British English', value: 'en-GB' },
{ name: 'LATAM Spanish', value: 'en-USes-419' },
{ name: 'Spanish', value: 'es' },
{ name: 'Catalan', value: 'ca-ES', version: 'v2' },
{ name: 'Brazilian Portuguese', value: 'pt-BR' },
{ name: 'French', value: 'fr', version: 'v1' },
{ name: 'Canadian French', value: 'fr-CA', version: 'v1' },
{ name: 'German', value: 'de', version: 'v1' },
{ name: 'Italian', value: 'it', version: 'v1' },
{ name: 'Turkish', value: 'tr', version: 'v1' },
{ name: 'Japanese', value: 'ja', version: 'v1' },
];
62 changes: 62 additions & 0 deletions lib/utils/speech-data/tts-verbio.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
module.exports = [
{
value: 'en-US',
name: 'US English',
voices: [
{
value: 'tommy_en_us',
name: 'Tommy-Male',
},
],
},
{
value: 'es-ES',
name: 'Castilian Spanish',
voices: [
{
value: 'david_es_es',
name: 'David-Male',
},
],
},
{
value: 'es-PE',
name: 'Peruvian Spanish',
voices: [
{
value: 'miguel_es_pe',
name: 'Miguel-Male',
},
],
},
{
value: 'es-PE',
name: 'Peruvian Spanish',
voices: [
{
value: 'luz_es_pe',
name: 'Luz-Female',
},
],
},
{
value: 'pt-BR',
name: 'Brazilian Portuguese',
voices: [
{
value: 'bel_pt_br',
name: 'Bel-Female',
},
],
},
{
value: 'ca-ES',
name: 'Catalan',
voices: [
{
value: 'anna_ca',
name: 'Anna-Female',
},
],
},
];
88 changes: 88 additions & 0 deletions lib/utils/speech-utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ const TtsNvidiaLanguagesVoices = require('./speech-data/tts-nvidia');
const TtsElevenlabsLanguagesVoices = require('./speech-data/tts-elevenlabs');
const TtsWhisperLanguagesVoices = require('./speech-data/tts-whisper');
const TtsPlayHtLanguagesVoices = require('./speech-data/tts-playht');
const TtsVerbioLanguagesVoices = require('./speech-data/tts-verbio');

const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
const TtsModelElevenLabs = require('./speech-data/tts-model-elevenlabs');
Expand All @@ -35,6 +36,7 @@ const SttNvidiaLanguagesVoices = require('./speech-data/stt-nvidia');
const SttCobaltLanguagesVoices = require('./speech-data/stt-cobalt');
const SttSonioxLanguagesVoices = require('./speech-data/stt-soniox');
const SttAssemblyaiLanguagesVoices = require('./speech-data/stt-assemblyai');
const SttVerbioLanguagesVoices = require('./speech-data/stt-verbio');

const testSonioxStt = async(logger, credentials) => {
const api_key = credentials;
Expand Down Expand Up @@ -366,6 +368,43 @@ const testWellSaidStt = async(logger, credentials) => {
return true;
};

const testVerbioTts = async(logger, synthAudio, credentials) => {
try {
await synthAudio(
{
increment: () => {},
histogram: () => {}
},
{
vendor: 'verbio',
credentials,
language: 'en-US',
voice: 'tommy_en-us',
text: 'Hi there and welcome to jambones!'
}
);
} catch (err) {
logger.info({err}, 'synth Verbio returned error');
throw err;
}
};
const testVerbioStt = async(logger, getVerbioAccessToken, credentials) => {
const token = await getVerbioAccessToken(credentials);
try {
const post = bent('https://us.rest.speechcenter.verbio.com', 'POST', 'json', {
'Authorization': `Bearer ${token.access_token}`,
'User-Agent': 'jambonz',
'Content-Type': 'audio/wav'
});
const json = await post('/api/v1/recognize?language=en-US&version=V1',
fs.readFileSync(`${__dirname}/../../data/test_audio.wav`));
logger.debug({json}, 'successfully speech to text from verbio');
} catch (err) {
logger.info({err}, 'testWellSaidTts returned error');
throw err;
}
};

const testAssemblyStt = async(logger, credentials) => {
const {api_key} = credentials;

Expand Down Expand Up @@ -512,6 +551,11 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
obj.model_id = o.model_id;
} else if ('verbio' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.client_id = o.client_id;
obj.client_secret = isObscureKey ? obscureKey(o.client_secret) : o.client_secret;
obj.engine_version = o.engine_version;
}
}

Expand Down Expand Up @@ -568,6 +612,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
return await getLanguagesVoicesForAssemblyAI(credential, getTtsVoices, logger);
case 'whisper':
return await getLanguagesVoicesForWhisper(credential, getTtsVoices, logger);
case 'verbio':
return await getLanguagesVoicesForVerbio(credential, getTtsVoices, logger);
default:
logger.info(`invalid vendor ${vendor}, return empty result`);
throw new Error(`Invalid vendor ${vendor}`);
Expand Down Expand Up @@ -816,6 +862,23 @@ async function getLanguagesVoicesForWhisper(credential) {
return tranform(TtsWhisperLanguagesVoices, undefined, TtsModelWhisper);
}

async function getLanguagesVoicesForVerbio(credentials, getTtsVoices, logger) {
const stt = SttVerbioLanguagesVoices.reduce((acc, v) => {
if (!v.version || credentials.engine_version === v.version) {
acc.push(v);
}
return acc;
}, []);
try {
const data = await getTtsVoices({vendor: 'verbio', credentials});
const voices = parseVerbioLanguagesVoices(data);
return tranform(voices, stt, undefined);
} catch (err) {
logger.info({err}, 'there is error while fetching verbio speech voices');
return tranform(TtsVerbioLanguagesVoices, stt, undefined);
}
}

function tranform(tts, stt, models) {
return {
...(tts && {tts}),
Expand Down Expand Up @@ -943,6 +1006,29 @@ function parseMicrosoftLanguagesVoices(data) {
}, []);
}

function parseVerbioLanguagesVoices(data) {
return data.reduce((acc, voice) => {
const languageCode = voice.language;
const existingLanguage = acc.find((lang) => lang.value === languageCode);
if (existingLanguage) {
existingLanguage.voices.push({
value: voice.voice_id,
name: voice.name,
});
} else {
acc.push({
value: voice.language,
name: voice.language,
voices: [{
value: voice.voice_id,
name: voice.name,
}]
});
}
return acc;
}, []);
}

module.exports = {
testGoogleTts,
testGoogleStt,
Expand All @@ -966,5 +1052,7 @@ module.exports = {
getSpeechCredential,
decryptCredential,
testWhisper,
testVerbioTts,
testVerbioStt,
getLanguagesAndVoicesForVendor
};
8 changes: 4 additions & 4 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"@jambonz/realtimedb-helpers": "^0.8.9",
"@jambonz/speech-utils": "^0.1.3",
"@jambonz/time-series": "^0.2.8",
"@jambonz/verb-specifications": "^0.0.69",
"@jambonz/verb-specifications": "^0.0.72",
"@soniox/soniox-node": "^1.2.2",
"argon2": "^0.40.1",
"assemblyai": "^4.3.4",
Expand Down

0 comments on commit d33d0aa

Please sign in to comment.