Skip to content

Commit

Permalink
BREAKING CHANGE: Allow specifying voice name for Microsoft TTS
Browse files Browse the repository at this point in the history
Reference voices by name.
  • Loading branch information
jishi committed Dec 11, 2016
1 parent 78e2cdb commit a49f35b
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 20 deletions.
37 changes: 26 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -305,9 +305,7 @@ Example:
"voicerss": "Your api key for TTS with voicerss",
"microsoft": {
"key": "Your api for Bing speech API",
"gender": "Female",
"name": "Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)",
"language": "en-US"
"name": "ZiraRUS"
},
"port": 5005,
"securePort": 5006,
Expand Down Expand Up @@ -440,25 +438,41 @@ The following configuration is available (the entered values except key are defa
{
"microsoft": {
"key": "Your api for Bing speech API",
"gender": "Female",
"name": "Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)",
"language": "en-US"
"name": "ZiraRUS"
}
}
```

If you change language, you need to change the name matching the gender for that language, according to this list: https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/API-Reference-REST/BingVoiceOutput#SupLocales. This one doesn't support providing language directly in the request for this reason.
You change language by specifying a voice name correlating to the desired language.
Name should be specified according to this list: https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/API-Reference-REST/BingVoiceOutput#SupLocales
where name is the right most part of the voice font name (without optional Apollo suffix). Example:

`Microsoft Server Speech Text to Speech Voice (ar-EG, Hoda)` name should be specified as `Hoda`

`Microsoft Server Speech Text to Speech Voice (de-DE, Stefan, Apollo)` name should be specified as `Stefan`

`Microsoft Server Speech Text to Speech Voice (en-US, BenjaminRUS)` name should be specified as `BenjaminRUS`

Action is:

/[Room name]/say/[phrase][/[announce volume]]
/sayall/[phrase][/[announce volume]]
/[Room name]/say/[phrase][/[name]][/[announce volume]]
/sayall/[phrase][/[name]][/[announce volume]]

Example:

/Office/say/Hello, dinner is ready
/Office/say/Hello, dinner is ready/BenjaminRUS
/Office/say/Guten morgen/Stefan
/sayall/Hello, dinner is ready
/Office/say/Hello, dinner is ready/90
/Office/say/Guten morgen/Stefan/90

Supported voices are:

Hoda, Hedda, Stefan, Catherine, Linda, Susan, George, Ravi, ZiraRUS, BenjaminRUS, Laura, Pablo, Raul, Caroline, Julie, Paul, Cosimo, Ayumi, Ichiro, Daniel, Irina, Pavel, HuihuiRUS, Yaoyao, Kangkang, Tracy, Danny, Yating, Zhiwei

See https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/API-Reference-REST/BingVoiceOutput#SupLocales to identify
which language and gender it maps against.

#### AWS Polly

Expand Down Expand Up @@ -496,10 +510,11 @@ Action is:
Example:

/Office/say/Hello, dinner is ready
/Office/say/Hej, maten är klar/Joanna
/Office/say/Hello, dinner is ready/Nicole
/Office/say/Hej, maten är klar/Astrid
/sayall/Hello, dinner is ready
/Office/say/Hello, dinner is ready/90
/Office/say/Hej, maten är klar/Russell/90
/Office/say/Hej, maten är klar/Astrid/90

This is the current list of voice names and their corresponding language and accent (as of Dec 2016).
To get a current list of voices, you would need to use the AWS CLI and invoke the describe-voices command.
Expand Down
10 changes: 9 additions & 1 deletion lib/actions/say.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,15 @@ let port;
let system;

function say(player, values) {
const text = decodeURIComponent(values[0]);
let text;
try {
text = decodeURIComponent(values[0]);
} catch (err) {
if (err instanceof URIError) {
err.message = `The encoded phrase ${values[0]} could not be URI decoded. Make sure your url encoded values (%xx) are within valid ranges. xx should be hexadecimal representations`;
}
return Promise.reject(err);
}
let announceVolume;
let language;

Expand Down
10 changes: 9 additions & 1 deletion lib/actions/sayall.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,15 @@ let port;
let system;

function sayAll(player, values) {
const text = decodeURIComponent(values[0]);
let text;
try {
text = decodeURIComponent(values[0]);
} catch (err) {
if (err instanceof URIError) {
err.message = `The encoded phrase ${values[0]} could not be URI decoded. Make sure your url encoded values (%xx) are within valid ranges. xx should be hexadecimal representations`;
}
return Promise.reject(err);
}
let announceVolume;
let language;

Expand Down
53 changes: 46 additions & 7 deletions lib/tts-providers/microsoft.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ const APP_ID = '9aa44d9e6ec14da99231a9166fd50b0f';
const INSTANCE_ID = crypto.randomBytes(16).toString('hex');
const TOKEN_EXPIRATION = 590000; // 9:50 minutes in ms
const DEFAULT_SETTINGS = {
language: 'en-US',
gender: 'Female',
name: 'Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)'
name: 'ZiraRUS'
};

let bearerToken;
Expand All @@ -39,15 +37,19 @@ function format(lang, gender, name, text) {
return `<speak version='1.0' xml:lang='en-us'><voice xml:lang='${lang}' xml:gender='${gender}' name='${name}'>${text}</voice></speak>`;
}

function microsoft(phrase, language) {
function microsoft(phrase, voiceName) {
if (!globalSettings.microsoft || !globalSettings.microsoft.key) {
return Promise.resolve();
}

const settings = Object.assign({}, DEFAULT_SETTINGS, globalSettings.microsoft);

if (voiceName) {
settings.name = voiceName;
}

const phraseHash = crypto.createHash('sha1').update(phrase).digest('hex');
const filename = `microsoft-${phraseHash}-${settings.language}-${settings.gender}.wav`;
const filename = `microsoft-${phraseHash}-${settings.name}.wav`;
const filepath = path.resolve(globalSettings.webroot, 'tts', filename);

const expectedUri = `/tts/${filename}`;
Expand All @@ -65,7 +67,12 @@ function microsoft(phrase, language) {
}

return promise.then(() => {
const ssml = format(settings.language, settings.gender, settings.name, phrase);
const voice = VOICE[settings.name];
if (!voice) {
throw new Error(`Voice name ${settings.name} could not be located in the list of valid voice names`);
}

const ssml = format(voice.language, voice.gender, voice.font, phrase);
return request({
uri: 'https://speech.platform.bing.com/synthesize',
method: 'POST',
Expand Down Expand Up @@ -99,4 +106,36 @@ function microsoft(phrase, language) {
});
}

module.exports = microsoft;
const VOICE = {
Hoda: { language: 'ar-EG', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (ar-EG, Hoda)' },
Hedda: { language: 'de-DE', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (de-DE, Hedda)' },
Stefan: { language: 'de-DE', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (de-DE, Stefan, Apollo)' },
Catherine: { language: 'en-AU', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (en-AU, Catherine)' },
Linda: { language: 'en-CA', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (en-CA, Linda)' },
Susan: { language: 'en-GB', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (en-GB, Susan, Apollo)' },
George: { language: 'en-GB', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (en-GB, George, Apollo)' },
Ravi: { language: 'en-IN', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (en-IN, Ravi, Apollo)' },
ZiraRUS: { language: 'en-US', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)' },
BenjaminRUS: { language: 'en-US', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (en-US, BenjaminRUS)' },
Laura: { language: 'es-ES', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (es-ES, Laura, Apollo)' },
Pablo: { language: 'es-ES', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (es-ES, Pablo, Apollo)' },
Raul: { language: 'es-MX', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (es-MX, Raul, Apollo)' },
Caroline: { language: 'fr-CA', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (fr-CA, Caroline)' },
Julie: { language: 'fr-FR', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (fr-FR, Julie, Apollo)' },
Paul: { language: 'fr-FR', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (fr-FR, Paul, Apollo)' },
Cosimo: { language: 'it-IT', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (it-IT, Cosimo, Apollo)' },
Ayumi: { language: 'ja-JP', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (ja-JP, Ayumi, Apollo)' },
Ichiro: { language: 'ja-JP', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (ja-JP, Ichiro, Apollo)' },
Daniel: { language: 'pt-BR', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (pt-BR, Daniel, Apollo)' },
Irina: { language: 'ru-RU', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (ru-RU, Irina, Apollo)' },
Pavel: { language: 'ru-RU', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (ru-RU, Pavel, Apollo)' },
HuihuiRUS: { language: 'zh-CN', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (zh-CN, HuihuiRUS)' },
Yaoyao: { language: 'zh-CN', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (zh-CN, Yaoyao, Apollo)' },
Kangkang: { language: 'zh-CN', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (zh-CN, Kangkang, Apollo)' },
Tracy: { language: 'zh-HK', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (zh-HK, Tracy, Apollo)' },
Danny: { language: 'zh-HK', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (zh-HK, Danny, Apollo)' },
Yating: { language: 'zh-TW', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (zh-TW, Yating, Apollo)' },
Zhiwei: { language: 'zh-TW', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (zh-TW, Zhiwei, Apollo)' }
};

module.exports = microsoft;

0 comments on commit a49f35b

Please sign in to comment.