BREAKING CHANGE: Allow specifying voice name for Microsoft TTS

Reference voices by name.
jishi · Dec 11, 2016 · a49f35b · a49f35b
1 parent 78e2cdb
commit a49f35b
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -305,9 +305,7 @@ Example:
 	  "voicerss": "Your api key for TTS with voicerss",
 	  "microsoft": {
 	    "key": "Your api for Bing speech API",
-	    "gender": "Female",
-        "name": "Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)",
-        "language": "en-US"
+	    "name": "ZiraRUS"
 	  },
 	  "port": 5005,
 	  "securePort": 5006,
@@ -440,25 +438,41 @@ The following configuration is available (the entered values except key are defa
 	{
 	  "microsoft": {
 	    "key": "Your api for Bing speech API",
-	    "gender": "Female",
-        "name": "Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)",
-        "language": "en-US"
+	    "name": "ZiraRUS"
 	  }
 	}
 ```
 
-If you change language, you need to change the name matching the gender for that language, according to this list: https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/API-Reference-REST/BingVoiceOutput#SupLocales. This one doesn't support providing language directly in the request for this reason.
+You change language by specifying a voice name correlating to the desired language.
+Name should be specified according to this list: https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/API-Reference-REST/BingVoiceOutput#SupLocales
+where name is the right most part of the voice font name (without optional Apollo suffix). Example:
+
+`Microsoft Server Speech Text to Speech Voice (ar-EG, Hoda)` name should be specified as `Hoda`
+
+`Microsoft Server Speech Text to Speech Voice (de-DE, Stefan, Apollo)` name should be specified as `Stefan`
+
+`Microsoft Server Speech Text to Speech Voice (en-US, BenjaminRUS)` name should be specified as `BenjaminRUS`
 
 Action is:
 
-	/[Room name]/say/[phrase][/[announce volume]]
-	/sayall/[phrase][/[announce volume]]
+	/[Room name]/say/[phrase][/[name]][/[announce volume]]
+	/sayall/[phrase][/[name]][/[announce volume]]
 
 Example:
 
 	/Office/say/Hello, dinner is ready
+	/Office/say/Hello, dinner is ready/BenjaminRUS
+	/Office/say/Guten morgen/Stefan
 	/sayall/Hello, dinner is ready
 	/Office/say/Hello, dinner is ready/90
+	/Office/say/Guten morgen/Stefan/90
+
+Supported voices are:
+
+Hoda, Hedda, Stefan, Catherine, Linda, Susan, George, Ravi, ZiraRUS, BenjaminRUS, Laura, Pablo, Raul, Caroline, Julie, Paul, Cosimo, Ayumi, Ichiro, Daniel, Irina, Pavel, HuihuiRUS, Yaoyao, Kangkang, Tracy, Danny, Yating, Zhiwei
+
+See https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/API-Reference-REST/BingVoiceOutput#SupLocales to identify
+which language and gender it maps against.
 
 #### AWS Polly
 
@@ -496,10 +510,11 @@ Action is:
 Example:
 
 	/Office/say/Hello, dinner is ready
-	/Office/say/Hej, maten är klar/Joanna
+	/Office/say/Hello, dinner is ready/Nicole
+	/Office/say/Hej, maten är klar/Astrid
 	/sayall/Hello, dinner is ready
 	/Office/say/Hello, dinner is ready/90
-	/Office/say/Hej, maten är klar/Russell/90
+	/Office/say/Hej, maten är klar/Astrid/90
 
 This is the current list of voice names and their corresponding language and accent (as of Dec 2016).
 To get a current list of voices, you would need to use the AWS CLI and invoke the describe-voices command.

diff --git a/lib/actions/say.js b/lib/actions/say.js
@@ -11,7 +11,15 @@ let port;
 let system;
 
 function say(player, values) {
-  const text = decodeURIComponent(values[0]);
+  let text;
+  try {
+    text = decodeURIComponent(values[0]);
+  } catch (err) {
+    if (err instanceof URIError) {
+      err.message = `The encoded phrase ${values[0]} could not be URI decoded. Make sure your url encoded values (%xx) are within valid ranges. xx should be hexadecimal representations`;
+    }
+    return Promise.reject(err);
+  }
   let announceVolume;
   let language;
 

diff --git a/lib/actions/sayall.js b/lib/actions/sayall.js
@@ -7,7 +7,15 @@ let port;
 let system;
 
 function sayAll(player, values) {
-  const text = decodeURIComponent(values[0]);
+  let text;
+  try {
+    text = decodeURIComponent(values[0]);
+  } catch (err) {
+    if (err instanceof URIError) {
+      err.message = `The encoded phrase ${values[0]} could not be URI decoded. Make sure your url encoded values (%xx) are within valid ranges. xx should be hexadecimal representations`;
+    }
+    return Promise.reject(err);
+  }
   let announceVolume;
   let language;
 

diff --git a/lib/tts-providers/microsoft.js b/lib/tts-providers/microsoft.js
@@ -10,9 +10,7 @@ const APP_ID = '9aa44d9e6ec14da99231a9166fd50b0f';
 const INSTANCE_ID = crypto.randomBytes(16).toString('hex');
 const TOKEN_EXPIRATION = 590000; // 9:50 minutes in ms
 const DEFAULT_SETTINGS = {
-  language: 'en-US',
-  gender: 'Female',
-  name: 'Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)'
+  name: 'ZiraRUS'
 };
 
 let bearerToken;
@@ -39,15 +37,19 @@ function format(lang, gender, name, text) {
   return `<speak version='1.0' xml:lang='en-us'><voice xml:lang='${lang}' xml:gender='${gender}' name='${name}'>${text}</voice></speak>`;
 }
 
-function microsoft(phrase, language) {
+function microsoft(phrase, voiceName) {
   if (!globalSettings.microsoft || !globalSettings.microsoft.key) {
     return Promise.resolve();
   }
 
   const settings = Object.assign({}, DEFAULT_SETTINGS, globalSettings.microsoft);
 
+  if (voiceName) {
+    settings.name = voiceName;
+  }
+
   const phraseHash = crypto.createHash('sha1').update(phrase).digest('hex');
-  const filename =  `microsoft-${phraseHash}-${settings.language}-${settings.gender}.wav`;
+  const filename =  `microsoft-${phraseHash}-${settings.name}.wav`;
   const filepath = path.resolve(globalSettings.webroot, 'tts', filename);
 
   const expectedUri = `/tts/${filename}`;
@@ -65,7 +67,12 @@ function microsoft(phrase, language) {
   }
 
   return promise.then(() => {
-    const ssml = format(settings.language, settings.gender, settings.name, phrase);
+    const voice = VOICE[settings.name];
+    if (!voice) {
+      throw new Error(`Voice name ${settings.name} could not be located in the list of valid voice names`);
+    }
+
+    const ssml = format(voice.language, voice.gender, voice.font, phrase);
     return request({
       uri: 'https://speech.platform.bing.com/synthesize',
       method: 'POST',
@@ -99,4 +106,36 @@ function microsoft(phrase, language) {
   });
 }
 
-module.exports = microsoft;
+const VOICE = {
+  Hoda: { language: 'ar-EG', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (ar-EG, Hoda)' },
+  Hedda: { language: 'de-DE', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (de-DE, Hedda)' },
+  Stefan: { language: 'de-DE', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (de-DE, Stefan, Apollo)' },
+  Catherine: { language: 'en-AU', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (en-AU, Catherine)' },
+  Linda: { language: 'en-CA', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (en-CA, Linda)' },
+  Susan: { language: 'en-GB', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (en-GB, Susan, Apollo)' },
+  George: { language: 'en-GB', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (en-GB, George, Apollo)' },
+  Ravi: { language: 'en-IN', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (en-IN, Ravi, Apollo)' },
+  ZiraRUS: { language: 'en-US', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)' },
+  BenjaminRUS: { language: 'en-US', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (en-US, BenjaminRUS)' },
+  Laura: { language: 'es-ES', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (es-ES, Laura, Apollo)' },
+  Pablo: { language: 'es-ES', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (es-ES, Pablo, Apollo)' },
+  Raul: { language: 'es-MX', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (es-MX, Raul, Apollo)' },
+  Caroline: { language: 'fr-CA', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (fr-CA, Caroline)' },
+  Julie: { language: 'fr-FR', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (fr-FR, Julie, Apollo)' },
+  Paul: { language: 'fr-FR', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (fr-FR, Paul, Apollo)' },
+  Cosimo: { language: 'it-IT', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (it-IT, Cosimo, Apollo)' },
+  Ayumi: { language: 'ja-JP', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (ja-JP, Ayumi, Apollo)' },
+  Ichiro: { language: 'ja-JP', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (ja-JP, Ichiro, Apollo)' },
+  Daniel: { language: 'pt-BR', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (pt-BR, Daniel, Apollo)' },
+  Irina: { language: 'ru-RU', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (ru-RU, Irina, Apollo)' },
+  Pavel: { language: 'ru-RU', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (ru-RU, Pavel, Apollo)' },
+  HuihuiRUS: { language: 'zh-CN', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (zh-CN, HuihuiRUS)' },
+  Yaoyao: { language: 'zh-CN', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (zh-CN, Yaoyao, Apollo)' },
+  Kangkang: { language: 'zh-CN', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (zh-CN, Kangkang, Apollo)' },
+  Tracy: { language: 'zh-HK', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (zh-HK, Tracy, Apollo)' },
+  Danny: { language: 'zh-HK', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (zh-HK, Danny, Apollo)' },
+  Yating: { language: 'zh-TW', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (zh-TW, Yating, Apollo)' },
+  Zhiwei: { language: 'zh-TW', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (zh-TW, Zhiwei, Apollo)' }
+};
+
+module.exports = microsoft;