diff --git a/README.md b/README.md index 1be1d0e..91a6756 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ the JSON Web Token needed to use that proxy (See Appendix B). // Create the talking head avatar const nodeAvatar = document.getElementById('avatar'); head = new TalkingHead( nodeAvatar, { - ttsEndpoint: "./gtts/", + ttsEndpoint: "/gtts/", jwtGet: jwtGet }); ``` @@ -77,6 +77,7 @@ Option | Description `ttsVolume` | Google text-to-speech volume gain (in dB) in the range [-96.0, 16.0]. Default is `0`. `ttsTrimStart` | Trim the viseme sequence start relative to the beginning of the audio (shift in milliseconds). Default is `0`. `ttsTrimEnd`| Trim the viseme sequence end relative to the end of the audio (shift in milliseconds). Default is `300`. +`lipsyncLang`| Lip-sync language. Currently 'en' and 'fi' are supported. Default is `fi`. `pcmSampleRate` | PCM (signed 16bit little endian) sample rate used in `speakAudio` in Hz. Default is `22050`. `modelPixelRatio` | Sets the device's pixel ratio. Default is `1`. `modelFPS` | Frames per second. Default is `30`. @@ -117,9 +118,9 @@ The following table lists some of the key methods. See the source code for the r Method | Description --- | --- -`showAvatar(avatar, [onprogress=null])` | Load and show the specified avatar. The `avatar` object must include the `url` for GLB file. Optional properties are `body` for either male `M` or female `F` body form, `ttsLang`, `ttsVoice`, `ttsRate`, `ttsPitch`, `ttsVolume`, `avatarMood` and `avatarMute`. +`showAvatar(avatar, [onprogress=null])` | Load and show the specified avatar. The `avatar` object must include the `url` for GLB file. Optional properties are `body` for either male `M` or female `F` body form, `visemesLang`, `ttsLang`, `ttsVoice`, `ttsRate`, `ttsPitch`, `ttsVolume`, `avatarMood` and `avatarMute`. `setView(view, [opt])` | Set view. Supported views are `"full"`, `"upper"` and `"head"`. Options `opt` can be used to set `cameraDistance`, `cameraX`, `cameraY`, `cameraRotateX`, `cameraRotateY`. -`speakText(text, [opt={}], [onsubtitles=null], [excludes=[]])` | Add the `text` string to the speech queue. The text can contain face emojis. Options `opt` can be used to set text-specific `ttsLang`, `ttsVoice`, `ttsRate`, `ttsPitch`, `ttsVolume`, `avatarMood`, `avatarMute`. Optional callback function `onsubtitles` is called whenever a new subtitle is to be written with the parameter of the added string. The optional `excludes` is an array of [start,end] indices to be excluded from audio but to be included in the subtitles. +`speakText(text, [opt={}], [onsubtitles=null], [excludes=[]])` | Add the `text` string to the speech queue. The text can contain face emojis. Options `opt` can be used to set text-specific `lipsyncLang`, `ttsLang`, `ttsVoice`, `ttsRate`, `ttsPitch`, `ttsVolume`, `avatarMood`, `avatarMute`. Optional callback function `onsubtitles` is called whenever a new subtitle is to be written with the parameter of the added string. The optional `excludes` is an array of [start,end] indices to be excluded from audio but to be included in the subtitles. `speakAudio(audio, [onsubtitles=null])` | Add the `audio` object to the speech queue. This method was added to support external TTS services such as ElevenLabs WebSocket API. The audio object contains ArrayBuffer chunks in `audio` array, characters in `chars` array, starting times for each character in milliseconds in `ts` array, and durations for each character in milliseconds in `ds` array. As of now, the only supported format is PCM signed 16bit little endian. `speakMarker(onmarker)` | Add a marker to the speech queue. The callback function `onmarker` is called when the queue processes the event. `lookAt(x,y,t)` | Make the avatar's head turn to look at the screen position (`x`,`y`) for `t` milliseconds. @@ -252,6 +253,15 @@ the Web Speech API events for syncronization, but the results were not good. Note that the ElevenLabs WebSocket API returns the word-to-audio alignment information, which is great for this purpose. +**Any future plans for the project?** + +This is a small side-project for me, so I don't have any big plans for it. +That said, there are some companies that are currently developing +text-to-avatar and text-to-animation features. If and when they get released +as APIs, I will probably take a look at them and see if they can be integrated +in some way to the project. + + --- ### See also @@ -301,7 +311,7 @@ RewriteEngine On RewriteMap jwtverify "prg:/etc/httpd/jwtverify" apache:apache ``` -4. Make a forward proxy for each service in which you add the required API key and protect the proxy with the JWT token verifier. Below is an example config for OpenAI API proxy using Apache 2.4 web server. Google TTS proxy would follow the same pattern passing the request to `https://eu-texttospeech.googleapis.com/v1/text:synthesize` (in EU). +4. Make a proxy for each service in which you add the required API key and protect the proxy with the JWT token verifier. Below is an example config for OpenAI API proxy using Apache 2.4 web server. Google TTS proxy would follow the same pattern passing the request to `https://eu-texttospeech.googleapis.com/v1/text:synthesize` (in EU). ```apacheconf # OpenAI API diff --git a/index.html b/index.html index 8f3e31d..86507dd 100755 --- a/index.html +++ b/index.html @@ -441,15 +441,15 @@ } // i18n -// NOTE: Default UI language is Finnish +// Default UI language is English function i18nWord(w,l) { - l = l || cfg('theme-lang') || 'fi'; + l = l || cfg('theme-lang') || 'en'; return (( i18n[l] && i18n[l][w] ) ? i18n[l][w] : w); } function i18nTranslate(l) { - l = l || cfg('theme-lang') || 'fi'; + l = l || cfg('theme-lang') || 'en'; // Text d3.selectAll("[data-i18n-text]").nodes().forEach( n => { @@ -572,19 +572,37 @@ // Speak audio if ( (r.isFinal || r.normalizedAlignment) && elevenOutputMsg ) { - head.speakAudio( elevenOutputMsg, node ? addText.bind(null,node) : null ); + head.speakAudio( elevenOutputMsg, { lipsyncLang: cfg('voice-lipsync-lang') }, node ? addText.bind(null,node) : null ); elevenOutputMsg = null; } if ( !r.isFinal ) { // New part if ( r.normalizedAlignment ) { - elevenOutputMsg = { - audio: [], - chars: r.normalizedAlignment.chars, - ts: r.normalizedAlignment.charStartTimesMs, - ds: r.normalizedAlignment.charDurationsMs - }; + elevenOutputMsg = { audio: [], words: [], times: [], durations: [] }; + + // Parse chars to words + let word = ''; + let time = 0; + let duration = 0; + for( let i=0; i -
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- +
+
+
+
+
+
+
+
+
@@ -3360,6 +3376,26 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/modules/lipsync-en.mjs b/modules/lipsync-en.mjs new file mode 100644 index 0000000..f4a546b --- /dev/null +++ b/modules/lipsync-en.mjs @@ -0,0 +1,372 @@ + +/** +* @class English lip-sync processor +* @author Mika Suominen +*/ + +class LipsyncEn { + + /** + * @constructor + */ + constructor() { + + // English words to Oculus visemes, algorithmic rules adapted from: + // NRL Report 7948, "Automatic Translation of English Text to Phonetics by Means of Letter-to-Sound Rules" (1976) + // by HONEY SUE EL.OVITZ, RODNEY W. JOHNSON, ASTRID McHUGH, AND JOHN E. SHORE + // Available at: https://apps.dtic.mil/sti/pdfs/ADA021929.pdf + this.rules = { + 'A': [ + "[A] =aa", " [ARE] =aa RR", " [AR]O=aa RR", "[AR]#=E RR", + " ^[AS]#=E SS", "[A]WA=aa", "[AW]=aa", " :[ANY]=E nn I", + "[A]^+#=E", "#:[ALLY]=aa nn I", " [AL]#=aa nn", "[AGAIN]=aa kk E nn", + "#:[AG]E=I kk", "[A]^+:#=aa", ":[A]^+ =E", "[A]^%=E", + " [ARR]=aa RR", "[ARR]=aa RR", " :[AR] =aa RR", "[AR] =E", + "[AR]=aa RR", "[AIR]=E RR", "[AI]=E", "[AY]=E", "[AU]=aa", + "#:[AL] =aa nn", "#:[ALS] =aa nn SS", "[ALK]=aa kk", "[AL]^=aa nn", + " :[ABLE]=E PP aa nn", "[ABLE]=aa PP aa nn", "[ANG]+=E nn kk", "[A]=aa" + ], + + 'B': [ + " [BE]^#=PP I", "[BEING]=PP I I nn", " [BOTH] =PP O TH", + " [BUS]#=PP I SS", "[BUIL]=PP I nn", "[B]=PP" + ], + + 'C': [ + " [CH]^=kk", "^E[CH]=kk", "[CH]=CH", " S[CI]#=SS aa", + "[CI]A=SS", "[CI]O=SS", "[CI]EN=SS", "[C]+=SS", + "[CK]=kk", "[COM]%=kk aa PP", "[C]=kk" + ], + + 'D': [ + "#:[DED] =DD I DD", ".E[D] =DD", "#^:E[D] =DD", " [DE]^#=DD I", + " [DO] =DD U", " [DOES]=DD aa SS", " [DOING]=DD U I nn", + " [DOW]=DD aa", "[DU]A=kk U", "[D]=DD" + ], + + 'E': [ + "#:[E] =", "' ^:[E] =", " :[E] =I", "#[ED] =DD", "#:[E]D =", + "[EV]ER=E FF", "[E]^%=I", "[ERI]#=I RR I", "[ERI]=E RR I", + "#:[ER]#=E", "[ER]#=E RR", "[ER]=E", " [EVEN]=I FF E nn", + "#:[E]W=", "@[EW]=U", "[EW]=I U", "[E]O=I", "#:&[ES] =I SS", + "#:[E]S =", "#:[ELY] =nn I", "#:[EMENT]=PP E nn DD", "[EFUL]=FF U nn", + "[EE]=I", "[EARN]=E nn", " [EAR]^=E", "[EAD]=E DD", "#:[EA] =I aa", + "[EA]SU=E", "[EA]=I", "[EIGH]=E", "[EI]=I", " [EYE]=aa", "[EY]=I", + "[EU]=I U", "[E]=E" + ], + + 'F': [ + "[FUL]=FF U nn", "[F]=FF" + ], + + 'G': [ + "[GIV]=kk I FF", " [G]I^=kk", "[GE]T=kk E", "SU[GGES]=kk kk E SS", + "[GG]=kk", " B#[G]=kk", "[G]+=kk", "[GREAT]=kk RR E DD", + "#[GH]=", "[G]=kk" + ], + + 'H': [ + " [HAV]=I aa FF", " [HERE]=I I RR", " [HOUR]=aa EE", "[HOW]=I aa", + "[H]#=I", "[H]=" + ], + + 'I': [ + " [IN]=I nn", " [I] =aa", "[IN]D=aa nn", "[IER]=I E", + "#:R[IED] =I DD", "[IED] =aa DD", "[IEN]=I E nn", "[IE]T=aa E", + " :[I]%=aa", "[I]%=I", "[IE]=I", "[I]^+:#=I", "[IR]#=aa RR", + "[IZ]%=aa SS", "[IS]%=aa SS", "[I]D%=aa", "+^[I]^+=I", + "[I]T%=aa", "#^:[I]^+=I", "[I]^+=aa", "[IR]=E", "[IGH]=aa", + "[ILD]=aa nn DD", "[IGN] =aa nn", "[IGN]^=aa nn", "[IGN]%=aa nn", + "[IQUE]=I kk", "[I]=I" + ], + + 'J': [ + "[J]=kk" + ], + + 'K': [ + " [K]N=", "[K]=kk" + ], + + 'L': [ + "[LO]C#=nn O", "L[L]=", "#^:[L]%=aa nn", "[LEAD]=nn I DD", "[L]=nn" + ], + + 'M': [ + "[MOV]=PP U FF", "[M]=PP" + ], + + 'N': [ + "E[NG]+=nn kk", "[NG]R=nn kk", "[NG]#=nn kk", "[NGL]%=nn kk aa nn", + "[NG]=nn", "[NK]=nn kk", " [NOW] =nn aa", "[N]=nn" + ], + + 'O': [ + "[OF] =aa FF", "[OROUGH]=E O", "#:[OR] =E", "#:[ORS] =E SS", + "[OR]=aa RR", " [ONE]=FF aa nn", "[OW]=O", " [OVER]=O FF E", + "[OV]=aa FF", "[O]^%=O", "[O]^EN=O", "[O]^I#=O", "[OL]D=O nn", + "[OUGHT]=aa DD", "[OUGH]=aa FF", " [OU]=aa", "H[OU]S#=aa", + "[OUS]=aa SS", "[OUR]=aa RR", "[OULD]=U DD", "^[OU]^L=aa", + "[OUP]=U OO", "[OU]=aa", "[OY]=O", "[OING]=O I nn", "[OI]=O", + "[OOR]=aa RR", "[OOK]=U kk", "[OOD]=U DD", "[OO]=U", "[O]E=O", + "[O] =O", "[OA]=O", " [ONLY]=O nn nn I", " [ONCE]=FF aa nn SS", + "[ON ' T]=O nn DD", "C[O]N=aa", "[O]NG=aa", " ^:[O]N=aa", + "I[ON]=aa nn", "#:[ON] =aa nn", "#^[ON]=aa nn", "[O]ST =O", + "[OF]^=aa FF", "[OTHER]=aa TH E", "[OSS] =aa SS", "#^:[OM]=aa PP", + "[O]=aa" + ], + + 'P': [ + "[PH]=FF", "[PEOP]=PP I PP", "[POW]=PP aa", "[PUT] =PP U DD", + "[P]=PP" + ], + + 'Q': [ + "[QUAR]=kk FF aa RR", "[QU]=kk FF", "[Q]=kk" + ], + + 'R': [ + " [RE]^#=RR I", "[R]=RR" + ], + + 'S': [ + "[SH]=SS", "#[SION]=SS aa nn", "[SOME]=SS aa PP", "#[SUR]#=SS E", + "[SUR]#=SS E", "#[SU]#=SS U", "#[SSU]#=SS U", "#[SED] =SS DD", + "#[S]#=SS", "[SAID]=SS E DD", "^[SION]=SS aa nn", "[S]S=", + ".[S] =SS", "#:.E[S] =SS", "#^:##[S] =SS", "#^:#[S] =SS", + "U[S] =SS", " :#[S] =SS", " [SCH]=SS kk", "[S]C+=", + "#[SM]=SS PP", "#[SN] '=SS aa nn", "[S]=SS" + ], + + 'T': [ + " [THE] =TH aa", "[TO] =DD U", "[THAT] =TH aa DD", " [THIS] =TH I SS", + " [THEY]=TH E", " [THERE]=TH E RR", "[THER]=TH E", "[THEIR]=TH E RR", + " [THAN] =TH aa nn", " [THEM] =TH E PP", "[THESE] =TH I SS", + " [THEN]=TH E nn", "[THROUGH]=TH RR U", "[THOSE]=TH O SS", + "[THOUGH] =TH O", " [THUS]=TH aa SS", "[TH]=TH", "#:[TED] =DD I DD", + "S[TI]#N=CH", "[TI]O=SS", "[TI]A=SS", "[TIEN]=SS aa nn", + "[TUR]#=CH E", "[TU]A=CH U", " [TWO]=DD U", "[T]=DD" + ], + + 'U': [ + " [UN]I=I U nn", " [UN]=aa nn", " [UPON]=aa PP aa nn", + "@[UR]#=U RR", "[UR]#=I U RR", "[UR]=E", "[U]^ =aa", + "[U]^^=aa", "[UY]=aa", " G[U]#=", "G[U]%=", "G[U]#=FF", + "#N[U]=I U", "@[U]=I", "[U]=I U" + ], + + 'V': [ + "[VIEW]=FF I U", "[V]=FF" + ], + + 'W': [ + " [WERE]=FF E", "[WA]S=FF aa", "[WA]T=FF aa", "[WHERE]=FF E RR", + "[WHAT]=FF aa DD", "[WHOL]=I O nn", "[WHO]=I U", "[WH]=FF", + "[WAR]=FF aa RR", "[WOR]^=F EF", "[WR]=RR", "[W]=FF" + ], + + 'X': [ + " [X]=SS", "[X]=kk SS" + ], + + 'Y': [ + "[YOUNG]=I aa nn", " [YOU]=I U", " [YES]=I E SS", " [Y]=I", + "#^:[Y] =I", "#^:[Y]I=I", " :[Y] =aa", " :[Y]#=aa", + " :[Y]^+:#=I", " :[Y]^#=I", "[Y]=I" + ], + + 'Z': [ + "[Z]=SS" + ] + }; + + const ops = { + '#': '[AEIOUY]+', // One or more vowels AEIOUY + "'": '[BCDFGHJKLMNPQRSTVWXZ]+', // One or more consonants BCDFGHJKLMNPQRSTVWXZ + '.': '[BDVGJLMNRWZ]+', // One voiced consonant BDVGJLMNRWZ + '$': '[BDVGJLMNRWZ][EI]', // One consonant followed by E or I + '%': '(ER|E|ES|ED|ING|ELY)', // One of ER, E, ES, ED, ING, ELY + '&': '([SCGZXJ]|CH|SH)', // One of S, C, G, Z, X, J, CH, SH + '@': '([TSRDLZNJ]|TH|CH|SH)', // One of T, S, R, D, L, Z, N, J, TH, CH, SH + '^': '[BCDFGHJKLMNPQRSTVWXZ]', // One consonant BCDFGHJKLMNPQRSTVWXZ + '+': '[EIY]', // One of E, I, Y + ':': '[BCDFGHJKLMNPQRSTVWXZ]*', // Zero or more consonants BCDFGHJKLMNPQRSTVWXZ + ' ': '\\b' // Start/end of the word + }; + + // Convert rules to regex + Object.keys(this.rules).forEach( key => { + this.rules[key] = this.rules[key].map( rule => { + const posL = rule.indexOf('['); + const posR = rule.indexOf(']'); + const posE = rule.indexOf('='); + const strLeft = rule.substring(0,posL); + const strLetters = rule.substring(posL+1,posR); + const strRight = rule.substring(posR+1,posE); + const strVisemes = rule.substring(posE+1); + + const o = { regex: '', move: 0, visemes: [] }; + + let exp = ''; + exp += [...strLeft].map( x => ops[x] || x ).join(''); + const ctxLetters = [...strLetters]; + ctxLetters[0] = ctxLetters[0].toLowerCase(); + exp += ctxLetters.join(''); + o.move = ctxLetters.length; + exp += [...strRight].map( x => ops[x] || x ).join(''); + o.regex = new RegExp(exp); + + if ( strVisemes.length ) { + strVisemes.split(' ').forEach( viseme => { + o.visemes.push(viseme); + }); + } + + return o; + }); + }); + + // Pauses in relative units to visemes + this.durations = { ' ': 1, ',': 3, '-':0.5 }; + + // English number words + this.digits = ['oh', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']; + this.ones = ['','one','two','three','four','five','six','seven','eight','nine']; + this.tens = ['','','twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety']; + this.teens = ['ten','eleven','twelve','thirteen','fourteen','fifteen','sixteen','seventeen','eighteen','nineteen']; + } + + convert_digit_by_digit(num) { + num = String(num).split(""); + var numWords = ""; + for(m=0; m=1000000){ + return this.convert_millions(Math.floor(num/1000000))+" million "+this.convert_thousands(num%1000000); + } + else { + return this.convert_thousands(num); + } + } + + convert_thousands(num){ + if (num>=1000){ + return this.convert_hundreds(Math.floor(num/1000))+" thousand "+this.convert_hundreds(num%1000); + } + else{ + return this.convert_hundreds(num); + } + } + + convert_hundreds(num){ + if (num>99){ + return this.ones[Math.floor(num/100)]+" hundred "+this.convert_tens(num%100); + } + else{ + return this.convert_tens(num); + } + } + + convert_tens(num){ + if (num<10) return ones[num]; + else if (num>=10 && num<20) return this.teens[num-10]; + else{ + return this.tens[Math.floor(num/10)]+" "+ones[num%10]; + } + } + + convertNumberToWords(num){ + if (num==0) { + return "zero"; + } else if ((num<1000&&num>99)||(num>10000&&num<1000000)) { //read area and zip codes digit by digit + return this.convert_digit_by_digit(num); + } else if ((num > 1000 && num < 2000)||(num>2009&&num<3000)) { //read years as two sets of two digits + return this.convert_sets_of_two(num); + } else { + return this.convert_millions(num); + } + } + + + /** + * Preprocess text: + * - convert symbols to words + * - convert numbers to words + * - filter out characters that should be left unspoken + * @param {string} s Text + * @return {string} Pre-processsed text. + */ + preProcessText(s) { + return s.replace('/[#_*\'\":;]/g','') + .replaceAll('%',' percentage ') + .replaceAll('€',' euros ') + .replaceAll('&',' and ') + .replaceAll('+',' plus ') + .replace(/(\D)\1\1+/g, "$1$1") // max 2 repeating chars + .replaceAll(' ',' ') // Only one repeating space + .replace(/(\d)\,(\d)/g, '$1 point $2') // Number separator + .replace(/\d+/g, this.convertNumberToWords.bind(this)) // Numbers to words + .trim(); + } + + + /** + * Convert word to Oculus LipSync Visemes and durations + * @param {string} w Text + * @return {Object} Oculus LipSync Visemes and durations. + */ + wordsToVisemes(w) { + let o = { words: w.toUpperCase(), visemes: [], times: [], durations: [], i:0 }; + let t = 0; + + const chars = [...o.words]; + while( o.i < chars.length ) { + const c = chars[o.i]; + const ruleset = this.rules[c]; + if ( ruleset ) { + for(let i=0; i { + if ( o.visemes.length && o.visemes[ o.visemes.length - 1 ] === viseme ) { + o.durations[ o.durations.length - 1 ] += 0.7; + t += 0.7; + } else { + o.visemes.push( viseme ); + o.times.push(t); + o.durations.push(1); + t++; + } + }) + o.i += rule.move; + break; + } + } + } else { + o.i++; + t += this.durations[c] || 0; + } + } + + return o; + } + +} + +export { LipsyncEn }; diff --git a/modules/lipsync-fi.mjs b/modules/lipsync-fi.mjs new file mode 100644 index 0000000..39051d9 --- /dev/null +++ b/modules/lipsync-fi.mjs @@ -0,0 +1,117 @@ + +/** +* @class Finnish lip-sync processor +* @author Mika Suominen +*/ + +class LipsyncFi { + + /** + * @constructor + */ + constructor() { + // Finnish letters to visemes. And yes, it is this simple in Finnish! + this.visemes = { + 'a': 'aa', 'e': 'E', 'i': 'I', 'o': 'O', 'u': 'U', 'y': 'U', 'ä': 'aa', + 'ö': 'O', 'b': 'PP', 'c': 'SS', 'd': 'DD', 'f': 'FF', 'g': 'kk', + 'h': 'O', 'j': 'I', 'k': 'kk', 'l': 'nn', 'm': 'PP', 'n': 'nn', + 'p': 'PP', 'q': 'kk', 'r': 'RR','s': 'SS', 't': 'DD', 'v': 'FF', + 'w': 'FF', 'x': 'SS', 'z': 'SS' + }; + + // Pauses in relative units to visemes + this.durations = { ' ': 1, ',': 3, '-':0.5 }; + } + + /** + * Convert the number string into Finnish words. + * @param {string} x Number string + * @return {string} The number in words in Finnish + */ + numberToFinnishWords(x) { + const w = []; + const dg = ['nolla', 'yksi', 'kaksi', 'kolme', 'neljä', 'viisi', 'kuusi', + 'seitsemän', 'kahdeksan', 'yhdeksän', "kymmenen","yksitoista","kaksitoista", + "kolmetoista","neljätoista","viisitoista","kuusitoista",'seitsemäntoista', + 'kahdeksantoista', 'yhdeksäntoista']; + let n = parseFloat(x); + if ( n === undefined ) return x; + let p = (n,z,w0,w1,w2) => { + if ( n < z ) return n; + const d = Math.floor(n/z); + w.push( w0 + ((d === 1) ? w1 : this.numberToFinnishWords(d.toString()) + w2) ); + return n - d * z; + } + if ( n < 0 ) { + w.push('miinus '); + n = Math.abs(n); + } + n = p(n,1000000000,' ','miljardi',' miljardia'); + n = p(n,1000000,' ','miljoona',' miljoonaa'); + n = p(n,1000,'', 'tuhat','tuhatta'); + n = p(n,100,' ','sata','sataa'); + if ( n > 20 ) n = p(n,10,'','','kymmentä'); + if ( n >= 1) { + let d = Math.floor(n); + w.push( dg[d] ); + n -= d; + } + if ( n >= 0 && parseFloat(x) < 1) w.push( 'nolla' ); + if ( n > 0 ) { + let d = (n % 1).toFixed(1) * 10; + if ( d > 0 ) w.push( ' pilkku ' + dg[d] ); + } + return w.join('').trim(); + } + + + /** + * Preprocess text: + * - convert symbols to words + * - convert numbers to words + * - filter out characters that should be left unspoken + * @param {string} s Text + * @return {string} Pre-processsed text. + */ + preProcessText(s) { + return s.replace('/[#_*\'\":;]/g','') + .replaceAll('%',' prosenttia ') + .replaceAll('€',' euroa ') + .replaceAll('&',' ja ') + .replaceAll('+',' plus ') + .replace(/(\D)\1\1+/g, "$1$1") // max 2 repeating chars + .replaceAll(' ',' ') // Only one repeating space + .replace(/(\d)\,(\d)/g, '$1 pilkku $2') // Number separator + .replace(/\d+/g, this.numberToFinnishWords.bind(this)) // Numbers to words + .trim(); + } + + /** + * Convert words to Oculus LipSync Visemes and durations + * @param {string} w Words + * @return {Object} Oculus LipSync Visemes and durations. + */ + wordsToVisemes(w) { + let o = { words: w, visemes: [], times: [], durations: [] }; + let t = 0; + + const chars = [...w]; + for( let i=0; i { + this.visemeNames.forEach( x => { this.morphs.forEach( y => y.morphTargetInfluences[y.morphTargetDictionary['viseme_'+x]] ); }); } /** - * Convert the number string into Finnish words. - * @param {string} x Number string - * @return {string} The number in words in Finnish - */ - numberToFinnishWords(x) { - const w = []; - const dg = ['nolla', 'yksi', 'kaksi', 'kolme', 'neljä', 'viisi', 'kuusi', - 'seitsemän', 'kahdeksan', 'yhdeksän', "kymmenen","yksitoista","kaksitoista", - "kolmetoista","neljätoista","viisitoista","kuusitoista",'seitsemäntoista', - 'kahdeksantoista', 'yhdeksäntoista']; - let n = parseFloat(x); - if ( n === undefined ) return x; - let p = (n,z,w0,w1,w2) => { - if ( n < z ) return n; - const d = Math.floor(n/z); - w.push( w0 + ((d === 1) ? w1 : this.numberToFinnishWords(d.toString()) + w2) ); - return n - d * z; - } - if ( n < 0 ) { - w.push('miinus '); - n = Math.abs(n); - } - n = p(n,1000000000,' ','miljardi',' miljardia'); - n = p(n,1000000,' ','miljoona',' miljoonaa'); - n = p(n,1000,'', 'tuhat','tuhatta'); - n = p(n,100,' ','sata','sataa'); - if ( n > 20 ) n = p(n,10,'','','kymmentä'); - if ( n >= 1) { - let d = Math.floor(n); - w.push( dg[d] ); - n -= d; - } - if ( n >= 0 && parseFloat(x) < 1) w.push( 'nolla' ); - if ( n > 0 ) { - let d = (n % 1).toFixed(1) * 10; - if ( d > 0 ) w.push( ' pilkku ' + dg[d] ); - } - return w.join('').trim(); + * Preprocess text for tts/lipsync, including: + * - convert symbols/numbers to words + * - filter out characters that should be left unspoken + * @param {string} s Text + * @param {string} lang Language + * @return {string} Pre-processsed text. + */ + lipsyncPreProcessText(s,lang) { + const o = this.lipsync[lang] || Object.values(this.lipsync)[0]; + return o.preProcessText(s); } - /** - * Convert symbols to Finnish words. - * @param {string} s String + * Convert words to Oculus LipSync Visemes. + * @param {string} w Word + * @param {string} lang Language + * @return {Lipsync} Lipsync object. */ - symbolsToFinnishWords(s) { - return s.replace('/[#_*\'\":;]/g','') - .replaceAll('%',' prosenttia ') - .replaceAll('€',' euroa ') - .replaceAll('&',' ja ') - .replaceAll('+',' plus ') - .replace(/(\D)\1\1+/g, "$1$1") // max 2 repeating chars - .replaceAll(' ',' ') // Only one repeating space - .replace(/(\d)\,(\d)/g, '$1 pilkku $2') // Number separator - .replace(/\d+/g, this.numberToFinnishWords.bind(this)) // Numbers to words - .trim(); + lipsyncWordsToVisemes(w,lang) { + const o = this.lipsync[lang] || Object.values(this.lipsync)[0]; + return o.wordsToVisemes(w); } + /** * Add text to the speech queue. * @param {string} s Text. - * @param {Options} [opt=null] Text-specific options for TTS language, voice, rate and pitch, mood and mute + * @param {Options} [opt=null] Text-specific options for lipsync/TTS language, voice, rate and pitch, mood and mute * @param {subtitlesfn} [onsubtitles=null] Callback when a subtitle is written * @param {number[][]} [excludes=null] Array of [start, end] index arrays to not speak */ @@ -1736,6 +1717,7 @@ class TalkingHead { const dividersWord = /[ !\.\?\n\p{Extended_Pictographic}]/ug; const speakables = /[\p{L}\p{N},]/ug; const emojis = /[\p{Extended_Pictographic}]/ug; + const lipsyncLang = opt.lipsyncLang || this.avatar.lipsyncLang || this.opt.lipsyncLang; let t = 0; // time counter let markdownWord = ''; // markdown word @@ -1763,7 +1745,7 @@ class TalkingHead { // Add to text-to-speech sentence if ( textWord.length ) { - textWord = this.symbolsToFinnishWords(textWord); + textWord = this.lipsyncPreProcessText(textWord, lipsyncLang); textSentence += ' ' + textWord; } @@ -1781,19 +1763,18 @@ class TalkingHead { // Push visemes to animation queue if ( textWord.length ) { - const chars = [...textWord]; - for( let j=0; j 0 ) { + for( let j=0; j