✨ feat: Add openai tts

canisminor1990 · canisminor1990 · commit 3dfcf6b8421b · 2023-11-07T18:12:31.000+08:00
diff --git a/.eslintignore b/.eslintignore
@@ -24,7 +24,6 @@ __test__
 # production
 dist
 es
-lib
 logs
 
 # misc
diff --git a/.prettierignore b/.prettierignore
@@ -35,7 +35,6 @@ __snapshots__
 # production
 dist
 es
-lib
 logs
 
 # umi
diff --git a/api/index.ts b/api/index.ts
@@ -1,8 +1,8 @@
 import qs from 'query-string';
 
 import cors from '../lib/cors';
+import { fetchMicrosoftSpeech } from '../lib/fetchMicrosoftSpeech';
 import { SsmlOptions } from '../lib/genSSML';
-import { postMicrosoftSpeech } from '../lib/postMicrosoftSpeech';
 
 export const config = {
   runtime: 'edge',
@@ -11,7 +11,7 @@ export const config = {
 export default async (req: Request) => {
   const { text, ...options }: SsmlOptions & { text: string } = qs.parseUrl(req.url).query as any;
 
-  const res = await fetch(...postMicrosoftSpeech(text, options));
+  const res = await fetchMicrosoftSpeech(text, options);
 
   const newResponse = new Response(res.body, res);
 
diff --git a/lib/fetchMicrosoftSpeech.ts b/lib/fetchMicrosoftSpeech.ts
@@ -5,7 +5,7 @@ import { type SsmlOptions, genSSML } from './genSSML';
 const API =
   'https://southeastasia.api.speech.microsoft.com/accfreetrial/texttospeech/acc/v3.0-beta1/vcg/speak';
 
-export const postMicrosoftSpeech = (text: string, options: SsmlOptions): [any, any] => {
+export const fetchMicrosoftSpeech = async (text: string, options: SsmlOptions) => {
   const data = JSON.stringify({
     offsetInPlainText: 0,
     properties: {
@@ -15,7 +15,7 @@ export const postMicrosoftSpeech = (text: string, options: SsmlOptions): [any, a
     ttsAudioFormat: 'audio-24khz-160kbitrate-mono-mp3',
   });
 
-  const DEFAULT_HEADERS = {
+  const DEFAULT_HEADERS = new Headers({
     'accept': '*/*',
     'accept-language': 'zh-CN,zh;q=0.9',
     'authority': 'southeastasia.api.speech.microsoft.com',
@@ -30,15 +30,13 @@ export const postMicrosoftSpeech = (text: string, options: SsmlOptions): [any, a
     'sec-fetch-site': 'same-site',
     'user-agent':
       'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
-  };
+  });
 
-  return [
-    API,
-    {
-      body: data,
-      headers: DEFAULT_HEADERS,
-      method: 'POST',
-      responseType: 'arraybuffer',
-    },
-  ];
+  return await fetch(API, {
+    body: data,
+    headers: DEFAULT_HEADERS,
+    method: 'POST',
+    // @ts-ignore
+    responseType: 'arraybuffer',
+  });
 };
diff --git a/package.json b/package.json
@@ -27,7 +27,7 @@
     "docs:dev": "dumi dev",
     "doctor": "father doctor",
     "postinstall": "npm run setup",
-    "lint": "eslint \"{src,tests}/**/*.{js,jsx,ts,tsx}\" --fix",
+    "lint": "eslint \"{src,api,lib}/**/*.{js,jsx,ts,tsx}\" --fix",
     "lint:md": "remark . --quiet --frail --output",
     "prepare": "husky install",
     "prepublishOnly": "npm run build",
@@ -65,6 +65,7 @@
     "query-string": "^8",
     "ssml-document": "^1",
     "swr": "^2",
+    "url-join": "^5.0.0",
     "uuid": "^9"
   },
   "devDependencies": {
diff --git a/src/const/api.ts b/src/const/api.ts
@@ -0,0 +1,10 @@
+import urlJoin from 'url-join';
+
+export const MICROSOFT_SPEECH_PROXY_URL = process.env.MICROSOFT_SPEECH_PROXY_URL || '';
+export const AZURE_SPEECH_KEY = process.env.AZURE_SPEECH_KEY || '';
+export const AZURE_SPEECH_REGION = process.env.AZURE_SPEECH_REGION || '';
+export const OPENAI_API_KEY = process.env.OPENAI_API_KEY || '';
+export const OPENAI_PROXY_URL = process.env.OPENAI_PROXY_URL || 'https://api.openai.com/v1';
+export const OPENAI_TTS_URL = (api?: string) => urlJoin(api || OPENAI_PROXY_URL, 'audio/speech');
+export const OPENAI_STT_URL = (api: string) =>
+  urlJoin(api || OPENAI_PROXY_URL, 'audio/transcriptions');
diff --git a/src/data/openaiVoiceList.json b/src/data/openaiVoiceList.json
@@ -0,0 +1 @@
+["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
diff --git a/src/index.ts b/src/index.ts
@@ -1,11 +1,18 @@
+export { fetchAzureSpeech } from './services/fetchAzureSpeech';
+export { fetchEdgeSpeech } from './services/fetchEdgeSpeech';
+export { fetchMicrosoftSpeech } from './services/fetchMicrosoftSpeech';
+export { fetchOpenaiSTT } from './services/fetchOpenaiSTT';
+export { fetchOpenaiTTS } from './services/fetchOpenaiTTS';
 export { useAzureSpeech } from './useAzureSpeech';
 export { useEdgeSpeech } from './useEdgeSpeech';
 export { useMicrosoftSpeech } from './useMicrosoftSpeech';
+export { useOpenaiTTS } from './useOpenaiTTS';
 export { usePersistedSpeechRecognition } from './useSpeechRecognition/usePersistedSpeechRecognition';
 export { useSpeechRecognition } from './useSpeechRecognition/useSpeechRecognition';
 export { useSpeechSynthes } from './useSpeechSynthes';
 export {
   getAzureVoiceList,
   getEdgeVoiceList,
+  getOpenaiVoiceList,
   getSpeechSynthesVoiceList,
 } from './utils/getVoiceList';
diff --git a/src/services/fetchAzureSpeech.ts b/src/services/fetchAzureSpeech.ts
@@ -8,6 +8,8 @@ import {
   SpeechSynthesizer,
 } from 'microsoft-cognitiveservices-speech-sdk';
 
+import { AZURE_SPEECH_KEY, AZURE_SPEECH_REGION } from '@/const/api';
+
 import { type SsmlOptions, genSSML } from '../utils/genSSML';
 
 export interface AzureSpeechOptions extends SsmlOptions {
@@ -18,12 +20,12 @@ export interface AzureSpeechOptions extends SsmlOptions {
 }
 
 // 纯文本生成语音
-export const postAzureSpeech = async (
+export const fetchAzureSpeech = async (
   text: string,
   { api, ...options }: AzureSpeechOptions,
 ): Promise<AudioBufferSourceNode> => {
-  const key = api.key || process.env.AZURE_SPEECH_KEY || '';
-  const region = api.key || process.env.AZURE_SPEECH_REGION || '';
+  const key = api.key || AZURE_SPEECH_KEY;
+  const region = api.key || AZURE_SPEECH_REGION;
   const speechConfig = SpeechConfig.fromSubscription(key, region);
   speechConfig.setProperty(PropertyId.SpeechServiceResponse_RequestSentenceBoundary, 'true');
   speechConfig.speechSynthesisOutputFormat = SpeechSynthesisOutputFormat.Webm24Khz16BitMonoOpus;
diff --git a/src/services/fetchEdgeSpeech.ts b/src/services/fetchEdgeSpeech.ts
@@ -8,7 +8,7 @@ import { getHeadersAndData } from '../utils/getHeadersAndData';
 const API = 'wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1';
 const TOKEN = '6A5AA1D4EAFF4E9FB37E23D68491D6F4';
 
-export const postEdgeSpeech = async (text: string, options: SsmlOptions) => {
+export const fetchEdgeSpeech = async (text: string, options: SsmlOptions) => {
   const connectId = uuidv4().replaceAll('-', '');
   const date = new Date().toString();
   const audioContext = new AudioContext();
diff --git a/src/services/fetchMicrosoftSpeech.ts b/src/services/fetchMicrosoftSpeech.ts
@@ -1,22 +1,28 @@
 import qs from 'query-string';
 
+import { MICROSOFT_SPEECH_PROXY_URL } from '@/const/api';
+
 import { type SsmlOptions } from '../utils/genSSML';
 
 export interface MicrosoftSpeechOptions extends SsmlOptions {
   api?: string;
 }
 
-export const postMicrosoftSpeech = async (
+export const fetchMicrosoftSpeech = async (
   text: string,
   { api, ...options }: MicrosoftSpeechOptions,
 ): Promise<AudioBufferSourceNode> => {
   const response: Response = await fetch(
     qs.stringifyUrl({
       query: { text, ...options },
-      url: api || process.env.MICROSOFT_SPEECH_PROXY_URL || '',
+      url: api || MICROSOFT_SPEECH_PROXY_URL,
     }),
   );
 
+  if (!response.ok) {
+    throw new Error('Network response was not ok');
+  }
+
   const audioData = await response.arrayBuffer();
   const audioContext = new AudioContext();
   const audioBufferSource = audioContext.createBufferSource();
diff --git a/src/services/fetchOpenaiSTT.ts b/src/services/fetchOpenaiSTT.ts
@@ -0,0 +1,39 @@
+import { v4 as uuidv4 } from 'uuid';
+
+import { OPENAI_API_KEY, OPENAI_STT_URL } from '@/const/api';
+
+export interface OpenaiTtsOptions {
+  api: {
+    key: string;
+    proxy: string;
+  };
+  model?: 'whisper-1';
+}
+
+// 纯文本生成语音
+export const fetchOpenaiSTT = async (
+  speech: Blob,
+  { api, model = 'whisper-1' }: OpenaiTtsOptions,
+): Promise<string> => {
+  const key = api.key || OPENAI_API_KEY;
+  const url = OPENAI_STT_URL(api.proxy);
+
+  const headers = new Headers({
+    'Authorization': `Bearer ${key}`,
+    'Content-Type': 'multipart/form-data',
+  });
+
+  const body = new FormData();
+  body.append('file', speech, `${uuidv4()}.webm`);
+  body.append('model', model);
+
+  const response: Response = await fetch(url, { body, headers, method: 'POST' });
+
+  if (!response.ok) {
+    throw new Error('Network response was not ok');
+  }
+
+  const json = await response.json();
+
+  return json?.text;
+};
diff --git a/src/services/fetchOpenaiTTS.ts b/src/services/fetchOpenaiTTS.ts
@@ -0,0 +1,47 @@
+import { OPENAI_API_KEY, OPENAI_TTS_URL } from '@/const/api';
+
+import { type SsmlOptions } from '../utils/genSSML';
+
+export type OpenaiVoice = 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer';
+
+export interface OpenaiTtsOptions extends SsmlOptions {
+  api: {
+    key: string;
+    proxy: string;
+  };
+  model?: 'tts-1' | 'tts-1-hd';
+  name: OpenaiVoice;
+}
+
+// 纯文本生成语音
+export const fetchOpenaiTTS = async (
+  text: string,
+  { api, model = 'tts-1', ...options }: OpenaiTtsOptions,
+): Promise<AudioBufferSourceNode> => {
+  const key = api.key || OPENAI_API_KEY;
+  const url = OPENAI_TTS_URL(api.proxy);
+
+  const headers = new Headers({
+    'Authorization': `Bearer ${key}`,
+    'Content-Type': 'application/json',
+  });
+
+  const body = JSON.stringify({
+    input: text,
+    model,
+    voice: options.name,
+  });
+
+  const response: Response = await fetch(url, { body, headers, method: 'POST' });
+
+  if (!response.ok) {
+    throw new Error('Network response was not ok');
+  }
+
+  const audioData = await response.arrayBuffer();
+  const audioContext = new AudioContext();
+  const audioBufferSource = audioContext.createBufferSource();
+  audioBufferSource.buffer = await audioContext.decodeAudioData(audioData);
+  audioBufferSource.connect(audioContext.destination);
+  return audioBufferSource;
+};
diff --git a/src/useAzureSpeech/demos/index.tsx b/src/useAzureSpeech/demos/index.tsx
diff --git a/src/useAzureSpeech/index.md b/src/useAzureSpeech/index.md
@@ -8,4 +8,4 @@ title: useAzureSpeech
 
 - ENV: `AZURE_SPEECH_KEY` `AZURE_SPEECH_REGION`
 
-<code src="./demos/AzureSpeech.tsx" nopadding></code>
+<code src="./demos/index.tsx" nopadding></code>
diff --git a/src/useAzureSpeech/index.ts b/src/useAzureSpeech/index.ts
@@ -1,7 +1,7 @@
 import { useState } from 'react';
 import useSWR from 'swr';
 
-import { AzureSpeechOptions, postAzureSpeech } from '../services/postAzureSpeech';
+import { AzureSpeechOptions, fetchAzureSpeech } from '../services/fetchAzureSpeech';
 
 export const useAzureSpeech = (defaultText: string, options: AzureSpeechOptions) => {
   const [data, setDate] = useState<AudioBufferSourceNode>();
@@ -11,7 +11,7 @@ export const useAzureSpeech = (defaultText: string, options: AzureSpeechOptions)
 
   const { isLoading } = useSWR(
     shouldFetch ? [options.name, text].join('-') : null,
-    () => postAzureSpeech(text, options),
+    () => fetchAzureSpeech(text, options),
     {
       onError: () => setShouldFetch(false),
       onSuccess: (audioBufferSource) => {
diff --git a/src/useEdgeSpeech/demos/index.tsx b/src/useEdgeSpeech/demos/index.tsx
diff --git a/src/useEdgeSpeech/index.md b/src/useEdgeSpeech/index.md
@@ -6,4 +6,4 @@ title: useEdgeSpeech
 
 ## hooks
 
-<code src="./demos/EdgeSpeech.tsx" nopadding></code>
+<code src="./demos/index.tsx" nopadding></code>
diff --git a/src/useEdgeSpeech/index.ts b/src/useEdgeSpeech/index.ts
@@ -1,7 +1,7 @@
 import { useState } from 'react';
 import useSWR from 'swr';
 
-import { postEdgeSpeech } from '../services/postEdgeSpeech';
+import { fetchEdgeSpeech } from '../services/fetchEdgeSpeech';
 import { SsmlOptions } from '../utils/genSSML';
 
 export const useEdgeSpeech = (defaultText: string, options: SsmlOptions) => {
@@ -12,7 +12,7 @@ export const useEdgeSpeech = (defaultText: string, options: SsmlOptions) => {
 
   const { isLoading } = useSWR(
     shouldFetch ? [options.name, text].join('-') : null,
-    () => postEdgeSpeech(text, options),
+    () => fetchEdgeSpeech(text, options),
     {
       onError: () => setShouldFetch(false),
       onSuccess: (audioBufferSource) => {
diff --git a/src/useMicrosoftSpeech/demos/index.tsx b/src/useMicrosoftSpeech/demos/index.tsx
@@ -6,14 +6,13 @@ import { Flexbox } from 'react-layout-kit';
 
 const defaultText = '这是一段使用 Microsoft Speech 的语音演示';
 
-const API = 'https://lobe-tts-preview.vercel.app/api/index';
 export default () => {
   const store = useCreateStore();
   const options: any = useControls(
     {
       api: {
         label: 'MICROSOFT_SPEECH_PROXY_URL',
-        value: API,
+        value: '',
       },
       name: {
         options: getEdgeVoiceList(),
diff --git a/src/useMicrosoftSpeech/index.md b/src/useMicrosoftSpeech/index.md
@@ -11,4 +11,4 @@ title: useMicrosoftSpeech
 
 > ex: <https://lobe-tts-preview.vercel.app/api/index>
 
-<code src="./demos/MicrosoftSpeech.tsx" nopadding></code>
+<code src="./demos/index.tsx" nopadding></code>
diff --git a/src/useMicrosoftSpeech/index.ts b/src/useMicrosoftSpeech/index.ts
@@ -1,7 +1,10 @@
 import { useState } from 'react';
 import useSWR from 'swr';
 
-import { type MicrosoftSpeechOptions, postMicrosoftSpeech } from '../services/postMicrosoftSpeech';
+import {
+  type MicrosoftSpeechOptions,
+  fetchMicrosoftSpeech,
+} from '../services/fetchMicrosoftSpeech';
 
 export const useMicrosoftSpeech = (defaultText: string, options: MicrosoftSpeechOptions) => {
   const [data, setDate] = useState<AudioBufferSourceNode>();
@@ -11,7 +14,7 @@ export const useMicrosoftSpeech = (defaultText: string, options: MicrosoftSpeech
 
   const { isLoading } = useSWR(
     shouldFetch ? [options.name, text].join('-') : null,
-    () => postMicrosoftSpeech(text, options),
+    () => fetchMicrosoftSpeech(text, options),
     {
       onError: () => setShouldFetch(false),
       onSuccess: (audioBufferSource) => {
diff --git a/src/useOpenaiTTS/demos/index.tsx b/src/useOpenaiTTS/demos/index.tsx
diff --git a/src/useOpenaiTTS/index.md b/src/useOpenaiTTS/index.md
diff --git a/src/useOpenaiTTS/index.ts b/src/useOpenaiTTS/index.ts
diff --git a/src/useSpeechRecognition/demos/index.tsx b/src/useSpeechRecognition/demos/index.tsx
diff --git a/src/useSpeechRecognition/index.md b/src/useSpeechRecognition/index.md
diff --git a/src/useSpeechSynthes/demos/index.tsx b/src/useSpeechSynthes/demos/index.tsx
diff --git a/src/useSpeechSynthes/index.md b/src/useSpeechSynthes/index.md
diff --git a/src/utils/getVoiceList.ts b/src/utils/getVoiceList.ts

-Original file line number
+Diff line change
 # production
 dist
 es
 -lib
 logs
 # misc
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+["alloy", "echo", "fable", "onyx", "nova", "shimmer"]`
Original file line number	Diff line number	Diff line change
`@@ -8,4 +8,4 @@ title: useAzureSpeech`
`8`	`8`
`9`	`9`	- ENV: `AZURE_SPEECH_KEY` `AZURE_SPEECH_REGION`
`10`	`10`
`11`		`-<code src="./demos/AzureSpeech.tsx" nopadding></code>`
	`11`	`+<code src="./demos/index.tsx" nopadding></code>`
Original file line number	Diff line number	Diff line change
`@@ -6,4 +6,4 @@ title: useEdgeSpeech`
`6`	`6`
`7`	`7`	`## hooks`
`8`	`8`
`9`		`-<code src="./demos/EdgeSpeech.tsx" nopadding></code>`
	`9`	`+<code src="./demos/index.tsx" nopadding></code>`
Original file line number	Diff line number	Diff line change
`@@ -11,4 +11,4 @@ title: useMicrosoftSpeech`
`11`	`11`
`12`	`12`	`> ex: <https://lobe-tts-preview.vercel.app/api/index>`
`13`	`13`
`14`		`-<code src="./demos/MicrosoftSpeech.tsx" nopadding></code>`
	`14`	`+<code src="./demos/index.tsx" nopadding></code>`