diff --git a/.changeset/thin-worlds-tie.md b/.changeset/thin-worlds-tie.md new file mode 100644 index 0000000000..5506452e80 --- /dev/null +++ b/.changeset/thin-worlds-tie.md @@ -0,0 +1,6 @@ +--- +'@mastra/speech-openai': minor +'@mastra/voice-openai': minor +--- + +Deprecate @mastra/speech-openai for @mastra/voice-openai diff --git a/.gitignore b/.gitignore index 597ee2532c..a0d5f6dec1 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,7 @@ openapi-ts-error* .secrets # Local Netlify folder .netlify -.npmrc \ No newline at end of file +.npmrc + +# Test output directories +voice/**/test-output*/ diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9b9e4822dc..44fff26549 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -2076,7 +2076,7 @@ importers: version: 7.50.0(@types/node@22.13.4) '@rollup/plugin-image': specifier: ^3.0.3 - version: 3.0.3(rollup@4.34.8) + version: 3.0.3(rollup@3.29.5) '@size-limit/preset-small-lib': specifier: ^11.1.4 version: 11.2.0(size-limit@11.2.0) @@ -2137,7 +2137,7 @@ importers: version: 7.50.0(@types/node@22.13.4) '@rollup/plugin-image': specifier: ^3.0.3 - version: 3.0.3(rollup@3.29.5) + version: 3.0.3(rollup@4.34.8) '@size-limit/preset-small-lib': specifier: ^11.1.4 version: 11.2.0(size-limit@11.2.0) @@ -3452,9 +3452,6 @@ importers: typescript: specifier: ^5.7.3 version: 5.7.3 - vitest: - specifier: ^2.1.8 - version: 2.1.9(@edge-runtime/vm@3.2.0)(@types/node@22.13.4)(jsdom@20.0.3(bufferutil@4.0.9)(canvas@2.11.2(encoding@0.1.13))(utf-8-validate@6.0.5))(terser@5.39.0) speech/playai: dependencies: @@ -3943,6 +3940,34 @@ importers: specifier: ^5.7.3 version: 5.7.3 + voice/openai: + dependencies: + '@mastra/core': + specifier: workspace:^ + version: link:../../packages/core + openai: + specifier: ^4.28.0 + version: 4.85.2(encoding@0.1.13)(ws@8.18.0(bufferutil@4.0.9)(utf-8-validate@6.0.5))(zod@3.24.2) + zod: + specifier: ^3.24.1 + version: 3.24.2 + devDependencies: + '@microsoft/api-extractor': + specifier: ^7.49.2 + version: 7.50.0(@types/node@22.13.4) + '@types/node': + specifier: ^22.13.1 + version: 22.13.4 + tsup: + specifier: ^8.3.6 + version: 8.3.6(@microsoft/api-extractor@7.50.0(@types/node@22.13.4))(@swc/core@1.10.18(@swc/helpers@0.5.15))(jiti@2.4.2)(postcss@8.5.2)(tsx@4.19.3)(typescript@5.7.3)(yaml@2.7.0) + typescript: + specifier: ^5.7.3 + version: 5.7.3 + vitest: + specifier: ^2.1.8 + version: 2.1.9(@edge-runtime/vm@3.2.0)(@types/node@22.13.4)(jsdom@20.0.3(bufferutil@4.0.9)(canvas@2.11.2(encoding@0.1.13))(utf-8-validate@6.0.5))(terser@5.39.0) + packages: '@ai-sdk/anthropic@1.1.9': diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index 730506a27e..9b274d33d0 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -5,6 +5,7 @@ packages: - "vector-stores/*" - "stores/*" - "speech/*" + - "voice/*" - "client-sdks/*" - "!packages/cli/admin" - "integration-generator/*" diff --git a/speech/openai/README.md b/speech/openai/README.md index 0891b8068f..d6e90171ea 100644 --- a/speech/openai/README.md +++ b/speech/openai/README.md @@ -1,67 +1,25 @@ -# @mastra/speech-openai +# @mastra/speech-openai (DEPRECATED) -OpenAI Speech integration for Mastra, providing Text-to-Speech (TTS) capabilities using OpenAI's advanced speech models. +⚠️ **This package is deprecated.** Please use [@mastra/voice-openai](https://github.com/mastra-ai/mastra/tree/main/voice/openai) instead. -## Installation +## Migration -```bash -npm install @mastra/speech-openai -``` - -## Configuration +The new package `@mastra/voice-openai` provides both Text-to-Speech and Speech-to-Text capabilities. To migrate: -The module requires the following environment variable: +1. Install the new package: ```bash -OPENAI_API_KEY=your_api_key +npm uninstall @mastra/speech-openai +npm install @mastra/voice-openai ``` -## Usage +2. Update your imports: ```typescript +// Old import { OpenAITTS } from '@mastra/speech-openai'; - -// Initialize with configuration -const tts = new OpenAITTS({ - model: { - name: 'alloy', // Default voice - apiKey: 'your-api-key', // Optional, can use OPENAI_API_KEY env var - }, -}); - -// List available voices -const voices = await tts.voices(); - -// Generate speech -const result = await tts.generate({ - voice: 'alloy', - text: 'Hello from Mastra!', -}); - -// Stream speech -const stream = await tts.stream({ - voice: 'alloy', - text: 'Hello from Mastra!', -}); +// New +import { OpenAIVoice } from '@mastra/voice-openai'; ``` -## Features - -- High-quality Text-to-Speech synthesis -- Multiple voice options -- Streaming support -- Natural and expressive speech output -- Fast generation times - -## Voice Options - -OpenAI provides several high-quality voices: - -- alloy (Neutral) -- echo (Male) -- fable (Male) -- onyx (Male) -- nova (Female) -- shimmer (Female) - -View the complete list in the `voices.ts` file or [OpenAI's documentation](https://platform.openai.com/docs/guides/text-to-speech). +For detailed migration instructions and new features, please refer to the [@mastra/voice-openai documentation](https://github.com/mastra-ai/mastra/tree/main/voice/openai). diff --git a/speech/openai/package.json b/speech/openai/package.json index 7924c2c4f4..974b8c08f3 100644 --- a/speech/openai/package.json +++ b/speech/openai/package.json @@ -1,7 +1,7 @@ { "name": "@mastra/speech-openai", "version": "0.1.3-alpha.0", - "description": "Mastra OpenAI speech integration", + "description": "Mastra OpenAI speech integration (deprecated, please use @mastra/voice-openai instead)", "type": "module", "main": "dist/index.js", "types": "dist/index.d.ts", @@ -16,8 +16,7 @@ }, "scripts": { "build": "tsup src/index.ts --format esm --experimental-dts --clean --treeshake", - "build:watch": "pnpm build --watch", - "test": "vitest run" + "test": "echo \"deprecated\"" }, "dependencies": { "@mastra/core": "workspace:^", @@ -28,7 +27,6 @@ "@microsoft/api-extractor": "^7.49.2", "@types/node": "^22.13.1", "tsup": "^8.0.1", - "typescript": "^5.7.3", - "vitest": "^2.1.8" + "typescript": "^5.7.3" } } diff --git a/speech/openai/src/index.test.ts b/speech/openai/src/index.test.ts deleted file mode 100644 index 2e7cfe279f..0000000000 --- a/speech/openai/src/index.test.ts +++ /dev/null @@ -1,114 +0,0 @@ -import { createWriteStream, writeFileSync } from 'fs'; -import path from 'path'; - -import { OpenAITTS } from './index.js'; - -describe('OpenAITTS Integration Tests', () => { - let tts: OpenAITTS; - - beforeAll(() => { - tts = new OpenAITTS({ - model: { - name: 'tts-1', - }, - }); - }); - - describe('stream', () => { - it('should stream audio data to file', async () => { - const { audioResult } = await tts.stream({ - text: 'Test streaming', - voice: 'alloy', - }); - - return new Promise((resolve, reject) => { - const outputPath = path.join(process.cwd(), 'test-outputs/stream-test.mp3'); - const fileStream = createWriteStream(outputPath); - const chunks: Buffer[] = []; - - audioResult.on('data', (chunk: Buffer) => { - chunks.push(chunk); - }); - - audioResult.pipe(fileStream); - - fileStream.on('finish', () => { - expect(chunks.length).toBeGreaterThan(0); - resolve(undefined); - }); - - audioResult.on('error', reject); - fileStream.on('error', reject); - }); - }), - 50000; - - it('should stream with different parameters and save to file', async () => { - const { audioResult } = await tts.stream({ - text: 'Testing with different voice and speed', - voice: 'nova', - speed: 1.2, - }); - - return new Promise((resolve, reject) => { - const outputPath = path.join(process.cwd(), 'test-outputs/stream-test-params.mp3'); - const fileStream = createWriteStream(outputPath); - - audioResult.pipe(fileStream); - - fileStream.on('finish', resolve); - audioResult.on('error', reject); - fileStream.on('error', reject); - }); - }); - }); - - describe('generate', () => { - it('should return a complete audio buffer and save to file', async () => { - const { audioResult } = await tts.generate({ - text: 'Hello World', - voice: 'alloy', - }); - - expect(Buffer.isBuffer(audioResult)).toBeTruthy(); - expect(audioResult.length).toBeGreaterThan(0); - - const outputPath = path.join(process.cwd(), 'test-outputs/open-aigenerate-test.mp3'); - writeFileSync(outputPath, audioResult); - }); - - it('should work with different parameters and save to file', async () => { - const { audioResult } = await tts.generate({ - text: 'Test with parameters', - voice: 'nova', - speed: 1.5, - }); - - expect(Buffer.isBuffer(audioResult)).toBeTruthy(); - - const outputPath = path.join(process.cwd(), 'test-outputs/open-nova-aigenerate-test.mp3'); - writeFileSync(outputPath, audioResult); - }); - }); - - // Error cases - describe('error handling', () => { - it('should handle invalid voice names', async () => { - await expect( - tts.stream({ - text: 'Test', - voice: 'invalid_voice', - }), - ).rejects.toThrow(); - }); - - it('should handle empty text', async () => { - await expect( - tts.stream({ - text: '', - voice: 'alloy', - }), - ).rejects.toThrow(); - }); - }); -}); diff --git a/speech/openai/src/index.ts b/speech/openai/src/index.ts index e84512585a..96ca0c09cd 100644 --- a/speech/openai/src/index.ts +++ b/speech/openai/src/index.ts @@ -7,6 +7,10 @@ interface OpenAITTSConfig { apiKey?: string; } +throw new Error( + '@mastra/speech-openai is deprecated. Please use @mastra/voice-openai instead, which provides both Text-to-Speech and Speech-to-Text capabilities.', +); + export class OpenAITTS extends MastraTTS { client: OpenAI; constructor({ model }: { model: OpenAITTSConfig }) { diff --git a/voice/openai/CHANGELOG.md b/voice/openai/CHANGELOG.md new file mode 100644 index 0000000000..71691707c1 --- /dev/null +++ b/voice/openai/CHANGELOG.md @@ -0,0 +1,8 @@ +# @mastra/voice-openai + +## 0.1.0 + +### Changes + +- `@mastra/speech-openai` is now deprecated. Please use `@mastra/voice-openai` instead. +- This package provides both Text-to-Speech (TTS) and Speech-to-Text (STT) capabilities through OpenAI's API. diff --git a/voice/openai/README.md b/voice/openai/README.md new file mode 100644 index 0000000000..29637ae231 --- /dev/null +++ b/voice/openai/README.md @@ -0,0 +1,91 @@ +# @mastra/voice-openai + +OpenAI Voice integration for Mastra, providing both Text-to-Speech (TTS) and Speech-to-Text (STT) capabilities using OpenAI's advanced models. + +## Installation + +```bash +npm install @mastra/voice-openai +``` + +## Configuration + +The module requires an OpenAI API key, which can be provided through environment variables or directly in the configuration: + +```bash +OPENAI_API_KEY=your_api_key +``` + +## Usage + +```typescript +import { OpenAIVoice } from '@mastra/voice-openai'; + +// Create voice with both speech and listening capabilities +const voice = new OpenAIVoice({ + speechModel: { + name: 'tts-1', // or 'tts-1-hd' for higher quality + apiKey: 'your-api-key', // Optional, can use OPENAI_API_KEY env var + }, + listeningModel: { + name: 'whisper-1', + apiKey: 'your-api-key', // Optional, can use OPENAI_API_KEY env var + }, + speaker: 'alloy', // Default voice +}); + +// Or create speech-only voice +const speechVoice = new OpenAIVoice({ + speechModel: { + name: 'tts-1', + apiKey: 'your-api-key', + }, + speaker: 'nova', +}); + +// Or create listening-only voice +const listeningVoice = new OpenAIVoice({ + listeningModel: { + name: 'whisper-1', + apiKey: 'your-api-key', + }, +}); + +// List available voices +const speakers = await voice.getSpeakers(); + +// Generate speech +const audioStream = await voice.speak('Hello from Mastra!', { + speaker: 'nova', // Optional: override default speaker + speed: 1.0, // Optional: adjust speech speed +}); + +// Convert speech to text +const text = await voice.listen(audioStream, { + filetype: 'wav', +}); +``` + +## Features + +- High-quality Text-to-Speech synthesis +- Accurate Speech-to-Text transcription +- Multiple voice options +- Natural and expressive speech output +- Fast processing times + +## Voice Options + +OpenAI provides several high-quality voices: + +- alloy (Neutral) +- echo (Male) +- fable (Male) +- onyx (Male) +- nova (Female) +- shimmer (Female) +- ash (Male) +- coral (Female) +- sage (Male) + +View the complete list in OpenAI's [Text to Speech documentation](https://platform.openai.com/docs/guides/text-to-speech). diff --git a/voice/openai/__fixtures__/voice-test.m4a b/voice/openai/__fixtures__/voice-test.m4a new file mode 100644 index 0000000000..515a9a28ee Binary files /dev/null and b/voice/openai/__fixtures__/voice-test.m4a differ diff --git a/voice/openai/package.json b/voice/openai/package.json new file mode 100644 index 0000000000..ce540f9443 --- /dev/null +++ b/voice/openai/package.json @@ -0,0 +1,34 @@ +{ + "name": "@mastra/voice-openai", + "version": "0.1.0-alpha.1", + "description": "Mastra OpenAI speech integration", + "type": "module", + "main": "dist/index.js", + "types": "dist/index.d.ts", + "exports": { + ".": { + "import": { + "types": "./dist/index.d.ts", + "default": "./dist/index.js" + } + }, + "./package.json": "./package.json" + }, + "scripts": { + "build": "tsup src/index.ts --format esm --experimental-dts --clean --treeshake", + "build:watch": "pnpm build --watch", + "test": "vitest run" + }, + "dependencies": { + "@mastra/core": "workspace:^", + "openai": "^4.28.0", + "zod": "^3.24.1" + }, + "devDependencies": { + "@microsoft/api-extractor": "^7.49.2", + "@types/node": "^22.13.1", + "tsup": "^8.3.6", + "typescript": "^5.7.3", + "vitest": "^2.1.8" + } +} diff --git a/voice/openai/src/index.test.ts b/voice/openai/src/index.test.ts new file mode 100644 index 0000000000..e0a4aa9a54 --- /dev/null +++ b/voice/openai/src/index.test.ts @@ -0,0 +1,190 @@ +import { writeFileSync, mkdirSync, createReadStream } from 'fs'; +import path from 'path'; +import { PassThrough } from 'stream'; +import { describe, expect, it, beforeAll } from 'vitest'; + +import { OpenAIVoice } from './index.js'; + +describe('OpenAIVoice Integration Tests', () => { + let voice: OpenAIVoice; + const outputDir = path.join(process.cwd(), 'test-outputs'); + + beforeAll(() => { + try { + mkdirSync(outputDir, { recursive: true }); + } catch (err) { + // Ignore if directory already exists + } + + voice = new OpenAIVoice({ + speechModel: { + name: 'tts-1', + }, + listeningModel: { + name: 'whisper-1', + }, + }); + }); + + describe('getSpeakers', () => { + it('should list available voices', async () => { + const speakers = await voice.getSpeakers(); + expect(speakers).toContainEqual({ voiceId: 'alloy' }); + expect(speakers).toContainEqual({ voiceId: 'nova' }); + }); + }); + + it('should initialize with default parameters', async () => { + const defaultVoice = new OpenAIVoice(); + const speakers = await defaultVoice.getSpeakers(); + expect(speakers).toBeInstanceOf(Array); + expect(speakers.length).toBeGreaterThan(0); + }); + + describe('speak', () => { + it('should speak with default parameters', async () => { + const defaultVoice = new OpenAIVoice(); + const audioStream = await defaultVoice.speak('Hello with defaults'); + + const chunks: Buffer[] = []; + for await (const chunk of audioStream) { + chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + } + const audioBuffer = Buffer.concat(chunks); + + expect(audioBuffer.length).toBeGreaterThan(0); + }); + + it('should generate audio stream from text', async () => { + const audioStream = await voice.speak('Hello World', { + speaker: 'alloy', + }); + + const chunks: Buffer[] = []; + for await (const chunk of audioStream) { + chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + } + const audioBuffer = Buffer.concat(chunks); + + expect(audioBuffer.length).toBeGreaterThan(0); + + const outputPath = path.join(outputDir, 'speech-test.mp3'); + writeFileSync(outputPath, audioBuffer); + }, 10000); + + it('should work with different parameters', async () => { + const audioStream = await voice.speak('Test with parameters', { + speaker: 'nova', + speed: 0.5, + }); + + const chunks: Buffer[] = []; + for await (const chunk of audioStream) { + chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + } + const audioBuffer = Buffer.concat(chunks); + + expect(audioBuffer.length).toBeGreaterThan(0); + + const outputPath = path.join(outputDir, 'speech-test-params.mp3'); + writeFileSync(outputPath, audioBuffer); + }, 10000); + + it('should accept text stream as input', async () => { + const inputStream = new PassThrough(); + inputStream.end('Hello from stream'); + + const audioStream = await voice.speak(inputStream, { + speaker: 'alloy', + }); + + const chunks: Buffer[] = []; + for await (const chunk of audioStream) { + chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + } + const audioBuffer = Buffer.concat(chunks); + + expect(audioBuffer.length).toBeGreaterThan(0); + + const outputPath = path.join(outputDir, 'speech-stream-input.mp3'); + writeFileSync(outputPath, audioBuffer); + }, 10000); + }); + + describe('listen', () => { + it('should listen with default parameters', async () => { + const defaultVoice = new OpenAIVoice(); + const audioStream = await defaultVoice.speak('Listening test with defaults'); + + const text = await defaultVoice.listen(audioStream); + + expect(text).toBeTruthy(); + expect(typeof text).toBe('string'); + expect(text.toLowerCase()).toContain('listening test'); + }); + + it('should transcribe audio from fixture file', async () => { + const fixturePath = path.join(process.cwd(), '__fixtures__', 'voice-test.m4a'); + const audioStream = createReadStream(fixturePath); + + const text = await voice.listen(audioStream, { + filetype: 'm4a', + }); + + expect(text).toBeTruthy(); + console.log(text); + expect(typeof text).toBe('string'); + expect(text.length).toBeGreaterThan(0); + }, 15000); + + it('should transcribe audio stream', async () => { + // First generate some test audio + const audioStream = await voice.speak('This is a test for transcription', { + speaker: 'alloy', + }); + + // Then transcribe it + const text = await voice.listen(audioStream, { + filetype: 'm4a', + }); + + expect(text).toBeTruthy(); + expect(typeof text).toBe('string'); + expect(text.toLowerCase()).toContain('test'); + }, 15000); + + it('should accept options', async () => { + const audioStream = await voice.speak('Test with language option', { + speaker: 'nova', + }); + + const text = await voice.listen(audioStream, { + language: 'en', + filetype: 'm4a', + }); + + expect(text).toBeTruthy(); + expect(typeof text).toBe('string'); + expect(text.toLowerCase()).toContain('test'); + }, 15000); + }); + + // Error cases + describe('error handling', () => { + it('should handle invalid speaker names', async () => { + await expect( + voice.speak('Test', { + speaker: 'invalid_voice', + }), + ).rejects.toThrow(); + }); + + it('should handle empty text', async () => { + await expect( + voice.speak('', { + speaker: 'alloy', + }), + ).rejects.toThrow(); + }); + }); +}); diff --git a/voice/openai/src/index.ts b/voice/openai/src/index.ts new file mode 100644 index 0000000000..70ce916afc --- /dev/null +++ b/voice/openai/src/index.ts @@ -0,0 +1,204 @@ +import { MastraVoice } from '@mastra/core/voice'; +import OpenAI from 'openai'; +import { PassThrough } from 'stream'; + +type OpenAIVoiceId = 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer' | 'ash' | 'coral' | 'sage'; +type OpenAIModel = 'tts-1' | 'tts-1-hd' | 'whisper-1'; + +export interface OpenAIConfig { + name?: OpenAIModel; + apiKey?: string; +} + +export interface OpenAIVoiceConfig { + speech?: { + model: 'tts-1' | 'tts-1-hd'; + apiKey?: string; + speaker?: OpenAIVoiceId; + }; + listening?: { + model: 'whisper-1'; + apiKey?: string; + }; +} + +export class OpenAIVoice extends MastraVoice { + speechClient?: OpenAI; + listeningClient?: OpenAI; + + /** + * Constructs an instance of OpenAIVoice with optional configurations for speech and listening models. + * + * @param {Object} [config] - Configuration options for the OpenAIVoice instance. + * @param {OpenAIConfig} [config.listeningModel] - Configuration for the listening model, including model name and API key. + * @param {OpenAIConfig} [config.speechModel] - Configuration for the speech model, including model name and API key. + * @param {string} [config.speaker] - The default speaker's voice to use for speech synthesis. + * @throws {Error} - Throws an error if no API key is provided for either the speech or listening model. + */ + constructor({ + listeningModel, + speechModel, + speaker, + }: { + listeningModel?: OpenAIConfig; + speechModel?: OpenAIConfig; + speaker?: string; + } = {}) { + const defaultApiKey = process.env.OPENAI_API_KEY; + const defaultSpeechModel = { + name: 'tts-1', + apiKey: defaultApiKey, + }; + const defaultListeningModel = { + name: 'whisper-1', + apiKey: defaultApiKey, + }; + + super({ + speechModel: { + name: speechModel?.name ?? defaultSpeechModel.name, + apiKey: speechModel?.apiKey ?? defaultSpeechModel.apiKey, + }, + listeningModel: { + name: listeningModel?.name ?? defaultListeningModel.name, + apiKey: listeningModel?.apiKey ?? defaultListeningModel.apiKey, + }, + speaker: speaker ?? 'alloy', + }); + + const speechApiKey = speechModel?.apiKey || defaultApiKey; + if (!speechApiKey) { + throw new Error('No API key provided for speech model'); + } + this.speechClient = new OpenAI({ apiKey: speechApiKey }); + + const listeningApiKey = listeningModel?.apiKey || defaultApiKey; + if (!listeningApiKey) { + throw new Error('No API key provided for listening model'); + } + this.listeningClient = new OpenAI({ apiKey: listeningApiKey }); + + if (!this.speechClient && !this.listeningClient) { + throw new Error('At least one of OPENAI_API_KEY, speechModel.apiKey, or listeningModel.apiKey must be set'); + } + } + + /** + * Retrieves a list of available speakers for the speech model. + * + * @returns {Promise>} - A promise that resolves to an array of objects, + * each containing a `voiceId` representing an available speaker. + * @throws {Error} - Throws an error if the speech model is not configured. + */ + async getSpeakers(): Promise> { + if (!this.speechModel) { + throw new Error('Speech model not configured'); + } + + return [ + { voiceId: 'alloy' }, + { voiceId: 'echo' }, + { voiceId: 'fable' }, + { voiceId: 'onyx' }, + { voiceId: 'nova' }, + { voiceId: 'shimmer' }, + { voiceId: 'ash' }, + { voiceId: 'coral' }, + { voiceId: 'sage' }, + ]; + } + + /** + * Converts text or audio input into speech using the configured speech model. + * + * @param {string | NodeJS.ReadableStream} input - The text or audio stream to be converted into speech. + * @param {Object} [options] - Optional parameters for the speech synthesis. + * @param {string} [options.speaker] - The speaker's voice to use for the speech synthesis. + * @param {number} [options.speed] - The speed at which the speech should be synthesized. + * @returns {Promise} - A promise that resolves to a readable stream of the synthesized audio. + * @throws {Error} - Throws an error if the speech model is not configured or if the input text is empty. + */ + async speak( + input: string | NodeJS.ReadableStream, + options?: { + speaker?: string; + speed?: number; + [key: string]: any; + }, + ): Promise { + if (!this.speechClient) { + throw new Error('Speech model not configured'); + } + + if (typeof input !== 'string') { + const chunks: Buffer[] = []; + for await (const chunk of input) { + chunks.push(Buffer.from(chunk)); + } + input = Buffer.concat(chunks).toString('utf-8'); + } + + if (input.trim().length === 0) { + throw new Error('Input text is empty'); + } + + const audio = await this.traced(async () => { + const response = await this.speechClient!.audio.speech.create({ + model: this.speechModel?.name ?? 'tts-1', + voice: (options?.speaker ?? this.speaker) as OpenAIVoiceId, + input, + speed: options?.speed || 1.0, + }); + + const passThrough = new PassThrough(); + const buffer = Buffer.from(await response.arrayBuffer()); + passThrough.end(buffer); + return passThrough; + }, 'voice.openai.speak')(); + + return audio; + } + + /** + * Transcribes audio from a given stream using the configured listening model. + * + * @param {NodeJS.ReadableStream} audioStream - The audio stream to be transcribed. + * @param {Object} [options] - Optional parameters for the transcription. + * @param {string} [options.filetype] - The file type of the audio stream. + * Supported types include 'mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm'. + * @returns {Promise} - A promise that resolves to the transcribed text. + * @throws {Error} - Throws an error if the listening model is not configured. + */ + async listen( + audioStream: NodeJS.ReadableStream, + options?: { + filetype?: 'mp3' | 'mp4' | 'mpeg' | 'mpga' | 'm4a' | 'wav' | 'webm'; + [key: string]: any; + }, + ): Promise { + if (!this.listeningClient) { + throw new Error('Listening model not configured'); + } + + const chunks: Buffer[] = []; + for await (const chunk of audioStream) { + chunks.push(Buffer.from(chunk)); + } + const audioBuffer = Buffer.concat(chunks); + + const text = await this.traced(async () => { + const { filetype, ...otherOptions } = options || {}; + const file = new File([audioBuffer], `audio.${filetype || 'mp3'}`); + + const response = await this.listeningClient!.audio.transcriptions.create({ + model: this.listeningModel?.name || 'whisper-1', + file: file as any, + ...otherOptions, + }); + + return response.text; + }, 'voice.openai.listen')(); + + return text; + } +} diff --git a/voice/openai/test-outputs/speech-stream-input.mp3 b/voice/openai/test-outputs/speech-stream-input.mp3 new file mode 100644 index 0000000000..a36ab1de83 Binary files /dev/null and b/voice/openai/test-outputs/speech-stream-input.mp3 differ diff --git a/voice/openai/test-outputs/speech-test-params.mp3 b/voice/openai/test-outputs/speech-test-params.mp3 new file mode 100644 index 0000000000..33fa259e0f Binary files /dev/null and b/voice/openai/test-outputs/speech-test-params.mp3 differ diff --git a/voice/openai/test-outputs/speech-test.mp3 b/voice/openai/test-outputs/speech-test.mp3 new file mode 100644 index 0000000000..78a5dd9e34 Binary files /dev/null and b/voice/openai/test-outputs/speech-test.mp3 differ diff --git a/voice/openai/tsconfig.json b/voice/openai/tsconfig.json new file mode 100644 index 0000000000..6750fddcd4 --- /dev/null +++ b/voice/openai/tsconfig.json @@ -0,0 +1,5 @@ +{ + "extends": "../../tsconfig.node.json", + "include": ["src/**/*"], + "exclude": ["node_modules", "**/*.test.ts"] +} diff --git a/speech/openai/vitest.config.ts b/voice/openai/vitest.config.ts similarity index 100% rename from speech/openai/vitest.config.ts rename to voice/openai/vitest.config.ts