An open-source Golang TTS API aggregation library that unifies Text-to-Speech (TTS) APIs from major providers.
- π― Unified Interface: Provides a unified TTS interface to simplify multi-provider integration
- π Multi-Provider Support: Supports mainstream TTS services including Google, OpenAI, Alibaba Cloud, Azure, and Tencent Cloud
- π€ Voice Cloning: Supports voice cloning functionality for Alibaba Cloud and Azure
- π Easy to Use: Create TTS instances with simple factory methods
- π¦ Lightweight: No external dependencies, uses only Go standard library
- β Google Cloud Text-to-Speech
- β OpenAI TTS API
- β Alibaba Cloud Intelligent Speech (supports voice cloning)
- β Azure Cognitive Services Speech (supports voice cloning)
- β Tencent Cloud Speech Synthesis
go get github.com/ttshub/ttshubpackage main
import (
"context"
"github.com/ttshub/ttshub"
)
func main() {
// Create credentials
credentials := &ttshub.Credentials{
APIKey: "your-api-key",
}
// Create TTS instance (using OpenAI as example)
tts, err := ttshub.NewTTS(ttshub.ProviderOpenAI, credentials)
if err != nil {
panic(err)
}
// Set synthesis options
options := ttshub.DefaultSynthesisOptions()
options.Language = "zh-CN"
options.Voice = "alloy"
options.Speed = 1.0
// Synthesize speech
audio, err := tts.Synthesize(context.Background(), "Hello, World!", options)
if err != nil {
panic(err)
}
// Save audio file
// os.WriteFile("output.mp3", audio, 0644)
}credentials := &ttshub.Credentials{
APIKey: "your-google-api-key",
}
tts, err := ttshub.NewTTS(ttshub.ProviderGoogle, credentials)credentials := &ttshub.Credentials{
APIKey: "your-openai-api-key",
}
tts, err := ttshub.NewTTS(ttshub.ProviderOpenAI, credentials)credentials := &ttshub.Credentials{
AccessKeyID: "your-access-key-id",
AccessKeySecret: "your-access-key-secret",
Region: "cn-shanghai",
}
tts, err := ttshub.NewTTS(ttshub.ProviderAliyun, credentials)credentials := &ttshub.Credentials{
APIKey: "your-azure-subscription-key",
Region: "eastus",
}
tts, err := ttshub.NewTTS(ttshub.ProviderAzure, credentials)credentials := &ttshub.Credentials{
AccessKeyID: "your-secret-id",
AccessKeySecret: "your-secret-key",
Region: "ap-beijing",
}
tts, err := ttshub.NewTTS(ttshub.ProviderTencent, credentials)voices, err := tts.ListVoices(context.Background())
if err != nil {
panic(err)
}
for _, voice := range voices {
fmt.Printf("ID: %s, Name: %s, Language: %s, Gender: %s\n",
voice.ID, voice.Name, voice.Language, voice.Gender)
}reader, err := tts.SynthesizeStream(context.Background(), "Long text content...", options)
if err != nil {
panic(err)
}
// Read audio stream
buffer := make([]byte, 4096)
for {
n, err := reader.Read(buffer)
if err == io.EOF {
break
}
// Process audio data
}// Prepare audio samples
samples := []ttshub.VoiceSample{
{
AudioData: audioBytes, // Audio data read from file
AudioFormat: "wav",
Text: "Corresponding text content", // Optional
},
}
// Set clone options
cloneOptions := ttshub.DefaultCloneOptions()
cloneOptions.Name = "My Custom Voice"
cloneOptions.Description = "This is a test voice"
cloneOptions.Language = "zh-CN"
// Create cloned voice
clonedVoiceID, err := tts.CloneVoice(context.Background(), samples, cloneOptions)
if err != nil {
panic(err)
}
fmt.Printf("Cloned Voice ID: %s\n", clonedVoiceID)
// Synthesize using cloned voice
audio, err := tts.SynthesizeWithClonedVoice(
context.Background(),
"Text synthesized with cloned voice",
clonedVoiceID,
options,
)
if err != nil {
panic(err)
}
// List all cloned voices
clonedVoices, err := tts.ListClonedVoices(context.Background())
if err != nil {
panic(err)
}
for _, voice := range clonedVoices {
fmt.Printf("ID: %s, Name: %s, Status: %s\n", voice.ID, voice.Name, voice.Status)
}
// Delete cloned voice
err = tts.DeleteClonedVoice(context.Background(), clonedVoiceID)
if err != nil {
panic(err)
}Note: Google, OpenAI, and Tencent Cloud do not support voice cloning. Calling related methods will return ErrVoiceCloningNotSupported error.
type TTS interface {
// Synthesize converts text to speech
Synthesize(ctx context.Context, text string, options *SynthesisOptions) ([]byte, error)
// SynthesizeStream converts text to speech stream
SynthesizeStream(ctx context.Context, text string, options *SynthesisOptions) (io.Reader, error)
// ListVoices lists available voices
ListVoices(ctx context.Context) ([]Voice, error)
// CloneVoice creates a voice cloning model
CloneVoice(ctx context.Context, samples []VoiceSample, options *CloneOptions) (string, error)
// SynthesizeWithClonedVoice synthesizes using cloned voice
SynthesizeWithClonedVoice(ctx context.Context, text string, clonedVoiceID string, options *SynthesisOptions) ([]byte, error)
// ListClonedVoices lists all created cloned voices
ListClonedVoices(ctx context.Context) ([]ClonedVoice, error)
// DeleteClonedVoice deletes a cloned voice model
DeleteClonedVoice(ctx context.Context, clonedVoiceID string) error
}type SynthesisOptions struct {
Language string // Language code, e.g., "zh-CN", "en-US"
Voice string // Voice name or ID
Speed float64 // Speech rate, typically range 0.5-2.0
Pitch float64 // Pitch, typically range -20.0 to 20.0
Volume float64 // Volume, typically range 0.0-1.0
AudioFormat string // Audio format, e.g., "mp3", "wav", "pcm"
SampleRate int // Sample rate, e.g., 16000, 22050, 44100
AdditionalOptions map[string]interface{} // Provider-specific additional options
}- Credentials: Different providers require different credentials. Please refer to each provider's documentation to obtain the corresponding keys.
- Parameter Mapping: Parameter ranges may differ between providers. The library automatically maps and converts them.
- Error Handling: Always handle all possible errors.
- Context Cancellation: Supports request cancellation via context.
- Voice Cloning: Currently, only Alibaba Cloud and Azure support voice cloning. Other providers will return an error when calling related methods.
- Voice Cloning Training: Voice cloning requires uploading audio samples for training, which may take a long time. Please be patient.
For detailed information about each provider's API introduction, documentation links, and pricing, please refer to:
Issues and Pull Requests are welcome!
MIT License