Skip to content

gotoailab/ttshub

Folders and files

NameName
Last commit message
Last commit date

Latest commit

Β 

History

4 Commits
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 

Repository files navigation

ttshub

An open-source Golang TTS API aggregation library that unifies Text-to-Speech (TTS) APIs from major providers.

Features

  • 🎯 Unified Interface: Provides a unified TTS interface to simplify multi-provider integration
  • πŸ”Œ Multi-Provider Support: Supports mainstream TTS services including Google, OpenAI, Alibaba Cloud, Azure, and Tencent Cloud
  • 🎀 Voice Cloning: Supports voice cloning functionality for Alibaba Cloud and Azure
  • πŸš€ Easy to Use: Create TTS instances with simple factory methods
  • πŸ“¦ Lightweight: No external dependencies, uses only Go standard library

Supported Providers

  • βœ… Google Cloud Text-to-Speech
  • βœ… OpenAI TTS API
  • βœ… Alibaba Cloud Intelligent Speech (supports voice cloning)
  • βœ… Azure Cognitive Services Speech (supports voice cloning)
  • βœ… Tencent Cloud Speech Synthesis

Installation

go get github.com/ttshub/ttshub

Quick Start

Basic Usage

package main

import (
    "context"
    "github.com/ttshub/ttshub"
)

func main() {
    // Create credentials
    credentials := &ttshub.Credentials{
        APIKey: "your-api-key",
    }

    // Create TTS instance (using OpenAI as example)
    tts, err := ttshub.NewTTS(ttshub.ProviderOpenAI, credentials)
    if err != nil {
        panic(err)
    }

    // Set synthesis options
    options := ttshub.DefaultSynthesisOptions()
    options.Language = "zh-CN"
    options.Voice = "alloy"
    options.Speed = 1.0

    // Synthesize speech
    audio, err := tts.Synthesize(context.Background(), "Hello, World!", options)
    if err != nil {
        panic(err)
    }

    // Save audio file
    // os.WriteFile("output.mp3", audio, 0644)
}

Using Different Providers

Google TTS

credentials := &ttshub.Credentials{
    APIKey: "your-google-api-key",
}
tts, err := ttshub.NewTTS(ttshub.ProviderGoogle, credentials)

OpenAI TTS

credentials := &ttshub.Credentials{
    APIKey: "your-openai-api-key",
}
tts, err := ttshub.NewTTS(ttshub.ProviderOpenAI, credentials)

Alibaba Cloud TTS

credentials := &ttshub.Credentials{
    AccessKeyID:     "your-access-key-id",
    AccessKeySecret: "your-access-key-secret",
    Region:          "cn-shanghai",
}
tts, err := ttshub.NewTTS(ttshub.ProviderAliyun, credentials)

Azure TTS

credentials := &ttshub.Credentials{
    APIKey: "your-azure-subscription-key",
    Region: "eastus",
}
tts, err := ttshub.NewTTS(ttshub.ProviderAzure, credentials)

Tencent Cloud TTS

credentials := &ttshub.Credentials{
    AccessKeyID:     "your-secret-id",
    AccessKeySecret: "your-secret-key",
    Region:          "ap-beijing",
}
tts, err := ttshub.NewTTS(ttshub.ProviderTencent, credentials)

List Available Voices

voices, err := tts.ListVoices(context.Background())
if err != nil {
    panic(err)
}

for _, voice := range voices {
    fmt.Printf("ID: %s, Name: %s, Language: %s, Gender: %s\n",
        voice.ID, voice.Name, voice.Language, voice.Gender)
}

Stream Synthesis

reader, err := tts.SynthesizeStream(context.Background(), "Long text content...", options)
if err != nil {
    panic(err)
}

// Read audio stream
buffer := make([]byte, 4096)
for {
    n, err := reader.Read(buffer)
    if err == io.EOF {
        break
    }
    // Process audio data
}

Voice Cloning (Only Alibaba Cloud and Azure Support)

// Prepare audio samples
samples := []ttshub.VoiceSample{
    {
        AudioData:   audioBytes, // Audio data read from file
        AudioFormat: "wav",
        Text:        "Corresponding text content", // Optional
    },
}

// Set clone options
cloneOptions := ttshub.DefaultCloneOptions()
cloneOptions.Name = "My Custom Voice"
cloneOptions.Description = "This is a test voice"
cloneOptions.Language = "zh-CN"

// Create cloned voice
clonedVoiceID, err := tts.CloneVoice(context.Background(), samples, cloneOptions)
if err != nil {
    panic(err)
}

fmt.Printf("Cloned Voice ID: %s\n", clonedVoiceID)

// Synthesize using cloned voice
audio, err := tts.SynthesizeWithClonedVoice(
    context.Background(),
    "Text synthesized with cloned voice",
    clonedVoiceID,
    options,
)
if err != nil {
    panic(err)
}

// List all cloned voices
clonedVoices, err := tts.ListClonedVoices(context.Background())
if err != nil {
    panic(err)
}

for _, voice := range clonedVoices {
    fmt.Printf("ID: %s, Name: %s, Status: %s\n", voice.ID, voice.Name, voice.Status)
}

// Delete cloned voice
err = tts.DeleteClonedVoice(context.Background(), clonedVoiceID)
if err != nil {
    panic(err)
}

Note: Google, OpenAI, and Tencent Cloud do not support voice cloning. Calling related methods will return ErrVoiceCloningNotSupported error.

API Documentation

TTS Interface

type TTS interface {
    // Synthesize converts text to speech
    Synthesize(ctx context.Context, text string, options *SynthesisOptions) ([]byte, error)
    
    // SynthesizeStream converts text to speech stream
    SynthesizeStream(ctx context.Context, text string, options *SynthesisOptions) (io.Reader, error)
    
    // ListVoices lists available voices
    ListVoices(ctx context.Context) ([]Voice, error)
    
    // CloneVoice creates a voice cloning model
    CloneVoice(ctx context.Context, samples []VoiceSample, options *CloneOptions) (string, error)
    
    // SynthesizeWithClonedVoice synthesizes using cloned voice
    SynthesizeWithClonedVoice(ctx context.Context, text string, clonedVoiceID string, options *SynthesisOptions) ([]byte, error)
    
    // ListClonedVoices lists all created cloned voices
    ListClonedVoices(ctx context.Context) ([]ClonedVoice, error)
    
    // DeleteClonedVoice deletes a cloned voice model
    DeleteClonedVoice(ctx context.Context, clonedVoiceID string) error
}

SynthesisOptions

type SynthesisOptions struct {
    Language         string                 // Language code, e.g., "zh-CN", "en-US"
    Voice            string                 // Voice name or ID
    Speed            float64                // Speech rate, typically range 0.5-2.0
    Pitch            float64                // Pitch, typically range -20.0 to 20.0
    Volume           float64                // Volume, typically range 0.0-1.0
    AudioFormat      string                 // Audio format, e.g., "mp3", "wav", "pcm"
    SampleRate       int                    // Sample rate, e.g., 16000, 22050, 44100
    AdditionalOptions map[string]interface{} // Provider-specific additional options
}

Notes

  1. Credentials: Different providers require different credentials. Please refer to each provider's documentation to obtain the corresponding keys.
  2. Parameter Mapping: Parameter ranges may differ between providers. The library automatically maps and converts them.
  3. Error Handling: Always handle all possible errors.
  4. Context Cancellation: Supports request cancellation via context.
  5. Voice Cloning: Currently, only Alibaba Cloud and Azure support voice cloning. Other providers will return an error when calling related methods.
  6. Voice Cloning Training: Voice cloning requires uploading audio samples for training, which may take a long time. Please be patient.

Provider Information

For detailed information about each provider's API introduction, documentation links, and pricing, please refer to:

Contributing

Issues and Pull Requests are welcome!

License

MIT License

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages