From 7dbfb8348527983c4f4210bfe6194dfda791354e Mon Sep 17 00:00:00 2001 From: tars90percent Date: Wed, 28 Jan 2026 19:41:47 +0800 Subject: [PATCH 1/3] Add MiniMax TTS support --- cmd/api_key.go | 49 +++++ cmd/speak.go | 346 ++++++++++++++++++++++++++--- go.mod | 1 + go.sum | 2 + internal/minimax/client.go | 435 +++++++++++++++++++++++++++++++++++++ internal/minimax/doc.go | 2 + 6 files changed, 806 insertions(+), 29 deletions(-) create mode 100644 internal/minimax/client.go create mode 100644 internal/minimax/doc.go diff --git a/cmd/api_key.go b/cmd/api_key.go index 8b22232..c8125f8 100644 --- a/cmd/api_key.go +++ b/cmd/api_key.go @@ -26,6 +26,33 @@ func ensureAPIKey() error { return nil } +func ensureAPIKeyForProvider(provider string) error { + if provider == "minimax" { + return ensureMiniMaxAPIKey() + } + return ensureAPIKey() +} + +func ensureMiniMaxAPIKey() error { + if cfg.APIKey == "" { + key, err := resolveMiniMaxAPIKeyFromFile() + if err != nil { + return err + } + cfg.APIKey = key + } + if cfg.APIKey == "" { + cfg.APIKey = os.Getenv("MINIMAX_API_KEY") + } + if cfg.APIKey == "" { + cfg.APIKey = os.Getenv("SAG_API_KEY") + } + if cfg.APIKey == "" { + return fmt.Errorf("missing MiniMax API key (set --api-key, --api-key-file, or MINIMAX_API_KEY)") + } + return nil +} + func resolveAPIKeyFromFile() (string, error) { path := cfg.APIKeyFile if path == "" { @@ -47,3 +74,25 @@ func resolveAPIKeyFromFile() (string, error) { } return key, nil } + +func resolveMiniMaxAPIKeyFromFile() (string, error) { + path := cfg.APIKeyFile + if path == "" { + path = os.Getenv("MINIMAX_API_KEY_FILE") + } + if path == "" { + path = os.Getenv("SAG_API_KEY_FILE") + } + if path == "" { + return "", nil + } + data, err := os.ReadFile(path) + if err != nil { + return "", fmt.Errorf("read api key file: %w", err) + } + key := strings.TrimSpace(string(data)) + if key == "" { + return "", fmt.Errorf("api key file %q is empty", path) + } + return key, nil +} diff --git a/cmd/speak.go b/cmd/speak.go index e043407..c7644fe 100644 --- a/cmd/speak.go +++ b/cmd/speak.go @@ -13,6 +13,7 @@ import ( "github.com/steipete/sag/internal/audio" "github.com/steipete/sag/internal/elevenlabs" + "github.com/steipete/sag/internal/minimax" "github.com/spf13/cobra" ) @@ -44,6 +45,11 @@ const defaultWPM = 175 // matches macOS `say` default rate var playToSpeakers = audio.StreamToSpeakers +const ( + providerElevenLabs = "elevenlabs" + providerMiniMax = "minimax" +) + func init() { opts := speakOptions{ modelID: "eleven_v3", @@ -55,39 +61,63 @@ func init() { cmd := &cobra.Command{ Use: "speak [text]", - Short: "Speak the provided text using ElevenLabs TTS (default: stream to speakers)", + Short: "Speak the provided text using TTS (default: stream to speakers)", Long: "If no text argument is provided, the command reads from stdin.\n\nTip: run `sag prompting` for model-specific prompting tips and recommended flag combinations.", Args: cobra.ArbitraryArgs, PreRunE: func(_ *cobra.Command, _ []string) error { - return ensureAPIKey() + return ensureAPIKeyForProvider(detectProvider(opts.modelID)) }, RunE: func(cmd *cobra.Command, args []string) error { if err := applyRateAndSpeed(&opts); err != nil { return err } + provider := detectProvider(opts.modelID) forceVoiceID := cmd.Flags().Changed("voice-id") voiceInput := opts.voiceID if voiceInput == "" { - if env := os.Getenv("ELEVENLABS_VOICE_ID"); env != "" { - voiceInput = env - forceVoiceID = true - } else if env := os.Getenv("SAG_VOICE_ID"); env != "" { - voiceInput = env - forceVoiceID = true + if provider == providerMiniMax { + if env := os.Getenv("MINIMAX_VOICE_ID"); env != "" { + voiceInput = env + forceVoiceID = true + } else if env := os.Getenv("SAG_VOICE_ID"); env != "" { + voiceInput = env + forceVoiceID = true + } + } else { + if env := os.Getenv("ELEVENLABS_VOICE_ID"); env != "" { + voiceInput = env + forceVoiceID = true + } else if env := os.Getenv("SAG_VOICE_ID"); env != "" { + voiceInput = env + forceVoiceID = true + } } } - client := elevenlabs.NewClient(cfg.APIKey, cfg.BaseURL) + elevenClient := elevenlabs.NewClient(cfg.APIKey, cfg.BaseURL) + miniClient := minimax.NewClient(cfg.APIKey, minimaxBaseURL()) - voiceID, err := resolveVoice(cmd.Context(), client, voiceInput, forceVoiceID) - if err != nil { - return err - } - if voiceID == "" { - // Likely printed voices for '?' request. - return nil + switch provider { + case providerMiniMax: + voiceID, err := resolveMiniMaxVoice(cmd.Context(), miniClient, voiceInput, forceVoiceID) + if err != nil { + return err + } + if voiceID == "" { + return nil + } + opts.voiceID = voiceID + default: + voiceID, err := resolveVoice(cmd.Context(), elevenClient, voiceInput, forceVoiceID) + if err != nil { + return err + } + if voiceID == "" { + // Likely printed voices for '?' request. + return nil + } + opts.voiceID = voiceID } - opts.voiceID = voiceID text, err := resolveText(args, opts.inputFile) if err != nil { @@ -96,7 +126,11 @@ func init() { // If user provided output path with a known extension, infer a compatible format. if opts.outputPath != "" { - if inferred := inferFormatFromExt(opts.outputPath); inferred != "" { + if provider == providerMiniMax { + if inferred := inferMiniMaxFormatFromExt(opts.outputPath); inferred != "" { + opts.outputFmt = inferred + } + } else if inferred := inferFormatFromExt(opts.outputPath); inferred != "" { opts.outputFmt = inferred } // Disable playback when -o is set, unless --play was explicitly provided @@ -108,25 +142,45 @@ func init() { ctx, cancel := context.WithTimeout(cmd.Context(), 90*time.Second) defer cancel() - payload, err := buildTTSRequest(cmd, opts, text) - if err != nil { - return err - } - start := time.Now() var bytes int64 - if opts.stream { - n, err := streamAndPlay(ctx, client, opts, payload) - bytes = n + switch provider { + case providerMiniMax: + payload, err := buildMiniMaxTTSRequest(opts, text) if err != nil { return err } - } else { - n, err := convertAndPlay(ctx, client, opts, payload) - bytes = n + if opts.stream { + n, err := streamAndPlayMiniMax(ctx, miniClient, opts, payload) + bytes = n + if err != nil { + return err + } + } else { + n, err := convertAndPlayMiniMax(ctx, miniClient, opts, payload) + bytes = n + if err != nil { + return err + } + } + default: + payload, err := buildTTSRequest(cmd, opts, text) if err != nil { return err } + if opts.stream { + n, err := streamAndPlay(ctx, elevenClient, opts, payload) + bytes = n + if err != nil { + return err + } + } else { + n, err := convertAndPlay(ctx, elevenClient, opts, payload) + bytes = n + if err != nil { + return err + } + } } if opts.metrics { fmt.Fprintf(os.Stderr, "metrics: chars=%d bytes=%d model=%s voice=%s stream=%t latencyTier=%d dur=%s\n", @@ -427,6 +481,93 @@ func convertAndPlay(ctx context.Context, client *elevenlabs.Client, opts speakOp return n, nil } +func streamAndPlayMiniMax(ctx context.Context, client *minimax.Client, opts speakOptions, payload minimax.TTSRequest) (int64, error) { + resp, err := client.StreamTTS(ctx, opts.voiceID, payload) + if err != nil { + return 0, err + } + defer func() { + _ = resp.Close() + }() + + writers := make([]io.Writer, 0, 2) + var file io.WriteCloser + if opts.outputPath != "" { + if err := os.MkdirAll(filepath.Dir(opts.outputPath), 0o755); err != nil { + return 0, err + } + file, err = os.Create(opts.outputPath) + if err != nil { + return 0, err + } + defer func() { + _ = file.Close() + }() + writers = append(writers, file) + } + + if opts.play { + pr, pw := io.Pipe() + writers = append(writers, pw) + mw := io.MultiWriter(writers...) + + copyErr := make(chan error, 1) + copyN := make(chan int64, 1) + go func() { + n, err := io.Copy(mw, resp) + copyN <- n + copyErr <- err + _ = pw.Close() + }() + + playErr := playToSpeakers(ctx, pr) + copyNVal := <-copyN + copyErrVal := <-copyErr + if copyErrVal != nil { + return copyNVal, copyErrVal + } + return copyNVal, playErr + } + + if len(writers) == 0 { + return 0, errors.New("nothing to do: enable --play or provide --output") + } + + mw := io.MultiWriter(writers...) + n, err := io.Copy(mw, resp) + return n, err +} + +func convertAndPlayMiniMax(ctx context.Context, client *minimax.Client, opts speakOptions, payload minimax.TTSRequest) (int64, error) { + data, err := client.ConvertTTS(ctx, opts.voiceID, payload) + if err != nil { + return 0, err + } + n := int64(len(data)) + + if opts.outputPath != "" { + if err := os.MkdirAll(filepath.Dir(opts.outputPath), 0o755); err != nil { + return n, err + } + if err := os.WriteFile(opts.outputPath, data, 0o644); err != nil { + return n, err + } + } + + if opts.play { + pr, pw := io.Pipe() + go func() { + _, _ = pw.Write(data) + _ = pw.Close() + }() + return n, playToSpeakers(ctx, pr) + } + if opts.outputPath == "" { + return n, errors.New("nothing to do: enable --play or provide --output") + } + return n, nil +} + func resolveVoice(ctx context.Context, client *elevenlabs.Client, voiceInput string, forceID bool) (string, error) { voiceInput = strings.TrimSpace(voiceInput) if voiceInput == "" { @@ -515,6 +656,68 @@ func resolveVoice(ctx context.Context, client *elevenlabs.Client, voiceInput str return "", fmt.Errorf("voice %q not found; try 'sag voices' or -v '?'", voiceInput) } +func resolveMiniMaxVoice(ctx context.Context, client *minimax.Client, voiceInput string, forceID bool) (string, error) { + voiceInput = strings.TrimSpace(voiceInput) + if voiceInput == "" { + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + voices, err := client.ListVoices(ctx) + if err != nil { + return "", fmt.Errorf("voice not specified and failed to fetch voices: %w", err) + } + if len(voices) == 0 { + return "", errors.New("no voices available; specify --voice or set MINIMAX_VOICE_ID") + } + fmt.Fprintf(os.Stderr, "defaulting to voice %s (%s)\n", voices[0].Name, voices[0].VoiceID) + return voices[0].VoiceID, nil + } + if voiceInput == "?" { + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + voices, err := client.ListVoices(ctx) + if err != nil { + return "", err + } + w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0) + if _, err := fmt.Fprintf(w, "VOICE ID\tNAME\tCATEGORY\n"); err != nil { + return "", err + } + for _, v := range voices { + if _, err := fmt.Fprintf(w, "%s\t%s\t%s\n", v.VoiceID, v.Name, v.Category); err != nil { + return "", err + } + } + if err := w.Flush(); err != nil { + return "", err + } + return "", nil + } + if forceID { + return voiceInput, nil + } + + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + voices, err := client.ListVoices(ctx) + if err != nil { + return voiceInput, nil + } + voiceInputLower := strings.ToLower(voiceInput) + for _, v := range voices { + if strings.ToLower(v.VoiceID) == voiceInputLower || strings.ToLower(v.Name) == voiceInputLower { + fmt.Fprintf(os.Stderr, "using voice %s (%s)\n", v.Name, v.VoiceID) + return v.VoiceID, nil + } + } + for _, v := range voices { + if strings.Contains(strings.ToLower(v.Name), voiceInputLower) { + fmt.Fprintf(os.Stderr, "using voice %s (%s)\n", v.Name, v.VoiceID) + return v.VoiceID, nil + } + } + return voiceInput, nil +} + func looksLikeVoiceID(voiceInput string) bool { return len(voiceInput) >= 15 && !strings.ContainsRune(voiceInput, ' ') } @@ -539,3 +742,88 @@ func inferFormatFromExt(path string) string { return "" } } + +func inferMiniMaxFormatFromExt(path string) string { + ext := strings.ToLower(filepath.Ext(path)) + switch ext { + case ".mp3": + return "mp3" + case ".wav", ".wave": + return "wav" + case ".flac": + return "flac" + default: + return "" + } +} + +func detectProvider(modelID string) string { + modelID = strings.ToLower(strings.TrimSpace(modelID)) + if strings.HasPrefix(modelID, "speech-") { + return providerMiniMax + } + return providerElevenLabs +} + +func minimaxBaseURL() string { + host := strings.TrimSpace(os.Getenv("MINIMAX_API_HOST")) + if host == "" { + host = strings.TrimSpace(os.Getenv("MINIMAX_BASE_URL")) + } + if host == "" { + return "" + } + if strings.HasPrefix(host, "http://") || strings.HasPrefix(host, "https://") { + return host + } + return "https://" + host +} + +func buildMiniMaxTTSRequest(opts speakOptions, text string) (minimax.TTSRequest, error) { + format, err := normalizeMiniMaxFormat(opts.outputFmt) + if err != nil { + return minimax.TTSRequest{}, err + } + if opts.stream && format != "mp3" { + return minimax.TTSRequest{}, errors.New("MiniMax streaming supports mp3 only; use --no-stream for wav/flac") + } + if opts.play && format != "mp3" { + return minimax.TTSRequest{}, errors.New("MiniMax playback supports mp3 only; use --output without --play for wav/flac") + } + + speed := opts.speed + return minimax.TTSRequest{ + Model: opts.modelID, + Text: text, + Speed: speed, + Volume: 1.0, + Pitch: 0, + AudioFormat: format, + SampleRate: 32000, + Bitrate: 128000, + Channel: 1, + }, nil +} + +func normalizeMiniMaxFormat(format string) (string, error) { + format = strings.ToLower(strings.TrimSpace(format)) + switch format { + case "", "mp3", "wav", "flac": + if format == "" { + return "mp3", nil + } + return format, nil + case "mp3_44100_128": + return "mp3", nil + case "pcm_44100": + return "wav", nil + default: + if strings.HasPrefix(format, "mp3_") { + return "mp3", nil + } + if strings.HasPrefix(format, "pcm_") { + return "wav", nil + } + return "", fmt.Errorf("format %q not supported for MiniMax (use mp3, wav, flac)", format) + } +} diff --git a/go.mod b/go.mod index adfa966..2cc43cd 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/steipete/sag go 1.24.0 require ( + github.com/coder/websocket v1.8.14 github.com/ebitengine/oto/v3 v3.4.0 github.com/hajimehoshi/go-mp3 v0.3.4 github.com/spf13/cobra v1.10.2 diff --git a/go.sum b/go.sum index 280fd5c..e58cdd8 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g= +github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/ebitengine/oto/v3 v3.4.0 h1:br0PgASsEWaoWn38b2Goe7m1GKFYfNgnsjSd5Gg+/bQ= github.com/ebitengine/oto/v3 v3.4.0/go.mod h1:IOleLVD0m+CMak3mRVwsYY8vTctQgOM0iiL6S7Ar7eI= diff --git a/internal/minimax/client.go b/internal/minimax/client.go new file mode 100644 index 0000000..bb98781 --- /dev/null +++ b/internal/minimax/client.go @@ -0,0 +1,435 @@ +package minimax + +import ( + "bytes" + "context" + "encoding/hex" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "path" + "strings" + "time" + + "github.com/coder/websocket" + "github.com/coder/websocket/wsjson" +) + +const defaultBaseURL = "https://api.minimax.io" + +// Client talks to the MiniMax TTS API. +type Client struct { + baseURL string + apiKey string + httpClient *http.Client +} + +// NewClient returns a client configured with the given API key and base URL. +func NewClient(apiKey, baseURL string) *Client { + if baseURL == "" { + baseURL = defaultBaseURL + } + return &Client{ + baseURL: baseURL, + apiKey: apiKey, + httpClient: &http.Client{ + Timeout: 60 * time.Second, + }, + } +} + +// Voice represents a MiniMax voice entry. +type Voice struct { + VoiceID string + Name string + Category string + Description string +} + +type voiceEntry struct { + VoiceID string `json:"voice_id"` + VoiceName string `json:"voice_name"` + Description []string `json:"description,omitempty"` +} + +type listVoicesRequest struct { + VoiceType string `json:"voice_type"` +} + +type listVoicesResponse struct { + SystemVoice []voiceEntry `json:"system_voice"` + VoiceCloning []voiceEntry `json:"voice_cloning"` + VoiceGeneration []voiceEntry `json:"voice_generation"` + BaseResp *baseResp `json:"base_resp,omitempty"` +} + +// ListVoices fetches available voices. +func (c *Client) ListVoices(ctx context.Context) ([]Voice, error) { + u, err := c.httpURL("/v1/get_voice") + if err != nil { + return nil, err + } + + reqBody, err := json.Marshal(listVoicesRequest{VoiceType: "all"}) + if err != nil { + return nil, err + } + req, err := http.NewRequestWithContext(ctx, http.MethodPost, u, bytes.NewReader(reqBody)) + if err != nil { + return nil, err + } + req.Header.Set("Authorization", "Bearer "+c.apiKey) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, err + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 400 { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("list voices failed: %s: %s", resp.Status, strings.TrimSpace(string(body))) + } + + var payload listVoicesResponse + if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { + return nil, err + } + if err := payload.BaseResp.err(); err != nil { + return nil, err + } + + voices := make([]Voice, 0, len(payload.SystemVoice)+len(payload.VoiceCloning)+len(payload.VoiceGeneration)) + appendVoices := func(category string, entries []voiceEntry) { + for _, v := range entries { + name := strings.TrimSpace(v.VoiceName) + if name == "" { + name = v.VoiceID + } + voices = append(voices, Voice{ + VoiceID: v.VoiceID, + Name: name, + Category: category, + Description: strings.Join(v.Description, " "), + }) + } + } + appendVoices("system", payload.SystemVoice) + appendVoices("voice_cloning", payload.VoiceCloning) + appendVoices("voice_generation", payload.VoiceGeneration) + return voices, nil +} + +// TTSRequest configures a text-to-speech request payload. +type TTSRequest struct { + Model string + Text string + Speed float64 + Volume float64 + Pitch int + AudioFormat string + SampleRate int + Bitrate int + Channel int + LanguageBoost string + ContinuousSound *bool +} + +type baseResp struct { + StatusCode int `json:"status_code"` + StatusMsg string `json:"status_msg"` +} + +func (b *baseResp) err() error { + if b == nil { + return nil + } + if b.StatusCode == 0 { + return nil + } + msg := strings.TrimSpace(b.StatusMsg) + if msg == "" { + msg = "unknown error" + } + return fmt.Errorf("minimax error: %s (code=%d)", msg, b.StatusCode) +} + +type voiceSetting struct { + VoiceID string `json:"voice_id"` + Speed float64 `json:"speed"` + Vol float64 `json:"vol"` + Pitch int `json:"pitch"` +} + +type audioSetting struct { + Format string `json:"format,omitempty"` + SampleRate int `json:"sample_rate,omitempty"` + Bitrate int `json:"bitrate,omitempty"` + Channel int `json:"channel,omitempty"` +} + +type t2aRequest struct { + Model string `json:"model"` + Text string `json:"text"` + Stream bool `json:"stream"` + OutputFormat string `json:"output_format,omitempty"` + VoiceSetting voiceSetting `json:"voice_setting"` + AudioSetting audioSetting `json:"audio_setting,omitempty"` + LanguageBoost string `json:"language_boost,omitempty"` + ContinuousSound *bool `json:"continuous_sound,omitempty"` +} + +type t2aResponse struct { + Data struct { + Audio string `json:"audio"` + } `json:"data"` + BaseResp *baseResp `json:"base_resp,omitempty"` +} + +// ConvertTTS downloads the full audio before returning. +func (c *Client) ConvertTTS(ctx context.Context, voiceID string, req TTSRequest) ([]byte, error) { + u, err := c.httpURL("/v1/t2a_v2") + if err != nil { + return nil, err + } + + payload := t2aRequest{ + Model: req.Model, + Text: req.Text, + Stream: false, + OutputFormat: "hex", + VoiceSetting: buildVoiceSetting(voiceID, req), + AudioSetting: buildAudioSetting(req), + LanguageBoost: req.LanguageBoost, + ContinuousSound: req.ContinuousSound, + } + bodyBytes, err := json.Marshal(payload) + if err != nil { + return nil, err + } + + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, u, bytes.NewReader(bodyBytes)) + if err != nil { + return nil, err + } + httpReq.Header.Set("Authorization", "Bearer "+c.apiKey) + httpReq.Header.Set("Content-Type", "application/json") + httpReq.Header.Set("Accept", "application/json") + + resp, err := c.httpClient.Do(httpReq) + if err != nil { + return nil, err + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 400 { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("convert TTS failed: %s: %s", resp.Status, strings.TrimSpace(string(body))) + } + + var response t2aResponse + if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { + return nil, err + } + if err := response.BaseResp.err(); err != nil { + return nil, err + } + if response.Data.Audio == "" { + return nil, errors.New("minimax response missing audio") + } + + data, err := hex.DecodeString(response.Data.Audio) + if err != nil { + return nil, fmt.Errorf("decode audio hex: %w", err) + } + return data, nil +} + +type wsTaskStart struct { + Event string `json:"event"` + Model string `json:"model"` + VoiceSetting voiceSetting `json:"voice_setting"` + AudioSetting audioSetting `json:"audio_setting,omitempty"` + LanguageBoost string `json:"language_boost,omitempty"` + ContinuousSound *bool `json:"continuous_sound,omitempty"` +} + +type wsTaskContinue struct { + Event string `json:"event"` + Text string `json:"text"` +} + +type wsMessage struct { + Event string `json:"event"` + Data *wsData `json:"data,omitempty"` + BaseResp *baseResp `json:"base_resp,omitempty"` + IsFinal bool `json:"is_final,omitempty"` +} + +type wsData struct { + Audio string `json:"audio,omitempty"` +} + +type cancelReadCloser struct { + *io.PipeReader + cancel func() +} + +func (c *cancelReadCloser) Close() error { + c.cancel() + return c.PipeReader.Close() +} + +// StreamTTS streams MP3 audio from MiniMax via WebSocket. +func (c *Client) StreamTTS(ctx context.Context, voiceID string, req TTSRequest) (io.ReadCloser, error) { + wsURL, err := c.wsURL("/ws/v1/t2a_v2") + if err != nil { + return nil, err + } + ctx, cancel := context.WithCancel(ctx) + pr, pw := io.Pipe() + + go func() { + defer cancel() + defer func() { _ = pw.Close() }() + + header := http.Header{} + header.Set("Authorization", "Bearer "+c.apiKey) + conn, _, err := websocket.Dial(ctx, wsURL, &websocket.DialOptions{HTTPHeader: header}) + if err != nil { + _ = pw.CloseWithError(err) + return + } + defer func() { + _ = conn.Close(websocket.StatusNormalClosure, "done") + }() + + if err := readWSUntilEvent(ctx, conn, "connected_success"); err != nil { + _ = pw.CloseWithError(err) + return + } + + start := wsTaskStart{ + Event: "task_start", + Model: req.Model, + VoiceSetting: buildVoiceSetting(voiceID, req), + AudioSetting: buildAudioSetting(req), + LanguageBoost: req.LanguageBoost, + ContinuousSound: req.ContinuousSound, + } + if err := wsjson.Write(ctx, conn, start); err != nil { + _ = pw.CloseWithError(err) + return + } + if err := readWSUntilEvent(ctx, conn, "task_started"); err != nil { + _ = pw.CloseWithError(err) + return + } + + if err := wsjson.Write(ctx, conn, wsTaskContinue{Event: "task_continue", Text: req.Text}); err != nil { + _ = pw.CloseWithError(err) + return + } + + for { + var msg wsMessage + if err := wsjson.Read(ctx, conn, &msg); err != nil { + _ = pw.CloseWithError(err) + return + } + if err := msg.BaseResp.err(); err != nil { + _ = pw.CloseWithError(err) + return + } + if msg.Event == "task_failed" { + _ = pw.CloseWithError(errors.New("minimax stream failed")) + return + } + if msg.Data != nil && msg.Data.Audio != "" { + chunk, err := hex.DecodeString(msg.Data.Audio) + if err != nil { + _ = pw.CloseWithError(fmt.Errorf("decode audio chunk: %w", err)) + return + } + if len(chunk) > 0 { + if _, err := pw.Write(chunk); err != nil { + return + } + } + } + if msg.IsFinal || msg.Event == "task_finished" { + return + } + } + }() + + return &cancelReadCloser{PipeReader: pr, cancel: cancel}, nil +} + +func readWSUntilEvent(ctx context.Context, conn *websocket.Conn, want string) error { + for { + var msg wsMessage + if err := wsjson.Read(ctx, conn, &msg); err != nil { + return err + } + if err := msg.BaseResp.err(); err != nil { + return err + } + if msg.Event == "task_failed" { + return errors.New("minimax task failed") + } + if msg.Event == want { + return nil + } + } +} + +func buildVoiceSetting(voiceID string, req TTSRequest) voiceSetting { + return voiceSetting{ + VoiceID: voiceID, + Speed: req.Speed, + Vol: req.Volume, + Pitch: req.Pitch, + } +} + +func buildAudioSetting(req TTSRequest) audioSetting { + return audioSetting{ + Format: req.AudioFormat, + SampleRate: req.SampleRate, + Bitrate: req.Bitrate, + Channel: req.Channel, + } +} + +func (c *Client) httpURL(endpoint string) (string, error) { + u, err := url.Parse(c.baseURL) + if err != nil { + return "", err + } + u.Path = path.Join(u.Path, endpoint) + return u.String(), nil +} + +func (c *Client) wsURL(endpoint string) (string, error) { + u, err := url.Parse(c.baseURL) + if err != nil { + return "", err + } + switch u.Scheme { + case "http": + u.Scheme = "ws" + case "https": + u.Scheme = "wss" + case "ws", "wss": + default: + u.Scheme = "wss" + } + u.Path = path.Join(u.Path, endpoint) + return u.String(), nil +} diff --git a/internal/minimax/doc.go b/internal/minimax/doc.go new file mode 100644 index 0000000..599152f --- /dev/null +++ b/internal/minimax/doc.go @@ -0,0 +1,2 @@ +// Package minimax provides a small client for the MiniMax TTS API. +package minimax From 214ca4d7d7501f99e60c935c8eff76cdc48676b2 Mon Sep 17 00:00:00 2001 From: Vincent Wu Date: Thu, 29 Jan 2026 03:13:49 +0800 Subject: [PATCH 2/3] docs: mention MiniMax TTS option --- README.md | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 78974c3..e456f49 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# sag 🗣️ — “Mac-style speech with ElevenLabs” +# sag 🗣️ — “Mac-style speech with ElevenLabs and MiniMax” -One-liner TTS that works like `say`: stream to speakers by default, list voices, or save audio files. +One-liner TTS that works like `say`: stream to speakers by default, list voices, or save audio files. Defaults to ElevenLabs, with MiniMax available via `speech-*` model IDs. ## Install Homebrew (macOS): @@ -15,18 +15,20 @@ go install ./cmd/sag Requires Go 1.24+. ## Configuration -- `ELEVENLABS_API_KEY` (required) -- `--api-key-file` or `ELEVENLABS_API_KEY_FILE`/`SAG_API_KEY_FILE` to load the key from a file -- Optional defaults: `ELEVENLABS_VOICE_ID` or `SAG_VOICE_ID` +- ElevenLabs: `ELEVENLABS_API_KEY` (or `SAG_API_KEY`) +- MiniMax: `MINIMAX_API_KEY` (or `SAG_API_KEY`) +- `--api-key-file` or `ELEVENLABS_API_KEY_FILE`/`MINIMAX_API_KEY_FILE`/`SAG_API_KEY_FILE` to load the key from a file +- Optional defaults: `ELEVENLABS_VOICE_ID`, `MINIMAX_VOICE_ID`, or `SAG_VOICE_ID` +- Optional: `MINIMAX_API_HOST` or `MINIMAX_BASE_URL` to override the MiniMax base URL ## Usage Features: - macOS `say`-style default: `sag "Hello"` routes to `speak` automatically. - Streaming playback to speakers with optional file output. -- Voice discovery via `sag voices` and `-v ?`. +- Voice discovery via `sag voices` (ElevenLabs) and `-v ?` (provider-specific). - Speed/rate controls, latency tiers, and format inference from output extension. -- Model selection via `--model-id` (defaults to `eleven_v3`; use `eleven_multilingual_v2` for a stable baseline). +- Model selection via `--model-id` (defaults to `eleven_v3`; use `eleven_multilingual_v2` for a stable baseline, `speech-*` for MiniMax). Speak (streams audio): ```bash @@ -52,6 +54,8 @@ sag speak -v Roger --stream --latency-tier 3 "Faster start" sag speak -v Roger --speed 1.2 "Talk a bit faster" sag speak -v Roger --model-id eleven_multilingual_v2 "Use stable v2 baseline" sag speak -v Roger --output out.wav --format pcm_44100 "Wave output" +sag speak --model-id speech-01 -v ? "List MiniMax voices" +sag speak --model-id speech-01 --output out.flac --stream=false "MiniMax file output" ``` Key flags (subset): @@ -94,7 +98,11 @@ Highlights: ## Models / engines -`sag` supports any ElevenLabs `model_id` via `--model-id` (we pass it through). Practical defaults + common IDs: +Provider selection: +- ElevenLabs (default): any ElevenLabs `model_id` via `--model-id` (we pass it through). +- MiniMax: use a `speech-*` model ID to route requests to MiniMax. Streaming/playback is MP3-only; use `--stream=false` for WAV/FLAC output. + +Practical defaults + common ElevenLabs IDs: | Engine | `--model-id` | Prompting style | Best for | |---|---|---|---| @@ -123,6 +131,6 @@ Notes: - Build: `go build ./cmd/sag` ## Limitations -- ElevenLabs account and API key required. +- ElevenLabs or MiniMax account and API key required (per provider). - Voice defaults to first available if not provided. - Non-mac platforms: playback still works via `go-mp3` + `oto`, but device selection flags are no-ops. From e59f3679a97ac23a3e9b56a9cee55db9293af5fc Mon Sep 17 00:00:00 2001 From: tars90percent Date: Thu, 29 Jan 2026 19:52:35 +0800 Subject: [PATCH 3/3] Use HTTP SSE for MiniMax streaming --- cmd/speak.go | 197 ++++++++++++++++++-- go.mod | 1 - go.sum | 2 - internal/minimax/client.go | 364 +++++++++++++++++++++---------------- 4 files changed, 391 insertions(+), 173 deletions(-) diff --git a/cmd/speak.go b/cmd/speak.go index c7644fe..002265e 100644 --- a/cmd/speak.go +++ b/cmd/speak.go @@ -39,6 +39,20 @@ type speakOptions struct { speakerBoost bool noSpeakerBoost bool + + minimaxVolume float64 + minimaxPitch int + minimaxEmotion string + minimaxLanguage string + minimaxAccent string + minimaxTone []string + minimaxTextNormalization bool + minimaxLatexRead bool + minimaxContinuousSound bool + minimaxVoiceModifyPitch int + minimaxVoiceModifyIntensity int + minimaxVoiceModifyTimbre int + minimaxVoiceModifySoundEffects string } const defaultWPM = 175 // matches macOS `say` default rate @@ -146,7 +160,7 @@ func init() { var bytes int64 switch provider { case providerMiniMax: - payload, err := buildMiniMaxTTSRequest(opts, text) + payload, err := buildMiniMaxTTSRequest(cmd, opts, text) if err != nil { return err } @@ -211,6 +225,19 @@ func init() { cmd.Flags().StringVar(&opts.lang, "lang", "", "Language code (2-letter ISO 639-1; influences normalization; when set)") cmd.Flags().BoolVar(&opts.metrics, "metrics", false, "Print request metrics to stderr (chars, bytes, duration, etc.)") cmd.Flags().StringVarP(&opts.inputFile, "input-file", "f", "", "Read text from file (use '-' for stdin), matching macOS say -f") + cmd.Flags().Float64Var(&opts.minimaxVolume, "volume", 0, "MiniMax voice volume (0..10; when set)") + cmd.Flags().IntVar(&opts.minimaxPitch, "pitch", 0, "MiniMax voice pitch (-12..12; when set)") + cmd.Flags().StringVar(&opts.minimaxEmotion, "emotion", "", "MiniMax voice emotion (model dependent)") + cmd.Flags().StringVar(&opts.minimaxLanguage, "language", "", "MiniMax language boost (e.g. English, Chinese,Yue; when set)") + cmd.Flags().StringVar(&opts.minimaxAccent, "accent", "", "Alias for --language (MiniMax language boost)") + cmd.Flags().StringArrayVar(&opts.minimaxTone, "tone", nil, "MiniMax pronunciation tone override (repeatable, e.g. \"omg/oh my god\")") + cmd.Flags().BoolVar(&opts.minimaxTextNormalization, "text-normalization", false, "MiniMax text normalization (improves digit reading; when set)") + cmd.Flags().BoolVar(&opts.minimaxLatexRead, "latex-read", false, "MiniMax LaTeX formula reading (Chinese only; when set)") + cmd.Flags().BoolVar(&opts.minimaxContinuousSound, "continuous-sound", false, "MiniMax continuous sound for smoother transitions (when set)") + cmd.Flags().IntVar(&opts.minimaxVoiceModifyPitch, "voice-modify-pitch", 0, "MiniMax voice modify pitch (-100..100; when set)") + cmd.Flags().IntVar(&opts.minimaxVoiceModifyIntensity, "voice-modify-intensity", 0, "MiniMax voice modify intensity (-100..100; when set)") + cmd.Flags().IntVar(&opts.minimaxVoiceModifyTimbre, "voice-modify-timbre", 0, "MiniMax voice modify timbre (-100..100; when set)") + cmd.Flags().StringVar(&opts.minimaxVoiceModifySoundEffects, "voice-modify-sound-effects", "", "MiniMax voice modify sound effects (e.g. spacious_echo, auditorium_echo, lofi_telephone, robotic)") cmd.Flags().Bool("progress", false, "Accepted for macOS say compatibility (no-op)") cmd.Flags().String("network-send", "", "Accepted for macOS say compatibility (not implemented)") cmd.Flags().String("audio-device", "", "Accepted for macOS say compatibility (not implemented)") @@ -591,11 +618,13 @@ func resolveVoice(ctx context.Context, client *elevenlabs.Client, voiceInput str return "", err } w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0) - if _, err := fmt.Fprintf(w, "VOICE ID\tNAME\tCATEGORY\n"); err != nil { + if _, err := fmt.Fprintf(w, "VOICE ID\tNAME\tCATEGORY\tDESCRIPTION\n"); err != nil { return "", err } for _, v := range voices { - if _, err := fmt.Fprintf(w, "%s\t%s\t%s\n", v.VoiceID, v.Name, v.Category); err != nil { + desc := strings.ReplaceAll(v.Description, "\t", " ") + desc = strings.ReplaceAll(desc, "\n", " ") + if _, err := fmt.Fprintf(w, "%s\t%s\t%s\t%s\n", v.VoiceID, v.Name, v.Category, desc); err != nil { return "", err } } @@ -779,29 +808,161 @@ func minimaxBaseURL() string { return "https://" + host } -func buildMiniMaxTTSRequest(opts speakOptions, text string) (minimax.TTSRequest, error) { +func buildMiniMaxTTSRequest(cmd *cobra.Command, opts speakOptions, text string) (minimax.TTSRequest, error) { + flags := cmd.Flags() + format, err := normalizeMiniMaxFormat(opts.outputFmt) if err != nil { return minimax.TTSRequest{}, err } - if opts.stream && format != "mp3" { - return minimax.TTSRequest{}, errors.New("MiniMax streaming supports mp3 only; use --no-stream for wav/flac") + formatExplicit := flags.Changed("format") || opts.outputPath != "" + if formatExplicit { + if opts.stream && format != "mp3" { + return minimax.TTSRequest{}, errors.New("MiniMax streaming supports mp3 only; use --no-stream for wav/flac") + } + if opts.play && format != "mp3" { + return minimax.TTSRequest{}, errors.New("MiniMax playback supports mp3 only; use --output without --play for wav/flac") + } + } else { + format = "" } - if opts.play && format != "mp3" { - return minimax.TTSRequest{}, errors.New("MiniMax playback supports mp3 only; use --output without --play for wav/flac") + + var speedPtr *float64 + if flags.Changed("speed") || flags.Changed("rate") { + speed := opts.speed + speedPtr = &speed + } + + var volumePtr *float64 + if flags.Changed("volume") { + if opts.minimaxVolume <= 0 || opts.minimaxVolume > 10 { + return minimax.TTSRequest{}, errors.New("volume must be between 0 and 10 (exclusive 0)") + } + volume := opts.minimaxVolume + volumePtr = &volume + } + + var pitchPtr *int + if flags.Changed("pitch") { + if opts.minimaxPitch < -12 || opts.minimaxPitch > 12 { + return minimax.TTSRequest{}, errors.New("pitch must be between -12 and 12") + } + pitch := opts.minimaxPitch + pitchPtr = &pitch + } + + emotion := strings.TrimSpace(opts.minimaxEmotion) + if flags.Changed("emotion") && emotion == "" { + return minimax.TTSRequest{}, errors.New("emotion cannot be empty") + } + + var textNormPtr *bool + if flags.Changed("text-normalization") { + v := opts.minimaxTextNormalization + textNormPtr = &v + } + + var latexReadPtr *bool + if flags.Changed("latex-read") { + v := opts.minimaxLatexRead + latexReadPtr = &v + } + + var continuousSoundPtr *bool + if flags.Changed("continuous-sound") { + v := opts.minimaxContinuousSound + continuousSoundPtr = &v + } + + var languageBoost string + if flags.Changed("language") || flags.Changed("accent") { + lang := strings.TrimSpace(opts.minimaxLanguage) + accent := strings.TrimSpace(opts.minimaxAccent) + if lang != "" && accent != "" && lang != accent { + return minimax.TTSRequest{}, errors.New("choose only one of --language or --accent (or set the same value)") + } + if lang != "" { + languageBoost = lang + } else { + languageBoost = accent + } + if languageBoost == "" { + return minimax.TTSRequest{}, errors.New("language/accent cannot be empty") + } + } + + var tone []string + if flags.Changed("tone") { + for _, entry := range opts.minimaxTone { + value := strings.TrimSpace(entry) + if value == "" { + return minimax.TTSRequest{}, errors.New("tone entries cannot be empty") + } + tone = append(tone, value) + } + } + + var voiceModify *minimax.VoiceModify + var voiceModifyPitch *int + var voiceModifyIntensity *int + var voiceModifyTimbre *int + var voiceModifySoundEffects *string + if flags.Changed("voice-modify-pitch") { + if opts.minimaxVoiceModifyPitch < -100 || opts.minimaxVoiceModifyPitch > 100 { + return minimax.TTSRequest{}, errors.New("voice-modify-pitch must be between -100 and 100") + } + v := opts.minimaxVoiceModifyPitch + voiceModifyPitch = &v + } + if flags.Changed("voice-modify-intensity") { + if opts.minimaxVoiceModifyIntensity < -100 || opts.minimaxVoiceModifyIntensity > 100 { + return minimax.TTSRequest{}, errors.New("voice-modify-intensity must be between -100 and 100") + } + v := opts.minimaxVoiceModifyIntensity + voiceModifyIntensity = &v + } + if flags.Changed("voice-modify-timbre") { + if opts.minimaxVoiceModifyTimbre < -100 || opts.minimaxVoiceModifyTimbre > 100 { + return minimax.TTSRequest{}, errors.New("voice-modify-timbre must be between -100 and 100") + } + v := opts.minimaxVoiceModifyTimbre + voiceModifyTimbre = &v + } + if flags.Changed("voice-modify-sound-effects") { + value := strings.TrimSpace(opts.minimaxVoiceModifySoundEffects) + if value == "" { + return minimax.TTSRequest{}, errors.New("voice-modify-sound-effects cannot be empty") + } + voiceModifySoundEffects = &value + } + if voiceModifyPitch != nil || voiceModifyIntensity != nil || voiceModifyTimbre != nil || voiceModifySoundEffects != nil { + voiceModify = &minimax.VoiceModify{ + Pitch: voiceModifyPitch, + Intensity: voiceModifyIntensity, + Timbre: voiceModifyTimbre, + SoundEffects: voiceModifySoundEffects, + } + } + + var pronunciationDict *minimax.PronunciationDict + if len(tone) > 0 { + pronunciationDict = &minimax.PronunciationDict{Tone: tone} } - speed := opts.speed return minimax.TTSRequest{ - Model: opts.modelID, - Text: text, - Speed: speed, - Volume: 1.0, - Pitch: 0, - AudioFormat: format, - SampleRate: 32000, - Bitrate: 128000, - Channel: 1, + Model: opts.modelID, + Text: text, + Speed: speedPtr, + Volume: volumePtr, + Pitch: pitchPtr, + Emotion: emotion, + TextNormalization: textNormPtr, + LatexRead: latexReadPtr, + AudioFormat: format, + LanguageBoost: languageBoost, + ContinuousSound: continuousSoundPtr, + PronunciationDict: pronunciationDict, + VoiceModify: voiceModify, }, nil } diff --git a/go.mod b/go.mod index 2cc43cd..adfa966 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,6 @@ module github.com/steipete/sag go 1.24.0 require ( - github.com/coder/websocket v1.8.14 github.com/ebitengine/oto/v3 v3.4.0 github.com/hajimehoshi/go-mp3 v0.3.4 github.com/spf13/cobra v1.10.2 diff --git a/go.sum b/go.sum index e58cdd8..280fd5c 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,3 @@ -github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g= -github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/ebitengine/oto/v3 v3.4.0 h1:br0PgASsEWaoWn38b2Goe7m1GKFYfNgnsjSd5Gg+/bQ= github.com/ebitengine/oto/v3 v3.4.0/go.mod h1:IOleLVD0m+CMak3mRVwsYY8vTctQgOM0iiL6S7Ar7eI= diff --git a/internal/minimax/client.go b/internal/minimax/client.go index bb98781..7211b8f 100644 --- a/internal/minimax/client.go +++ b/internal/minimax/client.go @@ -1,6 +1,7 @@ package minimax import ( + "bufio" "bytes" "context" "encoding/hex" @@ -13,9 +14,6 @@ import ( "path" "strings" "time" - - "github.com/coder/websocket" - "github.com/coder/websocket/wsjson" ) const defaultBaseURL = "https://api.minimax.io" @@ -127,17 +125,22 @@ func (c *Client) ListVoices(ctx context.Context) ([]Voice, error) { // TTSRequest configures a text-to-speech request payload. type TTSRequest struct { - Model string - Text string - Speed float64 - Volume float64 - Pitch int - AudioFormat string - SampleRate int - Bitrate int - Channel int - LanguageBoost string - ContinuousSound *bool + Model string + Text string + Speed *float64 + Volume *float64 + Pitch *int + Emotion string + TextNormalization *bool + LatexRead *bool + AudioFormat string + SampleRate int + Bitrate int + Channel int + LanguageBoost string + ContinuousSound *bool + PronunciationDict *PronunciationDict + VoiceModify *VoiceModify } type baseResp struct { @@ -160,10 +163,13 @@ func (b *baseResp) err() error { } type voiceSetting struct { - VoiceID string `json:"voice_id"` - Speed float64 `json:"speed"` - Vol float64 `json:"vol"` - Pitch int `json:"pitch"` + VoiceID string `json:"voice_id"` + Speed *float64 `json:"speed,omitempty"` + Vol *float64 `json:"vol,omitempty"` + Pitch *int `json:"pitch,omitempty"` + Emotion string `json:"emotion,omitempty"` + TextNormalization *bool `json:"text_normalization,omitempty"` + LatexRead *bool `json:"latex_read,omitempty"` } type audioSetting struct { @@ -173,15 +179,35 @@ type audioSetting struct { Channel int `json:"channel,omitempty"` } +// PronunciationDict configures pronunciation overrides. +type PronunciationDict struct { + Tone []string `json:"tone,omitempty"` +} + +// VoiceModify configures voice effects. +type VoiceModify struct { + Pitch *int `json:"pitch,omitempty"` + Intensity *int `json:"intensity,omitempty"` + Timbre *int `json:"timbre,omitempty"` + SoundEffects *string `json:"sound_effects,omitempty"` +} + type t2aRequest struct { - Model string `json:"model"` - Text string `json:"text"` - Stream bool `json:"stream"` - OutputFormat string `json:"output_format,omitempty"` - VoiceSetting voiceSetting `json:"voice_setting"` - AudioSetting audioSetting `json:"audio_setting,omitempty"` - LanguageBoost string `json:"language_boost,omitempty"` - ContinuousSound *bool `json:"continuous_sound,omitempty"` + Model string `json:"model"` + Text string `json:"text"` + Stream bool `json:"stream"` + StreamOptions *t2aStreamOptions `json:"stream_options,omitempty"` + OutputFormat string `json:"output_format,omitempty"` + VoiceSetting voiceSetting `json:"voice_setting"` + AudioSetting audioSetting `json:"audio_setting,omitempty"` + LanguageBoost string `json:"language_boost,omitempty"` + ContinuousSound *bool `json:"continuous_sound,omitempty"` + PronunciationDict *PronunciationDict `json:"pronunciation_dict,omitempty"` + VoiceModify *VoiceModify `json:"voice_modify,omitempty"` +} + +type t2aStreamOptions struct { + ExcludeAggregatedAudio bool `json:"exclude_aggregated_audio,omitempty"` } type t2aResponse struct { @@ -191,6 +217,16 @@ type t2aResponse struct { BaseResp *baseResp `json:"base_resp,omitempty"` } +type t2aStreamData struct { + Audio string `json:"audio,omitempty"` + Status int `json:"status,omitempty"` +} + +type t2aStreamResponse struct { + Data *t2aStreamData `json:"data,omitempty"` + BaseResp *baseResp `json:"base_resp,omitempty"` +} + // ConvertTTS downloads the full audio before returning. func (c *Client) ConvertTTS(ctx context.Context, voiceID string, req TTSRequest) ([]byte, error) { u, err := c.httpURL("/v1/t2a_v2") @@ -199,14 +235,16 @@ func (c *Client) ConvertTTS(ctx context.Context, voiceID string, req TTSRequest) } payload := t2aRequest{ - Model: req.Model, - Text: req.Text, - Stream: false, - OutputFormat: "hex", - VoiceSetting: buildVoiceSetting(voiceID, req), - AudioSetting: buildAudioSetting(req), - LanguageBoost: req.LanguageBoost, - ContinuousSound: req.ContinuousSound, + Model: req.Model, + Text: req.Text, + Stream: false, + OutputFormat: "hex", + VoiceSetting: buildVoiceSetting(voiceID, req), + AudioSetting: buildAudioSetting(req), + LanguageBoost: req.LanguageBoost, + ContinuousSound: req.ContinuousSound, + PronunciationDict: req.PronunciationDict, + VoiceModify: req.VoiceModify, } bodyBytes, err := json.Marshal(payload) if err != nil { @@ -250,31 +288,6 @@ func (c *Client) ConvertTTS(ctx context.Context, voiceID string, req TTSRequest) return data, nil } -type wsTaskStart struct { - Event string `json:"event"` - Model string `json:"model"` - VoiceSetting voiceSetting `json:"voice_setting"` - AudioSetting audioSetting `json:"audio_setting,omitempty"` - LanguageBoost string `json:"language_boost,omitempty"` - ContinuousSound *bool `json:"continuous_sound,omitempty"` -} - -type wsTaskContinue struct { - Event string `json:"event"` - Text string `json:"text"` -} - -type wsMessage struct { - Event string `json:"event"` - Data *wsData `json:"data,omitempty"` - BaseResp *baseResp `json:"base_resp,omitempty"` - IsFinal bool `json:"is_final,omitempty"` -} - -type wsData struct { - Audio string `json:"audio,omitempty"` -} - type cancelReadCloser struct { *io.PipeReader cancel func() @@ -285,116 +298,181 @@ func (c *cancelReadCloser) Close() error { return c.PipeReader.Close() } -// StreamTTS streams MP3 audio from MiniMax via WebSocket. +// StreamTTS streams MP3 audio from MiniMax via HTTP (SSE). func (c *Client) StreamTTS(ctx context.Context, voiceID string, req TTSRequest) (io.ReadCloser, error) { - wsURL, err := c.wsURL("/ws/v1/t2a_v2") + u, err := c.httpURL("/v1/t2a_v2") if err != nil { return nil, err } + + payload := t2aRequest{ + Model: req.Model, + Text: req.Text, + Stream: true, + StreamOptions: &t2aStreamOptions{ExcludeAggregatedAudio: true}, + OutputFormat: "hex", + VoiceSetting: buildVoiceSetting(voiceID, req), + AudioSetting: buildAudioSetting(req), + LanguageBoost: req.LanguageBoost, + ContinuousSound: req.ContinuousSound, + PronunciationDict: req.PronunciationDict, + VoiceModify: req.VoiceModify, + } + bodyBytes, err := json.Marshal(payload) + if err != nil { + return nil, err + } + ctx, cancel := context.WithCancel(ctx) - pr, pw := io.Pipe() + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, u, bytes.NewReader(bodyBytes)) + if err != nil { + cancel() + return nil, err + } + httpReq.Header.Set("Authorization", "Bearer "+c.apiKey) + httpReq.Header.Set("Content-Type", "application/json") + httpReq.Header.Set("Accept", "text/event-stream") + resp, err := c.httpClient.Do(httpReq) + if err != nil { + cancel() + return nil, err + } + if resp.StatusCode >= 400 { + body, _ := io.ReadAll(resp.Body) + _ = resp.Body.Close() + cancel() + return nil, fmt.Errorf("stream TTS failed: %s: %s", resp.Status, strings.TrimSpace(string(body))) + } + + pr, pw := io.Pipe() go func() { defer cancel() - defer func() { _ = pw.Close() }() - - header := http.Header{} - header.Set("Authorization", "Bearer "+c.apiKey) - conn, _, err := websocket.Dial(ctx, wsURL, &websocket.DialOptions{HTTPHeader: header}) - if err != nil { + defer func() { _ = resp.Body.Close() }() + if err := readMiniMaxStream(ctx, resp.Body, pw); err != nil && !errors.Is(err, context.Canceled) { _ = pw.CloseWithError(err) return } - defer func() { - _ = conn.Close(websocket.StatusNormalClosure, "done") - }() + _ = pw.Close() + }() - if err := readWSUntilEvent(ctx, conn, "connected_success"); err != nil { - _ = pw.CloseWithError(err) - return - } + return &cancelReadCloser{PipeReader: pr, cancel: cancel}, nil +} - start := wsTaskStart{ - Event: "task_start", - Model: req.Model, - VoiceSetting: buildVoiceSetting(voiceID, req), - AudioSetting: buildAudioSetting(req), - LanguageBoost: req.LanguageBoost, - ContinuousSound: req.ContinuousSound, - } - if err := wsjson.Write(ctx, conn, start); err != nil { - _ = pw.CloseWithError(err) - return - } - if err := readWSUntilEvent(ctx, conn, "task_started"); err != nil { - _ = pw.CloseWithError(err) - return +func readMiniMaxStream(ctx context.Context, body io.Reader, pw *io.PipeWriter) error { + reader := bufio.NewReader(body) + var dataLines []string + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: } - if err := wsjson.Write(ctx, conn, wsTaskContinue{Event: "task_continue", Text: req.Text}); err != nil { - _ = pw.CloseWithError(err) - return + line, err := reader.ReadString('\n') + if err != nil && err != io.EOF { + return err + } + if err == io.EOF && len(line) == 0 { + break } - for { - var msg wsMessage - if err := wsjson.Read(ctx, conn, &msg); err != nil { - _ = pw.CloseWithError(err) - return - } - if err := msg.BaseResp.err(); err != nil { - _ = pw.CloseWithError(err) - return - } - if msg.Event == "task_failed" { - _ = pw.CloseWithError(errors.New("minimax stream failed")) - return - } - if msg.Data != nil && msg.Data.Audio != "" { - chunk, err := hex.DecodeString(msg.Data.Audio) + line = strings.TrimRight(line, "\r\n") + if line == "" { + if len(dataLines) > 0 { + done, err := handleMiniMaxStreamPayload(strings.Join(dataLines, "\n"), pw) if err != nil { - _ = pw.CloseWithError(fmt.Errorf("decode audio chunk: %w", err)) - return + return err } - if len(chunk) > 0 { - if _, err := pw.Write(chunk); err != nil { - return - } + if done { + return nil } + dataLines = dataLines[:0] } - if msg.IsFinal || msg.Event == "task_finished" { - return + } else if strings.HasPrefix(line, "data:") { + dataLines = append(dataLines, strings.TrimSpace(strings.TrimPrefix(line, "data:"))) + } else if strings.HasPrefix(line, ":") || strings.HasPrefix(line, "event:") || strings.HasPrefix(line, "id:") || strings.HasPrefix(line, "retry:") { + // Ignore SSE metadata/comments. + } else { + trimmed := strings.TrimSpace(line) + if strings.HasPrefix(trimmed, "{") || strings.HasPrefix(trimmed, "[") { + done, err := handleMiniMaxStreamPayload(trimmed, pw) + if err != nil { + return err + } + if done { + return nil + } } } - }() - return &cancelReadCloser{PipeReader: pr, cancel: cancel}, nil -} + if err == io.EOF { + break + } + } -func readWSUntilEvent(ctx context.Context, conn *websocket.Conn, want string) error { - for { - var msg wsMessage - if err := wsjson.Read(ctx, conn, &msg); err != nil { + if len(dataLines) > 0 { + done, err := handleMiniMaxStreamPayload(strings.Join(dataLines, "\n"), pw) + if err != nil { return err } - if err := msg.BaseResp.err(); err != nil { - return err + if done { + return nil } - if msg.Event == "task_failed" { - return errors.New("minimax task failed") + } + return nil +} + +func handleMiniMaxStreamPayload(payload string, pw *io.PipeWriter) (bool, error) { + payload = strings.TrimSpace(payload) + if payload == "" { + return false, nil + } + + var items []t2aStreamResponse + if strings.HasPrefix(payload, "[") { + if err := json.Unmarshal([]byte(payload), &items); err != nil { + return false, err } - if msg.Event == want { - return nil + } else { + var item t2aStreamResponse + if err := json.Unmarshal([]byte(payload), &item); err != nil { + return false, err } + items = append(items, item) } + + for _, item := range items { + if err := item.BaseResp.err(); err != nil { + return false, err + } + if item.Data != nil && item.Data.Audio != "" { + chunk, err := hex.DecodeString(item.Data.Audio) + if err != nil { + return false, fmt.Errorf("decode audio chunk: %w", err) + } + if len(chunk) > 0 { + if _, err := pw.Write(chunk); err != nil { + return false, err + } + } + } + if item.Data != nil && item.Data.Status == 2 { + return true, nil + } + } + return false, nil } func buildVoiceSetting(voiceID string, req TTSRequest) voiceSetting { return voiceSetting{ - VoiceID: voiceID, - Speed: req.Speed, - Vol: req.Volume, - Pitch: req.Pitch, + VoiceID: voiceID, + Speed: req.Speed, + Vol: req.Volume, + Pitch: req.Pitch, + Emotion: req.Emotion, + TextNormalization: req.TextNormalization, + LatexRead: req.LatexRead, } } @@ -415,21 +493,3 @@ func (c *Client) httpURL(endpoint string) (string, error) { u.Path = path.Join(u.Path, endpoint) return u.String(), nil } - -func (c *Client) wsURL(endpoint string) (string, error) { - u, err := url.Parse(c.baseURL) - if err != nil { - return "", err - } - switch u.Scheme { - case "http": - u.Scheme = "ws" - case "https": - u.Scheme = "wss" - case "ws", "wss": - default: - u.Scheme = "wss" - } - u.Path = path.Join(u.Path, endpoint) - return u.String(), nil -}