From 7dbfb8348527983c4f4210bfe6194dfda791354e Mon Sep 17 00:00:00 2001
From: tars90percent <tars@minimaxi.com>
Date: Wed, 28 Jan 2026 19:41:47 +0800
Subject: [PATCH 1/3] Add MiniMax TTS support

---
 cmd/api_key.go             |  49 +++++
 cmd/speak.go               | 346 ++++++++++++++++++++++++++---
 go.mod                     |   1 +
 go.sum                     |   2 +
 internal/minimax/client.go | 435 +++++++++++++++++++++++++++++++++++++
 internal/minimax/doc.go    |   2 +
 6 files changed, 806 insertions(+), 29 deletions(-)
 create mode 100644 internal/minimax/client.go
 create mode 100644 internal/minimax/doc.go

diff --git a/cmd/api_key.go b/cmd/api_key.go
index 8b22232..c8125f8 100644
--- a/cmd/api_key.go
+++ b/cmd/api_key.go
@@ -26,6 +26,33 @@ func ensureAPIKey() error {
 	return nil
 }
 
+func ensureAPIKeyForProvider(provider string) error {
+	if provider == "minimax" {
+		return ensureMiniMaxAPIKey()
+	}
+	return ensureAPIKey()
+}
+
+func ensureMiniMaxAPIKey() error {
+	if cfg.APIKey == "" {
+		key, err := resolveMiniMaxAPIKeyFromFile()
+		if err != nil {
+			return err
+		}
+		cfg.APIKey = key
+	}
+	if cfg.APIKey == "" {
+		cfg.APIKey = os.Getenv("MINIMAX_API_KEY")
+	}
+	if cfg.APIKey == "" {
+		cfg.APIKey = os.Getenv("SAG_API_KEY")
+	}
+	if cfg.APIKey == "" {
+		return fmt.Errorf("missing MiniMax API key (set --api-key, --api-key-file, or MINIMAX_API_KEY)")
+	}
+	return nil
+}
+
 func resolveAPIKeyFromFile() (string, error) {
 	path := cfg.APIKeyFile
 	if path == "" {
@@ -47,3 +74,25 @@ func resolveAPIKeyFromFile() (string, error) {
 	}
 	return key, nil
 }
+
+func resolveMiniMaxAPIKeyFromFile() (string, error) {
+	path := cfg.APIKeyFile
+	if path == "" {
+		path = os.Getenv("MINIMAX_API_KEY_FILE")
+	}
+	if path == "" {
+		path = os.Getenv("SAG_API_KEY_FILE")
+	}
+	if path == "" {
+		return "", nil
+	}
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return "", fmt.Errorf("read api key file: %w", err)
+	}
+	key := strings.TrimSpace(string(data))
+	if key == "" {
+		return "", fmt.Errorf("api key file %q is empty", path)
+	}
+	return key, nil
+}
diff --git a/cmd/speak.go b/cmd/speak.go
index e043407..c7644fe 100644
--- a/cmd/speak.go
+++ b/cmd/speak.go
@@ -13,6 +13,7 @@ import (
 
 	"github.com/steipete/sag/internal/audio"
 	"github.com/steipete/sag/internal/elevenlabs"
+	"github.com/steipete/sag/internal/minimax"
 
 	"github.com/spf13/cobra"
 )
@@ -44,6 +45,11 @@ const defaultWPM = 175 // matches macOS `say` default rate
 
 var playToSpeakers = audio.StreamToSpeakers
 
+const (
+	providerElevenLabs = "elevenlabs"
+	providerMiniMax    = "minimax"
+)
+
 func init() {
 	opts := speakOptions{
 		modelID:   "eleven_v3",
@@ -55,39 +61,63 @@ func init() {
 
 	cmd := &cobra.Command{
 		Use:   "speak [text]",
-		Short: "Speak the provided text using ElevenLabs TTS (default: stream to speakers)",
+		Short: "Speak the provided text using TTS (default: stream to speakers)",
 		Long:  "If no text argument is provided, the command reads from stdin.\n\nTip: run `sag prompting` for model-specific prompting tips and recommended flag combinations.",
 		Args:  cobra.ArbitraryArgs,
 		PreRunE: func(_ *cobra.Command, _ []string) error {
-			return ensureAPIKey()
+			return ensureAPIKeyForProvider(detectProvider(opts.modelID))
 		},
 		RunE: func(cmd *cobra.Command, args []string) error {
 			if err := applyRateAndSpeed(&opts); err != nil {
 				return err
 			}
 
+			provider := detectProvider(opts.modelID)
 			forceVoiceID := cmd.Flags().Changed("voice-id")
 			voiceInput := opts.voiceID
 			if voiceInput == "" {
-				if env := os.Getenv("ELEVENLABS_VOICE_ID"); env != "" {
-					voiceInput = env
-					forceVoiceID = true
-				} else if env := os.Getenv("SAG_VOICE_ID"); env != "" {
-					voiceInput = env
-					forceVoiceID = true
+				if provider == providerMiniMax {
+					if env := os.Getenv("MINIMAX_VOICE_ID"); env != "" {
+						voiceInput = env
+						forceVoiceID = true
+					} else if env := os.Getenv("SAG_VOICE_ID"); env != "" {
+						voiceInput = env
+						forceVoiceID = true
+					}
+				} else {
+					if env := os.Getenv("ELEVENLABS_VOICE_ID"); env != "" {
+						voiceInput = env
+						forceVoiceID = true
+					} else if env := os.Getenv("SAG_VOICE_ID"); env != "" {
+						voiceInput = env
+						forceVoiceID = true
+					}
 				}
 			}
-			client := elevenlabs.NewClient(cfg.APIKey, cfg.BaseURL)
+			elevenClient := elevenlabs.NewClient(cfg.APIKey, cfg.BaseURL)
+			miniClient := minimax.NewClient(cfg.APIKey, minimaxBaseURL())
 
-			voiceID, err := resolveVoice(cmd.Context(), client, voiceInput, forceVoiceID)
-			if err != nil {
-				return err
-			}
-			if voiceID == "" {
-				// Likely printed voices for '?' request.
-				return nil
+			switch provider {
+			case providerMiniMax:
+				voiceID, err := resolveMiniMaxVoice(cmd.Context(), miniClient, voiceInput, forceVoiceID)
+				if err != nil {
+					return err
+				}
+				if voiceID == "" {
+					return nil
+				}
+				opts.voiceID = voiceID
+			default:
+				voiceID, err := resolveVoice(cmd.Context(), elevenClient, voiceInput, forceVoiceID)
+				if err != nil {
+					return err
+				}
+				if voiceID == "" {
+					// Likely printed voices for '?' request.
+					return nil
+				}
+				opts.voiceID = voiceID
 			}
-			opts.voiceID = voiceID
 
 			text, err := resolveText(args, opts.inputFile)
 			if err != nil {
@@ -96,7 +126,11 @@ func init() {
 
 			// If user provided output path with a known extension, infer a compatible format.
 			if opts.outputPath != "" {
-				if inferred := inferFormatFromExt(opts.outputPath); inferred != "" {
+				if provider == providerMiniMax {
+					if inferred := inferMiniMaxFormatFromExt(opts.outputPath); inferred != "" {
+						opts.outputFmt = inferred
+					}
+				} else if inferred := inferFormatFromExt(opts.outputPath); inferred != "" {
 					opts.outputFmt = inferred
 				}
 				// Disable playback when -o is set, unless --play was explicitly provided
@@ -108,25 +142,45 @@ func init() {
 			ctx, cancel := context.WithTimeout(cmd.Context(), 90*time.Second)
 			defer cancel()
 
-			payload, err := buildTTSRequest(cmd, opts, text)
-			if err != nil {
-				return err
-			}
-
 			start := time.Now()
 			var bytes int64
-			if opts.stream {
-				n, err := streamAndPlay(ctx, client, opts, payload)
-				bytes = n
+			switch provider {
+			case providerMiniMax:
+				payload, err := buildMiniMaxTTSRequest(opts, text)
 				if err != nil {
 					return err
 				}
-			} else {
-				n, err := convertAndPlay(ctx, client, opts, payload)
-				bytes = n
+				if opts.stream {
+					n, err := streamAndPlayMiniMax(ctx, miniClient, opts, payload)
+					bytes = n
+					if err != nil {
+						return err
+					}
+				} else {
+					n, err := convertAndPlayMiniMax(ctx, miniClient, opts, payload)
+					bytes = n
+					if err != nil {
+						return err
+					}
+				}
+			default:
+				payload, err := buildTTSRequest(cmd, opts, text)
 				if err != nil {
 					return err
 				}
+				if opts.stream {
+					n, err := streamAndPlay(ctx, elevenClient, opts, payload)
+					bytes = n
+					if err != nil {
+						return err
+					}
+				} else {
+					n, err := convertAndPlay(ctx, elevenClient, opts, payload)
+					bytes = n
+					if err != nil {
+						return err
+					}
+				}
 			}
 			if opts.metrics {
 				fmt.Fprintf(os.Stderr, "metrics: chars=%d bytes=%d model=%s voice=%s stream=%t latencyTier=%d dur=%s\n",
@@ -427,6 +481,93 @@ func convertAndPlay(ctx context.Context, client *elevenlabs.Client, opts speakOp
 	return n, nil
 }
 
+func streamAndPlayMiniMax(ctx context.Context, client *minimax.Client, opts speakOptions, payload minimax.TTSRequest) (int64, error) {
+	resp, err := client.StreamTTS(ctx, opts.voiceID, payload)
+	if err != nil {
+		return 0, err
+	}
+	defer func() {
+		_ = resp.Close()
+	}()
+
+	writers := make([]io.Writer, 0, 2)
+	var file io.WriteCloser
+	if opts.outputPath != "" {
+		if err := os.MkdirAll(filepath.Dir(opts.outputPath), 0o755); err != nil {
+			return 0, err
+		}
+		file, err = os.Create(opts.outputPath)
+		if err != nil {
+			return 0, err
+		}
+		defer func() {
+			_ = file.Close()
+		}()
+		writers = append(writers, file)
+	}
+
+	if opts.play {
+		pr, pw := io.Pipe()
+		writers = append(writers, pw)
+		mw := io.MultiWriter(writers...)
+
+		copyErr := make(chan error, 1)
+		copyN := make(chan int64, 1)
+		go func() {
+			n, err := io.Copy(mw, resp)
+			copyN <- n
+			copyErr <- err
+			_ = pw.Close()
+		}()
+
+		playErr := playToSpeakers(ctx, pr)
+		copyNVal := <-copyN
+		copyErrVal := <-copyErr
+		if copyErrVal != nil {
+			return copyNVal, copyErrVal
+		}
+		return copyNVal, playErr
+	}
+
+	if len(writers) == 0 {
+		return 0, errors.New("nothing to do: enable --play or provide --output")
+	}
+
+	mw := io.MultiWriter(writers...)
+	n, err := io.Copy(mw, resp)
+	return n, err
+}
+
+func convertAndPlayMiniMax(ctx context.Context, client *minimax.Client, opts speakOptions, payload minimax.TTSRequest) (int64, error) {
+	data, err := client.ConvertTTS(ctx, opts.voiceID, payload)
+	if err != nil {
+		return 0, err
+	}
+	n := int64(len(data))
+
+	if opts.outputPath != "" {
+		if err := os.MkdirAll(filepath.Dir(opts.outputPath), 0o755); err != nil {
+			return n, err
+		}
+		if err := os.WriteFile(opts.outputPath, data, 0o644); err != nil {
+			return n, err
+		}
+	}
+
+	if opts.play {
+		pr, pw := io.Pipe()
+		go func() {
+			_, _ = pw.Write(data)
+			_ = pw.Close()
+		}()
+		return n, playToSpeakers(ctx, pr)
+	}
+	if opts.outputPath == "" {
+		return n, errors.New("nothing to do: enable --play or provide --output")
+	}
+	return n, nil
+}
+
 func resolveVoice(ctx context.Context, client *elevenlabs.Client, voiceInput string, forceID bool) (string, error) {
 	voiceInput = strings.TrimSpace(voiceInput)
 	if voiceInput == "" {
@@ -515,6 +656,68 @@ func resolveVoice(ctx context.Context, client *elevenlabs.Client, voiceInput str
 	return "", fmt.Errorf("voice %q not found; try 'sag voices' or -v '?'", voiceInput)
 }
 
+func resolveMiniMaxVoice(ctx context.Context, client *minimax.Client, voiceInput string, forceID bool) (string, error) {
+	voiceInput = strings.TrimSpace(voiceInput)
+	if voiceInput == "" {
+		ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
+		defer cancel()
+		voices, err := client.ListVoices(ctx)
+		if err != nil {
+			return "", fmt.Errorf("voice not specified and failed to fetch voices: %w", err)
+		}
+		if len(voices) == 0 {
+			return "", errors.New("no voices available; specify --voice or set MINIMAX_VOICE_ID")
+		}
+		fmt.Fprintf(os.Stderr, "defaulting to voice %s (%s)\n", voices[0].Name, voices[0].VoiceID)
+		return voices[0].VoiceID, nil
+	}
+	if voiceInput == "?" {
+		ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
+		defer cancel()
+		voices, err := client.ListVoices(ctx)
+		if err != nil {
+			return "", err
+		}
+		w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
+		if _, err := fmt.Fprintf(w, "VOICE ID\tNAME\tCATEGORY\n"); err != nil {
+			return "", err
+		}
+		for _, v := range voices {
+			if _, err := fmt.Fprintf(w, "%s\t%s\t%s\n", v.VoiceID, v.Name, v.Category); err != nil {
+				return "", err
+			}
+		}
+		if err := w.Flush(); err != nil {
+			return "", err
+		}
+		return "", nil
+	}
+	if forceID {
+		return voiceInput, nil
+	}
+
+	ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
+	defer cancel()
+	voices, err := client.ListVoices(ctx)
+	if err != nil {
+		return voiceInput, nil
+	}
+	voiceInputLower := strings.ToLower(voiceInput)
+	for _, v := range voices {
+		if strings.ToLower(v.VoiceID) == voiceInputLower || strings.ToLower(v.Name) == voiceInputLower {
+			fmt.Fprintf(os.Stderr, "using voice %s (%s)\n", v.Name, v.VoiceID)
+			return v.VoiceID, nil
+		}
+	}
+	for _, v := range voices {
+		if strings.Contains(strings.ToLower(v.Name), voiceInputLower) {
+			fmt.Fprintf(os.Stderr, "using voice %s (%s)\n", v.Name, v.VoiceID)
+			return v.VoiceID, nil
+		}
+	}
+	return voiceInput, nil
+}
+
 func looksLikeVoiceID(voiceInput string) bool {
 	return len(voiceInput) >= 15 && !strings.ContainsRune(voiceInput, ' ')
 }
@@ -539,3 +742,88 @@ func inferFormatFromExt(path string) string {
 		return ""
 	}
 }
+
+func inferMiniMaxFormatFromExt(path string) string {
+	ext := strings.ToLower(filepath.Ext(path))
+	switch ext {
+	case ".mp3":
+		return "mp3"
+	case ".wav", ".wave":
+		return "wav"
+	case ".flac":
+		return "flac"
+	default:
+		return ""
+	}
+}
+
+func detectProvider(modelID string) string {
+	modelID = strings.ToLower(strings.TrimSpace(modelID))
+	if strings.HasPrefix(modelID, "speech-") {
+		return providerMiniMax
+	}
+	return providerElevenLabs
+}
+
+func minimaxBaseURL() string {
+	host := strings.TrimSpace(os.Getenv("MINIMAX_API_HOST"))
+	if host == "" {
+		host = strings.TrimSpace(os.Getenv("MINIMAX_BASE_URL"))
+	}
+	if host == "" {
+		return ""
+	}
+	if strings.HasPrefix(host, "http://") || strings.HasPrefix(host, "https://") {
+		return host
+	}
+	return "https://" + host
+}
+
+func buildMiniMaxTTSRequest(opts speakOptions, text string) (minimax.TTSRequest, error) {
+	format, err := normalizeMiniMaxFormat(opts.outputFmt)
+	if err != nil {
+		return minimax.TTSRequest{}, err
+	}
+	if opts.stream && format != "mp3" {
+		return minimax.TTSRequest{}, errors.New("MiniMax streaming supports mp3 only; use --no-stream for wav/flac")
+	}
+	if opts.play && format != "mp3" {
+		return minimax.TTSRequest{}, errors.New("MiniMax playback supports mp3 only; use --output without --play for wav/flac")
+	}
+
+	speed := opts.speed
+	return minimax.TTSRequest{
+		Model:       opts.modelID,
+		Text:        text,
+		Speed:       speed,
+		Volume:      1.0,
+		Pitch:       0,
+		AudioFormat: format,
+		SampleRate:  32000,
+		Bitrate:     128000,
+		Channel:     1,
+	}, nil
+}
+
+func normalizeMiniMaxFormat(format string) (string, error) {
+	format = strings.ToLower(strings.TrimSpace(format))
+	switch format {
+	case "", "mp3", "wav", "flac":
+		if format == "" {
+			return "mp3", nil
+		}
+		return format, nil
+	case "mp3_44100_128":
+		return "mp3", nil
+	case "pcm_44100":
+		return "wav", nil
+	default:
+		if strings.HasPrefix(format, "mp3_") {
+			return "mp3", nil
+		}
+		if strings.HasPrefix(format, "pcm_") {
+			return "wav", nil
+		}
+		return "", fmt.Errorf("format %q not supported for MiniMax (use mp3, wav, flac)", format)
+	}
+}
diff --git a/go.mod b/go.mod
index adfa966..2cc43cd 100644
--- a/go.mod
+++ b/go.mod
@@ -3,6 +3,7 @@ module github.com/steipete/sag
 go 1.24.0
 
 require (
+	github.com/coder/websocket v1.8.14
 	github.com/ebitengine/oto/v3 v3.4.0
 	github.com/hajimehoshi/go-mp3 v0.3.4
 	github.com/spf13/cobra v1.10.2
diff --git a/go.sum b/go.sum
index 280fd5c..e58cdd8 100644
--- a/go.sum
+++ b/go.sum
@@ -1,3 +1,5 @@
+github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g=
+github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg=
 github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
 github.com/ebitengine/oto/v3 v3.4.0 h1:br0PgASsEWaoWn38b2Goe7m1GKFYfNgnsjSd5Gg+/bQ=
 github.com/ebitengine/oto/v3 v3.4.0/go.mod h1:IOleLVD0m+CMak3mRVwsYY8vTctQgOM0iiL6S7Ar7eI=
diff --git a/internal/minimax/client.go b/internal/minimax/client.go
new file mode 100644
index 0000000..bb98781
--- /dev/null
+++ b/internal/minimax/client.go
@@ -0,0 +1,435 @@
+package minimax
+
+import (
+	"bytes"
+	"context"
+	"encoding/hex"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"path"
+	"strings"
+	"time"
+
+	"github.com/coder/websocket"
+	"github.com/coder/websocket/wsjson"
+)
+
+const defaultBaseURL = "https://api.minimax.io"
+
+// Client talks to the MiniMax TTS API.
+type Client struct {
+	baseURL    string
+	apiKey     string
+	httpClient *http.Client
+}
+
+// NewClient returns a client configured with the given API key and base URL.
+func NewClient(apiKey, baseURL string) *Client {
+	if baseURL == "" {
+		baseURL = defaultBaseURL
+	}
+	return &Client{
+		baseURL: baseURL,
+		apiKey:  apiKey,
+		httpClient: &http.Client{
+			Timeout: 60 * time.Second,
+		},
+	}
+}
+
+// Voice represents a MiniMax voice entry.
+type Voice struct {
+	VoiceID     string
+	Name        string
+	Category    string
+	Description string
+}
+
+type voiceEntry struct {
+	VoiceID     string   `json:"voice_id"`
+	VoiceName   string   `json:"voice_name"`
+	Description []string `json:"description,omitempty"`
+}
+
+type listVoicesRequest struct {
+	VoiceType string `json:"voice_type"`
+}
+
+type listVoicesResponse struct {
+	SystemVoice     []voiceEntry `json:"system_voice"`
+	VoiceCloning    []voiceEntry `json:"voice_cloning"`
+	VoiceGeneration []voiceEntry `json:"voice_generation"`
+	BaseResp        *baseResp    `json:"base_resp,omitempty"`
+}
+
+// ListVoices fetches available voices.
+func (c *Client) ListVoices(ctx context.Context) ([]Voice, error) {
+	u, err := c.httpURL("/v1/get_voice")
+	if err != nil {
+		return nil, err
+	}
+
+	reqBody, err := json.Marshal(listVoicesRequest{VoiceType: "all"})
+	if err != nil {
+		return nil, err
+	}
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, u, bytes.NewReader(reqBody))
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("Authorization", "Bearer "+c.apiKey)
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Accept", "application/json")
+
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = resp.Body.Close() }()
+
+	if resp.StatusCode >= 400 {
+		body, _ := io.ReadAll(resp.Body)
+		return nil, fmt.Errorf("list voices failed: %s: %s", resp.Status, strings.TrimSpace(string(body)))
+	}
+
+	var payload listVoicesResponse
+	if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
+		return nil, err
+	}
+	if err := payload.BaseResp.err(); err != nil {
+		return nil, err
+	}
+
+	voices := make([]Voice, 0, len(payload.SystemVoice)+len(payload.VoiceCloning)+len(payload.VoiceGeneration))
+	appendVoices := func(category string, entries []voiceEntry) {
+		for _, v := range entries {
+			name := strings.TrimSpace(v.VoiceName)
+			if name == "" {
+				name = v.VoiceID
+			}
+			voices = append(voices, Voice{
+				VoiceID:     v.VoiceID,
+				Name:        name,
+				Category:    category,
+				Description: strings.Join(v.Description, " "),
+			})
+		}
+	}
+	appendVoices("system", payload.SystemVoice)
+	appendVoices("voice_cloning", payload.VoiceCloning)
+	appendVoices("voice_generation", payload.VoiceGeneration)
+	return voices, nil
+}
+
+// TTSRequest configures a text-to-speech request payload.
+type TTSRequest struct {
+	Model           string
+	Text            string
+	Speed           float64
+	Volume          float64
+	Pitch           int
+	AudioFormat     string
+	SampleRate      int
+	Bitrate         int
+	Channel         int
+	LanguageBoost   string
+	ContinuousSound *bool
+}
+
+type baseResp struct {
+	StatusCode int    `json:"status_code"`
+	StatusMsg  string `json:"status_msg"`
+}
+
+func (b *baseResp) err() error {
+	if b == nil {
+		return nil
+	}
+	if b.StatusCode == 0 {
+		return nil
+	}
+	msg := strings.TrimSpace(b.StatusMsg)
+	if msg == "" {
+		msg = "unknown error"
+	}
+	return fmt.Errorf("minimax error: %s (code=%d)", msg, b.StatusCode)
+}
+
+type voiceSetting struct {
+	VoiceID string  `json:"voice_id"`
+	Speed   float64 `json:"speed"`
+	Vol     float64 `json:"vol"`
+	Pitch   int     `json:"pitch"`
+}
+
+type audioSetting struct {
+	Format     string `json:"format,omitempty"`
+	SampleRate int    `json:"sample_rate,omitempty"`
+	Bitrate    int    `json:"bitrate,omitempty"`
+	Channel    int    `json:"channel,omitempty"`
+}
+
+type t2aRequest struct {
+	Model           string       `json:"model"`
+	Text            string       `json:"text"`
+	Stream          bool         `json:"stream"`
+	OutputFormat    string       `json:"output_format,omitempty"`
+	VoiceSetting    voiceSetting `json:"voice_setting"`
+	AudioSetting    audioSetting `json:"audio_setting,omitempty"`
+	LanguageBoost   string       `json:"language_boost,omitempty"`
+	ContinuousSound *bool        `json:"continuous_sound,omitempty"`
+}
+
+type t2aResponse struct {
+	Data struct {
+		Audio string `json:"audio"`
+	} `json:"data"`
+	BaseResp *baseResp `json:"base_resp,omitempty"`
+}
+
+// ConvertTTS downloads the full audio before returning.
+func (c *Client) ConvertTTS(ctx context.Context, voiceID string, req TTSRequest) ([]byte, error) {
+	u, err := c.httpURL("/v1/t2a_v2")
+	if err != nil {
+		return nil, err
+	}
+
+	payload := t2aRequest{
+		Model:           req.Model,
+		Text:            req.Text,
+		Stream:          false,
+		OutputFormat:    "hex",
+		VoiceSetting:    buildVoiceSetting(voiceID, req),
+		AudioSetting:    buildAudioSetting(req),
+		LanguageBoost:   req.LanguageBoost,
+		ContinuousSound: req.ContinuousSound,
+	}
+	bodyBytes, err := json.Marshal(payload)
+	if err != nil {
+		return nil, err
+	}
+
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, u, bytes.NewReader(bodyBytes))
+	if err != nil {
+		return nil, err
+	}
+	httpReq.Header.Set("Authorization", "Bearer "+c.apiKey)
+	httpReq.Header.Set("Content-Type", "application/json")
+	httpReq.Header.Set("Accept", "application/json")
+
+	resp, err := c.httpClient.Do(httpReq)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = resp.Body.Close() }()
+
+	if resp.StatusCode >= 400 {
+		body, _ := io.ReadAll(resp.Body)
+		return nil, fmt.Errorf("convert TTS failed: %s: %s", resp.Status, strings.TrimSpace(string(body)))
+	}
+
+	var response t2aResponse
+	if err := json.NewDecoder(resp.Body).Decode(&response); err != nil {
+		return nil, err
+	}
+	if err := response.BaseResp.err(); err != nil {
+		return nil, err
+	}
+	if response.Data.Audio == "" {
+		return nil, errors.New("minimax response missing audio")
+	}
+
+	data, err := hex.DecodeString(response.Data.Audio)
+	if err != nil {
+		return nil, fmt.Errorf("decode audio hex: %w", err)
+	}
+	return data, nil
+}
+
+type wsTaskStart struct {
+	Event           string       `json:"event"`
+	Model           string       `json:"model"`
+	VoiceSetting    voiceSetting `json:"voice_setting"`
+	AudioSetting    audioSetting `json:"audio_setting,omitempty"`
+	LanguageBoost   string       `json:"language_boost,omitempty"`
+	ContinuousSound *bool        `json:"continuous_sound,omitempty"`
+}
+
+type wsTaskContinue struct {
+	Event string `json:"event"`
+	Text  string `json:"text"`
+}
+
+type wsMessage struct {
+	Event    string    `json:"event"`
+	Data     *wsData   `json:"data,omitempty"`
+	BaseResp *baseResp `json:"base_resp,omitempty"`
+	IsFinal  bool      `json:"is_final,omitempty"`
+}
+
+type wsData struct {
+	Audio string `json:"audio,omitempty"`
+}
+
+type cancelReadCloser struct {
+	*io.PipeReader
+	cancel func()
+}
+
+func (c *cancelReadCloser) Close() error {
+	c.cancel()
+	return c.PipeReader.Close()
+}
+
+// StreamTTS streams MP3 audio from MiniMax via WebSocket.
+func (c *Client) StreamTTS(ctx context.Context, voiceID string, req TTSRequest) (io.ReadCloser, error) {
+	wsURL, err := c.wsURL("/ws/v1/t2a_v2")
+	if err != nil {
+		return nil, err
+	}
+	ctx, cancel := context.WithCancel(ctx)
+	pr, pw := io.Pipe()
+
+	go func() {
+		defer cancel()
+		defer func() { _ = pw.Close() }()
+
+		header := http.Header{}
+		header.Set("Authorization", "Bearer "+c.apiKey)
+		conn, _, err := websocket.Dial(ctx, wsURL, &websocket.DialOptions{HTTPHeader: header})
+		if err != nil {
+			_ = pw.CloseWithError(err)
+			return
+		}
+		defer func() {
+			_ = conn.Close(websocket.StatusNormalClosure, "done")
+		}()
+
+		if err := readWSUntilEvent(ctx, conn, "connected_success"); err != nil {
+			_ = pw.CloseWithError(err)
+			return
+		}
+
+		start := wsTaskStart{
+			Event:           "task_start",
+			Model:           req.Model,
+			VoiceSetting:    buildVoiceSetting(voiceID, req),
+			AudioSetting:    buildAudioSetting(req),
+			LanguageBoost:   req.LanguageBoost,
+			ContinuousSound: req.ContinuousSound,
+		}
+		if err := wsjson.Write(ctx, conn, start); err != nil {
+			_ = pw.CloseWithError(err)
+			return
+		}
+		if err := readWSUntilEvent(ctx, conn, "task_started"); err != nil {
+			_ = pw.CloseWithError(err)
+			return
+		}
+
+		if err := wsjson.Write(ctx, conn, wsTaskContinue{Event: "task_continue", Text: req.Text}); err != nil {
+			_ = pw.CloseWithError(err)
+			return
+		}
+
+		for {
+			var msg wsMessage
+			if err := wsjson.Read(ctx, conn, &msg); err != nil {
+				_ = pw.CloseWithError(err)
+				return
+			}
+			if err := msg.BaseResp.err(); err != nil {
+				_ = pw.CloseWithError(err)
+				return
+			}
+			if msg.Event == "task_failed" {
+				_ = pw.CloseWithError(errors.New("minimax stream failed"))
+				return
+			}
+			if msg.Data != nil && msg.Data.Audio != "" {
+				chunk, err := hex.DecodeString(msg.Data.Audio)
+				if err != nil {
+					_ = pw.CloseWithError(fmt.Errorf("decode audio chunk: %w", err))
+					return
+				}
+				if len(chunk) > 0 {
+					if _, err := pw.Write(chunk); err != nil {
+						return
+					}
+				}
+			}
+			if msg.IsFinal || msg.Event == "task_finished" {
+				return
+			}
+		}
+	}()
+
+	return &cancelReadCloser{PipeReader: pr, cancel: cancel}, nil
+}
+
+func readWSUntilEvent(ctx context.Context, conn *websocket.Conn, want string) error {
+	for {
+		var msg wsMessage
+		if err := wsjson.Read(ctx, conn, &msg); err != nil {
+			return err
+		}
+		if err := msg.BaseResp.err(); err != nil {
+			return err
+		}
+		if msg.Event == "task_failed" {
+			return errors.New("minimax task failed")
+		}
+		if msg.Event == want {
+			return nil
+		}
+	}
+}
+
+func buildVoiceSetting(voiceID string, req TTSRequest) voiceSetting {
+	return voiceSetting{
+		VoiceID: voiceID,
+		Speed:   req.Speed,
+		Vol:     req.Volume,
+		Pitch:   req.Pitch,
+	}
+}
+
+func buildAudioSetting(req TTSRequest) audioSetting {
+	return audioSetting{
+		Format:     req.AudioFormat,
+		SampleRate: req.SampleRate,
+		Bitrate:    req.Bitrate,
+		Channel:    req.Channel,
+	}
+}
+
+func (c *Client) httpURL(endpoint string) (string, error) {
+	u, err := url.Parse(c.baseURL)
+	if err != nil {
+		return "", err
+	}
+	u.Path = path.Join(u.Path, endpoint)
+	return u.String(), nil
+}
+
+func (c *Client) wsURL(endpoint string) (string, error) {
+	u, err := url.Parse(c.baseURL)
+	if err != nil {
+		return "", err
+	}
+	switch u.Scheme {
+	case "http":
+		u.Scheme = "ws"
+	case "https":
+		u.Scheme = "wss"
+	case "ws", "wss":
+	default:
+		u.Scheme = "wss"
+	}
+	u.Path = path.Join(u.Path, endpoint)
+	return u.String(), nil
+}
diff --git a/internal/minimax/doc.go b/internal/minimax/doc.go
new file mode 100644
index 0000000..599152f
--- /dev/null
+++ b/internal/minimax/doc.go
@@ -0,0 +1,2 @@
+// Package minimax provides a small client for the MiniMax TTS API.
+package minimax

From 214ca4d7d7501f99e60c935c8eff76cdc48676b2 Mon Sep 17 00:00:00 2001
From: Vincent Wu <vincentwu@Electric-Sheep.local>
Date: Thu, 29 Jan 2026 03:13:49 +0800
Subject: [PATCH 2/3] docs: mention MiniMax TTS option

---
 README.md | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 78974c3..e456f49 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-# sag 🗣️ — “Mac-style speech with ElevenLabs”
+# sag 🗣️ — “Mac-style speech with ElevenLabs and MiniMax”
 
-One-liner TTS that works like `say`: stream to speakers by default, list voices, or save audio files.
+One-liner TTS that works like `say`: stream to speakers by default, list voices, or save audio files. Defaults to ElevenLabs, with MiniMax available via `speech-*` model IDs.
 
 ## Install
 Homebrew (macOS):
@@ -15,18 +15,20 @@ go install ./cmd/sag
 Requires Go 1.24+.
 
 ## Configuration
-- `ELEVENLABS_API_KEY` (required)
-- `--api-key-file` or `ELEVENLABS_API_KEY_FILE`/`SAG_API_KEY_FILE` to load the key from a file
-- Optional defaults: `ELEVENLABS_VOICE_ID` or `SAG_VOICE_ID`
+- ElevenLabs: `ELEVENLABS_API_KEY` (or `SAG_API_KEY`)
+- MiniMax: `MINIMAX_API_KEY` (or `SAG_API_KEY`)
+- `--api-key-file` or `ELEVENLABS_API_KEY_FILE`/`MINIMAX_API_KEY_FILE`/`SAG_API_KEY_FILE` to load the key from a file
+- Optional defaults: `ELEVENLABS_VOICE_ID`, `MINIMAX_VOICE_ID`, or `SAG_VOICE_ID`
+- Optional: `MINIMAX_API_HOST` or `MINIMAX_BASE_URL` to override the MiniMax base URL
 
 ## Usage
 
 Features:
 - macOS `say`-style default: `sag "Hello"` routes to `speak` automatically.
 - Streaming playback to speakers with optional file output.
-- Voice discovery via `sag voices` and `-v ?`.
+- Voice discovery via `sag voices` (ElevenLabs) and `-v ?` (provider-specific).
 - Speed/rate controls, latency tiers, and format inference from output extension.
-- Model selection via `--model-id` (defaults to `eleven_v3`; use `eleven_multilingual_v2` for a stable baseline).
+- Model selection via `--model-id` (defaults to `eleven_v3`; use `eleven_multilingual_v2` for a stable baseline, `speech-*` for MiniMax).
 
 Speak (streams audio):
 ```bash
@@ -52,6 +54,8 @@ sag speak -v Roger --stream --latency-tier 3 "Faster start"
 sag speak -v Roger --speed 1.2 "Talk a bit faster"
 sag speak -v Roger --model-id eleven_multilingual_v2 "Use stable v2 baseline"
 sag speak -v Roger --output out.wav --format pcm_44100 "Wave output"
+sag speak --model-id speech-01 -v ? "List MiniMax voices"
+sag speak --model-id speech-01 --output out.flac --stream=false "MiniMax file output"
 ```
 
 Key flags (subset):
@@ -94,7 +98,11 @@ Highlights:
 
 ## Models / engines
 
-`sag` supports any ElevenLabs `model_id` via `--model-id` (we pass it through). Practical defaults + common IDs:
+Provider selection:
+- ElevenLabs (default): any ElevenLabs `model_id` via `--model-id` (we pass it through).
+- MiniMax: use a `speech-*` model ID to route requests to MiniMax. Streaming/playback is MP3-only; use `--stream=false` for WAV/FLAC output.
+
+Practical defaults + common ElevenLabs IDs:
 
 | Engine | `--model-id` | Prompting style | Best for |
 |---|---|---|---|
@@ -123,6 +131,6 @@ Notes:
   - Build: `go build ./cmd/sag`
 
 ## Limitations
-- ElevenLabs account and API key required.
+- ElevenLabs or MiniMax account and API key required (per provider).
 - Voice defaults to first available if not provided.
 - Non-mac platforms: playback still works via `go-mp3` + `oto`, but device selection flags are no-ops.

From e59f3679a97ac23a3e9b56a9cee55db9293af5fc Mon Sep 17 00:00:00 2001
From: tars90percent <tars@minimaxi.com>
Date: Thu, 29 Jan 2026 19:52:35 +0800
Subject: [PATCH 3/3] Use HTTP SSE for MiniMax streaming

---
 cmd/speak.go               | 197 ++++++++++++++++++--
 go.mod                     |   1 -
 go.sum                     |   2 -
 internal/minimax/client.go | 364 +++++++++++++++++++++----------------
 4 files changed, 391 insertions(+), 173 deletions(-)

diff --git a/cmd/speak.go b/cmd/speak.go
index c7644fe..002265e 100644
--- a/cmd/speak.go
+++ b/cmd/speak.go
@@ -39,6 +39,20 @@ type speakOptions struct {
 
 	speakerBoost   bool
 	noSpeakerBoost bool
+
+	minimaxVolume                  float64
+	minimaxPitch                   int
+	minimaxEmotion                 string
+	minimaxLanguage                string
+	minimaxAccent                  string
+	minimaxTone                    []string
+	minimaxTextNormalization       bool
+	minimaxLatexRead               bool
+	minimaxContinuousSound         bool
+	minimaxVoiceModifyPitch        int
+	minimaxVoiceModifyIntensity    int
+	minimaxVoiceModifyTimbre       int
+	minimaxVoiceModifySoundEffects string
 }
 
 const defaultWPM = 175 // matches macOS `say` default rate
@@ -146,7 +160,7 @@ func init() {
 			var bytes int64
 			switch provider {
 			case providerMiniMax:
-				payload, err := buildMiniMaxTTSRequest(opts, text)
+				payload, err := buildMiniMaxTTSRequest(cmd, opts, text)
 				if err != nil {
 					return err
 				}
@@ -211,6 +225,19 @@ func init() {
 	cmd.Flags().StringVar(&opts.lang, "lang", "", "Language code (2-letter ISO 639-1; influences normalization; when set)")
 	cmd.Flags().BoolVar(&opts.metrics, "metrics", false, "Print request metrics to stderr (chars, bytes, duration, etc.)")
 	cmd.Flags().StringVarP(&opts.inputFile, "input-file", "f", "", "Read text from file (use '-' for stdin), matching macOS say -f")
+	cmd.Flags().Float64Var(&opts.minimaxVolume, "volume", 0, "MiniMax voice volume (0..10; when set)")
+	cmd.Flags().IntVar(&opts.minimaxPitch, "pitch", 0, "MiniMax voice pitch (-12..12; when set)")
+	cmd.Flags().StringVar(&opts.minimaxEmotion, "emotion", "", "MiniMax voice emotion (model dependent)")
+	cmd.Flags().StringVar(&opts.minimaxLanguage, "language", "", "MiniMax language boost (e.g. English, Chinese,Yue; when set)")
+	cmd.Flags().StringVar(&opts.minimaxAccent, "accent", "", "Alias for --language (MiniMax language boost)")
+	cmd.Flags().StringArrayVar(&opts.minimaxTone, "tone", nil, "MiniMax pronunciation tone override (repeatable, e.g. \"omg/oh my god\")")
+	cmd.Flags().BoolVar(&opts.minimaxTextNormalization, "text-normalization", false, "MiniMax text normalization (improves digit reading; when set)")
+	cmd.Flags().BoolVar(&opts.minimaxLatexRead, "latex-read", false, "MiniMax LaTeX formula reading (Chinese only; when set)")
+	cmd.Flags().BoolVar(&opts.minimaxContinuousSound, "continuous-sound", false, "MiniMax continuous sound for smoother transitions (when set)")
+	cmd.Flags().IntVar(&opts.minimaxVoiceModifyPitch, "voice-modify-pitch", 0, "MiniMax voice modify pitch (-100..100; when set)")
+	cmd.Flags().IntVar(&opts.minimaxVoiceModifyIntensity, "voice-modify-intensity", 0, "MiniMax voice modify intensity (-100..100; when set)")
+	cmd.Flags().IntVar(&opts.minimaxVoiceModifyTimbre, "voice-modify-timbre", 0, "MiniMax voice modify timbre (-100..100; when set)")
+	cmd.Flags().StringVar(&opts.minimaxVoiceModifySoundEffects, "voice-modify-sound-effects", "", "MiniMax voice modify sound effects (e.g. spacious_echo, auditorium_echo, lofi_telephone, robotic)")
 	cmd.Flags().Bool("progress", false, "Accepted for macOS say compatibility (no-op)")
 	cmd.Flags().String("network-send", "", "Accepted for macOS say compatibility (not implemented)")
 	cmd.Flags().String("audio-device", "", "Accepted for macOS say compatibility (not implemented)")
@@ -591,11 +618,13 @@ func resolveVoice(ctx context.Context, client *elevenlabs.Client, voiceInput str
 			return "", err
 		}
 		w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
-		if _, err := fmt.Fprintf(w, "VOICE ID\tNAME\tCATEGORY\n"); err != nil {
+		if _, err := fmt.Fprintf(w, "VOICE ID\tNAME\tCATEGORY\tDESCRIPTION\n"); err != nil {
 			return "", err
 		}
 		for _, v := range voices {
-			if _, err := fmt.Fprintf(w, "%s\t%s\t%s\n", v.VoiceID, v.Name, v.Category); err != nil {
+			desc := strings.ReplaceAll(v.Description, "\t", " ")
+			desc = strings.ReplaceAll(desc, "\n", " ")
+			if _, err := fmt.Fprintf(w, "%s\t%s\t%s\t%s\n", v.VoiceID, v.Name, v.Category, desc); err != nil {
 				return "", err
 			}
 		}
@@ -779,29 +808,161 @@ func minimaxBaseURL() string {
 	return "https://" + host
 }
 
-func buildMiniMaxTTSRequest(opts speakOptions, text string) (minimax.TTSRequest, error) {
+func buildMiniMaxTTSRequest(cmd *cobra.Command, opts speakOptions, text string) (minimax.TTSRequest, error) {
+	flags := cmd.Flags()
+
 	format, err := normalizeMiniMaxFormat(opts.outputFmt)
 	if err != nil {
 		return minimax.TTSRequest{}, err
 	}
-	if opts.stream && format != "mp3" {
-		return minimax.TTSRequest{}, errors.New("MiniMax streaming supports mp3 only; use --no-stream for wav/flac")
+	formatExplicit := flags.Changed("format") || opts.outputPath != ""
+	if formatExplicit {
+		if opts.stream && format != "mp3" {
+			return minimax.TTSRequest{}, errors.New("MiniMax streaming supports mp3 only; use --no-stream for wav/flac")
+		}
+		if opts.play && format != "mp3" {
+			return minimax.TTSRequest{}, errors.New("MiniMax playback supports mp3 only; use --output without --play for wav/flac")
+		}
+	} else {
+		format = ""
 	}
-	if opts.play && format != "mp3" {
-		return minimax.TTSRequest{}, errors.New("MiniMax playback supports mp3 only; use --output without --play for wav/flac")
+
+	var speedPtr *float64
+	if flags.Changed("speed") || flags.Changed("rate") {
+		speed := opts.speed
+		speedPtr = &speed
+	}
+
+	var volumePtr *float64
+	if flags.Changed("volume") {
+		if opts.minimaxVolume <= 0 || opts.minimaxVolume > 10 {
+			return minimax.TTSRequest{}, errors.New("volume must be between 0 and 10 (exclusive 0)")
+		}
+		volume := opts.minimaxVolume
+		volumePtr = &volume
+	}
+
+	var pitchPtr *int
+	if flags.Changed("pitch") {
+		if opts.minimaxPitch < -12 || opts.minimaxPitch > 12 {
+			return minimax.TTSRequest{}, errors.New("pitch must be between -12 and 12")
+		}
+		pitch := opts.minimaxPitch
+		pitchPtr = &pitch
+	}
+
+	emotion := strings.TrimSpace(opts.minimaxEmotion)
+	if flags.Changed("emotion") && emotion == "" {
+		return minimax.TTSRequest{}, errors.New("emotion cannot be empty")
+	}
+
+	var textNormPtr *bool
+	if flags.Changed("text-normalization") {
+		v := opts.minimaxTextNormalization
+		textNormPtr = &v
+	}
+
+	var latexReadPtr *bool
+	if flags.Changed("latex-read") {
+		v := opts.minimaxLatexRead
+		latexReadPtr = &v
+	}
+
+	var continuousSoundPtr *bool
+	if flags.Changed("continuous-sound") {
+		v := opts.minimaxContinuousSound
+		continuousSoundPtr = &v
+	}
+
+	var languageBoost string
+	if flags.Changed("language") || flags.Changed("accent") {
+		lang := strings.TrimSpace(opts.minimaxLanguage)
+		accent := strings.TrimSpace(opts.minimaxAccent)
+		if lang != "" && accent != "" && lang != accent {
+			return minimax.TTSRequest{}, errors.New("choose only one of --language or --accent (or set the same value)")
+		}
+		if lang != "" {
+			languageBoost = lang
+		} else {
+			languageBoost = accent
+		}
+		if languageBoost == "" {
+			return minimax.TTSRequest{}, errors.New("language/accent cannot be empty")
+		}
+	}
+
+	var tone []string
+	if flags.Changed("tone") {
+		for _, entry := range opts.minimaxTone {
+			value := strings.TrimSpace(entry)
+			if value == "" {
+				return minimax.TTSRequest{}, errors.New("tone entries cannot be empty")
+			}
+			tone = append(tone, value)
+		}
+	}
+
+	var voiceModify *minimax.VoiceModify
+	var voiceModifyPitch *int
+	var voiceModifyIntensity *int
+	var voiceModifyTimbre *int
+	var voiceModifySoundEffects *string
+	if flags.Changed("voice-modify-pitch") {
+		if opts.minimaxVoiceModifyPitch < -100 || opts.minimaxVoiceModifyPitch > 100 {
+			return minimax.TTSRequest{}, errors.New("voice-modify-pitch must be between -100 and 100")
+		}
+		v := opts.minimaxVoiceModifyPitch
+		voiceModifyPitch = &v
+	}
+	if flags.Changed("voice-modify-intensity") {
+		if opts.minimaxVoiceModifyIntensity < -100 || opts.minimaxVoiceModifyIntensity > 100 {
+			return minimax.TTSRequest{}, errors.New("voice-modify-intensity must be between -100 and 100")
+		}
+		v := opts.minimaxVoiceModifyIntensity
+		voiceModifyIntensity = &v
+	}
+	if flags.Changed("voice-modify-timbre") {
+		if opts.minimaxVoiceModifyTimbre < -100 || opts.minimaxVoiceModifyTimbre > 100 {
+			return minimax.TTSRequest{}, errors.New("voice-modify-timbre must be between -100 and 100")
+		}
+		v := opts.minimaxVoiceModifyTimbre
+		voiceModifyTimbre = &v
+	}
+	if flags.Changed("voice-modify-sound-effects") {
+		value := strings.TrimSpace(opts.minimaxVoiceModifySoundEffects)
+		if value == "" {
+			return minimax.TTSRequest{}, errors.New("voice-modify-sound-effects cannot be empty")
+		}
+		voiceModifySoundEffects = &value
+	}
+	if voiceModifyPitch != nil || voiceModifyIntensity != nil || voiceModifyTimbre != nil || voiceModifySoundEffects != nil {
+		voiceModify = &minimax.VoiceModify{
+			Pitch:        voiceModifyPitch,
+			Intensity:    voiceModifyIntensity,
+			Timbre:       voiceModifyTimbre,
+			SoundEffects: voiceModifySoundEffects,
+		}
+	}
+
+	var pronunciationDict *minimax.PronunciationDict
+	if len(tone) > 0 {
+		pronunciationDict = &minimax.PronunciationDict{Tone: tone}
 	}
 
-	speed := opts.speed
 	return minimax.TTSRequest{
-		Model:       opts.modelID,
-		Text:        text,
-		Speed:       speed,
-		Volume:      1.0,
-		Pitch:       0,
-		AudioFormat: format,
-		SampleRate:  32000,
-		Bitrate:     128000,
-		Channel:     1,
+		Model:             opts.modelID,
+		Text:              text,
+		Speed:             speedPtr,
+		Volume:            volumePtr,
+		Pitch:             pitchPtr,
+		Emotion:           emotion,
+		TextNormalization: textNormPtr,
+		LatexRead:         latexReadPtr,
+		AudioFormat:       format,
+		LanguageBoost:     languageBoost,
+		ContinuousSound:   continuousSoundPtr,
+		PronunciationDict: pronunciationDict,
+		VoiceModify:       voiceModify,
 	}, nil
 }
 
diff --git a/go.mod b/go.mod
index 2cc43cd..adfa966 100644
--- a/go.mod
+++ b/go.mod
@@ -3,7 +3,6 @@ module github.com/steipete/sag
 go 1.24.0
 
 require (
-	github.com/coder/websocket v1.8.14
 	github.com/ebitengine/oto/v3 v3.4.0
 	github.com/hajimehoshi/go-mp3 v0.3.4
 	github.com/spf13/cobra v1.10.2
diff --git a/go.sum b/go.sum
index e58cdd8..280fd5c 100644
--- a/go.sum
+++ b/go.sum
@@ -1,5 +1,3 @@
-github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g=
-github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg=
 github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
 github.com/ebitengine/oto/v3 v3.4.0 h1:br0PgASsEWaoWn38b2Goe7m1GKFYfNgnsjSd5Gg+/bQ=
 github.com/ebitengine/oto/v3 v3.4.0/go.mod h1:IOleLVD0m+CMak3mRVwsYY8vTctQgOM0iiL6S7Ar7eI=
diff --git a/internal/minimax/client.go b/internal/minimax/client.go
index bb98781..7211b8f 100644
--- a/internal/minimax/client.go
+++ b/internal/minimax/client.go
@@ -1,6 +1,7 @@
 package minimax
 
 import (
+	"bufio"
 	"bytes"
 	"context"
 	"encoding/hex"
@@ -13,9 +14,6 @@ import (
 	"path"
 	"strings"
 	"time"
-
-	"github.com/coder/websocket"
-	"github.com/coder/websocket/wsjson"
 )
 
 const defaultBaseURL = "https://api.minimax.io"
@@ -127,17 +125,22 @@ func (c *Client) ListVoices(ctx context.Context) ([]Voice, error) {
 
 // TTSRequest configures a text-to-speech request payload.
 type TTSRequest struct {
-	Model           string
-	Text            string
-	Speed           float64
-	Volume          float64
-	Pitch           int
-	AudioFormat     string
-	SampleRate      int
-	Bitrate         int
-	Channel         int
-	LanguageBoost   string
-	ContinuousSound *bool
+	Model             string
+	Text              string
+	Speed             *float64
+	Volume            *float64
+	Pitch             *int
+	Emotion           string
+	TextNormalization *bool
+	LatexRead         *bool
+	AudioFormat       string
+	SampleRate        int
+	Bitrate           int
+	Channel           int
+	LanguageBoost     string
+	ContinuousSound   *bool
+	PronunciationDict *PronunciationDict
+	VoiceModify       *VoiceModify
 }
 
 type baseResp struct {
@@ -160,10 +163,13 @@ func (b *baseResp) err() error {
 }
 
 type voiceSetting struct {
-	VoiceID string  `json:"voice_id"`
-	Speed   float64 `json:"speed"`
-	Vol     float64 `json:"vol"`
-	Pitch   int     `json:"pitch"`
+	VoiceID           string   `json:"voice_id"`
+	Speed             *float64 `json:"speed,omitempty"`
+	Vol               *float64 `json:"vol,omitempty"`
+	Pitch             *int     `json:"pitch,omitempty"`
+	Emotion           string   `json:"emotion,omitempty"`
+	TextNormalization *bool    `json:"text_normalization,omitempty"`
+	LatexRead         *bool    `json:"latex_read,omitempty"`
 }
 
 type audioSetting struct {
@@ -173,15 +179,35 @@ type audioSetting struct {
 	Channel    int    `json:"channel,omitempty"`
 }
 
+// PronunciationDict configures pronunciation overrides.
+type PronunciationDict struct {
+	Tone []string `json:"tone,omitempty"`
+}
+
+// VoiceModify configures voice effects.
+type VoiceModify struct {
+	Pitch        *int    `json:"pitch,omitempty"`
+	Intensity    *int    `json:"intensity,omitempty"`
+	Timbre       *int    `json:"timbre,omitempty"`
+	SoundEffects *string `json:"sound_effects,omitempty"`
+}
+
 type t2aRequest struct {
-	Model           string       `json:"model"`
-	Text            string       `json:"text"`
-	Stream          bool         `json:"stream"`
-	OutputFormat    string       `json:"output_format,omitempty"`
-	VoiceSetting    voiceSetting `json:"voice_setting"`
-	AudioSetting    audioSetting `json:"audio_setting,omitempty"`
-	LanguageBoost   string       `json:"language_boost,omitempty"`
-	ContinuousSound *bool        `json:"continuous_sound,omitempty"`
+	Model             string             `json:"model"`
+	Text              string             `json:"text"`
+	Stream            bool               `json:"stream"`
+	StreamOptions     *t2aStreamOptions  `json:"stream_options,omitempty"`
+	OutputFormat      string             `json:"output_format,omitempty"`
+	VoiceSetting      voiceSetting       `json:"voice_setting"`
+	AudioSetting      audioSetting       `json:"audio_setting,omitempty"`
+	LanguageBoost     string             `json:"language_boost,omitempty"`
+	ContinuousSound   *bool              `json:"continuous_sound,omitempty"`
+	PronunciationDict *PronunciationDict `json:"pronunciation_dict,omitempty"`
+	VoiceModify       *VoiceModify       `json:"voice_modify,omitempty"`
+}
+
+type t2aStreamOptions struct {
+	ExcludeAggregatedAudio bool `json:"exclude_aggregated_audio,omitempty"`
 }
 
 type t2aResponse struct {
@@ -191,6 +217,16 @@ type t2aResponse struct {
 	BaseResp *baseResp `json:"base_resp,omitempty"`
 }
 
+type t2aStreamData struct {
+	Audio  string `json:"audio,omitempty"`
+	Status int    `json:"status,omitempty"`
+}
+
+type t2aStreamResponse struct {
+	Data     *t2aStreamData `json:"data,omitempty"`
+	BaseResp *baseResp      `json:"base_resp,omitempty"`
+}
+
 // ConvertTTS downloads the full audio before returning.
 func (c *Client) ConvertTTS(ctx context.Context, voiceID string, req TTSRequest) ([]byte, error) {
 	u, err := c.httpURL("/v1/t2a_v2")
@@ -199,14 +235,16 @@ func (c *Client) ConvertTTS(ctx context.Context, voiceID string, req TTSRequest)
 	}
 
 	payload := t2aRequest{
-		Model:           req.Model,
-		Text:            req.Text,
-		Stream:          false,
-		OutputFormat:    "hex",
-		VoiceSetting:    buildVoiceSetting(voiceID, req),
-		AudioSetting:    buildAudioSetting(req),
-		LanguageBoost:   req.LanguageBoost,
-		ContinuousSound: req.ContinuousSound,
+		Model:             req.Model,
+		Text:              req.Text,
+		Stream:            false,
+		OutputFormat:      "hex",
+		VoiceSetting:      buildVoiceSetting(voiceID, req),
+		AudioSetting:      buildAudioSetting(req),
+		LanguageBoost:     req.LanguageBoost,
+		ContinuousSound:   req.ContinuousSound,
+		PronunciationDict: req.PronunciationDict,
+		VoiceModify:       req.VoiceModify,
 	}
 	bodyBytes, err := json.Marshal(payload)
 	if err != nil {
@@ -250,31 +288,6 @@ func (c *Client) ConvertTTS(ctx context.Context, voiceID string, req TTSRequest)
 	return data, nil
 }
 
-type wsTaskStart struct {
-	Event           string       `json:"event"`
-	Model           string       `json:"model"`
-	VoiceSetting    voiceSetting `json:"voice_setting"`
-	AudioSetting    audioSetting `json:"audio_setting,omitempty"`
-	LanguageBoost   string       `json:"language_boost,omitempty"`
-	ContinuousSound *bool        `json:"continuous_sound,omitempty"`
-}
-
-type wsTaskContinue struct {
-	Event string `json:"event"`
-	Text  string `json:"text"`
-}
-
-type wsMessage struct {
-	Event    string    `json:"event"`
-	Data     *wsData   `json:"data,omitempty"`
-	BaseResp *baseResp `json:"base_resp,omitempty"`
-	IsFinal  bool      `json:"is_final,omitempty"`
-}
-
-type wsData struct {
-	Audio string `json:"audio,omitempty"`
-}
-
 type cancelReadCloser struct {
 	*io.PipeReader
 	cancel func()
@@ -285,116 +298,181 @@ func (c *cancelReadCloser) Close() error {
 	return c.PipeReader.Close()
 }
 
-// StreamTTS streams MP3 audio from MiniMax via WebSocket.
+// StreamTTS streams MP3 audio from MiniMax via HTTP (SSE).
 func (c *Client) StreamTTS(ctx context.Context, voiceID string, req TTSRequest) (io.ReadCloser, error) {
-	wsURL, err := c.wsURL("/ws/v1/t2a_v2")
+	u, err := c.httpURL("/v1/t2a_v2")
 	if err != nil {
 		return nil, err
 	}
+
+	payload := t2aRequest{
+		Model:             req.Model,
+		Text:              req.Text,
+		Stream:            true,
+		StreamOptions:     &t2aStreamOptions{ExcludeAggregatedAudio: true},
+		OutputFormat:      "hex",
+		VoiceSetting:      buildVoiceSetting(voiceID, req),
+		AudioSetting:      buildAudioSetting(req),
+		LanguageBoost:     req.LanguageBoost,
+		ContinuousSound:   req.ContinuousSound,
+		PronunciationDict: req.PronunciationDict,
+		VoiceModify:       req.VoiceModify,
+	}
+	bodyBytes, err := json.Marshal(payload)
+	if err != nil {
+		return nil, err
+	}
+
 	ctx, cancel := context.WithCancel(ctx)
-	pr, pw := io.Pipe()
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, u, bytes.NewReader(bodyBytes))
+	if err != nil {
+		cancel()
+		return nil, err
+	}
+	httpReq.Header.Set("Authorization", "Bearer "+c.apiKey)
+	httpReq.Header.Set("Content-Type", "application/json")
+	httpReq.Header.Set("Accept", "text/event-stream")
 
+	resp, err := c.httpClient.Do(httpReq)
+	if err != nil {
+		cancel()
+		return nil, err
+	}
+	if resp.StatusCode >= 400 {
+		body, _ := io.ReadAll(resp.Body)
+		_ = resp.Body.Close()
+		cancel()
+		return nil, fmt.Errorf("stream TTS failed: %s: %s", resp.Status, strings.TrimSpace(string(body)))
+	}
+
+	pr, pw := io.Pipe()
 	go func() {
 		defer cancel()
-		defer func() { _ = pw.Close() }()
-
-		header := http.Header{}
-		header.Set("Authorization", "Bearer "+c.apiKey)
-		conn, _, err := websocket.Dial(ctx, wsURL, &websocket.DialOptions{HTTPHeader: header})
-		if err != nil {
+		defer func() { _ = resp.Body.Close() }()
+		if err := readMiniMaxStream(ctx, resp.Body, pw); err != nil && !errors.Is(err, context.Canceled) {
 			_ = pw.CloseWithError(err)
 			return
 		}
-		defer func() {
-			_ = conn.Close(websocket.StatusNormalClosure, "done")
-		}()
+		_ = pw.Close()
+	}()
 
-		if err := readWSUntilEvent(ctx, conn, "connected_success"); err != nil {
-			_ = pw.CloseWithError(err)
-			return
-		}
+	return &cancelReadCloser{PipeReader: pr, cancel: cancel}, nil
+}
 
-		start := wsTaskStart{
-			Event:           "task_start",
-			Model:           req.Model,
-			VoiceSetting:    buildVoiceSetting(voiceID, req),
-			AudioSetting:    buildAudioSetting(req),
-			LanguageBoost:   req.LanguageBoost,
-			ContinuousSound: req.ContinuousSound,
-		}
-		if err := wsjson.Write(ctx, conn, start); err != nil {
-			_ = pw.CloseWithError(err)
-			return
-		}
-		if err := readWSUntilEvent(ctx, conn, "task_started"); err != nil {
-			_ = pw.CloseWithError(err)
-			return
+func readMiniMaxStream(ctx context.Context, body io.Reader, pw *io.PipeWriter) error {
+	reader := bufio.NewReader(body)
+	var dataLines []string
+	for {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
 		}
 
-		if err := wsjson.Write(ctx, conn, wsTaskContinue{Event: "task_continue", Text: req.Text}); err != nil {
-			_ = pw.CloseWithError(err)
-			return
+		line, err := reader.ReadString('\n')
+		if err != nil && err != io.EOF {
+			return err
+		}
+		if err == io.EOF && len(line) == 0 {
+			break
 		}
 
-		for {
-			var msg wsMessage
-			if err := wsjson.Read(ctx, conn, &msg); err != nil {
-				_ = pw.CloseWithError(err)
-				return
-			}
-			if err := msg.BaseResp.err(); err != nil {
-				_ = pw.CloseWithError(err)
-				return
-			}
-			if msg.Event == "task_failed" {
-				_ = pw.CloseWithError(errors.New("minimax stream failed"))
-				return
-			}
-			if msg.Data != nil && msg.Data.Audio != "" {
-				chunk, err := hex.DecodeString(msg.Data.Audio)
+		line = strings.TrimRight(line, "\r\n")
+		if line == "" {
+			if len(dataLines) > 0 {
+				done, err := handleMiniMaxStreamPayload(strings.Join(dataLines, "\n"), pw)
 				if err != nil {
-					_ = pw.CloseWithError(fmt.Errorf("decode audio chunk: %w", err))
-					return
+					return err
 				}
-				if len(chunk) > 0 {
-					if _, err := pw.Write(chunk); err != nil {
-						return
-					}
+				if done {
+					return nil
 				}
+				dataLines = dataLines[:0]
 			}
-			if msg.IsFinal || msg.Event == "task_finished" {
-				return
+		} else if strings.HasPrefix(line, "data:") {
+			dataLines = append(dataLines, strings.TrimSpace(strings.TrimPrefix(line, "data:")))
+		} else if strings.HasPrefix(line, ":") || strings.HasPrefix(line, "event:") || strings.HasPrefix(line, "id:") || strings.HasPrefix(line, "retry:") {
+			// Ignore SSE metadata/comments.
+		} else {
+			trimmed := strings.TrimSpace(line)
+			if strings.HasPrefix(trimmed, "{") || strings.HasPrefix(trimmed, "[") {
+				done, err := handleMiniMaxStreamPayload(trimmed, pw)
+				if err != nil {
+					return err
+				}
+				if done {
+					return nil
+				}
 			}
 		}
-	}()
 
-	return &cancelReadCloser{PipeReader: pr, cancel: cancel}, nil
-}
+		if err == io.EOF {
+			break
+		}
+	}
 
-func readWSUntilEvent(ctx context.Context, conn *websocket.Conn, want string) error {
-	for {
-		var msg wsMessage
-		if err := wsjson.Read(ctx, conn, &msg); err != nil {
+	if len(dataLines) > 0 {
+		done, err := handleMiniMaxStreamPayload(strings.Join(dataLines, "\n"), pw)
+		if err != nil {
 			return err
 		}
-		if err := msg.BaseResp.err(); err != nil {
-			return err
+		if done {
+			return nil
 		}
-		if msg.Event == "task_failed" {
-			return errors.New("minimax task failed")
+	}
+	return nil
+}
+
+func handleMiniMaxStreamPayload(payload string, pw *io.PipeWriter) (bool, error) {
+	payload = strings.TrimSpace(payload)
+	if payload == "" {
+		return false, nil
+	}
+
+	var items []t2aStreamResponse
+	if strings.HasPrefix(payload, "[") {
+		if err := json.Unmarshal([]byte(payload), &items); err != nil {
+			return false, err
 		}
-		if msg.Event == want {
-			return nil
+	} else {
+		var item t2aStreamResponse
+		if err := json.Unmarshal([]byte(payload), &item); err != nil {
+			return false, err
 		}
+		items = append(items, item)
 	}
+
+	for _, item := range items {
+		if err := item.BaseResp.err(); err != nil {
+			return false, err
+		}
+		if item.Data != nil && item.Data.Audio != "" {
+			chunk, err := hex.DecodeString(item.Data.Audio)
+			if err != nil {
+				return false, fmt.Errorf("decode audio chunk: %w", err)
+			}
+			if len(chunk) > 0 {
+				if _, err := pw.Write(chunk); err != nil {
+					return false, err
+				}
+			}
+		}
+		if item.Data != nil && item.Data.Status == 2 {
+			return true, nil
+		}
+	}
+	return false, nil
 }
 
 func buildVoiceSetting(voiceID string, req TTSRequest) voiceSetting {
 	return voiceSetting{
-		VoiceID: voiceID,
-		Speed:   req.Speed,
-		Vol:     req.Volume,
-		Pitch:   req.Pitch,
+		VoiceID:           voiceID,
+		Speed:             req.Speed,
+		Vol:               req.Volume,
+		Pitch:             req.Pitch,
+		Emotion:           req.Emotion,
+		TextNormalization: req.TextNormalization,
+		LatexRead:         req.LatexRead,
 	}
 }
 
@@ -415,21 +493,3 @@ func (c *Client) httpURL(endpoint string) (string, error) {
 	u.Path = path.Join(u.Path, endpoint)
 	return u.String(), nil
 }
-
-func (c *Client) wsURL(endpoint string) (string, error) {
-	u, err := url.Parse(c.baseURL)
-	if err != nil {
-		return "", err
-	}
-	switch u.Scheme {
-	case "http":
-		u.Scheme = "ws"
-	case "https":
-		u.Scheme = "wss"
-	case "ws", "wss":
-	default:
-		u.Scheme = "wss"
-	}
-	u.Path = path.Join(u.Path, endpoint)
-	return u.String(), nil
-}