Skip to content

Commit

Permalink
fix: fixed tts_server.py on macOS
Browse files Browse the repository at this point in the history
  • Loading branch information
ErikBjare committed Jan 24, 2025
1 parent db212bd commit ac3e2a0
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 45 deletions.
109 changes: 70 additions & 39 deletions gptme/tools/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
else:
console.log("TTS disabled: server not available")
except (ImportError, OSError):
# will happen if tts extras not installed
# sounddevice may throw OSError("PortAudio library not found")
_available = False
# fmt: on
Expand Down Expand Up @@ -203,6 +204,67 @@ def clean_for_speech(content: str) -> str:
return content.strip()


def get_output_device():
"""Get the best available output device and its sample rate.
Returns:
tuple: (device_index, sample_rate)
Raises:
RuntimeError: If no suitable output device is found
"""
devices = sd.query_devices()
log.debug("Available audio devices:")
for i, dev in enumerate(devices):
log.debug(
f" [{i}] {dev['name']} (in: {dev['max_input_channels']}, "
f"out: {dev['max_output_channels']}, hostapi: {dev['hostapi']})"
)

# First try: use system default output device
try:
default_output = sd.default.device[1]
if default_output is not None:
device_info = sd.query_devices(default_output)
if device_info["max_output_channels"] > 0:
log.debug(f"Using system default output device: {device_info['name']}")
return default_output, int(device_info["default_samplerate"])
except Exception as e:
log.debug(f"Could not use default device: {e}")

# Second try: prefer CoreAudio devices
output_device = next(
(
i
for i, d in enumerate(devices)
if d["max_output_channels"] > 0 and d["hostapi"] == 2
),
None,
)

# Third try: any device with output channels
if output_device is None:
output_device = next(
(i for i, d in enumerate(devices) if d["max_output_channels"] > 0),
None,
)

if output_device is None:
raise RuntimeError(
"No suitable audio output device found. "
"Available devices:\n"
+ "\n".join(f" {i}: {d['name']}" for i, d in enumerate(devices))
)

device_info = sd.query_devices(output_device)
device_sr = int(device_info["default_samplerate"])

log.debug(f"Selected output device: {output_device} ({device_info['name']})")
log.debug(f"Sample rate: {device_sr}")

return output_device, device_sr


def audio_player_thread():
"""Background thread for playing audio."""
log.debug("Audio player thread started")
Expand All @@ -221,22 +283,13 @@ def audio_player_thread():
f"Playing audio: shape={data.shape}, sr={sample_rate}, vol={current_volume}"
)

# Play audio using explicit device index
devices = sd.query_devices()
output_device = next(
(
i
for i, d in enumerate(devices)
if d["max_output_channels"] > 0 and d["hostapi"] == 2
),
None,
)
if output_device is None:
log.error("No suitable output device found")
# Get output device
try:
output_device, _ = get_output_device()
log.debug(f"Playing on device: {output_device}")
except RuntimeError as e:
log.error(str(e))
continue

device_info = sd.query_devices(output_device)
log.debug(f"Playing on device: {output_device} ({device_info['name']})")
sd.play(data, sample_rate, device=output_device)
sd.wait() # Wait until audio is finished playing
log.debug("Finished playing audio chunk")
Expand Down Expand Up @@ -300,30 +353,8 @@ def speak(text, block=False, interrupt=True, clean=True):
chunks = [c.replace("gptme", "gpt-me") for c in chunks] # Fix pronunciation

try:
# Find the current output device
devices = sd.query_devices()
output_device = next(
(
i
for i, d in enumerate(devices)
if d["max_output_channels"] > 0 and d["hostapi"] == 2
),
None,
)
if output_device is None:
raise RuntimeError("No suitable output device found")

device_info = sd.query_devices(output_device)
device_sr = int(device_info["default_samplerate"])

log.debug("Available audio devices:")
for i, dev in enumerate(devices):
log.debug(
f" [{i}] {dev['name']} (in: {dev['max_input_channels']}, out: {dev['max_output_channels']}, hostapi: {dev['hostapi']})"
)

log.debug(f"Selected output device: {output_device} ({device_info['name']})")
log.debug(f"Sample rate: {device_sr}")
# Get output device and sample rate
output_device, device_sr = get_output_device()

# Ensure playback thread is running
ensure_playback_thread()
Expand Down
15 changes: 9 additions & 6 deletions scripts/tts_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import glob
import io
import logging
import subprocess
import shutil
import sys
from pathlib import Path

Expand All @@ -35,13 +35,19 @@
import uvicorn
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from phonemizer.backend.espeak.wrapper import EspeakWrapper

script_dir = Path(__file__).parent

# Add Kokoro-82M to Python path
kokoro_path = (script_dir / "Kokoro-82M").absolute()
sys.path.insert(0, str(kokoro_path))

# on macOS, use workaround for espeak detection
if sys.platform == "darwin":
_ESPEAK_LIBRARY = "/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib"
EspeakWrapper.set_library(_ESPEAK_LIBRARY)

from kokoro import generate # fmt: skip
from models import build_model # fmt: skip

Expand Down Expand Up @@ -75,12 +81,9 @@ def load_voice(voice_name: str):


def _check_espeak():
try:
# check that espeak-ng is installed
subprocess.run(["espeak-ng", "--version"], capture_output=True, check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
if not any([shutil.which("espeak"), shutil.which("espeak-ng")]):
raise RuntimeError(
"Failed to run `espeak-ng`. Try to install it using 'sudo apt-get install espeak-ng' or equivalent"
"Failed to find `espeak` or `espeak-ng`. Try to install it using 'sudo apt-get install espeak-ng' or equivalent"
) from None


Expand Down

0 comments on commit ac3e2a0

Please sign in to comment.