ManimCommunity · osolmaz · Feb 25, 2024 · Feb 20, 2024 · Feb 20, 2024 · Feb 25, 2024
@@ -42,6 +42,10 @@ Speech services
    :members:
    :show-inheritance:
 
+.. automodule:: manim_voiceover.services.elevenlabs
+   :members:
+   :show-inheritance:
+
 
 Defaults
 ~~~~~~~~

@@ -32,6 +32,11 @@ Manim Voiceover defines the :py:class:`~~base.SpeechService` class for adding ne
      - No
      - Yes
      - Azure gives 500min/month free TTS quota. However, registration still needs a credit or debit card. See `Azure free account FAQ <https://azure.microsoft.com/en-us/free/free-account-faq/>`__ for more details.
+   * - :py:class:`~elevenlabs.ElevenLabsService`
+     - Very good, human-like
+     - No
+     - Yes
+     - `ElevenLabs <elevenlabs.io>`__ develops very advanced voice generative AI models. It has a range of realistic and emotive voices, and also allows you to clone your own voice by uploading a few minutes of your speech.
    * - :py:class:`~coqui.CoquiService`
      - Good, human-like
      - Yes
@@ -144,3 +149,36 @@ Install Manim Voiceover with the ``pyttsx3`` extra in order to use :py:class:`~p
    pip install "manim-voiceover[pyttsx3]"
 
 Refer to the `example usage <https://github.com/ManimCommunity/manim-voiceover/blob/main/examples/pyttsx3-example.py>`__ to get started.
+
+
+:py:class:`~elevenlabs.ElevenLabsService`
+******************************************
+
+`ElevenLabs <https://www.elevenlabs.io/>`__ offers one of the most natural sounding speech service APIs. To use it, you will need to create an account at `Eleven Labs <https://elevenlabs.io/sign-up>`__.
+
+.. tip::
+    ElevenLabs currently offers free TTS of 10,000 characters/month and up to 3 custom voices.
+
+Install Manim Voiceover with the ``elevenlabs`` extra in order to use :py:class:`~elevenlabs.ElevenLabsService`:
+
+.. code:: sh
+
+   pip install "manim-voiceover[elevenlabs]"
+
+Then, you need to find out your API key.
+
+- Sign in to `ElevenLabs portal <https://www.elevenlabs.io/>`__ and go to your profile to obtain the key
+- Set the environment variable ``ELEVEN_API_KEY`` to your key
+
+Create a file called ``.env`` that contains your authentication
+information in the same directory where you call Manim.
+
+.. code:: sh
+
+   ELEVEN_API_KEY="..." # insert Key 1 here
+
+Check out `ElevenLabs
+docs <https://elevenlabs.io/docs/api-reference/python-text-to-speech-guide#getting-started>`__
+for more details.
+
+Refer to the `example usage <https://github.com/ManimCommunity/manim-voiceover/blob/main/examples/elevenlabs-example.py>`__ to get started.
@@ -0,0 +1,45 @@
+from manim import *
+
+from manim_voiceover import VoiceoverScene
+from manim_voiceover.services.elevenlabs import ElevenLabsService
+
+
+class ElevenLabsExample(VoiceoverScene):
+    def construct(self):
+        # Set speech service using defaults, without voice_name or voice_id
+        # If none of voice_name or voice_id is passed, it defaults to the
+        # first voice in the list returned by `voices()`
+        #
+        # self.set_speech_service(ElevenLabsService())
+        #
+        # Set speech service using voice_name
+        #
+        # self.set_speech_service(ElevenLabsService(voice_name="Adam"))
+        #
+        # Set speech service using voice_id
+        #
+        # self.set_speech_service(ElevenLabsService(voice_id="29vD33N1CtxCmqQRPOHJ"))
+
+        # customise voice by passing voice_settings
+        self.set_speech_service(
+            ElevenLabsService(
+                voice_name="Adam",
+                voice_settings={"stability": 0.001, "similarity_boost": 0.25},
+            )
+        )
+        circle = Circle()
+        square = Square().shift(2 * RIGHT)
+
+        with self.voiceover(text="This circle is drawn as I speak.") as tracker:
+            self.play(Create(circle), run_time=tracker.duration)
+
+        with self.voiceover(text="Let's shift it to the left 2 units.") as tracker:
+            self.play(circle.animate.shift(2 * LEFT), run_time=tracker.duration)
+
+        with self.voiceover(text="Now, let's transform it into a square.") as tracker:
+            self.play(Transform(circle, square), run_time=tracker.duration)
+
+        with self.voiceover(text="Thank you for watching."):
+            self.play(Uncreate(circle))
+
+        self.wait()
@@ -0,0 +1,181 @@
+import os
+import sys
+from pathlib import Path
+from typing import List, Optional, Union
+
+from dotenv import find_dotenv, load_dotenv
+from manim import logger
+
+from manim_voiceover.helper import create_dotenv_file
+
+try:
+    import elevenlabs
+except ImportError:
+    logger.error(
+        'Missing packages. Run `pip install "manim-voiceover[elevenlabs]"` '
+        "to use ElevenLabs API."
+    )
+
+from elevenlabs import Voice, VoiceSettings, generate, save, voices
+
+from manim_voiceover.helper import create_dotenv_file, remove_bookmarks
+from manim_voiceover.services.base import SpeechService
+
+load_dotenv(find_dotenv(usecwd=True))
+
+
+def create_dotenv_elevenlabs():
+    logger.info(
+        "Check out https://voiceover.manim.community/en/stable/services.html#elevenlabs"
+        " to learn how to create an account and get your subscription key."
+    )
+    try:
+        os.environ["ELEVEN_API_KEY"]
+    except KeyError:
+        if not create_dotenv_file(["ELEVEN_API_KEY"]):
+            raise Exception(
+                "The environment variables ELEVEN_API_KEY are not set. "
+                "Please set them or create a .env file with the variables."
+            )
+        logger.info("The .env file has been created. Please run Manim again.")
+        sys.exit()
+
+
+create_dotenv_elevenlabs()
+
+
+class ElevenLabsService(SpeechService):
+    """Speech service for ElevenLabs API."""
+
+    def __init__(
+        self,
+        voice_name: Optional[str] = None,
+        voice_id: Optional[str] = None,
+        model: str = "eleven_monolingual_v1",
+        voice_settings: Optional[Union[VoiceSettings, dict]] = None,
+        transcription_model: str = "base",
+        **kwargs,
+    ):
+        """
+        Args:
+            voice_name (str, optional): The name of the voice to use.
+                See the
+                `API page <https://elevenlabs.io/docs/api-reference/text-to-speech>`
+                for reference. Defaults to `None`.
+                If none of `voice_name` or `voice_id` is be provided,
+                it uses default available voice.
+            voice_id (str, Optional): The id of the voice to use.
+                See the
+                `API page <https://elevenlabs.io/docs/api-reference/text-to-speech>`
+                for reference. Defaults to `None`. If none of `voice_name`
+                or `voice_id` must be provided, it uses default available voice.
+            model (str, optional): The name of the model to use. See the `API
+                page: <https://elevenlabs.io/docs/api-reference/text-to-speech>`
+                for reference. Defaults to `eleven_monolingual_v1`
+            voice_settings (Union[VoiceSettings, dict], optional): The voice
+                settings to use.
+                See the
+                `Docs: <https://elevenlabs.io/docs/speech-synthesis/voice-settings>`
+                for reference.
+                It is a dictionary, with keys: `stability` (Required, number),
+                `similarity_boost` (Required, number),
+                `style` (Optional, number, default 0), `use_speaker_boost`
+                (Optional, boolean, True).
+        """
+        if not voice_name and not voice_id:
+            logger.warn(
+                "None of `voice_name` or `voice_id` provided. "
+                "Will be using default voice."
+            )
+
+        available_voices: List[Voice] = voices()
+
+        if voice_name:
+            selected_voice = [v for v in available_voices if v.name == voice_name]
+        elif voice_id:
+            selected_voice = [v for v in available_voices if v.voice_id == voice_id]
+        else:
+            selected_voice = None
+
+        if selected_voice:
+            self.voice = selected_voice[0]
+        else:
+            logger.warn(
+                "Given `voice_name` or `voice_id` not found (or not provided). "
+                f"Defaulting to {available_voices[0].name}"
+            )
+            self.voice = available_voices[0]
+
+        self.model = model
+
+        if voice_settings:
+            if isinstance(voice_settings, dict):
+                if not voice_settings.get("stability") or not voice_settings.get(
+                    "similarity_boost"
+                ):
+                    raise KeyError(
+                        "Missing required keys: 'stability' and 'similarity_boost'. "
+                        "Required for setting voice setting"
+                    )
+                self.voice_settings = VoiceSettings(
+                    stability=voice_settings["stability"],
+                    similarity_boost=voice_settings["similarity_boost"],
+                    style=voice_settings.get("style", 0),
+                    use_speaker_boost=voice_settings.get("use_speaker_boost", True),
+                )
+            elif isinstance(voice_settings, VoiceSettings):
+                self.voice_settings = voice_settings
+            else:
+                raise TypeError(
+                    "voice_settings must be a VoiceSettings object or a dictionary"
+                )
+
+            # apply voice settings to voice
+            self.voice = Voice(
+                voice_id=self.voice.voice_id, settings=self.voice_settings
+            )
+        SpeechService.__init__(self, transcription_model=transcription_model, **kwargs)
+
+    def generate_from_text(
+        self,
+        text: str,
+        cache_dir: Optional[str] = None,
+        path: Optional[str] = None,
+        **kwargs,
+    ) -> dict:
+        if cache_dir is None:
+            cache_dir = self.cache_dir  # type: ignore
+
+        input_text = remove_bookmarks(text)
+        input_data = {
+            "input_text": input_text,
+            "service": "elevenlabs",
+            "voice_id": self.voice.voice_id,
+            "model": self.model,
+            "voice_settings": self.voice.model_dump(exclude_none=True),
+        }
+
+        # if not config.disable_caching:
+        cached_result = self.get_cached_result(input_data, cache_dir)
+
+        if cached_result is not None:
+            return cached_result
+
+        if path is None:
+            audio_path = self.get_audio_basename(input_data) + ".mp3"
+        else:
+            audio_path = path
+        try:
+            audio = generate(text=text, voice=self.voice, model=self.model)
+            save(audio, str(Path(cache_dir) / audio_path))  # type: ignore
+        except Exception as e:
+            logger.error(e)
+            raise Exception("Failed to initialize ElevenLabs.")
+
+        json_dict = {
+            "input_text": text,
+            "input_data": input_data,
+            "original_audio": audio_path,
+        }
+
+        return json_dict