cartesia-ai · noahlt · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024
diff --git a/fern/definition/voices.yml b/fern/definition/voices.yml
@@ -42,6 +42,31 @@ types:
       language: tts.SupportedLanguage
       base_voice_id: optional<BaseVoiceId>
 
+  VoiceMetadata:
+    properties:
+      id: VoiceId
+      user_id:
+        type: string
+        docs: |
+          The ID of the user who owns the voice.
+      is_public:
+        type: boolean
+        docs: |
+          Whether the voice is publicly accessible.
+      name: &name
+        type: string
+        docs: |
+          The name of the voice.
+      description: &description
+        type: string
+        docs: |
+          The description of the voice.
+      created_at:
+        type: datetime
+        docs: |
+          The date and time the voice was created.
+      language: tts.SupportedLanguage
+
   CreateVoiceRequest:
     properties:
       name: *name
@@ -130,6 +155,11 @@ types:
       - IdSpecifier
       - EmbeddingSpecifier
 
+  CloneMode:
+    enum:
+      - similarity
+      - stability
+
 service:
   base-path: /voices
   auth: true
@@ -185,6 +215,77 @@ service:
       request: MixVoicesRequest
       response: EmbeddingResponse
 
+    clone:
+      path: /clone
+      method: POST
+      display-name: Clone Voice
+      request:
+        name: CloneVoiceRequest
+        body:
+          properties:
+            clip: file
+            name:
+              type: string
+              docs: |
+                The name of the voice.
+            description:
+              type: optional<string>
+              docs: |
+                A description for the voice.
+            language:
+              type: tts.SupportedLanguage
+              docs: |
+                The language of the voice.
+            mode:
+              type: CloneMode
+              docs: |
+                Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
+            enhance:
+              type: boolean
+              docs: |
+                Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
+            transcript:
+              type: optional<string>
+              docs: |
+                Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
+      response: VoiceMetadata
+      examples:
+        - name: Stability
+          request:
+            # clip: file
+            name: "A high-stability cloned voice"
+            description: "Copied from Cartesia docs"
+            mode: stability
+            language: en
+            enhance: true
+          response:
+            body:
+              id: "df076429-66c6-4f3e-b98b-aee4e2c925ab"
+              user_id: "482aa35e-d86c-42a4-b818-7bdcfe40a858"
+              is_public: false
+              name: "A high-stability cloned voice"
+              description: "Copied from Cartesia docs"
+              created_at: "2024-11-13T07:06:22.476564Z"
+              language: en
+        - name: Similarity
+          request:
+            # clip: file
+            name: "A high-similarity cloned voice"
+            description: "Copied from Cartesia docs"
+            mode: similarity
+            language: en
+            transcript: "A transcript of the words spoken in the audio clip."
+            enhance: true
+          response:
+            body:
+              id: "40248dd5-bfe9-48e2-93f7-ea3f9d5c7f72"
+              user_id: "482aa35e-d86c-42a4-b818-7bdcfe40a858"
+              is_public: false
+              name: "A high-similarity cloned voice"
+              description: "Copied from Cartesia docs"
+              created_at: "2024-11-13T07:06:22.476564Z"
+              language: en
+
     cloneFromClip:
       path: /clone/clip
       method: POST