@@ -501,6 +501,150 @@ class LanguageDetectionOptions(BaseModel):
501501 )
502502
503503
504+ class SpeakerType (str , Enum ):
505+ """
506+ Speaker identification type for speech understanding
507+ """
508+
509+ role = "role"
510+ "Identify speakers by their role"
511+
512+ name = "name"
513+ "Identify speakers by their name"
514+
515+
516+ class SpeakerIdentificationRequest (BaseModel ):
517+ """
518+ Speaker identification configuration for speech understanding
519+ """
520+
521+ speaker_type : SpeakerType
522+ "The type of speaker identification to perform"
523+
524+ known_values : Optional [List [str ]] = None
525+ "Known speaker values (required when speaker_type is 'role')"
526+
527+
528+ class TranslationRequest (BaseModel ):
529+ """
530+ Translation configuration for speech understanding
531+ """
532+
533+ target_languages : List [str ]
534+ "List of target language codes to translate the transcript into"
535+
536+ formal : Optional [bool ] = False
537+ "Whether to use formal language in translations (default: False)"
538+
539+ match_original_utterance : Optional [bool ] = False
540+ "Whether to match the original utterance structure in translations (default: False)"
541+
542+
543+ class CustomFormattingRequest (BaseModel ):
544+ """
545+ Custom formatting configuration for speech understanding
546+ """
547+
548+ date : Optional [str ] = None
549+ "Custom date format pattern (e.g., 'mm/dd/yyyy')"
550+
551+ phone_number : Optional [str ] = None
552+ "Custom phone number format pattern (e.g., '(xxx)xxx-xxxx')"
553+
554+ email : Optional [str ] = None
555+ "Custom email format pattern (e.g., 'username@domain.com')"
556+
557+
558+ class SpeechUnderstandingFeatureRequests (BaseModel ):
559+ """
560+ Speech understanding feature requests
561+ """
562+
563+ speaker_identification : Optional [SpeakerIdentificationRequest ] = None
564+ "Speaker identification configuration"
565+
566+ translation : Optional [TranslationRequest ] = None
567+ "Translation configuration"
568+
569+ custom_formatting : Optional [CustomFormattingRequest ] = None
570+ "Custom formatting configuration"
571+
572+
573+ class SpeechUnderstandingRequest (BaseModel ):
574+ """
575+ Speech understanding request configuration for LLM Gateway features
576+ """
577+
578+ request : Optional [SpeechUnderstandingFeatureRequests ] = None
579+ "The speech understanding feature requests"
580+
581+
582+ class SpeakerIdentificationResponse (BaseModel ):
583+ """
584+ Speaker identification response containing status and mapping
585+ """
586+
587+ status : str
588+ "Status of the speaker identification feature (e.g., 'success')"
589+
590+ mapping : Optional [Dict [str , str ]] = None
591+ "Mapping of original speaker labels to identified speaker labels"
592+
593+
594+ class CustomFormattingResponse (BaseModel ):
595+ """
596+ Custom formatting response containing mapping and formatted texts
597+ """
598+
599+ mapping : Optional [Dict [str , str ]] = None
600+ "Mapping of original entities to formatted entities"
601+
602+ formatted_text : Optional [str ] = None
603+ "Full transcript text with formatted entities"
604+
605+ formatted_utterances : Optional [List [Dict [str , Any ]]] = None
606+ "List of utterances with formatted text"
607+
608+ status : str
609+ "Status of the custom formatting feature"
610+
611+
612+ class TranslationResponse (BaseModel ):
613+ """
614+ Translation response containing status
615+ """
616+
617+ status : str
618+ "Status of the translation feature"
619+
620+
621+ class SpeechUnderstandingFeatureResponses (BaseModel ):
622+ """
623+ Speech understanding feature responses grouped together
624+ """
625+
626+ speaker_identification : Optional [SpeakerIdentificationResponse ] = None
627+ "Speaker identification results including status and mapping"
628+
629+ translation : Optional [TranslationResponse ] = None
630+ "Translation results"
631+
632+ custom_formatting : Optional [CustomFormattingResponse ] = None
633+ "Custom formatting results"
634+
635+
636+ class SpeechUnderstandingResponse (BaseModel ):
637+ """
638+ Speech understanding response containing both request and response
639+ """
640+
641+ request : Optional [SpeechUnderstandingFeatureRequests ] = None
642+ "The original speech understanding request"
643+
644+ response : Optional [SpeechUnderstandingFeatureResponses ] = None
645+ "The speech understanding feature responses"
646+
647+
504648class SpeakerOptions (BaseModel ):
505649 """
506650 Speaker options for controlling speaker diarization parameters
@@ -671,6 +815,9 @@ class RawTranscriptionConfig(BaseModel):
671815 language_codes : Optional [List [Union [str , LanguageCode ]]] = None
672816 "List of language codes detected in the audio file when language detection is enabled"
673817
818+ speech_understanding : Optional [SpeechUnderstandingRequest ] = None
819+ "Speech understanding configuration for LLM Gateway features"
820+
674821 model_config = ConfigDict (extra = "allow" )
675822
676823
@@ -719,6 +866,7 @@ def __init__(
719866 speech_models : Optional [List [str ]] = None ,
720867 prompt : Optional [str ] = None ,
721868 keyterms_prompt : Optional [List [str ]] = None ,
869+ speech_understanding : Optional [SpeechUnderstandingRequest ] = None ,
722870 ) -> None :
723871 """
724872 Args:
@@ -760,6 +908,7 @@ def __init__(
760908 language_detection_options: Options for controlling the behavior or Automatic Language Detection.
761909 speech_threshold: Reject audio files that contain less than this fraction of speech. Valid values are in the range [0,1] inclusive.
762910 raw_transcription_config: Create the config from a `RawTranscriptionConfig`
911+ speech_understanding: Speech understanding configuration for LLM Gateway features (speaker identification, translation, custom formatting)
763912 """
764913 self ._raw_transcription_config = (
765914 raw_transcription_config
@@ -813,6 +962,7 @@ def __init__(
813962 self .speech_models = speech_models
814963 self .prompt = prompt
815964 self .keyterms_prompt = keyterms_prompt
965+ self .speech_understanding = speech_understanding
816966
817967 @property
818968 def raw (self ) -> RawTranscriptionConfig :
@@ -871,6 +1021,18 @@ def keyterms_prompt(self, keyterms_prompt: Optional[List[str]]) -> None:
8711021 "Sets the prompt to use for the transcription."
8721022 self ._raw_transcription_config .keyterms_prompt = keyterms_prompt
8731023
1024+ @property
1025+ def speech_understanding (self ) -> Optional [SpeechUnderstandingRequest ]:
1026+ "The speech understanding configuration for LLM Gateway features."
1027+ return self ._raw_transcription_config .speech_understanding
1028+
1029+ @speech_understanding .setter
1030+ def speech_understanding (
1031+ self , speech_understanding : Optional [SpeechUnderstandingRequest ]
1032+ ) -> None :
1033+ "Sets the speech understanding configuration for LLM Gateway features."
1034+ self ._raw_transcription_config .speech_understanding = speech_understanding
1035+
8741036 @property
8751037 def punctuate (self ) -> Optional [bool ]:
8761038 "Returns the status of the Automatic Punctuation feature."
@@ -1649,6 +1811,8 @@ class UtteranceWord(Word):
16491811
16501812class Utterance (UtteranceWord ):
16511813 words : List [UtteranceWord ]
1814+ translated_texts : Optional [Dict [str , str ]] = None
1815+ "Translations of the utterance text when translation is enabled"
16521816
16531817
16541818class Chapter (BaseModel ):
@@ -1940,6 +2104,9 @@ class BaseTranscript(BaseModel):
19402104 keyterms_prompt : Optional [List [str ]] = None
19412105 "The list of key terms used to generate the transcript with the Slam-1 speech model. Can't be used together with `prompt`."
19422106
2107+ speech_understanding : Optional [SpeechUnderstandingRequest ] = None
2108+ "Speech understanding configuration for LLM Gateway features"
2109+
19432110
19442111class TranscriptRequest (BaseTranscript ):
19452112 """
@@ -2014,6 +2181,12 @@ class TranscriptResponse(BaseTranscript):
20142181 keyterms_prompt : Optional [List [str ]] = None
20152182 "When Slam-1 is enabled, the list of key terms used to generate the transcript"
20162183
2184+ speech_understanding : Optional [SpeechUnderstandingResponse ] = None
2185+ "Speech understanding response when enabled"
2186+
2187+ translated_texts : Optional [Dict [str , str ]] = None
2188+ "Translations of the full transcript text when translation is enabled"
2189+
20172190 def __init__ (self , ** data : Any ):
20182191 # cleanup the response before creating the object
20192192 if not data .get ("iab_categories_result" ) or (
0 commit comments