Skip to content

Commit 8ed6493

Browse files
he-jamesAssemblyAI
andauthored
chore: sync sdk code with DeepLearning repo (#143)
Co-authored-by: AssemblyAI <engineering.sdk@assemblyai.com>
1 parent 9f7f5df commit 8ed6493

File tree

2 files changed

+174
-1
lines changed

2 files changed

+174
-1
lines changed

assemblyai/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.45.2"
1+
__version__ = "0.45.3"

assemblyai/types.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,150 @@ class LanguageDetectionOptions(BaseModel):
501501
)
502502

503503

504+
class SpeakerType(str, Enum):
505+
"""
506+
Speaker identification type for speech understanding
507+
"""
508+
509+
role = "role"
510+
"Identify speakers by their role"
511+
512+
name = "name"
513+
"Identify speakers by their name"
514+
515+
516+
class SpeakerIdentificationRequest(BaseModel):
517+
"""
518+
Speaker identification configuration for speech understanding
519+
"""
520+
521+
speaker_type: SpeakerType
522+
"The type of speaker identification to perform"
523+
524+
known_values: Optional[List[str]] = None
525+
"Known speaker values (required when speaker_type is 'role')"
526+
527+
528+
class TranslationRequest(BaseModel):
529+
"""
530+
Translation configuration for speech understanding
531+
"""
532+
533+
target_languages: List[str]
534+
"List of target language codes to translate the transcript into"
535+
536+
formal: Optional[bool] = False
537+
"Whether to use formal language in translations (default: False)"
538+
539+
match_original_utterance: Optional[bool] = False
540+
"Whether to match the original utterance structure in translations (default: False)"
541+
542+
543+
class CustomFormattingRequest(BaseModel):
544+
"""
545+
Custom formatting configuration for speech understanding
546+
"""
547+
548+
date: Optional[str] = None
549+
"Custom date format pattern (e.g., 'mm/dd/yyyy')"
550+
551+
phone_number: Optional[str] = None
552+
"Custom phone number format pattern (e.g., '(xxx)xxx-xxxx')"
553+
554+
email: Optional[str] = None
555+
"Custom email format pattern (e.g., 'username@domain.com')"
556+
557+
558+
class SpeechUnderstandingFeatureRequests(BaseModel):
559+
"""
560+
Speech understanding feature requests
561+
"""
562+
563+
speaker_identification: Optional[SpeakerIdentificationRequest] = None
564+
"Speaker identification configuration"
565+
566+
translation: Optional[TranslationRequest] = None
567+
"Translation configuration"
568+
569+
custom_formatting: Optional[CustomFormattingRequest] = None
570+
"Custom formatting configuration"
571+
572+
573+
class SpeechUnderstandingRequest(BaseModel):
574+
"""
575+
Speech understanding request configuration for LLM Gateway features
576+
"""
577+
578+
request: Optional[SpeechUnderstandingFeatureRequests] = None
579+
"The speech understanding feature requests"
580+
581+
582+
class SpeakerIdentificationResponse(BaseModel):
583+
"""
584+
Speaker identification response containing status and mapping
585+
"""
586+
587+
status: str
588+
"Status of the speaker identification feature (e.g., 'success')"
589+
590+
mapping: Optional[Dict[str, str]] = None
591+
"Mapping of original speaker labels to identified speaker labels"
592+
593+
594+
class CustomFormattingResponse(BaseModel):
595+
"""
596+
Custom formatting response containing mapping and formatted texts
597+
"""
598+
599+
mapping: Optional[Dict[str, str]] = None
600+
"Mapping of original entities to formatted entities"
601+
602+
formatted_text: Optional[str] = None
603+
"Full transcript text with formatted entities"
604+
605+
formatted_utterances: Optional[List[Dict[str, Any]]] = None
606+
"List of utterances with formatted text"
607+
608+
status: str
609+
"Status of the custom formatting feature"
610+
611+
612+
class TranslationResponse(BaseModel):
613+
"""
614+
Translation response containing status
615+
"""
616+
617+
status: str
618+
"Status of the translation feature"
619+
620+
621+
class SpeechUnderstandingFeatureResponses(BaseModel):
622+
"""
623+
Speech understanding feature responses grouped together
624+
"""
625+
626+
speaker_identification: Optional[SpeakerIdentificationResponse] = None
627+
"Speaker identification results including status and mapping"
628+
629+
translation: Optional[TranslationResponse] = None
630+
"Translation results"
631+
632+
custom_formatting: Optional[CustomFormattingResponse] = None
633+
"Custom formatting results"
634+
635+
636+
class SpeechUnderstandingResponse(BaseModel):
637+
"""
638+
Speech understanding response containing both request and response
639+
"""
640+
641+
request: Optional[SpeechUnderstandingFeatureRequests] = None
642+
"The original speech understanding request"
643+
644+
response: Optional[SpeechUnderstandingFeatureResponses] = None
645+
"The speech understanding feature responses"
646+
647+
504648
class SpeakerOptions(BaseModel):
505649
"""
506650
Speaker options for controlling speaker diarization parameters
@@ -671,6 +815,9 @@ class RawTranscriptionConfig(BaseModel):
671815
language_codes: Optional[List[Union[str, LanguageCode]]] = None
672816
"List of language codes detected in the audio file when language detection is enabled"
673817

818+
speech_understanding: Optional[SpeechUnderstandingRequest] = None
819+
"Speech understanding configuration for LLM Gateway features"
820+
674821
model_config = ConfigDict(extra="allow")
675822

676823

@@ -719,6 +866,7 @@ def __init__(
719866
speech_models: Optional[List[str]] = None,
720867
prompt: Optional[str] = None,
721868
keyterms_prompt: Optional[List[str]] = None,
869+
speech_understanding: Optional[SpeechUnderstandingRequest] = None,
722870
) -> None:
723871
"""
724872
Args:
@@ -760,6 +908,7 @@ def __init__(
760908
language_detection_options: Options for controlling the behavior or Automatic Language Detection.
761909
speech_threshold: Reject audio files that contain less than this fraction of speech. Valid values are in the range [0,1] inclusive.
762910
raw_transcription_config: Create the config from a `RawTranscriptionConfig`
911+
speech_understanding: Speech understanding configuration for LLM Gateway features (speaker identification, translation, custom formatting)
763912
"""
764913
self._raw_transcription_config = (
765914
raw_transcription_config
@@ -813,6 +962,7 @@ def __init__(
813962
self.speech_models = speech_models
814963
self.prompt = prompt
815964
self.keyterms_prompt = keyterms_prompt
965+
self.speech_understanding = speech_understanding
816966

817967
@property
818968
def raw(self) -> RawTranscriptionConfig:
@@ -871,6 +1021,18 @@ def keyterms_prompt(self, keyterms_prompt: Optional[List[str]]) -> None:
8711021
"Sets the prompt to use for the transcription."
8721022
self._raw_transcription_config.keyterms_prompt = keyterms_prompt
8731023

1024+
@property
1025+
def speech_understanding(self) -> Optional[SpeechUnderstandingRequest]:
1026+
"The speech understanding configuration for LLM Gateway features."
1027+
return self._raw_transcription_config.speech_understanding
1028+
1029+
@speech_understanding.setter
1030+
def speech_understanding(
1031+
self, speech_understanding: Optional[SpeechUnderstandingRequest]
1032+
) -> None:
1033+
"Sets the speech understanding configuration for LLM Gateway features."
1034+
self._raw_transcription_config.speech_understanding = speech_understanding
1035+
8741036
@property
8751037
def punctuate(self) -> Optional[bool]:
8761038
"Returns the status of the Automatic Punctuation feature."
@@ -1649,6 +1811,8 @@ class UtteranceWord(Word):
16491811

16501812
class Utterance(UtteranceWord):
16511813
words: List[UtteranceWord]
1814+
translated_texts: Optional[Dict[str, str]] = None
1815+
"Translations of the utterance text when translation is enabled"
16521816

16531817

16541818
class Chapter(BaseModel):
@@ -1940,6 +2104,9 @@ class BaseTranscript(BaseModel):
19402104
keyterms_prompt: Optional[List[str]] = None
19412105
"The list of key terms used to generate the transcript with the Slam-1 speech model. Can't be used together with `prompt`."
19422106

2107+
speech_understanding: Optional[SpeechUnderstandingRequest] = None
2108+
"Speech understanding configuration for LLM Gateway features"
2109+
19432110

19442111
class TranscriptRequest(BaseTranscript):
19452112
"""
@@ -2014,6 +2181,12 @@ class TranscriptResponse(BaseTranscript):
20142181
keyterms_prompt: Optional[List[str]] = None
20152182
"When Slam-1 is enabled, the list of key terms used to generate the transcript"
20162183

2184+
speech_understanding: Optional[SpeechUnderstandingResponse] = None
2185+
"Speech understanding response when enabled"
2186+
2187+
translated_texts: Optional[Dict[str, str]] = None
2188+
"Translations of the full transcript text when translation is enabled"
2189+
20172190
def __init__(self, **data: Any):
20182191
# cleanup the response before creating the object
20192192
if not data.get("iab_categories_result") or (

0 commit comments

Comments
 (0)