From 225bd036edf39ea23589e1ad2220246494ea466d Mon Sep 17 00:00:00 2001 From: Patrick Loeber <50772274+patrickloeber@users.noreply.github.com> Date: Fri, 20 Oct 2023 12:20:42 -0400 Subject: [PATCH] Fixes and improvements for `recognize_assemblyai()` method: - Adds support for `AudioData` instance. Before, it only worked with a path to a file - Add more error handling - Removes inner `read_file` function since the requests module automatically handles chunking - Removes "content-type" from header since this is not needed - Add docstring - Add example code snippet in `examples/audio_transcribe.py` - List AssemblyAI in README --- README.rst | 3 +- examples/audio_transcribe.py | 21 +++++++ speech_recognition/__init__.py | 100 ++++++++++++++++++++++++--------- 3 files changed, 96 insertions(+), 28 deletions(-) diff --git a/README.rst b/README.rst index 410e289d..5a24e2b1 100644 --- a/README.rst +++ b/README.rst @@ -30,6 +30,7 @@ Speech recognition engine/API support: * `CMU Sphinx `__ (works offline) * Google Speech Recognition * `Google Cloud Speech API `__ +* `AssemblyAI API `__ * `Wit.ai `__ * `Microsoft Azure Speech `__ * `Microsoft Bing Voice Recognition (Deprecated) `__ @@ -202,7 +203,7 @@ The solution is to decrease this threshold, or call ``recognizer_instance.adjust The recognizer doesn't understand my particular language/dialect. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Try setting the recognition language to your language/dialect. To do this, see the documentation for ``recognizer_instance.recognize_sphinx``, ``recognizer_instance.recognize_google``, ``recognizer_instance.recognize_wit``, ``recognizer_instance.recognize_bing``, ``recognizer_instance.recognize_api``, ``recognizer_instance.recognize_houndify``, and ``recognizer_instance.recognize_ibm``. +Try setting the recognition language to your language/dialect. To do this, see the documentation for ``recognizer_instance.recognize_sphinx``, ``recognizer_instance.recognize_google``, ``recognizer_instance.recognize_wit``, ``recognizer_instance.recognize_bing``, ``recognizer_instance.recognize_api``, ``recognizer_instance.recognize_houndify``, and ``recognizer_instance.recognize_ibm``, and ``recognizer_instance.recognize_assemblyai``. For example, if your language/dialect is British English, it is better to use ``"en-GB"`` as the language rather than ``"en-US"``. diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py index 7806023f..7d9e8869 100644 --- a/examples/audio_transcribe.py +++ b/examples/audio_transcribe.py @@ -87,3 +87,24 @@ print("IBM Speech to Text could not understand audio") except sr.RequestError as e: print("Could not request results from IBM Speech to Text service; {0}".format(e)) + +# recognize speech using the AssemblyAI API +ASSEMBLYAI_API_TOKEN = "INSERT ASSEMBLYAI API TOKEN HERE" # Get a Free token at https://www.assemblyai.com/ + +# First submit the file for transcription and obtain the job_name that corresponds to the transcription_id +try: + r.recognize_assemblyai(audio, api_token=ASSEMBLYAI_API_TOKEN) +except sr.TranscriptionNotReady as e: + job_name = e.job_name +except sr.TranscriptionFailed as e: + print(e) +except sr.RequestError as e: + print("Could not request results from AssemblyAI service; {0}".format(e)) + +# Wait a little bit, then query the transcript with the job_name +try: + print("AssemblyAI thinks you said " + r.recognize_assemblyai(audio_data=None, api_token=ASSEMBLYAI_API_TOKEN, job_name=job_name)[0]) +except sr.TranscriptionFailed as e: + print(e) +except sr.RequestError as e: + print("Could not request results from AssemblyAI service; {0}".format(e)) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index bbff8dad..0400bf1a 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1289,33 +1289,51 @@ def recognize_amazon(self, audio_data, bucket_name=None, access_key_id=None, sec def recognize_assemblyai(self, audio_data, api_token, job_name=None, **kwargs): """ - Wraps the AssemblyAI STT service. + Performs speech recognition using the AssemblyAI API. + https://www.assemblyai.com/ + + Args: + audio_data: Can be an ``AudioData`` instance or a str with a path to a file. + api_token: An AssemblyAI API token. + job_name: The name of the job which corresponds to the transcription id. If no job_name is given, it submits the file for transcription + and raises a ``speech_recognition.TranscriptionNotReady`` exception. The final transcript can then be queried at a later time + by passing the job_name. + + Raises a ``speech_recognition.TranscriptionFailed`` exception if the speech recognition operation failed or if the key isn't valid. + Raises a ``speech_recognition.RequestError`` exception if API requests failed, e.g. if there is no internet connection. + + Example: + ``` + try: + r.recognize_assemblyai(audio_data=audio, api_token=your_token) + except sr.TranscriptionNotReady as e: + job_name = e.job_name + + # wait a little bit... + result = r.recognize_assemblyai(audio_data=None, api_token=your_token, job_name=job_name) + ``` """ - def read_file(filename, chunk_size=5242880): - with open(filename, 'rb') as _file: - while True: - data = _file.read(chunk_size) - if not data: - break - yield data + headers = {"authorization": api_token} check_existing = audio_data is None and job_name if check_existing: # Query status. transciption_id = job_name endpoint = f"https://api.assemblyai.com/v2/transcript/{transciption_id}" - headers = { - "authorization": api_token, - } - response = requests.get(endpoint, headers=headers) + + try: + response = requests.get(endpoint, headers=headers) + except requests.exceptions.RequestException as e: + raise RequestError("recognition request failed: {}".format(e.reason)) + data = response.json() status = data['status'] if status == 'error': # Handle error. - exc = TranscriptionFailed() + exc = TranscriptionFailed("Transcription failed: {}".format(data["error"])) exc.job_name = None exc.file_key = None raise exc @@ -1332,24 +1350,52 @@ def read_file(filename, chunk_size=5242880): exc.file_key = None raise exc else: - # Upload file. - headers = {'authorization': api_token} - response = requests.post('https://api.assemblyai.com/v2/upload', - headers=headers, - data=read_file(audio_data)) - upload_url = response.json()['upload_url'] + # Upload file and queue for transcription. + # This path raises a TranscriptionNotReady error that contains the job_id. + # The job_id can then be used at a later point to query the transcript + if isinstance(audio_data, AudioData): + # convert to flac first + upload_data = audio_data.get_flac_data( + convert_rate=None if audio_data.sample_rate >= 8000 else 8000, # audio samples should be at least 8 kHz + convert_width=None if audio_data.sample_width >= 2 else 2 # audio samples should be at least 16-bit + ) + else: + # assume audio_data is a path to a file that can be uploaded directly + upload_data = audio_data + + try: + response = requests.post('https://api.assemblyai.com/v2/upload', + headers=headers, + data=upload_data) + except requests.exceptions.RequestException as e: + raise RequestError("recognition request failed: {}".format(e.reason)) + + data = response.json() + if "error" in data: + exc = TranscriptionFailed("Transcription failed: {}".format(data["error"])) + exc.job_name = None + exc.file_key = None + raise exc + + upload_url = data['upload_url'] # Queue file for transcription. endpoint = "https://api.assemblyai.com/v2/transcript" - json = { - "audio_url": upload_url - } - headers = { - "authorization": api_token, - "content-type": "application/json" - } - response = requests.post(endpoint, json=json, headers=headers) + json = { "audio_url": upload_url } + + try: + response = requests.post(endpoint, json=json, headers=headers) + except requests.exceptions.RequestException as e: + raise RequestError("recognition request failed: {}".format(e.reason)) + data = response.json() + + if "error" in data: + exc = TranscriptionFailed("Transcription failed: {}".format(data["error"])) + exc.job_name = None + exc.file_key = None + raise exc + transciption_id = data['id'] exc = TranscriptionNotReady() exc.job_name = transciption_id