From 2b5163fdb3bb0eb958d8b6d9ae1cfb4d28b878bb Mon Sep 17 00:00:00 2001 From: Adam Sypniewski Date: Tue, 8 Nov 2022 12:01:22 -0500 Subject: [PATCH] Added Deepgram as a speech recognition provider. --- README.rst | 2 + examples/audio_transcribe.py | 9 ++ examples/extended_results.py | 10 ++ examples/microphone_recognition.py | 9 ++ examples/special_recognizer_features.py | 9 ++ reference/library-reference.rst | 11 +++ speech_recognition/__init__.py | 126 ++++++++++++++++++++++++ tests/test_recognition.py | 7 ++ tests/test_special_features.py | 6 ++ 9 files changed, 189 insertions(+) diff --git a/README.rst b/README.rst index d4aadafc..bd7d0648 100644 --- a/README.rst +++ b/README.rst @@ -39,6 +39,7 @@ Speech recognition engine/API support: * `Tensorflow `__ * `Vosk API `__ (works offline) * `OpenAI whisper `__ (works offline) +* `Deepgram `__ **Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details. @@ -377,6 +378,7 @@ Authors tb0hdan (Bohdan Turkynewych) Thynix (Steve Dougherty) beeedy (Broderick Carlin) + ajsyp (Adam Sypniewski) Please report bugs and suggestions at the `issue tracker `__! diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py index 7806023f..167f208d 100644 --- a/examples/audio_transcribe.py +++ b/examples/audio_transcribe.py @@ -87,3 +87,12 @@ print("IBM Speech to Text could not understand audio") except sr.RequestError as e: print("Could not request results from IBM Speech to Text service; {0}".format(e)) + +# recognize speech using Deepgram Speech to Text +DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API SECRET HERE" # Deepgram API secrets are 40-character lowercase hexadecimal strings. +try: + print("Deepgram thinks you said " + r.recognize_deepgram(audio, key=DEEPGRAM_API_SECRET)) +except sr.UnknownValueError: + print("Deepgram could not understand audio") +except sr.RequestError as e: + print("Could not request results from Deepgram; {0}".format(e)) diff --git a/examples/extended_results.py b/examples/extended_results.py index 599c67f2..eaff9297 100644 --- a/examples/extended_results.py +++ b/examples/extended_results.py @@ -87,3 +87,13 @@ print("IBM Speech to Text could not understand audio") except sr.RequestError as e: print("Could not request results from IBM Speech to Text service; {0}".format(e)) + +# recognize speech using Deepgram Speech to Text +DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API KEY SECRET HERE" # Deepgram API key secrets are 40-character lowercase hexadecimal strings. +try: + print("Deepgram results:") + pprint(r.recognize_deepgram(audio, key=DEEPGRAM_API_SECRET, show_all=True)) +except sr.UnknownValueError: + print("Deepgram could not understand audio") +except sr.RequestError as e: + print("Could not request results from Deepgram; {0}".format(e)) diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py index 56168b29..f64d1959 100644 --- a/examples/microphone_recognition.py +++ b/examples/microphone_recognition.py @@ -92,3 +92,12 @@ print("Whisper could not understand audio") except sr.RequestError as e: print("Could not request results from Whisper") + +# recognize speech using Deepgram Speech to Text +DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API KEY SECRET HERE" # Deepgram API key secrets are 40-character lowercase hexadecimal strings. +try: + print("Deepgram thinks you said " + r.recognize_deepgram(audio, key=DEEPGRAM_API_SECRET)) +except sr.UnknownValueError: + print("Deepgram could not understand audio") +except sr.RequestError as e: + print("Could not request results from Deepgram; {0}".format(e)) diff --git a/examples/special_recognizer_features.py b/examples/special_recognizer_features.py index f4365297..e7a00b8a 100644 --- a/examples/special_recognizer_features.py +++ b/examples/special_recognizer_features.py @@ -44,3 +44,12 @@ print("Google Cloud Speech could not understand audio") except sr.RequestError as e: print("Could not request results from Google Cloud Speech service; {0}".format(e)) + +# boost keyword detection in speech using Deepgram Speech to Text +DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API KEY SECRET HERE" # Deepgram API key secrets are 40-character lowercase hexadecimal strings. +try: + print("Deepgram thinks you said " + r.recognize_deepgram(audio_en, key=DEEPGRAM_API_SECRET, keywords=['elephant:10'])) +except sr.UnknownValueError: + print("Deepgram could not understand audio") +except sr.RequestError as e: + print("Could not request results from Deepgram; {0}".format(e)) diff --git a/reference/library-reference.rst b/reference/library-reference.rst index 7323bd9b..74777291 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -314,6 +314,17 @@ You can translate the result to english with Whisper by passing translate=True Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options +``recognizer_instance.recognize_deepgram(audio_data: AudioData, key: str, tier: Optional[str] = "enhanced", model: Optional[str] = "general", version: Optional[str] = "latest", language: Optional[str] = "en-US", detect_language: bool = False, punctuate: bool = True, profanity_filter: bool = False, redact: Optional[str] = None, diarize: bool = False, diarize_version: Optional[str] = None, ner: bool = True, multichannel: bool = False, alternatives: int = 1, numerals: bool = True, search: Optional[Iterable[str]] = None, replace: Optional[Dict[str, str]] = None, keywords: Optional[Iterable[str]] = None, paragraphs: bool = False, summarize: bool = False, detect_topics: bool = False, utterances: bool = False, utt_split: Optional[float] = None, show_all: bool = False) -> Union[str, Dict[str, Any]]`` +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +Performs speech recognition of ``audio_data`` (an ``AudioData`` instance) using the Deepgram speech recognition API. + +Deepgram uses an API secret to authenticate users and authorize requests. To obtain an API secret, create an account with `Deepgram `__. The API secret is a 40-character hexadecimal string that can only be retrieved at creation time. It is identified using a UUID, which is not the API secret and shouldn't be used here. + +If ``show_all`` is false (the default), returns the most likely transcript string; otherwise, returns the raw API JSON response. + +Details of the various features can be found in the `Deepgram Documentation `__. + ``AudioSource`` --------------- diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 5abd6118..62fca56a 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1726,6 +1726,132 @@ def recognize_vosk(self, audio_data, language='en'): return finalRecognition + def recognize_deepgram( + self, + audio_data, + key, + tier='enhanced', + model='general', + version='latest', + language='en-US', + detect_language=False, + punctuate=True, + profanity_filter=False, + redact=None, + diarize=False, + diarize_version=None, + ner=True, + multichannel=False, + alternatives=1, + numerals=True, + search=None, + replace=None, + keywords=None, + paragraphs=False, + summarize=False, + detect_topics=False, + utterances=False, + utt_split=None, + show_all=False + ): + """ + Performs speech recognition of ``audio_data`` (an ``AudioData`` instance) using the Deepgram speech recognition API. + + Deepgram uses an API secret to authenticate users and authorize requests. To obtain an API secret, create an account with `Deepgram `__. The API secret is a 40-character hexadecimal string that can only be retrieved at creation time. It is identified using a UUID, which is not the API secret and shouldn't be used here. + + If ``show_all`` is false (the default), returns the most likely transcript string; otherwise, returns the raw API JSON response. + + Details of the various features can be found in the `Deepgram Documentation `__. + """ + assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data" + assert isinstance(key, str), "``key`` must be a string" + assert tier is None or (isinstance(tier, str) and tier in {'base', 'enhanced'}), "invalid ``tier``" + assert model is None or isinstance(model, str), "``model`` must be None or a string" + assert version is None or isinstance(version, str), "``verison`` must be None or a string" + assert language is None or isinstance(language, str), "``language`` must be None or a string" + assert isinstance(detect_language, bool), "``detect_language`` must be a bool" + assert isinstance(punctuate, bool), "``punctuate`` must be a bool" + assert isinstance(profanity_filter, bool), "``profanity_filter`` must be a bool" + assert redact is None or isinstance(redact, str), "``redact`` must be None or a string" + assert isinstance(diarize, bool), "``diarize`` must be a bool" + assert diarize_version is None or isinstance(diarize_version, str), "``diarize_version`` must be None or a string" + assert isinstance(ner, bool), "``ner`` must be a bool" + assert isinstance(multichannel, bool), "``multichannel`` must be a bool" + assert isinstance(alternatives, int) and alternatives > 0, "``alternatives`` must be a positive integer" + assert isinstance(numerals, bool), "``numerals`` must be a bool" + assert search is None or (isinstance(search, list) and all(isinstance(s, str) for s in search)), "``search`` must be None or a list of strings" + assert replace is None or (isinstance(replace, dict) and all(isinstance(k, str) and isinstance(v, str) for k, v in replace.items())), "``replace`` must be None or a dicitonary with string keys and values" + assert keywords is None or (isinstance(keywords, list) and all(isinstance(s, str) for s in keywords)), "``keywords`` must be None or a list of strings" + assert isinstance(paragraphs, bool), "``paragraphs`` must be a bool" + assert isinstance(summarize, bool), "``summarize`` must be a bool" + assert isinstance(detect_topics, bool), "``detect_topics`` must be a bool" + assert isinstance(utterances, bool), "``utterances`` must be a bool" + assert utt_split is None or (isinstance(utt_split, (int, float)) and utt_split > 0), "``utt_split`` must be None or positive real number" + + def convert_bool(x): + if isinstance(x, bool): + return str(x).lower() + else: + return x + + params = [ + (p[0], convert_bool(p[1])) for p in ( + ('tier', tier), + ('model', model), + ('version', version), + ('language', language), + ('detect_language', detect_language), + ('punctuate', punctuate), + ('profanity_filter', profanity_filter), + ('redact', redact), + ('diarize', diarize), + ('diarize_version', diarize_version), + ('ner', ner), + ('multichannel', multichannel), + ('alternatives', alternatives), + ('numerals', numerals), + ('paragraphs', paragraphs), + ('summarize', summarize), + ('detect_topics', detect_topics), + ('utterances', utterances), + ('utt_split', utt_split), + ) if p[1] is not None + ] + if search is not None: + for s in search: + params.append(('search', s)) + if keywords is not None: + for k in keywords: + params.append(('keywords', k)) + if replace is not None: + for k, v in replace.items(): + k = k.replace(':', '%3a') + v = v.replace(':', '%3a') + params.append(('replace', f'{k}:{v}')) + + headers = { + 'authorization': f'token {key}', + } + url = 'https://api.deepgram.com/v1/listen?{}'.format(urlencode(params)) + data = audio_data.get_wav_data() + + request = Request(url, data, headers) + try: + response = urlopen(request, timeout=self.operation_timeout) + except HTTPError as e: + raise RequestError("recognition request failed: {}".format(e.reason)) + except URLError as e: + raise RequestError("recognition connection failed: {}".format(e.reason)) + + result = json.load(response) + + if show_all: + return result + pprint(result, indent=4) + + return result['results']['channels'][0]['alternatives'][0]['transcript'] + + def get_flac_converter(): """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found.""" flac_converter = shutil_which("flac") # check for installed version first diff --git a/tests/test_recognition.py b/tests/test_recognition.py index 5759d657..0bfcc18b 100644 --- a/tests/test_recognition.py +++ b/tests/test_recognition.py @@ -97,5 +97,12 @@ def test_whisper_chinese(self): with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source) self.assertEqual(r.recognize_whisper(audio, model="small", language="chinese", **self.WHISPER_CONFIG), u"砸自己的腳") + @unittest.skipUnless("DEEPGRAM_API_SECRET" in os.environ, "requires Deepgram API secret to be specified in DEEPGRAM_API_SECRET environment variables") + def test_deepgram(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) + self.assertEqual(r.recognize_deepgram(audio, key=os.environ["DEEPGRAM_API_SECRET"]), "123") + + if __name__ == "__main__": unittest.main() diff --git a/tests/test_special_features.py b/tests/test_special_features.py index f249356e..0d6123a9 100644 --- a/tests/test_special_features.py +++ b/tests/test_special_features.py @@ -25,6 +25,12 @@ def assertSameWords(self, tested, reference, msg=None): if set_tested != set_reference: raise self.failureException(msg if msg is not None else "%r doesn't consist of the same words as %r" % (tested, reference)) + @unittest.skipUnless("DEEPGRAM_API_SECRET" in os.environ, "requires Deepgram API secret to be specified in DEEPGRAM_API_SECRET environment variables") + def test_deepgram_keywords(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) + self.assertEqual(r.recognize_deepgram(audio, key=os.environ["DEEPGRAM_API_SECRET"], tier='base', keywords=['elephant:1000000']), "elephant elephant elephant") + if __name__ == "__main__": unittest.main()