diff --git a/README.rst b/README.rst
index d4aadafc..bd7d0648 100644
--- a/README.rst
+++ b/README.rst
@@ -39,6 +39,7 @@ Speech recognition engine/API support:
* `Tensorflow `__
* `Vosk API `__ (works offline)
* `OpenAI whisper `__ (works offline)
+* `Deepgram `__
**Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details.
@@ -377,6 +378,7 @@ Authors
tb0hdan (Bohdan Turkynewych)
Thynix (Steve Dougherty)
beeedy (Broderick Carlin)
+ ajsyp (Adam Sypniewski)
Please report bugs and suggestions at the `issue tracker `__!
diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py
index 7806023f..167f208d 100644
--- a/examples/audio_transcribe.py
+++ b/examples/audio_transcribe.py
@@ -87,3 +87,12 @@
print("IBM Speech to Text could not understand audio")
except sr.RequestError as e:
print("Could not request results from IBM Speech to Text service; {0}".format(e))
+
+# recognize speech using Deepgram Speech to Text
+DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API SECRET HERE" # Deepgram API secrets are 40-character lowercase hexadecimal strings.
+try:
+ print("Deepgram thinks you said " + r.recognize_deepgram(audio, key=DEEPGRAM_API_SECRET))
+except sr.UnknownValueError:
+ print("Deepgram could not understand audio")
+except sr.RequestError as e:
+ print("Could not request results from Deepgram; {0}".format(e))
diff --git a/examples/extended_results.py b/examples/extended_results.py
index 599c67f2..eaff9297 100644
--- a/examples/extended_results.py
+++ b/examples/extended_results.py
@@ -87,3 +87,13 @@
print("IBM Speech to Text could not understand audio")
except sr.RequestError as e:
print("Could not request results from IBM Speech to Text service; {0}".format(e))
+
+# recognize speech using Deepgram Speech to Text
+DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API KEY SECRET HERE" # Deepgram API key secrets are 40-character lowercase hexadecimal strings.
+try:
+ print("Deepgram results:")
+ pprint(r.recognize_deepgram(audio, key=DEEPGRAM_API_SECRET, show_all=True))
+except sr.UnknownValueError:
+ print("Deepgram could not understand audio")
+except sr.RequestError as e:
+ print("Could not request results from Deepgram; {0}".format(e))
diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py
index 56168b29..f64d1959 100644
--- a/examples/microphone_recognition.py
+++ b/examples/microphone_recognition.py
@@ -92,3 +92,12 @@
print("Whisper could not understand audio")
except sr.RequestError as e:
print("Could not request results from Whisper")
+
+# recognize speech using Deepgram Speech to Text
+DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API KEY SECRET HERE" # Deepgram API key secrets are 40-character lowercase hexadecimal strings.
+try:
+ print("Deepgram thinks you said " + r.recognize_deepgram(audio, key=DEEPGRAM_API_SECRET))
+except sr.UnknownValueError:
+ print("Deepgram could not understand audio")
+except sr.RequestError as e:
+ print("Could not request results from Deepgram; {0}".format(e))
diff --git a/examples/special_recognizer_features.py b/examples/special_recognizer_features.py
index f4365297..e7a00b8a 100644
--- a/examples/special_recognizer_features.py
+++ b/examples/special_recognizer_features.py
@@ -44,3 +44,12 @@
print("Google Cloud Speech could not understand audio")
except sr.RequestError as e:
print("Could not request results from Google Cloud Speech service; {0}".format(e))
+
+# boost keyword detection in speech using Deepgram Speech to Text
+DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API KEY SECRET HERE" # Deepgram API key secrets are 40-character lowercase hexadecimal strings.
+try:
+ print("Deepgram thinks you said " + r.recognize_deepgram(audio_en, key=DEEPGRAM_API_SECRET, keywords=['elephant:10']))
+except sr.UnknownValueError:
+ print("Deepgram could not understand audio")
+except sr.RequestError as e:
+ print("Could not request results from Deepgram; {0}".format(e))
diff --git a/reference/library-reference.rst b/reference/library-reference.rst
index 7323bd9b..74777291 100644
--- a/reference/library-reference.rst
+++ b/reference/library-reference.rst
@@ -314,6 +314,17 @@ You can translate the result to english with Whisper by passing translate=True
Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options
+``recognizer_instance.recognize_deepgram(audio_data: AudioData, key: str, tier: Optional[str] = "enhanced", model: Optional[str] = "general", version: Optional[str] = "latest", language: Optional[str] = "en-US", detect_language: bool = False, punctuate: bool = True, profanity_filter: bool = False, redact: Optional[str] = None, diarize: bool = False, diarize_version: Optional[str] = None, ner: bool = True, multichannel: bool = False, alternatives: int = 1, numerals: bool = True, search: Optional[Iterable[str]] = None, replace: Optional[Dict[str, str]] = None, keywords: Optional[Iterable[str]] = None, paragraphs: bool = False, summarize: bool = False, detect_topics: bool = False, utterances: bool = False, utt_split: Optional[float] = None, show_all: bool = False) -> Union[str, Dict[str, Any]]``
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Performs speech recognition of ``audio_data`` (an ``AudioData`` instance) using the Deepgram speech recognition API.
+
+Deepgram uses an API secret to authenticate users and authorize requests. To obtain an API secret, create an account with `Deepgram `__. The API secret is a 40-character hexadecimal string that can only be retrieved at creation time. It is identified using a UUID, which is not the API secret and shouldn't be used here.
+
+If ``show_all`` is false (the default), returns the most likely transcript string; otherwise, returns the raw API JSON response.
+
+Details of the various features can be found in the `Deepgram Documentation `__.
+
``AudioSource``
---------------
diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
index 5abd6118..62fca56a 100644
--- a/speech_recognition/__init__.py
+++ b/speech_recognition/__init__.py
@@ -1726,6 +1726,132 @@ def recognize_vosk(self, audio_data, language='en'):
return finalRecognition
+ def recognize_deepgram(
+ self,
+ audio_data,
+ key,
+ tier='enhanced',
+ model='general',
+ version='latest',
+ language='en-US',
+ detect_language=False,
+ punctuate=True,
+ profanity_filter=False,
+ redact=None,
+ diarize=False,
+ diarize_version=None,
+ ner=True,
+ multichannel=False,
+ alternatives=1,
+ numerals=True,
+ search=None,
+ replace=None,
+ keywords=None,
+ paragraphs=False,
+ summarize=False,
+ detect_topics=False,
+ utterances=False,
+ utt_split=None,
+ show_all=False
+ ):
+ """
+ Performs speech recognition of ``audio_data`` (an ``AudioData`` instance) using the Deepgram speech recognition API.
+
+ Deepgram uses an API secret to authenticate users and authorize requests. To obtain an API secret, create an account with `Deepgram `__. The API secret is a 40-character hexadecimal string that can only be retrieved at creation time. It is identified using a UUID, which is not the API secret and shouldn't be used here.
+
+ If ``show_all`` is false (the default), returns the most likely transcript string; otherwise, returns the raw API JSON response.
+
+ Details of the various features can be found in the `Deepgram Documentation `__.
+ """
+ assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
+ assert isinstance(key, str), "``key`` must be a string"
+ assert tier is None or (isinstance(tier, str) and tier in {'base', 'enhanced'}), "invalid ``tier``"
+ assert model is None or isinstance(model, str), "``model`` must be None or a string"
+ assert version is None or isinstance(version, str), "``verison`` must be None or a string"
+ assert language is None or isinstance(language, str), "``language`` must be None or a string"
+ assert isinstance(detect_language, bool), "``detect_language`` must be a bool"
+ assert isinstance(punctuate, bool), "``punctuate`` must be a bool"
+ assert isinstance(profanity_filter, bool), "``profanity_filter`` must be a bool"
+ assert redact is None or isinstance(redact, str), "``redact`` must be None or a string"
+ assert isinstance(diarize, bool), "``diarize`` must be a bool"
+ assert diarize_version is None or isinstance(diarize_version, str), "``diarize_version`` must be None or a string"
+ assert isinstance(ner, bool), "``ner`` must be a bool"
+ assert isinstance(multichannel, bool), "``multichannel`` must be a bool"
+ assert isinstance(alternatives, int) and alternatives > 0, "``alternatives`` must be a positive integer"
+ assert isinstance(numerals, bool), "``numerals`` must be a bool"
+ assert search is None or (isinstance(search, list) and all(isinstance(s, str) for s in search)), "``search`` must be None or a list of strings"
+ assert replace is None or (isinstance(replace, dict) and all(isinstance(k, str) and isinstance(v, str) for k, v in replace.items())), "``replace`` must be None or a dicitonary with string keys and values"
+ assert keywords is None or (isinstance(keywords, list) and all(isinstance(s, str) for s in keywords)), "``keywords`` must be None or a list of strings"
+ assert isinstance(paragraphs, bool), "``paragraphs`` must be a bool"
+ assert isinstance(summarize, bool), "``summarize`` must be a bool"
+ assert isinstance(detect_topics, bool), "``detect_topics`` must be a bool"
+ assert isinstance(utterances, bool), "``utterances`` must be a bool"
+ assert utt_split is None or (isinstance(utt_split, (int, float)) and utt_split > 0), "``utt_split`` must be None or positive real number"
+
+ def convert_bool(x):
+ if isinstance(x, bool):
+ return str(x).lower()
+ else:
+ return x
+
+ params = [
+ (p[0], convert_bool(p[1])) for p in (
+ ('tier', tier),
+ ('model', model),
+ ('version', version),
+ ('language', language),
+ ('detect_language', detect_language),
+ ('punctuate', punctuate),
+ ('profanity_filter', profanity_filter),
+ ('redact', redact),
+ ('diarize', diarize),
+ ('diarize_version', diarize_version),
+ ('ner', ner),
+ ('multichannel', multichannel),
+ ('alternatives', alternatives),
+ ('numerals', numerals),
+ ('paragraphs', paragraphs),
+ ('summarize', summarize),
+ ('detect_topics', detect_topics),
+ ('utterances', utterances),
+ ('utt_split', utt_split),
+ ) if p[1] is not None
+ ]
+ if search is not None:
+ for s in search:
+ params.append(('search', s))
+ if keywords is not None:
+ for k in keywords:
+ params.append(('keywords', k))
+ if replace is not None:
+ for k, v in replace.items():
+ k = k.replace(':', '%3a')
+ v = v.replace(':', '%3a')
+ params.append(('replace', f'{k}:{v}'))
+
+ headers = {
+ 'authorization': f'token {key}',
+ }
+ url = 'https://api.deepgram.com/v1/listen?{}'.format(urlencode(params))
+ data = audio_data.get_wav_data()
+
+ request = Request(url, data, headers)
+ try:
+ response = urlopen(request, timeout=self.operation_timeout)
+ except HTTPError as e:
+ raise RequestError("recognition request failed: {}".format(e.reason))
+ except URLError as e:
+ raise RequestError("recognition connection failed: {}".format(e.reason))
+
+ result = json.load(response)
+
+ if show_all:
+ return result
+ pprint(result, indent=4)
+
+ return result['results']['channels'][0]['alternatives'][0]['transcript']
+
+
def get_flac_converter():
"""Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found."""
flac_converter = shutil_which("flac") # check for installed version first
diff --git a/tests/test_recognition.py b/tests/test_recognition.py
index 5759d657..0bfcc18b 100644
--- a/tests/test_recognition.py
+++ b/tests/test_recognition.py
@@ -97,5 +97,12 @@ def test_whisper_chinese(self):
with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
self.assertEqual(r.recognize_whisper(audio, model="small", language="chinese", **self.WHISPER_CONFIG), u"砸自己的腳")
+ @unittest.skipUnless("DEEPGRAM_API_SECRET" in os.environ, "requires Deepgram API secret to be specified in DEEPGRAM_API_SECRET environment variables")
+ def test_deepgram(self):
+ r = sr.Recognizer()
+ with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
+ self.assertEqual(r.recognize_deepgram(audio, key=os.environ["DEEPGRAM_API_SECRET"]), "123")
+
+
if __name__ == "__main__":
unittest.main()
diff --git a/tests/test_special_features.py b/tests/test_special_features.py
index f249356e..0d6123a9 100644
--- a/tests/test_special_features.py
+++ b/tests/test_special_features.py
@@ -25,6 +25,12 @@ def assertSameWords(self, tested, reference, msg=None):
if set_tested != set_reference:
raise self.failureException(msg if msg is not None else "%r doesn't consist of the same words as %r" % (tested, reference))
+ @unittest.skipUnless("DEEPGRAM_API_SECRET" in os.environ, "requires Deepgram API secret to be specified in DEEPGRAM_API_SECRET environment variables")
+ def test_deepgram_keywords(self):
+ r = sr.Recognizer()
+ with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
+ self.assertEqual(r.recognize_deepgram(audio, key=os.environ["DEEPGRAM_API_SECRET"], tier='base', keywords=['elephant:1000000']), "elephant elephant elephant")
+
if __name__ == "__main__":
unittest.main()