Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions speech_recognition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,107 @@ def stopper(wait_for_stop=True):
listener_thread.start()
return stopper


def get_sphinx_decoder(self, language="en-US", keyword_entries=None, grammar=None):
"""
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx.

The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models.

If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for.

Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored.

Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition.

Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation.
"""
assert isinstance(language, str) or (isinstance(language, tuple) and len(language) == 3), "``language`` must be a string or 3-tuple of Sphinx data file paths of the form ``(acoustic_parameters, language_model, phoneme_dictionary)``"
assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1"

# import the PocketSphinx speech recognition module
try:
from pocketsphinx import pocketsphinx, Jsgf, FsgModel

except ImportError:
raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.")
except ValueError:
raise RequestError("bad PocketSphinx installation; try reinstalling PocketSphinx version 0.0.9 or better.")
if not hasattr(pocketsphinx, "Decoder") or not hasattr(pocketsphinx.Decoder, "default_config"):
raise RequestError("outdated PocketSphinx installation; ensure you have PocketSphinx version 0.0.9 or better.")

if isinstance(language, str): # directory containing language data
language_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data", language)
if not os.path.isdir(language_directory):
raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory))
acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model")
language_model_file = os.path.join(language_directory, "language-model.lm.bin")
phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict")
else: # 3-tuple of Sphinx data file paths
acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language
if not os.path.isdir(acoustic_parameters_directory):
raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory))
if not os.path.isfile(language_model_file):
raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file))
if not os.path.isfile(phoneme_dictionary_file):
raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file))

# create decoder object
config = pocketsphinx.Decoder.default_config()
config.set_string("-hmm", acoustic_parameters_directory) # set the path of the hidden Markov model (HMM) parameter files
config.set_string("-lm", language_model_file)
config.set_string("-dict", phoneme_dictionary_file)
config.set_string("-logfn", os.devnull) # disable logging (logging causes unwanted output in terminal)
decoder = pocketsphinx.Decoder(config)

# obtain recognition results
if keyword_entries is not None: # explicitly specified set of keywords
with PortableNamedTemporaryFile("w") as f:
# generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5
f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries)
f.flush()

# perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done)
decoder.set_kws("keywords", f.name)
decoder.set_search("keywords")

elif grammar is not None: # a path to a FSG or JSGF grammar
if not os.path.exists(grammar):
raise ValueError("Grammar '{0}' does not exist.".format(grammar))
grammar_path = os.path.abspath(os.path.dirname(grammar))
grammar_name = os.path.splitext(os.path.basename(grammar))[0]
fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name)
if not os.path.exists(fsg_path): # create FSG grammar if not available
jsgf = Jsgf(grammar)
rule = jsgf.get_rule("{0}.{0}".format(grammar_name))
fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5)
fsg.writefile(fsg_path)
else:
fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5)
decoder.set_fsg(grammar_name, fsg)
decoder.set_search(grammar_name)

return decoder

def recognize_sphinx_byDecoder(self,decoder, audio_data, show_all=False):

assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"

# obtain audio data
raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2) # the included language models require audio to be 16-bit mono 16 kHz in little-endian format
decoder.start_utt() # begin utterance processing
decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True)
decoder.end_utt() # stop utterance processing

if show_all: return decoder

# return results
hypothesis = decoder.hyp()
if hypothesis is not None: return hypothesis.hypstr
raise UnknownValueError() # no transcriptions available



def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, grammar=None, show_all=False):
"""
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx.
Expand Down
89 changes: 89 additions & 0 deletions speech_recognition/threaded_workersV2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env python3

# NOTE: this example requires PyAudio because it uses the Microphone class

from threading import Thread
try:
from queue import Queue # Python 3 import
except ImportError:
from Queue import Queue # Python 2 import

import speech_recognition as sr


r = sr.Recognizer()
audio_queue = Queue()



r.energy_threshold = 300 # minimum audio energy to consider for recording
r.dynamic_energy_threshold = True
r.dynamic_energy_adjustment_damping = 0.5
r.dynamic_energy_ratio = 1.5
r.pause_threshold = 0.05 # seconds of non-speaking audio before a phrase is considered complete
r.operation_timeout = None # seconds after an internal operation (e.g., an API request) starts before it times out, or ``None`` for no timeout

r.phrase_threshold = 0.1 # minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops)
r.non_speaking_duration = 0.025 # seconds of non-speaking audio to keep on both sides of the recording


myDecoder =r.get_sphinx_decoder(grammar='counting.gram');

j=0
def recognize_worker():
# this runs in a background thread
global j,myDecoder;
while True:
audio = audio_queue.get() # retrieve the next audio processing job from the main thread
if audio is None: break # stop processing if the main thread is done
"""
# received audio data, now we'll recognize it using Google Speech Recognition
try:
# for testing purposes, we're just using the default API key
# to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")`
# instead of `r.recognize_google(audio)`
print("Calling Google API")
print("Google Speech Recognition thinks you said " + r.recognize_google(audio))
except sr.UnknownValueError:
print("Google Speech Recognition could not understand audio")
except sr.RequestError as e:
print("Could not request results from Google Speech Recognition service; {0}".format(e))
"""
try:
print("Sphinx recognition for \"one two three\" for counting grammar:")
#print(r.recognize_sphinx(audio, grammar='counting.gram'))
print(r.recognize_sphinx_byDecoder(myDecoder,audio));
except sr.UnknownValueError:
print("Sphinx could not understand audio")
except sr.RequestError as e:
print("Sphinx error; {0}".format(e))


with open("microphone-results{0}.wav".format(j), "wb") as f:
j=j+1
#f.write(audio.get_wav_data())

audio_queue.task_done() # mark the audio processing job as completed in the queue


# start a new thread to recognize audio, while this thread focuses on listening
recognize_thread = Thread(target=recognize_worker)
recognize_thread.daemon = True
recognize_thread.start()
with sr.Microphone() as source:
try:
while True: # repeatedly listen for phrases and put the resulting audio on the audio processing job queue
audio_queue.put(r.listen(source))
except KeyboardInterrupt: # allow Ctrl + C to shut down the program
pass

audio_queue.join() # block until all current audio processing jobs are done
audio_queue.put(None) # tell the recognize_thread to stop
recognize_thread.join() # wait for the recognize_thread to actually stop