From ecec13edaf89a971a8b496ae3ae659a219148edb Mon Sep 17 00:00:00 2001 From: sekarpdkt Date: Fri, 30 Nov 2018 16:10:14 +0530 Subject: [PATCH 1/2] Created two functions to decouple decoder cofiguration from decoding I split ` def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, grammar=None, show_all=False):` into two functions. First one will return decoder and second one will use the supplied decoder. This will increase the speed of decoding. Tested by using this code using `threaded_workersV2.py` Note: I have not modified existing one. Just added two new functions 1. get_sphinx_decoder 2. recognize_sphinx_byDecoder --- speech_recognition/__init__.py | 101 +++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index c104390a..a672d428 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -743,6 +743,107 @@ def stopper(wait_for_stop=True): listener_thread.start() return stopper + + def get_sphinx_decoder(self, language="en-US", keyword_entries=None, grammar=None): + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx. + + The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx `__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models. + + If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for. + + Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored. + + Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition. + + Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation. + """ + assert isinstance(language, str) or (isinstance(language, tuple) and len(language) == 3), "``language`` must be a string or 3-tuple of Sphinx data file paths of the form ``(acoustic_parameters, language_model, phoneme_dictionary)``" + assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1" + + # import the PocketSphinx speech recognition module + try: + from pocketsphinx import pocketsphinx, Jsgf, FsgModel + + except ImportError: + raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.") + except ValueError: + raise RequestError("bad PocketSphinx installation; try reinstalling PocketSphinx version 0.0.9 or better.") + if not hasattr(pocketsphinx, "Decoder") or not hasattr(pocketsphinx.Decoder, "default_config"): + raise RequestError("outdated PocketSphinx installation; ensure you have PocketSphinx version 0.0.9 or better.") + + if isinstance(language, str): # directory containing language data + language_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data", language) + if not os.path.isdir(language_directory): + raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory)) + acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model") + language_model_file = os.path.join(language_directory, "language-model.lm.bin") + phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict") + else: # 3-tuple of Sphinx data file paths + acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language + if not os.path.isdir(acoustic_parameters_directory): + raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory)) + if not os.path.isfile(language_model_file): + raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file)) + if not os.path.isfile(phoneme_dictionary_file): + raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file)) + + # create decoder object + config = pocketsphinx.Decoder.default_config() + config.set_string("-hmm", acoustic_parameters_directory) # set the path of the hidden Markov model (HMM) parameter files + config.set_string("-lm", language_model_file) + config.set_string("-dict", phoneme_dictionary_file) + config.set_string("-logfn", os.devnull) # disable logging (logging causes unwanted output in terminal) + decoder = pocketsphinx.Decoder(config) + + # obtain recognition results + if keyword_entries is not None: # explicitly specified set of keywords + with PortableNamedTemporaryFile("w") as f: + # generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5 + f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries) + f.flush() + + # perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done) + decoder.set_kws("keywords", f.name) + decoder.set_search("keywords") + + elif grammar is not None: # a path to a FSG or JSGF grammar + if not os.path.exists(grammar): + raise ValueError("Grammar '{0}' does not exist.".format(grammar)) + grammar_path = os.path.abspath(os.path.dirname(grammar)) + grammar_name = os.path.splitext(os.path.basename(grammar))[0] + fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name) + if not os.path.exists(fsg_path): # create FSG grammar if not available + jsgf = Jsgf(grammar) + rule = jsgf.get_rule("{0}.{0}".format(grammar_name)) + fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5) + fsg.writefile(fsg_path) + else: + fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5) + decoder.set_fsg(grammar_name, fsg) + decoder.set_search(grammar_name) + + return decoder + + def recognize_sphinx_byDecoder(self,decoder, audio_data, show_all=False): + + assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data" + + # obtain audio data + raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2) # the included language models require audio to be 16-bit mono 16 kHz in little-endian format + decoder.start_utt() # begin utterance processing + decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) + decoder.end_utt() # stop utterance processing + + if show_all: return decoder + + # return results + hypothesis = decoder.hyp() + if hypothesis is not None: return hypothesis.hypstr + raise UnknownValueError() # no transcriptions available + + + def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, grammar=None, show_all=False): """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx. From b30b6295c4592550f54d3fa3da65e5238d88b2ea Mon Sep 17 00:00:00 2001 From: sekarpdkt Date: Fri, 30 Nov 2018 16:13:55 +0530 Subject: [PATCH 2/2] Faster decoding for sphinx by decoupling decoder creation from decoder This is a POC for two new functions --- speech_recognition/threaded_workersV2.py | 89 ++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 speech_recognition/threaded_workersV2.py diff --git a/speech_recognition/threaded_workersV2.py b/speech_recognition/threaded_workersV2.py new file mode 100644 index 00000000..11abf4f2 --- /dev/null +++ b/speech_recognition/threaded_workersV2.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +# NOTE: this example requires PyAudio because it uses the Microphone class + +from threading import Thread +try: + from queue import Queue # Python 3 import +except ImportError: + from Queue import Queue # Python 2 import + +import speech_recognition as sr + + +r = sr.Recognizer() +audio_queue = Queue() + + + +r.energy_threshold = 300 # minimum audio energy to consider for recording +r.dynamic_energy_threshold = True +r.dynamic_energy_adjustment_damping = 0.5 +r.dynamic_energy_ratio = 1.5 +r.pause_threshold = 0.05 # seconds of non-speaking audio before a phrase is considered complete +r.operation_timeout = None # seconds after an internal operation (e.g., an API request) starts before it times out, or ``None`` for no timeout + +r.phrase_threshold = 0.1 # minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops) +r.non_speaking_duration = 0.025 # seconds of non-speaking audio to keep on both sides of the recording + + +myDecoder =r.get_sphinx_decoder(grammar='counting.gram'); + +j=0 +def recognize_worker(): + # this runs in a background thread + global j,myDecoder; + while True: + audio = audio_queue.get() # retrieve the next audio processing job from the main thread + if audio is None: break # stop processing if the main thread is done + """ + # received audio data, now we'll recognize it using Google Speech Recognition + try: + # for testing purposes, we're just using the default API key + # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")` + # instead of `r.recognize_google(audio)` + print("Calling Google API") + print("Google Speech Recognition thinks you said " + r.recognize_google(audio)) + except sr.UnknownValueError: + print("Google Speech Recognition could not understand audio") + except sr.RequestError as e: + print("Could not request results from Google Speech Recognition service; {0}".format(e)) + """ + try: + print("Sphinx recognition for \"one two three\" for counting grammar:") + #print(r.recognize_sphinx(audio, grammar='counting.gram')) + print(r.recognize_sphinx_byDecoder(myDecoder,audio)); + except sr.UnknownValueError: + print("Sphinx could not understand audio") + except sr.RequestError as e: + print("Sphinx error; {0}".format(e)) + + + with open("microphone-results{0}.wav".format(j), "wb") as f: + j=j+1 + #f.write(audio.get_wav_data()) + + audio_queue.task_done() # mark the audio processing job as completed in the queue + + +# start a new thread to recognize audio, while this thread focuses on listening +recognize_thread = Thread(target=recognize_worker) +recognize_thread.daemon = True +recognize_thread.start() +with sr.Microphone() as source: + try: + while True: # repeatedly listen for phrases and put the resulting audio on the audio processing job queue + audio_queue.put(r.listen(source)) + except KeyboardInterrupt: # allow Ctrl + C to shut down the program + pass + +audio_queue.join() # block until all current audio processing jobs are done +audio_queue.put(None) # tell the recognize_thread to stop +recognize_thread.join() # wait for the recognize_thread to actually stop + + + + + + +