#
# This file is part of Dragonfly.
# (c) Copyright 2007, 2008 by Christo Butcher
# Licensed under the LGPL.
#
# Dragonfly is free software: you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Dragonfly is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with Dragonfly. If not, see
# <http://www.gnu.org/licenses/>.
#
"""
Engine class for CMU Pocket Sphinx
"""
import contextlib
import locale
import os
import wave
from six import binary_type, text_type, string_types, PY2
from jsgf import RootGrammar, PublicRule, Literal
from sphinxwrapper import PocketSphinx
import dragonfly.engines
from dragonfly.windows.window import Window
from dragonfly.engines.base import (EngineBase, EngineError, MimicFailure,
DelegateTimerManagerInterface,
DictationContainerBase)
from dragonfly.engines.backend_sphinx.compiler import SphinxJSGFCompiler
from dragonfly.engines.backend_sphinx.grammar_wrapper import GrammarWrapper
from dragonfly.engines.backend_sphinx.misc import (EngineConfig,
WaveRecognitionObserver,
get_decoder_config_object)
from dragonfly.engines.backend_sphinx.recobs import SphinxRecObsManager
from dragonfly.engines.backend_sphinx.recording import PyAudioRecorder
from dragonfly.engines.backend_sphinx.timer import SphinxTimerManager
from dragonfly.engines.backend_sphinx.training import (write_training_data,
write_transcript_files)
class UnknownWordError(Exception):
pass
def _map_to_str(text, encoding=locale.getpreferredencoding()):
# Translate unicode/bytes to whatever str is in this version of
# Python. Decoder methods seem to require str objects.
if not isinstance(text, string_types):
text = str(text)
if PY2 and isinstance(text, text_type):
text = text.encode(encoding)
elif not PY2 and isinstance(text, binary_type):
text = text.decode(encoding)
return text
[docs]class SphinxEngine(EngineBase, DelegateTimerManagerInterface):
""" Speech recognition engine back-end for CMU Pocket Sphinx. """
_name = "sphinx"
DictationContainer = DictationContainerBase
def __init__(self):
EngineBase.__init__(self)
DelegateTimerManagerInterface.__init__(self)
try:
import sphinxwrapper, jsgf, pyaudio
except ImportError:
raise EngineError("Failed to import Pocket Sphinx engine "
"dependencies.")
# Set the default engine configuration.
# This can be changed later using the config property.
self._config = None
self.config = EngineConfig
# Set other variables
self._decoder = None
self._audio_buffers = []
self.compiler = SphinxJSGFCompiler(self)
self._recognition_observer_manager = SphinxRecObsManager(self)
self._keyphrase_thresholds = {}
self._keyphrase_functions = {}
self._training_session_active = False
self._default_search_result = None
self._grammar_count = 0
# Timer-related members.
self._timer_manager = SphinxTimerManager(0.02, self)
# Set up keyphrase search names and valid search names for grammars.
self._keyphrase_search_names = ["_key_phrases", "_wake_phrase"]
self._valid_searches = set()
# Recognising loop members.
self._recorder = PyAudioRecorder(self.config)
self._cancel_recognition_next_time = False
self._recognising = False
self._recognition_paused = False
@property
def config(self):
"""
Python module/object containing engine configuration.
You will need to restart the engine with :meth:`disconnect` and
:meth:`connect` if the configuration has been changed after
:meth:`connect` has been called.
:returns: config module/object
"""
return self._config
@config.setter
def config(self, value):
# Validate configuration object.
self.validate_config(value)
self._config = value
@classmethod
def validate_config(cls, engine_config):
# Check configuration options and set defaults where appropriate.
# Set a new decoder config if necessary.
if not hasattr(engine_config, "DECODER_CONFIG"):
setattr(engine_config, "DECODER_CONFIG",
get_decoder_config_object())
options = [
"LANGUAGE",
"START_ASLEEP",
"WAKE_PHRASE",
"WAKE_PHRASE_THRESHOLD",
"SLEEP_PHRASE",
"SLEEP_PHRASE_THRESHOLD",
"TRAINING_DATA_DIR",
"TRANSCRIPT_NAME",
"START_TRAINING_PHRASE",
"START_TRAINING_PHRASE_THRESHOLD",
"END_TRAINING_PHRASE",
"END_TRAINING_PHRASE_THRESHOLD",
"CHANNELS",
"RATE",
"SAMPLE_WIDTH",
"FRAMES_PER_BUFFER",
]
# Get default values and set them they are missing.
for option in options:
if hasattr(engine_config, option):
continue
default_value = getattr(EngineConfig, option)
if "PHRASE" in option:
# Disable missing phrases by default if using a language
# other than English.
if not engine_config.LANGUAGE.startswith("en"):
default_value = "" if option.endswith("PHRASE") else 0.0
setattr(engine_config, option, default_value)
[docs] def connect(self):
"""
Set up the CMU Pocket Sphinx decoder.
This method does nothing if the engine is already connected.
"""
if self._decoder:
return
# Initialise a new decoder with the given configuration
decoder_config = self._config.DECODER_CONFIG
self._decoder = PocketSphinx(decoder_config)
self._valid_searches.add(self._default_search_name)
# Set up callback function wrappers
def hypothesis(hyp):
# Set default search result.
self._default_search_result = hyp
# Set speech to the hypothesis string or None if there isn't one
speech = hyp.hypstr if hyp else None
return self._hypothesis_callback(speech, False)
def speech_start():
# Reset the default search result and call the engine's callback
# method.
self._default_search_result = None
return self._speech_start_callback(False)
self._decoder.hypothesis_callback = hypothesis
self._decoder.speech_start_callback = speech_start
# Set up built-in keyphrases if they set. Catch and log any
# UnknownWordErrors because all keyphrases are optional.
def get_phrase_values(name):
phrase_attr = name + "_PHRASE"
threshold_attr = name + "_PHRASE_THRESHOLD"
return (getattr(self.config, phrase_attr, ""),
getattr(self.config, threshold_attr, 0))
def safe_set_keyphrase(name, func):
phrase, threshold = get_phrase_values(name)
if phrase and threshold:
try:
self.set_keyphrase(phrase, threshold, func)
except UnknownWordError as e:
self._log.error(e)
# Set the wake phrase using set_kws_list directly because it uses a
# different search.
wake_phrase, wake_threshold = get_phrase_values("WAKE")
if wake_phrase and wake_threshold:
try:
self._validate_words(wake_phrase.split(), "keyphrase")
self._decoder.set_kws_list("_wake_phrase", {
wake_phrase: wake_threshold
})
except UnknownWordError as e:
self._log.error(e)
# Set the other keyphrases using safe_set_keyphrase().
safe_set_keyphrase("SLEEP", self.pause_recognition)
safe_set_keyphrase("START_TRAINING",
self.start_training_session)
safe_set_keyphrase("END_TRAINING",
self.end_training_session)
# Set the PyAudioRecorder instance's config object.
self._recorder.config = self.config
# Start in sleep mode if requested.
if self.config.START_ASLEEP:
self.pause_recognition()
self._log.warning("Starting in sleep mode as requested.")
def _free_engine_resources(self):
"""
Internal method for freeing the resources used by the engine.
"""
# Stop the audio recorder if it is running.
self._recognising = False
self._recorder.stop()
# Free the decoder and clear audio buffers.
self._decoder = None
self._audio_buffers = []
# Reset other variables
self._cancel_recognition_next_time = False
self._training_session_active = False
self._recognition_paused = False
self._grammar_count = 0
# Clear dictionaries and sets
self._grammar_wrappers.clear()
self._valid_searches.clear()
self._keyphrase_thresholds.clear()
self._keyphrase_functions.clear()
[docs] def disconnect(self):
"""
Deallocate the CMU Sphinx decoder and any other resources used by
it.
This method effectively unloads all loaded grammars and key
phrases.
"""
# Free resources if the decoder isn't currently being used to
# recognise, otherwise stop the recognising loop, which will free
# the resources safely.
if not self.recognising:
self._free_engine_resources()
else:
self._recognising = False
self._recorder.stop()
# -----------------------------------------------------------------------
# Multiplexing timer methods.
[docs] def create_timer(self, callback, interval, repeating=True):
"""
Create and return a timer using the specified callback and repeat
interval.
**Note**: Timers will not run unless the engine is recognising
audio. Normal threads can be used instead with no downsides.
"""
if not self.recognising:
self._log.warning("Timers will not run unless the engine is "
"recognising audio.")
return super(SphinxEngine, self).create_timer(callback, interval,
repeating)
# -----------------------------------------------------------------------
# Methods for working with grammars.
[docs] def check_valid_word(self, word):
"""
Check if a word is in the current Sphinx pronunciation dictionary.
:rtype: bool
"""
if not self._decoder:
self.connect()
word = _map_to_str(word)
return bool(self._decoder.lookup_word(word.lower()))
def _validate_words(self, words, search_type):
unknown_words = []
# Use 'set' to de-duplicate the 'words' list.
for word in set(words):
if not self.check_valid_word(word):
unknown_words.append(word)
if unknown_words:
# Sort the word list before using it.
unknown_words.sort()
raise UnknownWordError(
"%s used words not found in the pronunciation dictionary: "
"%s" % (search_type, ", ".join(unknown_words))
)
def _build_grammar_wrapper(self, grammar):
search_name = "%d" % self._grammar_count
self._grammar_count += 1
return GrammarWrapper(grammar, self,
self._recognition_observer_manager,
search_name)
def _set_grammar(self, wrapper, activate, partial=False):
if not wrapper:
return
# Connect to the engine if it isn't connected already.
self.connect()
def activate_search_if_necessary():
if activate:
self._decoder.end_utterance()
self._decoder.active_search = wrapper.search_name
# Check if the wrapper's search name is valid.
# Set the search (again) if necessary.
valid_search = wrapper.search_name in self._valid_searches
if valid_search and not wrapper.set_search:
# wrapper.search_name is a valid search, so return.
activate_search_if_necessary()
return
# Return early if 'partial' is True as an optimisation to avoid
# recompiling grammars for every rule activation/deactivation.
# Also return if the search doesn't need to be set.
if partial or not wrapper.set_search:
return
# Compile and set the jsgf search.
compiled = wrapper.compile_jsgf()
self._log.debug(compiled)
# Nothing further to do; no public rules.
if "public <root> = " not in compiled:
wrapper.set_search = False
return
# Set the JSGF search.
self._decoder.end_utterance()
self._decoder.set_jsgf_string(wrapper.search_name,
_map_to_str(compiled))
activate_search_if_necessary()
# Grammar search has been loaded, so set the wrapper's flag.
wrapper.set_search = False
def _unset_search(self, name):
# Unset a Pocket Sphinx search with the given name.
# Don't unset the default or keyphrase searches.
default_search = self._default_search_name
reserved = [default_search] + self._keyphrase_search_names
if name in reserved:
return
# Unset the Pocket Sphinx search.
if name in self._valid_searches:
# Unset the decoder search.
self._decoder.unset_search(name)
# Remove the search from the valid searches set.
self._valid_searches.remove(name)
# Change to the default search to avoid possible segmentation faults
# from Pocket Sphinx which crash Python.
self._set_default_search()
# TODO Add optional context parameter
[docs] def set_keyphrase(self, keyphrase, threshold, func):
"""
Add a keyphrase to listen for.
Key phrases take precedence over grammars as they are processed first.
They cannot be set for specific contexts (yet).
:param keyphrase: keyphrase to add.
:param threshold: keyphrase threshold value to use.
:param func: function or method to call when the keyphrase is heard.
:type keyphrase: str
:type threshold: float
:type func: callable
:raises: UnknownWordError
"""
# Check that all words in the keyphrase are in the pronunciation dictionary.
# This can raise an UnknownWordError.
self._validate_words(keyphrase.split(), "keyphrase")
# Check that the threshold is a float.
if not isinstance(threshold, float):
raise TypeError("threshold must be a float, not %s" % threshold)
# Add parameters to the relevant dictionaries.
self._keyphrase_thresholds[keyphrase] = threshold
self._keyphrase_functions[keyphrase] = func
# Set the keyphrase search (again)
self._decoder.end_utterance()
self._decoder.set_kws_list("_key_phrases", self._keyphrase_thresholds)
[docs] def unset_keyphrase(self, keyphrase):
"""
Remove a set keyphrase so that the engine no longer listens for it.
:param keyphrase: keyphrase to remove.
:type keyphrase: str
"""
# Remove parameters from the relevant dictionaries. Don't raise an error
# if there is no such keyphrase.
self._keyphrase_thresholds.pop(keyphrase, None)
self._keyphrase_functions.pop(keyphrase, None)
# Set the keyphrase search (again)
self._decoder.end_utterance()
self._decoder.set_kws_list("_key_phrases", self._keyphrase_thresholds)
def _set_default_search(self):
# Change the active search to the one used for processing speech as
# it is heard.
swap_to_wake_search = (
self.recognition_paused and self.config.WAKE_PHRASE and
self.config.WAKE_PHRASE_THRESHOLD
)
# Ensure we're not processing.
self._decoder.end_utterance()
if swap_to_wake_search:
self._decoder.active_search = "_wake_phrase"
else:
self._decoder.active_search = self._default_search_name
def _load_grammar(self, grammar):
""" Load the given *grammar* and return a wrapper. """
self._log.debug("Engine %s: loading grammar %s."
% (self, grammar.name))
wrapper = self._build_grammar_wrapper(grammar)
# Attempt to set the grammar search.
try:
self._set_grammar(wrapper, False)
except Exception as e:
self._log.exception("Failed to load grammar %s: %s."
% (grammar, e))
raise EngineError("Failed to load grammar %s: %s."
% (grammar, e))
# Set the grammar wrapper's search name as valid and return the
# wrapper.
self._valid_searches.add(wrapper.search_name)
return wrapper
def _unload_grammar(self, grammar, wrapper):
try:
# Unset the search names for the grammar.
self._unset_search(wrapper.search_name)
except Exception as e:
self._log.exception("Failed to unload grammar %s: %s."
% (grammar, e))
def activate_grammar(self, grammar):
self._log.debug("Activating grammar %s." % grammar.name)
def deactivate_grammar(self, grammar):
self._log.debug("Deactivating grammar %s." % grammar.name)
def activate_rule(self, rule, grammar):
self._log.debug("Activating rule %s in grammar %s."
% (rule.name, grammar.name))
wrapper = self._get_grammar_wrapper(grammar)
if not wrapper:
return
try:
wrapper.enable_rule(rule.name)
self._set_grammar(wrapper, False, True)
except Exception as e:
self._log.exception("Failed to activate grammar %s: %s."
% (grammar, e))
def deactivate_rule(self, rule, grammar):
self._log.debug("Deactivating rule %s in grammar %s."
% (rule.name, grammar.name))
wrapper = self._get_grammar_wrapper(grammar)
if not wrapper:
return
try:
wrapper.disable_rule(rule.name)
self._set_grammar(wrapper, False, True)
except Exception as e:
self._log.exception("Failed to activate grammar %s: %s."
% (grammar, e))
def update_list(self, lst, grammar):
wrapper = self._get_grammar_wrapper(grammar)
if not wrapper:
return
# Unfortunately there is no way to update lists for Pocket Sphinx
# without reloading the grammar, so we'll update the list's JSGF
# rule and reload.
wrapper.update_list(lst)
# Reload the grammar.
try:
self._set_grammar(wrapper, False)
except Exception as e:
self._log.exception("Failed to update list %s: %s."
% (lst, e))
[docs] def set_exclusiveness(self, grammar, exclusive):
wrapper = self._get_grammar_wrapper(grammar)
if not wrapper:
return
wrapper.exclusive = exclusive
# -----------------------------------------------------------------------
# Miscellaneous methods.
@property
def recognising(self):
"""
Whether the engine is currently recognising speech.
To stop recognition, use :meth:`disconnect`.
:rtype: bool
"""
return self._recorder.recording or self._recognising
@property
def default_search_result(self):
"""
The last hypothesis object of the default search.
This does not currently reach recognition observers because it is
intended to be used for dictation results, which are currently
disabled. Nevertheless this object can be useful sometimes.
:returns: Sphinx Hypothesis object | None
"""
return self._default_search_result
@property
def _default_search_name(self):
# The name of the Pocket Sphinx search used for processing speech as
# it is heard.
return "_default"
def _get_best_hypothesis(self, hypotheses):
"""
Take a list of speech hypotheses and return the most likely one.
:type hypotheses: iterable
:return: str | None
"""
# Get all distinct, non-null hypotheses.
distinct = tuple([h for h in set(hypotheses) if bool(h)])
if not distinct:
return None
elif len(distinct) == 1:
return distinct[0] # only one choice
# Decide between non-null hypotheses using a Pocket Sphinx search with
# each hypothesis as a grammar rule.
grammar = RootGrammar()
grammar.language_name = self.language
for i, hypothesis in enumerate(distinct):
grammar.add_rule(PublicRule("rule%d" % i, Literal(hypothesis)))
compiled = grammar.compile_grammar()
name = "_temp"
# Store the current search name.
original = self._decoder.active_search
# Note that there is no need to validate words in this case because
# each literal in the _temp grammar came from a Pocket Sphinx
# hypothesis.
self._decoder.end_utterance()
self._decoder.set_jsgf_string(name, _map_to_str(compiled))
self._decoder.active_search = name
# Do the processing.
hyp = self._decoder.batch_process(
self._audio_buffers,
use_callbacks=False
)
result = hyp.hypstr if hyp else None
# Switch back to the previous search.
self._decoder.end_utterance() # just in case
self._decoder.active_search = original
self._decoder.unset_search("_temp")
return result
def _speech_start_callback(self, mimicking):
# Get context info.
fg_window = Window.get_foreground()
window_info = {
"executable": fg_window.executable,
"title": fg_window.title,
"handle": fg_window.handle,
}
# Call process_begin for all grammars so that any out of context
# grammar will not be used.
for wrapper in self._grammar_wrappers.copy().values():
wrapper.process_begin(**window_info)
if not mimicking:
# Trim excess audio buffers from the start of the list. Keep a maximum 1
# second of silence before speech start was detected. This should help
# increase the performance of batch reprocessing later.
chunk = self.config.FRAMES_PER_BUFFER
rate = self.config.RATE
seconds = 1
n_buffers = int(rate / chunk * seconds)
self._audio_buffers = self._audio_buffers[-1 * n_buffers:]
# Notify observers
self._recognition_observer_manager.notify_begin()
def _hypothesis_callback(self, speech, mimicking):
"""
Internal Pocket Sphinx hypothesis callback method. Calls _process_hypothesis
and does post-processing afterwards.
:param speech: speech hypothesis
:type speech: str | None
:param mimicking: whether to treat speech as mimicked speech.
:rtype: bool
"""
# Clear any recorded audio buffers.
self._recorder.clear_buffers()
# Process speech. We should get back a boolean for whether processing
# occurred as well as the final speech hypothesis.
processing_occurred, final_speech = self._process_hypotheses(
speech, mimicking
)
# Notify observers of failure.
results_obj = None # TODO Use PS results object once implemented
if not processing_occurred:
self._recognition_observer_manager.notify_failure(results_obj)
# Write the training data files if necessary.
data_dir = self.config.TRAINING_DATA_DIR
if not mimicking and data_dir and os.path.isdir(data_dir):
# Use the default search's hypothesis if final_speech was nil.
if not final_speech:
final_speech = speech
try:
write_training_data(self.config, self._audio_buffers,
final_speech)
except Exception as e:
self._log.exception("Failed to write training data: %s" % e)
# Clear audio buffer list because utterance processing has finished.
self._audio_buffers = []
# Ensure that the correct search is used.
self._set_default_search()
# Return whether processing occurred in case this method was called
# by mimic.
return processing_occurred
def _process_key_phrases(self, speech, mimicking):
"""
Processing key phrase searches and return the matched keyphrase
(if any).
:type speech: str
:param mimicking: whether to treat speech as mimicked speech.
:rtype: str
"""
# Return if speech is empty/null or if there are no key phrases set.
if not (speech and self._keyphrase_thresholds):
return "" # no matches
if not mimicking:
# Reprocess using the key phrases search
self._decoder.end_utterance()
self._decoder.active_search = "_key_phrases"
hyp = self._decoder.batch_process(self._audio_buffers,
use_callbacks=False)
# Get the hypothesis string.
speech = hyp.hypstr if hyp else ""
# Restore search to the default search.
self._set_default_search()
# Return if no key phrase matched.
if not speech:
return ""
# Handle multiple matching key phrases. This appears to be a
# quirk of how Pocket Sphinx 'kws' searches work. Get the best
# match instead if this is the case.
recognised_phrases = speech.split(" ")
if len(recognised_phrases) > 1:
# Remove trailing space from the last phrase.
recognised_phrases[len(recognised_phrases) - 1].rstrip()
speech = self._get_best_hypothesis(recognised_phrases)
else:
speech = speech.rstrip() # remove trailing whitespace.
# Notify observers if a keyphrase was matched.
results_obj = None # TODO Use PS results object once implemented
result = speech if speech in self._keyphrase_functions else ""
words = tuple(result.split())
if words:
self._recognition_observer_manager.notify_recognition(
words, None, None, results_obj
)
# Call the registered function if there was a match and the function
# is callable.
func = self._keyphrase_functions.get(speech, None)
if callable(func):
try:
func()
except Exception as e:
self._log.exception(
"Exception caught when executing the function for "
"keyphrase '%s': %s" % (speech, e)
)
# Notify observers after calling the keyphrase function.
if words:
self._recognition_observer_manager.notify_post_recognition(
words, None, None, results_obj
)
return result
def _process_hypotheses(self, speech, mimicking):
"""
Internal method to process speech hypotheses. This should only be called
from 'SphinxEngine._hypothesis_callback' because that method does important
post processing.
:param speech: speech
:param mimicking: whether to treat speech as mimicked speech.
:rtype: tuple
"""
# Check key phrases search first.
keyphrase = self._process_key_phrases(speech, mimicking)
if keyphrase:
# Keyphrase search matched.
return True, keyphrase
# Otherwise do grammar processing.
hypotheses = {}
wrappers = self._grammar_wrappers.copy().values()
# Save the LM hypothesis for later.
lm_hypothesis = speech
# Filter out inactive grammars.
wrappers = [w for w in wrappers if w.grammar_is_active]
# Include only exclusive grammars if at least one is active.
exclusive_count = 0
for wrapper in wrappers:
if wrapper.exclusive: exclusive_count += 1
if exclusive_count > 0:
wrappers = [w for w in wrappers if w.exclusive]
# No grammar has been loaded.
if not wrappers:
return False, speech
# Batch process audio buffers for each active grammar. Store each
# hypothesis.
for wrapper in wrappers:
if mimicking:
# Just use 'speech' for everything if mimicking.
hyp = speech
else:
# Switch to the search for this grammar and re-process the
# audio.
self._set_grammar(wrapper, True)
hyp = self._decoder.batch_process(
self._audio_buffers,
use_callbacks=False
)
if hyp:
hyp = hyp.hypstr
# Set the hypothesis in the dictionary.
hypotheses[wrapper.search_name] = hyp
# Get the best hypothesis.
speech = self._get_best_hypothesis(list(hypotheses.values()))
# If we have an hypothesis, filter out irrelevant grammars and
# attempt to process it with the resulting subset. Stop on the
# first grammar that processes the hypothesis.
result = False
decoder_results = None # FIXME Expose P.S. decoder results
if speech:
wrappers_subset = [wrapper for wrapper in wrappers
if hypotheses[wrapper.search_name] == speech]
words_rules = self._get_words_rules(speech.split(), 0)
for wrapper in wrappers_subset:
result = wrapper.process_results(words_rules,
wrapper.grammar.rule_names,
decoder_results)
if result: break
# If no processing has occurred by this point, try to process a
# grammar using the LM hypothesis instead, if there is one.
if not result and lm_hypothesis:
dictation_words = lm_hypothesis.split()
words_rules = self._get_words_rules(dictation_words, 1000000)
for wrapper in wrappers:
result = wrapper.process_results(words_rules,
wrapper.grammar.rule_names,
decoder_results)
if result: break
speech = lm_hypothesis
# Return whether processing occurred, plus the final speech
# hypothesis for post-processing.
return result, speech
[docs] def process_buffer(self, buf):
"""
Recognise speech from an audio buffer.
This method is meant to be called in sequence for multiple audio
buffers. It will do nothing if :meth:`connect` hasn't been called.
:param buf: audio buffer
:type buf: str
"""
if not self._decoder:
return
# Cancel current recognition if it has been requested.
if self._cancel_recognition_next_time:
self._decoder.end_utterance()
self._audio_buffers = []
self._cancel_recognition_next_time = False
# Keep a list of buffers for possible reprocessing using different Pocket
# Sphinx searches later.
self._audio_buffers.append(buf)
# Call the timer callback if it is set.
self.call_timer_callback()
# Process audio.
try:
self._recognising = True
self._decoder.process_audio(buf)
finally:
self._recognising = False
[docs] def process_wave_file(self, path):
"""
Recognise speech from a wave file and return the recognition results.
This method checks that the wave file is valid. It raises an error
if the file doesn't exist, if it can't be read or if the WAV header
values do not match those in the engine configuration.
If recognition is paused (sleep mode), this method will call
:meth:`resume_recognition`.
The wave file must use the same sample width, sample rate and number
of channels that the acoustic model uses.
If the file is valid, :meth:`process_buffer` is then used to process
the audio.
Multiple utterances are supported.
:param path: wave file path
:raises: IOError | OSError | ValueError
:returns: recognition results
:rtype: generator
"""
if not self._decoder:
self.connect()
# This method's implementation has been adapted from the PyAudio
# play wave example:
# http://people.csail.mit.edu/hubert/pyaudio/#play-wave-example
# Check that path is a valid file.
if not os.path.isfile(path):
raise IOError("'%s' is not a file. Please use a different file path.")
# Get required audio configuration from the engine config.
channels, sample_width, rate, chunk = (
self.config.CHANNELS,
self.config.SAMPLE_WIDTH,
self.config.RATE,
self.config.FRAMES_PER_BUFFER
)
# Make sure recognition is not paused.
if self.recognition_paused:
self.resume_recognition(notify=False)
# Open the wave file. Use contextlib to make sure that the file is
# closed whether errors are raised or not.
# Also register a custom recognition observer for the duration.
obs = WaveRecognitionObserver(self)
with contextlib.closing(wave.open(path, "rb")) as wf, obs as obs:
# Validate the wave file's header.
if wf.getnchannels() != channels:
message = ("WAV file '%s' should use %d channel(s), not %d!"
% (path, channels, wf.getnchannels()))
elif wf.getsampwidth() != sample_width:
message = ("WAV file '%s' should use sample width %d, not "
"%d!" % (path, sample_width, wf.getsampwidth()))
elif wf.getframerate() != rate:
message = ("WAV file '%s' should use sample rate %d, not "
"%d!" % (path, rate, wf.getframerate()))
else:
message = None
if message:
raise ValueError(message)
# Use process_buffer to process each buffer.
for _ in range(0, int(wf.getnframes() / chunk) + 1):
data = wf.readframes(chunk)
if not data:
break
self.process_buffer(data)
# Get the results from the observer.
if obs.words:
yield obs.words
obs.words = ""
# Log warnings if speech start or end weren't detected.
if not obs.complete:
self._log.warning("Speech start/end wasn't detected in the wave "
"file!")
self._log.warning("Perhaps the Sphinx '-vad_prespeech' value "
"should be higher?")
self._log.warning("Or maybe '-vad_startspeech' or "
"'-vad_postspeech' should be lower?")
def _do_recognition(self):
"""
Start recognising from the default recording device until
:meth:`disconnect` is called.
Recognition can be paused and resumed using either the sleep/wake
key phrases or by calling :meth:`pause_recognition` or
:meth:`resume_recognition`.
To configure audio input settings, modify the engine's ``CHANNELS``,
``RATE``, ``SAMPLE_WIDTH`` and/or ``FRAMES_PER_BUFFER``
configuration options.
"""
if not self._decoder:
self.connect()
# Start recognising in a loop.
self._recorder.start()
self._cancel_recognition_next_time = False
while self.recognising:
for buf in self._recorder.get_buffers():
self.process_buffer(buf)
# Free engine resources after recognition has stopped.
self._free_engine_resources()
[docs] def mimic(self, words):
""" Mimic a recognition of the given *words* """
# The *words* argument should be a string or iterable.
# Words are put into lowercase for consistency.
if isinstance(words, string_types):
words = words.lower()
elif iter(words):
words = " ".join([w.lower() for w in words])
else:
raise TypeError("%r is not a string or other iterable object"
% words)
# Fail on empty input.
if not words:
raise MimicFailure("Invalid mimic input %r" % words)
if self.recognition_paused and words == self.config.WAKE_PHRASE:
self.resume_recognition()
return
# Pretend that Sphinx has started processing speech
self._speech_start_callback(True)
# Process the words as if they were spoken
result = self._hypothesis_callback(words, True)
if not result:
raise MimicFailure("No matching rule found for words %s."
% words)
[docs] def mimic_phrases(self, *phrases):
"""
Mimic a recognition of the given *phrases*.
This method accepts variable phrases instead of a list of words.
"""
# Pretend that Sphinx has started processing speech
self._speech_start_callback(True)
# Process phrases as if they were spoken
wake_phrase = self.config.WAKE_PHRASE
for phrase in phrases:
if self.recognition_paused and phrase == wake_phrase:
self.resume_recognition()
continue
result = self._hypothesis_callback(phrase, True)
if not result:
raise MimicFailure("No matching rule found for words %s."
% phrase)
[docs] def speak(self, text):
""" Speak the given *text* using text-to-speech. """
dragonfly.engines.get_speaker().speak(text)
def _get_language(self):
return self.config.LANGUAGE
def _has_quoted_words_support(self):
return False
# ----------------------------------------------------------------------
# Training-related methods
[docs] def write_transcript_files(self, fileids_path, transcription_path):
"""
Write .fileids and .transcription files for files in the training
data directory and write them to the specified file paths.
This method will raise an error if the ``TRAINING_DATA_DIR``
configuration option is not set to an existing directory.
:param fileids_path: path to .fileids file to create.
:param transcription_path: path to .transcription file to create.
:type fileids_path: str
:type transcription_path: str
:raises: IOError | OSError
"""
write_transcript_files(
self.config, fileids_path, transcription_path
)
@property
def training_session_active(self):
"""
Whether a training session is in progress.
:rtype: bool
"""
return self._training_session_active
[docs] def start_training_session(self):
"""
Start the training session. This will stop recognition processing
until either :meth:`end_training_session` is called or the end
training keyphrase is heard.
"""
data_dir = self.config.TRAINING_DATA_DIR
if not data_dir or not os.path.isdir(data_dir):
self._log.warning("Training data will not be recorded; '%s' is "
"not a directory" % data_dir)
if not self._training_session_active:
self._log.info("Training session has started. No rule "
"actions will be processed. ")
self._log.info("Say '%s' to end the session."
% self.config.END_TRAINING_PHRASE)
self._training_session_active = True
[docs] def end_training_session(self):
"""
End the training if one is in progress. This will allow recognition
processing once again.
"""
if self._training_session_active:
self._log.info("Ending training session.")
self._log.info("Rule actions will now be processed normally "
"again.")
self._training_session_active = False
# ----------------------------------------------------------------------
# Recognition loop control methods
# Stopping recognition loop is done using disconnect()
@property
def recognition_paused(self):
"""
Whether the engine is waiting for the wake phrase to be heard or for
:meth:`resume_recognition` to be called.
:rtype: bool
"""
return self._recognition_paused
[docs] def pause_recognition(self):
"""
Pause recognition and wait for :meth:`resume_recognition` to be
called or for the wake keyphrase to be spoken.
"""
if not self._decoder:
return
self._recognition_paused = True
# Switch to the wake keyphrase search if a wake keyphrase has been
# set.
self._set_default_search()
if not self.config.WAKE_PHRASE:
self._log.warning("No wake phrase has been set.")
self._log.warning("Use engine.resume_recognition() to wake up.")
# Define temporary callback for the decoder.
def hypothesis(hyp):
# Clear any recorded audio buffers.
self._recorder.clear_buffers()
s = hyp.hypstr if hyp else None
# Resume recognition if s is the wake keyphrase.
if s and s.strip() == self.config.WAKE_PHRASE.strip():
self.resume_recognition()
elif self.config.WAKE_PHRASE:
self._log.debug("Didn't hear %s" % self.config.WAKE_PHRASE)
# Clear audio buffers
self._audio_buffers = []
# Override decoder hypothesis callback.
self._decoder.hypothesis_callback = hypothesis
[docs] def resume_recognition(self, notify=True):
"""
Resume listening for grammar rules and key phrases.
"""
if not self._decoder:
return
self._recognition_paused = False
# Notify observers about recognition resume.
keyphrase = self.config.WAKE_PHRASE
words = tuple(keyphrase.strip().split())
results_obj = None # TODO Use PS results object once implemented
if words and notify:
manager = self._recognition_observer_manager
arguments = (words, None, None, results_obj)
manager.notify_recognition(*arguments)
manager.notify_post_recognition(*arguments)
# Restore the callbacks to normal
def hypothesis(hyp):
# Set default search result.
self._default_search_result = hyp
# Set speech to the hypothesis string or None if there isn't one
speech = hyp.hypstr if hyp else None
return self._hypothesis_callback(speech, False)
self._decoder.hypothesis_callback = hypothesis
# Switch to the default search.
self._set_default_search()
[docs] def cancel_recognition(self):
"""
If a recognition was in progress, cancel it before processing the
next audio buffer.
"""
self._cancel_recognition_next_time = True