Add phrase time limit, update issue template, add operation timeout

azimut2000 · Oct 18, 2016 · d8e1911 · d8e1911
1 parent 50d46b2
commit d8e1911
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 22 deletions.
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
@@ -1,8 +1,8 @@
 Steps to reproduce
 ------------------
 
-1. (How do you make the issue happen?)
-2. (Make sure to go into as much detail as needed to reproduce the issue!)
+1. (How do you make the issue happen? Does it happen every time you try it?)
+2. (Make sure to go into as much detail as needed to reproduce the issue. Posting your code here can help us resolve the problem much faster!)
 3. (If there are any files, like audio recordings, don't forget to include them.)
 
 Expected behaviour
@@ -15,6 +15,10 @@ Actual behaviour
 
 (What happened instead? How is it different from what you expected?)
 
+```
+(If the library threw an exception, paste the full stack trace here)
+```
+
 System information
 ------------------
 

diff --git a/reference/library-reference.rst b/reference/library-reference.rst
@@ -129,6 +129,13 @@ Represents the minimum length of silence (in seconds) that will register as the
 
 Smaller values result in the recognition completing more quickly, but might result in slower speakers being cut off.
 
+``recognizer_instance.operation_timeout = None``
+------------------------------------------------
+
+Represents the timeout (in seconds) for internal operations, such as API requests. Can be changed.
+
+Setting this to a reasonable value ensures that these operations will never block indefinitely, though good values depend on your network speed and the expected length of the audio to recognize.
+
 ``recognizer_instance.record(source, duration = None, offset = None)``
 ----------------------------------------------------------------------
 
@@ -152,7 +159,11 @@ Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``
 
 This is done by waiting until the audio has an energy above ``recognizer_instance.energy_threshold`` (the user has started speaking), and then recording until it encounters ``recognizer_instance.pause_threshold`` seconds of non-speaking or there is no more audio input. The ending silence is not included.
 
-The ``timeout`` parameter is the maximum number of seconds that it will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, it will wait indefinitely.
+The ``timeout`` parameter is the maximum number of seconds that this will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, there will be no wait timeout.
+
+The ``phrase_time_limit`` parameter is the maximum number of seconds that this will allow a phrase to continue before stopping and returning the part of the phrase processed before the time limit was reached. The resulting audio will be the phrase cut off at the time limit. If ``phrase_timeout`` is ``None``, there will be no phrase time limit.
+
+This operation will always complete within ``timeout + phrase_timeout`` seconds if both are numbers, either by returning the audio data, or by raising an exception.
 
 ``recognizer_instance.listen_in_background(source, callback)``
 --------------------------------------------------------------
@@ -161,7 +172,7 @@ Spawns a thread to repeatedly record phrases from ``source`` (an ``AudioSource``
 
 Returns a function object that, when called, requests that the background listener thread stop, and waits until it does before returning. The background thread is a daemon and will not stop the program from exiting if there are no other non-daemon threads.
 
-Phrase recognition uses the exact same mechanism as ``recognizer_instance.listen(source)``.
+Phrase recognition uses the exact same mechanism as ``recognizer_instance.listen(source)``. The ``phrase_time_limit`` parameter works in the same way as the ``phrase_time_limit`` parameter for ``recognizer_instance.listen(source)``, as well.
 
 The ``callback`` parameter is a function that should accept two parameters - the ``recognizer_instance``, and an ``AudioData`` instance representing the captured audio. Note that ``callback`` function will be called from a non-main thread.
 

diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
@@ -199,7 +199,7 @@ def __enter__(self):
                 try:
                     self.audio_reader = aifc.open(aiff_file, "rb")
                 except aifc.Error:
-                    assert False, "Audio file could not be read as WAV, AIFF, or FLAC; check if file is corrupted"
+                    raise ValueError("Audio file could not be read as PCM WAV, AIFF/AIFF-C, or Native FLAC; check if file is corrupted or in another format")
                 self.little_endian = False # AIFF is a big-endian format
         assert 1 <= self.audio_reader.getnchannels() <= 2, "Audio must be mono or stereo"
         self.SAMPLE_WIDTH = self.audio_reader.getsampwidth()
@@ -397,6 +397,8 @@ def __init__(self):
         self.dynamic_energy_adjustment_damping = 0.15
         self.dynamic_energy_ratio = 1.5
         self.pause_threshold = 0.8 # seconds of non-speaking audio before a phrase is considered complete
+        self.operation_timeout = None # seconds after an internal operation (e.g., an API request) starts before it times out, or ``None`` for no timeout
+
         self.phrase_threshold = 0.3 # minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops)
         self.non_speaking_duration = 0.5 # seconds of non-speaking audio to keep on both sides of the recording
 
@@ -460,22 +462,26 @@ def adjust_for_ambient_noise(self, source, duration = 1):
             target_energy = energy * self.dynamic_energy_ratio
             self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping)
 
-    def listen(self, source, timeout = None):
+    def listen(self, source, timeout = None, phrase_time_limit = None):
         """
         Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance, which it returns.
 
         This is done by waiting until the audio has an energy above ``recognizer_instance.energy_threshold`` (the user has started speaking), and then recording until it encounters ``recognizer_instance.pause_threshold`` seconds of non-speaking or there is no more audio input. The ending silence is not included.
 
-        The ``timeout`` parameter is the maximum number of seconds that it will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, it will wait indefinitely.
+        The ``timeout`` parameter is the maximum number of seconds that this will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, there will be no wait timeout.
+
+        The ``phrase_time_limit`` parameter is the maximum number of seconds that this will allow a phrase to continue before stopping and returning the part of the phrase processed before the time limit was reached. The resulting audio will be the phrase cut off at the time limit. If ``phrase_timeout`` is ``None``, there will be no phrase time limit.
+
+        This operation will always complete within ``timeout + phrase_timeout`` seconds if both are numbers, either by returning the audio data, or by raising an exception.
         """
         assert isinstance(source, AudioSource), "Source must be an audio source"
         assert source.stream is not None, "Audio source must be entered before listening, see documentation for `AudioSource`; are you using `source` outside of a `with` statement?"
         assert self.pause_threshold >= self.non_speaking_duration >= 0
 
         seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
-        pause_buffer_count = int(math.ceil(self.pause_threshold / seconds_per_buffer)) # number of buffers of non-speaking audio before the phrase is complete
+        pause_buffer_count = int(math.ceil(self.pause_threshold / seconds_per_buffer)) # number of buffers of non-speaking audio during a phrase, before the phrase should be considered complete
         phrase_buffer_count = int(math.ceil(self.phrase_threshold / seconds_per_buffer)) # minimum number of buffers of speaking audio before we consider the speaking audio a phrase
-        non_speaking_buffer_count = int(math.ceil(self.non_speaking_duration / seconds_per_buffer)) # maximum number of buffers of non-speaking audio to retain before and after
+        non_speaking_buffer_count = int(math.ceil(self.non_speaking_duration / seconds_per_buffer)) # maximum number of buffers of non-speaking audio to retain before and after a phrase
 
         # read audio input for phrases until there is a phrase that is long enough
         elapsed_time = 0 # number of seconds of audio read
@@ -484,11 +490,11 @@ def listen(self, source, timeout = None):
             frames = collections.deque()
 
             # store audio input until the phrase starts
-
             while True:
+                # handle waiting too long for phrase by raising an exception
                 elapsed_time += seconds_per_buffer
-                if timeout and elapsed_time > timeout: # handle timeout if specified
-                    raise WaitTimeoutError("listening timed out")
+                if timeout and elapsed_time > timeout:
+                    raise WaitTimeoutError("listening timed out while waiting for phrase to start")
 
                 buffer = source.stream.read(source.CHUNK)
                 if len(buffer) == 0: break # reached end of the stream
@@ -508,16 +514,20 @@ def listen(self, source, timeout = None):
 
             # read audio input until the phrase ends
             pause_count, phrase_count = 0, 0
+            phrase_start_time = elapsed_time
             while True:
+                # handle phrase being too long by cutting off the audio
                 elapsed_time += seconds_per_buffer
+                if phrase_time_limit and elapsed_time - phrase_start_time > phrase_time_limit:
+                    break
 
                 buffer = source.stream.read(source.CHUNK)
                 if len(buffer) == 0: break # reached end of the stream
                 frames.append(buffer)
                 phrase_count += 1
 
                 # check if speaking has stopped for longer than the pause threshold on the audio input
-                energy = audioop.rms(buffer, source.SAMPLE_WIDTH) # energy of the audio signal
+                energy = audioop.rms(buffer, source.SAMPLE_WIDTH) # unit energy of the audio signal within the buffer
                 if energy > self.energy_threshold:
                     pause_count = 0
                 else:
@@ -535,13 +545,13 @@ def listen(self, source, timeout = None):
 
         return AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
 
-    def listen_in_background(self, source, callback):
+    def listen_in_background(self, source, callback, phrase_time_limit = None):
         """
         Spawns a thread to repeatedly record phrases from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance and call ``callback`` with that ``AudioData`` instance as soon as each phrase are detected.
 
         Returns a function object that, when called, requests that the background listener thread stop, and waits until it does before returning. The background thread is a daemon and will not stop the program from exiting if there are no other non-daemon threads.
 
-        Phrase recognition uses the exact same mechanism as ``recognizer_instance.listen(source)``.
+        Phrase recognition uses the exact same mechanism as ``recognizer_instance.listen(source)``. The ``phrase_time_limit`` parameter works in the same way as the ``phrase_time_limit`` parameter for ``recognizer_instance.listen(source)``, as well.
 
         The ``callback`` parameter is a function that should accept two parameters - the ``recognizer_instance``, and an ``AudioData`` instance representing the captured audio. Note that ``callback`` function will be called from a non-main thread.
         """
@@ -669,7 +679,7 @@ def recognize_google(self, audio_data, key = None, language = "en-US", show_all
 
         # obtain audio transcription results
         try:
-            response = urlopen(request)
+            response = urlopen(request, timeout=self.operation_timeout)
         except HTTPError as e:
             raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
         except URLError as e:
@@ -717,7 +727,7 @@ def recognize_wit(self, audio_data, key, show_all = False):
         url = "https://api.wit.ai/speech?v=20160526"
         request = Request(url, data = wav_data, headers = {"Authorization": "Bearer {0}".format(key), "Content-Type": "audio/wav"})
         try:
-            response = urlopen(request)
+            response = urlopen(request, timeout=self.operation_timeout)
         except HTTPError as e:
             raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
         except URLError as e:
@@ -770,7 +780,7 @@ def recognize_bing(self, audio_data, key, language = "en-US", show_all = False):
             if allow_caching:
                 start_time = monotonic()
             try:
-                credential_response = urlopen(credential_request)
+                credential_response = urlopen(credential_request, timeout=self.operation_timeout)
             except HTTPError as e:
                 raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
             except URLError as e:
@@ -804,7 +814,7 @@ def recognize_bing(self, audio_data, key, language = "en-US", show_all = False):
             "Content-Type": "audio/wav; samplerate=16000; sourcerate={0}; trustsourcerate=true".format(audio_data.sample_rate),
         })
         try:
-            response = urlopen(request)
+            response = urlopen(request, timeout=self.operation_timeout)
         except HTTPError as e:
             raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
         except URLError as e:
@@ -867,7 +877,7 @@ def recognize_api(self, audio_data, client_access_token, language = "en", sessio
             "Content-Type": "multipart/form-data; boundary={0}".format(boundary)
         })
         try:
-            response = urlopen(request)
+            response = urlopen(request, timeout=self.operation_timeout)
         except HTTPError as e:
             raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
         except URLError as e:
@@ -920,7 +930,7 @@ def recognize_houndify(self, audio_data, client_id, client_key, show_all = False
             "Hound-Client-Authentication": "{0};{1};{2}".format(client_id, request_time, request_signature)
         })
         try:
-            response = urlopen(request)
+            response = urlopen(request, timeout=self.operation_timeout)
         except HTTPError as e:
             raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
         except URLError as e:
@@ -970,7 +980,7 @@ def recognize_ibm(self, audio_data, username, password, language = "en-US", show
             authorization_value = base64.standard_b64encode("{0}:{1}".format(username, password))
         request.add_header("Authorization", "Basic {0}".format(authorization_value))
         try:
-            response = urlopen(request)
+            response = urlopen(request, timeout=self.operation_timeout)
         except HTTPError as e:
             raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6
         except URLError as e: