Merge pull request #239 from GoogleCloudPlatform/speech-streaming

jerjou · jerjou · commit b1873106f235 · 2016-04-07T16:21:40.000-07:00
Add speech api streaming sample.
diff --git a/.travis.yml b/.travis.yml
@@ -16,8 +16,13 @@ env:
   - GOOGLE_CLIENT_SECRETS=${TRAVIS_BUILD_DIR}/testing/resources/client-secrets.json
   - GAE_ROOT=${HOME}/.cache/
   - secure: Orp9Et2TIwCG/Hf59aa0NUDF1pNcwcS4TFulXX175918cFREOzf/cNZNg+Ui585ZRFjbifZdc858tVuCVd8XlxQPXQgp7bwB7nXs3lby3LYg4+HD83Gaz7KOWxRLWVor6IVn8OxeCzwl6fJkdmffsTTO9csC4yZ7izHr+u7hiO4=
+addons:
+  apt:
+    packages:
+    - portaudio19-dev
 before_install:
 - pip install --upgrade pip wheel virtualenv
+  # for speech api sample
 - openssl aes-256-cbc -k "$secrets_password" -in secrets.tar.enc -out secrets.tar -d
 - tar xvf secrets.tar
 install:
diff --git a/nox.py b/nox.py
@@ -86,6 +86,11 @@ def session_tests(session, interpreter, extra_pytest_args=None):
     # allows users to run a particular test instead of all of them.
     for sample in (session.posargs or
                    collect_sample_dirs('.', SESSION_TESTS_BLACKLIST)):
+        # Install additional dependencies if they exist
+        dirname = sample if os.path.isdir(sample) else os.path.dirname(sample)
+        for reqfile in list_files(dirname, 'requirements*.txt'):
+            session.install('-r', reqfile)
+
         session.run(
             'py.test', sample,
             *pytest_args,
diff --git a/speech/api/README.md b/speech/api/README.md
@@ -37,10 +37,36 @@ See the
 [Cloud Platform Auth Guide](https://cloud.google.com/docs/authentication#developer_workflow)
 for more information.
 
+### Install the dependencies
+
+* If you're running the `speechrest.py` sample:
+
+    ```sh
+    $ pip install requirements-speechrest.txt
+    ```
+
+* If you're running the `speech_streaming.py` sample:
+
+    ```sh
+    $ pip install requirements-speech_streaming.txt
+    ```
+
 ## Run the example
 
-```sh
-$ python speechrest.py resources/audio.raw
-```
+* To run the `speechrest.py` sample:
+
+    ```sh
+    $ python speechrest.py resources/audio.raw
+    ```
+
+    You should see a response with the transcription result.
+
+* To run the `speech_streaming.py` sample:
+
+    ```sh
+    $ python speech_streaming.py
+    ```
 
-You should see a response with the transcription result.
+    The sample will run in a continuous loop, printing the data and metadata
+    it receives from the Speech API, which includes alternative transcriptions
+    of what it hears, and a confidence score. Say "exit" to exit the loop.
diff --git a/speech/api/requirements-speech_streaming.txt b/speech/api/requirements-speech_streaming.txt
@@ -0,0 +1,4 @@
+gcloud==0.12.0
+grpcio==0.13.1
+PyAudio==0.2.9
+grpc-google-cloud-speech==1.0.0
diff --git a/speech/api/requirements-speechrest.txt b/speech/api/requirements-speechrest.txt
@@ -0,0 +1 @@
+google-api-python-client==1.5.0
diff --git a/speech/api/resources/quit.raw b/speech/api/resources/quit.raw
diff --git a/speech/api/speech_streaming.py b/speech/api/speech_streaming.py
@@ -0,0 +1,120 @@
+#!/usr/bin/python
+
+import contextlib
+import threading
+
+from gcloud.credentials import get_credentials
+from google.cloud.speech.v1.cloud_speech_pb2 import *  # noqa
+from google.rpc import code_pb2
+from grpc.beta import implementations
+import pyaudio
+
+# Audio recording parameters
+RATE = 16000
+CHANNELS = 1
+CHUNK = RATE // 10  # 100ms
+
+# Keep the request alive for this many seconds
+DEADLINE_SECS = 8 * 60 * 60
+SPEECH_SCOPE = 'https://www.googleapis.com/auth/cloud-platform'
+
+
+def make_channel(host, port):
+    """Creates an SSL channel with auth credentials from the environment."""
+    # In order to make an https call, use an ssl channel with defaults
+    ssl_channel = implementations.ssl_channel_credentials(None, None, None)
+
+    # Grab application default credentials from the environment
+    creds = get_credentials().create_scoped([SPEECH_SCOPE])
+    # Add a plugin to inject the creds into the header
+    auth_header = (
+            'Authorization',
+            'Bearer ' + creds.get_access_token().access_token)
+    auth_plugin = implementations.metadata_call_credentials(
+            lambda _, cb: cb([auth_header], None),
+            name='google_creds')
+
+    # compose the two together for both ssl and google auth
+    composite_channel = implementations.composite_channel_credentials(
+            ssl_channel, auth_plugin)
+
+    return implementations.secure_channel(host, port, composite_channel)
+
+
+@contextlib.contextmanager
+def record_audio(channels, rate, chunk):
+    """Opens a recording stream in a context manager."""
+    audio_interface = pyaudio.PyAudio()
+    audio_stream = audio_interface.open(
+        format=pyaudio.paInt16, channels=channels, rate=rate,
+        input=True, frames_per_buffer=chunk,
+    )
+
+    yield audio_stream
+
+    audio_stream.stop_stream()
+    audio_stream.close()
+    audio_interface.terminate()
+
+
+def request_stream(stop_audio, channels=CHANNELS, rate=RATE, chunk=CHUNK):
+    """Yields `RecognizeRequest`s constructed from a recording audio stream.
+
+    Args:
+        stop_audio: A threading.Event object stops the recording when set.
+        channels: How many audio channels to record.
+        rate: The sampling rate.
+        chunk: Buffer audio into chunks of this size before sending to the api.
+    """
+    with record_audio(channels, rate, chunk) as audio_stream:
+        # The initial request must contain metadata about the stream, so the
+        # server knows how to interpret it.
+        metadata = InitialRecognizeRequest(
+            encoding='LINEAR16', sample_rate=rate)
+        audio_request = AudioRequest(content=audio_stream.read(chunk))
+
+        yield RecognizeRequest(
+            initial_request=metadata,
+            audio_request=audio_request)
+
+        while not stop_audio.is_set():
+            # Subsequent requests can all just have the content
+            audio_request = AudioRequest(content=audio_stream.read(chunk))
+
+            yield RecognizeRequest(audio_request=audio_request)
+
+
+def listen_print_loop(recognize_stream):
+    for resp in recognize_stream:
+        if resp.error.code != code_pb2.OK:
+            raise RuntimeError('Server error: ' + resp.error.message)
+
+        # Display the transcriptions & their alternatives
+        for result in resp.results:
+            print(result.alternatives)
+
+        # Exit recognition if any of the transcribed phrases could be
+        # one of our keywords.
+        if any(alt.confidence > .5 and
+               (alt.transcript.strip() in ('exit', 'quit'))
+               for result in resp.results
+               for alt in result.alternatives):
+            print('Exiting..')
+            return
+
+
+def main():
+    stop_audio = threading.Event()
+    with beta_create_Speech_stub(
+            make_channel('speech.googleapis.com', 443)) as service:
+        try:
+            listen_print_loop(
+                service.Recognize(request_stream(stop_audio), DEADLINE_SECS))
+        finally:
+            # Stop the request stream once we're done with the loop - otherwise
+            # it'll keep going in the thread that the grpc lib makes for it..
+            stop_audio.set()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/speech/api/speech_streaming_test.py b/speech/api/speech_streaming_test.py
@@ -0,0 +1,67 @@
+# Copyright 2016, Google, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import io
+import re
+import sys
+
+import pytest
+
+import speech_streaming
+
+
+class MockAudioStream(object):
+    def __init__(self, audio_filename, trailing_silence_secs=10):
+        self.audio_filename = audio_filename
+        self.silence = io.BytesIO('\0\0' * speech_streaming.RATE *
+                                  trailing_silence_secs)
+
+    def __enter__(self):
+        self.audio_file = open(self.audio_filename)
+        return self
+
+    def __exit__(self, *args):
+        self.audio_file.close()
+
+    def __call__(self, *args):
+        return self
+
+    def read(self, num_frames):
+        # audio is 16-bit samples, whereas python byte is 8-bit
+        num_bytes = 2 * num_frames
+        chunk = self.audio_file.read(num_bytes) or self.silence.read(num_bytes)
+        return chunk
+
+
+def mock_audio_stream(filename):
+    @contextlib.contextmanager
+    def mock_audio_stream(channels, rate, chunk):
+        with open(filename, 'rb') as audio_file:
+            yield audio_file
+
+    return mock_audio_stream
+
+
+@pytest.mark.skipif(
+    sys.version_info >= (3, 0), reason="can't get grpc lib to work in python3")
+def test_main(resource, monkeypatch, capsys):
+    monkeypatch.setattr(
+        speech_streaming, 'record_audio',
+        mock_audio_stream(resource('quit.raw')))
+    monkeypatch.setattr(speech_streaming, 'DEADLINE_SECS', 5)
+
+    speech_streaming.main()
+    out, err = capsys.readouterr()
+
+    assert re.search(r'transcript.*"quit"', out, re.DOTALL | re.I)