diff --git a/docs/index.rst b/docs/index.rst index d5071697864c..cb9a5bde523b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -176,6 +176,7 @@ speech-encoding speech-metadata speech-operation + speech-streaming speech-sample speech-transcript diff --git a/docs/speech-streaming.rst b/docs/speech-streaming.rst new file mode 100644 index 000000000000..eab505b4d06d --- /dev/null +++ b/docs/speech-streaming.rst @@ -0,0 +1,23 @@ +Streaming Speech Response +========================= + +.. automodule:: google.cloud.speech.streaming_response + :members: + :undoc-members: + :show-inheritance: + +Streaming Speech Result +======================= + +.. automodule:: google.cloud.speech.streaming_result + :members: + :undoc-members: + :show-inheritance: + +Streaming Endpointer Type +========================= + +.. automodule:: google.cloud.speech.endpointer_type + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/speech-usage.rst b/docs/speech-usage.rst index e3341051d128..9c57fd4fd5ab 100644 --- a/docs/speech-usage.rst +++ b/docs/speech-usage.rst @@ -51,10 +51,9 @@ See: `Speech Asynchronous Recognize`_ >>> import time >>> from google.cloud import speech - >>> from google.cloud.speech.encoding import Encoding >>> client = speech.Client() >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', - ... encoding=Encoding.LINEAR16, + ... encoding=speech.Encoding.LINEAR16, ... sample_rate=44100) >>> operation = client.async_recognize(sample, max_alternatives=2) >>> retry_count = 100 @@ -82,10 +81,9 @@ Great Britian. .. code-block:: python >>> from google.cloud import speech - >>> from google.cloud.speech.encoding import Encoding >>> client = speech.Client() >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', - ... encoding=Encoding.FLAC, + ... encoding=speech.Encoding.FLAC, ... sample_rate=44100) >>> operation = client.async_recognize(sample, max_alternatives=2) >>> alternatives = client.sync_recognize( @@ -93,8 +91,8 @@ Great Britian. ... language_code='en-GB', max_alternatives=2) >>> for alternative in alternatives: ... print('=' * 20) - ... print('transcript: ' + alternative['transcript']) - ... print('confidence: ' + alternative['confidence']) + ... print('transcript: ' + alternative.transcript) + ... print('confidence: ' + alternative.confidence) ==================== transcript: Hello, this is a test confidence: 0.81 @@ -107,17 +105,16 @@ Example of using the profanity filter. .. code-block:: python >>> from google.cloud import speech - >>> from google.cloud.speech.encoding import Encoding >>> client = speech.Client() >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', - ... encoding=Encoding.FLAC, + ... encoding=speech.Encoding.FLAC, ... sample_rate=44100) >>> alternatives = client.sync_recognize(sample, max_alternatives=1, ... profanity_filter=True) >>> for alternative in alternatives: ... print('=' * 20) - ... print('transcript: ' + alternative['transcript']) - ... print('confidence: ' + alternative['confidence']) + ... print('transcript: ' + alternative.transcript) + ... print('confidence: ' + alternative.confidence) ==================== transcript: Hello, this is a f****** test confidence: 0.81 @@ -129,21 +126,92 @@ words to the vocabulary of the recognizer. .. code-block:: python >>> from google.cloud import speech - >>> from google.cloud.speech.encoding import Encoding >>> client = speech.Client() >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', - ... encoding=Encoding.FLAC, + ... encoding=speech.Encoding.FLAC, ... sample_rate=44100) >>> hints = ['hi', 'good afternoon'] >>> alternatives = client.sync_recognize(sample, max_alternatives=2, ... speech_context=hints) >>> for alternative in alternatives: ... print('=' * 20) - ... print('transcript: ' + alternative['transcript']) - ... print('confidence: ' + alternative['confidence']) + ... print('transcript: ' + alternative.transcript) + ... print('confidence: ' + alternative.confidence) ==================== transcript: Hello, this is a test confidence: 0.81 + +Streaming Recognition +--------------------- + +The :meth:`~google.cloud.speech.Client.stream_recognize` method converts speech +data to possible text alternatives on the fly. + +.. note:: + Streaming recognition requests are limited to 1 minute of audio. + + See: https://cloud.google.com/speech/limits#content + +.. code-block:: python + + >>> import io + >>> from google.cloud import speech + >>> client = speech.Client() + >>> with io.open('./hello.wav', 'rb') as stream: + ... sample = client.sample(stream=stream, encoding=speech.Encoding.LINEAR16, + ... sample_rate=16000) + ... for response in client.stream_recognize(sample): + ... print(response.transcript) + hello + ... print(response.is_final) + True + + +By setting ``interim_results`` to true, interim results (tentative hypotheses) +may be returned as they become available (these interim results are indicated +with the is_final=false flag). If false or omitted, only is_final=true +result(s) are returned. + +.. code-block:: python + + >>> import io + >>> from google.cloud import speech + >>> client = speech.Client() + >>> with io.open('./hello.wav', 'rb') as stream: + >>> sample = client.sample(stream=stream, encoding=speech.Encoding.LINEAR16, + ... sample_rate=16000) + ... for response in client.stream_recognize(sample, + ... interim_results=True): + ... print(response.transcript) + hell + ... print(response.is_final) + False + ... print(response.transcript) + hello + ... print(response.is_final) + True + + +By default the recognizer will perform continuous recognition +(continuing to process audio even if the user pauses speaking) until the client +closes the output stream or when the maximum time limit has been reached. + +If you only want to recognize a single utterance you can set + ``single_utterance`` to ``True`` and only one result will be returned. + +See: `Single Utterance`_ + +.. code-block:: python + + >>> with io.open('./hello_pause_goodbye.wav', 'rb') as stream: + >>> sample = client.sample(stream=stream, encoding=speech.Encoding.LINEAR16, + ... sample_rate=16000) + ... stream_container = client.stream_recognize(sample, + ... single_utterance=True) + >>> print(stream_container.get_full_text()) + hello + +.. _Single Utterance: https://cloud.google.com/speech/reference/rpc/google.cloud.speech.v1beta1#streamingrecognitionconfig .. _sync_recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/syncrecognize .. _Speech Asynchronous Recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/asyncrecognize diff --git a/scripts/verify_included_modules.py b/scripts/verify_included_modules.py index ed447585e2d5..b35ab99009f9 100644 --- a/scripts/verify_included_modules.py +++ b/scripts/verify_included_modules.py @@ -45,6 +45,7 @@ 'google.cloud.pubsub.__init__', 'google.cloud.resource_manager.__init__', 'google.cloud.speech.__init__', + 'google.cloud.speech.streaming.__init__', 'google.cloud.storage.__init__', 'google.cloud.streaming.__init__', 'google.cloud.streaming.buffered_stream', diff --git a/speech/google/cloud/speech/__init__.py b/speech/google/cloud/speech/__init__.py index ef55810893a7..4a9e4e4f6fc6 100644 --- a/speech/google/cloud/speech/__init__.py +++ b/speech/google/cloud/speech/__init__.py @@ -16,3 +16,4 @@ from google.cloud.speech.client import Client from google.cloud.speech.connection import Connection +from google.cloud.speech.encoding import Encoding diff --git a/speech/google/cloud/speech/client.py b/speech/google/cloud/speech/client.py index 553927d237cd..0e19e92549c5 100644 --- a/speech/google/cloud/speech/client.py +++ b/speech/google/cloud/speech/client.py @@ -14,15 +14,40 @@ """Basic client for Google Cloud Speech API.""" +import os from base64 import b64encode from google.cloud._helpers import _to_bytes from google.cloud._helpers import _bytes_to_unicode from google.cloud import client as client_module +from google.cloud.environment_vars import DISABLE_GRPC from google.cloud.speech.connection import Connection from google.cloud.speech.encoding import Encoding from google.cloud.speech.operation import Operation from google.cloud.speech.sample import Sample +from google.cloud.speech.transcript import Transcript +from google.cloud.speech.streaming_response import StreamingSpeechResponse + +try: + from google.cloud.gapic.speech.v1beta1.speech_api import SpeechApi + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + SpeechContext) + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + RecognitionConfig) + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + RecognitionAudio) + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognitionConfig) + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognizeRequest) +except ImportError: # pragma: NO COVER + _HAVE_GAX = False +else: + _HAVE_GAX = True + + +_DISABLE_GAX = os.getenv(DISABLE_GRPC, False) +_USE_GAX = _HAVE_GAX and not _DISABLE_GAX class Client(client_module.Client): @@ -47,6 +72,7 @@ class Client(client_module.Client): """ _connection_class = Connection + _speech_api = None def async_recognize(self, sample, language_code=None, max_alternatives=None, profanity_filter=None, @@ -104,7 +130,7 @@ def async_recognize(self, sample, language_code=None, return Operation.from_api_repr(self, api_response) @staticmethod - def sample(content=None, source_uri=None, encoding=None, + def sample(content=None, source_uri=None, stream=None, encoding=None, sample_rate=None): """Factory: construct Sample to use when making recognize requests. @@ -118,6 +144,9 @@ def sample(content=None, source_uri=None, encoding=None, supported, which must be specified in the following format: ``gs://bucket_name/object_name``. + :type stream: :class:`io.BufferedReader` + :param stream: File like object to read audio data from. + :type encoding: str :param encoding: encoding of audio data sent in all RecognitionAudio messages, can be one of: :attr:`~.Encoding.LINEAR16`, @@ -135,7 +164,7 @@ def sample(content=None, source_uri=None, encoding=None, :rtype: :class:`~google.cloud.speech.sample.Sample` :returns: Instance of ``Sample``. """ - return Sample(content=content, source_uri=source_uri, + return Sample(content=content, source_uri=source_uri, stream=stream, encoding=encoding, sample_rate=sample_rate) def sync_recognize(self, sample, language_code=None, @@ -187,17 +216,144 @@ def sync_recognize(self, sample, language_code=None, * ``confidence``: The confidence in language detection, float between 0 and 1. """ + if _USE_GAX: + config = RecognitionConfig( + encoding=sample.encoding, sample_rate=sample.sample_rate, + language_code=language_code, max_alternatives=max_alternatives, + profanity_filter=profanity_filter, + speech_context=SpeechContext(phrases=speech_context)) - data = _build_request_data(sample, language_code, max_alternatives, - profanity_filter, speech_context) - - api_response = self.connection.api_request( - method='POST', path='speech:syncrecognize', data=data) + audio = RecognitionAudio(content=sample.content, + uri=sample.source_uri) - if len(api_response['results']) == 1: - return api_response['results'][0]['alternatives'] + return self._sync_recognize(config, audio) else: - raise ValueError('result in api should have length 1') + data = _build_request_data(sample, language_code, max_alternatives, + profanity_filter, speech_context) + return self._sync_recognize(data=data) + + def stream_recognize(self, sample, language_code=None, + max_alternatives=None, profanity_filter=None, + speech_context=None, single_utterance=False, + interim_results=False): + """Streaming speech recognition. + + .. note:: + Streaming recognition requests are limited to 1 minute of audio. + + See: https://cloud.google.com/speech/limits#content + + :type sample: :class:`~google.cloud.speech.sample.Sample` + :param sample: Instance of ``Sample`` containing audio information. + + :type language_code: str + :param language_code: (Optional) The language of the supplied audio as + BCP-47 language tag. Example: ``'en-GB'``. + If omitted, defaults to ``'en-US'``. + + :type max_alternatives: int + :param max_alternatives: (Optional) Maximum number of recognition + hypotheses to be returned. The server may + return fewer than maxAlternatives. + Valid values are 0-30. A value of 0 or 1 + will return a maximum of 1. Defaults to 1 + + :type profanity_filter: bool + :param profanity_filter: If True, the server will attempt to filter + out profanities, replacing all but the + initial character in each filtered word with + asterisks, e.g. ``'f***'``. If False or + omitted, profanities won't be filtered out. + + :type speech_context: list + :param speech_context: A list of strings (max 50) containing words and + phrases "hints" so that the speech recognition + is more likely to recognize them. This can be + used to improve the accuracy for specific words + and phrases. This can also be used to add new + words to the vocabulary of the recognizer. + + :type single_utterance: boolean + :param single_utterance: [Optional] If false or omitted, the recognizer + will perform continuous recognition + (continuing to process audio even if the user + pauses speaking) until the client closes the + output stream (gRPC API) or when the maximum + time limit has been reached. Multiple + SpeechRecognitionResults with the is_final + flag set to true may be returned. + + If true, the recognizer will detect a single + spoken utterance. When it detects that the + user has paused or stopped speaking, it will + return an END_OF_UTTERANCE event and cease + recognition. It will return no more than one + SpeechRecognitionResult with the is_final flag + set to true. + + :type interim_results: boolean + :param interim_results: [Optional] If true, interim results (tentative + hypotheses) may be returned as they become + available (these interim results are indicated + with the is_final=false flag). If false or + omitted, only is_final=true result(s) are + returned. + """ + if not _USE_GAX: + raise EnvironmentError('gRPC is required to use this API.') + + requests = _make_request_stream(sample, language_code=language_code, + max_alternatives=max_alternatives, + profanity_filter=profanity_filter, + speech_context=speech_context, + single_utterance=single_utterance, + interim_results=interim_results) + + for response in self.speech_api.streaming_recognize(requests): + if hasattr(response, 'results') or interim_results: + yield StreamingSpeechResponse.from_pb(response) + + @property + def speech_api(self): + """Instance of Speech API. + + :rtype: :class:`google.cloud.gapic.speech.v1beta1.speech_api.SpeechApi` + :returns: Instance of ``SpeechApi``. + """ + if not self._speech_api: + self._speech_api = SpeechApi() + return self._speech_api + + def _sync_recognize(self, config=None, audio=None, data=None): + """Handler for sync_recognize requests with or without GAPIC. + + :type config: :class:`~RecognitionConfig + :param config: Instance of ``RecognitionConfig`` with recognition + settings. + + :type audio: :class:`~RecognitionAudio` + :param audio: Instance of ``RecognitionAudio`` with audio source data. + + :type data: dict + :param data: Mapped configuration paramters for the request. + + :rtype: list of :class:`~transcript.Transcript` + :returns: List of ``Transcript`` with recognition results. + """ + if config and audio and _USE_GAX: + api_response = self.speech_api.sync_recognize(config=config, + audio=audio) + results = api_response.results.pop() + alternatives = results.alternatives + return [Transcript.from_pb(alternative) + for alternative in alternatives] + elif data: + api_response = self.connection.api_request( + method='POST', path='speech:syncrecognize', data=data) + + return [Transcript.from_api_repr(alternative) + for alternative + in api_response['results'][0]['alternatives']] def _build_request_data(sample, language_code=None, max_alternatives=None, @@ -261,3 +417,160 @@ def _build_request_data(sample, language_code=None, max_alternatives=None, } return data + + +def _make_request_stream(sample, language_code=None, max_alternatives=None, + profanity_filter=None, speech_context=None, + single_utterance=None, interim_results=None): + """Generate stream of requests from sample. + + :type sample: :class:`~google.cloud.speech.sample.Sample` + :param sample: Instance of ``Sample`` containing audio information. + + :type language_code: str + :param language_code: (Optional) The language of the supplied audio as + BCP-47 language tag. Example: ``'en-GB'``. + If omitted, defaults to ``'en-US'``. + + :type max_alternatives: int + :param max_alternatives: (Optional) Maximum number of recognition + hypotheses to be returned. The server may + return fewer than maxAlternatives. + Valid values are 0-30. A value of 0 or 1 + will return a maximum of 1. Defaults to 1 + + :type profanity_filter: bool + :param profanity_filter: If True, the server will attempt to filter + out profanities, replacing all but the + initial character in each filtered word with + asterisks, e.g. ``'f***'``. If False or + omitted, profanities won't be filtered out. + + :type speech_context: list + :param speech_context: A list of strings (max 50) containing words and + phrases "hints" so that the speech recognition + is more likely to recognize them. This can be + used to improve the accuracy for specific words + and phrases. This can also be used to add new + words to the vocabulary of the recognizer. + + :type single_utterance: boolean + :param single_utterance: [Optional] If false or omitted, the recognizer + will perform continuous recognition + (continuing to process audio even if the user + pauses speaking) until the client closes the + output stream (gRPC API) or when the maximum + time limit has been reached. Multiple + SpeechRecognitionResults with the is_final + flag set to true may be returned. + + If true, the recognizer will detect a single + spoken utterance. When it detects that the + user has paused or stopped speaking, it will + return an END_OF_UTTERANCE event and cease + recognition. It will return no more than one + SpeechRecognitionResult with the is_final flag + set to true. + + :type interim_results: boolean + :param interim_results: [Optional] If true, interim results (tentative + hypotheses) may be returned as they become + available (these interim results are indicated + with the is_final=false flag). If false or + omitted, only is_final=true result(s) are + returned. + """ + config_request = _make_streaming_config( + sample, language_code=language_code, max_alternatives=max_alternatives, + profanity_filter=profanity_filter, speech_context=speech_context, + single_utterance=single_utterance, interim_results=interim_results) + + # The config request MUST go first and not contain any audio data. + yield config_request + + while True: + data = sample.stream.read(sample.chunk_size) + if not data: + break + # Optimize the request data size to around 100ms. + yield StreamingRecognizeRequest(audio_content=data) + + +def _make_streaming_config(sample, language_code, + max_alternatives, profanity_filter, + speech_context, single_utterance, + interim_results): + """Build streaming configuration. + + :type sample: :class:`~google.cloud.speech.sample.Sample` + :param sample: Instance of ``Sample`` containing audio information. + + :type language_code: str + :param language_code: (Optional) The language of the supplied audio as + BCP-47 language tag. Example: ``'en-GB'``. + If omitted, defaults to ``'en-US'``. + + :type max_alternatives: int + :param max_alternatives: (Optional) Maximum number of recognition + hypotheses to be returned. The server may + return fewer than maxAlternatives. + Valid values are 0-30. A value of 0 or 1 + will return a maximum of 1. Defaults to 1 + + :type profanity_filter: bool + :param profanity_filter: If True, the server will attempt to filter + out profanities, replacing all but the + initial character in each filtered word with + asterisks, e.g. ``'f***'``. If False or + omitted, profanities won't be filtered out. + + :type speech_context: list + :param speech_context: A list of strings (max 50) containing words and + phrases "hints" so that the speech recognition + is more likely to recognize them. This can be + used to improve the accuracy for specific words + and phrases. This can also be used to add new + words to the vocabulary of the recognizer. + + :type single_utterance: boolean + :param single_utterance: [Optional] If false or omitted, the recognizer + will perform continuous recognition + (continuing to process audio even if the user + pauses speaking) until the client closes the + output stream (gRPC API) or when the maximum + time limit has been reached. Multiple + SpeechRecognitionResults with the is_final + flag set to true may be returned. + + If true, the recognizer will detect a single + spoken utterance. When it detects that the + user has paused or stopped speaking, it will + return an END_OF_UTTERANCE event and cease + recognition. It will return no more than one + SpeechRecognitionResult with the is_final flag + set to true. + + :type interim_results: boolean + :param interim_results: [Optional] If true, interim results (tentative + hypotheses) may be returned as they become + available (these interim results are indicated + with the is_final=false flag). If false or + omitted, only is_final=true result(s) are + returned. + + :rtype: :class:`~StreamingRecognitionConfig` + :returns: Instance of ``StreamingRecognitionConfig``. + """ + config = RecognitionConfig( + encoding=sample.encoding, sample_rate=sample.sample_rate, + language_code=language_code, max_alternatives=max_alternatives, + profanity_filter=profanity_filter, speech_context=speech_context) + + streaming_config = StreamingRecognitionConfig( + config=config, single_utterance=single_utterance, + interim_results=interim_results) + + config_request = StreamingRecognizeRequest( + streaming_config=streaming_config) + + return config_request diff --git a/speech/google/cloud/speech/endpointer_type.py b/speech/google/cloud/speech/endpointer_type.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/speech/google/cloud/speech/operation.py b/speech/google/cloud/speech/operation.py index 69614b16cb7f..e7abbf88636d 100644 --- a/speech/google/cloud/speech/operation.py +++ b/speech/google/cloud/speech/operation.py @@ -124,7 +124,8 @@ def _update(self, response): results = [] if raw_results: for result in raw_results[0]['alternatives']: - results.append(Transcript(result)) + results.append(Transcript(result.get('transcript'), + result.get('confidence'))) if metadata: self._metadata = Metadata.from_api_repr(metadata) diff --git a/speech/google/cloud/speech/sample.py b/speech/google/cloud/speech/sample.py index a197f20372f6..fbc97adf1d0d 100644 --- a/speech/google/cloud/speech/sample.py +++ b/speech/google/cloud/speech/sample.py @@ -30,6 +30,9 @@ class Sample(object): supported, which must be specified in the following format: ``gs://bucket_name/object_name``. + :type stream: :class:`io.BufferedReader` + :param stream: File like object to read audio data from. + :type encoding: str :param encoding: encoding of audio data sent in all RecognitionAudio messages, can be one of: :attr:`~.Encoding.LINEAR16`, @@ -47,16 +50,15 @@ class Sample(object): default_encoding = Encoding.FLAC default_sample_rate = 16000 - def __init__(self, content=None, source_uri=None, + def __init__(self, content=None, source_uri=None, stream=None, encoding=None, sample_rate=None): - - no_source = content is None and source_uri is None - both_source = content is not None and source_uri is not None - if no_source or both_source: - raise ValueError('Supply one of \'content\' or \'source_uri\'') + if (content, source_uri, stream).count(None) != 2: + raise ValueError('Supply only one of \'content\', \'source_uri\'' + ' or stream.') self._content = content self._source_uri = source_uri + self._stream = stream if sample_rate is not None and not 8000 <= sample_rate <= 48000: raise ValueError('The value of sample_rate must be between 8000' @@ -68,6 +70,15 @@ def __init__(self, content=None, source_uri=None, else: raise ValueError('Invalid encoding: %s' % (encoding,)) + @property + def chunk_size(self): + """Chunk size to send over GRPC. ~100ms + + :rtype: int + :returns: Optimized chunk size. + """ + return int(self.sample_rate / 10) + @property def source_uri(self): """Google Cloud Storage URI of audio source. @@ -77,6 +88,15 @@ def source_uri(self): """ return self._source_uri + @property + def stream(self): + """Stream of audio data. + + :rtype: :class:`io.BufferedReader` + :returns: File like object to read audio data from. + """ + return self._stream + @property def content(self): """Bytes of audio content. diff --git a/speech/google/cloud/speech/streaming_response.py b/speech/google/cloud/speech/streaming_response.py new file mode 100644 index 000000000000..55e2321f1a1d --- /dev/null +++ b/speech/google/cloud/speech/streaming_response.py @@ -0,0 +1,148 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Representation of a GAPIC Speech API response.""" + +from google.cloud.speech.streaming_result import StreamingSpeechResult + + +class StreamingSpeechResponse(object): + """Representation of a Speech API protobuf streaming response. + + :type error: :class:`google.grpc.Status` + :param error: Instance of ``Status`` + + :type endpointer_type: :class:`~EndpointerType` + :param endpointer_type: Enum of endpointer event. + + :type results: list of + :class:`google.cloud.speech.v1beta1.StreamingRecognitionResult` + :param results: List of protobuf ``StreamingRecognitionResult``. + + :type result_index: int + :param result_index: Index for specific result set. Used for updating with + ``interim_results``. + """ + def __init__(self, error=None, endpointer_type=None, results=None, + result_index=None): + results = results or [] + self._error = error + self._endpointer_type = EndpointerType.reverse_map.get( + endpointer_type, None) + self._result_index = result_index + self._results = [StreamingSpeechResult.from_pb(result) + for result in results] + + @classmethod + def from_pb(cls, pb_response): + """Factory: construct a ``StreamingSpeechResponse`` from protobuf. + + :type pb_response: + :class:`google.cloud.speech.v1beta1.StreamingRecognizeResponse` + :param pb_response: Instance of protobuf + ``StreamingRecognizeResponse``. + :rtype: :class:`~StreamingSpeechResponse` + :returns: Instance of ``StreamingSpeechResponse``. + """ + error = pb_response.error + endpointer_type = pb_response.endpointer_type + results = pb_response.results + result_index = pb_response.result_index + return cls(error=error, endpointer_type=endpointer_type, + results=results, result_index=result_index) + + @property + def confidence(self): + """Confidence score for recognized speech. + + :rtype: float + :returns: Confidence score of recognized speech [0.0-1.0]. + """ + if self.results and self.results[0].alternatives: + return self.results[0].alternatives[0].confidence + else: + return 0.0 + + @property + def endpointer_type(self): + """Endpointer indicating the state of the speech detection. + + :rtype: str + :returns: String derived from :class:`~endpointer_type.EndpointerType`. + """ + return self._endpointer_type + + @property + def is_final(self): + """Represents an interim result that may change. + + :rtype: bool + :returns: True if the result has completed it's processing. + """ + if self.results: + return bool(self.results[0].is_final) + else: + return False + + @property + def result_index(self): + """Result index associated with this response. + + :rtype: int + :returns: Result index of this response. + """ + return self._result_index + + @property + def results(self): + """List of results for this response. + + :rtype: list of :class:`~result.StreamingSpeechResult` + :returns: List of ``StreamingSpeechResult`` in this response. + """ + return self._results + + @property + def transcript(self): + """Get most likely transcript from response. + + :rtype: str + :returns: Transcript text from response. + """ + if self.results and self.results[0].alternatives: + return self.results[0].alternatives[0].transcript + else: + return '' + + +class EndpointerType(object): + """Endpointer type for tracking state of Speech API detection. + + See: + https://cloud.google.com/speech/reference/rpc/\ + google.cloud.speech.v1beta1#endpointertype + """ + ENDPOINTER_EVENT_UNSPECIFIED = 0 + START_OF_SPEECH = 1 + END_OF_SPEECH = 2 + END_OF_AUDIO = 3 + END_OF_UTTERANCE = 4 + + reverse_map = { + 0: 'ENDPOINTER_EVENT_UNSPECIFIED', + 1: 'START_OF_SPEECH', + 2: 'END_OF_SPEECH', + 3: 'END_OF_AUDIO', + 4: 'END_OF_UTTERANCE' + } diff --git a/speech/google/cloud/speech/streaming_result.py b/speech/google/cloud/speech/streaming_result.py new file mode 100644 index 000000000000..6cfc37c18ad5 --- /dev/null +++ b/speech/google/cloud/speech/streaming_result.py @@ -0,0 +1,73 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Representation of Speech GAPIC API result.""" + +from google.cloud.speech.transcript import Transcript + + +class StreamingSpeechResult(object): + """Factory: contruct streaming speech result. + + :type alternatives: + :class:`google.cloud.speech.v1beta1.SpeechRecognitionAlternative` + :param alternatives: List of ``SpeechRecognitionAlternative``. + + :type is_final: bool + :param is_final: Indicates if the transcription is complete. + + :type stability: float + :param stability: An estimate of the probability that the recognizer will + not change its guess about this interim result. + """ + + def __init__(self, alternatives, is_final, stability): + self._alternatives = [Transcript.from_pb(alternative) + for alternative in alternatives] + self._is_final = is_final + self._stability = stability + + @classmethod + def from_pb(cls, pb_response): + """Factory: construct StreamingSpeechResult from protobuf response. + + :type pb_response: + :class:`google.cloud.speech.v1beta1.StreamingRecognitionResult` + :param pb_response: Instance of ``StreamingRecognitionResult``. + + :rtype: :class:`~result.StreamingSpeechResult` + :returns: Instance of ``StreamingSpeechResult``. + """ + alternatives = pb_response.alternatives + is_final = pb_response.is_final + stability = pb_response.stability + return cls(alternatives, is_final, stability) + + @property + def alternatives(self): + """List of alternative transcripts. + + :rtype: list of :class:`~google.cloud.speech.transcript.Transcript` + :returns: List of ``Transcript`` objects. + """ + return self._alternatives + + @property + def is_final(self): + """Represents an interim result that may change. + + :rtype: bool + :returns: True if the result has completed it's processing. + """ + return bool(self._is_final) diff --git a/speech/google/cloud/speech/transcript.py b/speech/google/cloud/speech/transcript.py index bbe915396c5c..d9fa104962fb 100644 --- a/speech/google/cloud/speech/transcript.py +++ b/speech/google/cloud/speech/transcript.py @@ -16,14 +16,43 @@ class Transcript(object): - """Representation of Speech Transcripts + """Representation of Speech Transcripts. - :type result: dict - :param result: Dictionary of transcript and confidence of recognition. + :type transcript: str + :param transcript: String of transcribed data. + + :type confidence: float + :param confidence: The confidence estimate between 0.0 and 1.0. """ - def __init__(self, result): - self._transcript = result.get('transcript') - self._confidence = result.get('confidence') + def __init__(self, transcript, confidence): + self._transcript = transcript + self._confidence = confidence + + @classmethod + def from_api_repr(cls, transcript): + """Factory: construct ``Transcript`` from JSON response. + + :type transcript: :class:`~SpeechRecognitionAlternative` + :param transcript: Instance of ``SpeechRecognitionAlternative`` + from protobuf. + + :rtype: :class:`~Transcript` + :returns: Instance of ``Transcript``. + """ + return cls(transcript['transcript'], transcript['confidence']) + + @classmethod + def from_pb(cls, transcript): + """Factory: construct ``Transcript`` from protobuf response. + + :type transcript: :class:`~SpeechRecognitionAlternative` + :param transcript: Instance of ``SpeechRecognitionAlternative`` + from protobuf. + + :rtype: :class:`~Transcript` + :returns: Instance of ``Transcript``. + """ + return cls(transcript.transcript, transcript.confidence) @property def transcript(self): diff --git a/speech/setup.py b/speech/setup.py index c02aeaad3e9d..c7504e1beac4 100644 --- a/speech/setup.py +++ b/speech/setup.py @@ -51,6 +51,7 @@ REQUIREMENTS = [ 'google-cloud-core >= 0.20.0', + 'gapic-google-cloud-speech-v1beta1 >= 0.11.1' ] setup( diff --git a/speech/unit_tests/test_client.py b/speech/unit_tests/test_client.py index 5972a0014eb3..795fbec54490 100644 --- a/speech/unit_tests/test_client.py +++ b/speech/unit_tests/test_client.py @@ -60,13 +60,30 @@ def test_create_sample_from_client(self): self.assertEqual(content_sample.sample_rate, self.SAMPLE_RATE) self.assertEqual(content_sample.encoding, Encoding.FLAC) + def test__sync_recognize_wo_gapic(self): + from google.cloud.speech import client as MUT + from google.cloud._testing import _Monkey + creds = _Credentials() + client = self._makeOne(credentials=creds) + client.connection = _Connection() + client._speech_api = _MockGAPICSpeechAPI() + client._speech_api._responses = [] + + with _Monkey(MUT, _USE_GAX=False): + data = {} + results = client._sync_recognize(data=data) + self.assertIsNone(results) + def test_sync_recognize_content_with_optional_parameters(self): from base64 import b64encode from google.cloud._helpers import _to_bytes from google.cloud._helpers import _bytes_to_unicode + from google.cloud._testing import _Monkey + from google.cloud.speech import client as MUT from google.cloud.speech.encoding import Encoding from google.cloud.speech.sample import Sample + from google.cloud.speech.transcript import Transcript from unit_tests._fixtures import SYNC_RECOGNIZE_RESPONSE _AUDIO_CONTENT = _to_bytes(self.AUDIO_CONTENT) _B64_AUDIO_CONTENT = _bytes_to_unicode(b64encode(_AUDIO_CONTENT)) @@ -96,11 +113,12 @@ def test_sync_recognize_content_with_optional_parameters(self): sample = Sample(content=self.AUDIO_CONTENT, encoding=encoding, sample_rate=self.SAMPLE_RATE) - response = client.sync_recognize(sample, - language_code='EN', - max_alternatives=2, - profanity_filter=True, - speech_context=self.HINTS) + with _Monkey(MUT, _USE_GAX=False): + response = client.sync_recognize(sample, + language_code='EN', + max_alternatives=2, + profanity_filter=True, + speech_context=self.HINTS) self.assertEqual(len(client.connection._requested), 1) req = client.connection._requested[0] @@ -108,13 +126,17 @@ def test_sync_recognize_content_with_optional_parameters(self): self.assertEqual(req['data'], REQUEST) self.assertEqual(req['method'], 'POST') self.assertEqual(req['path'], 'speech:syncrecognize') - - expected = SYNC_RECOGNIZE_RESPONSE['results'][0]['alternatives'] - self.assertEqual(response, expected) + alternative = SYNC_RECOGNIZE_RESPONSE['results'][0]['alternatives'][0] + expected = [Transcript.from_api_repr(alternative)] + self.assertEqual(response[0].transcript, expected[0].transcript) + self.assertEqual(response[0].confidence, expected[0].confidence) def test_sync_recognize_source_uri_without_optional_parameters(self): + from google.cloud._testing import _Monkey + from google.cloud.speech import client as MUT from google.cloud.speech.encoding import Encoding from google.cloud.speech.sample import Sample + from google.cloud.speech.transcript import Transcript from unit_tests._fixtures import SYNC_RECOGNIZE_RESPONSE RETURNED = SYNC_RECOGNIZE_RESPONSE @@ -135,7 +157,8 @@ def test_sync_recognize_source_uri_without_optional_parameters(self): sample = Sample(source_uri=self.AUDIO_SOURCE_URI, encoding=encoding, sample_rate=self.SAMPLE_RATE) - response = client.sync_recognize(sample) + with _Monkey(MUT, _USE_GAX=False): + response = client.sync_recognize(sample) self.assertEqual(len(client.connection._requested), 1) req = client.connection._requested[0] @@ -144,10 +167,14 @@ def test_sync_recognize_source_uri_without_optional_parameters(self): self.assertEqual(req['method'], 'POST') self.assertEqual(req['path'], 'speech:syncrecognize') - expected = SYNC_RECOGNIZE_RESPONSE['results'][0]['alternatives'] - self.assertEqual(response, expected) + expected = [Transcript.from_api_repr( + SYNC_RECOGNIZE_RESPONSE['results'][0]['alternatives'][0])] + self.assertEqual(response[0].transcript, expected[0].transcript) + self.assertEqual(response[0].confidence, expected[0].confidence) def test_sync_recognize_with_empty_results(self): + from google.cloud._testing import _Monkey + from google.cloud.speech import client as MUT from google.cloud.speech.encoding import Encoding from google.cloud.speech.sample import Sample from unit_tests._fixtures import SYNC_RECOGNIZE_EMPTY_RESPONSE @@ -156,11 +183,32 @@ def test_sync_recognize_with_empty_results(self): client = self._makeOne(credentials=credentials) client.connection = _Connection(SYNC_RECOGNIZE_EMPTY_RESPONSE) - with self.assertRaises(ValueError): - sample = Sample(source_uri=self.AUDIO_SOURCE_URI, - encoding=Encoding.FLAC, - sample_rate=self.SAMPLE_RATE) - client.sync_recognize(sample) + with self.assertRaises(IndexError): + with _Monkey(MUT, _USE_GAX=False): + sample = Sample(source_uri=self.AUDIO_SOURCE_URI, + encoding=Encoding.FLAC, + sample_rate=self.SAMPLE_RATE) + client.sync_recognize(sample) + + def test_sync_recognize_with_gapic(self): + from google.cloud.speech import client as MUT + from google.cloud.speech import Encoding + from google.cloud._testing import _Monkey + creds = _Credentials() + client = self._makeOne(credentials=creds) + client.connection = _Connection() + + client._speech_api = _MockGAPICSpeechAPI() + client._speech_api._responses = [] + + with _Monkey(MUT, _USE_GAX=True, RecognitionConfig=_RecognitionConfig, + RecognitionAudio=_RecognitionAudio): + sample = client.sample(source_uri=self.AUDIO_SOURCE_URI, + encoding=Encoding.FLAC, + sample_rate=self.SAMPLE_RATE) + results = client.sync_recognize(sample) + self.assertEqual(results[0].transcript, 'testing 1 2 3') + self.assertEqual(results[0].confidence, 0.95234356) def test_async_supported_encodings(self): from google.cloud.speech.encoding import Encoding @@ -195,6 +243,114 @@ def test_async_recognize(self): self.assertFalse(operation.complete) self.assertIsNone(operation.metadata) + def test_streaming_depends_on_gax(self): + from google.cloud.speech import client as MUT + from google.cloud._testing import _Monkey + creds = _Credentials() + client = self._makeOne(credentials=creds) + client.connection = _Connection() + + with _Monkey(MUT, _USE_GAX=False): + with self.assertRaises(EnvironmentError): + next(client.stream_recognize({})) + + def test_set_speech_api(self): + from google.cloud.speech import client as MUT + from google.cloud._testing import _Monkey + creds = _Credentials() + client = self._makeOne(credentials=creds) + client.connection = _Connection() + + with _Monkey(MUT, SpeechApi=_MockGAPICSpeechAPI): + client._speech_api = None + speech_api = client.speech_api + self.assertIsInstance(speech_api, _MockGAPICSpeechAPI) + + def test_streaming_with_empty_response(self): + from io import BytesIO + from google.cloud.speech.encoding import Encoding + + stream = BytesIO(b'Some audio data...') + credentials = _Credentials() + client = self._makeOne(credentials=credentials) + client.connection = _Connection() + client._speech_api = _MockGAPICSpeechAPI() + client._speech_api._responses = [] + + sample = client.sample(stream=stream, + encoding=Encoding.LINEAR16, + sample_rate=self.SAMPLE_RATE) + results = client.stream_recognize(sample) + with self.assertRaises(StopIteration): + next(results) + + def test_stream_recognize(self): + from io import BytesIO + from google.cloud.speech.encoding import Encoding + from google.cloud.speech.streaming_response import ( + StreamingSpeechResponse) + + stream = BytesIO(b'Some audio data...') + credentials = _Credentials() + client = self._makeOne(credentials=credentials) + client.connection = _Connection() + client._speech_api = _MockGAPICSpeechAPI() + + sample = client.sample(stream=stream, + encoding=Encoding.LINEAR16, + sample_rate=self.SAMPLE_RATE) + results = client.stream_recognize(sample) + + self.assertIsInstance(next(results), StreamingSpeechResponse) + requests = [] + for req in client.speech_api._requests: + requests.append(req) + self.assertEqual(len(requests), 2) + + +class _RecognitionConfig(object): + def __init__(self, *args, **kwargs): + self.args = args + self.kwargs = kwargs + + +class _RecognitionAudio(object): + def __init__(self, content, uri): + self.content = content + self.uri = uri + + +class _MockGAPICAlternative(object): + transcript = 'testing 1 2 3' + confidence = 0.95234356 + + +class _MockGAPICSyncResult(object): + alternatives = [_MockGAPICAlternative()] + + +class _MockGAPICSpeechResponse(object): + error = None + endpointer_type = None + results = [] + result_index = 0 + + +class _MockGAPICSpeechAPI(object): + _requests = None + _responses = [None, _MockGAPICSpeechResponse()] + + def streaming_recognize(self, requests): + self._requests = requests + return self._responses + + def sync_recognize(self, config, audio): + self.config = config + self.audio = audio + mock_response = _MockGAPICSpeechResponse() + mock_response.results = [_MockGAPICSyncResult()] + return mock_response + class _Credentials(object): diff --git a/speech/unit_tests/test_request.py b/speech/unit_tests/test_request.py new file mode 100644 index 000000000000..b536c661aecb --- /dev/null +++ b/speech/unit_tests/test_request.py @@ -0,0 +1,50 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + + +class TestStreamingSpeechRequestHelpers(unittest.TestCase): + def test_make_request_stream(self): + from io import BytesIO + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognizeRequest) + from google.cloud.speech.client import _make_request_stream + from google.cloud.speech.sample import Sample + + stream = BytesIO(b'g' * 1702) # Something bigger than a chunk. + sample = Sample(stream=stream, encoding='LINEAR16') + + request_count = 0 + for req in _make_request_stream(sample): + request_count += 1 + self.assertIsInstance(req, StreamingRecognizeRequest) + self.assertEqual(request_count, 3) + + def test_make_request_stream_short(self): + from io import BytesIO + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognizeRequest) + from google.cloud.speech.client import _make_request_stream + from google.cloud.speech.sample import Sample + + stream = BytesIO(b'g' * (1599 * 4)) # Something bigger than a chunk. + sample = Sample(stream=stream, encoding='LINEAR16') + + request_count = 0 + for req in _make_request_stream(sample): + request_count += 1 + self.assertIsInstance(req, StreamingRecognizeRequest) + + self.assertEqual(request_count, 5) diff --git a/speech/unit_tests/test_response.py b/speech/unit_tests/test_response.py new file mode 100644 index 000000000000..413aa6b72871 --- /dev/null +++ b/speech/unit_tests/test_response.py @@ -0,0 +1,60 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + + +class TestStreamingSpeechResponse(unittest.TestCase): + def _getTargetClass(self): + from google.cloud.speech.streaming_response import ( + StreamingSpeechResponse) + return StreamingSpeechResponse + + def _makeOne(self, *args, **kw): + return self._getTargetClass()(*args, **kw) + + def test_ctor(self): + response = self._makeOne({}, 'END_OF_UTTERANCE', [], 0) + self.assertEqual(response.result_index, 0) + self.assertEqual(response.confidence, 0.0) + self.assertEqual(response.endpointer_type, None) + self.assertEqual(response.results, []) + self.assertEqual(response.transcript, '') + self.assertFalse(response.is_final) + + def test_from_pb(self): + response = self._makeOne() + res = response.from_pb(_MockSpeechPBResponse) + self.assertFalse(res.is_final) + self.assertEqual(res.endpointer_type, 'END_OF_AUDIO') + self.assertEqual(res.transcript, 'hello there!') + self.assertEqual(res.confidence, 0.9704365) + + +class _MockSpeechPBAlternative(object): + transcript = 'hello there!' + confidence = 0.9704365 + + +class _MockSpeechPBResult(object): + alternatives = [_MockSpeechPBAlternative()] + is_final = False + stability = 0.0 + + +class _MockSpeechPBResponse(object): + error = {} + endpointer_type = 3 + result_index = 0 + results = [_MockSpeechPBResult, _MockSpeechPBResult] diff --git a/speech/unit_tests/test_transcript.py b/speech/unit_tests/test_transcript.py index b585d6e7429c..6cbf038546b4 100644 --- a/speech/unit_tests/test_transcript.py +++ b/speech/unit_tests/test_transcript.py @@ -26,7 +26,8 @@ def _makeOne(self, *args, **kwargs): def test_ctor(self): from unit_tests._fixtures import OPERATION_COMPLETE_RESPONSE as DATA TRANSCRIPT_DATA = DATA['response']['results'][0]['alternatives'][0] - transcript = self._makeOne(TRANSCRIPT_DATA) + transcript = self._makeOne(TRANSCRIPT_DATA['transcript'], + TRANSCRIPT_DATA['confidence']) self.assertEqual('how old is the Brooklyn Bridge', transcript.transcript) self.assertEqual(0.98267895, transcript.confidence)