1+ syntax = "proto3" ;
2+
3+ package google.cloud.speech.v1beta1 ;
4+
5+ option java_multiple_files = true ;
6+ option java_outer_classname = "SpeechProto" ;
7+ option java_package = "com.google.cloud.speech.v1beta1" ;
8+
9+ import "google/api/annotations.proto" ;
10+ import "google/longrunning/operations.proto" ;
11+ import "google/rpc/status.proto" ;
12+
13+
14+ // Service that implements Google Cloud Speech API.
15+ service Speech {
16+ // Perform synchronous speech-recognition: receive results after all audio
17+ // has been sent and processed.
18+ rpc SyncRecognize (SyncRecognizeRequest ) returns (SyncRecognizeResponse ) {
19+ option (google.api.http ) =
20+ { post : "/v1beta1/speech:syncrecognize" body: "*" };
21+ }
22+
23+ // Perform asynchronous speech-recognition: receive results via the
24+ // google.longrunning.Operations interface. `Operation.response` returns
25+ // `AsyncRecognizeResponse`.
26+ rpc AsyncRecognize (AsyncRecognizeRequest )
27+ returns (google .longrunning .Operation ) {
28+ option (google.api.http ) =
29+ { post : "/v1beta1/speech:asyncrecognize" body: "*" };
30+ }
31+
32+ // Perform bidirectional streaming speech-recognition: receive results while
33+ // sending audio. This method is only available via the gRPC API (not REST).
34+ rpc StreamingRecognize (stream StreamingRecognizeRequest )
35+ returns (stream StreamingRecognizeResponse );
36+ }
37+
38+ // `SyncRecognizeRequest` is the top-level message sent by the client for
39+ // the `SyncRecognize` method.
40+ message SyncRecognizeRequest {
41+ // [Required] The `config` message provides information to the recognizer
42+ // that specifies how to process the request.
43+ RecognitionConfig config = 1 ;
44+
45+ // [Required] The audio data to be recognized.
46+ RecognitionAudio audio = 2 ;
47+ }
48+
49+ // `AsyncRecognizeRequest` is the top-level message sent by the client for
50+ // the `AsyncRecognize` method.
51+ message AsyncRecognizeRequest {
52+ // [Required] The `config` message provides information to the recognizer
53+ // that specifies how to process the request.
54+ RecognitionConfig config = 1 ;
55+
56+ // [Required] The audio data to be recognized.
57+ RecognitionAudio audio = 2 ;
58+ }
59+
60+ // `StreamingRecognizeRequest` is the top-level message sent by the client for
61+ // the `StreamingRecognize`. Multiple `StreamingRecognizeRequest` messages are
62+ // sent. The first message must contain a `streaming_config` message and must
63+ // not contain `audio` data. All subsequent messages must contain `audio` data
64+ // and must not contain a `streaming_config` message.
65+ message StreamingRecognizeRequest {
66+ oneof streaming_request {
67+ // The `streaming_config` message provides information to the recognizer
68+ // that specifies how to process the request.
69+ //
70+ // The first `StreamingRecognizeRequest` message must contain a
71+ // `streaming_config` message.
72+ StreamingRecognitionConfig streaming_config = 1 ;
73+
74+ // The audio data to be recognized. Sequential chunks of audio data are sent
75+ // in sequential `StreamingRecognizeRequest` messages. The first
76+ // `StreamingRecognizeRequest` message must not contain `audio_content` data
77+ // and all subsequent `StreamingRecognizeRequest` messages must contain
78+ // `audio_content` data. The audio bytes must be encoded as specified in
79+ // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
80+ // pure binary representation (not base64).
81+ bytes audio_content = 2 [ctype = CORD ];
82+ }
83+ }
84+
85+ // The `StreamingRecognitionConfig` message provides information to the
86+ // recognizer that specifies how to process the request.
87+ message StreamingRecognitionConfig {
88+ // [Required] The `config` message provides information to the recognizer
89+ // that specifies how to process the request.
90+ RecognitionConfig config = 1 ;
91+
92+ // [Optional] If `false` or omitted, the recognizer will perform continuous
93+ // recognition (continuing to process audio even if the user pauses speaking)
94+ // until the client closes the output stream (gRPC API) or when the maximum
95+ // time limit has been reached. Multiple `SpeechRecognitionResult`s with the
96+ // `is_final` flag set to `true` may be returned.
97+ //
98+ // If `true`, the recognizer will detect a single spoken utterance. When it
99+ // detects that the user has paused or stopped speaking, it will return an
100+ // `END_OF_UTTERANCE` event and cease recognition. It will return no more than
101+ // one `SpeechRecognitionResult` with the `is_final` flag set to `true`.
102+ bool single_utterance = 2 ;
103+
104+ // [Optional] If `true`, interim results (tentative hypotheses) may be
105+ // returned as they become available (these interim results are indicated with
106+ // the `is_final=false` flag).
107+ // If `false` or omitted, only `is_final=true` result(s) are returned.
108+ bool interim_results = 3 ;
109+ }
110+
111+ // The `RecognitionConfig` message provides information to the recognizer
112+ // that specifies how to process the request.
113+ message RecognitionConfig {
114+ // Audio encoding of the data sent in the audio message. All encodings support
115+ // only 1 channel (mono) audio. Only `FLAC` includes a header that describes
116+ // the bytes of audio that follow the header. The other encodings are raw
117+ // audio bytes with no header.
118+ //
119+ // For best results, the audio source should be captured and transmitted using
120+ // a lossless encoding (`FLAC` or `LINEAR16`). Recognition accuracy may be
121+ // reduced if lossy codecs (such as AMR, AMR_WB and MULAW) are used to capture
122+ // or transmit the audio, particularly if background noise is present.
123+ enum AudioEncoding {
124+ // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
125+ ENCODING_UNSPECIFIED = 0 ;
126+
127+ // Uncompressed 16-bit signed little-endian samples.
128+ // This is the only encoding that may be used by `AsyncRecognize`.
129+ LINEAR16 = 1 ;
130+
131+ // This is the recommended encoding for `SyncRecognize` and
132+ // `StreamingRecognize` because it uses lossless compression; therefore
133+ // recognition accuracy is not compromised by a lossy codec.
134+ //
135+ // The stream FLAC (Free Lossless Audio Codec) encoding is specified at:
136+ // http://flac.sourceforge.net/documentation.html.
137+ // Only 16-bit samples are supported.
138+ // Not all fields in STREAMINFO are supported.
139+ FLAC = 2 ;
140+
141+ // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
142+ MULAW = 3 ;
143+
144+ // Adaptive Multi-Rate Narrowband codec. `sample_rate` must be 8000 Hz.
145+ AMR = 4 ;
146+
147+ // Adaptive Multi-Rate Wideband codec. `sample_rate` must be 16000 Hz.
148+ AMR_WB = 5 ;
149+ }
150+
151+ // [Required] Encoding of audio data sent in all `RecognitionAudio` messages.
152+ AudioEncoding encoding = 1 ;
153+
154+ // [Required] Sample rate in Hertz of the audio data sent in all
155+ // `RecognitionAudio` messages. Valid values are: 8000-48000.
156+ // 16000 is optimal. For best results, set the sampling rate of the audio
157+ // source to 16000 Hz. If that's not possible, use the native sample rate of
158+ // the audio source (instead of re-sampling).
159+ int32 sample_rate = 2 ;
160+
161+ // [Optional] The language of the supplied audio as a BCP-47 language tag.
162+ // Example: "en-GB" https://www.rfc-editor.org/rfc/bcp/bcp47.txt
163+ // If omitted, defaults to "en-US". See
164+ // [Language Support](/speech/docs/best-practices#language_support) for
165+ // a list of the currently supported language codes.
166+ string language_code = 3 ;
167+
168+ // [Optional] Maximum number of recognition hypotheses to be returned.
169+ // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
170+ // within each `SpeechRecognitionResult`.
171+ // The server may return fewer than `max_alternatives`.
172+ // Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
173+ // `1`. If omitted, defaults to `1`.
174+ int32 max_alternatives = 4 ;
175+
176+ // [Optional] If set to `true`, the server will attempt to filter out
177+ // profanities, replacing all but the initial character in each filtered word
178+ // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
179+ // won't be filtered out.
180+ bool profanity_filter = 5 ;
181+
182+ // [Optional] A means to provide context to assist the speech recognition.
183+ SpeechContext speech_context = 6 ;
184+ }
185+
186+ // Provides "hints" to the speech recognizer to favor specific words and phrases
187+ // in the results.
188+ message SpeechContext {
189+ // [Optional] A list of up to 50 phrases of up to 100 characters each to
190+ // provide words and phrases "hints" to the speech recognition so that it is
191+ // more likely to recognize them.
192+ repeated string phrases = 1 ;
193+ }
194+
195+ // Contains audio data in the encoding specified in the `RecognitionConfig`.
196+ // Either `content` or `uri` must be supplied. Supplying both or neither
197+ // returns [google.rpc.Code.INVALID_ARGUMENT][].
198+ message RecognitionAudio {
199+ oneof audio_source {
200+ // The audio data bytes encoded as specified in
201+ // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
202+ // pure binary representation, whereas JSON representations use base64.
203+ bytes content = 1 [ctype = CORD ];
204+
205+ // URI that points to a file that contains audio data bytes as specified in
206+ // `RecognitionConfig`. Currently, only Google Cloud Storage URIs are
207+ // supported, which must be specified in the following format:
208+ // `gs://bucket_name/object_name` (other URI formats return
209+ // [google.rpc.Code.INVALID_ARGUMENT][]). For more information, see
210+ // [Request URIs](/storage/docs/reference-uris).
211+ string uri = 2 ;
212+ }
213+ }
214+
215+ // `SyncRecognizeResponse` is the only message returned to the client by
216+ // `SyncRecognize`. It contains the result as zero or more
217+ // sequential `RecognizeResponse` messages.
218+ message SyncRecognizeResponse {
219+ // [Output-only] Sequential list of transcription results corresponding to
220+ // sequential portions of audio.
221+ repeated SpeechRecognitionResult results = 2 ;
222+ }
223+
224+ // `AsyncRecognizeResponse` is the only message returned to the client by
225+ // `AsyncRecognize`. It contains the result as zero or more
226+ // sequential `RecognizeResponse` messages.
227+ message AsyncRecognizeResponse {
228+ // [Output-only] Sequential list of transcription results corresponding to
229+ // sequential portions of audio.
230+ repeated SpeechRecognitionResult results = 2 ;
231+ }
232+
233+ // `StreamingRecognizeResponse` is the only message returned to the client by
234+ // `StreamingRecognize`. It contains the result as zero or more
235+ // sequential `RecognizeResponse` messages.
236+ message StreamingRecognizeResponse {
237+ // Indicates the type of endpointer event.
238+ enum EndpointerType {
239+ // No endpointer event specified.
240+ ENDPOINTER_EVENT_UNSPECIFIED = 0 ;
241+
242+ // Speech has been detected in the audio stream.
243+ START_OF_SPEECH = 1 ;
244+
245+ // Speech has ceased to be detected in the audio stream.
246+ END_OF_SPEECH = 2 ;
247+
248+ // The end of the audio stream has been reached. and it is being processed.
249+ END_OF_AUDIO = 3 ;
250+
251+ // This event is only sent when `single_utterance` is `true`. It indicates
252+ // that the server has detected the end of the user's speech utterance and
253+ // expects no additional speech. Therefore, the server will not process
254+ // additional audio. The client should stop sending additional audio data.
255+ END_OF_UTTERANCE = 4 ;
256+ }
257+
258+ // [Output-only] If set, returns a [google.rpc.Status][] message that
259+ // specifies the error for the operation.
260+ google.rpc.Status error = 1 ;
261+
262+ // [Output-only] This repeated list contains zero or more results that
263+ // correspond to consecutive portions of the audio currently being processed.
264+ // It contains zero or one `is_final=true` result (the newly settled portion),
265+ // followed by zero or more `is_final=false` results.
266+ repeated StreamingRecognitionResult results = 2 ;
267+
268+ // [Output-only] Indicates the lowest index in the `results` array that has
269+ // changed. The repeated `SpeechRecognitionResult` results overwrite past
270+ // results at this index and higher.
271+ int32 result_index = 3 ;
272+
273+ // [Output-only] Indicates the type of endpointer event.
274+ EndpointerType endpointer_type = 4 ;
275+ }
276+
277+ // A speech recognition result corresponding to a portion of the audio that is
278+ // currently being processed.
279+ // TODO(gshires): add a comment describing the various repeated interim and
280+ // alternative results fields.
281+ message StreamingRecognitionResult {
282+ // [Output-only] May contain one or more recognition hypotheses (up to the
283+ // maximum specified in `max_alternatives`).
284+ repeated SpeechRecognitionAlternative alternatives = 1 ;
285+
286+ // [Output-only] If `false`, this `SpeechRecognitionResult` represents an
287+ // interim result that may change. If `true`, this is the final time the
288+ // speech service will return this particular `SpeechRecognitionResult`,
289+ // the recognizer will not return any further hypotheses for this portion of
290+ // the transcript and corresponding audio.
291+ bool is_final = 2 ;
292+
293+ // [Output-only] An estimate of the probability that the recognizer will not
294+ // change its guess about this interim result. Values range from 0.0
295+ // (completely unstable) to 1.0 (completely stable). Note that this is not the
296+ // same as `confidence`, which estimates the probability that a recognition
297+ // result is correct.
298+ // This field is only provided for interim results (`is_final=false`).
299+ // The default of 0.0 is a sentinel value indicating stability was not set.
300+ float stability = 3 ;
301+ }
302+
303+ // A speech recognition result corresponding to a portion of the audio.
304+ message SpeechRecognitionResult {
305+ // [Output-only] May contain one or more recognition hypotheses (up to the
306+ // maximum specified in `max_alternatives`).
307+ repeated SpeechRecognitionAlternative alternatives = 1 ;
308+ }
309+
310+ // Alternative hypotheses (a.k.a. n-best list).
311+ message SpeechRecognitionAlternative {
312+ // [Output-only] Transcript text representing the words that the user spoke.
313+ string transcript = 1 ;
314+
315+ // [Output-only] The confidence estimate between 0.0 and 1.0. A higher number
316+ // means the system is more confident that the recognition is correct.
317+ // This field is typically provided only for the top hypothesis, and only for
318+ // `is_final=true` results.
319+ // The default of 0.0 is a sentinel value indicating confidence was not set.
320+ float confidence = 2 ;
321+ }
0 commit comments