1// Copyright 2017 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.speech.v1beta1;
18
19import "google/api/annotations.proto";
20import "google/longrunning/operations.proto";
21import "google/protobuf/duration.proto";
22import "google/protobuf/timestamp.proto";
23import "google/rpc/status.proto";
24
25option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1beta1;speech";
26option java_multiple_files = true;
27option java_outer_classname = "SpeechProto";
28option java_package = "com.google.cloud.speech.v1beta1";
29
30
31// Service that implements Google Cloud Speech API.
32service Speech {
33  // Performs synchronous speech recognition: receive results after all audio
34  // has been sent and processed.
35  rpc SyncRecognize(SyncRecognizeRequest) returns (SyncRecognizeResponse) {
36    option (google.api.http) = { post: "/v1beta1/speech:syncrecognize" body: "*" };
37  }
38
39  // Performs asynchronous speech recognition: receive results via the
40  // [google.longrunning.Operations]
41  // (/speech/reference/rest/v1beta1/operations#Operation)
42  // interface. Returns either an
43  // `Operation.error` or an `Operation.response` which contains
44  // an `AsyncRecognizeResponse` message.
45  rpc AsyncRecognize(AsyncRecognizeRequest) returns (google.longrunning.Operation) {
46    option (google.api.http) = { post: "/v1beta1/speech:asyncrecognize" body: "*" };
47  }
48
49  // Performs bidirectional streaming speech recognition: receive results while
50  // sending audio. This method is only available via the gRPC API (not REST).
51  rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse);
52}
53
54// The top-level message sent by the client for the `SyncRecognize` method.
55message SyncRecognizeRequest {
56  // *Required* Provides information to the recognizer that specifies how to
57  // process the request.
58  RecognitionConfig config = 1;
59
60  // *Required* The audio data to be recognized.
61  RecognitionAudio audio = 2;
62}
63
64// The top-level message sent by the client for the `AsyncRecognize` method.
65message AsyncRecognizeRequest {
66  // *Required* Provides information to the recognizer that specifies how to
67  // process the request.
68  RecognitionConfig config = 1;
69
70  // *Required* The audio data to be recognized.
71  RecognitionAudio audio = 2;
72}
73
74// The top-level message sent by the client for the `StreamingRecognize` method.
75// Multiple `StreamingRecognizeRequest` messages are sent. The first message
76// must contain a `streaming_config` message and must not contain `audio` data.
77// All subsequent messages must contain `audio` data and must not contain a
78// `streaming_config` message.
79message StreamingRecognizeRequest {
80  // The streaming request, which is either a streaming config or audio content.
81  oneof streaming_request {
82    // Provides information to the recognizer that specifies how to process the
83    // request. The first `StreamingRecognizeRequest` message must contain a
84    // `streaming_config`  message.
85    StreamingRecognitionConfig streaming_config = 1;
86
87    // The audio data to be recognized. Sequential chunks of audio data are sent
88    // in sequential `StreamingRecognizeRequest` messages. The first
89    // `StreamingRecognizeRequest` message must not contain `audio_content` data
90    // and all subsequent `StreamingRecognizeRequest` messages must contain
91    // `audio_content` data. The audio bytes must be encoded as specified in
92    // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
93    // pure binary representation (not base64). See
94    // [audio limits](https://cloud.google.com/speech/limits#content).
95    bytes audio_content = 2;
96  }
97}
98
99// Provides information to the recognizer that specifies how to process the
100// request.
101message StreamingRecognitionConfig {
102  // *Required* Provides information to the recognizer that specifies how to
103  // process the request.
104  RecognitionConfig config = 1;
105
106  // *Optional* If `false` or omitted, the recognizer will perform continuous
107  // recognition (continuing to wait for and process audio even if the user
108  // pauses speaking) until the client closes the input stream (gRPC API) or
109  // until the maximum time limit has been reached. May return multiple
110  // `StreamingRecognitionResult`s with the `is_final` flag set to `true`.
111  //
112  // If `true`, the recognizer will detect a single spoken utterance. When it
113  // detects that the user has paused or stopped speaking, it will return an
114  // `END_OF_UTTERANCE` event and cease recognition. It will return no more than
115  // one `StreamingRecognitionResult` with the `is_final` flag set to `true`.
116  bool single_utterance = 2;
117
118  // *Optional* If `true`, interim results (tentative hypotheses) may be
119  // returned as they become available (these interim results are indicated with
120  // the `is_final=false` flag).
121  // If `false` or omitted, only `is_final=true` result(s) are returned.
122  bool interim_results = 3;
123}
124
125// Provides information to the recognizer that specifies how to process the
126// request.
127message RecognitionConfig {
128  // Audio encoding of the data sent in the audio message. All encodings support
129  // only 1 channel (mono) audio. Only `FLAC` includes a header that describes
130  // the bytes of audio that follow the header. The other encodings are raw
131  // audio bytes with no header.
132  //
133  // For best results, the audio source should be captured and transmitted using
134  // a lossless encoding (`FLAC` or `LINEAR16`). Recognition accuracy may be
135  // reduced if lossy codecs (such as AMR, AMR_WB and MULAW) are used to capture
136  // or transmit the audio, particularly if background noise is present.
137  enum AudioEncoding {
138    // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
139    ENCODING_UNSPECIFIED = 0;
140
141    // Uncompressed 16-bit signed little-endian samples (Linear PCM).
142    // This is the only encoding that may be used by `AsyncRecognize`.
143    LINEAR16 = 1;
144
145    // This is the recommended encoding for `SyncRecognize` and
146    // `StreamingRecognize` because it uses lossless compression; therefore
147    // recognition accuracy is not compromised by a lossy codec.
148    //
149    // The stream FLAC (Free Lossless Audio Codec) encoding is specified at:
150    // http://flac.sourceforge.net/documentation.html.
151    // 16-bit and 24-bit samples are supported.
152    // Not all fields in STREAMINFO are supported.
153    FLAC = 2;
154
155    // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
156    MULAW = 3;
157
158    // Adaptive Multi-Rate Narrowband codec. `sample_rate` must be 8000 Hz.
159    AMR = 4;
160
161    // Adaptive Multi-Rate Wideband codec. `sample_rate` must be 16000 Hz.
162    AMR_WB = 5;
163  }
164
165  // *Required* Encoding of audio data sent in all `RecognitionAudio` messages.
166  AudioEncoding encoding = 1;
167
168  // *Required* Sample rate in Hertz of the audio data sent in all
169  // `RecognitionAudio` messages. Valid values are: 8000-48000.
170  // 16000 is optimal. For best results, set the sampling rate of the audio
171  // source to 16000 Hz. If that's not possible, use the native sample rate of
172  // the audio source (instead of re-sampling).
173  int32 sample_rate = 2;
174
175  // *Optional* The language of the supplied audio as a BCP-47 language tag.
176  // Example: "en-GB"  https://www.rfc-editor.org/rfc/bcp/bcp47.txt
177  // If omitted, defaults to "en-US". See
178  // [Language Support](https://cloud.google.com/speech/docs/languages)
179  // for a list of the currently supported language codes.
180  string language_code = 3;
181
182  // *Optional* Maximum number of recognition hypotheses to be returned.
183  // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
184  // within each `SpeechRecognitionResult`.
185  // The server may return fewer than `max_alternatives`.
186  // Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
187  // one. If omitted, will return a maximum of one.
188  int32 max_alternatives = 4;
189
190  // *Optional* If set to `true`, the server will attempt to filter out
191  // profanities, replacing all but the initial character in each filtered word
192  // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
193  // won't be filtered out.
194  bool profanity_filter = 5;
195
196  // *Optional* A means to provide context to assist the speech recognition.
197  SpeechContext speech_context = 6;
198}
199
200// Provides "hints" to the speech recognizer to favor specific words and phrases
201// in the results.
202message SpeechContext {
203  // *Optional* A list of strings containing words and phrases "hints" so that
204  // the speech recognition is more likely to recognize them. This can be used
205  // to improve the accuracy for specific words and phrases, for example, if
206  // specific commands are typically spoken by the user. This can also be used
207  // to add additional words to the vocabulary of the recognizer. See
208  // [usage limits](https://cloud.google.com/speech/limits#content).
209  repeated string phrases = 1;
210}
211
212// Contains audio data in the encoding specified in the `RecognitionConfig`.
213// Either `content` or `uri` must be supplied. Supplying both or neither
214// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
215// [audio limits](https://cloud.google.com/speech/limits#content).
216message RecognitionAudio {
217  // The audio source, which is either inline content or a GCS uri.
218  oneof audio_source {
219    // The audio data bytes encoded as specified in
220    // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
221    // pure binary representation, whereas JSON representations use base64.
222    bytes content = 1;
223
224    // URI that points to a file that contains audio data bytes as specified in
225    // `RecognitionConfig`. Currently, only Google Cloud Storage URIs are
226    // supported, which must be specified in the following format:
227    // `gs://bucket_name/object_name` (other URI formats return
228    // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
229    // [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
230    string uri = 2;
231  }
232}
233
234// The only message returned to the client by `SyncRecognize`. method. It
235// contains the result as zero or more sequential `SpeechRecognitionResult`
236// messages.
237message SyncRecognizeResponse {
238  // *Output-only* Sequential list of transcription results corresponding to
239  // sequential portions of audio.
240  repeated SpeechRecognitionResult results = 2;
241}
242
243// The only message returned to the client by `AsyncRecognize`. It contains the
244// result as zero or more sequential `SpeechRecognitionResult` messages. It is
245// included in the `result.response` field of the `Operation` returned by the
246// `GetOperation` call of the `google::longrunning::Operations` service.
247message AsyncRecognizeResponse {
248  // *Output-only* Sequential list of transcription results corresponding to
249  // sequential portions of audio.
250  repeated SpeechRecognitionResult results = 2;
251}
252
253// Describes the progress of a long-running `AsyncRecognize` call. It is
254// included in the `metadata` field of the `Operation` returned by the
255// `GetOperation` call of the `google::longrunning::Operations` service.
256message AsyncRecognizeMetadata {
257  // Approximate percentage of audio processed thus far. Guaranteed to be 100
258  // when the audio is fully processed and the results are available.
259  int32 progress_percent = 1;
260
261  // Time when the request was received.
262  google.protobuf.Timestamp start_time = 2;
263
264  // Time of the most recent processing update.
265  google.protobuf.Timestamp last_update_time = 3;
266}
267
268// `StreamingRecognizeResponse` is the only message returned to the client by
269// `StreamingRecognize`. A series of one or more `StreamingRecognizeResponse`
270// messages are streamed back to the client.
271//
272// Here's an example of a series of ten `StreamingRecognizeResponse`s that might
273// be returned while processing audio:
274//
275// 1. endpointer_type: START_OF_SPEECH
276//
277// 2. results { alternatives { transcript: "tube" } stability: 0.01 }
278//    result_index: 0
279//
280// 3. results { alternatives { transcript: "to be a" } stability: 0.01 }
281//    result_index: 0
282//
283// 4. results { alternatives { transcript: "to be" } stability: 0.9 }
284//    results { alternatives { transcript: " or not to be" } stability: 0.01 }
285//    result_index: 0
286//
287// 5. results { alternatives { transcript: "to be or not to be"
288//                             confidence: 0.92 }
289//              alternatives { transcript: "to bee or not to bee" }
290//              is_final: true }
291//    result_index: 0
292//
293// 6. results { alternatives { transcript: " that's" } stability: 0.01 }
294//    result_index: 1
295//
296// 7. results { alternatives { transcript: " that is" } stability: 0.9 }
297//    results { alternatives { transcript: " the question" } stability: 0.01 }
298//    result_index: 1
299//
300// 8. endpointer_type: END_OF_SPEECH
301//
302// 9. results { alternatives { transcript: " that is the question"
303//                             confidence: 0.98 }
304//              alternatives { transcript: " that was the question" }
305//              is_final: true }
306//    result_index: 1
307//
308// 10. endpointer_type: END_OF_AUDIO
309//
310// Notes:
311//
312// - Only two of the above responses #5 and #9 contain final results, they are
313//   indicated by `is_final: true`. Concatenating these together generates the
314//   full transcript: "to be or not to be that is the question".
315//
316// - The others contain interim `results`. #4 and #7 contain two interim
317//   `results`, the first portion has a high stability and is less likely to
318//   change, the second portion has a low stability and is very likely to
319//   change. A UI designer might choose to show only high stability `results`.
320//
321// - The specific `stability` and `confidence` values shown above are only for
322//   illustrative purposes. Actual values may vary.
323//
324// - The `result_index` indicates the portion of audio that has had final
325//   results returned, and is no longer being processed. For example, the
326//   `results` in #6 and later correspond to the portion of audio after
327//   "to be or not to be".
328message StreamingRecognizeResponse {
329  // Indicates the type of endpointer event.
330  enum EndpointerType {
331    // No endpointer event specified.
332    ENDPOINTER_EVENT_UNSPECIFIED = 0;
333
334    // Speech has been detected in the audio stream, and the service is
335    // beginning to process it.
336    START_OF_SPEECH = 1;
337
338    // Speech has ceased to be detected in the audio stream. (For example, the
339    // user may have paused after speaking.) If `single_utterance` is `false`,
340    // the service will continue to process audio, and if subsequent speech is
341    // detected, will send another START_OF_SPEECH event.
342    END_OF_SPEECH = 2;
343
344    // This event is sent after the client has half-closed the input stream gRPC
345    // connection and the server has received all of the audio. (The server may
346    // still be processing the audio and may subsequently return additional
347    // results.)
348    END_OF_AUDIO = 3;
349
350    // This event is only sent when `single_utterance` is `true`. It indicates
351    // that the server has detected the end of the user's speech utterance and
352    // expects no additional speech. Therefore, the server will not process
353    // additional audio (although it may subsequently return additional
354    // results). The client should stop sending additional audio data,
355    // half-close the gRPC connection, and wait for any additional results
356    // until the server closes the gRPC connection.
357    END_OF_UTTERANCE = 4;
358  }
359
360  // *Output-only* If set, returns a [google.rpc.Status][google.rpc.Status] message that
361  // specifies the error for the operation.
362  google.rpc.Status error = 1;
363
364  // *Output-only* This repeated list contains zero or more results that
365  // correspond to consecutive portions of the audio currently being processed.
366  // It contains zero or one `is_final=true` result (the newly settled portion),
367  // followed by zero or more `is_final=false` results.
368  repeated StreamingRecognitionResult results = 2;
369
370  // *Output-only* Indicates the lowest index in the `results` array that has
371  // changed. The repeated `StreamingRecognitionResult` results overwrite past
372  // results at this index and higher.
373  int32 result_index = 3;
374
375  // *Output-only* Indicates the type of endpointer event.
376  EndpointerType endpointer_type = 4;
377}
378
379// A streaming speech recognition result corresponding to a portion of the audio
380// that is currently being processed.
381message StreamingRecognitionResult {
382  // *Output-only* May contain one or more recognition hypotheses (up to the
383  // maximum specified in `max_alternatives`).
384  repeated SpeechRecognitionAlternative alternatives = 1;
385
386  // *Output-only* If `false`, this `StreamingRecognitionResult` represents an
387  // interim result that may change. If `true`, this is the final time the
388  // speech service will return this particular `StreamingRecognitionResult`,
389  // the recognizer will not return any further hypotheses for this portion of
390  // the transcript and corresponding audio.
391  bool is_final = 2;
392
393  // *Output-only* An estimate of the likelihood that the recognizer will not
394  // change its guess about this interim result. Values range from 0.0
395  // (completely unstable) to 1.0 (completely stable).
396  // This field is only provided for interim results (`is_final=false`).
397  // The default of 0.0 is a sentinel value indicating `stability` was not set.
398  float stability = 3;
399}
400
401// A speech recognition result corresponding to a portion of the audio.
402message SpeechRecognitionResult {
403  // *Output-only* May contain one or more recognition hypotheses (up to the
404  // maximum specified in `max_alternatives`).
405  repeated SpeechRecognitionAlternative alternatives = 1;
406}
407
408// Alternative hypotheses (a.k.a. n-best list).
409message SpeechRecognitionAlternative {
410  // *Output-only* Transcript text representing the words that the user spoke.
411  string transcript = 1;
412
413  // *Output-only* The confidence estimate between 0.0 and 1.0. A higher number
414  // indicates an estimated greater likelihood that the recognized words are
415  // correct. This field is typically provided only for the top hypothesis, and
416  // only for `is_final=true` results. Clients should not rely on the
417  // `confidence` field as it is not guaranteed to be accurate, or even set, in
418  // any of the results.
419  // The default of 0.0 is a sentinel value indicating `confidence` was not set.
420  float confidence = 2;
421}
422