1// Copyright 2017 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.speech.v1beta1; 18 19import "google/api/annotations.proto"; 20import "google/longrunning/operations.proto"; 21import "google/protobuf/duration.proto"; 22import "google/protobuf/timestamp.proto"; 23import "google/rpc/status.proto"; 24 25option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1beta1;speech"; 26option java_multiple_files = true; 27option java_outer_classname = "SpeechProto"; 28option java_package = "com.google.cloud.speech.v1beta1"; 29 30 31// Service that implements Google Cloud Speech API. 32service Speech { 33 // Performs synchronous speech recognition: receive results after all audio 34 // has been sent and processed. 35 rpc SyncRecognize(SyncRecognizeRequest) returns (SyncRecognizeResponse) { 36 option (google.api.http) = { post: "/v1beta1/speech:syncrecognize" body: "*" }; 37 } 38 39 // Performs asynchronous speech recognition: receive results via the 40 // [google.longrunning.Operations] 41 // (/speech/reference/rest/v1beta1/operations#Operation) 42 // interface. Returns either an 43 // `Operation.error` or an `Operation.response` which contains 44 // an `AsyncRecognizeResponse` message. 45 rpc AsyncRecognize(AsyncRecognizeRequest) returns (google.longrunning.Operation) { 46 option (google.api.http) = { post: "/v1beta1/speech:asyncrecognize" body: "*" }; 47 } 48 49 // Performs bidirectional streaming speech recognition: receive results while 50 // sending audio. This method is only available via the gRPC API (not REST). 51 rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse); 52} 53 54// The top-level message sent by the client for the `SyncRecognize` method. 55message SyncRecognizeRequest { 56 // *Required* Provides information to the recognizer that specifies how to 57 // process the request. 58 RecognitionConfig config = 1; 59 60 // *Required* The audio data to be recognized. 61 RecognitionAudio audio = 2; 62} 63 64// The top-level message sent by the client for the `AsyncRecognize` method. 65message AsyncRecognizeRequest { 66 // *Required* Provides information to the recognizer that specifies how to 67 // process the request. 68 RecognitionConfig config = 1; 69 70 // *Required* The audio data to be recognized. 71 RecognitionAudio audio = 2; 72} 73 74// The top-level message sent by the client for the `StreamingRecognize` method. 75// Multiple `StreamingRecognizeRequest` messages are sent. The first message 76// must contain a `streaming_config` message and must not contain `audio` data. 77// All subsequent messages must contain `audio` data and must not contain a 78// `streaming_config` message. 79message StreamingRecognizeRequest { 80 // The streaming request, which is either a streaming config or audio content. 81 oneof streaming_request { 82 // Provides information to the recognizer that specifies how to process the 83 // request. The first `StreamingRecognizeRequest` message must contain a 84 // `streaming_config` message. 85 StreamingRecognitionConfig streaming_config = 1; 86 87 // The audio data to be recognized. Sequential chunks of audio data are sent 88 // in sequential `StreamingRecognizeRequest` messages. The first 89 // `StreamingRecognizeRequest` message must not contain `audio_content` data 90 // and all subsequent `StreamingRecognizeRequest` messages must contain 91 // `audio_content` data. The audio bytes must be encoded as specified in 92 // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a 93 // pure binary representation (not base64). See 94 // [audio limits](https://cloud.google.com/speech/limits#content). 95 bytes audio_content = 2; 96 } 97} 98 99// Provides information to the recognizer that specifies how to process the 100// request. 101message StreamingRecognitionConfig { 102 // *Required* Provides information to the recognizer that specifies how to 103 // process the request. 104 RecognitionConfig config = 1; 105 106 // *Optional* If `false` or omitted, the recognizer will perform continuous 107 // recognition (continuing to wait for and process audio even if the user 108 // pauses speaking) until the client closes the input stream (gRPC API) or 109 // until the maximum time limit has been reached. May return multiple 110 // `StreamingRecognitionResult`s with the `is_final` flag set to `true`. 111 // 112 // If `true`, the recognizer will detect a single spoken utterance. When it 113 // detects that the user has paused or stopped speaking, it will return an 114 // `END_OF_UTTERANCE` event and cease recognition. It will return no more than 115 // one `StreamingRecognitionResult` with the `is_final` flag set to `true`. 116 bool single_utterance = 2; 117 118 // *Optional* If `true`, interim results (tentative hypotheses) may be 119 // returned as they become available (these interim results are indicated with 120 // the `is_final=false` flag). 121 // If `false` or omitted, only `is_final=true` result(s) are returned. 122 bool interim_results = 3; 123} 124 125// Provides information to the recognizer that specifies how to process the 126// request. 127message RecognitionConfig { 128 // Audio encoding of the data sent in the audio message. All encodings support 129 // only 1 channel (mono) audio. Only `FLAC` includes a header that describes 130 // the bytes of audio that follow the header. The other encodings are raw 131 // audio bytes with no header. 132 // 133 // For best results, the audio source should be captured and transmitted using 134 // a lossless encoding (`FLAC` or `LINEAR16`). Recognition accuracy may be 135 // reduced if lossy codecs (such as AMR, AMR_WB and MULAW) are used to capture 136 // or transmit the audio, particularly if background noise is present. 137 enum AudioEncoding { 138 // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. 139 ENCODING_UNSPECIFIED = 0; 140 141 // Uncompressed 16-bit signed little-endian samples (Linear PCM). 142 // This is the only encoding that may be used by `AsyncRecognize`. 143 LINEAR16 = 1; 144 145 // This is the recommended encoding for `SyncRecognize` and 146 // `StreamingRecognize` because it uses lossless compression; therefore 147 // recognition accuracy is not compromised by a lossy codec. 148 // 149 // The stream FLAC (Free Lossless Audio Codec) encoding is specified at: 150 // http://flac.sourceforge.net/documentation.html. 151 // 16-bit and 24-bit samples are supported. 152 // Not all fields in STREAMINFO are supported. 153 FLAC = 2; 154 155 // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. 156 MULAW = 3; 157 158 // Adaptive Multi-Rate Narrowband codec. `sample_rate` must be 8000 Hz. 159 AMR = 4; 160 161 // Adaptive Multi-Rate Wideband codec. `sample_rate` must be 16000 Hz. 162 AMR_WB = 5; 163 } 164 165 // *Required* Encoding of audio data sent in all `RecognitionAudio` messages. 166 AudioEncoding encoding = 1; 167 168 // *Required* Sample rate in Hertz of the audio data sent in all 169 // `RecognitionAudio` messages. Valid values are: 8000-48000. 170 // 16000 is optimal. For best results, set the sampling rate of the audio 171 // source to 16000 Hz. If that's not possible, use the native sample rate of 172 // the audio source (instead of re-sampling). 173 int32 sample_rate = 2; 174 175 // *Optional* The language of the supplied audio as a BCP-47 language tag. 176 // Example: "en-GB" https://www.rfc-editor.org/rfc/bcp/bcp47.txt 177 // If omitted, defaults to "en-US". See 178 // [Language Support](https://cloud.google.com/speech/docs/languages) 179 // for a list of the currently supported language codes. 180 string language_code = 3; 181 182 // *Optional* Maximum number of recognition hypotheses to be returned. 183 // Specifically, the maximum number of `SpeechRecognitionAlternative` messages 184 // within each `SpeechRecognitionResult`. 185 // The server may return fewer than `max_alternatives`. 186 // Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of 187 // one. If omitted, will return a maximum of one. 188 int32 max_alternatives = 4; 189 190 // *Optional* If set to `true`, the server will attempt to filter out 191 // profanities, replacing all but the initial character in each filtered word 192 // with asterisks, e.g. "f***". If set to `false` or omitted, profanities 193 // won't be filtered out. 194 bool profanity_filter = 5; 195 196 // *Optional* A means to provide context to assist the speech recognition. 197 SpeechContext speech_context = 6; 198} 199 200// Provides "hints" to the speech recognizer to favor specific words and phrases 201// in the results. 202message SpeechContext { 203 // *Optional* A list of strings containing words and phrases "hints" so that 204 // the speech recognition is more likely to recognize them. This can be used 205 // to improve the accuracy for specific words and phrases, for example, if 206 // specific commands are typically spoken by the user. This can also be used 207 // to add additional words to the vocabulary of the recognizer. See 208 // [usage limits](https://cloud.google.com/speech/limits#content). 209 repeated string phrases = 1; 210} 211 212// Contains audio data in the encoding specified in the `RecognitionConfig`. 213// Either `content` or `uri` must be supplied. Supplying both or neither 214// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See 215// [audio limits](https://cloud.google.com/speech/limits#content). 216message RecognitionAudio { 217 // The audio source, which is either inline content or a GCS uri. 218 oneof audio_source { 219 // The audio data bytes encoded as specified in 220 // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a 221 // pure binary representation, whereas JSON representations use base64. 222 bytes content = 1; 223 224 // URI that points to a file that contains audio data bytes as specified in 225 // `RecognitionConfig`. Currently, only Google Cloud Storage URIs are 226 // supported, which must be specified in the following format: 227 // `gs://bucket_name/object_name` (other URI formats return 228 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see 229 // [Request URIs](https://cloud.google.com/storage/docs/reference-uris). 230 string uri = 2; 231 } 232} 233 234// The only message returned to the client by `SyncRecognize`. method. It 235// contains the result as zero or more sequential `SpeechRecognitionResult` 236// messages. 237message SyncRecognizeResponse { 238 // *Output-only* Sequential list of transcription results corresponding to 239 // sequential portions of audio. 240 repeated SpeechRecognitionResult results = 2; 241} 242 243// The only message returned to the client by `AsyncRecognize`. It contains the 244// result as zero or more sequential `SpeechRecognitionResult` messages. It is 245// included in the `result.response` field of the `Operation` returned by the 246// `GetOperation` call of the `google::longrunning::Operations` service. 247message AsyncRecognizeResponse { 248 // *Output-only* Sequential list of transcription results corresponding to 249 // sequential portions of audio. 250 repeated SpeechRecognitionResult results = 2; 251} 252 253// Describes the progress of a long-running `AsyncRecognize` call. It is 254// included in the `metadata` field of the `Operation` returned by the 255// `GetOperation` call of the `google::longrunning::Operations` service. 256message AsyncRecognizeMetadata { 257 // Approximate percentage of audio processed thus far. Guaranteed to be 100 258 // when the audio is fully processed and the results are available. 259 int32 progress_percent = 1; 260 261 // Time when the request was received. 262 google.protobuf.Timestamp start_time = 2; 263 264 // Time of the most recent processing update. 265 google.protobuf.Timestamp last_update_time = 3; 266} 267 268// `StreamingRecognizeResponse` is the only message returned to the client by 269// `StreamingRecognize`. A series of one or more `StreamingRecognizeResponse` 270// messages are streamed back to the client. 271// 272// Here's an example of a series of ten `StreamingRecognizeResponse`s that might 273// be returned while processing audio: 274// 275// 1. endpointer_type: START_OF_SPEECH 276// 277// 2. results { alternatives { transcript: "tube" } stability: 0.01 } 278// result_index: 0 279// 280// 3. results { alternatives { transcript: "to be a" } stability: 0.01 } 281// result_index: 0 282// 283// 4. results { alternatives { transcript: "to be" } stability: 0.9 } 284// results { alternatives { transcript: " or not to be" } stability: 0.01 } 285// result_index: 0 286// 287// 5. results { alternatives { transcript: "to be or not to be" 288// confidence: 0.92 } 289// alternatives { transcript: "to bee or not to bee" } 290// is_final: true } 291// result_index: 0 292// 293// 6. results { alternatives { transcript: " that's" } stability: 0.01 } 294// result_index: 1 295// 296// 7. results { alternatives { transcript: " that is" } stability: 0.9 } 297// results { alternatives { transcript: " the question" } stability: 0.01 } 298// result_index: 1 299// 300// 8. endpointer_type: END_OF_SPEECH 301// 302// 9. results { alternatives { transcript: " that is the question" 303// confidence: 0.98 } 304// alternatives { transcript: " that was the question" } 305// is_final: true } 306// result_index: 1 307// 308// 10. endpointer_type: END_OF_AUDIO 309// 310// Notes: 311// 312// - Only two of the above responses #5 and #9 contain final results, they are 313// indicated by `is_final: true`. Concatenating these together generates the 314// full transcript: "to be or not to be that is the question". 315// 316// - The others contain interim `results`. #4 and #7 contain two interim 317// `results`, the first portion has a high stability and is less likely to 318// change, the second portion has a low stability and is very likely to 319// change. A UI designer might choose to show only high stability `results`. 320// 321// - The specific `stability` and `confidence` values shown above are only for 322// illustrative purposes. Actual values may vary. 323// 324// - The `result_index` indicates the portion of audio that has had final 325// results returned, and is no longer being processed. For example, the 326// `results` in #6 and later correspond to the portion of audio after 327// "to be or not to be". 328message StreamingRecognizeResponse { 329 // Indicates the type of endpointer event. 330 enum EndpointerType { 331 // No endpointer event specified. 332 ENDPOINTER_EVENT_UNSPECIFIED = 0; 333 334 // Speech has been detected in the audio stream, and the service is 335 // beginning to process it. 336 START_OF_SPEECH = 1; 337 338 // Speech has ceased to be detected in the audio stream. (For example, the 339 // user may have paused after speaking.) If `single_utterance` is `false`, 340 // the service will continue to process audio, and if subsequent speech is 341 // detected, will send another START_OF_SPEECH event. 342 END_OF_SPEECH = 2; 343 344 // This event is sent after the client has half-closed the input stream gRPC 345 // connection and the server has received all of the audio. (The server may 346 // still be processing the audio and may subsequently return additional 347 // results.) 348 END_OF_AUDIO = 3; 349 350 // This event is only sent when `single_utterance` is `true`. It indicates 351 // that the server has detected the end of the user's speech utterance and 352 // expects no additional speech. Therefore, the server will not process 353 // additional audio (although it may subsequently return additional 354 // results). The client should stop sending additional audio data, 355 // half-close the gRPC connection, and wait for any additional results 356 // until the server closes the gRPC connection. 357 END_OF_UTTERANCE = 4; 358 } 359 360 // *Output-only* If set, returns a [google.rpc.Status][google.rpc.Status] message that 361 // specifies the error for the operation. 362 google.rpc.Status error = 1; 363 364 // *Output-only* This repeated list contains zero or more results that 365 // correspond to consecutive portions of the audio currently being processed. 366 // It contains zero or one `is_final=true` result (the newly settled portion), 367 // followed by zero or more `is_final=false` results. 368 repeated StreamingRecognitionResult results = 2; 369 370 // *Output-only* Indicates the lowest index in the `results` array that has 371 // changed. The repeated `StreamingRecognitionResult` results overwrite past 372 // results at this index and higher. 373 int32 result_index = 3; 374 375 // *Output-only* Indicates the type of endpointer event. 376 EndpointerType endpointer_type = 4; 377} 378 379// A streaming speech recognition result corresponding to a portion of the audio 380// that is currently being processed. 381message StreamingRecognitionResult { 382 // *Output-only* May contain one or more recognition hypotheses (up to the 383 // maximum specified in `max_alternatives`). 384 repeated SpeechRecognitionAlternative alternatives = 1; 385 386 // *Output-only* If `false`, this `StreamingRecognitionResult` represents an 387 // interim result that may change. If `true`, this is the final time the 388 // speech service will return this particular `StreamingRecognitionResult`, 389 // the recognizer will not return any further hypotheses for this portion of 390 // the transcript and corresponding audio. 391 bool is_final = 2; 392 393 // *Output-only* An estimate of the likelihood that the recognizer will not 394 // change its guess about this interim result. Values range from 0.0 395 // (completely unstable) to 1.0 (completely stable). 396 // This field is only provided for interim results (`is_final=false`). 397 // The default of 0.0 is a sentinel value indicating `stability` was not set. 398 float stability = 3; 399} 400 401// A speech recognition result corresponding to a portion of the audio. 402message SpeechRecognitionResult { 403 // *Output-only* May contain one or more recognition hypotheses (up to the 404 // maximum specified in `max_alternatives`). 405 repeated SpeechRecognitionAlternative alternatives = 1; 406} 407 408// Alternative hypotheses (a.k.a. n-best list). 409message SpeechRecognitionAlternative { 410 // *Output-only* Transcript text representing the words that the user spoke. 411 string transcript = 1; 412 413 // *Output-only* The confidence estimate between 0.0 and 1.0. A higher number 414 // indicates an estimated greater likelihood that the recognized words are 415 // correct. This field is typically provided only for the top hypothesis, and 416 // only for `is_final=true` results. Clients should not rely on the 417 // `confidence` field as it is not guaranteed to be accurate, or even set, in 418 // any of the results. 419 // The default of 0.0 is a sentinel value indicating `confidence` was not set. 420 float confidence = 2; 421} 422