1// Copyright 2021 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.mediatranslation.v1beta1; 18 19import "google/api/field_behavior.proto"; 20import "google/rpc/status.proto"; 21import "google/api/client.proto"; 22 23option cc_enable_arenas = true; 24option go_package = "cloud.google.com/go/mediatranslation/apiv1beta1/mediatranslationpb;mediatranslationpb"; 25option java_multiple_files = true; 26option java_outer_classname = "MediaTranslationProto"; 27option java_package = "com.google.cloud.mediatranslation.v1beta1"; 28option csharp_namespace = "Google.Cloud.MediaTranslation.V1Beta1"; 29option ruby_package = "Google::Cloud::MediaTranslation::V1beta1"; 30option php_namespace = "Google\\Cloud\\MediaTranslation\\V1beta1"; 31 32// Provides translation from/to media types. 33service SpeechTranslationService { 34 option (google.api.default_host) = "mediatranslation.googleapis.com"; 35 option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform"; 36 37 // Performs bidirectional streaming speech translation: receive results while 38 // sending audio. This method is only available via the gRPC API (not REST). 39 rpc StreamingTranslateSpeech(stream StreamingTranslateSpeechRequest) returns (stream StreamingTranslateSpeechResponse) { 40 } 41} 42 43// Provides information to the speech translation that specifies how to process 44// the request. 45message TranslateSpeechConfig { 46 // Required. Encoding of audio data. 47 // Supported formats: 48 // 49 // - `linear16` 50 // 51 // Uncompressed 16-bit signed little-endian samples (Linear PCM). 52 // 53 // - `flac` 54 // 55 // `flac` (Free Lossless Audio Codec) is the recommended encoding 56 // because it is lossless--therefore recognition is not compromised--and 57 // requires only about half the bandwidth of `linear16`. 58 // 59 // - `mulaw` 60 // 61 // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. 62 // 63 // - `amr` 64 // 65 // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000. 66 // 67 // - `amr-wb` 68 // 69 // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000. 70 // 71 // - `ogg-opus` 72 // 73 // Opus encoded audio frames in [Ogg](https://wikipedia.org/wiki/Ogg) 74 // container. `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, 75 // or 48000. 76 // 77 // - `mp3` 78 // 79 // MP3 audio. Support all standard MP3 bitrates (which range from 32-320 80 // kbps). When using this encoding, `sample_rate_hertz` has to match the 81 // sample rate of the file being used. 82 string audio_encoding = 1 [(google.api.field_behavior) = REQUIRED]; 83 84 // Required. Source language code (BCP-47) of the input audio. 85 string source_language_code = 2 [(google.api.field_behavior) = REQUIRED]; 86 87 // Required. Target language code (BCP-47) of the output. 88 string target_language_code = 3 [(google.api.field_behavior) = REQUIRED]; 89 90 // Optional. Sample rate in Hertz of the audio data. Valid values are: 91 // 8000-48000. 16000 is optimal. For best results, set the sampling rate of 92 // the audio source to 16000 Hz. If that's not possible, use the native sample 93 // rate of the audio source (instead of re-sampling). 94 int32 sample_rate_hertz = 4 [(google.api.field_behavior) = OPTIONAL]; 95 96 // Optional. `google-provided-model/video` and 97 // `google-provided-model/enhanced-phone-call` are premium models. 98 // `google-provided-model/phone-call` is not premium model. 99 string model = 5 [(google.api.field_behavior) = OPTIONAL]; 100} 101 102// Config used for streaming translation. 103message StreamingTranslateSpeechConfig { 104 // Required. The common config for all the following audio contents. 105 TranslateSpeechConfig audio_config = 1 [(google.api.field_behavior) = REQUIRED]; 106 107 // Optional. If `false` or omitted, the system performs 108 // continuous translation (continuing to wait for and process audio even if 109 // the user pauses speaking) until the client closes the input stream (gRPC 110 // API) or until the maximum time limit has been reached. May return multiple 111 // `StreamingTranslateSpeechResult`s with the `is_final` flag set to `true`. 112 // 113 // If `true`, the speech translator will detect a single spoken utterance. 114 // When it detects that the user has paused or stopped speaking, it will 115 // return an `END_OF_SINGLE_UTTERANCE` event and cease translation. 116 // When the client receives 'END_OF_SINGLE_UTTERANCE' event, the client should 117 // stop sending the requests. However, clients should keep receiving remaining 118 // responses until the stream is terminated. To construct the complete 119 // sentence in a streaming way, one should override (if 'is_final' of previous 120 // response is false), or append (if 'is_final' of previous response is true). 121 bool single_utterance = 2 [(google.api.field_behavior) = OPTIONAL]; 122} 123 124// The top-level message sent by the client for the `StreamingTranslateSpeech` 125// method. Multiple `StreamingTranslateSpeechRequest` messages are sent. The 126// first message must contain a `streaming_config` message and must not contain 127// `audio_content` data. All subsequent messages must contain `audio_content` 128// data and must not contain a `streaming_config` message. 129message StreamingTranslateSpeechRequest { 130 // The streaming request, which is either a streaming config or content. 131 oneof streaming_request { 132 // Provides information to the recognizer that specifies how to process the 133 // request. The first `StreamingTranslateSpeechRequest` message must contain 134 // a `streaming_config` message. 135 StreamingTranslateSpeechConfig streaming_config = 1; 136 137 // The audio data to be translated. Sequential chunks of audio data are sent 138 // in sequential `StreamingTranslateSpeechRequest` messages. The first 139 // `StreamingTranslateSpeechRequest` message must not contain 140 // `audio_content` data and all subsequent `StreamingTranslateSpeechRequest` 141 // messages must contain `audio_content` data. The audio bytes must be 142 // encoded as specified in `StreamingTranslateSpeechConfig`. Note: as with 143 // all bytes fields, protobuffers use a pure binary representation (not 144 // base64). 145 bytes audio_content = 2; 146 } 147} 148 149// A streaming speech translation result corresponding to a portion of the audio 150// that is currently being processed. 151message StreamingTranslateSpeechResult { 152 // Text translation result. 153 message TextTranslationResult { 154 // Output only. The translated sentence. 155 string translation = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 156 157 // Output only. If `false`, this `StreamingTranslateSpeechResult` represents 158 // an interim result that may change. If `true`, this is the final time the 159 // translation service will return this particular 160 // `StreamingTranslateSpeechResult`, the streaming translator will not 161 // return any further hypotheses for this portion of the transcript and 162 // corresponding audio. 163 bool is_final = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 164 } 165 166 // Translation result. 167 oneof result { 168 // Text translation result. 169 TextTranslationResult text_translation_result = 1; 170 } 171} 172 173// A streaming speech translation response corresponding to a portion of 174// the audio currently processed. 175message StreamingTranslateSpeechResponse { 176 // Indicates the type of speech event. 177 enum SpeechEventType { 178 // No speech event specified. 179 SPEECH_EVENT_TYPE_UNSPECIFIED = 0; 180 181 // This event indicates that the server has detected the end of the user's 182 // speech utterance and expects no additional speech. Therefore, the server 183 // will not process additional audio (although it may subsequently return 184 // additional results). When the client receives 'END_OF_SINGLE_UTTERANCE' 185 // event, the client should stop sending the requests. However, clients 186 // should keep receiving remaining responses until the stream is terminated. 187 // To construct the complete sentence in a streaming way, one should 188 // override (if 'is_final' of previous response is false), or append (if 189 // 'is_final' of previous response is true). This event is only sent if 190 // `single_utterance` was set to `true`, and is not used otherwise. 191 END_OF_SINGLE_UTTERANCE = 1; 192 } 193 194 // Output only. If set, returns a [google.rpc.Status][google.rpc.Status] message that 195 // specifies the error for the operation. 196 google.rpc.Status error = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 197 198 // Output only. The translation result that is currently being processed (is_final could be 199 // true or false). 200 StreamingTranslateSpeechResult result = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 201 202 // Output only. Indicates the type of speech event. 203 SpeechEventType speech_event_type = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 204} 205