1// Copyright 2021 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.mediatranslation.v1beta1;
18
19import "google/api/field_behavior.proto";
20import "google/rpc/status.proto";
21import "google/api/client.proto";
22
23option cc_enable_arenas = true;
24option go_package = "cloud.google.com/go/mediatranslation/apiv1beta1/mediatranslationpb;mediatranslationpb";
25option java_multiple_files = true;
26option java_outer_classname = "MediaTranslationProto";
27option java_package = "com.google.cloud.mediatranslation.v1beta1";
28option csharp_namespace = "Google.Cloud.MediaTranslation.V1Beta1";
29option ruby_package = "Google::Cloud::MediaTranslation::V1beta1";
30option php_namespace = "Google\\Cloud\\MediaTranslation\\V1beta1";
31
32// Provides translation from/to media types.
33service SpeechTranslationService {
34  option (google.api.default_host) = "mediatranslation.googleapis.com";
35  option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";
36
37  // Performs bidirectional streaming speech translation: receive results while
38  // sending audio. This method is only available via the gRPC API (not REST).
39  rpc StreamingTranslateSpeech(stream StreamingTranslateSpeechRequest) returns (stream StreamingTranslateSpeechResponse) {
40  }
41}
42
43// Provides information to the speech translation that specifies how to process
44// the request.
45message TranslateSpeechConfig {
46  // Required. Encoding of audio data.
47  // Supported formats:
48  //
49  // - `linear16`
50  //
51  //   Uncompressed 16-bit signed little-endian samples (Linear PCM).
52  //
53  // - `flac`
54  //
55  //   `flac` (Free Lossless Audio Codec) is the recommended encoding
56  //   because it is lossless--therefore recognition is not compromised--and
57  //   requires only about half the bandwidth of `linear16`.
58  //
59  // - `mulaw`
60  //
61  //   8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
62  //
63  // - `amr`
64  //
65  //   Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
66  //
67  // - `amr-wb`
68  //
69  //   Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
70  //
71  // - `ogg-opus`
72  //
73  //   Opus encoded audio frames in [Ogg](https://wikipedia.org/wiki/Ogg)
74  //   container. `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000,
75  //   or 48000.
76  //
77  // - `mp3`
78  //
79  //   MP3 audio. Support all standard MP3 bitrates (which range from 32-320
80  //   kbps). When using this encoding, `sample_rate_hertz` has to match the
81  //   sample rate of the file being used.
82  string audio_encoding = 1 [(google.api.field_behavior) = REQUIRED];
83
84  // Required. Source language code (BCP-47) of the input audio.
85  string source_language_code = 2 [(google.api.field_behavior) = REQUIRED];
86
87  // Required. Target language code (BCP-47) of the output.
88  string target_language_code = 3 [(google.api.field_behavior) = REQUIRED];
89
90  // Optional. Sample rate in Hertz of the audio data. Valid values are:
91  // 8000-48000. 16000 is optimal. For best results, set the sampling rate of
92  // the audio source to 16000 Hz. If that's not possible, use the native sample
93  // rate of the audio source (instead of re-sampling).
94  int32 sample_rate_hertz = 4 [(google.api.field_behavior) = OPTIONAL];
95
96  // Optional. `google-provided-model/video` and
97  // `google-provided-model/enhanced-phone-call` are premium models.
98  // `google-provided-model/phone-call` is not premium model.
99  string model = 5 [(google.api.field_behavior) = OPTIONAL];
100}
101
102// Config used for streaming translation.
103message StreamingTranslateSpeechConfig {
104  // Required. The common config for all the following audio contents.
105  TranslateSpeechConfig audio_config = 1 [(google.api.field_behavior) = REQUIRED];
106
107  // Optional. If `false` or omitted, the system performs
108  // continuous translation (continuing to wait for and process audio even if
109  // the user pauses speaking) until the client closes the input stream (gRPC
110  // API) or until the maximum time limit has been reached. May return multiple
111  // `StreamingTranslateSpeechResult`s with the `is_final` flag set to `true`.
112  //
113  // If `true`, the speech translator will detect a single spoken utterance.
114  // When it detects that the user has paused or stopped speaking, it will
115  // return an `END_OF_SINGLE_UTTERANCE` event and cease translation.
116  // When the client receives 'END_OF_SINGLE_UTTERANCE' event, the client should
117  // stop sending the requests. However, clients should keep receiving remaining
118  // responses until the stream is terminated. To construct the complete
119  // sentence in a streaming way, one should override (if 'is_final' of previous
120  // response is false), or append (if 'is_final' of previous response is true).
121  bool single_utterance = 2 [(google.api.field_behavior) = OPTIONAL];
122}
123
124// The top-level message sent by the client for the `StreamingTranslateSpeech`
125// method. Multiple `StreamingTranslateSpeechRequest` messages are sent. The
126// first message must contain a `streaming_config` message and must not contain
127// `audio_content` data. All subsequent messages must contain `audio_content`
128// data and must not contain a `streaming_config` message.
129message StreamingTranslateSpeechRequest {
130  // The streaming request, which is either a streaming config or content.
131  oneof streaming_request {
132    // Provides information to the recognizer that specifies how to process the
133    // request. The first `StreamingTranslateSpeechRequest` message must contain
134    // a `streaming_config` message.
135    StreamingTranslateSpeechConfig streaming_config = 1;
136
137    // The audio data to be translated. Sequential chunks of audio data are sent
138    // in sequential `StreamingTranslateSpeechRequest` messages. The first
139    // `StreamingTranslateSpeechRequest` message must not contain
140    // `audio_content` data and all subsequent `StreamingTranslateSpeechRequest`
141    // messages must contain `audio_content` data. The audio bytes must be
142    // encoded as specified in `StreamingTranslateSpeechConfig`. Note: as with
143    // all bytes fields, protobuffers use a pure binary representation (not
144    // base64).
145    bytes audio_content = 2;
146  }
147}
148
149// A streaming speech translation result corresponding to a portion of the audio
150// that is currently being processed.
151message StreamingTranslateSpeechResult {
152  // Text translation result.
153  message TextTranslationResult {
154    // Output only. The translated sentence.
155    string translation = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
156
157    // Output only. If `false`, this `StreamingTranslateSpeechResult` represents
158    // an interim result that may change. If `true`, this is the final time the
159    // translation service will return this particular
160    // `StreamingTranslateSpeechResult`, the streaming translator will not
161    // return any further hypotheses for this portion of the transcript and
162    // corresponding audio.
163    bool is_final = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
164  }
165
166  // Translation result.
167  oneof result {
168    // Text translation result.
169    TextTranslationResult text_translation_result = 1;
170  }
171}
172
173// A streaming speech translation response corresponding to a portion of
174// the audio currently processed.
175message StreamingTranslateSpeechResponse {
176  // Indicates the type of speech event.
177  enum SpeechEventType {
178    // No speech event specified.
179    SPEECH_EVENT_TYPE_UNSPECIFIED = 0;
180
181    // This event indicates that the server has detected the end of the user's
182    // speech utterance and expects no additional speech. Therefore, the server
183    // will not process additional audio (although it may subsequently return
184    // additional results). When the client receives 'END_OF_SINGLE_UTTERANCE'
185    // event, the client should stop sending the requests. However, clients
186    // should keep receiving remaining responses until the stream is terminated.
187    // To construct the complete sentence in a streaming way, one should
188    // override (if 'is_final' of previous response is false), or append (if
189    // 'is_final' of previous response is true). This event is only sent if
190    // `single_utterance` was set to `true`, and is not used otherwise.
191    END_OF_SINGLE_UTTERANCE = 1;
192  }
193
194  // Output only. If set, returns a [google.rpc.Status][google.rpc.Status] message that
195  // specifies the error for the operation.
196  google.rpc.Status error = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
197
198  // Output only. The translation result that is currently being processed (is_final could be
199  // true or false).
200  StreamingTranslateSpeechResult result = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
201
202  // Output only. Indicates the type of speech event.
203  SpeechEventType speech_event_type = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
204}
205