xref: /aosp_15_r20/external/googleapis/google/assistant/embedded/v1alpha2/embedded_assistant.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2018 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.assistant.embedded.v1alpha2;
18
19import "google/api/annotations.proto";
20import "google/type/latlng.proto";
21
22option go_package = "google.golang.org/genproto/googleapis/assistant/embedded/v1alpha2;embedded";
23option java_multiple_files = true;
24option java_outer_classname = "AssistantProto";
25option java_package = "com.google.assistant.embedded.v1alpha2";
26option objc_class_prefix = "ASTSDK";
27
28// Service that implements the Google Assistant API.
29service EmbeddedAssistant {
30  // Initiates or continues a conversation with the embedded Assistant Service.
31  // Each call performs one round-trip, sending an audio request to the service
32  // and receiving the audio response. Uses bidirectional streaming to receive
33  // results, such as the `END_OF_UTTERANCE` event, while sending audio.
34  //
35  // A conversation is one or more gRPC connections, each consisting of several
36  // streamed requests and responses.
37  // For example, the user says *Add to my shopping list* and the Assistant
38  // responds *What do you want to add?*. The sequence of streamed requests and
39  // responses in the first gRPC message could be:
40  //
41  // *   AssistRequest.config
42  // *   AssistRequest.audio_in
43  // *   AssistRequest.audio_in
44  // *   AssistRequest.audio_in
45  // *   AssistRequest.audio_in
46  // *   AssistResponse.event_type.END_OF_UTTERANCE
47  // *   AssistResponse.speech_results.transcript "add to my shopping list"
48  // *   AssistResponse.dialog_state_out.microphone_mode.DIALOG_FOLLOW_ON
49  // *   AssistResponse.audio_out
50  // *   AssistResponse.audio_out
51  // *   AssistResponse.audio_out
52  //
53  //
54  // The user then says *bagels* and the Assistant responds
55  // *OK, I've added bagels to your shopping list*. This is sent as another gRPC
56  // connection call to the `Assist` method, again with streamed requests and
57  // responses, such as:
58  //
59  // *   AssistRequest.config
60  // *   AssistRequest.audio_in
61  // *   AssistRequest.audio_in
62  // *   AssistRequest.audio_in
63  // *   AssistResponse.event_type.END_OF_UTTERANCE
64  // *   AssistResponse.dialog_state_out.microphone_mode.CLOSE_MICROPHONE
65  // *   AssistResponse.audio_out
66  // *   AssistResponse.audio_out
67  // *   AssistResponse.audio_out
68  // *   AssistResponse.audio_out
69  //
70  // Although the precise order of responses is not guaranteed, sequential
71  // `AssistResponse.audio_out` messages will always contain sequential portions
72  // of audio.
73  rpc Assist(stream AssistRequest) returns (stream AssistResponse);
74}
75
76// The top-level message sent by the client. Clients must send at least two, and
77// typically numerous `AssistRequest` messages. The first message must
78// contain a `config` message and must not contain `audio_in` data. All
79// subsequent messages must contain `audio_in` data and must not contain a
80// `config` message.
81message AssistRequest {
82  // Exactly one of these fields must be specified in each `AssistRequest`.
83  oneof type {
84    // The `config` message provides information to the recognizer that
85    // specifies how to process the request.
86    // The first `AssistRequest` message must contain a `config` message.
87    AssistConfig config = 1;
88
89    // The audio data to be recognized. Sequential chunks of audio data are sent
90    // in sequential `AssistRequest` messages. The first `AssistRequest`
91    // message must not contain `audio_in` data and all subsequent
92    // `AssistRequest` messages must contain `audio_in` data. The audio bytes
93    // must be encoded as specified in `AudioInConfig`.
94    // Audio must be sent at approximately real-time (16000 samples per second).
95    // An error will be returned if audio is sent significantly faster or
96    // slower.
97    bytes audio_in = 2;
98  }
99}
100
101// The top-level message received by the client. A series of one or more
102// `AssistResponse` messages are streamed back to the client.
103message AssistResponse {
104  // Indicates the type of event.
105  enum EventType {
106    // No event specified.
107    EVENT_TYPE_UNSPECIFIED = 0;
108
109    // This event indicates that the server has detected the end of the user's
110    // speech utterance and expects no additional speech. Therefore, the server
111    // will not process additional audio (although it may subsequently return
112    // additional results). The client should stop sending additional audio
113    // data, half-close the gRPC connection, and wait for any additional results
114    // until the server closes the gRPC connection.
115    END_OF_UTTERANCE = 1;
116  }
117
118  // *Output-only* Indicates the type of event.
119  EventType event_type = 1;
120
121  // *Output-only* The audio containing the Assistant's response to the query.
122  AudioOut audio_out = 3;
123
124  // *Output-only* Contains the Assistant's visual response to the query.
125  ScreenOut screen_out = 4;
126
127  // *Output-only* Contains the action triggered by the query with the
128  // appropriate payloads and semantic parsing.
129  DeviceAction device_action = 6;
130
131  // *Output-only* This repeated list contains zero or more speech recognition
132  // results that correspond to consecutive portions of the audio currently
133  // being processed, starting with the portion corresponding to the earliest
134  // audio (and most stable portion) to the portion corresponding to the most
135  // recent audio. The strings can be concatenated to view the full
136  // in-progress response. When the speech recognition completes, this list
137  // will contain one item with `stability` of `1.0`.
138  repeated SpeechRecognitionResult speech_results = 2;
139
140  // *Output-only* Contains output related to the user's query.
141  DialogStateOut dialog_state_out = 5;
142
143  // *Output-only* Debugging info for developer. Only returned if request set
144  // `return_debug_info` to true.
145  DebugInfo debug_info = 8;
146}
147
148// Debug info for developer. Only returned if request set `return_debug_info`
149// to true.
150message DebugInfo {
151  // The original JSON response from an Action-on-Google agent to Google server.
152  // See
153  // https://developers.google.com/actions/reference/rest/Shared.Types/AppResponse.
154  // It will only be populated if the request maker owns the AoG project and the
155  // AoG project is in preview mode.
156  string aog_agent_to_assistant_json = 1;
157}
158
159// Specifies how to process the `AssistRequest` messages.
160message AssistConfig {
161  oneof type {
162    // Specifies how to process the subsequent incoming audio. Required if
163    // [AssistRequest.audio_in][google.assistant.embedded.v1alpha2.AssistRequest.audio_in]
164    // bytes will be provided in subsequent requests.
165    AudioInConfig audio_in_config = 1;
166
167    // The text input to be sent to the Assistant. This can be populated from a
168    // text interface if audio input is not available.
169    string text_query = 6;
170  }
171
172  // *Required* Specifies how to format the audio that will be returned.
173  AudioOutConfig audio_out_config = 2;
174
175  // *Optional* Specifies the desired format to use when server returns a
176  // visual screen response.
177  ScreenOutConfig screen_out_config = 8;
178
179  // *Required* Represents the current dialog state.
180  DialogStateIn dialog_state_in = 3;
181
182  // Device configuration that uniquely identifies a specific device.
183  DeviceConfig device_config = 4;
184
185  // *Optional* Debugging parameters for the whole `Assist` RPC.
186  DebugConfig debug_config = 5;
187}
188
189// Specifies how to process the `audio_in` data that will be provided in
190// subsequent requests. For recommended settings, see the Google Assistant SDK
191// [best
192// practices](https://developers.google.com/assistant/sdk/guides/service/python/best-practices/audio).
193message AudioInConfig {
194  // Audio encoding of the data sent in the audio message.
195  // Audio must be one-channel (mono).
196  enum Encoding {
197    // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
198    ENCODING_UNSPECIFIED = 0;
199
200    // Uncompressed 16-bit signed little-endian samples (Linear PCM).
201    // This encoding includes no header, only the raw audio bytes.
202    LINEAR16 = 1;
203
204    // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
205    // Codec) is the recommended encoding because it is
206    // lossless--therefore recognition is not compromised--and
207    // requires only about half the bandwidth of `LINEAR16`. This encoding
208    // includes the `FLAC` stream header followed by audio data. It supports
209    // 16-bit and 24-bit samples, however, not all fields in `STREAMINFO` are
210    // supported.
211    FLAC = 2;
212  }
213
214  // *Required* Encoding of audio data sent in all `audio_in` messages.
215  Encoding encoding = 1;
216
217  // *Required* Sample rate (in Hertz) of the audio data sent in all `audio_in`
218  // messages. Valid values are from 16000-24000, but 16000 is optimal.
219  // For best results, set the sampling rate of the audio source to 16000 Hz.
220  // If that's not possible, use the native sample rate of the audio source
221  // (instead of re-sampling).
222  int32 sample_rate_hertz = 2;
223}
224
225// Specifies the desired format for the server to use when it returns
226// `audio_out` messages.
227message AudioOutConfig {
228  // Audio encoding of the data returned in the audio message. All encodings are
229  // raw audio bytes with no header, except as indicated below.
230  enum Encoding {
231    // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
232    ENCODING_UNSPECIFIED = 0;
233
234    // Uncompressed 16-bit signed little-endian samples (Linear PCM).
235    LINEAR16 = 1;
236
237    // MP3 audio encoding. The sample rate is encoded in the payload.
238    MP3 = 2;
239
240    // Opus-encoded audio wrapped in an ogg container. The result will be a
241    // file which can be played natively on Android and in some browsers (such
242    // as Chrome). The quality of the encoding is considerably higher than MP3
243    // while using the same bitrate. The sample rate is encoded in the payload.
244    OPUS_IN_OGG = 3;
245  }
246
247  // *Required* The encoding of audio data to be returned in all `audio_out`
248  // messages.
249  Encoding encoding = 1;
250
251  // *Required* The sample rate in Hertz of the audio data returned in
252  // `audio_out` messages. Valid values are: 16000-24000.
253  int32 sample_rate_hertz = 2;
254
255  // *Required* Current volume setting of the device's audio output.
256  // Valid values are 1 to 100 (corresponding to 1% to 100%).
257  int32 volume_percentage = 3;
258}
259
260// Specifies the desired format for the server to use when it returns
261// `screen_out` response.
262message ScreenOutConfig {
263  // Possible modes for visual screen-output on the device.
264  enum ScreenMode {
265    // No video mode specified.
266    // The Assistant may respond as if in `OFF` mode.
267    SCREEN_MODE_UNSPECIFIED = 0;
268
269    // Screen is off (or has brightness or other settings set so low it is
270    // not visible). The Assistant will typically not return a screen response
271    // in this mode.
272    OFF = 1;
273
274    // The Assistant will typically return a partial-screen response in this
275    // mode.
276    PLAYING = 3;
277  }
278
279  // Current visual screen-mode for the device while issuing the query.
280  ScreenMode screen_mode = 1;
281}
282
283// Provides information about the current dialog state.
284message DialogStateIn {
285  // *Required* This field must always be set to the
286  // [DialogStateOut.conversation_state][google.assistant.embedded.v1alpha2.DialogStateOut.conversation_state]
287  // value that was returned in the prior `Assist` RPC. It should only be
288  // omitted (field not set) if there was no prior `Assist` RPC because this is
289  // the first `Assist` RPC made by this device after it was first setup and/or
290  // a factory-default reset.
291  bytes conversation_state = 1;
292
293  // *Required* Language of the request in
294  // [IETF BCP 47 syntax](https://tools.ietf.org/html/bcp47) (for example,
295  // "en-US"). See [Language
296  // Support](https://developers.google.com/assistant/sdk/reference/rpc/languages)
297  // for more information. If you have selected a language for this `device_id`
298  // using the
299  // [Settings](https://developers.google.com/assistant/sdk/reference/assistant-app/assistant-settings)
300  // menu in your phone's Google Assistant app, that selection will override
301  // this value.
302  string language_code = 2;
303
304  // *Optional* Location of the device where the query originated.
305  DeviceLocation device_location = 5;
306
307  // *Optional* If true, the server will treat the request as a new conversation
308  // and not use state from the prior request. Set this field to true when the
309  // conversation should be restarted, such as after a device reboot, or after a
310  // significant lapse of time since the prior query.
311  bool is_new_conversation = 7;
312}
313
314// *Required* Fields that identify the device to the Assistant.
315//
316// See also:
317//
318// *   [Register a Device - REST
319// API](https://developers.google.com/assistant/sdk/reference/device-registration/register-device-manual)
320// *   [Device Model and Instance
321// Schemas](https://developers.google.com/assistant/sdk/reference/device-registration/model-and-instance-schemas)
322// *   [Device
323// Proto](https://developers.google.com/assistant/sdk/reference/rpc/google.assistant.devices.v1alpha2#device)
324message DeviceConfig {
325  // *Required* Unique identifier for the device. The id length must be 128
326  // characters or less. Example: DBCDW098234. This MUST match the device_id
327  // returned from device registration. This device_id is used to match against
328  // the user's registered devices to lookup the supported traits and
329  // capabilities of this device. This information should not change across
330  // device reboots. However, it should not be saved across
331  // factory-default resets.
332  string device_id = 1;
333
334  // *Required* Unique identifier for the device model. The combination of
335  // device_model_id and device_id must have been previously associated through
336  // device registration.
337  string device_model_id = 3;
338}
339
340// The audio containing the Assistant's response to the query. Sequential chunks
341// of audio data are received in sequential `AssistResponse` messages.
342message AudioOut {
343  // *Output-only* The audio data containing the Assistant's response to the
344  // query. Sequential chunks of audio data are received in sequential
345  // `AssistResponse` messages.
346  bytes audio_data = 1;
347}
348
349// The Assistant's visual output response to query. Enabled by
350// `screen_out_config`.
351message ScreenOut {
352  // Possible formats of the screen data.
353  enum Format {
354    // No format specified.
355    FORMAT_UNSPECIFIED = 0;
356
357    // Data will contain a fully-formed HTML5 layout encoded in UTF-8, e.g.
358    // `<html><body><div>...</div></body></html>`. It is intended to be rendered
359    // along with the audio response. Note that HTML5 doctype should be included
360    // in the actual HTML data.
361    HTML = 1;
362  }
363
364  // *Output-only* The format of the provided screen data.
365  Format format = 1;
366
367  // *Output-only* The raw screen data to be displayed as the result of the
368  // Assistant query.
369  bytes data = 2;
370}
371
372// The response returned to the device if the user has triggered a Device
373// Action. For example, a device which supports the query *Turn on the light*
374// would receive a `DeviceAction` with a JSON payload containing the semantics
375// of the request.
376message DeviceAction {
377  // JSON containing the device command response generated from the triggered
378  // Device Action grammar. The format is given by the
379  // `action.devices.EXECUTE` intent for a given
380  // [trait](https://developers.google.com/assistant/sdk/reference/traits/).
381  string device_request_json = 1;
382}
383
384// The estimated transcription of a phrase the user has spoken. This could be
385// a single segment or the full guess of the user's spoken query.
386message SpeechRecognitionResult {
387  // *Output-only* Transcript text representing the words that the user spoke.
388  string transcript = 1;
389
390  // *Output-only* An estimate of the likelihood that the Assistant will not
391  // change its guess about this result. Values range from 0.0 (completely
392  // unstable) to 1.0 (completely stable and final). The default of 0.0 is a
393  // sentinel value indicating `stability` was not set.
394  float stability = 2;
395}
396
397// The dialog state resulting from the user's query. Multiple of these messages
398// may be received.
399message DialogStateOut {
400  // Possible states of the microphone after a `Assist` RPC completes.
401  enum MicrophoneMode {
402    // No mode specified.
403    MICROPHONE_MODE_UNSPECIFIED = 0;
404
405    // The service is not expecting a follow-on question from the user.
406    // The microphone should remain off until the user re-activates it.
407    CLOSE_MICROPHONE = 1;
408
409    // The service is expecting a follow-on question from the user. The
410    // microphone should be re-opened when the `AudioOut` playback completes
411    // (by starting a new `Assist` RPC call to send the new audio).
412    DIALOG_FOLLOW_ON = 2;
413  }
414
415  // *Output-only* Supplemental display text from the Assistant. This could be
416  // the same as the speech spoken in `AssistResponse.audio_out` or it could
417  // be some additional information which aids the user's understanding.
418  string supplemental_display_text = 1;
419
420  // *Output-only* State information for the subsequent `Assist` RPC. This
421  // value should be saved in the client and returned in the
422  // [`DialogStateIn.conversation_state`](#dialogstatein) field with the next
423  // `Assist` RPC. (The client does not need to interpret or otherwise use this
424  // value.) This information should be saved across device reboots. However,
425  // this value should be cleared (not saved in the client) during a
426  // factory-default reset.
427  bytes conversation_state = 2;
428
429  // *Output-only* Specifies the mode of the microphone after this `Assist`
430  // RPC is processed.
431  MicrophoneMode microphone_mode = 3;
432
433  // *Output-only* Updated volume level. The value will be 0 or omitted
434  // (indicating no change) unless a voice command such as *Increase the volume*
435  // or *Set volume level 4* was recognized, in which case the value will be
436  // between 1 and 100 (corresponding to the new volume level of 1% to 100%).
437  // Typically, a client should use this volume level when playing the
438  // `audio_out` data, and retain this value as the current volume level and
439  // supply it in the `AudioOutConfig` of the next `AssistRequest`. (Some
440  // clients may also implement other ways to allow the current volume level to
441  // be changed, for example, by providing a knob that the user can turn.)
442  int32 volume_percentage = 4;
443}
444
445// Debugging parameters for the current request.
446message DebugConfig {
447  // When this field is set to true, the `debug_info` field in `AssistResponse`
448  // may be populated. However it will significantly increase latency of
449  // responses. Do not set this field true in production code.
450  bool return_debug_info = 6;
451}
452
453// There are three sources of locations. They are used with this precedence:
454//
455// 1. This `DeviceLocation`, which is primarily used for mobile devices with
456//    GPS .
457// 2. Location specified by the user during device setup; this is per-user, per
458//    device. This location is used if `DeviceLocation` is not specified.
459// 3. Inferred location based on IP address. This is used only if neither of the
460//    above are specified.
461message DeviceLocation {
462  oneof type {
463    // Latitude and longitude of device.
464    google.type.LatLng coordinates = 1;
465  }
466}
467