1// Copyright 2018 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.assistant.embedded.v1alpha2; 18 19import "google/api/annotations.proto"; 20import "google/type/latlng.proto"; 21 22option go_package = "google.golang.org/genproto/googleapis/assistant/embedded/v1alpha2;embedded"; 23option java_multiple_files = true; 24option java_outer_classname = "AssistantProto"; 25option java_package = "com.google.assistant.embedded.v1alpha2"; 26option objc_class_prefix = "ASTSDK"; 27 28// Service that implements the Google Assistant API. 29service EmbeddedAssistant { 30 // Initiates or continues a conversation with the embedded Assistant Service. 31 // Each call performs one round-trip, sending an audio request to the service 32 // and receiving the audio response. Uses bidirectional streaming to receive 33 // results, such as the `END_OF_UTTERANCE` event, while sending audio. 34 // 35 // A conversation is one or more gRPC connections, each consisting of several 36 // streamed requests and responses. 37 // For example, the user says *Add to my shopping list* and the Assistant 38 // responds *What do you want to add?*. The sequence of streamed requests and 39 // responses in the first gRPC message could be: 40 // 41 // * AssistRequest.config 42 // * AssistRequest.audio_in 43 // * AssistRequest.audio_in 44 // * AssistRequest.audio_in 45 // * AssistRequest.audio_in 46 // * AssistResponse.event_type.END_OF_UTTERANCE 47 // * AssistResponse.speech_results.transcript "add to my shopping list" 48 // * AssistResponse.dialog_state_out.microphone_mode.DIALOG_FOLLOW_ON 49 // * AssistResponse.audio_out 50 // * AssistResponse.audio_out 51 // * AssistResponse.audio_out 52 // 53 // 54 // The user then says *bagels* and the Assistant responds 55 // *OK, I've added bagels to your shopping list*. This is sent as another gRPC 56 // connection call to the `Assist` method, again with streamed requests and 57 // responses, such as: 58 // 59 // * AssistRequest.config 60 // * AssistRequest.audio_in 61 // * AssistRequest.audio_in 62 // * AssistRequest.audio_in 63 // * AssistResponse.event_type.END_OF_UTTERANCE 64 // * AssistResponse.dialog_state_out.microphone_mode.CLOSE_MICROPHONE 65 // * AssistResponse.audio_out 66 // * AssistResponse.audio_out 67 // * AssistResponse.audio_out 68 // * AssistResponse.audio_out 69 // 70 // Although the precise order of responses is not guaranteed, sequential 71 // `AssistResponse.audio_out` messages will always contain sequential portions 72 // of audio. 73 rpc Assist(stream AssistRequest) returns (stream AssistResponse); 74} 75 76// The top-level message sent by the client. Clients must send at least two, and 77// typically numerous `AssistRequest` messages. The first message must 78// contain a `config` message and must not contain `audio_in` data. All 79// subsequent messages must contain `audio_in` data and must not contain a 80// `config` message. 81message AssistRequest { 82 // Exactly one of these fields must be specified in each `AssistRequest`. 83 oneof type { 84 // The `config` message provides information to the recognizer that 85 // specifies how to process the request. 86 // The first `AssistRequest` message must contain a `config` message. 87 AssistConfig config = 1; 88 89 // The audio data to be recognized. Sequential chunks of audio data are sent 90 // in sequential `AssistRequest` messages. The first `AssistRequest` 91 // message must not contain `audio_in` data and all subsequent 92 // `AssistRequest` messages must contain `audio_in` data. The audio bytes 93 // must be encoded as specified in `AudioInConfig`. 94 // Audio must be sent at approximately real-time (16000 samples per second). 95 // An error will be returned if audio is sent significantly faster or 96 // slower. 97 bytes audio_in = 2; 98 } 99} 100 101// The top-level message received by the client. A series of one or more 102// `AssistResponse` messages are streamed back to the client. 103message AssistResponse { 104 // Indicates the type of event. 105 enum EventType { 106 // No event specified. 107 EVENT_TYPE_UNSPECIFIED = 0; 108 109 // This event indicates that the server has detected the end of the user's 110 // speech utterance and expects no additional speech. Therefore, the server 111 // will not process additional audio (although it may subsequently return 112 // additional results). The client should stop sending additional audio 113 // data, half-close the gRPC connection, and wait for any additional results 114 // until the server closes the gRPC connection. 115 END_OF_UTTERANCE = 1; 116 } 117 118 // *Output-only* Indicates the type of event. 119 EventType event_type = 1; 120 121 // *Output-only* The audio containing the Assistant's response to the query. 122 AudioOut audio_out = 3; 123 124 // *Output-only* Contains the Assistant's visual response to the query. 125 ScreenOut screen_out = 4; 126 127 // *Output-only* Contains the action triggered by the query with the 128 // appropriate payloads and semantic parsing. 129 DeviceAction device_action = 6; 130 131 // *Output-only* This repeated list contains zero or more speech recognition 132 // results that correspond to consecutive portions of the audio currently 133 // being processed, starting with the portion corresponding to the earliest 134 // audio (and most stable portion) to the portion corresponding to the most 135 // recent audio. The strings can be concatenated to view the full 136 // in-progress response. When the speech recognition completes, this list 137 // will contain one item with `stability` of `1.0`. 138 repeated SpeechRecognitionResult speech_results = 2; 139 140 // *Output-only* Contains output related to the user's query. 141 DialogStateOut dialog_state_out = 5; 142 143 // *Output-only* Debugging info for developer. Only returned if request set 144 // `return_debug_info` to true. 145 DebugInfo debug_info = 8; 146} 147 148// Debug info for developer. Only returned if request set `return_debug_info` 149// to true. 150message DebugInfo { 151 // The original JSON response from an Action-on-Google agent to Google server. 152 // See 153 // https://developers.google.com/actions/reference/rest/Shared.Types/AppResponse. 154 // It will only be populated if the request maker owns the AoG project and the 155 // AoG project is in preview mode. 156 string aog_agent_to_assistant_json = 1; 157} 158 159// Specifies how to process the `AssistRequest` messages. 160message AssistConfig { 161 oneof type { 162 // Specifies how to process the subsequent incoming audio. Required if 163 // [AssistRequest.audio_in][google.assistant.embedded.v1alpha2.AssistRequest.audio_in] 164 // bytes will be provided in subsequent requests. 165 AudioInConfig audio_in_config = 1; 166 167 // The text input to be sent to the Assistant. This can be populated from a 168 // text interface if audio input is not available. 169 string text_query = 6; 170 } 171 172 // *Required* Specifies how to format the audio that will be returned. 173 AudioOutConfig audio_out_config = 2; 174 175 // *Optional* Specifies the desired format to use when server returns a 176 // visual screen response. 177 ScreenOutConfig screen_out_config = 8; 178 179 // *Required* Represents the current dialog state. 180 DialogStateIn dialog_state_in = 3; 181 182 // Device configuration that uniquely identifies a specific device. 183 DeviceConfig device_config = 4; 184 185 // *Optional* Debugging parameters for the whole `Assist` RPC. 186 DebugConfig debug_config = 5; 187} 188 189// Specifies how to process the `audio_in` data that will be provided in 190// subsequent requests. For recommended settings, see the Google Assistant SDK 191// [best 192// practices](https://developers.google.com/assistant/sdk/guides/service/python/best-practices/audio). 193message AudioInConfig { 194 // Audio encoding of the data sent in the audio message. 195 // Audio must be one-channel (mono). 196 enum Encoding { 197 // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][]. 198 ENCODING_UNSPECIFIED = 0; 199 200 // Uncompressed 16-bit signed little-endian samples (Linear PCM). 201 // This encoding includes no header, only the raw audio bytes. 202 LINEAR16 = 1; 203 204 // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio 205 // Codec) is the recommended encoding because it is 206 // lossless--therefore recognition is not compromised--and 207 // requires only about half the bandwidth of `LINEAR16`. This encoding 208 // includes the `FLAC` stream header followed by audio data. It supports 209 // 16-bit and 24-bit samples, however, not all fields in `STREAMINFO` are 210 // supported. 211 FLAC = 2; 212 } 213 214 // *Required* Encoding of audio data sent in all `audio_in` messages. 215 Encoding encoding = 1; 216 217 // *Required* Sample rate (in Hertz) of the audio data sent in all `audio_in` 218 // messages. Valid values are from 16000-24000, but 16000 is optimal. 219 // For best results, set the sampling rate of the audio source to 16000 Hz. 220 // If that's not possible, use the native sample rate of the audio source 221 // (instead of re-sampling). 222 int32 sample_rate_hertz = 2; 223} 224 225// Specifies the desired format for the server to use when it returns 226// `audio_out` messages. 227message AudioOutConfig { 228 // Audio encoding of the data returned in the audio message. All encodings are 229 // raw audio bytes with no header, except as indicated below. 230 enum Encoding { 231 // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][]. 232 ENCODING_UNSPECIFIED = 0; 233 234 // Uncompressed 16-bit signed little-endian samples (Linear PCM). 235 LINEAR16 = 1; 236 237 // MP3 audio encoding. The sample rate is encoded in the payload. 238 MP3 = 2; 239 240 // Opus-encoded audio wrapped in an ogg container. The result will be a 241 // file which can be played natively on Android and in some browsers (such 242 // as Chrome). The quality of the encoding is considerably higher than MP3 243 // while using the same bitrate. The sample rate is encoded in the payload. 244 OPUS_IN_OGG = 3; 245 } 246 247 // *Required* The encoding of audio data to be returned in all `audio_out` 248 // messages. 249 Encoding encoding = 1; 250 251 // *Required* The sample rate in Hertz of the audio data returned in 252 // `audio_out` messages. Valid values are: 16000-24000. 253 int32 sample_rate_hertz = 2; 254 255 // *Required* Current volume setting of the device's audio output. 256 // Valid values are 1 to 100 (corresponding to 1% to 100%). 257 int32 volume_percentage = 3; 258} 259 260// Specifies the desired format for the server to use when it returns 261// `screen_out` response. 262message ScreenOutConfig { 263 // Possible modes for visual screen-output on the device. 264 enum ScreenMode { 265 // No video mode specified. 266 // The Assistant may respond as if in `OFF` mode. 267 SCREEN_MODE_UNSPECIFIED = 0; 268 269 // Screen is off (or has brightness or other settings set so low it is 270 // not visible). The Assistant will typically not return a screen response 271 // in this mode. 272 OFF = 1; 273 274 // The Assistant will typically return a partial-screen response in this 275 // mode. 276 PLAYING = 3; 277 } 278 279 // Current visual screen-mode for the device while issuing the query. 280 ScreenMode screen_mode = 1; 281} 282 283// Provides information about the current dialog state. 284message DialogStateIn { 285 // *Required* This field must always be set to the 286 // [DialogStateOut.conversation_state][google.assistant.embedded.v1alpha2.DialogStateOut.conversation_state] 287 // value that was returned in the prior `Assist` RPC. It should only be 288 // omitted (field not set) if there was no prior `Assist` RPC because this is 289 // the first `Assist` RPC made by this device after it was first setup and/or 290 // a factory-default reset. 291 bytes conversation_state = 1; 292 293 // *Required* Language of the request in 294 // [IETF BCP 47 syntax](https://tools.ietf.org/html/bcp47) (for example, 295 // "en-US"). See [Language 296 // Support](https://developers.google.com/assistant/sdk/reference/rpc/languages) 297 // for more information. If you have selected a language for this `device_id` 298 // using the 299 // [Settings](https://developers.google.com/assistant/sdk/reference/assistant-app/assistant-settings) 300 // menu in your phone's Google Assistant app, that selection will override 301 // this value. 302 string language_code = 2; 303 304 // *Optional* Location of the device where the query originated. 305 DeviceLocation device_location = 5; 306 307 // *Optional* If true, the server will treat the request as a new conversation 308 // and not use state from the prior request. Set this field to true when the 309 // conversation should be restarted, such as after a device reboot, or after a 310 // significant lapse of time since the prior query. 311 bool is_new_conversation = 7; 312} 313 314// *Required* Fields that identify the device to the Assistant. 315// 316// See also: 317// 318// * [Register a Device - REST 319// API](https://developers.google.com/assistant/sdk/reference/device-registration/register-device-manual) 320// * [Device Model and Instance 321// Schemas](https://developers.google.com/assistant/sdk/reference/device-registration/model-and-instance-schemas) 322// * [Device 323// Proto](https://developers.google.com/assistant/sdk/reference/rpc/google.assistant.devices.v1alpha2#device) 324message DeviceConfig { 325 // *Required* Unique identifier for the device. The id length must be 128 326 // characters or less. Example: DBCDW098234. This MUST match the device_id 327 // returned from device registration. This device_id is used to match against 328 // the user's registered devices to lookup the supported traits and 329 // capabilities of this device. This information should not change across 330 // device reboots. However, it should not be saved across 331 // factory-default resets. 332 string device_id = 1; 333 334 // *Required* Unique identifier for the device model. The combination of 335 // device_model_id and device_id must have been previously associated through 336 // device registration. 337 string device_model_id = 3; 338} 339 340// The audio containing the Assistant's response to the query. Sequential chunks 341// of audio data are received in sequential `AssistResponse` messages. 342message AudioOut { 343 // *Output-only* The audio data containing the Assistant's response to the 344 // query. Sequential chunks of audio data are received in sequential 345 // `AssistResponse` messages. 346 bytes audio_data = 1; 347} 348 349// The Assistant's visual output response to query. Enabled by 350// `screen_out_config`. 351message ScreenOut { 352 // Possible formats of the screen data. 353 enum Format { 354 // No format specified. 355 FORMAT_UNSPECIFIED = 0; 356 357 // Data will contain a fully-formed HTML5 layout encoded in UTF-8, e.g. 358 // `<html><body><div>...</div></body></html>`. It is intended to be rendered 359 // along with the audio response. Note that HTML5 doctype should be included 360 // in the actual HTML data. 361 HTML = 1; 362 } 363 364 // *Output-only* The format of the provided screen data. 365 Format format = 1; 366 367 // *Output-only* The raw screen data to be displayed as the result of the 368 // Assistant query. 369 bytes data = 2; 370} 371 372// The response returned to the device if the user has triggered a Device 373// Action. For example, a device which supports the query *Turn on the light* 374// would receive a `DeviceAction` with a JSON payload containing the semantics 375// of the request. 376message DeviceAction { 377 // JSON containing the device command response generated from the triggered 378 // Device Action grammar. The format is given by the 379 // `action.devices.EXECUTE` intent for a given 380 // [trait](https://developers.google.com/assistant/sdk/reference/traits/). 381 string device_request_json = 1; 382} 383 384// The estimated transcription of a phrase the user has spoken. This could be 385// a single segment or the full guess of the user's spoken query. 386message SpeechRecognitionResult { 387 // *Output-only* Transcript text representing the words that the user spoke. 388 string transcript = 1; 389 390 // *Output-only* An estimate of the likelihood that the Assistant will not 391 // change its guess about this result. Values range from 0.0 (completely 392 // unstable) to 1.0 (completely stable and final). The default of 0.0 is a 393 // sentinel value indicating `stability` was not set. 394 float stability = 2; 395} 396 397// The dialog state resulting from the user's query. Multiple of these messages 398// may be received. 399message DialogStateOut { 400 // Possible states of the microphone after a `Assist` RPC completes. 401 enum MicrophoneMode { 402 // No mode specified. 403 MICROPHONE_MODE_UNSPECIFIED = 0; 404 405 // The service is not expecting a follow-on question from the user. 406 // The microphone should remain off until the user re-activates it. 407 CLOSE_MICROPHONE = 1; 408 409 // The service is expecting a follow-on question from the user. The 410 // microphone should be re-opened when the `AudioOut` playback completes 411 // (by starting a new `Assist` RPC call to send the new audio). 412 DIALOG_FOLLOW_ON = 2; 413 } 414 415 // *Output-only* Supplemental display text from the Assistant. This could be 416 // the same as the speech spoken in `AssistResponse.audio_out` or it could 417 // be some additional information which aids the user's understanding. 418 string supplemental_display_text = 1; 419 420 // *Output-only* State information for the subsequent `Assist` RPC. This 421 // value should be saved in the client and returned in the 422 // [`DialogStateIn.conversation_state`](#dialogstatein) field with the next 423 // `Assist` RPC. (The client does not need to interpret or otherwise use this 424 // value.) This information should be saved across device reboots. However, 425 // this value should be cleared (not saved in the client) during a 426 // factory-default reset. 427 bytes conversation_state = 2; 428 429 // *Output-only* Specifies the mode of the microphone after this `Assist` 430 // RPC is processed. 431 MicrophoneMode microphone_mode = 3; 432 433 // *Output-only* Updated volume level. The value will be 0 or omitted 434 // (indicating no change) unless a voice command such as *Increase the volume* 435 // or *Set volume level 4* was recognized, in which case the value will be 436 // between 1 and 100 (corresponding to the new volume level of 1% to 100%). 437 // Typically, a client should use this volume level when playing the 438 // `audio_out` data, and retain this value as the current volume level and 439 // supply it in the `AudioOutConfig` of the next `AssistRequest`. (Some 440 // clients may also implement other ways to allow the current volume level to 441 // be changed, for example, by providing a knob that the user can turn.) 442 int32 volume_percentage = 4; 443} 444 445// Debugging parameters for the current request. 446message DebugConfig { 447 // When this field is set to true, the `debug_info` field in `AssistResponse` 448 // may be populated. However it will significantly increase latency of 449 // responses. Do not set this field true in production code. 450 bool return_debug_info = 6; 451} 452 453// There are three sources of locations. They are used with this precedence: 454// 455// 1. This `DeviceLocation`, which is primarily used for mobile devices with 456// GPS . 457// 2. Location specified by the user during device setup; this is per-user, per 458// device. This location is used if `DeviceLocation` is not specified. 459// 3. Inferred location based on IP address. This is used only if neither of the 460// above are specified. 461message DeviceLocation { 462 oneof type { 463 // Latitude and longitude of device. 464 google.type.LatLng coordinates = 1; 465 } 466} 467