embedded_assistant.proto 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. // Copyright 2018 Google Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.assistant.embedded.v1alpha2;
  16. import "google/api/annotations.proto";
  17. import "google/type/latlng.proto";
  18. option go_package = "google.golang.org/genproto/googleapis/assistant/embedded/v1alpha2;embedded";
  19. option java_multiple_files = true;
  20. option java_outer_classname = "AssistantProto";
  21. option java_package = "com.google.assistant.embedded.v1alpha2";
  22. option objc_class_prefix = "ASTSDK";
  23. // Service that implements the Google Assistant API.
  24. service EmbeddedAssistant {
  25. // Initiates or continues a conversation with the embedded Assistant Service.
  26. // Each call performs one round-trip, sending an audio request to the service
  27. // and receiving the audio response. Uses bidirectional streaming to receive
  28. // results, such as the `END_OF_UTTERANCE` event, while sending audio.
  29. //
  30. // A conversation is one or more gRPC connections, each consisting of several
  31. // streamed requests and responses.
  32. // For example, the user says *Add to my shopping list* and the Assistant
  33. // responds *What do you want to add?*. The sequence of streamed requests and
  34. // responses in the first gRPC message could be:
  35. //
  36. // * AssistRequest.config
  37. // * AssistRequest.audio_in
  38. // * AssistRequest.audio_in
  39. // * AssistRequest.audio_in
  40. // * AssistRequest.audio_in
  41. // * AssistResponse.event_type.END_OF_UTTERANCE
  42. // * AssistResponse.speech_results.transcript "add to my shopping list"
  43. // * AssistResponse.dialog_state_out.microphone_mode.DIALOG_FOLLOW_ON
  44. // * AssistResponse.audio_out
  45. // * AssistResponse.audio_out
  46. // * AssistResponse.audio_out
  47. //
  48. //
  49. // The user then says *bagels* and the Assistant responds
  50. // *OK, I've added bagels to your shopping list*. This is sent as another gRPC
  51. // connection call to the `Assist` method, again with streamed requests and
  52. // responses, such as:
  53. //
  54. // * AssistRequest.config
  55. // * AssistRequest.audio_in
  56. // * AssistRequest.audio_in
  57. // * AssistRequest.audio_in
  58. // * AssistResponse.event_type.END_OF_UTTERANCE
  59. // * AssistResponse.dialog_state_out.microphone_mode.CLOSE_MICROPHONE
  60. // * AssistResponse.audio_out
  61. // * AssistResponse.audio_out
  62. // * AssistResponse.audio_out
  63. // * AssistResponse.audio_out
  64. //
  65. // Although the precise order of responses is not guaranteed, sequential
  66. // `AssistResponse.audio_out` messages will always contain sequential portions
  67. // of audio.
  68. rpc Assist(stream AssistRequest) returns (stream AssistResponse);
  69. }
  70. // The top-level message sent by the client. Clients must send at least two, and
  71. // typically numerous `AssistRequest` messages. The first message must
  72. // contain a `config` message and must not contain `audio_in` data. All
  73. // subsequent messages must contain `audio_in` data and must not contain a
  74. // `config` message.
  75. message AssistRequest {
  76. // Exactly one of these fields must be specified in each `AssistRequest`.
  77. oneof type {
  78. // The `config` message provides information to the recognizer that
  79. // specifies how to process the request.
  80. // The first `AssistRequest` message must contain a `config` message.
  81. AssistConfig config = 1;
  82. // The audio data to be recognized. Sequential chunks of audio data are sent
  83. // in sequential `AssistRequest` messages. The first `AssistRequest`
  84. // message must not contain `audio_in` data and all subsequent
  85. // `AssistRequest` messages must contain `audio_in` data. The audio bytes
  86. // must be encoded as specified in `AudioInConfig`.
  87. // Audio must be sent at approximately real-time (16000 samples per second).
  88. // An error will be returned if audio is sent significantly faster or
  89. // slower.
  90. bytes audio_in = 2;
  91. }
  92. }
  93. // The top-level message received by the client. A series of one or more
  94. // `AssistResponse` messages are streamed back to the client.
  95. message AssistResponse {
  96. // Indicates the type of event.
  97. enum EventType {
  98. // No event specified.
  99. EVENT_TYPE_UNSPECIFIED = 0;
  100. // This event indicates that the server has detected the end of the user's
  101. // speech utterance and expects no additional speech. Therefore, the server
  102. // will not process additional audio (although it may subsequently return
  103. // additional results). The client should stop sending additional audio
  104. // data, half-close the gRPC connection, and wait for any additional results
  105. // until the server closes the gRPC connection.
  106. END_OF_UTTERANCE = 1;
  107. }
  108. // *Output-only* Indicates the type of event.
  109. EventType event_type = 1;
  110. // *Output-only* The audio containing the Assistant's response to the query.
  111. AudioOut audio_out = 3;
  112. // *Output-only* Contains the Assistant's visual response to the query.
  113. ScreenOut screen_out = 4;
  114. // *Output-only* Contains the action triggered by the query with the
  115. // appropriate payloads and semantic parsing.
  116. DeviceAction device_action = 6;
  117. // *Output-only* This repeated list contains zero or more speech recognition
  118. // results that correspond to consecutive portions of the audio currently
  119. // being processed, starting with the portion corresponding to the earliest
  120. // audio (and most stable portion) to the portion corresponding to the most
  121. // recent audio. The strings can be concatenated to view the full
  122. // in-progress response. When the speech recognition completes, this list
  123. // will contain one item with `stability` of `1.0`.
  124. repeated SpeechRecognitionResult speech_results = 2;
  125. // *Output-only* Contains output related to the user's query.
  126. DialogStateOut dialog_state_out = 5;
  127. // *Output-only* Debugging info for developer. Only returned if request set
  128. // `return_debug_info` to true.
  129. DebugInfo debug_info = 8;
  130. }
  131. // Debug info for developer. Only returned if request set `return_debug_info`
  132. // to true.
  133. message DebugInfo {
  134. // The original JSON response from an Action-on-Google agent to Google server.
  135. // See
  136. // https://developers.google.com/actions/reference/rest/Shared.Types/AppResponse.
  137. // It will only be populated if the request maker owns the AoG project and the
  138. // AoG project is in preview mode.
  139. string aog_agent_to_assistant_json = 1;
  140. }
  141. // Specifies how to process the `AssistRequest` messages.
  142. message AssistConfig {
  143. oneof type {
  144. // Specifies how to process the subsequent incoming audio. Required if
  145. // [AssistRequest.audio_in][google.assistant.embedded.v1alpha2.AssistRequest.audio_in]
  146. // bytes will be provided in subsequent requests.
  147. AudioInConfig audio_in_config = 1;
  148. // The text input to be sent to the Assistant. This can be populated from a
  149. // text interface if audio input is not available.
  150. string text_query = 6;
  151. }
  152. // *Required* Specifies how to format the audio that will be returned.
  153. AudioOutConfig audio_out_config = 2;
  154. // *Optional* Specifies the desired format to use when server returns a
  155. // visual screen response.
  156. ScreenOutConfig screen_out_config = 8;
  157. // *Required* Represents the current dialog state.
  158. DialogStateIn dialog_state_in = 3;
  159. // Device configuration that uniquely identifies a specific device.
  160. DeviceConfig device_config = 4;
  161. // *Optional* Debugging parameters for the whole `Assist` RPC.
  162. DebugConfig debug_config = 5;
  163. }
  164. // Specifies how to process the `audio_in` data that will be provided in
  165. // subsequent requests. For recommended settings, see the Google Assistant SDK
  166. // [best
  167. // practices](https://developers.google.com/assistant/sdk/guides/service/python/best-practices/audio).
  168. message AudioInConfig {
  169. // Audio encoding of the data sent in the audio message.
  170. // Audio must be one-channel (mono).
  171. enum Encoding {
  172. // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
  173. ENCODING_UNSPECIFIED = 0;
  174. // Uncompressed 16-bit signed little-endian samples (Linear PCM).
  175. // This encoding includes no header, only the raw audio bytes.
  176. LINEAR16 = 1;
  177. // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
  178. // Codec) is the recommended encoding because it is
  179. // lossless--therefore recognition is not compromised--and
  180. // requires only about half the bandwidth of `LINEAR16`. This encoding
  181. // includes the `FLAC` stream header followed by audio data. It supports
  182. // 16-bit and 24-bit samples, however, not all fields in `STREAMINFO` are
  183. // supported.
  184. FLAC = 2;
  185. }
  186. // *Required* Encoding of audio data sent in all `audio_in` messages.
  187. Encoding encoding = 1;
  188. // *Required* Sample rate (in Hertz) of the audio data sent in all `audio_in`
  189. // messages. Valid values are from 16000-24000, but 16000 is optimal.
  190. // For best results, set the sampling rate of the audio source to 16000 Hz.
  191. // If that's not possible, use the native sample rate of the audio source
  192. // (instead of re-sampling).
  193. int32 sample_rate_hertz = 2;
  194. }
  195. // Specifies the desired format for the server to use when it returns
  196. // `audio_out` messages.
  197. message AudioOutConfig {
  198. // Audio encoding of the data returned in the audio message. All encodings are
  199. // raw audio bytes with no header, except as indicated below.
  200. enum Encoding {
  201. // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
  202. ENCODING_UNSPECIFIED = 0;
  203. // Uncompressed 16-bit signed little-endian samples (Linear PCM).
  204. LINEAR16 = 1;
  205. // MP3 audio encoding. The sample rate is encoded in the payload.
  206. MP3 = 2;
  207. // Opus-encoded audio wrapped in an ogg container. The result will be a
  208. // file which can be played natively on Android and in some browsers (such
  209. // as Chrome). The quality of the encoding is considerably higher than MP3
  210. // while using the same bitrate. The sample rate is encoded in the payload.
  211. OPUS_IN_OGG = 3;
  212. }
  213. // *Required* The encoding of audio data to be returned in all `audio_out`
  214. // messages.
  215. Encoding encoding = 1;
  216. // *Required* The sample rate in Hertz of the audio data returned in
  217. // `audio_out` messages. Valid values are: 16000-24000.
  218. int32 sample_rate_hertz = 2;
  219. // *Required* Current volume setting of the device's audio output.
  220. // Valid values are 1 to 100 (corresponding to 1% to 100%).
  221. int32 volume_percentage = 3;
  222. }
  223. // Specifies the desired format for the server to use when it returns
  224. // `screen_out` response.
  225. message ScreenOutConfig {
  226. // Possible modes for visual screen-output on the device.
  227. enum ScreenMode {
  228. // No video mode specified.
  229. // The Assistant may respond as if in `OFF` mode.
  230. SCREEN_MODE_UNSPECIFIED = 0;
  231. // Screen is off (or has brightness or other settings set so low it is
  232. // not visible). The Assistant will typically not return a screen response
  233. // in this mode.
  234. OFF = 1;
  235. // The Assistant will typically return a partial-screen response in this
  236. // mode.
  237. PLAYING = 3;
  238. }
  239. // Current visual screen-mode for the device while issuing the query.
  240. ScreenMode screen_mode = 1;
  241. }
  242. // Provides information about the current dialog state.
  243. message DialogStateIn {
  244. // *Required* This field must always be set to the
  245. // [DialogStateOut.conversation_state][google.assistant.embedded.v1alpha2.DialogStateOut.conversation_state]
  246. // value that was returned in the prior `Assist` RPC. It should only be
  247. // omitted (field not set) if there was no prior `Assist` RPC because this is
  248. // the first `Assist` RPC made by this device after it was first setup and/or
  249. // a factory-default reset.
  250. bytes conversation_state = 1;
  251. // *Required* Language of the request in
  252. // [IETF BCP 47 syntax](https://tools.ietf.org/html/bcp47) (for example,
  253. // "en-US"). See [Language
  254. // Support](https://developers.google.com/assistant/sdk/reference/rpc/languages)
  255. // for more information. If you have selected a language for this `device_id`
  256. // using the
  257. // [Settings](https://developers.google.com/assistant/sdk/reference/assistant-app/assistant-settings)
  258. // menu in your phone's Google Assistant app, that selection will override
  259. // this value.
  260. string language_code = 2;
  261. // *Optional* Location of the device where the query originated.
  262. DeviceLocation device_location = 5;
  263. // *Optional* If true, the server will treat the request as a new conversation
  264. // and not use state from the prior request. Set this field to true when the
  265. // conversation should be restarted, such as after a device reboot, or after a
  266. // significant lapse of time since the prior query.
  267. bool is_new_conversation = 7;
  268. }
  269. // *Required* Fields that identify the device to the Assistant.
  270. //
  271. // See also:
  272. //
  273. // * [Register a Device - REST
  274. // API](https://developers.google.com/assistant/sdk/reference/device-registration/register-device-manual)
  275. // * [Device Model and Instance
  276. // Schemas](https://developers.google.com/assistant/sdk/reference/device-registration/model-and-instance-schemas)
  277. // * [Device
  278. // Proto](https://developers.google.com/assistant/sdk/reference/rpc/google.assistant.devices.v1alpha2#device)
  279. message DeviceConfig {
  280. // *Required* Unique identifier for the device. The id length must be 128
  281. // characters or less. Example: DBCDW098234. This MUST match the device_id
  282. // returned from device registration. This device_id is used to match against
  283. // the user's registered devices to lookup the supported traits and
  284. // capabilities of this device. This information should not change across
  285. // device reboots. However, it should not be saved across
  286. // factory-default resets.
  287. string device_id = 1;
  288. // *Required* Unique identifier for the device model. The combination of
  289. // device_model_id and device_id must have been previously associated through
  290. // device registration.
  291. string device_model_id = 3;
  292. }
  293. // The audio containing the Assistant's response to the query. Sequential chunks
  294. // of audio data are received in sequential `AssistResponse` messages.
  295. message AudioOut {
  296. // *Output-only* The audio data containing the Assistant's response to the
  297. // query. Sequential chunks of audio data are received in sequential
  298. // `AssistResponse` messages.
  299. bytes audio_data = 1;
  300. }
  301. // The Assistant's visual output response to query. Enabled by
  302. // `screen_out_config`.
  303. message ScreenOut {
  304. // Possible formats of the screen data.
  305. enum Format {
  306. // No format specified.
  307. FORMAT_UNSPECIFIED = 0;
  308. // Data will contain a fully-formed HTML5 layout encoded in UTF-8, e.g.
  309. // `<html><body><div>...</div></body></html>`. It is intended to be rendered
  310. // along with the audio response. Note that HTML5 doctype should be included
  311. // in the actual HTML data.
  312. HTML = 1;
  313. }
  314. // *Output-only* The format of the provided screen data.
  315. Format format = 1;
  316. // *Output-only* The raw screen data to be displayed as the result of the
  317. // Assistant query.
  318. bytes data = 2;
  319. }
  320. // The response returned to the device if the user has triggered a Device
  321. // Action. For example, a device which supports the query *Turn on the light*
  322. // would receive a `DeviceAction` with a JSON payload containing the semantics
  323. // of the request.
  324. message DeviceAction {
  325. // JSON containing the device command response generated from the triggered
  326. // Device Action grammar. The format is given by the
  327. // `action.devices.EXECUTE` intent for a given
  328. // [trait](https://developers.google.com/assistant/sdk/reference/traits/).
  329. string device_request_json = 1;
  330. }
  331. // The estimated transcription of a phrase the user has spoken. This could be
  332. // a single segment or the full guess of the user's spoken query.
  333. message SpeechRecognitionResult {
  334. // *Output-only* Transcript text representing the words that the user spoke.
  335. string transcript = 1;
  336. // *Output-only* An estimate of the likelihood that the Assistant will not
  337. // change its guess about this result. Values range from 0.0 (completely
  338. // unstable) to 1.0 (completely stable and final). The default of 0.0 is a
  339. // sentinel value indicating `stability` was not set.
  340. float stability = 2;
  341. }
  342. // The dialog state resulting from the user's query. Multiple of these messages
  343. // may be received.
  344. message DialogStateOut {
  345. // Possible states of the microphone after a `Assist` RPC completes.
  346. enum MicrophoneMode {
  347. // No mode specified.
  348. MICROPHONE_MODE_UNSPECIFIED = 0;
  349. // The service is not expecting a follow-on question from the user.
  350. // The microphone should remain off until the user re-activates it.
  351. CLOSE_MICROPHONE = 1;
  352. // The service is expecting a follow-on question from the user. The
  353. // microphone should be re-opened when the `AudioOut` playback completes
  354. // (by starting a new `Assist` RPC call to send the new audio).
  355. DIALOG_FOLLOW_ON = 2;
  356. }
  357. // *Output-only* Supplemental display text from the Assistant. This could be
  358. // the same as the speech spoken in `AssistResponse.audio_out` or it could
  359. // be some additional information which aids the user's understanding.
  360. string supplemental_display_text = 1;
  361. // *Output-only* State information for the subsequent `Assist` RPC. This
  362. // value should be saved in the client and returned in the
  363. // [`DialogStateIn.conversation_state`](#dialogstatein) field with the next
  364. // `Assist` RPC. (The client does not need to interpret or otherwise use this
  365. // value.) This information should be saved across device reboots. However,
  366. // this value should be cleared (not saved in the client) during a
  367. // factory-default reset.
  368. bytes conversation_state = 2;
  369. // *Output-only* Specifies the mode of the microphone after this `Assist`
  370. // RPC is processed.
  371. MicrophoneMode microphone_mode = 3;
  372. // *Output-only* Updated volume level. The value will be 0 or omitted
  373. // (indicating no change) unless a voice command such as *Increase the volume*
  374. // or *Set volume level 4* was recognized, in which case the value will be
  375. // between 1 and 100 (corresponding to the new volume level of 1% to 100%).
  376. // Typically, a client should use this volume level when playing the
  377. // `audio_out` data, and retain this value as the current volume level and
  378. // supply it in the `AudioOutConfig` of the next `AssistRequest`. (Some
  379. // clients may also implement other ways to allow the current volume level to
  380. // be changed, for example, by providing a knob that the user can turn.)
  381. int32 volume_percentage = 4;
  382. }
  383. // Debugging parameters for the current request.
  384. message DebugConfig {
  385. // When this field is set to true, the `debug_info` field in `AssistResponse`
  386. // may be populated. However it will significantly increase latency of
  387. // responses. Do not set this field true in production code.
  388. bool return_debug_info = 6;
  389. }
  390. // There are three sources of locations. They are used with this precedence:
  391. //
  392. // 1. This `DeviceLocation`, which is primarily used for mobile devices with
  393. // GPS .
  394. // 2. Location specified by the user during device setup; this is per-user, per
  395. // device. This location is used if `DeviceLocation` is not specified.
  396. // 3. Inferred location based on IP address. This is used only if neither of the
  397. // above are specified.
  398. message DeviceLocation {
  399. oneof type {
  400. // Latitude and longitude of device.
  401. google.type.LatLng coordinates = 1;
  402. }
  403. }