embedded_assistant.proto 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. // Copyright 2017 Google Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.assistant.embedded.v1alpha1;
  16. import "google/api/annotations.proto";
  17. import "google/rpc/status.proto";
  18. option go_package = "google.golang.org/genproto/googleapis/assistant/embedded/v1alpha1;embedded";
  19. option java_multiple_files = true;
  20. option java_outer_classname = "AssistantProto";
  21. option java_package = "com.google.assistant.embedded.v1alpha1";
  22. // Service that implements Google Assistant API.
  23. service EmbeddedAssistant {
  24. // Initiates or continues a conversation with the embedded assistant service.
  25. // Each call performs one round-trip, sending an audio request to the service
  26. // and receiving the audio response. Uses bidirectional streaming to receive
  27. // results, such as the `END_OF_UTTERANCE` event, while sending audio.
  28. //
  29. // A conversation is one or more gRPC connections, each consisting of several
  30. // streamed requests and responses.
  31. // For example, the user says *Add to my shopping list* and the assistant
  32. // responds *What do you want to add?*. The sequence of streamed requests and
  33. // responses in the first gRPC message could be:
  34. //
  35. // * ConverseRequest.config
  36. // * ConverseRequest.audio_in
  37. // * ConverseRequest.audio_in
  38. // * ConverseRequest.audio_in
  39. // * ConverseRequest.audio_in
  40. // * ConverseResponse.event_type.END_OF_UTTERANCE
  41. // * ConverseResponse.result.microphone_mode.DIALOG_FOLLOW_ON
  42. // * ConverseResponse.audio_out
  43. // * ConverseResponse.audio_out
  44. // * ConverseResponse.audio_out
  45. //
  46. // The user then says *bagels* and the assistant responds
  47. // *OK, I've added bagels to your shopping list*. This is sent as another gRPC
  48. // connection call to the `Converse` method, again with streamed requests and
  49. // responses, such as:
  50. //
  51. // * ConverseRequest.config
  52. // * ConverseRequest.audio_in
  53. // * ConverseRequest.audio_in
  54. // * ConverseRequest.audio_in
  55. // * ConverseResponse.event_type.END_OF_UTTERANCE
  56. // * ConverseResponse.result.microphone_mode.CLOSE_MICROPHONE
  57. // * ConverseResponse.audio_out
  58. // * ConverseResponse.audio_out
  59. // * ConverseResponse.audio_out
  60. // * ConverseResponse.audio_out
  61. //
  62. // Although the precise order of responses is not guaranteed, sequential
  63. // ConverseResponse.audio_out messages will always contain sequential portions
  64. // of audio.
  65. rpc Converse(stream ConverseRequest) returns (stream ConverseResponse);
  66. }
  67. // Specifies how to process the `ConverseRequest` messages.
  68. message ConverseConfig {
  69. // *Required* Specifies how to process the subsequent incoming audio.
  70. AudioInConfig audio_in_config = 1;
  71. // *Required* Specifies how to format the audio that will be returned.
  72. AudioOutConfig audio_out_config = 2;
  73. // *Required* Represents the current dialog state.
  74. ConverseState converse_state = 3;
  75. }
  76. // Specifies how to process the `audio_in` data that will be provided in
  77. // subsequent requests. For recommended settings, see the Google Assistant SDK
  78. // [best
  79. // practices](https://developers.google.com/assistant/sdk/develop/grpc/best-practices/audio).
  80. message AudioInConfig {
  81. // Audio encoding of the data sent in the audio message.
  82. // Audio must be one-channel (mono). The only language supported is "en-US".
  83. enum Encoding {
  84. // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
  85. ENCODING_UNSPECIFIED = 0;
  86. // Uncompressed 16-bit signed little-endian samples (Linear PCM).
  87. // This encoding includes no header, only the raw audio bytes.
  88. LINEAR16 = 1;
  89. // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
  90. // Codec) is the recommended encoding because it is
  91. // lossless--therefore recognition is not compromised--and
  92. // requires only about half the bandwidth of `LINEAR16`. This encoding
  93. // includes the `FLAC` stream header followed by audio data. It supports
  94. // 16-bit and 24-bit samples, however, not all fields in `STREAMINFO` are
  95. // supported.
  96. FLAC = 2;
  97. }
  98. // *Required* Encoding of audio data sent in all `audio_in` messages.
  99. Encoding encoding = 1;
  100. // *Required* Sample rate (in Hertz) of the audio data sent in all `audio_in`
  101. // messages. Valid values are from 16000-24000, but 16000 is optimal.
  102. // For best results, set the sampling rate of the audio source to 16000 Hz.
  103. // If that's not possible, use the native sample rate of the audio source
  104. // (instead of re-sampling).
  105. int32 sample_rate_hertz = 2;
  106. }
  107. // Specifies the desired format for the server to use when it returns
  108. // `audio_out` messages.
  109. message AudioOutConfig {
  110. // Audio encoding of the data returned in the audio message. All encodings are
  111. // raw audio bytes with no header, except as indicated below.
  112. enum Encoding {
  113. // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
  114. ENCODING_UNSPECIFIED = 0;
  115. // Uncompressed 16-bit signed little-endian samples (Linear PCM).
  116. LINEAR16 = 1;
  117. // MP3 audio encoding. The sample rate is encoded in the payload.
  118. MP3 = 2;
  119. // Opus-encoded audio wrapped in an ogg container. The result will be a
  120. // file which can be played natively on Android and in some browsers (such
  121. // as Chrome). The quality of the encoding is considerably higher than MP3
  122. // while using the same bitrate. The sample rate is encoded in the payload.
  123. OPUS_IN_OGG = 3;
  124. }
  125. // *Required* The encoding of audio data to be returned in all `audio_out`
  126. // messages.
  127. Encoding encoding = 1;
  128. // *Required* The sample rate in Hertz of the audio data returned in
  129. // `audio_out` messages. Valid values are: 16000-24000.
  130. int32 sample_rate_hertz = 2;
  131. // *Required* Current volume setting of the device's audio output.
  132. // Valid values are 1 to 100 (corresponding to 1% to 100%).
  133. int32 volume_percentage = 3;
  134. }
  135. // Provides information about the current dialog state.
  136. message ConverseState {
  137. // *Required* The `conversation_state` value returned in the prior
  138. // `ConverseResponse`. Omit (do not set the field) if there was no prior
  139. // `ConverseResponse`. If there was a prior `ConverseResponse`, do not omit
  140. // this field; doing so will end that conversation (and this new request will
  141. // start a new conversation).
  142. bytes conversation_state = 1;
  143. }
  144. // The audio containing the assistant's response to the query. Sequential chunks
  145. // of audio data are received in sequential `ConverseResponse` messages.
  146. message AudioOut {
  147. // *Output-only* The audio data containing the assistant's response to the
  148. // query. Sequential chunks of audio data are received in sequential
  149. // `ConverseResponse` messages.
  150. bytes audio_data = 1;
  151. }
  152. // The semantic result for the user's spoken query.
  153. message ConverseResult {
  154. // Possible states of the microphone after a `Converse` RPC completes.
  155. enum MicrophoneMode {
  156. // No mode specified.
  157. MICROPHONE_MODE_UNSPECIFIED = 0;
  158. // The service is not expecting a follow-on question from the user.
  159. // The microphone should remain off until the user re-activates it.
  160. CLOSE_MICROPHONE = 1;
  161. // The service is expecting a follow-on question from the user. The
  162. // microphone should be re-opened when the `AudioOut` playback completes
  163. // (by starting a new `Converse` RPC call to send the new audio).
  164. DIALOG_FOLLOW_ON = 2;
  165. }
  166. // *Output-only* The recognized transcript of what the user said.
  167. string spoken_request_text = 1;
  168. // *Output-only* The text of the assistant's spoken response. This is only
  169. // returned for an IFTTT action.
  170. string spoken_response_text = 2;
  171. // *Output-only* State information for subsequent `ConverseRequest`. This
  172. // value should be saved in the client and returned in the
  173. // `conversation_state` with the next `ConverseRequest`. (The client does not
  174. // need to interpret or otherwise use this value.) There is no need to save
  175. // this information across device restarts.
  176. bytes conversation_state = 3;
  177. // *Output-only* Specifies the mode of the microphone after this `Converse`
  178. // RPC is processed.
  179. MicrophoneMode microphone_mode = 4;
  180. // *Output-only* Updated volume level. The value will be 0 or omitted
  181. // (indicating no change) unless a voice command such as "Increase the volume"
  182. // or "Set volume level 4" was recognized, in which case the value will be
  183. // between 1 and 100 (corresponding to the new volume level of 1% to 100%).
  184. // Typically, a client should use this volume level when playing the
  185. // `audio_out` data, and retain this value as the current volume level and
  186. // supply it in the `AudioOutConfig` of the next `ConverseRequest`. (Some
  187. // clients may also implement other ways to allow the current volume level to
  188. // be changed, for example, by providing a knob that the user can turn.)
  189. int32 volume_percentage = 5;
  190. }
  191. // The top-level message sent by the client. Clients must send at least two, and
  192. // typically numerous `ConverseRequest` messages. The first message must
  193. // contain a `config` message and must not contain `audio_in` data. All
  194. // subsequent messages must contain `audio_in` data and must not contain a
  195. // `config` message.
  196. message ConverseRequest {
  197. // Exactly one of these fields must be specified in each `ConverseRequest`.
  198. oneof converse_request {
  199. // The `config` message provides information to the recognizer that
  200. // specifies how to process the request.
  201. // The first `ConverseRequest` message must contain a `config` message.
  202. ConverseConfig config = 1;
  203. // The audio data to be recognized. Sequential chunks of audio data are sent
  204. // in sequential `ConverseRequest` messages. The first `ConverseRequest`
  205. // message must not contain `audio_in` data and all subsequent
  206. // `ConverseRequest` messages must contain `audio_in` data. The audio bytes
  207. // must be encoded as specified in `AudioInConfig`.
  208. // Audio must be sent at approximately real-time (16000 samples per second).
  209. // An error will be returned if audio is sent significantly faster or
  210. // slower.
  211. bytes audio_in = 2;
  212. }
  213. }
  214. // The top-level message received by the client. A series of one or more
  215. // `ConverseResponse` messages are streamed back to the client.
  216. message ConverseResponse {
  217. // Indicates the type of event.
  218. enum EventType {
  219. // No event specified.
  220. EVENT_TYPE_UNSPECIFIED = 0;
  221. // This event indicates that the server has detected the end of the user's
  222. // speech utterance and expects no additional speech. Therefore, the server
  223. // will not process additional audio (although it may subsequently return
  224. // additional results). The client should stop sending additional audio
  225. // data, half-close the gRPC connection, and wait for any additional results
  226. // until the server closes the gRPC connection.
  227. END_OF_UTTERANCE = 1;
  228. }
  229. // Exactly one of these fields will be populated in each `ConverseResponse`.
  230. oneof converse_response {
  231. // *Output-only* If set, returns a [google.rpc.Status][google.rpc.Status]
  232. // message that specifies the error for the operation. If an error occurs
  233. // during processing, this message will be set and there will be no further
  234. // messages sent.
  235. google.rpc.Status error = 1;
  236. // *Output-only* Indicates the type of event.
  237. EventType event_type = 2;
  238. // *Output-only* The audio containing the assistant's response to the query.
  239. AudioOut audio_out = 3;
  240. // *Output-only* The semantic result for the user's spoken query.
  241. ConverseResult result = 5;
  242. }
  243. }