123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466 |
- // Copyright 2018 Google Inc.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- syntax = "proto3";
- package google.assistant.embedded.v1alpha2;
- import "google/api/annotations.proto";
- import "google/type/latlng.proto";
- option go_package = "google.golang.org/genproto/googleapis/assistant/embedded/v1alpha2;embedded";
- option java_multiple_files = true;
- option java_outer_classname = "AssistantProto";
- option java_package = "com.google.assistant.embedded.v1alpha2";
- option objc_class_prefix = "ASTSDK";
- // Service that implements the Google Assistant API.
- service EmbeddedAssistant {
- // Initiates or continues a conversation with the embedded Assistant Service.
- // Each call performs one round-trip, sending an audio request to the service
- // and receiving the audio response. Uses bidirectional streaming to receive
- // results, such as the `END_OF_UTTERANCE` event, while sending audio.
- //
- // A conversation is one or more gRPC connections, each consisting of several
- // streamed requests and responses.
- // For example, the user says *Add to my shopping list* and the Assistant
- // responds *What do you want to add?*. The sequence of streamed requests and
- // responses in the first gRPC message could be:
- //
- // * AssistRequest.config
- // * AssistRequest.audio_in
- // * AssistRequest.audio_in
- // * AssistRequest.audio_in
- // * AssistRequest.audio_in
- // * AssistResponse.event_type.END_OF_UTTERANCE
- // * AssistResponse.speech_results.transcript "add to my shopping list"
- // * AssistResponse.dialog_state_out.microphone_mode.DIALOG_FOLLOW_ON
- // * AssistResponse.audio_out
- // * AssistResponse.audio_out
- // * AssistResponse.audio_out
- //
- //
- // The user then says *bagels* and the Assistant responds
- // *OK, I've added bagels to your shopping list*. This is sent as another gRPC
- // connection call to the `Assist` method, again with streamed requests and
- // responses, such as:
- //
- // * AssistRequest.config
- // * AssistRequest.audio_in
- // * AssistRequest.audio_in
- // * AssistRequest.audio_in
- // * AssistResponse.event_type.END_OF_UTTERANCE
- // * AssistResponse.dialog_state_out.microphone_mode.CLOSE_MICROPHONE
- // * AssistResponse.audio_out
- // * AssistResponse.audio_out
- // * AssistResponse.audio_out
- // * AssistResponse.audio_out
- //
- // Although the precise order of responses is not guaranteed, sequential
- // `AssistResponse.audio_out` messages will always contain sequential portions
- // of audio.
- rpc Assist(stream AssistRequest) returns (stream AssistResponse);
- }
- // The top-level message sent by the client. Clients must send at least two, and
- // typically numerous `AssistRequest` messages. The first message must
- // contain a `config` message and must not contain `audio_in` data. All
- // subsequent messages must contain `audio_in` data and must not contain a
- // `config` message.
- message AssistRequest {
- // Exactly one of these fields must be specified in each `AssistRequest`.
- oneof type {
- // The `config` message provides information to the recognizer that
- // specifies how to process the request.
- // The first `AssistRequest` message must contain a `config` message.
- AssistConfig config = 1;
- // The audio data to be recognized. Sequential chunks of audio data are sent
- // in sequential `AssistRequest` messages. The first `AssistRequest`
- // message must not contain `audio_in` data and all subsequent
- // `AssistRequest` messages must contain `audio_in` data. The audio bytes
- // must be encoded as specified in `AudioInConfig`.
- // Audio must be sent at approximately real-time (16000 samples per second).
- // An error will be returned if audio is sent significantly faster or
- // slower.
- bytes audio_in = 2;
- }
- }
- // The top-level message received by the client. A series of one or more
- // `AssistResponse` messages are streamed back to the client.
- message AssistResponse {
- // Indicates the type of event.
- enum EventType {
- // No event specified.
- EVENT_TYPE_UNSPECIFIED = 0;
- // This event indicates that the server has detected the end of the user's
- // speech utterance and expects no additional speech. Therefore, the server
- // will not process additional audio (although it may subsequently return
- // additional results). The client should stop sending additional audio
- // data, half-close the gRPC connection, and wait for any additional results
- // until the server closes the gRPC connection.
- END_OF_UTTERANCE = 1;
- }
- // *Output-only* Indicates the type of event.
- EventType event_type = 1;
- // *Output-only* The audio containing the Assistant's response to the query.
- AudioOut audio_out = 3;
- // *Output-only* Contains the Assistant's visual response to the query.
- ScreenOut screen_out = 4;
- // *Output-only* Contains the action triggered by the query with the
- // appropriate payloads and semantic parsing.
- DeviceAction device_action = 6;
- // *Output-only* This repeated list contains zero or more speech recognition
- // results that correspond to consecutive portions of the audio currently
- // being processed, starting with the portion corresponding to the earliest
- // audio (and most stable portion) to the portion corresponding to the most
- // recent audio. The strings can be concatenated to view the full
- // in-progress response. When the speech recognition completes, this list
- // will contain one item with `stability` of `1.0`.
- repeated SpeechRecognitionResult speech_results = 2;
- // *Output-only* Contains output related to the user's query.
- DialogStateOut dialog_state_out = 5;
- // *Output-only* Debugging info for developer. Only returned if request set
- // `return_debug_info` to true.
- DebugInfo debug_info = 8;
- }
- // Debug info for developer. Only returned if request set `return_debug_info`
- // to true.
- message DebugInfo {
- // The original JSON response from an Action-on-Google agent to Google server.
- // See
- // https://developers.google.com/actions/reference/rest/Shared.Types/AppResponse.
- // It will only be populated if the request maker owns the AoG project and the
- // AoG project is in preview mode.
- string aog_agent_to_assistant_json = 1;
- }
- // Specifies how to process the `AssistRequest` messages.
- message AssistConfig {
- oneof type {
- // Specifies how to process the subsequent incoming audio. Required if
- // [AssistRequest.audio_in][google.assistant.embedded.v1alpha2.AssistRequest.audio_in]
- // bytes will be provided in subsequent requests.
- AudioInConfig audio_in_config = 1;
- // The text input to be sent to the Assistant. This can be populated from a
- // text interface if audio input is not available.
- string text_query = 6;
- }
- // *Required* Specifies how to format the audio that will be returned.
- AudioOutConfig audio_out_config = 2;
- // *Optional* Specifies the desired format to use when server returns a
- // visual screen response.
- ScreenOutConfig screen_out_config = 8;
- // *Required* Represents the current dialog state.
- DialogStateIn dialog_state_in = 3;
- // Device configuration that uniquely identifies a specific device.
- DeviceConfig device_config = 4;
- // *Optional* Debugging parameters for the whole `Assist` RPC.
- DebugConfig debug_config = 5;
- }
- // Specifies how to process the `audio_in` data that will be provided in
- // subsequent requests. For recommended settings, see the Google Assistant SDK
- // [best
- // practices](https://developers.google.com/assistant/sdk/guides/service/python/best-practices/audio).
- message AudioInConfig {
- // Audio encoding of the data sent in the audio message.
- // Audio must be one-channel (mono).
- enum Encoding {
- // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
- ENCODING_UNSPECIFIED = 0;
- // Uncompressed 16-bit signed little-endian samples (Linear PCM).
- // This encoding includes no header, only the raw audio bytes.
- LINEAR16 = 1;
- // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
- // Codec) is the recommended encoding because it is
- // lossless--therefore recognition is not compromised--and
- // requires only about half the bandwidth of `LINEAR16`. This encoding
- // includes the `FLAC` stream header followed by audio data. It supports
- // 16-bit and 24-bit samples, however, not all fields in `STREAMINFO` are
- // supported.
- FLAC = 2;
- }
- // *Required* Encoding of audio data sent in all `audio_in` messages.
- Encoding encoding = 1;
- // *Required* Sample rate (in Hertz) of the audio data sent in all `audio_in`
- // messages. Valid values are from 16000-24000, but 16000 is optimal.
- // For best results, set the sampling rate of the audio source to 16000 Hz.
- // If that's not possible, use the native sample rate of the audio source
- // (instead of re-sampling).
- int32 sample_rate_hertz = 2;
- }
- // Specifies the desired format for the server to use when it returns
- // `audio_out` messages.
- message AudioOutConfig {
- // Audio encoding of the data returned in the audio message. All encodings are
- // raw audio bytes with no header, except as indicated below.
- enum Encoding {
- // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
- ENCODING_UNSPECIFIED = 0;
- // Uncompressed 16-bit signed little-endian samples (Linear PCM).
- LINEAR16 = 1;
- // MP3 audio encoding. The sample rate is encoded in the payload.
- MP3 = 2;
- // Opus-encoded audio wrapped in an ogg container. The result will be a
- // file which can be played natively on Android and in some browsers (such
- // as Chrome). The quality of the encoding is considerably higher than MP3
- // while using the same bitrate. The sample rate is encoded in the payload.
- OPUS_IN_OGG = 3;
- }
- // *Required* The encoding of audio data to be returned in all `audio_out`
- // messages.
- Encoding encoding = 1;
- // *Required* The sample rate in Hertz of the audio data returned in
- // `audio_out` messages. Valid values are: 16000-24000.
- int32 sample_rate_hertz = 2;
- // *Required* Current volume setting of the device's audio output.
- // Valid values are 1 to 100 (corresponding to 1% to 100%).
- int32 volume_percentage = 3;
- }
- // Specifies the desired format for the server to use when it returns
- // `screen_out` response.
- message ScreenOutConfig {
- // Possible modes for visual screen-output on the device.
- enum ScreenMode {
- // No video mode specified.
- // The Assistant may respond as if in `OFF` mode.
- SCREEN_MODE_UNSPECIFIED = 0;
- // Screen is off (or has brightness or other settings set so low it is
- // not visible). The Assistant will typically not return a screen response
- // in this mode.
- OFF = 1;
- // The Assistant will typically return a partial-screen response in this
- // mode.
- PLAYING = 3;
- }
- // Current visual screen-mode for the device while issuing the query.
- ScreenMode screen_mode = 1;
- }
- // Provides information about the current dialog state.
- message DialogStateIn {
- // *Required* This field must always be set to the
- // [DialogStateOut.conversation_state][google.assistant.embedded.v1alpha2.DialogStateOut.conversation_state]
- // value that was returned in the prior `Assist` RPC. It should only be
- // omitted (field not set) if there was no prior `Assist` RPC because this is
- // the first `Assist` RPC made by this device after it was first setup and/or
- // a factory-default reset.
- bytes conversation_state = 1;
- // *Required* Language of the request in
- // [IETF BCP 47 syntax](https://tools.ietf.org/html/bcp47) (for example,
- // "en-US"). See [Language
- // Support](https://developers.google.com/assistant/sdk/reference/rpc/languages)
- // for more information. If you have selected a language for this `device_id`
- // using the
- // [Settings](https://developers.google.com/assistant/sdk/reference/assistant-app/assistant-settings)
- // menu in your phone's Google Assistant app, that selection will override
- // this value.
- string language_code = 2;
- // *Optional* Location of the device where the query originated.
- DeviceLocation device_location = 5;
- // *Optional* If true, the server will treat the request as a new conversation
- // and not use state from the prior request. Set this field to true when the
- // conversation should be restarted, such as after a device reboot, or after a
- // significant lapse of time since the prior query.
- bool is_new_conversation = 7;
- }
- // *Required* Fields that identify the device to the Assistant.
- //
- // See also:
- //
- // * [Register a Device - REST
- // API](https://developers.google.com/assistant/sdk/reference/device-registration/register-device-manual)
- // * [Device Model and Instance
- // Schemas](https://developers.google.com/assistant/sdk/reference/device-registration/model-and-instance-schemas)
- // * [Device
- // Proto](https://developers.google.com/assistant/sdk/reference/rpc/google.assistant.devices.v1alpha2#device)
- message DeviceConfig {
- // *Required* Unique identifier for the device. The id length must be 128
- // characters or less. Example: DBCDW098234. This MUST match the device_id
- // returned from device registration. This device_id is used to match against
- // the user's registered devices to lookup the supported traits and
- // capabilities of this device. This information should not change across
- // device reboots. However, it should not be saved across
- // factory-default resets.
- string device_id = 1;
- // *Required* Unique identifier for the device model. The combination of
- // device_model_id and device_id must have been previously associated through
- // device registration.
- string device_model_id = 3;
- }
- // The audio containing the Assistant's response to the query. Sequential chunks
- // of audio data are received in sequential `AssistResponse` messages.
- message AudioOut {
- // *Output-only* The audio data containing the Assistant's response to the
- // query. Sequential chunks of audio data are received in sequential
- // `AssistResponse` messages.
- bytes audio_data = 1;
- }
- // The Assistant's visual output response to query. Enabled by
- // `screen_out_config`.
- message ScreenOut {
- // Possible formats of the screen data.
- enum Format {
- // No format specified.
- FORMAT_UNSPECIFIED = 0;
- // Data will contain a fully-formed HTML5 layout encoded in UTF-8, e.g.
- // `<html><body><div>...</div></body></html>`. It is intended to be rendered
- // along with the audio response. Note that HTML5 doctype should be included
- // in the actual HTML data.
- HTML = 1;
- }
- // *Output-only* The format of the provided screen data.
- Format format = 1;
- // *Output-only* The raw screen data to be displayed as the result of the
- // Assistant query.
- bytes data = 2;
- }
- // The response returned to the device if the user has triggered a Device
- // Action. For example, a device which supports the query *Turn on the light*
- // would receive a `DeviceAction` with a JSON payload containing the semantics
- // of the request.
- message DeviceAction {
- // JSON containing the device command response generated from the triggered
- // Device Action grammar. The format is given by the
- // `action.devices.EXECUTE` intent for a given
- // [trait](https://developers.google.com/assistant/sdk/reference/traits/).
- string device_request_json = 1;
- }
- // The estimated transcription of a phrase the user has spoken. This could be
- // a single segment or the full guess of the user's spoken query.
- message SpeechRecognitionResult {
- // *Output-only* Transcript text representing the words that the user spoke.
- string transcript = 1;
- // *Output-only* An estimate of the likelihood that the Assistant will not
- // change its guess about this result. Values range from 0.0 (completely
- // unstable) to 1.0 (completely stable and final). The default of 0.0 is a
- // sentinel value indicating `stability` was not set.
- float stability = 2;
- }
- // The dialog state resulting from the user's query. Multiple of these messages
- // may be received.
- message DialogStateOut {
- // Possible states of the microphone after a `Assist` RPC completes.
- enum MicrophoneMode {
- // No mode specified.
- MICROPHONE_MODE_UNSPECIFIED = 0;
- // The service is not expecting a follow-on question from the user.
- // The microphone should remain off until the user re-activates it.
- CLOSE_MICROPHONE = 1;
- // The service is expecting a follow-on question from the user. The
- // microphone should be re-opened when the `AudioOut` playback completes
- // (by starting a new `Assist` RPC call to send the new audio).
- DIALOG_FOLLOW_ON = 2;
- }
- // *Output-only* Supplemental display text from the Assistant. This could be
- // the same as the speech spoken in `AssistResponse.audio_out` or it could
- // be some additional information which aids the user's understanding.
- string supplemental_display_text = 1;
- // *Output-only* State information for the subsequent `Assist` RPC. This
- // value should be saved in the client and returned in the
- // [`DialogStateIn.conversation_state`](#dialogstatein) field with the next
- // `Assist` RPC. (The client does not need to interpret or otherwise use this
- // value.) This information should be saved across device reboots. However,
- // this value should be cleared (not saved in the client) during a
- // factory-default reset.
- bytes conversation_state = 2;
- // *Output-only* Specifies the mode of the microphone after this `Assist`
- // RPC is processed.
- MicrophoneMode microphone_mode = 3;
- // *Output-only* Updated volume level. The value will be 0 or omitted
- // (indicating no change) unless a voice command such as *Increase the volume*
- // or *Set volume level 4* was recognized, in which case the value will be
- // between 1 and 100 (corresponding to the new volume level of 1% to 100%).
- // Typically, a client should use this volume level when playing the
- // `audio_out` data, and retain this value as the current volume level and
- // supply it in the `AudioOutConfig` of the next `AssistRequest`. (Some
- // clients may also implement other ways to allow the current volume level to
- // be changed, for example, by providing a knob that the user can turn.)
- int32 volume_percentage = 4;
- }
- // Debugging parameters for the current request.
- message DebugConfig {
- // When this field is set to true, the `debug_info` field in `AssistResponse`
- // may be populated. However it will significantly increase latency of
- // responses. Do not set this field true in production code.
- bool return_debug_info = 6;
- }
- // There are three sources of locations. They are used with this precedence:
- //
- // 1. This `DeviceLocation`, which is primarily used for mobile devices with
- // GPS .
- // 2. Location specified by the user during device setup; this is per-user, per
- // device. This location is used if `DeviceLocation` is not specified.
- // 3. Inferred location based on IP address. This is used only if neither of the
- // above are specified.
- message DeviceLocation {
- oneof type {
- // Latitude and longitude of device.
- google.type.LatLng coordinates = 1;
- }
- }
|