media_translation.proto 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. // Copyright 2020 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.mediatranslation.v1alpha1;
  16. import "google/api/field_behavior.proto";
  17. import "google/rpc/status.proto";
  18. import "google/api/client.proto";
  19. option cc_enable_arenas = true;
  20. option go_package = "google.golang.org/genproto/googleapis/cloud/mediatranslation/v1alpha1;mediatranslation";
  21. option java_package = "com.google.cloud.mediatranslation.v1alpha1";
  22. // Provides translation from/to media types.
  23. service SpeechTranslationService {
  24. option (google.api.default_host) = "mediatranslation.googleapis.com";
  25. option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";
  26. // Performs bidirectional streaming speech translation: receive results while
  27. // sending audio. This method is only available via the gRPC API (not REST).
  28. rpc StreamingTranslateSpeech(stream StreamingTranslateSpeechRequest) returns (stream StreamingTranslateSpeechResponse) {
  29. }
  30. }
  31. // Provides information to the speech translation that specifies how to process
  32. // the request.
  33. message TranslateSpeechConfig {
  34. // Required. Encoding of audio data.
  35. // Supported formats:
  36. //
  37. // - `linear16`
  38. //
  39. // Uncompressed 16-bit signed little-endian samples (Linear PCM).
  40. //
  41. // - `flac`
  42. //
  43. // `flac` (Free Lossless Audio Codec) is the recommended encoding
  44. // because it is lossless--therefore recognition is not compromised--and
  45. // requires only about half the bandwidth of `linear16`.
  46. //
  47. // - `mulaw`
  48. //
  49. // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
  50. //
  51. // - `amr`
  52. //
  53. // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
  54. //
  55. // - `amr-wb`
  56. //
  57. // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
  58. //
  59. // - `ogg-opus`
  60. //
  61. // Opus encoded audio frames in Ogg container
  62. // ([OggOpus](https://wiki.xiph.org/OggOpus)).
  63. // `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000.
  64. //
  65. // - `mp3`
  66. //
  67. // MP3 audio. Support all standard MP3 bitrates (which range from 32-320
  68. // kbps). When using this encoding, `sample_rate_hertz` has to match the
  69. // sample rate of the file being used.
  70. //
  71. //
  72. string audio_encoding = 1 [(google.api.field_behavior) = REQUIRED];
  73. // Required. Source language code (BCP-47) of the input audio.
  74. string source_language_code = 2 [(google.api.field_behavior) = REQUIRED];
  75. // Required. Target language code (BCP-47) of the output.
  76. string target_language_code = 3 [(google.api.field_behavior) = REQUIRED];
  77. // Optional. A list of up to 3 additional language codes (BCP-47), listing possible
  78. // alternative languages of the supplied audio. If alternative source
  79. // languages are listed, speech translation result will translate in the most
  80. // likely language detected including the main source_language_code. The
  81. // translated result will include the language code of the language detected
  82. // in the audio.
  83. // Note:
  84. // 1. If the provided alternative_source_language_code is not supported
  85. // by current API version, we will skip that language code.
  86. // 2. If user only provided one eligible alternative_source_language_codes,
  87. // the translation will happen between source_language_code and
  88. // alternative_source_language_codes. The target_language_code will be
  89. // ignored. It will be useful in conversation mode.
  90. repeated string alternative_source_language_codes = 6 [(google.api.field_behavior) = OPTIONAL];
  91. // Optional. Sample rate in Hertz of the audio data. Valid values are:
  92. // 8000-48000. 16000 is optimal. For best results, set the sampling rate of
  93. // the audio source to 16000 Hz. If that's not possible, use the native sample
  94. // rate of the audio source (instead of re-sampling).
  95. //
  96. int32 sample_rate_hertz = 4 [(google.api.field_behavior) = OPTIONAL];
  97. // Optional.
  98. string model = 5 [(google.api.field_behavior) = OPTIONAL];
  99. }
  100. // Config used for streaming translation.
  101. message StreamingTranslateSpeechConfig {
  102. // Required. The common config for all the following audio contents.
  103. TranslateSpeechConfig audio_config = 1 [(google.api.field_behavior) = REQUIRED];
  104. // Optional. If `false` or omitted, the system performs
  105. // continuous translation (continuing to wait for and process audio even if
  106. // the user pauses speaking) until the client closes the input stream (gRPC
  107. // API) or until the maximum time limit has been reached. May return multiple
  108. // `StreamingTranslateSpeechResult`s with the `is_final` flag set to `true`.
  109. //
  110. // If `true`, the speech translator will detect a single spoken utterance.
  111. // When it detects that the user has paused or stopped speaking, it will
  112. // return an `END_OF_SINGLE_UTTERANCE` event and cease translation.
  113. // When the client receives `END_OF_SINGLE_UTTERANCE` event, the client should
  114. // stop sending the requests. However, clients should keep receiving remaining
  115. // responses until the stream is terminated. To construct the complete
  116. // sentence in a streaming way, one should override (if `is_final` of previous
  117. // response is false), or append (if 'is_final' of previous response is true).
  118. bool single_utterance = 2 [(google.api.field_behavior) = OPTIONAL];
  119. // Optional. Stability control for the media translation text. The value should be
  120. // "LOW", "MEDIUM", "HIGH". It applies to text/text_and_audio translation
  121. // only.
  122. // For audio translation mode, we only support HIGH stability mode,
  123. // low/medium stability mode will throw argument error.
  124. // Default empty string will be treated as "HIGH" in audio translation mode;
  125. // will be treated as "LOW" in other translation mode.
  126. // Note that stability and speed would be trade off.
  127. // 1. "LOW": In low mode, translation service will start to do translation
  128. // right after getting recognition response. The speed will be faster.
  129. // 2. "MEDIUM": In medium mode, translation service will
  130. // check if the recognition response is stable enough or not, and only
  131. // translate recognition response which is not likely to be changed later.
  132. // 3. "HIGH": In high mode, translation service will wait for more stable
  133. // recognition responses, and then start to do translation. Also, the
  134. // following recognition responses cannot modify previous recognition
  135. // responses. Thus it may impact quality in some situation. "HIGH" stability
  136. // will generate "final" responses more frequently.
  137. //
  138. string stability = 3 [(google.api.field_behavior) = OPTIONAL];
  139. // Optional. Translation mode, the value should be "text", "audio", "text_and_audio".
  140. // Default empty string will be treated as "text".
  141. // 1. "text": The response will be text translation. Text translation has a
  142. // field "is_final". Detailed definition can be found in
  143. // `TextTranslationResult`.
  144. // 2. "audio": The response will be audio translation. Audio translation does
  145. // not have "is_final" field, which means each audio translation response is
  146. // stable and will not be changed by later response.
  147. // Translation mode "audio" can only be used with "high" stability mode,
  148. // 3. "text_and_audio": The response will have a text translation, when
  149. // "is_final" is true, we will also output its corresponding audio
  150. // translation. When "is_final" is false, audio_translation field will be
  151. // empty.
  152. string translation_mode = 4 [(google.api.field_behavior) = OPTIONAL];
  153. // Optional. If disable_interim_results is true, we will only return "final" responses.
  154. // Otherwise, we will return all the responses. Default value will be false.
  155. // User can only set disable_interim_results to be true with "high" stability
  156. // mode.
  157. bool disable_interim_results = 5 [(google.api.field_behavior) = OPTIONAL];
  158. }
  159. // The top-level message sent by the client for the `StreamingTranslateSpeech`
  160. // method. Multiple `StreamingTranslateSpeechRequest` messages are sent. The
  161. // first message must contain a `streaming_config` message and must not contain
  162. // `audio_content` data. All subsequent messages must contain `audio_content`
  163. // data and must not contain a `streaming_config` message.
  164. message StreamingTranslateSpeechRequest {
  165. // The streaming request, which is either a streaming config or content.
  166. oneof streaming_request {
  167. // Provides information to the recognizer that specifies how to process the
  168. // request. The first `StreamingTranslateSpeechRequest` message must contain
  169. // a `streaming_config` message.
  170. StreamingTranslateSpeechConfig streaming_config = 1;
  171. // The audio data to be translated. Sequential chunks of audio data are sent
  172. // in sequential `StreamingTranslateSpeechRequest` messages. The first
  173. // `StreamingTranslateSpeechRequest` message must not contain
  174. // `audio_content` data and all subsequent `StreamingTranslateSpeechRequest`
  175. // messages must contain `audio_content` data. The audio bytes must be
  176. // encoded as specified in `StreamingTranslateSpeechConfig`. Note: as with
  177. // all bytes fields, protobuffers use a pure binary representation (not
  178. // base64).
  179. bytes audio_content = 2;
  180. }
  181. }
  182. // A streaming speech translation result corresponding to a portion of the audio
  183. // that is currently being processed.
  184. message StreamingTranslateSpeechResult {
  185. // Text translation result.
  186. message TextTranslationResult {
  187. // Output only. The translated sentence.
  188. string translation = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
  189. // Output only. If `false`, this `StreamingTranslateSpeechResult` represents
  190. // an interim result that may change. If `true`, this is the final time the
  191. // translation service will return this particular
  192. // `StreamingTranslateSpeechResult`, the streaming translator will not
  193. // return any further hypotheses for this portion of the transcript and
  194. // corresponding audio.
  195. bool is_final = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
  196. }
  197. // Audio translation result.
  198. message AudioTranslationResult {
  199. // Output only. The translated audio.
  200. bytes audio_translation = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
  201. }
  202. // Text translation result.
  203. TextTranslationResult text_translation_result = 1;
  204. // Audio translation result.
  205. AudioTranslationResult audio_translation_result = 2;
  206. // Output only. The debug only recognition result in original language. This field is debug
  207. // only and will be set to empty string if not available.
  208. // This is implementation detail and will not be backward compatible.
  209. string recognition_result = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
  210. // Output only.
  211. string detected_source_language_code = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
  212. }
  213. // A streaming speech translation response corresponding to a portion of
  214. // the audio currently processed.
  215. message StreamingTranslateSpeechResponse {
  216. // Indicates the type of speech event.
  217. enum SpeechEventType {
  218. // No speech event specified.
  219. SPEECH_EVENT_TYPE_UNSPECIFIED = 0;
  220. // This event indicates that the server has detected the end of the user's
  221. // speech utterance and expects no additional speech. Therefore, the server
  222. // will not process additional audio (although it may subsequently return
  223. // additional results). When the client receives `END_OF_SINGLE_UTTERANCE`
  224. // event, the client should stop sending the requests. However, clients
  225. // should keep receiving remaining responses until the stream is terminated.
  226. // To construct the complete sentence in a streaming way, one should
  227. // override (if `is_final` of previous response is `false`), or append (if
  228. // `is_final` of previous response is `true`). This event is only sent if
  229. // `single_utterance` was set to `true`, and is not used otherwise.
  230. END_OF_SINGLE_UTTERANCE = 1;
  231. }
  232. // Output only. If set, returns a [google.rpc.Status][google.rpc.Status] message that
  233. // specifies the error for the operation.
  234. google.rpc.Status error = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
  235. // Output only. The translation result that is currently being processed (For text
  236. // translation, `is_final` could be `true` or `false`.
  237. // For audio translation, we do not have is_final field, which means each
  238. // audio response is stable and will not get changed later. For
  239. // text_and_audio, we still have `is_final` field in text translation, but we
  240. // only output corresponsding audio when `is_final` is true.).
  241. StreamingTranslateSpeechResult result = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
  242. // Output only. Indicates the type of speech event.
  243. SpeechEventType speech_event_type = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
  244. }