media_translation.proto 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. // Copyright 2021 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.mediatranslation.v1beta1;
  16. import "google/api/field_behavior.proto";
  17. import "google/rpc/status.proto";
  18. import "google/api/client.proto";
  19. option cc_enable_arenas = true;
  20. option go_package = "google.golang.org/genproto/googleapis/cloud/mediatranslation/v1beta1;mediatranslation";
  21. option java_multiple_files = true;
  22. option java_outer_classname = "MediaTranslationProto";
  23. option java_package = "com.google.cloud.mediatranslation.v1beta1";
  24. option csharp_namespace = "Google.Cloud.MediaTranslation.V1Beta1";
  25. option ruby_package = "Google::Cloud::MediaTranslation::V1beta1";
  26. option php_namespace = "Google\\Cloud\\MediaTranslation\\V1beta1";
  27. // Provides translation from/to media types.
  28. service SpeechTranslationService {
  29. option (google.api.default_host) = "mediatranslation.googleapis.com";
  30. option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";
  31. // Performs bidirectional streaming speech translation: receive results while
  32. // sending audio. This method is only available via the gRPC API (not REST).
  33. rpc StreamingTranslateSpeech(stream StreamingTranslateSpeechRequest) returns (stream StreamingTranslateSpeechResponse) {
  34. }
  35. }
  36. // Provides information to the speech translation that specifies how to process
  37. // the request.
  38. message TranslateSpeechConfig {
  39. // Required. Encoding of audio data.
  40. // Supported formats:
  41. //
  42. // - `linear16`
  43. //
  44. // Uncompressed 16-bit signed little-endian samples (Linear PCM).
  45. //
  46. // - `flac`
  47. //
  48. // `flac` (Free Lossless Audio Codec) is the recommended encoding
  49. // because it is lossless--therefore recognition is not compromised--and
  50. // requires only about half the bandwidth of `linear16`.
  51. //
  52. // - `mulaw`
  53. //
  54. // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
  55. //
  56. // - `amr`
  57. //
  58. // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
  59. //
  60. // - `amr-wb`
  61. //
  62. // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
  63. //
  64. // - `ogg-opus`
  65. //
  66. // Opus encoded audio frames in [Ogg](https://wikipedia.org/wiki/Ogg)
  67. // container. `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000,
  68. // or 48000.
  69. //
  70. // - `mp3`
  71. //
  72. // MP3 audio. Support all standard MP3 bitrates (which range from 32-320
  73. // kbps). When using this encoding, `sample_rate_hertz` has to match the
  74. // sample rate of the file being used.
  75. string audio_encoding = 1 [(google.api.field_behavior) = REQUIRED];
  76. // Required. Source language code (BCP-47) of the input audio.
  77. string source_language_code = 2 [(google.api.field_behavior) = REQUIRED];
  78. // Required. Target language code (BCP-47) of the output.
  79. string target_language_code = 3 [(google.api.field_behavior) = REQUIRED];
  80. // Optional. Sample rate in Hertz of the audio data. Valid values are:
  81. // 8000-48000. 16000 is optimal. For best results, set the sampling rate of
  82. // the audio source to 16000 Hz. If that's not possible, use the native sample
  83. // rate of the audio source (instead of re-sampling).
  84. int32 sample_rate_hertz = 4 [(google.api.field_behavior) = OPTIONAL];
  85. // Optional. `google-provided-model/video` and
  86. // `google-provided-model/enhanced-phone-call` are premium models.
  87. // `google-provided-model/phone-call` is not premium model.
  88. string model = 5 [(google.api.field_behavior) = OPTIONAL];
  89. }
  90. // Config used for streaming translation.
  91. message StreamingTranslateSpeechConfig {
  92. // Required. The common config for all the following audio contents.
  93. TranslateSpeechConfig audio_config = 1 [(google.api.field_behavior) = REQUIRED];
  94. // Optional. If `false` or omitted, the system performs
  95. // continuous translation (continuing to wait for and process audio even if
  96. // the user pauses speaking) until the client closes the input stream (gRPC
  97. // API) or until the maximum time limit has been reached. May return multiple
  98. // `StreamingTranslateSpeechResult`s with the `is_final` flag set to `true`.
  99. //
  100. // If `true`, the speech translator will detect a single spoken utterance.
  101. // When it detects that the user has paused or stopped speaking, it will
  102. // return an `END_OF_SINGLE_UTTERANCE` event and cease translation.
  103. // When the client receives 'END_OF_SINGLE_UTTERANCE' event, the client should
  104. // stop sending the requests. However, clients should keep receiving remaining
  105. // responses until the stream is terminated. To construct the complete
  106. // sentence in a streaming way, one should override (if 'is_final' of previous
  107. // response is false), or append (if 'is_final' of previous response is true).
  108. bool single_utterance = 2 [(google.api.field_behavior) = OPTIONAL];
  109. }
  110. // The top-level message sent by the client for the `StreamingTranslateSpeech`
  111. // method. Multiple `StreamingTranslateSpeechRequest` messages are sent. The
  112. // first message must contain a `streaming_config` message and must not contain
  113. // `audio_content` data. All subsequent messages must contain `audio_content`
  114. // data and must not contain a `streaming_config` message.
  115. message StreamingTranslateSpeechRequest {
  116. // The streaming request, which is either a streaming config or content.
  117. oneof streaming_request {
  118. // Provides information to the recognizer that specifies how to process the
  119. // request. The first `StreamingTranslateSpeechRequest` message must contain
  120. // a `streaming_config` message.
  121. StreamingTranslateSpeechConfig streaming_config = 1;
  122. // The audio data to be translated. Sequential chunks of audio data are sent
  123. // in sequential `StreamingTranslateSpeechRequest` messages. The first
  124. // `StreamingTranslateSpeechRequest` message must not contain
  125. // `audio_content` data and all subsequent `StreamingTranslateSpeechRequest`
  126. // messages must contain `audio_content` data. The audio bytes must be
  127. // encoded as specified in `StreamingTranslateSpeechConfig`. Note: as with
  128. // all bytes fields, protobuffers use a pure binary representation (not
  129. // base64).
  130. bytes audio_content = 2;
  131. }
  132. }
  133. // A streaming speech translation result corresponding to a portion of the audio
  134. // that is currently being processed.
  135. message StreamingTranslateSpeechResult {
  136. // Text translation result.
  137. message TextTranslationResult {
  138. // Output only. The translated sentence.
  139. string translation = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
  140. // Output only. If `false`, this `StreamingTranslateSpeechResult` represents
  141. // an interim result that may change. If `true`, this is the final time the
  142. // translation service will return this particular
  143. // `StreamingTranslateSpeechResult`, the streaming translator will not
  144. // return any further hypotheses for this portion of the transcript and
  145. // corresponding audio.
  146. bool is_final = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
  147. }
  148. // Translation result.
  149. oneof result {
  150. // Text translation result.
  151. TextTranslationResult text_translation_result = 1;
  152. }
  153. }
  154. // A streaming speech translation response corresponding to a portion of
  155. // the audio currently processed.
  156. message StreamingTranslateSpeechResponse {
  157. // Indicates the type of speech event.
  158. enum SpeechEventType {
  159. // No speech event specified.
  160. SPEECH_EVENT_TYPE_UNSPECIFIED = 0;
  161. // This event indicates that the server has detected the end of the user's
  162. // speech utterance and expects no additional speech. Therefore, the server
  163. // will not process additional audio (although it may subsequently return
  164. // additional results). When the client receives 'END_OF_SINGLE_UTTERANCE'
  165. // event, the client should stop sending the requests. However, clients
  166. // should keep receiving remaining responses until the stream is terminated.
  167. // To construct the complete sentence in a streaming way, one should
  168. // override (if 'is_final' of previous response is false), or append (if
  169. // 'is_final' of previous response is true). This event is only sent if
  170. // `single_utterance` was set to `true`, and is not used otherwise.
  171. END_OF_SINGLE_UTTERANCE = 1;
  172. }
  173. // Output only. If set, returns a [google.rpc.Status][google.rpc.Status] message that
  174. // specifies the error for the operation.
  175. google.rpc.Status error = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
  176. // Output only. The translation result that is currently being processed (is_final could be
  177. // true or false).
  178. StreamingTranslateSpeechResult result = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
  179. // Output only. Indicates the type of speech event.
  180. SpeechEventType speech_event_type = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
  181. }