video_intelligence.proto 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. // Copyright 2019 Google LLC.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. syntax = "proto3";
  16. package google.cloud.videointelligence.v1p1beta1;
  17. import "google/api/annotations.proto";
  18. import "google/api/client.proto";
  19. import "google/api/field_behavior.proto";
  20. import "google/longrunning/operations.proto";
  21. import "google/protobuf/duration.proto";
  22. import "google/protobuf/timestamp.proto";
  23. import "google/rpc/status.proto";
  24. option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P1Beta1";
  25. option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1p1beta1;videointelligence";
  26. option java_multiple_files = true;
  27. option java_outer_classname = "VideoIntelligenceServiceProto";
  28. option java_package = "com.google.cloud.videointelligence.v1p1beta1";
  29. option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p1beta1";
  30. option ruby_package = "Google::Cloud::VideoIntelligence::V1p1beta1";
  31. // Service that implements Google Cloud Video Intelligence API.
  32. service VideoIntelligenceService {
  33. option (google.api.default_host) = "videointelligence.googleapis.com";
  34. option (google.api.oauth_scopes) =
  35. "https://www.googleapis.com/auth/cloud-platform";
  36. // Performs asynchronous video annotation. Progress and results can be
  37. // retrieved through the `google.longrunning.Operations` interface.
  38. // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
  39. // `Operation.response` contains `AnnotateVideoResponse` (results).
  40. rpc AnnotateVideo(AnnotateVideoRequest)
  41. returns (google.longrunning.Operation) {
  42. option (google.api.http) = {
  43. post: "/v1p1beta1/videos:annotate"
  44. body: "*"
  45. };
  46. option (google.api.method_signature) = "input_uri,features";
  47. option (google.longrunning.operation_info) = {
  48. response_type: "AnnotateVideoResponse"
  49. metadata_type: "AnnotateVideoProgress"
  50. };
  51. }
  52. }
  53. // Video annotation request.
  54. message AnnotateVideoRequest {
  55. // Input video location. Currently, only
  56. // [Google Cloud Storage](https://cloud.google.com/storage/) URIs are
  57. // supported, which must be specified in the following format:
  58. // `gs://bucket-id/object-id` (other URI formats return
  59. // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
  60. // more information, see [Request
  61. // URIs](https://cloud.google.com/storage/docs/request-endpoints). A video URI
  62. // may include wildcards in `object-id`, and thus identify multiple videos.
  63. // Supported wildcards: '*' to match 0 or more characters;
  64. // '?' to match 1 character. If unset, the input video should be embedded
  65. // in the request as `input_content`. If set, `input_content` should be unset.
  66. string input_uri = 1;
  67. // The video data bytes.
  68. // If unset, the input video(s) should be specified via `input_uri`.
  69. // If set, `input_uri` should be unset.
  70. bytes input_content = 6;
  71. // Required. Requested video annotation features.
  72. repeated Feature features = 2 [(google.api.field_behavior) = REQUIRED];
  73. // Additional video context and/or feature-specific parameters.
  74. VideoContext video_context = 3;
  75. // Optional. Location where the output (in JSON format) should be stored.
  76. // Currently, only [Google Cloud Storage](https://cloud.google.com/storage/)
  77. // URIs are supported, which must be specified in the following format:
  78. // `gs://bucket-id/object-id` (other URI formats return
  79. // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
  80. // more information, see [Request
  81. // URIs](https://cloud.google.com/storage/docs/request-endpoints).
  82. string output_uri = 4 [(google.api.field_behavior) = OPTIONAL];
  83. // Optional. Cloud region where annotation should take place. Supported cloud
  84. // regions: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no region
  85. // is specified, a region will be determined based on video file location.
  86. string location_id = 5 [(google.api.field_behavior) = OPTIONAL];
  87. }
  88. // Video context and/or feature-specific parameters.
  89. message VideoContext {
  90. // Video segments to annotate. The segments may overlap and are not required
  91. // to be contiguous or span the whole video. If unspecified, each video is
  92. // treated as a single segment.
  93. repeated VideoSegment segments = 1;
  94. // Config for LABEL_DETECTION.
  95. LabelDetectionConfig label_detection_config = 2;
  96. // Config for SHOT_CHANGE_DETECTION.
  97. ShotChangeDetectionConfig shot_change_detection_config = 3;
  98. // Config for EXPLICIT_CONTENT_DETECTION.
  99. ExplicitContentDetectionConfig explicit_content_detection_config = 4;
  100. // Config for SPEECH_TRANSCRIPTION.
  101. SpeechTranscriptionConfig speech_transcription_config = 6;
  102. }
  103. // Config for LABEL_DETECTION.
  104. message LabelDetectionConfig {
  105. // What labels should be detected with LABEL_DETECTION, in addition to
  106. // video-level labels or segment-level labels.
  107. // If unspecified, defaults to `SHOT_MODE`.
  108. LabelDetectionMode label_detection_mode = 1;
  109. // Whether the video has been shot from a stationary (i.e. non-moving) camera.
  110. // When set to true, might improve detection accuracy for moving objects.
  111. // Should be used with `SHOT_AND_FRAME_MODE` enabled.
  112. bool stationary_camera = 2;
  113. // Model to use for label detection.
  114. // Supported values: "builtin/stable" (the default if unset) and
  115. // "builtin/latest".
  116. string model = 3;
  117. }
  118. // Config for SHOT_CHANGE_DETECTION.
  119. message ShotChangeDetectionConfig {
  120. // Model to use for shot change detection.
  121. // Supported values: "builtin/stable" (the default if unset) and
  122. // "builtin/latest".
  123. string model = 1;
  124. }
  125. // Config for EXPLICIT_CONTENT_DETECTION.
  126. message ExplicitContentDetectionConfig {
  127. // Model to use for explicit content detection.
  128. // Supported values: "builtin/stable" (the default if unset) and
  129. // "builtin/latest".
  130. string model = 1;
  131. }
  132. // Video segment.
  133. message VideoSegment {
  134. // Time-offset, relative to the beginning of the video,
  135. // corresponding to the start of the segment (inclusive).
  136. google.protobuf.Duration start_time_offset = 1;
  137. // Time-offset, relative to the beginning of the video,
  138. // corresponding to the end of the segment (inclusive).
  139. google.protobuf.Duration end_time_offset = 2;
  140. }
  141. // Video segment level annotation results for label detection.
  142. message LabelSegment {
  143. // Video segment where a label was detected.
  144. VideoSegment segment = 1;
  145. // Confidence that the label is accurate. Range: [0, 1].
  146. float confidence = 2;
  147. }
  148. // Video frame level annotation results for label detection.
  149. message LabelFrame {
  150. // Time-offset, relative to the beginning of the video, corresponding to the
  151. // video frame for this location.
  152. google.protobuf.Duration time_offset = 1;
  153. // Confidence that the label is accurate. Range: [0, 1].
  154. float confidence = 2;
  155. }
  156. // Detected entity from video analysis.
  157. message Entity {
  158. // Opaque entity ID. Some IDs may be available in
  159. // [Google Knowledge Graph Search
  160. // API](https://developers.google.com/knowledge-graph/).
  161. string entity_id = 1;
  162. // Textual description, e.g. `Fixed-gear bicycle`.
  163. string description = 2;
  164. // Language code for `description` in BCP-47 format.
  165. string language_code = 3;
  166. }
  167. // Label annotation.
  168. message LabelAnnotation {
  169. // Detected entity.
  170. Entity entity = 1;
  171. // Common categories for the detected entity.
  172. // E.g. when the label is `Terrier` the category is likely `dog`. And in some
  173. // cases there might be more than one categories e.g. `Terrier` could also be
  174. // a `pet`.
  175. repeated Entity category_entities = 2;
  176. // All video segments where a label was detected.
  177. repeated LabelSegment segments = 3;
  178. // All video frames where a label was detected.
  179. repeated LabelFrame frames = 4;
  180. }
  181. // Video frame level annotation results for explicit content.
  182. message ExplicitContentFrame {
  183. // Time-offset, relative to the beginning of the video, corresponding to the
  184. // video frame for this location.
  185. google.protobuf.Duration time_offset = 1;
  186. // Likelihood of the pornography content..
  187. Likelihood pornography_likelihood = 2;
  188. }
  189. // Explicit content annotation (based on per-frame visual signals only).
  190. // If no explicit content has been detected in a frame, no annotations are
  191. // present for that frame.
  192. message ExplicitContentAnnotation {
  193. // All video frames where explicit content was detected.
  194. repeated ExplicitContentFrame frames = 1;
  195. }
  196. // Annotation results for a single video.
  197. message VideoAnnotationResults {
  198. // Output only. Video file location in
  199. // [Google Cloud Storage](https://cloud.google.com/storage/).
  200. string input_uri = 1;
  201. // Label annotations on video level or user specified segment level.
  202. // There is exactly one element for each unique label.
  203. repeated LabelAnnotation segment_label_annotations = 2;
  204. // Label annotations on shot level.
  205. // There is exactly one element for each unique label.
  206. repeated LabelAnnotation shot_label_annotations = 3;
  207. // Label annotations on frame level.
  208. // There is exactly one element for each unique label.
  209. repeated LabelAnnotation frame_label_annotations = 4;
  210. // Shot annotations. Each shot is represented as a video segment.
  211. repeated VideoSegment shot_annotations = 6;
  212. // Explicit content annotation.
  213. ExplicitContentAnnotation explicit_annotation = 7;
  214. // Speech transcription.
  215. repeated SpeechTranscription speech_transcriptions = 11;
  216. // Output only. If set, indicates an error. Note that for a single
  217. // `AnnotateVideoRequest` some videos may succeed and some may fail.
  218. google.rpc.Status error = 9;
  219. }
  220. // Video annotation response. Included in the `response`
  221. // field of the `Operation` returned by the `GetOperation`
  222. // call of the `google::longrunning::Operations` service.
  223. message AnnotateVideoResponse {
  224. // Annotation results for all videos specified in `AnnotateVideoRequest`.
  225. repeated VideoAnnotationResults annotation_results = 1;
  226. }
  227. // Annotation progress for a single video.
  228. message VideoAnnotationProgress {
  229. // Output only. Video file location in
  230. // [Google Cloud Storage](https://cloud.google.com/storage/).
  231. string input_uri = 1;
  232. // Output only. Approximate percentage processed thus far. Guaranteed to be
  233. // 100 when fully processed.
  234. int32 progress_percent = 2;
  235. // Output only. Time when the request was received.
  236. google.protobuf.Timestamp start_time = 3;
  237. // Output only. Time of the most recent update.
  238. google.protobuf.Timestamp update_time = 4;
  239. }
  240. // Video annotation progress. Included in the `metadata`
  241. // field of the `Operation` returned by the `GetOperation`
  242. // call of the `google::longrunning::Operations` service.
  243. message AnnotateVideoProgress {
  244. // Progress metadata for all videos specified in `AnnotateVideoRequest`.
  245. repeated VideoAnnotationProgress annotation_progress = 1;
  246. }
  247. // Config for SPEECH_TRANSCRIPTION.
  248. message SpeechTranscriptionConfig {
  249. // Required. *Required* The language of the supplied audio as a
  250. // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
  251. // Example: "en-US".
  252. // See [Language Support](https://cloud.google.com/speech/docs/languages)
  253. // for a list of the currently supported language codes.
  254. string language_code = 1 [(google.api.field_behavior) = REQUIRED];
  255. // Optional. Maximum number of recognition hypotheses to be returned.
  256. // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
  257. // within each `SpeechTranscription`. The server may return fewer than
  258. // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will
  259. // return a maximum of one. If omitted, will return a maximum of one.
  260. int32 max_alternatives = 2 [(google.api.field_behavior) = OPTIONAL];
  261. // Optional. If set to `true`, the server will attempt to filter out
  262. // profanities, replacing all but the initial character in each filtered word
  263. // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
  264. // won't be filtered out.
  265. bool filter_profanity = 3 [(google.api.field_behavior) = OPTIONAL];
  266. // Optional. A means to provide context to assist the speech recognition.
  267. repeated SpeechContext speech_contexts = 4
  268. [(google.api.field_behavior) = OPTIONAL];
  269. // Optional. If 'true', adds punctuation to recognition result hypotheses.
  270. // This feature is only available in select languages. Setting this for
  271. // requests in other languages has no effect at all. The default 'false' value
  272. // does not add punctuation to result hypotheses. NOTE: "This is currently
  273. // offered as an experimental service, complimentary to all users. In the
  274. // future this may be exclusively available as a premium feature."
  275. bool enable_automatic_punctuation = 5
  276. [(google.api.field_behavior) = OPTIONAL];
  277. // Optional. For file formats, such as MXF or MKV, supporting multiple audio
  278. // tracks, specify up to two tracks. Default: track 0.
  279. repeated int32 audio_tracks = 6 [(google.api.field_behavior) = OPTIONAL];
  280. }
  281. // Provides "hints" to the speech recognizer to favor specific words and phrases
  282. // in the results.
  283. message SpeechContext {
  284. // Optional. A list of strings containing words and phrases "hints" so that
  285. // the speech recognition is more likely to recognize them. This can be used
  286. // to improve the accuracy for specific words and phrases, for example, if
  287. // specific commands are typically spoken by the user. This can also be used
  288. // to add additional words to the vocabulary of the recognizer. See
  289. // [usage limits](https://cloud.google.com/speech/limits#content).
  290. repeated string phrases = 1 [(google.api.field_behavior) = OPTIONAL];
  291. }
  292. // A speech recognition result corresponding to a portion of the audio.
  293. message SpeechTranscription {
  294. // May contain one or more recognition hypotheses (up to the maximum specified
  295. // in `max_alternatives`). These alternatives are ordered in terms of
  296. // accuracy, with the top (first) alternative being the most probable, as
  297. // ranked by the recognizer.
  298. repeated SpeechRecognitionAlternative alternatives = 1;
  299. }
  300. // Alternative hypotheses (a.k.a. n-best list).
  301. message SpeechRecognitionAlternative {
  302. // Output only. Transcript text representing the words that the user spoke.
  303. string transcript = 1;
  304. // Output only. The confidence estimate between 0.0 and 1.0. A higher number
  305. // indicates an estimated greater likelihood that the recognized words are
  306. // correct. This field is set only for the top alternative.
  307. // This field is not guaranteed to be accurate and users should not rely on it
  308. // to be always provided.
  309. // The default of 0.0 is a sentinel value indicating `confidence` was not set.
  310. float confidence = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
  311. // Output only. A list of word-specific information for each recognized word.
  312. repeated WordInfo words = 3;
  313. }
  314. // Word-specific information for recognized words. Word information is only
  315. // included in the response when certain request parameters are set, such
  316. // as `enable_word_time_offsets`.
  317. message WordInfo {
  318. // Output only. Time offset relative to the beginning of the audio, and
  319. // corresponding to the start of the spoken word. This field is only set if
  320. // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
  321. // experimental feature and the accuracy of the time offset can vary.
  322. google.protobuf.Duration start_time = 1;
  323. // Output only. Time offset relative to the beginning of the audio, and
  324. // corresponding to the end of the spoken word. This field is only set if
  325. // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
  326. // experimental feature and the accuracy of the time offset can vary.
  327. google.protobuf.Duration end_time = 2;
  328. // Output only. The word corresponding to this set of information.
  329. string word = 3;
  330. }
  331. // Video annotation feature.
  332. enum Feature {
  333. // Unspecified.
  334. FEATURE_UNSPECIFIED = 0;
  335. // Label detection. Detect objects, such as dog or flower.
  336. LABEL_DETECTION = 1;
  337. // Shot change detection.
  338. SHOT_CHANGE_DETECTION = 2;
  339. // Explicit content detection.
  340. EXPLICIT_CONTENT_DETECTION = 3;
  341. // Speech transcription.
  342. SPEECH_TRANSCRIPTION = 6;
  343. }
  344. // Label detection mode.
  345. enum LabelDetectionMode {
  346. // Unspecified.
  347. LABEL_DETECTION_MODE_UNSPECIFIED = 0;
  348. // Detect shot-level labels.
  349. SHOT_MODE = 1;
  350. // Detect frame-level labels.
  351. FRAME_MODE = 2;
  352. // Detect both shot-level and frame-level labels.
  353. SHOT_AND_FRAME_MODE = 3;
  354. }
  355. // Bucketized representation of likelihood.
  356. enum Likelihood {
  357. // Unspecified likelihood.
  358. LIKELIHOOD_UNSPECIFIED = 0;
  359. // Very unlikely.
  360. VERY_UNLIKELY = 1;
  361. // Unlikely.
  362. UNLIKELY = 2;
  363. // Possible.
  364. POSSIBLE = 3;
  365. // Likely.
  366. LIKELY = 4;
  367. // Very likely.
  368. VERY_LIKELY = 5;
  369. }