video_intelligence.proto 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090
  1. // Copyright 2020 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.videointelligence.v1p3beta1;
  16. import "google/api/annotations.proto";
  17. import "google/api/client.proto";
  18. import "google/api/field_behavior.proto";
  19. import "google/longrunning/operations.proto";
  20. import "google/protobuf/duration.proto";
  21. import "google/protobuf/timestamp.proto";
  22. import "google/rpc/status.proto";
  23. option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P3Beta1";
  24. option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1p3beta1;videointelligence";
  25. option java_multiple_files = true;
  26. option java_outer_classname = "VideoIntelligenceServiceProto";
  27. option java_package = "com.google.cloud.videointelligence.v1p3beta1";
  28. option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p3beta1";
  29. option ruby_package = "Google::Cloud::VideoIntelligence::V1p3beta1";
  30. // Service that implements the Video Intelligence API.
  31. service VideoIntelligenceService {
  32. option (google.api.default_host) = "videointelligence.googleapis.com";
  33. option (google.api.oauth_scopes) =
  34. "https://www.googleapis.com/auth/cloud-platform";
  35. // Performs asynchronous video annotation. Progress and results can be
  36. // retrieved through the `google.longrunning.Operations` interface.
  37. // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
  38. // `Operation.response` contains `AnnotateVideoResponse` (results).
  39. rpc AnnotateVideo(AnnotateVideoRequest)
  40. returns (google.longrunning.Operation) {
  41. option (google.api.http) = {
  42. post: "/v1p3beta1/videos:annotate"
  43. body: "*"
  44. };
  45. option (google.api.method_signature) = "input_uri,features";
  46. option (google.longrunning.operation_info) = {
  47. response_type: "AnnotateVideoResponse"
  48. metadata_type: "AnnotateVideoProgress"
  49. };
  50. }
  51. }
  52. // Service that implements streaming Video Intelligence API.
  53. service StreamingVideoIntelligenceService {
  54. option (google.api.default_host) = "videointelligence.googleapis.com";
  55. option (google.api.oauth_scopes) =
  56. "https://www.googleapis.com/auth/cloud-platform";
  57. // Performs video annotation with bidirectional streaming: emitting results
  58. // while sending video/audio bytes.
  59. // This method is only available via the gRPC API (not REST).
  60. rpc StreamingAnnotateVideo(stream StreamingAnnotateVideoRequest)
  61. returns (stream StreamingAnnotateVideoResponse) {}
  62. }
  63. // Video annotation request.
  64. message AnnotateVideoRequest {
  65. // Input video location. Currently, only
  66. // [Cloud Storage](https://cloud.google.com/storage/) URIs are
  67. // supported. URIs must be specified in the following format:
  68. // `gs://bucket-id/object-id` (other URI formats return
  69. // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
  70. // more information, see [Request
  71. // URIs](https://cloud.google.com/storage/docs/request-endpoints). To identify
  72. // multiple videos, a video URI may include wildcards in the `object-id`.
  73. // Supported wildcards: '*' to match 0 or more characters;
  74. // '?' to match 1 character. If unset, the input video should be embedded
  75. // in the request as `input_content`. If set, `input_content` must be unset.
  76. string input_uri = 1;
  77. // The video data bytes.
  78. // If unset, the input video(s) should be specified via the `input_uri`.
  79. // If set, `input_uri` must be unset.
  80. bytes input_content = 6;
  81. // Required. Requested video annotation features.
  82. repeated Feature features = 2 [(google.api.field_behavior) = REQUIRED];
  83. // Additional video context and/or feature-specific parameters.
  84. VideoContext video_context = 3;
  85. // Optional. Location where the output (in JSON format) should be stored.
  86. // Currently, only [Cloud Storage](https://cloud.google.com/storage/)
  87. // URIs are supported. These must be specified in the following format:
  88. // `gs://bucket-id/object-id` (other URI formats return
  89. // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
  90. // more information, see [Request
  91. // URIs](https://cloud.google.com/storage/docs/request-endpoints).
  92. string output_uri = 4 [(google.api.field_behavior) = OPTIONAL];
  93. // Optional. Cloud region where annotation should take place. Supported cloud
  94. // regions are: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no
  95. // region is specified, the region will be determined based on video file
  96. // location.
  97. string location_id = 5 [(google.api.field_behavior) = OPTIONAL];
  98. }
  99. // Video context and/or feature-specific parameters.
  100. message VideoContext {
  101. // Video segments to annotate. The segments may overlap and are not required
  102. // to be contiguous or span the whole video. If unspecified, each video is
  103. // treated as a single segment.
  104. repeated VideoSegment segments = 1;
  105. // Config for LABEL_DETECTION.
  106. LabelDetectionConfig label_detection_config = 2;
  107. // Config for SHOT_CHANGE_DETECTION.
  108. ShotChangeDetectionConfig shot_change_detection_config = 3;
  109. // Config for EXPLICIT_CONTENT_DETECTION.
  110. ExplicitContentDetectionConfig explicit_content_detection_config = 4;
  111. // Config for FACE_DETECTION.
  112. FaceDetectionConfig face_detection_config = 5;
  113. // Config for SPEECH_TRANSCRIPTION.
  114. SpeechTranscriptionConfig speech_transcription_config = 6;
  115. // Config for TEXT_DETECTION.
  116. TextDetectionConfig text_detection_config = 8;
  117. // Config for PERSON_DETECTION.
  118. PersonDetectionConfig person_detection_config = 11;
  119. // Config for OBJECT_TRACKING.
  120. ObjectTrackingConfig object_tracking_config = 13;
  121. }
  122. // Label detection mode.
  123. enum LabelDetectionMode {
  124. // Unspecified.
  125. LABEL_DETECTION_MODE_UNSPECIFIED = 0;
  126. // Detect shot-level labels.
  127. SHOT_MODE = 1;
  128. // Detect frame-level labels.
  129. FRAME_MODE = 2;
  130. // Detect both shot-level and frame-level labels.
  131. SHOT_AND_FRAME_MODE = 3;
  132. }
  133. // Bucketized representation of likelihood.
  134. enum Likelihood {
  135. // Unspecified likelihood.
  136. LIKELIHOOD_UNSPECIFIED = 0;
  137. // Very unlikely.
  138. VERY_UNLIKELY = 1;
  139. // Unlikely.
  140. UNLIKELY = 2;
  141. // Possible.
  142. POSSIBLE = 3;
  143. // Likely.
  144. LIKELY = 4;
  145. // Very likely.
  146. VERY_LIKELY = 5;
  147. }
  148. // Config for LABEL_DETECTION.
  149. message LabelDetectionConfig {
  150. // What labels should be detected with LABEL_DETECTION, in addition to
  151. // video-level labels or segment-level labels.
  152. // If unspecified, defaults to `SHOT_MODE`.
  153. LabelDetectionMode label_detection_mode = 1;
  154. // Whether the video has been shot from a stationary (i.e., non-moving)
  155. // camera. When set to true, might improve detection accuracy for moving
  156. // objects. Should be used with `SHOT_AND_FRAME_MODE` enabled.
  157. bool stationary_camera = 2;
  158. // Model to use for label detection.
  159. // Supported values: "builtin/stable" (the default if unset) and
  160. // "builtin/latest".
  161. string model = 3;
  162. // The confidence threshold we perform filtering on the labels from
  163. // frame-level detection. If not set, it is set to 0.4 by default. The valid
  164. // range for this threshold is [0.1, 0.9]. Any value set outside of this
  165. // range will be clipped.
  166. // Note: For best results, follow the default threshold. We will update
  167. // the default threshold everytime when we release a new model.
  168. float frame_confidence_threshold = 4;
  169. // The confidence threshold we perform filtering on the labels from
  170. // video-level and shot-level detections. If not set, it's set to 0.3 by
  171. // default. The valid range for this threshold is [0.1, 0.9]. Any value set
  172. // outside of this range will be clipped.
  173. // Note: For best results, follow the default threshold. We will update
  174. // the default threshold everytime when we release a new model.
  175. float video_confidence_threshold = 5;
  176. }
  177. // Streaming video annotation feature.
  178. enum StreamingFeature {
  179. // Unspecified.
  180. STREAMING_FEATURE_UNSPECIFIED = 0;
  181. // Label detection. Detect objects, such as dog or flower.
  182. STREAMING_LABEL_DETECTION = 1;
  183. // Shot change detection.
  184. STREAMING_SHOT_CHANGE_DETECTION = 2;
  185. // Explicit content detection.
  186. STREAMING_EXPLICIT_CONTENT_DETECTION = 3;
  187. // Object detection and tracking.
  188. STREAMING_OBJECT_TRACKING = 4;
  189. // Action recognition based on AutoML model.
  190. STREAMING_AUTOML_ACTION_RECOGNITION = 23;
  191. // Video classification based on AutoML model.
  192. STREAMING_AUTOML_CLASSIFICATION = 21;
  193. // Object detection and tracking based on AutoML model.
  194. STREAMING_AUTOML_OBJECT_TRACKING = 22;
  195. }
  196. // Video annotation feature.
  197. enum Feature {
  198. // Unspecified.
  199. FEATURE_UNSPECIFIED = 0;
  200. // Label detection. Detect objects, such as dog or flower.
  201. LABEL_DETECTION = 1;
  202. // Shot change detection.
  203. SHOT_CHANGE_DETECTION = 2;
  204. // Explicit content detection.
  205. EXPLICIT_CONTENT_DETECTION = 3;
  206. // Human face detection.
  207. FACE_DETECTION = 4;
  208. // Speech transcription.
  209. SPEECH_TRANSCRIPTION = 6;
  210. // OCR text detection and tracking.
  211. TEXT_DETECTION = 7;
  212. // Object detection and tracking.
  213. OBJECT_TRACKING = 9;
  214. // Logo detection, tracking, and recognition.
  215. LOGO_RECOGNITION = 12;
  216. // Celebrity recognition.
  217. CELEBRITY_RECOGNITION = 13;
  218. // Person detection.
  219. PERSON_DETECTION = 14;
  220. }
  221. // Config for SHOT_CHANGE_DETECTION.
  222. message ShotChangeDetectionConfig {
  223. // Model to use for shot change detection.
  224. // Supported values: "builtin/stable" (the default if unset) and
  225. // "builtin/latest".
  226. string model = 1;
  227. }
  228. // Config for OBJECT_TRACKING.
  229. message ObjectTrackingConfig {
  230. // Model to use for object tracking.
  231. // Supported values: "builtin/stable" (the default if unset) and
  232. // "builtin/latest".
  233. string model = 1;
  234. }
  235. // Config for EXPLICIT_CONTENT_DETECTION.
  236. message ExplicitContentDetectionConfig {
  237. // Model to use for explicit content detection.
  238. // Supported values: "builtin/stable" (the default if unset) and
  239. // "builtin/latest".
  240. string model = 1;
  241. }
  242. // Config for FACE_DETECTION.
  243. message FaceDetectionConfig {
  244. // Model to use for face detection.
  245. // Supported values: "builtin/stable" (the default if unset) and
  246. // "builtin/latest".
  247. string model = 1;
  248. // Whether bounding boxes are included in the face annotation output.
  249. bool include_bounding_boxes = 2;
  250. // Whether to enable face attributes detection, such as glasses, dark_glasses,
  251. // mouth_open etc. Ignored if 'include_bounding_boxes' is set to false.
  252. bool include_attributes = 5;
  253. }
  254. // Config for PERSON_DETECTION.
  255. message PersonDetectionConfig {
  256. // Whether bounding boxes are included in the person detection annotation
  257. // output.
  258. bool include_bounding_boxes = 1;
  259. // Whether to enable pose landmarks detection. Ignored if
  260. // 'include_bounding_boxes' is set to false.
  261. bool include_pose_landmarks = 2;
  262. // Whether to enable person attributes detection, such as cloth color (black,
  263. // blue, etc), type (coat, dress, etc), pattern (plain, floral, etc), hair,
  264. // etc.
  265. // Ignored if 'include_bounding_boxes' is set to false.
  266. bool include_attributes = 3;
  267. }
  268. // Config for TEXT_DETECTION.
  269. message TextDetectionConfig {
  270. // Language hint can be specified if the language to be detected is known a
  271. // priori. It can increase the accuracy of the detection. Language hint must
  272. // be language code in BCP-47 format.
  273. //
  274. // Automatic language detection is performed if no hint is provided.
  275. repeated string language_hints = 1;
  276. // Model to use for text detection.
  277. // Supported values: "builtin/stable" (the default if unset) and
  278. // "builtin/latest".
  279. string model = 2;
  280. }
  281. // Video segment.
  282. message VideoSegment {
  283. // Time-offset, relative to the beginning of the video,
  284. // corresponding to the start of the segment (inclusive).
  285. google.protobuf.Duration start_time_offset = 1;
  286. // Time-offset, relative to the beginning of the video,
  287. // corresponding to the end of the segment (inclusive).
  288. google.protobuf.Duration end_time_offset = 2;
  289. }
  290. // Video segment level annotation results for label detection.
  291. message LabelSegment {
  292. // Video segment where a label was detected.
  293. VideoSegment segment = 1;
  294. // Confidence that the label is accurate. Range: [0, 1].
  295. float confidence = 2;
  296. }
  297. // Video frame level annotation results for label detection.
  298. message LabelFrame {
  299. // Time-offset, relative to the beginning of the video, corresponding to the
  300. // video frame for this location.
  301. google.protobuf.Duration time_offset = 1;
  302. // Confidence that the label is accurate. Range: [0, 1].
  303. float confidence = 2;
  304. }
  305. // Detected entity from video analysis.
  306. message Entity {
  307. // Opaque entity ID. Some IDs may be available in
  308. // [Google Knowledge Graph Search
  309. // API](https://developers.google.com/knowledge-graph/).
  310. string entity_id = 1;
  311. // Textual description, e.g., `Fixed-gear bicycle`.
  312. string description = 2;
  313. // Language code for `description` in BCP-47 format.
  314. string language_code = 3;
  315. }
  316. // Label annotation.
  317. message LabelAnnotation {
  318. // Detected entity.
  319. Entity entity = 1;
  320. // Common categories for the detected entity.
  321. // For example, when the label is `Terrier`, the category is likely `dog`. And
  322. // in some cases there might be more than one categories e.g., `Terrier` could
  323. // also be a `pet`.
  324. repeated Entity category_entities = 2;
  325. // All video segments where a label was detected.
  326. repeated LabelSegment segments = 3;
  327. // All video frames where a label was detected.
  328. repeated LabelFrame frames = 4;
  329. }
  330. // Video frame level annotation results for explicit content.
  331. message ExplicitContentFrame {
  332. // Time-offset, relative to the beginning of the video, corresponding to the
  333. // video frame for this location.
  334. google.protobuf.Duration time_offset = 1;
  335. // Likelihood of the pornography content..
  336. Likelihood pornography_likelihood = 2;
  337. }
  338. // Explicit content annotation (based on per-frame visual signals only).
  339. // If no explicit content has been detected in a frame, no annotations are
  340. // present for that frame.
  341. message ExplicitContentAnnotation {
  342. // All video frames where explicit content was detected.
  343. repeated ExplicitContentFrame frames = 1;
  344. }
  345. // Normalized bounding box.
  346. // The normalized vertex coordinates are relative to the original image.
  347. // Range: [0, 1].
  348. message NormalizedBoundingBox {
  349. // Left X coordinate.
  350. float left = 1;
  351. // Top Y coordinate.
  352. float top = 2;
  353. // Right X coordinate.
  354. float right = 3;
  355. // Bottom Y coordinate.
  356. float bottom = 4;
  357. }
  358. // For tracking related features.
  359. // An object at time_offset with attributes, and located with
  360. // normalized_bounding_box.
  361. message TimestampedObject {
  362. // Normalized Bounding box in a frame, where the object is located.
  363. NormalizedBoundingBox normalized_bounding_box = 1;
  364. // Time-offset, relative to the beginning of the video,
  365. // corresponding to the video frame for this object.
  366. google.protobuf.Duration time_offset = 2;
  367. // Optional. The attributes of the object in the bounding box.
  368. repeated DetectedAttribute attributes = 3
  369. [(google.api.field_behavior) = OPTIONAL];
  370. // Optional. The detected landmarks.
  371. repeated DetectedLandmark landmarks = 4
  372. [(google.api.field_behavior) = OPTIONAL];
  373. }
  374. // A track of an object instance.
  375. message Track {
  376. // Video segment of a track.
  377. VideoSegment segment = 1;
  378. // The object with timestamp and attributes per frame in the track.
  379. repeated TimestampedObject timestamped_objects = 2;
  380. // Optional. Attributes in the track level.
  381. repeated DetectedAttribute attributes = 3
  382. [(google.api.field_behavior) = OPTIONAL];
  383. // Optional. The confidence score of the tracked object.
  384. float confidence = 4 [(google.api.field_behavior) = OPTIONAL];
  385. }
  386. // A generic detected attribute represented by name in string format.
  387. message DetectedAttribute {
  388. // The name of the attribute, for example, glasses, dark_glasses, mouth_open.
  389. // A full list of supported type names will be provided in the document.
  390. string name = 1;
  391. // Detected attribute confidence. Range [0, 1].
  392. float confidence = 2;
  393. // Text value of the detection result. For example, the value for "HairColor"
  394. // can be "black", "blonde", etc.
  395. string value = 3;
  396. }
  397. // Celebrity definition.
  398. message Celebrity {
  399. // The resource name of the celebrity. Have the format
  400. // `video-intelligence/kg-mid` indicates a celebrity from preloaded gallery.
  401. // kg-mid is the id in Google knowledge graph, which is unique for the
  402. // celebrity.
  403. string name = 1;
  404. // The celebrity name.
  405. string display_name = 2;
  406. // Textual description of additional information about the celebrity, if
  407. // applicable.
  408. string description = 3;
  409. }
  410. // The annotation result of a celebrity face track. RecognizedCelebrity field
  411. // could be empty if the face track does not have any matched celebrities.
  412. message CelebrityTrack {
  413. // The recognized celebrity with confidence score.
  414. message RecognizedCelebrity {
  415. // The recognized celebrity.
  416. Celebrity celebrity = 1;
  417. // Recognition confidence. Range [0, 1].
  418. float confidence = 2;
  419. }
  420. // Top N match of the celebrities for the face in this track.
  421. repeated RecognizedCelebrity celebrities = 1;
  422. // A track of a person's face.
  423. Track face_track = 3;
  424. }
  425. // Celebrity recognition annotation per video.
  426. message CelebrityRecognitionAnnotation {
  427. // The tracks detected from the input video, including recognized celebrities
  428. // and other detected faces in the video.
  429. repeated CelebrityTrack celebrity_tracks = 1;
  430. }
  431. // A generic detected landmark represented by name in string format and a 2D
  432. // location.
  433. message DetectedLandmark {
  434. // The name of this landmark, for example, left_hand, right_shoulder.
  435. string name = 1;
  436. // The 2D point of the detected landmark using the normalized image
  437. // coordindate system. The normalized coordinates have the range from 0 to 1.
  438. NormalizedVertex point = 2;
  439. // The confidence score of the detected landmark. Range [0, 1].
  440. float confidence = 3;
  441. }
  442. // Face detection annotation.
  443. message FaceDetectionAnnotation {
  444. // The face tracks with attributes.
  445. repeated Track tracks = 3;
  446. // The thumbnail of a person's face.
  447. bytes thumbnail = 4;
  448. }
  449. // Person detection annotation per video.
  450. message PersonDetectionAnnotation {
  451. // The detected tracks of a person.
  452. repeated Track tracks = 1;
  453. }
  454. // Annotation results for a single video.
  455. message VideoAnnotationResults {
  456. // Video file location in
  457. // [Cloud Storage](https://cloud.google.com/storage/).
  458. string input_uri = 1;
  459. // Video segment on which the annotation is run.
  460. VideoSegment segment = 10;
  461. // Topical label annotations on video level or user-specified segment level.
  462. // There is exactly one element for each unique label.
  463. repeated LabelAnnotation segment_label_annotations = 2;
  464. // Presence label annotations on video level or user-specified segment level.
  465. // There is exactly one element for each unique label. Compared to the
  466. // existing topical `segment_label_annotations`, this field presents more
  467. // fine-grained, segment-level labels detected in video content and is made
  468. // available only when the client sets `LabelDetectionConfig.model` to
  469. // "builtin/latest" in the request.
  470. repeated LabelAnnotation segment_presence_label_annotations = 23;
  471. // Topical label annotations on shot level.
  472. // There is exactly one element for each unique label.
  473. repeated LabelAnnotation shot_label_annotations = 3;
  474. // Presence label annotations on shot level. There is exactly one element for
  475. // each unique label. Compared to the existing topical
  476. // `shot_label_annotations`, this field presents more fine-grained, shot-level
  477. // labels detected in video content and is made available only when the client
  478. // sets `LabelDetectionConfig.model` to "builtin/latest" in the request.
  479. repeated LabelAnnotation shot_presence_label_annotations = 24;
  480. // Label annotations on frame level.
  481. // There is exactly one element for each unique label.
  482. repeated LabelAnnotation frame_label_annotations = 4;
  483. // Face detection annotations.
  484. repeated FaceDetectionAnnotation face_detection_annotations = 13;
  485. // Shot annotations. Each shot is represented as a video segment.
  486. repeated VideoSegment shot_annotations = 6;
  487. // Explicit content annotation.
  488. ExplicitContentAnnotation explicit_annotation = 7;
  489. // Speech transcription.
  490. repeated SpeechTranscription speech_transcriptions = 11;
  491. // OCR text detection and tracking.
  492. // Annotations for list of detected text snippets. Each will have list of
  493. // frame information associated with it.
  494. repeated TextAnnotation text_annotations = 12;
  495. // Annotations for list of objects detected and tracked in video.
  496. repeated ObjectTrackingAnnotation object_annotations = 14;
  497. // Annotations for list of logos detected, tracked and recognized in video.
  498. repeated LogoRecognitionAnnotation logo_recognition_annotations = 19;
  499. // Person detection annotations.
  500. repeated PersonDetectionAnnotation person_detection_annotations = 20;
  501. // Celebrity recognition annotations.
  502. CelebrityRecognitionAnnotation celebrity_recognition_annotations = 21;
  503. // If set, indicates an error. Note that for a single `AnnotateVideoRequest`
  504. // some videos may succeed and some may fail.
  505. google.rpc.Status error = 9;
  506. }
  507. // Video annotation response. Included in the `response`
  508. // field of the `Operation` returned by the `GetOperation`
  509. // call of the `google::longrunning::Operations` service.
  510. message AnnotateVideoResponse {
  511. // Annotation results for all videos specified in `AnnotateVideoRequest`.
  512. repeated VideoAnnotationResults annotation_results = 1;
  513. }
  514. // Annotation progress for a single video.
  515. message VideoAnnotationProgress {
  516. // Video file location in
  517. // [Cloud Storage](https://cloud.google.com/storage/).
  518. string input_uri = 1;
  519. // Approximate percentage processed thus far. Guaranteed to be
  520. // 100 when fully processed.
  521. int32 progress_percent = 2;
  522. // Time when the request was received.
  523. google.protobuf.Timestamp start_time = 3;
  524. // Time of the most recent update.
  525. google.protobuf.Timestamp update_time = 4;
  526. // Specifies which feature is being tracked if the request contains more than
  527. // one feature.
  528. Feature feature = 5;
  529. // Specifies which segment is being tracked if the request contains more than
  530. // one segment.
  531. VideoSegment segment = 6;
  532. }
  533. // Video annotation progress. Included in the `metadata`
  534. // field of the `Operation` returned by the `GetOperation`
  535. // call of the `google::longrunning::Operations` service.
  536. message AnnotateVideoProgress {
  537. // Progress metadata for all videos specified in `AnnotateVideoRequest`.
  538. repeated VideoAnnotationProgress annotation_progress = 1;
  539. }
  540. // Config for SPEECH_TRANSCRIPTION.
  541. message SpeechTranscriptionConfig {
  542. // Required. *Required* The language of the supplied audio as a
  543. // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
  544. // Example: "en-US".
  545. // See [Language Support](https://cloud.google.com/speech/docs/languages)
  546. // for a list of the currently supported language codes.
  547. string language_code = 1 [(google.api.field_behavior) = REQUIRED];
  548. // Optional. Maximum number of recognition hypotheses to be returned.
  549. // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
  550. // within each `SpeechTranscription`. The server may return fewer than
  551. // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will
  552. // return a maximum of one. If omitted, will return a maximum of one.
  553. int32 max_alternatives = 2 [(google.api.field_behavior) = OPTIONAL];
  554. // Optional. If set to `true`, the server will attempt to filter out
  555. // profanities, replacing all but the initial character in each filtered word
  556. // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
  557. // won't be filtered out.
  558. bool filter_profanity = 3 [(google.api.field_behavior) = OPTIONAL];
  559. // Optional. A means to provide context to assist the speech recognition.
  560. repeated SpeechContext speech_contexts = 4
  561. [(google.api.field_behavior) = OPTIONAL];
  562. // Optional. If 'true', adds punctuation to recognition result hypotheses.
  563. // This feature is only available in select languages. Setting this for
  564. // requests in other languages has no effect at all. The default 'false' value
  565. // does not add punctuation to result hypotheses. NOTE: "This is currently
  566. // offered as an experimental service, complimentary to all users. In the
  567. // future this may be exclusively available as a premium feature."
  568. bool enable_automatic_punctuation = 5
  569. [(google.api.field_behavior) = OPTIONAL];
  570. // Optional. For file formats, such as MXF or MKV, supporting multiple audio
  571. // tracks, specify up to two tracks. Default: track 0.
  572. repeated int32 audio_tracks = 6 [(google.api.field_behavior) = OPTIONAL];
  573. // Optional. If 'true', enables speaker detection for each recognized word in
  574. // the top alternative of the recognition result using a speaker_tag provided
  575. // in the WordInfo.
  576. // Note: When this is true, we send all the words from the beginning of the
  577. // audio for the top alternative in every consecutive response.
  578. // This is done in order to improve our speaker tags as our models learn to
  579. // identify the speakers in the conversation over time.
  580. bool enable_speaker_diarization = 7 [(google.api.field_behavior) = OPTIONAL];
  581. // Optional. If set, specifies the estimated number of speakers in the
  582. // conversation. If not set, defaults to '2'. Ignored unless
  583. // enable_speaker_diarization is set to true.
  584. int32 diarization_speaker_count = 8 [(google.api.field_behavior) = OPTIONAL];
  585. // Optional. If `true`, the top result includes a list of words and the
  586. // confidence for those words. If `false`, no word-level confidence
  587. // information is returned. The default is `false`.
  588. bool enable_word_confidence = 9 [(google.api.field_behavior) = OPTIONAL];
  589. }
  590. // Provides "hints" to the speech recognizer to favor specific words and phrases
  591. // in the results.
  592. message SpeechContext {
  593. // Optional. A list of strings containing words and phrases "hints" so that
  594. // the speech recognition is more likely to recognize them. This can be used
  595. // to improve the accuracy for specific words and phrases, for example, if
  596. // specific commands are typically spoken by the user. This can also be used
  597. // to add additional words to the vocabulary of the recognizer. See
  598. // [usage limits](https://cloud.google.com/speech/limits#content).
  599. repeated string phrases = 1 [(google.api.field_behavior) = OPTIONAL];
  600. }
  601. // A speech recognition result corresponding to a portion of the audio.
  602. message SpeechTranscription {
  603. // May contain one or more recognition hypotheses (up to the maximum specified
  604. // in `max_alternatives`). These alternatives are ordered in terms of
  605. // accuracy, with the top (first) alternative being the most probable, as
  606. // ranked by the recognizer.
  607. repeated SpeechRecognitionAlternative alternatives = 1;
  608. // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
  609. // language tag of the language in this result. This language code was
  610. // detected to have the most likelihood of being spoken in the audio.
  611. string language_code = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
  612. }
  613. // Alternative hypotheses (a.k.a. n-best list).
  614. message SpeechRecognitionAlternative {
  615. // Transcript text representing the words that the user spoke.
  616. string transcript = 1;
  617. // Output only. The confidence estimate between 0.0 and 1.0. A higher number
  618. // indicates an estimated greater likelihood that the recognized words are
  619. // correct. This field is set only for the top alternative.
  620. // This field is not guaranteed to be accurate and users should not rely on it
  621. // to be always provided.
  622. // The default of 0.0 is a sentinel value indicating `confidence` was not set.
  623. float confidence = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
  624. // Output only. A list of word-specific information for each recognized word.
  625. // Note: When `enable_speaker_diarization` is set to true, you will see all
  626. // the words from the beginning of the audio.
  627. repeated WordInfo words = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
  628. }
  629. // Word-specific information for recognized words. Word information is only
  630. // included in the response when certain request parameters are set, such
  631. // as `enable_word_time_offsets`.
  632. message WordInfo {
  633. // Time offset relative to the beginning of the audio, and
  634. // corresponding to the start of the spoken word. This field is only set if
  635. // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
  636. // experimental feature and the accuracy of the time offset can vary.
  637. google.protobuf.Duration start_time = 1;
  638. // Time offset relative to the beginning of the audio, and
  639. // corresponding to the end of the spoken word. This field is only set if
  640. // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
  641. // experimental feature and the accuracy of the time offset can vary.
  642. google.protobuf.Duration end_time = 2;
  643. // The word corresponding to this set of information.
  644. string word = 3;
  645. // Output only. The confidence estimate between 0.0 and 1.0. A higher number
  646. // indicates an estimated greater likelihood that the recognized words are
  647. // correct. This field is set only for the top alternative.
  648. // This field is not guaranteed to be accurate and users should not rely on it
  649. // to be always provided.
  650. // The default of 0.0 is a sentinel value indicating `confidence` was not set.
  651. float confidence = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
  652. // Output only. A distinct integer value is assigned for every speaker within
  653. // the audio. This field specifies which one of those speakers was detected to
  654. // have spoken this word. Value ranges from 1 up to diarization_speaker_count,
  655. // and is only set if speaker diarization is enabled.
  656. int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
  657. }
  658. // A vertex represents a 2D point in the image.
  659. // NOTE: the normalized vertex coordinates are relative to the original image
  660. // and range from 0 to 1.
  661. message NormalizedVertex {
  662. // X coordinate.
  663. float x = 1;
  664. // Y coordinate.
  665. float y = 2;
  666. }
  667. // Normalized bounding polygon for text (that might not be aligned with axis).
  668. // Contains list of the corner points in clockwise order starting from
  669. // top-left corner. For example, for a rectangular bounding box:
  670. // When the text is horizontal it might look like:
  671. // 0----1
  672. // | |
  673. // 3----2
  674. //
  675. // When it's clockwise rotated 180 degrees around the top-left corner it
  676. // becomes:
  677. // 2----3
  678. // | |
  679. // 1----0
  680. //
  681. // and the vertex order will still be (0, 1, 2, 3). Note that values can be less
  682. // than 0, or greater than 1 due to trignometric calculations for location of
  683. // the box.
  684. message NormalizedBoundingPoly {
  685. // Normalized vertices of the bounding polygon.
  686. repeated NormalizedVertex vertices = 1;
  687. }
  688. // Video segment level annotation results for text detection.
  689. message TextSegment {
  690. // Video segment where a text snippet was detected.
  691. VideoSegment segment = 1;
  692. // Confidence for the track of detected text. It is calculated as the highest
  693. // over all frames where OCR detected text appears.
  694. float confidence = 2;
  695. // Information related to the frames where OCR detected text appears.
  696. repeated TextFrame frames = 3;
  697. }
  698. // Video frame level annotation results for text annotation (OCR).
  699. // Contains information regarding timestamp and bounding box locations for the
  700. // frames containing detected OCR text snippets.
  701. message TextFrame {
  702. // Bounding polygon of the detected text for this frame.
  703. NormalizedBoundingPoly rotated_bounding_box = 1;
  704. // Timestamp of this frame.
  705. google.protobuf.Duration time_offset = 2;
  706. }
  707. // Annotations related to one detected OCR text snippet. This will contain the
  708. // corresponding text, confidence value, and frame level information for each
  709. // detection.
  710. message TextAnnotation {
  711. // The detected text.
  712. string text = 1;
  713. // All video segments where OCR detected text appears.
  714. repeated TextSegment segments = 2;
  715. }
  716. // Video frame level annotations for object detection and tracking. This field
  717. // stores per frame location, time offset, and confidence.
  718. message ObjectTrackingFrame {
  719. // The normalized bounding box location of this object track for the frame.
  720. NormalizedBoundingBox normalized_bounding_box = 1;
  721. // The timestamp of the frame in microseconds.
  722. google.protobuf.Duration time_offset = 2;
  723. }
  724. // Annotations corresponding to one tracked object.
  725. message ObjectTrackingAnnotation {
  726. // Different representation of tracking info in non-streaming batch
  727. // and streaming modes.
  728. oneof track_info {
  729. // Non-streaming batch mode ONLY.
  730. // Each object track corresponds to one video segment where it appears.
  731. VideoSegment segment = 3;
  732. // Streaming mode ONLY.
  733. // In streaming mode, we do not know the end time of a tracked object
  734. // before it is completed. Hence, there is no VideoSegment info returned.
  735. // Instead, we provide a unique identifiable integer track_id so that
  736. // the customers can correlate the results of the ongoing
  737. // ObjectTrackAnnotation of the same track_id over time.
  738. int64 track_id = 5;
  739. }
  740. // Entity to specify the object category that this track is labeled as.
  741. Entity entity = 1;
  742. // Object category's labeling confidence of this track.
  743. float confidence = 4;
  744. // Information corresponding to all frames where this object track appears.
  745. // Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame
  746. // messages in frames.
  747. // Streaming mode: it can only be one ObjectTrackingFrame message in frames.
  748. repeated ObjectTrackingFrame frames = 2;
  749. }
  750. // Annotation corresponding to one detected, tracked and recognized logo class.
  751. message LogoRecognitionAnnotation {
  752. // Entity category information to specify the logo class that all the logo
  753. // tracks within this LogoRecognitionAnnotation are recognized as.
  754. Entity entity = 1;
  755. // All logo tracks where the recognized logo appears. Each track corresponds
  756. // to one logo instance appearing in consecutive frames.
  757. repeated Track tracks = 2;
  758. // All video segments where the recognized logo appears. There might be
  759. // multiple instances of the same logo class appearing in one VideoSegment.
  760. repeated VideoSegment segments = 3;
  761. }
  762. // The top-level message sent by the client for the `StreamingAnnotateVideo`
  763. // method. Multiple `StreamingAnnotateVideoRequest` messages are sent.
  764. // The first message must only contain a `StreamingVideoConfig` message.
  765. // All subsequent messages must only contain `input_content` data.
  766. message StreamingAnnotateVideoRequest {
  767. // *Required* The streaming request, which is either a streaming config or
  768. // video content.
  769. oneof streaming_request {
  770. // Provides information to the annotator, specifing how to process the
  771. // request. The first `AnnotateStreamingVideoRequest` message must only
  772. // contain a `video_config` message.
  773. StreamingVideoConfig video_config = 1;
  774. // The video data to be annotated. Chunks of video data are sequentially
  775. // sent in `StreamingAnnotateVideoRequest` messages. Except the initial
  776. // `StreamingAnnotateVideoRequest` message containing only
  777. // `video_config`, all subsequent `AnnotateStreamingVideoRequest`
  778. // messages must only contain `input_content` field.
  779. // Note: as with all bytes fields, protobuffers use a pure binary
  780. // representation (not base64).
  781. bytes input_content = 2;
  782. }
  783. }
  784. // Provides information to the annotator that specifies how to process the
  785. // request.
  786. message StreamingVideoConfig {
  787. // Config for requested annotation feature.
  788. oneof streaming_config {
  789. // Config for STREAMING_SHOT_CHANGE_DETECTION.
  790. StreamingShotChangeDetectionConfig shot_change_detection_config = 2;
  791. // Config for STREAMING_LABEL_DETECTION.
  792. StreamingLabelDetectionConfig label_detection_config = 3;
  793. // Config for STREAMING_EXPLICIT_CONTENT_DETECTION.
  794. StreamingExplicitContentDetectionConfig explicit_content_detection_config =
  795. 4;
  796. // Config for STREAMING_OBJECT_TRACKING.
  797. StreamingObjectTrackingConfig object_tracking_config = 5;
  798. // Config for STREAMING_AUTOML_ACTION_RECOGNITION.
  799. StreamingAutomlActionRecognitionConfig automl_action_recognition_config =
  800. 23;
  801. // Config for STREAMING_AUTOML_CLASSIFICATION.
  802. StreamingAutomlClassificationConfig automl_classification_config = 21;
  803. // Config for STREAMING_AUTOML_OBJECT_TRACKING.
  804. StreamingAutomlObjectTrackingConfig automl_object_tracking_config = 22;
  805. }
  806. // Requested annotation feature.
  807. StreamingFeature feature = 1;
  808. // Streaming storage option. By default: storage is disabled.
  809. StreamingStorageConfig storage_config = 30;
  810. }
  811. // `StreamingAnnotateVideoResponse` is the only message returned to the client
  812. // by `StreamingAnnotateVideo`. A series of zero or more
  813. // `StreamingAnnotateVideoResponse` messages are streamed back to the client.
  814. message StreamingAnnotateVideoResponse {
  815. // If set, returns a [google.rpc.Status][google.rpc.Status] message that
  816. // specifies the error for the operation.
  817. google.rpc.Status error = 1;
  818. // Streaming annotation results.
  819. StreamingVideoAnnotationResults annotation_results = 2;
  820. // Google Cloud Storage(GCS) URI that stores annotation results of one
  821. // streaming session in JSON format.
  822. // It is the annotation_result_storage_directory
  823. // from the request followed by '/cloud_project_number-session_id'.
  824. string annotation_results_uri = 3;
  825. }
  826. // Streaming annotation results corresponding to a portion of the video
  827. // that is currently being processed.
  828. message StreamingVideoAnnotationResults {
  829. // Shot annotation results. Each shot is represented as a video segment.
  830. repeated VideoSegment shot_annotations = 1;
  831. // Label annotation results.
  832. repeated LabelAnnotation label_annotations = 2;
  833. // Explicit content annotation results.
  834. ExplicitContentAnnotation explicit_annotation = 3;
  835. // Object tracking results.
  836. repeated ObjectTrackingAnnotation object_annotations = 4;
  837. }
  838. // Config for STREAMING_SHOT_CHANGE_DETECTION.
  839. message StreamingShotChangeDetectionConfig {}
  840. // Config for STREAMING_LABEL_DETECTION.
  841. message StreamingLabelDetectionConfig {
  842. // Whether the video has been captured from a stationary (i.e. non-moving)
  843. // camera. When set to true, might improve detection accuracy for moving
  844. // objects. Default: false.
  845. bool stationary_camera = 1;
  846. }
  847. // Config for STREAMING_EXPLICIT_CONTENT_DETECTION.
  848. message StreamingExplicitContentDetectionConfig {}
  849. // Config for STREAMING_OBJECT_TRACKING.
  850. message StreamingObjectTrackingConfig {}
  851. // Config for STREAMING_AUTOML_ACTION_RECOGNITION.
  852. message StreamingAutomlActionRecognitionConfig {
  853. // Resource name of AutoML model.
  854. // Format: `projects/{project_id}/locations/{location_id}/models/{model_id}`
  855. string model_name = 1;
  856. }
  857. // Config for STREAMING_AUTOML_CLASSIFICATION.
  858. message StreamingAutomlClassificationConfig {
  859. // Resource name of AutoML model.
  860. // Format:
  861. // `projects/{project_number}/locations/{location_id}/models/{model_id}`
  862. string model_name = 1;
  863. }
  864. // Config for STREAMING_AUTOML_OBJECT_TRACKING.
  865. message StreamingAutomlObjectTrackingConfig {
  866. // Resource name of AutoML model.
  867. // Format: `projects/{project_id}/locations/{location_id}/models/{model_id}`
  868. string model_name = 1;
  869. }
  870. // Config for streaming storage option.
  871. message StreamingStorageConfig {
  872. // Enable streaming storage. Default: false.
  873. bool enable_storage_annotation_result = 1;
  874. // Cloud Storage URI to store all annotation results for one client. Client
  875. // should specify this field as the top-level storage directory. Annotation
  876. // results of different sessions will be put into different sub-directories
  877. // denoted by project_name and session_id. All sub-directories will be auto
  878. // generated by program and will be made accessible to client in response
  879. // proto. URIs must be specified in the following format:
  880. // `gs://bucket-id/object-id` `bucket-id` should be a valid Cloud Storage
  881. // bucket created by client and bucket permission shall also be configured
  882. // properly. `object-id` can be arbitrary string that make sense to client.
  883. // Other URI formats will return error and cause Cloud Storage write failure.
  884. string annotation_result_storage_directory = 3;
  885. }