video_intelligence.proto 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906
  1. // Copyright 2020 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.videointelligence.v1;
  16. import "google/api/annotations.proto";
  17. import "google/api/client.proto";
  18. import "google/api/field_behavior.proto";
  19. import "google/longrunning/operations.proto";
  20. import "google/protobuf/duration.proto";
  21. import "google/protobuf/timestamp.proto";
  22. import "google/rpc/status.proto";
  23. option csharp_namespace = "Google.Cloud.VideoIntelligence.V1";
  24. option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1;videointelligence";
  25. option java_multiple_files = true;
  26. option java_outer_classname = "VideoIntelligenceServiceProto";
  27. option java_package = "com.google.cloud.videointelligence.v1";
  28. option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1";
  29. option ruby_package = "Google::Cloud::VideoIntelligence::V1";
  30. // Service that implements the Video Intelligence API.
  31. service VideoIntelligenceService {
  32. option (google.api.default_host) = "videointelligence.googleapis.com";
  33. option (google.api.oauth_scopes) =
  34. "https://www.googleapis.com/auth/cloud-platform";
  35. // Performs asynchronous video annotation. Progress and results can be
  36. // retrieved through the `google.longrunning.Operations` interface.
  37. // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
  38. // `Operation.response` contains `AnnotateVideoResponse` (results).
  39. rpc AnnotateVideo(AnnotateVideoRequest)
  40. returns (google.longrunning.Operation) {
  41. option (google.api.http) = {
  42. post: "/v1/videos:annotate"
  43. body: "*"
  44. };
  45. option (google.api.method_signature) = "input_uri,features";
  46. option (google.longrunning.operation_info) = {
  47. response_type: "AnnotateVideoResponse"
  48. metadata_type: "AnnotateVideoProgress"
  49. };
  50. }
  51. }
  52. // Video annotation request.
  53. message AnnotateVideoRequest {
  54. // Input video location. Currently, only
  55. // [Cloud Storage](https://cloud.google.com/storage/) URIs are
  56. // supported. URIs must be specified in the following format:
  57. // `gs://bucket-id/object-id` (other URI formats return
  58. // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
  59. // more information, see [Request
  60. // URIs](https://cloud.google.com/storage/docs/request-endpoints). To identify
  61. // multiple videos, a video URI may include wildcards in the `object-id`.
  62. // Supported wildcards: '*' to match 0 or more characters;
  63. // '?' to match 1 character. If unset, the input video should be embedded
  64. // in the request as `input_content`. If set, `input_content` must be unset.
  65. string input_uri = 1;
  66. // The video data bytes.
  67. // If unset, the input video(s) should be specified via the `input_uri`.
  68. // If set, `input_uri` must be unset.
  69. bytes input_content = 6;
  70. // Required. Requested video annotation features.
  71. repeated Feature features = 2 [(google.api.field_behavior) = REQUIRED];
  72. // Additional video context and/or feature-specific parameters.
  73. VideoContext video_context = 3;
  74. // Optional. Location where the output (in JSON format) should be stored.
  75. // Currently, only [Cloud Storage](https://cloud.google.com/storage/)
  76. // URIs are supported. These must be specified in the following format:
  77. // `gs://bucket-id/object-id` (other URI formats return
  78. // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
  79. // more information, see [Request
  80. // URIs](https://cloud.google.com/storage/docs/request-endpoints).
  81. string output_uri = 4 [(google.api.field_behavior) = OPTIONAL];
  82. // Optional. Cloud region where annotation should take place. Supported cloud
  83. // regions are: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no
  84. // region is specified, the region will be determined based on video file
  85. // location.
  86. string location_id = 5 [(google.api.field_behavior) = OPTIONAL];
  87. }
  88. // Video context and/or feature-specific parameters.
  89. message VideoContext {
  90. // Video segments to annotate. The segments may overlap and are not required
  91. // to be contiguous or span the whole video. If unspecified, each video is
  92. // treated as a single segment.
  93. repeated VideoSegment segments = 1;
  94. // Config for LABEL_DETECTION.
  95. LabelDetectionConfig label_detection_config = 2;
  96. // Config for SHOT_CHANGE_DETECTION.
  97. ShotChangeDetectionConfig shot_change_detection_config = 3;
  98. // Config for EXPLICIT_CONTENT_DETECTION.
  99. ExplicitContentDetectionConfig explicit_content_detection_config = 4;
  100. // Config for FACE_DETECTION.
  101. FaceDetectionConfig face_detection_config = 5;
  102. // Config for SPEECH_TRANSCRIPTION.
  103. SpeechTranscriptionConfig speech_transcription_config = 6;
  104. // Config for TEXT_DETECTION.
  105. TextDetectionConfig text_detection_config = 8;
  106. // Config for PERSON_DETECTION.
  107. PersonDetectionConfig person_detection_config = 11;
  108. // Config for OBJECT_TRACKING.
  109. ObjectTrackingConfig object_tracking_config = 13;
  110. }
  111. // Video annotation feature.
  112. enum Feature {
  113. // Unspecified.
  114. FEATURE_UNSPECIFIED = 0;
  115. // Label detection. Detect objects, such as dog or flower.
  116. LABEL_DETECTION = 1;
  117. // Shot change detection.
  118. SHOT_CHANGE_DETECTION = 2;
  119. // Explicit content detection.
  120. EXPLICIT_CONTENT_DETECTION = 3;
  121. // Human face detection.
  122. FACE_DETECTION = 4;
  123. // Speech transcription.
  124. SPEECH_TRANSCRIPTION = 6;
  125. // OCR text detection and tracking.
  126. TEXT_DETECTION = 7;
  127. // Object detection and tracking.
  128. OBJECT_TRACKING = 9;
  129. // Logo detection, tracking, and recognition.
  130. LOGO_RECOGNITION = 12;
  131. // Person detection.
  132. PERSON_DETECTION = 14;
  133. }
  134. // Label detection mode.
  135. enum LabelDetectionMode {
  136. // Unspecified.
  137. LABEL_DETECTION_MODE_UNSPECIFIED = 0;
  138. // Detect shot-level labels.
  139. SHOT_MODE = 1;
  140. // Detect frame-level labels.
  141. FRAME_MODE = 2;
  142. // Detect both shot-level and frame-level labels.
  143. SHOT_AND_FRAME_MODE = 3;
  144. }
  145. // Bucketized representation of likelihood.
  146. enum Likelihood {
  147. // Unspecified likelihood.
  148. LIKELIHOOD_UNSPECIFIED = 0;
  149. // Very unlikely.
  150. VERY_UNLIKELY = 1;
  151. // Unlikely.
  152. UNLIKELY = 2;
  153. // Possible.
  154. POSSIBLE = 3;
  155. // Likely.
  156. LIKELY = 4;
  157. // Very likely.
  158. VERY_LIKELY = 5;
  159. }
  160. // Config for LABEL_DETECTION.
  161. message LabelDetectionConfig {
  162. // What labels should be detected with LABEL_DETECTION, in addition to
  163. // video-level labels or segment-level labels.
  164. // If unspecified, defaults to `SHOT_MODE`.
  165. LabelDetectionMode label_detection_mode = 1;
  166. // Whether the video has been shot from a stationary (i.e., non-moving)
  167. // camera. When set to true, might improve detection accuracy for moving
  168. // objects. Should be used with `SHOT_AND_FRAME_MODE` enabled.
  169. bool stationary_camera = 2;
  170. // Model to use for label detection.
  171. // Supported values: "builtin/stable" (the default if unset) and
  172. // "builtin/latest".
  173. string model = 3;
  174. // The confidence threshold we perform filtering on the labels from
  175. // frame-level detection. If not set, it is set to 0.4 by default. The valid
  176. // range for this threshold is [0.1, 0.9]. Any value set outside of this
  177. // range will be clipped.
  178. // Note: For best results, follow the default threshold. We will update
  179. // the default threshold everytime when we release a new model.
  180. float frame_confidence_threshold = 4;
  181. // The confidence threshold we perform filtering on the labels from
  182. // video-level and shot-level detections. If not set, it's set to 0.3 by
  183. // default. The valid range for this threshold is [0.1, 0.9]. Any value set
  184. // outside of this range will be clipped.
  185. // Note: For best results, follow the default threshold. We will update
  186. // the default threshold everytime when we release a new model.
  187. float video_confidence_threshold = 5;
  188. }
  189. // Config for SHOT_CHANGE_DETECTION.
  190. message ShotChangeDetectionConfig {
  191. // Model to use for shot change detection.
  192. // Supported values: "builtin/stable" (the default if unset) and
  193. // "builtin/latest".
  194. string model = 1;
  195. }
  196. // Config for OBJECT_TRACKING.
  197. message ObjectTrackingConfig {
  198. // Model to use for object tracking.
  199. // Supported values: "builtin/stable" (the default if unset) and
  200. // "builtin/latest".
  201. string model = 1;
  202. }
  203. // Config for FACE_DETECTION.
  204. message FaceDetectionConfig {
  205. // Model to use for face detection.
  206. // Supported values: "builtin/stable" (the default if unset) and
  207. // "builtin/latest".
  208. string model = 1;
  209. // Whether bounding boxes are included in the face annotation output.
  210. bool include_bounding_boxes = 2;
  211. // Whether to enable face attributes detection, such as glasses, dark_glasses,
  212. // mouth_open etc. Ignored if 'include_bounding_boxes' is set to false.
  213. bool include_attributes = 5;
  214. }
  215. // Config for PERSON_DETECTION.
  216. message PersonDetectionConfig {
  217. // Whether bounding boxes are included in the person detection annotation
  218. // output.
  219. bool include_bounding_boxes = 1;
  220. // Whether to enable pose landmarks detection. Ignored if
  221. // 'include_bounding_boxes' is set to false.
  222. bool include_pose_landmarks = 2;
  223. // Whether to enable person attributes detection, such as cloth color (black,
  224. // blue, etc), type (coat, dress, etc), pattern (plain, floral, etc), hair,
  225. // etc.
  226. // Ignored if 'include_bounding_boxes' is set to false.
  227. bool include_attributes = 3;
  228. }
  229. // Config for EXPLICIT_CONTENT_DETECTION.
  230. message ExplicitContentDetectionConfig {
  231. // Model to use for explicit content detection.
  232. // Supported values: "builtin/stable" (the default if unset) and
  233. // "builtin/latest".
  234. string model = 1;
  235. }
  236. // Config for TEXT_DETECTION.
  237. message TextDetectionConfig {
  238. // Language hint can be specified if the language to be detected is known a
  239. // priori. It can increase the accuracy of the detection. Language hint must
  240. // be language code in BCP-47 format.
  241. //
  242. // Automatic language detection is performed if no hint is provided.
  243. repeated string language_hints = 1;
  244. // Model to use for text detection.
  245. // Supported values: "builtin/stable" (the default if unset) and
  246. // "builtin/latest".
  247. string model = 2;
  248. }
  249. // Video segment.
  250. message VideoSegment {
  251. // Time-offset, relative to the beginning of the video,
  252. // corresponding to the start of the segment (inclusive).
  253. google.protobuf.Duration start_time_offset = 1;
  254. // Time-offset, relative to the beginning of the video,
  255. // corresponding to the end of the segment (inclusive).
  256. google.protobuf.Duration end_time_offset = 2;
  257. }
  258. // Video segment level annotation results for label detection.
  259. message LabelSegment {
  260. // Video segment where a label was detected.
  261. VideoSegment segment = 1;
  262. // Confidence that the label is accurate. Range: [0, 1].
  263. float confidence = 2;
  264. }
  265. // Video frame level annotation results for label detection.
  266. message LabelFrame {
  267. // Time-offset, relative to the beginning of the video, corresponding to the
  268. // video frame for this location.
  269. google.protobuf.Duration time_offset = 1;
  270. // Confidence that the label is accurate. Range: [0, 1].
  271. float confidence = 2;
  272. }
  273. // Detected entity from video analysis.
  274. message Entity {
  275. // Opaque entity ID. Some IDs may be available in
  276. // [Google Knowledge Graph Search
  277. // API](https://developers.google.com/knowledge-graph/).
  278. string entity_id = 1;
  279. // Textual description, e.g., `Fixed-gear bicycle`.
  280. string description = 2;
  281. // Language code for `description` in BCP-47 format.
  282. string language_code = 3;
  283. }
  284. // Label annotation.
  285. message LabelAnnotation {
  286. // Detected entity.
  287. Entity entity = 1;
  288. // Common categories for the detected entity.
  289. // For example, when the label is `Terrier`, the category is likely `dog`. And
  290. // in some cases there might be more than one categories e.g., `Terrier` could
  291. // also be a `pet`.
  292. repeated Entity category_entities = 2;
  293. // All video segments where a label was detected.
  294. repeated LabelSegment segments = 3;
  295. // All video frames where a label was detected.
  296. repeated LabelFrame frames = 4;
  297. // Feature version.
  298. string version = 5;
  299. }
  300. // Video frame level annotation results for explicit content.
  301. message ExplicitContentFrame {
  302. // Time-offset, relative to the beginning of the video, corresponding to the
  303. // video frame for this location.
  304. google.protobuf.Duration time_offset = 1;
  305. // Likelihood of the pornography content..
  306. Likelihood pornography_likelihood = 2;
  307. }
  308. // Explicit content annotation (based on per-frame visual signals only).
  309. // If no explicit content has been detected in a frame, no annotations are
  310. // present for that frame.
  311. message ExplicitContentAnnotation {
  312. // All video frames where explicit content was detected.
  313. repeated ExplicitContentFrame frames = 1;
  314. // Feature version.
  315. string version = 2;
  316. }
  317. // Normalized bounding box.
  318. // The normalized vertex coordinates are relative to the original image.
  319. // Range: [0, 1].
  320. message NormalizedBoundingBox {
  321. // Left X coordinate.
  322. float left = 1;
  323. // Top Y coordinate.
  324. float top = 2;
  325. // Right X coordinate.
  326. float right = 3;
  327. // Bottom Y coordinate.
  328. float bottom = 4;
  329. }
  330. // Face detection annotation.
  331. message FaceDetectionAnnotation {
  332. // The face tracks with attributes.
  333. repeated Track tracks = 3;
  334. // The thumbnail of a person's face.
  335. bytes thumbnail = 4;
  336. // Feature version.
  337. string version = 5;
  338. }
  339. // Person detection annotation per video.
  340. message PersonDetectionAnnotation {
  341. // The detected tracks of a person.
  342. repeated Track tracks = 1;
  343. // Feature version.
  344. string version = 2;
  345. }
  346. // Video segment level annotation results for face detection.
  347. message FaceSegment {
  348. // Video segment where a face was detected.
  349. VideoSegment segment = 1;
  350. }
  351. // Deprecated. No effect.
  352. message FaceFrame {
  353. option deprecated = true;
  354. // Normalized Bounding boxes in a frame.
  355. // There can be more than one boxes if the same face is detected in multiple
  356. // locations within the current frame.
  357. repeated NormalizedBoundingBox normalized_bounding_boxes = 1;
  358. // Time-offset, relative to the beginning of the video,
  359. // corresponding to the video frame for this location.
  360. google.protobuf.Duration time_offset = 2;
  361. }
  362. // Deprecated. No effect.
  363. message FaceAnnotation {
  364. option deprecated = true;
  365. // Thumbnail of a representative face view (in JPEG format).
  366. bytes thumbnail = 1;
  367. // All video segments where a face was detected.
  368. repeated FaceSegment segments = 2;
  369. // All video frames where a face was detected.
  370. repeated FaceFrame frames = 3;
  371. }
  372. // For tracking related features.
  373. // An object at time_offset with attributes, and located with
  374. // normalized_bounding_box.
  375. message TimestampedObject {
  376. // Normalized Bounding box in a frame, where the object is located.
  377. NormalizedBoundingBox normalized_bounding_box = 1;
  378. // Time-offset, relative to the beginning of the video,
  379. // corresponding to the video frame for this object.
  380. google.protobuf.Duration time_offset = 2;
  381. // Optional. The attributes of the object in the bounding box.
  382. repeated DetectedAttribute attributes = 3
  383. [(google.api.field_behavior) = OPTIONAL];
  384. // Optional. The detected landmarks.
  385. repeated DetectedLandmark landmarks = 4
  386. [(google.api.field_behavior) = OPTIONAL];
  387. }
  388. // A track of an object instance.
  389. message Track {
  390. // Video segment of a track.
  391. VideoSegment segment = 1;
  392. // The object with timestamp and attributes per frame in the track.
  393. repeated TimestampedObject timestamped_objects = 2;
  394. // Optional. Attributes in the track level.
  395. repeated DetectedAttribute attributes = 3
  396. [(google.api.field_behavior) = OPTIONAL];
  397. // Optional. The confidence score of the tracked object.
  398. float confidence = 4 [(google.api.field_behavior) = OPTIONAL];
  399. }
  400. // A generic detected attribute represented by name in string format.
  401. message DetectedAttribute {
  402. // The name of the attribute, for example, glasses, dark_glasses, mouth_open.
  403. // A full list of supported type names will be provided in the document.
  404. string name = 1;
  405. // Detected attribute confidence. Range [0, 1].
  406. float confidence = 2;
  407. // Text value of the detection result. For example, the value for "HairColor"
  408. // can be "black", "blonde", etc.
  409. string value = 3;
  410. }
  411. // A generic detected landmark represented by name in string format and a 2D
  412. // location.
  413. message DetectedLandmark {
  414. // The name of this landmark, for example, left_hand, right_shoulder.
  415. string name = 1;
  416. // The 2D point of the detected landmark using the normalized image
  417. // coordindate system. The normalized coordinates have the range from 0 to 1.
  418. NormalizedVertex point = 2;
  419. // The confidence score of the detected landmark. Range [0, 1].
  420. float confidence = 3;
  421. }
  422. // Annotation results for a single video.
  423. message VideoAnnotationResults {
  424. // Video file location in
  425. // [Cloud Storage](https://cloud.google.com/storage/).
  426. string input_uri = 1;
  427. // Video segment on which the annotation is run.
  428. VideoSegment segment = 10;
  429. // Topical label annotations on video level or user-specified segment level.
  430. // There is exactly one element for each unique label.
  431. repeated LabelAnnotation segment_label_annotations = 2;
  432. // Presence label annotations on video level or user-specified segment level.
  433. // There is exactly one element for each unique label. Compared to the
  434. // existing topical `segment_label_annotations`, this field presents more
  435. // fine-grained, segment-level labels detected in video content and is made
  436. // available only when the client sets `LabelDetectionConfig.model` to
  437. // "builtin/latest" in the request.
  438. repeated LabelAnnotation segment_presence_label_annotations = 23;
  439. // Topical label annotations on shot level.
  440. // There is exactly one element for each unique label.
  441. repeated LabelAnnotation shot_label_annotations = 3;
  442. // Presence label annotations on shot level. There is exactly one element for
  443. // each unique label. Compared to the existing topical
  444. // `shot_label_annotations`, this field presents more fine-grained, shot-level
  445. // labels detected in video content and is made available only when the client
  446. // sets `LabelDetectionConfig.model` to "builtin/latest" in the request.
  447. repeated LabelAnnotation shot_presence_label_annotations = 24;
  448. // Label annotations on frame level.
  449. // There is exactly one element for each unique label.
  450. repeated LabelAnnotation frame_label_annotations = 4;
  451. // Deprecated. Please use `face_detection_annotations` instead.
  452. repeated FaceAnnotation face_annotations = 5 [deprecated = true];
  453. // Face detection annotations.
  454. repeated FaceDetectionAnnotation face_detection_annotations = 13;
  455. // Shot annotations. Each shot is represented as a video segment.
  456. repeated VideoSegment shot_annotations = 6;
  457. // Explicit content annotation.
  458. ExplicitContentAnnotation explicit_annotation = 7;
  459. // Speech transcription.
  460. repeated SpeechTranscription speech_transcriptions = 11;
  461. // OCR text detection and tracking.
  462. // Annotations for list of detected text snippets. Each will have list of
  463. // frame information associated with it.
  464. repeated TextAnnotation text_annotations = 12;
  465. // Annotations for list of objects detected and tracked in video.
  466. repeated ObjectTrackingAnnotation object_annotations = 14;
  467. // Annotations for list of logos detected, tracked and recognized in video.
  468. repeated LogoRecognitionAnnotation logo_recognition_annotations = 19;
  469. // Person detection annotations.
  470. repeated PersonDetectionAnnotation person_detection_annotations = 20;
  471. // If set, indicates an error. Note that for a single `AnnotateVideoRequest`
  472. // some videos may succeed and some may fail.
  473. google.rpc.Status error = 9;
  474. }
  475. // Video annotation response. Included in the `response`
  476. // field of the `Operation` returned by the `GetOperation`
  477. // call of the `google::longrunning::Operations` service.
  478. message AnnotateVideoResponse {
  479. // Annotation results for all videos specified in `AnnotateVideoRequest`.
  480. repeated VideoAnnotationResults annotation_results = 1;
  481. }
  482. // Annotation progress for a single video.
  483. message VideoAnnotationProgress {
  484. // Video file location in
  485. // [Cloud Storage](https://cloud.google.com/storage/).
  486. string input_uri = 1;
  487. // Approximate percentage processed thus far. Guaranteed to be
  488. // 100 when fully processed.
  489. int32 progress_percent = 2;
  490. // Time when the request was received.
  491. google.protobuf.Timestamp start_time = 3;
  492. // Time of the most recent update.
  493. google.protobuf.Timestamp update_time = 4;
  494. // Specifies which feature is being tracked if the request contains more than
  495. // one feature.
  496. Feature feature = 5;
  497. // Specifies which segment is being tracked if the request contains more than
  498. // one segment.
  499. VideoSegment segment = 6;
  500. }
  501. // Video annotation progress. Included in the `metadata`
  502. // field of the `Operation` returned by the `GetOperation`
  503. // call of the `google::longrunning::Operations` service.
  504. message AnnotateVideoProgress {
  505. // Progress metadata for all videos specified in `AnnotateVideoRequest`.
  506. repeated VideoAnnotationProgress annotation_progress = 1;
  507. }
  508. // Config for SPEECH_TRANSCRIPTION.
  509. message SpeechTranscriptionConfig {
  510. // Required. *Required* The language of the supplied audio as a
  511. // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
  512. // Example: "en-US".
  513. // See [Language Support](https://cloud.google.com/speech/docs/languages)
  514. // for a list of the currently supported language codes.
  515. string language_code = 1 [(google.api.field_behavior) = REQUIRED];
  516. // Optional. Maximum number of recognition hypotheses to be returned.
  517. // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
  518. // within each `SpeechTranscription`. The server may return fewer than
  519. // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will
  520. // return a maximum of one. If omitted, will return a maximum of one.
  521. int32 max_alternatives = 2 [(google.api.field_behavior) = OPTIONAL];
  522. // Optional. If set to `true`, the server will attempt to filter out
  523. // profanities, replacing all but the initial character in each filtered word
  524. // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
  525. // won't be filtered out.
  526. bool filter_profanity = 3 [(google.api.field_behavior) = OPTIONAL];
  527. // Optional. A means to provide context to assist the speech recognition.
  528. repeated SpeechContext speech_contexts = 4
  529. [(google.api.field_behavior) = OPTIONAL];
  530. // Optional. If 'true', adds punctuation to recognition result hypotheses.
  531. // This feature is only available in select languages. Setting this for
  532. // requests in other languages has no effect at all. The default 'false' value
  533. // does not add punctuation to result hypotheses. NOTE: "This is currently
  534. // offered as an experimental service, complimentary to all users. In the
  535. // future this may be exclusively available as a premium feature."
  536. bool enable_automatic_punctuation = 5
  537. [(google.api.field_behavior) = OPTIONAL];
  538. // Optional. For file formats, such as MXF or MKV, supporting multiple audio
  539. // tracks, specify up to two tracks. Default: track 0.
  540. repeated int32 audio_tracks = 6 [(google.api.field_behavior) = OPTIONAL];
  541. // Optional. If 'true', enables speaker detection for each recognized word in
  542. // the top alternative of the recognition result using a speaker_tag provided
  543. // in the WordInfo.
  544. // Note: When this is true, we send all the words from the beginning of the
  545. // audio for the top alternative in every consecutive response.
  546. // This is done in order to improve our speaker tags as our models learn to
  547. // identify the speakers in the conversation over time.
  548. bool enable_speaker_diarization = 7 [(google.api.field_behavior) = OPTIONAL];
  549. // Optional. If set, specifies the estimated number of speakers in the
  550. // conversation. If not set, defaults to '2'. Ignored unless
  551. // enable_speaker_diarization is set to true.
  552. int32 diarization_speaker_count = 8 [(google.api.field_behavior) = OPTIONAL];
  553. // Optional. If `true`, the top result includes a list of words and the
  554. // confidence for those words. If `false`, no word-level confidence
  555. // information is returned. The default is `false`.
  556. bool enable_word_confidence = 9 [(google.api.field_behavior) = OPTIONAL];
  557. }
  558. // Provides "hints" to the speech recognizer to favor specific words and phrases
  559. // in the results.
  560. message SpeechContext {
  561. // Optional. A list of strings containing words and phrases "hints" so that
  562. // the speech recognition is more likely to recognize them. This can be used
  563. // to improve the accuracy for specific words and phrases, for example, if
  564. // specific commands are typically spoken by the user. This can also be used
  565. // to add additional words to the vocabulary of the recognizer. See
  566. // [usage limits](https://cloud.google.com/speech/limits#content).
  567. repeated string phrases = 1 [(google.api.field_behavior) = OPTIONAL];
  568. }
  569. // A speech recognition result corresponding to a portion of the audio.
  570. message SpeechTranscription {
  571. // May contain one or more recognition hypotheses (up to the maximum specified
  572. // in `max_alternatives`). These alternatives are ordered in terms of
  573. // accuracy, with the top (first) alternative being the most probable, as
  574. // ranked by the recognizer.
  575. repeated SpeechRecognitionAlternative alternatives = 1;
  576. // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
  577. // language tag of the language in this result. This language code was
  578. // detected to have the most likelihood of being spoken in the audio.
  579. string language_code = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
  580. }
  581. // Alternative hypotheses (a.k.a. n-best list).
  582. message SpeechRecognitionAlternative {
  583. // Transcript text representing the words that the user spoke.
  584. string transcript = 1;
  585. // Output only. The confidence estimate between 0.0 and 1.0. A higher number
  586. // indicates an estimated greater likelihood that the recognized words are
  587. // correct. This field is set only for the top alternative.
  588. // This field is not guaranteed to be accurate and users should not rely on it
  589. // to be always provided.
  590. // The default of 0.0 is a sentinel value indicating `confidence` was not set.
  591. float confidence = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
  592. // Output only. A list of word-specific information for each recognized word.
  593. // Note: When `enable_speaker_diarization` is set to true, you will see all
  594. // the words from the beginning of the audio.
  595. repeated WordInfo words = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
  596. }
  597. // Word-specific information for recognized words. Word information is only
  598. // included in the response when certain request parameters are set, such
  599. // as `enable_word_time_offsets`.
  600. message WordInfo {
  601. // Time offset relative to the beginning of the audio, and
  602. // corresponding to the start of the spoken word. This field is only set if
  603. // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
  604. // experimental feature and the accuracy of the time offset can vary.
  605. google.protobuf.Duration start_time = 1;
  606. // Time offset relative to the beginning of the audio, and
  607. // corresponding to the end of the spoken word. This field is only set if
  608. // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
  609. // experimental feature and the accuracy of the time offset can vary.
  610. google.protobuf.Duration end_time = 2;
  611. // The word corresponding to this set of information.
  612. string word = 3;
  613. // Output only. The confidence estimate between 0.0 and 1.0. A higher number
  614. // indicates an estimated greater likelihood that the recognized words are
  615. // correct. This field is set only for the top alternative.
  616. // This field is not guaranteed to be accurate and users should not rely on it
  617. // to be always provided.
  618. // The default of 0.0 is a sentinel value indicating `confidence` was not set.
  619. float confidence = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
  620. // Output only. A distinct integer value is assigned for every speaker within
  621. // the audio. This field specifies which one of those speakers was detected to
  622. // have spoken this word. Value ranges from 1 up to diarization_speaker_count,
  623. // and is only set if speaker diarization is enabled.
  624. int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
  625. }
  626. // A vertex represents a 2D point in the image.
  627. // NOTE: the normalized vertex coordinates are relative to the original image
  628. // and range from 0 to 1.
  629. message NormalizedVertex {
  630. // X coordinate.
  631. float x = 1;
  632. // Y coordinate.
  633. float y = 2;
  634. }
  635. // Normalized bounding polygon for text (that might not be aligned with axis).
  636. // Contains list of the corner points in clockwise order starting from
  637. // top-left corner. For example, for a rectangular bounding box:
  638. // When the text is horizontal it might look like:
  639. // 0----1
  640. // | |
  641. // 3----2
  642. //
  643. // When it's clockwise rotated 180 degrees around the top-left corner it
  644. // becomes:
  645. // 2----3
  646. // | |
  647. // 1----0
  648. //
  649. // and the vertex order will still be (0, 1, 2, 3). Note that values can be less
  650. // than 0, or greater than 1 due to trignometric calculations for location of
  651. // the box.
  652. message NormalizedBoundingPoly {
  653. // Normalized vertices of the bounding polygon.
  654. repeated NormalizedVertex vertices = 1;
  655. }
  656. // Video segment level annotation results for text detection.
  657. message TextSegment {
  658. // Video segment where a text snippet was detected.
  659. VideoSegment segment = 1;
  660. // Confidence for the track of detected text. It is calculated as the highest
  661. // over all frames where OCR detected text appears.
  662. float confidence = 2;
  663. // Information related to the frames where OCR detected text appears.
  664. repeated TextFrame frames = 3;
  665. }
  666. // Video frame level annotation results for text annotation (OCR).
  667. // Contains information regarding timestamp and bounding box locations for the
  668. // frames containing detected OCR text snippets.
  669. message TextFrame {
  670. // Bounding polygon of the detected text for this frame.
  671. NormalizedBoundingPoly rotated_bounding_box = 1;
  672. // Timestamp of this frame.
  673. google.protobuf.Duration time_offset = 2;
  674. }
  675. // Annotations related to one detected OCR text snippet. This will contain the
  676. // corresponding text, confidence value, and frame level information for each
  677. // detection.
  678. message TextAnnotation {
  679. // The detected text.
  680. string text = 1;
  681. // All video segments where OCR detected text appears.
  682. repeated TextSegment segments = 2;
  683. // Feature version.
  684. string version = 3;
  685. }
  686. // Video frame level annotations for object detection and tracking. This field
  687. // stores per frame location, time offset, and confidence.
  688. message ObjectTrackingFrame {
  689. // The normalized bounding box location of this object track for the frame.
  690. NormalizedBoundingBox normalized_bounding_box = 1;
  691. // The timestamp of the frame in microseconds.
  692. google.protobuf.Duration time_offset = 2;
  693. }
  694. // Annotations corresponding to one tracked object.
  695. message ObjectTrackingAnnotation {
  696. // Different representation of tracking info in non-streaming batch
  697. // and streaming modes.
  698. oneof track_info {
  699. // Non-streaming batch mode ONLY.
  700. // Each object track corresponds to one video segment where it appears.
  701. VideoSegment segment = 3;
  702. // Streaming mode ONLY.
  703. // In streaming mode, we do not know the end time of a tracked object
  704. // before it is completed. Hence, there is no VideoSegment info returned.
  705. // Instead, we provide a unique identifiable integer track_id so that
  706. // the customers can correlate the results of the ongoing
  707. // ObjectTrackAnnotation of the same track_id over time.
  708. int64 track_id = 5;
  709. }
  710. // Entity to specify the object category that this track is labeled as.
  711. Entity entity = 1;
  712. // Object category's labeling confidence of this track.
  713. float confidence = 4;
  714. // Information corresponding to all frames where this object track appears.
  715. // Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame
  716. // messages in frames.
  717. // Streaming mode: it can only be one ObjectTrackingFrame message in frames.
  718. repeated ObjectTrackingFrame frames = 2;
  719. // Feature version.
  720. string version = 6;
  721. }
  722. // Annotation corresponding to one detected, tracked and recognized logo class.
  723. message LogoRecognitionAnnotation {
  724. // Entity category information to specify the logo class that all the logo
  725. // tracks within this LogoRecognitionAnnotation are recognized as.
  726. Entity entity = 1;
  727. // All logo tracks where the recognized logo appears. Each track corresponds
  728. // to one logo instance appearing in consecutive frames.
  729. repeated Track tracks = 2;
  730. // All video segments where the recognized logo appears. There might be
  731. // multiple instances of the same logo class appearing in one VideoSegment.
  732. repeated VideoSegment segments = 3;
  733. }