image_annotator.proto 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618
  1. // Copyright 2019 Google LLC.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. syntax = "proto3";
  16. package google.cloud.vision.v1p1beta1;
  17. import "google/api/annotations.proto";
  18. import "google/api/client.proto";
  19. import "google/api/field_behavior.proto";
  20. import "google/cloud/vision/v1p1beta1/geometry.proto";
  21. import "google/cloud/vision/v1p1beta1/text_annotation.proto";
  22. import "google/cloud/vision/v1p1beta1/web_detection.proto";
  23. import "google/rpc/status.proto";
  24. import "google/type/color.proto";
  25. import "google/type/latlng.proto";
  26. option cc_enable_arenas = true;
  27. option go_package = "google.golang.org/genproto/googleapis/cloud/vision/v1p1beta1;vision";
  28. option java_multiple_files = true;
  29. option java_outer_classname = "ImageAnnotatorProto";
  30. option java_package = "com.google.cloud.vision.v1p1beta1";
  31. // Service that performs Google Cloud Vision API detection tasks over client
  32. // images, such as face, landmark, logo, label, and text detection. The
  33. // ImageAnnotator service returns detected entities from the images.
  34. service ImageAnnotator {
  35. option (google.api.default_host) = "vision.googleapis.com";
  36. option (google.api.oauth_scopes) =
  37. "https://www.googleapis.com/auth/cloud-platform,"
  38. "https://www.googleapis.com/auth/cloud-vision";
  39. // Run image detection and annotation for a batch of images.
  40. rpc BatchAnnotateImages(BatchAnnotateImagesRequest)
  41. returns (BatchAnnotateImagesResponse) {
  42. option (google.api.http) = {
  43. post: "/v1p1beta1/images:annotate"
  44. body: "*"
  45. };
  46. option (google.api.method_signature) = "requests";
  47. }
  48. }
  49. // Users describe the type of Google Cloud Vision API tasks to perform over
  50. // images by using *Feature*s. Each Feature indicates a type of image
  51. // detection task to perform. Features encode the Cloud Vision API
  52. // vertical to operate on and the number of top-scoring results to return.
  53. message Feature {
  54. // Type of image feature.
  55. enum Type {
  56. // Unspecified feature type.
  57. TYPE_UNSPECIFIED = 0;
  58. // Run face detection.
  59. FACE_DETECTION = 1;
  60. // Run landmark detection.
  61. LANDMARK_DETECTION = 2;
  62. // Run logo detection.
  63. LOGO_DETECTION = 3;
  64. // Run label detection.
  65. LABEL_DETECTION = 4;
  66. // Run OCR.
  67. TEXT_DETECTION = 5;
  68. // Run dense text document OCR. Takes precedence when both
  69. // DOCUMENT_TEXT_DETECTION and TEXT_DETECTION are present.
  70. DOCUMENT_TEXT_DETECTION = 11;
  71. // Run computer vision models to compute image safe-search properties.
  72. SAFE_SEARCH_DETECTION = 6;
  73. // Compute a set of image properties, such as the image's dominant colors.
  74. IMAGE_PROPERTIES = 7;
  75. // Run crop hints.
  76. CROP_HINTS = 9;
  77. // Run web detection.
  78. WEB_DETECTION = 10;
  79. }
  80. // The feature type.
  81. Type type = 1;
  82. // Maximum number of results of this type.
  83. int32 max_results = 2;
  84. // Model to use for the feature.
  85. // Supported values: "builtin/stable" (the default if unset) and
  86. // "builtin/latest". `DOCUMENT_TEXT_DETECTION` and `TEXT_DETECTION` also
  87. // support "builtin/weekly" for the bleeding edge release updated weekly.
  88. string model = 3;
  89. }
  90. // External image source (Google Cloud Storage image location).
  91. message ImageSource {
  92. // NOTE: For new code `image_uri` below is preferred.
  93. // Google Cloud Storage image URI, which must be in the following form:
  94. // `gs://bucket_name/object_name` (for details, see
  95. // [Google Cloud Storage Request
  96. // URIs](https://cloud.google.com/storage/docs/reference-uris)).
  97. // NOTE: Cloud Storage object versioning is not supported.
  98. string gcs_image_uri = 1;
  99. // Image URI which supports:
  100. // 1) Google Cloud Storage image URI, which must be in the following form:
  101. // `gs://bucket_name/object_name` (for details, see
  102. // [Google Cloud Storage Request
  103. // URIs](https://cloud.google.com/storage/docs/reference-uris)).
  104. // NOTE: Cloud Storage object versioning is not supported.
  105. // 2) Publicly accessible image HTTP/HTTPS URL.
  106. // This is preferred over the legacy `gcs_image_uri` above. When both
  107. // `gcs_image_uri` and `image_uri` are specified, `image_uri` takes
  108. // precedence.
  109. string image_uri = 2;
  110. }
  111. // Client image to perform Google Cloud Vision API tasks over.
  112. message Image {
  113. // Image content, represented as a stream of bytes.
  114. // Note: as with all `bytes` fields, protobuffers use a pure binary
  115. // representation, whereas JSON representations use base64.
  116. bytes content = 1;
  117. // Google Cloud Storage image location. If both `content` and `source`
  118. // are provided for an image, `content` takes precedence and is
  119. // used to perform the image annotation request.
  120. ImageSource source = 2;
  121. }
  122. // A face annotation object contains the results of face detection.
  123. message FaceAnnotation {
  124. // A face-specific landmark (for example, a face feature).
  125. message Landmark {
  126. // Face landmark (feature) type.
  127. // Left and right are defined from the vantage of the viewer of the image
  128. // without considering mirror projections typical of photos. So, `LEFT_EYE`,
  129. // typically, is the person's right eye.
  130. enum Type {
  131. // Unknown face landmark detected. Should not be filled.
  132. UNKNOWN_LANDMARK = 0;
  133. // Left eye.
  134. LEFT_EYE = 1;
  135. // Right eye.
  136. RIGHT_EYE = 2;
  137. // Left of left eyebrow.
  138. LEFT_OF_LEFT_EYEBROW = 3;
  139. // Right of left eyebrow.
  140. RIGHT_OF_LEFT_EYEBROW = 4;
  141. // Left of right eyebrow.
  142. LEFT_OF_RIGHT_EYEBROW = 5;
  143. // Right of right eyebrow.
  144. RIGHT_OF_RIGHT_EYEBROW = 6;
  145. // Midpoint between eyes.
  146. MIDPOINT_BETWEEN_EYES = 7;
  147. // Nose tip.
  148. NOSE_TIP = 8;
  149. // Upper lip.
  150. UPPER_LIP = 9;
  151. // Lower lip.
  152. LOWER_LIP = 10;
  153. // Mouth left.
  154. MOUTH_LEFT = 11;
  155. // Mouth right.
  156. MOUTH_RIGHT = 12;
  157. // Mouth center.
  158. MOUTH_CENTER = 13;
  159. // Nose, bottom right.
  160. NOSE_BOTTOM_RIGHT = 14;
  161. // Nose, bottom left.
  162. NOSE_BOTTOM_LEFT = 15;
  163. // Nose, bottom center.
  164. NOSE_BOTTOM_CENTER = 16;
  165. // Left eye, top boundary.
  166. LEFT_EYE_TOP_BOUNDARY = 17;
  167. // Left eye, right corner.
  168. LEFT_EYE_RIGHT_CORNER = 18;
  169. // Left eye, bottom boundary.
  170. LEFT_EYE_BOTTOM_BOUNDARY = 19;
  171. // Left eye, left corner.
  172. LEFT_EYE_LEFT_CORNER = 20;
  173. // Right eye, top boundary.
  174. RIGHT_EYE_TOP_BOUNDARY = 21;
  175. // Right eye, right corner.
  176. RIGHT_EYE_RIGHT_CORNER = 22;
  177. // Right eye, bottom boundary.
  178. RIGHT_EYE_BOTTOM_BOUNDARY = 23;
  179. // Right eye, left corner.
  180. RIGHT_EYE_LEFT_CORNER = 24;
  181. // Left eyebrow, upper midpoint.
  182. LEFT_EYEBROW_UPPER_MIDPOINT = 25;
  183. // Right eyebrow, upper midpoint.
  184. RIGHT_EYEBROW_UPPER_MIDPOINT = 26;
  185. // Left ear tragion.
  186. LEFT_EAR_TRAGION = 27;
  187. // Right ear tragion.
  188. RIGHT_EAR_TRAGION = 28;
  189. // Left eye pupil.
  190. LEFT_EYE_PUPIL = 29;
  191. // Right eye pupil.
  192. RIGHT_EYE_PUPIL = 30;
  193. // Forehead glabella.
  194. FOREHEAD_GLABELLA = 31;
  195. // Chin gnathion.
  196. CHIN_GNATHION = 32;
  197. // Chin left gonion.
  198. CHIN_LEFT_GONION = 33;
  199. // Chin right gonion.
  200. CHIN_RIGHT_GONION = 34;
  201. }
  202. // Face landmark type.
  203. Type type = 3;
  204. // Face landmark position.
  205. Position position = 4;
  206. }
  207. // The bounding polygon around the face. The coordinates of the bounding box
  208. // are in the original image's scale, as returned in `ImageParams`.
  209. // The bounding box is computed to "frame" the face in accordance with human
  210. // expectations. It is based on the landmarker results.
  211. // Note that one or more x and/or y coordinates may not be generated in the
  212. // `BoundingPoly` (the polygon will be unbounded) if only a partial face
  213. // appears in the image to be annotated.
  214. BoundingPoly bounding_poly = 1;
  215. // The `fd_bounding_poly` bounding polygon is tighter than the
  216. // `boundingPoly`, and encloses only the skin part of the face. Typically, it
  217. // is used to eliminate the face from any image analysis that detects the
  218. // "amount of skin" visible in an image. It is not based on the
  219. // landmarker results, only on the initial face detection, hence
  220. // the <code>fd</code> (face detection) prefix.
  221. BoundingPoly fd_bounding_poly = 2;
  222. // Detected face landmarks.
  223. repeated Landmark landmarks = 3;
  224. // Roll angle, which indicates the amount of clockwise/anti-clockwise rotation
  225. // of the face relative to the image vertical about the axis perpendicular to
  226. // the face. Range [-180,180].
  227. float roll_angle = 4;
  228. // Yaw angle, which indicates the leftward/rightward angle that the face is
  229. // pointing relative to the vertical plane perpendicular to the image. Range
  230. // [-180,180].
  231. float pan_angle = 5;
  232. // Pitch angle, which indicates the upwards/downwards angle that the face is
  233. // pointing relative to the image's horizontal plane. Range [-180,180].
  234. float tilt_angle = 6;
  235. // Detection confidence. Range [0, 1].
  236. float detection_confidence = 7;
  237. // Face landmarking confidence. Range [0, 1].
  238. float landmarking_confidence = 8;
  239. // Joy likelihood.
  240. Likelihood joy_likelihood = 9;
  241. // Sorrow likelihood.
  242. Likelihood sorrow_likelihood = 10;
  243. // Anger likelihood.
  244. Likelihood anger_likelihood = 11;
  245. // Surprise likelihood.
  246. Likelihood surprise_likelihood = 12;
  247. // Under-exposed likelihood.
  248. Likelihood under_exposed_likelihood = 13;
  249. // Blurred likelihood.
  250. Likelihood blurred_likelihood = 14;
  251. // Headwear likelihood.
  252. Likelihood headwear_likelihood = 15;
  253. }
  254. // Detected entity location information.
  255. message LocationInfo {
  256. // lat/long location coordinates.
  257. google.type.LatLng lat_lng = 1;
  258. }
  259. // A `Property` consists of a user-supplied name/value pair.
  260. message Property {
  261. // Name of the property.
  262. string name = 1;
  263. // Value of the property.
  264. string value = 2;
  265. // Value of numeric properties.
  266. uint64 uint64_value = 3;
  267. }
  268. // Set of detected entity features.
  269. message EntityAnnotation {
  270. // Opaque entity ID. Some IDs may be available in
  271. // [Google Knowledge Graph Search
  272. // API](https://developers.google.com/knowledge-graph/).
  273. string mid = 1;
  274. // The language code for the locale in which the entity textual
  275. // `description` is expressed.
  276. string locale = 2;
  277. // Entity textual description, expressed in its `locale` language.
  278. string description = 3;
  279. // Overall score of the result. Range [0, 1].
  280. float score = 4;
  281. // The accuracy of the entity detection in an image.
  282. // For example, for an image in which the "Eiffel Tower" entity is detected,
  283. // this field represents the confidence that there is a tower in the query
  284. // image. Range [0, 1].
  285. float confidence = 5;
  286. // The relevancy of the ICA (Image Content Annotation) label to the
  287. // image. For example, the relevancy of "tower" is likely higher to an image
  288. // containing the detected "Eiffel Tower" than to an image containing a
  289. // detected distant towering building, even though the confidence that
  290. // there is a tower in each image may be the same. Range [0, 1].
  291. float topicality = 6;
  292. // Image region to which this entity belongs. Not produced
  293. // for `LABEL_DETECTION` features.
  294. BoundingPoly bounding_poly = 7;
  295. // The location information for the detected entity. Multiple
  296. // `LocationInfo` elements can be present because one location may
  297. // indicate the location of the scene in the image, and another location
  298. // may indicate the location of the place where the image was taken.
  299. // Location information is usually present for landmarks.
  300. repeated LocationInfo locations = 8;
  301. // Some entities may have optional user-supplied `Property` (name/value)
  302. // fields, such a score or string that qualifies the entity.
  303. repeated Property properties = 9;
  304. }
  305. // Set of features pertaining to the image, computed by computer vision
  306. // methods over safe-search verticals (for example, adult, spoof, medical,
  307. // violence).
  308. message SafeSearchAnnotation {
  309. // Represents the adult content likelihood for the image. Adult content may
  310. // contain elements such as nudity, pornographic images or cartoons, or
  311. // sexual activities.
  312. Likelihood adult = 1;
  313. // Spoof likelihood. The likelihood that an modification
  314. // was made to the image's canonical version to make it appear
  315. // funny or offensive.
  316. Likelihood spoof = 2;
  317. // Likelihood that this is a medical image.
  318. Likelihood medical = 3;
  319. // Likelihood that this image contains violent content.
  320. Likelihood violence = 4;
  321. // Likelihood that the request image contains racy content. Racy content may
  322. // include (but is not limited to) skimpy or sheer clothing, strategically
  323. // covered nudity, lewd or provocative poses, or close-ups of sensitive
  324. // body areas.
  325. Likelihood racy = 9;
  326. }
  327. // Rectangle determined by min and max `LatLng` pairs.
  328. message LatLongRect {
  329. // Min lat/long pair.
  330. google.type.LatLng min_lat_lng = 1;
  331. // Max lat/long pair.
  332. google.type.LatLng max_lat_lng = 2;
  333. }
  334. // Color information consists of RGB channels, score, and the fraction of
  335. // the image that the color occupies in the image.
  336. message ColorInfo {
  337. // RGB components of the color.
  338. google.type.Color color = 1;
  339. // Image-specific score for this color. Value in range [0, 1].
  340. float score = 2;
  341. // The fraction of pixels the color occupies in the image.
  342. // Value in range [0, 1].
  343. float pixel_fraction = 3;
  344. }
  345. // Set of dominant colors and their corresponding scores.
  346. message DominantColorsAnnotation {
  347. // RGB color values with their score and pixel fraction.
  348. repeated ColorInfo colors = 1;
  349. }
  350. // Stores image properties, such as dominant colors.
  351. message ImageProperties {
  352. // If present, dominant colors completed successfully.
  353. DominantColorsAnnotation dominant_colors = 1;
  354. }
  355. // Single crop hint that is used to generate a new crop when serving an image.
  356. message CropHint {
  357. // The bounding polygon for the crop region. The coordinates of the bounding
  358. // box are in the original image's scale, as returned in `ImageParams`.
  359. BoundingPoly bounding_poly = 1;
  360. // Confidence of this being a salient region. Range [0, 1].
  361. float confidence = 2;
  362. // Fraction of importance of this salient region with respect to the original
  363. // image.
  364. float importance_fraction = 3;
  365. }
  366. // Set of crop hints that are used to generate new crops when serving images.
  367. message CropHintsAnnotation {
  368. // Crop hint results.
  369. repeated CropHint crop_hints = 1;
  370. }
  371. // Parameters for crop hints annotation request.
  372. message CropHintsParams {
  373. // Aspect ratios in floats, representing the ratio of the width to the height
  374. // of the image. For example, if the desired aspect ratio is 4/3, the
  375. // corresponding float value should be 1.33333. If not specified, the
  376. // best possible crop is returned. The number of provided aspect ratios is
  377. // limited to a maximum of 16; any aspect ratios provided after the 16th are
  378. // ignored.
  379. repeated float aspect_ratios = 1;
  380. }
  381. // Parameters for web detection request.
  382. message WebDetectionParams {
  383. // Whether to include results derived from the geo information in the image.
  384. bool include_geo_results = 2;
  385. }
  386. // Parameters for text detections. This is used to control TEXT_DETECTION and
  387. // DOCUMENT_TEXT_DETECTION features.
  388. message TextDetectionParams {
  389. // By default, Cloud Vision API only includes confidence score for
  390. // DOCUMENT_TEXT_DETECTION result. Set the flag to true to include confidence
  391. // score for TEXT_DETECTION as well.
  392. bool enable_text_detection_confidence_score = 9;
  393. // A list of advanced OCR options to fine-tune OCR behavior.
  394. repeated string advanced_ocr_options = 11;
  395. }
  396. // Image context and/or feature-specific parameters.
  397. message ImageContext {
  398. // lat/long rectangle that specifies the location of the image.
  399. LatLongRect lat_long_rect = 1;
  400. // List of languages to use for TEXT_DETECTION. In most cases, an empty value
  401. // yields the best results since it enables automatic language detection. For
  402. // languages based on the Latin alphabet, setting `language_hints` is not
  403. // needed. In rare cases, when the language of the text in the image is known,
  404. // setting a hint will help get better results (although it will be a
  405. // significant hindrance if the hint is wrong). Text detection returns an
  406. // error if one or more of the specified languages is not one of the
  407. // [supported languages](https://cloud.google.com/vision/docs/languages).
  408. repeated string language_hints = 2;
  409. // Parameters for crop hints annotation request.
  410. CropHintsParams crop_hints_params = 4;
  411. // Parameters for web detection.
  412. WebDetectionParams web_detection_params = 6;
  413. // Parameters for text detection and document text detection.
  414. TextDetectionParams text_detection_params = 12;
  415. }
  416. // Request for performing Google Cloud Vision API tasks over a user-provided
  417. // image, with user-requested features.
  418. message AnnotateImageRequest {
  419. // The image to be processed.
  420. Image image = 1;
  421. // Requested features.
  422. repeated Feature features = 2;
  423. // Additional context that may accompany the image.
  424. ImageContext image_context = 3;
  425. }
  426. // Response to an image annotation request.
  427. message AnnotateImageResponse {
  428. // If present, face detection has completed successfully.
  429. repeated FaceAnnotation face_annotations = 1;
  430. // If present, landmark detection has completed successfully.
  431. repeated EntityAnnotation landmark_annotations = 2;
  432. // If present, logo detection has completed successfully.
  433. repeated EntityAnnotation logo_annotations = 3;
  434. // If present, label detection has completed successfully.
  435. repeated EntityAnnotation label_annotations = 4;
  436. // If present, text (OCR) detection has completed successfully.
  437. repeated EntityAnnotation text_annotations = 5;
  438. // If present, text (OCR) detection or document (OCR) text detection has
  439. // completed successfully.
  440. // This annotation provides the structural hierarchy for the OCR detected
  441. // text.
  442. TextAnnotation full_text_annotation = 12;
  443. // If present, safe-search annotation has completed successfully.
  444. SafeSearchAnnotation safe_search_annotation = 6;
  445. // If present, image properties were extracted successfully.
  446. ImageProperties image_properties_annotation = 8;
  447. // If present, crop hints have completed successfully.
  448. CropHintsAnnotation crop_hints_annotation = 11;
  449. // If present, web detection has completed successfully.
  450. WebDetection web_detection = 13;
  451. // If set, represents the error message for the operation.
  452. // Note that filled-in image annotations are guaranteed to be
  453. // correct, even when `error` is set.
  454. google.rpc.Status error = 9;
  455. }
  456. // Multiple image annotation requests are batched into a single service call.
  457. message BatchAnnotateImagesRequest {
  458. // Required. Individual image annotation requests for this batch.
  459. repeated AnnotateImageRequest requests = 1 [(google.api.field_behavior) = REQUIRED];
  460. }
  461. // Response to a batch image annotation request.
  462. message BatchAnnotateImagesResponse {
  463. // Individual responses to image annotation requests within the batch.
  464. repeated AnnotateImageResponse responses = 1;
  465. }
  466. // A bucketized representation of likelihood, which is intended to give clients
  467. // highly stable results across model upgrades.
  468. enum Likelihood {
  469. // Unknown likelihood.
  470. UNKNOWN = 0;
  471. // It is very unlikely that the image belongs to the specified vertical.
  472. VERY_UNLIKELY = 1;
  473. // It is unlikely that the image belongs to the specified vertical.
  474. UNLIKELY = 2;
  475. // It is possible that the image belongs to the specified vertical.
  476. POSSIBLE = 3;
  477. // It is likely that the image belongs to the specified vertical.
  478. LIKELY = 4;
  479. // It is very likely that the image belongs to the specified vertical.
  480. VERY_LIKELY = 5;
  481. }