123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820 |
- // Copyright 2022 Google LLC
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- syntax = "proto3";
- package google.cloud.documentai.v1beta3;
- import "google/api/field_behavior.proto";
- import "google/cloud/documentai/v1beta3/barcode.proto";
- import "google/cloud/documentai/v1beta3/geometry.proto";
- import "google/protobuf/timestamp.proto";
- import "google/rpc/status.proto";
- import "google/type/color.proto";
- import "google/type/date.proto";
- import "google/type/datetime.proto";
- import "google/type/money.proto";
- import "google/type/postal_address.proto";
- option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3";
- option go_package = "google.golang.org/genproto/googleapis/cloud/documentai/v1beta3;documentai";
- option java_multiple_files = true;
- option java_outer_classname = "DocumentProto";
- option java_package = "com.google.cloud.documentai.v1beta3";
- option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3";
- option ruby_package = "Google::Cloud::DocumentAI::V1beta3";
- // Document represents the canonical document resource in Document AI. It is an
- // interchange format that provides insights into documents and allows for
- // collaboration between users and Document AI to iterate and optimize for
- // quality.
- message Document {
- // For a large document, sharding may be performed to produce several
- // document shards. Each document shard contains this field to detail which
- // shard it is.
- message ShardInfo {
- // The 0-based index of this shard.
- int64 shard_index = 1;
- // Total number of shards.
- int64 shard_count = 2;
- // The index of the first character in [Document.text][google.cloud.documentai.v1beta3.Document.text] in the overall
- // document global text.
- int64 text_offset = 3;
- }
- // Annotation for common text style attributes. This adheres to CSS
- // conventions as much as possible.
- message Style {
- // Font size with unit.
- message FontSize {
- // Font size for the text.
- float size = 1;
- // Unit for the font size. Follows CSS naming (in, px, pt, etc.).
- string unit = 2;
- }
- // Text anchor indexing into the [Document.text][google.cloud.documentai.v1beta3.Document.text].
- TextAnchor text_anchor = 1;
- // Text color.
- google.type.Color color = 2;
- // Text background color.
- google.type.Color background_color = 3;
- // Font weight. Possible values are normal, bold, bolder, and lighter.
- // https://www.w3schools.com/cssref/pr_font_weight.asp
- string font_weight = 4;
- // Text style. Possible values are normal, italic, and oblique.
- // https://www.w3schools.com/cssref/pr_font_font-style.asp
- string text_style = 5;
- // Text decoration. Follows CSS standard.
- // <text-decoration-line> <text-decoration-color> <text-decoration-style>
- // https://www.w3schools.com/cssref/pr_text_text-decoration.asp
- string text_decoration = 6;
- // Font size.
- FontSize font_size = 7;
- // Font family such as `Arial`, `Times New Roman`.
- // https://www.w3schools.com/cssref/pr_font_font-family.asp
- string font_family = 8;
- }
- // A page in a [Document][google.cloud.documentai.v1beta3.Document].
- message Page {
- // Dimension for the page.
- message Dimension {
- // Page width.
- float width = 1;
- // Page height.
- float height = 2;
- // Dimension unit.
- string unit = 3;
- }
- // Rendered image contents for this page.
- message Image {
- // Raw byte content of the image.
- bytes content = 1;
- // Encoding mime type for the image.
- string mime_type = 2;
- // Width of the image in pixels.
- int32 width = 3;
- // Height of the image in pixels.
- int32 height = 4;
- }
- // Representation for transformation matrix, intended to be compatible and
- // used with OpenCV format for image manipulation.
- message Matrix {
- // Number of rows in the matrix.
- int32 rows = 1;
- // Number of columns in the matrix.
- int32 cols = 2;
- // This encodes information about what data type the matrix uses.
- // For example, 0 (CV_8U) is an unsigned 8-bit image. For the full list
- // of OpenCV primitive data types, please refer to
- // https://docs.opencv.org/4.3.0/d1/d1b/group__core__hal__interface.html
- int32 type = 3;
- // The matrix data.
- bytes data = 4;
- }
- // Visual element describing a layout unit on a page.
- message Layout {
- // Detected human reading orientation.
- enum Orientation {
- // Unspecified orientation.
- ORIENTATION_UNSPECIFIED = 0;
- // Orientation is aligned with page up.
- PAGE_UP = 1;
- // Orientation is aligned with page right.
- // Turn the head 90 degrees clockwise from upright to read.
- PAGE_RIGHT = 2;
- // Orientation is aligned with page down.
- // Turn the head 180 degrees from upright to read.
- PAGE_DOWN = 3;
- // Orientation is aligned with page left.
- // Turn the head 90 degrees counterclockwise from upright to read.
- PAGE_LEFT = 4;
- }
- // Text anchor indexing into the [Document.text][google.cloud.documentai.v1beta3.Document.text].
- TextAnchor text_anchor = 1;
- // Confidence of the current [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] within context of the object this
- // layout is for. e.g. confidence can be for a single token, a table,
- // a visual element, etc. depending on context. Range `[0, 1]`.
- float confidence = 2;
- // The bounding polygon for the [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout].
- BoundingPoly bounding_poly = 3;
- // Detected orientation for the [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout].
- Orientation orientation = 4;
- }
- // A block has a set of lines (collected into paragraphs) that have a
- // common line-spacing and orientation.
- message Block {
- // [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for [Block][google.cloud.documentai.v1beta3.Document.Page.Block].
- Layout layout = 1;
- // A list of detected languages together with confidence.
- repeated DetectedLanguage detected_languages = 2;
- // The history of this annotation.
- Provenance provenance = 3 [deprecated = true];
- }
- // A collection of lines that a human would perceive as a paragraph.
- message Paragraph {
- // [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for [Paragraph][google.cloud.documentai.v1beta3.Document.Page.Paragraph].
- Layout layout = 1;
- // A list of detected languages together with confidence.
- repeated DetectedLanguage detected_languages = 2;
- // The history of this annotation.
- Provenance provenance = 3 [deprecated = true];
- }
- // A collection of tokens that a human would perceive as a line.
- // Does not cross column boundaries, can be horizontal, vertical, etc.
- message Line {
- // [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for [Line][google.cloud.documentai.v1beta3.Document.Page.Line].
- Layout layout = 1;
- // A list of detected languages together with confidence.
- repeated DetectedLanguage detected_languages = 2;
- // The history of this annotation.
- Provenance provenance = 3 [deprecated = true];
- }
- // A detected token.
- message Token {
- // Detected break at the end of a [Token][google.cloud.documentai.v1beta3.Document.Page.Token].
- message DetectedBreak {
- // Enum to denote the type of break found.
- enum Type {
- // Unspecified break type.
- TYPE_UNSPECIFIED = 0;
- // A single whitespace.
- SPACE = 1;
- // A wider whitespace.
- WIDE_SPACE = 2;
- // A hyphen that indicates that a token has been split across lines.
- HYPHEN = 3;
- }
- // Detected break type.
- Type type = 1;
- }
- // [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for [Token][google.cloud.documentai.v1beta3.Document.Page.Token].
- Layout layout = 1;
- // Detected break at the end of a [Token][google.cloud.documentai.v1beta3.Document.Page.Token].
- DetectedBreak detected_break = 2;
- // A list of detected languages together with confidence.
- repeated DetectedLanguage detected_languages = 3;
- // The history of this annotation.
- Provenance provenance = 4 [deprecated = true];
- }
- // A detected symbol.
- message Symbol {
- // [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for [Symbol][google.cloud.documentai.v1beta3.Document.Page.Symbol].
- Layout layout = 1;
- // A list of detected languages together with confidence.
- repeated DetectedLanguage detected_languages = 2;
- }
- // Detected non-text visual elements e.g. checkbox, signature etc. on the
- // page.
- message VisualElement {
- // [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for [VisualElement][google.cloud.documentai.v1beta3.Document.Page.VisualElement].
- Layout layout = 1;
- // Type of the [VisualElement][google.cloud.documentai.v1beta3.Document.Page.VisualElement].
- string type = 2;
- // A list of detected languages together with confidence.
- repeated DetectedLanguage detected_languages = 3;
- }
- // A table representation similar to HTML table structure.
- message Table {
- // A row of table cells.
- message TableRow {
- // Cells that make up this row.
- repeated TableCell cells = 1;
- }
- // A cell representation inside the table.
- message TableCell {
- // [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for [TableCell][google.cloud.documentai.v1beta3.Document.Page.Table.TableCell].
- Layout layout = 1;
- // How many rows this cell spans.
- int32 row_span = 2;
- // How many columns this cell spans.
- int32 col_span = 3;
- // A list of detected languages together with confidence.
- repeated DetectedLanguage detected_languages = 4;
- }
- // [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for [Table][google.cloud.documentai.v1beta3.Document.Page.Table].
- Layout layout = 1;
- // Header rows of the table.
- repeated TableRow header_rows = 2;
- // Body rows of the table.
- repeated TableRow body_rows = 3;
- // A list of detected languages together with confidence.
- repeated DetectedLanguage detected_languages = 4;
- // The history of this table.
- Provenance provenance = 5;
- }
- // A form field detected on the page.
- message FormField {
- // [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for the [FormField][google.cloud.documentai.v1beta3.Document.Page.FormField] name. e.g. `Address`, `Email`,
- // `Grand total`, `Phone number`, etc.
- Layout field_name = 1;
- // [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for the [FormField][google.cloud.documentai.v1beta3.Document.Page.FormField] value.
- Layout field_value = 2;
- // A list of detected languages for name together with confidence.
- repeated DetectedLanguage name_detected_languages = 3;
- // A list of detected languages for value together with confidence.
- repeated DetectedLanguage value_detected_languages = 4;
- // If the value is non-textual, this field represents the type. Current
- // valid values are:
- // - blank (this indicates the field_value is normal text)
- // - "unfilled_checkbox"
- // - "filled_checkbox"
- string value_type = 5;
- // Created for Labeling UI to export key text.
- // If corrections were made to the text identified by the
- // `field_name.text_anchor`, this field will contain the correction.
- string corrected_key_text = 6;
- // Created for Labeling UI to export value text.
- // If corrections were made to the text identified by the
- // `field_value.text_anchor`, this field will contain the correction.
- string corrected_value_text = 7;
- // The history of this annotation.
- Provenance provenance = 8;
- }
- // A detected barcode.
- message DetectedBarcode {
- // [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for [DetectedBarcode][google.cloud.documentai.v1beta3.Document.Page.DetectedBarcode].
- Layout layout = 1;
- // Detailed barcode information of the [DetectedBarcode][google.cloud.documentai.v1beta3.Document.Page.DetectedBarcode].
- Barcode barcode = 2;
- }
- // Detected language for a structural component.
- message DetectedLanguage {
- // The BCP-47 language code, such as `en-US` or `sr-Latn`. For more
- // information, see
- // https://www.unicode.org/reports/tr35/#Unicode_locale_identifier.
- string language_code = 1;
- // Confidence of detected language. Range `[0, 1]`.
- float confidence = 2;
- }
- // Image Quality Scores for the page image
- message ImageQualityScores {
- // Image Quality Defects
- message DetectedDefect {
- // Name of the defect type. Supported values are:
- //
- // - `quality/defect_blurry`
- // - `quality/defect_noisy`
- // - `quality/defect_dark`
- // - `quality/defect_faint`
- // - `quality/defect_text_too_small`
- // - `quality/defect_document_cutoff`
- // - `quality/defect_text_cutoff`
- // - `quality/defect_glare`
- string type = 1;
- // Confidence of detected defect. Range `[0, 1]` where 1 indicates
- // strong confidence of that the defect exists.
- float confidence = 2;
- }
- // The overall quality score. Range `[0, 1]` where 1 is perfect quality.
- float quality_score = 1;
- // A list of detected defects.
- repeated DetectedDefect detected_defects = 2;
- }
- // 1-based index for current [Page][google.cloud.documentai.v1beta3.Document.Page] in a parent [Document][google.cloud.documentai.v1beta3.Document].
- // Useful when a page is taken out of a [Document][google.cloud.documentai.v1beta3.Document] for individual
- // processing.
- int32 page_number = 1;
- // Rendered image for this page. This image is preprocessed to remove any
- // skew, rotation, and distortions such that the annotation bounding boxes
- // can be upright and axis-aligned.
- Image image = 13;
- // Transformation matrices that were applied to the original document image
- // to produce [Page.image][google.cloud.documentai.v1beta3.Document.Page.image].
- repeated Matrix transforms = 14;
- // Physical dimension of the page.
- Dimension dimension = 2;
- // [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for the page.
- Layout layout = 3;
- // A list of detected languages together with confidence.
- repeated DetectedLanguage detected_languages = 4;
- // A list of visually detected text blocks on the page.
- // A block has a set of lines (collected into paragraphs) that have a common
- // line-spacing and orientation.
- repeated Block blocks = 5;
- // A list of visually detected text paragraphs on the page.
- // A collection of lines that a human would perceive as a paragraph.
- repeated Paragraph paragraphs = 6;
- // A list of visually detected text lines on the page.
- // A collection of tokens that a human would perceive as a line.
- repeated Line lines = 7;
- // A list of visually detected tokens on the page.
- repeated Token tokens = 8;
- // A list of detected non-text visual elements e.g. checkbox,
- // signature etc. on the page.
- repeated VisualElement visual_elements = 9;
- // A list of visually detected tables on the page.
- repeated Table tables = 10;
- // A list of visually detected form fields on the page.
- repeated FormField form_fields = 11;
- // A list of visually detected symbols on the page.
- repeated Symbol symbols = 12;
- // A list of detected barcodes.
- repeated DetectedBarcode detected_barcodes = 15;
- // Image Quality Scores.
- ImageQualityScores image_quality_scores = 17;
- // The history of this page.
- Provenance provenance = 16 [deprecated = true];
- }
- // An entity that could be a phrase in the text or a property that belongs to
- // the document. It is a known entity type, such as a person, an organization,
- // or location.
- message Entity {
- // Parsed and normalized entity value.
- message NormalizedValue {
- // An optional structured entity value.
- // Must match entity type defined in schema if
- // known. If this field is present, the `text` field could also be
- // populated.
- oneof structured_value {
- // Money value. See also:
- // https://github.com/googleapis/googleapis/blob/master/google/type/money.proto
- google.type.Money money_value = 2;
- // Date value. Includes year, month, day. See also:
- // https://github.com/googleapis/googleapis/blob/master/google/type/date.proto
- google.type.Date date_value = 3;
- // DateTime value. Includes date, time, and timezone. See also:
- // https://github.com/googleapis/googleapis/blob/master/google/type/datetime.proto
- google.type.DateTime datetime_value = 4;
- // Postal address. See also:
- // https://github.com/googleapis/googleapis/blob/master/google/type/postal_address.proto
- google.type.PostalAddress address_value = 5;
- // Boolean value. Can be used for entities with binary values, or for
- // checkboxes.
- bool boolean_value = 6;
- // Integer value.
- int32 integer_value = 7;
- // Float value.
- float float_value = 8;
- }
- // Optional. An optional field to store a normalized string.
- // For some entity types, one of respective `structured_value` fields may
- // also be populated. Also not all the types of `structured_value` will be
- // normalized. For example, some processors may not generate float
- // or int normalized text by default.
- //
- // Below are sample formats mapped to structured values.
- //
- // - Money/Currency type (`money_value`) is in the ISO 4217 text format.
- // - Date type (`date_value`) is in the ISO 8601 text format.
- // - Datetime type (`datetime_value`) is in the ISO 8601 text format.
- string text = 1 [(google.api.field_behavior) = OPTIONAL];
- }
- // Optional. Provenance of the entity.
- // Text anchor indexing into the [Document.text][google.cloud.documentai.v1beta3.Document.text].
- TextAnchor text_anchor = 1 [(google.api.field_behavior) = OPTIONAL];
- // Required. Entity type from a schema e.g. `Address`.
- string type = 2 [(google.api.field_behavior) = REQUIRED];
- // Optional. Text value of the entity e.g. `1600 Amphitheatre Pkwy`.
- string mention_text = 3 [(google.api.field_behavior) = OPTIONAL];
- // Optional. Deprecated. Use `id` field instead.
- string mention_id = 4 [(google.api.field_behavior) = OPTIONAL];
- // Optional. Confidence of detected Schema entity. Range `[0, 1]`.
- float confidence = 5 [(google.api.field_behavior) = OPTIONAL];
- // Optional. Represents the provenance of this entity wrt. the location on the
- // page where it was found.
- PageAnchor page_anchor = 6 [(google.api.field_behavior) = OPTIONAL];
- // Optional. Canonical id. This will be a unique value in the entity list
- // for this document.
- string id = 7 [(google.api.field_behavior) = OPTIONAL];
- // Optional. Normalized entity value. Absent if the extracted value could not be
- // converted or the type (e.g. address) is not supported for certain
- // parsers. This field is also only populated for certain supported document
- // types.
- NormalizedValue normalized_value = 9 [(google.api.field_behavior) = OPTIONAL];
- // Optional. Entities can be nested to form a hierarchical data structure representing
- // the content in the document.
- repeated Entity properties = 10 [(google.api.field_behavior) = OPTIONAL];
- // Optional. The history of this annotation.
- Provenance provenance = 11 [(google.api.field_behavior) = OPTIONAL];
- // Optional. Whether the entity will be redacted for de-identification purposes.
- bool redacted = 12 [(google.api.field_behavior) = OPTIONAL];
- }
- // Relationship between [Entities][google.cloud.documentai.v1beta3.Document.Entity].
- message EntityRelation {
- // Subject entity id.
- string subject_id = 1;
- // Object entity id.
- string object_id = 2;
- // Relationship description.
- string relation = 3;
- }
- // Text reference indexing into the [Document.text][google.cloud.documentai.v1beta3.Document.text].
- message TextAnchor {
- // A text segment in the [Document.text][google.cloud.documentai.v1beta3.Document.text]. The indices may be out of bounds
- // which indicate that the text extends into another document shard for
- // large sharded documents. See [ShardInfo.text_offset][google.cloud.documentai.v1beta3.Document.ShardInfo.text_offset]
- message TextSegment {
- // [TextSegment][google.cloud.documentai.v1beta3.Document.TextAnchor.TextSegment] start UTF-8 char index in the [Document.text][google.cloud.documentai.v1beta3.Document.text].
- int64 start_index = 1;
- // [TextSegment][google.cloud.documentai.v1beta3.Document.TextAnchor.TextSegment] half open end UTF-8 char index in the
- // [Document.text][google.cloud.documentai.v1beta3.Document.text].
- int64 end_index = 2;
- }
- // The text segments from the [Document.text][google.cloud.documentai.v1beta3.Document.text].
- repeated TextSegment text_segments = 1;
- // Contains the content of the text span so that users do
- // not have to look it up in the text_segments. It is always
- // populated for formFields.
- string content = 2;
- }
- // Referencing the visual context of the entity in the [Document.pages][google.cloud.documentai.v1beta3.Document.pages].
- // Page anchors can be cross-page, consist of multiple bounding polygons and
- // optionally reference specific layout element types.
- message PageAnchor {
- // Represents a weak reference to a page element within a document.
- message PageRef {
- // The type of layout that is being referenced.
- enum LayoutType {
- // Layout Unspecified.
- LAYOUT_TYPE_UNSPECIFIED = 0;
- // References a [Page.blocks][google.cloud.documentai.v1beta3.Document.Page.blocks] element.
- BLOCK = 1;
- // References a [Page.paragraphs][google.cloud.documentai.v1beta3.Document.Page.paragraphs] element.
- PARAGRAPH = 2;
- // References a [Page.lines][google.cloud.documentai.v1beta3.Document.Page.lines] element.
- LINE = 3;
- // References a [Page.tokens][google.cloud.documentai.v1beta3.Document.Page.tokens] element.
- TOKEN = 4;
- // References a [Page.visual_elements][google.cloud.documentai.v1beta3.Document.Page.visual_elements] element.
- VISUAL_ELEMENT = 5;
- // Refrrences a [Page.tables][google.cloud.documentai.v1beta3.Document.Page.tables] element.
- TABLE = 6;
- // References a [Page.form_fields][google.cloud.documentai.v1beta3.Document.Page.form_fields] element.
- FORM_FIELD = 7;
- }
- // Required. Index into the [Document.pages][google.cloud.documentai.v1beta3.Document.pages] element, for example using
- // [Document.pages][page_refs.page] to locate the related page element.
- // This field is skipped when its value is the default 0. See
- // https://developers.google.com/protocol-buffers/docs/proto3#json.
- int64 page = 1 [(google.api.field_behavior) = REQUIRED];
- // Optional. The type of the layout element that is being referenced if any.
- LayoutType layout_type = 2 [(google.api.field_behavior) = OPTIONAL];
- // Optional. Deprecated. Use [PageRef.bounding_poly][google.cloud.documentai.v1beta3.Document.PageAnchor.PageRef.bounding_poly] instead.
- string layout_id = 3 [
- deprecated = true,
- (google.api.field_behavior) = OPTIONAL
- ];
- // Optional. Identifies the bounding polygon of a layout element on the page.
- BoundingPoly bounding_poly = 4 [(google.api.field_behavior) = OPTIONAL];
- // Optional. Confidence of detected page element, if applicable. Range `[0, 1]`.
- float confidence = 5 [(google.api.field_behavior) = OPTIONAL];
- }
- // One or more references to visual page elements
- repeated PageRef page_refs = 1;
- }
- // Structure to identify provenance relationships between annotations in
- // different revisions.
- message Provenance {
- // The parent element the current element is based on. Used for
- // referencing/aligning, removal and replacement operations.
- message Parent {
- // The index of the index into current revision's parent_ids list.
- int32 revision = 1;
- // The index of the parent item in the corresponding item list (eg. list
- // of entities, properties within entities, etc.) in the parent revision.
- int32 index = 3;
- // The id of the parent provenance.
- int32 id = 2 [deprecated = true];
- }
- // If a processor or agent does an explicit operation on existing elements.
- enum OperationType {
- // Operation type unspecified. If no operation is specified a provenance
- // entry is simply used to match against a `parent`.
- OPERATION_TYPE_UNSPECIFIED = 0;
- // Add an element.
- ADD = 1;
- // Remove an element identified by `parent`.
- REMOVE = 2;
- // Replace an element identified by `parent`.
- REPLACE = 3;
- // Request human review for the element identified by `parent`.
- EVAL_REQUESTED = 4;
- // Element is reviewed and approved at human review, confidence will be
- // set to 1.0.
- EVAL_APPROVED = 5;
- // Element is skipped in the validation process.
- EVAL_SKIPPED = 6;
- }
- // The index of the revision that produced this element.
- int32 revision = 1;
- // The Id of this operation. Needs to be unique within the scope of the
- // revision.
- int32 id = 2 [deprecated = true];
- // References to the original elements that are replaced.
- repeated Parent parents = 3;
- // The type of provenance operation.
- OperationType type = 4;
- }
- // Contains past or forward revisions of this document.
- message Revision {
- // Human Review information of the document.
- message HumanReview {
- // Human review state. e.g. `requested`, `succeeded`, `rejected`.
- string state = 1;
- // A message providing more details about the current state of processing.
- // For example, the rejection reason when the state is `rejected`.
- string state_message = 2;
- }
- // Who/what made the change
- oneof source {
- // If the change was made by a person specify the name or id of that
- // person.
- string agent = 4;
- // If the annotation was made by processor identify the processor by its
- // resource name.
- string processor = 5;
- }
- // Id of the revision. Unique within the context of the document.
- string id = 1;
- // The revisions that this revision is based on. This can include one or
- // more parent (when documents are merged.) This field represents the
- // index into the `revisions` field.
- repeated int32 parent = 2 [deprecated = true];
- // The revisions that this revision is based on. Must include all the ids
- // that have anything to do with this revision - eg. there are
- // `provenance.parent.revision` fields that index into this field.
- repeated string parent_ids = 7;
- // The time that the revision was created.
- google.protobuf.Timestamp create_time = 3;
- // Human Review information of this revision.
- HumanReview human_review = 6;
- }
- // This message is used for text changes aka. OCR corrections.
- message TextChange {
- // Provenance of the correction.
- // Text anchor indexing into the [Document.text][google.cloud.documentai.v1beta3.Document.text]. There can only be a
- // single `TextAnchor.text_segments` element. If the start and
- // end index of the text segment are the same, the text change is inserted
- // before that index.
- TextAnchor text_anchor = 1;
- // The text that replaces the text identified in the `text_anchor`.
- string changed_text = 2;
- // The history of this annotation.
- repeated Provenance provenance = 3 [deprecated = true];
- }
- // Original source document from the user.
- oneof source {
- // Optional. Currently supports Google Cloud Storage URI of the form
- // `gs://bucket_name/object_name`. Object versioning is not supported.
- // See [Google Cloud Storage Request
- // URIs](https://cloud.google.com/storage/docs/reference-uris) for more
- // info.
- string uri = 1 [(google.api.field_behavior) = OPTIONAL];
- // Optional. Inline document content, represented as a stream of bytes.
- // Note: As with all `bytes` fields, protobuffers use a pure binary
- // representation, whereas JSON representations use base64.
- bytes content = 2 [(google.api.field_behavior) = OPTIONAL];
- }
- // An IANA published MIME type (also referred to as media type). For more
- // information, see
- // https://www.iana.org/assignments/media-types/media-types.xhtml.
- string mime_type = 3;
- // Optional. UTF-8 encoded text in reading order from the document.
- string text = 4 [(google.api.field_behavior) = OPTIONAL];
- // Placeholder. Styles for the [Document.text][google.cloud.documentai.v1beta3.Document.text].
- repeated Style text_styles = 5;
- // Visual page layout for the [Document][google.cloud.documentai.v1beta3.Document].
- repeated Page pages = 6;
- // A list of entities detected on [Document.text][google.cloud.documentai.v1beta3.Document.text]. For document shards,
- // entities in this list may cross shard boundaries.
- repeated Entity entities = 7;
- // Placeholder. Relationship among [Document.entities][google.cloud.documentai.v1beta3.Document.entities].
- repeated EntityRelation entity_relations = 8;
- // Placeholder. A list of text corrections made to [Document.text][google.cloud.documentai.v1beta3.Document.text]. This
- // is usually used for annotating corrections to OCR mistakes. Text changes
- // for a given revision may not overlap with each other.
- repeated TextChange text_changes = 14;
- // Information about the sharding if this document is sharded part of a larger
- // document. If the document is not sharded, this message is not specified.
- ShardInfo shard_info = 9;
- // Any error that occurred while processing this document.
- google.rpc.Status error = 10;
- // Placeholder. Revision history of this document.
- repeated Revision revisions = 13;
- }
|