xref: /aosp_15_r20/external/googleapis/google/cloud/documentai/v1beta3/document_schema.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.documentai.v1beta3;
18
19option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3";
20option go_package = "cloud.google.com/go/documentai/apiv1beta3/documentaipb;documentaipb";
21option java_multiple_files = true;
22option java_outer_classname = "DocumentAiDocumentSchema";
23option java_package = "com.google.cloud.documentai.v1beta3";
24option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3";
25option ruby_package = "Google::Cloud::DocumentAI::V1beta3";
26
27// Metadata for document summarization.
28message SummaryOptions {
29  // The Length enum.
30  enum Length {
31    // Default.
32    LENGTH_UNSPECIFIED = 0;
33
34    // A brief summary of one or two sentences.
35    BRIEF = 1;
36
37    // A paragraph-length summary.
38    MODERATE = 2;
39
40    // The longest option available.
41    COMPREHENSIVE = 3;
42  }
43
44  // The Format enum.
45  enum Format {
46    // Default.
47    FORMAT_UNSPECIFIED = 0;
48
49    // Format the output in paragraphs.
50    PARAGRAPH = 1;
51
52    // Format the output in bullets.
53    BULLETS = 2;
54  }
55
56  // How long the summary should be.
57  Length length = 1;
58
59  // The format the summary should be in.
60  Format format = 2;
61}
62
63// Metadata for how this field value is extracted.
64message FieldExtractionMetadata {
65  // Summary options config.
66  SummaryOptions summary_options = 2;
67}
68
69// Metadata about a property.
70message PropertyMetadata {
71  // Whether the property should be considered as "inactive".
72  bool inactive = 3;
73
74  // Field extraction metadata on the property.
75  FieldExtractionMetadata field_extraction_metadata = 9;
76}
77
78// Metadata about an entity type.
79message EntityTypeMetadata {
80  // Whether the entity type should be considered inactive.
81  bool inactive = 5;
82}
83
84// The schema defines the output of the processed document by a processor.
85message DocumentSchema {
86  // EntityType is the wrapper of a label of the corresponding model with
87  // detailed attributes and limitations for entity-based processors. Multiple
88  // types can also compose a dependency tree to represent nested types.
89  message EntityType {
90    // Defines the a list of enum values.
91    message EnumValues {
92      // The individual values that this enum values type can include.
93      repeated string values = 1;
94    }
95
96    // Defines properties that can be part of the entity type.
97    message Property {
98      // Types of occurrences of the entity type in the document.  This
99      // represents the number of instances, not mentions, of an entity.
100      // For example, a bank statement might only have one
101      // `account_number`, but this account number can be mentioned in several
102      // places on the document.  In this case, the `account_number` is
103      // considered a `REQUIRED_ONCE` entity type. If, on the other hand, we
104      // expect a bank statement to contain the status of multiple different
105      // accounts for the customers, the occurrence type is set to
106      // `REQUIRED_MULTIPLE`.
107      enum OccurrenceType {
108        // Unspecified occurrence type.
109        OCCURRENCE_TYPE_UNSPECIFIED = 0;
110
111        // There will be zero or one instance of this entity type.  The same
112        // entity instance may be mentioned multiple times.
113        OPTIONAL_ONCE = 1;
114
115        // The entity type will appear zero or multiple times.
116        OPTIONAL_MULTIPLE = 2;
117
118        // The entity type will only appear exactly once.  The same
119        // entity instance may be mentioned multiple times.
120        REQUIRED_ONCE = 3;
121
122        // The entity type will appear once or more times.
123        REQUIRED_MULTIPLE = 4;
124      }
125
126      // The name of the property.  Follows the same guidelines as the
127      // EntityType name.
128      string name = 1;
129
130      // User defined name for the property.
131      string display_name = 6;
132
133      // A reference to the value type of the property.  This type is subject
134      // to the same conventions as the `Entity.base_types` field.
135      string value_type = 2;
136
137      // Occurrence type limits the number of instances an entity type appears
138      // in the document.
139      OccurrenceType occurrence_type = 3;
140
141      // Any additional metadata about the property can be added here.
142      PropertyMetadata property_metadata = 5;
143    }
144
145    oneof value_source {
146      // If specified, lists all the possible values for this entity.  This
147      // should not be more than a handful of values.  If the number of values
148      // is >10 or could change frequently use the `EntityType.value_ontology`
149      // field and specify a list of all possible values in a value ontology
150      // file.
151      EnumValues enum_values = 14;
152    }
153
154    // User defined name for the type.
155    string display_name = 13;
156
157    // Name of the type. It must be unique within the schema file and
158    // cannot be a "Common Type".  The following naming conventions are used:
159    //
160    // - Use `snake_casing`.
161    // - Name matching is case-sensitive.
162    // - Maximum 64 characters.
163    // - Must start with a letter.
164    // - Allowed characters: ASCII letters `[a-z0-9_-]`.  (For backward
165    //   compatibility internal infrastructure and tooling can handle any ascii
166    //   character.)
167    // - The `/` is sometimes used to denote a property of a type.  For example
168    //   `line_item/amount`.  This convention is deprecated, but will still be
169    //   honored for backward compatibility.
170    string name = 1;
171
172    // The entity type that this type is derived from.  For now, one and only
173    // one should be set.
174    repeated string base_types = 2;
175
176    // Description the nested structure, or composition of an entity.
177    repeated Property properties = 6;
178
179    // Metadata for the entity type.
180    EntityTypeMetadata entity_type_metadata = 11;
181  }
182
183  // Metadata for global schema behavior.
184  message Metadata {
185    // If true, a `document` entity type can be applied to subdocument
186    // (splitting). Otherwise, it can only be applied to the entire document
187    // (classification).
188    bool document_splitter = 1;
189
190    // If true, on a given page, there can be multiple `document` annotations
191    // covering it.
192    bool document_allow_multiple_labels = 2;
193
194    // If set, all the nested entities must be prefixed with the parents.
195    bool prefixed_naming_on_properties = 6;
196
197    // If set, we will skip the naming format validation in the schema. So the
198    // string values in `DocumentSchema.EntityType.name` and
199    // `DocumentSchema.EntityType.Property.name` will not be checked.
200    bool skip_naming_validation = 7;
201  }
202
203  // Display name to show to users.
204  string display_name = 1;
205
206  // Description of the schema.
207  string description = 2;
208
209  // Entity types of the schema.
210  repeated EntityType entity_types = 3;
211
212  // Metadata of the schema.
213  Metadata metadata = 4;
214}
215