xref: /aosp_15_r20/external/icing/icing/schema/schema-util.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_SCHEMA_SCHEMA_UTIL_H_
16 #define ICING_SCHEMA_SCHEMA_UTIL_H_
17 
18 #include <cstdint>
19 #include <string>
20 #include <string_view>
21 #include <unordered_map>
22 #include <unordered_set>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/feature-flags.h"
27 #include "icing/proto/schema.pb.h"
28 
29 namespace icing {
30 namespace lib {
31 
32 class SchemaUtil {
33  public:
34   using TypeConfigMap =
35       std::unordered_map<std::string, const SchemaTypeConfigProto>;
36 
37   // A data structure that stores the relationships between schema types. The
38   // keys in TypeRelationMap are schema types, and the values are sets of schema
39   // types that are directly or indirectly related to the key.
40   template <typename T>
41   using TypeRelationMap =
42       std::unordered_map<std::string_view,
43                          std::unordered_map<std::string_view, T>>;
44 
45   // If A -> B is indicated in the map, then type A must be built before
46   // building type B, which implies one of the following situations.
47   //
48   // 1. B has a property of type A.
49   // 2. A is a parent type of B via polymorphism.
50   //
51   // For the first case, this map will also include all PropertyConfigProto
52   // (with DOCUMENT data_type) pointers which *directly* connects type A and B.
53   // IOW, this vector of PropertyConfigProto* are "direct edges" connecting A
54   // and B directly. It will be an empty vector if A and B are not "directly"
55   // connected, but instead via another intermediate level of schema type. For
56   // example, the actual dependency is A -> C -> B, so there will be A -> C and
57   // C -> B with valid PropertyConfigProto* respectively in this map, but we
58   // will also expand transitive dependents: add A -> B into dependent map with
59   // empty vector of "edges".
60   using DependentMap = TypeRelationMap<std::vector<const PropertyConfigProto*>>;
61 
62   // If A -> B is indicated in the map, then type A is a parent type of B,
63   // directly or indirectly. If directly, the bool value in the map will be
64   // true, otherwise false.
65   //
66   // Note that all relationships contained in this map are also entries in the
67   // DependentMap, i.e. if B inherits from A, then there will be a mapping from
68   // A to B in both this map and the DependentMap.
69   using InheritanceMap = TypeRelationMap<bool>;
70 
71   struct SchemaDelta {
72     // Which schema types were present in the old schema, but were deleted from
73     // the new schema.
74     std::unordered_set<std::string> schema_types_deleted;
75 
76     // Which schema types had their SchemaTypeConfigProto changed in a way that
77     // could invalidate existing Documents of that schema type.
78     std::unordered_set<std::string> schema_types_incompatible;
79 
80     // Schema types that were added in the new schema. Represented by the
81     // `schema_type` field in the SchemaTypeConfigProto.
82     std::unordered_set<std::string> schema_types_new;
83 
84     // Schema types that were changed in a way that was backwards compatible and
85     // didn't invalidate the index. Represented by the `schema_type` field in
86     // the SchemaTypeConfigProto.
87     std::unordered_set<std::string> schema_types_changed_fully_compatible;
88 
89     // Schema types that were changed in a way that was backwards compatible,
90     // but invalidated the index. Represented by the `schema_type` field in the
91     // SchemaTypeConfigProto.
92     std::unordered_set<std::string> schema_types_index_incompatible;
93 
94     // Schema types that were changed in a way that was backwards compatible,
95     // but invalidated the joinable cache. Represented by the `schema_type`
96     // field in the SchemaTypeConfigProto.
97     std::unordered_set<std::string> schema_types_join_incompatible;
98 
99     // Schema types that were changed in a way that was backwards compatible,
100     // but inconsistent with the old schema so that the scorable property cache
101     // needs to be re-generated.
102     std::unordered_set<std::string> schema_types_scorable_property_inconsistent;
103 
104     bool operator==(const SchemaDelta& other) const {
105       return schema_types_deleted == other.schema_types_deleted &&
106              schema_types_incompatible == other.schema_types_incompatible &&
107              schema_types_new == other.schema_types_new &&
108              schema_types_changed_fully_compatible ==
109                  other.schema_types_changed_fully_compatible &&
110              schema_types_index_incompatible ==
111                  other.schema_types_index_incompatible &&
112              schema_types_join_incompatible ==
113                  other.schema_types_join_incompatible &&
114              schema_types_scorable_property_inconsistent ==
115                  other.schema_types_scorable_property_inconsistent;
116     }
117   };
118 
119   struct ParsedPropertyConfigs {
120     // Mapping of property name to PropertyConfigProto
121     std::unordered_map<std::string_view, const PropertyConfigProto*>
122         property_config_map;
123 
124     // Properties that have an indexing config
125     std::unordered_set<std::string_view> indexed_properties;
126 
127     // Properties that were REQUIRED
128     std::unordered_set<std::string_view> required_properties;
129 
130     // Properties that have joinable config
131     std::unordered_set<std::string_view> joinable_properties;
132 
133     // Properties that have DataType::DOCUMENT
134     std::unordered_set<std::string_view> nested_document_properties;
135   };
136 
137   // This function validates:
138   //   1. SchemaTypeConfigProto.schema_type's must be unique
139   //   2. Properties within one SchemaTypeConfigProto must be unique
140   //   3. SchemaTypeConfigProtos.schema_type must be non-empty
141   //   4. PropertyConfigProtos.property_name must be non-empty
142   //   5. PropertyConfigProtos.property_name's must be unique within one
143   //      SchemaTypeConfigProto
144   //   6. PropertyConfigProtos.data_type cannot be UNKNOWN
145   //   7. PropertyConfigProtos.data_type of DOCUMENT must also have a
146   //      schema_type
147   //   8. PropertyConfigProtos.cardinality cannot be UNKNOWN
148   //   9. PropertyConfigProtos.schema_type's must correspond to a
149   //      SchemaTypeConfigProto.schema_type
150   //  10. Property names can only be alphanumeric.
151   //  11. Any STRING data types have a valid string_indexing_config
152   //  12. PropertyConfigProtos.joinable_config must be valid. See
153   //      ValidateJoinableConfig for more details.
154   //  13. Any PropertyConfigProtos with nested DOCUMENT data type must not have
155   //      REPEATED cardinality if they reference a schema type containing
156   //      joinable property.
157   //  14. The schema definition cannot have invalid cycles. A cycle is invalid
158   //      if:
159   //      a. SchemaTypeConfigProto.parent_type definitions form an inheritance
160   //         cycle.
161   //      b. The schema's property definitions have schema_types that form a
162   //         cycle, and all properties on the cycle declare
163   //         DocumentIndexingConfig.index_nested_properties=true.
164   //      c. The schema's property definitions have schema_types that form a
165   //         cycle, and the cycle leads to an invalid joinable property config.
166   //         This is the case if:
167   //           i. Any type node in the cycle itself has a joinable proprty
168   //              (property whose joinable config is not NONE), OR
169   //          ii. Any type node in the cycle has a nested-type (direct or
170   //              indirect) with a joinable property.
171   //  15. For DOCUMENT data types, if
172   //      DocumentIndexingConfig.indexable_nested_properties_list is non-empty,
173   //      DocumentIndexingConfig.index_nested_properties must be false.
174   //  16. Validate the PropertyConfigProtos.scorable_type:
175   //        - It can only be set to ENABLED for the following data types:
176   //            a. Int64
177   //            b. Double
178   //            c. Boolean
179   //        - Documment type can't be explicitly set to DISABLED OR
180   //          ENABLED. It is implicitly considered scorable if any of its or its
181   //          dependency's property is scorable.
182   //
183   // Returns:
184   //   On success, a dependent map from each types to their dependent types
185   //   that depend on it directly or indirectly.
186   //   ALREADY_EXISTS for case 1 and 2
187   //   INVALID_ARGUMENT for 3-15
188   static libtextclassifier3::StatusOr<DependentMap> Validate(
189       const SchemaProto& schema, const FeatureFlags& feature_flags,
190       bool allow_circular_schema_definitions);
191 
192   // Builds a transitive inheritance map.
193   //
194   // Ex. Suppose we have a schema with four types A, B, C and D, and we have the
195   // following direct inheritance relation.
196   //
197   // A -> B (A is the parent type of B)
198   // B -> C (B is the parent type of C)
199   // C -> D (C is the parent type of D)
200   //
201   // Then, the transitive inheritance map for this schema would be:
202   //
203   // A -> B, C, D
204   // B -> C, D
205   // C -> D
206   //
207   // RETURNS:
208   //   On success, a transitive inheritance map of all types in the schema.
209   //   INVALID_ARGUMENT if the inheritance graph contains a cycle.
210   static libtextclassifier3::StatusOr<SchemaUtil::InheritanceMap>
211   BuildTransitiveInheritanceGraph(const SchemaProto& schema);
212 
213   // Creates a mapping of schema type -> schema type config proto. The
214   // type_config_map is cleared, and then each schema-type_config_proto pair is
215   // placed in the given type_config_map parameter.
216   static void BuildTypeConfigMap(const SchemaProto& schema,
217                                  TypeConfigMap* type_config_map);
218 
219   // Parses the given type_config and returns a struct of easily-parseable
220   // information about the properties.
221   static ParsedPropertyConfigs ParsePropertyConfigs(
222       const SchemaTypeConfigProto& type_config);
223 
224   // Computes the delta between the old and new schema. There are a few
225   // differences that'll be reported:
226   //   1. The derived index would be incompatible. This is held in
227   //      `SchemaDelta.index_incompatible`.
228   //   2. Some schema types existed in the old schema, but have been deleted
229   //      from the new schema. This is held in
230   //      `SchemaDelta.schema_types_deleted`
231   //   3. A schema type's new definition would mean any existing data of the old
232   //      definition is now incompatible.
233   //   4. The derived join index would be incompatible. This is held in
234   //      `SchemaDelta.join_incompatible`.
235   //   5. The scorable properties of two schema are inconsistent. This is held
236   //      in `SchemaDelta.schema_types_scorable_property_inconsistent`.
237   //
238   // For case 1, the two schemas would result in an incompatible index if:
239   //   1.1. The new SchemaProto has a different set of indexed properties than
240   //        the old SchemaProto.
241   //
242   // For case 3, the two schemas would result in incompatible data if:
243   //   3.1. A SchemaTypeConfig exists in the old SchemaProto, but is not in the
244   //        new SchemaProto
245   //   3.2. A property exists in the old SchemaTypeConfig, but is not in the new
246   //        SchemaTypeConfig
247   //   3.3. A property in the new SchemaTypeConfig and has a REQUIRED
248   //        PropertyConfigProto.cardinality, but is not in the old
249   //        SchemaTypeConfig
250   //   3.4. A property is in both the old and new SchemaTypeConfig, but its
251   //        PropertyConfigProto.data_type is different
252   //   3.5. A property is in both the old and new SchemaTypeConfig, but its
253   //        PropertyConfigProto.schema_type is different
254   //   3.6. A property is in both the old and new SchemaTypeConfig, but its new
255   //        PropertyConfigProto.cardinality is more restrictive. Restrictive
256   //        scale defined as:
257   //          LEAST <REPEATED - OPTIONAL - REQUIRED> MOST
258   //
259   // For case 4, the two schemas would result in an incompatible join if:
260   //   4.1. A SchematypeConfig exists in the new SchemaProto that has a
261   //        different set of joinable properties than it did in the old
262   //        SchemaProto.
263   //
264   // For case 5, a schema type is considered to have inconsistent scorable
265   // properties if it is present in both the old and new schemas, and that:
266   //   5.1. The schema type contains different sets of scorable properties in
267   //        the old and new schemas. It could be that:
268   //          a. The type contains scorable properties in the new schema, but
269   //             not in the old schema.
270   //          b. The type contains scorable properties in the old schema, but
271   //             not in the new schema.
272   //          c. The type contains scorable properties in both the old and new
273   //             schemas, but the set of properties are different.
274   //   5.2. The type has dependency on the types that are considered to have
275   //        inconsistent scorable properties, based on the new schema's
276   //        dependent map.
277   //
278   // A property is defined by the combination of the
279   // SchemaTypeConfig.schema_type and the PropertyConfigProto.property_name.
280   //
281   // Returns a SchemaDelta that captures the aforementioned differences.
282   static const SchemaDelta ComputeCompatibilityDelta(
283       const SchemaProto& old_schema, const SchemaProto& new_schema,
284       const DependentMap& new_schema_dependent_map,
285       const FeatureFlags& feature_flags);
286 
287   // Validates the 'property_name' field.
288   //   1. Can't be an empty string
289   //   2. Can only contain alphanumeric characters
290   //
291   // NOTE: schema_type is only used for logging. It is not necessary to populate
292   // it.
293   //
294   // RETURNS:
295   //   - OK if property_name is valid
296   //   - INVALID_ARGUMENT if property name is empty or contains an
297   //     non-alphabetic character.
298   static libtextclassifier3::Status ValidatePropertyName(
299       std::string_view property_name, std::string_view schema_type = "");
300 
301   static bool IsIndexedProperty(const PropertyConfigProto& property_config);
302 
303  private:
304   // Validates the 'schema_type' field
305   //
306   // Returns:
307   //   INVALID_ARGUMENT if 'schema_type' is an empty string.
308   //   OK on success
309   static libtextclassifier3::Status ValidateSchemaType(
310       std::string_view schema_type);
311 
312   // Validates the 'data_type' field.
313   //
314   // Returns:
315   //   INVALID_ARGUMENT if it's UNKNOWN
316   //   OK on success
317   static libtextclassifier3::Status ValidateDataType(
318       PropertyConfigProto::DataType::Code data_type,
319       std::string_view schema_type, std::string_view property_name);
320 
321   // Validates the 'cardinality' field.
322   //
323   // Returns:
324   //   INVALID_ARGUMENT if it's UNKNOWN
325   //   OK on success
326   static libtextclassifier3::Status ValidateCardinality(
327       PropertyConfigProto::Cardinality::Code cardinality,
328       std::string_view schema_type, std::string_view property_name);
329 
330   // Validates the scorable_type of the given |property_config_proto|.
331   //
332   // Returns:
333   //   INVALID_ARGUMENT if any scorable_type is found to be set incorrectly.
334   //   OK on success
335   static libtextclassifier3::Status ValidateScorableType(
336       std::string_view schema_type,
337       const PropertyConfigProto& property_config_proto);
338 
339   // Checks that the 'string_indexing_config' satisfies the following rules:
340   //   1. Only STRING data types can be indexed
341   //   2. An indexed property must have a valid tokenizer type
342   //
343   // Returns:
344   //   INVALID_ARGUMENT if any of the rules are not followed
345   //   OK on success
346   static libtextclassifier3::Status ValidateStringIndexingConfig(
347       const StringIndexingConfig& config,
348       PropertyConfigProto::DataType::Code data_type,
349       std::string_view schema_type, std::string_view property_name);
350 
351   // Checks that the 'joinable_config' satisfies the following rules:
352   //   1. If the data type matches joinable value type
353   //      a. Only STRING data types can use QUALIFIED_ID joinable value type
354   //   2. Only QUALIFIED_ID joinable value type can have delete propagation
355   //      enabled
356   //   3. Any joinable property should have non-REPEATED cardinality
357   //
358   // Returns:
359   //   INVALID_ARGUMENT if any of the rules are not followed
360   //   OK on success
361   static libtextclassifier3::Status ValidateJoinableConfig(
362       const JoinableConfig& config,
363       PropertyConfigProto::DataType::Code data_type,
364       PropertyConfigProto::Cardinality::Code cardinality,
365       std::string_view schema_type, std::string_view property_name,
366       const FeatureFlags& feature_flags);
367 
368   // Checks that the 'document_indexing_config' satisfies the following rule:
369   //    1. If indexable_nested_properties is non-empty, index_nested_properties
370   //       must be set to false.
371   //
372   // Returns:
373   //   INVALID_ARGUMENT if any of the rules are not followed
374   //   OK on success
375   static libtextclassifier3::Status ValidateDocumentIndexingConfig(
376       const DocumentIndexingConfig& config, std::string_view schema_type,
377       std::string_view property_name);
378 
379   // Returns if 'parent_type' is a direct or indirect parent of 'child_type'.
380   static bool IsParent(const SchemaUtil::InheritanceMap& inheritance_map,
381                        std::string_view parent_type,
382                        std::string_view child_type);
383 
384   // Returns if 'child_property_config' in a child type can override
385   // 'parent_property_config' in the parent type.
386   //
387   // Let's assign 'child_property_config' a type T1 and 'parent_property_config'
388   // a type T2 that captures information for their data_type, schema_type and
389   // cardinalities, so that 'child_property_config' can override
390   // 'parent_property_config' if and only if T1 <: T2, i.e. T1 is a subtype of
391   // T2.
392   //
393   // Below are the rules for inferring subtype relations.
394   // - T <: T for every type T.
395   // - If U extends T, then U <: T.
396   // - For every type T1, T2 and T3, if T1 <: T2 and T2 <: T3, then T1 <: T3.
397   // - Optional<T> <: Repeated<T> for every type T.
398   // - Required<T> <: Optional<T> for every type T.
399   // - If T1 <: T2, then
400   //   - Required<T1> <: Required<T2>
401   //   - Optional<T1> <: Optional<T2>
402   //   - Repeated<T1> <: Repeated<T2>
403   //
404   // We assume the Closed World Assumption (CWA), i.e. if T1 <: T2 cannot be
405   // deduced from the above rules, then T1 is not a subtype of T2.
406   static bool IsInheritedPropertyCompatible(
407       const SchemaUtil::InheritanceMap& inheritance_map,
408       const PropertyConfigProto& child_property_config,
409       const PropertyConfigProto& parent_property_config);
410 
411   // Verifies that every child type's property set has included all compatible
412   // properties from parent types, based on the following rule:
413   //
414   // - If a property "prop" of type T is in the parent, then the child type must
415   //   also have "prop" that is of type U, such that U <: T, i.e. U is a subtype
416   //   of T.
417   //
418   // RETURNS:
419   //   Ok on validation success
420   //   INVALID_ARGUMENT if an exception that violates the above validation rule
421   //     is found.
422   static libtextclassifier3::Status ValidateInheritedProperties(
423       const SchemaProto& schema);
424 };
425 
426 }  // namespace lib
427 }  // namespace icing
428 
429 #endif  // ICING_SCHEMA_SCHEMA_UTIL_H_
430