1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_SCHEMA_SCHEMA_UTIL_H_ 16 #define ICING_SCHEMA_SCHEMA_UTIL_H_ 17 18 #include <cstdint> 19 #include <string> 20 #include <string_view> 21 #include <unordered_map> 22 #include <unordered_set> 23 24 #include "icing/text_classifier/lib3/utils/base/status.h" 25 #include "icing/text_classifier/lib3/utils/base/statusor.h" 26 #include "icing/feature-flags.h" 27 #include "icing/proto/schema.pb.h" 28 29 namespace icing { 30 namespace lib { 31 32 class SchemaUtil { 33 public: 34 using TypeConfigMap = 35 std::unordered_map<std::string, const SchemaTypeConfigProto>; 36 37 // A data structure that stores the relationships between schema types. The 38 // keys in TypeRelationMap are schema types, and the values are sets of schema 39 // types that are directly or indirectly related to the key. 40 template <typename T> 41 using TypeRelationMap = 42 std::unordered_map<std::string_view, 43 std::unordered_map<std::string_view, T>>; 44 45 // If A -> B is indicated in the map, then type A must be built before 46 // building type B, which implies one of the following situations. 47 // 48 // 1. B has a property of type A. 49 // 2. A is a parent type of B via polymorphism. 50 // 51 // For the first case, this map will also include all PropertyConfigProto 52 // (with DOCUMENT data_type) pointers which *directly* connects type A and B. 53 // IOW, this vector of PropertyConfigProto* are "direct edges" connecting A 54 // and B directly. It will be an empty vector if A and B are not "directly" 55 // connected, but instead via another intermediate level of schema type. For 56 // example, the actual dependency is A -> C -> B, so there will be A -> C and 57 // C -> B with valid PropertyConfigProto* respectively in this map, but we 58 // will also expand transitive dependents: add A -> B into dependent map with 59 // empty vector of "edges". 60 using DependentMap = TypeRelationMap<std::vector<const PropertyConfigProto*>>; 61 62 // If A -> B is indicated in the map, then type A is a parent type of B, 63 // directly or indirectly. If directly, the bool value in the map will be 64 // true, otherwise false. 65 // 66 // Note that all relationships contained in this map are also entries in the 67 // DependentMap, i.e. if B inherits from A, then there will be a mapping from 68 // A to B in both this map and the DependentMap. 69 using InheritanceMap = TypeRelationMap<bool>; 70 71 struct SchemaDelta { 72 // Which schema types were present in the old schema, but were deleted from 73 // the new schema. 74 std::unordered_set<std::string> schema_types_deleted; 75 76 // Which schema types had their SchemaTypeConfigProto changed in a way that 77 // could invalidate existing Documents of that schema type. 78 std::unordered_set<std::string> schema_types_incompatible; 79 80 // Schema types that were added in the new schema. Represented by the 81 // `schema_type` field in the SchemaTypeConfigProto. 82 std::unordered_set<std::string> schema_types_new; 83 84 // Schema types that were changed in a way that was backwards compatible and 85 // didn't invalidate the index. Represented by the `schema_type` field in 86 // the SchemaTypeConfigProto. 87 std::unordered_set<std::string> schema_types_changed_fully_compatible; 88 89 // Schema types that were changed in a way that was backwards compatible, 90 // but invalidated the index. Represented by the `schema_type` field in the 91 // SchemaTypeConfigProto. 92 std::unordered_set<std::string> schema_types_index_incompatible; 93 94 // Schema types that were changed in a way that was backwards compatible, 95 // but invalidated the joinable cache. Represented by the `schema_type` 96 // field in the SchemaTypeConfigProto. 97 std::unordered_set<std::string> schema_types_join_incompatible; 98 99 // Schema types that were changed in a way that was backwards compatible, 100 // but inconsistent with the old schema so that the scorable property cache 101 // needs to be re-generated. 102 std::unordered_set<std::string> schema_types_scorable_property_inconsistent; 103 104 bool operator==(const SchemaDelta& other) const { 105 return schema_types_deleted == other.schema_types_deleted && 106 schema_types_incompatible == other.schema_types_incompatible && 107 schema_types_new == other.schema_types_new && 108 schema_types_changed_fully_compatible == 109 other.schema_types_changed_fully_compatible && 110 schema_types_index_incompatible == 111 other.schema_types_index_incompatible && 112 schema_types_join_incompatible == 113 other.schema_types_join_incompatible && 114 schema_types_scorable_property_inconsistent == 115 other.schema_types_scorable_property_inconsistent; 116 } 117 }; 118 119 struct ParsedPropertyConfigs { 120 // Mapping of property name to PropertyConfigProto 121 std::unordered_map<std::string_view, const PropertyConfigProto*> 122 property_config_map; 123 124 // Properties that have an indexing config 125 std::unordered_set<std::string_view> indexed_properties; 126 127 // Properties that were REQUIRED 128 std::unordered_set<std::string_view> required_properties; 129 130 // Properties that have joinable config 131 std::unordered_set<std::string_view> joinable_properties; 132 133 // Properties that have DataType::DOCUMENT 134 std::unordered_set<std::string_view> nested_document_properties; 135 }; 136 137 // This function validates: 138 // 1. SchemaTypeConfigProto.schema_type's must be unique 139 // 2. Properties within one SchemaTypeConfigProto must be unique 140 // 3. SchemaTypeConfigProtos.schema_type must be non-empty 141 // 4. PropertyConfigProtos.property_name must be non-empty 142 // 5. PropertyConfigProtos.property_name's must be unique within one 143 // SchemaTypeConfigProto 144 // 6. PropertyConfigProtos.data_type cannot be UNKNOWN 145 // 7. PropertyConfigProtos.data_type of DOCUMENT must also have a 146 // schema_type 147 // 8. PropertyConfigProtos.cardinality cannot be UNKNOWN 148 // 9. PropertyConfigProtos.schema_type's must correspond to a 149 // SchemaTypeConfigProto.schema_type 150 // 10. Property names can only be alphanumeric. 151 // 11. Any STRING data types have a valid string_indexing_config 152 // 12. PropertyConfigProtos.joinable_config must be valid. See 153 // ValidateJoinableConfig for more details. 154 // 13. Any PropertyConfigProtos with nested DOCUMENT data type must not have 155 // REPEATED cardinality if they reference a schema type containing 156 // joinable property. 157 // 14. The schema definition cannot have invalid cycles. A cycle is invalid 158 // if: 159 // a. SchemaTypeConfigProto.parent_type definitions form an inheritance 160 // cycle. 161 // b. The schema's property definitions have schema_types that form a 162 // cycle, and all properties on the cycle declare 163 // DocumentIndexingConfig.index_nested_properties=true. 164 // c. The schema's property definitions have schema_types that form a 165 // cycle, and the cycle leads to an invalid joinable property config. 166 // This is the case if: 167 // i. Any type node in the cycle itself has a joinable proprty 168 // (property whose joinable config is not NONE), OR 169 // ii. Any type node in the cycle has a nested-type (direct or 170 // indirect) with a joinable property. 171 // 15. For DOCUMENT data types, if 172 // DocumentIndexingConfig.indexable_nested_properties_list is non-empty, 173 // DocumentIndexingConfig.index_nested_properties must be false. 174 // 16. Validate the PropertyConfigProtos.scorable_type: 175 // - It can only be set to ENABLED for the following data types: 176 // a. Int64 177 // b. Double 178 // c. Boolean 179 // - Documment type can't be explicitly set to DISABLED OR 180 // ENABLED. It is implicitly considered scorable if any of its or its 181 // dependency's property is scorable. 182 // 183 // Returns: 184 // On success, a dependent map from each types to their dependent types 185 // that depend on it directly or indirectly. 186 // ALREADY_EXISTS for case 1 and 2 187 // INVALID_ARGUMENT for 3-15 188 static libtextclassifier3::StatusOr<DependentMap> Validate( 189 const SchemaProto& schema, const FeatureFlags& feature_flags, 190 bool allow_circular_schema_definitions); 191 192 // Builds a transitive inheritance map. 193 // 194 // Ex. Suppose we have a schema with four types A, B, C and D, and we have the 195 // following direct inheritance relation. 196 // 197 // A -> B (A is the parent type of B) 198 // B -> C (B is the parent type of C) 199 // C -> D (C is the parent type of D) 200 // 201 // Then, the transitive inheritance map for this schema would be: 202 // 203 // A -> B, C, D 204 // B -> C, D 205 // C -> D 206 // 207 // RETURNS: 208 // On success, a transitive inheritance map of all types in the schema. 209 // INVALID_ARGUMENT if the inheritance graph contains a cycle. 210 static libtextclassifier3::StatusOr<SchemaUtil::InheritanceMap> 211 BuildTransitiveInheritanceGraph(const SchemaProto& schema); 212 213 // Creates a mapping of schema type -> schema type config proto. The 214 // type_config_map is cleared, and then each schema-type_config_proto pair is 215 // placed in the given type_config_map parameter. 216 static void BuildTypeConfigMap(const SchemaProto& schema, 217 TypeConfigMap* type_config_map); 218 219 // Parses the given type_config and returns a struct of easily-parseable 220 // information about the properties. 221 static ParsedPropertyConfigs ParsePropertyConfigs( 222 const SchemaTypeConfigProto& type_config); 223 224 // Computes the delta between the old and new schema. There are a few 225 // differences that'll be reported: 226 // 1. The derived index would be incompatible. This is held in 227 // `SchemaDelta.index_incompatible`. 228 // 2. Some schema types existed in the old schema, but have been deleted 229 // from the new schema. This is held in 230 // `SchemaDelta.schema_types_deleted` 231 // 3. A schema type's new definition would mean any existing data of the old 232 // definition is now incompatible. 233 // 4. The derived join index would be incompatible. This is held in 234 // `SchemaDelta.join_incompatible`. 235 // 5. The scorable properties of two schema are inconsistent. This is held 236 // in `SchemaDelta.schema_types_scorable_property_inconsistent`. 237 // 238 // For case 1, the two schemas would result in an incompatible index if: 239 // 1.1. The new SchemaProto has a different set of indexed properties than 240 // the old SchemaProto. 241 // 242 // For case 3, the two schemas would result in incompatible data if: 243 // 3.1. A SchemaTypeConfig exists in the old SchemaProto, but is not in the 244 // new SchemaProto 245 // 3.2. A property exists in the old SchemaTypeConfig, but is not in the new 246 // SchemaTypeConfig 247 // 3.3. A property in the new SchemaTypeConfig and has a REQUIRED 248 // PropertyConfigProto.cardinality, but is not in the old 249 // SchemaTypeConfig 250 // 3.4. A property is in both the old and new SchemaTypeConfig, but its 251 // PropertyConfigProto.data_type is different 252 // 3.5. A property is in both the old and new SchemaTypeConfig, but its 253 // PropertyConfigProto.schema_type is different 254 // 3.6. A property is in both the old and new SchemaTypeConfig, but its new 255 // PropertyConfigProto.cardinality is more restrictive. Restrictive 256 // scale defined as: 257 // LEAST <REPEATED - OPTIONAL - REQUIRED> MOST 258 // 259 // For case 4, the two schemas would result in an incompatible join if: 260 // 4.1. A SchematypeConfig exists in the new SchemaProto that has a 261 // different set of joinable properties than it did in the old 262 // SchemaProto. 263 // 264 // For case 5, a schema type is considered to have inconsistent scorable 265 // properties if it is present in both the old and new schemas, and that: 266 // 5.1. The schema type contains different sets of scorable properties in 267 // the old and new schemas. It could be that: 268 // a. The type contains scorable properties in the new schema, but 269 // not in the old schema. 270 // b. The type contains scorable properties in the old schema, but 271 // not in the new schema. 272 // c. The type contains scorable properties in both the old and new 273 // schemas, but the set of properties are different. 274 // 5.2. The type has dependency on the types that are considered to have 275 // inconsistent scorable properties, based on the new schema's 276 // dependent map. 277 // 278 // A property is defined by the combination of the 279 // SchemaTypeConfig.schema_type and the PropertyConfigProto.property_name. 280 // 281 // Returns a SchemaDelta that captures the aforementioned differences. 282 static const SchemaDelta ComputeCompatibilityDelta( 283 const SchemaProto& old_schema, const SchemaProto& new_schema, 284 const DependentMap& new_schema_dependent_map, 285 const FeatureFlags& feature_flags); 286 287 // Validates the 'property_name' field. 288 // 1. Can't be an empty string 289 // 2. Can only contain alphanumeric characters 290 // 291 // NOTE: schema_type is only used for logging. It is not necessary to populate 292 // it. 293 // 294 // RETURNS: 295 // - OK if property_name is valid 296 // - INVALID_ARGUMENT if property name is empty or contains an 297 // non-alphabetic character. 298 static libtextclassifier3::Status ValidatePropertyName( 299 std::string_view property_name, std::string_view schema_type = ""); 300 301 static bool IsIndexedProperty(const PropertyConfigProto& property_config); 302 303 private: 304 // Validates the 'schema_type' field 305 // 306 // Returns: 307 // INVALID_ARGUMENT if 'schema_type' is an empty string. 308 // OK on success 309 static libtextclassifier3::Status ValidateSchemaType( 310 std::string_view schema_type); 311 312 // Validates the 'data_type' field. 313 // 314 // Returns: 315 // INVALID_ARGUMENT if it's UNKNOWN 316 // OK on success 317 static libtextclassifier3::Status ValidateDataType( 318 PropertyConfigProto::DataType::Code data_type, 319 std::string_view schema_type, std::string_view property_name); 320 321 // Validates the 'cardinality' field. 322 // 323 // Returns: 324 // INVALID_ARGUMENT if it's UNKNOWN 325 // OK on success 326 static libtextclassifier3::Status ValidateCardinality( 327 PropertyConfigProto::Cardinality::Code cardinality, 328 std::string_view schema_type, std::string_view property_name); 329 330 // Validates the scorable_type of the given |property_config_proto|. 331 // 332 // Returns: 333 // INVALID_ARGUMENT if any scorable_type is found to be set incorrectly. 334 // OK on success 335 static libtextclassifier3::Status ValidateScorableType( 336 std::string_view schema_type, 337 const PropertyConfigProto& property_config_proto); 338 339 // Checks that the 'string_indexing_config' satisfies the following rules: 340 // 1. Only STRING data types can be indexed 341 // 2. An indexed property must have a valid tokenizer type 342 // 343 // Returns: 344 // INVALID_ARGUMENT if any of the rules are not followed 345 // OK on success 346 static libtextclassifier3::Status ValidateStringIndexingConfig( 347 const StringIndexingConfig& config, 348 PropertyConfigProto::DataType::Code data_type, 349 std::string_view schema_type, std::string_view property_name); 350 351 // Checks that the 'joinable_config' satisfies the following rules: 352 // 1. If the data type matches joinable value type 353 // a. Only STRING data types can use QUALIFIED_ID joinable value type 354 // 2. Only QUALIFIED_ID joinable value type can have delete propagation 355 // enabled 356 // 3. Any joinable property should have non-REPEATED cardinality 357 // 358 // Returns: 359 // INVALID_ARGUMENT if any of the rules are not followed 360 // OK on success 361 static libtextclassifier3::Status ValidateJoinableConfig( 362 const JoinableConfig& config, 363 PropertyConfigProto::DataType::Code data_type, 364 PropertyConfigProto::Cardinality::Code cardinality, 365 std::string_view schema_type, std::string_view property_name, 366 const FeatureFlags& feature_flags); 367 368 // Checks that the 'document_indexing_config' satisfies the following rule: 369 // 1. If indexable_nested_properties is non-empty, index_nested_properties 370 // must be set to false. 371 // 372 // Returns: 373 // INVALID_ARGUMENT if any of the rules are not followed 374 // OK on success 375 static libtextclassifier3::Status ValidateDocumentIndexingConfig( 376 const DocumentIndexingConfig& config, std::string_view schema_type, 377 std::string_view property_name); 378 379 // Returns if 'parent_type' is a direct or indirect parent of 'child_type'. 380 static bool IsParent(const SchemaUtil::InheritanceMap& inheritance_map, 381 std::string_view parent_type, 382 std::string_view child_type); 383 384 // Returns if 'child_property_config' in a child type can override 385 // 'parent_property_config' in the parent type. 386 // 387 // Let's assign 'child_property_config' a type T1 and 'parent_property_config' 388 // a type T2 that captures information for their data_type, schema_type and 389 // cardinalities, so that 'child_property_config' can override 390 // 'parent_property_config' if and only if T1 <: T2, i.e. T1 is a subtype of 391 // T2. 392 // 393 // Below are the rules for inferring subtype relations. 394 // - T <: T for every type T. 395 // - If U extends T, then U <: T. 396 // - For every type T1, T2 and T3, if T1 <: T2 and T2 <: T3, then T1 <: T3. 397 // - Optional<T> <: Repeated<T> for every type T. 398 // - Required<T> <: Optional<T> for every type T. 399 // - If T1 <: T2, then 400 // - Required<T1> <: Required<T2> 401 // - Optional<T1> <: Optional<T2> 402 // - Repeated<T1> <: Repeated<T2> 403 // 404 // We assume the Closed World Assumption (CWA), i.e. if T1 <: T2 cannot be 405 // deduced from the above rules, then T1 is not a subtype of T2. 406 static bool IsInheritedPropertyCompatible( 407 const SchemaUtil::InheritanceMap& inheritance_map, 408 const PropertyConfigProto& child_property_config, 409 const PropertyConfigProto& parent_property_config); 410 411 // Verifies that every child type's property set has included all compatible 412 // properties from parent types, based on the following rule: 413 // 414 // - If a property "prop" of type T is in the parent, then the child type must 415 // also have "prop" that is of type U, such that U <: T, i.e. U is a subtype 416 // of T. 417 // 418 // RETURNS: 419 // Ok on validation success 420 // INVALID_ARGUMENT if an exception that violates the above validation rule 421 // is found. 422 static libtextclassifier3::Status ValidateInheritedProperties( 423 const SchemaProto& schema); 424 }; 425 426 } // namespace lib 427 } // namespace icing 428 429 #endif // ICING_SCHEMA_SCHEMA_UTIL_H_ 430