1 // Copyright (C) 2020 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_STORE_TOKENIZED_DOCUMENT_H_ 16 #define ICING_STORE_TOKENIZED_DOCUMENT_H_ 17 18 #include <cstdint> 19 #include <string_view> 20 #include <utility> 21 #include <vector> 22 23 #include "icing/text_classifier/lib3/utils/base/statusor.h" 24 #include "icing/proto/document.pb.h" 25 #include "icing/schema/joinable-property.h" 26 #include "icing/schema/schema-store.h" 27 #include "icing/schema/section.h" 28 #include "icing/tokenization/language-segmenter.h" 29 30 namespace icing { 31 namespace lib { 32 33 struct TokenizedSection { 34 SectionMetadata metadata; 35 std::vector<std::string_view> token_sequence; 36 TokenizedSectionTokenizedSection37 TokenizedSection(SectionMetadata&& metadata_in, 38 std::vector<std::string_view>&& token_sequence_in) 39 : metadata(std::move(metadata_in)), 40 token_sequence(std::move(token_sequence_in)) {} 41 }; 42 43 class TokenizedDocument { 44 public: 45 static libtextclassifier3::StatusOr<TokenizedDocument> Create( 46 const SchemaStore* schema_store, 47 const LanguageSegmenter* language_segmenter, DocumentProto document); 48 document()49 const DocumentProto& document() const { return document_; } 50 num_string_tokens()51 int32_t num_string_tokens() const { 52 int32_t num_string_tokens = 0; 53 for (const TokenizedSection& section : tokenized_string_sections_) { 54 num_string_tokens += section.token_sequence.size(); 55 } 56 return num_string_tokens; 57 } 58 tokenized_string_sections()59 const std::vector<TokenizedSection>& tokenized_string_sections() const { 60 return tokenized_string_sections_; 61 } 62 integer_sections()63 const std::vector<Section<int64_t>>& integer_sections() const { 64 return integer_sections_; 65 } 66 vector_sections()67 const std::vector<Section<PropertyProto::VectorProto>>& vector_sections() 68 const { 69 return vector_sections_; 70 } 71 72 const std::vector<JoinableProperty<std::string_view>>& qualified_id_join_properties()73 qualified_id_join_properties() const { 74 return joinable_property_group_.qualified_id_properties; 75 } 76 77 private: 78 // Use TokenizedDocument::Create() to instantiate. TokenizedDocument(DocumentProto && document,std::vector<TokenizedSection> && tokenized_string_sections,std::vector<Section<int64_t>> && integer_sections,std::vector<Section<PropertyProto::VectorProto>> && vector_sections,JoinablePropertyGroup && joinable_property_group)79 explicit TokenizedDocument( 80 DocumentProto&& document, 81 std::vector<TokenizedSection>&& tokenized_string_sections, 82 std::vector<Section<int64_t>>&& integer_sections, 83 std::vector<Section<PropertyProto::VectorProto>>&& vector_sections, 84 JoinablePropertyGroup&& joinable_property_group) 85 : document_(std::move(document)), 86 tokenized_string_sections_(std::move(tokenized_string_sections)), 87 integer_sections_(std::move(integer_sections)), 88 vector_sections_(std::move(vector_sections)), 89 joinable_property_group_(std::move(joinable_property_group)) {} 90 91 DocumentProto document_; 92 std::vector<TokenizedSection> tokenized_string_sections_; 93 std::vector<Section<int64_t>> integer_sections_; 94 std::vector<Section<PropertyProto::VectorProto>> vector_sections_; 95 JoinablePropertyGroup joinable_property_group_; 96 }; 97 98 } // namespace lib 99 } // namespace icing 100 101 #endif // ICING_STORE_TOKENIZED_DOCUMENT_H_ 102