1*8b6cd535SAndroid Build Coastguard Worker // Copyright (C) 2020 Google LLC 2*8b6cd535SAndroid Build Coastguard Worker // 3*8b6cd535SAndroid Build Coastguard Worker // Licensed under the Apache License, Version 2.0 (the "License"); 4*8b6cd535SAndroid Build Coastguard Worker // you may not use this file except in compliance with the License. 5*8b6cd535SAndroid Build Coastguard Worker // You may obtain a copy of the License at 6*8b6cd535SAndroid Build Coastguard Worker // 7*8b6cd535SAndroid Build Coastguard Worker // http://www.apache.org/licenses/LICENSE-2.0 8*8b6cd535SAndroid Build Coastguard Worker // 9*8b6cd535SAndroid Build Coastguard Worker // Unless required by applicable law or agreed to in writing, software 10*8b6cd535SAndroid Build Coastguard Worker // distributed under the License is distributed on an "AS IS" BASIS, 11*8b6cd535SAndroid Build Coastguard Worker // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*8b6cd535SAndroid Build Coastguard Worker // See the License for the specific language governing permissions and 13*8b6cd535SAndroid Build Coastguard Worker // limitations under the License. 14*8b6cd535SAndroid Build Coastguard Worker 15*8b6cd535SAndroid Build Coastguard Worker #ifndef ICING_STORE_TOKENIZED_DOCUMENT_H_ 16*8b6cd535SAndroid Build Coastguard Worker #define ICING_STORE_TOKENIZED_DOCUMENT_H_ 17*8b6cd535SAndroid Build Coastguard Worker 18*8b6cd535SAndroid Build Coastguard Worker #include <cstdint> 19*8b6cd535SAndroid Build Coastguard Worker #include <string_view> 20*8b6cd535SAndroid Build Coastguard Worker #include <utility> 21*8b6cd535SAndroid Build Coastguard Worker #include <vector> 22*8b6cd535SAndroid Build Coastguard Worker 23*8b6cd535SAndroid Build Coastguard Worker #include "icing/text_classifier/lib3/utils/base/statusor.h" 24*8b6cd535SAndroid Build Coastguard Worker #include "icing/proto/document.pb.h" 25*8b6cd535SAndroid Build Coastguard Worker #include "icing/schema/joinable-property.h" 26*8b6cd535SAndroid Build Coastguard Worker #include "icing/schema/schema-store.h" 27*8b6cd535SAndroid Build Coastguard Worker #include "icing/schema/section.h" 28*8b6cd535SAndroid Build Coastguard Worker #include "icing/tokenization/language-segmenter.h" 29*8b6cd535SAndroid Build Coastguard Worker 30*8b6cd535SAndroid Build Coastguard Worker namespace icing { 31*8b6cd535SAndroid Build Coastguard Worker namespace lib { 32*8b6cd535SAndroid Build Coastguard Worker 33*8b6cd535SAndroid Build Coastguard Worker struct TokenizedSection { 34*8b6cd535SAndroid Build Coastguard Worker SectionMetadata metadata; 35*8b6cd535SAndroid Build Coastguard Worker std::vector<std::string_view> token_sequence; 36*8b6cd535SAndroid Build Coastguard Worker TokenizedSectionTokenizedSection37*8b6cd535SAndroid Build Coastguard Worker TokenizedSection(SectionMetadata&& metadata_in, 38*8b6cd535SAndroid Build Coastguard Worker std::vector<std::string_view>&& token_sequence_in) 39*8b6cd535SAndroid Build Coastguard Worker : metadata(std::move(metadata_in)), 40*8b6cd535SAndroid Build Coastguard Worker token_sequence(std::move(token_sequence_in)) {} 41*8b6cd535SAndroid Build Coastguard Worker }; 42*8b6cd535SAndroid Build Coastguard Worker 43*8b6cd535SAndroid Build Coastguard Worker class TokenizedDocument { 44*8b6cd535SAndroid Build Coastguard Worker public: 45*8b6cd535SAndroid Build Coastguard Worker static libtextclassifier3::StatusOr<TokenizedDocument> Create( 46*8b6cd535SAndroid Build Coastguard Worker const SchemaStore* schema_store, 47*8b6cd535SAndroid Build Coastguard Worker const LanguageSegmenter* language_segmenter, DocumentProto document); 48*8b6cd535SAndroid Build Coastguard Worker document()49*8b6cd535SAndroid Build Coastguard Worker const DocumentProto& document() const { return document_; } 50*8b6cd535SAndroid Build Coastguard Worker num_string_tokens()51*8b6cd535SAndroid Build Coastguard Worker int32_t num_string_tokens() const { 52*8b6cd535SAndroid Build Coastguard Worker int32_t num_string_tokens = 0; 53*8b6cd535SAndroid Build Coastguard Worker for (const TokenizedSection& section : tokenized_string_sections_) { 54*8b6cd535SAndroid Build Coastguard Worker num_string_tokens += section.token_sequence.size(); 55*8b6cd535SAndroid Build Coastguard Worker } 56*8b6cd535SAndroid Build Coastguard Worker return num_string_tokens; 57*8b6cd535SAndroid Build Coastguard Worker } 58*8b6cd535SAndroid Build Coastguard Worker tokenized_string_sections()59*8b6cd535SAndroid Build Coastguard Worker const std::vector<TokenizedSection>& tokenized_string_sections() const { 60*8b6cd535SAndroid Build Coastguard Worker return tokenized_string_sections_; 61*8b6cd535SAndroid Build Coastguard Worker } 62*8b6cd535SAndroid Build Coastguard Worker integer_sections()63*8b6cd535SAndroid Build Coastguard Worker const std::vector<Section<int64_t>>& integer_sections() const { 64*8b6cd535SAndroid Build Coastguard Worker return integer_sections_; 65*8b6cd535SAndroid Build Coastguard Worker } 66*8b6cd535SAndroid Build Coastguard Worker vector_sections()67*8b6cd535SAndroid Build Coastguard Worker const std::vector<Section<PropertyProto::VectorProto>>& vector_sections() 68*8b6cd535SAndroid Build Coastguard Worker const { 69*8b6cd535SAndroid Build Coastguard Worker return vector_sections_; 70*8b6cd535SAndroid Build Coastguard Worker } 71*8b6cd535SAndroid Build Coastguard Worker 72*8b6cd535SAndroid Build Coastguard Worker const std::vector<JoinableProperty<std::string_view>>& qualified_id_join_properties()73*8b6cd535SAndroid Build Coastguard Worker qualified_id_join_properties() const { 74*8b6cd535SAndroid Build Coastguard Worker return joinable_property_group_.qualified_id_properties; 75*8b6cd535SAndroid Build Coastguard Worker } 76*8b6cd535SAndroid Build Coastguard Worker 77*8b6cd535SAndroid Build Coastguard Worker private: 78*8b6cd535SAndroid Build Coastguard Worker // Use TokenizedDocument::Create() to instantiate. TokenizedDocument(DocumentProto && document,std::vector<TokenizedSection> && tokenized_string_sections,std::vector<Section<int64_t>> && integer_sections,std::vector<Section<PropertyProto::VectorProto>> && vector_sections,JoinablePropertyGroup && joinable_property_group)79*8b6cd535SAndroid Build Coastguard Worker explicit TokenizedDocument( 80*8b6cd535SAndroid Build Coastguard Worker DocumentProto&& document, 81*8b6cd535SAndroid Build Coastguard Worker std::vector<TokenizedSection>&& tokenized_string_sections, 82*8b6cd535SAndroid Build Coastguard Worker std::vector<Section<int64_t>>&& integer_sections, 83*8b6cd535SAndroid Build Coastguard Worker std::vector<Section<PropertyProto::VectorProto>>&& vector_sections, 84*8b6cd535SAndroid Build Coastguard Worker JoinablePropertyGroup&& joinable_property_group) 85*8b6cd535SAndroid Build Coastguard Worker : document_(std::move(document)), 86*8b6cd535SAndroid Build Coastguard Worker tokenized_string_sections_(std::move(tokenized_string_sections)), 87*8b6cd535SAndroid Build Coastguard Worker integer_sections_(std::move(integer_sections)), 88*8b6cd535SAndroid Build Coastguard Worker vector_sections_(std::move(vector_sections)), 89*8b6cd535SAndroid Build Coastguard Worker joinable_property_group_(std::move(joinable_property_group)) {} 90*8b6cd535SAndroid Build Coastguard Worker 91*8b6cd535SAndroid Build Coastguard Worker DocumentProto document_; 92*8b6cd535SAndroid Build Coastguard Worker std::vector<TokenizedSection> tokenized_string_sections_; 93*8b6cd535SAndroid Build Coastguard Worker std::vector<Section<int64_t>> integer_sections_; 94*8b6cd535SAndroid Build Coastguard Worker std::vector<Section<PropertyProto::VectorProto>> vector_sections_; 95*8b6cd535SAndroid Build Coastguard Worker JoinablePropertyGroup joinable_property_group_; 96*8b6cd535SAndroid Build Coastguard Worker }; 97*8b6cd535SAndroid Build Coastguard Worker 98*8b6cd535SAndroid Build Coastguard Worker } // namespace lib 99*8b6cd535SAndroid Build Coastguard Worker } // namespace icing 100*8b6cd535SAndroid Build Coastguard Worker 101*8b6cd535SAndroid Build Coastguard Worker #endif // ICING_STORE_TOKENIZED_DOCUMENT_H_ 102