xref: /aosp_15_r20/external/icing/icing/util/tokenized-document.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_STORE_TOKENIZED_DOCUMENT_H_
16 #define ICING_STORE_TOKENIZED_DOCUMENT_H_
17 
18 #include <cstdint>
19 #include <string_view>
20 #include <utility>
21 #include <vector>
22 
23 #include "icing/text_classifier/lib3/utils/base/statusor.h"
24 #include "icing/proto/document.pb.h"
25 #include "icing/schema/joinable-property.h"
26 #include "icing/schema/schema-store.h"
27 #include "icing/schema/section.h"
28 #include "icing/tokenization/language-segmenter.h"
29 
30 namespace icing {
31 namespace lib {
32 
33 struct TokenizedSection {
34   SectionMetadata metadata;
35   std::vector<std::string_view> token_sequence;
36 
TokenizedSectionTokenizedSection37   TokenizedSection(SectionMetadata&& metadata_in,
38                    std::vector<std::string_view>&& token_sequence_in)
39       : metadata(std::move(metadata_in)),
40         token_sequence(std::move(token_sequence_in)) {}
41 };
42 
43 class TokenizedDocument {
44  public:
45   static libtextclassifier3::StatusOr<TokenizedDocument> Create(
46       const SchemaStore* schema_store,
47       const LanguageSegmenter* language_segmenter, DocumentProto document);
48 
document()49   const DocumentProto& document() const { return document_; }
50 
num_string_tokens()51   int32_t num_string_tokens() const {
52     int32_t num_string_tokens = 0;
53     for (const TokenizedSection& section : tokenized_string_sections_) {
54       num_string_tokens += section.token_sequence.size();
55     }
56     return num_string_tokens;
57   }
58 
tokenized_string_sections()59   const std::vector<TokenizedSection>& tokenized_string_sections() const {
60     return tokenized_string_sections_;
61   }
62 
integer_sections()63   const std::vector<Section<int64_t>>& integer_sections() const {
64     return integer_sections_;
65   }
66 
vector_sections()67   const std::vector<Section<PropertyProto::VectorProto>>& vector_sections()
68       const {
69     return vector_sections_;
70   }
71 
72   const std::vector<JoinableProperty<std::string_view>>&
qualified_id_join_properties()73   qualified_id_join_properties() const {
74     return joinable_property_group_.qualified_id_properties;
75   }
76 
77  private:
78   // Use TokenizedDocument::Create() to instantiate.
TokenizedDocument(DocumentProto && document,std::vector<TokenizedSection> && tokenized_string_sections,std::vector<Section<int64_t>> && integer_sections,std::vector<Section<PropertyProto::VectorProto>> && vector_sections,JoinablePropertyGroup && joinable_property_group)79   explicit TokenizedDocument(
80       DocumentProto&& document,
81       std::vector<TokenizedSection>&& tokenized_string_sections,
82       std::vector<Section<int64_t>>&& integer_sections,
83       std::vector<Section<PropertyProto::VectorProto>>&& vector_sections,
84       JoinablePropertyGroup&& joinable_property_group)
85       : document_(std::move(document)),
86         tokenized_string_sections_(std::move(tokenized_string_sections)),
87         integer_sections_(std::move(integer_sections)),
88         vector_sections_(std::move(vector_sections)),
89         joinable_property_group_(std::move(joinable_property_group)) {}
90 
91   DocumentProto document_;
92   std::vector<TokenizedSection> tokenized_string_sections_;
93   std::vector<Section<int64_t>> integer_sections_;
94   std::vector<Section<PropertyProto::VectorProto>> vector_sections_;
95   JoinablePropertyGroup joinable_property_group_;
96 };
97 
98 }  // namespace lib
99 }  // namespace icing
100 
101 #endif  // ICING_STORE_TOKENIZED_DOCUMENT_H_
102