xref: /aosp_15_r20/external/icing/icing/util/tokenized-document.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1*8b6cd535SAndroid Build Coastguard Worker // Copyright (C) 2020 Google LLC
2*8b6cd535SAndroid Build Coastguard Worker //
3*8b6cd535SAndroid Build Coastguard Worker // Licensed under the Apache License, Version 2.0 (the "License");
4*8b6cd535SAndroid Build Coastguard Worker // you may not use this file except in compliance with the License.
5*8b6cd535SAndroid Build Coastguard Worker // You may obtain a copy of the License at
6*8b6cd535SAndroid Build Coastguard Worker //
7*8b6cd535SAndroid Build Coastguard Worker //      http://www.apache.org/licenses/LICENSE-2.0
8*8b6cd535SAndroid Build Coastguard Worker //
9*8b6cd535SAndroid Build Coastguard Worker // Unless required by applicable law or agreed to in writing, software
10*8b6cd535SAndroid Build Coastguard Worker // distributed under the License is distributed on an "AS IS" BASIS,
11*8b6cd535SAndroid Build Coastguard Worker // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*8b6cd535SAndroid Build Coastguard Worker // See the License for the specific language governing permissions and
13*8b6cd535SAndroid Build Coastguard Worker // limitations under the License.
14*8b6cd535SAndroid Build Coastguard Worker 
15*8b6cd535SAndroid Build Coastguard Worker #ifndef ICING_STORE_TOKENIZED_DOCUMENT_H_
16*8b6cd535SAndroid Build Coastguard Worker #define ICING_STORE_TOKENIZED_DOCUMENT_H_
17*8b6cd535SAndroid Build Coastguard Worker 
18*8b6cd535SAndroid Build Coastguard Worker #include <cstdint>
19*8b6cd535SAndroid Build Coastguard Worker #include <string_view>
20*8b6cd535SAndroid Build Coastguard Worker #include <utility>
21*8b6cd535SAndroid Build Coastguard Worker #include <vector>
22*8b6cd535SAndroid Build Coastguard Worker 
23*8b6cd535SAndroid Build Coastguard Worker #include "icing/text_classifier/lib3/utils/base/statusor.h"
24*8b6cd535SAndroid Build Coastguard Worker #include "icing/proto/document.pb.h"
25*8b6cd535SAndroid Build Coastguard Worker #include "icing/schema/joinable-property.h"
26*8b6cd535SAndroid Build Coastguard Worker #include "icing/schema/schema-store.h"
27*8b6cd535SAndroid Build Coastguard Worker #include "icing/schema/section.h"
28*8b6cd535SAndroid Build Coastguard Worker #include "icing/tokenization/language-segmenter.h"
29*8b6cd535SAndroid Build Coastguard Worker 
30*8b6cd535SAndroid Build Coastguard Worker namespace icing {
31*8b6cd535SAndroid Build Coastguard Worker namespace lib {
32*8b6cd535SAndroid Build Coastguard Worker 
33*8b6cd535SAndroid Build Coastguard Worker struct TokenizedSection {
34*8b6cd535SAndroid Build Coastguard Worker   SectionMetadata metadata;
35*8b6cd535SAndroid Build Coastguard Worker   std::vector<std::string_view> token_sequence;
36*8b6cd535SAndroid Build Coastguard Worker 
TokenizedSectionTokenizedSection37*8b6cd535SAndroid Build Coastguard Worker   TokenizedSection(SectionMetadata&& metadata_in,
38*8b6cd535SAndroid Build Coastguard Worker                    std::vector<std::string_view>&& token_sequence_in)
39*8b6cd535SAndroid Build Coastguard Worker       : metadata(std::move(metadata_in)),
40*8b6cd535SAndroid Build Coastguard Worker         token_sequence(std::move(token_sequence_in)) {}
41*8b6cd535SAndroid Build Coastguard Worker };
42*8b6cd535SAndroid Build Coastguard Worker 
43*8b6cd535SAndroid Build Coastguard Worker class TokenizedDocument {
44*8b6cd535SAndroid Build Coastguard Worker  public:
45*8b6cd535SAndroid Build Coastguard Worker   static libtextclassifier3::StatusOr<TokenizedDocument> Create(
46*8b6cd535SAndroid Build Coastguard Worker       const SchemaStore* schema_store,
47*8b6cd535SAndroid Build Coastguard Worker       const LanguageSegmenter* language_segmenter, DocumentProto document);
48*8b6cd535SAndroid Build Coastguard Worker 
document()49*8b6cd535SAndroid Build Coastguard Worker   const DocumentProto& document() const { return document_; }
50*8b6cd535SAndroid Build Coastguard Worker 
num_string_tokens()51*8b6cd535SAndroid Build Coastguard Worker   int32_t num_string_tokens() const {
52*8b6cd535SAndroid Build Coastguard Worker     int32_t num_string_tokens = 0;
53*8b6cd535SAndroid Build Coastguard Worker     for (const TokenizedSection& section : tokenized_string_sections_) {
54*8b6cd535SAndroid Build Coastguard Worker       num_string_tokens += section.token_sequence.size();
55*8b6cd535SAndroid Build Coastguard Worker     }
56*8b6cd535SAndroid Build Coastguard Worker     return num_string_tokens;
57*8b6cd535SAndroid Build Coastguard Worker   }
58*8b6cd535SAndroid Build Coastguard Worker 
tokenized_string_sections()59*8b6cd535SAndroid Build Coastguard Worker   const std::vector<TokenizedSection>& tokenized_string_sections() const {
60*8b6cd535SAndroid Build Coastguard Worker     return tokenized_string_sections_;
61*8b6cd535SAndroid Build Coastguard Worker   }
62*8b6cd535SAndroid Build Coastguard Worker 
integer_sections()63*8b6cd535SAndroid Build Coastguard Worker   const std::vector<Section<int64_t>>& integer_sections() const {
64*8b6cd535SAndroid Build Coastguard Worker     return integer_sections_;
65*8b6cd535SAndroid Build Coastguard Worker   }
66*8b6cd535SAndroid Build Coastguard Worker 
vector_sections()67*8b6cd535SAndroid Build Coastguard Worker   const std::vector<Section<PropertyProto::VectorProto>>& vector_sections()
68*8b6cd535SAndroid Build Coastguard Worker       const {
69*8b6cd535SAndroid Build Coastguard Worker     return vector_sections_;
70*8b6cd535SAndroid Build Coastguard Worker   }
71*8b6cd535SAndroid Build Coastguard Worker 
72*8b6cd535SAndroid Build Coastguard Worker   const std::vector<JoinableProperty<std::string_view>>&
qualified_id_join_properties()73*8b6cd535SAndroid Build Coastguard Worker   qualified_id_join_properties() const {
74*8b6cd535SAndroid Build Coastguard Worker     return joinable_property_group_.qualified_id_properties;
75*8b6cd535SAndroid Build Coastguard Worker   }
76*8b6cd535SAndroid Build Coastguard Worker 
77*8b6cd535SAndroid Build Coastguard Worker  private:
78*8b6cd535SAndroid Build Coastguard Worker   // Use TokenizedDocument::Create() to instantiate.
TokenizedDocument(DocumentProto && document,std::vector<TokenizedSection> && tokenized_string_sections,std::vector<Section<int64_t>> && integer_sections,std::vector<Section<PropertyProto::VectorProto>> && vector_sections,JoinablePropertyGroup && joinable_property_group)79*8b6cd535SAndroid Build Coastguard Worker   explicit TokenizedDocument(
80*8b6cd535SAndroid Build Coastguard Worker       DocumentProto&& document,
81*8b6cd535SAndroid Build Coastguard Worker       std::vector<TokenizedSection>&& tokenized_string_sections,
82*8b6cd535SAndroid Build Coastguard Worker       std::vector<Section<int64_t>>&& integer_sections,
83*8b6cd535SAndroid Build Coastguard Worker       std::vector<Section<PropertyProto::VectorProto>>&& vector_sections,
84*8b6cd535SAndroid Build Coastguard Worker       JoinablePropertyGroup&& joinable_property_group)
85*8b6cd535SAndroid Build Coastguard Worker       : document_(std::move(document)),
86*8b6cd535SAndroid Build Coastguard Worker         tokenized_string_sections_(std::move(tokenized_string_sections)),
87*8b6cd535SAndroid Build Coastguard Worker         integer_sections_(std::move(integer_sections)),
88*8b6cd535SAndroid Build Coastguard Worker         vector_sections_(std::move(vector_sections)),
89*8b6cd535SAndroid Build Coastguard Worker         joinable_property_group_(std::move(joinable_property_group)) {}
90*8b6cd535SAndroid Build Coastguard Worker 
91*8b6cd535SAndroid Build Coastguard Worker   DocumentProto document_;
92*8b6cd535SAndroid Build Coastguard Worker   std::vector<TokenizedSection> tokenized_string_sections_;
93*8b6cd535SAndroid Build Coastguard Worker   std::vector<Section<int64_t>> integer_sections_;
94*8b6cd535SAndroid Build Coastguard Worker   std::vector<Section<PropertyProto::VectorProto>> vector_sections_;
95*8b6cd535SAndroid Build Coastguard Worker   JoinablePropertyGroup joinable_property_group_;
96*8b6cd535SAndroid Build Coastguard Worker };
97*8b6cd535SAndroid Build Coastguard Worker 
98*8b6cd535SAndroid Build Coastguard Worker }  // namespace lib
99*8b6cd535SAndroid Build Coastguard Worker }  // namespace icing
100*8b6cd535SAndroid Build Coastguard Worker 
101*8b6cd535SAndroid Build Coastguard Worker #endif  // ICING_STORE_TOKENIZED_DOCUMENT_H_
102