xref: /aosp_15_r20/external/libtextclassifier/native/annotator/duration/duration.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
19*993b0882SAndroid Build Coastguard Worker 
20*993b0882SAndroid Build Coastguard Worker #include <string>
21*993b0882SAndroid Build Coastguard Worker #include <unordered_map>
22*993b0882SAndroid Build Coastguard Worker #include <unordered_set>
23*993b0882SAndroid Build Coastguard Worker #include <vector>
24*993b0882SAndroid Build Coastguard Worker 
25*993b0882SAndroid Build Coastguard Worker #include "annotator/feature-processor.h"
26*993b0882SAndroid Build Coastguard Worker #include "annotator/model_generated.h"
27*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h"
28*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h"
29*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib.h"
30*993b0882SAndroid Build Coastguard Worker 
31*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
32*993b0882SAndroid Build Coastguard Worker 
33*993b0882SAndroid Build Coastguard Worker namespace internal {
34*993b0882SAndroid Build Coastguard Worker enum class DurationUnit {
35*993b0882SAndroid Build Coastguard Worker   UNKNOWN = -1,
36*993b0882SAndroid Build Coastguard Worker   WEEK = 0,
37*993b0882SAndroid Build Coastguard Worker   DAY = 1,
38*993b0882SAndroid Build Coastguard Worker   HOUR = 2,
39*993b0882SAndroid Build Coastguard Worker   MINUTE = 3,
40*993b0882SAndroid Build Coastguard Worker   SECOND = 4
41*993b0882SAndroid Build Coastguard Worker 
42*993b0882SAndroid Build Coastguard Worker   // NOTE: If we want to add MONTH and YEAR we'll have to think of different
43*993b0882SAndroid Build Coastguard Worker   // parsing format, because MONTH and YEAR don't have a fixed number of
44*993b0882SAndroid Build Coastguard Worker   // milliseconds, unlike week/day/hour/minute/second. We ignore the daylight
45*993b0882SAndroid Build Coastguard Worker   // savings time and assume the day is always 24 hours.
46*993b0882SAndroid Build Coastguard Worker };
47*993b0882SAndroid Build Coastguard Worker 
48*993b0882SAndroid Build Coastguard Worker // Prepares the mapping between token values and duration unit types.
49*993b0882SAndroid Build Coastguard Worker std::unordered_map<std::string, internal::DurationUnit>
50*993b0882SAndroid Build Coastguard Worker BuildTokenToDurationUnitMapping(const DurationAnnotatorOptions* options,
51*993b0882SAndroid Build Coastguard Worker                                 const UniLib* unilib);
52*993b0882SAndroid Build Coastguard Worker 
53*993b0882SAndroid Build Coastguard Worker // Creates a set of strings from a flatbuffer string vector.
54*993b0882SAndroid Build Coastguard Worker std::unordered_set<std::string> BuildStringSet(
55*993b0882SAndroid Build Coastguard Worker     const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*
56*993b0882SAndroid Build Coastguard Worker         strings,
57*993b0882SAndroid Build Coastguard Worker     const UniLib* unilib);
58*993b0882SAndroid Build Coastguard Worker 
59*993b0882SAndroid Build Coastguard Worker // Creates a set of ints from a flatbuffer int vector.
60*993b0882SAndroid Build Coastguard Worker std::unordered_set<int32> BuildInt32Set(const flatbuffers::Vector<int32>* ints);
61*993b0882SAndroid Build Coastguard Worker 
62*993b0882SAndroid Build Coastguard Worker }  // namespace internal
63*993b0882SAndroid Build Coastguard Worker 
64*993b0882SAndroid Build Coastguard Worker // Annotator of duration expressions like "3 minutes 30 seconds".
65*993b0882SAndroid Build Coastguard Worker class DurationAnnotator {
66*993b0882SAndroid Build Coastguard Worker  public:
DurationAnnotator(const DurationAnnotatorOptions * options,const FeatureProcessor * feature_processor,const UniLib * unilib)67*993b0882SAndroid Build Coastguard Worker   explicit DurationAnnotator(const DurationAnnotatorOptions* options,
68*993b0882SAndroid Build Coastguard Worker                              const FeatureProcessor* feature_processor,
69*993b0882SAndroid Build Coastguard Worker                              const UniLib* unilib)
70*993b0882SAndroid Build Coastguard Worker       : options_(options),
71*993b0882SAndroid Build Coastguard Worker         feature_processor_(feature_processor),
72*993b0882SAndroid Build Coastguard Worker         unilib_(unilib),
73*993b0882SAndroid Build Coastguard Worker         token_value_to_duration_unit_(
74*993b0882SAndroid Build Coastguard Worker             internal::BuildTokenToDurationUnitMapping(options, unilib)),
75*993b0882SAndroid Build Coastguard Worker         filler_expressions_(
76*993b0882SAndroid Build Coastguard Worker             internal::BuildStringSet(options->filler_expressions(), unilib)),
77*993b0882SAndroid Build Coastguard Worker         half_expressions_(
78*993b0882SAndroid Build Coastguard Worker             internal::BuildStringSet(options->half_expressions(), unilib)),
79*993b0882SAndroid Build Coastguard Worker         sub_token_separator_codepoints_(internal::BuildInt32Set(
80*993b0882SAndroid Build Coastguard Worker             options->sub_token_separator_codepoints())) {}
81*993b0882SAndroid Build Coastguard Worker 
82*993b0882SAndroid Build Coastguard Worker   // Classifies given text, and if it is a duration, it passes the result in
83*993b0882SAndroid Build Coastguard Worker   // 'classification_result' and returns true, otherwise returns false.
84*993b0882SAndroid Build Coastguard Worker   bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices,
85*993b0882SAndroid Build Coastguard Worker                     AnnotationUsecase annotation_usecase,
86*993b0882SAndroid Build Coastguard Worker                     ClassificationResult* classification_result) const;
87*993b0882SAndroid Build Coastguard Worker 
88*993b0882SAndroid Build Coastguard Worker   // Finds all duration instances in the input text.
89*993b0882SAndroid Build Coastguard Worker   bool FindAll(const UnicodeText& context, const std::vector<Token>& tokens,
90*993b0882SAndroid Build Coastguard Worker                AnnotationUsecase annotation_usecase, ModeFlag mode,
91*993b0882SAndroid Build Coastguard Worker                std::vector<AnnotatedSpan>* results) const;
92*993b0882SAndroid Build Coastguard Worker 
93*993b0882SAndroid Build Coastguard Worker  private:
94*993b0882SAndroid Build Coastguard Worker   // Represents a component of duration parsed from text (e.g. "3 hours" from
95*993b0882SAndroid Build Coastguard Worker   // the expression "3 hours and 20 minutes").
96*993b0882SAndroid Build Coastguard Worker   struct ParsedDurationAtom {
97*993b0882SAndroid Build Coastguard Worker     // Unit of the duration.
98*993b0882SAndroid Build Coastguard Worker     internal::DurationUnit unit = internal::DurationUnit::UNKNOWN;
99*993b0882SAndroid Build Coastguard Worker 
100*993b0882SAndroid Build Coastguard Worker     // Quantity of the duration unit.
101*993b0882SAndroid Build Coastguard Worker     double value = 0;
102*993b0882SAndroid Build Coastguard Worker 
103*993b0882SAndroid Build Coastguard Worker     // True, if half an unit was specified (either in addition, or exclusively).
104*993b0882SAndroid Build Coastguard Worker     // E.g. "hour and a half".
105*993b0882SAndroid Build Coastguard Worker     // NOTE: Quarter, three-quarters etc. is not supported.
106*993b0882SAndroid Build Coastguard Worker     bool plus_half = false;
107*993b0882SAndroid Build Coastguard Worker 
HalfParsedDurationAtom108*993b0882SAndroid Build Coastguard Worker     static ParsedDurationAtom Half() {
109*993b0882SAndroid Build Coastguard Worker       ParsedDurationAtom result;
110*993b0882SAndroid Build Coastguard Worker       result.plus_half = true;
111*993b0882SAndroid Build Coastguard Worker       return result;
112*993b0882SAndroid Build Coastguard Worker     }
113*993b0882SAndroid Build Coastguard Worker   };
114*993b0882SAndroid Build Coastguard Worker 
115*993b0882SAndroid Build Coastguard Worker   // Starts consuming tokens and returns the index past the last consumed token.
116*993b0882SAndroid Build Coastguard Worker   int FindDurationStartingAt(const UnicodeText& context,
117*993b0882SAndroid Build Coastguard Worker                              const std::vector<Token>& tokens,
118*993b0882SAndroid Build Coastguard Worker                              int start_token_index,
119*993b0882SAndroid Build Coastguard Worker                              AnnotatedSpan* result) const;
120*993b0882SAndroid Build Coastguard Worker 
121*993b0882SAndroid Build Coastguard Worker   bool ParseQuantityToken(const Token& token, ParsedDurationAtom* value) const;
122*993b0882SAndroid Build Coastguard Worker   bool ParseDurationUnitToken(const Token& token,
123*993b0882SAndroid Build Coastguard Worker                               internal::DurationUnit* duration_unit) const;
124*993b0882SAndroid Build Coastguard Worker   bool ParseQuantityDurationUnitToken(const Token& token,
125*993b0882SAndroid Build Coastguard Worker                                       ParsedDurationAtom* value) const;
126*993b0882SAndroid Build Coastguard Worker   bool ParseFillerToken(const Token& token) const;
127*993b0882SAndroid Build Coastguard Worker 
128*993b0882SAndroid Build Coastguard Worker   int64 ParsedDurationAtomsToMillis(
129*993b0882SAndroid Build Coastguard Worker       const std::vector<ParsedDurationAtom>& atoms) const;
130*993b0882SAndroid Build Coastguard Worker 
131*993b0882SAndroid Build Coastguard Worker   const DurationAnnotatorOptions* options_;
132*993b0882SAndroid Build Coastguard Worker   const FeatureProcessor* feature_processor_;
133*993b0882SAndroid Build Coastguard Worker   const UniLib* unilib_;
134*993b0882SAndroid Build Coastguard Worker   const std::unordered_map<std::string, internal::DurationUnit>
135*993b0882SAndroid Build Coastguard Worker       token_value_to_duration_unit_;
136*993b0882SAndroid Build Coastguard Worker   const std::unordered_set<std::string> filler_expressions_;
137*993b0882SAndroid Build Coastguard Worker   const std::unordered_set<std::string> half_expressions_;
138*993b0882SAndroid Build Coastguard Worker   const std::unordered_set<int32> sub_token_separator_codepoints_;
139*993b0882SAndroid Build Coastguard Worker };
140*993b0882SAndroid Build Coastguard Worker 
141*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
142*993b0882SAndroid Build Coastguard Worker 
143*993b0882SAndroid Build Coastguard Worker #endif  // LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
144