1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_ 18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_ 19*993b0882SAndroid Build Coastguard Worker 20*993b0882SAndroid Build Coastguard Worker #include <string> 21*993b0882SAndroid Build Coastguard Worker #include <unordered_map> 22*993b0882SAndroid Build Coastguard Worker #include <unordered_set> 23*993b0882SAndroid Build Coastguard Worker #include <vector> 24*993b0882SAndroid Build Coastguard Worker 25*993b0882SAndroid Build Coastguard Worker #include "annotator/feature-processor.h" 26*993b0882SAndroid Build Coastguard Worker #include "annotator/model_generated.h" 27*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h" 28*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h" 29*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib.h" 30*993b0882SAndroid Build Coastguard Worker 31*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 { 32*993b0882SAndroid Build Coastguard Worker 33*993b0882SAndroid Build Coastguard Worker namespace internal { 34*993b0882SAndroid Build Coastguard Worker enum class DurationUnit { 35*993b0882SAndroid Build Coastguard Worker UNKNOWN = -1, 36*993b0882SAndroid Build Coastguard Worker WEEK = 0, 37*993b0882SAndroid Build Coastguard Worker DAY = 1, 38*993b0882SAndroid Build Coastguard Worker HOUR = 2, 39*993b0882SAndroid Build Coastguard Worker MINUTE = 3, 40*993b0882SAndroid Build Coastguard Worker SECOND = 4 41*993b0882SAndroid Build Coastguard Worker 42*993b0882SAndroid Build Coastguard Worker // NOTE: If we want to add MONTH and YEAR we'll have to think of different 43*993b0882SAndroid Build Coastguard Worker // parsing format, because MONTH and YEAR don't have a fixed number of 44*993b0882SAndroid Build Coastguard Worker // milliseconds, unlike week/day/hour/minute/second. We ignore the daylight 45*993b0882SAndroid Build Coastguard Worker // savings time and assume the day is always 24 hours. 46*993b0882SAndroid Build Coastguard Worker }; 47*993b0882SAndroid Build Coastguard Worker 48*993b0882SAndroid Build Coastguard Worker // Prepares the mapping between token values and duration unit types. 49*993b0882SAndroid Build Coastguard Worker std::unordered_map<std::string, internal::DurationUnit> 50*993b0882SAndroid Build Coastguard Worker BuildTokenToDurationUnitMapping(const DurationAnnotatorOptions* options, 51*993b0882SAndroid Build Coastguard Worker const UniLib* unilib); 52*993b0882SAndroid Build Coastguard Worker 53*993b0882SAndroid Build Coastguard Worker // Creates a set of strings from a flatbuffer string vector. 54*993b0882SAndroid Build Coastguard Worker std::unordered_set<std::string> BuildStringSet( 55*993b0882SAndroid Build Coastguard Worker const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>* 56*993b0882SAndroid Build Coastguard Worker strings, 57*993b0882SAndroid Build Coastguard Worker const UniLib* unilib); 58*993b0882SAndroid Build Coastguard Worker 59*993b0882SAndroid Build Coastguard Worker // Creates a set of ints from a flatbuffer int vector. 60*993b0882SAndroid Build Coastguard Worker std::unordered_set<int32> BuildInt32Set(const flatbuffers::Vector<int32>* ints); 61*993b0882SAndroid Build Coastguard Worker 62*993b0882SAndroid Build Coastguard Worker } // namespace internal 63*993b0882SAndroid Build Coastguard Worker 64*993b0882SAndroid Build Coastguard Worker // Annotator of duration expressions like "3 minutes 30 seconds". 65*993b0882SAndroid Build Coastguard Worker class DurationAnnotator { 66*993b0882SAndroid Build Coastguard Worker public: DurationAnnotator(const DurationAnnotatorOptions * options,const FeatureProcessor * feature_processor,const UniLib * unilib)67*993b0882SAndroid Build Coastguard Worker explicit DurationAnnotator(const DurationAnnotatorOptions* options, 68*993b0882SAndroid Build Coastguard Worker const FeatureProcessor* feature_processor, 69*993b0882SAndroid Build Coastguard Worker const UniLib* unilib) 70*993b0882SAndroid Build Coastguard Worker : options_(options), 71*993b0882SAndroid Build Coastguard Worker feature_processor_(feature_processor), 72*993b0882SAndroid Build Coastguard Worker unilib_(unilib), 73*993b0882SAndroid Build Coastguard Worker token_value_to_duration_unit_( 74*993b0882SAndroid Build Coastguard Worker internal::BuildTokenToDurationUnitMapping(options, unilib)), 75*993b0882SAndroid Build Coastguard Worker filler_expressions_( 76*993b0882SAndroid Build Coastguard Worker internal::BuildStringSet(options->filler_expressions(), unilib)), 77*993b0882SAndroid Build Coastguard Worker half_expressions_( 78*993b0882SAndroid Build Coastguard Worker internal::BuildStringSet(options->half_expressions(), unilib)), 79*993b0882SAndroid Build Coastguard Worker sub_token_separator_codepoints_(internal::BuildInt32Set( 80*993b0882SAndroid Build Coastguard Worker options->sub_token_separator_codepoints())) {} 81*993b0882SAndroid Build Coastguard Worker 82*993b0882SAndroid Build Coastguard Worker // Classifies given text, and if it is a duration, it passes the result in 83*993b0882SAndroid Build Coastguard Worker // 'classification_result' and returns true, otherwise returns false. 84*993b0882SAndroid Build Coastguard Worker bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices, 85*993b0882SAndroid Build Coastguard Worker AnnotationUsecase annotation_usecase, 86*993b0882SAndroid Build Coastguard Worker ClassificationResult* classification_result) const; 87*993b0882SAndroid Build Coastguard Worker 88*993b0882SAndroid Build Coastguard Worker // Finds all duration instances in the input text. 89*993b0882SAndroid Build Coastguard Worker bool FindAll(const UnicodeText& context, const std::vector<Token>& tokens, 90*993b0882SAndroid Build Coastguard Worker AnnotationUsecase annotation_usecase, ModeFlag mode, 91*993b0882SAndroid Build Coastguard Worker std::vector<AnnotatedSpan>* results) const; 92*993b0882SAndroid Build Coastguard Worker 93*993b0882SAndroid Build Coastguard Worker private: 94*993b0882SAndroid Build Coastguard Worker // Represents a component of duration parsed from text (e.g. "3 hours" from 95*993b0882SAndroid Build Coastguard Worker // the expression "3 hours and 20 minutes"). 96*993b0882SAndroid Build Coastguard Worker struct ParsedDurationAtom { 97*993b0882SAndroid Build Coastguard Worker // Unit of the duration. 98*993b0882SAndroid Build Coastguard Worker internal::DurationUnit unit = internal::DurationUnit::UNKNOWN; 99*993b0882SAndroid Build Coastguard Worker 100*993b0882SAndroid Build Coastguard Worker // Quantity of the duration unit. 101*993b0882SAndroid Build Coastguard Worker double value = 0; 102*993b0882SAndroid Build Coastguard Worker 103*993b0882SAndroid Build Coastguard Worker // True, if half an unit was specified (either in addition, or exclusively). 104*993b0882SAndroid Build Coastguard Worker // E.g. "hour and a half". 105*993b0882SAndroid Build Coastguard Worker // NOTE: Quarter, three-quarters etc. is not supported. 106*993b0882SAndroid Build Coastguard Worker bool plus_half = false; 107*993b0882SAndroid Build Coastguard Worker HalfParsedDurationAtom108*993b0882SAndroid Build Coastguard Worker static ParsedDurationAtom Half() { 109*993b0882SAndroid Build Coastguard Worker ParsedDurationAtom result; 110*993b0882SAndroid Build Coastguard Worker result.plus_half = true; 111*993b0882SAndroid Build Coastguard Worker return result; 112*993b0882SAndroid Build Coastguard Worker } 113*993b0882SAndroid Build Coastguard Worker }; 114*993b0882SAndroid Build Coastguard Worker 115*993b0882SAndroid Build Coastguard Worker // Starts consuming tokens and returns the index past the last consumed token. 116*993b0882SAndroid Build Coastguard Worker int FindDurationStartingAt(const UnicodeText& context, 117*993b0882SAndroid Build Coastguard Worker const std::vector<Token>& tokens, 118*993b0882SAndroid Build Coastguard Worker int start_token_index, 119*993b0882SAndroid Build Coastguard Worker AnnotatedSpan* result) const; 120*993b0882SAndroid Build Coastguard Worker 121*993b0882SAndroid Build Coastguard Worker bool ParseQuantityToken(const Token& token, ParsedDurationAtom* value) const; 122*993b0882SAndroid Build Coastguard Worker bool ParseDurationUnitToken(const Token& token, 123*993b0882SAndroid Build Coastguard Worker internal::DurationUnit* duration_unit) const; 124*993b0882SAndroid Build Coastguard Worker bool ParseQuantityDurationUnitToken(const Token& token, 125*993b0882SAndroid Build Coastguard Worker ParsedDurationAtom* value) const; 126*993b0882SAndroid Build Coastguard Worker bool ParseFillerToken(const Token& token) const; 127*993b0882SAndroid Build Coastguard Worker 128*993b0882SAndroid Build Coastguard Worker int64 ParsedDurationAtomsToMillis( 129*993b0882SAndroid Build Coastguard Worker const std::vector<ParsedDurationAtom>& atoms) const; 130*993b0882SAndroid Build Coastguard Worker 131*993b0882SAndroid Build Coastguard Worker const DurationAnnotatorOptions* options_; 132*993b0882SAndroid Build Coastguard Worker const FeatureProcessor* feature_processor_; 133*993b0882SAndroid Build Coastguard Worker const UniLib* unilib_; 134*993b0882SAndroid Build Coastguard Worker const std::unordered_map<std::string, internal::DurationUnit> 135*993b0882SAndroid Build Coastguard Worker token_value_to_duration_unit_; 136*993b0882SAndroid Build Coastguard Worker const std::unordered_set<std::string> filler_expressions_; 137*993b0882SAndroid Build Coastguard Worker const std::unordered_set<std::string> half_expressions_; 138*993b0882SAndroid Build Coastguard Worker const std::unordered_set<int32> sub_token_separator_codepoints_; 139*993b0882SAndroid Build Coastguard Worker }; 140*993b0882SAndroid Build Coastguard Worker 141*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3 142*993b0882SAndroid Build Coastguard Worker 143*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_ 144