1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker // A token based context-free grammar matcher. 18*993b0882SAndroid Build Coastguard Worker // 19*993b0882SAndroid Build Coastguard Worker // A parser passes token to the matcher: literal terminal strings and token 20*993b0882SAndroid Build Coastguard Worker // types. 21*993b0882SAndroid Build Coastguard Worker // The parser passes each token along with the [begin, end) position range 22*993b0882SAndroid Build Coastguard Worker // in which it occurs. So for an input string "Groundhog February 2, 2007", the 23*993b0882SAndroid Build Coastguard Worker // parser would tell the matcher that: 24*993b0882SAndroid Build Coastguard Worker // 25*993b0882SAndroid Build Coastguard Worker // "Groundhog" occurs at [0, 9) 26*993b0882SAndroid Build Coastguard Worker // "February" occurs at [9, 18) 27*993b0882SAndroid Build Coastguard Worker // <digits> occurs at [18, 20) 28*993b0882SAndroid Build Coastguard Worker // "," occurs at [20, 21) 29*993b0882SAndroid Build Coastguard Worker // <digits> occurs at [21, 26) 30*993b0882SAndroid Build Coastguard Worker // 31*993b0882SAndroid Build Coastguard Worker // Multiple overlapping symbols can be passed. 32*993b0882SAndroid Build Coastguard Worker // The only constraint on symbol order is that they have to be passed in 33*993b0882SAndroid Build Coastguard Worker // left-to-right order, strictly speaking, their "end" positions must be 34*993b0882SAndroid Build Coastguard Worker // nondecreasing. This constraint allows a more efficient matching algorithm. 35*993b0882SAndroid Build Coastguard Worker // The "begin" positions can be in any order. 36*993b0882SAndroid Build Coastguard Worker 37*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_ 38*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_ 39*993b0882SAndroid Build Coastguard Worker 40*993b0882SAndroid Build Coastguard Worker #include <array> 41*993b0882SAndroid Build Coastguard Worker #include <functional> 42*993b0882SAndroid Build Coastguard Worker #include <vector> 43*993b0882SAndroid Build Coastguard Worker 44*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h" 45*993b0882SAndroid Build Coastguard Worker #include "utils/base/arena.h" 46*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/parsing/chart.h" 47*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/parsing/derivation.h" 48*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/parsing/parse-tree.h" 49*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/rules_generated.h" 50*993b0882SAndroid Build Coastguard Worker #include "utils/strings/stringpiece.h" 51*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib.h" 52*993b0882SAndroid Build Coastguard Worker 53*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3::grammar { 54*993b0882SAndroid Build Coastguard Worker 55*993b0882SAndroid Build Coastguard Worker class Matcher { 56*993b0882SAndroid Build Coastguard Worker public: Matcher(const UniLib * unilib,const RulesSet * rules,const std::vector<const RulesSet_::Rules * > rules_shards,UnsafeArena * arena)57*993b0882SAndroid Build Coastguard Worker explicit Matcher(const UniLib* unilib, const RulesSet* rules, 58*993b0882SAndroid Build Coastguard Worker const std::vector<const RulesSet_::Rules*> rules_shards, 59*993b0882SAndroid Build Coastguard Worker UnsafeArena* arena) 60*993b0882SAndroid Build Coastguard Worker : unilib_(*unilib), 61*993b0882SAndroid Build Coastguard Worker arena_(arena), 62*993b0882SAndroid Build Coastguard Worker last_end_(std::numeric_limits<int>().lowest()), 63*993b0882SAndroid Build Coastguard Worker rules_(rules), 64*993b0882SAndroid Build Coastguard Worker rules_shards_(rules_shards), 65*993b0882SAndroid Build Coastguard Worker pending_items_(nullptr), 66*993b0882SAndroid Build Coastguard Worker pending_exclusion_items_(nullptr) { 67*993b0882SAndroid Build Coastguard Worker TC3_CHECK_NE(rules, nullptr); 68*993b0882SAndroid Build Coastguard Worker } 69*993b0882SAndroid Build Coastguard Worker Matcher(const UniLib * unilib,const RulesSet * rules,UnsafeArena * arena)70*993b0882SAndroid Build Coastguard Worker explicit Matcher(const UniLib* unilib, const RulesSet* rules, 71*993b0882SAndroid Build Coastguard Worker UnsafeArena* arena) 72*993b0882SAndroid Build Coastguard Worker : Matcher(unilib, rules, {}, arena) { 73*993b0882SAndroid Build Coastguard Worker rules_shards_.reserve(rules->rules()->size()); 74*993b0882SAndroid Build Coastguard Worker rules_shards_.insert(rules_shards_.end(), rules->rules()->begin(), 75*993b0882SAndroid Build Coastguard Worker rules->rules()->end()); 76*993b0882SAndroid Build Coastguard Worker } 77*993b0882SAndroid Build Coastguard Worker 78*993b0882SAndroid Build Coastguard Worker // Finish the matching. 79*993b0882SAndroid Build Coastguard Worker void Finish(); 80*993b0882SAndroid Build Coastguard Worker 81*993b0882SAndroid Build Coastguard Worker // Tells the matcher that the given terminal was found occupying position 82*993b0882SAndroid Build Coastguard Worker // range [begin, end) in the input. 83*993b0882SAndroid Build Coastguard Worker // The matcher may invoke callback functions before returning, if this 84*993b0882SAndroid Build Coastguard Worker // terminal triggers any new matches for rules in the grammar. 85*993b0882SAndroid Build Coastguard Worker // Calls to AddTerminal() and AddParseTree() must be in left-to-right order, 86*993b0882SAndroid Build Coastguard Worker // that is, the sequence of `end` values must be non-decreasing. 87*993b0882SAndroid Build Coastguard Worker void AddTerminal(const CodepointSpan codepoint_span, const int match_offset, 88*993b0882SAndroid Build Coastguard Worker StringPiece terminal); AddTerminal(const CodepointIndex begin,const CodepointIndex end,StringPiece terminal)89*993b0882SAndroid Build Coastguard Worker void AddTerminal(const CodepointIndex begin, const CodepointIndex end, 90*993b0882SAndroid Build Coastguard Worker StringPiece terminal) { 91*993b0882SAndroid Build Coastguard Worker AddTerminal(CodepointSpan{begin, end}, begin, terminal); 92*993b0882SAndroid Build Coastguard Worker } 93*993b0882SAndroid Build Coastguard Worker 94*993b0882SAndroid Build Coastguard Worker // Adds predefined parse tree. 95*993b0882SAndroid Build Coastguard Worker void AddParseTree(ParseTree* parse_tree); 96*993b0882SAndroid Build Coastguard Worker chart()97*993b0882SAndroid Build Coastguard Worker const Chart<> chart() const { return chart_; } 98*993b0882SAndroid Build Coastguard Worker 99*993b0882SAndroid Build Coastguard Worker private: 100*993b0882SAndroid Build Coastguard Worker // Process matches from lhs set. 101*993b0882SAndroid Build Coastguard Worker void ExecuteLhsSet(const CodepointSpan codepoint_span, const int match_offset, 102*993b0882SAndroid Build Coastguard Worker const int whitespace_gap, 103*993b0882SAndroid Build Coastguard Worker const std::function<void(ParseTree*)>& initializer_fn, 104*993b0882SAndroid Build Coastguard Worker const RulesSet_::LhsSet* lhs_set); 105*993b0882SAndroid Build Coastguard Worker 106*993b0882SAndroid Build Coastguard Worker // Queues a newly created match item. 107*993b0882SAndroid Build Coastguard Worker void QueueForProcessing(ParseTree* item); 108*993b0882SAndroid Build Coastguard Worker 109*993b0882SAndroid Build Coastguard Worker // Queues a match item for later post checking of the exclusion condition. 110*993b0882SAndroid Build Coastguard Worker // For exclusions we need to check that the `item->excluded_nonterminal` 111*993b0882SAndroid Build Coastguard Worker // doesn't match the same span. As we cannot know which matches have already 112*993b0882SAndroid Build Coastguard Worker // been added, we queue the item for later post checking - once all matches 113*993b0882SAndroid Build Coastguard Worker // up to `item->codepoint_span.second` have been added. 114*993b0882SAndroid Build Coastguard Worker void QueueForPostCheck(ExclusionNode* item); 115*993b0882SAndroid Build Coastguard Worker 116*993b0882SAndroid Build Coastguard Worker // Adds pending items to the chart, possibly generating new matches as a 117*993b0882SAndroid Build Coastguard Worker // result. 118*993b0882SAndroid Build Coastguard Worker void ProcessPendingSet(); 119*993b0882SAndroid Build Coastguard Worker 120*993b0882SAndroid Build Coastguard Worker // Checks all pending exclusion matches that their exclusion condition is 121*993b0882SAndroid Build Coastguard Worker // fulfilled. 122*993b0882SAndroid Build Coastguard Worker void ProcessPendingExclusionMatches(); 123*993b0882SAndroid Build Coastguard Worker 124*993b0882SAndroid Build Coastguard Worker UniLib unilib_; 125*993b0882SAndroid Build Coastguard Worker 126*993b0882SAndroid Build Coastguard Worker // Memory arena for match allocation. 127*993b0882SAndroid Build Coastguard Worker UnsafeArena* arena_; 128*993b0882SAndroid Build Coastguard Worker 129*993b0882SAndroid Build Coastguard Worker // The end position of the most recent match or terminal, for sanity 130*993b0882SAndroid Build Coastguard Worker // checking. 131*993b0882SAndroid Build Coastguard Worker int last_end_; 132*993b0882SAndroid Build Coastguard Worker 133*993b0882SAndroid Build Coastguard Worker // Rules. 134*993b0882SAndroid Build Coastguard Worker const RulesSet* rules_; 135*993b0882SAndroid Build Coastguard Worker // The active rule shards. 136*993b0882SAndroid Build Coastguard Worker std::vector<const RulesSet_::Rules*> rules_shards_; 137*993b0882SAndroid Build Coastguard Worker 138*993b0882SAndroid Build Coastguard Worker // The set of items pending to be added to the chart as a singly-linked list. 139*993b0882SAndroid Build Coastguard Worker ParseTree* pending_items_; 140*993b0882SAndroid Build Coastguard Worker 141*993b0882SAndroid Build Coastguard Worker // The set of items pending to be post-checked as a singly-linked list. 142*993b0882SAndroid Build Coastguard Worker ExclusionNode* pending_exclusion_items_; 143*993b0882SAndroid Build Coastguard Worker 144*993b0882SAndroid Build Coastguard Worker // The chart data structure: a hashtable containing all matches, indexed by 145*993b0882SAndroid Build Coastguard Worker // their end positions. 146*993b0882SAndroid Build Coastguard Worker Chart<> chart_; 147*993b0882SAndroid Build Coastguard Worker }; 148*993b0882SAndroid Build Coastguard Worker 149*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3::grammar 150*993b0882SAndroid Build Coastguard Worker 151*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_ 152