xref: /aosp_15_r20/external/libtextclassifier/native/utils/grammar/parsing/matcher.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker // A token based context-free grammar matcher.
18*993b0882SAndroid Build Coastguard Worker //
19*993b0882SAndroid Build Coastguard Worker // A parser passes token to the matcher: literal terminal strings and token
20*993b0882SAndroid Build Coastguard Worker // types.
21*993b0882SAndroid Build Coastguard Worker // The parser passes each token along with the [begin, end) position range
22*993b0882SAndroid Build Coastguard Worker // in which it occurs.  So for an input string "Groundhog February 2, 2007", the
23*993b0882SAndroid Build Coastguard Worker // parser would tell the matcher that:
24*993b0882SAndroid Build Coastguard Worker //
25*993b0882SAndroid Build Coastguard Worker //     "Groundhog" occurs at [0, 9)
26*993b0882SAndroid Build Coastguard Worker //     "February" occurs at [9, 18)
27*993b0882SAndroid Build Coastguard Worker //     <digits> occurs at [18, 20)
28*993b0882SAndroid Build Coastguard Worker //     "," occurs at [20, 21)
29*993b0882SAndroid Build Coastguard Worker //     <digits> occurs at [21, 26)
30*993b0882SAndroid Build Coastguard Worker //
31*993b0882SAndroid Build Coastguard Worker // Multiple overlapping symbols can be passed.
32*993b0882SAndroid Build Coastguard Worker // The only constraint on symbol order is that they have to be passed in
33*993b0882SAndroid Build Coastguard Worker // left-to-right order, strictly speaking, their "end" positions must be
34*993b0882SAndroid Build Coastguard Worker // nondecreasing. This constraint allows a more efficient matching algorithm.
35*993b0882SAndroid Build Coastguard Worker // The "begin" positions can be in any order.
36*993b0882SAndroid Build Coastguard Worker 
37*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_
38*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_
39*993b0882SAndroid Build Coastguard Worker 
40*993b0882SAndroid Build Coastguard Worker #include <array>
41*993b0882SAndroid Build Coastguard Worker #include <functional>
42*993b0882SAndroid Build Coastguard Worker #include <vector>
43*993b0882SAndroid Build Coastguard Worker 
44*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h"
45*993b0882SAndroid Build Coastguard Worker #include "utils/base/arena.h"
46*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/parsing/chart.h"
47*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/parsing/derivation.h"
48*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/parsing/parse-tree.h"
49*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/rules_generated.h"
50*993b0882SAndroid Build Coastguard Worker #include "utils/strings/stringpiece.h"
51*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib.h"
52*993b0882SAndroid Build Coastguard Worker 
53*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3::grammar {
54*993b0882SAndroid Build Coastguard Worker 
55*993b0882SAndroid Build Coastguard Worker class Matcher {
56*993b0882SAndroid Build Coastguard Worker  public:
Matcher(const UniLib * unilib,const RulesSet * rules,const std::vector<const RulesSet_::Rules * > rules_shards,UnsafeArena * arena)57*993b0882SAndroid Build Coastguard Worker   explicit Matcher(const UniLib* unilib, const RulesSet* rules,
58*993b0882SAndroid Build Coastguard Worker                    const std::vector<const RulesSet_::Rules*> rules_shards,
59*993b0882SAndroid Build Coastguard Worker                    UnsafeArena* arena)
60*993b0882SAndroid Build Coastguard Worker       : unilib_(*unilib),
61*993b0882SAndroid Build Coastguard Worker         arena_(arena),
62*993b0882SAndroid Build Coastguard Worker         last_end_(std::numeric_limits<int>().lowest()),
63*993b0882SAndroid Build Coastguard Worker         rules_(rules),
64*993b0882SAndroid Build Coastguard Worker         rules_shards_(rules_shards),
65*993b0882SAndroid Build Coastguard Worker         pending_items_(nullptr),
66*993b0882SAndroid Build Coastguard Worker         pending_exclusion_items_(nullptr) {
67*993b0882SAndroid Build Coastguard Worker     TC3_CHECK_NE(rules, nullptr);
68*993b0882SAndroid Build Coastguard Worker   }
69*993b0882SAndroid Build Coastguard Worker 
Matcher(const UniLib * unilib,const RulesSet * rules,UnsafeArena * arena)70*993b0882SAndroid Build Coastguard Worker   explicit Matcher(const UniLib* unilib, const RulesSet* rules,
71*993b0882SAndroid Build Coastguard Worker                    UnsafeArena* arena)
72*993b0882SAndroid Build Coastguard Worker       : Matcher(unilib, rules, {}, arena) {
73*993b0882SAndroid Build Coastguard Worker     rules_shards_.reserve(rules->rules()->size());
74*993b0882SAndroid Build Coastguard Worker     rules_shards_.insert(rules_shards_.end(), rules->rules()->begin(),
75*993b0882SAndroid Build Coastguard Worker                          rules->rules()->end());
76*993b0882SAndroid Build Coastguard Worker   }
77*993b0882SAndroid Build Coastguard Worker 
78*993b0882SAndroid Build Coastguard Worker   // Finish the matching.
79*993b0882SAndroid Build Coastguard Worker   void Finish();
80*993b0882SAndroid Build Coastguard Worker 
81*993b0882SAndroid Build Coastguard Worker   // Tells the matcher that the given terminal was found occupying position
82*993b0882SAndroid Build Coastguard Worker   // range [begin, end) in the input.
83*993b0882SAndroid Build Coastguard Worker   // The matcher may invoke callback functions before returning, if this
84*993b0882SAndroid Build Coastguard Worker   // terminal triggers any new matches for rules in the grammar.
85*993b0882SAndroid Build Coastguard Worker   // Calls to AddTerminal() and AddParseTree() must be in left-to-right order,
86*993b0882SAndroid Build Coastguard Worker   // that is, the sequence of `end` values must be non-decreasing.
87*993b0882SAndroid Build Coastguard Worker   void AddTerminal(const CodepointSpan codepoint_span, const int match_offset,
88*993b0882SAndroid Build Coastguard Worker                    StringPiece terminal);
AddTerminal(const CodepointIndex begin,const CodepointIndex end,StringPiece terminal)89*993b0882SAndroid Build Coastguard Worker   void AddTerminal(const CodepointIndex begin, const CodepointIndex end,
90*993b0882SAndroid Build Coastguard Worker                    StringPiece terminal) {
91*993b0882SAndroid Build Coastguard Worker     AddTerminal(CodepointSpan{begin, end}, begin, terminal);
92*993b0882SAndroid Build Coastguard Worker   }
93*993b0882SAndroid Build Coastguard Worker 
94*993b0882SAndroid Build Coastguard Worker   // Adds predefined parse tree.
95*993b0882SAndroid Build Coastguard Worker   void AddParseTree(ParseTree* parse_tree);
96*993b0882SAndroid Build Coastguard Worker 
chart()97*993b0882SAndroid Build Coastguard Worker   const Chart<> chart() const { return chart_; }
98*993b0882SAndroid Build Coastguard Worker 
99*993b0882SAndroid Build Coastguard Worker  private:
100*993b0882SAndroid Build Coastguard Worker   // Process matches from lhs set.
101*993b0882SAndroid Build Coastguard Worker   void ExecuteLhsSet(const CodepointSpan codepoint_span, const int match_offset,
102*993b0882SAndroid Build Coastguard Worker                      const int whitespace_gap,
103*993b0882SAndroid Build Coastguard Worker                      const std::function<void(ParseTree*)>& initializer_fn,
104*993b0882SAndroid Build Coastguard Worker                      const RulesSet_::LhsSet* lhs_set);
105*993b0882SAndroid Build Coastguard Worker 
106*993b0882SAndroid Build Coastguard Worker   // Queues a newly created match item.
107*993b0882SAndroid Build Coastguard Worker   void QueueForProcessing(ParseTree* item);
108*993b0882SAndroid Build Coastguard Worker 
109*993b0882SAndroid Build Coastguard Worker   // Queues a match item for later post checking of the exclusion condition.
110*993b0882SAndroid Build Coastguard Worker   // For exclusions we need to check that the `item->excluded_nonterminal`
111*993b0882SAndroid Build Coastguard Worker   // doesn't match the same span. As we cannot know which matches have already
112*993b0882SAndroid Build Coastguard Worker   // been added, we queue the item for later post checking - once all matches
113*993b0882SAndroid Build Coastguard Worker   // up to `item->codepoint_span.second` have been added.
114*993b0882SAndroid Build Coastguard Worker   void QueueForPostCheck(ExclusionNode* item);
115*993b0882SAndroid Build Coastguard Worker 
116*993b0882SAndroid Build Coastguard Worker   // Adds pending items to the chart, possibly generating new matches as a
117*993b0882SAndroid Build Coastguard Worker   // result.
118*993b0882SAndroid Build Coastguard Worker   void ProcessPendingSet();
119*993b0882SAndroid Build Coastguard Worker 
120*993b0882SAndroid Build Coastguard Worker   // Checks all pending exclusion matches that their exclusion condition is
121*993b0882SAndroid Build Coastguard Worker   // fulfilled.
122*993b0882SAndroid Build Coastguard Worker   void ProcessPendingExclusionMatches();
123*993b0882SAndroid Build Coastguard Worker 
124*993b0882SAndroid Build Coastguard Worker   UniLib unilib_;
125*993b0882SAndroid Build Coastguard Worker 
126*993b0882SAndroid Build Coastguard Worker   // Memory arena for match allocation.
127*993b0882SAndroid Build Coastguard Worker   UnsafeArena* arena_;
128*993b0882SAndroid Build Coastguard Worker 
129*993b0882SAndroid Build Coastguard Worker   // The end position of the most recent match or terminal, for sanity
130*993b0882SAndroid Build Coastguard Worker   // checking.
131*993b0882SAndroid Build Coastguard Worker   int last_end_;
132*993b0882SAndroid Build Coastguard Worker 
133*993b0882SAndroid Build Coastguard Worker   // Rules.
134*993b0882SAndroid Build Coastguard Worker   const RulesSet* rules_;
135*993b0882SAndroid Build Coastguard Worker   // The active rule shards.
136*993b0882SAndroid Build Coastguard Worker   std::vector<const RulesSet_::Rules*> rules_shards_;
137*993b0882SAndroid Build Coastguard Worker 
138*993b0882SAndroid Build Coastguard Worker   // The set of items pending to be added to the chart as a singly-linked list.
139*993b0882SAndroid Build Coastguard Worker   ParseTree* pending_items_;
140*993b0882SAndroid Build Coastguard Worker 
141*993b0882SAndroid Build Coastguard Worker   // The set of items pending to be post-checked as a singly-linked list.
142*993b0882SAndroid Build Coastguard Worker   ExclusionNode* pending_exclusion_items_;
143*993b0882SAndroid Build Coastguard Worker 
144*993b0882SAndroid Build Coastguard Worker   // The chart data structure: a hashtable containing all matches, indexed by
145*993b0882SAndroid Build Coastguard Worker   // their end positions.
146*993b0882SAndroid Build Coastguard Worker   Chart<> chart_;
147*993b0882SAndroid Build Coastguard Worker };
148*993b0882SAndroid Build Coastguard Worker 
149*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3::grammar
150*993b0882SAndroid Build Coastguard Worker 
151*993b0882SAndroid Build Coastguard Worker #endif  // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_
152