xref: /aosp_15_r20/external/libtextclassifier/native/utils/grammar/parsing/parse-tree.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSE_TREE_H_
18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSE_TREE_H_
19*993b0882SAndroid Build Coastguard Worker 
20*993b0882SAndroid Build Coastguard Worker #include <functional>
21*993b0882SAndroid Build Coastguard Worker #include <vector>
22*993b0882SAndroid Build Coastguard Worker 
23*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h"
24*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/semantics/expression_generated.h"
25*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/types.h"
26*993b0882SAndroid Build Coastguard Worker #include "utils/strings/stringpiece.h"
27*993b0882SAndroid Build Coastguard Worker 
28*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3::grammar {
29*993b0882SAndroid Build Coastguard Worker 
30*993b0882SAndroid Build Coastguard Worker // Represents a parse tree for a match that was found for a nonterminal.
31*993b0882SAndroid Build Coastguard Worker struct ParseTree {
32*993b0882SAndroid Build Coastguard Worker   enum class Type : int8 {
33*993b0882SAndroid Build Coastguard Worker     // Default, untyped match.
34*993b0882SAndroid Build Coastguard Worker     kDefault = 0,
35*993b0882SAndroid Build Coastguard Worker 
36*993b0882SAndroid Build Coastguard Worker     // An assertion match (see: AssertionNode).
37*993b0882SAndroid Build Coastguard Worker     kAssertion = 1,
38*993b0882SAndroid Build Coastguard Worker 
39*993b0882SAndroid Build Coastguard Worker     // A value mapping match (see: MappingNode).
40*993b0882SAndroid Build Coastguard Worker     kMapping = 2,
41*993b0882SAndroid Build Coastguard Worker 
42*993b0882SAndroid Build Coastguard Worker     // An exclusion match (see: ExclusionNode).
43*993b0882SAndroid Build Coastguard Worker     kExclusion = 3,
44*993b0882SAndroid Build Coastguard Worker 
45*993b0882SAndroid Build Coastguard Worker     // A match for an annotation (see: AnnotationNode).
46*993b0882SAndroid Build Coastguard Worker     kAnnotation = 4,
47*993b0882SAndroid Build Coastguard Worker 
48*993b0882SAndroid Build Coastguard Worker     // A match for a semantic annotation (see: SemanticExpressionNode).
49*993b0882SAndroid Build Coastguard Worker     kExpression = 5,
50*993b0882SAndroid Build Coastguard Worker   };
51*993b0882SAndroid Build Coastguard Worker 
52*993b0882SAndroid Build Coastguard Worker   explicit ParseTree() = default;
ParseTreeParseTree53*993b0882SAndroid Build Coastguard Worker   explicit ParseTree(const Nonterm lhs, const CodepointSpan& codepoint_span,
54*993b0882SAndroid Build Coastguard Worker                      const int match_offset, const Type type)
55*993b0882SAndroid Build Coastguard Worker       : lhs(lhs),
56*993b0882SAndroid Build Coastguard Worker         type(type),
57*993b0882SAndroid Build Coastguard Worker         codepoint_span(codepoint_span),
58*993b0882SAndroid Build Coastguard Worker         match_offset(match_offset) {}
59*993b0882SAndroid Build Coastguard Worker 
60*993b0882SAndroid Build Coastguard Worker   // For binary rule matches:  rhs1 != NULL and rhs2 != NULL
61*993b0882SAndroid Build Coastguard Worker   //      unary rule matches:  rhs1 == NULL and rhs2 != NULL
62*993b0882SAndroid Build Coastguard Worker   //   terminal rule matches:  rhs1 != NULL and rhs2 == NULL
63*993b0882SAndroid Build Coastguard Worker   //           custom leaves:  rhs1 == NULL and rhs2 == NULL
IsInteriorNodeParseTree64*993b0882SAndroid Build Coastguard Worker   bool IsInteriorNode() const { return rhs2 != nullptr; }
IsLeafParseTree65*993b0882SAndroid Build Coastguard Worker   bool IsLeaf() const { return !rhs2; }
66*993b0882SAndroid Build Coastguard Worker 
IsBinaryRuleParseTree67*993b0882SAndroid Build Coastguard Worker   bool IsBinaryRule() const { return rhs1 && rhs2; }
IsUnaryRuleParseTree68*993b0882SAndroid Build Coastguard Worker   bool IsUnaryRule() const { return !rhs1 && rhs2; }
IsTerminalRuleParseTree69*993b0882SAndroid Build Coastguard Worker   bool IsTerminalRule() const { return rhs1 && !rhs2; }
HasLeadingWhitespaceParseTree70*993b0882SAndroid Build Coastguard Worker   bool HasLeadingWhitespace() const {
71*993b0882SAndroid Build Coastguard Worker     return codepoint_span.first != match_offset;
72*993b0882SAndroid Build Coastguard Worker   }
73*993b0882SAndroid Build Coastguard Worker 
unary_rule_rhsParseTree74*993b0882SAndroid Build Coastguard Worker   const ParseTree* unary_rule_rhs() const { return rhs2; }
75*993b0882SAndroid Build Coastguard Worker 
76*993b0882SAndroid Build Coastguard Worker   // Used in singly-linked queue of matches for processing.
77*993b0882SAndroid Build Coastguard Worker   ParseTree* next = nullptr;
78*993b0882SAndroid Build Coastguard Worker 
79*993b0882SAndroid Build Coastguard Worker   // Nonterminal we found a match for.
80*993b0882SAndroid Build Coastguard Worker   Nonterm lhs = kUnassignedNonterm;
81*993b0882SAndroid Build Coastguard Worker 
82*993b0882SAndroid Build Coastguard Worker   // Type of the match.
83*993b0882SAndroid Build Coastguard Worker   Type type = Type::kDefault;
84*993b0882SAndroid Build Coastguard Worker 
85*993b0882SAndroid Build Coastguard Worker   // The span in codepoints.
86*993b0882SAndroid Build Coastguard Worker   CodepointSpan codepoint_span;
87*993b0882SAndroid Build Coastguard Worker 
88*993b0882SAndroid Build Coastguard Worker   // The begin codepoint offset used during matching.
89*993b0882SAndroid Build Coastguard Worker   // This is usually including any prefix whitespace.
90*993b0882SAndroid Build Coastguard Worker   int match_offset;
91*993b0882SAndroid Build Coastguard Worker 
92*993b0882SAndroid Build Coastguard Worker   union {
93*993b0882SAndroid Build Coastguard Worker     // The first sub match for binary rules.
94*993b0882SAndroid Build Coastguard Worker     const ParseTree* rhs1 = nullptr;
95*993b0882SAndroid Build Coastguard Worker 
96*993b0882SAndroid Build Coastguard Worker     // The terminal, for terminal rules.
97*993b0882SAndroid Build Coastguard Worker     const char* terminal;
98*993b0882SAndroid Build Coastguard Worker   };
99*993b0882SAndroid Build Coastguard Worker   // First or second sub-match for interior nodes.
100*993b0882SAndroid Build Coastguard Worker   const ParseTree* rhs2 = nullptr;
101*993b0882SAndroid Build Coastguard Worker };
102*993b0882SAndroid Build Coastguard Worker 
103*993b0882SAndroid Build Coastguard Worker // Node type to keep track of associated values.
104*993b0882SAndroid Build Coastguard Worker struct MappingNode : public ParseTree {
MappingNodeMappingNode105*993b0882SAndroid Build Coastguard Worker   explicit MappingNode(const Nonterm arg_lhs,
106*993b0882SAndroid Build Coastguard Worker                        const CodepointSpan arg_codepoint_span,
107*993b0882SAndroid Build Coastguard Worker                        const int arg_match_offset, const int64 arg_value)
108*993b0882SAndroid Build Coastguard Worker       : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
109*993b0882SAndroid Build Coastguard Worker                   Type::kMapping),
110*993b0882SAndroid Build Coastguard Worker         id(arg_value) {}
111*993b0882SAndroid Build Coastguard Worker   // The associated id or value.
112*993b0882SAndroid Build Coastguard Worker   int64 id;
113*993b0882SAndroid Build Coastguard Worker };
114*993b0882SAndroid Build Coastguard Worker 
115*993b0882SAndroid Build Coastguard Worker // Node type to keep track of assertions.
116*993b0882SAndroid Build Coastguard Worker struct AssertionNode : public ParseTree {
AssertionNodeAssertionNode117*993b0882SAndroid Build Coastguard Worker   explicit AssertionNode(const Nonterm arg_lhs,
118*993b0882SAndroid Build Coastguard Worker                          const CodepointSpan arg_codepoint_span,
119*993b0882SAndroid Build Coastguard Worker                          const int arg_match_offset, const bool arg_negative)
120*993b0882SAndroid Build Coastguard Worker       : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
121*993b0882SAndroid Build Coastguard Worker                   Type::kAssertion),
122*993b0882SAndroid Build Coastguard Worker         negative(arg_negative) {}
123*993b0882SAndroid Build Coastguard Worker   // If true, the assertion is negative and will be valid if the input doesn't
124*993b0882SAndroid Build Coastguard Worker   // match.
125*993b0882SAndroid Build Coastguard Worker   bool negative;
126*993b0882SAndroid Build Coastguard Worker };
127*993b0882SAndroid Build Coastguard Worker 
128*993b0882SAndroid Build Coastguard Worker // Node type to define exclusions.
129*993b0882SAndroid Build Coastguard Worker struct ExclusionNode : public ParseTree {
ExclusionNodeExclusionNode130*993b0882SAndroid Build Coastguard Worker   explicit ExclusionNode(const Nonterm arg_lhs,
131*993b0882SAndroid Build Coastguard Worker                          const CodepointSpan arg_codepoint_span,
132*993b0882SAndroid Build Coastguard Worker                          const int arg_match_offset,
133*993b0882SAndroid Build Coastguard Worker                          const Nonterm arg_exclusion_nonterm)
134*993b0882SAndroid Build Coastguard Worker       : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
135*993b0882SAndroid Build Coastguard Worker                   Type::kExclusion),
136*993b0882SAndroid Build Coastguard Worker         exclusion_nonterm(arg_exclusion_nonterm) {}
137*993b0882SAndroid Build Coastguard Worker   // The nonterminal that denotes matches to exclude from a successful match.
138*993b0882SAndroid Build Coastguard Worker   // So the match is only valid if there is no match of `exclusion_nonterm`
139*993b0882SAndroid Build Coastguard Worker   // spanning the same text range.
140*993b0882SAndroid Build Coastguard Worker   Nonterm exclusion_nonterm;
141*993b0882SAndroid Build Coastguard Worker };
142*993b0882SAndroid Build Coastguard Worker 
143*993b0882SAndroid Build Coastguard Worker // Match to represent an annotator annotated span in the grammar.
144*993b0882SAndroid Build Coastguard Worker struct AnnotationNode : public ParseTree {
AnnotationNodeAnnotationNode145*993b0882SAndroid Build Coastguard Worker   explicit AnnotationNode(const Nonterm arg_lhs,
146*993b0882SAndroid Build Coastguard Worker                           const CodepointSpan arg_codepoint_span,
147*993b0882SAndroid Build Coastguard Worker                           const int arg_match_offset,
148*993b0882SAndroid Build Coastguard Worker                           const ClassificationResult* arg_annotation)
149*993b0882SAndroid Build Coastguard Worker       : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
150*993b0882SAndroid Build Coastguard Worker                   Type::kAnnotation),
151*993b0882SAndroid Build Coastguard Worker         annotation(arg_annotation) {}
152*993b0882SAndroid Build Coastguard Worker   const ClassificationResult* annotation;
153*993b0882SAndroid Build Coastguard Worker };
154*993b0882SAndroid Build Coastguard Worker 
155*993b0882SAndroid Build Coastguard Worker // Node type to represent an associated semantic expression.
156*993b0882SAndroid Build Coastguard Worker struct SemanticExpressionNode : public ParseTree {
SemanticExpressionNodeSemanticExpressionNode157*993b0882SAndroid Build Coastguard Worker   explicit SemanticExpressionNode(const Nonterm arg_lhs,
158*993b0882SAndroid Build Coastguard Worker                                   const CodepointSpan arg_codepoint_span,
159*993b0882SAndroid Build Coastguard Worker                                   const int arg_match_offset,
160*993b0882SAndroid Build Coastguard Worker                                   const SemanticExpression* arg_expression)
161*993b0882SAndroid Build Coastguard Worker       : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
162*993b0882SAndroid Build Coastguard Worker                   Type::kExpression),
163*993b0882SAndroid Build Coastguard Worker         expression(arg_expression) {}
164*993b0882SAndroid Build Coastguard Worker   const SemanticExpression* expression;
165*993b0882SAndroid Build Coastguard Worker };
166*993b0882SAndroid Build Coastguard Worker 
167*993b0882SAndroid Build Coastguard Worker // Utility functions for parse tree traversal.
168*993b0882SAndroid Build Coastguard Worker 
169*993b0882SAndroid Build Coastguard Worker // Does a preorder traversal, calling `node_fn` on each node.
170*993b0882SAndroid Build Coastguard Worker // `node_fn` is expected to return whether to continue expanding a node.
171*993b0882SAndroid Build Coastguard Worker void Traverse(const ParseTree* root,
172*993b0882SAndroid Build Coastguard Worker               const std::function<bool(const ParseTree*)>& node_fn);
173*993b0882SAndroid Build Coastguard Worker 
174*993b0882SAndroid Build Coastguard Worker // Does a preorder traversal, selecting all nodes where `pred_fn` returns true.
175*993b0882SAndroid Build Coastguard Worker std::vector<const ParseTree*> SelectAll(
176*993b0882SAndroid Build Coastguard Worker     const ParseTree* root,
177*993b0882SAndroid Build Coastguard Worker     const std::function<bool(const ParseTree*)>& pred_fn);
178*993b0882SAndroid Build Coastguard Worker 
179*993b0882SAndroid Build Coastguard Worker // Retrieves all nodes of a given type.
180*993b0882SAndroid Build Coastguard Worker template <typename T>
SelectAllOfType(const ParseTree * root,const ParseTree::Type type)181*993b0882SAndroid Build Coastguard Worker const std::vector<const T*> SelectAllOfType(const ParseTree* root,
182*993b0882SAndroid Build Coastguard Worker                                             const ParseTree::Type type) {
183*993b0882SAndroid Build Coastguard Worker   std::vector<const T*> result;
184*993b0882SAndroid Build Coastguard Worker   Traverse(root, [&result, type](const ParseTree* node) {
185*993b0882SAndroid Build Coastguard Worker     if (node->type == type) {
186*993b0882SAndroid Build Coastguard Worker       result.push_back(static_cast<const T*>(node));
187*993b0882SAndroid Build Coastguard Worker     }
188*993b0882SAndroid Build Coastguard Worker     return true;
189*993b0882SAndroid Build Coastguard Worker   });
190*993b0882SAndroid Build Coastguard Worker   return result;
191*993b0882SAndroid Build Coastguard Worker }
192*993b0882SAndroid Build Coastguard Worker 
193*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3::grammar
194*993b0882SAndroid Build Coastguard Worker 
195*993b0882SAndroid Build Coastguard Worker #endif  // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSE_TREE_H_
196