1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker *
4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker *
8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker *
10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker */
16*993b0882SAndroid Build Coastguard Worker
17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSE_TREE_H_
18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSE_TREE_H_
19*993b0882SAndroid Build Coastguard Worker
20*993b0882SAndroid Build Coastguard Worker #include <functional>
21*993b0882SAndroid Build Coastguard Worker #include <vector>
22*993b0882SAndroid Build Coastguard Worker
23*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h"
24*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/semantics/expression_generated.h"
25*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/types.h"
26*993b0882SAndroid Build Coastguard Worker #include "utils/strings/stringpiece.h"
27*993b0882SAndroid Build Coastguard Worker
28*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3::grammar {
29*993b0882SAndroid Build Coastguard Worker
30*993b0882SAndroid Build Coastguard Worker // Represents a parse tree for a match that was found for a nonterminal.
31*993b0882SAndroid Build Coastguard Worker struct ParseTree {
32*993b0882SAndroid Build Coastguard Worker enum class Type : int8 {
33*993b0882SAndroid Build Coastguard Worker // Default, untyped match.
34*993b0882SAndroid Build Coastguard Worker kDefault = 0,
35*993b0882SAndroid Build Coastguard Worker
36*993b0882SAndroid Build Coastguard Worker // An assertion match (see: AssertionNode).
37*993b0882SAndroid Build Coastguard Worker kAssertion = 1,
38*993b0882SAndroid Build Coastguard Worker
39*993b0882SAndroid Build Coastguard Worker // A value mapping match (see: MappingNode).
40*993b0882SAndroid Build Coastguard Worker kMapping = 2,
41*993b0882SAndroid Build Coastguard Worker
42*993b0882SAndroid Build Coastguard Worker // An exclusion match (see: ExclusionNode).
43*993b0882SAndroid Build Coastguard Worker kExclusion = 3,
44*993b0882SAndroid Build Coastguard Worker
45*993b0882SAndroid Build Coastguard Worker // A match for an annotation (see: AnnotationNode).
46*993b0882SAndroid Build Coastguard Worker kAnnotation = 4,
47*993b0882SAndroid Build Coastguard Worker
48*993b0882SAndroid Build Coastguard Worker // A match for a semantic annotation (see: SemanticExpressionNode).
49*993b0882SAndroid Build Coastguard Worker kExpression = 5,
50*993b0882SAndroid Build Coastguard Worker };
51*993b0882SAndroid Build Coastguard Worker
52*993b0882SAndroid Build Coastguard Worker explicit ParseTree() = default;
ParseTreeParseTree53*993b0882SAndroid Build Coastguard Worker explicit ParseTree(const Nonterm lhs, const CodepointSpan& codepoint_span,
54*993b0882SAndroid Build Coastguard Worker const int match_offset, const Type type)
55*993b0882SAndroid Build Coastguard Worker : lhs(lhs),
56*993b0882SAndroid Build Coastguard Worker type(type),
57*993b0882SAndroid Build Coastguard Worker codepoint_span(codepoint_span),
58*993b0882SAndroid Build Coastguard Worker match_offset(match_offset) {}
59*993b0882SAndroid Build Coastguard Worker
60*993b0882SAndroid Build Coastguard Worker // For binary rule matches: rhs1 != NULL and rhs2 != NULL
61*993b0882SAndroid Build Coastguard Worker // unary rule matches: rhs1 == NULL and rhs2 != NULL
62*993b0882SAndroid Build Coastguard Worker // terminal rule matches: rhs1 != NULL and rhs2 == NULL
63*993b0882SAndroid Build Coastguard Worker // custom leaves: rhs1 == NULL and rhs2 == NULL
IsInteriorNodeParseTree64*993b0882SAndroid Build Coastguard Worker bool IsInteriorNode() const { return rhs2 != nullptr; }
IsLeafParseTree65*993b0882SAndroid Build Coastguard Worker bool IsLeaf() const { return !rhs2; }
66*993b0882SAndroid Build Coastguard Worker
IsBinaryRuleParseTree67*993b0882SAndroid Build Coastguard Worker bool IsBinaryRule() const { return rhs1 && rhs2; }
IsUnaryRuleParseTree68*993b0882SAndroid Build Coastguard Worker bool IsUnaryRule() const { return !rhs1 && rhs2; }
IsTerminalRuleParseTree69*993b0882SAndroid Build Coastguard Worker bool IsTerminalRule() const { return rhs1 && !rhs2; }
HasLeadingWhitespaceParseTree70*993b0882SAndroid Build Coastguard Worker bool HasLeadingWhitespace() const {
71*993b0882SAndroid Build Coastguard Worker return codepoint_span.first != match_offset;
72*993b0882SAndroid Build Coastguard Worker }
73*993b0882SAndroid Build Coastguard Worker
unary_rule_rhsParseTree74*993b0882SAndroid Build Coastguard Worker const ParseTree* unary_rule_rhs() const { return rhs2; }
75*993b0882SAndroid Build Coastguard Worker
76*993b0882SAndroid Build Coastguard Worker // Used in singly-linked queue of matches for processing.
77*993b0882SAndroid Build Coastguard Worker ParseTree* next = nullptr;
78*993b0882SAndroid Build Coastguard Worker
79*993b0882SAndroid Build Coastguard Worker // Nonterminal we found a match for.
80*993b0882SAndroid Build Coastguard Worker Nonterm lhs = kUnassignedNonterm;
81*993b0882SAndroid Build Coastguard Worker
82*993b0882SAndroid Build Coastguard Worker // Type of the match.
83*993b0882SAndroid Build Coastguard Worker Type type = Type::kDefault;
84*993b0882SAndroid Build Coastguard Worker
85*993b0882SAndroid Build Coastguard Worker // The span in codepoints.
86*993b0882SAndroid Build Coastguard Worker CodepointSpan codepoint_span;
87*993b0882SAndroid Build Coastguard Worker
88*993b0882SAndroid Build Coastguard Worker // The begin codepoint offset used during matching.
89*993b0882SAndroid Build Coastguard Worker // This is usually including any prefix whitespace.
90*993b0882SAndroid Build Coastguard Worker int match_offset;
91*993b0882SAndroid Build Coastguard Worker
92*993b0882SAndroid Build Coastguard Worker union {
93*993b0882SAndroid Build Coastguard Worker // The first sub match for binary rules.
94*993b0882SAndroid Build Coastguard Worker const ParseTree* rhs1 = nullptr;
95*993b0882SAndroid Build Coastguard Worker
96*993b0882SAndroid Build Coastguard Worker // The terminal, for terminal rules.
97*993b0882SAndroid Build Coastguard Worker const char* terminal;
98*993b0882SAndroid Build Coastguard Worker };
99*993b0882SAndroid Build Coastguard Worker // First or second sub-match for interior nodes.
100*993b0882SAndroid Build Coastguard Worker const ParseTree* rhs2 = nullptr;
101*993b0882SAndroid Build Coastguard Worker };
102*993b0882SAndroid Build Coastguard Worker
103*993b0882SAndroid Build Coastguard Worker // Node type to keep track of associated values.
104*993b0882SAndroid Build Coastguard Worker struct MappingNode : public ParseTree {
MappingNodeMappingNode105*993b0882SAndroid Build Coastguard Worker explicit MappingNode(const Nonterm arg_lhs,
106*993b0882SAndroid Build Coastguard Worker const CodepointSpan arg_codepoint_span,
107*993b0882SAndroid Build Coastguard Worker const int arg_match_offset, const int64 arg_value)
108*993b0882SAndroid Build Coastguard Worker : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
109*993b0882SAndroid Build Coastguard Worker Type::kMapping),
110*993b0882SAndroid Build Coastguard Worker id(arg_value) {}
111*993b0882SAndroid Build Coastguard Worker // The associated id or value.
112*993b0882SAndroid Build Coastguard Worker int64 id;
113*993b0882SAndroid Build Coastguard Worker };
114*993b0882SAndroid Build Coastguard Worker
115*993b0882SAndroid Build Coastguard Worker // Node type to keep track of assertions.
116*993b0882SAndroid Build Coastguard Worker struct AssertionNode : public ParseTree {
AssertionNodeAssertionNode117*993b0882SAndroid Build Coastguard Worker explicit AssertionNode(const Nonterm arg_lhs,
118*993b0882SAndroid Build Coastguard Worker const CodepointSpan arg_codepoint_span,
119*993b0882SAndroid Build Coastguard Worker const int arg_match_offset, const bool arg_negative)
120*993b0882SAndroid Build Coastguard Worker : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
121*993b0882SAndroid Build Coastguard Worker Type::kAssertion),
122*993b0882SAndroid Build Coastguard Worker negative(arg_negative) {}
123*993b0882SAndroid Build Coastguard Worker // If true, the assertion is negative and will be valid if the input doesn't
124*993b0882SAndroid Build Coastguard Worker // match.
125*993b0882SAndroid Build Coastguard Worker bool negative;
126*993b0882SAndroid Build Coastguard Worker };
127*993b0882SAndroid Build Coastguard Worker
128*993b0882SAndroid Build Coastguard Worker // Node type to define exclusions.
129*993b0882SAndroid Build Coastguard Worker struct ExclusionNode : public ParseTree {
ExclusionNodeExclusionNode130*993b0882SAndroid Build Coastguard Worker explicit ExclusionNode(const Nonterm arg_lhs,
131*993b0882SAndroid Build Coastguard Worker const CodepointSpan arg_codepoint_span,
132*993b0882SAndroid Build Coastguard Worker const int arg_match_offset,
133*993b0882SAndroid Build Coastguard Worker const Nonterm arg_exclusion_nonterm)
134*993b0882SAndroid Build Coastguard Worker : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
135*993b0882SAndroid Build Coastguard Worker Type::kExclusion),
136*993b0882SAndroid Build Coastguard Worker exclusion_nonterm(arg_exclusion_nonterm) {}
137*993b0882SAndroid Build Coastguard Worker // The nonterminal that denotes matches to exclude from a successful match.
138*993b0882SAndroid Build Coastguard Worker // So the match is only valid if there is no match of `exclusion_nonterm`
139*993b0882SAndroid Build Coastguard Worker // spanning the same text range.
140*993b0882SAndroid Build Coastguard Worker Nonterm exclusion_nonterm;
141*993b0882SAndroid Build Coastguard Worker };
142*993b0882SAndroid Build Coastguard Worker
143*993b0882SAndroid Build Coastguard Worker // Match to represent an annotator annotated span in the grammar.
144*993b0882SAndroid Build Coastguard Worker struct AnnotationNode : public ParseTree {
AnnotationNodeAnnotationNode145*993b0882SAndroid Build Coastguard Worker explicit AnnotationNode(const Nonterm arg_lhs,
146*993b0882SAndroid Build Coastguard Worker const CodepointSpan arg_codepoint_span,
147*993b0882SAndroid Build Coastguard Worker const int arg_match_offset,
148*993b0882SAndroid Build Coastguard Worker const ClassificationResult* arg_annotation)
149*993b0882SAndroid Build Coastguard Worker : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
150*993b0882SAndroid Build Coastguard Worker Type::kAnnotation),
151*993b0882SAndroid Build Coastguard Worker annotation(arg_annotation) {}
152*993b0882SAndroid Build Coastguard Worker const ClassificationResult* annotation;
153*993b0882SAndroid Build Coastguard Worker };
154*993b0882SAndroid Build Coastguard Worker
155*993b0882SAndroid Build Coastguard Worker // Node type to represent an associated semantic expression.
156*993b0882SAndroid Build Coastguard Worker struct SemanticExpressionNode : public ParseTree {
SemanticExpressionNodeSemanticExpressionNode157*993b0882SAndroid Build Coastguard Worker explicit SemanticExpressionNode(const Nonterm arg_lhs,
158*993b0882SAndroid Build Coastguard Worker const CodepointSpan arg_codepoint_span,
159*993b0882SAndroid Build Coastguard Worker const int arg_match_offset,
160*993b0882SAndroid Build Coastguard Worker const SemanticExpression* arg_expression)
161*993b0882SAndroid Build Coastguard Worker : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
162*993b0882SAndroid Build Coastguard Worker Type::kExpression),
163*993b0882SAndroid Build Coastguard Worker expression(arg_expression) {}
164*993b0882SAndroid Build Coastguard Worker const SemanticExpression* expression;
165*993b0882SAndroid Build Coastguard Worker };
166*993b0882SAndroid Build Coastguard Worker
167*993b0882SAndroid Build Coastguard Worker // Utility functions for parse tree traversal.
168*993b0882SAndroid Build Coastguard Worker
169*993b0882SAndroid Build Coastguard Worker // Does a preorder traversal, calling `node_fn` on each node.
170*993b0882SAndroid Build Coastguard Worker // `node_fn` is expected to return whether to continue expanding a node.
171*993b0882SAndroid Build Coastguard Worker void Traverse(const ParseTree* root,
172*993b0882SAndroid Build Coastguard Worker const std::function<bool(const ParseTree*)>& node_fn);
173*993b0882SAndroid Build Coastguard Worker
174*993b0882SAndroid Build Coastguard Worker // Does a preorder traversal, selecting all nodes where `pred_fn` returns true.
175*993b0882SAndroid Build Coastguard Worker std::vector<const ParseTree*> SelectAll(
176*993b0882SAndroid Build Coastguard Worker const ParseTree* root,
177*993b0882SAndroid Build Coastguard Worker const std::function<bool(const ParseTree*)>& pred_fn);
178*993b0882SAndroid Build Coastguard Worker
179*993b0882SAndroid Build Coastguard Worker // Retrieves all nodes of a given type.
180*993b0882SAndroid Build Coastguard Worker template <typename T>
SelectAllOfType(const ParseTree * root,const ParseTree::Type type)181*993b0882SAndroid Build Coastguard Worker const std::vector<const T*> SelectAllOfType(const ParseTree* root,
182*993b0882SAndroid Build Coastguard Worker const ParseTree::Type type) {
183*993b0882SAndroid Build Coastguard Worker std::vector<const T*> result;
184*993b0882SAndroid Build Coastguard Worker Traverse(root, [&result, type](const ParseTree* node) {
185*993b0882SAndroid Build Coastguard Worker if (node->type == type) {
186*993b0882SAndroid Build Coastguard Worker result.push_back(static_cast<const T*>(node));
187*993b0882SAndroid Build Coastguard Worker }
188*993b0882SAndroid Build Coastguard Worker return true;
189*993b0882SAndroid Build Coastguard Worker });
190*993b0882SAndroid Build Coastguard Worker return result;
191*993b0882SAndroid Build Coastguard Worker }
192*993b0882SAndroid Build Coastguard Worker
193*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3::grammar
194*993b0882SAndroid Build Coastguard Worker
195*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSE_TREE_H_
196