1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 2002-2016 International Business Machines Corporation *
6 * and others. All rights reserved. *
7 ***************************************************************************
8 */
9
10 //
11 // File: rbbinode.cpp
12 //
13 // Implementation of class RBBINode, which represents a node in the
14 // tree generated when parsing the Rules Based Break Iterator rules.
15 //
16 // This "Class" is actually closer to a struct.
17 // Code using it is expected to directly access fields much of the time.
18 //
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_BREAK_ITERATION
23
24 #include "unicode/unistr.h"
25 #include "unicode/uniset.h"
26 #include "unicode/uchar.h"
27 #include "unicode/parsepos.h"
28
29 #include "cstr.h"
30 #include "uvector.h"
31
32 #include "rbbirb.h"
33 #include "rbbinode.h"
34
35 #include "uassert.h"
36
37
38 U_NAMESPACE_BEGIN
39
40 #ifdef RBBI_DEBUG
41 static int gLastSerial = 0;
42 #endif
43
44
45 //-------------------------------------------------------------------------
46 //
47 // Constructor. Just set the fields to reasonable default values.
48 //
49 //-------------------------------------------------------------------------
RBBINode(NodeType t)50 RBBINode::RBBINode(NodeType t) : UMemory() {
51 #ifdef RBBI_DEBUG
52 fSerialNum = ++gLastSerial;
53 #endif
54 fType = t;
55 fParent = nullptr;
56 fLeftChild = nullptr;
57 fRightChild = nullptr;
58 fInputSet = nullptr;
59 fFirstPos = 0;
60 fLastPos = 0;
61 fNullable = false;
62 fLookAheadEnd = false;
63 fRuleRoot = false;
64 fChainIn = false;
65 fVal = 0;
66 fPrecedence = precZero;
67
68 UErrorCode status = U_ZERO_ERROR;
69 fFirstPosSet = new UVector(status); // TODO - get a real status from somewhere
70 fLastPosSet = new UVector(status);
71 fFollowPos = new UVector(status);
72 if (t==opCat) {fPrecedence = precOpCat;}
73 else if (t==opOr) {fPrecedence = precOpOr;}
74 else if (t==opStart) {fPrecedence = precStart;}
75 else if (t==opLParen) {fPrecedence = precLParen;}
76
77 }
78
79
RBBINode(const RBBINode & other)80 RBBINode::RBBINode(const RBBINode &other) : UMemory(other) {
81 #ifdef RBBI_DEBUG
82 fSerialNum = ++gLastSerial;
83 #endif
84 fType = other.fType;
85 fParent = nullptr;
86 fLeftChild = nullptr;
87 fRightChild = nullptr;
88 fInputSet = other.fInputSet;
89 fPrecedence = other.fPrecedence;
90 fText = other.fText;
91 fFirstPos = other.fFirstPos;
92 fLastPos = other.fLastPos;
93 fNullable = other.fNullable;
94 fVal = other.fVal;
95 fRuleRoot = false;
96 fChainIn = other.fChainIn;
97 UErrorCode status = U_ZERO_ERROR;
98 fFirstPosSet = new UVector(status); // TODO - get a real status from somewhere
99 fLastPosSet = new UVector(status);
100 fFollowPos = new UVector(status);
101 }
102
103
104 //-------------------------------------------------------------------------
105 //
106 // Destructor. Deletes both this node AND any child nodes,
107 // except in the case of variable reference nodes. For
108 // these, the l. child points back to the definition, which
109 // is common for all references to the variable, meaning
110 // it can't be deleted here.
111 //
112 //-------------------------------------------------------------------------
~RBBINode()113 RBBINode::~RBBINode() {
114 // printf("deleting node %8x serial %4d\n", this, this->fSerialNum);
115 delete fInputSet;
116 fInputSet = nullptr;
117
118 switch (this->fType) {
119 case varRef:
120 case setRef:
121 // for these node types, multiple instances point to the same "children"
122 // Storage ownership of children handled elsewhere. Don't delete here.
123 break;
124
125 default:
126 // Avoid using a recursive implementation because of stack overflow problems.
127 // See bug ICU-22584.
128 // delete fLeftChild;
129 NRDeleteNode(fLeftChild);
130 fLeftChild = nullptr;
131 // delete fRightChild;
132 NRDeleteNode(fRightChild);
133 fRightChild = nullptr;
134 }
135
136 delete fFirstPosSet;
137 delete fLastPosSet;
138 delete fFollowPos;
139 }
140
141 /**
142 * Non-recursive delete of a node + its children. Used from the node destructor
143 * instead of the more obvious recursive implementation to avoid problems with
144 * stack overflow with some perverse test rule data (from fuzzing).
145 */
NRDeleteNode(RBBINode * node)146 void RBBINode::NRDeleteNode(RBBINode *node) {
147 if (node == nullptr) {
148 return;
149 }
150
151 RBBINode *stopNode = node->fParent;
152 RBBINode *nextNode = node;
153 while (nextNode != stopNode && nextNode != nullptr) {
154 RBBINode *currentNode = nextNode;
155
156 if ((currentNode->fLeftChild == nullptr && currentNode->fRightChild == nullptr) ||
157 currentNode->fType == varRef || // varRef and setRef nodes do not
158 currentNode->fType == setRef) { // own their children nodes.
159 // CurrentNode is effectively a leaf node; it's safe to go ahead and delete it.
160 nextNode = currentNode->fParent;
161 if (nextNode) {
162 if (nextNode->fLeftChild == currentNode) {
163 nextNode->fLeftChild = nullptr;
164 } else if (nextNode->fRightChild == currentNode) {
165 nextNode->fRightChild = nullptr;
166 }
167 }
168 delete currentNode;
169 } else if (currentNode->fLeftChild) {
170 nextNode = currentNode->fLeftChild;
171 if (nextNode->fParent == nullptr) {
172 nextNode->fParent = currentNode;
173 // fParent isn't always set; do it now if not.
174 }
175 U_ASSERT(nextNode->fParent == currentNode);
176 } else if (currentNode->fRightChild) {
177 nextNode = currentNode->fRightChild;
178 if (nextNode->fParent == nullptr) {
179 nextNode->fParent = currentNode;
180 // fParent isn't always set; do it now if not.
181 }
182 U_ASSERT(nextNode->fParent == currentNode);
183 }
184 }
185 }
186
187 //-------------------------------------------------------------------------
188 //
189 // cloneTree Make a copy of the subtree rooted at this node.
190 // Discard any variable references encountered along the way,
191 // and replace with copies of the variable's definitions.
192 // Used to replicate the expression underneath variable
193 // references in preparation for generating the DFA tables.
194 //
195 //-------------------------------------------------------------------------
cloneTree()196 RBBINode *RBBINode::cloneTree() {
197 RBBINode *n;
198
199 if (fType == RBBINode::varRef) {
200 // If the current node is a variable reference, skip over it
201 // and clone the definition of the variable instead.
202 n = fLeftChild->cloneTree();
203 } else if (fType == RBBINode::uset) {
204 n = this;
205 } else {
206 n = new RBBINode(*this);
207 // Check for null pointer.
208 if (n != nullptr) {
209 if (fLeftChild != nullptr) {
210 n->fLeftChild = fLeftChild->cloneTree();
211 n->fLeftChild->fParent = n;
212 }
213 if (fRightChild != nullptr) {
214 n->fRightChild = fRightChild->cloneTree();
215 n->fRightChild->fParent = n;
216 }
217 }
218 }
219 return n;
220 }
221
222
223
224 //-------------------------------------------------------------------------
225 //
226 // flattenVariables Walk a parse tree, replacing any variable
227 // references with a copy of the variable's definition.
228 // Aside from variables, the tree is not changed.
229 //
230 // Return the root of the tree. If the root was not a variable
231 // reference, it remains unchanged - the root we started with
232 // is the root we return. If, however, the root was a variable
233 // reference, the root of the newly cloned replacement tree will
234 // be returned, and the original tree deleted.
235 //
236 // This function works by recursively walking the tree
237 // without doing anything until a variable reference is
238 // found, then calling cloneTree() at that point. Any
239 // nested references are handled by cloneTree(), not here.
240 //
241 //-------------------------------------------------------------------------
242 constexpr int kRecursiveDepthLimit = 3500;
flattenVariables(UErrorCode & status,int depth)243 RBBINode *RBBINode::flattenVariables(UErrorCode& status, int depth) {
244 if (U_FAILURE(status)) {
245 return this;
246 }
247 // If the depth of the stack is too deep, we return U_INPUT_TOO_LONG_ERROR
248 // to avoid stack overflow crash.
249 if (depth > kRecursiveDepthLimit) {
250 status = U_INPUT_TOO_LONG_ERROR;
251 return this;
252 }
253 if (fType == varRef) {
254 RBBINode *retNode = fLeftChild->cloneTree();
255 if (retNode != nullptr) {
256 retNode->fRuleRoot = this->fRuleRoot;
257 retNode->fChainIn = this->fChainIn;
258 }
259 delete this; // TODO: undefined behavior. Fix.
260 return retNode;
261 }
262
263 if (fLeftChild != nullptr) {
264 fLeftChild = fLeftChild->flattenVariables(status, depth+1);
265 fLeftChild->fParent = this;
266 }
267 if (fRightChild != nullptr) {
268 fRightChild = fRightChild->flattenVariables(status, depth+1);
269 fRightChild->fParent = this;
270 }
271 return this;
272 }
273
274
275 //-------------------------------------------------------------------------
276 //
277 // flattenSets Walk the parse tree, replacing any nodes of type setRef
278 // with a copy of the expression tree for the set. A set's
279 // equivalent expression tree is precomputed and saved as
280 // the left child of the uset node.
281 //
282 //-------------------------------------------------------------------------
flattenSets()283 void RBBINode::flattenSets() {
284 U_ASSERT(fType != setRef);
285
286 if (fLeftChild != nullptr) {
287 if (fLeftChild->fType==setRef) {
288 RBBINode *setRefNode = fLeftChild;
289 RBBINode *usetNode = setRefNode->fLeftChild;
290 RBBINode *replTree = usetNode->fLeftChild;
291 fLeftChild = replTree->cloneTree();
292 fLeftChild->fParent = this;
293 delete setRefNode;
294 } else {
295 fLeftChild->flattenSets();
296 }
297 }
298
299 if (fRightChild != nullptr) {
300 if (fRightChild->fType==setRef) {
301 RBBINode *setRefNode = fRightChild;
302 RBBINode *usetNode = setRefNode->fLeftChild;
303 RBBINode *replTree = usetNode->fLeftChild;
304 fRightChild = replTree->cloneTree();
305 fRightChild->fParent = this;
306 delete setRefNode;
307 } else {
308 fRightChild->flattenSets();
309 }
310 }
311 }
312
313
314
315 //-------------------------------------------------------------------------
316 //
317 // findNodes() Locate all the nodes of the specified type, starting
318 // at the specified root.
319 //
320 //-------------------------------------------------------------------------
findNodes(UVector * dest,RBBINode::NodeType kind,UErrorCode & status)321 void RBBINode::findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &status) {
322 /* test for buffer overflows */
323 if (U_FAILURE(status)) {
324 return;
325 }
326 U_ASSERT(!dest->hasDeleter());
327 if (fType == kind) {
328 dest->addElement(this, status);
329 }
330 if (fLeftChild != nullptr) {
331 fLeftChild->findNodes(dest, kind, status);
332 }
333 if (fRightChild != nullptr) {
334 fRightChild->findNodes(dest, kind, status);
335 }
336 }
337
338
339 //-------------------------------------------------------------------------
340 //
341 // print. Print out a single node, for debugging.
342 //
343 //-------------------------------------------------------------------------
344 #ifdef RBBI_DEBUG
345
serial(const RBBINode * node)346 static int32_t serial(const RBBINode *node) {
347 return (node == nullptr? -1 : node->fSerialNum);
348 }
349
350
printNode(const RBBINode * node)351 void RBBINode::printNode(const RBBINode *node) {
352 static const char * const nodeTypeNames[] = {
353 "setRef",
354 "uset",
355 "varRef",
356 "leafChar",
357 "lookAhead",
358 "tag",
359 "endMark",
360 "opStart",
361 "opCat",
362 "opOr",
363 "opStar",
364 "opPlus",
365 "opQuestion",
366 "opBreak",
367 "opReverse",
368 "opLParen"
369 };
370
371 if (node==nullptr) {
372 RBBIDebugPrintf("%10p", (void *)node);
373 } else {
374 RBBIDebugPrintf("%10p %5d %12s %c%c %5d %5d %5d %6d %d ",
375 (void *)node, node->fSerialNum, nodeTypeNames[node->fType],
376 node->fRuleRoot?'R':' ', node->fChainIn?'C':' ',
377 serial(node->fLeftChild), serial(node->fRightChild), serial(node->fParent),
378 node->fFirstPos, node->fVal);
379 if (node->fType == varRef) {
380 RBBI_DEBUG_printUnicodeString(node->fText);
381 }
382 }
383 RBBIDebugPrintf("\n");
384 }
385 #endif
386
387
388 #ifdef RBBI_DEBUG
RBBI_DEBUG_printUnicodeString(const UnicodeString & s,int minWidth)389 U_CFUNC void RBBI_DEBUG_printUnicodeString(const UnicodeString &s, int minWidth) {
390 RBBIDebugPrintf("%*s", minWidth, CStr(s)());
391 }
392 #endif
393
394
395 //-------------------------------------------------------------------------
396 //
397 // print. Print out the tree of nodes rooted at "this"
398 //
399 //-------------------------------------------------------------------------
400 #ifdef RBBI_DEBUG
printNodeHeader()401 void RBBINode::printNodeHeader() {
402 RBBIDebugPrintf(" Address serial type LeftChild RightChild Parent position value\n");
403 }
404
printTree(const RBBINode * node,UBool printHeading)405 void RBBINode::printTree(const RBBINode *node, UBool printHeading) {
406 if (printHeading) {
407 printNodeHeader();
408 }
409 printNode(node);
410 if (node != nullptr) {
411 // Only dump the definition under a variable reference if asked to.
412 // Unconditionally dump children of all other node types.
413 if (node->fType != varRef) {
414 if (node->fLeftChild != nullptr) {
415 printTree(node->fLeftChild, false);
416 }
417
418 if (node->fRightChild != nullptr) {
419 printTree(node->fRightChild, false);
420 }
421 }
422 }
423 }
424 #endif
425
426
427
428 U_NAMESPACE_END
429
430 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
431