1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9 //
10 // rbbitblb.cpp
11 //
12
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_BREAK_ITERATION
17
18 #include "unicode/unistr.h"
19 #include "rbbitblb.h"
20 #include "rbbirb.h"
21 #include "rbbiscan.h"
22 #include "rbbisetb.h"
23 #include "rbbidata.h"
24 #include "cstring.h"
25 #include "uassert.h"
26 #include "uvectr32.h"
27 #include "cmemory.h"
28
29 U_NAMESPACE_BEGIN
30
31 const int32_t kMaxStateFor8BitsTable = 255;
32
RBBITableBuilder(RBBIRuleBuilder * rb,RBBINode ** rootNode,UErrorCode & status)33 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
34 fRB(rb),
35 fTree(*rootNode),
36 fStatus(&status),
37 fDStates(nullptr),
38 fSafeTable(nullptr) {
39 if (U_FAILURE(status)) {
40 return;
41 }
42 // fDStates is UVector<RBBIStateDescriptor *>
43 fDStates = new UVector(status);
44 if (U_SUCCESS(status) && fDStates == nullptr ) {
45 status = U_MEMORY_ALLOCATION_ERROR;
46 }
47 }
48
49
50
~RBBITableBuilder()51 RBBITableBuilder::~RBBITableBuilder() {
52 int i;
53 for (i=0; i<fDStates->size(); i++) {
54 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
55 }
56 delete fDStates;
57 delete fSafeTable;
58 delete fLookAheadRuleMap;
59 }
60
61
62 //-----------------------------------------------------------------------------
63 //
64 // RBBITableBuilder::buildForwardTable - This is the main function for building
65 // the DFA state transition table from the RBBI rules parse tree.
66 //
67 //-----------------------------------------------------------------------------
buildForwardTable()68 void RBBITableBuilder::buildForwardTable() {
69
70 if (U_FAILURE(*fStatus)) {
71 return;
72 }
73
74 // If there were no rules, just return. This situation can easily arise
75 // for the reverse rules.
76 if (fTree==nullptr) {
77 return;
78 }
79
80 //
81 // Walk through the tree, replacing any references to $variables with a copy of the
82 // parse tree for the substitution expression.
83 //
84 fTree = fTree->flattenVariables();
85 #ifdef RBBI_DEBUG
86 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
87 RBBIDebugPuts("\nParse tree after flattening variable references.");
88 RBBINode::printTree(fTree, true);
89 }
90 #endif
91
92 //
93 // If the rules contained any references to {bof}
94 // add a {bof} <cat> <former root of tree> to the
95 // tree. Means that all matches must start out with the
96 // {bof} fake character.
97 //
98 if (fRB->fSetBuilder->sawBOF()) {
99 RBBINode *bofTop = new RBBINode(RBBINode::opCat);
100 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
101 // Delete and exit if memory allocation failed.
102 if (bofTop == nullptr || bofLeaf == nullptr) {
103 *fStatus = U_MEMORY_ALLOCATION_ERROR;
104 delete bofTop;
105 delete bofLeaf;
106 return;
107 }
108 bofTop->fLeftChild = bofLeaf;
109 bofTop->fRightChild = fTree;
110 bofLeaf->fParent = bofTop;
111 bofLeaf->fVal = 2; // Reserved value for {bof}.
112 fTree = bofTop;
113 }
114
115 //
116 // Add a unique right-end marker to the expression.
117 // Appears as a cat-node, left child being the original tree,
118 // right child being the end marker.
119 //
120 RBBINode *cn = new RBBINode(RBBINode::opCat);
121 // Exit if memory allocation failed.
122 if (cn == nullptr) {
123 *fStatus = U_MEMORY_ALLOCATION_ERROR;
124 return;
125 }
126 cn->fLeftChild = fTree;
127 fTree->fParent = cn;
128 RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark);
129 // Delete and exit if memory allocation failed.
130 if (cn->fRightChild == nullptr) {
131 *fStatus = U_MEMORY_ALLOCATION_ERROR;
132 delete cn;
133 return;
134 }
135 cn->fRightChild->fParent = cn;
136 fTree = cn;
137
138 //
139 // Replace all references to UnicodeSets with the tree for the equivalent
140 // expression.
141 //
142 fTree->flattenSets();
143 #ifdef RBBI_DEBUG
144 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
145 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
146 RBBINode::printTree(fTree, true);
147 }
148 #endif
149
150
151 //
152 // calculate the functions nullable, firstpos, lastpos and followpos on
153 // nodes in the parse tree.
154 // See the algorithm description in Aho.
155 // Understanding how this works by looking at the code alone will be
156 // nearly impossible.
157 //
158 calcNullable(fTree);
159 calcFirstPos(fTree);
160 calcLastPos(fTree);
161 calcFollowPos(fTree);
162 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
163 RBBIDebugPuts("\n");
164 printPosSets(fTree);
165 }
166
167 //
168 // For "chained" rules, modify the followPos sets
169 //
170 if (fRB->fChainRules) {
171 calcChainedFollowPos(fTree, endMarkerNode);
172 }
173
174 //
175 // BOF (start of input) test fixup.
176 //
177 if (fRB->fSetBuilder->sawBOF()) {
178 bofFixup();
179 }
180
181 //
182 // Build the DFA state transition tables.
183 //
184 buildStateTable();
185 mapLookAheadRules();
186 flagAcceptingStates();
187 flagLookAheadStates();
188 flagTaggedStates();
189
190 //
191 // Update the global table of rule status {tag} values
192 // The rule builder has a global vector of status values that are common
193 // for all tables. Merge the ones from this table into the global set.
194 //
195 mergeRuleStatusVals();
196 }
197
198
199
200 //-----------------------------------------------------------------------------
201 //
202 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
203 //
204 //-----------------------------------------------------------------------------
calcNullable(RBBINode * n)205 void RBBITableBuilder::calcNullable(RBBINode *n) {
206 if (n == nullptr) {
207 return;
208 }
209 if (n->fType == RBBINode::setRef ||
210 n->fType == RBBINode::endMark ) {
211 // These are non-empty leaf node types.
212 n->fNullable = false;
213 return;
214 }
215
216 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
217 // Lookahead marker node. It's a leaf, so no recursion on children.
218 // It's nullable because it does not match any literal text from the input stream.
219 n->fNullable = true;
220 return;
221 }
222
223
224 // The node is not a leaf.
225 // Calculate nullable on its children.
226 calcNullable(n->fLeftChild);
227 calcNullable(n->fRightChild);
228
229 // Apply functions from table 3.40 in Aho
230 if (n->fType == RBBINode::opOr) {
231 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
232 }
233 else if (n->fType == RBBINode::opCat) {
234 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
235 }
236 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
237 n->fNullable = true;
238 }
239 else {
240 n->fNullable = false;
241 }
242 }
243
244
245
246
247 //-----------------------------------------------------------------------------
248 //
249 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
250 //
251 //-----------------------------------------------------------------------------
calcFirstPos(RBBINode * n)252 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
253 if (n == nullptr) {
254 return;
255 }
256 if (n->fType == RBBINode::leafChar ||
257 n->fType == RBBINode::endMark ||
258 n->fType == RBBINode::lookAhead ||
259 n->fType == RBBINode::tag) {
260 // These are non-empty leaf node types.
261 // Note: In order to maintain the sort invariant on the set,
262 // this function should only be called on a node whose set is
263 // empty to start with.
264 n->fFirstPosSet->addElement(n, *fStatus);
265 return;
266 }
267
268 // The node is not a leaf.
269 // Calculate firstPos on its children.
270 calcFirstPos(n->fLeftChild);
271 calcFirstPos(n->fRightChild);
272
273 // Apply functions from table 3.40 in Aho
274 if (n->fType == RBBINode::opOr) {
275 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
276 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277 }
278 else if (n->fType == RBBINode::opCat) {
279 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
280 if (n->fLeftChild->fNullable) {
281 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
282 }
283 }
284 else if (n->fType == RBBINode::opStar ||
285 n->fType == RBBINode::opQuestion ||
286 n->fType == RBBINode::opPlus) {
287 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
288 }
289 }
290
291
292
293 //-----------------------------------------------------------------------------
294 //
295 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
296 //
297 //-----------------------------------------------------------------------------
calcLastPos(RBBINode * n)298 void RBBITableBuilder::calcLastPos(RBBINode *n) {
299 if (n == nullptr) {
300 return;
301 }
302 if (n->fType == RBBINode::leafChar ||
303 n->fType == RBBINode::endMark ||
304 n->fType == RBBINode::lookAhead ||
305 n->fType == RBBINode::tag) {
306 // These are non-empty leaf node types.
307 // Note: In order to maintain the sort invariant on the set,
308 // this function should only be called on a node whose set is
309 // empty to start with.
310 n->fLastPosSet->addElement(n, *fStatus);
311 return;
312 }
313
314 // The node is not a leaf.
315 // Calculate lastPos on its children.
316 calcLastPos(n->fLeftChild);
317 calcLastPos(n->fRightChild);
318
319 // Apply functions from table 3.40 in Aho
320 if (n->fType == RBBINode::opOr) {
321 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
322 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
323 }
324 else if (n->fType == RBBINode::opCat) {
325 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
326 if (n->fRightChild->fNullable) {
327 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
328 }
329 }
330 else if (n->fType == RBBINode::opStar ||
331 n->fType == RBBINode::opQuestion ||
332 n->fType == RBBINode::opPlus) {
333 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
334 }
335 }
336
337
338
339 //-----------------------------------------------------------------------------
340 //
341 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
342 //
343 //-----------------------------------------------------------------------------
calcFollowPos(RBBINode * n)344 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
345 if (n == nullptr ||
346 n->fType == RBBINode::leafChar ||
347 n->fType == RBBINode::endMark) {
348 return;
349 }
350
351 calcFollowPos(n->fLeftChild);
352 calcFollowPos(n->fRightChild);
353
354 // Aho rule #1
355 if (n->fType == RBBINode::opCat) {
356 RBBINode *i; // is 'i' in Aho's description
357 uint32_t ix;
358
359 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
360
361 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
362 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
363 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
364 }
365 }
366
367 // Aho rule #2
368 if (n->fType == RBBINode::opStar ||
369 n->fType == RBBINode::opPlus) {
370 RBBINode *i; // again, n and i are the names from Aho's description.
371 uint32_t ix;
372
373 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
374 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
375 setAdd(i->fFollowPos, n->fFirstPosSet);
376 }
377 }
378
379
380
381 }
382
383 //-----------------------------------------------------------------------------
384 //
385 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
386 // as roots of a rule to a destination vector.
387 //
388 //-----------------------------------------------------------------------------
addRuleRootNodes(UVector * dest,RBBINode * node)389 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
390 if (node == nullptr || U_FAILURE(*fStatus)) {
391 return;
392 }
393 U_ASSERT(!dest->hasDeleter());
394 if (node->fRuleRoot) {
395 dest->addElement(node, *fStatus);
396 // Note: rules cannot nest. If we found a rule start node,
397 // no child node can also be a start node.
398 return;
399 }
400 addRuleRootNodes(dest, node->fLeftChild);
401 addRuleRootNodes(dest, node->fRightChild);
402 }
403
404 //-----------------------------------------------------------------------------
405 //
406 // calcChainedFollowPos. Modify the previously calculated followPos sets
407 // to implement rule chaining. NOT described by Aho
408 //
409 //-----------------------------------------------------------------------------
calcChainedFollowPos(RBBINode * tree,RBBINode * endMarkNode)410 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) {
411
412 UVector leafNodes(*fStatus);
413 if (U_FAILURE(*fStatus)) {
414 return;
415 }
416
417 // get a list all leaf nodes
418 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
419 if (U_FAILURE(*fStatus)) {
420 return;
421 }
422
423 // Collect all leaf nodes that can start matches for rules
424 // with inbound chaining enabled, which is the union of the
425 // firstPosition sets from each of the rule root nodes.
426
427 UVector ruleRootNodes(*fStatus);
428 addRuleRootNodes(&ruleRootNodes, tree);
429
430 UVector matchStartNodes(*fStatus);
431 for (int j=0; j<ruleRootNodes.size(); ++j) {
432 RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
433 if (node->fChainIn) {
434 setAdd(&matchStartNodes, node->fFirstPosSet);
435 }
436 }
437 if (U_FAILURE(*fStatus)) {
438 return;
439 }
440
441 int32_t endNodeIx;
442 int32_t startNodeIx;
443
444 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
445 RBBINode *endNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
446
447 // Identify leaf nodes that correspond to overall rule match positions.
448 // These include the endMarkNode in their followPos sets.
449 //
450 // Note: do not consider other end marker nodes, those that are added to
451 // look-ahead rules. These can't chain; a match immediately stops
452 // further matching. This leaves exactly one end marker node, the one
453 // at the end of the complete tree.
454
455 if (!endNode->fFollowPos->contains(endMarkNode)) {
456 continue;
457 }
458
459 // We've got a node that can end a match.
460
461 // Now iterate over the nodes that can start a match, looking for ones
462 // with the same char class as our ending node.
463 RBBINode *startNode;
464 for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
465 startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
466 if (startNode->fType != RBBINode::leafChar) {
467 continue;
468 }
469
470 if (endNode->fVal == startNode->fVal) {
471 // The end val (character class) of one possible match is the
472 // same as the start of another.
473
474 // Add all nodes from the followPos of the start node to the
475 // followPos set of the end node, which will have the effect of
476 // letting matches transition from a match state at endNode
477 // to the second char of a match starting with startNode.
478 setAdd(endNode->fFollowPos, startNode->fFollowPos);
479 }
480 }
481 }
482 }
483
484
485 //-----------------------------------------------------------------------------
486 //
487 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
488 // Do an swizzle similar to chaining, modifying the followPos set of
489 // the bofNode to include the followPos nodes from other {bot} nodes
490 // scattered through the tree.
491 //
492 // This function has much in common with calcChainedFollowPos().
493 //
494 //-----------------------------------------------------------------------------
bofFixup()495 void RBBITableBuilder::bofFixup() {
496
497 if (U_FAILURE(*fStatus)) {
498 return;
499 }
500
501 // The parse tree looks like this ...
502 // fTree root ---> <cat>
503 // / \ .
504 // <cat> <#end node>
505 // / \ .
506 // <bofNode> rest
507 // of tree
508 //
509 // We will be adding things to the followPos set of the <bofNode>
510 //
511 RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
512 U_ASSERT(bofNode->fType == RBBINode::leafChar);
513 U_ASSERT(bofNode->fVal == 2);
514
515 // Get all nodes that can be the start a match of the user-written rules
516 // (excluding the fake bofNode)
517 // We want the nodes that can start a match in the
518 // part labeled "rest of tree"
519 //
520 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
521
522 RBBINode *startNode;
523 int startNodeIx;
524 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
525 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
526 if (startNode->fType != RBBINode::leafChar) {
527 continue;
528 }
529
530 if (startNode->fVal == bofNode->fVal) {
531 // We found a leaf node corresponding to a {bof} that was
532 // explicitly written into a rule.
533 // Add everything from the followPos set of this node to the
534 // followPos set of the fake bofNode at the start of the tree.
535 //
536 setAdd(bofNode->fFollowPos, startNode->fFollowPos);
537 }
538 }
539 }
540
541 //-----------------------------------------------------------------------------
542 //
543 // buildStateTable() Determine the set of runtime DFA states and the
544 // transition tables for these states, by the algorithm
545 // of fig. 3.44 in Aho.
546 //
547 // Most of the comments are quotes of Aho's psuedo-code.
548 //
549 //-----------------------------------------------------------------------------
buildStateTable()550 void RBBITableBuilder::buildStateTable() {
551 if (U_FAILURE(*fStatus)) {
552 return;
553 }
554 RBBIStateDescriptor *failState;
555 // Set it to nullptr to avoid uninitialized warning
556 RBBIStateDescriptor *initialState = nullptr;
557 //
558 // Add a dummy state 0 - the stop state. Not from Aho.
559 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
560 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
561 if (failState == nullptr) {
562 *fStatus = U_MEMORY_ALLOCATION_ERROR;
563 goto ExitBuildSTdeleteall;
564 }
565 failState->fPositions = new UVector(*fStatus);
566 if (failState->fPositions == nullptr) {
567 *fStatus = U_MEMORY_ALLOCATION_ERROR;
568 }
569 if (failState->fPositions == nullptr || U_FAILURE(*fStatus)) {
570 goto ExitBuildSTdeleteall;
571 }
572 fDStates->addElement(failState, *fStatus);
573 if (U_FAILURE(*fStatus)) {
574 goto ExitBuildSTdeleteall;
575 }
576
577 // initially, the only unmarked state in Dstates is firstpos(root),
578 // where toot is the root of the syntax tree for (r)#;
579 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
580 if (initialState == nullptr) {
581 *fStatus = U_MEMORY_ALLOCATION_ERROR;
582 }
583 if (U_FAILURE(*fStatus)) {
584 goto ExitBuildSTdeleteall;
585 }
586 initialState->fPositions = new UVector(*fStatus);
587 if (initialState->fPositions == nullptr) {
588 *fStatus = U_MEMORY_ALLOCATION_ERROR;
589 }
590 if (U_FAILURE(*fStatus)) {
591 goto ExitBuildSTdeleteall;
592 }
593 setAdd(initialState->fPositions, fTree->fFirstPosSet);
594 fDStates->addElement(initialState, *fStatus);
595 if (U_FAILURE(*fStatus)) {
596 goto ExitBuildSTdeleteall;
597 }
598
599 // while there is an unmarked state T in Dstates do begin
600 for (;;) {
601 RBBIStateDescriptor *T = nullptr;
602 int32_t tx;
603 for (tx=1; tx<fDStates->size(); tx++) {
604 RBBIStateDescriptor *temp;
605 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
606 if (temp->fMarked == false) {
607 T = temp;
608 break;
609 }
610 }
611 if (T == nullptr) {
612 break;
613 }
614
615 // mark T;
616 T->fMarked = true;
617
618 // for each input symbol a do begin
619 int32_t a;
620 for (a = 1; a<=lastInputSymbol; a++) {
621 // let U be the set of positions that are in followpos(p)
622 // for some position p in T
623 // such that the symbol at position p is a;
624 UVector *U = nullptr;
625 RBBINode *p;
626 int32_t px;
627 for (px=0; px<T->fPositions->size(); px++) {
628 p = (RBBINode *)T->fPositions->elementAt(px);
629 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
630 if (U == nullptr) {
631 U = new UVector(*fStatus);
632 if (U == nullptr) {
633 *fStatus = U_MEMORY_ALLOCATION_ERROR;
634 goto ExitBuildSTdeleteall;
635 }
636 }
637 setAdd(U, p->fFollowPos);
638 }
639 }
640
641 // if U is not empty and not in DStates then
642 int32_t ux = 0;
643 UBool UinDstates = false;
644 if (U != nullptr) {
645 U_ASSERT(U->size() > 0);
646 int ix;
647 for (ix=0; ix<fDStates->size(); ix++) {
648 RBBIStateDescriptor *temp2;
649 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
650 if (setEquals(U, temp2->fPositions)) {
651 delete U;
652 U = temp2->fPositions;
653 ux = ix;
654 UinDstates = true;
655 break;
656 }
657 }
658
659 // Add U as an unmarked state to Dstates
660 if (!UinDstates)
661 {
662 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
663 if (newState == nullptr) {
664 *fStatus = U_MEMORY_ALLOCATION_ERROR;
665 }
666 if (U_FAILURE(*fStatus)) {
667 goto ExitBuildSTdeleteall;
668 }
669 newState->fPositions = U;
670 fDStates->addElement(newState, *fStatus);
671 if (U_FAILURE(*fStatus)) {
672 return;
673 }
674 ux = fDStates->size()-1;
675 }
676
677 // Dtran[T, a] := U;
678 T->fDtran->setElementAt(ux, a);
679 }
680 }
681 }
682 return;
683 // delete local pointers only if error occurred.
684 ExitBuildSTdeleteall:
685 delete initialState;
686 delete failState;
687 }
688
689
690 /**
691 * mapLookAheadRules
692 *
693 */
mapLookAheadRules()694 void RBBITableBuilder::mapLookAheadRules() {
695 fLookAheadRuleMap = new UVector32(fRB->fScanner->numRules() + 1, *fStatus);
696 if (fLookAheadRuleMap == nullptr) {
697 *fStatus = U_MEMORY_ALLOCATION_ERROR;
698 }
699 if (U_FAILURE(*fStatus)) {
700 return;
701 }
702 fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
703
704 for (int32_t n=0; n<fDStates->size(); n++) {
705 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
706 int32_t laSlotForState = 0;
707
708 // Establish the look-ahead slot for this state, if the state covers
709 // any look-ahead nodes - corresponding to the '/' in look-ahead rules.
710
711 // If any of the look-ahead nodes already have a slot assigned, use it,
712 // otherwise assign a new one.
713
714 bool sawLookAheadNode = false;
715 for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
716 RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
717 if (node->fType != RBBINode::NodeType::lookAhead) {
718 continue;
719 }
720 sawLookAheadNode = true;
721 int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
722 U_ASSERT(ruleNum < fLookAheadRuleMap->size());
723 U_ASSERT(ruleNum > 0);
724 int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum);
725 if (laSlot != 0) {
726 if (laSlotForState == 0) {
727 laSlotForState = laSlot;
728 } else {
729 // TODO: figure out if this can fail, change to setting an error code if so.
730 U_ASSERT(laSlot == laSlotForState);
731 }
732 }
733 }
734 if (!sawLookAheadNode) {
735 continue;
736 }
737
738 if (laSlotForState == 0) {
739 laSlotForState = ++fLASlotsInUse;
740 }
741
742 // For each look ahead node covered by this state,
743 // set the mapping from the node's rule number to the look ahead slot.
744 // There can be multiple nodes/rule numbers going to the same la slot.
745
746 for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
747 RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
748 if (node->fType != RBBINode::NodeType::lookAhead) {
749 continue;
750 }
751 int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
752 int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum);
753 (void)existingVal;
754 U_ASSERT(existingVal == 0 || existingVal == laSlotForState);
755 fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum);
756 }
757 }
758
759 }
760
761 //-----------------------------------------------------------------------------
762 //
763 // flagAcceptingStates Identify accepting states.
764 // First get a list of all of the end marker nodes.
765 // Then, for each state s,
766 // if s contains one of the end marker nodes in its list of tree positions then
767 // s is an accepting state.
768 //
769 //-----------------------------------------------------------------------------
flagAcceptingStates()770 void RBBITableBuilder::flagAcceptingStates() {
771 if (U_FAILURE(*fStatus)) {
772 return;
773 }
774 UVector endMarkerNodes(*fStatus);
775 RBBINode *endMarker;
776 int32_t i;
777 int32_t n;
778
779 if (U_FAILURE(*fStatus)) {
780 return;
781 }
782
783 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
784 if (U_FAILURE(*fStatus)) {
785 return;
786 }
787
788 for (i=0; i<endMarkerNodes.size(); i++) {
789 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
790 for (n=0; n<fDStates->size(); n++) {
791 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
792 if (sd->fPositions->indexOf(endMarker) >= 0) {
793 // Any non-zero value for fAccepting means this is an accepting node.
794 // The value is what will be returned to the user as the break status.
795 // If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1).
796
797 if (sd->fAccepting==0) {
798 // State hasn't been marked as accepting yet. Do it now.
799 sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
800 if (sd->fAccepting == 0) {
801 sd->fAccepting = ACCEPTING_UNCONDITIONAL;
802 }
803 }
804 if (sd->fAccepting==ACCEPTING_UNCONDITIONAL && endMarker->fVal != 0) {
805 // Both lookahead and non-lookahead accepting for this state.
806 // Favor the look-ahead, because a look-ahead match needs to
807 // immediately stop the run-time engine. First match, not longest.
808 sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
809 }
810 // implicit else:
811 // if sd->fAccepting already had a value other than 0 or 1, leave it be.
812 }
813 }
814 }
815 }
816
817
818 //-----------------------------------------------------------------------------
819 //
820 // flagLookAheadStates Very similar to flagAcceptingStates, above.
821 //
822 //-----------------------------------------------------------------------------
flagLookAheadStates()823 void RBBITableBuilder::flagLookAheadStates() {
824 if (U_FAILURE(*fStatus)) {
825 return;
826 }
827 UVector lookAheadNodes(*fStatus);
828 RBBINode *lookAheadNode;
829 int32_t i;
830 int32_t n;
831
832 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
833 if (U_FAILURE(*fStatus)) {
834 return;
835 }
836 for (i=0; i<lookAheadNodes.size(); i++) {
837 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
838 U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead);
839
840 for (n=0; n<fDStates->size(); n++) {
841 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
842 int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
843 if (positionsIdx >= 0) {
844 U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
845 uint32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
846 U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
847 // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
848 // printf("%s:%d Bingo. sd->fLookAhead:%d lookaheadSlot:%d\n",
849 // __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot);
850 // }
851 sd->fLookAhead = lookaheadSlot;
852 }
853 }
854 }
855 }
856
857
858
859
860 //-----------------------------------------------------------------------------
861 //
862 // flagTaggedStates
863 //
864 //-----------------------------------------------------------------------------
flagTaggedStates()865 void RBBITableBuilder::flagTaggedStates() {
866 if (U_FAILURE(*fStatus)) {
867 return;
868 }
869 UVector tagNodes(*fStatus);
870 RBBINode *tagNode;
871 int32_t i;
872 int32_t n;
873
874 if (U_FAILURE(*fStatus)) {
875 return;
876 }
877 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
878 if (U_FAILURE(*fStatus)) {
879 return;
880 }
881 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
882 tagNode = (RBBINode *)tagNodes.elementAt(i);
883
884 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
885 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
886 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
887 sortedAdd(&sd->fTagVals, tagNode->fVal);
888 }
889 }
890 }
891 }
892
893
894
895
896 //-----------------------------------------------------------------------------
897 //
898 // mergeRuleStatusVals
899 //
900 // Update the global table of rule status {tag} values
901 // The rule builder has a global vector of status values that are common
902 // for all tables. Merge the ones from this table into the global set.
903 //
904 //-----------------------------------------------------------------------------
mergeRuleStatusVals()905 void RBBITableBuilder::mergeRuleStatusVals() {
906 //
907 // The basic outline of what happens here is this...
908 //
909 // for each state in this state table
910 // if the status tag list for this state is in the global statuses list
911 // record where and
912 // continue with the next state
913 // else
914 // add the tag list for this state to the global list.
915 //
916 int i;
917 int n;
918
919 // Pre-set a single tag of {0} into the table.
920 // We will need this as a default, for rule sets with no explicit tagging.
921 if (fRB->fRuleStatusVals->size() == 0) {
922 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
923 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
924 }
925
926 // For each state
927 for (n=0; n<fDStates->size(); n++) {
928 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
929 UVector *thisStatesTagValues = sd->fTagVals;
930 if (thisStatesTagValues == nullptr) {
931 // No tag values are explicitly associated with this state.
932 // Set the default tag value.
933 sd->fTagsIdx = 0;
934 continue;
935 }
936
937 // There are tag(s) associated with this state.
938 // fTagsIdx will be the index into the global tag list for this state's tag values.
939 // Initial value of -1 flags that we haven't got it set yet.
940 sd->fTagsIdx = -1;
941 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
942 int32_t nextTagGroupStart = 0;
943
944 // Loop runs once per group of tags in the global list
945 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
946 thisTagGroupStart = nextTagGroupStart;
947 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
948 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
949 // The number of tags for this state is different from
950 // the number of tags in this group from the global list.
951 // Continue with the next group from the global list.
952 continue;
953 }
954 // The lengths match, go ahead and compare the actual tag values
955 // between this state and the group from the global list.
956 for (i=0; i<thisStatesTagValues->size(); i++) {
957 if (thisStatesTagValues->elementAti(i) !=
958 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
959 // Mismatch.
960 break;
961 }
962 }
963
964 if (i == thisStatesTagValues->size()) {
965 // We found a set of tag values in the global list that match
966 // those for this state. Use them.
967 sd->fTagsIdx = thisTagGroupStart;
968 break;
969 }
970 }
971
972 if (sd->fTagsIdx == -1) {
973 // No suitable entry in the global tag list already. Add one
974 sd->fTagsIdx = fRB->fRuleStatusVals->size();
975 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
976 for (i=0; i<thisStatesTagValues->size(); i++) {
977 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
978 }
979 }
980 }
981 }
982
983
984
985
986
987
988
989 //-----------------------------------------------------------------------------
990 //
991 // sortedAdd Add a value to a vector of sorted values (ints).
992 // Do not replicate entries; if the value is already there, do not
993 // add a second one.
994 // Lazily create the vector if it does not already exist.
995 //
996 //-----------------------------------------------------------------------------
sortedAdd(UVector ** vector,int32_t val)997 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
998 int32_t i;
999
1000 if (*vector == nullptr) {
1001 *vector = new UVector(*fStatus);
1002 }
1003 if (*vector == nullptr || U_FAILURE(*fStatus)) {
1004 return;
1005 }
1006 UVector *vec = *vector;
1007 int32_t vSize = vec->size();
1008 for (i=0; i<vSize; i++) {
1009 int32_t valAtI = vec->elementAti(i);
1010 if (valAtI == val) {
1011 // The value is already in the vector. Don't add it again.
1012 return;
1013 }
1014 if (valAtI > val) {
1015 break;
1016 }
1017 }
1018 vec->insertElementAt(val, i, *fStatus);
1019 }
1020
1021
1022
1023 //-----------------------------------------------------------------------------
1024 //
1025 // setAdd Set operation on UVector
1026 // dest = dest union source
1027 // Elements may only appear once and must be sorted.
1028 //
1029 //-----------------------------------------------------------------------------
setAdd(UVector * dest,UVector * source)1030 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
1031 U_ASSERT(!dest->hasDeleter());
1032 U_ASSERT(!source->hasDeleter());
1033 int32_t destOriginalSize = dest->size();
1034 int32_t sourceSize = source->size();
1035 int32_t di = 0;
1036 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc
1037 void **destPtr, **sourcePtr;
1038 void **destLim, **sourceLim;
1039
1040 if (destOriginalSize > destArray.getCapacity()) {
1041 if (destArray.resize(destOriginalSize) == nullptr) {
1042 return;
1043 }
1044 }
1045 destPtr = destArray.getAlias();
1046 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()?
1047
1048 if (sourceSize > sourceArray.getCapacity()) {
1049 if (sourceArray.resize(sourceSize) == nullptr) {
1050 return;
1051 }
1052 }
1053 sourcePtr = sourceArray.getAlias();
1054 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()?
1055
1056 // Avoid multiple "get element" calls by getting the contents into arrays
1057 (void) dest->toArray(destPtr);
1058 (void) source->toArray(sourcePtr);
1059
1060 dest->setSize(sourceSize+destOriginalSize, *fStatus);
1061 if (U_FAILURE(*fStatus)) {
1062 return;
1063 }
1064
1065 while (sourcePtr < sourceLim && destPtr < destLim) {
1066 if (*destPtr == *sourcePtr) {
1067 dest->setElementAt(*sourcePtr++, di++);
1068 destPtr++;
1069 }
1070 // This check is required for machines with segmented memory, like i5/OS.
1071 // Direct pointer comparison is not recommended.
1072 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1073 dest->setElementAt(*destPtr++, di++);
1074 }
1075 else { /* *sourcePtr < *destPtr */
1076 dest->setElementAt(*sourcePtr++, di++);
1077 }
1078 }
1079
1080 // At most one of these two cleanup loops will execute
1081 while (destPtr < destLim) {
1082 dest->setElementAt(*destPtr++, di++);
1083 }
1084 while (sourcePtr < sourceLim) {
1085 dest->setElementAt(*sourcePtr++, di++);
1086 }
1087
1088 dest->setSize(di, *fStatus);
1089 }
1090
1091
1092
1093 //-----------------------------------------------------------------------------
1094 //
1095 // setEqual Set operation on UVector.
1096 // Compare for equality.
1097 // Elements must be sorted.
1098 //
1099 //-----------------------------------------------------------------------------
setEquals(UVector * a,UVector * b)1100 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1101 return a->equals(*b);
1102 }
1103
1104
1105 //-----------------------------------------------------------------------------
1106 //
1107 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1108 // for each node in the tree.
1109 //
1110 //-----------------------------------------------------------------------------
1111 #ifdef RBBI_DEBUG
printPosSets(RBBINode * n)1112 void RBBITableBuilder::printPosSets(RBBINode *n) {
1113 if (n==nullptr) {
1114 return;
1115 }
1116 printf("\n");
1117 RBBINode::printNodeHeader();
1118 RBBINode::printNode(n);
1119 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"true":"false");
1120
1121 RBBIDebugPrintf(" firstpos: ");
1122 printSet(n->fFirstPosSet);
1123
1124 RBBIDebugPrintf(" lastpos: ");
1125 printSet(n->fLastPosSet);
1126
1127 RBBIDebugPrintf(" followpos: ");
1128 printSet(n->fFollowPos);
1129
1130 printPosSets(n->fLeftChild);
1131 printPosSets(n->fRightChild);
1132 }
1133 #endif
1134
1135 //
1136 // findDuplCharClassFrom()
1137 //
findDuplCharClassFrom(IntPair * categories)1138 bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1139 int32_t numStates = fDStates->size();
1140 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1141
1142 for (; categories->first < numCols-1; categories->first++) {
1143 // Note: dictionary & non-dictionary columns cannot be merged.
1144 // The limitSecond value prevents considering mixed pairs.
1145 // Dictionary categories are >= DictCategoriesStart.
1146 // Non dict categories are < DictCategoriesStart.
1147 int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ?
1148 fRB->fSetBuilder->getDictCategoriesStart() : numCols;
1149 for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) {
1150 // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
1151 uint16_t table_base = 0;
1152 uint16_t table_dupl = 1;
1153 for (int32_t state=0; state<numStates; state++) {
1154 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1155 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1156 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1157 if (table_base != table_dupl) {
1158 break;
1159 }
1160 }
1161 if (table_base == table_dupl) {
1162 return true;
1163 }
1164 }
1165 }
1166 return false;
1167 }
1168
1169
1170 //
1171 // removeColumn()
1172 //
removeColumn(int32_t column)1173 void RBBITableBuilder::removeColumn(int32_t column) {
1174 int32_t numStates = fDStates->size();
1175 for (int32_t state=0; state<numStates; state++) {
1176 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1177 U_ASSERT(column < sd->fDtran->size());
1178 sd->fDtran->removeElementAt(column);
1179 }
1180 }
1181
1182 /*
1183 * findDuplicateState
1184 */
findDuplicateState(IntPair * states)1185 bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1186 int32_t numStates = fDStates->size();
1187 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1188
1189 for (; states->first<numStates-1; states->first++) {
1190 RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1191 for (states->second=states->first+1; states->second<numStates; states->second++) {
1192 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1193 if (firstSD->fAccepting != duplSD->fAccepting ||
1194 firstSD->fLookAhead != duplSD->fLookAhead ||
1195 firstSD->fTagsIdx != duplSD->fTagsIdx) {
1196 continue;
1197 }
1198 bool rowsMatch = true;
1199 for (int32_t col=0; col < numCols; ++col) {
1200 int32_t firstVal = firstSD->fDtran->elementAti(col);
1201 int32_t duplVal = duplSD->fDtran->elementAti(col);
1202 if (!((firstVal == duplVal) ||
1203 ((firstVal == states->first || firstVal == states->second) &&
1204 (duplVal == states->first || duplVal == states->second)))) {
1205 rowsMatch = false;
1206 break;
1207 }
1208 }
1209 if (rowsMatch) {
1210 return true;
1211 }
1212 }
1213 }
1214 return false;
1215 }
1216
1217
findDuplicateSafeState(IntPair * states)1218 bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1219 int32_t numStates = fSafeTable->size();
1220
1221 for (; states->first<numStates-1; states->first++) {
1222 UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1223 for (states->second=states->first+1; states->second<numStates; states->second++) {
1224 UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1225 bool rowsMatch = true;
1226 int32_t numCols = firstRow->length();
1227 for (int32_t col=0; col < numCols; ++col) {
1228 int32_t firstVal = firstRow->charAt(col);
1229 int32_t duplVal = duplRow->charAt(col);
1230 if (!((firstVal == duplVal) ||
1231 ((firstVal == states->first || firstVal == states->second) &&
1232 (duplVal == states->first || duplVal == states->second)))) {
1233 rowsMatch = false;
1234 break;
1235 }
1236 }
1237 if (rowsMatch) {
1238 return true;
1239 }
1240 }
1241 }
1242 return false;
1243 }
1244
1245
removeState(IntPair duplStates)1246 void RBBITableBuilder::removeState(IntPair duplStates) {
1247 const int32_t keepState = duplStates.first;
1248 const int32_t duplState = duplStates.second;
1249 U_ASSERT(keepState < duplState);
1250 U_ASSERT(duplState < fDStates->size());
1251
1252 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1253 fDStates->removeElementAt(duplState);
1254 delete duplSD;
1255
1256 int32_t numStates = fDStates->size();
1257 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1258 for (int32_t state=0; state<numStates; ++state) {
1259 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1260 for (int32_t col=0; col<numCols; col++) {
1261 int32_t existingVal = sd->fDtran->elementAti(col);
1262 int32_t newVal = existingVal;
1263 if (existingVal == duplState) {
1264 newVal = keepState;
1265 } else if (existingVal > duplState) {
1266 newVal = existingVal - 1;
1267 }
1268 sd->fDtran->setElementAt(newVal, col);
1269 }
1270 }
1271 }
1272
removeSafeState(IntPair duplStates)1273 void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1274 const int32_t keepState = duplStates.first;
1275 const int32_t duplState = duplStates.second;
1276 U_ASSERT(keepState < duplState);
1277 U_ASSERT(duplState < fSafeTable->size());
1278
1279 fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function
1280 // and will auto-delete the removed element.
1281 int32_t numStates = fSafeTable->size();
1282 for (int32_t state=0; state<numStates; ++state) {
1283 UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1284 int32_t numCols = sd->length();
1285 for (int32_t col=0; col<numCols; col++) {
1286 int32_t existingVal = sd->charAt(col);
1287 int32_t newVal = existingVal;
1288 if (existingVal == duplState) {
1289 newVal = keepState;
1290 } else if (existingVal > duplState) {
1291 newVal = existingVal - 1;
1292 }
1293 sd->setCharAt(col, static_cast<char16_t>(newVal));
1294 }
1295 }
1296 }
1297
1298
1299 /*
1300 * RemoveDuplicateStates
1301 */
removeDuplicateStates()1302 int32_t RBBITableBuilder::removeDuplicateStates() {
1303 IntPair dupls = {3, 0};
1304 int32_t numStatesRemoved = 0;
1305
1306 while (findDuplicateState(&dupls)) {
1307 // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1308 removeState(dupls);
1309 ++numStatesRemoved;
1310 }
1311 return numStatesRemoved;
1312 }
1313
1314
1315 //-----------------------------------------------------------------------------
1316 //
1317 // getTableSize() Calculate the size of the runtime form of this
1318 // state transition table.
1319 //
1320 //-----------------------------------------------------------------------------
getTableSize() const1321 int32_t RBBITableBuilder::getTableSize() const {
1322 int32_t size = 0;
1323 int32_t numRows;
1324 int32_t numCols;
1325 int32_t rowSize;
1326
1327 if (fTree == nullptr) {
1328 return 0;
1329 }
1330
1331 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1332
1333 numRows = fDStates->size();
1334 numCols = fRB->fSetBuilder->getNumCharCategories();
1335
1336 if (use8BitsForTable()) {
1337 rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1338 } else {
1339 rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1340 }
1341 size += numRows * rowSize;
1342 return size;
1343 }
1344
use8BitsForTable() const1345 bool RBBITableBuilder::use8BitsForTable() const {
1346 return fDStates->size() <= kMaxStateFor8BitsTable;
1347 }
1348
1349 //-----------------------------------------------------------------------------
1350 //
1351 // exportTable() export the state transition table in the format required
1352 // by the runtime engine. getTableSize() bytes of memory
1353 // must be available at the output address "where".
1354 //
1355 //-----------------------------------------------------------------------------
exportTable(void * where)1356 void RBBITableBuilder::exportTable(void *where) {
1357 RBBIStateTable *table = (RBBIStateTable *)where;
1358 uint32_t state;
1359 int col;
1360
1361 if (U_FAILURE(*fStatus) || fTree == nullptr) {
1362 return;
1363 }
1364
1365 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1366 if (catCount > 0x7fff ||
1367 fDStates->size() > 0x7fff) {
1368 *fStatus = U_BRK_INTERNAL_ERROR;
1369 return;
1370 }
1371
1372 table->fNumStates = fDStates->size();
1373 table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
1374 table->fLookAheadResultsSize = fLASlotsInUse == ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
1375 table->fFlags = 0;
1376 if (use8BitsForTable()) {
1377 table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1378 table->fFlags |= RBBI_8BITS_ROWS;
1379 } else {
1380 table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1381 }
1382 if (fRB->fLookAheadHardBreak) {
1383 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
1384 }
1385 if (fRB->fSetBuilder->sawBOF()) {
1386 table->fFlags |= RBBI_BOF_REQUIRED;
1387 }
1388
1389 for (state=0; state<table->fNumStates; state++) {
1390 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1391 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1392 if (use8BitsForTable()) {
1393 U_ASSERT (sd->fAccepting <= 255);
1394 U_ASSERT (sd->fLookAhead <= 255);
1395 U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 255);
1396 RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
1397 r8->fAccepting = sd->fAccepting;
1398 r8->fLookAhead = sd->fLookAhead;
1399 r8->fTagsIdx = sd->fTagsIdx;
1400 for (col=0; col<catCount; col++) {
1401 U_ASSERT (sd->fDtran->elementAti(col) <= kMaxStateFor8BitsTable);
1402 r8->fNextState[col] = sd->fDtran->elementAti(col);
1403 }
1404 } else {
1405 U_ASSERT (sd->fAccepting <= 0xffff);
1406 U_ASSERT (sd->fLookAhead <= 0xffff);
1407 U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 0xffff);
1408 row->r16.fAccepting = sd->fAccepting;
1409 row->r16.fLookAhead = sd->fLookAhead;
1410 row->r16.fTagsIdx = sd->fTagsIdx;
1411 for (col=0; col<catCount; col++) {
1412 row->r16.fNextState[col] = sd->fDtran->elementAti(col);
1413 }
1414 }
1415 }
1416 }
1417
1418
1419 /**
1420 * Synthesize a safe state table from the main state table.
1421 */
buildSafeReverseTable(UErrorCode & status)1422 void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1423 // The safe table creation has three steps:
1424
1425 // 1. Identify pairs of character classes that are "safe." Safe means that boundaries
1426 // following the pair do not depend on context or state before the pair. To test
1427 // whether a pair is safe, run it through the main forward state table, starting
1428 // from each state. If the the final state is the same, no matter what the starting state,
1429 // the pair is safe.
1430 //
1431 // 2. Build a state table that recognizes the safe pairs. It's similar to their
1432 // forward table, with a column for each input character [class], and a row for
1433 // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1434 // create an additional state for each input character category; being in
1435 // one of these states means that the character has been seen, and is potentially
1436 // the first of a pair. In each of these rows, the entry for the second character
1437 // of a safe pair is set to the stop state (0), indicating that a match was found.
1438 // All other table entries are set to the state corresponding the current input
1439 // character, allowing that character to be the of a start following pair.
1440 //
1441 // Because the safe rules are to be run in reverse, moving backwards in the text,
1442 // the first and second pair categories are swapped when building the table.
1443 //
1444 // 3. Compress the table. There are typically many rows (states) that are
1445 // equivalent - that have zeroes (match completed) in the same columns -
1446 // and can be folded together.
1447
1448 // Each safe pair is stored as two UChars in the safePair string.
1449 UnicodeString safePairs;
1450
1451 int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1452 int32_t numStates = fDStates->size();
1453
1454 for (int32_t c1=0; c1<numCharClasses; ++c1) {
1455 for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1456 int32_t wantedEndState = -1;
1457 int32_t endState = 0;
1458 for (int32_t startState = 1; startState < numStates; ++startState) {
1459 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1460 int32_t s2 = startStateD->fDtran->elementAti(c1);
1461 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1462 endState = s2StateD->fDtran->elementAti(c2);
1463 if (wantedEndState < 0) {
1464 wantedEndState = endState;
1465 } else {
1466 if (wantedEndState != endState) {
1467 break;
1468 }
1469 }
1470 }
1471 if (wantedEndState == endState) {
1472 safePairs.append((char16_t)c1);
1473 safePairs.append((char16_t)c2);
1474 // printf("(%d, %d) ", c1, c2);
1475 }
1476 }
1477 // printf("\n");
1478 }
1479
1480 // Populate the initial safe table.
1481 // The table as a whole is UVector<UnicodeString>
1482 // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1483 // Row 0 is the stop state.
1484 // Row 1 is the start state.
1485 // Row 2 and beyond are other states, initially one per char class, but
1486 // after initial construction, many of the states will be combined, compacting the table.
1487 // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1488 // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1489
1490 U_ASSERT(fSafeTable == nullptr);
1491 LocalPointer<UVector> lpSafeTable(
1492 new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status), status);
1493 if (U_FAILURE(status)) {
1494 return;
1495 }
1496 fSafeTable = lpSafeTable.orphan();
1497 for (int32_t row=0; row<numCharClasses + 2; ++row) {
1498 LocalPointer<UnicodeString> lpString(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1499 fSafeTable->adoptElement(lpString.orphan(), status);
1500 }
1501 if (U_FAILURE(status)) {
1502 return;
1503 }
1504
1505 // From the start state, each input char class transitions to the state for that input.
1506 UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1507 for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1508 // Note: +2 for the start & stop state.
1509 startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
1510 }
1511
1512 // Initially make every other state table row look like the start state row,
1513 for (int32_t row=2; row<numCharClasses+2; ++row) {
1514 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1515 rowState = startState; // UnicodeString assignment, copies contents.
1516 }
1517
1518 // Run through the safe pairs, set the next state to zero when pair has been seen.
1519 // Zero being the stop state, meaning we found a safe point.
1520 for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1521 int32_t c1 = safePairs.charAt(pairIdx);
1522 int32_t c2 = safePairs.charAt(pairIdx + 1);
1523
1524 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1525 rowState.setCharAt(c1, 0);
1526 }
1527
1528 // Remove duplicate or redundant rows from the table.
1529 IntPair states = {1, 0};
1530 while (findDuplicateSafeState(&states)) {
1531 // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1532 removeSafeState(states);
1533 }
1534 }
1535
1536
1537 //-----------------------------------------------------------------------------
1538 //
1539 // getSafeTableSize() Calculate the size of the runtime form of this
1540 // safe state table.
1541 //
1542 //-----------------------------------------------------------------------------
getSafeTableSize() const1543 int32_t RBBITableBuilder::getSafeTableSize() const {
1544 int32_t size = 0;
1545 int32_t numRows;
1546 int32_t numCols;
1547 int32_t rowSize;
1548
1549 if (fSafeTable == nullptr) {
1550 return 0;
1551 }
1552
1553 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1554
1555 numRows = fSafeTable->size();
1556 numCols = fRB->fSetBuilder->getNumCharCategories();
1557
1558 if (use8BitsForSafeTable()) {
1559 rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1560 } else {
1561 rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1562 }
1563 size += numRows * rowSize;
1564 return size;
1565 }
1566
use8BitsForSafeTable() const1567 bool RBBITableBuilder::use8BitsForSafeTable() const {
1568 return fSafeTable->size() <= kMaxStateFor8BitsTable;
1569 }
1570
1571 //-----------------------------------------------------------------------------
1572 //
1573 // exportSafeTable() export the state transition table in the format required
1574 // by the runtime engine. getTableSize() bytes of memory
1575 // must be available at the output address "where".
1576 //
1577 //-----------------------------------------------------------------------------
exportSafeTable(void * where)1578 void RBBITableBuilder::exportSafeTable(void *where) {
1579 RBBIStateTable *table = (RBBIStateTable *)where;
1580 uint32_t state;
1581 int col;
1582
1583 if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1584 return;
1585 }
1586
1587 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1588 if (catCount > 0x7fff ||
1589 fSafeTable->size() > 0x7fff) {
1590 *fStatus = U_BRK_INTERNAL_ERROR;
1591 return;
1592 }
1593
1594 table->fNumStates = fSafeTable->size();
1595 table->fFlags = 0;
1596 if (use8BitsForSafeTable()) {
1597 table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1598 table->fFlags |= RBBI_8BITS_ROWS;
1599 } else {
1600 table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1601 }
1602
1603 for (state=0; state<table->fNumStates; state++) {
1604 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1605 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1606 if (use8BitsForSafeTable()) {
1607 RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
1608 r8->fAccepting = 0;
1609 r8->fLookAhead = 0;
1610 r8->fTagsIdx = 0;
1611 for (col=0; col<catCount; col++) {
1612 U_ASSERT(rowString->charAt(col) <= kMaxStateFor8BitsTable);
1613 r8->fNextState[col] = static_cast<uint8_t>(rowString->charAt(col));
1614 }
1615 } else {
1616 row->r16.fAccepting = 0;
1617 row->r16.fLookAhead = 0;
1618 row->r16.fTagsIdx = 0;
1619 for (col=0; col<catCount; col++) {
1620 row->r16.fNextState[col] = rowString->charAt(col);
1621 }
1622 }
1623 }
1624 }
1625
1626
1627
1628
1629 //-----------------------------------------------------------------------------
1630 //
1631 // printSet Debug function. Print the contents of a UVector
1632 //
1633 //-----------------------------------------------------------------------------
1634 #ifdef RBBI_DEBUG
printSet(UVector * s)1635 void RBBITableBuilder::printSet(UVector *s) {
1636 int32_t i;
1637 for (i=0; i<s->size(); i++) {
1638 const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1639 RBBIDebugPrintf("%5d", v==nullptr? -1 : v->fSerialNum);
1640 }
1641 RBBIDebugPrintf("\n");
1642 }
1643 #endif
1644
1645
1646 //-----------------------------------------------------------------------------
1647 //
1648 // printStates Debug Function. Dump the fully constructed state transition table.
1649 //
1650 //-----------------------------------------------------------------------------
1651 #ifdef RBBI_DEBUG
printStates()1652 void RBBITableBuilder::printStates() {
1653 int c; // input "character"
1654 int n; // state number
1655
1656 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1657 RBBIDebugPrintf(" | Acc LA Tag");
1658 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1659 RBBIDebugPrintf(" %3d", c);
1660 }
1661 RBBIDebugPrintf("\n");
1662 RBBIDebugPrintf(" |---------------");
1663 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1664 RBBIDebugPrintf("----");
1665 }
1666 RBBIDebugPrintf("\n");
1667
1668 for (n=0; n<fDStates->size(); n++) {
1669 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1670 RBBIDebugPrintf(" %3d | " , n);
1671 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1672 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1673 RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c));
1674 }
1675 RBBIDebugPrintf("\n");
1676 }
1677 RBBIDebugPrintf("\n\n");
1678 }
1679 #endif
1680
1681
1682 //-----------------------------------------------------------------------------
1683 //
1684 // printSafeTable Debug Function. Dump the fully constructed safe table.
1685 //
1686 //-----------------------------------------------------------------------------
1687 #ifdef RBBI_DEBUG
printReverseTable()1688 void RBBITableBuilder::printReverseTable() {
1689 int c; // input "character"
1690 int n; // state number
1691
1692 RBBIDebugPrintf(" Safe Reverse Table \n");
1693 if (fSafeTable == nullptr) {
1694 RBBIDebugPrintf(" --- nullptr ---\n");
1695 return;
1696 }
1697 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1698 RBBIDebugPrintf(" | Acc LA Tag");
1699 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1700 RBBIDebugPrintf(" %2d", c);
1701 }
1702 RBBIDebugPrintf("\n");
1703 RBBIDebugPrintf(" |---------------");
1704 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1705 RBBIDebugPrintf("---");
1706 }
1707 RBBIDebugPrintf("\n");
1708
1709 for (n=0; n<fSafeTable->size(); n++) {
1710 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1711 RBBIDebugPrintf(" %3d | " , n);
1712 RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
1713 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1714 RBBIDebugPrintf(" %2d", rowString->charAt(c));
1715 }
1716 RBBIDebugPrintf("\n");
1717 }
1718 RBBIDebugPrintf("\n\n");
1719 }
1720 #endif
1721
1722
1723
1724 //-----------------------------------------------------------------------------
1725 //
1726 // printRuleStatusTable Debug Function. Dump the common rule status table
1727 //
1728 //-----------------------------------------------------------------------------
1729 #ifdef RBBI_DEBUG
printRuleStatusTable()1730 void RBBITableBuilder::printRuleStatusTable() {
1731 int32_t thisRecord = 0;
1732 int32_t nextRecord = 0;
1733 int i;
1734 UVector *tbl = fRB->fRuleStatusVals;
1735
1736 RBBIDebugPrintf("index | tags \n");
1737 RBBIDebugPrintf("-------------------\n");
1738
1739 while (nextRecord < tbl->size()) {
1740 thisRecord = nextRecord;
1741 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1742 RBBIDebugPrintf("%4d ", thisRecord);
1743 for (i=thisRecord+1; i<nextRecord; i++) {
1744 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
1745 }
1746 RBBIDebugPrintf("\n");
1747 }
1748 RBBIDebugPrintf("\n\n");
1749 }
1750 #endif
1751
1752
1753 //-----------------------------------------------------------------------------
1754 //
1755 // RBBIStateDescriptor Methods. This is a very struct-like class
1756 // Most access is directly to the fields.
1757 //
1758 //-----------------------------------------------------------------------------
1759
RBBIStateDescriptor(int lastInputSymbol,UErrorCode * fStatus)1760 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1761 fMarked = false;
1762 fAccepting = 0;
1763 fLookAhead = 0;
1764 fTagsIdx = 0;
1765 fTagVals = nullptr;
1766 fPositions = nullptr;
1767 fDtran = nullptr;
1768
1769 fDtran = new UVector32(lastInputSymbol+1, *fStatus);
1770 if (U_FAILURE(*fStatus)) {
1771 return;
1772 }
1773 if (fDtran == nullptr) {
1774 *fStatus = U_MEMORY_ALLOCATION_ERROR;
1775 return;
1776 }
1777 fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
1778 // It is indexed by input symbols, and will
1779 // hold the next state number for each
1780 // symbol.
1781 }
1782
1783
~RBBIStateDescriptor()1784 RBBIStateDescriptor::~RBBIStateDescriptor() {
1785 delete fPositions;
1786 delete fDtran;
1787 delete fTagVals;
1788 fPositions = nullptr;
1789 fDtran = nullptr;
1790 fTagVals = nullptr;
1791 }
1792
1793 U_NAMESPACE_END
1794
1795 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1796