xref: /aosp_15_r20/external/cronet/third_party/icu/source/common/rbbitblb.cpp (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (c) 2002-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9 //
10 //  rbbitblb.cpp
11 //
12 
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_BREAK_ITERATION
17 
18 #include "unicode/unistr.h"
19 #include "rbbitblb.h"
20 #include "rbbirb.h"
21 #include "rbbiscan.h"
22 #include "rbbisetb.h"
23 #include "rbbidata.h"
24 #include "cstring.h"
25 #include "uassert.h"
26 #include "uvectr32.h"
27 #include "cmemory.h"
28 
29 U_NAMESPACE_BEGIN
30 
31 const int32_t kMaxStateFor8BitsTable = 255;
32 
RBBITableBuilder(RBBIRuleBuilder * rb,RBBINode ** rootNode,UErrorCode & status)33 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
34         fRB(rb),
35         fTree(*rootNode),
36         fStatus(&status),
37         fDStates(nullptr),
38         fSafeTable(nullptr) {
39     if (U_FAILURE(status)) {
40         return;
41     }
42     // fDStates is UVector<RBBIStateDescriptor *>
43     fDStates = new UVector(status);
44     if (U_SUCCESS(status) && fDStates == nullptr ) {
45         status = U_MEMORY_ALLOCATION_ERROR;
46     }
47 }
48 
49 
50 
~RBBITableBuilder()51 RBBITableBuilder::~RBBITableBuilder() {
52     int i;
53     for (i=0; i<fDStates->size(); i++) {
54         delete (RBBIStateDescriptor *)fDStates->elementAt(i);
55     }
56     delete fDStates;
57     delete fSafeTable;
58     delete fLookAheadRuleMap;
59 }
60 
61 
62 //-----------------------------------------------------------------------------
63 //
64 //   RBBITableBuilder::buildForwardTable  -  This is the main function for building
65 //                               the DFA state transition table from the RBBI rules parse tree.
66 //
67 //-----------------------------------------------------------------------------
buildForwardTable()68 void  RBBITableBuilder::buildForwardTable() {
69 
70     if (U_FAILURE(*fStatus)) {
71         return;
72     }
73 
74     // If there were no rules, just return.  This situation can easily arise
75     //   for the reverse rules.
76     if (fTree==nullptr) {
77         return;
78     }
79 
80     //
81     // Walk through the tree, replacing any references to $variables with a copy of the
82     //   parse tree for the substitution expression.
83     //
84     fTree = fTree->flattenVariables();
85 #ifdef RBBI_DEBUG
86     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
87         RBBIDebugPuts("\nParse tree after flattening variable references.");
88         RBBINode::printTree(fTree, true);
89     }
90 #endif
91 
92     //
93     // If the rules contained any references to {bof}
94     //   add a {bof} <cat> <former root of tree> to the
95     //   tree.  Means that all matches must start out with the
96     //   {bof} fake character.
97     //
98     if (fRB->fSetBuilder->sawBOF()) {
99         RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
100         RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
101         // Delete and exit if memory allocation failed.
102         if (bofTop == nullptr || bofLeaf == nullptr) {
103             *fStatus = U_MEMORY_ALLOCATION_ERROR;
104             delete bofTop;
105             delete bofLeaf;
106             return;
107         }
108         bofTop->fLeftChild  = bofLeaf;
109         bofTop->fRightChild = fTree;
110         bofLeaf->fParent    = bofTop;
111         bofLeaf->fVal       = 2;      // Reserved value for {bof}.
112         fTree               = bofTop;
113     }
114 
115     //
116     // Add a unique right-end marker to the expression.
117     //   Appears as a cat-node, left child being the original tree,
118     //   right child being the end marker.
119     //
120     RBBINode *cn = new RBBINode(RBBINode::opCat);
121     // Exit if memory allocation failed.
122     if (cn == nullptr) {
123         *fStatus = U_MEMORY_ALLOCATION_ERROR;
124         return;
125     }
126     cn->fLeftChild = fTree;
127     fTree->fParent = cn;
128     RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark);
129     // Delete and exit if memory allocation failed.
130     if (cn->fRightChild == nullptr) {
131         *fStatus = U_MEMORY_ALLOCATION_ERROR;
132         delete cn;
133         return;
134     }
135     cn->fRightChild->fParent = cn;
136     fTree = cn;
137 
138     //
139     //  Replace all references to UnicodeSets with the tree for the equivalent
140     //      expression.
141     //
142     fTree->flattenSets();
143 #ifdef RBBI_DEBUG
144     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
145         RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
146         RBBINode::printTree(fTree, true);
147     }
148 #endif
149 
150 
151     //
152     // calculate the functions nullable, firstpos, lastpos and followpos on
153     // nodes in the parse tree.
154     //    See the algorithm description in Aho.
155     //    Understanding how this works by looking at the code alone will be
156     //       nearly impossible.
157     //
158     calcNullable(fTree);
159     calcFirstPos(fTree);
160     calcLastPos(fTree);
161     calcFollowPos(fTree);
162     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
163         RBBIDebugPuts("\n");
164         printPosSets(fTree);
165     }
166 
167     //
168     //  For "chained" rules, modify the followPos sets
169     //
170     if (fRB->fChainRules) {
171         calcChainedFollowPos(fTree, endMarkerNode);
172     }
173 
174     //
175     //  BOF (start of input) test fixup.
176     //
177     if (fRB->fSetBuilder->sawBOF()) {
178         bofFixup();
179     }
180 
181     //
182     // Build the DFA state transition tables.
183     //
184     buildStateTable();
185     mapLookAheadRules();
186     flagAcceptingStates();
187     flagLookAheadStates();
188     flagTaggedStates();
189 
190     //
191     // Update the global table of rule status {tag} values
192     // The rule builder has a global vector of status values that are common
193     //    for all tables.  Merge the ones from this table into the global set.
194     //
195     mergeRuleStatusVals();
196 }
197 
198 
199 
200 //-----------------------------------------------------------------------------
201 //
202 //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
203 //
204 //-----------------------------------------------------------------------------
calcNullable(RBBINode * n)205 void RBBITableBuilder::calcNullable(RBBINode *n) {
206     if (n == nullptr) {
207         return;
208     }
209     if (n->fType == RBBINode::setRef ||
210         n->fType == RBBINode::endMark ) {
211         // These are non-empty leaf node types.
212         n->fNullable = false;
213         return;
214     }
215 
216     if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
217         // Lookahead marker node.  It's a leaf, so no recursion on children.
218         // It's nullable because it does not match any literal text from the input stream.
219         n->fNullable = true;
220         return;
221     }
222 
223 
224     // The node is not a leaf.
225     //  Calculate nullable on its children.
226     calcNullable(n->fLeftChild);
227     calcNullable(n->fRightChild);
228 
229     // Apply functions from table 3.40 in Aho
230     if (n->fType == RBBINode::opOr) {
231         n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
232     }
233     else if (n->fType == RBBINode::opCat) {
234         n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
235     }
236     else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
237         n->fNullable = true;
238     }
239     else {
240         n->fNullable = false;
241     }
242 }
243 
244 
245 
246 
247 //-----------------------------------------------------------------------------
248 //
249 //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
250 //
251 //-----------------------------------------------------------------------------
calcFirstPos(RBBINode * n)252 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
253     if (n == nullptr) {
254         return;
255     }
256     if (n->fType == RBBINode::leafChar  ||
257         n->fType == RBBINode::endMark   ||
258         n->fType == RBBINode::lookAhead ||
259         n->fType == RBBINode::tag) {
260         // These are non-empty leaf node types.
261         // Note: In order to maintain the sort invariant on the set,
262         // this function should only be called on a node whose set is
263         // empty to start with.
264         n->fFirstPosSet->addElement(n, *fStatus);
265         return;
266     }
267 
268     // The node is not a leaf.
269     //  Calculate firstPos on its children.
270     calcFirstPos(n->fLeftChild);
271     calcFirstPos(n->fRightChild);
272 
273     // Apply functions from table 3.40 in Aho
274     if (n->fType == RBBINode::opOr) {
275         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
276         setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277     }
278     else if (n->fType == RBBINode::opCat) {
279         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
280         if (n->fLeftChild->fNullable) {
281             setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
282         }
283     }
284     else if (n->fType == RBBINode::opStar ||
285              n->fType == RBBINode::opQuestion ||
286              n->fType == RBBINode::opPlus) {
287         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
288     }
289 }
290 
291 
292 
293 //-----------------------------------------------------------------------------
294 //
295 //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
296 //
297 //-----------------------------------------------------------------------------
calcLastPos(RBBINode * n)298 void RBBITableBuilder::calcLastPos(RBBINode *n) {
299     if (n == nullptr) {
300         return;
301     }
302     if (n->fType == RBBINode::leafChar  ||
303         n->fType == RBBINode::endMark   ||
304         n->fType == RBBINode::lookAhead ||
305         n->fType == RBBINode::tag) {
306         // These are non-empty leaf node types.
307         // Note: In order to maintain the sort invariant on the set,
308         // this function should only be called on a node whose set is
309         // empty to start with.
310         n->fLastPosSet->addElement(n, *fStatus);
311         return;
312     }
313 
314     // The node is not a leaf.
315     //  Calculate lastPos on its children.
316     calcLastPos(n->fLeftChild);
317     calcLastPos(n->fRightChild);
318 
319     // Apply functions from table 3.40 in Aho
320     if (n->fType == RBBINode::opOr) {
321         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
322         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
323     }
324     else if (n->fType == RBBINode::opCat) {
325         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
326         if (n->fRightChild->fNullable) {
327             setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
328         }
329     }
330     else if (n->fType == RBBINode::opStar     ||
331              n->fType == RBBINode::opQuestion ||
332              n->fType == RBBINode::opPlus) {
333         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
334     }
335 }
336 
337 
338 
339 //-----------------------------------------------------------------------------
340 //
341 //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
342 //
343 //-----------------------------------------------------------------------------
calcFollowPos(RBBINode * n)344 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
345     if (n == nullptr ||
346         n->fType == RBBINode::leafChar ||
347         n->fType == RBBINode::endMark) {
348         return;
349     }
350 
351     calcFollowPos(n->fLeftChild);
352     calcFollowPos(n->fRightChild);
353 
354     // Aho rule #1
355     if (n->fType == RBBINode::opCat) {
356         RBBINode *i;   // is 'i' in Aho's description
357         uint32_t     ix;
358 
359         UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
360 
361         for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
362             i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
363             setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
364         }
365     }
366 
367     // Aho rule #2
368     if (n->fType == RBBINode::opStar ||
369         n->fType == RBBINode::opPlus) {
370         RBBINode   *i;  // again, n and i are the names from Aho's description.
371         uint32_t    ix;
372 
373         for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
374             i = (RBBINode *)n->fLastPosSet->elementAt(ix);
375             setAdd(i->fFollowPos, n->fFirstPosSet);
376         }
377     }
378 
379 
380 
381 }
382 
383 //-----------------------------------------------------------------------------
384 //
385 //    addRuleRootNodes    Recursively walk a parse tree, adding all nodes flagged
386 //                        as roots of a rule to a destination vector.
387 //
388 //-----------------------------------------------------------------------------
addRuleRootNodes(UVector * dest,RBBINode * node)389 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
390     if (node == nullptr || U_FAILURE(*fStatus)) {
391         return;
392     }
393     U_ASSERT(!dest->hasDeleter());
394     if (node->fRuleRoot) {
395         dest->addElement(node, *fStatus);
396         // Note: rules cannot nest. If we found a rule start node,
397         //       no child node can also be a start node.
398         return;
399     }
400     addRuleRootNodes(dest, node->fLeftChild);
401     addRuleRootNodes(dest, node->fRightChild);
402 }
403 
404 //-----------------------------------------------------------------------------
405 //
406 //   calcChainedFollowPos.    Modify the previously calculated followPos sets
407 //                            to implement rule chaining.  NOT described by Aho
408 //
409 //-----------------------------------------------------------------------------
calcChainedFollowPos(RBBINode * tree,RBBINode * endMarkNode)410 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) {
411 
412     UVector         leafNodes(*fStatus);
413     if (U_FAILURE(*fStatus)) {
414         return;
415     }
416 
417     // get a list all leaf nodes
418     tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
419     if (U_FAILURE(*fStatus)) {
420         return;
421     }
422 
423     // Collect all leaf nodes that can start matches for rules
424     // with inbound chaining enabled, which is the union of the
425     // firstPosition sets from each of the rule root nodes.
426 
427     UVector ruleRootNodes(*fStatus);
428     addRuleRootNodes(&ruleRootNodes, tree);
429 
430     UVector matchStartNodes(*fStatus);
431     for (int j=0; j<ruleRootNodes.size(); ++j) {
432         RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
433         if (node->fChainIn) {
434             setAdd(&matchStartNodes, node->fFirstPosSet);
435         }
436     }
437     if (U_FAILURE(*fStatus)) {
438         return;
439     }
440 
441     int32_t  endNodeIx;
442     int32_t  startNodeIx;
443 
444     for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
445         RBBINode *endNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
446 
447         // Identify leaf nodes that correspond to overall rule match positions.
448         // These include the endMarkNode in their followPos sets.
449         //
450         // Note: do not consider other end marker nodes, those that are added to
451         //       look-ahead rules. These can't chain; a match immediately stops
452         //       further matching. This leaves exactly one end marker node, the one
453         //       at the end of the complete tree.
454 
455         if (!endNode->fFollowPos->contains(endMarkNode)) {
456             continue;
457         }
458 
459         // We've got a node that can end a match.
460 
461         // Now iterate over the nodes that can start a match, looking for ones
462         //   with the same char class as our ending node.
463         RBBINode *startNode;
464         for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
465             startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
466             if (startNode->fType != RBBINode::leafChar) {
467                 continue;
468             }
469 
470             if (endNode->fVal == startNode->fVal) {
471                 // The end val (character class) of one possible match is the
472                 //   same as the start of another.
473 
474                 // Add all nodes from the followPos of the start node to the
475                 //  followPos set of the end node, which will have the effect of
476                 //  letting matches transition from a match state at endNode
477                 //  to the second char of a match starting with startNode.
478                 setAdd(endNode->fFollowPos, startNode->fFollowPos);
479             }
480         }
481     }
482 }
483 
484 
485 //-----------------------------------------------------------------------------
486 //
487 //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
488 //                Do an swizzle similar to chaining, modifying the followPos set of
489 //                the bofNode to include the followPos nodes from other {bot} nodes
490 //                scattered through the tree.
491 //
492 //                This function has much in common with calcChainedFollowPos().
493 //
494 //-----------------------------------------------------------------------------
bofFixup()495 void RBBITableBuilder::bofFixup() {
496 
497     if (U_FAILURE(*fStatus)) {
498         return;
499     }
500 
501     //   The parse tree looks like this ...
502     //         fTree root  --->       <cat>
503     //                               /     \       .
504     //                            <cat>   <#end node>
505     //                           /     \  .
506     //                     <bofNode>   rest
507     //                               of tree
508     //
509     //    We will be adding things to the followPos set of the <bofNode>
510     //
511     RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
512     U_ASSERT(bofNode->fType == RBBINode::leafChar);
513     U_ASSERT(bofNode->fVal == 2);
514 
515     // Get all nodes that can be the start a match of the user-written rules
516     //  (excluding the fake bofNode)
517     //  We want the nodes that can start a match in the
518     //     part labeled "rest of tree"
519     //
520     UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
521 
522     RBBINode *startNode;
523     int       startNodeIx;
524     for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
525         startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
526         if (startNode->fType != RBBINode::leafChar) {
527             continue;
528         }
529 
530         if (startNode->fVal == bofNode->fVal) {
531             //  We found a leaf node corresponding to a {bof} that was
532             //    explicitly written into a rule.
533             //  Add everything from the followPos set of this node to the
534             //    followPos set of the fake bofNode at the start of the tree.
535             //
536             setAdd(bofNode->fFollowPos, startNode->fFollowPos);
537         }
538     }
539 }
540 
541 //-----------------------------------------------------------------------------
542 //
543 //   buildStateTable()    Determine the set of runtime DFA states and the
544 //                        transition tables for these states, by the algorithm
545 //                        of fig. 3.44 in Aho.
546 //
547 //                        Most of the comments are quotes of Aho's psuedo-code.
548 //
549 //-----------------------------------------------------------------------------
buildStateTable()550 void RBBITableBuilder::buildStateTable() {
551     if (U_FAILURE(*fStatus)) {
552         return;
553     }
554     RBBIStateDescriptor *failState;
555     // Set it to nullptr to avoid uninitialized warning
556     RBBIStateDescriptor *initialState = nullptr;
557     //
558     // Add a dummy state 0 - the stop state.  Not from Aho.
559     int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
560     failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
561     if (failState == nullptr) {
562         *fStatus = U_MEMORY_ALLOCATION_ERROR;
563         goto ExitBuildSTdeleteall;
564     }
565     failState->fPositions = new UVector(*fStatus);
566     if (failState->fPositions == nullptr) {
567         *fStatus = U_MEMORY_ALLOCATION_ERROR;
568     }
569     if (failState->fPositions == nullptr || U_FAILURE(*fStatus)) {
570         goto ExitBuildSTdeleteall;
571     }
572     fDStates->addElement(failState, *fStatus);
573     if (U_FAILURE(*fStatus)) {
574         goto ExitBuildSTdeleteall;
575     }
576 
577     // initially, the only unmarked state in Dstates is firstpos(root),
578     //       where toot is the root of the syntax tree for (r)#;
579     initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
580     if (initialState == nullptr) {
581         *fStatus = U_MEMORY_ALLOCATION_ERROR;
582     }
583     if (U_FAILURE(*fStatus)) {
584         goto ExitBuildSTdeleteall;
585     }
586     initialState->fPositions = new UVector(*fStatus);
587     if (initialState->fPositions == nullptr) {
588         *fStatus = U_MEMORY_ALLOCATION_ERROR;
589     }
590     if (U_FAILURE(*fStatus)) {
591         goto ExitBuildSTdeleteall;
592     }
593     setAdd(initialState->fPositions, fTree->fFirstPosSet);
594     fDStates->addElement(initialState, *fStatus);
595     if (U_FAILURE(*fStatus)) {
596         goto ExitBuildSTdeleteall;
597     }
598 
599     // while there is an unmarked state T in Dstates do begin
600     for (;;) {
601         RBBIStateDescriptor *T = nullptr;
602         int32_t              tx;
603         for (tx=1; tx<fDStates->size(); tx++) {
604             RBBIStateDescriptor *temp;
605             temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
606             if (temp->fMarked == false) {
607                 T = temp;
608                 break;
609             }
610         }
611         if (T == nullptr) {
612             break;
613         }
614 
615         // mark T;
616         T->fMarked = true;
617 
618         // for each input symbol a do begin
619         int32_t  a;
620         for (a = 1; a<=lastInputSymbol; a++) {
621             // let U be the set of positions that are in followpos(p)
622             //    for some position p in T
623             //    such that the symbol at position p is a;
624             UVector    *U = nullptr;
625             RBBINode   *p;
626             int32_t     px;
627             for (px=0; px<T->fPositions->size(); px++) {
628                 p = (RBBINode *)T->fPositions->elementAt(px);
629                 if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
630                     if (U == nullptr) {
631                         U = new UVector(*fStatus);
632                         if (U == nullptr) {
633                         	*fStatus = U_MEMORY_ALLOCATION_ERROR;
634                         	goto ExitBuildSTdeleteall;
635                         }
636                     }
637                     setAdd(U, p->fFollowPos);
638                 }
639             }
640 
641             // if U is not empty and not in DStates then
642             int32_t  ux = 0;
643             UBool    UinDstates = false;
644             if (U != nullptr) {
645                 U_ASSERT(U->size() > 0);
646                 int  ix;
647                 for (ix=0; ix<fDStates->size(); ix++) {
648                     RBBIStateDescriptor *temp2;
649                     temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
650                     if (setEquals(U, temp2->fPositions)) {
651                         delete U;
652                         U  = temp2->fPositions;
653                         ux = ix;
654                         UinDstates = true;
655                         break;
656                     }
657                 }
658 
659                 // Add U as an unmarked state to Dstates
660                 if (!UinDstates)
661                 {
662                     RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
663                     if (newState == nullptr) {
664                     	*fStatus = U_MEMORY_ALLOCATION_ERROR;
665                     }
666                     if (U_FAILURE(*fStatus)) {
667                         goto ExitBuildSTdeleteall;
668                     }
669                     newState->fPositions = U;
670                     fDStates->addElement(newState, *fStatus);
671                     if (U_FAILURE(*fStatus)) {
672                         return;
673                     }
674                     ux = fDStates->size()-1;
675                 }
676 
677                 // Dtran[T, a] := U;
678                 T->fDtran->setElementAt(ux, a);
679             }
680         }
681     }
682     return;
683     // delete local pointers only if error occurred.
684 ExitBuildSTdeleteall:
685     delete initialState;
686     delete failState;
687 }
688 
689 
690 /**
691  * mapLookAheadRules
692  *
693  */
mapLookAheadRules()694 void RBBITableBuilder::mapLookAheadRules() {
695     fLookAheadRuleMap =  new UVector32(fRB->fScanner->numRules() + 1, *fStatus);
696     if (fLookAheadRuleMap == nullptr) {
697         *fStatus = U_MEMORY_ALLOCATION_ERROR;
698     }
699     if (U_FAILURE(*fStatus)) {
700         return;
701     }
702     fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
703 
704     for (int32_t n=0; n<fDStates->size(); n++) {
705         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
706         int32_t laSlotForState = 0;
707 
708         // Establish the look-ahead slot for this state, if the state covers
709         // any look-ahead nodes - corresponding to the '/' in look-ahead rules.
710 
711         // If any of the look-ahead nodes already have a slot assigned, use it,
712         // otherwise assign a new one.
713 
714         bool sawLookAheadNode = false;
715         for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
716             RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
717             if (node->fType != RBBINode::NodeType::lookAhead) {
718                 continue;
719             }
720             sawLookAheadNode = true;
721             int32_t ruleNum = node->fVal;     // Set when rule was originally parsed.
722             U_ASSERT(ruleNum < fLookAheadRuleMap->size());
723             U_ASSERT(ruleNum > 0);
724             int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum);
725             if (laSlot != 0) {
726                 if (laSlotForState == 0) {
727                     laSlotForState = laSlot;
728                 } else {
729                     // TODO: figure out if this can fail, change to setting an error code if so.
730                     U_ASSERT(laSlot == laSlotForState);
731                 }
732             }
733         }
734         if (!sawLookAheadNode) {
735             continue;
736         }
737 
738         if (laSlotForState == 0) {
739             laSlotForState = ++fLASlotsInUse;
740         }
741 
742         // For each look ahead node covered by this state,
743         // set the mapping from the node's rule number to the look ahead slot.
744         // There can be multiple nodes/rule numbers going to the same la slot.
745 
746         for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
747             RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
748             if (node->fType != RBBINode::NodeType::lookAhead) {
749                 continue;
750             }
751             int32_t ruleNum = node->fVal;     // Set when rule was originally parsed.
752             int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum);
753             (void)existingVal;
754             U_ASSERT(existingVal == 0 || existingVal == laSlotForState);
755             fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum);
756         }
757     }
758 
759 }
760 
761 //-----------------------------------------------------------------------------
762 //
763 //   flagAcceptingStates    Identify accepting states.
764 //                          First get a list of all of the end marker nodes.
765 //                          Then, for each state s,
766 //                              if s contains one of the end marker nodes in its list of tree positions then
767 //                                  s is an accepting state.
768 //
769 //-----------------------------------------------------------------------------
flagAcceptingStates()770 void     RBBITableBuilder::flagAcceptingStates() {
771     if (U_FAILURE(*fStatus)) {
772         return;
773     }
774     UVector     endMarkerNodes(*fStatus);
775     RBBINode    *endMarker;
776     int32_t     i;
777     int32_t     n;
778 
779     if (U_FAILURE(*fStatus)) {
780         return;
781     }
782 
783     fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
784     if (U_FAILURE(*fStatus)) {
785         return;
786     }
787 
788     for (i=0; i<endMarkerNodes.size(); i++) {
789         endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
790         for (n=0; n<fDStates->size(); n++) {
791             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
792             if (sd->fPositions->indexOf(endMarker) >= 0) {
793                 // Any non-zero value for fAccepting means this is an accepting node.
794                 // The value is what will be returned to the user as the break status.
795                 // If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1).
796 
797                 if (sd->fAccepting==0) {
798                     // State hasn't been marked as accepting yet.  Do it now.
799                     sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
800                     if (sd->fAccepting == 0) {
801                         sd->fAccepting = ACCEPTING_UNCONDITIONAL;
802                     }
803                 }
804                 if (sd->fAccepting==ACCEPTING_UNCONDITIONAL && endMarker->fVal != 0) {
805                     // Both lookahead and non-lookahead accepting for this state.
806                     // Favor the look-ahead, because a look-ahead match needs to
807                     // immediately stop the run-time engine. First match, not longest.
808                     sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
809                 }
810                 // implicit else:
811                 // if sd->fAccepting already had a value other than 0 or 1, leave it be.
812             }
813         }
814     }
815 }
816 
817 
818 //-----------------------------------------------------------------------------
819 //
820 //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
821 //
822 //-----------------------------------------------------------------------------
flagLookAheadStates()823 void     RBBITableBuilder::flagLookAheadStates() {
824     if (U_FAILURE(*fStatus)) {
825         return;
826     }
827     UVector     lookAheadNodes(*fStatus);
828     RBBINode    *lookAheadNode;
829     int32_t     i;
830     int32_t     n;
831 
832     fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
833     if (U_FAILURE(*fStatus)) {
834         return;
835     }
836     for (i=0; i<lookAheadNodes.size(); i++) {
837         lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
838         U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead);
839 
840         for (n=0; n<fDStates->size(); n++) {
841             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
842             int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
843             if (positionsIdx >= 0) {
844                 U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
845                 uint32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
846                 U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
847                 // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
848                 //     printf("%s:%d Bingo. sd->fLookAhead:%d   lookaheadSlot:%d\n",
849                 //            __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot);
850                 // }
851                 sd->fLookAhead = lookaheadSlot;
852             }
853         }
854     }
855 }
856 
857 
858 
859 
860 //-----------------------------------------------------------------------------
861 //
862 //    flagTaggedStates
863 //
864 //-----------------------------------------------------------------------------
flagTaggedStates()865 void     RBBITableBuilder::flagTaggedStates() {
866     if (U_FAILURE(*fStatus)) {
867         return;
868     }
869     UVector     tagNodes(*fStatus);
870     RBBINode    *tagNode;
871     int32_t     i;
872     int32_t     n;
873 
874     if (U_FAILURE(*fStatus)) {
875         return;
876     }
877     fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
878     if (U_FAILURE(*fStatus)) {
879         return;
880     }
881     for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
882         tagNode = (RBBINode *)tagNodes.elementAt(i);
883 
884         for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
885             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
886             if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
887                 sortedAdd(&sd->fTagVals, tagNode->fVal);
888             }
889         }
890     }
891 }
892 
893 
894 
895 
896 //-----------------------------------------------------------------------------
897 //
898 //  mergeRuleStatusVals
899 //
900 //      Update the global table of rule status {tag} values
901 //      The rule builder has a global vector of status values that are common
902 //      for all tables.  Merge the ones from this table into the global set.
903 //
904 //-----------------------------------------------------------------------------
mergeRuleStatusVals()905 void  RBBITableBuilder::mergeRuleStatusVals() {
906     //
907     //  The basic outline of what happens here is this...
908     //
909     //    for each state in this state table
910     //       if the status tag list for this state is in the global statuses list
911     //           record where and
912     //           continue with the next state
913     //       else
914     //           add the tag list for this state to the global list.
915     //
916     int i;
917     int n;
918 
919     // Pre-set a single tag of {0} into the table.
920     //   We will need this as a default, for rule sets with no explicit tagging.
921     if (fRB->fRuleStatusVals->size() == 0) {
922         fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
923         fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
924     }
925 
926     //    For each state
927     for (n=0; n<fDStates->size(); n++) {
928         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
929         UVector *thisStatesTagValues = sd->fTagVals;
930         if (thisStatesTagValues == nullptr) {
931             // No tag values are explicitly associated with this state.
932             //   Set the default tag value.
933             sd->fTagsIdx = 0;
934             continue;
935         }
936 
937         // There are tag(s) associated with this state.
938         //   fTagsIdx will be the index into the global tag list for this state's tag values.
939         //   Initial value of -1 flags that we haven't got it set yet.
940         sd->fTagsIdx = -1;
941         int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
942         int32_t  nextTagGroupStart = 0;
943 
944         // Loop runs once per group of tags in the global list
945         while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
946             thisTagGroupStart = nextTagGroupStart;
947             nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
948             if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
949                 // The number of tags for this state is different from
950                 //    the number of tags in this group from the global list.
951                 //    Continue with the next group from the global list.
952                 continue;
953             }
954             // The lengths match, go ahead and compare the actual tag values
955             //    between this state and the group from the global list.
956             for (i=0; i<thisStatesTagValues->size(); i++) {
957                 if (thisStatesTagValues->elementAti(i) !=
958                     fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
959                     // Mismatch.
960                     break;
961                 }
962             }
963 
964             if (i == thisStatesTagValues->size()) {
965                 // We found a set of tag values in the global list that match
966                 //   those for this state.  Use them.
967                 sd->fTagsIdx = thisTagGroupStart;
968                 break;
969             }
970         }
971 
972         if (sd->fTagsIdx == -1) {
973             // No suitable entry in the global tag list already.  Add one
974             sd->fTagsIdx = fRB->fRuleStatusVals->size();
975             fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
976             for (i=0; i<thisStatesTagValues->size(); i++) {
977                 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
978             }
979         }
980     }
981 }
982 
983 
984 
985 
986 
987 
988 
989 //-----------------------------------------------------------------------------
990 //
991 //  sortedAdd  Add a value to a vector of sorted values (ints).
992 //             Do not replicate entries; if the value is already there, do not
993 //                add a second one.
994 //             Lazily create the vector if it does not already exist.
995 //
996 //-----------------------------------------------------------------------------
sortedAdd(UVector ** vector,int32_t val)997 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
998     int32_t i;
999 
1000     if (*vector == nullptr) {
1001         *vector = new UVector(*fStatus);
1002     }
1003     if (*vector == nullptr || U_FAILURE(*fStatus)) {
1004         return;
1005     }
1006     UVector *vec = *vector;
1007     int32_t  vSize = vec->size();
1008     for (i=0; i<vSize; i++) {
1009         int32_t valAtI = vec->elementAti(i);
1010         if (valAtI == val) {
1011             // The value is already in the vector.  Don't add it again.
1012             return;
1013         }
1014         if (valAtI > val) {
1015             break;
1016         }
1017     }
1018     vec->insertElementAt(val, i, *fStatus);
1019 }
1020 
1021 
1022 
1023 //-----------------------------------------------------------------------------
1024 //
1025 //  setAdd     Set operation on UVector
1026 //             dest = dest union source
1027 //             Elements may only appear once and must be sorted.
1028 //
1029 //-----------------------------------------------------------------------------
setAdd(UVector * dest,UVector * source)1030 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
1031     U_ASSERT(!dest->hasDeleter());
1032     U_ASSERT(!source->hasDeleter());
1033     int32_t destOriginalSize = dest->size();
1034     int32_t sourceSize       = source->size();
1035     int32_t di           = 0;
1036     MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
1037     void **destPtr, **sourcePtr;
1038     void **destLim, **sourceLim;
1039 
1040     if (destOriginalSize > destArray.getCapacity()) {
1041         if (destArray.resize(destOriginalSize) == nullptr) {
1042             return;
1043         }
1044     }
1045     destPtr = destArray.getAlias();
1046     destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
1047 
1048     if (sourceSize > sourceArray.getCapacity()) {
1049         if (sourceArray.resize(sourceSize) == nullptr) {
1050             return;
1051         }
1052     }
1053     sourcePtr = sourceArray.getAlias();
1054     sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
1055 
1056     // Avoid multiple "get element" calls by getting the contents into arrays
1057     (void) dest->toArray(destPtr);
1058     (void) source->toArray(sourcePtr);
1059 
1060     dest->setSize(sourceSize+destOriginalSize, *fStatus);
1061     if (U_FAILURE(*fStatus)) {
1062         return;
1063     }
1064 
1065     while (sourcePtr < sourceLim && destPtr < destLim) {
1066         if (*destPtr == *sourcePtr) {
1067             dest->setElementAt(*sourcePtr++, di++);
1068             destPtr++;
1069         }
1070         // This check is required for machines with segmented memory, like i5/OS.
1071         // Direct pointer comparison is not recommended.
1072         else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1073             dest->setElementAt(*destPtr++, di++);
1074         }
1075         else { /* *sourcePtr < *destPtr */
1076             dest->setElementAt(*sourcePtr++, di++);
1077         }
1078     }
1079 
1080     // At most one of these two cleanup loops will execute
1081     while (destPtr < destLim) {
1082         dest->setElementAt(*destPtr++, di++);
1083     }
1084     while (sourcePtr < sourceLim) {
1085         dest->setElementAt(*sourcePtr++, di++);
1086     }
1087 
1088     dest->setSize(di, *fStatus);
1089 }
1090 
1091 
1092 
1093 //-----------------------------------------------------------------------------
1094 //
1095 //  setEqual    Set operation on UVector.
1096 //              Compare for equality.
1097 //              Elements must be sorted.
1098 //
1099 //-----------------------------------------------------------------------------
setEquals(UVector * a,UVector * b)1100 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1101     return a->equals(*b);
1102 }
1103 
1104 
1105 //-----------------------------------------------------------------------------
1106 //
1107 //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
1108 //                 for each node in the tree.
1109 //
1110 //-----------------------------------------------------------------------------
1111 #ifdef RBBI_DEBUG
printPosSets(RBBINode * n)1112 void RBBITableBuilder::printPosSets(RBBINode *n) {
1113     if (n==nullptr) {
1114         return;
1115     }
1116     printf("\n");
1117     RBBINode::printNodeHeader();
1118     RBBINode::printNode(n);
1119     RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"true":"false");
1120 
1121     RBBIDebugPrintf("         firstpos:  ");
1122     printSet(n->fFirstPosSet);
1123 
1124     RBBIDebugPrintf("         lastpos:   ");
1125     printSet(n->fLastPosSet);
1126 
1127     RBBIDebugPrintf("         followpos: ");
1128     printSet(n->fFollowPos);
1129 
1130     printPosSets(n->fLeftChild);
1131     printPosSets(n->fRightChild);
1132 }
1133 #endif
1134 
1135 //
1136 //    findDuplCharClassFrom()
1137 //
findDuplCharClassFrom(IntPair * categories)1138 bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1139     int32_t numStates = fDStates->size();
1140     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1141 
1142     for (; categories->first < numCols-1; categories->first++) {
1143         // Note: dictionary & non-dictionary columns cannot be merged.
1144         //       The limitSecond value prevents considering mixed pairs.
1145         //       Dictionary categories are >= DictCategoriesStart.
1146         //       Non dict categories are   <  DictCategoriesStart.
1147         int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ?
1148             fRB->fSetBuilder->getDictCategoriesStart() : numCols;
1149         for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) {
1150             // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
1151             uint16_t table_base = 0;
1152             uint16_t table_dupl = 1;
1153             for (int32_t state=0; state<numStates; state++) {
1154                 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1155                 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1156                 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1157                 if (table_base != table_dupl) {
1158                     break;
1159                 }
1160             }
1161             if (table_base == table_dupl) {
1162                 return true;
1163             }
1164         }
1165     }
1166     return false;
1167 }
1168 
1169 
1170 //
1171 //    removeColumn()
1172 //
removeColumn(int32_t column)1173 void RBBITableBuilder::removeColumn(int32_t column) {
1174     int32_t numStates = fDStates->size();
1175     for (int32_t state=0; state<numStates; state++) {
1176         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1177         U_ASSERT(column < sd->fDtran->size());
1178         sd->fDtran->removeElementAt(column);
1179     }
1180 }
1181 
1182 /*
1183  * findDuplicateState
1184  */
findDuplicateState(IntPair * states)1185 bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1186     int32_t numStates = fDStates->size();
1187     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1188 
1189     for (; states->first<numStates-1; states->first++) {
1190         RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1191         for (states->second=states->first+1; states->second<numStates; states->second++) {
1192             RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1193             if (firstSD->fAccepting != duplSD->fAccepting ||
1194                 firstSD->fLookAhead != duplSD->fLookAhead ||
1195                 firstSD->fTagsIdx   != duplSD->fTagsIdx) {
1196                 continue;
1197             }
1198             bool rowsMatch = true;
1199             for (int32_t col=0; col < numCols; ++col) {
1200                 int32_t firstVal = firstSD->fDtran->elementAti(col);
1201                 int32_t duplVal = duplSD->fDtran->elementAti(col);
1202                 if (!((firstVal == duplVal) ||
1203                         ((firstVal == states->first || firstVal == states->second) &&
1204                         (duplVal  == states->first || duplVal  == states->second)))) {
1205                     rowsMatch = false;
1206                     break;
1207                 }
1208             }
1209             if (rowsMatch) {
1210                 return true;
1211             }
1212         }
1213     }
1214     return false;
1215 }
1216 
1217 
findDuplicateSafeState(IntPair * states)1218 bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1219     int32_t numStates = fSafeTable->size();
1220 
1221     for (; states->first<numStates-1; states->first++) {
1222         UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1223         for (states->second=states->first+1; states->second<numStates; states->second++) {
1224             UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1225             bool rowsMatch = true;
1226             int32_t numCols = firstRow->length();
1227             for (int32_t col=0; col < numCols; ++col) {
1228                 int32_t firstVal = firstRow->charAt(col);
1229                 int32_t duplVal = duplRow->charAt(col);
1230                 if (!((firstVal == duplVal) ||
1231                         ((firstVal == states->first || firstVal == states->second) &&
1232                         (duplVal  == states->first || duplVal  == states->second)))) {
1233                     rowsMatch = false;
1234                     break;
1235                 }
1236             }
1237             if (rowsMatch) {
1238                 return true;
1239             }
1240         }
1241     }
1242     return false;
1243 }
1244 
1245 
removeState(IntPair duplStates)1246 void RBBITableBuilder::removeState(IntPair duplStates) {
1247     const int32_t keepState = duplStates.first;
1248     const int32_t duplState = duplStates.second;
1249     U_ASSERT(keepState < duplState);
1250     U_ASSERT(duplState < fDStates->size());
1251 
1252     RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1253     fDStates->removeElementAt(duplState);
1254     delete duplSD;
1255 
1256     int32_t numStates = fDStates->size();
1257     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1258     for (int32_t state=0; state<numStates; ++state) {
1259         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1260         for (int32_t col=0; col<numCols; col++) {
1261             int32_t existingVal = sd->fDtran->elementAti(col);
1262             int32_t newVal = existingVal;
1263             if (existingVal == duplState) {
1264                 newVal = keepState;
1265             } else if (existingVal > duplState) {
1266                 newVal = existingVal - 1;
1267             }
1268             sd->fDtran->setElementAt(newVal, col);
1269         }
1270     }
1271 }
1272 
removeSafeState(IntPair duplStates)1273 void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1274     const int32_t keepState = duplStates.first;
1275     const int32_t duplState = duplStates.second;
1276     U_ASSERT(keepState < duplState);
1277     U_ASSERT(duplState < fSafeTable->size());
1278 
1279     fSafeTable->removeElementAt(duplState);   // Note that fSafeTable has a deleter function
1280                                               // and will auto-delete the removed element.
1281     int32_t numStates = fSafeTable->size();
1282     for (int32_t state=0; state<numStates; ++state) {
1283         UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1284         int32_t numCols = sd->length();
1285         for (int32_t col=0; col<numCols; col++) {
1286             int32_t existingVal = sd->charAt(col);
1287             int32_t newVal = existingVal;
1288             if (existingVal == duplState) {
1289                 newVal = keepState;
1290             } else if (existingVal > duplState) {
1291                 newVal = existingVal - 1;
1292             }
1293             sd->setCharAt(col, static_cast<char16_t>(newVal));
1294         }
1295     }
1296 }
1297 
1298 
1299 /*
1300  * RemoveDuplicateStates
1301  */
removeDuplicateStates()1302 int32_t RBBITableBuilder::removeDuplicateStates() {
1303     IntPair dupls = {3, 0};
1304     int32_t numStatesRemoved = 0;
1305 
1306     while (findDuplicateState(&dupls)) {
1307         // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1308         removeState(dupls);
1309         ++numStatesRemoved;
1310     }
1311     return numStatesRemoved;
1312 }
1313 
1314 
1315 //-----------------------------------------------------------------------------
1316 //
1317 //   getTableSize()    Calculate the size of the runtime form of this
1318 //                     state transition table.
1319 //
1320 //-----------------------------------------------------------------------------
getTableSize() const1321 int32_t  RBBITableBuilder::getTableSize() const {
1322     int32_t    size = 0;
1323     int32_t    numRows;
1324     int32_t    numCols;
1325     int32_t    rowSize;
1326 
1327     if (fTree == nullptr) {
1328         return 0;
1329     }
1330 
1331     size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
1332 
1333     numRows = fDStates->size();
1334     numCols = fRB->fSetBuilder->getNumCharCategories();
1335 
1336     if (use8BitsForTable()) {
1337         rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1338     } else {
1339         rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1340     }
1341     size   += numRows * rowSize;
1342     return size;
1343 }
1344 
use8BitsForTable() const1345 bool RBBITableBuilder::use8BitsForTable() const {
1346     return fDStates->size() <= kMaxStateFor8BitsTable;
1347 }
1348 
1349 //-----------------------------------------------------------------------------
1350 //
1351 //   exportTable()    export the state transition table in the format required
1352 //                    by the runtime engine.  getTableSize() bytes of memory
1353 //                    must be available at the output address "where".
1354 //
1355 //-----------------------------------------------------------------------------
exportTable(void * where)1356 void RBBITableBuilder::exportTable(void *where) {
1357     RBBIStateTable    *table = (RBBIStateTable *)where;
1358     uint32_t           state;
1359     int                col;
1360 
1361     if (U_FAILURE(*fStatus) || fTree == nullptr) {
1362         return;
1363     }
1364 
1365     int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1366     if (catCount > 0x7fff ||
1367         fDStates->size() > 0x7fff) {
1368         *fStatus = U_BRK_INTERNAL_ERROR;
1369         return;
1370     }
1371 
1372     table->fNumStates = fDStates->size();
1373     table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
1374     table->fLookAheadResultsSize = fLASlotsInUse == ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
1375     table->fFlags     = 0;
1376     if (use8BitsForTable()) {
1377         table->fRowLen    = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1378         table->fFlags  |= RBBI_8BITS_ROWS;
1379     } else {
1380         table->fRowLen    = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1381     }
1382     if (fRB->fLookAheadHardBreak) {
1383         table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
1384     }
1385     if (fRB->fSetBuilder->sawBOF()) {
1386         table->fFlags  |= RBBI_BOF_REQUIRED;
1387     }
1388 
1389     for (state=0; state<table->fNumStates; state++) {
1390         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1391         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1392         if (use8BitsForTable()) {
1393             U_ASSERT (sd->fAccepting <= 255);
1394             U_ASSERT (sd->fLookAhead <= 255);
1395             U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 255);
1396             RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
1397             r8->fAccepting = sd->fAccepting;
1398             r8->fLookAhead = sd->fLookAhead;
1399             r8->fTagsIdx   = sd->fTagsIdx;
1400             for (col=0; col<catCount; col++) {
1401                 U_ASSERT (sd->fDtran->elementAti(col) <= kMaxStateFor8BitsTable);
1402                 r8->fNextState[col] = sd->fDtran->elementAti(col);
1403             }
1404         } else {
1405             U_ASSERT (sd->fAccepting <= 0xffff);
1406             U_ASSERT (sd->fLookAhead <= 0xffff);
1407             U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 0xffff);
1408             row->r16.fAccepting = sd->fAccepting;
1409             row->r16.fLookAhead = sd->fLookAhead;
1410             row->r16.fTagsIdx   = sd->fTagsIdx;
1411             for (col=0; col<catCount; col++) {
1412                 row->r16.fNextState[col] = sd->fDtran->elementAti(col);
1413             }
1414         }
1415     }
1416 }
1417 
1418 
1419 /**
1420  *   Synthesize a safe state table from the main state table.
1421  */
buildSafeReverseTable(UErrorCode & status)1422 void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1423     // The safe table creation has three steps:
1424 
1425     // 1. Identify pairs of character classes that are "safe." Safe means that boundaries
1426     // following the pair do not depend on context or state before the pair. To test
1427     // whether a pair is safe, run it through the main forward state table, starting
1428     // from each state. If the the final state is the same, no matter what the starting state,
1429     // the pair is safe.
1430     //
1431     // 2. Build a state table that recognizes the safe pairs. It's similar to their
1432     // forward table, with a column for each input character [class], and a row for
1433     // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1434     // create an additional state for each input character category; being in
1435     // one of these states means that the character has been seen, and is potentially
1436     // the first of a pair. In each of these rows, the entry for the second character
1437     // of a safe pair is set to the stop state (0), indicating that a match was found.
1438     // All other table entries are set to the state corresponding the current input
1439     // character, allowing that character to be the of a start following pair.
1440     //
1441     // Because the safe rules are to be run in reverse, moving backwards in the text,
1442     // the first and second pair categories are swapped when building the table.
1443     //
1444     // 3. Compress the table. There are typically many rows (states) that are
1445     // equivalent - that have zeroes (match completed) in the same columns -
1446     // and can be folded together.
1447 
1448     // Each safe pair is stored as two UChars in the safePair string.
1449     UnicodeString safePairs;
1450 
1451     int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1452     int32_t numStates = fDStates->size();
1453 
1454     for (int32_t c1=0; c1<numCharClasses; ++c1) {
1455         for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1456             int32_t wantedEndState = -1;
1457             int32_t endState = 0;
1458             for (int32_t startState = 1; startState < numStates; ++startState) {
1459                 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1460                 int32_t s2 = startStateD->fDtran->elementAti(c1);
1461                 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1462                 endState = s2StateD->fDtran->elementAti(c2);
1463                 if (wantedEndState < 0) {
1464                     wantedEndState = endState;
1465                 } else {
1466                     if (wantedEndState != endState) {
1467                         break;
1468                     }
1469                 }
1470             }
1471             if (wantedEndState == endState) {
1472                 safePairs.append((char16_t)c1);
1473                 safePairs.append((char16_t)c2);
1474                 // printf("(%d, %d) ", c1, c2);
1475             }
1476         }
1477         // printf("\n");
1478     }
1479 
1480     // Populate the initial safe table.
1481     // The table as a whole is UVector<UnicodeString>
1482     // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1483     // Row 0 is the stop state.
1484     // Row 1 is the start state.
1485     // Row 2 and beyond are other states, initially one per char class, but
1486     //   after initial construction, many of the states will be combined, compacting the table.
1487     // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1488     // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1489 
1490     U_ASSERT(fSafeTable == nullptr);
1491     LocalPointer<UVector> lpSafeTable(
1492         new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status), status);
1493     if (U_FAILURE(status)) {
1494         return;
1495     }
1496     fSafeTable = lpSafeTable.orphan();
1497     for (int32_t row=0; row<numCharClasses + 2; ++row) {
1498         LocalPointer<UnicodeString> lpString(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1499         fSafeTable->adoptElement(lpString.orphan(), status);
1500     }
1501     if (U_FAILURE(status)) {
1502         return;
1503     }
1504 
1505     // From the start state, each input char class transitions to the state for that input.
1506     UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1507     for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1508         // Note: +2 for the start & stop state.
1509         startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
1510     }
1511 
1512     // Initially make every other state table row look like the start state row,
1513     for (int32_t row=2; row<numCharClasses+2; ++row) {
1514         UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1515         rowState = startState;   // UnicodeString assignment, copies contents.
1516     }
1517 
1518     // Run through the safe pairs, set the next state to zero when pair has been seen.
1519     // Zero being the stop state, meaning we found a safe point.
1520     for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1521         int32_t c1 = safePairs.charAt(pairIdx);
1522         int32_t c2 = safePairs.charAt(pairIdx + 1);
1523 
1524         UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1525         rowState.setCharAt(c1, 0);
1526     }
1527 
1528     // Remove duplicate or redundant rows from the table.
1529     IntPair states = {1, 0};
1530     while (findDuplicateSafeState(&states)) {
1531         // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1532         removeSafeState(states);
1533     }
1534 }
1535 
1536 
1537 //-----------------------------------------------------------------------------
1538 //
1539 //   getSafeTableSize()    Calculate the size of the runtime form of this
1540 //                         safe state table.
1541 //
1542 //-----------------------------------------------------------------------------
getSafeTableSize() const1543 int32_t  RBBITableBuilder::getSafeTableSize() const {
1544     int32_t    size = 0;
1545     int32_t    numRows;
1546     int32_t    numCols;
1547     int32_t    rowSize;
1548 
1549     if (fSafeTable == nullptr) {
1550         return 0;
1551     }
1552 
1553     size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
1554 
1555     numRows = fSafeTable->size();
1556     numCols = fRB->fSetBuilder->getNumCharCategories();
1557 
1558     if (use8BitsForSafeTable()) {
1559         rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1560     } else {
1561         rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1562     }
1563     size   += numRows * rowSize;
1564     return size;
1565 }
1566 
use8BitsForSafeTable() const1567 bool RBBITableBuilder::use8BitsForSafeTable() const {
1568     return fSafeTable->size() <= kMaxStateFor8BitsTable;
1569 }
1570 
1571 //-----------------------------------------------------------------------------
1572 //
1573 //   exportSafeTable()   export the state transition table in the format required
1574 //                       by the runtime engine.  getTableSize() bytes of memory
1575 //                       must be available at the output address "where".
1576 //
1577 //-----------------------------------------------------------------------------
exportSafeTable(void * where)1578 void RBBITableBuilder::exportSafeTable(void *where) {
1579     RBBIStateTable    *table = (RBBIStateTable *)where;
1580     uint32_t           state;
1581     int                col;
1582 
1583     if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1584         return;
1585     }
1586 
1587     int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1588     if (catCount > 0x7fff ||
1589             fSafeTable->size() > 0x7fff) {
1590         *fStatus = U_BRK_INTERNAL_ERROR;
1591         return;
1592     }
1593 
1594     table->fNumStates = fSafeTable->size();
1595     table->fFlags     = 0;
1596     if (use8BitsForSafeTable()) {
1597         table->fRowLen    = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1598         table->fFlags  |= RBBI_8BITS_ROWS;
1599     } else {
1600         table->fRowLen    = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1601     }
1602 
1603     for (state=0; state<table->fNumStates; state++) {
1604         UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1605         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1606         if (use8BitsForSafeTable()) {
1607             RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
1608             r8->fAccepting = 0;
1609             r8->fLookAhead = 0;
1610             r8->fTagsIdx    = 0;
1611             for (col=0; col<catCount; col++) {
1612                 U_ASSERT(rowString->charAt(col) <= kMaxStateFor8BitsTable);
1613                 r8->fNextState[col] = static_cast<uint8_t>(rowString->charAt(col));
1614             }
1615         } else {
1616             row->r16.fAccepting = 0;
1617             row->r16.fLookAhead = 0;
1618             row->r16.fTagsIdx    = 0;
1619             for (col=0; col<catCount; col++) {
1620                 row->r16.fNextState[col] = rowString->charAt(col);
1621             }
1622         }
1623     }
1624 }
1625 
1626 
1627 
1628 
1629 //-----------------------------------------------------------------------------
1630 //
1631 //   printSet    Debug function.   Print the contents of a UVector
1632 //
1633 //-----------------------------------------------------------------------------
1634 #ifdef RBBI_DEBUG
printSet(UVector * s)1635 void RBBITableBuilder::printSet(UVector *s) {
1636     int32_t  i;
1637     for (i=0; i<s->size(); i++) {
1638         const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1639         RBBIDebugPrintf("%5d", v==nullptr? -1 : v->fSerialNum);
1640     }
1641     RBBIDebugPrintf("\n");
1642 }
1643 #endif
1644 
1645 
1646 //-----------------------------------------------------------------------------
1647 //
1648 //   printStates    Debug Function.  Dump the fully constructed state transition table.
1649 //
1650 //-----------------------------------------------------------------------------
1651 #ifdef RBBI_DEBUG
printStates()1652 void RBBITableBuilder::printStates() {
1653     int     c;    // input "character"
1654     int     n;    // state number
1655 
1656     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1657     RBBIDebugPrintf("      | Acc  LA    Tag");
1658     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1659         RBBIDebugPrintf(" %3d", c);
1660     }
1661     RBBIDebugPrintf("\n");
1662     RBBIDebugPrintf("      |---------------");
1663     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1664         RBBIDebugPrintf("----");
1665     }
1666     RBBIDebugPrintf("\n");
1667 
1668     for (n=0; n<fDStates->size(); n++) {
1669         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1670         RBBIDebugPrintf("  %3d | " , n);
1671         RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1672         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1673             RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c));
1674         }
1675         RBBIDebugPrintf("\n");
1676     }
1677     RBBIDebugPrintf("\n\n");
1678 }
1679 #endif
1680 
1681 
1682 //-----------------------------------------------------------------------------
1683 //
1684 //   printSafeTable    Debug Function.  Dump the fully constructed safe table.
1685 //
1686 //-----------------------------------------------------------------------------
1687 #ifdef RBBI_DEBUG
printReverseTable()1688 void RBBITableBuilder::printReverseTable() {
1689     int     c;    // input "character"
1690     int     n;    // state number
1691 
1692     RBBIDebugPrintf("    Safe Reverse Table \n");
1693     if (fSafeTable == nullptr) {
1694         RBBIDebugPrintf("   --- nullptr ---\n");
1695         return;
1696     }
1697     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1698     RBBIDebugPrintf("      | Acc  LA    Tag");
1699     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1700         RBBIDebugPrintf(" %2d", c);
1701     }
1702     RBBIDebugPrintf("\n");
1703     RBBIDebugPrintf("      |---------------");
1704     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1705         RBBIDebugPrintf("---");
1706     }
1707     RBBIDebugPrintf("\n");
1708 
1709     for (n=0; n<fSafeTable->size(); n++) {
1710         UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1711         RBBIDebugPrintf("  %3d | " , n);
1712         RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0);  // Accepting, LookAhead, Tags
1713         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1714             RBBIDebugPrintf(" %2d", rowString->charAt(c));
1715         }
1716         RBBIDebugPrintf("\n");
1717     }
1718     RBBIDebugPrintf("\n\n");
1719 }
1720 #endif
1721 
1722 
1723 
1724 //-----------------------------------------------------------------------------
1725 //
1726 //   printRuleStatusTable    Debug Function.  Dump the common rule status table
1727 //
1728 //-----------------------------------------------------------------------------
1729 #ifdef RBBI_DEBUG
printRuleStatusTable()1730 void RBBITableBuilder::printRuleStatusTable() {
1731     int32_t  thisRecord = 0;
1732     int32_t  nextRecord = 0;
1733     int      i;
1734     UVector  *tbl = fRB->fRuleStatusVals;
1735 
1736     RBBIDebugPrintf("index |  tags \n");
1737     RBBIDebugPrintf("-------------------\n");
1738 
1739     while (nextRecord < tbl->size()) {
1740         thisRecord = nextRecord;
1741         nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1742         RBBIDebugPrintf("%4d   ", thisRecord);
1743         for (i=thisRecord+1; i<nextRecord; i++) {
1744             RBBIDebugPrintf("  %5d", tbl->elementAti(i));
1745         }
1746         RBBIDebugPrintf("\n");
1747     }
1748     RBBIDebugPrintf("\n\n");
1749 }
1750 #endif
1751 
1752 
1753 //-----------------------------------------------------------------------------
1754 //
1755 //   RBBIStateDescriptor     Methods.  This is a very struct-like class
1756 //                           Most access is directly to the fields.
1757 //
1758 //-----------------------------------------------------------------------------
1759 
RBBIStateDescriptor(int lastInputSymbol,UErrorCode * fStatus)1760 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1761     fMarked    = false;
1762     fAccepting = 0;
1763     fLookAhead = 0;
1764     fTagsIdx   = 0;
1765     fTagVals   = nullptr;
1766     fPositions = nullptr;
1767     fDtran     = nullptr;
1768 
1769     fDtran     = new UVector32(lastInputSymbol+1, *fStatus);
1770     if (U_FAILURE(*fStatus)) {
1771         return;
1772     }
1773     if (fDtran == nullptr) {
1774         *fStatus = U_MEMORY_ALLOCATION_ERROR;
1775         return;
1776     }
1777     fDtran->setSize(lastInputSymbol+1);    // fDtran needs to be pre-sized.
1778                                            //   It is indexed by input symbols, and will
1779                                            //   hold  the next state number for each
1780                                            //   symbol.
1781 }
1782 
1783 
~RBBIStateDescriptor()1784 RBBIStateDescriptor::~RBBIStateDescriptor() {
1785     delete       fPositions;
1786     delete       fDtran;
1787     delete       fTagVals;
1788     fPositions = nullptr;
1789     fDtran     = nullptr;
1790     fTagVals   = nullptr;
1791 }
1792 
1793 U_NAMESPACE_END
1794 
1795 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1796