xref: /aosp_15_r20/external/cronet/third_party/icu/source/common/rbbi_cache.cpp (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // file: rbbi_cache.cpp
5 
6 #include "unicode/utypes.h"
7 
8 #if !UCONFIG_NO_BREAK_ITERATION
9 
10 #include "unicode/ubrk.h"
11 #include "unicode/rbbi.h"
12 
13 #include "rbbi_cache.h"
14 
15 #include "brkeng.h"
16 #include "cmemory.h"
17 #include "rbbidata.h"
18 #include "rbbirb.h"
19 #include "uassert.h"
20 #include "uvectr32.h"
21 
22 U_NAMESPACE_BEGIN
23 
24 /*
25  * DictionaryCache implementation
26  */
27 
DictionaryCache(RuleBasedBreakIterator * bi,UErrorCode & status)28 RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
29         fBI(bi), fBreaks(status), fPositionInCache(-1),
30         fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
31 }
32 
~DictionaryCache()33 RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
34 }
35 
reset()36 void RuleBasedBreakIterator::DictionaryCache::reset() {
37     fPositionInCache = -1;
38     fStart = 0;
39     fLimit = 0;
40     fFirstRuleStatusIndex = 0;
41     fOtherRuleStatusIndex = 0;
42     fBreaks.removeAllElements();
43 }
44 
following(int32_t fromPos,int32_t * result,int32_t * statusIndex)45 UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
46     if (fromPos >= fLimit || fromPos < fStart) {
47         fPositionInCache = -1;
48         return false;
49     }
50 
51     // Sequential iteration, move from previous boundary to the following
52 
53     int32_t r = 0;
54     if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
55         ++fPositionInCache;
56         if (fPositionInCache >= fBreaks.size()) {
57             fPositionInCache = -1;
58             return false;
59         }
60         r = fBreaks.elementAti(fPositionInCache);
61         U_ASSERT(r > fromPos);
62         *result = r;
63         *statusIndex = fOtherRuleStatusIndex;
64         return true;
65     }
66 
67     // Random indexing. Linear search for the boundary following the given position.
68 
69     for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
70         r= fBreaks.elementAti(fPositionInCache);
71         if (r > fromPos) {
72             *result = r;
73             *statusIndex = fOtherRuleStatusIndex;
74             return true;
75         }
76     }
77     UPRV_UNREACHABLE_EXIT;
78 }
79 
80 
preceding(int32_t fromPos,int32_t * result,int32_t * statusIndex)81 UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
82     if (fromPos <= fStart || fromPos > fLimit) {
83         fPositionInCache = -1;
84         return false;
85     }
86 
87     if (fromPos == fLimit) {
88         fPositionInCache = fBreaks.size() - 1;
89         if (fPositionInCache >= 0) {
90             U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);
91         }
92     }
93 
94     int32_t r;
95     if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
96         --fPositionInCache;
97         r = fBreaks.elementAti(fPositionInCache);
98         U_ASSERT(r < fromPos);
99         *result = r;
100         *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
101         return true;
102     }
103 
104     if (fPositionInCache == 0) {
105         fPositionInCache = -1;
106         return false;
107     }
108 
109     for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
110         r = fBreaks.elementAti(fPositionInCache);
111         if (r < fromPos) {
112             *result = r;
113             *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
114             return true;
115         }
116     }
117     UPRV_UNREACHABLE_EXIT;
118 }
119 
populateDictionary(int32_t startPos,int32_t endPos,int32_t firstRuleStatus,int32_t otherRuleStatus)120 void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
121                                        int32_t firstRuleStatus, int32_t otherRuleStatus) {
122     if ((endPos - startPos) <= 1) {
123         return;
124     }
125 
126     reset();
127     fFirstRuleStatusIndex = firstRuleStatus;
128     fOtherRuleStatusIndex = otherRuleStatus;
129 
130     int32_t rangeStart = startPos;
131     int32_t rangeEnd = endPos;
132 
133     uint16_t    category;
134     int32_t     current;
135     UErrorCode  status = U_ZERO_ERROR;
136     int32_t     foundBreakCount = 0;
137     UText      *text = &fBI->fText;
138 
139     // Loop through the text, looking for ranges of dictionary characters.
140     // For each span, find the appropriate break engine, and ask it to find
141     // any breaks within the span.
142 
143     utext_setNativeIndex(text, rangeStart);
144     UChar32     c = utext_current32(text);
145     category = ucptrie_get(fBI->fData->fTrie, c);
146     uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart;
147 
148     while(U_SUCCESS(status)) {
149         while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd
150                 && (category < dictStart)) {
151             utext_next32(text);           // TODO: cleaner loop structure.
152             c = utext_current32(text);
153             category = ucptrie_get(fBI->fData->fTrie, c);
154         }
155         if (current >= rangeEnd) {
156             break;
157         }
158 
159         // We now have a dictionary character. Get the appropriate language object
160         // to deal with it.
161         const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
162             c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
163 
164         // Ask the language object if there are any breaks. It will add them to the cache and
165         // leave the text pointer on the other side of its range, ready to search for the next one.
166         if (lbe != nullptr) {
167             foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
168         }
169 
170         // Reload the loop variables for the next go-round
171         c = utext_current32(text);
172         category = ucptrie_get(fBI->fData->fTrie, c);
173     }
174 
175     // If we found breaks, ensure that the first and last entries are
176     // the original starting and ending position. And initialize the
177     // cache iteration position to the first entry.
178 
179     // printf("foundBreakCount = %d\n", foundBreakCount);
180     if (foundBreakCount > 0) {
181         U_ASSERT(foundBreakCount == fBreaks.size());
182         if (startPos < fBreaks.elementAti(0)) {
183             // The dictionary did not place a boundary at the start of the segment of text.
184             // Add one now. This should not commonly happen, but it would be easy for interactions
185             // of the rules for dictionary segments and the break engine implementations to
186             // inadvertently cause it. Cover it here, just in case.
187             fBreaks.insertElementAt(startPos, 0, status);
188         }
189         if (endPos > fBreaks.peeki()) {
190             fBreaks.push(endPos, status);
191         }
192         fPositionInCache = 0;
193         // Note: Dictionary matching may extend beyond the original limit.
194         fStart = fBreaks.elementAti(0);
195         fLimit = fBreaks.peeki();
196     } else {
197         // there were no language-based breaks, even though the segment contained
198         // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
199         // for this range will fail, and the calling code will fall back to the rule based boundaries.
200     }
201 }
202 
203 
204 /*
205  *   BreakCache implementation
206  */
207 
BreakCache(RuleBasedBreakIterator * bi,UErrorCode & status)208 RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
209         fBI(bi), fSideBuffer(status) {
210     reset();
211 }
212 
213 
~BreakCache()214 RuleBasedBreakIterator::BreakCache::~BreakCache() {
215 }
216 
217 
reset(int32_t pos,int32_t ruleStatus)218 void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) {
219     fStartBufIdx = 0;
220     fEndBufIdx = 0;
221     fTextIdx = pos;
222     fBufIdx = 0;
223     fBoundaries[0] = pos;
224     fStatuses[0] = (uint16_t)ruleStatus;
225 }
226 
227 
current()228 int32_t  RuleBasedBreakIterator::BreakCache::current() {
229     fBI->fPosition = fTextIdx;
230     fBI->fRuleStatusIndex = fStatuses[fBufIdx];
231     fBI->fDone = false;
232     return fTextIdx;
233 }
234 
235 
following(int32_t startPos,UErrorCode & status)236 void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) {
237     if (U_FAILURE(status)) {
238         return;
239     }
240     if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
241         // startPos is in the cache. Do a next() from that position.
242         // TODO: an awkward set of interactions with bi->fDone
243         //       seek() does not clear it; it can't because of interactions with populateNear().
244         //       next() does not clear it in the fast-path case, where everything matters. Maybe it should.
245         //       So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
246         fBI->fDone = false;
247         next();
248     }
249     return;
250 }
251 
252 
preceding(int32_t startPos,UErrorCode & status)253 void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) {
254     if (U_FAILURE(status)) {
255         return;
256     }
257     if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
258         if (startPos == fTextIdx) {
259             previous(status);
260         } else {
261             // seek() leaves the BreakCache positioned at the preceding boundary
262             //        if the requested position is between two boundaries.
263             // current() pushes the BreakCache position out to the BreakIterator itself.
264             U_ASSERT(startPos > fTextIdx);
265             current();
266         }
267     }
268     return;
269 }
270 
271 
272 /*
273  * Out-of-line code for BreakCache::next().
274  * Cache does not already contain the boundary
275  */
nextOL()276 void RuleBasedBreakIterator::BreakCache::nextOL() {
277     fBI->fDone = !populateFollowing();
278     fBI->fPosition = fTextIdx;
279     fBI->fRuleStatusIndex = fStatuses[fBufIdx];
280     return;
281 }
282 
283 
previous(UErrorCode & status)284 void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
285     if (U_FAILURE(status)) {
286         return;
287     }
288     int32_t initialBufIdx = fBufIdx;
289     if (fBufIdx == fStartBufIdx) {
290         // At start of cache. Prepend to it.
291         populatePreceding(status);
292     } else {
293         // Cache already holds the next boundary
294         fBufIdx = modChunkSize(fBufIdx - 1);
295         fTextIdx = fBoundaries[fBufIdx];
296     }
297     fBI->fDone = (fBufIdx == initialBufIdx);
298     fBI->fPosition = fTextIdx;
299     fBI->fRuleStatusIndex = fStatuses[fBufIdx];
300     return;
301 }
302 
303 
seek(int32_t pos)304 UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
305     if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
306         return false;
307     }
308     if (pos == fBoundaries[fStartBufIdx]) {
309         // Common case: seek(0), from BreakIterator::first()
310         fBufIdx = fStartBufIdx;
311         fTextIdx = fBoundaries[fBufIdx];
312         return true;
313     }
314     if (pos == fBoundaries[fEndBufIdx]) {
315         fBufIdx = fEndBufIdx;
316         fTextIdx = fBoundaries[fBufIdx];
317         return true;
318     }
319 
320     int32_t min = fStartBufIdx;
321     int32_t max = fEndBufIdx;
322     while (min != max) {
323         int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
324         probe = modChunkSize(probe);
325         if (fBoundaries[probe] > pos) {
326             max = probe;
327         } else {
328             min = modChunkSize(probe + 1);
329         }
330     }
331     U_ASSERT(fBoundaries[max] > pos);
332     fBufIdx = modChunkSize(max - 1);
333     fTextIdx = fBoundaries[fBufIdx];
334     U_ASSERT(fTextIdx <= pos);
335     return true;
336 }
337 
338 
populateNear(int32_t position,UErrorCode & status)339 UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) {
340     if (U_FAILURE(status)) {
341         return false;
342     }
343     U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
344 
345     // Add boundaries to the cache near the specified position.
346     // The given position need not be a boundary itself.
347     // The input position must be within the range of the text, and
348     // on a code point boundary.
349     // If the requested position is a break boundary, leave the iteration
350     // position on it.
351     // If the requested position is not a boundary, leave the iteration
352     // position on the preceding boundary and include both the
353     // preceding and following boundaries in the cache.
354     // Additional boundaries, either preceding or following, may be added
355     // to the cache as a side effect.
356 
357     // If the requested position is not near already cached positions, clear the existing cache,
358     // find a near-by boundary and begin new cache contents there.
359 
360     // Threshold for a text position to be considered near to existing cache contents.
361     // TODO: See issue ICU-22024 "perf tuning of Cache needed."
362     //       This value is subject to change. See the ticket for more details.
363     static constexpr int32_t CACHE_NEAR = 15;
364 
365     int32_t aBoundary = -1;
366     int32_t ruleStatusIndex = 0;
367     bool retainCache = false;
368     if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
369         // Requested position is near the existing cache. Retain it.
370         retainCache = true;
371     } else if (position <= CACHE_NEAR) {
372         // Requested position is near the start of the text. Fill cache from start, skipping
373         // the need to find a safe point.
374         retainCache = false;
375         aBoundary = 0;
376     } else {
377         // Requested position is not near the existing cache.
378         // Find a safe point to refill the cache from.
379         int32_t backupPos = fBI->handleSafePrevious(position);
380 
381         if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
382             // The requested position is beyond the end of the existing cache, but the
383             // reverse rules produced a position near or before the cached region.
384             // Retain the existing cache, and fill from the end of it.
385             retainCache = true;
386         } else if (backupPos < CACHE_NEAR) {
387             // The safe reverse rules moved us to near the start of text.
388             // Take that (index 0) as the backup boundary, avoiding the complication
389             // (in the following block) of moving forward from the safe point to a known boundary.
390             //
391             // Retain the cache if it begins not too far from the requested position.
392             aBoundary = 0;
393             retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
394         } else {
395             // The safe reverse rules produced a position that is neither near the existing
396             // cache, nor near the start of text.
397             // Advance to the boundary following.
398             // There is a complication: the safe reverse rules identify pairs of code points
399             // that are safe. If advancing from the safe point moves forwards by less than
400             // two code points, we need to advance one more time to ensure that the boundary
401             // is good, including a correct rules status value.
402             retainCache = false;
403             fBI->fPosition = backupPos;
404             aBoundary = fBI->handleNext();
405             if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) {
406                 // +4 is a quick test for possibly having advanced only one codepoint.
407                 // Four being the length of the longest potential code point, a supplementary in UTF-8
408                 utext_setNativeIndex(&fBI->fText, aBoundary);
409                 if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
410                     // The initial handleNext() only advanced by a single code point. Go again.
411                     aBoundary = fBI->handleNext();   // Safe rules identify safe pairs.
412                 }
413             }
414             if (aBoundary == UBRK_DONE) {
415                 // Note (Andy Heninger): I don't think this condition can occur, but it's hard
416                 // to prove that it can't. We ran off the end of the string looking a boundary
417                 // following a safe point; choose the end of the string as that boundary.
418                 aBoundary = utext_nativeLength(&fBI->fText);
419             }
420             ruleStatusIndex = fBI->fRuleStatusIndex;
421         }
422     }
423 
424     if (!retainCache) {
425         U_ASSERT(aBoundary != -1);
426         reset(aBoundary, ruleStatusIndex);        // Reset cache to hold aBoundary as a single starting point.
427     }
428 
429     // Fill in boundaries between existing cache content and the new requested position.
430 
431     if (fBoundaries[fEndBufIdx] < position) {
432         // The last position in the cache precedes the requested position.
433         // Add following position(s) to the cache.
434         while (fBoundaries[fEndBufIdx] < position) {
435             if (!populateFollowing()) {
436                 UPRV_UNREACHABLE_EXIT;
437             }
438         }
439         fBufIdx = fEndBufIdx;                      // Set iterator position to the end of the buffer.
440         fTextIdx = fBoundaries[fBufIdx];           // Required because populateFollowing may add extra boundaries.
441         while (fTextIdx > position) {              // Move backwards to a position at or preceding the requested pos.
442             previous(status);
443         }
444         return true;
445     }
446 
447     if (fBoundaries[fStartBufIdx] > position) {
448         // The first position in the cache is beyond the requested position.
449         // back up more until we get a boundary <= the requested position.
450         while (fBoundaries[fStartBufIdx] > position) {
451             populatePreceding(status);
452         }
453         fBufIdx = fStartBufIdx;                    // Set iterator position to the start of the buffer.
454         fTextIdx = fBoundaries[fBufIdx];           // Required because populatePreceding may add extra boundaries.
455         while (fTextIdx < position) {              // Move forwards to a position at or following the requested pos.
456             next();
457         }
458         if (fTextIdx > position) {
459             // If position is not itself a boundary, the next() loop above will overshoot.
460             // Back up one, leaving cache position at the boundary preceding the requested position.
461             previous(status);
462         }
463         return true;
464     }
465 
466     U_ASSERT(fTextIdx == position);
467     return true;
468 }
469 
470 
471 
populateFollowing()472 UBool RuleBasedBreakIterator::BreakCache::populateFollowing() {
473     int32_t fromPosition = fBoundaries[fEndBufIdx];
474     int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx];
475     int32_t pos = 0;
476     int32_t ruleStatusIdx = 0;
477 
478     if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
479         addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
480         return true;
481     }
482 
483     fBI->fPosition = fromPosition;
484     pos = fBI->handleNext();
485     if (pos == UBRK_DONE) {
486         return false;
487     }
488 
489     ruleStatusIdx = fBI->fRuleStatusIndex;
490     if (fBI->fDictionaryCharCount > 0) {
491         // The text segment obtained from the rules includes dictionary characters.
492         // Subdivide it, with subdivided results going into the dictionary cache.
493         fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
494         if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
495             addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
496             return true;
497             // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.
498             //       But be careful with interactions with populateNear().
499         }
500     }
501 
502     // Rule based segment did not include dictionary characters.
503     // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
504     //    meaning that we didn't take the return, above.
505     // Add its end point to the cache.
506     addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
507 
508     // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
509     //    (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
510     //
511     for (int count=0; count<6; ++count) {
512         pos = fBI->handleNext();
513         if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) {
514             break;
515         }
516         addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition);
517     }
518 
519     return true;
520 }
521 
522 
populatePreceding(UErrorCode & status)523 UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) {
524     if (U_FAILURE(status)) {
525         return false;
526     }
527 
528     int32_t fromPosition = fBoundaries[fStartBufIdx];
529     if (fromPosition == 0) {
530         return false;
531     }
532 
533     int32_t position = 0;
534     int32_t positionStatusIdx = 0;
535 
536     if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) {
537         addPreceding(position, positionStatusIdx, UpdateCachePosition);
538         return true;
539     }
540 
541     int32_t backupPosition = fromPosition;
542 
543     // Find a boundary somewhere preceding the first already-cached boundary
544     do {
545         backupPosition = backupPosition - 30;
546         if (backupPosition <= 0) {
547             backupPosition = 0;
548         } else {
549             backupPosition = fBI->handleSafePrevious(backupPosition);
550         }
551         if (backupPosition == UBRK_DONE || backupPosition == 0) {
552             position = 0;
553             positionStatusIdx = 0;
554         } else {
555             // Advance to the boundary following the backup position.
556             // There is a complication: the safe reverse rules identify pairs of code points
557             // that are safe. If advancing from the safe point moves forwards by less than
558             // two code points, we need to advance one more time to ensure that the boundary
559             // is good, including a correct rules status value.
560             //
561             fBI->fPosition = backupPosition;
562             position = fBI->handleNext();
563             if (position <= backupPosition + 4) {
564                 // +4 is a quick test for possibly having advanced only one codepoint.
565                 // Four being the length of the longest potential code point, a supplementary in UTF-8
566                 utext_setNativeIndex(&fBI->fText, position);
567                 if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
568                     // The initial handleNext() only advanced by a single code point. Go again.
569                     position = fBI->handleNext();   // Safe rules identify safe pairs.
570                 }
571             }
572             positionStatusIdx = fBI->fRuleStatusIndex;
573         }
574     } while (position >= fromPosition);
575 
576     // Find boundaries between the one we just located and the first already-cached boundary
577     // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
578 
579     fSideBuffer.removeAllElements();
580     fSideBuffer.addElement(position, status);
581     fSideBuffer.addElement(positionStatusIdx, status);
582 
583     do {
584         int32_t prevPosition = fBI->fPosition = position;
585         int32_t prevStatusIdx = positionStatusIdx;
586         position = fBI->handleNext();
587         positionStatusIdx = fBI->fRuleStatusIndex;
588         if (position == UBRK_DONE) {
589             break;
590         }
591 
592         UBool segmentHandledByDictionary = false;
593         if (fBI->fDictionaryCharCount != 0) {
594             // Segment from the rules includes dictionary characters.
595             // Subdivide it, with subdivided results going into the dictionary cache.
596             int32_t dictSegEndPosition = position;
597             fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
598             while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) {
599                 segmentHandledByDictionary = true;
600                 U_ASSERT(position > prevPosition);
601                 if (position >= fromPosition) {
602                     break;
603                 }
604                 U_ASSERT(position <= dictSegEndPosition);
605                 fSideBuffer.addElement(position, status);
606                 fSideBuffer.addElement(positionStatusIdx, status);
607                 prevPosition = position;
608             }
609             U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
610         }
611 
612         if (!segmentHandledByDictionary && position < fromPosition) {
613             fSideBuffer.addElement(position, status);
614             fSideBuffer.addElement(positionStatusIdx, status);
615         }
616     } while (position < fromPosition);
617 
618     // Move boundaries from the side buffer to the main circular buffer.
619     UBool success = false;
620     if (!fSideBuffer.isEmpty()) {
621         positionStatusIdx = fSideBuffer.popi();
622         position = fSideBuffer.popi();
623         addPreceding(position, positionStatusIdx, UpdateCachePosition);
624         success = true;
625     }
626 
627     while (!fSideBuffer.isEmpty()) {
628         positionStatusIdx = fSideBuffer.popi();
629         position = fSideBuffer.popi();
630         if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
631             // No space in circular buffer to hold a new preceding result while
632             // also retaining the current cache (iteration) position.
633             // Bailing out is safe; the cache will refill again if needed.
634             break;
635         }
636     }
637 
638     return success;
639 }
640 
641 
addFollowing(int32_t position,int32_t ruleStatusIdx,UpdatePositionValues update)642 void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
643     U_ASSERT(position > fBoundaries[fEndBufIdx]);
644     U_ASSERT(ruleStatusIdx <= UINT16_MAX);
645     int32_t nextIdx = modChunkSize(fEndBufIdx + 1);
646     if (nextIdx == fStartBufIdx) {
647         fStartBufIdx = modChunkSize(fStartBufIdx + 6);    // TODO: experiment. Probably revert to 1.
648     }
649     fBoundaries[nextIdx] = position;
650     fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
651     fEndBufIdx = nextIdx;
652     if (update == UpdateCachePosition) {
653         // Set current position to the newly added boundary.
654         fBufIdx = nextIdx;
655         fTextIdx = position;
656     } else {
657         // Retaining the original cache position.
658         // Check if the added boundary wraps around the buffer, and would over-write the original position.
659         // It's the responsibility of callers of this function to not add too many.
660         U_ASSERT(nextIdx != fBufIdx);
661     }
662 }
663 
addPreceding(int32_t position,int32_t ruleStatusIdx,UpdatePositionValues update)664 bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
665     U_ASSERT(position < fBoundaries[fStartBufIdx]);
666     U_ASSERT(ruleStatusIdx <= UINT16_MAX);
667     int32_t nextIdx = modChunkSize(fStartBufIdx - 1);
668     if (nextIdx == fEndBufIdx) {
669         if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
670             // Failure. The insertion of the new boundary would claim the buffer position that is the
671             // current iteration position. And we also want to retain the current iteration position.
672             // (The buffer is already completely full of entries that precede the iteration position.)
673             return false;
674         }
675         fEndBufIdx = modChunkSize(fEndBufIdx - 1);
676     }
677     fBoundaries[nextIdx] = position;
678     fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
679     fStartBufIdx = nextIdx;
680     if (update == UpdateCachePosition) {
681         fBufIdx = nextIdx;
682         fTextIdx = position;
683     }
684     return true;
685 }
686 
687 
dumpCache()688 void RuleBasedBreakIterator::BreakCache::dumpCache() {
689 #ifdef RBBI_DEBUG
690     RBBIDebugPrintf("fTextIdx:%d   fBufIdx:%d\n", fTextIdx, fBufIdx);
691     for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) {
692         RBBIDebugPrintf("%d  %d\n", i, fBoundaries[i]);
693         if (i == fEndBufIdx) {
694             break;
695         }
696     }
697 #endif
698 }
699 
700 U_NAMESPACE_END
701 
702 #endif // #if !UCONFIG_NO_BREAK_ITERATION
703