1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 // file: rbbi_cache.cpp
5
6 #include "unicode/utypes.h"
7
8 #if !UCONFIG_NO_BREAK_ITERATION
9
10 #include "unicode/ubrk.h"
11 #include "unicode/rbbi.h"
12
13 #include "rbbi_cache.h"
14
15 #include "brkeng.h"
16 #include "cmemory.h"
17 #include "rbbidata.h"
18 #include "rbbirb.h"
19 #include "uassert.h"
20 #include "uvectr32.h"
21
22 U_NAMESPACE_BEGIN
23
24 /*
25 * DictionaryCache implementation
26 */
27
DictionaryCache(RuleBasedBreakIterator * bi,UErrorCode & status)28 RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
29 fBI(bi), fBreaks(status), fPositionInCache(-1),
30 fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
31 }
32
~DictionaryCache()33 RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
34 }
35
reset()36 void RuleBasedBreakIterator::DictionaryCache::reset() {
37 fPositionInCache = -1;
38 fStart = 0;
39 fLimit = 0;
40 fFirstRuleStatusIndex = 0;
41 fOtherRuleStatusIndex = 0;
42 fBreaks.removeAllElements();
43 }
44
following(int32_t fromPos,int32_t * result,int32_t * statusIndex)45 UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
46 if (fromPos >= fLimit || fromPos < fStart) {
47 fPositionInCache = -1;
48 return false;
49 }
50
51 // Sequential iteration, move from previous boundary to the following
52
53 int32_t r = 0;
54 if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
55 ++fPositionInCache;
56 if (fPositionInCache >= fBreaks.size()) {
57 fPositionInCache = -1;
58 return false;
59 }
60 r = fBreaks.elementAti(fPositionInCache);
61 U_ASSERT(r > fromPos);
62 *result = r;
63 *statusIndex = fOtherRuleStatusIndex;
64 return true;
65 }
66
67 // Random indexing. Linear search for the boundary following the given position.
68
69 for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
70 r= fBreaks.elementAti(fPositionInCache);
71 if (r > fromPos) {
72 *result = r;
73 *statusIndex = fOtherRuleStatusIndex;
74 return true;
75 }
76 }
77 UPRV_UNREACHABLE_EXIT;
78 }
79
80
preceding(int32_t fromPos,int32_t * result,int32_t * statusIndex)81 UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
82 if (fromPos <= fStart || fromPos > fLimit) {
83 fPositionInCache = -1;
84 return false;
85 }
86
87 if (fromPos == fLimit) {
88 fPositionInCache = fBreaks.size() - 1;
89 if (fPositionInCache >= 0) {
90 U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);
91 }
92 }
93
94 int32_t r;
95 if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
96 --fPositionInCache;
97 r = fBreaks.elementAti(fPositionInCache);
98 U_ASSERT(r < fromPos);
99 *result = r;
100 *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
101 return true;
102 }
103
104 if (fPositionInCache == 0) {
105 fPositionInCache = -1;
106 return false;
107 }
108
109 for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
110 r = fBreaks.elementAti(fPositionInCache);
111 if (r < fromPos) {
112 *result = r;
113 *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
114 return true;
115 }
116 }
117 UPRV_UNREACHABLE_EXIT;
118 }
119
populateDictionary(int32_t startPos,int32_t endPos,int32_t firstRuleStatus,int32_t otherRuleStatus)120 void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
121 int32_t firstRuleStatus, int32_t otherRuleStatus) {
122 if ((endPos - startPos) <= 1) {
123 return;
124 }
125
126 reset();
127 fFirstRuleStatusIndex = firstRuleStatus;
128 fOtherRuleStatusIndex = otherRuleStatus;
129
130 int32_t rangeStart = startPos;
131 int32_t rangeEnd = endPos;
132
133 uint16_t category;
134 int32_t current;
135 UErrorCode status = U_ZERO_ERROR;
136 int32_t foundBreakCount = 0;
137 UText *text = &fBI->fText;
138
139 // Loop through the text, looking for ranges of dictionary characters.
140 // For each span, find the appropriate break engine, and ask it to find
141 // any breaks within the span.
142
143 utext_setNativeIndex(text, rangeStart);
144 UChar32 c = utext_current32(text);
145 category = ucptrie_get(fBI->fData->fTrie, c);
146 uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart;
147
148 while(U_SUCCESS(status)) {
149 while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd
150 && (category < dictStart)) {
151 utext_next32(text); // TODO: cleaner loop structure.
152 c = utext_current32(text);
153 category = ucptrie_get(fBI->fData->fTrie, c);
154 }
155 if (current >= rangeEnd) {
156 break;
157 }
158
159 // We now have a dictionary character. Get the appropriate language object
160 // to deal with it.
161 const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
162 c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
163
164 // Ask the language object if there are any breaks. It will add them to the cache and
165 // leave the text pointer on the other side of its range, ready to search for the next one.
166 if (lbe != nullptr) {
167 foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
168 }
169
170 // Reload the loop variables for the next go-round
171 c = utext_current32(text);
172 category = ucptrie_get(fBI->fData->fTrie, c);
173 }
174
175 // If we found breaks, ensure that the first and last entries are
176 // the original starting and ending position. And initialize the
177 // cache iteration position to the first entry.
178
179 // printf("foundBreakCount = %d\n", foundBreakCount);
180 if (foundBreakCount > 0) {
181 U_ASSERT(foundBreakCount == fBreaks.size());
182 if (startPos < fBreaks.elementAti(0)) {
183 // The dictionary did not place a boundary at the start of the segment of text.
184 // Add one now. This should not commonly happen, but it would be easy for interactions
185 // of the rules for dictionary segments and the break engine implementations to
186 // inadvertently cause it. Cover it here, just in case.
187 fBreaks.insertElementAt(startPos, 0, status);
188 }
189 if (endPos > fBreaks.peeki()) {
190 fBreaks.push(endPos, status);
191 }
192 fPositionInCache = 0;
193 // Note: Dictionary matching may extend beyond the original limit.
194 fStart = fBreaks.elementAti(0);
195 fLimit = fBreaks.peeki();
196 } else {
197 // there were no language-based breaks, even though the segment contained
198 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
199 // for this range will fail, and the calling code will fall back to the rule based boundaries.
200 }
201 }
202
203
204 /*
205 * BreakCache implementation
206 */
207
BreakCache(RuleBasedBreakIterator * bi,UErrorCode & status)208 RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
209 fBI(bi), fSideBuffer(status) {
210 reset();
211 }
212
213
~BreakCache()214 RuleBasedBreakIterator::BreakCache::~BreakCache() {
215 }
216
217
reset(int32_t pos,int32_t ruleStatus)218 void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) {
219 fStartBufIdx = 0;
220 fEndBufIdx = 0;
221 fTextIdx = pos;
222 fBufIdx = 0;
223 fBoundaries[0] = pos;
224 fStatuses[0] = (uint16_t)ruleStatus;
225 }
226
227
current()228 int32_t RuleBasedBreakIterator::BreakCache::current() {
229 fBI->fPosition = fTextIdx;
230 fBI->fRuleStatusIndex = fStatuses[fBufIdx];
231 fBI->fDone = false;
232 return fTextIdx;
233 }
234
235
following(int32_t startPos,UErrorCode & status)236 void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) {
237 if (U_FAILURE(status)) {
238 return;
239 }
240 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
241 // startPos is in the cache. Do a next() from that position.
242 // TODO: an awkward set of interactions with bi->fDone
243 // seek() does not clear it; it can't because of interactions with populateNear().
244 // next() does not clear it in the fast-path case, where everything matters. Maybe it should.
245 // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
246 fBI->fDone = false;
247 next();
248 }
249 return;
250 }
251
252
preceding(int32_t startPos,UErrorCode & status)253 void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) {
254 if (U_FAILURE(status)) {
255 return;
256 }
257 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
258 if (startPos == fTextIdx) {
259 previous(status);
260 } else {
261 // seek() leaves the BreakCache positioned at the preceding boundary
262 // if the requested position is between two boundaries.
263 // current() pushes the BreakCache position out to the BreakIterator itself.
264 U_ASSERT(startPos > fTextIdx);
265 current();
266 }
267 }
268 return;
269 }
270
271
272 /*
273 * Out-of-line code for BreakCache::next().
274 * Cache does not already contain the boundary
275 */
nextOL()276 void RuleBasedBreakIterator::BreakCache::nextOL() {
277 fBI->fDone = !populateFollowing();
278 fBI->fPosition = fTextIdx;
279 fBI->fRuleStatusIndex = fStatuses[fBufIdx];
280 return;
281 }
282
283
previous(UErrorCode & status)284 void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
285 if (U_FAILURE(status)) {
286 return;
287 }
288 int32_t initialBufIdx = fBufIdx;
289 if (fBufIdx == fStartBufIdx) {
290 // At start of cache. Prepend to it.
291 populatePreceding(status);
292 } else {
293 // Cache already holds the next boundary
294 fBufIdx = modChunkSize(fBufIdx - 1);
295 fTextIdx = fBoundaries[fBufIdx];
296 }
297 fBI->fDone = (fBufIdx == initialBufIdx);
298 fBI->fPosition = fTextIdx;
299 fBI->fRuleStatusIndex = fStatuses[fBufIdx];
300 return;
301 }
302
303
seek(int32_t pos)304 UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
305 if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
306 return false;
307 }
308 if (pos == fBoundaries[fStartBufIdx]) {
309 // Common case: seek(0), from BreakIterator::first()
310 fBufIdx = fStartBufIdx;
311 fTextIdx = fBoundaries[fBufIdx];
312 return true;
313 }
314 if (pos == fBoundaries[fEndBufIdx]) {
315 fBufIdx = fEndBufIdx;
316 fTextIdx = fBoundaries[fBufIdx];
317 return true;
318 }
319
320 int32_t min = fStartBufIdx;
321 int32_t max = fEndBufIdx;
322 while (min != max) {
323 int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
324 probe = modChunkSize(probe);
325 if (fBoundaries[probe] > pos) {
326 max = probe;
327 } else {
328 min = modChunkSize(probe + 1);
329 }
330 }
331 U_ASSERT(fBoundaries[max] > pos);
332 fBufIdx = modChunkSize(max - 1);
333 fTextIdx = fBoundaries[fBufIdx];
334 U_ASSERT(fTextIdx <= pos);
335 return true;
336 }
337
338
populateNear(int32_t position,UErrorCode & status)339 UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) {
340 if (U_FAILURE(status)) {
341 return false;
342 }
343 U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
344
345 // Add boundaries to the cache near the specified position.
346 // The given position need not be a boundary itself.
347 // The input position must be within the range of the text, and
348 // on a code point boundary.
349 // If the requested position is a break boundary, leave the iteration
350 // position on it.
351 // If the requested position is not a boundary, leave the iteration
352 // position on the preceding boundary and include both the
353 // preceding and following boundaries in the cache.
354 // Additional boundaries, either preceding or following, may be added
355 // to the cache as a side effect.
356
357 // If the requested position is not near already cached positions, clear the existing cache,
358 // find a near-by boundary and begin new cache contents there.
359
360 // Threshold for a text position to be considered near to existing cache contents.
361 // TODO: See issue ICU-22024 "perf tuning of Cache needed."
362 // This value is subject to change. See the ticket for more details.
363 static constexpr int32_t CACHE_NEAR = 15;
364
365 int32_t aBoundary = -1;
366 int32_t ruleStatusIndex = 0;
367 bool retainCache = false;
368 if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
369 // Requested position is near the existing cache. Retain it.
370 retainCache = true;
371 } else if (position <= CACHE_NEAR) {
372 // Requested position is near the start of the text. Fill cache from start, skipping
373 // the need to find a safe point.
374 retainCache = false;
375 aBoundary = 0;
376 } else {
377 // Requested position is not near the existing cache.
378 // Find a safe point to refill the cache from.
379 int32_t backupPos = fBI->handleSafePrevious(position);
380
381 if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
382 // The requested position is beyond the end of the existing cache, but the
383 // reverse rules produced a position near or before the cached region.
384 // Retain the existing cache, and fill from the end of it.
385 retainCache = true;
386 } else if (backupPos < CACHE_NEAR) {
387 // The safe reverse rules moved us to near the start of text.
388 // Take that (index 0) as the backup boundary, avoiding the complication
389 // (in the following block) of moving forward from the safe point to a known boundary.
390 //
391 // Retain the cache if it begins not too far from the requested position.
392 aBoundary = 0;
393 retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
394 } else {
395 // The safe reverse rules produced a position that is neither near the existing
396 // cache, nor near the start of text.
397 // Advance to the boundary following.
398 // There is a complication: the safe reverse rules identify pairs of code points
399 // that are safe. If advancing from the safe point moves forwards by less than
400 // two code points, we need to advance one more time to ensure that the boundary
401 // is good, including a correct rules status value.
402 retainCache = false;
403 fBI->fPosition = backupPos;
404 aBoundary = fBI->handleNext();
405 if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) {
406 // +4 is a quick test for possibly having advanced only one codepoint.
407 // Four being the length of the longest potential code point, a supplementary in UTF-8
408 utext_setNativeIndex(&fBI->fText, aBoundary);
409 if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
410 // The initial handleNext() only advanced by a single code point. Go again.
411 aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
412 }
413 }
414 if (aBoundary == UBRK_DONE) {
415 // Note (Andy Heninger): I don't think this condition can occur, but it's hard
416 // to prove that it can't. We ran off the end of the string looking a boundary
417 // following a safe point; choose the end of the string as that boundary.
418 aBoundary = utext_nativeLength(&fBI->fText);
419 }
420 ruleStatusIndex = fBI->fRuleStatusIndex;
421 }
422 }
423
424 if (!retainCache) {
425 U_ASSERT(aBoundary != -1);
426 reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
427 }
428
429 // Fill in boundaries between existing cache content and the new requested position.
430
431 if (fBoundaries[fEndBufIdx] < position) {
432 // The last position in the cache precedes the requested position.
433 // Add following position(s) to the cache.
434 while (fBoundaries[fEndBufIdx] < position) {
435 if (!populateFollowing()) {
436 UPRV_UNREACHABLE_EXIT;
437 }
438 }
439 fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer.
440 fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries.
441 while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos.
442 previous(status);
443 }
444 return true;
445 }
446
447 if (fBoundaries[fStartBufIdx] > position) {
448 // The first position in the cache is beyond the requested position.
449 // back up more until we get a boundary <= the requested position.
450 while (fBoundaries[fStartBufIdx] > position) {
451 populatePreceding(status);
452 }
453 fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer.
454 fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries.
455 while (fTextIdx < position) { // Move forwards to a position at or following the requested pos.
456 next();
457 }
458 if (fTextIdx > position) {
459 // If position is not itself a boundary, the next() loop above will overshoot.
460 // Back up one, leaving cache position at the boundary preceding the requested position.
461 previous(status);
462 }
463 return true;
464 }
465
466 U_ASSERT(fTextIdx == position);
467 return true;
468 }
469
470
471
populateFollowing()472 UBool RuleBasedBreakIterator::BreakCache::populateFollowing() {
473 int32_t fromPosition = fBoundaries[fEndBufIdx];
474 int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx];
475 int32_t pos = 0;
476 int32_t ruleStatusIdx = 0;
477
478 if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
479 addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
480 return true;
481 }
482
483 fBI->fPosition = fromPosition;
484 pos = fBI->handleNext();
485 if (pos == UBRK_DONE) {
486 return false;
487 }
488
489 ruleStatusIdx = fBI->fRuleStatusIndex;
490 if (fBI->fDictionaryCharCount > 0) {
491 // The text segment obtained from the rules includes dictionary characters.
492 // Subdivide it, with subdivided results going into the dictionary cache.
493 fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
494 if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
495 addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
496 return true;
497 // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.
498 // But be careful with interactions with populateNear().
499 }
500 }
501
502 // Rule based segment did not include dictionary characters.
503 // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
504 // meaning that we didn't take the return, above.
505 // Add its end point to the cache.
506 addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
507
508 // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
509 // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
510 //
511 for (int count=0; count<6; ++count) {
512 pos = fBI->handleNext();
513 if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) {
514 break;
515 }
516 addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition);
517 }
518
519 return true;
520 }
521
522
populatePreceding(UErrorCode & status)523 UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) {
524 if (U_FAILURE(status)) {
525 return false;
526 }
527
528 int32_t fromPosition = fBoundaries[fStartBufIdx];
529 if (fromPosition == 0) {
530 return false;
531 }
532
533 int32_t position = 0;
534 int32_t positionStatusIdx = 0;
535
536 if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) {
537 addPreceding(position, positionStatusIdx, UpdateCachePosition);
538 return true;
539 }
540
541 int32_t backupPosition = fromPosition;
542
543 // Find a boundary somewhere preceding the first already-cached boundary
544 do {
545 backupPosition = backupPosition - 30;
546 if (backupPosition <= 0) {
547 backupPosition = 0;
548 } else {
549 backupPosition = fBI->handleSafePrevious(backupPosition);
550 }
551 if (backupPosition == UBRK_DONE || backupPosition == 0) {
552 position = 0;
553 positionStatusIdx = 0;
554 } else {
555 // Advance to the boundary following the backup position.
556 // There is a complication: the safe reverse rules identify pairs of code points
557 // that are safe. If advancing from the safe point moves forwards by less than
558 // two code points, we need to advance one more time to ensure that the boundary
559 // is good, including a correct rules status value.
560 //
561 fBI->fPosition = backupPosition;
562 position = fBI->handleNext();
563 if (position <= backupPosition + 4) {
564 // +4 is a quick test for possibly having advanced only one codepoint.
565 // Four being the length of the longest potential code point, a supplementary in UTF-8
566 utext_setNativeIndex(&fBI->fText, position);
567 if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
568 // The initial handleNext() only advanced by a single code point. Go again.
569 position = fBI->handleNext(); // Safe rules identify safe pairs.
570 }
571 }
572 positionStatusIdx = fBI->fRuleStatusIndex;
573 }
574 } while (position >= fromPosition);
575
576 // Find boundaries between the one we just located and the first already-cached boundary
577 // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
578
579 fSideBuffer.removeAllElements();
580 fSideBuffer.addElement(position, status);
581 fSideBuffer.addElement(positionStatusIdx, status);
582
583 do {
584 int32_t prevPosition = fBI->fPosition = position;
585 int32_t prevStatusIdx = positionStatusIdx;
586 position = fBI->handleNext();
587 positionStatusIdx = fBI->fRuleStatusIndex;
588 if (position == UBRK_DONE) {
589 break;
590 }
591
592 UBool segmentHandledByDictionary = false;
593 if (fBI->fDictionaryCharCount != 0) {
594 // Segment from the rules includes dictionary characters.
595 // Subdivide it, with subdivided results going into the dictionary cache.
596 int32_t dictSegEndPosition = position;
597 fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
598 while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) {
599 segmentHandledByDictionary = true;
600 U_ASSERT(position > prevPosition);
601 if (position >= fromPosition) {
602 break;
603 }
604 U_ASSERT(position <= dictSegEndPosition);
605 fSideBuffer.addElement(position, status);
606 fSideBuffer.addElement(positionStatusIdx, status);
607 prevPosition = position;
608 }
609 U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
610 }
611
612 if (!segmentHandledByDictionary && position < fromPosition) {
613 fSideBuffer.addElement(position, status);
614 fSideBuffer.addElement(positionStatusIdx, status);
615 }
616 } while (position < fromPosition);
617
618 // Move boundaries from the side buffer to the main circular buffer.
619 UBool success = false;
620 if (!fSideBuffer.isEmpty()) {
621 positionStatusIdx = fSideBuffer.popi();
622 position = fSideBuffer.popi();
623 addPreceding(position, positionStatusIdx, UpdateCachePosition);
624 success = true;
625 }
626
627 while (!fSideBuffer.isEmpty()) {
628 positionStatusIdx = fSideBuffer.popi();
629 position = fSideBuffer.popi();
630 if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
631 // No space in circular buffer to hold a new preceding result while
632 // also retaining the current cache (iteration) position.
633 // Bailing out is safe; the cache will refill again if needed.
634 break;
635 }
636 }
637
638 return success;
639 }
640
641
addFollowing(int32_t position,int32_t ruleStatusIdx,UpdatePositionValues update)642 void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
643 U_ASSERT(position > fBoundaries[fEndBufIdx]);
644 U_ASSERT(ruleStatusIdx <= UINT16_MAX);
645 int32_t nextIdx = modChunkSize(fEndBufIdx + 1);
646 if (nextIdx == fStartBufIdx) {
647 fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1.
648 }
649 fBoundaries[nextIdx] = position;
650 fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
651 fEndBufIdx = nextIdx;
652 if (update == UpdateCachePosition) {
653 // Set current position to the newly added boundary.
654 fBufIdx = nextIdx;
655 fTextIdx = position;
656 } else {
657 // Retaining the original cache position.
658 // Check if the added boundary wraps around the buffer, and would over-write the original position.
659 // It's the responsibility of callers of this function to not add too many.
660 U_ASSERT(nextIdx != fBufIdx);
661 }
662 }
663
addPreceding(int32_t position,int32_t ruleStatusIdx,UpdatePositionValues update)664 bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
665 U_ASSERT(position < fBoundaries[fStartBufIdx]);
666 U_ASSERT(ruleStatusIdx <= UINT16_MAX);
667 int32_t nextIdx = modChunkSize(fStartBufIdx - 1);
668 if (nextIdx == fEndBufIdx) {
669 if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
670 // Failure. The insertion of the new boundary would claim the buffer position that is the
671 // current iteration position. And we also want to retain the current iteration position.
672 // (The buffer is already completely full of entries that precede the iteration position.)
673 return false;
674 }
675 fEndBufIdx = modChunkSize(fEndBufIdx - 1);
676 }
677 fBoundaries[nextIdx] = position;
678 fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
679 fStartBufIdx = nextIdx;
680 if (update == UpdateCachePosition) {
681 fBufIdx = nextIdx;
682 fTextIdx = position;
683 }
684 return true;
685 }
686
687
dumpCache()688 void RuleBasedBreakIterator::BreakCache::dumpCache() {
689 #ifdef RBBI_DEBUG
690 RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx);
691 for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) {
692 RBBIDebugPrintf("%d %d\n", i, fBoundaries[i]);
693 if (i == fEndBufIdx) {
694 break;
695 }
696 }
697 #endif
698 }
699
700 U_NAMESPACE_END
701
702 #endif // #if !UCONFIG_NO_BREAK_ITERATION
703