xref: /aosp_15_r20/external/icu/icu4c/source/common/unisetspan.cpp (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker ******************************************************************************
5*0e209d39SAndroid Build Coastguard Worker *
6*0e209d39SAndroid Build Coastguard Worker *   Copyright (C) 2007-2012, International Business Machines
7*0e209d39SAndroid Build Coastguard Worker *   Corporation and others.  All Rights Reserved.
8*0e209d39SAndroid Build Coastguard Worker *
9*0e209d39SAndroid Build Coastguard Worker ******************************************************************************
10*0e209d39SAndroid Build Coastguard Worker *   file name:  unisetspan.cpp
11*0e209d39SAndroid Build Coastguard Worker *   encoding:   UTF-8
12*0e209d39SAndroid Build Coastguard Worker *   tab size:   8 (not used)
13*0e209d39SAndroid Build Coastguard Worker *   indentation:4
14*0e209d39SAndroid Build Coastguard Worker *
15*0e209d39SAndroid Build Coastguard Worker *   created on: 2007mar01
16*0e209d39SAndroid Build Coastguard Worker *   created by: Markus W. Scherer
17*0e209d39SAndroid Build Coastguard Worker */
18*0e209d39SAndroid Build Coastguard Worker 
19*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
20*0e209d39SAndroid Build Coastguard Worker #include "unicode/uniset.h"
21*0e209d39SAndroid Build Coastguard Worker #include "unicode/ustring.h"
22*0e209d39SAndroid Build Coastguard Worker #include "unicode/utf8.h"
23*0e209d39SAndroid Build Coastguard Worker #include "unicode/utf16.h"
24*0e209d39SAndroid Build Coastguard Worker #include "cmemory.h"
25*0e209d39SAndroid Build Coastguard Worker #include "uvector.h"
26*0e209d39SAndroid Build Coastguard Worker #include "unisetspan.h"
27*0e209d39SAndroid Build Coastguard Worker 
28*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
29*0e209d39SAndroid Build Coastguard Worker 
30*0e209d39SAndroid Build Coastguard Worker /*
31*0e209d39SAndroid Build Coastguard Worker  * List of offsets from the current position from where to try matching
32*0e209d39SAndroid Build Coastguard Worker  * a code point or a string.
33*0e209d39SAndroid Build Coastguard Worker  * Store offsets rather than indexes to simplify the code and use the same list
34*0e209d39SAndroid Build Coastguard Worker  * for both increments (in span()) and decrements (in spanBack()).
35*0e209d39SAndroid Build Coastguard Worker  *
36*0e209d39SAndroid Build Coastguard Worker  * Assumption: The maximum offset is limited, and the offsets that are stored
37*0e209d39SAndroid Build Coastguard Worker  * at any one time are relatively dense, that is, there are normally no gaps of
38*0e209d39SAndroid Build Coastguard Worker  * hundreds or thousands of offset values.
39*0e209d39SAndroid Build Coastguard Worker  *
40*0e209d39SAndroid Build Coastguard Worker  * The implementation uses a circular buffer of byte flags,
41*0e209d39SAndroid Build Coastguard Worker  * each indicating whether the corresponding offset is in the list.
42*0e209d39SAndroid Build Coastguard Worker  * This avoids inserting into a sorted list of offsets (or absolute indexes) and
43*0e209d39SAndroid Build Coastguard Worker  * physically moving part of the list.
44*0e209d39SAndroid Build Coastguard Worker  *
45*0e209d39SAndroid Build Coastguard Worker  * Note: In principle, the caller should setMaxLength() to the maximum of the
46*0e209d39SAndroid Build Coastguard Worker  * max string length and U16_LENGTH/U8_LENGTH to account for
47*0e209d39SAndroid Build Coastguard Worker  * "long" single code points.
48*0e209d39SAndroid Build Coastguard Worker  * However, this implementation uses at least a staticList with more than
49*0e209d39SAndroid Build Coastguard Worker  * U8_LENGTH entries anyway.
50*0e209d39SAndroid Build Coastguard Worker  *
51*0e209d39SAndroid Build Coastguard Worker  * Note: If maxLength were guaranteed to be no more than 32 or 64,
52*0e209d39SAndroid Build Coastguard Worker  * the list could be stored as bit flags in a single integer.
53*0e209d39SAndroid Build Coastguard Worker  * Rather than handling a circular buffer with a start list index,
54*0e209d39SAndroid Build Coastguard Worker  * the integer would simply be shifted when lower offsets are removed.
55*0e209d39SAndroid Build Coastguard Worker  * UnicodeSet does not have a limit on the lengths of strings.
56*0e209d39SAndroid Build Coastguard Worker  */
57*0e209d39SAndroid Build Coastguard Worker class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
58*0e209d39SAndroid Build Coastguard Worker public:
OffsetList()59*0e209d39SAndroid Build Coastguard Worker     OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
60*0e209d39SAndroid Build Coastguard Worker 
~OffsetList()61*0e209d39SAndroid Build Coastguard Worker     ~OffsetList() {
62*0e209d39SAndroid Build Coastguard Worker         if(list!=staticList) {
63*0e209d39SAndroid Build Coastguard Worker             uprv_free(list);
64*0e209d39SAndroid Build Coastguard Worker         }
65*0e209d39SAndroid Build Coastguard Worker     }
66*0e209d39SAndroid Build Coastguard Worker 
67*0e209d39SAndroid Build Coastguard Worker     // Call exactly once if the list is to be used.
setMaxLength(int32_t maxLength)68*0e209d39SAndroid Build Coastguard Worker     void setMaxLength(int32_t maxLength) {
69*0e209d39SAndroid Build Coastguard Worker         if(maxLength<=(int32_t)sizeof(staticList)) {
70*0e209d39SAndroid Build Coastguard Worker             capacity=(int32_t)sizeof(staticList);
71*0e209d39SAndroid Build Coastguard Worker         } else {
72*0e209d39SAndroid Build Coastguard Worker             UBool *l=(UBool *)uprv_malloc(maxLength);
73*0e209d39SAndroid Build Coastguard Worker             if(l!=nullptr) {
74*0e209d39SAndroid Build Coastguard Worker                 list=l;
75*0e209d39SAndroid Build Coastguard Worker                 capacity=maxLength;
76*0e209d39SAndroid Build Coastguard Worker             }
77*0e209d39SAndroid Build Coastguard Worker         }
78*0e209d39SAndroid Build Coastguard Worker         uprv_memset(list, 0, capacity);
79*0e209d39SAndroid Build Coastguard Worker     }
80*0e209d39SAndroid Build Coastguard Worker 
clear()81*0e209d39SAndroid Build Coastguard Worker     void clear() {
82*0e209d39SAndroid Build Coastguard Worker         uprv_memset(list, 0, capacity);
83*0e209d39SAndroid Build Coastguard Worker         start=length=0;
84*0e209d39SAndroid Build Coastguard Worker     }
85*0e209d39SAndroid Build Coastguard Worker 
isEmpty() const86*0e209d39SAndroid Build Coastguard Worker     UBool isEmpty() const {
87*0e209d39SAndroid Build Coastguard Worker         return (UBool)(length==0);
88*0e209d39SAndroid Build Coastguard Worker     }
89*0e209d39SAndroid Build Coastguard Worker 
90*0e209d39SAndroid Build Coastguard Worker     // Reduce all stored offsets by delta, used when the current position
91*0e209d39SAndroid Build Coastguard Worker     // moves by delta.
92*0e209d39SAndroid Build Coastguard Worker     // There must not be any offsets lower than delta.
93*0e209d39SAndroid Build Coastguard Worker     // If there is an offset equal to delta, it is removed.
94*0e209d39SAndroid Build Coastguard Worker     // delta=[1..maxLength]
shift(int32_t delta)95*0e209d39SAndroid Build Coastguard Worker     void shift(int32_t delta) {
96*0e209d39SAndroid Build Coastguard Worker         int32_t i=start+delta;
97*0e209d39SAndroid Build Coastguard Worker         if(i>=capacity) {
98*0e209d39SAndroid Build Coastguard Worker             i-=capacity;
99*0e209d39SAndroid Build Coastguard Worker         }
100*0e209d39SAndroid Build Coastguard Worker         if(list[i]) {
101*0e209d39SAndroid Build Coastguard Worker             list[i]=false;
102*0e209d39SAndroid Build Coastguard Worker             --length;
103*0e209d39SAndroid Build Coastguard Worker         }
104*0e209d39SAndroid Build Coastguard Worker         start=i;
105*0e209d39SAndroid Build Coastguard Worker     }
106*0e209d39SAndroid Build Coastguard Worker 
107*0e209d39SAndroid Build Coastguard Worker     // Add an offset. The list must not contain it yet.
108*0e209d39SAndroid Build Coastguard Worker     // offset=[1..maxLength]
addOffset(int32_t offset)109*0e209d39SAndroid Build Coastguard Worker     void addOffset(int32_t offset) {
110*0e209d39SAndroid Build Coastguard Worker         int32_t i=start+offset;
111*0e209d39SAndroid Build Coastguard Worker         if(i>=capacity) {
112*0e209d39SAndroid Build Coastguard Worker             i-=capacity;
113*0e209d39SAndroid Build Coastguard Worker         }
114*0e209d39SAndroid Build Coastguard Worker         list[i]=true;
115*0e209d39SAndroid Build Coastguard Worker         ++length;
116*0e209d39SAndroid Build Coastguard Worker     }
117*0e209d39SAndroid Build Coastguard Worker 
118*0e209d39SAndroid Build Coastguard Worker     // offset=[1..maxLength]
containsOffset(int32_t offset) const119*0e209d39SAndroid Build Coastguard Worker     UBool containsOffset(int32_t offset) const {
120*0e209d39SAndroid Build Coastguard Worker         int32_t i=start+offset;
121*0e209d39SAndroid Build Coastguard Worker         if(i>=capacity) {
122*0e209d39SAndroid Build Coastguard Worker             i-=capacity;
123*0e209d39SAndroid Build Coastguard Worker         }
124*0e209d39SAndroid Build Coastguard Worker         return list[i];
125*0e209d39SAndroid Build Coastguard Worker     }
126*0e209d39SAndroid Build Coastguard Worker 
127*0e209d39SAndroid Build Coastguard Worker     // Find the lowest stored offset from a non-empty list, remove it,
128*0e209d39SAndroid Build Coastguard Worker     // and reduce all other offsets by this minimum.
129*0e209d39SAndroid Build Coastguard Worker     // Returns [1..maxLength].
popMinimum()130*0e209d39SAndroid Build Coastguard Worker     int32_t popMinimum() {
131*0e209d39SAndroid Build Coastguard Worker         // Look for the next offset in list[start+1..capacity-1].
132*0e209d39SAndroid Build Coastguard Worker         int32_t i=start, result;
133*0e209d39SAndroid Build Coastguard Worker         while(++i<capacity) {
134*0e209d39SAndroid Build Coastguard Worker             if(list[i]) {
135*0e209d39SAndroid Build Coastguard Worker                 list[i]=false;
136*0e209d39SAndroid Build Coastguard Worker                 --length;
137*0e209d39SAndroid Build Coastguard Worker                 result=i-start;
138*0e209d39SAndroid Build Coastguard Worker                 start=i;
139*0e209d39SAndroid Build Coastguard Worker                 return result;
140*0e209d39SAndroid Build Coastguard Worker             }
141*0e209d39SAndroid Build Coastguard Worker         }
142*0e209d39SAndroid Build Coastguard Worker         // i==capacity
143*0e209d39SAndroid Build Coastguard Worker 
144*0e209d39SAndroid Build Coastguard Worker         // Wrap around and look for the next offset in list[0..start].
145*0e209d39SAndroid Build Coastguard Worker         // Since the list is not empty, there will be one.
146*0e209d39SAndroid Build Coastguard Worker         result=capacity-start;
147*0e209d39SAndroid Build Coastguard Worker         i=0;
148*0e209d39SAndroid Build Coastguard Worker         while(!list[i]) {
149*0e209d39SAndroid Build Coastguard Worker             ++i;
150*0e209d39SAndroid Build Coastguard Worker         }
151*0e209d39SAndroid Build Coastguard Worker         list[i]=false;
152*0e209d39SAndroid Build Coastguard Worker         --length;
153*0e209d39SAndroid Build Coastguard Worker         start=i;
154*0e209d39SAndroid Build Coastguard Worker         return result+=i;
155*0e209d39SAndroid Build Coastguard Worker     }
156*0e209d39SAndroid Build Coastguard Worker 
157*0e209d39SAndroid Build Coastguard Worker private:
158*0e209d39SAndroid Build Coastguard Worker     UBool *list;
159*0e209d39SAndroid Build Coastguard Worker     int32_t capacity;
160*0e209d39SAndroid Build Coastguard Worker     int32_t length;
161*0e209d39SAndroid Build Coastguard Worker     int32_t start;
162*0e209d39SAndroid Build Coastguard Worker 
163*0e209d39SAndroid Build Coastguard Worker     UBool staticList[16];
164*0e209d39SAndroid Build Coastguard Worker };
165*0e209d39SAndroid Build Coastguard Worker 
166*0e209d39SAndroid Build Coastguard Worker // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
167*0e209d39SAndroid Build Coastguard Worker static int32_t
getUTF8Length(const char16_t * s,int32_t length)168*0e209d39SAndroid Build Coastguard Worker getUTF8Length(const char16_t *s, int32_t length) {
169*0e209d39SAndroid Build Coastguard Worker     UErrorCode errorCode=U_ZERO_ERROR;
170*0e209d39SAndroid Build Coastguard Worker     int32_t length8=0;
171*0e209d39SAndroid Build Coastguard Worker     u_strToUTF8(nullptr, 0, &length8, s, length, &errorCode);
172*0e209d39SAndroid Build Coastguard Worker     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
173*0e209d39SAndroid Build Coastguard Worker         return length8;
174*0e209d39SAndroid Build Coastguard Worker     } else {
175*0e209d39SAndroid Build Coastguard Worker         // The string contains an unpaired surrogate.
176*0e209d39SAndroid Build Coastguard Worker         // Ignore this string.
177*0e209d39SAndroid Build Coastguard Worker         return 0;
178*0e209d39SAndroid Build Coastguard Worker     }
179*0e209d39SAndroid Build Coastguard Worker }
180*0e209d39SAndroid Build Coastguard Worker 
181*0e209d39SAndroid Build Coastguard Worker // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
182*0e209d39SAndroid Build Coastguard Worker static int32_t
appendUTF8(const char16_t * s,int32_t length,uint8_t * t,int32_t capacity)183*0e209d39SAndroid Build Coastguard Worker appendUTF8(const char16_t *s, int32_t length, uint8_t *t, int32_t capacity) {
184*0e209d39SAndroid Build Coastguard Worker     UErrorCode errorCode=U_ZERO_ERROR;
185*0e209d39SAndroid Build Coastguard Worker     int32_t length8=0;
186*0e209d39SAndroid Build Coastguard Worker     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
187*0e209d39SAndroid Build Coastguard Worker     if(U_SUCCESS(errorCode)) {
188*0e209d39SAndroid Build Coastguard Worker         return length8;
189*0e209d39SAndroid Build Coastguard Worker     } else {
190*0e209d39SAndroid Build Coastguard Worker         // The string contains an unpaired surrogate.
191*0e209d39SAndroid Build Coastguard Worker         // Ignore this string.
192*0e209d39SAndroid Build Coastguard Worker         return 0;
193*0e209d39SAndroid Build Coastguard Worker     }
194*0e209d39SAndroid Build Coastguard Worker }
195*0e209d39SAndroid Build Coastguard Worker 
196*0e209d39SAndroid Build Coastguard Worker static inline uint8_t
makeSpanLengthByte(int32_t spanLength)197*0e209d39SAndroid Build Coastguard Worker makeSpanLengthByte(int32_t spanLength) {
198*0e209d39SAndroid Build Coastguard Worker     // 0xfe==UnicodeSetStringSpan::LONG_SPAN
199*0e209d39SAndroid Build Coastguard Worker     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
200*0e209d39SAndroid Build Coastguard Worker }
201*0e209d39SAndroid Build Coastguard Worker 
202*0e209d39SAndroid Build Coastguard Worker // Construct for all variants of span(), or only for any one variant.
203*0e209d39SAndroid Build Coastguard Worker // Initialize as little as possible, for single use.
UnicodeSetStringSpan(const UnicodeSet & set,const UVector & setStrings,uint32_t which)204*0e209d39SAndroid Build Coastguard Worker UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
205*0e209d39SAndroid Build Coastguard Worker                                            const UVector &setStrings,
206*0e209d39SAndroid Build Coastguard Worker                                            uint32_t which)
207*0e209d39SAndroid Build Coastguard Worker         : spanSet(0, 0x10ffff), pSpanNotSet(nullptr), strings(setStrings),
208*0e209d39SAndroid Build Coastguard Worker           utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr),
209*0e209d39SAndroid Build Coastguard Worker           utf8Length(0),
210*0e209d39SAndroid Build Coastguard Worker           maxLength16(0), maxLength8(0),
211*0e209d39SAndroid Build Coastguard Worker           all((UBool)(which==ALL)) {
212*0e209d39SAndroid Build Coastguard Worker     spanSet.retainAll(set);
213*0e209d39SAndroid Build Coastguard Worker     if(which&NOT_CONTAINED) {
214*0e209d39SAndroid Build Coastguard Worker         // Default to the same sets.
215*0e209d39SAndroid Build Coastguard Worker         // addToSpanNotSet() will create a separate set if necessary.
216*0e209d39SAndroid Build Coastguard Worker         pSpanNotSet=&spanSet;
217*0e209d39SAndroid Build Coastguard Worker     }
218*0e209d39SAndroid Build Coastguard Worker 
219*0e209d39SAndroid Build Coastguard Worker     // Determine if the strings even need to be taken into account at all for span() etc.
220*0e209d39SAndroid Build Coastguard Worker     // If any string is relevant, then all strings need to be used for
221*0e209d39SAndroid Build Coastguard Worker     // span(longest match) but only the relevant ones for span(while contained).
222*0e209d39SAndroid Build Coastguard Worker     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
223*0e209d39SAndroid Build Coastguard Worker     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
224*0e209d39SAndroid Build Coastguard Worker     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
225*0e209d39SAndroid Build Coastguard Worker     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
226*0e209d39SAndroid Build Coastguard Worker     int32_t stringsLength=strings.size();
227*0e209d39SAndroid Build Coastguard Worker 
228*0e209d39SAndroid Build Coastguard Worker     int32_t i, spanLength;
229*0e209d39SAndroid Build Coastguard Worker     UBool someRelevant=false;
230*0e209d39SAndroid Build Coastguard Worker     for(i=0; i<stringsLength; ++i) {
231*0e209d39SAndroid Build Coastguard Worker         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
232*0e209d39SAndroid Build Coastguard Worker         const char16_t *s16=string.getBuffer();
233*0e209d39SAndroid Build Coastguard Worker         int32_t length16=string.length();
234*0e209d39SAndroid Build Coastguard Worker         if (length16==0) {
235*0e209d39SAndroid Build Coastguard Worker             continue;  // skip the empty string
236*0e209d39SAndroid Build Coastguard Worker         }
237*0e209d39SAndroid Build Coastguard Worker         UBool thisRelevant;
238*0e209d39SAndroid Build Coastguard Worker         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
239*0e209d39SAndroid Build Coastguard Worker         if(spanLength<length16) {  // Relevant string.
240*0e209d39SAndroid Build Coastguard Worker             someRelevant=thisRelevant=true;
241*0e209d39SAndroid Build Coastguard Worker         } else {
242*0e209d39SAndroid Build Coastguard Worker             thisRelevant=false;
243*0e209d39SAndroid Build Coastguard Worker         }
244*0e209d39SAndroid Build Coastguard Worker         if((which&UTF16) && length16>maxLength16) {
245*0e209d39SAndroid Build Coastguard Worker             maxLength16=length16;
246*0e209d39SAndroid Build Coastguard Worker         }
247*0e209d39SAndroid Build Coastguard Worker         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
248*0e209d39SAndroid Build Coastguard Worker             int32_t length8=getUTF8Length(s16, length16);
249*0e209d39SAndroid Build Coastguard Worker             utf8Length+=length8;
250*0e209d39SAndroid Build Coastguard Worker             if(length8>maxLength8) {
251*0e209d39SAndroid Build Coastguard Worker                 maxLength8=length8;
252*0e209d39SAndroid Build Coastguard Worker             }
253*0e209d39SAndroid Build Coastguard Worker         }
254*0e209d39SAndroid Build Coastguard Worker     }
255*0e209d39SAndroid Build Coastguard Worker     if(!someRelevant) {
256*0e209d39SAndroid Build Coastguard Worker         maxLength16=maxLength8=0;
257*0e209d39SAndroid Build Coastguard Worker         return;
258*0e209d39SAndroid Build Coastguard Worker     }
259*0e209d39SAndroid Build Coastguard Worker 
260*0e209d39SAndroid Build Coastguard Worker     // Freeze after checking for the need to use strings at all because freezing
261*0e209d39SAndroid Build Coastguard Worker     // a set takes some time and memory which are wasted if there are no relevant strings.
262*0e209d39SAndroid Build Coastguard Worker     if(all) {
263*0e209d39SAndroid Build Coastguard Worker         spanSet.freeze();
264*0e209d39SAndroid Build Coastguard Worker     }
265*0e209d39SAndroid Build Coastguard Worker 
266*0e209d39SAndroid Build Coastguard Worker     uint8_t *spanBackLengths;
267*0e209d39SAndroid Build Coastguard Worker     uint8_t *spanUTF8Lengths;
268*0e209d39SAndroid Build Coastguard Worker     uint8_t *spanBackUTF8Lengths;
269*0e209d39SAndroid Build Coastguard Worker 
270*0e209d39SAndroid Build Coastguard Worker     // Allocate a block of meta data.
271*0e209d39SAndroid Build Coastguard Worker     int32_t allocSize;
272*0e209d39SAndroid Build Coastguard Worker     if(all) {
273*0e209d39SAndroid Build Coastguard Worker         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
274*0e209d39SAndroid Build Coastguard Worker         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
275*0e209d39SAndroid Build Coastguard Worker     } else {
276*0e209d39SAndroid Build Coastguard Worker         allocSize=stringsLength;  // One set of span lengths.
277*0e209d39SAndroid Build Coastguard Worker         if(which&UTF8) {
278*0e209d39SAndroid Build Coastguard Worker             // UTF-8 lengths and UTF-8 strings.
279*0e209d39SAndroid Build Coastguard Worker             allocSize+=stringsLength*4+utf8Length;
280*0e209d39SAndroid Build Coastguard Worker         }
281*0e209d39SAndroid Build Coastguard Worker     }
282*0e209d39SAndroid Build Coastguard Worker     if(allocSize<=(int32_t)sizeof(staticLengths)) {
283*0e209d39SAndroid Build Coastguard Worker         utf8Lengths=staticLengths;
284*0e209d39SAndroid Build Coastguard Worker     } else {
285*0e209d39SAndroid Build Coastguard Worker         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
286*0e209d39SAndroid Build Coastguard Worker         if(utf8Lengths==nullptr) {
287*0e209d39SAndroid Build Coastguard Worker             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return false.
288*0e209d39SAndroid Build Coastguard Worker             return;  // Out of memory.
289*0e209d39SAndroid Build Coastguard Worker         }
290*0e209d39SAndroid Build Coastguard Worker     }
291*0e209d39SAndroid Build Coastguard Worker 
292*0e209d39SAndroid Build Coastguard Worker     if(all) {
293*0e209d39SAndroid Build Coastguard Worker         // Store span lengths for all span() variants.
294*0e209d39SAndroid Build Coastguard Worker         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
295*0e209d39SAndroid Build Coastguard Worker         spanBackLengths=spanLengths+stringsLength;
296*0e209d39SAndroid Build Coastguard Worker         spanUTF8Lengths=spanBackLengths+stringsLength;
297*0e209d39SAndroid Build Coastguard Worker         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
298*0e209d39SAndroid Build Coastguard Worker         utf8=spanBackUTF8Lengths+stringsLength;
299*0e209d39SAndroid Build Coastguard Worker     } else {
300*0e209d39SAndroid Build Coastguard Worker         // Store span lengths for only one span() variant.
301*0e209d39SAndroid Build Coastguard Worker         if(which&UTF8) {
302*0e209d39SAndroid Build Coastguard Worker             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
303*0e209d39SAndroid Build Coastguard Worker             utf8=spanLengths+stringsLength;
304*0e209d39SAndroid Build Coastguard Worker         } else {
305*0e209d39SAndroid Build Coastguard Worker             spanLengths=(uint8_t *)utf8Lengths;
306*0e209d39SAndroid Build Coastguard Worker         }
307*0e209d39SAndroid Build Coastguard Worker         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
308*0e209d39SAndroid Build Coastguard Worker     }
309*0e209d39SAndroid Build Coastguard Worker 
310*0e209d39SAndroid Build Coastguard Worker     // Set the meta data and pSpanNotSet and write the UTF-8 strings.
311*0e209d39SAndroid Build Coastguard Worker     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
312*0e209d39SAndroid Build Coastguard Worker 
313*0e209d39SAndroid Build Coastguard Worker     for(i=0; i<stringsLength; ++i) {
314*0e209d39SAndroid Build Coastguard Worker         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
315*0e209d39SAndroid Build Coastguard Worker         const char16_t *s16=string.getBuffer();
316*0e209d39SAndroid Build Coastguard Worker         int32_t length16=string.length();
317*0e209d39SAndroid Build Coastguard Worker         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
318*0e209d39SAndroid Build Coastguard Worker         if(spanLength<length16 && length16>0) {  // Relevant string.
319*0e209d39SAndroid Build Coastguard Worker             if(which&UTF16) {
320*0e209d39SAndroid Build Coastguard Worker                 if(which&CONTAINED) {
321*0e209d39SAndroid Build Coastguard Worker                     if(which&FWD) {
322*0e209d39SAndroid Build Coastguard Worker                         spanLengths[i]=makeSpanLengthByte(spanLength);
323*0e209d39SAndroid Build Coastguard Worker                     }
324*0e209d39SAndroid Build Coastguard Worker                     if(which&BACK) {
325*0e209d39SAndroid Build Coastguard Worker                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
326*0e209d39SAndroid Build Coastguard Worker                         spanBackLengths[i]=makeSpanLengthByte(spanLength);
327*0e209d39SAndroid Build Coastguard Worker                     }
328*0e209d39SAndroid Build Coastguard Worker                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
329*0e209d39SAndroid Build Coastguard Worker                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
330*0e209d39SAndroid Build Coastguard Worker                 }
331*0e209d39SAndroid Build Coastguard Worker             }
332*0e209d39SAndroid Build Coastguard Worker             if(which&UTF8) {
333*0e209d39SAndroid Build Coastguard Worker                 uint8_t *s8=utf8+utf8Count;
334*0e209d39SAndroid Build Coastguard Worker                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
335*0e209d39SAndroid Build Coastguard Worker                 utf8Count+=utf8Lengths[i]=length8;
336*0e209d39SAndroid Build Coastguard Worker                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
337*0e209d39SAndroid Build Coastguard Worker                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
338*0e209d39SAndroid Build Coastguard Worker                 } else {  // Relevant for UTF-8.
339*0e209d39SAndroid Build Coastguard Worker                     if(which&CONTAINED) {
340*0e209d39SAndroid Build Coastguard Worker                         if(which&FWD) {
341*0e209d39SAndroid Build Coastguard Worker                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
342*0e209d39SAndroid Build Coastguard Worker                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
343*0e209d39SAndroid Build Coastguard Worker                         }
344*0e209d39SAndroid Build Coastguard Worker                         if(which&BACK) {
345*0e209d39SAndroid Build Coastguard Worker                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
346*0e209d39SAndroid Build Coastguard Worker                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
347*0e209d39SAndroid Build Coastguard Worker                         }
348*0e209d39SAndroid Build Coastguard Worker                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
349*0e209d39SAndroid Build Coastguard Worker                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
350*0e209d39SAndroid Build Coastguard Worker                     }
351*0e209d39SAndroid Build Coastguard Worker                 }
352*0e209d39SAndroid Build Coastguard Worker             }
353*0e209d39SAndroid Build Coastguard Worker             if(which&NOT_CONTAINED) {
354*0e209d39SAndroid Build Coastguard Worker                 // Add string start and end code points to the spanNotSet so that
355*0e209d39SAndroid Build Coastguard Worker                 // a span(while not contained) stops before any string.
356*0e209d39SAndroid Build Coastguard Worker                 UChar32 c;
357*0e209d39SAndroid Build Coastguard Worker                 if(which&FWD) {
358*0e209d39SAndroid Build Coastguard Worker                     int32_t len=0;
359*0e209d39SAndroid Build Coastguard Worker                     U16_NEXT(s16, len, length16, c);
360*0e209d39SAndroid Build Coastguard Worker                     addToSpanNotSet(c);
361*0e209d39SAndroid Build Coastguard Worker                 }
362*0e209d39SAndroid Build Coastguard Worker                 if(which&BACK) {
363*0e209d39SAndroid Build Coastguard Worker                     int32_t len=length16;
364*0e209d39SAndroid Build Coastguard Worker                     U16_PREV(s16, 0, len, c);
365*0e209d39SAndroid Build Coastguard Worker                     addToSpanNotSet(c);
366*0e209d39SAndroid Build Coastguard Worker                 }
367*0e209d39SAndroid Build Coastguard Worker             }
368*0e209d39SAndroid Build Coastguard Worker         } else {  // Irrelevant string. (Also the empty string.)
369*0e209d39SAndroid Build Coastguard Worker             if(which&UTF8) {
370*0e209d39SAndroid Build Coastguard Worker                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
371*0e209d39SAndroid Build Coastguard Worker                     uint8_t *s8=utf8+utf8Count;
372*0e209d39SAndroid Build Coastguard Worker                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
373*0e209d39SAndroid Build Coastguard Worker                     utf8Count+=utf8Lengths[i]=length8;
374*0e209d39SAndroid Build Coastguard Worker                 } else {
375*0e209d39SAndroid Build Coastguard Worker                     utf8Lengths[i]=0;
376*0e209d39SAndroid Build Coastguard Worker                 }
377*0e209d39SAndroid Build Coastguard Worker             }
378*0e209d39SAndroid Build Coastguard Worker             if(all) {
379*0e209d39SAndroid Build Coastguard Worker                 spanLengths[i]=spanBackLengths[i]=
380*0e209d39SAndroid Build Coastguard Worker                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
381*0e209d39SAndroid Build Coastguard Worker                         (uint8_t)ALL_CP_CONTAINED;
382*0e209d39SAndroid Build Coastguard Worker             } else {
383*0e209d39SAndroid Build Coastguard Worker                 // All spanXYZLengths pointers contain the same address.
384*0e209d39SAndroid Build Coastguard Worker                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
385*0e209d39SAndroid Build Coastguard Worker             }
386*0e209d39SAndroid Build Coastguard Worker         }
387*0e209d39SAndroid Build Coastguard Worker     }
388*0e209d39SAndroid Build Coastguard Worker 
389*0e209d39SAndroid Build Coastguard Worker     // Finish.
390*0e209d39SAndroid Build Coastguard Worker     if(all) {
391*0e209d39SAndroid Build Coastguard Worker         pSpanNotSet->freeze();
392*0e209d39SAndroid Build Coastguard Worker     }
393*0e209d39SAndroid Build Coastguard Worker }
394*0e209d39SAndroid Build Coastguard Worker 
395*0e209d39SAndroid Build Coastguard Worker // Copy constructor. Assumes which==ALL for a frozen set.
UnicodeSetStringSpan(const UnicodeSetStringSpan & otherStringSpan,const UVector & newParentSetStrings)396*0e209d39SAndroid Build Coastguard Worker UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
397*0e209d39SAndroid Build Coastguard Worker                                            const UVector &newParentSetStrings)
398*0e209d39SAndroid Build Coastguard Worker         : spanSet(otherStringSpan.spanSet), pSpanNotSet(nullptr), strings(newParentSetStrings),
399*0e209d39SAndroid Build Coastguard Worker           utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr),
400*0e209d39SAndroid Build Coastguard Worker           utf8Length(otherStringSpan.utf8Length),
401*0e209d39SAndroid Build Coastguard Worker           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
402*0e209d39SAndroid Build Coastguard Worker           all(true) {
403*0e209d39SAndroid Build Coastguard Worker     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
404*0e209d39SAndroid Build Coastguard Worker         pSpanNotSet=&spanSet;
405*0e209d39SAndroid Build Coastguard Worker     } else {
406*0e209d39SAndroid Build Coastguard Worker         pSpanNotSet=otherStringSpan.pSpanNotSet->clone();
407*0e209d39SAndroid Build Coastguard Worker     }
408*0e209d39SAndroid Build Coastguard Worker 
409*0e209d39SAndroid Build Coastguard Worker     // Allocate a block of meta data.
410*0e209d39SAndroid Build Coastguard Worker     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
411*0e209d39SAndroid Build Coastguard Worker     int32_t stringsLength=strings.size();
412*0e209d39SAndroid Build Coastguard Worker     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
413*0e209d39SAndroid Build Coastguard Worker     if(allocSize<=(int32_t)sizeof(staticLengths)) {
414*0e209d39SAndroid Build Coastguard Worker         utf8Lengths=staticLengths;
415*0e209d39SAndroid Build Coastguard Worker     } else {
416*0e209d39SAndroid Build Coastguard Worker         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
417*0e209d39SAndroid Build Coastguard Worker         if(utf8Lengths==nullptr) {
418*0e209d39SAndroid Build Coastguard Worker             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return false.
419*0e209d39SAndroid Build Coastguard Worker             return;  // Out of memory.
420*0e209d39SAndroid Build Coastguard Worker         }
421*0e209d39SAndroid Build Coastguard Worker     }
422*0e209d39SAndroid Build Coastguard Worker 
423*0e209d39SAndroid Build Coastguard Worker     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
424*0e209d39SAndroid Build Coastguard Worker     utf8=spanLengths+stringsLength*4;
425*0e209d39SAndroid Build Coastguard Worker     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
426*0e209d39SAndroid Build Coastguard Worker }
427*0e209d39SAndroid Build Coastguard Worker 
~UnicodeSetStringSpan()428*0e209d39SAndroid Build Coastguard Worker UnicodeSetStringSpan::~UnicodeSetStringSpan() {
429*0e209d39SAndroid Build Coastguard Worker     if(pSpanNotSet!=nullptr && pSpanNotSet!=&spanSet) {
430*0e209d39SAndroid Build Coastguard Worker         delete pSpanNotSet;
431*0e209d39SAndroid Build Coastguard Worker     }
432*0e209d39SAndroid Build Coastguard Worker     if(utf8Lengths!=nullptr && utf8Lengths!=staticLengths) {
433*0e209d39SAndroid Build Coastguard Worker         uprv_free(utf8Lengths);
434*0e209d39SAndroid Build Coastguard Worker     }
435*0e209d39SAndroid Build Coastguard Worker }
436*0e209d39SAndroid Build Coastguard Worker 
addToSpanNotSet(UChar32 c)437*0e209d39SAndroid Build Coastguard Worker void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
438*0e209d39SAndroid Build Coastguard Worker     if(pSpanNotSet==nullptr || pSpanNotSet==&spanSet) {
439*0e209d39SAndroid Build Coastguard Worker         if(spanSet.contains(c)) {
440*0e209d39SAndroid Build Coastguard Worker             return;  // Nothing to do.
441*0e209d39SAndroid Build Coastguard Worker         }
442*0e209d39SAndroid Build Coastguard Worker         UnicodeSet *newSet=spanSet.cloneAsThawed();
443*0e209d39SAndroid Build Coastguard Worker         if(newSet==nullptr) {
444*0e209d39SAndroid Build Coastguard Worker             return;  // Out of memory.
445*0e209d39SAndroid Build Coastguard Worker         } else {
446*0e209d39SAndroid Build Coastguard Worker             pSpanNotSet=newSet;
447*0e209d39SAndroid Build Coastguard Worker         }
448*0e209d39SAndroid Build Coastguard Worker     }
449*0e209d39SAndroid Build Coastguard Worker     pSpanNotSet->add(c);
450*0e209d39SAndroid Build Coastguard Worker }
451*0e209d39SAndroid Build Coastguard Worker 
452*0e209d39SAndroid Build Coastguard Worker // Compare strings without any argument checks. Requires length>0.
453*0e209d39SAndroid Build Coastguard Worker static inline UBool
matches16(const char16_t * s,const char16_t * t,int32_t length)454*0e209d39SAndroid Build Coastguard Worker matches16(const char16_t *s, const char16_t *t, int32_t length) {
455*0e209d39SAndroid Build Coastguard Worker     do {
456*0e209d39SAndroid Build Coastguard Worker         if(*s++!=*t++) {
457*0e209d39SAndroid Build Coastguard Worker             return false;
458*0e209d39SAndroid Build Coastguard Worker         }
459*0e209d39SAndroid Build Coastguard Worker     } while(--length>0);
460*0e209d39SAndroid Build Coastguard Worker     return true;
461*0e209d39SAndroid Build Coastguard Worker }
462*0e209d39SAndroid Build Coastguard Worker 
463*0e209d39SAndroid Build Coastguard Worker static inline UBool
matches8(const uint8_t * s,const uint8_t * t,int32_t length)464*0e209d39SAndroid Build Coastguard Worker matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
465*0e209d39SAndroid Build Coastguard Worker     do {
466*0e209d39SAndroid Build Coastguard Worker         if(*s++!=*t++) {
467*0e209d39SAndroid Build Coastguard Worker             return false;
468*0e209d39SAndroid Build Coastguard Worker         }
469*0e209d39SAndroid Build Coastguard Worker     } while(--length>0);
470*0e209d39SAndroid Build Coastguard Worker     return true;
471*0e209d39SAndroid Build Coastguard Worker }
472*0e209d39SAndroid Build Coastguard Worker 
473*0e209d39SAndroid Build Coastguard Worker // Compare 16-bit Unicode strings (which may be malformed UTF-16)
474*0e209d39SAndroid Build Coastguard Worker // at code point boundaries.
475*0e209d39SAndroid Build Coastguard Worker // That is, each edge of a match must not be in the middle of a surrogate pair.
476*0e209d39SAndroid Build Coastguard Worker static inline UBool
matches16CPB(const char16_t * s,int32_t start,int32_t limit,const char16_t * t,int32_t length)477*0e209d39SAndroid Build Coastguard Worker matches16CPB(const char16_t *s, int32_t start, int32_t limit, const char16_t *t, int32_t length) {
478*0e209d39SAndroid Build Coastguard Worker     s+=start;
479*0e209d39SAndroid Build Coastguard Worker     limit-=start;
480*0e209d39SAndroid Build Coastguard Worker     return matches16(s, t, length) &&
481*0e209d39SAndroid Build Coastguard Worker            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
482*0e209d39SAndroid Build Coastguard Worker            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
483*0e209d39SAndroid Build Coastguard Worker }
484*0e209d39SAndroid Build Coastguard Worker 
485*0e209d39SAndroid Build Coastguard Worker // Does the set contain the next code point?
486*0e209d39SAndroid Build Coastguard Worker // If so, return its length; otherwise return its negative length.
487*0e209d39SAndroid Build Coastguard Worker static inline int32_t
spanOne(const UnicodeSet & set,const char16_t * s,int32_t length)488*0e209d39SAndroid Build Coastguard Worker spanOne(const UnicodeSet &set, const char16_t *s, int32_t length) {
489*0e209d39SAndroid Build Coastguard Worker     char16_t c=*s, c2;
490*0e209d39SAndroid Build Coastguard Worker     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
491*0e209d39SAndroid Build Coastguard Worker         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
492*0e209d39SAndroid Build Coastguard Worker     }
493*0e209d39SAndroid Build Coastguard Worker     return set.contains(c) ? 1 : -1;
494*0e209d39SAndroid Build Coastguard Worker }
495*0e209d39SAndroid Build Coastguard Worker 
496*0e209d39SAndroid Build Coastguard Worker static inline int32_t
spanOneBack(const UnicodeSet & set,const char16_t * s,int32_t length)497*0e209d39SAndroid Build Coastguard Worker spanOneBack(const UnicodeSet &set, const char16_t *s, int32_t length) {
498*0e209d39SAndroid Build Coastguard Worker     char16_t c=s[length-1], c2;
499*0e209d39SAndroid Build Coastguard Worker     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
500*0e209d39SAndroid Build Coastguard Worker         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
501*0e209d39SAndroid Build Coastguard Worker     }
502*0e209d39SAndroid Build Coastguard Worker     return set.contains(c) ? 1 : -1;
503*0e209d39SAndroid Build Coastguard Worker }
504*0e209d39SAndroid Build Coastguard Worker 
505*0e209d39SAndroid Build Coastguard Worker static inline int32_t
spanOneUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)506*0e209d39SAndroid Build Coastguard Worker spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
507*0e209d39SAndroid Build Coastguard Worker     UChar32 c=*s;
508*0e209d39SAndroid Build Coastguard Worker     if(U8_IS_SINGLE(c)) {
509*0e209d39SAndroid Build Coastguard Worker         return set.contains(c) ? 1 : -1;
510*0e209d39SAndroid Build Coastguard Worker     }
511*0e209d39SAndroid Build Coastguard Worker     // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
512*0e209d39SAndroid Build Coastguard Worker     int32_t i=0;
513*0e209d39SAndroid Build Coastguard Worker     U8_NEXT_OR_FFFD(s, i, length, c);
514*0e209d39SAndroid Build Coastguard Worker     return set.contains(c) ? i : -i;
515*0e209d39SAndroid Build Coastguard Worker }
516*0e209d39SAndroid Build Coastguard Worker 
517*0e209d39SAndroid Build Coastguard Worker static inline int32_t
spanOneBackUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)518*0e209d39SAndroid Build Coastguard Worker spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
519*0e209d39SAndroid Build Coastguard Worker     UChar32 c=s[length-1];
520*0e209d39SAndroid Build Coastguard Worker     if(U8_IS_SINGLE(c)) {
521*0e209d39SAndroid Build Coastguard Worker         return set.contains(c) ? 1 : -1;
522*0e209d39SAndroid Build Coastguard Worker     }
523*0e209d39SAndroid Build Coastguard Worker     int32_t i=length-1;
524*0e209d39SAndroid Build Coastguard Worker     c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
525*0e209d39SAndroid Build Coastguard Worker     length-=i;
526*0e209d39SAndroid Build Coastguard Worker     return set.contains(c) ? length : -length;
527*0e209d39SAndroid Build Coastguard Worker }
528*0e209d39SAndroid Build Coastguard Worker 
529*0e209d39SAndroid Build Coastguard Worker /*
530*0e209d39SAndroid Build Coastguard Worker  * Note: In span() when spanLength==0 (after a string match, or at the beginning
531*0e209d39SAndroid Build Coastguard Worker  * after an empty code point span) and in spanNot() and spanNotUTF8(),
532*0e209d39SAndroid Build Coastguard Worker  * string matching could use a binary search
533*0e209d39SAndroid Build Coastguard Worker  * because all string matches are done from the same start index.
534*0e209d39SAndroid Build Coastguard Worker  *
535*0e209d39SAndroid Build Coastguard Worker  * For UTF-8, this would require a comparison function that returns UTF-16 order.
536*0e209d39SAndroid Build Coastguard Worker  *
537*0e209d39SAndroid Build Coastguard Worker  * This optimization should not be necessary for normal UnicodeSets because
538*0e209d39SAndroid Build Coastguard Worker  * most sets have no strings, and most sets with strings have
539*0e209d39SAndroid Build Coastguard Worker  * very few very short strings.
540*0e209d39SAndroid Build Coastguard Worker  * For cases with many strings, it might be better to use a different API
541*0e209d39SAndroid Build Coastguard Worker  * and implementation with a DFA (state machine).
542*0e209d39SAndroid Build Coastguard Worker  */
543*0e209d39SAndroid Build Coastguard Worker 
544*0e209d39SAndroid Build Coastguard Worker /*
545*0e209d39SAndroid Build Coastguard Worker  * Algorithm for span(USET_SPAN_CONTAINED)
546*0e209d39SAndroid Build Coastguard Worker  *
547*0e209d39SAndroid Build Coastguard Worker  * Theoretical algorithm:
548*0e209d39SAndroid Build Coastguard Worker  * - Iterate through the string, and at each code point boundary:
549*0e209d39SAndroid Build Coastguard Worker  *   + If the code point there is in the set, then remember to continue after it.
550*0e209d39SAndroid Build Coastguard Worker  *   + If a set string matches at the current position, then remember to continue after it.
551*0e209d39SAndroid Build Coastguard Worker  *   + Either recursively span for each code point or string match,
552*0e209d39SAndroid Build Coastguard Worker  *     or recursively span for all but the shortest one and
553*0e209d39SAndroid Build Coastguard Worker  *     iteratively continue the span with the shortest local match.
554*0e209d39SAndroid Build Coastguard Worker  *   + Remember the longest recursive span (the farthest end point).
555*0e209d39SAndroid Build Coastguard Worker  *   + If there is no match at the current position, neither for the code point there
556*0e209d39SAndroid Build Coastguard Worker  *     nor for any set string, then stop and return the longest recursive span length.
557*0e209d39SAndroid Build Coastguard Worker  *
558*0e209d39SAndroid Build Coastguard Worker  * Optimized implementation:
559*0e209d39SAndroid Build Coastguard Worker  *
560*0e209d39SAndroid Build Coastguard Worker  * (We assume that most sets will have very few very short strings.
561*0e209d39SAndroid Build Coastguard Worker  * A span using a string-less set is extremely fast.)
562*0e209d39SAndroid Build Coastguard Worker  *
563*0e209d39SAndroid Build Coastguard Worker  * Create and cache a spanSet which contains all of the single code points
564*0e209d39SAndroid Build Coastguard Worker  * of the original set but none of its strings.
565*0e209d39SAndroid Build Coastguard Worker  *
566*0e209d39SAndroid Build Coastguard Worker  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
567*0e209d39SAndroid Build Coastguard Worker  * - Loop:
568*0e209d39SAndroid Build Coastguard Worker  *   + Try to match each set string at the end of the spanLength.
569*0e209d39SAndroid Build Coastguard Worker  *     ~ Set strings that start with set-contained code points must be matched
570*0e209d39SAndroid Build Coastguard Worker  *       with a partial overlap because the recursive algorithm would have tried
571*0e209d39SAndroid Build Coastguard Worker  *       to match them at every position.
572*0e209d39SAndroid Build Coastguard Worker  *     ~ Set strings that entirely consist of set-contained code points
573*0e209d39SAndroid Build Coastguard Worker  *       are irrelevant for span(USET_SPAN_CONTAINED) because the
574*0e209d39SAndroid Build Coastguard Worker  *       recursive algorithm would continue after them anyway
575*0e209d39SAndroid Build Coastguard Worker  *       and find the longest recursive match from their end.
576*0e209d39SAndroid Build Coastguard Worker  *     ~ Rather than recursing, note each end point of a set string match.
577*0e209d39SAndroid Build Coastguard Worker  *   + If no set string matched after spanSet.span(), then return
578*0e209d39SAndroid Build Coastguard Worker  *     with where the spanSet.span() ended.
579*0e209d39SAndroid Build Coastguard Worker  *   + If at least one set string matched after spanSet.span(), then
580*0e209d39SAndroid Build Coastguard Worker  *     pop the shortest string match end point and continue
581*0e209d39SAndroid Build Coastguard Worker  *     the loop, trying to match all set strings from there.
582*0e209d39SAndroid Build Coastguard Worker  *   + If at least one more set string matched after a previous string match,
583*0e209d39SAndroid Build Coastguard Worker  *     then test if the code point after the previous string match is also
584*0e209d39SAndroid Build Coastguard Worker  *     contained in the set.
585*0e209d39SAndroid Build Coastguard Worker  *     Continue the loop with the shortest end point of either this code point
586*0e209d39SAndroid Build Coastguard Worker  *     or a matching set string.
587*0e209d39SAndroid Build Coastguard Worker  *   + If no more set string matched after a previous string match,
588*0e209d39SAndroid Build Coastguard Worker  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
589*0e209d39SAndroid Build Coastguard Worker  *     Stop if spanLength==0, otherwise continue the loop.
590*0e209d39SAndroid Build Coastguard Worker  *
591*0e209d39SAndroid Build Coastguard Worker  * By noting each end point of a set string match,
592*0e209d39SAndroid Build Coastguard Worker  * the function visits each string position at most once and finishes
593*0e209d39SAndroid Build Coastguard Worker  * in linear time.
594*0e209d39SAndroid Build Coastguard Worker  *
595*0e209d39SAndroid Build Coastguard Worker  * The recursive algorithm may visit the same string position many times
596*0e209d39SAndroid Build Coastguard Worker  * if multiple paths lead to it and finishes in exponential time.
597*0e209d39SAndroid Build Coastguard Worker  */
598*0e209d39SAndroid Build Coastguard Worker 
599*0e209d39SAndroid Build Coastguard Worker /*
600*0e209d39SAndroid Build Coastguard Worker  * Algorithm for span(USET_SPAN_SIMPLE)
601*0e209d39SAndroid Build Coastguard Worker  *
602*0e209d39SAndroid Build Coastguard Worker  * Theoretical algorithm:
603*0e209d39SAndroid Build Coastguard Worker  * - Iterate through the string, and at each code point boundary:
604*0e209d39SAndroid Build Coastguard Worker  *   + If the code point there is in the set, then remember to continue after it.
605*0e209d39SAndroid Build Coastguard Worker  *   + If a set string matches at the current position, then remember to continue after it.
606*0e209d39SAndroid Build Coastguard Worker  *   + Continue from the farthest match position and ignore all others.
607*0e209d39SAndroid Build Coastguard Worker  *   + If there is no match at the current position,
608*0e209d39SAndroid Build Coastguard Worker  *     then stop and return the current position.
609*0e209d39SAndroid Build Coastguard Worker  *
610*0e209d39SAndroid Build Coastguard Worker  * Optimized implementation:
611*0e209d39SAndroid Build Coastguard Worker  *
612*0e209d39SAndroid Build Coastguard Worker  * (Same assumption and spanSet as above.)
613*0e209d39SAndroid Build Coastguard Worker  *
614*0e209d39SAndroid Build Coastguard Worker  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
615*0e209d39SAndroid Build Coastguard Worker  * - Loop:
616*0e209d39SAndroid Build Coastguard Worker  *   + Try to match each set string at the end of the spanLength.
617*0e209d39SAndroid Build Coastguard Worker  *     ~ Set strings that start with set-contained code points must be matched
618*0e209d39SAndroid Build Coastguard Worker  *       with a partial overlap because the standard algorithm would have tried
619*0e209d39SAndroid Build Coastguard Worker  *       to match them earlier.
620*0e209d39SAndroid Build Coastguard Worker  *     ~ Set strings that entirely consist of set-contained code points
621*0e209d39SAndroid Build Coastguard Worker  *       must be matched with a full overlap because the longest-match algorithm
622*0e209d39SAndroid Build Coastguard Worker  *       would hide set string matches that end earlier.
623*0e209d39SAndroid Build Coastguard Worker  *       Such set strings need not be matched earlier inside the code point span
624*0e209d39SAndroid Build Coastguard Worker  *       because the standard algorithm would then have continued after
625*0e209d39SAndroid Build Coastguard Worker  *       the set string match anyway.
626*0e209d39SAndroid Build Coastguard Worker  *     ~ Remember the longest set string match (farthest end point) from the earliest
627*0e209d39SAndroid Build Coastguard Worker  *       starting point.
628*0e209d39SAndroid Build Coastguard Worker  *   + If no set string matched after spanSet.span(), then return
629*0e209d39SAndroid Build Coastguard Worker  *     with where the spanSet.span() ended.
630*0e209d39SAndroid Build Coastguard Worker  *   + If at least one set string matched, then continue the loop after the
631*0e209d39SAndroid Build Coastguard Worker  *     longest match from the earliest position.
632*0e209d39SAndroid Build Coastguard Worker  *   + If no more set string matched after a previous string match,
633*0e209d39SAndroid Build Coastguard Worker  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
634*0e209d39SAndroid Build Coastguard Worker  *     Stop if spanLength==0, otherwise continue the loop.
635*0e209d39SAndroid Build Coastguard Worker  */
636*0e209d39SAndroid Build Coastguard Worker 
span(const char16_t * s,int32_t length,USetSpanCondition spanCondition) const637*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
638*0e209d39SAndroid Build Coastguard Worker     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
639*0e209d39SAndroid Build Coastguard Worker         return spanNot(s, length);
640*0e209d39SAndroid Build Coastguard Worker     }
641*0e209d39SAndroid Build Coastguard Worker     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
642*0e209d39SAndroid Build Coastguard Worker     if(spanLength==length) {
643*0e209d39SAndroid Build Coastguard Worker         return length;
644*0e209d39SAndroid Build Coastguard Worker     }
645*0e209d39SAndroid Build Coastguard Worker 
646*0e209d39SAndroid Build Coastguard Worker     // Consider strings; they may overlap with the span.
647*0e209d39SAndroid Build Coastguard Worker     OffsetList offsets;
648*0e209d39SAndroid Build Coastguard Worker     if(spanCondition==USET_SPAN_CONTAINED) {
649*0e209d39SAndroid Build Coastguard Worker         // Use offset list to try all possibilities.
650*0e209d39SAndroid Build Coastguard Worker         offsets.setMaxLength(maxLength16);
651*0e209d39SAndroid Build Coastguard Worker     }
652*0e209d39SAndroid Build Coastguard Worker     int32_t pos=spanLength, rest=length-pos;
653*0e209d39SAndroid Build Coastguard Worker     int32_t i, stringsLength=strings.size();
654*0e209d39SAndroid Build Coastguard Worker     for(;;) {
655*0e209d39SAndroid Build Coastguard Worker         if(spanCondition==USET_SPAN_CONTAINED) {
656*0e209d39SAndroid Build Coastguard Worker             for(i=0; i<stringsLength; ++i) {
657*0e209d39SAndroid Build Coastguard Worker                 int32_t overlap=spanLengths[i];
658*0e209d39SAndroid Build Coastguard Worker                 if(overlap==ALL_CP_CONTAINED) {
659*0e209d39SAndroid Build Coastguard Worker                     continue;  // Irrelevant string. (Also the empty string.)
660*0e209d39SAndroid Build Coastguard Worker                 }
661*0e209d39SAndroid Build Coastguard Worker                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
662*0e209d39SAndroid Build Coastguard Worker                 const char16_t *s16=string.getBuffer();
663*0e209d39SAndroid Build Coastguard Worker                 int32_t length16=string.length();
664*0e209d39SAndroid Build Coastguard Worker                 U_ASSERT(length>0);
665*0e209d39SAndroid Build Coastguard Worker 
666*0e209d39SAndroid Build Coastguard Worker                 // Try to match this string at pos-overlap..pos.
667*0e209d39SAndroid Build Coastguard Worker                 if(overlap>=LONG_SPAN) {
668*0e209d39SAndroid Build Coastguard Worker                     overlap=length16;
669*0e209d39SAndroid Build Coastguard Worker                     // While contained: No point matching fully inside the code point span.
670*0e209d39SAndroid Build Coastguard Worker                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
671*0e209d39SAndroid Build Coastguard Worker                 }
672*0e209d39SAndroid Build Coastguard Worker                 if(overlap>spanLength) {
673*0e209d39SAndroid Build Coastguard Worker                     overlap=spanLength;
674*0e209d39SAndroid Build Coastguard Worker                 }
675*0e209d39SAndroid Build Coastguard Worker                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
676*0e209d39SAndroid Build Coastguard Worker                 for(;;) {
677*0e209d39SAndroid Build Coastguard Worker                     if(inc>rest) {
678*0e209d39SAndroid Build Coastguard Worker                         break;
679*0e209d39SAndroid Build Coastguard Worker                     }
680*0e209d39SAndroid Build Coastguard Worker                     // Try to match if the increment is not listed already.
681*0e209d39SAndroid Build Coastguard Worker                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
682*0e209d39SAndroid Build Coastguard Worker                         if(inc==rest) {
683*0e209d39SAndroid Build Coastguard Worker                             return length;  // Reached the end of the string.
684*0e209d39SAndroid Build Coastguard Worker                         }
685*0e209d39SAndroid Build Coastguard Worker                         offsets.addOffset(inc);
686*0e209d39SAndroid Build Coastguard Worker                     }
687*0e209d39SAndroid Build Coastguard Worker                     if(overlap==0) {
688*0e209d39SAndroid Build Coastguard Worker                         break;
689*0e209d39SAndroid Build Coastguard Worker                     }
690*0e209d39SAndroid Build Coastguard Worker                     --overlap;
691*0e209d39SAndroid Build Coastguard Worker                     ++inc;
692*0e209d39SAndroid Build Coastguard Worker                 }
693*0e209d39SAndroid Build Coastguard Worker             }
694*0e209d39SAndroid Build Coastguard Worker         } else /* USET_SPAN_SIMPLE */ {
695*0e209d39SAndroid Build Coastguard Worker             int32_t maxInc=0, maxOverlap=0;
696*0e209d39SAndroid Build Coastguard Worker             for(i=0; i<stringsLength; ++i) {
697*0e209d39SAndroid Build Coastguard Worker                 int32_t overlap=spanLengths[i];
698*0e209d39SAndroid Build Coastguard Worker                 // For longest match, we do need to try to match even an all-contained string
699*0e209d39SAndroid Build Coastguard Worker                 // to find the match from the earliest start.
700*0e209d39SAndroid Build Coastguard Worker 
701*0e209d39SAndroid Build Coastguard Worker                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
702*0e209d39SAndroid Build Coastguard Worker                 const char16_t *s16=string.getBuffer();
703*0e209d39SAndroid Build Coastguard Worker                 int32_t length16=string.length();
704*0e209d39SAndroid Build Coastguard Worker                 if (length16==0) {
705*0e209d39SAndroid Build Coastguard Worker                     continue;  // skip the empty string
706*0e209d39SAndroid Build Coastguard Worker                 }
707*0e209d39SAndroid Build Coastguard Worker 
708*0e209d39SAndroid Build Coastguard Worker                 // Try to match this string at pos-overlap..pos.
709*0e209d39SAndroid Build Coastguard Worker                 if(overlap>=LONG_SPAN) {
710*0e209d39SAndroid Build Coastguard Worker                     overlap=length16;
711*0e209d39SAndroid Build Coastguard Worker                     // Longest match: Need to match fully inside the code point span
712*0e209d39SAndroid Build Coastguard Worker                     // to find the match from the earliest start.
713*0e209d39SAndroid Build Coastguard Worker                 }
714*0e209d39SAndroid Build Coastguard Worker                 if(overlap>spanLength) {
715*0e209d39SAndroid Build Coastguard Worker                     overlap=spanLength;
716*0e209d39SAndroid Build Coastguard Worker                 }
717*0e209d39SAndroid Build Coastguard Worker                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
718*0e209d39SAndroid Build Coastguard Worker                 for(;;) {
719*0e209d39SAndroid Build Coastguard Worker                     if(inc>rest || overlap<maxOverlap) {
720*0e209d39SAndroid Build Coastguard Worker                         break;
721*0e209d39SAndroid Build Coastguard Worker                     }
722*0e209d39SAndroid Build Coastguard Worker                     // Try to match if the string is longer or starts earlier.
723*0e209d39SAndroid Build Coastguard Worker                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
724*0e209d39SAndroid Build Coastguard Worker                         matches16CPB(s, pos-overlap, length, s16, length16)
725*0e209d39SAndroid Build Coastguard Worker                     ) {
726*0e209d39SAndroid Build Coastguard Worker                         maxInc=inc;  // Longest match from earliest start.
727*0e209d39SAndroid Build Coastguard Worker                         maxOverlap=overlap;
728*0e209d39SAndroid Build Coastguard Worker                         break;
729*0e209d39SAndroid Build Coastguard Worker                     }
730*0e209d39SAndroid Build Coastguard Worker                     --overlap;
731*0e209d39SAndroid Build Coastguard Worker                     ++inc;
732*0e209d39SAndroid Build Coastguard Worker                 }
733*0e209d39SAndroid Build Coastguard Worker             }
734*0e209d39SAndroid Build Coastguard Worker 
735*0e209d39SAndroid Build Coastguard Worker             if(maxInc!=0 || maxOverlap!=0) {
736*0e209d39SAndroid Build Coastguard Worker                 // Longest-match algorithm, and there was a string match.
737*0e209d39SAndroid Build Coastguard Worker                 // Simply continue after it.
738*0e209d39SAndroid Build Coastguard Worker                 pos+=maxInc;
739*0e209d39SAndroid Build Coastguard Worker                 rest-=maxInc;
740*0e209d39SAndroid Build Coastguard Worker                 if(rest==0) {
741*0e209d39SAndroid Build Coastguard Worker                     return length;  // Reached the end of the string.
742*0e209d39SAndroid Build Coastguard Worker                 }
743*0e209d39SAndroid Build Coastguard Worker                 spanLength=0;  // Match strings from after a string match.
744*0e209d39SAndroid Build Coastguard Worker                 continue;
745*0e209d39SAndroid Build Coastguard Worker             }
746*0e209d39SAndroid Build Coastguard Worker         }
747*0e209d39SAndroid Build Coastguard Worker         // Finished trying to match all strings at pos.
748*0e209d39SAndroid Build Coastguard Worker 
749*0e209d39SAndroid Build Coastguard Worker         if(spanLength!=0 || pos==0) {
750*0e209d39SAndroid Build Coastguard Worker             // The position is after an unlimited code point span (spanLength!=0),
751*0e209d39SAndroid Build Coastguard Worker             // not after a string match.
752*0e209d39SAndroid Build Coastguard Worker             // The only position where spanLength==0 after a span is pos==0.
753*0e209d39SAndroid Build Coastguard Worker             // Otherwise, an unlimited code point span is only tried again when no
754*0e209d39SAndroid Build Coastguard Worker             // strings match, and if such a non-initial span fails we stop.
755*0e209d39SAndroid Build Coastguard Worker             if(offsets.isEmpty()) {
756*0e209d39SAndroid Build Coastguard Worker                 return pos;  // No strings matched after a span.
757*0e209d39SAndroid Build Coastguard Worker             }
758*0e209d39SAndroid Build Coastguard Worker             // Match strings from after the next string match.
759*0e209d39SAndroid Build Coastguard Worker         } else {
760*0e209d39SAndroid Build Coastguard Worker             // The position is after a string match (or a single code point).
761*0e209d39SAndroid Build Coastguard Worker             if(offsets.isEmpty()) {
762*0e209d39SAndroid Build Coastguard Worker                 // No more strings matched after a previous string match.
763*0e209d39SAndroid Build Coastguard Worker                 // Try another code point span from after the last string match.
764*0e209d39SAndroid Build Coastguard Worker                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
765*0e209d39SAndroid Build Coastguard Worker                 if( spanLength==rest || // Reached the end of the string, or
766*0e209d39SAndroid Build Coastguard Worker                     spanLength==0       // neither strings nor span progressed.
767*0e209d39SAndroid Build Coastguard Worker                 ) {
768*0e209d39SAndroid Build Coastguard Worker                     return pos+spanLength;
769*0e209d39SAndroid Build Coastguard Worker                 }
770*0e209d39SAndroid Build Coastguard Worker                 pos+=spanLength;
771*0e209d39SAndroid Build Coastguard Worker                 rest-=spanLength;
772*0e209d39SAndroid Build Coastguard Worker                 continue;  // spanLength>0: Match strings from after a span.
773*0e209d39SAndroid Build Coastguard Worker             } else {
774*0e209d39SAndroid Build Coastguard Worker                 // Try to match only one code point from after a string match if some
775*0e209d39SAndroid Build Coastguard Worker                 // string matched beyond it, so that we try all possible positions
776*0e209d39SAndroid Build Coastguard Worker                 // and don't overshoot.
777*0e209d39SAndroid Build Coastguard Worker                 spanLength=spanOne(spanSet, s+pos, rest);
778*0e209d39SAndroid Build Coastguard Worker                 if(spanLength>0) {
779*0e209d39SAndroid Build Coastguard Worker                     if(spanLength==rest) {
780*0e209d39SAndroid Build Coastguard Worker                         return length;  // Reached the end of the string.
781*0e209d39SAndroid Build Coastguard Worker                     }
782*0e209d39SAndroid Build Coastguard Worker                     // Match strings after this code point.
783*0e209d39SAndroid Build Coastguard Worker                     // There cannot be any increments below it because UnicodeSet strings
784*0e209d39SAndroid Build Coastguard Worker                     // contain multiple code points.
785*0e209d39SAndroid Build Coastguard Worker                     pos+=spanLength;
786*0e209d39SAndroid Build Coastguard Worker                     rest-=spanLength;
787*0e209d39SAndroid Build Coastguard Worker                     offsets.shift(spanLength);
788*0e209d39SAndroid Build Coastguard Worker                     spanLength=0;
789*0e209d39SAndroid Build Coastguard Worker                     continue;  // Match strings from after a single code point.
790*0e209d39SAndroid Build Coastguard Worker                 }
791*0e209d39SAndroid Build Coastguard Worker                 // Match strings from after the next string match.
792*0e209d39SAndroid Build Coastguard Worker             }
793*0e209d39SAndroid Build Coastguard Worker         }
794*0e209d39SAndroid Build Coastguard Worker         int32_t minOffset=offsets.popMinimum();
795*0e209d39SAndroid Build Coastguard Worker         pos+=minOffset;
796*0e209d39SAndroid Build Coastguard Worker         rest-=minOffset;
797*0e209d39SAndroid Build Coastguard Worker         spanLength=0;  // Match strings from after a string match.
798*0e209d39SAndroid Build Coastguard Worker     }
799*0e209d39SAndroid Build Coastguard Worker }
800*0e209d39SAndroid Build Coastguard Worker 
spanBack(const char16_t * s,int32_t length,USetSpanCondition spanCondition) const801*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
802*0e209d39SAndroid Build Coastguard Worker     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
803*0e209d39SAndroid Build Coastguard Worker         return spanNotBack(s, length);
804*0e209d39SAndroid Build Coastguard Worker     }
805*0e209d39SAndroid Build Coastguard Worker     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
806*0e209d39SAndroid Build Coastguard Worker     if(pos==0) {
807*0e209d39SAndroid Build Coastguard Worker         return 0;
808*0e209d39SAndroid Build Coastguard Worker     }
809*0e209d39SAndroid Build Coastguard Worker     int32_t spanLength=length-pos;
810*0e209d39SAndroid Build Coastguard Worker 
811*0e209d39SAndroid Build Coastguard Worker     // Consider strings; they may overlap with the span.
812*0e209d39SAndroid Build Coastguard Worker     OffsetList offsets;
813*0e209d39SAndroid Build Coastguard Worker     if(spanCondition==USET_SPAN_CONTAINED) {
814*0e209d39SAndroid Build Coastguard Worker         // Use offset list to try all possibilities.
815*0e209d39SAndroid Build Coastguard Worker         offsets.setMaxLength(maxLength16);
816*0e209d39SAndroid Build Coastguard Worker     }
817*0e209d39SAndroid Build Coastguard Worker     int32_t i, stringsLength=strings.size();
818*0e209d39SAndroid Build Coastguard Worker     uint8_t *spanBackLengths=spanLengths;
819*0e209d39SAndroid Build Coastguard Worker     if(all) {
820*0e209d39SAndroid Build Coastguard Worker         spanBackLengths+=stringsLength;
821*0e209d39SAndroid Build Coastguard Worker     }
822*0e209d39SAndroid Build Coastguard Worker     for(;;) {
823*0e209d39SAndroid Build Coastguard Worker         if(spanCondition==USET_SPAN_CONTAINED) {
824*0e209d39SAndroid Build Coastguard Worker             for(i=0; i<stringsLength; ++i) {
825*0e209d39SAndroid Build Coastguard Worker                 int32_t overlap=spanBackLengths[i];
826*0e209d39SAndroid Build Coastguard Worker                 if(overlap==ALL_CP_CONTAINED) {
827*0e209d39SAndroid Build Coastguard Worker                     continue;  // Irrelevant string. (Also the empty string.)
828*0e209d39SAndroid Build Coastguard Worker                 }
829*0e209d39SAndroid Build Coastguard Worker                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
830*0e209d39SAndroid Build Coastguard Worker                 const char16_t *s16=string.getBuffer();
831*0e209d39SAndroid Build Coastguard Worker                 int32_t length16=string.length();
832*0e209d39SAndroid Build Coastguard Worker                 U_ASSERT(length>0);
833*0e209d39SAndroid Build Coastguard Worker 
834*0e209d39SAndroid Build Coastguard Worker                 // Try to match this string at pos-(length16-overlap)..pos-length16.
835*0e209d39SAndroid Build Coastguard Worker                 if(overlap>=LONG_SPAN) {
836*0e209d39SAndroid Build Coastguard Worker                     overlap=length16;
837*0e209d39SAndroid Build Coastguard Worker                     // While contained: No point matching fully inside the code point span.
838*0e209d39SAndroid Build Coastguard Worker                     int32_t len1=0;
839*0e209d39SAndroid Build Coastguard Worker                     U16_FWD_1(s16, len1, overlap);
840*0e209d39SAndroid Build Coastguard Worker                     overlap-=len1;  // Length of the string minus the first code point.
841*0e209d39SAndroid Build Coastguard Worker                 }
842*0e209d39SAndroid Build Coastguard Worker                 if(overlap>spanLength) {
843*0e209d39SAndroid Build Coastguard Worker                     overlap=spanLength;
844*0e209d39SAndroid Build Coastguard Worker                 }
845*0e209d39SAndroid Build Coastguard Worker                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
846*0e209d39SAndroid Build Coastguard Worker                 for(;;) {
847*0e209d39SAndroid Build Coastguard Worker                     if(dec>pos) {
848*0e209d39SAndroid Build Coastguard Worker                         break;
849*0e209d39SAndroid Build Coastguard Worker                     }
850*0e209d39SAndroid Build Coastguard Worker                     // Try to match if the decrement is not listed already.
851*0e209d39SAndroid Build Coastguard Worker                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
852*0e209d39SAndroid Build Coastguard Worker                         if(dec==pos) {
853*0e209d39SAndroid Build Coastguard Worker                             return 0;  // Reached the start of the string.
854*0e209d39SAndroid Build Coastguard Worker                         }
855*0e209d39SAndroid Build Coastguard Worker                         offsets.addOffset(dec);
856*0e209d39SAndroid Build Coastguard Worker                     }
857*0e209d39SAndroid Build Coastguard Worker                     if(overlap==0) {
858*0e209d39SAndroid Build Coastguard Worker                         break;
859*0e209d39SAndroid Build Coastguard Worker                     }
860*0e209d39SAndroid Build Coastguard Worker                     --overlap;
861*0e209d39SAndroid Build Coastguard Worker                     ++dec;
862*0e209d39SAndroid Build Coastguard Worker                 }
863*0e209d39SAndroid Build Coastguard Worker             }
864*0e209d39SAndroid Build Coastguard Worker         } else /* USET_SPAN_SIMPLE */ {
865*0e209d39SAndroid Build Coastguard Worker             int32_t maxDec=0, maxOverlap=0;
866*0e209d39SAndroid Build Coastguard Worker             for(i=0; i<stringsLength; ++i) {
867*0e209d39SAndroid Build Coastguard Worker                 int32_t overlap=spanBackLengths[i];
868*0e209d39SAndroid Build Coastguard Worker                 // For longest match, we do need to try to match even an all-contained string
869*0e209d39SAndroid Build Coastguard Worker                 // to find the match from the latest end.
870*0e209d39SAndroid Build Coastguard Worker 
871*0e209d39SAndroid Build Coastguard Worker                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
872*0e209d39SAndroid Build Coastguard Worker                 const char16_t *s16=string.getBuffer();
873*0e209d39SAndroid Build Coastguard Worker                 int32_t length16=string.length();
874*0e209d39SAndroid Build Coastguard Worker                 if (length16==0) {
875*0e209d39SAndroid Build Coastguard Worker                     continue;  // skip the empty string
876*0e209d39SAndroid Build Coastguard Worker                 }
877*0e209d39SAndroid Build Coastguard Worker 
878*0e209d39SAndroid Build Coastguard Worker                 // Try to match this string at pos-(length16-overlap)..pos-length16.
879*0e209d39SAndroid Build Coastguard Worker                 if(overlap>=LONG_SPAN) {
880*0e209d39SAndroid Build Coastguard Worker                     overlap=length16;
881*0e209d39SAndroid Build Coastguard Worker                     // Longest match: Need to match fully inside the code point span
882*0e209d39SAndroid Build Coastguard Worker                     // to find the match from the latest end.
883*0e209d39SAndroid Build Coastguard Worker                 }
884*0e209d39SAndroid Build Coastguard Worker                 if(overlap>spanLength) {
885*0e209d39SAndroid Build Coastguard Worker                     overlap=spanLength;
886*0e209d39SAndroid Build Coastguard Worker                 }
887*0e209d39SAndroid Build Coastguard Worker                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
888*0e209d39SAndroid Build Coastguard Worker                 for(;;) {
889*0e209d39SAndroid Build Coastguard Worker                     if(dec>pos || overlap<maxOverlap) {
890*0e209d39SAndroid Build Coastguard Worker                         break;
891*0e209d39SAndroid Build Coastguard Worker                     }
892*0e209d39SAndroid Build Coastguard Worker                     // Try to match if the string is longer or ends later.
893*0e209d39SAndroid Build Coastguard Worker                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
894*0e209d39SAndroid Build Coastguard Worker                         matches16CPB(s, pos-dec, length, s16, length16)
895*0e209d39SAndroid Build Coastguard Worker                     ) {
896*0e209d39SAndroid Build Coastguard Worker                         maxDec=dec;  // Longest match from latest end.
897*0e209d39SAndroid Build Coastguard Worker                         maxOverlap=overlap;
898*0e209d39SAndroid Build Coastguard Worker                         break;
899*0e209d39SAndroid Build Coastguard Worker                     }
900*0e209d39SAndroid Build Coastguard Worker                     --overlap;
901*0e209d39SAndroid Build Coastguard Worker                     ++dec;
902*0e209d39SAndroid Build Coastguard Worker                 }
903*0e209d39SAndroid Build Coastguard Worker             }
904*0e209d39SAndroid Build Coastguard Worker 
905*0e209d39SAndroid Build Coastguard Worker             if(maxDec!=0 || maxOverlap!=0) {
906*0e209d39SAndroid Build Coastguard Worker                 // Longest-match algorithm, and there was a string match.
907*0e209d39SAndroid Build Coastguard Worker                 // Simply continue before it.
908*0e209d39SAndroid Build Coastguard Worker                 pos-=maxDec;
909*0e209d39SAndroid Build Coastguard Worker                 if(pos==0) {
910*0e209d39SAndroid Build Coastguard Worker                     return 0;  // Reached the start of the string.
911*0e209d39SAndroid Build Coastguard Worker                 }
912*0e209d39SAndroid Build Coastguard Worker                 spanLength=0;  // Match strings from before a string match.
913*0e209d39SAndroid Build Coastguard Worker                 continue;
914*0e209d39SAndroid Build Coastguard Worker             }
915*0e209d39SAndroid Build Coastguard Worker         }
916*0e209d39SAndroid Build Coastguard Worker         // Finished trying to match all strings at pos.
917*0e209d39SAndroid Build Coastguard Worker 
918*0e209d39SAndroid Build Coastguard Worker         if(spanLength!=0 || pos==length) {
919*0e209d39SAndroid Build Coastguard Worker             // The position is before an unlimited code point span (spanLength!=0),
920*0e209d39SAndroid Build Coastguard Worker             // not before a string match.
921*0e209d39SAndroid Build Coastguard Worker             // The only position where spanLength==0 before a span is pos==length.
922*0e209d39SAndroid Build Coastguard Worker             // Otherwise, an unlimited code point span is only tried again when no
923*0e209d39SAndroid Build Coastguard Worker             // strings match, and if such a non-initial span fails we stop.
924*0e209d39SAndroid Build Coastguard Worker             if(offsets.isEmpty()) {
925*0e209d39SAndroid Build Coastguard Worker                 return pos;  // No strings matched before a span.
926*0e209d39SAndroid Build Coastguard Worker             }
927*0e209d39SAndroid Build Coastguard Worker             // Match strings from before the next string match.
928*0e209d39SAndroid Build Coastguard Worker         } else {
929*0e209d39SAndroid Build Coastguard Worker             // The position is before a string match (or a single code point).
930*0e209d39SAndroid Build Coastguard Worker             if(offsets.isEmpty()) {
931*0e209d39SAndroid Build Coastguard Worker                 // No more strings matched before a previous string match.
932*0e209d39SAndroid Build Coastguard Worker                 // Try another code point span from before the last string match.
933*0e209d39SAndroid Build Coastguard Worker                 int32_t oldPos=pos;
934*0e209d39SAndroid Build Coastguard Worker                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
935*0e209d39SAndroid Build Coastguard Worker                 spanLength=oldPos-pos;
936*0e209d39SAndroid Build Coastguard Worker                 if( pos==0 ||           // Reached the start of the string, or
937*0e209d39SAndroid Build Coastguard Worker                     spanLength==0       // neither strings nor span progressed.
938*0e209d39SAndroid Build Coastguard Worker                 ) {
939*0e209d39SAndroid Build Coastguard Worker                     return pos;
940*0e209d39SAndroid Build Coastguard Worker                 }
941*0e209d39SAndroid Build Coastguard Worker                 continue;  // spanLength>0: Match strings from before a span.
942*0e209d39SAndroid Build Coastguard Worker             } else {
943*0e209d39SAndroid Build Coastguard Worker                 // Try to match only one code point from before a string match if some
944*0e209d39SAndroid Build Coastguard Worker                 // string matched beyond it, so that we try all possible positions
945*0e209d39SAndroid Build Coastguard Worker                 // and don't overshoot.
946*0e209d39SAndroid Build Coastguard Worker                 spanLength=spanOneBack(spanSet, s, pos);
947*0e209d39SAndroid Build Coastguard Worker                 if(spanLength>0) {
948*0e209d39SAndroid Build Coastguard Worker                     if(spanLength==pos) {
949*0e209d39SAndroid Build Coastguard Worker                         return 0;  // Reached the start of the string.
950*0e209d39SAndroid Build Coastguard Worker                     }
951*0e209d39SAndroid Build Coastguard Worker                     // Match strings before this code point.
952*0e209d39SAndroid Build Coastguard Worker                     // There cannot be any decrements below it because UnicodeSet strings
953*0e209d39SAndroid Build Coastguard Worker                     // contain multiple code points.
954*0e209d39SAndroid Build Coastguard Worker                     pos-=spanLength;
955*0e209d39SAndroid Build Coastguard Worker                     offsets.shift(spanLength);
956*0e209d39SAndroid Build Coastguard Worker                     spanLength=0;
957*0e209d39SAndroid Build Coastguard Worker                     continue;  // Match strings from before a single code point.
958*0e209d39SAndroid Build Coastguard Worker                 }
959*0e209d39SAndroid Build Coastguard Worker                 // Match strings from before the next string match.
960*0e209d39SAndroid Build Coastguard Worker             }
961*0e209d39SAndroid Build Coastguard Worker         }
962*0e209d39SAndroid Build Coastguard Worker         pos-=offsets.popMinimum();
963*0e209d39SAndroid Build Coastguard Worker         spanLength=0;  // Match strings from before a string match.
964*0e209d39SAndroid Build Coastguard Worker     }
965*0e209d39SAndroid Build Coastguard Worker }
966*0e209d39SAndroid Build Coastguard Worker 
spanUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const967*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
968*0e209d39SAndroid Build Coastguard Worker     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
969*0e209d39SAndroid Build Coastguard Worker         return spanNotUTF8(s, length);
970*0e209d39SAndroid Build Coastguard Worker     }
971*0e209d39SAndroid Build Coastguard Worker     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
972*0e209d39SAndroid Build Coastguard Worker     if(spanLength==length) {
973*0e209d39SAndroid Build Coastguard Worker         return length;
974*0e209d39SAndroid Build Coastguard Worker     }
975*0e209d39SAndroid Build Coastguard Worker 
976*0e209d39SAndroid Build Coastguard Worker     // Consider strings; they may overlap with the span.
977*0e209d39SAndroid Build Coastguard Worker     OffsetList offsets;
978*0e209d39SAndroid Build Coastguard Worker     if(spanCondition==USET_SPAN_CONTAINED) {
979*0e209d39SAndroid Build Coastguard Worker         // Use offset list to try all possibilities.
980*0e209d39SAndroid Build Coastguard Worker         offsets.setMaxLength(maxLength8);
981*0e209d39SAndroid Build Coastguard Worker     }
982*0e209d39SAndroid Build Coastguard Worker     int32_t pos=spanLength, rest=length-pos;
983*0e209d39SAndroid Build Coastguard Worker     int32_t i, stringsLength=strings.size();
984*0e209d39SAndroid Build Coastguard Worker     uint8_t *spanUTF8Lengths=spanLengths;
985*0e209d39SAndroid Build Coastguard Worker     if(all) {
986*0e209d39SAndroid Build Coastguard Worker         spanUTF8Lengths+=2*stringsLength;
987*0e209d39SAndroid Build Coastguard Worker     }
988*0e209d39SAndroid Build Coastguard Worker     for(;;) {
989*0e209d39SAndroid Build Coastguard Worker         const uint8_t *s8=utf8;
990*0e209d39SAndroid Build Coastguard Worker         int32_t length8;
991*0e209d39SAndroid Build Coastguard Worker         if(spanCondition==USET_SPAN_CONTAINED) {
992*0e209d39SAndroid Build Coastguard Worker             for(i=0; i<stringsLength; ++i) {
993*0e209d39SAndroid Build Coastguard Worker                 length8=utf8Lengths[i];
994*0e209d39SAndroid Build Coastguard Worker                 if(length8==0) {
995*0e209d39SAndroid Build Coastguard Worker                     continue;  // String not representable in UTF-8.
996*0e209d39SAndroid Build Coastguard Worker                 }
997*0e209d39SAndroid Build Coastguard Worker                 int32_t overlap=spanUTF8Lengths[i];
998*0e209d39SAndroid Build Coastguard Worker                 if(overlap==ALL_CP_CONTAINED) {
999*0e209d39SAndroid Build Coastguard Worker                     s8+=length8;
1000*0e209d39SAndroid Build Coastguard Worker                     continue;  // Irrelevant string.
1001*0e209d39SAndroid Build Coastguard Worker                 }
1002*0e209d39SAndroid Build Coastguard Worker 
1003*0e209d39SAndroid Build Coastguard Worker                 // Try to match this string at pos-overlap..pos.
1004*0e209d39SAndroid Build Coastguard Worker                 if(overlap>=LONG_SPAN) {
1005*0e209d39SAndroid Build Coastguard Worker                     overlap=length8;
1006*0e209d39SAndroid Build Coastguard Worker                     // While contained: No point matching fully inside the code point span.
1007*0e209d39SAndroid Build Coastguard Worker                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
1008*0e209d39SAndroid Build Coastguard Worker                 }
1009*0e209d39SAndroid Build Coastguard Worker                 if(overlap>spanLength) {
1010*0e209d39SAndroid Build Coastguard Worker                     overlap=spanLength;
1011*0e209d39SAndroid Build Coastguard Worker                 }
1012*0e209d39SAndroid Build Coastguard Worker                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1013*0e209d39SAndroid Build Coastguard Worker                 for(;;) {
1014*0e209d39SAndroid Build Coastguard Worker                     if(inc>rest) {
1015*0e209d39SAndroid Build Coastguard Worker                         break;
1016*0e209d39SAndroid Build Coastguard Worker                     }
1017*0e209d39SAndroid Build Coastguard Worker                     // Try to match if the increment is not listed already.
1018*0e209d39SAndroid Build Coastguard Worker                     // Match at code point boundaries. (The UTF-8 strings were converted
1019*0e209d39SAndroid Build Coastguard Worker                     // from UTF-16 and are guaranteed to be well-formed.)
1020*0e209d39SAndroid Build Coastguard Worker                     if(!U8_IS_TRAIL(s[pos-overlap]) &&
1021*0e209d39SAndroid Build Coastguard Worker                             !offsets.containsOffset(inc) &&
1022*0e209d39SAndroid Build Coastguard Worker                             matches8(s+pos-overlap, s8, length8)) {
1023*0e209d39SAndroid Build Coastguard Worker                         if(inc==rest) {
1024*0e209d39SAndroid Build Coastguard Worker                             return length;  // Reached the end of the string.
1025*0e209d39SAndroid Build Coastguard Worker                         }
1026*0e209d39SAndroid Build Coastguard Worker                         offsets.addOffset(inc);
1027*0e209d39SAndroid Build Coastguard Worker                     }
1028*0e209d39SAndroid Build Coastguard Worker                     if(overlap==0) {
1029*0e209d39SAndroid Build Coastguard Worker                         break;
1030*0e209d39SAndroid Build Coastguard Worker                     }
1031*0e209d39SAndroid Build Coastguard Worker                     --overlap;
1032*0e209d39SAndroid Build Coastguard Worker                     ++inc;
1033*0e209d39SAndroid Build Coastguard Worker                 }
1034*0e209d39SAndroid Build Coastguard Worker                 s8+=length8;
1035*0e209d39SAndroid Build Coastguard Worker             }
1036*0e209d39SAndroid Build Coastguard Worker         } else /* USET_SPAN_SIMPLE */ {
1037*0e209d39SAndroid Build Coastguard Worker             int32_t maxInc=0, maxOverlap=0;
1038*0e209d39SAndroid Build Coastguard Worker             for(i=0; i<stringsLength; ++i) {
1039*0e209d39SAndroid Build Coastguard Worker                 length8=utf8Lengths[i];
1040*0e209d39SAndroid Build Coastguard Worker                 if(length8==0) {
1041*0e209d39SAndroid Build Coastguard Worker                     continue;  // String not representable in UTF-8.
1042*0e209d39SAndroid Build Coastguard Worker                 }
1043*0e209d39SAndroid Build Coastguard Worker                 int32_t overlap=spanUTF8Lengths[i];
1044*0e209d39SAndroid Build Coastguard Worker                 // For longest match, we do need to try to match even an all-contained string
1045*0e209d39SAndroid Build Coastguard Worker                 // to find the match from the earliest start.
1046*0e209d39SAndroid Build Coastguard Worker 
1047*0e209d39SAndroid Build Coastguard Worker                 // Try to match this string at pos-overlap..pos.
1048*0e209d39SAndroid Build Coastguard Worker                 if(overlap>=LONG_SPAN) {
1049*0e209d39SAndroid Build Coastguard Worker                     overlap=length8;
1050*0e209d39SAndroid Build Coastguard Worker                     // Longest match: Need to match fully inside the code point span
1051*0e209d39SAndroid Build Coastguard Worker                     // to find the match from the earliest start.
1052*0e209d39SAndroid Build Coastguard Worker                 }
1053*0e209d39SAndroid Build Coastguard Worker                 if(overlap>spanLength) {
1054*0e209d39SAndroid Build Coastguard Worker                     overlap=spanLength;
1055*0e209d39SAndroid Build Coastguard Worker                 }
1056*0e209d39SAndroid Build Coastguard Worker                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1057*0e209d39SAndroid Build Coastguard Worker                 for(;;) {
1058*0e209d39SAndroid Build Coastguard Worker                     if(inc>rest || overlap<maxOverlap) {
1059*0e209d39SAndroid Build Coastguard Worker                         break;
1060*0e209d39SAndroid Build Coastguard Worker                     }
1061*0e209d39SAndroid Build Coastguard Worker                     // Try to match if the string is longer or starts earlier.
1062*0e209d39SAndroid Build Coastguard Worker                     // Match at code point boundaries. (The UTF-8 strings were converted
1063*0e209d39SAndroid Build Coastguard Worker                     // from UTF-16 and are guaranteed to be well-formed.)
1064*0e209d39SAndroid Build Coastguard Worker                     if(!U8_IS_TRAIL(s[pos-overlap]) &&
1065*0e209d39SAndroid Build Coastguard Worker                             (overlap>maxOverlap ||
1066*0e209d39SAndroid Build Coastguard Worker                                 /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1067*0e209d39SAndroid Build Coastguard Worker                             matches8(s+pos-overlap, s8, length8)) {
1068*0e209d39SAndroid Build Coastguard Worker                         maxInc=inc;  // Longest match from earliest start.
1069*0e209d39SAndroid Build Coastguard Worker                         maxOverlap=overlap;
1070*0e209d39SAndroid Build Coastguard Worker                         break;
1071*0e209d39SAndroid Build Coastguard Worker                     }
1072*0e209d39SAndroid Build Coastguard Worker                     --overlap;
1073*0e209d39SAndroid Build Coastguard Worker                     ++inc;
1074*0e209d39SAndroid Build Coastguard Worker                 }
1075*0e209d39SAndroid Build Coastguard Worker                 s8+=length8;
1076*0e209d39SAndroid Build Coastguard Worker             }
1077*0e209d39SAndroid Build Coastguard Worker 
1078*0e209d39SAndroid Build Coastguard Worker             if(maxInc!=0 || maxOverlap!=0) {
1079*0e209d39SAndroid Build Coastguard Worker                 // Longest-match algorithm, and there was a string match.
1080*0e209d39SAndroid Build Coastguard Worker                 // Simply continue after it.
1081*0e209d39SAndroid Build Coastguard Worker                 pos+=maxInc;
1082*0e209d39SAndroid Build Coastguard Worker                 rest-=maxInc;
1083*0e209d39SAndroid Build Coastguard Worker                 if(rest==0) {
1084*0e209d39SAndroid Build Coastguard Worker                     return length;  // Reached the end of the string.
1085*0e209d39SAndroid Build Coastguard Worker                 }
1086*0e209d39SAndroid Build Coastguard Worker                 spanLength=0;  // Match strings from after a string match.
1087*0e209d39SAndroid Build Coastguard Worker                 continue;
1088*0e209d39SAndroid Build Coastguard Worker             }
1089*0e209d39SAndroid Build Coastguard Worker         }
1090*0e209d39SAndroid Build Coastguard Worker         // Finished trying to match all strings at pos.
1091*0e209d39SAndroid Build Coastguard Worker 
1092*0e209d39SAndroid Build Coastguard Worker         if(spanLength!=0 || pos==0) {
1093*0e209d39SAndroid Build Coastguard Worker             // The position is after an unlimited code point span (spanLength!=0),
1094*0e209d39SAndroid Build Coastguard Worker             // not after a string match.
1095*0e209d39SAndroid Build Coastguard Worker             // The only position where spanLength==0 after a span is pos==0.
1096*0e209d39SAndroid Build Coastguard Worker             // Otherwise, an unlimited code point span is only tried again when no
1097*0e209d39SAndroid Build Coastguard Worker             // strings match, and if such a non-initial span fails we stop.
1098*0e209d39SAndroid Build Coastguard Worker             if(offsets.isEmpty()) {
1099*0e209d39SAndroid Build Coastguard Worker                 return pos;  // No strings matched after a span.
1100*0e209d39SAndroid Build Coastguard Worker             }
1101*0e209d39SAndroid Build Coastguard Worker             // Match strings from after the next string match.
1102*0e209d39SAndroid Build Coastguard Worker         } else {
1103*0e209d39SAndroid Build Coastguard Worker             // The position is after a string match (or a single code point).
1104*0e209d39SAndroid Build Coastguard Worker             if(offsets.isEmpty()) {
1105*0e209d39SAndroid Build Coastguard Worker                 // No more strings matched after a previous string match.
1106*0e209d39SAndroid Build Coastguard Worker                 // Try another code point span from after the last string match.
1107*0e209d39SAndroid Build Coastguard Worker                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1108*0e209d39SAndroid Build Coastguard Worker                 if( spanLength==rest || // Reached the end of the string, or
1109*0e209d39SAndroid Build Coastguard Worker                     spanLength==0       // neither strings nor span progressed.
1110*0e209d39SAndroid Build Coastguard Worker                 ) {
1111*0e209d39SAndroid Build Coastguard Worker                     return pos+spanLength;
1112*0e209d39SAndroid Build Coastguard Worker                 }
1113*0e209d39SAndroid Build Coastguard Worker                 pos+=spanLength;
1114*0e209d39SAndroid Build Coastguard Worker                 rest-=spanLength;
1115*0e209d39SAndroid Build Coastguard Worker                 continue;  // spanLength>0: Match strings from after a span.
1116*0e209d39SAndroid Build Coastguard Worker             } else {
1117*0e209d39SAndroid Build Coastguard Worker                 // Try to match only one code point from after a string match if some
1118*0e209d39SAndroid Build Coastguard Worker                 // string matched beyond it, so that we try all possible positions
1119*0e209d39SAndroid Build Coastguard Worker                 // and don't overshoot.
1120*0e209d39SAndroid Build Coastguard Worker                 spanLength=spanOneUTF8(spanSet, s+pos, rest);
1121*0e209d39SAndroid Build Coastguard Worker                 if(spanLength>0) {
1122*0e209d39SAndroid Build Coastguard Worker                     if(spanLength==rest) {
1123*0e209d39SAndroid Build Coastguard Worker                         return length;  // Reached the end of the string.
1124*0e209d39SAndroid Build Coastguard Worker                     }
1125*0e209d39SAndroid Build Coastguard Worker                     // Match strings after this code point.
1126*0e209d39SAndroid Build Coastguard Worker                     // There cannot be any increments below it because UnicodeSet strings
1127*0e209d39SAndroid Build Coastguard Worker                     // contain multiple code points.
1128*0e209d39SAndroid Build Coastguard Worker                     pos+=spanLength;
1129*0e209d39SAndroid Build Coastguard Worker                     rest-=spanLength;
1130*0e209d39SAndroid Build Coastguard Worker                     offsets.shift(spanLength);
1131*0e209d39SAndroid Build Coastguard Worker                     spanLength=0;
1132*0e209d39SAndroid Build Coastguard Worker                     continue;  // Match strings from after a single code point.
1133*0e209d39SAndroid Build Coastguard Worker                 }
1134*0e209d39SAndroid Build Coastguard Worker                 // Match strings from after the next string match.
1135*0e209d39SAndroid Build Coastguard Worker             }
1136*0e209d39SAndroid Build Coastguard Worker         }
1137*0e209d39SAndroid Build Coastguard Worker         int32_t minOffset=offsets.popMinimum();
1138*0e209d39SAndroid Build Coastguard Worker         pos+=minOffset;
1139*0e209d39SAndroid Build Coastguard Worker         rest-=minOffset;
1140*0e209d39SAndroid Build Coastguard Worker         spanLength=0;  // Match strings from after a string match.
1141*0e209d39SAndroid Build Coastguard Worker     }
1142*0e209d39SAndroid Build Coastguard Worker }
1143*0e209d39SAndroid Build Coastguard Worker 
spanBackUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const1144*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1145*0e209d39SAndroid Build Coastguard Worker     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1146*0e209d39SAndroid Build Coastguard Worker         return spanNotBackUTF8(s, length);
1147*0e209d39SAndroid Build Coastguard Worker     }
1148*0e209d39SAndroid Build Coastguard Worker     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1149*0e209d39SAndroid Build Coastguard Worker     if(pos==0) {
1150*0e209d39SAndroid Build Coastguard Worker         return 0;
1151*0e209d39SAndroid Build Coastguard Worker     }
1152*0e209d39SAndroid Build Coastguard Worker     int32_t spanLength=length-pos;
1153*0e209d39SAndroid Build Coastguard Worker 
1154*0e209d39SAndroid Build Coastguard Worker     // Consider strings; they may overlap with the span.
1155*0e209d39SAndroid Build Coastguard Worker     OffsetList offsets;
1156*0e209d39SAndroid Build Coastguard Worker     if(spanCondition==USET_SPAN_CONTAINED) {
1157*0e209d39SAndroid Build Coastguard Worker         // Use offset list to try all possibilities.
1158*0e209d39SAndroid Build Coastguard Worker         offsets.setMaxLength(maxLength8);
1159*0e209d39SAndroid Build Coastguard Worker     }
1160*0e209d39SAndroid Build Coastguard Worker     int32_t i, stringsLength=strings.size();
1161*0e209d39SAndroid Build Coastguard Worker     uint8_t *spanBackUTF8Lengths=spanLengths;
1162*0e209d39SAndroid Build Coastguard Worker     if(all) {
1163*0e209d39SAndroid Build Coastguard Worker         spanBackUTF8Lengths+=3*stringsLength;
1164*0e209d39SAndroid Build Coastguard Worker     }
1165*0e209d39SAndroid Build Coastguard Worker     for(;;) {
1166*0e209d39SAndroid Build Coastguard Worker         const uint8_t *s8=utf8;
1167*0e209d39SAndroid Build Coastguard Worker         int32_t length8;
1168*0e209d39SAndroid Build Coastguard Worker         if(spanCondition==USET_SPAN_CONTAINED) {
1169*0e209d39SAndroid Build Coastguard Worker             for(i=0; i<stringsLength; ++i) {
1170*0e209d39SAndroid Build Coastguard Worker                 length8=utf8Lengths[i];
1171*0e209d39SAndroid Build Coastguard Worker                 if(length8==0) {
1172*0e209d39SAndroid Build Coastguard Worker                     continue;  // String not representable in UTF-8.
1173*0e209d39SAndroid Build Coastguard Worker                 }
1174*0e209d39SAndroid Build Coastguard Worker                 int32_t overlap=spanBackUTF8Lengths[i];
1175*0e209d39SAndroid Build Coastguard Worker                 if(overlap==ALL_CP_CONTAINED) {
1176*0e209d39SAndroid Build Coastguard Worker                     s8+=length8;
1177*0e209d39SAndroid Build Coastguard Worker                     continue;  // Irrelevant string.
1178*0e209d39SAndroid Build Coastguard Worker                 }
1179*0e209d39SAndroid Build Coastguard Worker 
1180*0e209d39SAndroid Build Coastguard Worker                 // Try to match this string at pos-(length8-overlap)..pos-length8.
1181*0e209d39SAndroid Build Coastguard Worker                 if(overlap>=LONG_SPAN) {
1182*0e209d39SAndroid Build Coastguard Worker                     overlap=length8;
1183*0e209d39SAndroid Build Coastguard Worker                     // While contained: No point matching fully inside the code point span.
1184*0e209d39SAndroid Build Coastguard Worker                     int32_t len1=0;
1185*0e209d39SAndroid Build Coastguard Worker                     U8_FWD_1(s8, len1, overlap);
1186*0e209d39SAndroid Build Coastguard Worker                     overlap-=len1;  // Length of the string minus the first code point.
1187*0e209d39SAndroid Build Coastguard Worker                 }
1188*0e209d39SAndroid Build Coastguard Worker                 if(overlap>spanLength) {
1189*0e209d39SAndroid Build Coastguard Worker                     overlap=spanLength;
1190*0e209d39SAndroid Build Coastguard Worker                 }
1191*0e209d39SAndroid Build Coastguard Worker                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1192*0e209d39SAndroid Build Coastguard Worker                 for(;;) {
1193*0e209d39SAndroid Build Coastguard Worker                     if(dec>pos) {
1194*0e209d39SAndroid Build Coastguard Worker                         break;
1195*0e209d39SAndroid Build Coastguard Worker                     }
1196*0e209d39SAndroid Build Coastguard Worker                     // Try to match if the decrement is not listed already.
1197*0e209d39SAndroid Build Coastguard Worker                     // Match at code point boundaries. (The UTF-8 strings were converted
1198*0e209d39SAndroid Build Coastguard Worker                     // from UTF-16 and are guaranteed to be well-formed.)
1199*0e209d39SAndroid Build Coastguard Worker                     if( !U8_IS_TRAIL(s[pos-dec]) &&
1200*0e209d39SAndroid Build Coastguard Worker                         !offsets.containsOffset(dec) &&
1201*0e209d39SAndroid Build Coastguard Worker                         matches8(s+pos-dec, s8, length8)
1202*0e209d39SAndroid Build Coastguard Worker                     ) {
1203*0e209d39SAndroid Build Coastguard Worker                         if(dec==pos) {
1204*0e209d39SAndroid Build Coastguard Worker                             return 0;  // Reached the start of the string.
1205*0e209d39SAndroid Build Coastguard Worker                         }
1206*0e209d39SAndroid Build Coastguard Worker                         offsets.addOffset(dec);
1207*0e209d39SAndroid Build Coastguard Worker                     }
1208*0e209d39SAndroid Build Coastguard Worker                     if(overlap==0) {
1209*0e209d39SAndroid Build Coastguard Worker                         break;
1210*0e209d39SAndroid Build Coastguard Worker                     }
1211*0e209d39SAndroid Build Coastguard Worker                     --overlap;
1212*0e209d39SAndroid Build Coastguard Worker                     ++dec;
1213*0e209d39SAndroid Build Coastguard Worker                 }
1214*0e209d39SAndroid Build Coastguard Worker                 s8+=length8;
1215*0e209d39SAndroid Build Coastguard Worker             }
1216*0e209d39SAndroid Build Coastguard Worker         } else /* USET_SPAN_SIMPLE */ {
1217*0e209d39SAndroid Build Coastguard Worker             int32_t maxDec=0, maxOverlap=0;
1218*0e209d39SAndroid Build Coastguard Worker             for(i=0; i<stringsLength; ++i) {
1219*0e209d39SAndroid Build Coastguard Worker                 length8=utf8Lengths[i];
1220*0e209d39SAndroid Build Coastguard Worker                 if(length8==0) {
1221*0e209d39SAndroid Build Coastguard Worker                     continue;  // String not representable in UTF-8.
1222*0e209d39SAndroid Build Coastguard Worker                 }
1223*0e209d39SAndroid Build Coastguard Worker                 int32_t overlap=spanBackUTF8Lengths[i];
1224*0e209d39SAndroid Build Coastguard Worker                 // For longest match, we do need to try to match even an all-contained string
1225*0e209d39SAndroid Build Coastguard Worker                 // to find the match from the latest end.
1226*0e209d39SAndroid Build Coastguard Worker 
1227*0e209d39SAndroid Build Coastguard Worker                 // Try to match this string at pos-(length8-overlap)..pos-length8.
1228*0e209d39SAndroid Build Coastguard Worker                 if(overlap>=LONG_SPAN) {
1229*0e209d39SAndroid Build Coastguard Worker                     overlap=length8;
1230*0e209d39SAndroid Build Coastguard Worker                     // Longest match: Need to match fully inside the code point span
1231*0e209d39SAndroid Build Coastguard Worker                     // to find the match from the latest end.
1232*0e209d39SAndroid Build Coastguard Worker                 }
1233*0e209d39SAndroid Build Coastguard Worker                 if(overlap>spanLength) {
1234*0e209d39SAndroid Build Coastguard Worker                     overlap=spanLength;
1235*0e209d39SAndroid Build Coastguard Worker                 }
1236*0e209d39SAndroid Build Coastguard Worker                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1237*0e209d39SAndroid Build Coastguard Worker                 for(;;) {
1238*0e209d39SAndroid Build Coastguard Worker                     if(dec>pos || overlap<maxOverlap) {
1239*0e209d39SAndroid Build Coastguard Worker                         break;
1240*0e209d39SAndroid Build Coastguard Worker                     }
1241*0e209d39SAndroid Build Coastguard Worker                     // Try to match if the string is longer or ends later.
1242*0e209d39SAndroid Build Coastguard Worker                     // Match at code point boundaries. (The UTF-8 strings were converted
1243*0e209d39SAndroid Build Coastguard Worker                     // from UTF-16 and are guaranteed to be well-formed.)
1244*0e209d39SAndroid Build Coastguard Worker                     if( !U8_IS_TRAIL(s[pos-dec]) &&
1245*0e209d39SAndroid Build Coastguard Worker                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1246*0e209d39SAndroid Build Coastguard Worker                         matches8(s+pos-dec, s8, length8)
1247*0e209d39SAndroid Build Coastguard Worker                     ) {
1248*0e209d39SAndroid Build Coastguard Worker                         maxDec=dec;  // Longest match from latest end.
1249*0e209d39SAndroid Build Coastguard Worker                         maxOverlap=overlap;
1250*0e209d39SAndroid Build Coastguard Worker                         break;
1251*0e209d39SAndroid Build Coastguard Worker                     }
1252*0e209d39SAndroid Build Coastguard Worker                     --overlap;
1253*0e209d39SAndroid Build Coastguard Worker                     ++dec;
1254*0e209d39SAndroid Build Coastguard Worker                 }
1255*0e209d39SAndroid Build Coastguard Worker                 s8+=length8;
1256*0e209d39SAndroid Build Coastguard Worker             }
1257*0e209d39SAndroid Build Coastguard Worker 
1258*0e209d39SAndroid Build Coastguard Worker             if(maxDec!=0 || maxOverlap!=0) {
1259*0e209d39SAndroid Build Coastguard Worker                 // Longest-match algorithm, and there was a string match.
1260*0e209d39SAndroid Build Coastguard Worker                 // Simply continue before it.
1261*0e209d39SAndroid Build Coastguard Worker                 pos-=maxDec;
1262*0e209d39SAndroid Build Coastguard Worker                 if(pos==0) {
1263*0e209d39SAndroid Build Coastguard Worker                     return 0;  // Reached the start of the string.
1264*0e209d39SAndroid Build Coastguard Worker                 }
1265*0e209d39SAndroid Build Coastguard Worker                 spanLength=0;  // Match strings from before a string match.
1266*0e209d39SAndroid Build Coastguard Worker                 continue;
1267*0e209d39SAndroid Build Coastguard Worker             }
1268*0e209d39SAndroid Build Coastguard Worker         }
1269*0e209d39SAndroid Build Coastguard Worker         // Finished trying to match all strings at pos.
1270*0e209d39SAndroid Build Coastguard Worker 
1271*0e209d39SAndroid Build Coastguard Worker         if(spanLength!=0 || pos==length) {
1272*0e209d39SAndroid Build Coastguard Worker             // The position is before an unlimited code point span (spanLength!=0),
1273*0e209d39SAndroid Build Coastguard Worker             // not before a string match.
1274*0e209d39SAndroid Build Coastguard Worker             // The only position where spanLength==0 before a span is pos==length.
1275*0e209d39SAndroid Build Coastguard Worker             // Otherwise, an unlimited code point span is only tried again when no
1276*0e209d39SAndroid Build Coastguard Worker             // strings match, and if such a non-initial span fails we stop.
1277*0e209d39SAndroid Build Coastguard Worker             if(offsets.isEmpty()) {
1278*0e209d39SAndroid Build Coastguard Worker                 return pos;  // No strings matched before a span.
1279*0e209d39SAndroid Build Coastguard Worker             }
1280*0e209d39SAndroid Build Coastguard Worker             // Match strings from before the next string match.
1281*0e209d39SAndroid Build Coastguard Worker         } else {
1282*0e209d39SAndroid Build Coastguard Worker             // The position is before a string match (or a single code point).
1283*0e209d39SAndroid Build Coastguard Worker             if(offsets.isEmpty()) {
1284*0e209d39SAndroid Build Coastguard Worker                 // No more strings matched before a previous string match.
1285*0e209d39SAndroid Build Coastguard Worker                 // Try another code point span from before the last string match.
1286*0e209d39SAndroid Build Coastguard Worker                 int32_t oldPos=pos;
1287*0e209d39SAndroid Build Coastguard Worker                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1288*0e209d39SAndroid Build Coastguard Worker                 spanLength=oldPos-pos;
1289*0e209d39SAndroid Build Coastguard Worker                 if( pos==0 ||           // Reached the start of the string, or
1290*0e209d39SAndroid Build Coastguard Worker                     spanLength==0       // neither strings nor span progressed.
1291*0e209d39SAndroid Build Coastguard Worker                 ) {
1292*0e209d39SAndroid Build Coastguard Worker                     return pos;
1293*0e209d39SAndroid Build Coastguard Worker                 }
1294*0e209d39SAndroid Build Coastguard Worker                 continue;  // spanLength>0: Match strings from before a span.
1295*0e209d39SAndroid Build Coastguard Worker             } else {
1296*0e209d39SAndroid Build Coastguard Worker                 // Try to match only one code point from before a string match if some
1297*0e209d39SAndroid Build Coastguard Worker                 // string matched beyond it, so that we try all possible positions
1298*0e209d39SAndroid Build Coastguard Worker                 // and don't overshoot.
1299*0e209d39SAndroid Build Coastguard Worker                 spanLength=spanOneBackUTF8(spanSet, s, pos);
1300*0e209d39SAndroid Build Coastguard Worker                 if(spanLength>0) {
1301*0e209d39SAndroid Build Coastguard Worker                     if(spanLength==pos) {
1302*0e209d39SAndroid Build Coastguard Worker                         return 0;  // Reached the start of the string.
1303*0e209d39SAndroid Build Coastguard Worker                     }
1304*0e209d39SAndroid Build Coastguard Worker                     // Match strings before this code point.
1305*0e209d39SAndroid Build Coastguard Worker                     // There cannot be any decrements below it because UnicodeSet strings
1306*0e209d39SAndroid Build Coastguard Worker                     // contain multiple code points.
1307*0e209d39SAndroid Build Coastguard Worker                     pos-=spanLength;
1308*0e209d39SAndroid Build Coastguard Worker                     offsets.shift(spanLength);
1309*0e209d39SAndroid Build Coastguard Worker                     spanLength=0;
1310*0e209d39SAndroid Build Coastguard Worker                     continue;  // Match strings from before a single code point.
1311*0e209d39SAndroid Build Coastguard Worker                 }
1312*0e209d39SAndroid Build Coastguard Worker                 // Match strings from before the next string match.
1313*0e209d39SAndroid Build Coastguard Worker             }
1314*0e209d39SAndroid Build Coastguard Worker         }
1315*0e209d39SAndroid Build Coastguard Worker         pos-=offsets.popMinimum();
1316*0e209d39SAndroid Build Coastguard Worker         spanLength=0;  // Match strings from before a string match.
1317*0e209d39SAndroid Build Coastguard Worker     }
1318*0e209d39SAndroid Build Coastguard Worker }
1319*0e209d39SAndroid Build Coastguard Worker 
1320*0e209d39SAndroid Build Coastguard Worker /*
1321*0e209d39SAndroid Build Coastguard Worker  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1322*0e209d39SAndroid Build Coastguard Worker  *
1323*0e209d39SAndroid Build Coastguard Worker  * Theoretical algorithm:
1324*0e209d39SAndroid Build Coastguard Worker  * - Iterate through the string, and at each code point boundary:
1325*0e209d39SAndroid Build Coastguard Worker  *   + If the code point there is in the set, then return with the current position.
1326*0e209d39SAndroid Build Coastguard Worker  *   + If a set string matches at the current position, then return with the current position.
1327*0e209d39SAndroid Build Coastguard Worker  *
1328*0e209d39SAndroid Build Coastguard Worker  * Optimized implementation:
1329*0e209d39SAndroid Build Coastguard Worker  *
1330*0e209d39SAndroid Build Coastguard Worker  * (Same assumption as for span() above.)
1331*0e209d39SAndroid Build Coastguard Worker  *
1332*0e209d39SAndroid Build Coastguard Worker  * Create and cache a spanNotSet which contains all of the single code points
1333*0e209d39SAndroid Build Coastguard Worker  * of the original set but none of its strings.
1334*0e209d39SAndroid Build Coastguard Worker  * For each set string add its initial code point to the spanNotSet.
1335*0e209d39SAndroid Build Coastguard Worker  * (Also add its final code point for spanNotBack().)
1336*0e209d39SAndroid Build Coastguard Worker  *
1337*0e209d39SAndroid Build Coastguard Worker  * - Loop:
1338*0e209d39SAndroid Build Coastguard Worker  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1339*0e209d39SAndroid Build Coastguard Worker  *   + If the current code point is in the original set, then
1340*0e209d39SAndroid Build Coastguard Worker  *     return the current position.
1341*0e209d39SAndroid Build Coastguard Worker  *   + If any set string matches at the current position, then
1342*0e209d39SAndroid Build Coastguard Worker  *     return the current position.
1343*0e209d39SAndroid Build Coastguard Worker  *   + If there is no match at the current position, neither for the code point there
1344*0e209d39SAndroid Build Coastguard Worker  *     nor for any set string, then skip this code point and continue the loop.
1345*0e209d39SAndroid Build Coastguard Worker  *     This happens for set-string-initial code points that were added to spanNotSet
1346*0e209d39SAndroid Build Coastguard Worker  *     when there is not actually a match for such a set string.
1347*0e209d39SAndroid Build Coastguard Worker  */
1348*0e209d39SAndroid Build Coastguard Worker 
spanNot(const char16_t * s,int32_t length) const1349*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::spanNot(const char16_t *s, int32_t length) const {
1350*0e209d39SAndroid Build Coastguard Worker     int32_t pos=0, rest=length;
1351*0e209d39SAndroid Build Coastguard Worker     int32_t i, stringsLength=strings.size();
1352*0e209d39SAndroid Build Coastguard Worker     do {
1353*0e209d39SAndroid Build Coastguard Worker         // Span until we find a code point from the set,
1354*0e209d39SAndroid Build Coastguard Worker         // or a code point that starts or ends some string.
1355*0e209d39SAndroid Build Coastguard Worker         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1356*0e209d39SAndroid Build Coastguard Worker         if(i==rest) {
1357*0e209d39SAndroid Build Coastguard Worker             return length;  // Reached the end of the string.
1358*0e209d39SAndroid Build Coastguard Worker         }
1359*0e209d39SAndroid Build Coastguard Worker         pos+=i;
1360*0e209d39SAndroid Build Coastguard Worker         rest-=i;
1361*0e209d39SAndroid Build Coastguard Worker 
1362*0e209d39SAndroid Build Coastguard Worker         // Check whether the current code point is in the original set,
1363*0e209d39SAndroid Build Coastguard Worker         // without the string starts and ends.
1364*0e209d39SAndroid Build Coastguard Worker         int32_t cpLength=spanOne(spanSet, s+pos, rest);
1365*0e209d39SAndroid Build Coastguard Worker         if(cpLength>0) {
1366*0e209d39SAndroid Build Coastguard Worker             return pos;  // There is a set element at pos.
1367*0e209d39SAndroid Build Coastguard Worker         }
1368*0e209d39SAndroid Build Coastguard Worker 
1369*0e209d39SAndroid Build Coastguard Worker         // Try to match the strings at pos.
1370*0e209d39SAndroid Build Coastguard Worker         for(i=0; i<stringsLength; ++i) {
1371*0e209d39SAndroid Build Coastguard Worker             if(spanLengths[i]==ALL_CP_CONTAINED) {
1372*0e209d39SAndroid Build Coastguard Worker                 continue;  // Irrelevant string. (Also the empty string.)
1373*0e209d39SAndroid Build Coastguard Worker             }
1374*0e209d39SAndroid Build Coastguard Worker             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1375*0e209d39SAndroid Build Coastguard Worker             const char16_t *s16=string.getBuffer();
1376*0e209d39SAndroid Build Coastguard Worker             int32_t length16=string.length();
1377*0e209d39SAndroid Build Coastguard Worker             U_ASSERT(length>0);
1378*0e209d39SAndroid Build Coastguard Worker             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1379*0e209d39SAndroid Build Coastguard Worker                 return pos;  // There is a set element at pos.
1380*0e209d39SAndroid Build Coastguard Worker             }
1381*0e209d39SAndroid Build Coastguard Worker         }
1382*0e209d39SAndroid Build Coastguard Worker 
1383*0e209d39SAndroid Build Coastguard Worker         // The span(while not contained) ended on a string start/end which is
1384*0e209d39SAndroid Build Coastguard Worker         // not in the original set. Skip this code point and continue.
1385*0e209d39SAndroid Build Coastguard Worker         // cpLength<0
1386*0e209d39SAndroid Build Coastguard Worker         pos-=cpLength;
1387*0e209d39SAndroid Build Coastguard Worker         rest+=cpLength;
1388*0e209d39SAndroid Build Coastguard Worker     } while(rest!=0);
1389*0e209d39SAndroid Build Coastguard Worker     return length;  // Reached the end of the string.
1390*0e209d39SAndroid Build Coastguard Worker }
1391*0e209d39SAndroid Build Coastguard Worker 
spanNotBack(const char16_t * s,int32_t length) const1392*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::spanNotBack(const char16_t *s, int32_t length) const {
1393*0e209d39SAndroid Build Coastguard Worker     int32_t pos=length;
1394*0e209d39SAndroid Build Coastguard Worker     int32_t i, stringsLength=strings.size();
1395*0e209d39SAndroid Build Coastguard Worker     do {
1396*0e209d39SAndroid Build Coastguard Worker         // Span until we find a code point from the set,
1397*0e209d39SAndroid Build Coastguard Worker         // or a code point that starts or ends some string.
1398*0e209d39SAndroid Build Coastguard Worker         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1399*0e209d39SAndroid Build Coastguard Worker         if(pos==0) {
1400*0e209d39SAndroid Build Coastguard Worker             return 0;  // Reached the start of the string.
1401*0e209d39SAndroid Build Coastguard Worker         }
1402*0e209d39SAndroid Build Coastguard Worker 
1403*0e209d39SAndroid Build Coastguard Worker         // Check whether the current code point is in the original set,
1404*0e209d39SAndroid Build Coastguard Worker         // without the string starts and ends.
1405*0e209d39SAndroid Build Coastguard Worker         int32_t cpLength=spanOneBack(spanSet, s, pos);
1406*0e209d39SAndroid Build Coastguard Worker         if(cpLength>0) {
1407*0e209d39SAndroid Build Coastguard Worker             return pos;  // There is a set element at pos.
1408*0e209d39SAndroid Build Coastguard Worker         }
1409*0e209d39SAndroid Build Coastguard Worker 
1410*0e209d39SAndroid Build Coastguard Worker         // Try to match the strings at pos.
1411*0e209d39SAndroid Build Coastguard Worker         for(i=0; i<stringsLength; ++i) {
1412*0e209d39SAndroid Build Coastguard Worker             // Use spanLengths rather than a spanBackLengths pointer because
1413*0e209d39SAndroid Build Coastguard Worker             // it is easier and we only need to know whether the string is irrelevant
1414*0e209d39SAndroid Build Coastguard Worker             // which is the same in either array.
1415*0e209d39SAndroid Build Coastguard Worker             if(spanLengths[i]==ALL_CP_CONTAINED) {
1416*0e209d39SAndroid Build Coastguard Worker                 continue;  // Irrelevant string. (Also the empty string.)
1417*0e209d39SAndroid Build Coastguard Worker             }
1418*0e209d39SAndroid Build Coastguard Worker             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1419*0e209d39SAndroid Build Coastguard Worker             const char16_t *s16=string.getBuffer();
1420*0e209d39SAndroid Build Coastguard Worker             int32_t length16=string.length();
1421*0e209d39SAndroid Build Coastguard Worker             U_ASSERT(length>0);
1422*0e209d39SAndroid Build Coastguard Worker             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1423*0e209d39SAndroid Build Coastguard Worker                 return pos;  // There is a set element at pos.
1424*0e209d39SAndroid Build Coastguard Worker             }
1425*0e209d39SAndroid Build Coastguard Worker         }
1426*0e209d39SAndroid Build Coastguard Worker 
1427*0e209d39SAndroid Build Coastguard Worker         // The span(while not contained) ended on a string start/end which is
1428*0e209d39SAndroid Build Coastguard Worker         // not in the original set. Skip this code point and continue.
1429*0e209d39SAndroid Build Coastguard Worker         // cpLength<0
1430*0e209d39SAndroid Build Coastguard Worker         pos+=cpLength;
1431*0e209d39SAndroid Build Coastguard Worker     } while(pos!=0);
1432*0e209d39SAndroid Build Coastguard Worker     return 0;  // Reached the start of the string.
1433*0e209d39SAndroid Build Coastguard Worker }
1434*0e209d39SAndroid Build Coastguard Worker 
spanNotUTF8(const uint8_t * s,int32_t length) const1435*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1436*0e209d39SAndroid Build Coastguard Worker     int32_t pos=0, rest=length;
1437*0e209d39SAndroid Build Coastguard Worker     int32_t i, stringsLength=strings.size();
1438*0e209d39SAndroid Build Coastguard Worker     uint8_t *spanUTF8Lengths=spanLengths;
1439*0e209d39SAndroid Build Coastguard Worker     if(all) {
1440*0e209d39SAndroid Build Coastguard Worker         spanUTF8Lengths+=2*stringsLength;
1441*0e209d39SAndroid Build Coastguard Worker     }
1442*0e209d39SAndroid Build Coastguard Worker     do {
1443*0e209d39SAndroid Build Coastguard Worker         // Span until we find a code point from the set,
1444*0e209d39SAndroid Build Coastguard Worker         // or a code point that starts or ends some string.
1445*0e209d39SAndroid Build Coastguard Worker         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1446*0e209d39SAndroid Build Coastguard Worker         if(i==rest) {
1447*0e209d39SAndroid Build Coastguard Worker             return length;  // Reached the end of the string.
1448*0e209d39SAndroid Build Coastguard Worker         }
1449*0e209d39SAndroid Build Coastguard Worker         pos+=i;
1450*0e209d39SAndroid Build Coastguard Worker         rest-=i;
1451*0e209d39SAndroid Build Coastguard Worker 
1452*0e209d39SAndroid Build Coastguard Worker         // Check whether the current code point is in the original set,
1453*0e209d39SAndroid Build Coastguard Worker         // without the string starts and ends.
1454*0e209d39SAndroid Build Coastguard Worker         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1455*0e209d39SAndroid Build Coastguard Worker         if(cpLength>0) {
1456*0e209d39SAndroid Build Coastguard Worker             return pos;  // There is a set element at pos.
1457*0e209d39SAndroid Build Coastguard Worker         }
1458*0e209d39SAndroid Build Coastguard Worker 
1459*0e209d39SAndroid Build Coastguard Worker         // Try to match the strings at pos.
1460*0e209d39SAndroid Build Coastguard Worker         const uint8_t *s8=utf8;
1461*0e209d39SAndroid Build Coastguard Worker         int32_t length8;
1462*0e209d39SAndroid Build Coastguard Worker         for(i=0; i<stringsLength; ++i) {
1463*0e209d39SAndroid Build Coastguard Worker             length8=utf8Lengths[i];
1464*0e209d39SAndroid Build Coastguard Worker             // ALL_CP_CONTAINED: Irrelevant string.
1465*0e209d39SAndroid Build Coastguard Worker             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1466*0e209d39SAndroid Build Coastguard Worker                 return pos;  // There is a set element at pos.
1467*0e209d39SAndroid Build Coastguard Worker             }
1468*0e209d39SAndroid Build Coastguard Worker             s8+=length8;
1469*0e209d39SAndroid Build Coastguard Worker         }
1470*0e209d39SAndroid Build Coastguard Worker 
1471*0e209d39SAndroid Build Coastguard Worker         // The span(while not contained) ended on a string start/end which is
1472*0e209d39SAndroid Build Coastguard Worker         // not in the original set. Skip this code point and continue.
1473*0e209d39SAndroid Build Coastguard Worker         // cpLength<0
1474*0e209d39SAndroid Build Coastguard Worker         pos-=cpLength;
1475*0e209d39SAndroid Build Coastguard Worker         rest+=cpLength;
1476*0e209d39SAndroid Build Coastguard Worker     } while(rest!=0);
1477*0e209d39SAndroid Build Coastguard Worker     return length;  // Reached the end of the string.
1478*0e209d39SAndroid Build Coastguard Worker }
1479*0e209d39SAndroid Build Coastguard Worker 
spanNotBackUTF8(const uint8_t * s,int32_t length) const1480*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1481*0e209d39SAndroid Build Coastguard Worker     int32_t pos=length;
1482*0e209d39SAndroid Build Coastguard Worker     int32_t i, stringsLength=strings.size();
1483*0e209d39SAndroid Build Coastguard Worker     uint8_t *spanBackUTF8Lengths=spanLengths;
1484*0e209d39SAndroid Build Coastguard Worker     if(all) {
1485*0e209d39SAndroid Build Coastguard Worker         spanBackUTF8Lengths+=3*stringsLength;
1486*0e209d39SAndroid Build Coastguard Worker     }
1487*0e209d39SAndroid Build Coastguard Worker     do {
1488*0e209d39SAndroid Build Coastguard Worker         // Span until we find a code point from the set,
1489*0e209d39SAndroid Build Coastguard Worker         // or a code point that starts or ends some string.
1490*0e209d39SAndroid Build Coastguard Worker         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1491*0e209d39SAndroid Build Coastguard Worker         if(pos==0) {
1492*0e209d39SAndroid Build Coastguard Worker             return 0;  // Reached the start of the string.
1493*0e209d39SAndroid Build Coastguard Worker         }
1494*0e209d39SAndroid Build Coastguard Worker 
1495*0e209d39SAndroid Build Coastguard Worker         // Check whether the current code point is in the original set,
1496*0e209d39SAndroid Build Coastguard Worker         // without the string starts and ends.
1497*0e209d39SAndroid Build Coastguard Worker         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1498*0e209d39SAndroid Build Coastguard Worker         if(cpLength>0) {
1499*0e209d39SAndroid Build Coastguard Worker             return pos;  // There is a set element at pos.
1500*0e209d39SAndroid Build Coastguard Worker         }
1501*0e209d39SAndroid Build Coastguard Worker 
1502*0e209d39SAndroid Build Coastguard Worker         // Try to match the strings at pos.
1503*0e209d39SAndroid Build Coastguard Worker         const uint8_t *s8=utf8;
1504*0e209d39SAndroid Build Coastguard Worker         int32_t length8;
1505*0e209d39SAndroid Build Coastguard Worker         for(i=0; i<stringsLength; ++i) {
1506*0e209d39SAndroid Build Coastguard Worker             length8=utf8Lengths[i];
1507*0e209d39SAndroid Build Coastguard Worker             // ALL_CP_CONTAINED: Irrelevant string.
1508*0e209d39SAndroid Build Coastguard Worker             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1509*0e209d39SAndroid Build Coastguard Worker                 return pos;  // There is a set element at pos.
1510*0e209d39SAndroid Build Coastguard Worker             }
1511*0e209d39SAndroid Build Coastguard Worker             s8+=length8;
1512*0e209d39SAndroid Build Coastguard Worker         }
1513*0e209d39SAndroid Build Coastguard Worker 
1514*0e209d39SAndroid Build Coastguard Worker         // The span(while not contained) ended on a string start/end which is
1515*0e209d39SAndroid Build Coastguard Worker         // not in the original set. Skip this code point and continue.
1516*0e209d39SAndroid Build Coastguard Worker         // cpLength<0
1517*0e209d39SAndroid Build Coastguard Worker         pos+=cpLength;
1518*0e209d39SAndroid Build Coastguard Worker     } while(pos!=0);
1519*0e209d39SAndroid Build Coastguard Worker     return 0;  // Reached the start of the string.
1520*0e209d39SAndroid Build Coastguard Worker }
1521*0e209d39SAndroid Build Coastguard Worker 
1522*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
1523