1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker ******************************************************************************
5*0e209d39SAndroid Build Coastguard Worker *
6*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 2007-2012, International Business Machines
7*0e209d39SAndroid Build Coastguard Worker * Corporation and others. All Rights Reserved.
8*0e209d39SAndroid Build Coastguard Worker *
9*0e209d39SAndroid Build Coastguard Worker ******************************************************************************
10*0e209d39SAndroid Build Coastguard Worker * file name: unisetspan.cpp
11*0e209d39SAndroid Build Coastguard Worker * encoding: UTF-8
12*0e209d39SAndroid Build Coastguard Worker * tab size: 8 (not used)
13*0e209d39SAndroid Build Coastguard Worker * indentation:4
14*0e209d39SAndroid Build Coastguard Worker *
15*0e209d39SAndroid Build Coastguard Worker * created on: 2007mar01
16*0e209d39SAndroid Build Coastguard Worker * created by: Markus W. Scherer
17*0e209d39SAndroid Build Coastguard Worker */
18*0e209d39SAndroid Build Coastguard Worker
19*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
20*0e209d39SAndroid Build Coastguard Worker #include "unicode/uniset.h"
21*0e209d39SAndroid Build Coastguard Worker #include "unicode/ustring.h"
22*0e209d39SAndroid Build Coastguard Worker #include "unicode/utf8.h"
23*0e209d39SAndroid Build Coastguard Worker #include "unicode/utf16.h"
24*0e209d39SAndroid Build Coastguard Worker #include "cmemory.h"
25*0e209d39SAndroid Build Coastguard Worker #include "uvector.h"
26*0e209d39SAndroid Build Coastguard Worker #include "unisetspan.h"
27*0e209d39SAndroid Build Coastguard Worker
28*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
29*0e209d39SAndroid Build Coastguard Worker
30*0e209d39SAndroid Build Coastguard Worker /*
31*0e209d39SAndroid Build Coastguard Worker * List of offsets from the current position from where to try matching
32*0e209d39SAndroid Build Coastguard Worker * a code point or a string.
33*0e209d39SAndroid Build Coastguard Worker * Store offsets rather than indexes to simplify the code and use the same list
34*0e209d39SAndroid Build Coastguard Worker * for both increments (in span()) and decrements (in spanBack()).
35*0e209d39SAndroid Build Coastguard Worker *
36*0e209d39SAndroid Build Coastguard Worker * Assumption: The maximum offset is limited, and the offsets that are stored
37*0e209d39SAndroid Build Coastguard Worker * at any one time are relatively dense, that is, there are normally no gaps of
38*0e209d39SAndroid Build Coastguard Worker * hundreds or thousands of offset values.
39*0e209d39SAndroid Build Coastguard Worker *
40*0e209d39SAndroid Build Coastguard Worker * The implementation uses a circular buffer of byte flags,
41*0e209d39SAndroid Build Coastguard Worker * each indicating whether the corresponding offset is in the list.
42*0e209d39SAndroid Build Coastguard Worker * This avoids inserting into a sorted list of offsets (or absolute indexes) and
43*0e209d39SAndroid Build Coastguard Worker * physically moving part of the list.
44*0e209d39SAndroid Build Coastguard Worker *
45*0e209d39SAndroid Build Coastguard Worker * Note: In principle, the caller should setMaxLength() to the maximum of the
46*0e209d39SAndroid Build Coastguard Worker * max string length and U16_LENGTH/U8_LENGTH to account for
47*0e209d39SAndroid Build Coastguard Worker * "long" single code points.
48*0e209d39SAndroid Build Coastguard Worker * However, this implementation uses at least a staticList with more than
49*0e209d39SAndroid Build Coastguard Worker * U8_LENGTH entries anyway.
50*0e209d39SAndroid Build Coastguard Worker *
51*0e209d39SAndroid Build Coastguard Worker * Note: If maxLength were guaranteed to be no more than 32 or 64,
52*0e209d39SAndroid Build Coastguard Worker * the list could be stored as bit flags in a single integer.
53*0e209d39SAndroid Build Coastguard Worker * Rather than handling a circular buffer with a start list index,
54*0e209d39SAndroid Build Coastguard Worker * the integer would simply be shifted when lower offsets are removed.
55*0e209d39SAndroid Build Coastguard Worker * UnicodeSet does not have a limit on the lengths of strings.
56*0e209d39SAndroid Build Coastguard Worker */
57*0e209d39SAndroid Build Coastguard Worker class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
58*0e209d39SAndroid Build Coastguard Worker public:
OffsetList()59*0e209d39SAndroid Build Coastguard Worker OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
60*0e209d39SAndroid Build Coastguard Worker
~OffsetList()61*0e209d39SAndroid Build Coastguard Worker ~OffsetList() {
62*0e209d39SAndroid Build Coastguard Worker if(list!=staticList) {
63*0e209d39SAndroid Build Coastguard Worker uprv_free(list);
64*0e209d39SAndroid Build Coastguard Worker }
65*0e209d39SAndroid Build Coastguard Worker }
66*0e209d39SAndroid Build Coastguard Worker
67*0e209d39SAndroid Build Coastguard Worker // Call exactly once if the list is to be used.
setMaxLength(int32_t maxLength)68*0e209d39SAndroid Build Coastguard Worker void setMaxLength(int32_t maxLength) {
69*0e209d39SAndroid Build Coastguard Worker if(maxLength<=(int32_t)sizeof(staticList)) {
70*0e209d39SAndroid Build Coastguard Worker capacity=(int32_t)sizeof(staticList);
71*0e209d39SAndroid Build Coastguard Worker } else {
72*0e209d39SAndroid Build Coastguard Worker UBool *l=(UBool *)uprv_malloc(maxLength);
73*0e209d39SAndroid Build Coastguard Worker if(l!=nullptr) {
74*0e209d39SAndroid Build Coastguard Worker list=l;
75*0e209d39SAndroid Build Coastguard Worker capacity=maxLength;
76*0e209d39SAndroid Build Coastguard Worker }
77*0e209d39SAndroid Build Coastguard Worker }
78*0e209d39SAndroid Build Coastguard Worker uprv_memset(list, 0, capacity);
79*0e209d39SAndroid Build Coastguard Worker }
80*0e209d39SAndroid Build Coastguard Worker
clear()81*0e209d39SAndroid Build Coastguard Worker void clear() {
82*0e209d39SAndroid Build Coastguard Worker uprv_memset(list, 0, capacity);
83*0e209d39SAndroid Build Coastguard Worker start=length=0;
84*0e209d39SAndroid Build Coastguard Worker }
85*0e209d39SAndroid Build Coastguard Worker
isEmpty() const86*0e209d39SAndroid Build Coastguard Worker UBool isEmpty() const {
87*0e209d39SAndroid Build Coastguard Worker return (UBool)(length==0);
88*0e209d39SAndroid Build Coastguard Worker }
89*0e209d39SAndroid Build Coastguard Worker
90*0e209d39SAndroid Build Coastguard Worker // Reduce all stored offsets by delta, used when the current position
91*0e209d39SAndroid Build Coastguard Worker // moves by delta.
92*0e209d39SAndroid Build Coastguard Worker // There must not be any offsets lower than delta.
93*0e209d39SAndroid Build Coastguard Worker // If there is an offset equal to delta, it is removed.
94*0e209d39SAndroid Build Coastguard Worker // delta=[1..maxLength]
shift(int32_t delta)95*0e209d39SAndroid Build Coastguard Worker void shift(int32_t delta) {
96*0e209d39SAndroid Build Coastguard Worker int32_t i=start+delta;
97*0e209d39SAndroid Build Coastguard Worker if(i>=capacity) {
98*0e209d39SAndroid Build Coastguard Worker i-=capacity;
99*0e209d39SAndroid Build Coastguard Worker }
100*0e209d39SAndroid Build Coastguard Worker if(list[i]) {
101*0e209d39SAndroid Build Coastguard Worker list[i]=false;
102*0e209d39SAndroid Build Coastguard Worker --length;
103*0e209d39SAndroid Build Coastguard Worker }
104*0e209d39SAndroid Build Coastguard Worker start=i;
105*0e209d39SAndroid Build Coastguard Worker }
106*0e209d39SAndroid Build Coastguard Worker
107*0e209d39SAndroid Build Coastguard Worker // Add an offset. The list must not contain it yet.
108*0e209d39SAndroid Build Coastguard Worker // offset=[1..maxLength]
addOffset(int32_t offset)109*0e209d39SAndroid Build Coastguard Worker void addOffset(int32_t offset) {
110*0e209d39SAndroid Build Coastguard Worker int32_t i=start+offset;
111*0e209d39SAndroid Build Coastguard Worker if(i>=capacity) {
112*0e209d39SAndroid Build Coastguard Worker i-=capacity;
113*0e209d39SAndroid Build Coastguard Worker }
114*0e209d39SAndroid Build Coastguard Worker list[i]=true;
115*0e209d39SAndroid Build Coastguard Worker ++length;
116*0e209d39SAndroid Build Coastguard Worker }
117*0e209d39SAndroid Build Coastguard Worker
118*0e209d39SAndroid Build Coastguard Worker // offset=[1..maxLength]
containsOffset(int32_t offset) const119*0e209d39SAndroid Build Coastguard Worker UBool containsOffset(int32_t offset) const {
120*0e209d39SAndroid Build Coastguard Worker int32_t i=start+offset;
121*0e209d39SAndroid Build Coastguard Worker if(i>=capacity) {
122*0e209d39SAndroid Build Coastguard Worker i-=capacity;
123*0e209d39SAndroid Build Coastguard Worker }
124*0e209d39SAndroid Build Coastguard Worker return list[i];
125*0e209d39SAndroid Build Coastguard Worker }
126*0e209d39SAndroid Build Coastguard Worker
127*0e209d39SAndroid Build Coastguard Worker // Find the lowest stored offset from a non-empty list, remove it,
128*0e209d39SAndroid Build Coastguard Worker // and reduce all other offsets by this minimum.
129*0e209d39SAndroid Build Coastguard Worker // Returns [1..maxLength].
popMinimum()130*0e209d39SAndroid Build Coastguard Worker int32_t popMinimum() {
131*0e209d39SAndroid Build Coastguard Worker // Look for the next offset in list[start+1..capacity-1].
132*0e209d39SAndroid Build Coastguard Worker int32_t i=start, result;
133*0e209d39SAndroid Build Coastguard Worker while(++i<capacity) {
134*0e209d39SAndroid Build Coastguard Worker if(list[i]) {
135*0e209d39SAndroid Build Coastguard Worker list[i]=false;
136*0e209d39SAndroid Build Coastguard Worker --length;
137*0e209d39SAndroid Build Coastguard Worker result=i-start;
138*0e209d39SAndroid Build Coastguard Worker start=i;
139*0e209d39SAndroid Build Coastguard Worker return result;
140*0e209d39SAndroid Build Coastguard Worker }
141*0e209d39SAndroid Build Coastguard Worker }
142*0e209d39SAndroid Build Coastguard Worker // i==capacity
143*0e209d39SAndroid Build Coastguard Worker
144*0e209d39SAndroid Build Coastguard Worker // Wrap around and look for the next offset in list[0..start].
145*0e209d39SAndroid Build Coastguard Worker // Since the list is not empty, there will be one.
146*0e209d39SAndroid Build Coastguard Worker result=capacity-start;
147*0e209d39SAndroid Build Coastguard Worker i=0;
148*0e209d39SAndroid Build Coastguard Worker while(!list[i]) {
149*0e209d39SAndroid Build Coastguard Worker ++i;
150*0e209d39SAndroid Build Coastguard Worker }
151*0e209d39SAndroid Build Coastguard Worker list[i]=false;
152*0e209d39SAndroid Build Coastguard Worker --length;
153*0e209d39SAndroid Build Coastguard Worker start=i;
154*0e209d39SAndroid Build Coastguard Worker return result+=i;
155*0e209d39SAndroid Build Coastguard Worker }
156*0e209d39SAndroid Build Coastguard Worker
157*0e209d39SAndroid Build Coastguard Worker private:
158*0e209d39SAndroid Build Coastguard Worker UBool *list;
159*0e209d39SAndroid Build Coastguard Worker int32_t capacity;
160*0e209d39SAndroid Build Coastguard Worker int32_t length;
161*0e209d39SAndroid Build Coastguard Worker int32_t start;
162*0e209d39SAndroid Build Coastguard Worker
163*0e209d39SAndroid Build Coastguard Worker UBool staticList[16];
164*0e209d39SAndroid Build Coastguard Worker };
165*0e209d39SAndroid Build Coastguard Worker
166*0e209d39SAndroid Build Coastguard Worker // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
167*0e209d39SAndroid Build Coastguard Worker static int32_t
getUTF8Length(const char16_t * s,int32_t length)168*0e209d39SAndroid Build Coastguard Worker getUTF8Length(const char16_t *s, int32_t length) {
169*0e209d39SAndroid Build Coastguard Worker UErrorCode errorCode=U_ZERO_ERROR;
170*0e209d39SAndroid Build Coastguard Worker int32_t length8=0;
171*0e209d39SAndroid Build Coastguard Worker u_strToUTF8(nullptr, 0, &length8, s, length, &errorCode);
172*0e209d39SAndroid Build Coastguard Worker if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
173*0e209d39SAndroid Build Coastguard Worker return length8;
174*0e209d39SAndroid Build Coastguard Worker } else {
175*0e209d39SAndroid Build Coastguard Worker // The string contains an unpaired surrogate.
176*0e209d39SAndroid Build Coastguard Worker // Ignore this string.
177*0e209d39SAndroid Build Coastguard Worker return 0;
178*0e209d39SAndroid Build Coastguard Worker }
179*0e209d39SAndroid Build Coastguard Worker }
180*0e209d39SAndroid Build Coastguard Worker
181*0e209d39SAndroid Build Coastguard Worker // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
182*0e209d39SAndroid Build Coastguard Worker static int32_t
appendUTF8(const char16_t * s,int32_t length,uint8_t * t,int32_t capacity)183*0e209d39SAndroid Build Coastguard Worker appendUTF8(const char16_t *s, int32_t length, uint8_t *t, int32_t capacity) {
184*0e209d39SAndroid Build Coastguard Worker UErrorCode errorCode=U_ZERO_ERROR;
185*0e209d39SAndroid Build Coastguard Worker int32_t length8=0;
186*0e209d39SAndroid Build Coastguard Worker u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
187*0e209d39SAndroid Build Coastguard Worker if(U_SUCCESS(errorCode)) {
188*0e209d39SAndroid Build Coastguard Worker return length8;
189*0e209d39SAndroid Build Coastguard Worker } else {
190*0e209d39SAndroid Build Coastguard Worker // The string contains an unpaired surrogate.
191*0e209d39SAndroid Build Coastguard Worker // Ignore this string.
192*0e209d39SAndroid Build Coastguard Worker return 0;
193*0e209d39SAndroid Build Coastguard Worker }
194*0e209d39SAndroid Build Coastguard Worker }
195*0e209d39SAndroid Build Coastguard Worker
196*0e209d39SAndroid Build Coastguard Worker static inline uint8_t
makeSpanLengthByte(int32_t spanLength)197*0e209d39SAndroid Build Coastguard Worker makeSpanLengthByte(int32_t spanLength) {
198*0e209d39SAndroid Build Coastguard Worker // 0xfe==UnicodeSetStringSpan::LONG_SPAN
199*0e209d39SAndroid Build Coastguard Worker return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
200*0e209d39SAndroid Build Coastguard Worker }
201*0e209d39SAndroid Build Coastguard Worker
202*0e209d39SAndroid Build Coastguard Worker // Construct for all variants of span(), or only for any one variant.
203*0e209d39SAndroid Build Coastguard Worker // Initialize as little as possible, for single use.
UnicodeSetStringSpan(const UnicodeSet & set,const UVector & setStrings,uint32_t which)204*0e209d39SAndroid Build Coastguard Worker UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
205*0e209d39SAndroid Build Coastguard Worker const UVector &setStrings,
206*0e209d39SAndroid Build Coastguard Worker uint32_t which)
207*0e209d39SAndroid Build Coastguard Worker : spanSet(0, 0x10ffff), pSpanNotSet(nullptr), strings(setStrings),
208*0e209d39SAndroid Build Coastguard Worker utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr),
209*0e209d39SAndroid Build Coastguard Worker utf8Length(0),
210*0e209d39SAndroid Build Coastguard Worker maxLength16(0), maxLength8(0),
211*0e209d39SAndroid Build Coastguard Worker all((UBool)(which==ALL)) {
212*0e209d39SAndroid Build Coastguard Worker spanSet.retainAll(set);
213*0e209d39SAndroid Build Coastguard Worker if(which&NOT_CONTAINED) {
214*0e209d39SAndroid Build Coastguard Worker // Default to the same sets.
215*0e209d39SAndroid Build Coastguard Worker // addToSpanNotSet() will create a separate set if necessary.
216*0e209d39SAndroid Build Coastguard Worker pSpanNotSet=&spanSet;
217*0e209d39SAndroid Build Coastguard Worker }
218*0e209d39SAndroid Build Coastguard Worker
219*0e209d39SAndroid Build Coastguard Worker // Determine if the strings even need to be taken into account at all for span() etc.
220*0e209d39SAndroid Build Coastguard Worker // If any string is relevant, then all strings need to be used for
221*0e209d39SAndroid Build Coastguard Worker // span(longest match) but only the relevant ones for span(while contained).
222*0e209d39SAndroid Build Coastguard Worker // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
223*0e209d39SAndroid Build Coastguard Worker // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
224*0e209d39SAndroid Build Coastguard Worker // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
225*0e209d39SAndroid Build Coastguard Worker // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
226*0e209d39SAndroid Build Coastguard Worker int32_t stringsLength=strings.size();
227*0e209d39SAndroid Build Coastguard Worker
228*0e209d39SAndroid Build Coastguard Worker int32_t i, spanLength;
229*0e209d39SAndroid Build Coastguard Worker UBool someRelevant=false;
230*0e209d39SAndroid Build Coastguard Worker for(i=0; i<stringsLength; ++i) {
231*0e209d39SAndroid Build Coastguard Worker const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
232*0e209d39SAndroid Build Coastguard Worker const char16_t *s16=string.getBuffer();
233*0e209d39SAndroid Build Coastguard Worker int32_t length16=string.length();
234*0e209d39SAndroid Build Coastguard Worker if (length16==0) {
235*0e209d39SAndroid Build Coastguard Worker continue; // skip the empty string
236*0e209d39SAndroid Build Coastguard Worker }
237*0e209d39SAndroid Build Coastguard Worker UBool thisRelevant;
238*0e209d39SAndroid Build Coastguard Worker spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
239*0e209d39SAndroid Build Coastguard Worker if(spanLength<length16) { // Relevant string.
240*0e209d39SAndroid Build Coastguard Worker someRelevant=thisRelevant=true;
241*0e209d39SAndroid Build Coastguard Worker } else {
242*0e209d39SAndroid Build Coastguard Worker thisRelevant=false;
243*0e209d39SAndroid Build Coastguard Worker }
244*0e209d39SAndroid Build Coastguard Worker if((which&UTF16) && length16>maxLength16) {
245*0e209d39SAndroid Build Coastguard Worker maxLength16=length16;
246*0e209d39SAndroid Build Coastguard Worker }
247*0e209d39SAndroid Build Coastguard Worker if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
248*0e209d39SAndroid Build Coastguard Worker int32_t length8=getUTF8Length(s16, length16);
249*0e209d39SAndroid Build Coastguard Worker utf8Length+=length8;
250*0e209d39SAndroid Build Coastguard Worker if(length8>maxLength8) {
251*0e209d39SAndroid Build Coastguard Worker maxLength8=length8;
252*0e209d39SAndroid Build Coastguard Worker }
253*0e209d39SAndroid Build Coastguard Worker }
254*0e209d39SAndroid Build Coastguard Worker }
255*0e209d39SAndroid Build Coastguard Worker if(!someRelevant) {
256*0e209d39SAndroid Build Coastguard Worker maxLength16=maxLength8=0;
257*0e209d39SAndroid Build Coastguard Worker return;
258*0e209d39SAndroid Build Coastguard Worker }
259*0e209d39SAndroid Build Coastguard Worker
260*0e209d39SAndroid Build Coastguard Worker // Freeze after checking for the need to use strings at all because freezing
261*0e209d39SAndroid Build Coastguard Worker // a set takes some time and memory which are wasted if there are no relevant strings.
262*0e209d39SAndroid Build Coastguard Worker if(all) {
263*0e209d39SAndroid Build Coastguard Worker spanSet.freeze();
264*0e209d39SAndroid Build Coastguard Worker }
265*0e209d39SAndroid Build Coastguard Worker
266*0e209d39SAndroid Build Coastguard Worker uint8_t *spanBackLengths;
267*0e209d39SAndroid Build Coastguard Worker uint8_t *spanUTF8Lengths;
268*0e209d39SAndroid Build Coastguard Worker uint8_t *spanBackUTF8Lengths;
269*0e209d39SAndroid Build Coastguard Worker
270*0e209d39SAndroid Build Coastguard Worker // Allocate a block of meta data.
271*0e209d39SAndroid Build Coastguard Worker int32_t allocSize;
272*0e209d39SAndroid Build Coastguard Worker if(all) {
273*0e209d39SAndroid Build Coastguard Worker // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
274*0e209d39SAndroid Build Coastguard Worker allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
275*0e209d39SAndroid Build Coastguard Worker } else {
276*0e209d39SAndroid Build Coastguard Worker allocSize=stringsLength; // One set of span lengths.
277*0e209d39SAndroid Build Coastguard Worker if(which&UTF8) {
278*0e209d39SAndroid Build Coastguard Worker // UTF-8 lengths and UTF-8 strings.
279*0e209d39SAndroid Build Coastguard Worker allocSize+=stringsLength*4+utf8Length;
280*0e209d39SAndroid Build Coastguard Worker }
281*0e209d39SAndroid Build Coastguard Worker }
282*0e209d39SAndroid Build Coastguard Worker if(allocSize<=(int32_t)sizeof(staticLengths)) {
283*0e209d39SAndroid Build Coastguard Worker utf8Lengths=staticLengths;
284*0e209d39SAndroid Build Coastguard Worker } else {
285*0e209d39SAndroid Build Coastguard Worker utf8Lengths=(int32_t *)uprv_malloc(allocSize);
286*0e209d39SAndroid Build Coastguard Worker if(utf8Lengths==nullptr) {
287*0e209d39SAndroid Build Coastguard Worker maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return false.
288*0e209d39SAndroid Build Coastguard Worker return; // Out of memory.
289*0e209d39SAndroid Build Coastguard Worker }
290*0e209d39SAndroid Build Coastguard Worker }
291*0e209d39SAndroid Build Coastguard Worker
292*0e209d39SAndroid Build Coastguard Worker if(all) {
293*0e209d39SAndroid Build Coastguard Worker // Store span lengths for all span() variants.
294*0e209d39SAndroid Build Coastguard Worker spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
295*0e209d39SAndroid Build Coastguard Worker spanBackLengths=spanLengths+stringsLength;
296*0e209d39SAndroid Build Coastguard Worker spanUTF8Lengths=spanBackLengths+stringsLength;
297*0e209d39SAndroid Build Coastguard Worker spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
298*0e209d39SAndroid Build Coastguard Worker utf8=spanBackUTF8Lengths+stringsLength;
299*0e209d39SAndroid Build Coastguard Worker } else {
300*0e209d39SAndroid Build Coastguard Worker // Store span lengths for only one span() variant.
301*0e209d39SAndroid Build Coastguard Worker if(which&UTF8) {
302*0e209d39SAndroid Build Coastguard Worker spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
303*0e209d39SAndroid Build Coastguard Worker utf8=spanLengths+stringsLength;
304*0e209d39SAndroid Build Coastguard Worker } else {
305*0e209d39SAndroid Build Coastguard Worker spanLengths=(uint8_t *)utf8Lengths;
306*0e209d39SAndroid Build Coastguard Worker }
307*0e209d39SAndroid Build Coastguard Worker spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
308*0e209d39SAndroid Build Coastguard Worker }
309*0e209d39SAndroid Build Coastguard Worker
310*0e209d39SAndroid Build Coastguard Worker // Set the meta data and pSpanNotSet and write the UTF-8 strings.
311*0e209d39SAndroid Build Coastguard Worker int32_t utf8Count=0; // Count UTF-8 bytes written so far.
312*0e209d39SAndroid Build Coastguard Worker
313*0e209d39SAndroid Build Coastguard Worker for(i=0; i<stringsLength; ++i) {
314*0e209d39SAndroid Build Coastguard Worker const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
315*0e209d39SAndroid Build Coastguard Worker const char16_t *s16=string.getBuffer();
316*0e209d39SAndroid Build Coastguard Worker int32_t length16=string.length();
317*0e209d39SAndroid Build Coastguard Worker spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
318*0e209d39SAndroid Build Coastguard Worker if(spanLength<length16 && length16>0) { // Relevant string.
319*0e209d39SAndroid Build Coastguard Worker if(which&UTF16) {
320*0e209d39SAndroid Build Coastguard Worker if(which&CONTAINED) {
321*0e209d39SAndroid Build Coastguard Worker if(which&FWD) {
322*0e209d39SAndroid Build Coastguard Worker spanLengths[i]=makeSpanLengthByte(spanLength);
323*0e209d39SAndroid Build Coastguard Worker }
324*0e209d39SAndroid Build Coastguard Worker if(which&BACK) {
325*0e209d39SAndroid Build Coastguard Worker spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
326*0e209d39SAndroid Build Coastguard Worker spanBackLengths[i]=makeSpanLengthByte(spanLength);
327*0e209d39SAndroid Build Coastguard Worker }
328*0e209d39SAndroid Build Coastguard Worker } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
329*0e209d39SAndroid Build Coastguard Worker spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
330*0e209d39SAndroid Build Coastguard Worker }
331*0e209d39SAndroid Build Coastguard Worker }
332*0e209d39SAndroid Build Coastguard Worker if(which&UTF8) {
333*0e209d39SAndroid Build Coastguard Worker uint8_t *s8=utf8+utf8Count;
334*0e209d39SAndroid Build Coastguard Worker int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
335*0e209d39SAndroid Build Coastguard Worker utf8Count+=utf8Lengths[i]=length8;
336*0e209d39SAndroid Build Coastguard Worker if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
337*0e209d39SAndroid Build Coastguard Worker spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
338*0e209d39SAndroid Build Coastguard Worker } else { // Relevant for UTF-8.
339*0e209d39SAndroid Build Coastguard Worker if(which&CONTAINED) {
340*0e209d39SAndroid Build Coastguard Worker if(which&FWD) {
341*0e209d39SAndroid Build Coastguard Worker spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
342*0e209d39SAndroid Build Coastguard Worker spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
343*0e209d39SAndroid Build Coastguard Worker }
344*0e209d39SAndroid Build Coastguard Worker if(which&BACK) {
345*0e209d39SAndroid Build Coastguard Worker spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
346*0e209d39SAndroid Build Coastguard Worker spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
347*0e209d39SAndroid Build Coastguard Worker }
348*0e209d39SAndroid Build Coastguard Worker } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
349*0e209d39SAndroid Build Coastguard Worker spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
350*0e209d39SAndroid Build Coastguard Worker }
351*0e209d39SAndroid Build Coastguard Worker }
352*0e209d39SAndroid Build Coastguard Worker }
353*0e209d39SAndroid Build Coastguard Worker if(which&NOT_CONTAINED) {
354*0e209d39SAndroid Build Coastguard Worker // Add string start and end code points to the spanNotSet so that
355*0e209d39SAndroid Build Coastguard Worker // a span(while not contained) stops before any string.
356*0e209d39SAndroid Build Coastguard Worker UChar32 c;
357*0e209d39SAndroid Build Coastguard Worker if(which&FWD) {
358*0e209d39SAndroid Build Coastguard Worker int32_t len=0;
359*0e209d39SAndroid Build Coastguard Worker U16_NEXT(s16, len, length16, c);
360*0e209d39SAndroid Build Coastguard Worker addToSpanNotSet(c);
361*0e209d39SAndroid Build Coastguard Worker }
362*0e209d39SAndroid Build Coastguard Worker if(which&BACK) {
363*0e209d39SAndroid Build Coastguard Worker int32_t len=length16;
364*0e209d39SAndroid Build Coastguard Worker U16_PREV(s16, 0, len, c);
365*0e209d39SAndroid Build Coastguard Worker addToSpanNotSet(c);
366*0e209d39SAndroid Build Coastguard Worker }
367*0e209d39SAndroid Build Coastguard Worker }
368*0e209d39SAndroid Build Coastguard Worker } else { // Irrelevant string. (Also the empty string.)
369*0e209d39SAndroid Build Coastguard Worker if(which&UTF8) {
370*0e209d39SAndroid Build Coastguard Worker if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
371*0e209d39SAndroid Build Coastguard Worker uint8_t *s8=utf8+utf8Count;
372*0e209d39SAndroid Build Coastguard Worker int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
373*0e209d39SAndroid Build Coastguard Worker utf8Count+=utf8Lengths[i]=length8;
374*0e209d39SAndroid Build Coastguard Worker } else {
375*0e209d39SAndroid Build Coastguard Worker utf8Lengths[i]=0;
376*0e209d39SAndroid Build Coastguard Worker }
377*0e209d39SAndroid Build Coastguard Worker }
378*0e209d39SAndroid Build Coastguard Worker if(all) {
379*0e209d39SAndroid Build Coastguard Worker spanLengths[i]=spanBackLengths[i]=
380*0e209d39SAndroid Build Coastguard Worker spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
381*0e209d39SAndroid Build Coastguard Worker (uint8_t)ALL_CP_CONTAINED;
382*0e209d39SAndroid Build Coastguard Worker } else {
383*0e209d39SAndroid Build Coastguard Worker // All spanXYZLengths pointers contain the same address.
384*0e209d39SAndroid Build Coastguard Worker spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
385*0e209d39SAndroid Build Coastguard Worker }
386*0e209d39SAndroid Build Coastguard Worker }
387*0e209d39SAndroid Build Coastguard Worker }
388*0e209d39SAndroid Build Coastguard Worker
389*0e209d39SAndroid Build Coastguard Worker // Finish.
390*0e209d39SAndroid Build Coastguard Worker if(all) {
391*0e209d39SAndroid Build Coastguard Worker pSpanNotSet->freeze();
392*0e209d39SAndroid Build Coastguard Worker }
393*0e209d39SAndroid Build Coastguard Worker }
394*0e209d39SAndroid Build Coastguard Worker
395*0e209d39SAndroid Build Coastguard Worker // Copy constructor. Assumes which==ALL for a frozen set.
UnicodeSetStringSpan(const UnicodeSetStringSpan & otherStringSpan,const UVector & newParentSetStrings)396*0e209d39SAndroid Build Coastguard Worker UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
397*0e209d39SAndroid Build Coastguard Worker const UVector &newParentSetStrings)
398*0e209d39SAndroid Build Coastguard Worker : spanSet(otherStringSpan.spanSet), pSpanNotSet(nullptr), strings(newParentSetStrings),
399*0e209d39SAndroid Build Coastguard Worker utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr),
400*0e209d39SAndroid Build Coastguard Worker utf8Length(otherStringSpan.utf8Length),
401*0e209d39SAndroid Build Coastguard Worker maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
402*0e209d39SAndroid Build Coastguard Worker all(true) {
403*0e209d39SAndroid Build Coastguard Worker if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
404*0e209d39SAndroid Build Coastguard Worker pSpanNotSet=&spanSet;
405*0e209d39SAndroid Build Coastguard Worker } else {
406*0e209d39SAndroid Build Coastguard Worker pSpanNotSet=otherStringSpan.pSpanNotSet->clone();
407*0e209d39SAndroid Build Coastguard Worker }
408*0e209d39SAndroid Build Coastguard Worker
409*0e209d39SAndroid Build Coastguard Worker // Allocate a block of meta data.
410*0e209d39SAndroid Build Coastguard Worker // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
411*0e209d39SAndroid Build Coastguard Worker int32_t stringsLength=strings.size();
412*0e209d39SAndroid Build Coastguard Worker int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
413*0e209d39SAndroid Build Coastguard Worker if(allocSize<=(int32_t)sizeof(staticLengths)) {
414*0e209d39SAndroid Build Coastguard Worker utf8Lengths=staticLengths;
415*0e209d39SAndroid Build Coastguard Worker } else {
416*0e209d39SAndroid Build Coastguard Worker utf8Lengths=(int32_t *)uprv_malloc(allocSize);
417*0e209d39SAndroid Build Coastguard Worker if(utf8Lengths==nullptr) {
418*0e209d39SAndroid Build Coastguard Worker maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return false.
419*0e209d39SAndroid Build Coastguard Worker return; // Out of memory.
420*0e209d39SAndroid Build Coastguard Worker }
421*0e209d39SAndroid Build Coastguard Worker }
422*0e209d39SAndroid Build Coastguard Worker
423*0e209d39SAndroid Build Coastguard Worker spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
424*0e209d39SAndroid Build Coastguard Worker utf8=spanLengths+stringsLength*4;
425*0e209d39SAndroid Build Coastguard Worker uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
426*0e209d39SAndroid Build Coastguard Worker }
427*0e209d39SAndroid Build Coastguard Worker
~UnicodeSetStringSpan()428*0e209d39SAndroid Build Coastguard Worker UnicodeSetStringSpan::~UnicodeSetStringSpan() {
429*0e209d39SAndroid Build Coastguard Worker if(pSpanNotSet!=nullptr && pSpanNotSet!=&spanSet) {
430*0e209d39SAndroid Build Coastguard Worker delete pSpanNotSet;
431*0e209d39SAndroid Build Coastguard Worker }
432*0e209d39SAndroid Build Coastguard Worker if(utf8Lengths!=nullptr && utf8Lengths!=staticLengths) {
433*0e209d39SAndroid Build Coastguard Worker uprv_free(utf8Lengths);
434*0e209d39SAndroid Build Coastguard Worker }
435*0e209d39SAndroid Build Coastguard Worker }
436*0e209d39SAndroid Build Coastguard Worker
addToSpanNotSet(UChar32 c)437*0e209d39SAndroid Build Coastguard Worker void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
438*0e209d39SAndroid Build Coastguard Worker if(pSpanNotSet==nullptr || pSpanNotSet==&spanSet) {
439*0e209d39SAndroid Build Coastguard Worker if(spanSet.contains(c)) {
440*0e209d39SAndroid Build Coastguard Worker return; // Nothing to do.
441*0e209d39SAndroid Build Coastguard Worker }
442*0e209d39SAndroid Build Coastguard Worker UnicodeSet *newSet=spanSet.cloneAsThawed();
443*0e209d39SAndroid Build Coastguard Worker if(newSet==nullptr) {
444*0e209d39SAndroid Build Coastguard Worker return; // Out of memory.
445*0e209d39SAndroid Build Coastguard Worker } else {
446*0e209d39SAndroid Build Coastguard Worker pSpanNotSet=newSet;
447*0e209d39SAndroid Build Coastguard Worker }
448*0e209d39SAndroid Build Coastguard Worker }
449*0e209d39SAndroid Build Coastguard Worker pSpanNotSet->add(c);
450*0e209d39SAndroid Build Coastguard Worker }
451*0e209d39SAndroid Build Coastguard Worker
452*0e209d39SAndroid Build Coastguard Worker // Compare strings without any argument checks. Requires length>0.
453*0e209d39SAndroid Build Coastguard Worker static inline UBool
matches16(const char16_t * s,const char16_t * t,int32_t length)454*0e209d39SAndroid Build Coastguard Worker matches16(const char16_t *s, const char16_t *t, int32_t length) {
455*0e209d39SAndroid Build Coastguard Worker do {
456*0e209d39SAndroid Build Coastguard Worker if(*s++!=*t++) {
457*0e209d39SAndroid Build Coastguard Worker return false;
458*0e209d39SAndroid Build Coastguard Worker }
459*0e209d39SAndroid Build Coastguard Worker } while(--length>0);
460*0e209d39SAndroid Build Coastguard Worker return true;
461*0e209d39SAndroid Build Coastguard Worker }
462*0e209d39SAndroid Build Coastguard Worker
463*0e209d39SAndroid Build Coastguard Worker static inline UBool
matches8(const uint8_t * s,const uint8_t * t,int32_t length)464*0e209d39SAndroid Build Coastguard Worker matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
465*0e209d39SAndroid Build Coastguard Worker do {
466*0e209d39SAndroid Build Coastguard Worker if(*s++!=*t++) {
467*0e209d39SAndroid Build Coastguard Worker return false;
468*0e209d39SAndroid Build Coastguard Worker }
469*0e209d39SAndroid Build Coastguard Worker } while(--length>0);
470*0e209d39SAndroid Build Coastguard Worker return true;
471*0e209d39SAndroid Build Coastguard Worker }
472*0e209d39SAndroid Build Coastguard Worker
473*0e209d39SAndroid Build Coastguard Worker // Compare 16-bit Unicode strings (which may be malformed UTF-16)
474*0e209d39SAndroid Build Coastguard Worker // at code point boundaries.
475*0e209d39SAndroid Build Coastguard Worker // That is, each edge of a match must not be in the middle of a surrogate pair.
476*0e209d39SAndroid Build Coastguard Worker static inline UBool
matches16CPB(const char16_t * s,int32_t start,int32_t limit,const char16_t * t,int32_t length)477*0e209d39SAndroid Build Coastguard Worker matches16CPB(const char16_t *s, int32_t start, int32_t limit, const char16_t *t, int32_t length) {
478*0e209d39SAndroid Build Coastguard Worker s+=start;
479*0e209d39SAndroid Build Coastguard Worker limit-=start;
480*0e209d39SAndroid Build Coastguard Worker return matches16(s, t, length) &&
481*0e209d39SAndroid Build Coastguard Worker !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
482*0e209d39SAndroid Build Coastguard Worker !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
483*0e209d39SAndroid Build Coastguard Worker }
484*0e209d39SAndroid Build Coastguard Worker
485*0e209d39SAndroid Build Coastguard Worker // Does the set contain the next code point?
486*0e209d39SAndroid Build Coastguard Worker // If so, return its length; otherwise return its negative length.
487*0e209d39SAndroid Build Coastguard Worker static inline int32_t
spanOne(const UnicodeSet & set,const char16_t * s,int32_t length)488*0e209d39SAndroid Build Coastguard Worker spanOne(const UnicodeSet &set, const char16_t *s, int32_t length) {
489*0e209d39SAndroid Build Coastguard Worker char16_t c=*s, c2;
490*0e209d39SAndroid Build Coastguard Worker if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
491*0e209d39SAndroid Build Coastguard Worker return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
492*0e209d39SAndroid Build Coastguard Worker }
493*0e209d39SAndroid Build Coastguard Worker return set.contains(c) ? 1 : -1;
494*0e209d39SAndroid Build Coastguard Worker }
495*0e209d39SAndroid Build Coastguard Worker
496*0e209d39SAndroid Build Coastguard Worker static inline int32_t
spanOneBack(const UnicodeSet & set,const char16_t * s,int32_t length)497*0e209d39SAndroid Build Coastguard Worker spanOneBack(const UnicodeSet &set, const char16_t *s, int32_t length) {
498*0e209d39SAndroid Build Coastguard Worker char16_t c=s[length-1], c2;
499*0e209d39SAndroid Build Coastguard Worker if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
500*0e209d39SAndroid Build Coastguard Worker return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
501*0e209d39SAndroid Build Coastguard Worker }
502*0e209d39SAndroid Build Coastguard Worker return set.contains(c) ? 1 : -1;
503*0e209d39SAndroid Build Coastguard Worker }
504*0e209d39SAndroid Build Coastguard Worker
505*0e209d39SAndroid Build Coastguard Worker static inline int32_t
spanOneUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)506*0e209d39SAndroid Build Coastguard Worker spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
507*0e209d39SAndroid Build Coastguard Worker UChar32 c=*s;
508*0e209d39SAndroid Build Coastguard Worker if(U8_IS_SINGLE(c)) {
509*0e209d39SAndroid Build Coastguard Worker return set.contains(c) ? 1 : -1;
510*0e209d39SAndroid Build Coastguard Worker }
511*0e209d39SAndroid Build Coastguard Worker // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
512*0e209d39SAndroid Build Coastguard Worker int32_t i=0;
513*0e209d39SAndroid Build Coastguard Worker U8_NEXT_OR_FFFD(s, i, length, c);
514*0e209d39SAndroid Build Coastguard Worker return set.contains(c) ? i : -i;
515*0e209d39SAndroid Build Coastguard Worker }
516*0e209d39SAndroid Build Coastguard Worker
517*0e209d39SAndroid Build Coastguard Worker static inline int32_t
spanOneBackUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)518*0e209d39SAndroid Build Coastguard Worker spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
519*0e209d39SAndroid Build Coastguard Worker UChar32 c=s[length-1];
520*0e209d39SAndroid Build Coastguard Worker if(U8_IS_SINGLE(c)) {
521*0e209d39SAndroid Build Coastguard Worker return set.contains(c) ? 1 : -1;
522*0e209d39SAndroid Build Coastguard Worker }
523*0e209d39SAndroid Build Coastguard Worker int32_t i=length-1;
524*0e209d39SAndroid Build Coastguard Worker c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
525*0e209d39SAndroid Build Coastguard Worker length-=i;
526*0e209d39SAndroid Build Coastguard Worker return set.contains(c) ? length : -length;
527*0e209d39SAndroid Build Coastguard Worker }
528*0e209d39SAndroid Build Coastguard Worker
529*0e209d39SAndroid Build Coastguard Worker /*
530*0e209d39SAndroid Build Coastguard Worker * Note: In span() when spanLength==0 (after a string match, or at the beginning
531*0e209d39SAndroid Build Coastguard Worker * after an empty code point span) and in spanNot() and spanNotUTF8(),
532*0e209d39SAndroid Build Coastguard Worker * string matching could use a binary search
533*0e209d39SAndroid Build Coastguard Worker * because all string matches are done from the same start index.
534*0e209d39SAndroid Build Coastguard Worker *
535*0e209d39SAndroid Build Coastguard Worker * For UTF-8, this would require a comparison function that returns UTF-16 order.
536*0e209d39SAndroid Build Coastguard Worker *
537*0e209d39SAndroid Build Coastguard Worker * This optimization should not be necessary for normal UnicodeSets because
538*0e209d39SAndroid Build Coastguard Worker * most sets have no strings, and most sets with strings have
539*0e209d39SAndroid Build Coastguard Worker * very few very short strings.
540*0e209d39SAndroid Build Coastguard Worker * For cases with many strings, it might be better to use a different API
541*0e209d39SAndroid Build Coastguard Worker * and implementation with a DFA (state machine).
542*0e209d39SAndroid Build Coastguard Worker */
543*0e209d39SAndroid Build Coastguard Worker
544*0e209d39SAndroid Build Coastguard Worker /*
545*0e209d39SAndroid Build Coastguard Worker * Algorithm for span(USET_SPAN_CONTAINED)
546*0e209d39SAndroid Build Coastguard Worker *
547*0e209d39SAndroid Build Coastguard Worker * Theoretical algorithm:
548*0e209d39SAndroid Build Coastguard Worker * - Iterate through the string, and at each code point boundary:
549*0e209d39SAndroid Build Coastguard Worker * + If the code point there is in the set, then remember to continue after it.
550*0e209d39SAndroid Build Coastguard Worker * + If a set string matches at the current position, then remember to continue after it.
551*0e209d39SAndroid Build Coastguard Worker * + Either recursively span for each code point or string match,
552*0e209d39SAndroid Build Coastguard Worker * or recursively span for all but the shortest one and
553*0e209d39SAndroid Build Coastguard Worker * iteratively continue the span with the shortest local match.
554*0e209d39SAndroid Build Coastguard Worker * + Remember the longest recursive span (the farthest end point).
555*0e209d39SAndroid Build Coastguard Worker * + If there is no match at the current position, neither for the code point there
556*0e209d39SAndroid Build Coastguard Worker * nor for any set string, then stop and return the longest recursive span length.
557*0e209d39SAndroid Build Coastguard Worker *
558*0e209d39SAndroid Build Coastguard Worker * Optimized implementation:
559*0e209d39SAndroid Build Coastguard Worker *
560*0e209d39SAndroid Build Coastguard Worker * (We assume that most sets will have very few very short strings.
561*0e209d39SAndroid Build Coastguard Worker * A span using a string-less set is extremely fast.)
562*0e209d39SAndroid Build Coastguard Worker *
563*0e209d39SAndroid Build Coastguard Worker * Create and cache a spanSet which contains all of the single code points
564*0e209d39SAndroid Build Coastguard Worker * of the original set but none of its strings.
565*0e209d39SAndroid Build Coastguard Worker *
566*0e209d39SAndroid Build Coastguard Worker * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
567*0e209d39SAndroid Build Coastguard Worker * - Loop:
568*0e209d39SAndroid Build Coastguard Worker * + Try to match each set string at the end of the spanLength.
569*0e209d39SAndroid Build Coastguard Worker * ~ Set strings that start with set-contained code points must be matched
570*0e209d39SAndroid Build Coastguard Worker * with a partial overlap because the recursive algorithm would have tried
571*0e209d39SAndroid Build Coastguard Worker * to match them at every position.
572*0e209d39SAndroid Build Coastguard Worker * ~ Set strings that entirely consist of set-contained code points
573*0e209d39SAndroid Build Coastguard Worker * are irrelevant for span(USET_SPAN_CONTAINED) because the
574*0e209d39SAndroid Build Coastguard Worker * recursive algorithm would continue after them anyway
575*0e209d39SAndroid Build Coastguard Worker * and find the longest recursive match from their end.
576*0e209d39SAndroid Build Coastguard Worker * ~ Rather than recursing, note each end point of a set string match.
577*0e209d39SAndroid Build Coastguard Worker * + If no set string matched after spanSet.span(), then return
578*0e209d39SAndroid Build Coastguard Worker * with where the spanSet.span() ended.
579*0e209d39SAndroid Build Coastguard Worker * + If at least one set string matched after spanSet.span(), then
580*0e209d39SAndroid Build Coastguard Worker * pop the shortest string match end point and continue
581*0e209d39SAndroid Build Coastguard Worker * the loop, trying to match all set strings from there.
582*0e209d39SAndroid Build Coastguard Worker * + If at least one more set string matched after a previous string match,
583*0e209d39SAndroid Build Coastguard Worker * then test if the code point after the previous string match is also
584*0e209d39SAndroid Build Coastguard Worker * contained in the set.
585*0e209d39SAndroid Build Coastguard Worker * Continue the loop with the shortest end point of either this code point
586*0e209d39SAndroid Build Coastguard Worker * or a matching set string.
587*0e209d39SAndroid Build Coastguard Worker * + If no more set string matched after a previous string match,
588*0e209d39SAndroid Build Coastguard Worker * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
589*0e209d39SAndroid Build Coastguard Worker * Stop if spanLength==0, otherwise continue the loop.
590*0e209d39SAndroid Build Coastguard Worker *
591*0e209d39SAndroid Build Coastguard Worker * By noting each end point of a set string match,
592*0e209d39SAndroid Build Coastguard Worker * the function visits each string position at most once and finishes
593*0e209d39SAndroid Build Coastguard Worker * in linear time.
594*0e209d39SAndroid Build Coastguard Worker *
595*0e209d39SAndroid Build Coastguard Worker * The recursive algorithm may visit the same string position many times
596*0e209d39SAndroid Build Coastguard Worker * if multiple paths lead to it and finishes in exponential time.
597*0e209d39SAndroid Build Coastguard Worker */
598*0e209d39SAndroid Build Coastguard Worker
599*0e209d39SAndroid Build Coastguard Worker /*
600*0e209d39SAndroid Build Coastguard Worker * Algorithm for span(USET_SPAN_SIMPLE)
601*0e209d39SAndroid Build Coastguard Worker *
602*0e209d39SAndroid Build Coastguard Worker * Theoretical algorithm:
603*0e209d39SAndroid Build Coastguard Worker * - Iterate through the string, and at each code point boundary:
604*0e209d39SAndroid Build Coastguard Worker * + If the code point there is in the set, then remember to continue after it.
605*0e209d39SAndroid Build Coastguard Worker * + If a set string matches at the current position, then remember to continue after it.
606*0e209d39SAndroid Build Coastguard Worker * + Continue from the farthest match position and ignore all others.
607*0e209d39SAndroid Build Coastguard Worker * + If there is no match at the current position,
608*0e209d39SAndroid Build Coastguard Worker * then stop and return the current position.
609*0e209d39SAndroid Build Coastguard Worker *
610*0e209d39SAndroid Build Coastguard Worker * Optimized implementation:
611*0e209d39SAndroid Build Coastguard Worker *
612*0e209d39SAndroid Build Coastguard Worker * (Same assumption and spanSet as above.)
613*0e209d39SAndroid Build Coastguard Worker *
614*0e209d39SAndroid Build Coastguard Worker * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
615*0e209d39SAndroid Build Coastguard Worker * - Loop:
616*0e209d39SAndroid Build Coastguard Worker * + Try to match each set string at the end of the spanLength.
617*0e209d39SAndroid Build Coastguard Worker * ~ Set strings that start with set-contained code points must be matched
618*0e209d39SAndroid Build Coastguard Worker * with a partial overlap because the standard algorithm would have tried
619*0e209d39SAndroid Build Coastguard Worker * to match them earlier.
620*0e209d39SAndroid Build Coastguard Worker * ~ Set strings that entirely consist of set-contained code points
621*0e209d39SAndroid Build Coastguard Worker * must be matched with a full overlap because the longest-match algorithm
622*0e209d39SAndroid Build Coastguard Worker * would hide set string matches that end earlier.
623*0e209d39SAndroid Build Coastguard Worker * Such set strings need not be matched earlier inside the code point span
624*0e209d39SAndroid Build Coastguard Worker * because the standard algorithm would then have continued after
625*0e209d39SAndroid Build Coastguard Worker * the set string match anyway.
626*0e209d39SAndroid Build Coastguard Worker * ~ Remember the longest set string match (farthest end point) from the earliest
627*0e209d39SAndroid Build Coastguard Worker * starting point.
628*0e209d39SAndroid Build Coastguard Worker * + If no set string matched after spanSet.span(), then return
629*0e209d39SAndroid Build Coastguard Worker * with where the spanSet.span() ended.
630*0e209d39SAndroid Build Coastguard Worker * + If at least one set string matched, then continue the loop after the
631*0e209d39SAndroid Build Coastguard Worker * longest match from the earliest position.
632*0e209d39SAndroid Build Coastguard Worker * + If no more set string matched after a previous string match,
633*0e209d39SAndroid Build Coastguard Worker * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
634*0e209d39SAndroid Build Coastguard Worker * Stop if spanLength==0, otherwise continue the loop.
635*0e209d39SAndroid Build Coastguard Worker */
636*0e209d39SAndroid Build Coastguard Worker
span(const char16_t * s,int32_t length,USetSpanCondition spanCondition) const637*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
638*0e209d39SAndroid Build Coastguard Worker if(spanCondition==USET_SPAN_NOT_CONTAINED) {
639*0e209d39SAndroid Build Coastguard Worker return spanNot(s, length);
640*0e209d39SAndroid Build Coastguard Worker }
641*0e209d39SAndroid Build Coastguard Worker int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
642*0e209d39SAndroid Build Coastguard Worker if(spanLength==length) {
643*0e209d39SAndroid Build Coastguard Worker return length;
644*0e209d39SAndroid Build Coastguard Worker }
645*0e209d39SAndroid Build Coastguard Worker
646*0e209d39SAndroid Build Coastguard Worker // Consider strings; they may overlap with the span.
647*0e209d39SAndroid Build Coastguard Worker OffsetList offsets;
648*0e209d39SAndroid Build Coastguard Worker if(spanCondition==USET_SPAN_CONTAINED) {
649*0e209d39SAndroid Build Coastguard Worker // Use offset list to try all possibilities.
650*0e209d39SAndroid Build Coastguard Worker offsets.setMaxLength(maxLength16);
651*0e209d39SAndroid Build Coastguard Worker }
652*0e209d39SAndroid Build Coastguard Worker int32_t pos=spanLength, rest=length-pos;
653*0e209d39SAndroid Build Coastguard Worker int32_t i, stringsLength=strings.size();
654*0e209d39SAndroid Build Coastguard Worker for(;;) {
655*0e209d39SAndroid Build Coastguard Worker if(spanCondition==USET_SPAN_CONTAINED) {
656*0e209d39SAndroid Build Coastguard Worker for(i=0; i<stringsLength; ++i) {
657*0e209d39SAndroid Build Coastguard Worker int32_t overlap=spanLengths[i];
658*0e209d39SAndroid Build Coastguard Worker if(overlap==ALL_CP_CONTAINED) {
659*0e209d39SAndroid Build Coastguard Worker continue; // Irrelevant string. (Also the empty string.)
660*0e209d39SAndroid Build Coastguard Worker }
661*0e209d39SAndroid Build Coastguard Worker const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
662*0e209d39SAndroid Build Coastguard Worker const char16_t *s16=string.getBuffer();
663*0e209d39SAndroid Build Coastguard Worker int32_t length16=string.length();
664*0e209d39SAndroid Build Coastguard Worker U_ASSERT(length>0);
665*0e209d39SAndroid Build Coastguard Worker
666*0e209d39SAndroid Build Coastguard Worker // Try to match this string at pos-overlap..pos.
667*0e209d39SAndroid Build Coastguard Worker if(overlap>=LONG_SPAN) {
668*0e209d39SAndroid Build Coastguard Worker overlap=length16;
669*0e209d39SAndroid Build Coastguard Worker // While contained: No point matching fully inside the code point span.
670*0e209d39SAndroid Build Coastguard Worker U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.
671*0e209d39SAndroid Build Coastguard Worker }
672*0e209d39SAndroid Build Coastguard Worker if(overlap>spanLength) {
673*0e209d39SAndroid Build Coastguard Worker overlap=spanLength;
674*0e209d39SAndroid Build Coastguard Worker }
675*0e209d39SAndroid Build Coastguard Worker int32_t inc=length16-overlap; // Keep overlap+inc==length16.
676*0e209d39SAndroid Build Coastguard Worker for(;;) {
677*0e209d39SAndroid Build Coastguard Worker if(inc>rest) {
678*0e209d39SAndroid Build Coastguard Worker break;
679*0e209d39SAndroid Build Coastguard Worker }
680*0e209d39SAndroid Build Coastguard Worker // Try to match if the increment is not listed already.
681*0e209d39SAndroid Build Coastguard Worker if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
682*0e209d39SAndroid Build Coastguard Worker if(inc==rest) {
683*0e209d39SAndroid Build Coastguard Worker return length; // Reached the end of the string.
684*0e209d39SAndroid Build Coastguard Worker }
685*0e209d39SAndroid Build Coastguard Worker offsets.addOffset(inc);
686*0e209d39SAndroid Build Coastguard Worker }
687*0e209d39SAndroid Build Coastguard Worker if(overlap==0) {
688*0e209d39SAndroid Build Coastguard Worker break;
689*0e209d39SAndroid Build Coastguard Worker }
690*0e209d39SAndroid Build Coastguard Worker --overlap;
691*0e209d39SAndroid Build Coastguard Worker ++inc;
692*0e209d39SAndroid Build Coastguard Worker }
693*0e209d39SAndroid Build Coastguard Worker }
694*0e209d39SAndroid Build Coastguard Worker } else /* USET_SPAN_SIMPLE */ {
695*0e209d39SAndroid Build Coastguard Worker int32_t maxInc=0, maxOverlap=0;
696*0e209d39SAndroid Build Coastguard Worker for(i=0; i<stringsLength; ++i) {
697*0e209d39SAndroid Build Coastguard Worker int32_t overlap=spanLengths[i];
698*0e209d39SAndroid Build Coastguard Worker // For longest match, we do need to try to match even an all-contained string
699*0e209d39SAndroid Build Coastguard Worker // to find the match from the earliest start.
700*0e209d39SAndroid Build Coastguard Worker
701*0e209d39SAndroid Build Coastguard Worker const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
702*0e209d39SAndroid Build Coastguard Worker const char16_t *s16=string.getBuffer();
703*0e209d39SAndroid Build Coastguard Worker int32_t length16=string.length();
704*0e209d39SAndroid Build Coastguard Worker if (length16==0) {
705*0e209d39SAndroid Build Coastguard Worker continue; // skip the empty string
706*0e209d39SAndroid Build Coastguard Worker }
707*0e209d39SAndroid Build Coastguard Worker
708*0e209d39SAndroid Build Coastguard Worker // Try to match this string at pos-overlap..pos.
709*0e209d39SAndroid Build Coastguard Worker if(overlap>=LONG_SPAN) {
710*0e209d39SAndroid Build Coastguard Worker overlap=length16;
711*0e209d39SAndroid Build Coastguard Worker // Longest match: Need to match fully inside the code point span
712*0e209d39SAndroid Build Coastguard Worker // to find the match from the earliest start.
713*0e209d39SAndroid Build Coastguard Worker }
714*0e209d39SAndroid Build Coastguard Worker if(overlap>spanLength) {
715*0e209d39SAndroid Build Coastguard Worker overlap=spanLength;
716*0e209d39SAndroid Build Coastguard Worker }
717*0e209d39SAndroid Build Coastguard Worker int32_t inc=length16-overlap; // Keep overlap+inc==length16.
718*0e209d39SAndroid Build Coastguard Worker for(;;) {
719*0e209d39SAndroid Build Coastguard Worker if(inc>rest || overlap<maxOverlap) {
720*0e209d39SAndroid Build Coastguard Worker break;
721*0e209d39SAndroid Build Coastguard Worker }
722*0e209d39SAndroid Build Coastguard Worker // Try to match if the string is longer or starts earlier.
723*0e209d39SAndroid Build Coastguard Worker if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
724*0e209d39SAndroid Build Coastguard Worker matches16CPB(s, pos-overlap, length, s16, length16)
725*0e209d39SAndroid Build Coastguard Worker ) {
726*0e209d39SAndroid Build Coastguard Worker maxInc=inc; // Longest match from earliest start.
727*0e209d39SAndroid Build Coastguard Worker maxOverlap=overlap;
728*0e209d39SAndroid Build Coastguard Worker break;
729*0e209d39SAndroid Build Coastguard Worker }
730*0e209d39SAndroid Build Coastguard Worker --overlap;
731*0e209d39SAndroid Build Coastguard Worker ++inc;
732*0e209d39SAndroid Build Coastguard Worker }
733*0e209d39SAndroid Build Coastguard Worker }
734*0e209d39SAndroid Build Coastguard Worker
735*0e209d39SAndroid Build Coastguard Worker if(maxInc!=0 || maxOverlap!=0) {
736*0e209d39SAndroid Build Coastguard Worker // Longest-match algorithm, and there was a string match.
737*0e209d39SAndroid Build Coastguard Worker // Simply continue after it.
738*0e209d39SAndroid Build Coastguard Worker pos+=maxInc;
739*0e209d39SAndroid Build Coastguard Worker rest-=maxInc;
740*0e209d39SAndroid Build Coastguard Worker if(rest==0) {
741*0e209d39SAndroid Build Coastguard Worker return length; // Reached the end of the string.
742*0e209d39SAndroid Build Coastguard Worker }
743*0e209d39SAndroid Build Coastguard Worker spanLength=0; // Match strings from after a string match.
744*0e209d39SAndroid Build Coastguard Worker continue;
745*0e209d39SAndroid Build Coastguard Worker }
746*0e209d39SAndroid Build Coastguard Worker }
747*0e209d39SAndroid Build Coastguard Worker // Finished trying to match all strings at pos.
748*0e209d39SAndroid Build Coastguard Worker
749*0e209d39SAndroid Build Coastguard Worker if(spanLength!=0 || pos==0) {
750*0e209d39SAndroid Build Coastguard Worker // The position is after an unlimited code point span (spanLength!=0),
751*0e209d39SAndroid Build Coastguard Worker // not after a string match.
752*0e209d39SAndroid Build Coastguard Worker // The only position where spanLength==0 after a span is pos==0.
753*0e209d39SAndroid Build Coastguard Worker // Otherwise, an unlimited code point span is only tried again when no
754*0e209d39SAndroid Build Coastguard Worker // strings match, and if such a non-initial span fails we stop.
755*0e209d39SAndroid Build Coastguard Worker if(offsets.isEmpty()) {
756*0e209d39SAndroid Build Coastguard Worker return pos; // No strings matched after a span.
757*0e209d39SAndroid Build Coastguard Worker }
758*0e209d39SAndroid Build Coastguard Worker // Match strings from after the next string match.
759*0e209d39SAndroid Build Coastguard Worker } else {
760*0e209d39SAndroid Build Coastguard Worker // The position is after a string match (or a single code point).
761*0e209d39SAndroid Build Coastguard Worker if(offsets.isEmpty()) {
762*0e209d39SAndroid Build Coastguard Worker // No more strings matched after a previous string match.
763*0e209d39SAndroid Build Coastguard Worker // Try another code point span from after the last string match.
764*0e209d39SAndroid Build Coastguard Worker spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
765*0e209d39SAndroid Build Coastguard Worker if( spanLength==rest || // Reached the end of the string, or
766*0e209d39SAndroid Build Coastguard Worker spanLength==0 // neither strings nor span progressed.
767*0e209d39SAndroid Build Coastguard Worker ) {
768*0e209d39SAndroid Build Coastguard Worker return pos+spanLength;
769*0e209d39SAndroid Build Coastguard Worker }
770*0e209d39SAndroid Build Coastguard Worker pos+=spanLength;
771*0e209d39SAndroid Build Coastguard Worker rest-=spanLength;
772*0e209d39SAndroid Build Coastguard Worker continue; // spanLength>0: Match strings from after a span.
773*0e209d39SAndroid Build Coastguard Worker } else {
774*0e209d39SAndroid Build Coastguard Worker // Try to match only one code point from after a string match if some
775*0e209d39SAndroid Build Coastguard Worker // string matched beyond it, so that we try all possible positions
776*0e209d39SAndroid Build Coastguard Worker // and don't overshoot.
777*0e209d39SAndroid Build Coastguard Worker spanLength=spanOne(spanSet, s+pos, rest);
778*0e209d39SAndroid Build Coastguard Worker if(spanLength>0) {
779*0e209d39SAndroid Build Coastguard Worker if(spanLength==rest) {
780*0e209d39SAndroid Build Coastguard Worker return length; // Reached the end of the string.
781*0e209d39SAndroid Build Coastguard Worker }
782*0e209d39SAndroid Build Coastguard Worker // Match strings after this code point.
783*0e209d39SAndroid Build Coastguard Worker // There cannot be any increments below it because UnicodeSet strings
784*0e209d39SAndroid Build Coastguard Worker // contain multiple code points.
785*0e209d39SAndroid Build Coastguard Worker pos+=spanLength;
786*0e209d39SAndroid Build Coastguard Worker rest-=spanLength;
787*0e209d39SAndroid Build Coastguard Worker offsets.shift(spanLength);
788*0e209d39SAndroid Build Coastguard Worker spanLength=0;
789*0e209d39SAndroid Build Coastguard Worker continue; // Match strings from after a single code point.
790*0e209d39SAndroid Build Coastguard Worker }
791*0e209d39SAndroid Build Coastguard Worker // Match strings from after the next string match.
792*0e209d39SAndroid Build Coastguard Worker }
793*0e209d39SAndroid Build Coastguard Worker }
794*0e209d39SAndroid Build Coastguard Worker int32_t minOffset=offsets.popMinimum();
795*0e209d39SAndroid Build Coastguard Worker pos+=minOffset;
796*0e209d39SAndroid Build Coastguard Worker rest-=minOffset;
797*0e209d39SAndroid Build Coastguard Worker spanLength=0; // Match strings from after a string match.
798*0e209d39SAndroid Build Coastguard Worker }
799*0e209d39SAndroid Build Coastguard Worker }
800*0e209d39SAndroid Build Coastguard Worker
spanBack(const char16_t * s,int32_t length,USetSpanCondition spanCondition) const801*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
802*0e209d39SAndroid Build Coastguard Worker if(spanCondition==USET_SPAN_NOT_CONTAINED) {
803*0e209d39SAndroid Build Coastguard Worker return spanNotBack(s, length);
804*0e209d39SAndroid Build Coastguard Worker }
805*0e209d39SAndroid Build Coastguard Worker int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
806*0e209d39SAndroid Build Coastguard Worker if(pos==0) {
807*0e209d39SAndroid Build Coastguard Worker return 0;
808*0e209d39SAndroid Build Coastguard Worker }
809*0e209d39SAndroid Build Coastguard Worker int32_t spanLength=length-pos;
810*0e209d39SAndroid Build Coastguard Worker
811*0e209d39SAndroid Build Coastguard Worker // Consider strings; they may overlap with the span.
812*0e209d39SAndroid Build Coastguard Worker OffsetList offsets;
813*0e209d39SAndroid Build Coastguard Worker if(spanCondition==USET_SPAN_CONTAINED) {
814*0e209d39SAndroid Build Coastguard Worker // Use offset list to try all possibilities.
815*0e209d39SAndroid Build Coastguard Worker offsets.setMaxLength(maxLength16);
816*0e209d39SAndroid Build Coastguard Worker }
817*0e209d39SAndroid Build Coastguard Worker int32_t i, stringsLength=strings.size();
818*0e209d39SAndroid Build Coastguard Worker uint8_t *spanBackLengths=spanLengths;
819*0e209d39SAndroid Build Coastguard Worker if(all) {
820*0e209d39SAndroid Build Coastguard Worker spanBackLengths+=stringsLength;
821*0e209d39SAndroid Build Coastguard Worker }
822*0e209d39SAndroid Build Coastguard Worker for(;;) {
823*0e209d39SAndroid Build Coastguard Worker if(spanCondition==USET_SPAN_CONTAINED) {
824*0e209d39SAndroid Build Coastguard Worker for(i=0; i<stringsLength; ++i) {
825*0e209d39SAndroid Build Coastguard Worker int32_t overlap=spanBackLengths[i];
826*0e209d39SAndroid Build Coastguard Worker if(overlap==ALL_CP_CONTAINED) {
827*0e209d39SAndroid Build Coastguard Worker continue; // Irrelevant string. (Also the empty string.)
828*0e209d39SAndroid Build Coastguard Worker }
829*0e209d39SAndroid Build Coastguard Worker const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
830*0e209d39SAndroid Build Coastguard Worker const char16_t *s16=string.getBuffer();
831*0e209d39SAndroid Build Coastguard Worker int32_t length16=string.length();
832*0e209d39SAndroid Build Coastguard Worker U_ASSERT(length>0);
833*0e209d39SAndroid Build Coastguard Worker
834*0e209d39SAndroid Build Coastguard Worker // Try to match this string at pos-(length16-overlap)..pos-length16.
835*0e209d39SAndroid Build Coastguard Worker if(overlap>=LONG_SPAN) {
836*0e209d39SAndroid Build Coastguard Worker overlap=length16;
837*0e209d39SAndroid Build Coastguard Worker // While contained: No point matching fully inside the code point span.
838*0e209d39SAndroid Build Coastguard Worker int32_t len1=0;
839*0e209d39SAndroid Build Coastguard Worker U16_FWD_1(s16, len1, overlap);
840*0e209d39SAndroid Build Coastguard Worker overlap-=len1; // Length of the string minus the first code point.
841*0e209d39SAndroid Build Coastguard Worker }
842*0e209d39SAndroid Build Coastguard Worker if(overlap>spanLength) {
843*0e209d39SAndroid Build Coastguard Worker overlap=spanLength;
844*0e209d39SAndroid Build Coastguard Worker }
845*0e209d39SAndroid Build Coastguard Worker int32_t dec=length16-overlap; // Keep dec+overlap==length16.
846*0e209d39SAndroid Build Coastguard Worker for(;;) {
847*0e209d39SAndroid Build Coastguard Worker if(dec>pos) {
848*0e209d39SAndroid Build Coastguard Worker break;
849*0e209d39SAndroid Build Coastguard Worker }
850*0e209d39SAndroid Build Coastguard Worker // Try to match if the decrement is not listed already.
851*0e209d39SAndroid Build Coastguard Worker if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
852*0e209d39SAndroid Build Coastguard Worker if(dec==pos) {
853*0e209d39SAndroid Build Coastguard Worker return 0; // Reached the start of the string.
854*0e209d39SAndroid Build Coastguard Worker }
855*0e209d39SAndroid Build Coastguard Worker offsets.addOffset(dec);
856*0e209d39SAndroid Build Coastguard Worker }
857*0e209d39SAndroid Build Coastguard Worker if(overlap==0) {
858*0e209d39SAndroid Build Coastguard Worker break;
859*0e209d39SAndroid Build Coastguard Worker }
860*0e209d39SAndroid Build Coastguard Worker --overlap;
861*0e209d39SAndroid Build Coastguard Worker ++dec;
862*0e209d39SAndroid Build Coastguard Worker }
863*0e209d39SAndroid Build Coastguard Worker }
864*0e209d39SAndroid Build Coastguard Worker } else /* USET_SPAN_SIMPLE */ {
865*0e209d39SAndroid Build Coastguard Worker int32_t maxDec=0, maxOverlap=0;
866*0e209d39SAndroid Build Coastguard Worker for(i=0; i<stringsLength; ++i) {
867*0e209d39SAndroid Build Coastguard Worker int32_t overlap=spanBackLengths[i];
868*0e209d39SAndroid Build Coastguard Worker // For longest match, we do need to try to match even an all-contained string
869*0e209d39SAndroid Build Coastguard Worker // to find the match from the latest end.
870*0e209d39SAndroid Build Coastguard Worker
871*0e209d39SAndroid Build Coastguard Worker const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
872*0e209d39SAndroid Build Coastguard Worker const char16_t *s16=string.getBuffer();
873*0e209d39SAndroid Build Coastguard Worker int32_t length16=string.length();
874*0e209d39SAndroid Build Coastguard Worker if (length16==0) {
875*0e209d39SAndroid Build Coastguard Worker continue; // skip the empty string
876*0e209d39SAndroid Build Coastguard Worker }
877*0e209d39SAndroid Build Coastguard Worker
878*0e209d39SAndroid Build Coastguard Worker // Try to match this string at pos-(length16-overlap)..pos-length16.
879*0e209d39SAndroid Build Coastguard Worker if(overlap>=LONG_SPAN) {
880*0e209d39SAndroid Build Coastguard Worker overlap=length16;
881*0e209d39SAndroid Build Coastguard Worker // Longest match: Need to match fully inside the code point span
882*0e209d39SAndroid Build Coastguard Worker // to find the match from the latest end.
883*0e209d39SAndroid Build Coastguard Worker }
884*0e209d39SAndroid Build Coastguard Worker if(overlap>spanLength) {
885*0e209d39SAndroid Build Coastguard Worker overlap=spanLength;
886*0e209d39SAndroid Build Coastguard Worker }
887*0e209d39SAndroid Build Coastguard Worker int32_t dec=length16-overlap; // Keep dec+overlap==length16.
888*0e209d39SAndroid Build Coastguard Worker for(;;) {
889*0e209d39SAndroid Build Coastguard Worker if(dec>pos || overlap<maxOverlap) {
890*0e209d39SAndroid Build Coastguard Worker break;
891*0e209d39SAndroid Build Coastguard Worker }
892*0e209d39SAndroid Build Coastguard Worker // Try to match if the string is longer or ends later.
893*0e209d39SAndroid Build Coastguard Worker if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
894*0e209d39SAndroid Build Coastguard Worker matches16CPB(s, pos-dec, length, s16, length16)
895*0e209d39SAndroid Build Coastguard Worker ) {
896*0e209d39SAndroid Build Coastguard Worker maxDec=dec; // Longest match from latest end.
897*0e209d39SAndroid Build Coastguard Worker maxOverlap=overlap;
898*0e209d39SAndroid Build Coastguard Worker break;
899*0e209d39SAndroid Build Coastguard Worker }
900*0e209d39SAndroid Build Coastguard Worker --overlap;
901*0e209d39SAndroid Build Coastguard Worker ++dec;
902*0e209d39SAndroid Build Coastguard Worker }
903*0e209d39SAndroid Build Coastguard Worker }
904*0e209d39SAndroid Build Coastguard Worker
905*0e209d39SAndroid Build Coastguard Worker if(maxDec!=0 || maxOverlap!=0) {
906*0e209d39SAndroid Build Coastguard Worker // Longest-match algorithm, and there was a string match.
907*0e209d39SAndroid Build Coastguard Worker // Simply continue before it.
908*0e209d39SAndroid Build Coastguard Worker pos-=maxDec;
909*0e209d39SAndroid Build Coastguard Worker if(pos==0) {
910*0e209d39SAndroid Build Coastguard Worker return 0; // Reached the start of the string.
911*0e209d39SAndroid Build Coastguard Worker }
912*0e209d39SAndroid Build Coastguard Worker spanLength=0; // Match strings from before a string match.
913*0e209d39SAndroid Build Coastguard Worker continue;
914*0e209d39SAndroid Build Coastguard Worker }
915*0e209d39SAndroid Build Coastguard Worker }
916*0e209d39SAndroid Build Coastguard Worker // Finished trying to match all strings at pos.
917*0e209d39SAndroid Build Coastguard Worker
918*0e209d39SAndroid Build Coastguard Worker if(spanLength!=0 || pos==length) {
919*0e209d39SAndroid Build Coastguard Worker // The position is before an unlimited code point span (spanLength!=0),
920*0e209d39SAndroid Build Coastguard Worker // not before a string match.
921*0e209d39SAndroid Build Coastguard Worker // The only position where spanLength==0 before a span is pos==length.
922*0e209d39SAndroid Build Coastguard Worker // Otherwise, an unlimited code point span is only tried again when no
923*0e209d39SAndroid Build Coastguard Worker // strings match, and if such a non-initial span fails we stop.
924*0e209d39SAndroid Build Coastguard Worker if(offsets.isEmpty()) {
925*0e209d39SAndroid Build Coastguard Worker return pos; // No strings matched before a span.
926*0e209d39SAndroid Build Coastguard Worker }
927*0e209d39SAndroid Build Coastguard Worker // Match strings from before the next string match.
928*0e209d39SAndroid Build Coastguard Worker } else {
929*0e209d39SAndroid Build Coastguard Worker // The position is before a string match (or a single code point).
930*0e209d39SAndroid Build Coastguard Worker if(offsets.isEmpty()) {
931*0e209d39SAndroid Build Coastguard Worker // No more strings matched before a previous string match.
932*0e209d39SAndroid Build Coastguard Worker // Try another code point span from before the last string match.
933*0e209d39SAndroid Build Coastguard Worker int32_t oldPos=pos;
934*0e209d39SAndroid Build Coastguard Worker pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
935*0e209d39SAndroid Build Coastguard Worker spanLength=oldPos-pos;
936*0e209d39SAndroid Build Coastguard Worker if( pos==0 || // Reached the start of the string, or
937*0e209d39SAndroid Build Coastguard Worker spanLength==0 // neither strings nor span progressed.
938*0e209d39SAndroid Build Coastguard Worker ) {
939*0e209d39SAndroid Build Coastguard Worker return pos;
940*0e209d39SAndroid Build Coastguard Worker }
941*0e209d39SAndroid Build Coastguard Worker continue; // spanLength>0: Match strings from before a span.
942*0e209d39SAndroid Build Coastguard Worker } else {
943*0e209d39SAndroid Build Coastguard Worker // Try to match only one code point from before a string match if some
944*0e209d39SAndroid Build Coastguard Worker // string matched beyond it, so that we try all possible positions
945*0e209d39SAndroid Build Coastguard Worker // and don't overshoot.
946*0e209d39SAndroid Build Coastguard Worker spanLength=spanOneBack(spanSet, s, pos);
947*0e209d39SAndroid Build Coastguard Worker if(spanLength>0) {
948*0e209d39SAndroid Build Coastguard Worker if(spanLength==pos) {
949*0e209d39SAndroid Build Coastguard Worker return 0; // Reached the start of the string.
950*0e209d39SAndroid Build Coastguard Worker }
951*0e209d39SAndroid Build Coastguard Worker // Match strings before this code point.
952*0e209d39SAndroid Build Coastguard Worker // There cannot be any decrements below it because UnicodeSet strings
953*0e209d39SAndroid Build Coastguard Worker // contain multiple code points.
954*0e209d39SAndroid Build Coastguard Worker pos-=spanLength;
955*0e209d39SAndroid Build Coastguard Worker offsets.shift(spanLength);
956*0e209d39SAndroid Build Coastguard Worker spanLength=0;
957*0e209d39SAndroid Build Coastguard Worker continue; // Match strings from before a single code point.
958*0e209d39SAndroid Build Coastguard Worker }
959*0e209d39SAndroid Build Coastguard Worker // Match strings from before the next string match.
960*0e209d39SAndroid Build Coastguard Worker }
961*0e209d39SAndroid Build Coastguard Worker }
962*0e209d39SAndroid Build Coastguard Worker pos-=offsets.popMinimum();
963*0e209d39SAndroid Build Coastguard Worker spanLength=0; // Match strings from before a string match.
964*0e209d39SAndroid Build Coastguard Worker }
965*0e209d39SAndroid Build Coastguard Worker }
966*0e209d39SAndroid Build Coastguard Worker
spanUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const967*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
968*0e209d39SAndroid Build Coastguard Worker if(spanCondition==USET_SPAN_NOT_CONTAINED) {
969*0e209d39SAndroid Build Coastguard Worker return spanNotUTF8(s, length);
970*0e209d39SAndroid Build Coastguard Worker }
971*0e209d39SAndroid Build Coastguard Worker int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
972*0e209d39SAndroid Build Coastguard Worker if(spanLength==length) {
973*0e209d39SAndroid Build Coastguard Worker return length;
974*0e209d39SAndroid Build Coastguard Worker }
975*0e209d39SAndroid Build Coastguard Worker
976*0e209d39SAndroid Build Coastguard Worker // Consider strings; they may overlap with the span.
977*0e209d39SAndroid Build Coastguard Worker OffsetList offsets;
978*0e209d39SAndroid Build Coastguard Worker if(spanCondition==USET_SPAN_CONTAINED) {
979*0e209d39SAndroid Build Coastguard Worker // Use offset list to try all possibilities.
980*0e209d39SAndroid Build Coastguard Worker offsets.setMaxLength(maxLength8);
981*0e209d39SAndroid Build Coastguard Worker }
982*0e209d39SAndroid Build Coastguard Worker int32_t pos=spanLength, rest=length-pos;
983*0e209d39SAndroid Build Coastguard Worker int32_t i, stringsLength=strings.size();
984*0e209d39SAndroid Build Coastguard Worker uint8_t *spanUTF8Lengths=spanLengths;
985*0e209d39SAndroid Build Coastguard Worker if(all) {
986*0e209d39SAndroid Build Coastguard Worker spanUTF8Lengths+=2*stringsLength;
987*0e209d39SAndroid Build Coastguard Worker }
988*0e209d39SAndroid Build Coastguard Worker for(;;) {
989*0e209d39SAndroid Build Coastguard Worker const uint8_t *s8=utf8;
990*0e209d39SAndroid Build Coastguard Worker int32_t length8;
991*0e209d39SAndroid Build Coastguard Worker if(spanCondition==USET_SPAN_CONTAINED) {
992*0e209d39SAndroid Build Coastguard Worker for(i=0; i<stringsLength; ++i) {
993*0e209d39SAndroid Build Coastguard Worker length8=utf8Lengths[i];
994*0e209d39SAndroid Build Coastguard Worker if(length8==0) {
995*0e209d39SAndroid Build Coastguard Worker continue; // String not representable in UTF-8.
996*0e209d39SAndroid Build Coastguard Worker }
997*0e209d39SAndroid Build Coastguard Worker int32_t overlap=spanUTF8Lengths[i];
998*0e209d39SAndroid Build Coastguard Worker if(overlap==ALL_CP_CONTAINED) {
999*0e209d39SAndroid Build Coastguard Worker s8+=length8;
1000*0e209d39SAndroid Build Coastguard Worker continue; // Irrelevant string.
1001*0e209d39SAndroid Build Coastguard Worker }
1002*0e209d39SAndroid Build Coastguard Worker
1003*0e209d39SAndroid Build Coastguard Worker // Try to match this string at pos-overlap..pos.
1004*0e209d39SAndroid Build Coastguard Worker if(overlap>=LONG_SPAN) {
1005*0e209d39SAndroid Build Coastguard Worker overlap=length8;
1006*0e209d39SAndroid Build Coastguard Worker // While contained: No point matching fully inside the code point span.
1007*0e209d39SAndroid Build Coastguard Worker U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.
1008*0e209d39SAndroid Build Coastguard Worker }
1009*0e209d39SAndroid Build Coastguard Worker if(overlap>spanLength) {
1010*0e209d39SAndroid Build Coastguard Worker overlap=spanLength;
1011*0e209d39SAndroid Build Coastguard Worker }
1012*0e209d39SAndroid Build Coastguard Worker int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1013*0e209d39SAndroid Build Coastguard Worker for(;;) {
1014*0e209d39SAndroid Build Coastguard Worker if(inc>rest) {
1015*0e209d39SAndroid Build Coastguard Worker break;
1016*0e209d39SAndroid Build Coastguard Worker }
1017*0e209d39SAndroid Build Coastguard Worker // Try to match if the increment is not listed already.
1018*0e209d39SAndroid Build Coastguard Worker // Match at code point boundaries. (The UTF-8 strings were converted
1019*0e209d39SAndroid Build Coastguard Worker // from UTF-16 and are guaranteed to be well-formed.)
1020*0e209d39SAndroid Build Coastguard Worker if(!U8_IS_TRAIL(s[pos-overlap]) &&
1021*0e209d39SAndroid Build Coastguard Worker !offsets.containsOffset(inc) &&
1022*0e209d39SAndroid Build Coastguard Worker matches8(s+pos-overlap, s8, length8)) {
1023*0e209d39SAndroid Build Coastguard Worker if(inc==rest) {
1024*0e209d39SAndroid Build Coastguard Worker return length; // Reached the end of the string.
1025*0e209d39SAndroid Build Coastguard Worker }
1026*0e209d39SAndroid Build Coastguard Worker offsets.addOffset(inc);
1027*0e209d39SAndroid Build Coastguard Worker }
1028*0e209d39SAndroid Build Coastguard Worker if(overlap==0) {
1029*0e209d39SAndroid Build Coastguard Worker break;
1030*0e209d39SAndroid Build Coastguard Worker }
1031*0e209d39SAndroid Build Coastguard Worker --overlap;
1032*0e209d39SAndroid Build Coastguard Worker ++inc;
1033*0e209d39SAndroid Build Coastguard Worker }
1034*0e209d39SAndroid Build Coastguard Worker s8+=length8;
1035*0e209d39SAndroid Build Coastguard Worker }
1036*0e209d39SAndroid Build Coastguard Worker } else /* USET_SPAN_SIMPLE */ {
1037*0e209d39SAndroid Build Coastguard Worker int32_t maxInc=0, maxOverlap=0;
1038*0e209d39SAndroid Build Coastguard Worker for(i=0; i<stringsLength; ++i) {
1039*0e209d39SAndroid Build Coastguard Worker length8=utf8Lengths[i];
1040*0e209d39SAndroid Build Coastguard Worker if(length8==0) {
1041*0e209d39SAndroid Build Coastguard Worker continue; // String not representable in UTF-8.
1042*0e209d39SAndroid Build Coastguard Worker }
1043*0e209d39SAndroid Build Coastguard Worker int32_t overlap=spanUTF8Lengths[i];
1044*0e209d39SAndroid Build Coastguard Worker // For longest match, we do need to try to match even an all-contained string
1045*0e209d39SAndroid Build Coastguard Worker // to find the match from the earliest start.
1046*0e209d39SAndroid Build Coastguard Worker
1047*0e209d39SAndroid Build Coastguard Worker // Try to match this string at pos-overlap..pos.
1048*0e209d39SAndroid Build Coastguard Worker if(overlap>=LONG_SPAN) {
1049*0e209d39SAndroid Build Coastguard Worker overlap=length8;
1050*0e209d39SAndroid Build Coastguard Worker // Longest match: Need to match fully inside the code point span
1051*0e209d39SAndroid Build Coastguard Worker // to find the match from the earliest start.
1052*0e209d39SAndroid Build Coastguard Worker }
1053*0e209d39SAndroid Build Coastguard Worker if(overlap>spanLength) {
1054*0e209d39SAndroid Build Coastguard Worker overlap=spanLength;
1055*0e209d39SAndroid Build Coastguard Worker }
1056*0e209d39SAndroid Build Coastguard Worker int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1057*0e209d39SAndroid Build Coastguard Worker for(;;) {
1058*0e209d39SAndroid Build Coastguard Worker if(inc>rest || overlap<maxOverlap) {
1059*0e209d39SAndroid Build Coastguard Worker break;
1060*0e209d39SAndroid Build Coastguard Worker }
1061*0e209d39SAndroid Build Coastguard Worker // Try to match if the string is longer or starts earlier.
1062*0e209d39SAndroid Build Coastguard Worker // Match at code point boundaries. (The UTF-8 strings were converted
1063*0e209d39SAndroid Build Coastguard Worker // from UTF-16 and are guaranteed to be well-formed.)
1064*0e209d39SAndroid Build Coastguard Worker if(!U8_IS_TRAIL(s[pos-overlap]) &&
1065*0e209d39SAndroid Build Coastguard Worker (overlap>maxOverlap ||
1066*0e209d39SAndroid Build Coastguard Worker /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1067*0e209d39SAndroid Build Coastguard Worker matches8(s+pos-overlap, s8, length8)) {
1068*0e209d39SAndroid Build Coastguard Worker maxInc=inc; // Longest match from earliest start.
1069*0e209d39SAndroid Build Coastguard Worker maxOverlap=overlap;
1070*0e209d39SAndroid Build Coastguard Worker break;
1071*0e209d39SAndroid Build Coastguard Worker }
1072*0e209d39SAndroid Build Coastguard Worker --overlap;
1073*0e209d39SAndroid Build Coastguard Worker ++inc;
1074*0e209d39SAndroid Build Coastguard Worker }
1075*0e209d39SAndroid Build Coastguard Worker s8+=length8;
1076*0e209d39SAndroid Build Coastguard Worker }
1077*0e209d39SAndroid Build Coastguard Worker
1078*0e209d39SAndroid Build Coastguard Worker if(maxInc!=0 || maxOverlap!=0) {
1079*0e209d39SAndroid Build Coastguard Worker // Longest-match algorithm, and there was a string match.
1080*0e209d39SAndroid Build Coastguard Worker // Simply continue after it.
1081*0e209d39SAndroid Build Coastguard Worker pos+=maxInc;
1082*0e209d39SAndroid Build Coastguard Worker rest-=maxInc;
1083*0e209d39SAndroid Build Coastguard Worker if(rest==0) {
1084*0e209d39SAndroid Build Coastguard Worker return length; // Reached the end of the string.
1085*0e209d39SAndroid Build Coastguard Worker }
1086*0e209d39SAndroid Build Coastguard Worker spanLength=0; // Match strings from after a string match.
1087*0e209d39SAndroid Build Coastguard Worker continue;
1088*0e209d39SAndroid Build Coastguard Worker }
1089*0e209d39SAndroid Build Coastguard Worker }
1090*0e209d39SAndroid Build Coastguard Worker // Finished trying to match all strings at pos.
1091*0e209d39SAndroid Build Coastguard Worker
1092*0e209d39SAndroid Build Coastguard Worker if(spanLength!=0 || pos==0) {
1093*0e209d39SAndroid Build Coastguard Worker // The position is after an unlimited code point span (spanLength!=0),
1094*0e209d39SAndroid Build Coastguard Worker // not after a string match.
1095*0e209d39SAndroid Build Coastguard Worker // The only position where spanLength==0 after a span is pos==0.
1096*0e209d39SAndroid Build Coastguard Worker // Otherwise, an unlimited code point span is only tried again when no
1097*0e209d39SAndroid Build Coastguard Worker // strings match, and if such a non-initial span fails we stop.
1098*0e209d39SAndroid Build Coastguard Worker if(offsets.isEmpty()) {
1099*0e209d39SAndroid Build Coastguard Worker return pos; // No strings matched after a span.
1100*0e209d39SAndroid Build Coastguard Worker }
1101*0e209d39SAndroid Build Coastguard Worker // Match strings from after the next string match.
1102*0e209d39SAndroid Build Coastguard Worker } else {
1103*0e209d39SAndroid Build Coastguard Worker // The position is after a string match (or a single code point).
1104*0e209d39SAndroid Build Coastguard Worker if(offsets.isEmpty()) {
1105*0e209d39SAndroid Build Coastguard Worker // No more strings matched after a previous string match.
1106*0e209d39SAndroid Build Coastguard Worker // Try another code point span from after the last string match.
1107*0e209d39SAndroid Build Coastguard Worker spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1108*0e209d39SAndroid Build Coastguard Worker if( spanLength==rest || // Reached the end of the string, or
1109*0e209d39SAndroid Build Coastguard Worker spanLength==0 // neither strings nor span progressed.
1110*0e209d39SAndroid Build Coastguard Worker ) {
1111*0e209d39SAndroid Build Coastguard Worker return pos+spanLength;
1112*0e209d39SAndroid Build Coastguard Worker }
1113*0e209d39SAndroid Build Coastguard Worker pos+=spanLength;
1114*0e209d39SAndroid Build Coastguard Worker rest-=spanLength;
1115*0e209d39SAndroid Build Coastguard Worker continue; // spanLength>0: Match strings from after a span.
1116*0e209d39SAndroid Build Coastguard Worker } else {
1117*0e209d39SAndroid Build Coastguard Worker // Try to match only one code point from after a string match if some
1118*0e209d39SAndroid Build Coastguard Worker // string matched beyond it, so that we try all possible positions
1119*0e209d39SAndroid Build Coastguard Worker // and don't overshoot.
1120*0e209d39SAndroid Build Coastguard Worker spanLength=spanOneUTF8(spanSet, s+pos, rest);
1121*0e209d39SAndroid Build Coastguard Worker if(spanLength>0) {
1122*0e209d39SAndroid Build Coastguard Worker if(spanLength==rest) {
1123*0e209d39SAndroid Build Coastguard Worker return length; // Reached the end of the string.
1124*0e209d39SAndroid Build Coastguard Worker }
1125*0e209d39SAndroid Build Coastguard Worker // Match strings after this code point.
1126*0e209d39SAndroid Build Coastguard Worker // There cannot be any increments below it because UnicodeSet strings
1127*0e209d39SAndroid Build Coastguard Worker // contain multiple code points.
1128*0e209d39SAndroid Build Coastguard Worker pos+=spanLength;
1129*0e209d39SAndroid Build Coastguard Worker rest-=spanLength;
1130*0e209d39SAndroid Build Coastguard Worker offsets.shift(spanLength);
1131*0e209d39SAndroid Build Coastguard Worker spanLength=0;
1132*0e209d39SAndroid Build Coastguard Worker continue; // Match strings from after a single code point.
1133*0e209d39SAndroid Build Coastguard Worker }
1134*0e209d39SAndroid Build Coastguard Worker // Match strings from after the next string match.
1135*0e209d39SAndroid Build Coastguard Worker }
1136*0e209d39SAndroid Build Coastguard Worker }
1137*0e209d39SAndroid Build Coastguard Worker int32_t minOffset=offsets.popMinimum();
1138*0e209d39SAndroid Build Coastguard Worker pos+=minOffset;
1139*0e209d39SAndroid Build Coastguard Worker rest-=minOffset;
1140*0e209d39SAndroid Build Coastguard Worker spanLength=0; // Match strings from after a string match.
1141*0e209d39SAndroid Build Coastguard Worker }
1142*0e209d39SAndroid Build Coastguard Worker }
1143*0e209d39SAndroid Build Coastguard Worker
spanBackUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const1144*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1145*0e209d39SAndroid Build Coastguard Worker if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1146*0e209d39SAndroid Build Coastguard Worker return spanNotBackUTF8(s, length);
1147*0e209d39SAndroid Build Coastguard Worker }
1148*0e209d39SAndroid Build Coastguard Worker int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1149*0e209d39SAndroid Build Coastguard Worker if(pos==0) {
1150*0e209d39SAndroid Build Coastguard Worker return 0;
1151*0e209d39SAndroid Build Coastguard Worker }
1152*0e209d39SAndroid Build Coastguard Worker int32_t spanLength=length-pos;
1153*0e209d39SAndroid Build Coastguard Worker
1154*0e209d39SAndroid Build Coastguard Worker // Consider strings; they may overlap with the span.
1155*0e209d39SAndroid Build Coastguard Worker OffsetList offsets;
1156*0e209d39SAndroid Build Coastguard Worker if(spanCondition==USET_SPAN_CONTAINED) {
1157*0e209d39SAndroid Build Coastguard Worker // Use offset list to try all possibilities.
1158*0e209d39SAndroid Build Coastguard Worker offsets.setMaxLength(maxLength8);
1159*0e209d39SAndroid Build Coastguard Worker }
1160*0e209d39SAndroid Build Coastguard Worker int32_t i, stringsLength=strings.size();
1161*0e209d39SAndroid Build Coastguard Worker uint8_t *spanBackUTF8Lengths=spanLengths;
1162*0e209d39SAndroid Build Coastguard Worker if(all) {
1163*0e209d39SAndroid Build Coastguard Worker spanBackUTF8Lengths+=3*stringsLength;
1164*0e209d39SAndroid Build Coastguard Worker }
1165*0e209d39SAndroid Build Coastguard Worker for(;;) {
1166*0e209d39SAndroid Build Coastguard Worker const uint8_t *s8=utf8;
1167*0e209d39SAndroid Build Coastguard Worker int32_t length8;
1168*0e209d39SAndroid Build Coastguard Worker if(spanCondition==USET_SPAN_CONTAINED) {
1169*0e209d39SAndroid Build Coastguard Worker for(i=0; i<stringsLength; ++i) {
1170*0e209d39SAndroid Build Coastguard Worker length8=utf8Lengths[i];
1171*0e209d39SAndroid Build Coastguard Worker if(length8==0) {
1172*0e209d39SAndroid Build Coastguard Worker continue; // String not representable in UTF-8.
1173*0e209d39SAndroid Build Coastguard Worker }
1174*0e209d39SAndroid Build Coastguard Worker int32_t overlap=spanBackUTF8Lengths[i];
1175*0e209d39SAndroid Build Coastguard Worker if(overlap==ALL_CP_CONTAINED) {
1176*0e209d39SAndroid Build Coastguard Worker s8+=length8;
1177*0e209d39SAndroid Build Coastguard Worker continue; // Irrelevant string.
1178*0e209d39SAndroid Build Coastguard Worker }
1179*0e209d39SAndroid Build Coastguard Worker
1180*0e209d39SAndroid Build Coastguard Worker // Try to match this string at pos-(length8-overlap)..pos-length8.
1181*0e209d39SAndroid Build Coastguard Worker if(overlap>=LONG_SPAN) {
1182*0e209d39SAndroid Build Coastguard Worker overlap=length8;
1183*0e209d39SAndroid Build Coastguard Worker // While contained: No point matching fully inside the code point span.
1184*0e209d39SAndroid Build Coastguard Worker int32_t len1=0;
1185*0e209d39SAndroid Build Coastguard Worker U8_FWD_1(s8, len1, overlap);
1186*0e209d39SAndroid Build Coastguard Worker overlap-=len1; // Length of the string minus the first code point.
1187*0e209d39SAndroid Build Coastguard Worker }
1188*0e209d39SAndroid Build Coastguard Worker if(overlap>spanLength) {
1189*0e209d39SAndroid Build Coastguard Worker overlap=spanLength;
1190*0e209d39SAndroid Build Coastguard Worker }
1191*0e209d39SAndroid Build Coastguard Worker int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1192*0e209d39SAndroid Build Coastguard Worker for(;;) {
1193*0e209d39SAndroid Build Coastguard Worker if(dec>pos) {
1194*0e209d39SAndroid Build Coastguard Worker break;
1195*0e209d39SAndroid Build Coastguard Worker }
1196*0e209d39SAndroid Build Coastguard Worker // Try to match if the decrement is not listed already.
1197*0e209d39SAndroid Build Coastguard Worker // Match at code point boundaries. (The UTF-8 strings were converted
1198*0e209d39SAndroid Build Coastguard Worker // from UTF-16 and are guaranteed to be well-formed.)
1199*0e209d39SAndroid Build Coastguard Worker if( !U8_IS_TRAIL(s[pos-dec]) &&
1200*0e209d39SAndroid Build Coastguard Worker !offsets.containsOffset(dec) &&
1201*0e209d39SAndroid Build Coastguard Worker matches8(s+pos-dec, s8, length8)
1202*0e209d39SAndroid Build Coastguard Worker ) {
1203*0e209d39SAndroid Build Coastguard Worker if(dec==pos) {
1204*0e209d39SAndroid Build Coastguard Worker return 0; // Reached the start of the string.
1205*0e209d39SAndroid Build Coastguard Worker }
1206*0e209d39SAndroid Build Coastguard Worker offsets.addOffset(dec);
1207*0e209d39SAndroid Build Coastguard Worker }
1208*0e209d39SAndroid Build Coastguard Worker if(overlap==0) {
1209*0e209d39SAndroid Build Coastguard Worker break;
1210*0e209d39SAndroid Build Coastguard Worker }
1211*0e209d39SAndroid Build Coastguard Worker --overlap;
1212*0e209d39SAndroid Build Coastguard Worker ++dec;
1213*0e209d39SAndroid Build Coastguard Worker }
1214*0e209d39SAndroid Build Coastguard Worker s8+=length8;
1215*0e209d39SAndroid Build Coastguard Worker }
1216*0e209d39SAndroid Build Coastguard Worker } else /* USET_SPAN_SIMPLE */ {
1217*0e209d39SAndroid Build Coastguard Worker int32_t maxDec=0, maxOverlap=0;
1218*0e209d39SAndroid Build Coastguard Worker for(i=0; i<stringsLength; ++i) {
1219*0e209d39SAndroid Build Coastguard Worker length8=utf8Lengths[i];
1220*0e209d39SAndroid Build Coastguard Worker if(length8==0) {
1221*0e209d39SAndroid Build Coastguard Worker continue; // String not representable in UTF-8.
1222*0e209d39SAndroid Build Coastguard Worker }
1223*0e209d39SAndroid Build Coastguard Worker int32_t overlap=spanBackUTF8Lengths[i];
1224*0e209d39SAndroid Build Coastguard Worker // For longest match, we do need to try to match even an all-contained string
1225*0e209d39SAndroid Build Coastguard Worker // to find the match from the latest end.
1226*0e209d39SAndroid Build Coastguard Worker
1227*0e209d39SAndroid Build Coastguard Worker // Try to match this string at pos-(length8-overlap)..pos-length8.
1228*0e209d39SAndroid Build Coastguard Worker if(overlap>=LONG_SPAN) {
1229*0e209d39SAndroid Build Coastguard Worker overlap=length8;
1230*0e209d39SAndroid Build Coastguard Worker // Longest match: Need to match fully inside the code point span
1231*0e209d39SAndroid Build Coastguard Worker // to find the match from the latest end.
1232*0e209d39SAndroid Build Coastguard Worker }
1233*0e209d39SAndroid Build Coastguard Worker if(overlap>spanLength) {
1234*0e209d39SAndroid Build Coastguard Worker overlap=spanLength;
1235*0e209d39SAndroid Build Coastguard Worker }
1236*0e209d39SAndroid Build Coastguard Worker int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1237*0e209d39SAndroid Build Coastguard Worker for(;;) {
1238*0e209d39SAndroid Build Coastguard Worker if(dec>pos || overlap<maxOverlap) {
1239*0e209d39SAndroid Build Coastguard Worker break;
1240*0e209d39SAndroid Build Coastguard Worker }
1241*0e209d39SAndroid Build Coastguard Worker // Try to match if the string is longer or ends later.
1242*0e209d39SAndroid Build Coastguard Worker // Match at code point boundaries. (The UTF-8 strings were converted
1243*0e209d39SAndroid Build Coastguard Worker // from UTF-16 and are guaranteed to be well-formed.)
1244*0e209d39SAndroid Build Coastguard Worker if( !U8_IS_TRAIL(s[pos-dec]) &&
1245*0e209d39SAndroid Build Coastguard Worker (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1246*0e209d39SAndroid Build Coastguard Worker matches8(s+pos-dec, s8, length8)
1247*0e209d39SAndroid Build Coastguard Worker ) {
1248*0e209d39SAndroid Build Coastguard Worker maxDec=dec; // Longest match from latest end.
1249*0e209d39SAndroid Build Coastguard Worker maxOverlap=overlap;
1250*0e209d39SAndroid Build Coastguard Worker break;
1251*0e209d39SAndroid Build Coastguard Worker }
1252*0e209d39SAndroid Build Coastguard Worker --overlap;
1253*0e209d39SAndroid Build Coastguard Worker ++dec;
1254*0e209d39SAndroid Build Coastguard Worker }
1255*0e209d39SAndroid Build Coastguard Worker s8+=length8;
1256*0e209d39SAndroid Build Coastguard Worker }
1257*0e209d39SAndroid Build Coastguard Worker
1258*0e209d39SAndroid Build Coastguard Worker if(maxDec!=0 || maxOverlap!=0) {
1259*0e209d39SAndroid Build Coastguard Worker // Longest-match algorithm, and there was a string match.
1260*0e209d39SAndroid Build Coastguard Worker // Simply continue before it.
1261*0e209d39SAndroid Build Coastguard Worker pos-=maxDec;
1262*0e209d39SAndroid Build Coastguard Worker if(pos==0) {
1263*0e209d39SAndroid Build Coastguard Worker return 0; // Reached the start of the string.
1264*0e209d39SAndroid Build Coastguard Worker }
1265*0e209d39SAndroid Build Coastguard Worker spanLength=0; // Match strings from before a string match.
1266*0e209d39SAndroid Build Coastguard Worker continue;
1267*0e209d39SAndroid Build Coastguard Worker }
1268*0e209d39SAndroid Build Coastguard Worker }
1269*0e209d39SAndroid Build Coastguard Worker // Finished trying to match all strings at pos.
1270*0e209d39SAndroid Build Coastguard Worker
1271*0e209d39SAndroid Build Coastguard Worker if(spanLength!=0 || pos==length) {
1272*0e209d39SAndroid Build Coastguard Worker // The position is before an unlimited code point span (spanLength!=0),
1273*0e209d39SAndroid Build Coastguard Worker // not before a string match.
1274*0e209d39SAndroid Build Coastguard Worker // The only position where spanLength==0 before a span is pos==length.
1275*0e209d39SAndroid Build Coastguard Worker // Otherwise, an unlimited code point span is only tried again when no
1276*0e209d39SAndroid Build Coastguard Worker // strings match, and if such a non-initial span fails we stop.
1277*0e209d39SAndroid Build Coastguard Worker if(offsets.isEmpty()) {
1278*0e209d39SAndroid Build Coastguard Worker return pos; // No strings matched before a span.
1279*0e209d39SAndroid Build Coastguard Worker }
1280*0e209d39SAndroid Build Coastguard Worker // Match strings from before the next string match.
1281*0e209d39SAndroid Build Coastguard Worker } else {
1282*0e209d39SAndroid Build Coastguard Worker // The position is before a string match (or a single code point).
1283*0e209d39SAndroid Build Coastguard Worker if(offsets.isEmpty()) {
1284*0e209d39SAndroid Build Coastguard Worker // No more strings matched before a previous string match.
1285*0e209d39SAndroid Build Coastguard Worker // Try another code point span from before the last string match.
1286*0e209d39SAndroid Build Coastguard Worker int32_t oldPos=pos;
1287*0e209d39SAndroid Build Coastguard Worker pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1288*0e209d39SAndroid Build Coastguard Worker spanLength=oldPos-pos;
1289*0e209d39SAndroid Build Coastguard Worker if( pos==0 || // Reached the start of the string, or
1290*0e209d39SAndroid Build Coastguard Worker spanLength==0 // neither strings nor span progressed.
1291*0e209d39SAndroid Build Coastguard Worker ) {
1292*0e209d39SAndroid Build Coastguard Worker return pos;
1293*0e209d39SAndroid Build Coastguard Worker }
1294*0e209d39SAndroid Build Coastguard Worker continue; // spanLength>0: Match strings from before a span.
1295*0e209d39SAndroid Build Coastguard Worker } else {
1296*0e209d39SAndroid Build Coastguard Worker // Try to match only one code point from before a string match if some
1297*0e209d39SAndroid Build Coastguard Worker // string matched beyond it, so that we try all possible positions
1298*0e209d39SAndroid Build Coastguard Worker // and don't overshoot.
1299*0e209d39SAndroid Build Coastguard Worker spanLength=spanOneBackUTF8(spanSet, s, pos);
1300*0e209d39SAndroid Build Coastguard Worker if(spanLength>0) {
1301*0e209d39SAndroid Build Coastguard Worker if(spanLength==pos) {
1302*0e209d39SAndroid Build Coastguard Worker return 0; // Reached the start of the string.
1303*0e209d39SAndroid Build Coastguard Worker }
1304*0e209d39SAndroid Build Coastguard Worker // Match strings before this code point.
1305*0e209d39SAndroid Build Coastguard Worker // There cannot be any decrements below it because UnicodeSet strings
1306*0e209d39SAndroid Build Coastguard Worker // contain multiple code points.
1307*0e209d39SAndroid Build Coastguard Worker pos-=spanLength;
1308*0e209d39SAndroid Build Coastguard Worker offsets.shift(spanLength);
1309*0e209d39SAndroid Build Coastguard Worker spanLength=0;
1310*0e209d39SAndroid Build Coastguard Worker continue; // Match strings from before a single code point.
1311*0e209d39SAndroid Build Coastguard Worker }
1312*0e209d39SAndroid Build Coastguard Worker // Match strings from before the next string match.
1313*0e209d39SAndroid Build Coastguard Worker }
1314*0e209d39SAndroid Build Coastguard Worker }
1315*0e209d39SAndroid Build Coastguard Worker pos-=offsets.popMinimum();
1316*0e209d39SAndroid Build Coastguard Worker spanLength=0; // Match strings from before a string match.
1317*0e209d39SAndroid Build Coastguard Worker }
1318*0e209d39SAndroid Build Coastguard Worker }
1319*0e209d39SAndroid Build Coastguard Worker
1320*0e209d39SAndroid Build Coastguard Worker /*
1321*0e209d39SAndroid Build Coastguard Worker * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1322*0e209d39SAndroid Build Coastguard Worker *
1323*0e209d39SAndroid Build Coastguard Worker * Theoretical algorithm:
1324*0e209d39SAndroid Build Coastguard Worker * - Iterate through the string, and at each code point boundary:
1325*0e209d39SAndroid Build Coastguard Worker * + If the code point there is in the set, then return with the current position.
1326*0e209d39SAndroid Build Coastguard Worker * + If a set string matches at the current position, then return with the current position.
1327*0e209d39SAndroid Build Coastguard Worker *
1328*0e209d39SAndroid Build Coastguard Worker * Optimized implementation:
1329*0e209d39SAndroid Build Coastguard Worker *
1330*0e209d39SAndroid Build Coastguard Worker * (Same assumption as for span() above.)
1331*0e209d39SAndroid Build Coastguard Worker *
1332*0e209d39SAndroid Build Coastguard Worker * Create and cache a spanNotSet which contains all of the single code points
1333*0e209d39SAndroid Build Coastguard Worker * of the original set but none of its strings.
1334*0e209d39SAndroid Build Coastguard Worker * For each set string add its initial code point to the spanNotSet.
1335*0e209d39SAndroid Build Coastguard Worker * (Also add its final code point for spanNotBack().)
1336*0e209d39SAndroid Build Coastguard Worker *
1337*0e209d39SAndroid Build Coastguard Worker * - Loop:
1338*0e209d39SAndroid Build Coastguard Worker * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1339*0e209d39SAndroid Build Coastguard Worker * + If the current code point is in the original set, then
1340*0e209d39SAndroid Build Coastguard Worker * return the current position.
1341*0e209d39SAndroid Build Coastguard Worker * + If any set string matches at the current position, then
1342*0e209d39SAndroid Build Coastguard Worker * return the current position.
1343*0e209d39SAndroid Build Coastguard Worker * + If there is no match at the current position, neither for the code point there
1344*0e209d39SAndroid Build Coastguard Worker * nor for any set string, then skip this code point and continue the loop.
1345*0e209d39SAndroid Build Coastguard Worker * This happens for set-string-initial code points that were added to spanNotSet
1346*0e209d39SAndroid Build Coastguard Worker * when there is not actually a match for such a set string.
1347*0e209d39SAndroid Build Coastguard Worker */
1348*0e209d39SAndroid Build Coastguard Worker
spanNot(const char16_t * s,int32_t length) const1349*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::spanNot(const char16_t *s, int32_t length) const {
1350*0e209d39SAndroid Build Coastguard Worker int32_t pos=0, rest=length;
1351*0e209d39SAndroid Build Coastguard Worker int32_t i, stringsLength=strings.size();
1352*0e209d39SAndroid Build Coastguard Worker do {
1353*0e209d39SAndroid Build Coastguard Worker // Span until we find a code point from the set,
1354*0e209d39SAndroid Build Coastguard Worker // or a code point that starts or ends some string.
1355*0e209d39SAndroid Build Coastguard Worker i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1356*0e209d39SAndroid Build Coastguard Worker if(i==rest) {
1357*0e209d39SAndroid Build Coastguard Worker return length; // Reached the end of the string.
1358*0e209d39SAndroid Build Coastguard Worker }
1359*0e209d39SAndroid Build Coastguard Worker pos+=i;
1360*0e209d39SAndroid Build Coastguard Worker rest-=i;
1361*0e209d39SAndroid Build Coastguard Worker
1362*0e209d39SAndroid Build Coastguard Worker // Check whether the current code point is in the original set,
1363*0e209d39SAndroid Build Coastguard Worker // without the string starts and ends.
1364*0e209d39SAndroid Build Coastguard Worker int32_t cpLength=spanOne(spanSet, s+pos, rest);
1365*0e209d39SAndroid Build Coastguard Worker if(cpLength>0) {
1366*0e209d39SAndroid Build Coastguard Worker return pos; // There is a set element at pos.
1367*0e209d39SAndroid Build Coastguard Worker }
1368*0e209d39SAndroid Build Coastguard Worker
1369*0e209d39SAndroid Build Coastguard Worker // Try to match the strings at pos.
1370*0e209d39SAndroid Build Coastguard Worker for(i=0; i<stringsLength; ++i) {
1371*0e209d39SAndroid Build Coastguard Worker if(spanLengths[i]==ALL_CP_CONTAINED) {
1372*0e209d39SAndroid Build Coastguard Worker continue; // Irrelevant string. (Also the empty string.)
1373*0e209d39SAndroid Build Coastguard Worker }
1374*0e209d39SAndroid Build Coastguard Worker const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1375*0e209d39SAndroid Build Coastguard Worker const char16_t *s16=string.getBuffer();
1376*0e209d39SAndroid Build Coastguard Worker int32_t length16=string.length();
1377*0e209d39SAndroid Build Coastguard Worker U_ASSERT(length>0);
1378*0e209d39SAndroid Build Coastguard Worker if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1379*0e209d39SAndroid Build Coastguard Worker return pos; // There is a set element at pos.
1380*0e209d39SAndroid Build Coastguard Worker }
1381*0e209d39SAndroid Build Coastguard Worker }
1382*0e209d39SAndroid Build Coastguard Worker
1383*0e209d39SAndroid Build Coastguard Worker // The span(while not contained) ended on a string start/end which is
1384*0e209d39SAndroid Build Coastguard Worker // not in the original set. Skip this code point and continue.
1385*0e209d39SAndroid Build Coastguard Worker // cpLength<0
1386*0e209d39SAndroid Build Coastguard Worker pos-=cpLength;
1387*0e209d39SAndroid Build Coastguard Worker rest+=cpLength;
1388*0e209d39SAndroid Build Coastguard Worker } while(rest!=0);
1389*0e209d39SAndroid Build Coastguard Worker return length; // Reached the end of the string.
1390*0e209d39SAndroid Build Coastguard Worker }
1391*0e209d39SAndroid Build Coastguard Worker
spanNotBack(const char16_t * s,int32_t length) const1392*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::spanNotBack(const char16_t *s, int32_t length) const {
1393*0e209d39SAndroid Build Coastguard Worker int32_t pos=length;
1394*0e209d39SAndroid Build Coastguard Worker int32_t i, stringsLength=strings.size();
1395*0e209d39SAndroid Build Coastguard Worker do {
1396*0e209d39SAndroid Build Coastguard Worker // Span until we find a code point from the set,
1397*0e209d39SAndroid Build Coastguard Worker // or a code point that starts or ends some string.
1398*0e209d39SAndroid Build Coastguard Worker pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1399*0e209d39SAndroid Build Coastguard Worker if(pos==0) {
1400*0e209d39SAndroid Build Coastguard Worker return 0; // Reached the start of the string.
1401*0e209d39SAndroid Build Coastguard Worker }
1402*0e209d39SAndroid Build Coastguard Worker
1403*0e209d39SAndroid Build Coastguard Worker // Check whether the current code point is in the original set,
1404*0e209d39SAndroid Build Coastguard Worker // without the string starts and ends.
1405*0e209d39SAndroid Build Coastguard Worker int32_t cpLength=spanOneBack(spanSet, s, pos);
1406*0e209d39SAndroid Build Coastguard Worker if(cpLength>0) {
1407*0e209d39SAndroid Build Coastguard Worker return pos; // There is a set element at pos.
1408*0e209d39SAndroid Build Coastguard Worker }
1409*0e209d39SAndroid Build Coastguard Worker
1410*0e209d39SAndroid Build Coastguard Worker // Try to match the strings at pos.
1411*0e209d39SAndroid Build Coastguard Worker for(i=0; i<stringsLength; ++i) {
1412*0e209d39SAndroid Build Coastguard Worker // Use spanLengths rather than a spanBackLengths pointer because
1413*0e209d39SAndroid Build Coastguard Worker // it is easier and we only need to know whether the string is irrelevant
1414*0e209d39SAndroid Build Coastguard Worker // which is the same in either array.
1415*0e209d39SAndroid Build Coastguard Worker if(spanLengths[i]==ALL_CP_CONTAINED) {
1416*0e209d39SAndroid Build Coastguard Worker continue; // Irrelevant string. (Also the empty string.)
1417*0e209d39SAndroid Build Coastguard Worker }
1418*0e209d39SAndroid Build Coastguard Worker const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1419*0e209d39SAndroid Build Coastguard Worker const char16_t *s16=string.getBuffer();
1420*0e209d39SAndroid Build Coastguard Worker int32_t length16=string.length();
1421*0e209d39SAndroid Build Coastguard Worker U_ASSERT(length>0);
1422*0e209d39SAndroid Build Coastguard Worker if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1423*0e209d39SAndroid Build Coastguard Worker return pos; // There is a set element at pos.
1424*0e209d39SAndroid Build Coastguard Worker }
1425*0e209d39SAndroid Build Coastguard Worker }
1426*0e209d39SAndroid Build Coastguard Worker
1427*0e209d39SAndroid Build Coastguard Worker // The span(while not contained) ended on a string start/end which is
1428*0e209d39SAndroid Build Coastguard Worker // not in the original set. Skip this code point and continue.
1429*0e209d39SAndroid Build Coastguard Worker // cpLength<0
1430*0e209d39SAndroid Build Coastguard Worker pos+=cpLength;
1431*0e209d39SAndroid Build Coastguard Worker } while(pos!=0);
1432*0e209d39SAndroid Build Coastguard Worker return 0; // Reached the start of the string.
1433*0e209d39SAndroid Build Coastguard Worker }
1434*0e209d39SAndroid Build Coastguard Worker
spanNotUTF8(const uint8_t * s,int32_t length) const1435*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1436*0e209d39SAndroid Build Coastguard Worker int32_t pos=0, rest=length;
1437*0e209d39SAndroid Build Coastguard Worker int32_t i, stringsLength=strings.size();
1438*0e209d39SAndroid Build Coastguard Worker uint8_t *spanUTF8Lengths=spanLengths;
1439*0e209d39SAndroid Build Coastguard Worker if(all) {
1440*0e209d39SAndroid Build Coastguard Worker spanUTF8Lengths+=2*stringsLength;
1441*0e209d39SAndroid Build Coastguard Worker }
1442*0e209d39SAndroid Build Coastguard Worker do {
1443*0e209d39SAndroid Build Coastguard Worker // Span until we find a code point from the set,
1444*0e209d39SAndroid Build Coastguard Worker // or a code point that starts or ends some string.
1445*0e209d39SAndroid Build Coastguard Worker i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1446*0e209d39SAndroid Build Coastguard Worker if(i==rest) {
1447*0e209d39SAndroid Build Coastguard Worker return length; // Reached the end of the string.
1448*0e209d39SAndroid Build Coastguard Worker }
1449*0e209d39SAndroid Build Coastguard Worker pos+=i;
1450*0e209d39SAndroid Build Coastguard Worker rest-=i;
1451*0e209d39SAndroid Build Coastguard Worker
1452*0e209d39SAndroid Build Coastguard Worker // Check whether the current code point is in the original set,
1453*0e209d39SAndroid Build Coastguard Worker // without the string starts and ends.
1454*0e209d39SAndroid Build Coastguard Worker int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1455*0e209d39SAndroid Build Coastguard Worker if(cpLength>0) {
1456*0e209d39SAndroid Build Coastguard Worker return pos; // There is a set element at pos.
1457*0e209d39SAndroid Build Coastguard Worker }
1458*0e209d39SAndroid Build Coastguard Worker
1459*0e209d39SAndroid Build Coastguard Worker // Try to match the strings at pos.
1460*0e209d39SAndroid Build Coastguard Worker const uint8_t *s8=utf8;
1461*0e209d39SAndroid Build Coastguard Worker int32_t length8;
1462*0e209d39SAndroid Build Coastguard Worker for(i=0; i<stringsLength; ++i) {
1463*0e209d39SAndroid Build Coastguard Worker length8=utf8Lengths[i];
1464*0e209d39SAndroid Build Coastguard Worker // ALL_CP_CONTAINED: Irrelevant string.
1465*0e209d39SAndroid Build Coastguard Worker if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1466*0e209d39SAndroid Build Coastguard Worker return pos; // There is a set element at pos.
1467*0e209d39SAndroid Build Coastguard Worker }
1468*0e209d39SAndroid Build Coastguard Worker s8+=length8;
1469*0e209d39SAndroid Build Coastguard Worker }
1470*0e209d39SAndroid Build Coastguard Worker
1471*0e209d39SAndroid Build Coastguard Worker // The span(while not contained) ended on a string start/end which is
1472*0e209d39SAndroid Build Coastguard Worker // not in the original set. Skip this code point and continue.
1473*0e209d39SAndroid Build Coastguard Worker // cpLength<0
1474*0e209d39SAndroid Build Coastguard Worker pos-=cpLength;
1475*0e209d39SAndroid Build Coastguard Worker rest+=cpLength;
1476*0e209d39SAndroid Build Coastguard Worker } while(rest!=0);
1477*0e209d39SAndroid Build Coastguard Worker return length; // Reached the end of the string.
1478*0e209d39SAndroid Build Coastguard Worker }
1479*0e209d39SAndroid Build Coastguard Worker
spanNotBackUTF8(const uint8_t * s,int32_t length) const1480*0e209d39SAndroid Build Coastguard Worker int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1481*0e209d39SAndroid Build Coastguard Worker int32_t pos=length;
1482*0e209d39SAndroid Build Coastguard Worker int32_t i, stringsLength=strings.size();
1483*0e209d39SAndroid Build Coastguard Worker uint8_t *spanBackUTF8Lengths=spanLengths;
1484*0e209d39SAndroid Build Coastguard Worker if(all) {
1485*0e209d39SAndroid Build Coastguard Worker spanBackUTF8Lengths+=3*stringsLength;
1486*0e209d39SAndroid Build Coastguard Worker }
1487*0e209d39SAndroid Build Coastguard Worker do {
1488*0e209d39SAndroid Build Coastguard Worker // Span until we find a code point from the set,
1489*0e209d39SAndroid Build Coastguard Worker // or a code point that starts or ends some string.
1490*0e209d39SAndroid Build Coastguard Worker pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1491*0e209d39SAndroid Build Coastguard Worker if(pos==0) {
1492*0e209d39SAndroid Build Coastguard Worker return 0; // Reached the start of the string.
1493*0e209d39SAndroid Build Coastguard Worker }
1494*0e209d39SAndroid Build Coastguard Worker
1495*0e209d39SAndroid Build Coastguard Worker // Check whether the current code point is in the original set,
1496*0e209d39SAndroid Build Coastguard Worker // without the string starts and ends.
1497*0e209d39SAndroid Build Coastguard Worker int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1498*0e209d39SAndroid Build Coastguard Worker if(cpLength>0) {
1499*0e209d39SAndroid Build Coastguard Worker return pos; // There is a set element at pos.
1500*0e209d39SAndroid Build Coastguard Worker }
1501*0e209d39SAndroid Build Coastguard Worker
1502*0e209d39SAndroid Build Coastguard Worker // Try to match the strings at pos.
1503*0e209d39SAndroid Build Coastguard Worker const uint8_t *s8=utf8;
1504*0e209d39SAndroid Build Coastguard Worker int32_t length8;
1505*0e209d39SAndroid Build Coastguard Worker for(i=0; i<stringsLength; ++i) {
1506*0e209d39SAndroid Build Coastguard Worker length8=utf8Lengths[i];
1507*0e209d39SAndroid Build Coastguard Worker // ALL_CP_CONTAINED: Irrelevant string.
1508*0e209d39SAndroid Build Coastguard Worker if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1509*0e209d39SAndroid Build Coastguard Worker return pos; // There is a set element at pos.
1510*0e209d39SAndroid Build Coastguard Worker }
1511*0e209d39SAndroid Build Coastguard Worker s8+=length8;
1512*0e209d39SAndroid Build Coastguard Worker }
1513*0e209d39SAndroid Build Coastguard Worker
1514*0e209d39SAndroid Build Coastguard Worker // The span(while not contained) ended on a string start/end which is
1515*0e209d39SAndroid Build Coastguard Worker // not in the original set. Skip this code point and continue.
1516*0e209d39SAndroid Build Coastguard Worker // cpLength<0
1517*0e209d39SAndroid Build Coastguard Worker pos+=cpLength;
1518*0e209d39SAndroid Build Coastguard Worker } while(pos!=0);
1519*0e209d39SAndroid Build Coastguard Worker return 0; // Reached the start of the string.
1520*0e209d39SAndroid Build Coastguard Worker }
1521*0e209d39SAndroid Build Coastguard Worker
1522*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
1523