xref: /aosp_15_r20/external/cronet/third_party/icu/source/common/ucasemap.cpp (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2005-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucasemap.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2005may06
16 *   created by: Markus W. Scherer
17 *
18 *   Case mapping service object and functions using it.
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/bytestream.h"
24 #include "unicode/casemap.h"
25 #include "unicode/edits.h"
26 #include "unicode/stringoptions.h"
27 #include "unicode/stringpiece.h"
28 #include "unicode/ubrk.h"
29 #include "unicode/uloc.h"
30 #include "unicode/ustring.h"
31 #include "unicode/ucasemap.h"
32 #if !UCONFIG_NO_BREAK_ITERATION
33 #include "unicode/utext.h"
34 #endif
35 #include "unicode/utf.h"
36 #include "unicode/utf8.h"
37 #include "unicode/utf16.h"
38 #include "bytesinkutil.h"
39 #include "cmemory.h"
40 #include "cstring.h"
41 #include "uassert.h"
42 #include "ucase.h"
43 #include "ucasemap_imp.h"
44 #include "ustr_imp.h"
45 
46 U_NAMESPACE_USE
47 
48 /* UCaseMap service object -------------------------------------------------- */
49 
UCaseMap(const char * localeID,uint32_t opts,UErrorCode * pErrorCode)50 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
51 #if !UCONFIG_NO_BREAK_ITERATION
52         iter(nullptr),
53 #endif
54         caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
55     ucasemap_setLocale(this, localeID, pErrorCode);
56 }
57 
~UCaseMap()58 UCaseMap::~UCaseMap() {
59 #if !UCONFIG_NO_BREAK_ITERATION
60     delete iter;
61 #endif
62 }
63 
64 U_CAPI UCaseMap * U_EXPORT2
ucasemap_open(const char * locale,uint32_t options,UErrorCode * pErrorCode)65 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
66     if(U_FAILURE(*pErrorCode)) {
67         return nullptr;
68     }
69     UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
70     if(csm==nullptr) {
71         *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
72         return nullptr;
73     } else if (U_FAILURE(*pErrorCode)) {
74         delete csm;
75         return nullptr;
76     }
77     return csm;
78 }
79 
80 U_CAPI void U_EXPORT2
ucasemap_close(UCaseMap * csm)81 ucasemap_close(UCaseMap *csm) {
82     delete csm;
83 }
84 
85 U_CAPI const char * U_EXPORT2
ucasemap_getLocale(const UCaseMap * csm)86 ucasemap_getLocale(const UCaseMap *csm) {
87     return csm->locale;
88 }
89 
90 U_CAPI uint32_t U_EXPORT2
ucasemap_getOptions(const UCaseMap * csm)91 ucasemap_getOptions(const UCaseMap *csm) {
92     return csm->options;
93 }
94 
95 U_CAPI void U_EXPORT2
ucasemap_setLocale(UCaseMap * csm,const char * locale,UErrorCode * pErrorCode)96 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
97     if(U_FAILURE(*pErrorCode)) {
98         return;
99     }
100     if (locale != nullptr && *locale == 0) {
101         csm->locale[0] = 0;
102         csm->caseLocale = UCASE_LOC_ROOT;
103         return;
104     }
105 
106     int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
107     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
108         *pErrorCode=U_ZERO_ERROR;
109         /* we only really need the language code for case mappings */
110         length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
111     }
112     if(length==sizeof(csm->locale)) {
113         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
114     }
115     if(U_SUCCESS(*pErrorCode)) {
116         csm->caseLocale = ucase_getCaseLocale(csm->locale);
117     } else {
118         csm->locale[0]=0;
119         csm->caseLocale = UCASE_LOC_ROOT;
120     }
121 }
122 
123 U_CAPI void U_EXPORT2
ucasemap_setOptions(UCaseMap * csm,uint32_t options,UErrorCode * pErrorCode)124 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
125     if(U_FAILURE(*pErrorCode)) {
126         return;
127     }
128     csm->options=options;
129 }
130 
131 /* UTF-8 string case mappings ----------------------------------------------- */
132 
133 /* TODO(markus): Move to a new, separate utf8case.cpp file. */
134 
135 namespace {
136 
137 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
138 inline UBool
appendResult(int32_t cpLength,int32_t result,const char16_t * s,ByteSink & sink,uint32_t options,icu::Edits * edits,UErrorCode & errorCode)139 appendResult(int32_t cpLength, int32_t result, const char16_t *s,
140              ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
141     U_ASSERT(U_SUCCESS(errorCode));
142 
143     /* decode the result */
144     if(result<0) {
145         /* (not) original code point */
146         if(edits!=nullptr) {
147             edits->addUnchanged(cpLength);
148         }
149         if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
150             ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
151         }
152     } else {
153         if(result<=UCASE_MAX_STRING_LENGTH) {
154             // string: "result" is the UTF-16 length
155             return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
156         } else {
157             ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
158         }
159     }
160     return true;
161 }
162 
163 // See unicode/utf8.h U8_APPEND_UNSAFE().
getTwoByteLead(UChar32 c)164 inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
getTwoByteTrail(UChar32 c)165 inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
166 
167 UChar32 U_CALLCONV
utf8_caseContextIterator(void * context,int8_t dir)168 utf8_caseContextIterator(void *context, int8_t dir) {
169     UCaseContext *csc=(UCaseContext *)context;
170     UChar32 c;
171 
172     if(dir<0) {
173         /* reset for backward iteration */
174         csc->index=csc->cpStart;
175         csc->dir=dir;
176     } else if(dir>0) {
177         /* reset for forward iteration */
178         csc->index=csc->cpLimit;
179         csc->dir=dir;
180     } else {
181         /* continue current iteration direction */
182         dir=csc->dir;
183     }
184 
185     if(dir<0) {
186         if(csc->start<csc->index) {
187             U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
188             return c;
189         }
190     } else {
191         if(csc->index<csc->limit) {
192             U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
193             return c;
194         }
195     }
196     return U_SENTINEL;
197 }
198 
199 /**
200  * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
201  * caseLocale < 0: Case-folds [srcStart..srcLimit[.
202  */
toLower(int32_t caseLocale,uint32_t options,const uint8_t * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)203 void toLower(int32_t caseLocale, uint32_t options,
204              const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
205              icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
206     const int8_t *latinToLower;
207     if (caseLocale == UCASE_LOC_ROOT ||
208             (caseLocale >= 0 ?
209                 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
210                 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
211         latinToLower = LatinCase::TO_LOWER_NORMAL;
212     } else {
213         latinToLower = LatinCase::TO_LOWER_TR_LT;
214     }
215     const UTrie2 *trie = ucase_getTrie();
216     int32_t prev = srcStart;
217     int32_t srcIndex = srcStart;
218     for (;;) {
219         // fast path for simple cases
220         int32_t cpStart;
221         UChar32 c;
222         for (;;) {
223             if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
224                 c = U_SENTINEL;
225                 break;
226             }
227             uint8_t lead = src[srcIndex++];
228             if (lead <= 0x7f) {
229                 int8_t d = latinToLower[lead];
230                 if (d == LatinCase::EXC) {
231                     cpStart = srcIndex - 1;
232                     c = lead;
233                     break;
234                 }
235                 if (d == 0) { continue; }
236                 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
237                                               sink, options, edits, errorCode);
238                 char ascii = (char)(lead + d);
239                 sink.Append(&ascii, 1);
240                 if (edits != nullptr) {
241                     edits->addReplace(1, 1);
242                 }
243                 prev = srcIndex;
244                 continue;
245             } else if (lead < 0xe3) {
246                 uint8_t t;
247                 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
248                         (t = src[srcIndex] - 0x80) <= 0x3f) {
249                     // U+0080..U+017F
250                     ++srcIndex;
251                     c = ((lead - 0xc0) << 6) | t;
252                     int8_t d = latinToLower[c];
253                     if (d == LatinCase::EXC) {
254                         cpStart = srcIndex - 2;
255                         break;
256                     }
257                     if (d == 0) { continue; }
258                     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
259                                                   sink, options, edits, errorCode);
260                     ByteSinkUtil::appendTwoBytes(c + d, sink);
261                     if (edits != nullptr) {
262                         edits->addReplace(2, 2);
263                     }
264                     prev = srcIndex;
265                     continue;
266                 }
267             } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
268                     (srcIndex + 2) <= srcLimit &&
269                     U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
270                 // most of CJK: no case mappings
271                 srcIndex += 2;
272                 continue;
273             }
274             cpStart = --srcIndex;
275             U8_NEXT(src, srcIndex, srcLimit, c);
276             if (c < 0) {
277                 // ill-formed UTF-8
278                 continue;
279             }
280             uint16_t props = UTRIE2_GET16(trie, c);
281             if (UCASE_HAS_EXCEPTION(props)) { break; }
282             int32_t delta;
283             if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
284                 continue;
285             }
286             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
287                                           sink, options, edits, errorCode);
288             ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
289             prev = srcIndex;
290         }
291         if (c < 0) {
292             break;
293         }
294         // slow path
295         const char16_t *s;
296         if (caseLocale >= 0) {
297             csc->cpStart = cpStart;
298             csc->cpLimit = srcIndex;
299             c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
300         } else {
301             c = ucase_toFullFolding(c, &s, options);
302         }
303         if (c >= 0) {
304             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
305                                           sink, options, edits, errorCode);
306             appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
307             prev = srcIndex;
308         }
309     }
310     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
311                                   sink, options, edits, errorCode);
312 }
313 
toUpper(int32_t caseLocale,uint32_t options,const uint8_t * src,UCaseContext * csc,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)314 void toUpper(int32_t caseLocale, uint32_t options,
315              const uint8_t *src, UCaseContext *csc, int32_t srcLength,
316              icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
317     const int8_t *latinToUpper;
318     if (caseLocale == UCASE_LOC_TURKISH) {
319         latinToUpper = LatinCase::TO_UPPER_TR;
320     } else {
321         latinToUpper = LatinCase::TO_UPPER_NORMAL;
322     }
323     const UTrie2 *trie = ucase_getTrie();
324     int32_t prev = 0;
325     int32_t srcIndex = 0;
326     for (;;) {
327         // fast path for simple cases
328         int32_t cpStart;
329         UChar32 c;
330         for (;;) {
331             if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
332                 c = U_SENTINEL;
333                 break;
334             }
335             uint8_t lead = src[srcIndex++];
336             if (lead <= 0x7f) {
337                 int8_t d = latinToUpper[lead];
338                 if (d == LatinCase::EXC) {
339                     cpStart = srcIndex - 1;
340                     c = lead;
341                     break;
342                 }
343                 if (d == 0) { continue; }
344                 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
345                                               sink, options, edits, errorCode);
346                 char ascii = (char)(lead + d);
347                 sink.Append(&ascii, 1);
348                 if (edits != nullptr) {
349                     edits->addReplace(1, 1);
350                 }
351                 prev = srcIndex;
352                 continue;
353             } else if (lead < 0xe3) {
354                 uint8_t t;
355                 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
356                         (t = src[srcIndex] - 0x80) <= 0x3f) {
357                     // U+0080..U+017F
358                     ++srcIndex;
359                     c = ((lead - 0xc0) << 6) | t;
360                     int8_t d = latinToUpper[c];
361                     if (d == LatinCase::EXC) {
362                         cpStart = srcIndex - 2;
363                         break;
364                     }
365                     if (d == 0) { continue; }
366                     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
367                                                   sink, options, edits, errorCode);
368                     ByteSinkUtil::appendTwoBytes(c + d, sink);
369                     if (edits != nullptr) {
370                         edits->addReplace(2, 2);
371                     }
372                     prev = srcIndex;
373                     continue;
374                 }
375             } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
376                     (srcIndex + 2) <= srcLength &&
377                     U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
378                 // most of CJK: no case mappings
379                 srcIndex += 2;
380                 continue;
381             }
382             cpStart = --srcIndex;
383             U8_NEXT(src, srcIndex, srcLength, c);
384             if (c < 0) {
385                 // ill-formed UTF-8
386                 continue;
387             }
388             uint16_t props = UTRIE2_GET16(trie, c);
389             if (UCASE_HAS_EXCEPTION(props)) { break; }
390             int32_t delta;
391             if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
392                 continue;
393             }
394             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
395                                           sink, options, edits, errorCode);
396             ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
397             prev = srcIndex;
398         }
399         if (c < 0) {
400             break;
401         }
402         // slow path
403         csc->cpStart = cpStart;
404         csc->cpLimit = srcIndex;
405         const char16_t *s;
406         c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
407         if (c >= 0) {
408             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
409                                           sink, options, edits, errorCode);
410             appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
411             prev = srcIndex;
412         }
413     }
414     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
415                                   sink, options, edits, errorCode);
416 }
417 
418 }  // namespace
419 
420 #if !UCONFIG_NO_BREAK_ITERATION
421 
422 namespace {
423 
424 constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
425 
426 constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
427 
428 /**
429  * Input: c is a letter I with or without acute accent.
430  * start is the index in src after c, and is less than segmentLimit.
431  * If a plain i/I is followed by a plain j/J,
432  * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
433  * then we output accordingly.
434  *
435  * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
436  */
maybeTitleDutchIJ(const uint8_t * src,UChar32 c,int32_t start,int32_t segmentLimit,ByteSink & sink,uint32_t options,icu::Edits * edits,UErrorCode & errorCode)437 int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
438                           ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
439     U_ASSERT(start < segmentLimit);
440 
441     int32_t index = start;
442     bool withAcute = false;
443 
444     // If the conditions are met, then the following variables tell us what to output.
445     int32_t unchanged1 = 0;  // code units before the j, or the whole sequence (0..3)
446     bool doTitleJ = false;  // true if the j needs to be titlecased
447     int32_t unchanged2 = 0;  // after the j (0 or 1)
448 
449     // next character after the first letter
450     UChar32 c2;
451     c2 = src[index++];
452 
453     // Is the first letter an i/I with accent?
454     if (c == u'I') {
455         if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
456             withAcute = true;
457             unchanged1 = 2;  // ACUTE is 2 code units in UTF-8
458             if (index == segmentLimit) { return start; }
459             c2 = src[index++];
460         }
461     } else {  // Í
462         withAcute = true;
463     }
464 
465     // Is the next character a j/J?
466     if (c2 == u'j') {
467         doTitleJ = true;
468     } else if (c2 == u'J') {
469         ++unchanged1;
470     } else {
471         return start;
472     }
473 
474     // A plain i/I must be followed by a plain j/J.
475     // An i/I with acute must be followed by a j/J with acute.
476     if (withAcute) {
477         if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
478             return start;
479         }
480         if (doTitleJ) {
481             unchanged2 = 2;  // ACUTE is 2 code units in UTF-8
482         } else {
483             unchanged1 = unchanged1 + 2;    // ACUTE is 2 code units in UTF-8
484         }
485     }
486 
487     // There must not be another combining mark.
488     if (index < segmentLimit) {
489         int32_t cp;
490         int32_t i = index;
491         U8_NEXT(src, i, segmentLimit, cp);
492         uint32_t typeMask = U_GET_GC_MASK(cp);
493         if ((typeMask & U_GC_M_MASK) != 0) {
494             return start;
495         }
496     }
497 
498     // Output the rest of the Dutch IJ.
499     ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
500     start += unchanged1;
501     if (doTitleJ) {
502         ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
503         ++start;
504     }
505     ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
506 
507     U_ASSERT(start + unchanged2 == index);
508     return index;
509 }
510 
511 }  // namespace
512 
513 U_CFUNC void U_CALLCONV
ucasemap_internalUTF8ToTitle(int32_t caseLocale,uint32_t options,BreakIterator * iter,const uint8_t * src,int32_t srcLength,ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)514 ucasemap_internalUTF8ToTitle(
515         int32_t caseLocale, uint32_t options, BreakIterator *iter,
516         const uint8_t *src, int32_t srcLength,
517         ByteSink &sink, icu::Edits *edits,
518         UErrorCode &errorCode) {
519     if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
520         return;
521     }
522 
523     /* set up local variables */
524     UCaseContext csc=UCASECONTEXT_INITIALIZER;
525     csc.p=(void *)src;
526     csc.limit=srcLength;
527     int32_t prev=0;
528     UBool isFirstIndex=true;
529 
530     /* titlecasing loop */
531     while(prev<srcLength) {
532         /* find next index where to titlecase */
533         int32_t index;
534         if(isFirstIndex) {
535             isFirstIndex=false;
536             index=iter->first();
537         } else {
538             index=iter->next();
539         }
540         if(index==UBRK_DONE || index>srcLength) {
541             index=srcLength;
542         }
543 
544         /*
545          * Segment [prev..index[ into 3 parts:
546          * a) skipped characters (copy as-is) [prev..titleStart[
547          * b) first letter (titlecase)              [titleStart..titleLimit[
548          * c) subsequent characters (lowercase)                 [titleLimit..index[
549          */
550         if(prev<index) {
551             /* find and copy skipped characters [prev..titleStart[ */
552             int32_t titleStart=prev;
553             int32_t titleLimit=prev;
554             UChar32 c;
555             U8_NEXT(src, titleLimit, index, c);
556             if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
557                 // Adjust the titlecasing index to the next cased character,
558                 // or to the next letter/number/symbol/private use.
559                 // Stop with titleStart<titleLimit<=index
560                 // if there is a character to be titlecased,
561                 // or else stop with titleStart==titleLimit==index.
562                 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
563                 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
564                     titleStart=titleLimit;
565                     if(titleLimit==index) {
566                         break;
567                     }
568                     U8_NEXT(src, titleLimit, index, c);
569                 }
570                 if (prev < titleStart) {
571                     if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
572                                                        sink, options, edits, errorCode)) {
573                         return;
574                     }
575                 }
576             }
577 
578             if(titleStart<titleLimit) {
579                 /* titlecase c which is from [titleStart..titleLimit[ */
580                 if(c>=0) {
581                     csc.cpStart=titleStart;
582                     csc.cpLimit=titleLimit;
583                     const char16_t *s;
584                     c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
585                     if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
586                         return;
587                     }
588                 } else {
589                     // Malformed UTF-8.
590                     if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
591                                                        sink, options, edits, errorCode)) {
592                         return;
593                     }
594                 }
595 
596                 /* Special case Dutch IJ titlecasing */
597                 if (titleLimit < index &&
598                     caseLocale == UCASE_LOC_DUTCH) {
599                     if (c < 0) {
600                         c = ~c;
601                     }
602 
603                     if (c == u'I' || c == u'Í') {
604                         titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
605                     }
606                 }
607 
608                 /* lowercase [titleLimit..index[ */
609                 if(titleLimit<index) {
610                     if((options&U_TITLECASE_NO_LOWERCASE)==0) {
611                         /* Normal operation: Lowercase the rest of the word. */
612                         toLower(caseLocale, options,
613                                 src, &csc, titleLimit, index,
614                                 sink, edits, errorCode);
615                         if(U_FAILURE(errorCode)) {
616                             return;
617                         }
618                     } else {
619                         /* Optionally just copy the rest of the word unchanged. */
620                         if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
621                                                            sink, options, edits, errorCode)) {
622                             return;
623                         }
624                     }
625                 }
626             }
627         }
628 
629         prev=index;
630     }
631 }
632 
633 #endif
634 
635 U_NAMESPACE_BEGIN
636 namespace GreekUpper {
637 
isFollowedByCasedLetter(const uint8_t * s,int32_t i,int32_t length)638 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
639     while (i < length) {
640         UChar32 c;
641         U8_NEXT(s, i, length, c);
642         int32_t type = ucase_getTypeOrIgnorable(c);
643         if ((type & UCASE_IGNORABLE) != 0) {
644             // Case-ignorable, continue with the loop.
645         } else if (type != UCASE_NONE) {
646             return true;  // Followed by cased letter.
647         } else {
648             return false;  // Uncased and not case-ignorable.
649         }
650     }
651     return false;  // Not followed by cased letter.
652 }
653 
654 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
toUpper(uint32_t options,const uint8_t * src,int32_t srcLength,ByteSink & sink,Edits * edits,UErrorCode & errorCode)655 void toUpper(uint32_t options,
656              const uint8_t *src, int32_t srcLength,
657              ByteSink &sink, Edits *edits,
658              UErrorCode &errorCode) {
659     uint32_t state = 0;
660     for (int32_t i = 0; i < srcLength;) {
661         int32_t nextIndex = i;
662         UChar32 c;
663         U8_NEXT(src, nextIndex, srcLength, c);
664         uint32_t nextState = 0;
665         int32_t type = ucase_getTypeOrIgnorable(c);
666         if ((type & UCASE_IGNORABLE) != 0) {
667             // c is case-ignorable
668             nextState |= (state & AFTER_CASED);
669         } else if (type != UCASE_NONE) {
670             // c is cased
671             nextState |= AFTER_CASED;
672         }
673         uint32_t data = getLetterData(c);
674         if (data > 0) {
675             uint32_t upper = data & UPPER_MASK;
676             // Add a dialytika to this iota or ypsilon vowel
677             // if we removed a tonos from the previous vowel,
678             // and that previous vowel did not also have (or gain) a dialytika.
679             // Adding one only to the final vowel in a longer sequence
680             // (which does not occur in normal writing) would require lookahead.
681             // Set the same flag as for preserving an existing dialytika.
682             if ((data & HAS_VOWEL) != 0 &&
683                 (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
684                     0 &&
685                 (upper == 0x399 || upper == 0x3A5)) {
686                 data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) != 0 ? HAS_DIALYTIKA
687                                                                            : HAS_COMBINING_DIALYTIKA;
688             }
689             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
690             if ((data & HAS_YPOGEGRAMMENI) != 0) {
691                 numYpogegrammeni = 1;
692             }
693             const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
694             // Skip combining diacritics after this Greek letter.
695             int32_t nextNextIndex = nextIndex;
696             while (nextIndex < srcLength) {
697                 UChar32 c2;
698                 U8_NEXT(src, nextNextIndex, srcLength, c2);
699                 uint32_t diacriticData = getDiacriticData(c2);
700                 if (diacriticData != 0) {
701                     data |= diacriticData;
702                     if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
703                         ++numYpogegrammeni;
704                     }
705                     nextIndex = nextNextIndex;
706                 } else {
707                     break;  // not a Greek diacritic
708                 }
709             }
710             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
711                 nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
712                                                   : AFTER_VOWEL_WITH_COMBINING_ACCENT;
713             }
714             // Map according to Greek rules.
715             UBool addTonos = false;
716             if (upper == 0x397 &&
717                     (data & HAS_ACCENT) != 0 &&
718                     numYpogegrammeni == 0 &&
719                     (state & AFTER_CASED) == 0 &&
720                     !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
721                 // Keep disjunctive "or" with (only) a tonos.
722                 // We use the same "word boundary" conditions as for the Final_Sigma test.
723                 if (hasPrecomposedAccent) {
724                     upper = 0x389;  // Preserve the precomposed form.
725                 } else {
726                     addTonos = true;
727                 }
728             } else if ((data & HAS_DIALYTIKA) != 0) {
729                 // Preserve a vowel with dialytika in precomposed form if it exists.
730                 if (upper == 0x399) {
731                     upper = 0x3AA;
732                     data &= ~HAS_EITHER_DIALYTIKA;
733                 } else if (upper == 0x3A5) {
734                     upper = 0x3AB;
735                     data &= ~HAS_EITHER_DIALYTIKA;
736                 }
737             }
738 
739             UBool change;
740             if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
741                 change = true;  // common, simple usage
742             } else {
743                 // Find out first whether we are changing the text.
744                 U_ASSERT(0x370 <= upper && upper <= 0x3ff);  // 2-byte UTF-8, main Greek block
745                 change = (i + 2) > nextIndex ||
746                         src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
747                         numYpogegrammeni > 0;
748                 int32_t i2 = i + 2;
749                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
750                     change |= (i2 + 2) > nextIndex ||
751                             src[i2] != (uint8_t)u8"\u0308"[0] ||
752                             src[i2 + 1] != (uint8_t)u8"\u0308"[1];
753                     i2 += 2;
754                 }
755                 if (addTonos) {
756                     change |= (i2 + 2) > nextIndex ||
757                             src[i2] != (uint8_t)u8"\u0301"[0] ||
758                             src[i2 + 1] != (uint8_t)u8"\u0301"[1];
759                     i2 += 2;
760                 }
761                 int32_t oldLength = nextIndex - i;
762                 int32_t newLength = (i2 - i) + numYpogegrammeni * 2;  // 2 bytes per U+0399
763                 change |= oldLength != newLength;
764                 if (change) {
765                     if (edits != nullptr) {
766                         edits->addReplace(oldLength, newLength);
767                     }
768                 } else {
769                     if (edits != nullptr) {
770                         edits->addUnchanged(oldLength);
771                     }
772                     // Write unchanged text?
773                     change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
774                 }
775             }
776 
777             if (change) {
778                 ByteSinkUtil::appendTwoBytes(upper, sink);
779                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
780                     sink.AppendU8(u8"\u0308", 2);  // restore or add a dialytika
781                 }
782                 if (addTonos) {
783                     sink.AppendU8(u8"\u0301", 2);
784                 }
785                 while (numYpogegrammeni > 0) {
786                     sink.AppendU8(u8"\u0399", 2);
787                     --numYpogegrammeni;
788                 }
789             }
790         } else if(c>=0) {
791             const char16_t *s;
792             c=ucase_toFullUpper(c, nullptr, nullptr, &s, UCASE_LOC_GREEK);
793             if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
794                 return;
795             }
796         } else {
797             // Malformed UTF-8.
798             if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
799                                                sink, options, edits, errorCode)) {
800                 return;
801             }
802         }
803         i = nextIndex;
804         state = nextState;
805     }
806 }
807 
808 }  // namespace GreekUpper
809 U_NAMESPACE_END
810 
811 static void U_CALLCONV
ucasemap_internalUTF8ToLower(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)812 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
813                              const uint8_t *src, int32_t srcLength,
814                              icu::ByteSink &sink, icu::Edits *edits,
815                              UErrorCode &errorCode) {
816     UCaseContext csc=UCASECONTEXT_INITIALIZER;
817     csc.p=(void *)src;
818     csc.limit=srcLength;
819     toLower(
820         caseLocale, options,
821         src, &csc, 0, srcLength,
822         sink, edits, errorCode);
823 }
824 
825 static void U_CALLCONV
ucasemap_internalUTF8ToUpper(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)826 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
827                              const uint8_t *src, int32_t srcLength,
828                              icu::ByteSink &sink, icu::Edits *edits,
829                              UErrorCode &errorCode) {
830     if (caseLocale == UCASE_LOC_GREEK) {
831         GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
832     } else {
833         UCaseContext csc=UCASECONTEXT_INITIALIZER;
834         csc.p=(void *)src;
835         csc.limit=srcLength;
836         toUpper(
837             caseLocale, options,
838             src, &csc, srcLength,
839             sink, edits, errorCode);
840     }
841 }
842 
843 static void U_CALLCONV
ucasemap_internalUTF8Fold(int32_t,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)844 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
845                           const uint8_t *src, int32_t srcLength,
846                           icu::ByteSink &sink, icu::Edits *edits,
847                           UErrorCode &errorCode) {
848     toLower(
849         -1, options,
850         src, nullptr, 0, srcLength,
851         sink, edits, errorCode);
852 }
853 
854 void
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)855 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
856                  const char *src, int32_t srcLength,
857                  UTF8CaseMapper *stringCaseMapper,
858                  icu::ByteSink &sink, icu::Edits *edits,
859                  UErrorCode &errorCode) {
860     /* check argument values */
861     if (U_FAILURE(errorCode)) {
862         return;
863     }
864     if ((src == nullptr && srcLength != 0) || srcLength < -1) {
865         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
866         return;
867     }
868 
869     // Get the string length.
870     if (srcLength == -1) {
871         srcLength = (int32_t)uprv_strlen((const char *)src);
872     }
873 
874     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
875         edits->reset();
876     }
877     stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
878                      (const uint8_t *)src, srcLength, sink, edits, errorCode);
879     sink.Flush();
880     if (U_SUCCESS(errorCode)) {
881         if (edits != nullptr) {
882             edits->copyErrorTo(errorCode);
883         }
884     }
885 }
886 
887 int32_t
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::Edits * edits,UErrorCode & errorCode)888 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
889                  char *dest, int32_t destCapacity,
890                  const char *src, int32_t srcLength,
891                  UTF8CaseMapper *stringCaseMapper,
892                  icu::Edits *edits,
893                  UErrorCode &errorCode) {
894     /* check argument values */
895     if(U_FAILURE(errorCode)) {
896         return 0;
897     }
898     if( destCapacity<0 ||
899         (dest==nullptr && destCapacity>0) ||
900         (src==nullptr && srcLength!=0) || srcLength<-1
901     ) {
902         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
903         return 0;
904     }
905 
906     /* get the string length */
907     if(srcLength==-1) {
908         srcLength=(int32_t)uprv_strlen((const char *)src);
909     }
910 
911     /* check for overlapping source and destination */
912     if( dest!=nullptr &&
913         ((src>=dest && src<(dest+destCapacity)) ||
914          (dest>=src && dest<(src+srcLength)))
915     ) {
916         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
917         return 0;
918     }
919 
920     CheckedArrayByteSink sink(dest, destCapacity);
921     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
922         edits->reset();
923     }
924     stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
925                      (const uint8_t *)src, srcLength, sink, edits, errorCode);
926     sink.Flush();
927     if (U_SUCCESS(errorCode)) {
928         if (sink.Overflowed()) {
929             errorCode = U_BUFFER_OVERFLOW_ERROR;
930         } else if (edits != nullptr) {
931             edits->copyErrorTo(errorCode);
932         }
933     }
934     return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
935 }
936 
937 /* public API functions */
938 
939 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToLower(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)940 ucasemap_utf8ToLower(const UCaseMap *csm,
941                      char *dest, int32_t destCapacity,
942                      const char *src, int32_t srcLength,
943                      UErrorCode *pErrorCode) {
944     return ucasemap_mapUTF8(
945         csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
946         dest, destCapacity,
947         src, srcLength,
948         ucasemap_internalUTF8ToLower, nullptr, *pErrorCode);
949 }
950 
951 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToUpper(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)952 ucasemap_utf8ToUpper(const UCaseMap *csm,
953                      char *dest, int32_t destCapacity,
954                      const char *src, int32_t srcLength,
955                      UErrorCode *pErrorCode) {
956     return ucasemap_mapUTF8(
957         csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
958         dest, destCapacity,
959         src, srcLength,
960         ucasemap_internalUTF8ToUpper, nullptr, *pErrorCode);
961 }
962 
963 U_CAPI int32_t U_EXPORT2
ucasemap_utf8FoldCase(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)964 ucasemap_utf8FoldCase(const UCaseMap *csm,
965                       char *dest, int32_t destCapacity,
966                       const char *src, int32_t srcLength,
967                       UErrorCode *pErrorCode) {
968     return ucasemap_mapUTF8(
969         UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
970         dest, destCapacity,
971         src, srcLength,
972         ucasemap_internalUTF8Fold, nullptr, *pErrorCode);
973 }
974 
975 U_NAMESPACE_BEGIN
976 
utf8ToLower(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)977 void CaseMap::utf8ToLower(
978         const char *locale, uint32_t options,
979         StringPiece src, ByteSink &sink, Edits *edits,
980         UErrorCode &errorCode) {
981     ucasemap_mapUTF8(
982         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
983         src.data(), src.length(),
984         ucasemap_internalUTF8ToLower, sink, edits, errorCode);
985 }
986 
utf8ToUpper(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)987 void CaseMap::utf8ToUpper(
988         const char *locale, uint32_t options,
989         StringPiece src, ByteSink &sink, Edits *edits,
990         UErrorCode &errorCode) {
991     ucasemap_mapUTF8(
992         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
993         src.data(), src.length(),
994         ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
995 }
996 
utf8Fold(uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)997 void CaseMap::utf8Fold(
998         uint32_t options,
999         StringPiece src, ByteSink &sink, Edits *edits,
1000         UErrorCode &errorCode) {
1001     ucasemap_mapUTF8(
1002         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1003         src.data(), src.length(),
1004         ucasemap_internalUTF8Fold, sink, edits, errorCode);
1005 }
1006 
utf8ToLower(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1007 int32_t CaseMap::utf8ToLower(
1008         const char *locale, uint32_t options,
1009         const char *src, int32_t srcLength,
1010         char *dest, int32_t destCapacity, Edits *edits,
1011         UErrorCode &errorCode) {
1012     return ucasemap_mapUTF8(
1013         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1014         dest, destCapacity,
1015         src, srcLength,
1016         ucasemap_internalUTF8ToLower, edits, errorCode);
1017 }
1018 
utf8ToUpper(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1019 int32_t CaseMap::utf8ToUpper(
1020         const char *locale, uint32_t options,
1021         const char *src, int32_t srcLength,
1022         char *dest, int32_t destCapacity, Edits *edits,
1023         UErrorCode &errorCode) {
1024     return ucasemap_mapUTF8(
1025         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1026         dest, destCapacity,
1027         src, srcLength,
1028         ucasemap_internalUTF8ToUpper, edits, errorCode);
1029 }
1030 
utf8Fold(uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1031 int32_t CaseMap::utf8Fold(
1032         uint32_t options,
1033         const char *src, int32_t srcLength,
1034         char *dest, int32_t destCapacity, Edits *edits,
1035         UErrorCode &errorCode) {
1036     return ucasemap_mapUTF8(
1037         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1038         dest, destCapacity,
1039         src, srcLength,
1040         ucasemap_internalUTF8Fold, edits, errorCode);
1041 }
1042 
1043 U_NAMESPACE_END
1044