1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2005-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ucasemap.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2005may06
16 * created by: Markus W. Scherer
17 *
18 * Case mapping service object and functions using it.
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/bytestream.h"
24 #include "unicode/casemap.h"
25 #include "unicode/edits.h"
26 #include "unicode/stringoptions.h"
27 #include "unicode/stringpiece.h"
28 #include "unicode/ubrk.h"
29 #include "unicode/uloc.h"
30 #include "unicode/ustring.h"
31 #include "unicode/ucasemap.h"
32 #if !UCONFIG_NO_BREAK_ITERATION
33 #include "unicode/utext.h"
34 #endif
35 #include "unicode/utf.h"
36 #include "unicode/utf8.h"
37 #include "unicode/utf16.h"
38 #include "bytesinkutil.h"
39 #include "cmemory.h"
40 #include "cstring.h"
41 #include "uassert.h"
42 #include "ucase.h"
43 #include "ucasemap_imp.h"
44 #include "ustr_imp.h"
45
46 U_NAMESPACE_USE
47
48 /* UCaseMap service object -------------------------------------------------- */
49
UCaseMap(const char * localeID,uint32_t opts,UErrorCode * pErrorCode)50 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
51 #if !UCONFIG_NO_BREAK_ITERATION
52 iter(nullptr),
53 #endif
54 caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
55 ucasemap_setLocale(this, localeID, pErrorCode);
56 }
57
~UCaseMap()58 UCaseMap::~UCaseMap() {
59 #if !UCONFIG_NO_BREAK_ITERATION
60 delete iter;
61 #endif
62 }
63
64 U_CAPI UCaseMap * U_EXPORT2
ucasemap_open(const char * locale,uint32_t options,UErrorCode * pErrorCode)65 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
66 if(U_FAILURE(*pErrorCode)) {
67 return nullptr;
68 }
69 UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
70 if(csm==nullptr) {
71 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
72 return nullptr;
73 } else if (U_FAILURE(*pErrorCode)) {
74 delete csm;
75 return nullptr;
76 }
77 return csm;
78 }
79
80 U_CAPI void U_EXPORT2
ucasemap_close(UCaseMap * csm)81 ucasemap_close(UCaseMap *csm) {
82 delete csm;
83 }
84
85 U_CAPI const char * U_EXPORT2
ucasemap_getLocale(const UCaseMap * csm)86 ucasemap_getLocale(const UCaseMap *csm) {
87 return csm->locale;
88 }
89
90 U_CAPI uint32_t U_EXPORT2
ucasemap_getOptions(const UCaseMap * csm)91 ucasemap_getOptions(const UCaseMap *csm) {
92 return csm->options;
93 }
94
95 U_CAPI void U_EXPORT2
ucasemap_setLocale(UCaseMap * csm,const char * locale,UErrorCode * pErrorCode)96 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
97 if(U_FAILURE(*pErrorCode)) {
98 return;
99 }
100 if (locale != nullptr && *locale == 0) {
101 csm->locale[0] = 0;
102 csm->caseLocale = UCASE_LOC_ROOT;
103 return;
104 }
105
106 int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
107 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
108 *pErrorCode=U_ZERO_ERROR;
109 /* we only really need the language code for case mappings */
110 length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
111 }
112 if(length==sizeof(csm->locale)) {
113 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
114 }
115 if(U_SUCCESS(*pErrorCode)) {
116 csm->caseLocale = ucase_getCaseLocale(csm->locale);
117 } else {
118 csm->locale[0]=0;
119 csm->caseLocale = UCASE_LOC_ROOT;
120 }
121 }
122
123 U_CAPI void U_EXPORT2
ucasemap_setOptions(UCaseMap * csm,uint32_t options,UErrorCode * pErrorCode)124 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
125 if(U_FAILURE(*pErrorCode)) {
126 return;
127 }
128 csm->options=options;
129 }
130
131 /* UTF-8 string case mappings ----------------------------------------------- */
132
133 /* TODO(markus): Move to a new, separate utf8case.cpp file. */
134
135 namespace {
136
137 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
138 inline UBool
appendResult(int32_t cpLength,int32_t result,const char16_t * s,ByteSink & sink,uint32_t options,icu::Edits * edits,UErrorCode & errorCode)139 appendResult(int32_t cpLength, int32_t result, const char16_t *s,
140 ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
141 U_ASSERT(U_SUCCESS(errorCode));
142
143 /* decode the result */
144 if(result<0) {
145 /* (not) original code point */
146 if(edits!=nullptr) {
147 edits->addUnchanged(cpLength);
148 }
149 if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
150 ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
151 }
152 } else {
153 if(result<=UCASE_MAX_STRING_LENGTH) {
154 // string: "result" is the UTF-16 length
155 return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
156 } else {
157 ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
158 }
159 }
160 return true;
161 }
162
163 // See unicode/utf8.h U8_APPEND_UNSAFE().
getTwoByteLead(UChar32 c)164 inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
getTwoByteTrail(UChar32 c)165 inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
166
167 UChar32 U_CALLCONV
utf8_caseContextIterator(void * context,int8_t dir)168 utf8_caseContextIterator(void *context, int8_t dir) {
169 UCaseContext *csc=(UCaseContext *)context;
170 UChar32 c;
171
172 if(dir<0) {
173 /* reset for backward iteration */
174 csc->index=csc->cpStart;
175 csc->dir=dir;
176 } else if(dir>0) {
177 /* reset for forward iteration */
178 csc->index=csc->cpLimit;
179 csc->dir=dir;
180 } else {
181 /* continue current iteration direction */
182 dir=csc->dir;
183 }
184
185 if(dir<0) {
186 if(csc->start<csc->index) {
187 U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
188 return c;
189 }
190 } else {
191 if(csc->index<csc->limit) {
192 U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
193 return c;
194 }
195 }
196 return U_SENTINEL;
197 }
198
199 /**
200 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
201 * caseLocale < 0: Case-folds [srcStart..srcLimit[.
202 */
toLower(int32_t caseLocale,uint32_t options,const uint8_t * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)203 void toLower(int32_t caseLocale, uint32_t options,
204 const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
205 icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
206 const int8_t *latinToLower;
207 if (caseLocale == UCASE_LOC_ROOT ||
208 (caseLocale >= 0 ?
209 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
210 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
211 latinToLower = LatinCase::TO_LOWER_NORMAL;
212 } else {
213 latinToLower = LatinCase::TO_LOWER_TR_LT;
214 }
215 const UTrie2 *trie = ucase_getTrie();
216 int32_t prev = srcStart;
217 int32_t srcIndex = srcStart;
218 for (;;) {
219 // fast path for simple cases
220 int32_t cpStart;
221 UChar32 c;
222 for (;;) {
223 if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
224 c = U_SENTINEL;
225 break;
226 }
227 uint8_t lead = src[srcIndex++];
228 if (lead <= 0x7f) {
229 int8_t d = latinToLower[lead];
230 if (d == LatinCase::EXC) {
231 cpStart = srcIndex - 1;
232 c = lead;
233 break;
234 }
235 if (d == 0) { continue; }
236 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
237 sink, options, edits, errorCode);
238 char ascii = (char)(lead + d);
239 sink.Append(&ascii, 1);
240 if (edits != nullptr) {
241 edits->addReplace(1, 1);
242 }
243 prev = srcIndex;
244 continue;
245 } else if (lead < 0xe3) {
246 uint8_t t;
247 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
248 (t = src[srcIndex] - 0x80) <= 0x3f) {
249 // U+0080..U+017F
250 ++srcIndex;
251 c = ((lead - 0xc0) << 6) | t;
252 int8_t d = latinToLower[c];
253 if (d == LatinCase::EXC) {
254 cpStart = srcIndex - 2;
255 break;
256 }
257 if (d == 0) { continue; }
258 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
259 sink, options, edits, errorCode);
260 ByteSinkUtil::appendTwoBytes(c + d, sink);
261 if (edits != nullptr) {
262 edits->addReplace(2, 2);
263 }
264 prev = srcIndex;
265 continue;
266 }
267 } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
268 (srcIndex + 2) <= srcLimit &&
269 U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
270 // most of CJK: no case mappings
271 srcIndex += 2;
272 continue;
273 }
274 cpStart = --srcIndex;
275 U8_NEXT(src, srcIndex, srcLimit, c);
276 if (c < 0) {
277 // ill-formed UTF-8
278 continue;
279 }
280 uint16_t props = UTRIE2_GET16(trie, c);
281 if (UCASE_HAS_EXCEPTION(props)) { break; }
282 int32_t delta;
283 if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
284 continue;
285 }
286 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
287 sink, options, edits, errorCode);
288 ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
289 prev = srcIndex;
290 }
291 if (c < 0) {
292 break;
293 }
294 // slow path
295 const char16_t *s;
296 if (caseLocale >= 0) {
297 csc->cpStart = cpStart;
298 csc->cpLimit = srcIndex;
299 c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
300 } else {
301 c = ucase_toFullFolding(c, &s, options);
302 }
303 if (c >= 0) {
304 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
305 sink, options, edits, errorCode);
306 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
307 prev = srcIndex;
308 }
309 }
310 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
311 sink, options, edits, errorCode);
312 }
313
toUpper(int32_t caseLocale,uint32_t options,const uint8_t * src,UCaseContext * csc,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)314 void toUpper(int32_t caseLocale, uint32_t options,
315 const uint8_t *src, UCaseContext *csc, int32_t srcLength,
316 icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
317 const int8_t *latinToUpper;
318 if (caseLocale == UCASE_LOC_TURKISH) {
319 latinToUpper = LatinCase::TO_UPPER_TR;
320 } else {
321 latinToUpper = LatinCase::TO_UPPER_NORMAL;
322 }
323 const UTrie2 *trie = ucase_getTrie();
324 int32_t prev = 0;
325 int32_t srcIndex = 0;
326 for (;;) {
327 // fast path for simple cases
328 int32_t cpStart;
329 UChar32 c;
330 for (;;) {
331 if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
332 c = U_SENTINEL;
333 break;
334 }
335 uint8_t lead = src[srcIndex++];
336 if (lead <= 0x7f) {
337 int8_t d = latinToUpper[lead];
338 if (d == LatinCase::EXC) {
339 cpStart = srcIndex - 1;
340 c = lead;
341 break;
342 }
343 if (d == 0) { continue; }
344 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
345 sink, options, edits, errorCode);
346 char ascii = (char)(lead + d);
347 sink.Append(&ascii, 1);
348 if (edits != nullptr) {
349 edits->addReplace(1, 1);
350 }
351 prev = srcIndex;
352 continue;
353 } else if (lead < 0xe3) {
354 uint8_t t;
355 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
356 (t = src[srcIndex] - 0x80) <= 0x3f) {
357 // U+0080..U+017F
358 ++srcIndex;
359 c = ((lead - 0xc0) << 6) | t;
360 int8_t d = latinToUpper[c];
361 if (d == LatinCase::EXC) {
362 cpStart = srcIndex - 2;
363 break;
364 }
365 if (d == 0) { continue; }
366 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
367 sink, options, edits, errorCode);
368 ByteSinkUtil::appendTwoBytes(c + d, sink);
369 if (edits != nullptr) {
370 edits->addReplace(2, 2);
371 }
372 prev = srcIndex;
373 continue;
374 }
375 } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
376 (srcIndex + 2) <= srcLength &&
377 U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
378 // most of CJK: no case mappings
379 srcIndex += 2;
380 continue;
381 }
382 cpStart = --srcIndex;
383 U8_NEXT(src, srcIndex, srcLength, c);
384 if (c < 0) {
385 // ill-formed UTF-8
386 continue;
387 }
388 uint16_t props = UTRIE2_GET16(trie, c);
389 if (UCASE_HAS_EXCEPTION(props)) { break; }
390 int32_t delta;
391 if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
392 continue;
393 }
394 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
395 sink, options, edits, errorCode);
396 ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
397 prev = srcIndex;
398 }
399 if (c < 0) {
400 break;
401 }
402 // slow path
403 csc->cpStart = cpStart;
404 csc->cpLimit = srcIndex;
405 const char16_t *s;
406 c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
407 if (c >= 0) {
408 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
409 sink, options, edits, errorCode);
410 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
411 prev = srcIndex;
412 }
413 }
414 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
415 sink, options, edits, errorCode);
416 }
417
418 } // namespace
419
420 #if !UCONFIG_NO_BREAK_ITERATION
421
422 namespace {
423
424 constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
425
426 constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
427
428 /**
429 * Input: c is a letter I with or without acute accent.
430 * start is the index in src after c, and is less than segmentLimit.
431 * If a plain i/I is followed by a plain j/J,
432 * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
433 * then we output accordingly.
434 *
435 * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
436 */
maybeTitleDutchIJ(const uint8_t * src,UChar32 c,int32_t start,int32_t segmentLimit,ByteSink & sink,uint32_t options,icu::Edits * edits,UErrorCode & errorCode)437 int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
438 ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
439 U_ASSERT(start < segmentLimit);
440
441 int32_t index = start;
442 bool withAcute = false;
443
444 // If the conditions are met, then the following variables tell us what to output.
445 int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
446 bool doTitleJ = false; // true if the j needs to be titlecased
447 int32_t unchanged2 = 0; // after the j (0 or 1)
448
449 // next character after the first letter
450 UChar32 c2;
451 c2 = src[index++];
452
453 // Is the first letter an i/I with accent?
454 if (c == u'I') {
455 if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
456 withAcute = true;
457 unchanged1 = 2; // ACUTE is 2 code units in UTF-8
458 if (index == segmentLimit) { return start; }
459 c2 = src[index++];
460 }
461 } else { // Í
462 withAcute = true;
463 }
464
465 // Is the next character a j/J?
466 if (c2 == u'j') {
467 doTitleJ = true;
468 } else if (c2 == u'J') {
469 ++unchanged1;
470 } else {
471 return start;
472 }
473
474 // A plain i/I must be followed by a plain j/J.
475 // An i/I with acute must be followed by a j/J with acute.
476 if (withAcute) {
477 if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
478 return start;
479 }
480 if (doTitleJ) {
481 unchanged2 = 2; // ACUTE is 2 code units in UTF-8
482 } else {
483 unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8
484 }
485 }
486
487 // There must not be another combining mark.
488 if (index < segmentLimit) {
489 int32_t cp;
490 int32_t i = index;
491 U8_NEXT(src, i, segmentLimit, cp);
492 uint32_t typeMask = U_GET_GC_MASK(cp);
493 if ((typeMask & U_GC_M_MASK) != 0) {
494 return start;
495 }
496 }
497
498 // Output the rest of the Dutch IJ.
499 ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
500 start += unchanged1;
501 if (doTitleJ) {
502 ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
503 ++start;
504 }
505 ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
506
507 U_ASSERT(start + unchanged2 == index);
508 return index;
509 }
510
511 } // namespace
512
513 U_CFUNC void U_CALLCONV
ucasemap_internalUTF8ToTitle(int32_t caseLocale,uint32_t options,BreakIterator * iter,const uint8_t * src,int32_t srcLength,ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)514 ucasemap_internalUTF8ToTitle(
515 int32_t caseLocale, uint32_t options, BreakIterator *iter,
516 const uint8_t *src, int32_t srcLength,
517 ByteSink &sink, icu::Edits *edits,
518 UErrorCode &errorCode) {
519 if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
520 return;
521 }
522
523 /* set up local variables */
524 UCaseContext csc=UCASECONTEXT_INITIALIZER;
525 csc.p=(void *)src;
526 csc.limit=srcLength;
527 int32_t prev=0;
528 UBool isFirstIndex=true;
529
530 /* titlecasing loop */
531 while(prev<srcLength) {
532 /* find next index where to titlecase */
533 int32_t index;
534 if(isFirstIndex) {
535 isFirstIndex=false;
536 index=iter->first();
537 } else {
538 index=iter->next();
539 }
540 if(index==UBRK_DONE || index>srcLength) {
541 index=srcLength;
542 }
543
544 /*
545 * Segment [prev..index[ into 3 parts:
546 * a) skipped characters (copy as-is) [prev..titleStart[
547 * b) first letter (titlecase) [titleStart..titleLimit[
548 * c) subsequent characters (lowercase) [titleLimit..index[
549 */
550 if(prev<index) {
551 /* find and copy skipped characters [prev..titleStart[ */
552 int32_t titleStart=prev;
553 int32_t titleLimit=prev;
554 UChar32 c;
555 U8_NEXT(src, titleLimit, index, c);
556 if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
557 // Adjust the titlecasing index to the next cased character,
558 // or to the next letter/number/symbol/private use.
559 // Stop with titleStart<titleLimit<=index
560 // if there is a character to be titlecased,
561 // or else stop with titleStart==titleLimit==index.
562 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
563 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
564 titleStart=titleLimit;
565 if(titleLimit==index) {
566 break;
567 }
568 U8_NEXT(src, titleLimit, index, c);
569 }
570 if (prev < titleStart) {
571 if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
572 sink, options, edits, errorCode)) {
573 return;
574 }
575 }
576 }
577
578 if(titleStart<titleLimit) {
579 /* titlecase c which is from [titleStart..titleLimit[ */
580 if(c>=0) {
581 csc.cpStart=titleStart;
582 csc.cpLimit=titleLimit;
583 const char16_t *s;
584 c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
585 if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
586 return;
587 }
588 } else {
589 // Malformed UTF-8.
590 if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
591 sink, options, edits, errorCode)) {
592 return;
593 }
594 }
595
596 /* Special case Dutch IJ titlecasing */
597 if (titleLimit < index &&
598 caseLocale == UCASE_LOC_DUTCH) {
599 if (c < 0) {
600 c = ~c;
601 }
602
603 if (c == u'I' || c == u'Í') {
604 titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
605 }
606 }
607
608 /* lowercase [titleLimit..index[ */
609 if(titleLimit<index) {
610 if((options&U_TITLECASE_NO_LOWERCASE)==0) {
611 /* Normal operation: Lowercase the rest of the word. */
612 toLower(caseLocale, options,
613 src, &csc, titleLimit, index,
614 sink, edits, errorCode);
615 if(U_FAILURE(errorCode)) {
616 return;
617 }
618 } else {
619 /* Optionally just copy the rest of the word unchanged. */
620 if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
621 sink, options, edits, errorCode)) {
622 return;
623 }
624 }
625 }
626 }
627 }
628
629 prev=index;
630 }
631 }
632
633 #endif
634
635 U_NAMESPACE_BEGIN
636 namespace GreekUpper {
637
isFollowedByCasedLetter(const uint8_t * s,int32_t i,int32_t length)638 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
639 while (i < length) {
640 UChar32 c;
641 U8_NEXT(s, i, length, c);
642 int32_t type = ucase_getTypeOrIgnorable(c);
643 if ((type & UCASE_IGNORABLE) != 0) {
644 // Case-ignorable, continue with the loop.
645 } else if (type != UCASE_NONE) {
646 return true; // Followed by cased letter.
647 } else {
648 return false; // Uncased and not case-ignorable.
649 }
650 }
651 return false; // Not followed by cased letter.
652 }
653
654 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
toUpper(uint32_t options,const uint8_t * src,int32_t srcLength,ByteSink & sink,Edits * edits,UErrorCode & errorCode)655 void toUpper(uint32_t options,
656 const uint8_t *src, int32_t srcLength,
657 ByteSink &sink, Edits *edits,
658 UErrorCode &errorCode) {
659 uint32_t state = 0;
660 for (int32_t i = 0; i < srcLength;) {
661 int32_t nextIndex = i;
662 UChar32 c;
663 U8_NEXT(src, nextIndex, srcLength, c);
664 uint32_t nextState = 0;
665 int32_t type = ucase_getTypeOrIgnorable(c);
666 if ((type & UCASE_IGNORABLE) != 0) {
667 // c is case-ignorable
668 nextState |= (state & AFTER_CASED);
669 } else if (type != UCASE_NONE) {
670 // c is cased
671 nextState |= AFTER_CASED;
672 }
673 uint32_t data = getLetterData(c);
674 if (data > 0) {
675 uint32_t upper = data & UPPER_MASK;
676 // Add a dialytika to this iota or ypsilon vowel
677 // if we removed a tonos from the previous vowel,
678 // and that previous vowel did not also have (or gain) a dialytika.
679 // Adding one only to the final vowel in a longer sequence
680 // (which does not occur in normal writing) would require lookahead.
681 // Set the same flag as for preserving an existing dialytika.
682 if ((data & HAS_VOWEL) != 0 &&
683 (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
684 0 &&
685 (upper == 0x399 || upper == 0x3A5)) {
686 data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) != 0 ? HAS_DIALYTIKA
687 : HAS_COMBINING_DIALYTIKA;
688 }
689 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
690 if ((data & HAS_YPOGEGRAMMENI) != 0) {
691 numYpogegrammeni = 1;
692 }
693 const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
694 // Skip combining diacritics after this Greek letter.
695 int32_t nextNextIndex = nextIndex;
696 while (nextIndex < srcLength) {
697 UChar32 c2;
698 U8_NEXT(src, nextNextIndex, srcLength, c2);
699 uint32_t diacriticData = getDiacriticData(c2);
700 if (diacriticData != 0) {
701 data |= diacriticData;
702 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
703 ++numYpogegrammeni;
704 }
705 nextIndex = nextNextIndex;
706 } else {
707 break; // not a Greek diacritic
708 }
709 }
710 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
711 nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
712 : AFTER_VOWEL_WITH_COMBINING_ACCENT;
713 }
714 // Map according to Greek rules.
715 UBool addTonos = false;
716 if (upper == 0x397 &&
717 (data & HAS_ACCENT) != 0 &&
718 numYpogegrammeni == 0 &&
719 (state & AFTER_CASED) == 0 &&
720 !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
721 // Keep disjunctive "or" with (only) a tonos.
722 // We use the same "word boundary" conditions as for the Final_Sigma test.
723 if (hasPrecomposedAccent) {
724 upper = 0x389; // Preserve the precomposed form.
725 } else {
726 addTonos = true;
727 }
728 } else if ((data & HAS_DIALYTIKA) != 0) {
729 // Preserve a vowel with dialytika in precomposed form if it exists.
730 if (upper == 0x399) {
731 upper = 0x3AA;
732 data &= ~HAS_EITHER_DIALYTIKA;
733 } else if (upper == 0x3A5) {
734 upper = 0x3AB;
735 data &= ~HAS_EITHER_DIALYTIKA;
736 }
737 }
738
739 UBool change;
740 if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
741 change = true; // common, simple usage
742 } else {
743 // Find out first whether we are changing the text.
744 U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block
745 change = (i + 2) > nextIndex ||
746 src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
747 numYpogegrammeni > 0;
748 int32_t i2 = i + 2;
749 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
750 change |= (i2 + 2) > nextIndex ||
751 src[i2] != (uint8_t)u8"\u0308"[0] ||
752 src[i2 + 1] != (uint8_t)u8"\u0308"[1];
753 i2 += 2;
754 }
755 if (addTonos) {
756 change |= (i2 + 2) > nextIndex ||
757 src[i2] != (uint8_t)u8"\u0301"[0] ||
758 src[i2 + 1] != (uint8_t)u8"\u0301"[1];
759 i2 += 2;
760 }
761 int32_t oldLength = nextIndex - i;
762 int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399
763 change |= oldLength != newLength;
764 if (change) {
765 if (edits != nullptr) {
766 edits->addReplace(oldLength, newLength);
767 }
768 } else {
769 if (edits != nullptr) {
770 edits->addUnchanged(oldLength);
771 }
772 // Write unchanged text?
773 change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
774 }
775 }
776
777 if (change) {
778 ByteSinkUtil::appendTwoBytes(upper, sink);
779 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
780 sink.AppendU8(u8"\u0308", 2); // restore or add a dialytika
781 }
782 if (addTonos) {
783 sink.AppendU8(u8"\u0301", 2);
784 }
785 while (numYpogegrammeni > 0) {
786 sink.AppendU8(u8"\u0399", 2);
787 --numYpogegrammeni;
788 }
789 }
790 } else if(c>=0) {
791 const char16_t *s;
792 c=ucase_toFullUpper(c, nullptr, nullptr, &s, UCASE_LOC_GREEK);
793 if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
794 return;
795 }
796 } else {
797 // Malformed UTF-8.
798 if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
799 sink, options, edits, errorCode)) {
800 return;
801 }
802 }
803 i = nextIndex;
804 state = nextState;
805 }
806 }
807
808 } // namespace GreekUpper
809 U_NAMESPACE_END
810
811 static void U_CALLCONV
ucasemap_internalUTF8ToLower(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)812 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
813 const uint8_t *src, int32_t srcLength,
814 icu::ByteSink &sink, icu::Edits *edits,
815 UErrorCode &errorCode) {
816 UCaseContext csc=UCASECONTEXT_INITIALIZER;
817 csc.p=(void *)src;
818 csc.limit=srcLength;
819 toLower(
820 caseLocale, options,
821 src, &csc, 0, srcLength,
822 sink, edits, errorCode);
823 }
824
825 static void U_CALLCONV
ucasemap_internalUTF8ToUpper(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)826 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
827 const uint8_t *src, int32_t srcLength,
828 icu::ByteSink &sink, icu::Edits *edits,
829 UErrorCode &errorCode) {
830 if (caseLocale == UCASE_LOC_GREEK) {
831 GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
832 } else {
833 UCaseContext csc=UCASECONTEXT_INITIALIZER;
834 csc.p=(void *)src;
835 csc.limit=srcLength;
836 toUpper(
837 caseLocale, options,
838 src, &csc, srcLength,
839 sink, edits, errorCode);
840 }
841 }
842
843 static void U_CALLCONV
ucasemap_internalUTF8Fold(int32_t,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)844 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
845 const uint8_t *src, int32_t srcLength,
846 icu::ByteSink &sink, icu::Edits *edits,
847 UErrorCode &errorCode) {
848 toLower(
849 -1, options,
850 src, nullptr, 0, srcLength,
851 sink, edits, errorCode);
852 }
853
854 void
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)855 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
856 const char *src, int32_t srcLength,
857 UTF8CaseMapper *stringCaseMapper,
858 icu::ByteSink &sink, icu::Edits *edits,
859 UErrorCode &errorCode) {
860 /* check argument values */
861 if (U_FAILURE(errorCode)) {
862 return;
863 }
864 if ((src == nullptr && srcLength != 0) || srcLength < -1) {
865 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
866 return;
867 }
868
869 // Get the string length.
870 if (srcLength == -1) {
871 srcLength = (int32_t)uprv_strlen((const char *)src);
872 }
873
874 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
875 edits->reset();
876 }
877 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
878 (const uint8_t *)src, srcLength, sink, edits, errorCode);
879 sink.Flush();
880 if (U_SUCCESS(errorCode)) {
881 if (edits != nullptr) {
882 edits->copyErrorTo(errorCode);
883 }
884 }
885 }
886
887 int32_t
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::Edits * edits,UErrorCode & errorCode)888 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
889 char *dest, int32_t destCapacity,
890 const char *src, int32_t srcLength,
891 UTF8CaseMapper *stringCaseMapper,
892 icu::Edits *edits,
893 UErrorCode &errorCode) {
894 /* check argument values */
895 if(U_FAILURE(errorCode)) {
896 return 0;
897 }
898 if( destCapacity<0 ||
899 (dest==nullptr && destCapacity>0) ||
900 (src==nullptr && srcLength!=0) || srcLength<-1
901 ) {
902 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
903 return 0;
904 }
905
906 /* get the string length */
907 if(srcLength==-1) {
908 srcLength=(int32_t)uprv_strlen((const char *)src);
909 }
910
911 /* check for overlapping source and destination */
912 if( dest!=nullptr &&
913 ((src>=dest && src<(dest+destCapacity)) ||
914 (dest>=src && dest<(src+srcLength)))
915 ) {
916 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
917 return 0;
918 }
919
920 CheckedArrayByteSink sink(dest, destCapacity);
921 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
922 edits->reset();
923 }
924 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
925 (const uint8_t *)src, srcLength, sink, edits, errorCode);
926 sink.Flush();
927 if (U_SUCCESS(errorCode)) {
928 if (sink.Overflowed()) {
929 errorCode = U_BUFFER_OVERFLOW_ERROR;
930 } else if (edits != nullptr) {
931 edits->copyErrorTo(errorCode);
932 }
933 }
934 return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
935 }
936
937 /* public API functions */
938
939 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToLower(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)940 ucasemap_utf8ToLower(const UCaseMap *csm,
941 char *dest, int32_t destCapacity,
942 const char *src, int32_t srcLength,
943 UErrorCode *pErrorCode) {
944 return ucasemap_mapUTF8(
945 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
946 dest, destCapacity,
947 src, srcLength,
948 ucasemap_internalUTF8ToLower, nullptr, *pErrorCode);
949 }
950
951 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToUpper(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)952 ucasemap_utf8ToUpper(const UCaseMap *csm,
953 char *dest, int32_t destCapacity,
954 const char *src, int32_t srcLength,
955 UErrorCode *pErrorCode) {
956 return ucasemap_mapUTF8(
957 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
958 dest, destCapacity,
959 src, srcLength,
960 ucasemap_internalUTF8ToUpper, nullptr, *pErrorCode);
961 }
962
963 U_CAPI int32_t U_EXPORT2
ucasemap_utf8FoldCase(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)964 ucasemap_utf8FoldCase(const UCaseMap *csm,
965 char *dest, int32_t destCapacity,
966 const char *src, int32_t srcLength,
967 UErrorCode *pErrorCode) {
968 return ucasemap_mapUTF8(
969 UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
970 dest, destCapacity,
971 src, srcLength,
972 ucasemap_internalUTF8Fold, nullptr, *pErrorCode);
973 }
974
975 U_NAMESPACE_BEGIN
976
utf8ToLower(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)977 void CaseMap::utf8ToLower(
978 const char *locale, uint32_t options,
979 StringPiece src, ByteSink &sink, Edits *edits,
980 UErrorCode &errorCode) {
981 ucasemap_mapUTF8(
982 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
983 src.data(), src.length(),
984 ucasemap_internalUTF8ToLower, sink, edits, errorCode);
985 }
986
utf8ToUpper(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)987 void CaseMap::utf8ToUpper(
988 const char *locale, uint32_t options,
989 StringPiece src, ByteSink &sink, Edits *edits,
990 UErrorCode &errorCode) {
991 ucasemap_mapUTF8(
992 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
993 src.data(), src.length(),
994 ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
995 }
996
utf8Fold(uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)997 void CaseMap::utf8Fold(
998 uint32_t options,
999 StringPiece src, ByteSink &sink, Edits *edits,
1000 UErrorCode &errorCode) {
1001 ucasemap_mapUTF8(
1002 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1003 src.data(), src.length(),
1004 ucasemap_internalUTF8Fold, sink, edits, errorCode);
1005 }
1006
utf8ToLower(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1007 int32_t CaseMap::utf8ToLower(
1008 const char *locale, uint32_t options,
1009 const char *src, int32_t srcLength,
1010 char *dest, int32_t destCapacity, Edits *edits,
1011 UErrorCode &errorCode) {
1012 return ucasemap_mapUTF8(
1013 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1014 dest, destCapacity,
1015 src, srcLength,
1016 ucasemap_internalUTF8ToLower, edits, errorCode);
1017 }
1018
utf8ToUpper(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1019 int32_t CaseMap::utf8ToUpper(
1020 const char *locale, uint32_t options,
1021 const char *src, int32_t srcLength,
1022 char *dest, int32_t destCapacity, Edits *edits,
1023 UErrorCode &errorCode) {
1024 return ucasemap_mapUTF8(
1025 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1026 dest, destCapacity,
1027 src, srcLength,
1028 ucasemap_internalUTF8ToUpper, edits, errorCode);
1029 }
1030
utf8Fold(uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1031 int32_t CaseMap::utf8Fold(
1032 uint32_t options,
1033 const char *src, int32_t srcLength,
1034 char *dest, int32_t destCapacity, Edits *edits,
1035 UErrorCode &errorCode) {
1036 return ucasemap_mapUTF8(
1037 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1038 dest, destCapacity,
1039 src, srcLength,
1040 ucasemap_internalUTF8Fold, edits, errorCode);
1041 }
1042
1043 U_NAMESPACE_END
1044