1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * Copyright (c) 1997-2016, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 ********************************************************************/
7
8 #include "unicode/ustring.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/uniset.h"
12 #include "unicode/putil.h"
13 #include "unicode/uscript.h"
14 #include "unicode/uset.h"
15 #include "cstring.h"
16 #include "hash.h"
17 #include "patternprops.h"
18 #include "normalizer2impl.h"
19 #include "testutil.h"
20 #include "uparse.h"
21 #include "ucdtest.h"
22
23 static const char *ignorePropNames[]={
24 "FC_NFKC",
25 "NFD_QC",
26 "NFC_QC",
27 "NFKD_QC",
28 "NFKC_QC",
29 "Expands_On_NFD",
30 "Expands_On_NFC",
31 "Expands_On_NFKD",
32 "Expands_On_NFKC",
33 "InCB",
34 "NFKC_CF",
35 "NFKC_SCF"
36 };
37
UnicodeTest()38 UnicodeTest::UnicodeTest()
39 {
40 UErrorCode errorCode=U_ZERO_ERROR;
41 unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode);
42 if(U_FAILURE(errorCode)) {
43 delete unknownPropertyNames;
44 unknownPropertyNames=nullptr;
45 }
46 // Ignore some property names altogether.
47 for(int32_t i=0; i<UPRV_LENGTHOF(ignorePropNames); ++i) {
48 unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode);
49 }
50 }
51
~UnicodeTest()52 UnicodeTest::~UnicodeTest()
53 {
54 delete unknownPropertyNames;
55 }
56
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)57 void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
58 {
59 if(exec) {
60 logln("TestSuite UnicodeTest: ");
61 }
62 TESTCASE_AUTO_BEGIN;
63 TESTCASE_AUTO(TestAdditionalProperties);
64 TESTCASE_AUTO(TestBinaryValues);
65 TESTCASE_AUTO(TestConsistency);
66 TESTCASE_AUTO(TestPatternProperties);
67 TESTCASE_AUTO(TestScriptMetadata);
68 TESTCASE_AUTO(TestBidiPairedBracketType);
69 TESTCASE_AUTO(TestEmojiProperties);
70 TESTCASE_AUTO(TestEmojiPropertiesOfStrings);
71 TESTCASE_AUTO(TestIndicPositionalCategory);
72 TESTCASE_AUTO(TestIndicSyllabicCategory);
73 TESTCASE_AUTO(TestVerticalOrientation);
74 TESTCASE_AUTO(TestDefaultScriptExtensions);
75 TESTCASE_AUTO(TestInvalidCodePointFolding);
76 #if !UCONFIG_NO_NORMALIZATION
77 TESTCASE_AUTO(TestBinaryCharacterProperties);
78 TESTCASE_AUTO(TestIntCharacterProperties);
79 #endif
80 TESTCASE_AUTO(TestPropertyNames);
81 TESTCASE_AUTO(TestIDSUnaryOperator);
82 TESTCASE_AUTO(TestIDCompatMath);
83 TESTCASE_AUTO_END;
84 }
85
86 //====================================================
87 // private data used by the tests
88 //====================================================
89
90 // test DerivedCoreProperties.txt -------------------------------------------
91
92 // copied from genprops.c
93 static int32_t
getTokenIndex(const char * const tokens[],int32_t countTokens,const char * s)94 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
95 const char *t, *z;
96 int32_t i, j;
97
98 s=u_skipWhitespace(s);
99 for(i=0; i<countTokens; ++i) {
100 t=tokens[i];
101 if(t!=nullptr) {
102 for(j=0;; ++j) {
103 if(t[j]!=0) {
104 if(s[j]!=t[j]) {
105 break;
106 }
107 } else {
108 z=u_skipWhitespace(s+j);
109 if(*z==';' || *z==0) {
110 return i;
111 } else {
112 break;
113 }
114 }
115 }
116 }
117 }
118 return -1;
119 }
120
121 static const char *const
122 derivedPropsNames[]={
123 "Math",
124 "Alphabetic",
125 "Lowercase",
126 "Uppercase",
127 "ID_Start",
128 "ID_Continue",
129 "XID_Start",
130 "XID_Continue",
131 "Default_Ignorable_Code_Point",
132 "Full_Composition_Exclusion",
133 "Grapheme_Extend",
134 "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */
135 "Grapheme_Base",
136 "Cased",
137 "Case_Ignorable",
138 "Changes_When_Lowercased",
139 "Changes_When_Uppercased",
140 "Changes_When_Titlecased",
141 "Changes_When_Casefolded",
142 "Changes_When_Casemapped",
143 "Changes_When_NFKC_Casefolded"
144 };
145
146 static const UProperty
147 derivedPropsIndex[]={
148 UCHAR_MATH,
149 UCHAR_ALPHABETIC,
150 UCHAR_LOWERCASE,
151 UCHAR_UPPERCASE,
152 UCHAR_ID_START,
153 UCHAR_ID_CONTINUE,
154 UCHAR_XID_START,
155 UCHAR_XID_CONTINUE,
156 UCHAR_DEFAULT_IGNORABLE_CODE_POINT,
157 UCHAR_FULL_COMPOSITION_EXCLUSION,
158 UCHAR_GRAPHEME_EXTEND,
159 UCHAR_GRAPHEME_LINK,
160 UCHAR_GRAPHEME_BASE,
161 UCHAR_CASED,
162 UCHAR_CASE_IGNORABLE,
163 UCHAR_CHANGES_WHEN_LOWERCASED,
164 UCHAR_CHANGES_WHEN_UPPERCASED,
165 UCHAR_CHANGES_WHEN_TITLECASED,
166 UCHAR_CHANGES_WHEN_CASEFOLDED,
167 UCHAR_CHANGES_WHEN_CASEMAPPED,
168 UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
169 };
170
171 static int32_t numErrors[UPRV_LENGTHOF(derivedPropsIndex)]={ 0 };
172
173 enum { MAX_ERRORS=50 };
174
175 U_CFUNC void U_CALLCONV
derivedPropsLineFn(void * context,char * fields[][2],int32_t,UErrorCode * pErrorCode)176 derivedPropsLineFn(void *context,
177 char *fields[][2], int32_t /* fieldCount */,
178 UErrorCode *pErrorCode)
179 {
180 UnicodeTest *me=static_cast<UnicodeTest*>(context);
181 uint32_t start, end;
182 int32_t i;
183
184 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
185 if(U_FAILURE(*pErrorCode)) {
186 me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]);
187 return;
188 }
189
190 /* parse derived binary property name, ignore unknown names */
191 i=getTokenIndex(derivedPropsNames, UPRV_LENGTHOF(derivedPropsNames), fields[1][0]);
192 if(i<0) {
193 UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0]));
194 propName.trim();
195 if(me->unknownPropertyNames->find(propName)==nullptr) {
196 UErrorCode errorCode=U_ZERO_ERROR;
197 me->unknownPropertyNames->puti(propName, 1, errorCode);
198 me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]);
199 }
200 return;
201 }
202
203 me->derivedProps[i].add(start, end);
204 }
205
TestAdditionalProperties()206 void UnicodeTest::TestAdditionalProperties() {
207 #if !UCONFIG_NO_NORMALIZATION
208 // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt
209 if(UPRV_LENGTHOF(derivedProps)<UPRV_LENGTHOF(derivedPropsNames)) {
210 errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n",
211 UPRV_LENGTHOF(derivedPropsNames));
212 return;
213 }
214 if(UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)) {
215 errln("error in ucdtest.cpp: UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)\n");
216 return;
217 }
218
219 char path[500];
220 if(getUnidataPath(path) == nullptr) {
221 errln("unable to find path to source/data/unidata/");
222 return;
223 }
224 char *basename=strchr(path, 0);
225 strcpy(basename, "DerivedCoreProperties.txt");
226
227 char *fields[2][2];
228 UErrorCode errorCode=U_ZERO_ERROR;
229 u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
230 if(U_FAILURE(errorCode)) {
231 errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode));
232 return;
233 }
234
235 strcpy(basename, "DerivedNormalizationProps.txt");
236 u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
237 if(U_FAILURE(errorCode)) {
238 errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode));
239 return;
240 }
241
242 // now we have all derived core properties in the UnicodeSets
243 // run them all through the API
244 int32_t rangeCount, range;
245 uint32_t i;
246 UChar32 start, end;
247
248 // test all true properties
249 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
250 rangeCount=derivedProps[i].getRangeCount();
251 for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
252 start=derivedProps[i].getRangeStart(range);
253 end=derivedProps[i].getRangeEnd(range);
254 for(; start<=end; ++start) {
255 if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) {
256 dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==false is wrong", start, derivedPropsNames[i]);
257 if(++numErrors[i]>=MAX_ERRORS) {
258 dataerrln("Too many errors, moving to the next test");
259 break;
260 }
261 }
262 }
263 }
264 }
265
266 // invert all properties
267 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
268 derivedProps[i].complement();
269 }
270
271 // test all false properties
272 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
273 rangeCount=derivedProps[i].getRangeCount();
274 for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
275 start=derivedProps[i].getRangeStart(range);
276 end=derivedProps[i].getRangeEnd(range);
277 for(; start<=end; ++start) {
278 if(u_hasBinaryProperty(start, derivedPropsIndex[i])) {
279 errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==true is wrong\n", start, derivedPropsNames[i]);
280 if(++numErrors[i]>=MAX_ERRORS) {
281 errln("Too many errors, moving to the next test");
282 break;
283 }
284 }
285 }
286 }
287 }
288 #endif /* !UCONFIG_NO_NORMALIZATION */
289 }
290
TestBinaryValues()291 void UnicodeTest::TestBinaryValues() {
292 /*
293 * Unicode 5.1 explicitly defines binary property value aliases.
294 * Verify that they are all recognized.
295 */
296 UErrorCode errorCode=U_ZERO_ERROR;
297 UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode);
298 if(U_FAILURE(errorCode)) {
299 dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode));
300 return;
301 }
302
303 static const char *const falseValues[]={ "N", "No", "F", "False" };
304 static const char *const trueValues[]={ "Y", "Yes", "T", "True" };
305 int32_t i;
306 for(i=0; i<UPRV_LENGTHOF(falseValues); ++i) {
307 UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
308 pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV));
309 errorCode=U_ZERO_ERROR;
310 UnicodeSet set(pattern, errorCode);
311 if(U_FAILURE(errorCode)) {
312 errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode));
313 continue;
314 }
315 set.complement();
316 if(set!=alpha) {
317 errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]);
318 }
319 }
320 for(i=0; i<UPRV_LENGTHOF(trueValues); ++i) {
321 UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
322 pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV));
323 errorCode=U_ZERO_ERROR;
324 UnicodeSet set(pattern, errorCode);
325 if(U_FAILURE(errorCode)) {
326 errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode));
327 continue;
328 }
329 if(set!=alpha) {
330 errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]);
331 }
332 }
333 }
334
TestConsistency()335 void UnicodeTest::TestConsistency() {
336 #if !UCONFIG_NO_NORMALIZATION
337 /*
338 * Test for an example that getCanonStartSet() delivers
339 * all characters that compose from the input one,
340 * even in multiple steps.
341 * For example, the set for "I" (0049) should contain both
342 * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
343 * In general, the set for the middle such character should be a subset
344 * of the set for the first.
345 */
346 IcuTestErrorCode errorCode(*this, "TestConsistency");
347 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
348 const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
349 if(!nfcImpl->ensureCanonIterData(errorCode) || errorCode.isFailure()) {
350 dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n",
351 errorCode.errorName());
352 errorCode.reset();
353 return;
354 }
355
356 UnicodeSet set1, set2;
357 if (nfcImpl->getCanonStartSet(0x49, set1)) {
358 /* enumerate all characters that are plausible to be latin letters */
359 for(char16_t start=0xa0; start<0x2000; ++start) {
360 UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode);
361 if(decomp.length()>1 && decomp[0]==0x49) {
362 set2.add(start);
363 }
364 }
365
366 if (set1!=set2) {
367 errln("[canon start set of 0049] != [all c with canon decomp with 0049]");
368 }
369 // This was available in cucdtst.c but the test had to move to intltest
370 // because the new internal normalization functions are in C++.
371 //compareUSets(set1, set2,
372 // "[canon start set of 0049]", "[all c with canon decomp with 0049]",
373 // true);
374 } else {
375 errln("NFC.getCanonStartSet() returned false");
376 }
377 #endif
378 }
379
380 /**
381 * Test various implementations of Pattern_Syntax & Pattern_White_Space.
382 */
TestPatternProperties()383 void UnicodeTest::TestPatternProperties() {
384 IcuTestErrorCode errorCode(*this, "TestPatternProperties()");
385 UnicodeSet syn_pp;
386 UnicodeSet syn_prop(UNICODE_STRING_SIMPLE("[:Pattern_Syntax:]"), errorCode);
387 UnicodeSet syn_list(
388 "[!-/\\:-@\\[-\\^`\\{-~"
389 "\\u00A1-\\u00A7\\u00A9\\u00AB\\u00AC\\u00AE\\u00B0\\u00B1\\u00B6\\u00BB\\u00BF\\u00D7\\u00F7"
390 "\\u2010-\\u2027\\u2030-\\u203E\\u2041-\\u2053\\u2055-\\u205E\\u2190-\\u245F\\u2500-\\u2775"
391 "\\u2794-\\u2BFF\\u2E00-\\u2E7F\\u3001-\\u3003\\u3008-\\u3020\\u3030\\uFD3E\\uFD3F\\uFE45\\uFE46]", errorCode);
392 UnicodeSet ws_pp;
393 UnicodeSet ws_prop(UNICODE_STRING_SIMPLE("[:Pattern_White_Space:]"), errorCode);
394 UnicodeSet ws_list(UNICODE_STRING_SIMPLE("[\\u0009-\\u000D\\ \\u0085\\u200E\\u200F\\u2028\\u2029]"), errorCode);
395 UnicodeSet syn_ws_pp;
396 UnicodeSet syn_ws_prop(syn_prop);
397 syn_ws_prop.addAll(ws_prop);
398 for(UChar32 c=0; c<=0xffff; ++c) {
399 if(PatternProps::isSyntax(c)) {
400 syn_pp.add(c);
401 }
402 if(PatternProps::isWhiteSpace(c)) {
403 ws_pp.add(c);
404 }
405 if(PatternProps::isSyntaxOrWhiteSpace(c)) {
406 syn_ws_pp.add(c);
407 }
408 }
409 compareUSets(syn_pp, syn_prop,
410 "PatternProps.isSyntax()", "[:Pattern_Syntax:]", true);
411 compareUSets(syn_pp, syn_list,
412 "PatternProps.isSyntax()", "[Pattern_Syntax ranges]", true);
413 compareUSets(ws_pp, ws_prop,
414 "PatternProps.isWhiteSpace()", "[:Pattern_White_Space:]", true);
415 compareUSets(ws_pp, ws_list,
416 "PatternProps.isWhiteSpace()", "[Pattern_White_Space ranges]", true);
417 compareUSets(syn_ws_pp, syn_ws_prop,
418 "PatternProps.isSyntaxOrWhiteSpace()",
419 "[[:Pattern_Syntax:][:Pattern_White_Space:]]", true);
420 }
421
422 // So far only minimal port of Java & cucdtst.c compareUSets().
423 UBool
compareUSets(const UnicodeSet & a,const UnicodeSet & b,const char * a_name,const char * b_name,UBool diffIsError)424 UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b,
425 const char *a_name, const char *b_name,
426 UBool diffIsError) {
427 UBool same= a==b;
428 if(!same && diffIsError) {
429 errln("Sets are different: %s vs. %s\n", a_name, b_name);
430 }
431 return same;
432 }
433
434 namespace {
435
436 /**
437 * Maps a special script code to the most common script of its encoded characters.
438 */
getCharScript(UScriptCode script)439 UScriptCode getCharScript(UScriptCode script) {
440 switch(script) {
441 case USCRIPT_HAN_WITH_BOPOMOFO:
442 case USCRIPT_SIMPLIFIED_HAN:
443 case USCRIPT_TRADITIONAL_HAN:
444 return USCRIPT_HAN;
445 case USCRIPT_JAPANESE:
446 return USCRIPT_HIRAGANA;
447 case USCRIPT_JAMO:
448 case USCRIPT_KOREAN:
449 return USCRIPT_HANGUL;
450 case USCRIPT_SYMBOLS_EMOJI:
451 return USCRIPT_SYMBOLS;
452 default:
453 return script;
454 }
455 }
456
457 } // namespace
458
TestScriptMetadata()459 void UnicodeTest::TestScriptMetadata() {
460 IcuTestErrorCode errorCode(*this, "TestScriptMetadata()");
461 UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode);
462 // So far, sample characters are uppercase.
463 // Georgian is special.
464 UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode);
465 for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) {
466 UScriptCode sc = (UScriptCode)sci;
467 // Run the test with -v to see which script has failures:
468 // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 6 FAIL
469 logln(uscript_getShortName(sc));
470 UScriptUsage usage = uscript_getUsage(sc);
471 UnicodeString sample = uscript_getSampleUnicodeString(sc);
472 UnicodeSet scriptSet;
473 scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode);
474 if(usage == USCRIPT_USAGE_NOT_ENCODED) {
475 assertTrue("not encoded, no sample", sample.isEmpty());
476 assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc));
477 assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc));
478 assertFalse("not encoded, not cased", uscript_isCased(sc));
479 assertTrue("not encoded, no characters", scriptSet.isEmpty());
480 } else {
481 assertFalse("encoded, has a sample character", sample.isEmpty());
482 UChar32 firstChar = sample.char32At(0);
483 UScriptCode charScript = getCharScript(sc);
484 assertEquals("script(sample(script))",
485 (int32_t)charScript, (int32_t)uscript_getScript(firstChar, errorCode));
486 assertEquals("RTL vs. set", (UBool)rtl.contains(firstChar), (UBool)uscript_isRightToLeft(sc));
487 assertEquals("cased vs. set", (UBool)cased.contains(firstChar), (UBool)uscript_isCased(sc));
488 assertEquals("encoded, has characters", (UBool)(sc == charScript), (UBool)(!scriptSet.isEmpty()));
489 if(uscript_isRightToLeft(sc)) {
490 rtl.removeAll(scriptSet);
491 }
492 if(uscript_isCased(sc)) {
493 cased.removeAll(scriptSet);
494 }
495 }
496 }
497 UnicodeString pattern;
498 assertEquals("no remaining RTL characters",
499 UnicodeString("[]"), rtl.toPattern(pattern));
500 assertEquals("no remaining cased characters",
501 UnicodeString("[]"), cased.toPattern(pattern));
502
503 assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN));
504 assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI));
505 assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN));
506 }
507
TestBidiPairedBracketType()508 void UnicodeTest::TestBidiPairedBracketType() {
509 // BidiBrackets-6.3.0.txt says:
510 //
511 // The set of code points listed in this file was originally derived
512 // using the character properties General_Category (gc), Bidi_Class (bc),
513 // Bidi_Mirrored (Bidi_M), and Bidi_Mirroring_Glyph (bmg), as follows:
514 // two characters, A and B, form a pair if A has gc=Ps and B has gc=Pe,
515 // both have bc=ON and Bidi_M=Y, and bmg of A is B. Bidi_Paired_Bracket
516 // maps A to B and vice versa, and their Bidi_Paired_Bracket_Type
517 // property values are Open and Close, respectively.
518 IcuTestErrorCode errorCode(*this, "TestBidiPairedBracketType()");
519 UnicodeSet bpt("[:^bpt=n:]", errorCode);
520 assertTrue("bpt!=None is not empty", !bpt.isEmpty());
521 // The following should always be true.
522 UnicodeSet mirrored("[:Bidi_M:]", errorCode);
523 UnicodeSet other_neutral("[:bc=ON:]", errorCode);
524 assertTrue("bpt!=None is a subset of Bidi_M", mirrored.containsAll(bpt));
525 assertTrue("bpt!=None is a subset of bc=ON", other_neutral.containsAll(bpt));
526 // The following are true at least initially in Unicode 6.3.
527 UnicodeSet bpt_open("[:bpt=o:]", errorCode);
528 UnicodeSet bpt_close("[:bpt=c:]", errorCode);
529 UnicodeSet ps("[:Ps:]", errorCode);
530 UnicodeSet pe("[:Pe:]", errorCode);
531 assertTrue("bpt=Open is a subset of Ps", ps.containsAll(bpt_open));
532 assertTrue("bpt=Close is a subset of Pe", pe.containsAll(bpt_close));
533 }
534
TestEmojiProperties()535 void UnicodeTest::TestEmojiProperties() {
536 assertFalse("space is not Emoji", u_hasBinaryProperty(0x20, UCHAR_EMOJI));
537 assertTrue("shooting star is Emoji", u_hasBinaryProperty(0x1F320, UCHAR_EMOJI));
538 IcuTestErrorCode errorCode(*this, "TestEmojiProperties()");
539 UnicodeSet emoji("[:Emoji:]", errorCode);
540 assertTrue("lots of Emoji", emoji.size() > 700);
541
542 assertTrue("shooting star is Emoji_Presentation",
543 u_hasBinaryProperty(0x1F320, UCHAR_EMOJI_PRESENTATION));
544 assertTrue("Fitzpatrick 6 is Emoji_Modifier",
545 u_hasBinaryProperty(0x1F3FF, UCHAR_EMOJI_MODIFIER));
546 assertTrue("happy person is Emoji_Modifier_Base",
547 u_hasBinaryProperty(0x1F64B, UCHAR_EMOJI_MODIFIER_BASE));
548 assertTrue("asterisk is Emoji_Component",
549 u_hasBinaryProperty(0x2A, UCHAR_EMOJI_COMPONENT));
550 assertTrue("copyright is Extended_Pictographic",
551 u_hasBinaryProperty(0xA9, UCHAR_EXTENDED_PICTOGRAPHIC));
552 }
553
554 namespace {
555
hbp(const char16_t * s,int32_t length,UProperty which)556 UBool hbp(const char16_t *s, int32_t length, UProperty which) {
557 return u_stringHasBinaryProperty(s, length, which);
558 }
559
hbp(const char16_t * s,UProperty which)560 UBool hbp(const char16_t *s, UProperty which) {
561 return u_stringHasBinaryProperty(s, -1, which);
562 }
563
564 } // namespace
565
TestEmojiPropertiesOfStrings()566 void UnicodeTest::TestEmojiPropertiesOfStrings() {
567 // Property of code points, for coverage
568 assertFalse("null is not Ideographic", hbp(nullptr, 1, UCHAR_IDEOGRAPHIC));
569 assertFalse("null/0 is not Ideographic", hbp(nullptr, -1, UCHAR_IDEOGRAPHIC));
570 assertFalse("empty string is not Ideographic", hbp(u"", 0, UCHAR_IDEOGRAPHIC));
571 assertFalse("empty string/0 is not Ideographic", hbp(u"", -1, UCHAR_IDEOGRAPHIC));
572 assertFalse("L is not Ideographic", hbp(u"L", 1, UCHAR_IDEOGRAPHIC));
573 assertFalse("L/0 is not Ideographic", hbp(u"L", -1, UCHAR_IDEOGRAPHIC));
574 assertTrue("U+4E02 is Ideographic", hbp(u"丂", 1, UCHAR_IDEOGRAPHIC));
575 assertTrue("U+4E02/0 is Ideographic", hbp(u"丂", -1, UCHAR_IDEOGRAPHIC));
576 assertFalse("2*U+4E02 is not Ideographic", hbp(u"丂丂", 2, UCHAR_IDEOGRAPHIC));
577 assertFalse("2*U+4E02/0 is not Ideographic", hbp(u"丂丂", -1, UCHAR_IDEOGRAPHIC));
578 assertFalse("bicycle is not Ideographic", hbp(u"", 2, UCHAR_IDEOGRAPHIC));
579 assertFalse("bicycle/0 is not Ideographic", hbp(u"", -1, UCHAR_IDEOGRAPHIC));
580 assertTrue("U+23456 is Ideographic", hbp(u"\U00023456", 2, UCHAR_IDEOGRAPHIC));
581 assertTrue("U+23456/0 is Ideographic", hbp(u"\U00023456", -1, UCHAR_IDEOGRAPHIC));
582
583 // Property of (code points and) strings
584 assertFalse("null is not Basic_Emoji", hbp(nullptr, 1, UCHAR_BASIC_EMOJI));
585 assertFalse("null/0 is not Basic_Emoji", hbp(nullptr, -1, UCHAR_BASIC_EMOJI));
586 assertFalse("empty string is not Basic_Emoji", hbp(u"", 0, UCHAR_BASIC_EMOJI));
587 assertFalse("empty string/0 is not Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
588 assertFalse("L is not Basic_Emoji", hbp(u"L", 1, UCHAR_BASIC_EMOJI));
589 assertFalse("L/0 is not Basic_Emoji", hbp(u"L", -1, UCHAR_BASIC_EMOJI));
590 assertFalse("U+4E02 is not Basic_Emoji", hbp(u"丂", 1, UCHAR_BASIC_EMOJI));
591 assertFalse("U+4E02/0 is not Basic_Emoji", hbp(u"丂", -1, UCHAR_BASIC_EMOJI));
592 assertTrue("bicycle is Basic_Emoji", hbp(u"", 2, UCHAR_BASIC_EMOJI));
593 assertTrue("bicycle/0 is Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
594 assertFalse("2*bicycle is Basic_Emoji", hbp(u"", 4, UCHAR_BASIC_EMOJI));
595 assertFalse("2*bicycle/0 is Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
596 assertFalse("U+23456 is not Basic_Emoji", hbp(u"\U00023456", 2, UCHAR_BASIC_EMOJI));
597 assertFalse("U+23456/0 is not Basic_Emoji", hbp(u"\U00023456", -1, UCHAR_BASIC_EMOJI));
598
599 assertFalse("stopwatch is not Basic_Emoji", hbp(u"⏱", 1, UCHAR_BASIC_EMOJI));
600 assertFalse("stopwatch/0 is not Basic_Emoji", hbp(u"⏱", -1, UCHAR_BASIC_EMOJI));
601 assertTrue("stopwatch+emoji is Basic_Emoji", hbp(u"⏱\uFE0F", 2, UCHAR_BASIC_EMOJI));
602 assertTrue("stopwatch+emoji/0 is Basic_Emoji", hbp(u"⏱\uFE0F", -1, UCHAR_BASIC_EMOJI));
603
604 assertFalse("chipmunk is not Basic_Emoji", hbp(u"", UCHAR_BASIC_EMOJI));
605 assertTrue("chipmunk+emoji is Basic_Emoji", hbp(u"\uFE0F", UCHAR_BASIC_EMOJI));
606 assertFalse("chipmunk+2*emoji is not Basic_Emoji", hbp(u"\uFE0F\uFE0F", UCHAR_BASIC_EMOJI));
607
608 // Properties of strings (only)
609 assertFalse("4+emoji is not Emoji_Keycap_Sequence",
610 hbp(u"4\uFE0F", UCHAR_EMOJI_KEYCAP_SEQUENCE));
611 assertTrue("4+emoji+keycap is Emoji_Keycap_Sequence",
612 hbp(u"4\uFE0F\u20E3", UCHAR_EMOJI_KEYCAP_SEQUENCE));
613
614 assertFalse("[B] is not RGI_Emoji_Flag_Sequence",
615 hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
616 assertTrue("[BE] is RGI_Emoji_Flag_Sequence",
617 hbp(u"", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
618
619 assertFalse("[flag] is not RGI_Emoji_Tag_Sequence",
620 hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
621 assertTrue("[Scotland] is RGI_Emoji_Tag_Sequence",
622 hbp(u"", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
623
624 assertFalse("bicyclist is not RGI_Emoji_Modifier_Sequence",
625 hbp(u"", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
626 assertTrue("bicyclist+medium is RGI_Emoji_Modifier_Sequence",
627 hbp(u"\U0001F3FD", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
628
629 assertFalse("woman+dark+ZWJ is not RGI_Emoji_ZWJ_Sequence",
630 hbp(u"\U0001F3FF\u200D", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
631 assertTrue("woman pilot: dark skin tone is RGI_Emoji_ZWJ_Sequence",
632 hbp(u"\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
633
634 // RGI_Emoji = all of the above
635 assertFalse("stopwatch is not RGI_Emoji", hbp(u"⏱", UCHAR_RGI_EMOJI));
636 assertTrue("stopwatch+emoji is RGI_Emoji", hbp(u"⏱\uFE0F", UCHAR_RGI_EMOJI));
637
638 assertFalse("chipmunk is not RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
639 assertTrue("chipmunk+emoji is RGI_Emoji", hbp(u"\uFE0F", UCHAR_RGI_EMOJI));
640
641 assertFalse("4+emoji is not RGI_Emoji", hbp(u"4\uFE0F", UCHAR_RGI_EMOJI));
642 assertTrue("4+emoji+keycap is RGI_Emoji", hbp(u"4\uFE0F\u20E3", UCHAR_RGI_EMOJI));
643
644 assertFalse("[B] is not RGI_Emoji", hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI));
645 assertTrue("[BE] is RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
646
647 assertTrue("[flag] is RGI_Emoji", hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI));
648 assertTrue("[Scotland] is RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
649
650 assertTrue("bicyclist is RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
651 assertTrue("bicyclist+medium is RGI_Emoji", hbp(u"\U0001F3FD", UCHAR_RGI_EMOJI));
652
653 assertFalse("woman+dark+ZWJ is not RGI_Emoji", hbp(u"\U0001F3FF\u200D", UCHAR_RGI_EMOJI));
654 assertTrue("woman pilot: dark skin tone is RGI_Emoji",
655 hbp(u"\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI));
656
657 // UnicodeSet with properties of strings
658 IcuTestErrorCode errorCode(*this, "TestEmojiPropertiesOfStrings()");
659 UnicodeSet basic("[:Basic_Emoji:]", errorCode);
660 UnicodeSet keycaps("[:Emoji_Keycap_Sequence:]", errorCode);
661 UnicodeSet modified("[:RGI_Emoji_Modifier_Sequence:]", errorCode);
662 UnicodeSet flags("[:RGI_Emoji_Flag_Sequence:]", errorCode);
663 UnicodeSet tags("[:RGI_Emoji_Tag_Sequence:]", errorCode);
664 UnicodeSet combos("[:RGI_Emoji_ZWJ_Sequence:]", errorCode);
665 UnicodeSet rgi("[:RGI_Emoji:]", errorCode);
666 if (errorCode.errDataIfFailureAndReset("UnicodeSets")) {
667 return;
668 }
669
670 // union of all sets except for "rgi" -- should be the same as "rgi"
671 UnicodeSet all(basic);
672 all.addAll(keycaps).addAll(modified).addAll(flags).addAll(tags).addAll(combos);
673
674 UnicodeSet basicOnlyCp(basic);
675 basicOnlyCp.removeAllStrings();
676
677 UnicodeSet rgiOnlyCp(rgi);
678 rgiOnlyCp.removeAllStrings();
679
680 assertTrue("lots of Basic_Emoji", basic.size() > 1000);
681 assertEquals("12 Emoji_Keycap_Sequence", 12, keycaps.size());
682 assertTrue("lots of RGI_Emoji_Modifier_Sequence", modified.size() > 600);
683 assertTrue("lots of RGI_Emoji_Flag_Sequence", flags.size() > 250);
684 assertTrue("some RGI_Emoji_Tag_Sequence", tags.size() >= 3);
685 assertTrue("lots of RGI_Emoji_ZWJ_Sequence", combos.size() > 1300);
686 assertTrue("lots of RGI_Emoji", rgi.size() > 3000);
687
688 assertTrue("lots of Basic_Emoji code points", basicOnlyCp.size() > 1000);
689 assertTrue("Basic_Emoji.hasStrings()", basic.hasStrings());
690 assertEquals("no Emoji_Keycap_Sequence code points", 0, keycaps.getRangeCount());
691 assertEquals("lots of RGI_Emoji_Modifier_Sequence", 0, modified.getRangeCount());
692 assertEquals("lots of RGI_Emoji_Flag_Sequence", 0, flags.getRangeCount());
693 assertEquals("some RGI_Emoji_Tag_Sequence", 0, tags.getRangeCount());
694 assertEquals("lots of RGI_Emoji_ZWJ_Sequence", 0, combos.getRangeCount());
695
696 assertTrue("lots of RGI_Emoji code points", rgiOnlyCp.size() > 1000);
697 assertTrue("RGI_Emoji.hasStrings()", rgi.hasStrings());
698 assertEquals("RGI_Emoji/only-cp.size() == Basic_Emoji/only-cp.size()",
699 rgiOnlyCp.size(), basicOnlyCp.size());
700 assertTrue("RGI_Emoji/only-cp == Basic_Emoji/only-cp", rgiOnlyCp == basicOnlyCp);
701 assertEquals("RGI_Emoji.size() == union.size()", rgi.size(), all.size());
702 assertTrue("RGI_Emoji == union", rgi == all);
703
704 assertTrue("Basic_Emoji.contains(stopwatch+emoji)", basic.contains(u"⏱\uFE0F"));
705 assertTrue("Basic_Emoji.contains(chipmunk+emoji)", basic.contains(u"\uFE0F"));
706 assertTrue("Emoji_Keycap_Sequence.contains(4+emoji+keycap)",
707 keycaps.contains(u"4\uFE0F\u20E3"));
708 assertTrue("RGI_Emoji_Flag_Sequence.contains([BE])", flags.contains(u""));
709 assertTrue("RGI_Emoji_Tag_Sequence.contains([Scotland])", tags.contains(u""));
710 assertTrue("RGI_Emoji_Modifier_Sequence.contains(bicyclist+medium)",
711 modified.contains(u"\U0001F3FD"));
712 assertTrue("RGI_Emoji_ZWJ_Sequence.contains(woman pilot: dark skin tone)",
713 combos.contains(u"\U0001F3FF\u200D✈\uFE0F"));
714 assertTrue("RGI_Emoji.contains(stopwatch+emoji)", rgi.contains(u"⏱\uFE0F"));
715 assertTrue("RGI_Emoji.contains(chipmunk+emoji)", rgi.contains(u"\uFE0F"));
716 assertTrue("RGI_Emoji.contains(4+emoji+keycap)", rgi.contains(u"4\uFE0F\u20E3"));
717 assertTrue("RGI_Emoji.contains([BE] is RGI_Emoji)", rgi.contains(u""));
718 assertTrue("RGI_Emoji.contains([flag])", rgi.contains(u"\U0001F3F4"));
719 assertTrue("RGI_Emoji.contains([Scotland])", rgi.contains(u""));
720 assertTrue("RGI_Emoji.contains(bicyclist)", rgi.contains(u""));
721 assertTrue("RGI_Emoji.contains(bicyclist+medium)", rgi.contains(u"\U0001F3FD"));
722 assertTrue("RGI_Emoji.contains(woman pilot: dark skin tone)", rgi.contains(u"\U0001F3FF\u200D✈\uFE0F"));
723 }
724
TestIndicPositionalCategory()725 void UnicodeTest::TestIndicPositionalCategory() {
726 IcuTestErrorCode errorCode(*this, "TestIndicPositionalCategory()");
727 UnicodeSet na(u"[:InPC=NA:]", errorCode);
728 assertTrue("mostly NA", 1000000 <= na.size() && na.size() <= UCHAR_MAX_VALUE - 500);
729 UnicodeSet vol(u"[:InPC=Visual_Order_Left:]", errorCode);
730 assertTrue("some Visual_Order_Left", 19 <= vol.size() && vol.size() <= 100);
731 assertEquals("U+08FF: NA", U_INPC_NA,
732 u_getIntPropertyValue(0x08FF, UCHAR_INDIC_POSITIONAL_CATEGORY));
733 assertEquals("U+0900: Top", U_INPC_TOP,
734 u_getIntPropertyValue(0x0900, UCHAR_INDIC_POSITIONAL_CATEGORY));
735 assertEquals("U+10A06: Overstruck", U_INPC_OVERSTRUCK,
736 u_getIntPropertyValue(0x10A06, UCHAR_INDIC_POSITIONAL_CATEGORY));
737 }
738
TestIndicSyllabicCategory()739 void UnicodeTest::TestIndicSyllabicCategory() {
740 IcuTestErrorCode errorCode(*this, "TestIndicSyllabicCategory()");
741 UnicodeSet other(u"[:InSC=Other:]", errorCode);
742 assertTrue("mostly Other", 1000000 <= other.size() && other.size() <= UCHAR_MAX_VALUE - 500);
743 UnicodeSet ava(u"[:InSC=Avagraha:]", errorCode);
744 assertTrue("some Avagraha", 16 <= ava.size() && ava.size() <= 100);
745 assertEquals("U+08FF: Other", U_INSC_OTHER,
746 u_getIntPropertyValue(0x08FF, UCHAR_INDIC_SYLLABIC_CATEGORY));
747 assertEquals("U+0900: Bindu", U_INSC_BINDU,
748 u_getIntPropertyValue(0x0900, UCHAR_INDIC_SYLLABIC_CATEGORY));
749 assertEquals("U+11065: Brahmi_Joining_Number", U_INSC_BRAHMI_JOINING_NUMBER,
750 u_getIntPropertyValue(0x11065, UCHAR_INDIC_SYLLABIC_CATEGORY));
751 }
752
TestVerticalOrientation()753 void UnicodeTest::TestVerticalOrientation() {
754 IcuTestErrorCode errorCode(*this, "TestVerticalOrientation()");
755 UnicodeSet r(u"[:vo=R:]", errorCode);
756 assertTrue("mostly R", 0xc0000 <= r.size() && r.size() <= 0xd0000);
757 UnicodeSet u(u"[:vo=U:]", errorCode);
758 assertTrue("much U", 0x40000 <= u.size() && u.size() <= 0x50000);
759 UnicodeSet tu(u"[:vo=Tu:]", errorCode);
760 assertTrue("some Tu", 147 <= tu.size() && tu.size() <= 300);
761 assertEquals("U+0E01: Rotated", U_VO_ROTATED,
762 u_getIntPropertyValue(0x0E01, UCHAR_VERTICAL_ORIENTATION));
763 assertEquals("U+3008: Transformed_Rotated", U_VO_TRANSFORMED_ROTATED,
764 u_getIntPropertyValue(0x3008, UCHAR_VERTICAL_ORIENTATION));
765 assertEquals("U+33333: Upright", U_VO_UPRIGHT,
766 u_getIntPropertyValue(0x33333, UCHAR_VERTICAL_ORIENTATION));
767 }
768
TestDefaultScriptExtensions()769 void UnicodeTest::TestDefaultScriptExtensions() {
770 // Block 3000..303F CJK Symbols and Punctuation defaults to scx=Bopo Hang Hani Hira Kana Yiii
771 // but some of its characters revert to scx=<script> which is usually Common.
772 IcuTestErrorCode errorCode(*this, "TestDefaultScriptExtensions()");
773 UScriptCode scx[20];
774 scx[0] = USCRIPT_INVALID_CODE;
775 assertEquals("U+3000 num scx", 1, // IDEOGRAPHIC SPACE
776 uscript_getScriptExtensions(0x3000, scx, UPRV_LENGTHOF(scx), errorCode));
777 assertEquals("U+3000 num scx[0]", USCRIPT_COMMON, scx[0]);
778 scx[0] = USCRIPT_INVALID_CODE;
779 assertEquals("U+3012 num scx", 1, // POSTAL MARK
780 uscript_getScriptExtensions(0x3012, scx, UPRV_LENGTHOF(scx), errorCode));
781 assertEquals("U+3012 num scx[0]", USCRIPT_COMMON, scx[0]);
782 }
783
TestInvalidCodePointFolding()784 void UnicodeTest::TestInvalidCodePointFolding() {
785 // Test behavior when an invalid code point is passed to u_foldCase
786 static const UChar32 invalidCodePoints[] = {
787 0xD800, // lead surrogate
788 0xDFFF, // trail surrogate
789 0xFDD0, // noncharacter
790 0xFFFF, // noncharacter
791 0x110000, // out of range
792 -1 // negative
793 };
794 for (int32_t i=0; i<UPRV_LENGTHOF(invalidCodePoints); ++i) {
795 UChar32 cp = invalidCodePoints[i];
796 assertEquals("Invalid code points should be echoed back",
797 cp, u_foldCase(cp, U_FOLD_CASE_DEFAULT));
798 assertEquals("Invalid code points should be echoed back",
799 cp, u_foldCase(cp, U_FOLD_CASE_EXCLUDE_SPECIAL_I));
800 }
801 }
802
TestBinaryCharacterProperties()803 void UnicodeTest::TestBinaryCharacterProperties() {
804 #if !UCONFIG_NO_NORMALIZATION
805 IcuTestErrorCode errorCode(*this, "TestBinaryCharacterProperties()");
806 // Spot-check getBinaryPropertySet() vs. hasBinaryProperty().
807 for (int32_t prop = 0; prop < UCHAR_BINARY_LIMIT; ++prop) {
808 const USet *uset = u_getBinaryPropertySet((UProperty)prop, errorCode);
809 if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(%d)", (int)prop)) {
810 continue;
811 }
812 const UnicodeSet &set = *UnicodeSet::fromUSet(uset);
813 int32_t count = set.getRangeCount();
814 if (count == 0) {
815 assertFalse(UnicodeString("!hasBinaryProperty(U+0020, ") + prop + u")",
816 u_hasBinaryProperty(0x20, (UProperty)prop));
817 assertFalse(UnicodeString("!hasBinaryProperty(U+0061, ") + prop + u")",
818 u_hasBinaryProperty(0x61, (UProperty)prop));
819 assertFalse(UnicodeString("!hasBinaryProperty(U+4E00, ") + prop + u")",
820 u_hasBinaryProperty(0x4e00, (UProperty)prop));
821 } else {
822 UChar32 c = set.getRangeStart(0);
823 if (c > 0) {
824 assertFalse(
825 UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c - 1) +
826 u", " + prop + u")",
827 u_hasBinaryProperty(c - 1, (UProperty)prop));
828 }
829 assertTrue(
830 UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
831 u", " + prop + u")",
832 u_hasBinaryProperty(c, (UProperty)prop));
833 c = set.getRangeEnd(count - 1);
834 assertTrue(
835 UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
836 u", " + prop + u")",
837 u_hasBinaryProperty(c, (UProperty)prop));
838 if (c < 0x10ffff) {
839 assertFalse(
840 UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c + 1) +
841 u", " + prop + u")",
842 u_hasBinaryProperty(c + 1, (UProperty)prop));
843 }
844 }
845 }
846 #endif
847 }
848
TestIntCharacterProperties()849 void UnicodeTest::TestIntCharacterProperties() {
850 #if !UCONFIG_NO_NORMALIZATION
851 IcuTestErrorCode errorCode(*this, "TestIntCharacterProperties()");
852 // Spot-check getIntPropertyMap() vs. getIntPropertyValue().
853 for (int32_t prop = UCHAR_INT_START; prop < UCHAR_INT_LIMIT; ++prop) {
854 const UCPMap *map = u_getIntPropertyMap((UProperty)prop, errorCode);
855 if (errorCode.errIfFailureAndReset("u_getIntPropertyMap(%d)", (int)prop)) {
856 continue;
857 }
858 uint32_t value;
859 UChar32 end = ucpmap_getRange(map, 0, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
860 assertTrue("int property first range", end >= 0);
861 UChar32 c = end / 2;
862 assertEquals(UnicodeString("int property first range value at ") + TestUtility::hex(c),
863 u_getIntPropertyValue(c, (UProperty)prop), value);
864 end = ucpmap_getRange(map, 0x5000, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
865 assertTrue("int property later range", end >= 0);
866 assertEquals(UnicodeString("int property later range value at ") + TestUtility::hex(end),
867 u_getIntPropertyValue(end, (UProperty)prop), value);
868 // ucpmap_get() API coverage
869 // TODO: move to cucdtst.c
870 assertEquals(
871 "int property upcmap_get(U+0061)",
872 u_getIntPropertyValue(0x61, (UProperty)prop), ucpmap_get(map, 0x61));
873 }
874 #endif
875 }
876
877 namespace {
878
getPropName(UProperty property,int32_t nameChoice)879 const char *getPropName(UProperty property, int32_t nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
880 const char *name = u_getPropertyName(property, (UPropertyNameChoice)nameChoice);
881 return name != nullptr ? name : "null";
882 }
883
getValueName(UProperty property,int32_t value,int32_t nameChoice)884 const char *getValueName(UProperty property, int32_t value, int32_t nameChoice)
885 UPRV_NO_SANITIZE_UNDEFINED {
886 const char *name = u_getPropertyValueName(property, value, (UPropertyNameChoice)nameChoice);
887 return name != nullptr ? name : "null";
888 }
889
890 } // namespace
891
TestPropertyNames()892 void UnicodeTest::TestPropertyNames() {
893 IcuTestErrorCode errorCode(*this, "TestPropertyNames()");
894 // Test names of certain properties & values.
895 // The UPropertyNameChoice is really an integer with only a couple of named constants.
896 UProperty prop = UCHAR_WHITE_SPACE;
897 constexpr int32_t SHORT = U_SHORT_PROPERTY_NAME;
898 constexpr int32_t LONG = U_LONG_PROPERTY_NAME;
899 assertEquals("White_Space: index -1", "null", getPropName(prop, -1));
900 assertEquals("White_Space: short", "WSpace", getPropName(prop, SHORT));
901 assertEquals("White_Space: long", "White_Space", getPropName(prop, LONG));
902 assertEquals("White_Space: index 2", "space", getPropName(prop, 2));
903 assertEquals("White_Space: index 3", "null", getPropName(prop, 3));
904
905 prop = UCHAR_SIMPLE_CASE_FOLDING;
906 assertEquals("Simple_Case_Folding: index -1", "null", getPropName(prop, -1));
907 assertEquals("Simple_Case_Folding: short", "scf", getPropName(prop, SHORT));
908 assertEquals("Simple_Case_Folding: long", "Simple_Case_Folding", getPropName(prop, LONG));
909 assertEquals("Simple_Case_Folding: index 2", "sfc", getPropName(prop, 2));
910 assertEquals("Simple_Case_Folding: index 3", "null", getPropName(prop, 3));
911
912 prop = UCHAR_CASED;
913 assertEquals("Cased=Y: index -1", "null", getValueName(prop, 1, -1));
914 assertEquals("Cased=Y: short", "Y", getValueName(prop, 1, SHORT));
915 assertEquals("Cased=Y: long", "Yes", getValueName(prop, 1, LONG));
916 assertEquals("Cased=Y: index 2", "T", getValueName(prop, 1, 2));
917 assertEquals("Cased=Y: index 3", "True", getValueName(prop, 1, 3));
918 assertEquals("Cased=Y: index 4", "null", getValueName(prop, 1, 4));
919
920 prop = UCHAR_DECOMPOSITION_TYPE;
921 int32_t value = U_DT_NOBREAK;
922 assertEquals("dt=Nb: index -1", "null", getValueName(prop, value, -1));
923 assertEquals("dt=Nb: short", "Nb", getValueName(prop, value, SHORT));
924 assertEquals("dt=Nb: long", "Nobreak", getValueName(prop, value, LONG));
925 assertEquals("dt=Nb: index 2", "nb", getValueName(prop, value, 2));
926 assertEquals("dt=Nb: index 3", "null", getValueName(prop, value, 3));
927
928 // Canonical_Combining_Class:
929 // The UCD inserts the numeric values in the second filed of its PropertyValueAliases.txt lines.
930 // In ICU, we don't treat these as names,
931 // they are just the numeric values returned by u_getCombiningClass().
932 // We return the real short and long names for the usual choice constants.
933 prop = UCHAR_CANONICAL_COMBINING_CLASS;
934 assertEquals("ccc=230: index -1", "null", getValueName(prop, 230, -1));
935 assertEquals("ccc=230: short", "A", getValueName(prop, 230, SHORT));
936 assertEquals("ccc=230: long", "Above", getValueName(prop, 230, LONG));
937 assertEquals("ccc=230: index 2", "null", getValueName(prop, 230, 2));
938
939 prop = UCHAR_GENERAL_CATEGORY;
940 value = U_DECIMAL_DIGIT_NUMBER;
941 assertEquals("gc=Nd: index -1", "null", getValueName(prop, value, -1));
942 assertEquals("gc=Nd: short", "Nd", getValueName(prop, value, SHORT));
943 assertEquals("gc=Nd: long", "Decimal_Number", getValueName(prop, value, LONG));
944 assertEquals("gc=Nd: index 2", "digit", getValueName(prop, value, 2));
945 assertEquals("gc=Nd: index 3", "null", getValueName(prop, value, 3));
946
947 prop = UCHAR_GENERAL_CATEGORY_MASK;
948 value = U_GC_P_MASK;
949 assertEquals("gc=P mask: index -1", "null", getValueName(prop, value, -1));
950 assertEquals("gc=P mask: short", "P", getValueName(prop, value, SHORT));
951 assertEquals("gc=P mask: long", "Punctuation", getValueName(prop, value, LONG));
952 assertEquals("gc=P mask: index 2", "punct", getValueName(prop, value, 2));
953 assertEquals("gc=P mask: index 3", "null", getValueName(prop, value, 3));
954 }
955
TestIDSUnaryOperator()956 void UnicodeTest::TestIDSUnaryOperator() {
957 IcuTestErrorCode errorCode(*this, "TestIDSUnaryOperator()");
958 // New in Unicode 15.1 for just two characters.
959 assertFalse("U+2FFC IDSU", u_hasBinaryProperty(0x2ffc, UCHAR_IDS_UNARY_OPERATOR));
960 assertFalse("U+2FFD IDSU", u_hasBinaryProperty(0x2ffd, UCHAR_IDS_UNARY_OPERATOR));
961 assertTrue("U+2FFE IDSU", u_hasBinaryProperty(0x2ffe, UCHAR_IDS_UNARY_OPERATOR));
962 assertTrue("U+2FFF IDSU", u_hasBinaryProperty(0x2fff, UCHAR_IDS_UNARY_OPERATOR));
963 assertFalse("U+3000 IDSU", u_hasBinaryProperty(0x3000, UCHAR_IDS_UNARY_OPERATOR));
964 assertFalse("U+3001 IDSU", u_hasBinaryProperty(0x3001, UCHAR_IDS_UNARY_OPERATOR));
965
966 // Property name works and gets the correct set.
967 UnicodeSet idsu(u"[:IDS_Unary_Operator:]", errorCode);
968 assertEquals("IDSU set number of characters", 2, idsu.size());
969 assertFalse("idsu.contains(U+2FFD)", idsu.contains(0x2ffd));
970 assertTrue("idsu.contains(U+2FFE)", idsu.contains(0x2ffe));
971 assertTrue("idsu.contains(U+2FFF)", idsu.contains(0x2fff));
972 assertFalse("idsu.contains(U+3000)", idsu.contains(0x3000));
973 }
974
975 namespace {
976
isMathStart(UChar32 c)977 bool isMathStart(UChar32 c) {
978 return u_hasBinaryProperty(c, UCHAR_ID_COMPAT_MATH_START);
979 }
980
isMathContinue(UChar32 c)981 bool isMathContinue(UChar32 c) {
982 return u_hasBinaryProperty(c, UCHAR_ID_COMPAT_MATH_CONTINUE);
983 }
984
985 } // namespace
986
TestIDCompatMath()987 void UnicodeTest::TestIDCompatMath() {
988 IcuTestErrorCode errorCode(*this, "TestIDCompatMath()");
989 assertFalse("U+00B1 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb1));
990 assertTrue("U+00B2 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb2));
991 assertTrue("U+00B3 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb3));
992 assertFalse("U+00B4 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb4));
993 assertFalse("U+207F UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x207f));
994 assertTrue("U+2080 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2080));
995 assertTrue("U+208E UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x208e));
996 assertFalse("U+208F UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x208f));
997 assertFalse("U+2201 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2201));
998 assertTrue("U+2202 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2202));
999 assertTrue("U+1D6C1 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D6C1));
1000 assertTrue("U+1D7C3 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D7C3));
1001 assertFalse("U+1D7C4 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D7C4));
1002
1003 assertFalse("U+00B2 UCHAR_ID_COMPAT_MATH_START", isMathStart(0xb2));
1004 assertFalse("U+2080 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2080));
1005 assertFalse("U+2201 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2201));
1006 assertTrue("U+2202 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2202));
1007 assertTrue("U+1D6C1 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D6C1));
1008 assertTrue("U+1D7C3 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D7C3));
1009 assertFalse("U+1D7C4 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D7C4));
1010
1011 // Property names work and get the correct sets.
1012 UnicodeSet idcmStart(u"[:ID_Compat_Math_Start:]", errorCode);
1013 UnicodeSet idcmContinue(u"[:ID_Compat_Math_Continue:]", errorCode);
1014 assertEquals("ID_Compat_Math_Start set number of characters", 13, idcmStart.size());
1015 assertEquals("ID_Compat_Math_Continue set number of characters", 43, idcmContinue.size());
1016 assertTrue("ID_Compat_Math_Start is a subset of ID_Compat_Math_Continue",
1017 idcmContinue.containsAll(idcmStart));
1018 assertFalse("idcmContinue.contains(U+207F)", idcmContinue.contains(0x207f));
1019 assertTrue("idcmContinue.contains(U+2080)", idcmContinue.contains(0x2080));
1020 assertTrue("idcmContinue.contains(U+208E)", idcmContinue.contains(0x208e));
1021 assertFalse("idcmContinue.contains(U+208F)", idcmContinue.contains(0x208f));
1022 assertFalse("idcmStart.contains(U+2201)", idcmStart.contains(0x2201));
1023 assertTrue("idcmStart.contains(U+2202)", idcmStart.contains(0x2202));
1024 assertTrue("idcmStart.contains(U+1D7C3)", idcmStart.contains(0x1D7C3));
1025 assertFalse("idcmStart.contains(U+1D7C4)", idcmStart.contains(0x1D7C4));
1026 }
1027