xref: /aosp_15_r20/external/cronet/third_party/icu/source/test/intltest/ucdtest.cpp (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * Copyright (c) 1997-2016, International Business Machines Corporation and
5  * others. All Rights Reserved.
6  ********************************************************************/
7 
8 #include "unicode/ustring.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/uniset.h"
12 #include "unicode/putil.h"
13 #include "unicode/uscript.h"
14 #include "unicode/uset.h"
15 #include "cstring.h"
16 #include "hash.h"
17 #include "patternprops.h"
18 #include "normalizer2impl.h"
19 #include "testutil.h"
20 #include "uparse.h"
21 #include "ucdtest.h"
22 
23 static const char *ignorePropNames[]={
24     "FC_NFKC",
25     "NFD_QC",
26     "NFC_QC",
27     "NFKD_QC",
28     "NFKC_QC",
29     "Expands_On_NFD",
30     "Expands_On_NFC",
31     "Expands_On_NFKD",
32     "Expands_On_NFKC",
33     "InCB",
34     "NFKC_CF",
35     "NFKC_SCF"
36 };
37 
UnicodeTest()38 UnicodeTest::UnicodeTest()
39 {
40     UErrorCode errorCode=U_ZERO_ERROR;
41     unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode);
42     if(U_FAILURE(errorCode)) {
43         delete unknownPropertyNames;
44         unknownPropertyNames=nullptr;
45     }
46     // Ignore some property names altogether.
47     for(int32_t i=0; i<UPRV_LENGTHOF(ignorePropNames); ++i) {
48         unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode);
49     }
50 }
51 
~UnicodeTest()52 UnicodeTest::~UnicodeTest()
53 {
54     delete unknownPropertyNames;
55 }
56 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)57 void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
58 {
59     if(exec) {
60         logln("TestSuite UnicodeTest: ");
61     }
62     TESTCASE_AUTO_BEGIN;
63     TESTCASE_AUTO(TestAdditionalProperties);
64     TESTCASE_AUTO(TestBinaryValues);
65     TESTCASE_AUTO(TestConsistency);
66     TESTCASE_AUTO(TestPatternProperties);
67     TESTCASE_AUTO(TestScriptMetadata);
68     TESTCASE_AUTO(TestBidiPairedBracketType);
69     TESTCASE_AUTO(TestEmojiProperties);
70     TESTCASE_AUTO(TestEmojiPropertiesOfStrings);
71     TESTCASE_AUTO(TestIndicPositionalCategory);
72     TESTCASE_AUTO(TestIndicSyllabicCategory);
73     TESTCASE_AUTO(TestVerticalOrientation);
74     TESTCASE_AUTO(TestDefaultScriptExtensions);
75     TESTCASE_AUTO(TestInvalidCodePointFolding);
76 #if !UCONFIG_NO_NORMALIZATION
77     TESTCASE_AUTO(TestBinaryCharacterProperties);
78     TESTCASE_AUTO(TestIntCharacterProperties);
79 #endif
80     TESTCASE_AUTO(TestPropertyNames);
81     TESTCASE_AUTO(TestIDSUnaryOperator);
82     TESTCASE_AUTO(TestIDCompatMath);
83     TESTCASE_AUTO_END;
84 }
85 
86 //====================================================
87 // private data used by the tests
88 //====================================================
89 
90 // test DerivedCoreProperties.txt -------------------------------------------
91 
92 // copied from genprops.c
93 static int32_t
getTokenIndex(const char * const tokens[],int32_t countTokens,const char * s)94 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
95     const char *t, *z;
96     int32_t i, j;
97 
98     s=u_skipWhitespace(s);
99     for(i=0; i<countTokens; ++i) {
100         t=tokens[i];
101         if(t!=nullptr) {
102             for(j=0;; ++j) {
103                 if(t[j]!=0) {
104                     if(s[j]!=t[j]) {
105                         break;
106                     }
107                 } else {
108                     z=u_skipWhitespace(s+j);
109                     if(*z==';' || *z==0) {
110                         return i;
111                     } else {
112                         break;
113                     }
114                 }
115             }
116         }
117     }
118     return -1;
119 }
120 
121 static const char *const
122 derivedPropsNames[]={
123     "Math",
124     "Alphabetic",
125     "Lowercase",
126     "Uppercase",
127     "ID_Start",
128     "ID_Continue",
129     "XID_Start",
130     "XID_Continue",
131     "Default_Ignorable_Code_Point",
132     "Full_Composition_Exclusion",
133     "Grapheme_Extend",
134     "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */
135     "Grapheme_Base",
136     "Cased",
137     "Case_Ignorable",
138     "Changes_When_Lowercased",
139     "Changes_When_Uppercased",
140     "Changes_When_Titlecased",
141     "Changes_When_Casefolded",
142     "Changes_When_Casemapped",
143     "Changes_When_NFKC_Casefolded"
144 };
145 
146 static const UProperty
147 derivedPropsIndex[]={
148     UCHAR_MATH,
149     UCHAR_ALPHABETIC,
150     UCHAR_LOWERCASE,
151     UCHAR_UPPERCASE,
152     UCHAR_ID_START,
153     UCHAR_ID_CONTINUE,
154     UCHAR_XID_START,
155     UCHAR_XID_CONTINUE,
156     UCHAR_DEFAULT_IGNORABLE_CODE_POINT,
157     UCHAR_FULL_COMPOSITION_EXCLUSION,
158     UCHAR_GRAPHEME_EXTEND,
159     UCHAR_GRAPHEME_LINK,
160     UCHAR_GRAPHEME_BASE,
161     UCHAR_CASED,
162     UCHAR_CASE_IGNORABLE,
163     UCHAR_CHANGES_WHEN_LOWERCASED,
164     UCHAR_CHANGES_WHEN_UPPERCASED,
165     UCHAR_CHANGES_WHEN_TITLECASED,
166     UCHAR_CHANGES_WHEN_CASEFOLDED,
167     UCHAR_CHANGES_WHEN_CASEMAPPED,
168     UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
169 };
170 
171 static int32_t numErrors[UPRV_LENGTHOF(derivedPropsIndex)]={ 0 };
172 
173 enum { MAX_ERRORS=50 };
174 
175 U_CFUNC void U_CALLCONV
derivedPropsLineFn(void * context,char * fields[][2],int32_t,UErrorCode * pErrorCode)176 derivedPropsLineFn(void *context,
177                    char *fields[][2], int32_t /* fieldCount */,
178                    UErrorCode *pErrorCode)
179 {
180     UnicodeTest *me=static_cast<UnicodeTest*>(context);
181     uint32_t start, end;
182     int32_t i;
183 
184     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
185     if(U_FAILURE(*pErrorCode)) {
186         me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]);
187         return;
188     }
189 
190     /* parse derived binary property name, ignore unknown names */
191     i=getTokenIndex(derivedPropsNames, UPRV_LENGTHOF(derivedPropsNames), fields[1][0]);
192     if(i<0) {
193         UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0]));
194         propName.trim();
195         if(me->unknownPropertyNames->find(propName)==nullptr) {
196             UErrorCode errorCode=U_ZERO_ERROR;
197             me->unknownPropertyNames->puti(propName, 1, errorCode);
198             me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]);
199         }
200         return;
201     }
202 
203     me->derivedProps[i].add(start, end);
204 }
205 
TestAdditionalProperties()206 void UnicodeTest::TestAdditionalProperties() {
207 #if !UCONFIG_NO_NORMALIZATION
208     // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt
209     if(UPRV_LENGTHOF(derivedProps)<UPRV_LENGTHOF(derivedPropsNames)) {
210         errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n",
211               UPRV_LENGTHOF(derivedPropsNames));
212         return;
213     }
214     if(UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)) {
215         errln("error in ucdtest.cpp: UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)\n");
216         return;
217     }
218 
219     char path[500];
220     if(getUnidataPath(path) == nullptr) {
221         errln("unable to find path to source/data/unidata/");
222         return;
223     }
224     char *basename=strchr(path, 0);
225     strcpy(basename, "DerivedCoreProperties.txt");
226 
227     char *fields[2][2];
228     UErrorCode errorCode=U_ZERO_ERROR;
229     u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
230     if(U_FAILURE(errorCode)) {
231         errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode));
232         return;
233     }
234 
235     strcpy(basename, "DerivedNormalizationProps.txt");
236     u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
237     if(U_FAILURE(errorCode)) {
238         errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode));
239         return;
240     }
241 
242     // now we have all derived core properties in the UnicodeSets
243     // run them all through the API
244     int32_t rangeCount, range;
245     uint32_t i;
246     UChar32 start, end;
247 
248     // test all true properties
249     for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
250         rangeCount=derivedProps[i].getRangeCount();
251         for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
252             start=derivedProps[i].getRangeStart(range);
253             end=derivedProps[i].getRangeEnd(range);
254             for(; start<=end; ++start) {
255                 if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) {
256                     dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==false is wrong", start, derivedPropsNames[i]);
257                     if(++numErrors[i]>=MAX_ERRORS) {
258                       dataerrln("Too many errors, moving to the next test");
259                       break;
260                     }
261                 }
262             }
263         }
264     }
265 
266     // invert all properties
267     for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
268         derivedProps[i].complement();
269     }
270 
271     // test all false properties
272     for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
273         rangeCount=derivedProps[i].getRangeCount();
274         for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
275             start=derivedProps[i].getRangeStart(range);
276             end=derivedProps[i].getRangeEnd(range);
277             for(; start<=end; ++start) {
278                 if(u_hasBinaryProperty(start, derivedPropsIndex[i])) {
279                     errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==true is wrong\n", start, derivedPropsNames[i]);
280                     if(++numErrors[i]>=MAX_ERRORS) {
281                       errln("Too many errors, moving to the next test");
282                       break;
283                     }
284                 }
285             }
286         }
287     }
288 #endif /* !UCONFIG_NO_NORMALIZATION */
289 }
290 
TestBinaryValues()291 void UnicodeTest::TestBinaryValues() {
292     /*
293      * Unicode 5.1 explicitly defines binary property value aliases.
294      * Verify that they are all recognized.
295      */
296     UErrorCode errorCode=U_ZERO_ERROR;
297     UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode);
298     if(U_FAILURE(errorCode)) {
299         dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode));
300         return;
301     }
302 
303     static const char *const falseValues[]={ "N", "No", "F", "False" };
304     static const char *const trueValues[]={ "Y", "Yes", "T", "True" };
305     int32_t i;
306     for(i=0; i<UPRV_LENGTHOF(falseValues); ++i) {
307         UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
308         pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV));
309         errorCode=U_ZERO_ERROR;
310         UnicodeSet set(pattern, errorCode);
311         if(U_FAILURE(errorCode)) {
312             errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode));
313             continue;
314         }
315         set.complement();
316         if(set!=alpha) {
317             errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]);
318         }
319     }
320     for(i=0; i<UPRV_LENGTHOF(trueValues); ++i) {
321         UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
322         pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV));
323         errorCode=U_ZERO_ERROR;
324         UnicodeSet set(pattern, errorCode);
325         if(U_FAILURE(errorCode)) {
326             errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode));
327             continue;
328         }
329         if(set!=alpha) {
330             errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]);
331         }
332     }
333 }
334 
TestConsistency()335 void UnicodeTest::TestConsistency() {
336 #if !UCONFIG_NO_NORMALIZATION
337     /*
338      * Test for an example that getCanonStartSet() delivers
339      * all characters that compose from the input one,
340      * even in multiple steps.
341      * For example, the set for "I" (0049) should contain both
342      * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
343      * In general, the set for the middle such character should be a subset
344      * of the set for the first.
345      */
346     IcuTestErrorCode errorCode(*this, "TestConsistency");
347     const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
348     const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
349     if(!nfcImpl->ensureCanonIterData(errorCode) || errorCode.isFailure()) {
350         dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n",
351                   errorCode.errorName());
352         errorCode.reset();
353         return;
354     }
355 
356     UnicodeSet set1, set2;
357     if (nfcImpl->getCanonStartSet(0x49, set1)) {
358         /* enumerate all characters that are plausible to be latin letters */
359         for(char16_t start=0xa0; start<0x2000; ++start) {
360             UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode);
361             if(decomp.length()>1 && decomp[0]==0x49) {
362                 set2.add(start);
363             }
364         }
365 
366         if (set1!=set2) {
367             errln("[canon start set of 0049] != [all c with canon decomp with 0049]");
368         }
369         // This was available in cucdtst.c but the test had to move to intltest
370         // because the new internal normalization functions are in C++.
371         //compareUSets(set1, set2,
372         //             "[canon start set of 0049]", "[all c with canon decomp with 0049]",
373         //             true);
374     } else {
375         errln("NFC.getCanonStartSet() returned false");
376     }
377 #endif
378 }
379 
380 /**
381  * Test various implementations of Pattern_Syntax & Pattern_White_Space.
382  */
TestPatternProperties()383 void UnicodeTest::TestPatternProperties() {
384     IcuTestErrorCode errorCode(*this, "TestPatternProperties()");
385     UnicodeSet syn_pp;
386     UnicodeSet syn_prop(UNICODE_STRING_SIMPLE("[:Pattern_Syntax:]"), errorCode);
387     UnicodeSet syn_list(
388         "[!-/\\:-@\\[-\\^`\\{-~"
389         "\\u00A1-\\u00A7\\u00A9\\u00AB\\u00AC\\u00AE\\u00B0\\u00B1\\u00B6\\u00BB\\u00BF\\u00D7\\u00F7"
390         "\\u2010-\\u2027\\u2030-\\u203E\\u2041-\\u2053\\u2055-\\u205E\\u2190-\\u245F\\u2500-\\u2775"
391         "\\u2794-\\u2BFF\\u2E00-\\u2E7F\\u3001-\\u3003\\u3008-\\u3020\\u3030\\uFD3E\\uFD3F\\uFE45\\uFE46]", errorCode);
392     UnicodeSet ws_pp;
393     UnicodeSet ws_prop(UNICODE_STRING_SIMPLE("[:Pattern_White_Space:]"), errorCode);
394     UnicodeSet ws_list(UNICODE_STRING_SIMPLE("[\\u0009-\\u000D\\ \\u0085\\u200E\\u200F\\u2028\\u2029]"), errorCode);
395     UnicodeSet syn_ws_pp;
396     UnicodeSet syn_ws_prop(syn_prop);
397     syn_ws_prop.addAll(ws_prop);
398     for(UChar32 c=0; c<=0xffff; ++c) {
399         if(PatternProps::isSyntax(c)) {
400             syn_pp.add(c);
401         }
402         if(PatternProps::isWhiteSpace(c)) {
403             ws_pp.add(c);
404         }
405         if(PatternProps::isSyntaxOrWhiteSpace(c)) {
406             syn_ws_pp.add(c);
407         }
408     }
409     compareUSets(syn_pp, syn_prop,
410                  "PatternProps.isSyntax()", "[:Pattern_Syntax:]", true);
411     compareUSets(syn_pp, syn_list,
412                  "PatternProps.isSyntax()", "[Pattern_Syntax ranges]", true);
413     compareUSets(ws_pp, ws_prop,
414                  "PatternProps.isWhiteSpace()", "[:Pattern_White_Space:]", true);
415     compareUSets(ws_pp, ws_list,
416                  "PatternProps.isWhiteSpace()", "[Pattern_White_Space ranges]", true);
417     compareUSets(syn_ws_pp, syn_ws_prop,
418                  "PatternProps.isSyntaxOrWhiteSpace()",
419                  "[[:Pattern_Syntax:][:Pattern_White_Space:]]", true);
420 }
421 
422 // So far only minimal port of Java & cucdtst.c compareUSets().
423 UBool
compareUSets(const UnicodeSet & a,const UnicodeSet & b,const char * a_name,const char * b_name,UBool diffIsError)424 UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b,
425                           const char *a_name, const char *b_name,
426                           UBool diffIsError) {
427     UBool same= a==b;
428     if(!same && diffIsError) {
429         errln("Sets are different: %s vs. %s\n", a_name, b_name);
430     }
431     return same;
432 }
433 
434 namespace {
435 
436 /**
437  * Maps a special script code to the most common script of its encoded characters.
438  */
getCharScript(UScriptCode script)439 UScriptCode getCharScript(UScriptCode script) {
440     switch(script) {
441     case USCRIPT_HAN_WITH_BOPOMOFO:
442     case USCRIPT_SIMPLIFIED_HAN:
443     case USCRIPT_TRADITIONAL_HAN:
444         return USCRIPT_HAN;
445     case USCRIPT_JAPANESE:
446         return USCRIPT_HIRAGANA;
447     case USCRIPT_JAMO:
448     case USCRIPT_KOREAN:
449         return USCRIPT_HANGUL;
450     case USCRIPT_SYMBOLS_EMOJI:
451         return USCRIPT_SYMBOLS;
452     default:
453         return script;
454     }
455 }
456 
457 }  // namespace
458 
TestScriptMetadata()459 void UnicodeTest::TestScriptMetadata() {
460     IcuTestErrorCode errorCode(*this, "TestScriptMetadata()");
461     UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode);
462     // So far, sample characters are uppercase.
463     // Georgian is special.
464     UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode);
465     for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) {
466         UScriptCode sc = (UScriptCode)sci;
467         // Run the test with -v to see which script has failures:
468         // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 6 FAIL
469         logln(uscript_getShortName(sc));
470         UScriptUsage usage = uscript_getUsage(sc);
471         UnicodeString sample = uscript_getSampleUnicodeString(sc);
472         UnicodeSet scriptSet;
473         scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode);
474         if(usage == USCRIPT_USAGE_NOT_ENCODED) {
475             assertTrue("not encoded, no sample", sample.isEmpty());
476             assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc));
477             assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc));
478             assertFalse("not encoded, not cased", uscript_isCased(sc));
479             assertTrue("not encoded, no characters", scriptSet.isEmpty());
480         } else {
481             assertFalse("encoded, has a sample character", sample.isEmpty());
482             UChar32 firstChar = sample.char32At(0);
483             UScriptCode charScript = getCharScript(sc);
484             assertEquals("script(sample(script))",
485                          (int32_t)charScript, (int32_t)uscript_getScript(firstChar, errorCode));
486             assertEquals("RTL vs. set", (UBool)rtl.contains(firstChar), (UBool)uscript_isRightToLeft(sc));
487             assertEquals("cased vs. set", (UBool)cased.contains(firstChar), (UBool)uscript_isCased(sc));
488             assertEquals("encoded, has characters", (UBool)(sc == charScript), (UBool)(!scriptSet.isEmpty()));
489             if(uscript_isRightToLeft(sc)) {
490                 rtl.removeAll(scriptSet);
491             }
492             if(uscript_isCased(sc)) {
493                 cased.removeAll(scriptSet);
494             }
495         }
496     }
497     UnicodeString pattern;
498     assertEquals("no remaining RTL characters",
499                  UnicodeString("[]"), rtl.toPattern(pattern));
500     assertEquals("no remaining cased characters",
501                  UnicodeString("[]"), cased.toPattern(pattern));
502 
503     assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN));
504     assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI));
505     assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN));
506 }
507 
TestBidiPairedBracketType()508 void UnicodeTest::TestBidiPairedBracketType() {
509     // BidiBrackets-6.3.0.txt says:
510     //
511     // The set of code points listed in this file was originally derived
512     // using the character properties General_Category (gc), Bidi_Class (bc),
513     // Bidi_Mirrored (Bidi_M), and Bidi_Mirroring_Glyph (bmg), as follows:
514     // two characters, A and B, form a pair if A has gc=Ps and B has gc=Pe,
515     // both have bc=ON and Bidi_M=Y, and bmg of A is B. Bidi_Paired_Bracket
516     // maps A to B and vice versa, and their Bidi_Paired_Bracket_Type
517     // property values are Open and Close, respectively.
518     IcuTestErrorCode errorCode(*this, "TestBidiPairedBracketType()");
519     UnicodeSet bpt("[:^bpt=n:]", errorCode);
520     assertTrue("bpt!=None is not empty", !bpt.isEmpty());
521     // The following should always be true.
522     UnicodeSet mirrored("[:Bidi_M:]", errorCode);
523     UnicodeSet other_neutral("[:bc=ON:]", errorCode);
524     assertTrue("bpt!=None is a subset of Bidi_M", mirrored.containsAll(bpt));
525     assertTrue("bpt!=None is a subset of bc=ON", other_neutral.containsAll(bpt));
526     // The following are true at least initially in Unicode 6.3.
527     UnicodeSet bpt_open("[:bpt=o:]", errorCode);
528     UnicodeSet bpt_close("[:bpt=c:]", errorCode);
529     UnicodeSet ps("[:Ps:]", errorCode);
530     UnicodeSet pe("[:Pe:]", errorCode);
531     assertTrue("bpt=Open is a subset of Ps", ps.containsAll(bpt_open));
532     assertTrue("bpt=Close is a subset of Pe", pe.containsAll(bpt_close));
533 }
534 
TestEmojiProperties()535 void UnicodeTest::TestEmojiProperties() {
536     assertFalse("space is not Emoji", u_hasBinaryProperty(0x20, UCHAR_EMOJI));
537     assertTrue("shooting star is Emoji", u_hasBinaryProperty(0x1F320, UCHAR_EMOJI));
538     IcuTestErrorCode errorCode(*this, "TestEmojiProperties()");
539     UnicodeSet emoji("[:Emoji:]", errorCode);
540     assertTrue("lots of Emoji", emoji.size() > 700);
541 
542     assertTrue("shooting star is Emoji_Presentation",
543                u_hasBinaryProperty(0x1F320, UCHAR_EMOJI_PRESENTATION));
544     assertTrue("Fitzpatrick 6 is Emoji_Modifier",
545                u_hasBinaryProperty(0x1F3FF, UCHAR_EMOJI_MODIFIER));
546     assertTrue("happy person is Emoji_Modifier_Base",
547                u_hasBinaryProperty(0x1F64B, UCHAR_EMOJI_MODIFIER_BASE));
548     assertTrue("asterisk is Emoji_Component",
549                u_hasBinaryProperty(0x2A, UCHAR_EMOJI_COMPONENT));
550     assertTrue("copyright is Extended_Pictographic",
551                u_hasBinaryProperty(0xA9, UCHAR_EXTENDED_PICTOGRAPHIC));
552 }
553 
554 namespace {
555 
hbp(const char16_t * s,int32_t length,UProperty which)556 UBool hbp(const char16_t *s, int32_t length, UProperty which) {
557     return u_stringHasBinaryProperty(s, length, which);
558 }
559 
hbp(const char16_t * s,UProperty which)560 UBool hbp(const char16_t *s, UProperty which) {
561     return u_stringHasBinaryProperty(s, -1, which);
562 }
563 
564 }  // namespace
565 
TestEmojiPropertiesOfStrings()566 void UnicodeTest::TestEmojiPropertiesOfStrings() {
567     // Property of code points, for coverage
568     assertFalse("null is not Ideographic", hbp(nullptr, 1, UCHAR_IDEOGRAPHIC));
569     assertFalse("null/0 is not Ideographic", hbp(nullptr, -1, UCHAR_IDEOGRAPHIC));
570     assertFalse("empty string is not Ideographic", hbp(u"", 0, UCHAR_IDEOGRAPHIC));
571     assertFalse("empty string/0 is not Ideographic", hbp(u"", -1, UCHAR_IDEOGRAPHIC));
572     assertFalse("L is not Ideographic", hbp(u"L", 1, UCHAR_IDEOGRAPHIC));
573     assertFalse("L/0 is not Ideographic", hbp(u"L", -1, UCHAR_IDEOGRAPHIC));
574     assertTrue("U+4E02 is Ideographic", hbp(u"丂", 1, UCHAR_IDEOGRAPHIC));
575     assertTrue("U+4E02/0 is Ideographic", hbp(u"丂", -1, UCHAR_IDEOGRAPHIC));
576     assertFalse("2*U+4E02 is not Ideographic", hbp(u"丂丂", 2, UCHAR_IDEOGRAPHIC));
577     assertFalse("2*U+4E02/0 is not Ideographic", hbp(u"丂丂", -1, UCHAR_IDEOGRAPHIC));
578     assertFalse("bicycle is not Ideographic", hbp(u"��", 2, UCHAR_IDEOGRAPHIC));
579     assertFalse("bicycle/0 is not Ideographic", hbp(u"��", -1, UCHAR_IDEOGRAPHIC));
580     assertTrue("U+23456 is Ideographic", hbp(u"\U00023456", 2, UCHAR_IDEOGRAPHIC));
581     assertTrue("U+23456/0 is Ideographic", hbp(u"\U00023456", -1, UCHAR_IDEOGRAPHIC));
582 
583     // Property of (code points and) strings
584     assertFalse("null is not Basic_Emoji", hbp(nullptr, 1, UCHAR_BASIC_EMOJI));
585     assertFalse("null/0 is not Basic_Emoji", hbp(nullptr, -1, UCHAR_BASIC_EMOJI));
586     assertFalse("empty string is not Basic_Emoji", hbp(u"", 0, UCHAR_BASIC_EMOJI));
587     assertFalse("empty string/0 is not Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
588     assertFalse("L is not Basic_Emoji", hbp(u"L", 1, UCHAR_BASIC_EMOJI));
589     assertFalse("L/0 is not Basic_Emoji", hbp(u"L", -1, UCHAR_BASIC_EMOJI));
590     assertFalse("U+4E02 is not Basic_Emoji", hbp(u"丂", 1, UCHAR_BASIC_EMOJI));
591     assertFalse("U+4E02/0 is not Basic_Emoji", hbp(u"丂", -1, UCHAR_BASIC_EMOJI));
592     assertTrue("bicycle is Basic_Emoji", hbp(u"��", 2, UCHAR_BASIC_EMOJI));
593     assertTrue("bicycle/0 is Basic_Emoji", hbp(u"��", -1, UCHAR_BASIC_EMOJI));
594     assertFalse("2*bicycle is Basic_Emoji", hbp(u"����", 4, UCHAR_BASIC_EMOJI));
595     assertFalse("2*bicycle/0 is Basic_Emoji", hbp(u"����", -1, UCHAR_BASIC_EMOJI));
596     assertFalse("U+23456 is not Basic_Emoji", hbp(u"\U00023456", 2, UCHAR_BASIC_EMOJI));
597     assertFalse("U+23456/0 is not Basic_Emoji", hbp(u"\U00023456", -1, UCHAR_BASIC_EMOJI));
598 
599     assertFalse("stopwatch is not Basic_Emoji", hbp(u"⏱", 1, UCHAR_BASIC_EMOJI));
600     assertFalse("stopwatch/0 is not Basic_Emoji", hbp(u"⏱", -1, UCHAR_BASIC_EMOJI));
601     assertTrue("stopwatch+emoji is Basic_Emoji", hbp(u"⏱\uFE0F", 2, UCHAR_BASIC_EMOJI));
602     assertTrue("stopwatch+emoji/0 is Basic_Emoji", hbp(u"⏱\uFE0F", -1, UCHAR_BASIC_EMOJI));
603 
604     assertFalse("chipmunk is not Basic_Emoji", hbp(u"��", UCHAR_BASIC_EMOJI));
605     assertTrue("chipmunk+emoji is Basic_Emoji", hbp(u"��\uFE0F", UCHAR_BASIC_EMOJI));
606     assertFalse("chipmunk+2*emoji is not Basic_Emoji", hbp(u"��\uFE0F\uFE0F", UCHAR_BASIC_EMOJI));
607 
608     // Properties of strings (only)
609     assertFalse("4+emoji is not Emoji_Keycap_Sequence",
610                 hbp(u"4\uFE0F", UCHAR_EMOJI_KEYCAP_SEQUENCE));
611     assertTrue("4+emoji+keycap is Emoji_Keycap_Sequence",
612                hbp(u"4\uFE0F\u20E3", UCHAR_EMOJI_KEYCAP_SEQUENCE));
613 
614     assertFalse("[B] is not RGI_Emoji_Flag_Sequence",
615                 hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
616     assertTrue("[BE] is RGI_Emoji_Flag_Sequence",
617                hbp(u"����", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
618 
619     assertFalse("[flag] is not RGI_Emoji_Tag_Sequence",
620                 hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
621     assertTrue("[Scotland] is RGI_Emoji_Tag_Sequence",
622                hbp(u"��������������", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
623 
624     assertFalse("bicyclist is not RGI_Emoji_Modifier_Sequence",
625                 hbp(u"��", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
626     assertTrue("bicyclist+medium is RGI_Emoji_Modifier_Sequence",
627                hbp(u"��\U0001F3FD", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
628 
629     assertFalse("woman+dark+ZWJ is not RGI_Emoji_ZWJ_Sequence",
630                 hbp(u"��\U0001F3FF\u200D", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
631     assertTrue("woman pilot: dark skin tone is RGI_Emoji_ZWJ_Sequence",
632                hbp(u"��\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
633 
634     // RGI_Emoji = all of the above
635     assertFalse("stopwatch is not RGI_Emoji", hbp(u"⏱", UCHAR_RGI_EMOJI));
636     assertTrue("stopwatch+emoji is RGI_Emoji", hbp(u"⏱\uFE0F", UCHAR_RGI_EMOJI));
637 
638     assertFalse("chipmunk is not RGI_Emoji", hbp(u"��", UCHAR_RGI_EMOJI));
639     assertTrue("chipmunk+emoji is RGI_Emoji", hbp(u"��\uFE0F", UCHAR_RGI_EMOJI));
640 
641     assertFalse("4+emoji is not RGI_Emoji", hbp(u"4\uFE0F", UCHAR_RGI_EMOJI));
642     assertTrue("4+emoji+keycap is RGI_Emoji", hbp(u"4\uFE0F\u20E3", UCHAR_RGI_EMOJI));
643 
644     assertFalse("[B] is not RGI_Emoji", hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI));
645     assertTrue("[BE] is RGI_Emoji", hbp(u"����", UCHAR_RGI_EMOJI));
646 
647     assertTrue("[flag] is RGI_Emoji", hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI));
648     assertTrue("[Scotland] is RGI_Emoji", hbp(u"��������������", UCHAR_RGI_EMOJI));
649 
650     assertTrue("bicyclist is RGI_Emoji", hbp(u"��", UCHAR_RGI_EMOJI));
651     assertTrue("bicyclist+medium is RGI_Emoji", hbp(u"��\U0001F3FD", UCHAR_RGI_EMOJI));
652 
653     assertFalse("woman+dark+ZWJ is not RGI_Emoji", hbp(u"��\U0001F3FF\u200D", UCHAR_RGI_EMOJI));
654     assertTrue("woman pilot: dark skin tone is RGI_Emoji",
655                hbp(u"��\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI));
656 
657     // UnicodeSet with properties of strings
658     IcuTestErrorCode errorCode(*this, "TestEmojiPropertiesOfStrings()");
659     UnicodeSet basic("[:Basic_Emoji:]", errorCode);
660     UnicodeSet keycaps("[:Emoji_Keycap_Sequence:]", errorCode);
661     UnicodeSet modified("[:RGI_Emoji_Modifier_Sequence:]", errorCode);
662     UnicodeSet flags("[:RGI_Emoji_Flag_Sequence:]", errorCode);
663     UnicodeSet tags("[:RGI_Emoji_Tag_Sequence:]", errorCode);
664     UnicodeSet combos("[:RGI_Emoji_ZWJ_Sequence:]", errorCode);
665     UnicodeSet rgi("[:RGI_Emoji:]", errorCode);
666     if (errorCode.errDataIfFailureAndReset("UnicodeSets")) {
667         return;
668     }
669 
670     // union of all sets except for "rgi" -- should be the same as "rgi"
671     UnicodeSet all(basic);
672     all.addAll(keycaps).addAll(modified).addAll(flags).addAll(tags).addAll(combos);
673 
674     UnicodeSet basicOnlyCp(basic);
675     basicOnlyCp.removeAllStrings();
676 
677     UnicodeSet rgiOnlyCp(rgi);
678     rgiOnlyCp.removeAllStrings();
679 
680     assertTrue("lots of Basic_Emoji", basic.size() > 1000);
681     assertEquals("12 Emoji_Keycap_Sequence", 12, keycaps.size());
682     assertTrue("lots of RGI_Emoji_Modifier_Sequence", modified.size() > 600);
683     assertTrue("lots of RGI_Emoji_Flag_Sequence", flags.size() > 250);
684     assertTrue("some RGI_Emoji_Tag_Sequence", tags.size() >= 3);
685     assertTrue("lots of RGI_Emoji_ZWJ_Sequence", combos.size() > 1300);
686     assertTrue("lots of RGI_Emoji", rgi.size() > 3000);
687 
688     assertTrue("lots of Basic_Emoji code points", basicOnlyCp.size() > 1000);
689     assertTrue("Basic_Emoji.hasStrings()", basic.hasStrings());
690     assertEquals("no Emoji_Keycap_Sequence code points", 0, keycaps.getRangeCount());
691     assertEquals("lots of RGI_Emoji_Modifier_Sequence", 0, modified.getRangeCount());
692     assertEquals("lots of RGI_Emoji_Flag_Sequence", 0, flags.getRangeCount());
693     assertEquals("some RGI_Emoji_Tag_Sequence", 0, tags.getRangeCount());
694     assertEquals("lots of RGI_Emoji_ZWJ_Sequence", 0, combos.getRangeCount());
695 
696     assertTrue("lots of RGI_Emoji code points", rgiOnlyCp.size() > 1000);
697     assertTrue("RGI_Emoji.hasStrings()", rgi.hasStrings());
698     assertEquals("RGI_Emoji/only-cp.size() == Basic_Emoji/only-cp.size()",
699                  rgiOnlyCp.size(), basicOnlyCp.size());
700     assertTrue("RGI_Emoji/only-cp == Basic_Emoji/only-cp", rgiOnlyCp == basicOnlyCp);
701     assertEquals("RGI_Emoji.size() == union.size()", rgi.size(), all.size());
702     assertTrue("RGI_Emoji == union", rgi == all);
703 
704     assertTrue("Basic_Emoji.contains(stopwatch+emoji)", basic.contains(u"⏱\uFE0F"));
705     assertTrue("Basic_Emoji.contains(chipmunk+emoji)", basic.contains(u"��\uFE0F"));
706     assertTrue("Emoji_Keycap_Sequence.contains(4+emoji+keycap)",
707                keycaps.contains(u"4\uFE0F\u20E3"));
708     assertTrue("RGI_Emoji_Flag_Sequence.contains([BE])", flags.contains(u"����"));
709     assertTrue("RGI_Emoji_Tag_Sequence.contains([Scotland])", tags.contains(u"��������������"));
710     assertTrue("RGI_Emoji_Modifier_Sequence.contains(bicyclist+medium)",
711                modified.contains(u"��\U0001F3FD"));
712     assertTrue("RGI_Emoji_ZWJ_Sequence.contains(woman pilot: dark skin tone)",
713                combos.contains(u"��\U0001F3FF\u200D✈\uFE0F"));
714     assertTrue("RGI_Emoji.contains(stopwatch+emoji)", rgi.contains(u"⏱\uFE0F"));
715     assertTrue("RGI_Emoji.contains(chipmunk+emoji)", rgi.contains(u"��\uFE0F"));
716     assertTrue("RGI_Emoji.contains(4+emoji+keycap)", rgi.contains(u"4\uFE0F\u20E3"));
717     assertTrue("RGI_Emoji.contains([BE] is RGI_Emoji)", rgi.contains(u"����"));
718     assertTrue("RGI_Emoji.contains([flag])", rgi.contains(u"\U0001F3F4"));
719     assertTrue("RGI_Emoji.contains([Scotland])", rgi.contains(u"��������������"));
720     assertTrue("RGI_Emoji.contains(bicyclist)", rgi.contains(u"��"));
721     assertTrue("RGI_Emoji.contains(bicyclist+medium)", rgi.contains(u"��\U0001F3FD"));
722     assertTrue("RGI_Emoji.contains(woman pilot: dark skin tone)", rgi.contains(u"��\U0001F3FF\u200D✈\uFE0F"));
723 }
724 
TestIndicPositionalCategory()725 void UnicodeTest::TestIndicPositionalCategory() {
726     IcuTestErrorCode errorCode(*this, "TestIndicPositionalCategory()");
727     UnicodeSet na(u"[:InPC=NA:]", errorCode);
728     assertTrue("mostly NA", 1000000 <= na.size() && na.size() <= UCHAR_MAX_VALUE - 500);
729     UnicodeSet vol(u"[:InPC=Visual_Order_Left:]", errorCode);
730     assertTrue("some Visual_Order_Left", 19 <= vol.size() && vol.size() <= 100);
731     assertEquals("U+08FF: NA", U_INPC_NA,
732                  u_getIntPropertyValue(0x08FF, UCHAR_INDIC_POSITIONAL_CATEGORY));
733     assertEquals("U+0900: Top", U_INPC_TOP,
734                  u_getIntPropertyValue(0x0900, UCHAR_INDIC_POSITIONAL_CATEGORY));
735     assertEquals("U+10A06: Overstruck", U_INPC_OVERSTRUCK,
736                  u_getIntPropertyValue(0x10A06, UCHAR_INDIC_POSITIONAL_CATEGORY));
737 }
738 
TestIndicSyllabicCategory()739 void UnicodeTest::TestIndicSyllabicCategory() {
740     IcuTestErrorCode errorCode(*this, "TestIndicSyllabicCategory()");
741     UnicodeSet other(u"[:InSC=Other:]", errorCode);
742     assertTrue("mostly Other", 1000000 <= other.size() && other.size() <= UCHAR_MAX_VALUE - 500);
743     UnicodeSet ava(u"[:InSC=Avagraha:]", errorCode);
744     assertTrue("some Avagraha", 16 <= ava.size() && ava.size() <= 100);
745     assertEquals("U+08FF: Other", U_INSC_OTHER,
746                  u_getIntPropertyValue(0x08FF, UCHAR_INDIC_SYLLABIC_CATEGORY));
747     assertEquals("U+0900: Bindu", U_INSC_BINDU,
748                  u_getIntPropertyValue(0x0900, UCHAR_INDIC_SYLLABIC_CATEGORY));
749     assertEquals("U+11065: Brahmi_Joining_Number", U_INSC_BRAHMI_JOINING_NUMBER,
750                  u_getIntPropertyValue(0x11065, UCHAR_INDIC_SYLLABIC_CATEGORY));
751 }
752 
TestVerticalOrientation()753 void UnicodeTest::TestVerticalOrientation() {
754     IcuTestErrorCode errorCode(*this, "TestVerticalOrientation()");
755     UnicodeSet r(u"[:vo=R:]", errorCode);
756     assertTrue("mostly R", 0xc0000 <= r.size() && r.size() <= 0xd0000);
757     UnicodeSet u(u"[:vo=U:]", errorCode);
758     assertTrue("much U", 0x40000 <= u.size() && u.size() <= 0x50000);
759     UnicodeSet tu(u"[:vo=Tu:]", errorCode);
760     assertTrue("some Tu", 147 <= tu.size() && tu.size() <= 300);
761     assertEquals("U+0E01: Rotated", U_VO_ROTATED,
762                  u_getIntPropertyValue(0x0E01, UCHAR_VERTICAL_ORIENTATION));
763     assertEquals("U+3008: Transformed_Rotated", U_VO_TRANSFORMED_ROTATED,
764                  u_getIntPropertyValue(0x3008, UCHAR_VERTICAL_ORIENTATION));
765     assertEquals("U+33333: Upright", U_VO_UPRIGHT,
766                  u_getIntPropertyValue(0x33333, UCHAR_VERTICAL_ORIENTATION));
767 }
768 
TestDefaultScriptExtensions()769 void UnicodeTest::TestDefaultScriptExtensions() {
770     // Block 3000..303F CJK Symbols and Punctuation defaults to scx=Bopo Hang Hani Hira Kana Yiii
771     // but some of its characters revert to scx=<script> which is usually Common.
772     IcuTestErrorCode errorCode(*this, "TestDefaultScriptExtensions()");
773     UScriptCode scx[20];
774     scx[0] = USCRIPT_INVALID_CODE;
775     assertEquals("U+3000 num scx", 1,  // IDEOGRAPHIC SPACE
776                  uscript_getScriptExtensions(0x3000, scx, UPRV_LENGTHOF(scx), errorCode));
777     assertEquals("U+3000 num scx[0]", USCRIPT_COMMON, scx[0]);
778     scx[0] = USCRIPT_INVALID_CODE;
779     assertEquals("U+3012 num scx", 1,  // POSTAL MARK
780                  uscript_getScriptExtensions(0x3012, scx, UPRV_LENGTHOF(scx), errorCode));
781     assertEquals("U+3012 num scx[0]", USCRIPT_COMMON, scx[0]);
782 }
783 
TestInvalidCodePointFolding()784 void UnicodeTest::TestInvalidCodePointFolding() {
785     // Test behavior when an invalid code point is passed to u_foldCase
786     static const UChar32 invalidCodePoints[] = {
787             0xD800, // lead surrogate
788             0xDFFF, // trail surrogate
789             0xFDD0, // noncharacter
790             0xFFFF, // noncharacter
791             0x110000, // out of range
792             -1 // negative
793     };
794     for (int32_t i=0; i<UPRV_LENGTHOF(invalidCodePoints); ++i) {
795         UChar32 cp = invalidCodePoints[i];
796         assertEquals("Invalid code points should be echoed back",
797                 cp, u_foldCase(cp, U_FOLD_CASE_DEFAULT));
798         assertEquals("Invalid code points should be echoed back",
799                 cp, u_foldCase(cp, U_FOLD_CASE_EXCLUDE_SPECIAL_I));
800     }
801 }
802 
TestBinaryCharacterProperties()803 void UnicodeTest::TestBinaryCharacterProperties() {
804 #if !UCONFIG_NO_NORMALIZATION
805     IcuTestErrorCode errorCode(*this, "TestBinaryCharacterProperties()");
806     // Spot-check getBinaryPropertySet() vs. hasBinaryProperty().
807     for (int32_t prop = 0; prop < UCHAR_BINARY_LIMIT; ++prop) {
808         const USet *uset = u_getBinaryPropertySet((UProperty)prop, errorCode);
809         if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(%d)", (int)prop)) {
810             continue;
811         }
812         const UnicodeSet &set = *UnicodeSet::fromUSet(uset);
813         int32_t count = set.getRangeCount();
814         if (count == 0) {
815             assertFalse(UnicodeString("!hasBinaryProperty(U+0020, ") + prop + u")",
816                 u_hasBinaryProperty(0x20, (UProperty)prop));
817             assertFalse(UnicodeString("!hasBinaryProperty(U+0061, ") + prop + u")",
818                 u_hasBinaryProperty(0x61, (UProperty)prop));
819             assertFalse(UnicodeString("!hasBinaryProperty(U+4E00, ") + prop + u")",
820                 u_hasBinaryProperty(0x4e00, (UProperty)prop));
821         } else {
822             UChar32 c = set.getRangeStart(0);
823             if (c > 0) {
824                 assertFalse(
825                     UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c - 1) +
826                         u", " + prop + u")",
827                     u_hasBinaryProperty(c - 1, (UProperty)prop));
828             }
829             assertTrue(
830                 UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
831                     u", " + prop + u")",
832                 u_hasBinaryProperty(c, (UProperty)prop));
833             c = set.getRangeEnd(count - 1);
834             assertTrue(
835                 UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
836                     u", " + prop + u")",
837                 u_hasBinaryProperty(c, (UProperty)prop));
838             if (c < 0x10ffff) {
839                 assertFalse(
840                     UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c + 1) +
841                         u", " + prop + u")",
842                     u_hasBinaryProperty(c + 1, (UProperty)prop));
843             }
844         }
845     }
846 #endif
847 }
848 
TestIntCharacterProperties()849 void UnicodeTest::TestIntCharacterProperties() {
850 #if !UCONFIG_NO_NORMALIZATION
851     IcuTestErrorCode errorCode(*this, "TestIntCharacterProperties()");
852     // Spot-check getIntPropertyMap() vs. getIntPropertyValue().
853     for (int32_t prop = UCHAR_INT_START; prop < UCHAR_INT_LIMIT; ++prop) {
854         const UCPMap *map = u_getIntPropertyMap((UProperty)prop, errorCode);
855         if (errorCode.errIfFailureAndReset("u_getIntPropertyMap(%d)", (int)prop)) {
856             continue;
857         }
858         uint32_t value;
859         UChar32 end = ucpmap_getRange(map, 0, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
860         assertTrue("int property first range", end >= 0);
861         UChar32 c = end / 2;
862         assertEquals(UnicodeString("int property first range value at ") + TestUtility::hex(c),
863             u_getIntPropertyValue(c, (UProperty)prop), value);
864         end = ucpmap_getRange(map, 0x5000, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
865         assertTrue("int property later range", end >= 0);
866         assertEquals(UnicodeString("int property later range value at ") + TestUtility::hex(end),
867             u_getIntPropertyValue(end, (UProperty)prop), value);
868         // ucpmap_get() API coverage
869         // TODO: move to cucdtst.c
870         assertEquals(
871             "int property upcmap_get(U+0061)",
872             u_getIntPropertyValue(0x61, (UProperty)prop), ucpmap_get(map, 0x61));
873     }
874 #endif
875 }
876 
877 namespace {
878 
getPropName(UProperty property,int32_t nameChoice)879 const char *getPropName(UProperty property, int32_t nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
880     const char *name = u_getPropertyName(property, (UPropertyNameChoice)nameChoice);
881     return name != nullptr ? name : "null";
882 }
883 
getValueName(UProperty property,int32_t value,int32_t nameChoice)884 const char *getValueName(UProperty property, int32_t value, int32_t nameChoice)
885         UPRV_NO_SANITIZE_UNDEFINED {
886     const char *name = u_getPropertyValueName(property, value, (UPropertyNameChoice)nameChoice);
887     return name != nullptr ? name : "null";
888 }
889 
890 }  // namespace
891 
TestPropertyNames()892 void UnicodeTest::TestPropertyNames() {
893     IcuTestErrorCode errorCode(*this, "TestPropertyNames()");
894     // Test names of certain properties & values.
895     // The UPropertyNameChoice is really an integer with only a couple of named constants.
896     UProperty prop = UCHAR_WHITE_SPACE;
897     constexpr int32_t SHORT = U_SHORT_PROPERTY_NAME;
898     constexpr int32_t LONG = U_LONG_PROPERTY_NAME;
899     assertEquals("White_Space: index -1", "null", getPropName(prop, -1));
900     assertEquals("White_Space: short", "WSpace", getPropName(prop, SHORT));
901     assertEquals("White_Space: long", "White_Space", getPropName(prop, LONG));
902     assertEquals("White_Space: index 2", "space", getPropName(prop, 2));
903     assertEquals("White_Space: index 3", "null", getPropName(prop, 3));
904 
905     prop = UCHAR_SIMPLE_CASE_FOLDING;
906     assertEquals("Simple_Case_Folding: index -1", "null", getPropName(prop, -1));
907     assertEquals("Simple_Case_Folding: short", "scf", getPropName(prop, SHORT));
908     assertEquals("Simple_Case_Folding: long", "Simple_Case_Folding", getPropName(prop, LONG));
909     assertEquals("Simple_Case_Folding: index 2", "sfc", getPropName(prop, 2));
910     assertEquals("Simple_Case_Folding: index 3", "null", getPropName(prop, 3));
911 
912     prop = UCHAR_CASED;
913     assertEquals("Cased=Y: index -1", "null", getValueName(prop, 1, -1));
914     assertEquals("Cased=Y: short", "Y", getValueName(prop, 1, SHORT));
915     assertEquals("Cased=Y: long", "Yes", getValueName(prop, 1, LONG));
916     assertEquals("Cased=Y: index 2", "T", getValueName(prop, 1, 2));
917     assertEquals("Cased=Y: index 3", "True", getValueName(prop, 1, 3));
918     assertEquals("Cased=Y: index 4", "null", getValueName(prop, 1, 4));
919 
920     prop = UCHAR_DECOMPOSITION_TYPE;
921     int32_t value = U_DT_NOBREAK;
922     assertEquals("dt=Nb: index -1", "null", getValueName(prop, value, -1));
923     assertEquals("dt=Nb: short", "Nb", getValueName(prop, value, SHORT));
924     assertEquals("dt=Nb: long", "Nobreak", getValueName(prop, value, LONG));
925     assertEquals("dt=Nb: index 2", "nb", getValueName(prop, value, 2));
926     assertEquals("dt=Nb: index 3", "null", getValueName(prop, value, 3));
927 
928     // Canonical_Combining_Class:
929     // The UCD inserts the numeric values in the second filed of its PropertyValueAliases.txt lines.
930     // In ICU, we don't treat these as names,
931     // they are just the numeric values returned by u_getCombiningClass().
932     // We return the real short and long names for the usual choice constants.
933     prop = UCHAR_CANONICAL_COMBINING_CLASS;
934     assertEquals("ccc=230: index -1", "null", getValueName(prop, 230, -1));
935     assertEquals("ccc=230: short", "A", getValueName(prop, 230, SHORT));
936     assertEquals("ccc=230: long", "Above", getValueName(prop, 230, LONG));
937     assertEquals("ccc=230: index 2", "null", getValueName(prop, 230, 2));
938 
939     prop = UCHAR_GENERAL_CATEGORY;
940     value = U_DECIMAL_DIGIT_NUMBER;
941     assertEquals("gc=Nd: index -1", "null", getValueName(prop, value, -1));
942     assertEquals("gc=Nd: short", "Nd", getValueName(prop, value, SHORT));
943     assertEquals("gc=Nd: long", "Decimal_Number", getValueName(prop, value, LONG));
944     assertEquals("gc=Nd: index 2", "digit", getValueName(prop, value, 2));
945     assertEquals("gc=Nd: index 3", "null", getValueName(prop, value, 3));
946 
947     prop = UCHAR_GENERAL_CATEGORY_MASK;
948     value = U_GC_P_MASK;
949     assertEquals("gc=P mask: index -1", "null", getValueName(prop, value, -1));
950     assertEquals("gc=P mask: short", "P", getValueName(prop, value, SHORT));
951     assertEquals("gc=P mask: long", "Punctuation", getValueName(prop, value, LONG));
952     assertEquals("gc=P mask: index 2", "punct", getValueName(prop, value, 2));
953     assertEquals("gc=P mask: index 3", "null", getValueName(prop, value, 3));
954 }
955 
TestIDSUnaryOperator()956 void UnicodeTest::TestIDSUnaryOperator() {
957     IcuTestErrorCode errorCode(*this, "TestIDSUnaryOperator()");
958     // New in Unicode 15.1 for just two characters.
959     assertFalse("U+2FFC IDSU", u_hasBinaryProperty(0x2ffc, UCHAR_IDS_UNARY_OPERATOR));
960     assertFalse("U+2FFD IDSU", u_hasBinaryProperty(0x2ffd, UCHAR_IDS_UNARY_OPERATOR));
961     assertTrue("U+2FFE IDSU", u_hasBinaryProperty(0x2ffe, UCHAR_IDS_UNARY_OPERATOR));
962     assertTrue("U+2FFF IDSU", u_hasBinaryProperty(0x2fff, UCHAR_IDS_UNARY_OPERATOR));
963     assertFalse("U+3000 IDSU", u_hasBinaryProperty(0x3000, UCHAR_IDS_UNARY_OPERATOR));
964     assertFalse("U+3001 IDSU", u_hasBinaryProperty(0x3001, UCHAR_IDS_UNARY_OPERATOR));
965 
966     // Property name works and gets the correct set.
967     UnicodeSet idsu(u"[:IDS_Unary_Operator:]", errorCode);
968     assertEquals("IDSU set number of characters", 2, idsu.size());
969     assertFalse("idsu.contains(U+2FFD)", idsu.contains(0x2ffd));
970     assertTrue("idsu.contains(U+2FFE)", idsu.contains(0x2ffe));
971     assertTrue("idsu.contains(U+2FFF)", idsu.contains(0x2fff));
972     assertFalse("idsu.contains(U+3000)", idsu.contains(0x3000));
973 }
974 
975 namespace {
976 
isMathStart(UChar32 c)977 bool isMathStart(UChar32 c) {
978     return u_hasBinaryProperty(c, UCHAR_ID_COMPAT_MATH_START);
979 }
980 
isMathContinue(UChar32 c)981 bool isMathContinue(UChar32 c) {
982     return u_hasBinaryProperty(c, UCHAR_ID_COMPAT_MATH_CONTINUE);
983 }
984 
985 }  // namespace
986 
TestIDCompatMath()987 void UnicodeTest::TestIDCompatMath() {
988     IcuTestErrorCode errorCode(*this, "TestIDCompatMath()");
989     assertFalse("U+00B1 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb1));
990     assertTrue("U+00B2 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb2));
991     assertTrue("U+00B3 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb3));
992     assertFalse("U+00B4 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb4));
993     assertFalse("U+207F UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x207f));
994     assertTrue("U+2080 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2080));
995     assertTrue("U+208E UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x208e));
996     assertFalse("U+208F UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x208f));
997     assertFalse("U+2201 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2201));
998     assertTrue("U+2202 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2202));
999     assertTrue("U+1D6C1 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D6C1));
1000     assertTrue("U+1D7C3 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D7C3));
1001     assertFalse("U+1D7C4 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D7C4));
1002 
1003     assertFalse("U+00B2 UCHAR_ID_COMPAT_MATH_START", isMathStart(0xb2));
1004     assertFalse("U+2080 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2080));
1005     assertFalse("U+2201 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2201));
1006     assertTrue("U+2202 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2202));
1007     assertTrue("U+1D6C1 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D6C1));
1008     assertTrue("U+1D7C3 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D7C3));
1009     assertFalse("U+1D7C4 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D7C4));
1010 
1011     // Property names work and get the correct sets.
1012     UnicodeSet idcmStart(u"[:ID_Compat_Math_Start:]", errorCode);
1013     UnicodeSet idcmContinue(u"[:ID_Compat_Math_Continue:]", errorCode);
1014     assertEquals("ID_Compat_Math_Start set number of characters", 13, idcmStart.size());
1015     assertEquals("ID_Compat_Math_Continue set number of characters", 43, idcmContinue.size());
1016     assertTrue("ID_Compat_Math_Start is a subset of ID_Compat_Math_Continue",
1017                idcmContinue.containsAll(idcmStart));
1018     assertFalse("idcmContinue.contains(U+207F)", idcmContinue.contains(0x207f));
1019     assertTrue("idcmContinue.contains(U+2080)", idcmContinue.contains(0x2080));
1020     assertTrue("idcmContinue.contains(U+208E)", idcmContinue.contains(0x208e));
1021     assertFalse("idcmContinue.contains(U+208F)", idcmContinue.contains(0x208f));
1022     assertFalse("idcmStart.contains(U+2201)", idcmStart.contains(0x2201));
1023     assertTrue("idcmStart.contains(U+2202)", idcmStart.contains(0x2202));
1024     assertTrue("idcmStart.contains(U+1D7C3)", idcmStart.contains(0x1D7C3));
1025     assertFalse("idcmStart.contains(U+1D7C4)", idcmStart.contains(0x1D7C4));
1026 }
1027