xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/draft/Misc.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.draft;
2 
3 import com.google.common.base.Joiner;
4 import com.ibm.icu.impl.Row;
5 import com.ibm.icu.impl.Row.R2;
6 import com.ibm.icu.lang.UCharacter;
7 import com.ibm.icu.lang.UProperty;
8 import com.ibm.icu.lang.UScript;
9 import com.ibm.icu.text.Collator;
10 import com.ibm.icu.text.DateFormat;
11 import com.ibm.icu.text.DateTimePatternGenerator;
12 import com.ibm.icu.text.DecimalFormat;
13 import com.ibm.icu.text.Normalizer2;
14 import com.ibm.icu.text.RawCollationKey;
15 import com.ibm.icu.text.RuleBasedCollator;
16 import com.ibm.icu.text.SimpleDateFormat;
17 import com.ibm.icu.text.StringTransform;
18 import com.ibm.icu.text.Transliterator;
19 import com.ibm.icu.text.UTF16;
20 import com.ibm.icu.text.UnicodeSet;
21 import com.ibm.icu.util.TimeZone;
22 import com.ibm.icu.util.ULocale;
23 import java.io.IOException;
24 import java.io.PrintWriter;
25 import java.util.Date;
26 import java.util.LinkedHashSet;
27 import java.util.List;
28 import java.util.Locale;
29 import java.util.Map;
30 import java.util.Set;
31 import java.util.TreeMap;
32 import java.util.TreeSet;
33 import org.unicode.cldr.tool.ToolConfig;
34 import org.unicode.cldr.util.Builder;
35 import org.unicode.cldr.util.CLDRConfig;
36 import org.unicode.cldr.util.CLDRFile;
37 import org.unicode.cldr.util.CLDRFile.WinningChoice;
38 import org.unicode.cldr.util.CLDRPaths;
39 import org.unicode.cldr.util.Factory;
40 import org.unicode.cldr.util.LanguageTagParser;
41 import org.unicode.cldr.util.LocaleIDParser;
42 import org.unicode.cldr.util.PluralSnapshot;
43 import org.unicode.cldr.util.StandardCodes;
44 import org.unicode.cldr.util.SupplementalDataInfo;
45 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData;
46 import org.unicode.cldr.util.Timer;
47 
48 public class Misc {
main(String[] args)49     public static void main(String[] args) throws IOException {
50         showDefaultContent(
51                 "bn", "sw", "mr", "ta", "ms", "am", "af", "zu", "et", "is", "ur", "te", "gu", "kn",
52                 "ml", "gl", "eu");
53         showSortKey();
54         showNumberSamples();
55         showDateSamples();
56         showExemplarSize();
57 
58         doNFC();
59         showPlurals();
60 
61         String[] locales =
62                 "zh en es hi fr ar pt ru id bn ur ja de fil sw pa jv ko tr vi it te mr th fa ta pl lah gu my ms uk zh_Hant kn su ml nl az or ro uz bho ps ha ku mad yo ig si mg sd hu am om kk el ne be mai sr cs km as sv mag mwr sn ny ca bg hne tg bgc ii he dcc ug fuv qu rw min af zu mn bjn so ki hr ak tk fi sq da bya sk gn bal no lua xh bs ht syl ka bjj ban sat hy za luy rn bug bem luo wtm st lo gl ti shn ceb ks mfa ace lt ky bm lg shi tn bcl glk war kok bew kln kam umb bo suk ee kmb ay pam bhk sas bbc swv nso tpi rjb gbm lmn ff kab sl ts ba cv kri gon ndc guz wo tzm mak kfy ln ljp mk efi ibb doi awa mos nyn vmw mer kru lv sid pag gno sck tcy wbq nd lrc ss cgg brh xog nn sg xnr dyu rmt teo kxm mdh hno lu eu khn wbr tsg rej rif brx ilo kbd et ce kg fy hil kj cy ast av ve udm ga tt sah myv tet gaa ady mt dv fj nr is mdf kum kha sm kpv lez pap krc inh oc se tyv zdj dz bi gag to koi lbe mi ab os ty kl gil iu ch fo rm mh chk haw pon lb pau tvl sa kos na ho yap gd uli niu la tkl eo kl"
63                         .split(" ");
64         SupplementalDataInfo sdi = SupplementalDataInfo.getInstance();
65         Set<String> scripts = new LinkedHashSet<>();
66         for (String locale : locales) {
67             Set<BasicLanguageData> items = sdi.getBasicLanguageData(locale);
68             if (items == null) {
69                 System.out.println(locale + "\t?");
70                 continue;
71             }
72             scripts.clear();
73             for (BasicLanguageData item : items) {
74                 if (item.getType() == BasicLanguageData.Type.secondary) {
75                     continue;
76                 }
77                 Set<String> script2 = item.getScripts();
78                 if (script2 != null) {
79                     scripts.addAll(script2);
80                 }
81             }
82             if (scripts.size() == 0) {
83                 System.out.println(locale + "\t?");
84                 continue;
85             }
86             if (locale.equals("zh")) {
87                 scripts.remove("Hant");
88             } else if (locale.equals("zh_Hant")) {
89                 scripts.add("Hant");
90             }
91             System.out.println(locale + "\t" + Joiner.on(" ").join(scripts));
92         }
93 
94         StringTransform unicode = Transliterator.getInstance("hex/unicode");
95         UnicodeSet exclude = new UnicodeSet("[:bidimirrored:]");
96         for (int i = 0; i < 0x110000; ++i) {
97             if (exclude.contains(i)) continue;
98             String name = UCharacter.getExtendedName(i);
99             if (name == null) continue;
100             String reverse = name.replaceAll("RIGHT", "LEFT");
101             if (reverse.equals(name)) {
102                 reverse = name.replaceAll("REVERSED ", "");
103                 if (reverse.equals(name)) continue;
104             }
105             int rev = UCharacter.getCharFromName(reverse);
106             if (rev == -1) continue;
107             System.out.println(
108                     unicode.transform(UTF16.valueOf(i))
109                             + "\t"
110                             + UTF16.valueOf(i)
111                             + "\t"
112                             + name
113                             + "\t"
114                             + UTF16.valueOf(rev)
115                             + "\t"
116                             + unicode.transform(UTF16.valueOf(rev))
117                             + "\t"
118                             + reverse);
119         }
120         System.out.println(Locale.SIMPLIFIED_CHINESE);
121         System.out.println(Locale.TRADITIONAL_CHINESE);
122         for (String s : StandardCodes.make().getGoodCountries()) {
123             System.out.println(s + "\t" + ULocale.getDisplayCountry("und-" + s, ULocale.ENGLISH));
124         }
125     }
126 
showDefaultContent(String... strings)127     private static void showDefaultContent(String... strings) {
128         SupplementalDataInfo sdi = SupplementalDataInfo.getInstance();
129         final CLDRConfig info = ToolConfig.getToolInstance();
130         CLDRFile english = info.getEnglish();
131         Set<String> defaultContents = sdi.getDefaultContentLocales();
132         for (String string : strings) {
133             String defCon = null;
134             for (String dc : defaultContents) {
135                 if (string.equals(LocaleIDParser.getParent(dc))) {
136                     defCon = dc;
137                     break;
138                 }
139             }
140             System.out.println(string + "\t" + defCon + "\t" + english.getName(defCon));
141         }
142     }
143 
showSortKey()144     private static void showSortKey() {
145         String[] tests = "a ä A ぁ あ ァ ァ ア ア ㋐".split(" ");
146         RuleBasedCollator c = (RuleBasedCollator) Collator.getInstance(ULocale.ENGLISH);
147         c.setStrength(RuleBasedCollator.QUATERNARY);
148         c.setCaseLevel(true);
149         c.setHiraganaQuaternary(true);
150         for (String test : tests) {
151             for (boolean caseLevel : new boolean[] {false, true}) {
152                 c.setCaseLevel(caseLevel);
153                 for (boolean hiraganaQuaternary : new boolean[] {false, true}) {
154                     c.setHiraganaQuaternary(hiraganaQuaternary);
155                     System.out.print((caseLevel ? "Cl\t" : "\t"));
156                     System.out.print((hiraganaQuaternary ? "Hl\t" : "\t"));
157                     System.out.print(test + "\t");
158                     RawCollationKey key = c.getRawCollationKey(test, null);
159                     for (byte item : key.bytes) {
160                         System.out.print(Integer.toHexString(0xFF & item) + "\t");
161                     }
162                     System.out.println();
163                 }
164             }
165         }
166     }
167 
showNumberSamples()168     private static void showNumberSamples() {
169         String[] tests = {"a$b", "abcd_defg-hi", "abcd-defg$xy", "ab-d$efg-419", "root", "", "und"};
170         for (String test : tests) {
171             ULocale locale = ULocale.forLanguageTag(test);
172             System.out.println(test + " -> " + locale);
173         }
174         DecimalFormat df = new DecimalFormat("***");
175         for (int i = 10; i > -10; --i) {
176             String sample = df.format(1.23456789 * Math.pow(10, i));
177             System.out.println(sample);
178         }
179     }
180 
showDateSamples()181     private static void showDateSamples() {
182         Map<String, Row.R2<Integer, Integer>> specials =
183                 Builder.with(new TreeMap<String, Row.R2<Integer, Integer>>())
184                         .put("full-date", Row.of(DateFormat.FULL, DateFormat.NONE))
185                         .put("long-date", Row.of(DateFormat.LONG, DateFormat.NONE))
186                         .put("medium-date", Row.of(DateFormat.MEDIUM, DateFormat.NONE))
187                         .put("short-date", Row.of(DateFormat.SHORT, DateFormat.NONE))
188                         .put("full-time", Row.of(DateFormat.NONE, DateFormat.FULL))
189                         .put("long-time", Row.of(DateFormat.NONE, DateFormat.LONG))
190                         .put("medium-time", Row.of(DateFormat.NONE, DateFormat.MEDIUM))
191                         .put("short-time", Row.of(DateFormat.NONE, DateFormat.SHORT))
192                         .freeze();
193         Date sample = new Date(2011 - 1900, 12 - 1, 30, 14, 45, 59);
194         final ULocale english = ULocale.ENGLISH;
195         final ULocale otherLocale = new ULocale("el");
196         DateTimePatternGenerator englishGenerator = DateTimePatternGenerator.getInstance(english);
197         DateTimePatternGenerator otherGenerator = DateTimePatternGenerator.getInstance(otherLocale);
198         for (String dp :
199                 new String[] {
200                     "d",
201                     "h",
202                     "H",
203                     "hm",
204                     "Hm",
205                     "Hms",
206                     "hms",
207                     "hmv",
208                     "Hmv",
209                     "hv",
210                     "Hv",
211                     "M",
212                     "Md",
213                     "MEd",
214                     "MMM",
215                     "MMMd",
216                     "MMMEd",
217                     "ms",
218                     "y",
219                     "yM",
220                     "yMd",
221                     "yMEd",
222                     "yMMM",
223                     "yMMMd",
224                     "yMMMEd",
225                     "yMMMM",
226                     "yQ",
227                     "yQQQ",
228                     "EEEd",
229                     "full-date",
230                     "long-date",
231                     "medium-date",
232                     "short-date",
233                     "full-time",
234                     "long-time",
235                     "medium-time",
236                     "short-time",
237                     "MMMM",
238                     "MMMMd",
239                     "E",
240                     "Ed",
241                     "GGGGyMd",
242                     "GGGGyMMMMEEEEdd",
243                     "GGGGyyyyMMMMd",
244                     "HHmm",
245                     "HHmmss",
246                     "HHmmZ",
247                     "Hmm",
248                     "MMd",
249                     "MMdd",
250                     "MMMdd",
251                     "MMMEEEd",
252                     "MMMMdd",
253                     "MMMMEd",
254                     "MMMMEEEd",
255                     "mmss",
256                     "yMMMMccccd",
257                     "yyMM",
258                     "yyMMdd",
259                     "yyMMM",
260                     "yyMMMd",
261                     "yyMMMEEEd",
262                     "yyQ",
263                     "yyQQQQ",
264                     "yyyy",
265                     "yyyyLLLL",
266                     "yyyyM",
267                     "yyyyMEEEd",
268                     "yyyyMM",
269                     "yyyyMMM",
270                     "yyyyMMMM",
271                     "yyyyMMMMEEEEd",
272                     "yyyyQQQQ",
273                     "hmz",
274                     "hz",
275                     "LLL",
276                     "LLLL",
277                     "MMMMEEEEd",
278                     "yMMMMd",
279                     "yMMMMEEEEd"
280                 }) {
281             final String formattedEnglish =
282                     getFormatted(specials, sample, dp, english, englishGenerator);
283             final String formattedOther =
284                     getFormatted(specials, sample, dp, otherLocale, otherGenerator);
285             System.out.println(dp + "\t«" + formattedEnglish + "»\t«" + formattedOther + "»");
286         }
287     }
288 
getFormatted( Map<String, Row.R2<Integer, Integer>> specials, Date sample, String dp, ULocale ulocale, DateTimePatternGenerator generator)289     private static String getFormatted(
290             Map<String, Row.R2<Integer, Integer>> specials,
291             Date sample,
292             String dp,
293             ULocale ulocale,
294             DateTimePatternGenerator generator) {
295         Row.R2<Integer, Integer> special = specials.get(dp);
296         DateFormat df;
297         if (special != null) {
298             df = DateFormat.getDateTimeInstance(special.get0(), special.get1(), ulocale);
299         } else {
300             String pat = generator.getBestPattern(dp);
301             df = new SimpleDateFormat(pat, ulocale);
302         }
303         df.setTimeZone(TimeZone.getTimeZone("GMT"));
304         final String formatted = df.format(sample);
305         return formatted;
306     }
307 
showExemplarSize()308     private static void showExemplarSize() {
309         final CLDRConfig info = ToolConfig.getToolInstance();
310         CLDRFile english = info.getEnglish();
311         Factory factory = info.getCldrFactory();
312         SupplementalDataInfo dataInfo = info.getSupplementalDataInfo();
313         Map<String, Map<String, R2<List<String>, String>>> type_tag_replacement =
314                 dataInfo.getLocaleAliasInfo();
315         Map<String, R2<List<String>, String>> lang2replacement =
316                 type_tag_replacement.get("language");
317 
318         LanguageTagParser ltp = new LanguageTagParser();
319         String[] locales =
320                 "en ru nl en-GB fr de it pl pt-BR es tr th ja zh-CN zh-TW ko ar bg sr uk ca hr cs da fil fi hu id lv lt no pt-PT ro sk sl es-419 sv vi el iw fa hi am af et is ms sw zu bn mr ta eu fr-CA gl zh-HK ur gu kn ml te"
321                         .split(" ");
322         Set<String> nameAndInfo = new TreeSet<>(info.getCollator());
323         for (String localeCode : locales) {
324             String baseLanguage = ltp.set(localeCode).getLanguage();
325             R2<List<String>, String> temp = lang2replacement.get(baseLanguage);
326             if (temp != null) {
327                 baseLanguage = temp.get0().get(0);
328             }
329             String englishName = english.getName(baseLanguage);
330             CLDRFile cldrFile = factory.make(baseLanguage, false);
331             UnicodeSet set = cldrFile.getExemplarSet("", WinningChoice.WINNING);
332             int script = -1;
333             for (String s : set) {
334                 int cp = s.codePointAt(0);
335                 script = UScript.getScript(cp);
336                 if (script != UScript.COMMON && script != UScript.INHERITED) {
337                     break;
338                 }
339             }
340             String nativeName = cldrFile.getName(baseLanguage);
341             nameAndInfo.add(
342                     englishName
343                             + "\t"
344                             + nativeName
345                             + "\t"
346                             + baseLanguage
347                             + "\t"
348                             + UScript.getShortName(script));
349         }
350 
351         for (String item : nameAndInfo) {
352             System.out.println(item);
353         }
354         // for (String localeCode : locales) {
355         // String baseLanguage = ltp.set(localeCode).getLanguage();
356         // R2<List<String>, String> temp = lang2replacement.get(baseLanguage);
357         // if (temp != null) {
358         // baseLanguage = temp.get0().get(0);
359         // }
360         // int size = -1;
361         //
362         // try {
363         // CLDRFile cldrFile = factory.make(baseLanguage, false);
364         // UnicodeSet set = cldrFile.getExemplarSet("", WinningChoice.WINNING);
365         // size = set.size();
366         // } catch (Exception e) {
367         // }
368         //
369         // System.out.println(localeCode + "\t" + size);
370         // }
371     }
372 
373     static final Normalizer2 nfc = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE);
374     static final Normalizer2 nfd = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE);
375 
doNFC()376     private static void doNFC() {
377 
378         StringBuilder b = new StringBuilder();
379         for (int i = 0; i < 0x110000; ++i) {
380             b.setLength(0);
381             b.appendCodePoint(i);
382             boolean isNfd = nfd.isNormalized(b);
383             boolean isNfdNew = IsNfd.isNormalizedUpTo(b) < 0;
384             if (isNfd != isNfdNew) {
385                 IsNfd.isNormalizedUpTo(b);
386                 throw new IllegalArgumentException();
387             }
388         }
389         String[] tests = {"Mark", "Μάρκος", nfd.normalize("Μάρκος")};
390         long[] times = new long[2];
391         // warmup
392         for (String test : tests) {
393             times[0] = times[1] = Long.MIN_VALUE;
394             time(nfc, test, 10000000, "NFC", times);
395             time(nfd, test, 10000000, "NFD", times);
396             time(test, 10000000, "NFDx", times);
397         }
398         System.out.println();
399         for (String test : tests) {
400             times[0] = times[1] = Long.MIN_VALUE;
401             time(nfc, test, 100000000, "NFC", times);
402             time(nfd, test, 100000000, "NFD", times);
403             time(test, 100000000, "NFDx", times);
404         }
405     }
406 
407     // static class ByteTrie {
408     // static class Block {
409     // byte[] values = new byte[128];
410     // }
411     // int[] index;
412     // byte[][] blocks;
413     // static class Builder {
414     // Map<Block,Integer> backIndex = new HashMap<Block, Integer>();
415     // Block block = new Block();
416     // int pos = 0;
417     // void append(byte item) {
418     // if (pos >= 128) {
419     // // if block is in backIndex, use the index, otherwise add and create new
420     // Block
421     // } else {
422     // block.values[pos++] = item;
423     // }
424     // }
425     // }
426     // }
427 
428     static class IsNfd {
429         static final byte[] info = new byte[0x110000];
430 
431         static {
432             for (int i = 0; i < 0x110000; ++i) {
433                 int nfdqc = UCharacter.getIntPropertyValue(i, UProperty.NFD_QUICK_CHECK);
434                 if (nfdqc == 0) {
435                     info[i] = (byte) 0xFF;
436                     continue;
437                 }
438                 int ccc = UCharacter.getIntPropertyValue(i, UProperty.CANONICAL_COMBINING_CLASS);
439                 info[i] = (byte) ccc;
440 
441                 // if (ccc != 0) {
442                 // info[i] = (byte) ccc;
443                 // continue;
444                 // }
445                 // int gc = UCharacter.getIntPropertyValue(i,
446                 // UProperty.GENERAL_CATEGORY);
447                 // if (gc != UCharacter.UNASSIGNED) {
448                 // info[i] = (byte) 0;
449                 // continue;
450                 // }
451                 // int nc = UCharacter.getIntPropertyValue(i,
452                 // UProperty.NONCHARACTER_CODE_POINT);
453                 // if (nc == yes) {
454                 // info[i] = (byte) 0;
455                 // continue;
456                 // }
457                 // info[i] = (byte) 0xFF;
458             }
459         }
460 
461         public static String normalize(CharSequence s) {
462             int normalizedUpTo = isNormalizedUpTo(s);
463             if (normalizedUpTo < 0) {
464                 return s.toString();
465             }
466             return nfd.normalizeSecondAndAppend(
467                             new StringBuilder(s.subSequence(0, normalizedUpTo)),
468                             s.subSequence(normalizedUpTo, s.length()))
469                     .toString();
470         }
471 
472         public static int isNormalizedUpTo(CharSequence s) {
473             final int length = s.length();
474             int lastNonStarterIndex = 0;
475             int lastByte = 0;
476             int i;
477             for (i = 0; i < length; ++i) {
478                 int cp = s.charAt(i);
479                 if (cp >= 0xD800 && cp < 0xDC00) {
480                     cp = Character.codePointAt(s, i);
481                 }
482                 int b = info[cp] & 0xFF;
483                 if (b == 0) {
484                     lastNonStarterIndex = i;
485                     lastByte = b;
486                 } else if (b == lastByte) {
487                     // do nothing, common case
488                 } else if (b < lastByte || b == 0xFF) {
489                     return lastNonStarterIndex; // failure
490                 } else {
491                     lastByte = b; // increasing CCC, ok
492                 }
493                 if (cp > 0xFFFF) {
494                     ++i;
495                 }
496             }
497             return -1;
498         }
499     }
500 
501     private static void time(String test, int iterations, String name, long[] times) {
502         System.out.println(test);
503         System.gc();
504         System.gc();
505         System.gc();
506 
507         Timer t = new Timer();
508         t.start();
509         for (int i = iterations; i > 0; --i) {
510             IsNfd.isNormalizedUpTo(test);
511         }
512         long isNfc = t.getDuration();
513         if (times[0] != Long.MIN_VALUE) {
514             System.out.println("\tis" + name + ":\t" + t.toString(iterations, times[0]));
515         } else {
516             System.out.println("\tis" + name + ":\t" + t.toString(iterations));
517         }
518         times[0] = isNfc;
519 
520         System.gc();
521         System.gc();
522         System.gc();
523         t.start();
524         for (int i = iterations; i > 0; --i) {
525             IsNfd.normalize(test);
526         }
527         long toNfc = t.getDuration();
528         if (times[1] != Long.MIN_VALUE) {
529             System.out.println("\tto" + name + ":\t" + t.toString(iterations, times[1]));
530         } else {
531             System.out.println("\tto" + name + ":\t" + t.toString(iterations));
532         }
533         times[1] = toNfc;
534     }
535 
536     private static void time(
537             Normalizer2 nfx, String test, int iterations, String name, long[] times) {
538         System.out.println(test);
539         System.gc();
540         System.gc();
541         System.gc();
542 
543         Timer t = new Timer();
544         t.start();
545         for (int i = iterations; i > 0; --i) {
546             nfx.isNormalized(test);
547         }
548         long isNfc = t.getDuration();
549         if (times[0] != Long.MIN_VALUE) {
550             System.out.println("\tis" + name + ":\t" + t.toString(iterations, times[0]));
551         } else {
552             System.out.println("\tis" + name + ":\t" + t.toString(iterations));
553         }
554         times[0] = isNfc;
555 
556         System.gc();
557         System.gc();
558         System.gc();
559         t.start();
560         for (int i = iterations; i > 0; --i) {
561             nfx.normalize(test);
562         }
563         long toNfc = t.getDuration();
564         if (times[1] != Long.MIN_VALUE) {
565             System.out.println("\tto" + name + ":\t" + t.toString(iterations, times[1]));
566         } else {
567             System.out.println("\tto" + name + ":\t" + t.toString(iterations));
568         }
569         times[1] = toNfc;
570     }
571 
showPlurals()572     private static void showPlurals() throws IOException {
573         CLDRConfig testInfo = org.unicode.cldr.tool.ToolConfig.getToolInstance();
574         // for (Entry<PluralSnapshot, String> ruleEntry : info) {
575         // PluralSnapshot ss = ruleEntry.getKey();
576         // String rules = ruleEntry.getValue();
577         // Set<String> locales = info.getLocales(rules);
578         // System.out.println(ss + "\nRules:\t" + rules + "\nLocales:\t" +
579         // locales + "\n");
580         // }
581 
582         PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY, "pluralTest.html");
583 
584         System.out.println(PluralSnapshot.getDefaultStyles());
585 
586         out.println("<html><head>" + PluralSnapshot.getDefaultStyles() + "</style><body>");
587 
588         PluralSnapshot.writeTables(testInfo.getEnglish(), out);
589         out.println("</body></html>");
590         out.close();
591     }
592 }
593