1 package org.unicode.cldr.draft; 2 3 import com.google.common.base.Joiner; 4 import com.ibm.icu.impl.Row; 5 import com.ibm.icu.impl.Row.R2; 6 import com.ibm.icu.lang.UCharacter; 7 import com.ibm.icu.lang.UProperty; 8 import com.ibm.icu.lang.UScript; 9 import com.ibm.icu.text.Collator; 10 import com.ibm.icu.text.DateFormat; 11 import com.ibm.icu.text.DateTimePatternGenerator; 12 import com.ibm.icu.text.DecimalFormat; 13 import com.ibm.icu.text.Normalizer2; 14 import com.ibm.icu.text.RawCollationKey; 15 import com.ibm.icu.text.RuleBasedCollator; 16 import com.ibm.icu.text.SimpleDateFormat; 17 import com.ibm.icu.text.StringTransform; 18 import com.ibm.icu.text.Transliterator; 19 import com.ibm.icu.text.UTF16; 20 import com.ibm.icu.text.UnicodeSet; 21 import com.ibm.icu.util.TimeZone; 22 import com.ibm.icu.util.ULocale; 23 import java.io.IOException; 24 import java.io.PrintWriter; 25 import java.util.Date; 26 import java.util.LinkedHashSet; 27 import java.util.List; 28 import java.util.Locale; 29 import java.util.Map; 30 import java.util.Set; 31 import java.util.TreeMap; 32 import java.util.TreeSet; 33 import org.unicode.cldr.tool.ToolConfig; 34 import org.unicode.cldr.util.Builder; 35 import org.unicode.cldr.util.CLDRConfig; 36 import org.unicode.cldr.util.CLDRFile; 37 import org.unicode.cldr.util.CLDRFile.WinningChoice; 38 import org.unicode.cldr.util.CLDRPaths; 39 import org.unicode.cldr.util.Factory; 40 import org.unicode.cldr.util.LanguageTagParser; 41 import org.unicode.cldr.util.LocaleIDParser; 42 import org.unicode.cldr.util.PluralSnapshot; 43 import org.unicode.cldr.util.StandardCodes; 44 import org.unicode.cldr.util.SupplementalDataInfo; 45 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; 46 import org.unicode.cldr.util.Timer; 47 48 public class Misc { main(String[] args)49 public static void main(String[] args) throws IOException { 50 showDefaultContent( 51 "bn", "sw", "mr", "ta", "ms", "am", "af", "zu", "et", "is", "ur", "te", "gu", "kn", 52 "ml", "gl", "eu"); 53 showSortKey(); 54 showNumberSamples(); 55 showDateSamples(); 56 showExemplarSize(); 57 58 doNFC(); 59 showPlurals(); 60 61 String[] locales = 62 "zh en es hi fr ar pt ru id bn ur ja de fil sw pa jv ko tr vi it te mr th fa ta pl lah gu my ms uk zh_Hant kn su ml nl az or ro uz bho ps ha ku mad yo ig si mg sd hu am om kk el ne be mai sr cs km as sv mag mwr sn ny ca bg hne tg bgc ii he dcc ug fuv qu rw min af zu mn bjn so ki hr ak tk fi sq da bya sk gn bal no lua xh bs ht syl ka bjj ban sat hy za luy rn bug bem luo wtm st lo gl ti shn ceb ks mfa ace lt ky bm lg shi tn bcl glk war kok bew kln kam umb bo suk ee kmb ay pam bhk sas bbc swv nso tpi rjb gbm lmn ff kab sl ts ba cv kri gon ndc guz wo tzm mak kfy ln ljp mk efi ibb doi awa mos nyn vmw mer kru lv sid pag gno sck tcy wbq nd lrc ss cgg brh xog nn sg xnr dyu rmt teo kxm mdh hno lu eu khn wbr tsg rej rif brx ilo kbd et ce kg fy hil kj cy ast av ve udm ga tt sah myv tet gaa ady mt dv fj nr is mdf kum kha sm kpv lez pap krc inh oc se tyv zdj dz bi gag to koi lbe mi ab os ty kl gil iu ch fo rm mh chk haw pon lb pau tvl sa kos na ho yap gd uli niu la tkl eo kl" 63 .split(" "); 64 SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); 65 Set<String> scripts = new LinkedHashSet<>(); 66 for (String locale : locales) { 67 Set<BasicLanguageData> items = sdi.getBasicLanguageData(locale); 68 if (items == null) { 69 System.out.println(locale + "\t?"); 70 continue; 71 } 72 scripts.clear(); 73 for (BasicLanguageData item : items) { 74 if (item.getType() == BasicLanguageData.Type.secondary) { 75 continue; 76 } 77 Set<String> script2 = item.getScripts(); 78 if (script2 != null) { 79 scripts.addAll(script2); 80 } 81 } 82 if (scripts.size() == 0) { 83 System.out.println(locale + "\t?"); 84 continue; 85 } 86 if (locale.equals("zh")) { 87 scripts.remove("Hant"); 88 } else if (locale.equals("zh_Hant")) { 89 scripts.add("Hant"); 90 } 91 System.out.println(locale + "\t" + Joiner.on(" ").join(scripts)); 92 } 93 94 StringTransform unicode = Transliterator.getInstance("hex/unicode"); 95 UnicodeSet exclude = new UnicodeSet("[:bidimirrored:]"); 96 for (int i = 0; i < 0x110000; ++i) { 97 if (exclude.contains(i)) continue; 98 String name = UCharacter.getExtendedName(i); 99 if (name == null) continue; 100 String reverse = name.replaceAll("RIGHT", "LEFT"); 101 if (reverse.equals(name)) { 102 reverse = name.replaceAll("REVERSED ", ""); 103 if (reverse.equals(name)) continue; 104 } 105 int rev = UCharacter.getCharFromName(reverse); 106 if (rev == -1) continue; 107 System.out.println( 108 unicode.transform(UTF16.valueOf(i)) 109 + "\t" 110 + UTF16.valueOf(i) 111 + "\t" 112 + name 113 + "\t" 114 + UTF16.valueOf(rev) 115 + "\t" 116 + unicode.transform(UTF16.valueOf(rev)) 117 + "\t" 118 + reverse); 119 } 120 System.out.println(Locale.SIMPLIFIED_CHINESE); 121 System.out.println(Locale.TRADITIONAL_CHINESE); 122 for (String s : StandardCodes.make().getGoodCountries()) { 123 System.out.println(s + "\t" + ULocale.getDisplayCountry("und-" + s, ULocale.ENGLISH)); 124 } 125 } 126 showDefaultContent(String... strings)127 private static void showDefaultContent(String... strings) { 128 SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); 129 final CLDRConfig info = ToolConfig.getToolInstance(); 130 CLDRFile english = info.getEnglish(); 131 Set<String> defaultContents = sdi.getDefaultContentLocales(); 132 for (String string : strings) { 133 String defCon = null; 134 for (String dc : defaultContents) { 135 if (string.equals(LocaleIDParser.getParent(dc))) { 136 defCon = dc; 137 break; 138 } 139 } 140 System.out.println(string + "\t" + defCon + "\t" + english.getName(defCon)); 141 } 142 } 143 showSortKey()144 private static void showSortKey() { 145 String[] tests = "a ä A ぁ あ ァ ァ ア ア ㋐".split(" "); 146 RuleBasedCollator c = (RuleBasedCollator) Collator.getInstance(ULocale.ENGLISH); 147 c.setStrength(RuleBasedCollator.QUATERNARY); 148 c.setCaseLevel(true); 149 c.setHiraganaQuaternary(true); 150 for (String test : tests) { 151 for (boolean caseLevel : new boolean[] {false, true}) { 152 c.setCaseLevel(caseLevel); 153 for (boolean hiraganaQuaternary : new boolean[] {false, true}) { 154 c.setHiraganaQuaternary(hiraganaQuaternary); 155 System.out.print((caseLevel ? "Cl\t" : "\t")); 156 System.out.print((hiraganaQuaternary ? "Hl\t" : "\t")); 157 System.out.print(test + "\t"); 158 RawCollationKey key = c.getRawCollationKey(test, null); 159 for (byte item : key.bytes) { 160 System.out.print(Integer.toHexString(0xFF & item) + "\t"); 161 } 162 System.out.println(); 163 } 164 } 165 } 166 } 167 showNumberSamples()168 private static void showNumberSamples() { 169 String[] tests = {"a$b", "abcd_defg-hi", "abcd-defg$xy", "ab-d$efg-419", "root", "", "und"}; 170 for (String test : tests) { 171 ULocale locale = ULocale.forLanguageTag(test); 172 System.out.println(test + " -> " + locale); 173 } 174 DecimalFormat df = new DecimalFormat("***"); 175 for (int i = 10; i > -10; --i) { 176 String sample = df.format(1.23456789 * Math.pow(10, i)); 177 System.out.println(sample); 178 } 179 } 180 showDateSamples()181 private static void showDateSamples() { 182 Map<String, Row.R2<Integer, Integer>> specials = 183 Builder.with(new TreeMap<String, Row.R2<Integer, Integer>>()) 184 .put("full-date", Row.of(DateFormat.FULL, DateFormat.NONE)) 185 .put("long-date", Row.of(DateFormat.LONG, DateFormat.NONE)) 186 .put("medium-date", Row.of(DateFormat.MEDIUM, DateFormat.NONE)) 187 .put("short-date", Row.of(DateFormat.SHORT, DateFormat.NONE)) 188 .put("full-time", Row.of(DateFormat.NONE, DateFormat.FULL)) 189 .put("long-time", Row.of(DateFormat.NONE, DateFormat.LONG)) 190 .put("medium-time", Row.of(DateFormat.NONE, DateFormat.MEDIUM)) 191 .put("short-time", Row.of(DateFormat.NONE, DateFormat.SHORT)) 192 .freeze(); 193 Date sample = new Date(2011 - 1900, 12 - 1, 30, 14, 45, 59); 194 final ULocale english = ULocale.ENGLISH; 195 final ULocale otherLocale = new ULocale("el"); 196 DateTimePatternGenerator englishGenerator = DateTimePatternGenerator.getInstance(english); 197 DateTimePatternGenerator otherGenerator = DateTimePatternGenerator.getInstance(otherLocale); 198 for (String dp : 199 new String[] { 200 "d", 201 "h", 202 "H", 203 "hm", 204 "Hm", 205 "Hms", 206 "hms", 207 "hmv", 208 "Hmv", 209 "hv", 210 "Hv", 211 "M", 212 "Md", 213 "MEd", 214 "MMM", 215 "MMMd", 216 "MMMEd", 217 "ms", 218 "y", 219 "yM", 220 "yMd", 221 "yMEd", 222 "yMMM", 223 "yMMMd", 224 "yMMMEd", 225 "yMMMM", 226 "yQ", 227 "yQQQ", 228 "EEEd", 229 "full-date", 230 "long-date", 231 "medium-date", 232 "short-date", 233 "full-time", 234 "long-time", 235 "medium-time", 236 "short-time", 237 "MMMM", 238 "MMMMd", 239 "E", 240 "Ed", 241 "GGGGyMd", 242 "GGGGyMMMMEEEEdd", 243 "GGGGyyyyMMMMd", 244 "HHmm", 245 "HHmmss", 246 "HHmmZ", 247 "Hmm", 248 "MMd", 249 "MMdd", 250 "MMMdd", 251 "MMMEEEd", 252 "MMMMdd", 253 "MMMMEd", 254 "MMMMEEEd", 255 "mmss", 256 "yMMMMccccd", 257 "yyMM", 258 "yyMMdd", 259 "yyMMM", 260 "yyMMMd", 261 "yyMMMEEEd", 262 "yyQ", 263 "yyQQQQ", 264 "yyyy", 265 "yyyyLLLL", 266 "yyyyM", 267 "yyyyMEEEd", 268 "yyyyMM", 269 "yyyyMMM", 270 "yyyyMMMM", 271 "yyyyMMMMEEEEd", 272 "yyyyQQQQ", 273 "hmz", 274 "hz", 275 "LLL", 276 "LLLL", 277 "MMMMEEEEd", 278 "yMMMMd", 279 "yMMMMEEEEd" 280 }) { 281 final String formattedEnglish = 282 getFormatted(specials, sample, dp, english, englishGenerator); 283 final String formattedOther = 284 getFormatted(specials, sample, dp, otherLocale, otherGenerator); 285 System.out.println(dp + "\t«" + formattedEnglish + "»\t«" + formattedOther + "»"); 286 } 287 } 288 getFormatted( Map<String, Row.R2<Integer, Integer>> specials, Date sample, String dp, ULocale ulocale, DateTimePatternGenerator generator)289 private static String getFormatted( 290 Map<String, Row.R2<Integer, Integer>> specials, 291 Date sample, 292 String dp, 293 ULocale ulocale, 294 DateTimePatternGenerator generator) { 295 Row.R2<Integer, Integer> special = specials.get(dp); 296 DateFormat df; 297 if (special != null) { 298 df = DateFormat.getDateTimeInstance(special.get0(), special.get1(), ulocale); 299 } else { 300 String pat = generator.getBestPattern(dp); 301 df = new SimpleDateFormat(pat, ulocale); 302 } 303 df.setTimeZone(TimeZone.getTimeZone("GMT")); 304 final String formatted = df.format(sample); 305 return formatted; 306 } 307 showExemplarSize()308 private static void showExemplarSize() { 309 final CLDRConfig info = ToolConfig.getToolInstance(); 310 CLDRFile english = info.getEnglish(); 311 Factory factory = info.getCldrFactory(); 312 SupplementalDataInfo dataInfo = info.getSupplementalDataInfo(); 313 Map<String, Map<String, R2<List<String>, String>>> type_tag_replacement = 314 dataInfo.getLocaleAliasInfo(); 315 Map<String, R2<List<String>, String>> lang2replacement = 316 type_tag_replacement.get("language"); 317 318 LanguageTagParser ltp = new LanguageTagParser(); 319 String[] locales = 320 "en ru nl en-GB fr de it pl pt-BR es tr th ja zh-CN zh-TW ko ar bg sr uk ca hr cs da fil fi hu id lv lt no pt-PT ro sk sl es-419 sv vi el iw fa hi am af et is ms sw zu bn mr ta eu fr-CA gl zh-HK ur gu kn ml te" 321 .split(" "); 322 Set<String> nameAndInfo = new TreeSet<>(info.getCollator()); 323 for (String localeCode : locales) { 324 String baseLanguage = ltp.set(localeCode).getLanguage(); 325 R2<List<String>, String> temp = lang2replacement.get(baseLanguage); 326 if (temp != null) { 327 baseLanguage = temp.get0().get(0); 328 } 329 String englishName = english.getName(baseLanguage); 330 CLDRFile cldrFile = factory.make(baseLanguage, false); 331 UnicodeSet set = cldrFile.getExemplarSet("", WinningChoice.WINNING); 332 int script = -1; 333 for (String s : set) { 334 int cp = s.codePointAt(0); 335 script = UScript.getScript(cp); 336 if (script != UScript.COMMON && script != UScript.INHERITED) { 337 break; 338 } 339 } 340 String nativeName = cldrFile.getName(baseLanguage); 341 nameAndInfo.add( 342 englishName 343 + "\t" 344 + nativeName 345 + "\t" 346 + baseLanguage 347 + "\t" 348 + UScript.getShortName(script)); 349 } 350 351 for (String item : nameAndInfo) { 352 System.out.println(item); 353 } 354 // for (String localeCode : locales) { 355 // String baseLanguage = ltp.set(localeCode).getLanguage(); 356 // R2<List<String>, String> temp = lang2replacement.get(baseLanguage); 357 // if (temp != null) { 358 // baseLanguage = temp.get0().get(0); 359 // } 360 // int size = -1; 361 // 362 // try { 363 // CLDRFile cldrFile = factory.make(baseLanguage, false); 364 // UnicodeSet set = cldrFile.getExemplarSet("", WinningChoice.WINNING); 365 // size = set.size(); 366 // } catch (Exception e) { 367 // } 368 // 369 // System.out.println(localeCode + "\t" + size); 370 // } 371 } 372 373 static final Normalizer2 nfc = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE); 374 static final Normalizer2 nfd = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE); 375 doNFC()376 private static void doNFC() { 377 378 StringBuilder b = new StringBuilder(); 379 for (int i = 0; i < 0x110000; ++i) { 380 b.setLength(0); 381 b.appendCodePoint(i); 382 boolean isNfd = nfd.isNormalized(b); 383 boolean isNfdNew = IsNfd.isNormalizedUpTo(b) < 0; 384 if (isNfd != isNfdNew) { 385 IsNfd.isNormalizedUpTo(b); 386 throw new IllegalArgumentException(); 387 } 388 } 389 String[] tests = {"Mark", "Μάρκος", nfd.normalize("Μάρκος")}; 390 long[] times = new long[2]; 391 // warmup 392 for (String test : tests) { 393 times[0] = times[1] = Long.MIN_VALUE; 394 time(nfc, test, 10000000, "NFC", times); 395 time(nfd, test, 10000000, "NFD", times); 396 time(test, 10000000, "NFDx", times); 397 } 398 System.out.println(); 399 for (String test : tests) { 400 times[0] = times[1] = Long.MIN_VALUE; 401 time(nfc, test, 100000000, "NFC", times); 402 time(nfd, test, 100000000, "NFD", times); 403 time(test, 100000000, "NFDx", times); 404 } 405 } 406 407 // static class ByteTrie { 408 // static class Block { 409 // byte[] values = new byte[128]; 410 // } 411 // int[] index; 412 // byte[][] blocks; 413 // static class Builder { 414 // Map<Block,Integer> backIndex = new HashMap<Block, Integer>(); 415 // Block block = new Block(); 416 // int pos = 0; 417 // void append(byte item) { 418 // if (pos >= 128) { 419 // // if block is in backIndex, use the index, otherwise add and create new 420 // Block 421 // } else { 422 // block.values[pos++] = item; 423 // } 424 // } 425 // } 426 // } 427 428 static class IsNfd { 429 static final byte[] info = new byte[0x110000]; 430 431 static { 432 for (int i = 0; i < 0x110000; ++i) { 433 int nfdqc = UCharacter.getIntPropertyValue(i, UProperty.NFD_QUICK_CHECK); 434 if (nfdqc == 0) { 435 info[i] = (byte) 0xFF; 436 continue; 437 } 438 int ccc = UCharacter.getIntPropertyValue(i, UProperty.CANONICAL_COMBINING_CLASS); 439 info[i] = (byte) ccc; 440 441 // if (ccc != 0) { 442 // info[i] = (byte) ccc; 443 // continue; 444 // } 445 // int gc = UCharacter.getIntPropertyValue(i, 446 // UProperty.GENERAL_CATEGORY); 447 // if (gc != UCharacter.UNASSIGNED) { 448 // info[i] = (byte) 0; 449 // continue; 450 // } 451 // int nc = UCharacter.getIntPropertyValue(i, 452 // UProperty.NONCHARACTER_CODE_POINT); 453 // if (nc == yes) { 454 // info[i] = (byte) 0; 455 // continue; 456 // } 457 // info[i] = (byte) 0xFF; 458 } 459 } 460 461 public static String normalize(CharSequence s) { 462 int normalizedUpTo = isNormalizedUpTo(s); 463 if (normalizedUpTo < 0) { 464 return s.toString(); 465 } 466 return nfd.normalizeSecondAndAppend( 467 new StringBuilder(s.subSequence(0, normalizedUpTo)), 468 s.subSequence(normalizedUpTo, s.length())) 469 .toString(); 470 } 471 472 public static int isNormalizedUpTo(CharSequence s) { 473 final int length = s.length(); 474 int lastNonStarterIndex = 0; 475 int lastByte = 0; 476 int i; 477 for (i = 0; i < length; ++i) { 478 int cp = s.charAt(i); 479 if (cp >= 0xD800 && cp < 0xDC00) { 480 cp = Character.codePointAt(s, i); 481 } 482 int b = info[cp] & 0xFF; 483 if (b == 0) { 484 lastNonStarterIndex = i; 485 lastByte = b; 486 } else if (b == lastByte) { 487 // do nothing, common case 488 } else if (b < lastByte || b == 0xFF) { 489 return lastNonStarterIndex; // failure 490 } else { 491 lastByte = b; // increasing CCC, ok 492 } 493 if (cp > 0xFFFF) { 494 ++i; 495 } 496 } 497 return -1; 498 } 499 } 500 501 private static void time(String test, int iterations, String name, long[] times) { 502 System.out.println(test); 503 System.gc(); 504 System.gc(); 505 System.gc(); 506 507 Timer t = new Timer(); 508 t.start(); 509 for (int i = iterations; i > 0; --i) { 510 IsNfd.isNormalizedUpTo(test); 511 } 512 long isNfc = t.getDuration(); 513 if (times[0] != Long.MIN_VALUE) { 514 System.out.println("\tis" + name + ":\t" + t.toString(iterations, times[0])); 515 } else { 516 System.out.println("\tis" + name + ":\t" + t.toString(iterations)); 517 } 518 times[0] = isNfc; 519 520 System.gc(); 521 System.gc(); 522 System.gc(); 523 t.start(); 524 for (int i = iterations; i > 0; --i) { 525 IsNfd.normalize(test); 526 } 527 long toNfc = t.getDuration(); 528 if (times[1] != Long.MIN_VALUE) { 529 System.out.println("\tto" + name + ":\t" + t.toString(iterations, times[1])); 530 } else { 531 System.out.println("\tto" + name + ":\t" + t.toString(iterations)); 532 } 533 times[1] = toNfc; 534 } 535 536 private static void time( 537 Normalizer2 nfx, String test, int iterations, String name, long[] times) { 538 System.out.println(test); 539 System.gc(); 540 System.gc(); 541 System.gc(); 542 543 Timer t = new Timer(); 544 t.start(); 545 for (int i = iterations; i > 0; --i) { 546 nfx.isNormalized(test); 547 } 548 long isNfc = t.getDuration(); 549 if (times[0] != Long.MIN_VALUE) { 550 System.out.println("\tis" + name + ":\t" + t.toString(iterations, times[0])); 551 } else { 552 System.out.println("\tis" + name + ":\t" + t.toString(iterations)); 553 } 554 times[0] = isNfc; 555 556 System.gc(); 557 System.gc(); 558 System.gc(); 559 t.start(); 560 for (int i = iterations; i > 0; --i) { 561 nfx.normalize(test); 562 } 563 long toNfc = t.getDuration(); 564 if (times[1] != Long.MIN_VALUE) { 565 System.out.println("\tto" + name + ":\t" + t.toString(iterations, times[1])); 566 } else { 567 System.out.println("\tto" + name + ":\t" + t.toString(iterations)); 568 } 569 times[1] = toNfc; 570 } 571 showPlurals()572 private static void showPlurals() throws IOException { 573 CLDRConfig testInfo = org.unicode.cldr.tool.ToolConfig.getToolInstance(); 574 // for (Entry<PluralSnapshot, String> ruleEntry : info) { 575 // PluralSnapshot ss = ruleEntry.getKey(); 576 // String rules = ruleEntry.getValue(); 577 // Set<String> locales = info.getLocales(rules); 578 // System.out.println(ss + "\nRules:\t" + rules + "\nLocales:\t" + 579 // locales + "\n"); 580 // } 581 582 PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY, "pluralTest.html"); 583 584 System.out.println(PluralSnapshot.getDefaultStyles()); 585 586 out.println("<html><head>" + PluralSnapshot.getDefaultStyles() + "</style><body>"); 587 588 PluralSnapshot.writeTables(testInfo.getEnglish(), out); 589 out.println("</body></html>"); 590 out.close(); 591 } 592 } 593