1 package org.unicode.cldr.unittest; 2 3 import com.google.common.base.Joiner; 4 import com.google.common.base.Objects; 5 import com.google.common.collect.ImmutableMap; 6 import com.google.common.collect.ImmutableSet; 7 import com.google.common.collect.Multimap; 8 import com.google.common.collect.Sets; 9 import com.google.common.collect.TreeMultimap; 10 import com.ibm.icu.dev.test.TestFmwk; 11 import com.ibm.icu.dev.util.UnicodeMap; 12 import com.ibm.icu.lang.UCharacter; 13 import com.ibm.icu.lang.UProperty; 14 import com.ibm.icu.lang.UScript; 15 import com.ibm.icu.text.UnicodeSet; 16 import com.ibm.icu.util.VersionInfo; 17 import java.util.Arrays; 18 import java.util.Collection; 19 import java.util.HashSet; 20 import java.util.LinkedHashSet; 21 import java.util.Map; 22 import java.util.Map.Entry; 23 import java.util.Set; 24 import java.util.TreeMap; 25 import java.util.TreeSet; 26 import org.unicode.cldr.draft.ScriptMetadata; 27 import org.unicode.cldr.draft.ScriptMetadata.Info; 28 import org.unicode.cldr.tool.LikelySubtags; 29 import org.unicode.cldr.util.CLDRConfig; 30 import org.unicode.cldr.util.CLDRFile; 31 import org.unicode.cldr.util.CLDRFile.ExemplarType; 32 import org.unicode.cldr.util.CLDRFile.WinningChoice; 33 import org.unicode.cldr.util.CLDRLocale; 34 import org.unicode.cldr.util.CalculatedCoverageLevels; 35 import org.unicode.cldr.util.ChainedMap; 36 import org.unicode.cldr.util.ChainedMap.M3; 37 import org.unicode.cldr.util.CldrUtility; 38 import org.unicode.cldr.util.Containment; 39 import org.unicode.cldr.util.Factory; 40 import org.unicode.cldr.util.LanguageTagParser; 41 import org.unicode.cldr.util.Level; 42 import org.unicode.cldr.util.ScriptToExemplars; 43 import org.unicode.cldr.util.StandardCodes; 44 import org.unicode.cldr.util.StandardCodes.LstrType; 45 import org.unicode.cldr.util.SupplementalDataInfo; 46 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; 47 import org.unicode.cldr.util.Validity; 48 import org.unicode.cldr.util.Validity.Status; 49 50 public class LikelySubtagsTest extends TestFmwk { 51 52 private static final Validity VALIDITY = Validity.getInstance(); 53 private boolean DEBUG = false; 54 private static boolean SHOW_EXEMPLARS = System.getProperty("SHOW_EXEMPLARS") != null; 55 private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); 56 private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = 57 CLDR_CONFIG.getSupplementalDataInfo(); 58 static final Map<String, String> likely = SUPPLEMENTAL_DATA_INFO.getLikelySubtags(); 59 static final LikelySubtags LIKELY = new LikelySubtags(); 60 main(String[] args)61 public static void main(String[] args) { 62 new LikelySubtagsTest().run(args); 63 } 64 65 static class Tags { 66 final Set<String> languages = new TreeSet<>(); 67 final Set<String> scripts = new TreeSet<>(); 68 final Set<String> regions = new TreeSet<>(); 69 final Set<String> scriptRegion = new TreeSet<>(); 70 final Set<String> languageScript = new TreeSet<>(); 71 final Set<String> languageRegion = new TreeSet<>(); 72 final Set<String> all = new TreeSet<>(); 73 final ChainedMap.M4<String, String, String, Boolean> languageToScriptToRegions = 74 ChainedMap.of( 75 new TreeMap<String, Object>(), 76 new TreeMap<String, Object>(), 77 new TreeMap<String, Object>(), 78 Boolean.class); 79 final ChainedMap.M3<String, String, Boolean> languageToRegions = 80 ChainedMap.of( 81 new TreeMap<String, Object>(), 82 new TreeMap<String, Object>(), 83 Boolean.class); 84 Tags()85 public Tags() { 86 final LanguageTagParser ltp = new LanguageTagParser(); 87 for (Entry<String, String> entry : likely.entrySet()) { 88 add(ltp.set(entry.getKey()), true); 89 add(ltp.set(entry.getValue()), false); 90 } 91 // add unfamiliar script, unfamiliar region 92 for (String lang : languageToScriptToRegions.keySet()) { 93 if (lang.equals("und")) { 94 continue; 95 } 96 M3<String, String, Boolean> scriptToRegion = languageToScriptToRegions.get(lang); 97 final Set<String> scriptsFor = scriptToRegion.keySet(); 98 final Set<String> regionsFor = languageToRegions.get(lang).keySet(); 99 100 String firstScriptNotIn = getNonEmptyNotIn(scripts, scriptsFor); 101 String firstRegionNotIn = getNonEmptyNotIn(regions, regionsFor); 102 103 languageToScriptToRegions.put( 104 lang, firstScriptNotIn, firstRegionNotIn, Boolean.TRUE); 105 // clone for safety before iterating 106 for (String script : new HashSet<>(scriptsFor)) { 107 languageToScriptToRegions.put(lang, script, firstRegionNotIn, Boolean.TRUE); 108 } 109 for (String region : new HashSet<>(regionsFor)) { 110 languageToScriptToRegions.put(lang, firstScriptNotIn, region, Boolean.TRUE); 111 } 112 } 113 114 // System.out.println("all: " + all); 115 // System.out.println("scriptRegion: " + scriptRegion); 116 // System.out.println("languageScript: " + languageScript); 117 // System.out.println("languageRegion: " + languageRegion); 118 } 119 getNonEmptyNotIn(Iterable<T> a, Set<T> b)120 private static <T> T getNonEmptyNotIn(Iterable<T> a, Set<T> b) { 121 for (T x : a) { 122 if (!b.contains(x) && !x.toString().isEmpty()) { 123 return x; 124 } 125 } 126 throw new IllegalArgumentException(); 127 } 128 add(LanguageTagParser ltp, boolean source)129 void add(LanguageTagParser ltp, boolean source) { 130 String sourceLanguage = ltp.getLanguage(); 131 String sourceScript = ltp.getScript(); 132 String sourceRegion = ltp.getRegion(); 133 languageToScriptToRegions.put(sourceLanguage, sourceScript, sourceRegion, Boolean.TRUE); 134 languageToScriptToRegions.put(sourceLanguage, sourceScript, "", Boolean.TRUE); 135 languageToScriptToRegions.put(sourceLanguage, "", "", Boolean.TRUE); 136 languageToRegions.put(sourceLanguage, "", Boolean.TRUE); 137 if (StandardCodes.isCountry(sourceRegion)) { 138 languageToScriptToRegions.put(sourceLanguage, "", sourceRegion, Boolean.TRUE); 139 languageToRegions.put(sourceLanguage, sourceRegion, Boolean.TRUE); 140 } 141 142 // capture all cases of 2 items 143 if (source) { 144 if (!sourceScript.isEmpty() && !sourceRegion.isEmpty()) { 145 if (!sourceLanguage.equals("und")) { 146 all.add(ltp.toString()); 147 } else { 148 scriptRegion.add(ltp.toString()); 149 } 150 } else if (!sourceLanguage.equals("und")) { 151 if (!sourceScript.isEmpty()) { 152 languageScript.add(ltp.toString()); 153 } else if (!sourceRegion.isEmpty()) { 154 languageRegion.add(ltp.toString()); 155 } 156 } 157 } 158 languages.add(sourceLanguage); 159 scripts.add(sourceScript); 160 if (StandardCodes.isCountry(sourceRegion) || sourceRegion.isEmpty()) { 161 regions.add(sourceRegion); 162 } 163 } 164 } 165 166 static final Tags TAGS = new Tags(); 167 168 final LanguageTagParser maxLtp = new LanguageTagParser(); 169 final LanguageTagParser sourceLtp = new LanguageTagParser(); 170 171 /** 172 * Return false if we should skip the language 173 * 174 * @param source 175 * @return 176 */ checkAdding(String source)177 public boolean checkAdding(String source) { 178 // if X maps to Y, then adding a field from Y to X will still map to Y 179 // Example: 180 // und_AF => fa_Arab_AF 181 // therefore, the following should also be true: 182 // und_Arab_AF => fa_Arab_AF 183 // fa_AF => fa_Arab_AF 184 // fa_Arab_AF => fa_Arab_AF 185 186 String max = LIKELY.maximize(source); 187 if (!assertNotEquals("Maximize " + source, null, max)) { 188 return source.contains("_"); 189 } 190 sourceLtp.set(source); 191 if (!sourceLtp.getRegion().isEmpty() && !StandardCodes.isCountry(sourceLtp.getRegion())) { 192 return true; 193 } 194 maxLtp.set(max); 195 for (int i = 1; i < 8; ++i) { 196 if ((i & 1) != 0) { 197 if (!sourceLtp.getLanguage().equals("und")) continue; 198 sourceLtp.setLanguage(maxLtp.getLanguage()); 199 } 200 if ((i & 2) != 0) { 201 if (!sourceLtp.getScript().isEmpty()) continue; 202 sourceLtp.setScript(maxLtp.getScript()); 203 } 204 if ((i & 4) != 0) { 205 if (!sourceLtp.getRegion().isEmpty()) continue; 206 sourceLtp.setRegion(maxLtp.getRegion()); 207 } 208 String test = sourceLtp.toString(); 209 final String maximize = LIKELY.maximize(test); 210 if (!max.equals(maximize)) { 211 // max(source) = max, max(test) ≠ max 212 if (!assertEquals( 213 String.format( 214 "checkAdding: max(%s)->%s, however max(%s)->", source, max, test), 215 max, 216 maximize)) { 217 // LIKELY.maximize(test); // Could step into this for debugging. 218 } 219 } 220 sourceLtp.set(source); // restore 221 } 222 return true; 223 } 224 TestCompleteness()225 public void TestCompleteness() { 226 final LanguageTagParser ltp = new LanguageTagParser(); 227 if (DEBUG) { 228 System.out.println(TAGS.languages.size() + "\t" + TAGS.languages); 229 System.out.println(TAGS.scripts.size() + "\t" + TAGS.scripts); 230 System.out.println(TAGS.regions.size() + "\t" + TAGS.regions); 231 } 232 main: 233 for (Entry<String, Map<String, Map<String, Boolean>>> languageScriptRegion : 234 TAGS.languageToScriptToRegions) { 235 String language = languageScriptRegion.getKey(); 236 ltp.set(language); // clears script, region 237 for (Entry<String, Map<String, Boolean>> scriptRegion : 238 languageScriptRegion.getValue().entrySet()) { 239 String script = scriptRegion.getKey(); 240 ltp.setScript(script); 241 for (String region : scriptRegion.getValue().keySet()) { 242 ltp.setRegion(region); 243 String testTag = ltp.toString(); 244 // System.out.println(testTag); 245 if (!testTag.equals("und") && !checkAdding(testTag)) { 246 checkAdding(testTag); // for debugging 247 continue main; 248 } 249 } 250 } 251 } 252 } 253 254 static Set<String> exceptions = 255 new HashSet<>( 256 Arrays.asList( 257 "Zyyy", "Zinh", "Zzzz", "Brai", 258 "Cpmn")); // scripts with no default language 259 TestStability()260 public void TestStability() { 261 // when maximized must never change 262 // first get all the subtags 263 // then test all the combinations 264 LanguageTagParser ltp = new LanguageTagParser(); 265 for (Entry<String, String> entry : likely.entrySet()) { 266 ltp.set(entry.getKey()); 267 String sourceLanguage = ltp.getLanguage(); 268 if (sourceLanguage.equals("und")) { 269 sourceLanguage = ""; 270 } 271 String sourceScript = ltp.getScript(); 272 String sourceRegion = ltp.getRegion(); 273 ltp.set(entry.getValue()); 274 String targetLanguage = ltp.getLanguage(); 275 String targetScript = ltp.getScript(); 276 String targetRegion = ltp.getRegion(); 277 if (!sourceLanguage.isEmpty()) { 278 assertEquals("language", sourceLanguage, targetLanguage); 279 } 280 if (!sourceScript.isEmpty()) { 281 assertEquals("script", sourceScript, targetScript); 282 } 283 if (!sourceRegion.isEmpty()) { 284 if (Containment.isLeaf(sourceRegion)) { 285 assertEquals("region", sourceRegion, targetRegion); 286 } 287 } 288 } 289 } 290 TestForMissingScriptMetadata()291 public void TestForMissingScriptMetadata() { 292 TreeSet<String> metadataScripts = new TreeSet<>(ScriptMetadata.getScripts()); 293 UnicodeSet current = new UnicodeSet(0, 0x10FFFF); 294 UnicodeSet toRemove = new UnicodeSet(); 295 296 while (!current.isEmpty()) { 297 int ch = current.charAt(0); 298 int script = UScript.getScript(ch); 299 String shortName = UScript.getShortName(script); 300 Info i = ScriptMetadata.getInfo(shortName); 301 if (i == null) { 302 errln("Script Metadata is missing: " + shortName); 303 continue; 304 } 305 if (i.likelyLanguage.equals("und") && !exceptions.contains(shortName)) { 306 errln("Script has no likely language: " + shortName); 307 } 308 toRemove.applyIntPropertyValue(UProperty.SCRIPT, script); 309 current.removeAll(toRemove); 310 metadataScripts.remove(shortName); 311 } 312 metadataScripts.removeAll( 313 Arrays.asList("Hans", "Hant", "Hanb", "Jamo", "Jpan", "Kore")); // remove 314 // "combo" 315 // scripts 316 if (!metadataScripts.isEmpty()) { 317 // Warning, not error, so that we can add scripts to the script metadata 318 // and later update to the Unicode version that has characters for those scripts. 319 warnln("Script Metadata for characters not in Unicode: " + metadataScripts); 320 } 321 } 322 TestMissingInfoForLanguage()323 public void TestMissingInfoForLanguage() { 324 CLDRFile english = CLDR_CONFIG.getEnglish().getUnresolved(); 325 326 CalculatedCoverageLevels ccl = CalculatedCoverageLevels.getInstance(); 327 328 for (String language : CLDR_CONFIG.getCldrFactory().getAvailableLanguages()) { 329 if (language.contains("_") || language.equals("root")) { 330 continue; 331 } 332 String likelyExpansion = likely.get(language); 333 if (likelyExpansion == null) { 334 errln("Missing likely subtags for: " + language); 335 } else { 336 logln("Likely subtags for " + language + ":\t " + likely); 337 } 338 String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language); 339 String englishName = english.getStringValue(path); 340 if (englishName == null) { 341 Level covLevel = ccl.getEffectiveCoverageLevel(language); 342 if (covLevel == null || !covLevel.isAtLeast(Level.BASIC)) { 343 // https://unicode-org.atlassian.net/browse/CLDR-15663 344 if (logKnownIssue( 345 "CLDR-15663", 346 "English translation should not be required for sub-basic language name")) { 347 continue; // skip error 348 } 349 } 350 errln("Missing English translation for: " + language + " which is at " + covLevel); 351 } 352 } 353 } 354 TestMissingInfoForRegion()355 public void TestMissingInfoForRegion() { 356 CLDRFile english = CLDR_CONFIG.getEnglish(); 357 358 for (String region : StandardCodes.make().getGoodAvailableCodes("territory")) { 359 String likelyExpansion = likely.get("und_" + region); 360 if (likelyExpansion == null) { 361 if (SUPPLEMENTAL_DATA_INFO.getContained(region) == null) { // not 362 // container 363 String likelyTag = LikelySubtags.maximize("und_" + region, likely); 364 if (likelyTag == null) { // || !likelyTag.startsWith("en_Latn_") 365 logln( 366 "Missing likely subtags for region: " 367 + region 368 + "\t" 369 + english.getName("territory", region)); 370 } 371 } else { // container 372 logln( 373 "Missing likely subtags for macroregion (fix to exclude regions having 'en'): " 374 + region 375 + "\t" 376 + english.getName("territory", region)); 377 } 378 } else { 379 logln("Likely subtags for region: " + region + ":\t " + likely); 380 } 381 String path = CLDRFile.getKey(CLDRFile.TERRITORY_NAME, region); 382 String englishName = english.getStringValue(path); 383 if (englishName == null) { 384 errln("Missing English translation for: " + region); 385 } 386 } 387 } 388 389 // typically historical script that don't need to be in likely subtags 390 391 static final Set<String> KNOWN_SCRIPTS_WITHOUT_LIKELY_SUBTAGS = 392 ImmutableSet.of("Hatr", "Cpmn", "Ougr"); 393 TestMissingInfoForScript()394 public void TestMissingInfoForScript() { 395 VersionInfo icuUnicodeVersion = UCharacter.getUnicodeVersion(); 396 TreeSet<String> sorted = new TreeSet<>(ScriptMetadata.getScripts()); 397 Set<String> exceptions2 = 398 new HashSet<>( 399 Arrays.asList("zh_Hans_CN", "hnj_Hmnp_US", "hnj_Hmng_LA", "iu_Cans_CA")); 400 for (String script : sorted) { 401 if (exceptions.contains(script) || script.equals("Latn") || script.equals("Dsrt")) { 402 // we minimize away und_X, when the code puts in en...US 403 continue; 404 } 405 Info i = ScriptMetadata.getInfo(script); 406 // System.out.println(i); 407 String likelyLanguage = i.likelyLanguage; 408 String originCountry = i.originCountry; 409 String undScript = "und_" + script; 410 String langScript = likelyLanguage + "_" + script + "_"; 411 String likelyExpansion = likely.get(undScript); 412 if (likelyExpansion == null) { 413 if (!KNOWN_SCRIPTS_WITHOUT_LIKELY_SUBTAGS.contains(script)) { 414 String msg = 415 "likelySubtags.xml missing language for script (und_" 416 + script 417 + "). Script Metadata suggests that it should be something like:\t " 418 + showOverride(script, originCountry, langScript); 419 if (i.age.compareTo(icuUnicodeVersion) <= 0) { 420 // Error: Missing data for a script in ICU's Unicode version. 421 errln(msg); 422 } else { 423 // Warning: Missing data for a script in a future Unicode version. 424 warnln(msg); 425 } 426 } 427 } else if (!exceptions2.contains(likelyExpansion) 428 && !likelyExpansion.startsWith(langScript)) { 429 // if 430 // (logKnownIssue("Cldrbug:7181","Missing script metadata for " 431 // + script) 432 // && (script.equals("Tfng") || script.equals("Brah"))) { 433 // logln("Wrong likely language for script (und_" + script + 434 // "). Should not be " + likelyExpansion 435 // + ", but something like:\t " + showOverride(script, 436 // originCountry, langScript)); 437 // } else { 438 errln( 439 "likelySubtags.xml has wrong language for script (und_" 440 + script 441 + "). Should not be " 442 + likelyExpansion 443 + ", but Script Metadata suggests something like:\t " 444 + showOverride(script, originCountry, langScript)); 445 // } 446 } else { 447 logln("OK: " + undScript + " => " + likelyExpansion); 448 } 449 } 450 /** 451 * und_Bopo => zh_Bopo_TW und_Copt => cop_Copt_EG // fix 002 und_Dsrt => en_Dsrt_US // fix 452 * US 453 */ 454 } 455 showOverride(String script, String originCountry, String langScript)456 public String showOverride(String script, String originCountry, String langScript) { 457 return "{\"und_" + script + "\", \"" + langScript + originCountry + "\"},"; 458 } 459 460 /** 461 * Test two issues: 462 * 463 * <ul> 464 * <li>That the script of the locale's examplars matches the script derived from the locale's 465 * identifier. 466 * <li>That the union of the exemplar sets (main+aux) for all locales with the script matches 467 * what is in ltp.getResolvedScript() 468 * </ul> 469 * 470 * Written as one test, to avoid the overhead of iterating over all locales twice. 471 */ testGetResolvedScriptVsExemplars()472 public void testGetResolvedScriptVsExemplars() { 473 Factory factory = CLDR_CONFIG.getCldrFactory(); 474 LanguageTagParser ltp = new LanguageTagParser(); 475 Multimap<String, UnicodeSet> scriptToMains = TreeMultimap.create(); 476 Multimap<String, UnicodeSet> scriptToAuxes = TreeMultimap.create(); 477 UnicodeSet collectedBad = new UnicodeSet(); 478 for (String locale : factory.getAvailable()) { 479 if ("root".equals(locale)) { 480 continue; 481 } 482 CLDRFile cldrFile = factory.make(locale, true); 483 UnicodeSet main = cldrFile.getRawExemplarSet(ExemplarType.main, WinningChoice.WINNING); 484 main = checkSet("main", locale, main, collectedBad); 485 UnicodeSet aux = 486 cldrFile.getRawExemplarSet(ExemplarType.auxiliary, WinningChoice.WINNING); 487 aux = checkSet("aux", locale, aux, collectedBad); 488 String script = null; 489 int uScript = 0; 490 for (String s : main) { 491 uScript = UScript.getScript(s.codePointAt(0)); 492 if (uScript > UScript.INHERITED) { 493 script = UScript.getShortName(uScript); 494 break; 495 } 496 } 497 if (script == null) { 498 errln("No script for " + locale); 499 continue; 500 } 501 String ltpScript = ltp.set(locale).getResolvedScript(); 502 switch (uScript) { 503 case UScript.HAN: 504 switch (ltp.getLanguage()) { 505 case "ja": 506 script = "Jpan"; 507 break; 508 case "yue": 509 script = ltp.getScript(); 510 if (script.isEmpty()) { 511 script = "Hant"; 512 } 513 break; 514 case "zh": 515 script = ltp.getScript(); 516 if (script.isEmpty()) { 517 script = "Hans"; 518 } 519 break; 520 } 521 break; 522 case UScript.HANGUL: 523 switch (ltp.getLanguage()) { 524 case "ko": 525 script = "Kore"; 526 break; 527 } 528 } 529 if (!assertEquals(locale, script, ltpScript)) { 530 ltp.getResolvedScript(); // for debugging 531 } 532 scriptToMains.put(ltpScript, main.freeze()); 533 if (!aux.isEmpty()) { 534 scriptToAuxes.put(ltpScript, aux.freeze()); 535 } 536 } 537 538 if (!collectedBad.isEmpty()) { 539 warnln( 540 "Locales have " 541 + collectedBad.size() 542 + " unexpected characters in main and/or aux:\t" 543 + collectedBad.toPattern(false) 544 + "\n Use -DSHOW_EXEMPLARS for details"); 545 } 546 547 // now check that ScriptToExemplars.getExemplars matches the data 548 549 Set<String> problemScripts = new LinkedHashSet<>(); 550 Map<String, UnicodeSet> expected = new TreeMap<>(); 551 for (Entry<String, Collection<UnicodeSet>> entry : scriptToMains.asMap().entrySet()) { 552 String script = entry.getKey(); 553 Collection<UnicodeSet> mains = entry.getValue(); 554 Collection<UnicodeSet> auxes = scriptToAuxes.get(script); 555 556 UnicodeSet flattened; 557 if (mains.size() <= 1 && auxes.size() <= 1) { 558 continue; 559 } else { 560 UnicodeMap<Integer> counts = new UnicodeMap<>(); 561 getCounts(mains, counts); 562 flattened = getUncommon(counts, mains.size()); 563 if (counts.size() < 32) { 564 getCounts(auxes, counts); 565 flattened = getUncommon(counts, mains.size()); 566 } 567 } 568 expected.put(script, flattened.freeze()); 569 } 570 for (Entry<String, UnicodeSet> entry : expected.entrySet()) { 571 String script = entry.getKey(); 572 UnicodeSet flattened = entry.getValue(); 573 574 // now compare to what we get from the cached file, to make sure the latter is up to 575 // date 576 577 if (!assertEquals( 578 script, 579 flattened.toPattern(false), 580 ScriptToExemplars.getExemplars(script).toPattern(false))) { 581 problemScripts.add(script); 582 } 583 } 584 585 if (!problemScripts.isEmpty()) { 586 warnln( 587 "Adjust the data in scriptToExemplars.txt. Use -DSHOW_EXEMPLARS to get a fresh copy, or reset to expected value for: " 588 + problemScripts); 589 if (SHOW_EXEMPLARS) { 590 for (Entry<String, UnicodeSet> entry : expected.entrySet()) { 591 String script = entry.getKey(); 592 UnicodeSet flattened = entry.getValue(); 593 if (!flattened.isEmpty()) { 594 System.out.println( 595 script 596 + " ;\t" 597 + flattened.size() 598 + " ;\t" 599 + flattened.toPattern(false)); 600 } 601 } 602 } 603 } 604 } 605 606 static final UnicodeSet MAIN_AUX_EXPECTED = new UnicodeSet("[\\p{L}\\p{M}\\p{Cf}·]").freeze(); 607 checkSet( String title, String locale, UnicodeSet main, UnicodeSet collected)608 private UnicodeSet checkSet( 609 String title, String locale, UnicodeSet main, UnicodeSet collected) { 610 UnicodeSet bad = new UnicodeSet(); 611 for (String s : main) { 612 if (!MAIN_AUX_EXPECTED.containsAll(s)) { 613 bad.add(s); 614 } 615 } 616 if (!bad.isEmpty()) { 617 if (SHOW_EXEMPLARS) { 618 warnln( 619 "\t" 620 + title 621 + "\tLocale\t" 622 + locale 623 + "\thas " 624 + bad.size() 625 + " unexpected exemplar characters:\t" 626 + bad.toPattern(false)); 627 } 628 collected.addAll(bad); 629 } 630 return CldrUtility.flatten(new UnicodeSet(main).removeAll(bad)); 631 } 632 633 /** 634 * Remove items with a count equal to size (they are common to all locales), and flatten 635 * (against the whole set) 636 */ getUncommon(UnicodeMap<Integer> counts, int size)637 private UnicodeSet getUncommon(UnicodeMap<Integer> counts, int size) { 638 UnicodeSet flattenedAll = 639 CldrUtility.flatten(counts.keySet()); // we flatten against the whole set 640 UnicodeSet result = new UnicodeSet(); 641 for (String s : flattenedAll) { 642 int count = counts.get(s); 643 if (count != size) { 644 result.add(s); 645 } 646 } 647 return result.freeze(); 648 } 649 getCounts(Collection<UnicodeSet> usets, UnicodeMap<Integer> counts)650 private void getCounts(Collection<UnicodeSet> usets, UnicodeMap<Integer> counts) { 651 for (UnicodeSet uset : usets) { 652 for (String s : uset) { 653 Integer old = counts.get(s); 654 if (old == null) { 655 counts.put(s, 1); 656 } else { 657 counts.put(s, old + 1); 658 } 659 } 660 } 661 } 662 testUndAllScriptsAndRegions()663 public void testUndAllScriptsAndRegions() { 664 Set<String> regions = new TreeSet<>(); 665 Set<String> scripts = new TreeSet<>(); 666 Set<String> regularCountries = 667 VALIDITY.getStatusToCodes(LstrType.region).get(Status.regular); 668 Set<String> macroRegions = 669 Set 670 .of(); // Validity.getInstance().getStatusToCodes(LstrType.region).get(Status.macroregion); 671 672 for (String country : Sets.union(regularCountries, macroRegions)) { 673 regions.add(country); 674 } 675 676 // for Scripts, just test the ones in CLDR 677 for (String localeString : CLDR_CONFIG.getCldrFactory().getAvailable()) { 678 if (localeString.equals("root")) { 679 continue; 680 } 681 CLDRLocale cLocale = CLDRLocale.getInstance(localeString); 682 final String script = cLocale.getScript(); 683 if (script.equals("Dsrt")) { 684 continue; // toy script 685 } 686 final String country = cLocale.getCountry(); 687 if (!country.isEmpty() && !country.equals("001")) { 688 regions.add(country); 689 } 690 if (!script.isEmpty()) { 691 scripts.add(script); 692 // if (!country.isEmpty()) { 693 // // we only need this if the value from script + country is 694 // different from the value of script 695 // combinations.add("und_" + script + "_" + country); 696 // } 697 } 698 } 699 for (String script : scripts) { 700 if (script.equals("Latn")) { 701 assertTrue("contains und_" + script, likely.containsKey("und")); 702 } else if (!assertTrue("contains und_" + script, likely.containsKey("und_" + script))) { 703 704 } 705 } 706 LanguageTagParser ltp = new LanguageTagParser(); 707 Set<String> possibleFixes = new TreeSet<>(); 708 for (String region : regions) { 709 final String undRegion = "und_" + region; 710 if (region.equals("150") && likely.containsKey("und")) { 711 // skip 712 } else if (!assertTrue("contains und_" + region, likely.containsKey(undRegion))) { 713 Set<String> languages = 714 SUPPLEMENTAL_DATA_INFO.getLanguagesForTerritoryWithPopulationData(region); 715 double biggest = -1; 716 String biggestLang = null; 717 for (String language : languages) { 718 PopulationData popData = 719 SUPPLEMENTAL_DATA_INFO.getLanguageAndTerritoryPopulationData( 720 language, region); 721 if (popData.getLiteratePopulation() > biggest) { 722 biggest = popData.getLiteratePopulation(); 723 biggestLang = language; 724 } 725 } 726 if (biggestLang != null) { 727 ltp.set(biggestLang); 728 if (ltp.getScript().isEmpty()) { 729 String biggestMax = likely.get(biggestLang); 730 ltp.set(biggestMax); 731 } 732 ltp.setRegion(region); 733 possibleFixes.add( 734 "<likelySubtag from=\"" + undRegion + "\" to=\"" + ltp + "\"/>"); 735 } 736 } 737 } 738 System.out.println("\t\t" + Joiner.on("\n\t\t").join(possibleFixes)); 739 } 740 testToAttributeValidityStatus()741 public void testToAttributeValidityStatus() { 742 Set<String> okLanguages = VALIDITY.getStatusToCodes(LstrType.language).get(Status.regular); 743 Set<String> okScripts = VALIDITY.getStatusToCodes(LstrType.script).get(Status.regular); 744 Set<String> okRegions = VALIDITY.getStatusToCodes(LstrType.region).get(Status.regular); 745 Multimap<String, String> badFieldsToLocales = TreeMultimap.create(); 746 Set<String> knownExceptions = Set.of("in", "iw", "ji", "jw", "mo", "tl"); 747 for (String s : likely.values()) { 748 CLDRLocale cLocale = CLDRLocale.getInstance(s); 749 final String language = cLocale.getLanguage(); 750 final String script = cLocale.getScript(); 751 final String region = cLocale.getCountry(); 752 if (!okLanguages.contains(language)) { 753 if (knownExceptions.contains(language)) { 754 continue; 755 } 756 badFieldsToLocales.put(language, s); 757 } 758 if (!okScripts.contains(script)) { 759 badFieldsToLocales.put(script, s); 760 } 761 if (!okRegions.contains(region)) { 762 badFieldsToLocales.put(region, s); 763 } 764 } 765 if (!badFieldsToLocales.isEmpty()) { 766 Multimap<Status, String> statusToExamples = TreeMultimap.create(); 767 for (String field : badFieldsToLocales.keySet()) { 768 Status status = VALIDITY.getCodeToStatus(LstrType.language).get(field); 769 if (status == null) { 770 status = VALIDITY.getCodeToStatus(LstrType.script).get(field); 771 } 772 if (status == null) { 773 status = VALIDITY.getCodeToStatus(LstrType.region).get(field); 774 } 775 statusToExamples.put(status, field); 776 } 777 Map<String, String> fieldToOrigin = new TreeMap<>(); 778 for (Entry<Status, Collection<String>> entry : statusToExamples.asMap().entrySet()) { 779 // for (String value : entry.getValue()) { 780 // String origin = 781 // SUPPLEMENTAL_DATA_INFO.getLikelyOrigins().get(value); 782 // fieldToOrigin.put(value, origin == null ? "n/a" : origin); 783 // } 784 warnln("Bad status=" + entry.getKey() + " for " + entry.getValue()); 785 } 786 } 787 } 788 789 /** 790 * Test whether any of the mapping lines in likelySubtags.xml are superfluous. <br> 791 * For example, with the following mappings, #2 and #3 are superfluous, since they would be 792 * produced by the algorithm anyway. 793 * 794 * <ol> 795 * <li>ll => ll_Sss1_R1 796 * <li>ll_Sss2 => ll_Sss2_RR 797 * <li>ll_R2 => ll_Ssss_R2 798 * </ol> 799 * 800 * On the other hand, the following are not: 801 * 802 * <ol> 803 * <li>ll_Sss2 => ll_Sss2_R3 804 * <li>ll_R2 => ll_Sss3_R2 805 * </ol> 806 */ testSuperfluous()807 public void testSuperfluous() { 808 Map<String, String> origins = SUPPLEMENTAL_DATA_INFO.getLikelyOrigins(); 809 810 // collect all items with same language 811 LanguageTagParser ltp = new LanguageTagParser(); 812 TreeMap<String, TreeMap<String, String>> langToLikelySubset = new TreeMap<>(); 813 for (Entry<String, String> entry : likely.entrySet()) { 814 String lang = ltp.set(entry.getKey()).getLanguage(); 815 if (lang.equals("und")) { 816 continue; 817 } 818 TreeMap<String, String> subtree = langToLikelySubset.get(lang); 819 if (subtree == null) { 820 langToLikelySubset.put(lang, subtree = new TreeMap<>()); 821 } 822 subtree.put(entry.getKey(), entry.getValue()); 823 } 824 boolean first = true; 825 826 for (Entry<String, TreeMap<String, String>> langAndMap : langToLikelySubset.entrySet()) { 827 String lang0 = langAndMap.getKey(); 828 Map<String, String> goldenMap = ImmutableMap.copyOf(langAndMap.getValue()); 829 if (goldenMap.size() == 1) { 830 continue; 831 } 832 833 // get test sets and build probe data 834 835 Set<String> scripts = new TreeSet<>(); 836 scripts.add("Egyp"); 837 scripts.add(""); 838 Set<String> regions = new TreeSet<>(); 839 regions.add("AQ"); 840 regions.add(""); 841 for (String key : Sets.union(goldenMap.keySet(), new TreeSet<>(goldenMap.values()))) { 842 scripts.add(ltp.set(key).getScript()); 843 regions.add(ltp.getRegion()); 844 } 845 scripts = ImmutableSet.copyOf(scripts); 846 regions = ImmutableSet.copyOf(regions); 847 848 TreeSet<String> probeData = new TreeSet<>(); 849 ltp.setLanguage(lang0); // clear; 850 for (String script : scripts) { 851 ltp.setScript(script); // clear; 852 for (String region : regions) { 853 ltp.setRegion(region); 854 probeData.add(ltp.toString()); 855 } 856 } 857 858 // see if the omission of a <key,value> makes no difference 859 860 String omittableKey = null; 861 862 for (String keyToTryOmitting : goldenMap.keySet()) { 863 if (!keyToTryOmitting.contains("_")) { 864 continue; 865 } 866 TreeMap<String, String> mapWithOmittedKey = new TreeMap<>(goldenMap); 867 mapWithOmittedKey.remove(keyToTryOmitting); 868 869 boolean makesADifference = false; 870 for (String probe : probeData) { 871 String expected = LikelySubtags.maximize(probe, goldenMap); 872 String actual = LikelySubtags.maximize(probe, mapWithOmittedKey); 873 if (!Objects.equal(expected, actual)) { 874 makesADifference = true; 875 break; 876 } 877 } 878 if (!makesADifference) { 879 omittableKey = keyToTryOmitting; 880 break; 881 } 882 } 883 884 // show the value that doesn't make a difference 885 // NOTE: there may be more than one, but it is sufficient to find one. 886 if (omittableKey != null) { 887 final String origin = origins.get(omittableKey); 888 if (origin != null) { // only check the non-sil for now 889 logKnownIssue("CLDR-17084", "Remove superfluous lines in likelySubtags.txt"); 890 continue; 891 } 892 if (first) { 893 warnln("\tMaps\tKey to omit\tvalue\torigin"); 894 first = false; 895 } 896 assertFalse( 897 "\t" 898 + goldenMap 899 + "\t" 900 + omittableKey 901 + "\t" 902 + goldenMap.get(omittableKey) 903 + "\t" 904 + (origin == null ? "" : origin) 905 + "\t", 906 true); 907 } 908 } 909 } 910 } 911