xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.unittest;
2 
3 import com.google.common.base.Joiner;
4 import com.google.common.base.Objects;
5 import com.google.common.collect.ImmutableMap;
6 import com.google.common.collect.ImmutableSet;
7 import com.google.common.collect.Multimap;
8 import com.google.common.collect.Sets;
9 import com.google.common.collect.TreeMultimap;
10 import com.ibm.icu.dev.test.TestFmwk;
11 import com.ibm.icu.dev.util.UnicodeMap;
12 import com.ibm.icu.lang.UCharacter;
13 import com.ibm.icu.lang.UProperty;
14 import com.ibm.icu.lang.UScript;
15 import com.ibm.icu.text.UnicodeSet;
16 import com.ibm.icu.util.VersionInfo;
17 import java.util.Arrays;
18 import java.util.Collection;
19 import java.util.HashSet;
20 import java.util.LinkedHashSet;
21 import java.util.Map;
22 import java.util.Map.Entry;
23 import java.util.Set;
24 import java.util.TreeMap;
25 import java.util.TreeSet;
26 import org.unicode.cldr.draft.ScriptMetadata;
27 import org.unicode.cldr.draft.ScriptMetadata.Info;
28 import org.unicode.cldr.tool.LikelySubtags;
29 import org.unicode.cldr.util.CLDRConfig;
30 import org.unicode.cldr.util.CLDRFile;
31 import org.unicode.cldr.util.CLDRFile.ExemplarType;
32 import org.unicode.cldr.util.CLDRFile.WinningChoice;
33 import org.unicode.cldr.util.CLDRLocale;
34 import org.unicode.cldr.util.CalculatedCoverageLevels;
35 import org.unicode.cldr.util.ChainedMap;
36 import org.unicode.cldr.util.ChainedMap.M3;
37 import org.unicode.cldr.util.CldrUtility;
38 import org.unicode.cldr.util.Containment;
39 import org.unicode.cldr.util.Factory;
40 import org.unicode.cldr.util.LanguageTagParser;
41 import org.unicode.cldr.util.Level;
42 import org.unicode.cldr.util.ScriptToExemplars;
43 import org.unicode.cldr.util.StandardCodes;
44 import org.unicode.cldr.util.StandardCodes.LstrType;
45 import org.unicode.cldr.util.SupplementalDataInfo;
46 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
47 import org.unicode.cldr.util.Validity;
48 import org.unicode.cldr.util.Validity.Status;
49 
50 public class LikelySubtagsTest extends TestFmwk {
51 
52     private static final Validity VALIDITY = Validity.getInstance();
53     private boolean DEBUG = false;
54     private static boolean SHOW_EXEMPLARS = System.getProperty("SHOW_EXEMPLARS") != null;
55     private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance();
56     private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO =
57             CLDR_CONFIG.getSupplementalDataInfo();
58     static final Map<String, String> likely = SUPPLEMENTAL_DATA_INFO.getLikelySubtags();
59     static final LikelySubtags LIKELY = new LikelySubtags();
60 
main(String[] args)61     public static void main(String[] args) {
62         new LikelySubtagsTest().run(args);
63     }
64 
65     static class Tags {
66         final Set<String> languages = new TreeSet<>();
67         final Set<String> scripts = new TreeSet<>();
68         final Set<String> regions = new TreeSet<>();
69         final Set<String> scriptRegion = new TreeSet<>();
70         final Set<String> languageScript = new TreeSet<>();
71         final Set<String> languageRegion = new TreeSet<>();
72         final Set<String> all = new TreeSet<>();
73         final ChainedMap.M4<String, String, String, Boolean> languageToScriptToRegions =
74                 ChainedMap.of(
75                         new TreeMap<String, Object>(),
76                         new TreeMap<String, Object>(),
77                         new TreeMap<String, Object>(),
78                         Boolean.class);
79         final ChainedMap.M3<String, String, Boolean> languageToRegions =
80                 ChainedMap.of(
81                         new TreeMap<String, Object>(),
82                         new TreeMap<String, Object>(),
83                         Boolean.class);
84 
Tags()85         public Tags() {
86             final LanguageTagParser ltp = new LanguageTagParser();
87             for (Entry<String, String> entry : likely.entrySet()) {
88                 add(ltp.set(entry.getKey()), true);
89                 add(ltp.set(entry.getValue()), false);
90             }
91             // add unfamiliar script, unfamiliar region
92             for (String lang : languageToScriptToRegions.keySet()) {
93                 if (lang.equals("und")) {
94                     continue;
95                 }
96                 M3<String, String, Boolean> scriptToRegion = languageToScriptToRegions.get(lang);
97                 final Set<String> scriptsFor = scriptToRegion.keySet();
98                 final Set<String> regionsFor = languageToRegions.get(lang).keySet();
99 
100                 String firstScriptNotIn = getNonEmptyNotIn(scripts, scriptsFor);
101                 String firstRegionNotIn = getNonEmptyNotIn(regions, regionsFor);
102 
103                 languageToScriptToRegions.put(
104                         lang, firstScriptNotIn, firstRegionNotIn, Boolean.TRUE);
105                 // clone for safety before iterating
106                 for (String script : new HashSet<>(scriptsFor)) {
107                     languageToScriptToRegions.put(lang, script, firstRegionNotIn, Boolean.TRUE);
108                 }
109                 for (String region : new HashSet<>(regionsFor)) {
110                     languageToScriptToRegions.put(lang, firstScriptNotIn, region, Boolean.TRUE);
111                 }
112             }
113 
114             // System.out.println("all: " + all);
115             // System.out.println("scriptRegion: " + scriptRegion);
116             // System.out.println("languageScript: " + languageScript);
117             // System.out.println("languageRegion: " + languageRegion);
118         }
119 
getNonEmptyNotIn(Iterable<T> a, Set<T> b)120         private static <T> T getNonEmptyNotIn(Iterable<T> a, Set<T> b) {
121             for (T x : a) {
122                 if (!b.contains(x) && !x.toString().isEmpty()) {
123                     return x;
124                 }
125             }
126             throw new IllegalArgumentException();
127         }
128 
add(LanguageTagParser ltp, boolean source)129         void add(LanguageTagParser ltp, boolean source) {
130             String sourceLanguage = ltp.getLanguage();
131             String sourceScript = ltp.getScript();
132             String sourceRegion = ltp.getRegion();
133             languageToScriptToRegions.put(sourceLanguage, sourceScript, sourceRegion, Boolean.TRUE);
134             languageToScriptToRegions.put(sourceLanguage, sourceScript, "", Boolean.TRUE);
135             languageToScriptToRegions.put(sourceLanguage, "", "", Boolean.TRUE);
136             languageToRegions.put(sourceLanguage, "", Boolean.TRUE);
137             if (StandardCodes.isCountry(sourceRegion)) {
138                 languageToScriptToRegions.put(sourceLanguage, "", sourceRegion, Boolean.TRUE);
139                 languageToRegions.put(sourceLanguage, sourceRegion, Boolean.TRUE);
140             }
141 
142             // capture all cases of 2 items
143             if (source) {
144                 if (!sourceScript.isEmpty() && !sourceRegion.isEmpty()) {
145                     if (!sourceLanguage.equals("und")) {
146                         all.add(ltp.toString());
147                     } else {
148                         scriptRegion.add(ltp.toString());
149                     }
150                 } else if (!sourceLanguage.equals("und")) {
151                     if (!sourceScript.isEmpty()) {
152                         languageScript.add(ltp.toString());
153                     } else if (!sourceRegion.isEmpty()) {
154                         languageRegion.add(ltp.toString());
155                     }
156                 }
157             }
158             languages.add(sourceLanguage);
159             scripts.add(sourceScript);
160             if (StandardCodes.isCountry(sourceRegion) || sourceRegion.isEmpty()) {
161                 regions.add(sourceRegion);
162             }
163         }
164     }
165 
166     static final Tags TAGS = new Tags();
167 
168     final LanguageTagParser maxLtp = new LanguageTagParser();
169     final LanguageTagParser sourceLtp = new LanguageTagParser();
170 
171     /**
172      * Return false if we should skip the language
173      *
174      * @param source
175      * @return
176      */
checkAdding(String source)177     public boolean checkAdding(String source) {
178         // if X maps to Y, then adding a field from Y to X will still map to Y
179         // Example:
180         // und_AF => fa_Arab_AF
181         // therefore, the following should also be true:
182         // und_Arab_AF => fa_Arab_AF
183         // fa_AF => fa_Arab_AF
184         // fa_Arab_AF => fa_Arab_AF
185 
186         String max = LIKELY.maximize(source);
187         if (!assertNotEquals("Maximize " + source, null, max)) {
188             return source.contains("_");
189         }
190         sourceLtp.set(source);
191         if (!sourceLtp.getRegion().isEmpty() && !StandardCodes.isCountry(sourceLtp.getRegion())) {
192             return true;
193         }
194         maxLtp.set(max);
195         for (int i = 1; i < 8; ++i) {
196             if ((i & 1) != 0) {
197                 if (!sourceLtp.getLanguage().equals("und")) continue;
198                 sourceLtp.setLanguage(maxLtp.getLanguage());
199             }
200             if ((i & 2) != 0) {
201                 if (!sourceLtp.getScript().isEmpty()) continue;
202                 sourceLtp.setScript(maxLtp.getScript());
203             }
204             if ((i & 4) != 0) {
205                 if (!sourceLtp.getRegion().isEmpty()) continue;
206                 sourceLtp.setRegion(maxLtp.getRegion());
207             }
208             String test = sourceLtp.toString();
209             final String maximize = LIKELY.maximize(test);
210             if (!max.equals(maximize)) {
211                 // max(source) = max, max(test) ≠ max
212                 if (!assertEquals(
213                         String.format(
214                                 "checkAdding: max(%s)->%s, however max(%s)->", source, max, test),
215                         max,
216                         maximize)) {
217                     // LIKELY.maximize(test); // Could step into this for debugging.
218                 }
219             }
220             sourceLtp.set(source); // restore
221         }
222         return true;
223     }
224 
TestCompleteness()225     public void TestCompleteness() {
226         final LanguageTagParser ltp = new LanguageTagParser();
227         if (DEBUG) {
228             System.out.println(TAGS.languages.size() + "\t" + TAGS.languages);
229             System.out.println(TAGS.scripts.size() + "\t" + TAGS.scripts);
230             System.out.println(TAGS.regions.size() + "\t" + TAGS.regions);
231         }
232         main:
233         for (Entry<String, Map<String, Map<String, Boolean>>> languageScriptRegion :
234                 TAGS.languageToScriptToRegions) {
235             String language = languageScriptRegion.getKey();
236             ltp.set(language); // clears script, region
237             for (Entry<String, Map<String, Boolean>> scriptRegion :
238                     languageScriptRegion.getValue().entrySet()) {
239                 String script = scriptRegion.getKey();
240                 ltp.setScript(script);
241                 for (String region : scriptRegion.getValue().keySet()) {
242                     ltp.setRegion(region);
243                     String testTag = ltp.toString();
244                     // System.out.println(testTag);
245                     if (!testTag.equals("und") && !checkAdding(testTag)) {
246                         checkAdding(testTag); // for debugging
247                         continue main;
248                     }
249                 }
250             }
251         }
252     }
253 
254     static Set<String> exceptions =
255             new HashSet<>(
256                     Arrays.asList(
257                             "Zyyy", "Zinh", "Zzzz", "Brai",
258                             "Cpmn")); // scripts with no default language
259 
TestStability()260     public void TestStability() {
261         // when maximized must never change
262         // first get all the subtags
263         // then test all the combinations
264         LanguageTagParser ltp = new LanguageTagParser();
265         for (Entry<String, String> entry : likely.entrySet()) {
266             ltp.set(entry.getKey());
267             String sourceLanguage = ltp.getLanguage();
268             if (sourceLanguage.equals("und")) {
269                 sourceLanguage = "";
270             }
271             String sourceScript = ltp.getScript();
272             String sourceRegion = ltp.getRegion();
273             ltp.set(entry.getValue());
274             String targetLanguage = ltp.getLanguage();
275             String targetScript = ltp.getScript();
276             String targetRegion = ltp.getRegion();
277             if (!sourceLanguage.isEmpty()) {
278                 assertEquals("language", sourceLanguage, targetLanguage);
279             }
280             if (!sourceScript.isEmpty()) {
281                 assertEquals("script", sourceScript, targetScript);
282             }
283             if (!sourceRegion.isEmpty()) {
284                 if (Containment.isLeaf(sourceRegion)) {
285                     assertEquals("region", sourceRegion, targetRegion);
286                 }
287             }
288         }
289     }
290 
TestForMissingScriptMetadata()291     public void TestForMissingScriptMetadata() {
292         TreeSet<String> metadataScripts = new TreeSet<>(ScriptMetadata.getScripts());
293         UnicodeSet current = new UnicodeSet(0, 0x10FFFF);
294         UnicodeSet toRemove = new UnicodeSet();
295 
296         while (!current.isEmpty()) {
297             int ch = current.charAt(0);
298             int script = UScript.getScript(ch);
299             String shortName = UScript.getShortName(script);
300             Info i = ScriptMetadata.getInfo(shortName);
301             if (i == null) {
302                 errln("Script Metadata is missing: " + shortName);
303                 continue;
304             }
305             if (i.likelyLanguage.equals("und") && !exceptions.contains(shortName)) {
306                 errln("Script has no likely language: " + shortName);
307             }
308             toRemove.applyIntPropertyValue(UProperty.SCRIPT, script);
309             current.removeAll(toRemove);
310             metadataScripts.remove(shortName);
311         }
312         metadataScripts.removeAll(
313                 Arrays.asList("Hans", "Hant", "Hanb", "Jamo", "Jpan", "Kore")); // remove
314         // "combo"
315         // scripts
316         if (!metadataScripts.isEmpty()) {
317             // Warning, not error, so that we can add scripts to the script metadata
318             // and later update to the Unicode version that has characters for those scripts.
319             warnln("Script Metadata for characters not in Unicode: " + metadataScripts);
320         }
321     }
322 
TestMissingInfoForLanguage()323     public void TestMissingInfoForLanguage() {
324         CLDRFile english = CLDR_CONFIG.getEnglish().getUnresolved();
325 
326         CalculatedCoverageLevels ccl = CalculatedCoverageLevels.getInstance();
327 
328         for (String language : CLDR_CONFIG.getCldrFactory().getAvailableLanguages()) {
329             if (language.contains("_") || language.equals("root")) {
330                 continue;
331             }
332             String likelyExpansion = likely.get(language);
333             if (likelyExpansion == null) {
334                 errln("Missing likely subtags for: " + language);
335             } else {
336                 logln("Likely subtags for " + language + ":\t " + likely);
337             }
338             String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language);
339             String englishName = english.getStringValue(path);
340             if (englishName == null) {
341                 Level covLevel = ccl.getEffectiveCoverageLevel(language);
342                 if (covLevel == null || !covLevel.isAtLeast(Level.BASIC)) {
343                     // https://unicode-org.atlassian.net/browse/CLDR-15663
344                     if (logKnownIssue(
345                             "CLDR-15663",
346                             "English translation should not be required for sub-basic language name")) {
347                         continue; // skip error
348                     }
349                 }
350                 errln("Missing English translation for: " + language + " which is at " + covLevel);
351             }
352         }
353     }
354 
TestMissingInfoForRegion()355     public void TestMissingInfoForRegion() {
356         CLDRFile english = CLDR_CONFIG.getEnglish();
357 
358         for (String region : StandardCodes.make().getGoodAvailableCodes("territory")) {
359             String likelyExpansion = likely.get("und_" + region);
360             if (likelyExpansion == null) {
361                 if (SUPPLEMENTAL_DATA_INFO.getContained(region) == null) { // not
362                     // container
363                     String likelyTag = LikelySubtags.maximize("und_" + region, likely);
364                     if (likelyTag == null) { //  || !likelyTag.startsWith("en_Latn_")
365                         logln(
366                                 "Missing likely subtags for region: "
367                                         + region
368                                         + "\t"
369                                         + english.getName("territory", region));
370                     }
371                 } else { // container
372                     logln(
373                             "Missing likely subtags for macroregion (fix to exclude regions having 'en'): "
374                                     + region
375                                     + "\t"
376                                     + english.getName("territory", region));
377                 }
378             } else {
379                 logln("Likely subtags for region: " + region + ":\t " + likely);
380             }
381             String path = CLDRFile.getKey(CLDRFile.TERRITORY_NAME, region);
382             String englishName = english.getStringValue(path);
383             if (englishName == null) {
384                 errln("Missing English translation for: " + region);
385             }
386         }
387     }
388 
389     // typically historical script that don't need to  be in likely subtags
390 
391     static final Set<String> KNOWN_SCRIPTS_WITHOUT_LIKELY_SUBTAGS =
392             ImmutableSet.of("Hatr", "Cpmn", "Ougr");
393 
TestMissingInfoForScript()394     public void TestMissingInfoForScript() {
395         VersionInfo icuUnicodeVersion = UCharacter.getUnicodeVersion();
396         TreeSet<String> sorted = new TreeSet<>(ScriptMetadata.getScripts());
397         Set<String> exceptions2 =
398                 new HashSet<>(
399                         Arrays.asList("zh_Hans_CN", "hnj_Hmnp_US", "hnj_Hmng_LA", "iu_Cans_CA"));
400         for (String script : sorted) {
401             if (exceptions.contains(script) || script.equals("Latn") || script.equals("Dsrt")) {
402                 // we minimize away und_X, when the code puts in en...US
403                 continue;
404             }
405             Info i = ScriptMetadata.getInfo(script);
406             // System.out.println(i);
407             String likelyLanguage = i.likelyLanguage;
408             String originCountry = i.originCountry;
409             String undScript = "und_" + script;
410             String langScript = likelyLanguage + "_" + script + "_";
411             String likelyExpansion = likely.get(undScript);
412             if (likelyExpansion == null) {
413                 if (!KNOWN_SCRIPTS_WITHOUT_LIKELY_SUBTAGS.contains(script)) {
414                     String msg =
415                             "likelySubtags.xml missing language for script (und_"
416                                     + script
417                                     + "). Script Metadata suggests that it should be something like:\t "
418                                     + showOverride(script, originCountry, langScript);
419                     if (i.age.compareTo(icuUnicodeVersion) <= 0) {
420                         // Error: Missing data for a script in ICU's Unicode version.
421                         errln(msg);
422                     } else {
423                         // Warning: Missing data for a script in a future Unicode version.
424                         warnln(msg);
425                     }
426                 }
427             } else if (!exceptions2.contains(likelyExpansion)
428                     && !likelyExpansion.startsWith(langScript)) {
429                 // if
430                 // (logKnownIssue("Cldrbug:7181","Missing script metadata for "
431                 // + script)
432                 // && (script.equals("Tfng") || script.equals("Brah"))) {
433                 // logln("Wrong likely language for script (und_" + script +
434                 // "). Should not be " + likelyExpansion
435                 // + ", but something like:\t " + showOverride(script,
436                 // originCountry, langScript));
437                 // } else {
438                 errln(
439                         "likelySubtags.xml has wrong language for script (und_"
440                                 + script
441                                 + "). Should not be "
442                                 + likelyExpansion
443                                 + ", but Script Metadata suggests something like:\t "
444                                 + showOverride(script, originCountry, langScript));
445                 // }
446             } else {
447                 logln("OK: " + undScript + " => " + likelyExpansion);
448             }
449         }
450         /**
451          * und_Bopo => zh_Bopo_TW und_Copt => cop_Copt_EG // fix 002 und_Dsrt => en_Dsrt_US // fix
452          * US
453          */
454     }
455 
showOverride(String script, String originCountry, String langScript)456     public String showOverride(String script, String originCountry, String langScript) {
457         return "{\"und_" + script + "\", \"" + langScript + originCountry + "\"},";
458     }
459 
460     /**
461      * Test two issues:
462      *
463      * <ul>
464      *   <li>That the script of the locale's examplars matches the script derived from the locale's
465      *       identifier.
466      *   <li>That the union of the exemplar sets (main+aux) for all locales with the script matches
467      *       what is in ltp.getResolvedScript()
468      * </ul>
469      *
470      * Written as one test, to avoid the overhead of iterating over all locales twice.
471      */
testGetResolvedScriptVsExemplars()472     public void testGetResolvedScriptVsExemplars() {
473         Factory factory = CLDR_CONFIG.getCldrFactory();
474         LanguageTagParser ltp = new LanguageTagParser();
475         Multimap<String, UnicodeSet> scriptToMains = TreeMultimap.create();
476         Multimap<String, UnicodeSet> scriptToAuxes = TreeMultimap.create();
477         UnicodeSet collectedBad = new UnicodeSet();
478         for (String locale : factory.getAvailable()) {
479             if ("root".equals(locale)) {
480                 continue;
481             }
482             CLDRFile cldrFile = factory.make(locale, true);
483             UnicodeSet main = cldrFile.getRawExemplarSet(ExemplarType.main, WinningChoice.WINNING);
484             main = checkSet("main", locale, main, collectedBad);
485             UnicodeSet aux =
486                     cldrFile.getRawExemplarSet(ExemplarType.auxiliary, WinningChoice.WINNING);
487             aux = checkSet("aux", locale, aux, collectedBad);
488             String script = null;
489             int uScript = 0;
490             for (String s : main) {
491                 uScript = UScript.getScript(s.codePointAt(0));
492                 if (uScript > UScript.INHERITED) {
493                     script = UScript.getShortName(uScript);
494                     break;
495                 }
496             }
497             if (script == null) {
498                 errln("No script for " + locale);
499                 continue;
500             }
501             String ltpScript = ltp.set(locale).getResolvedScript();
502             switch (uScript) {
503                 case UScript.HAN:
504                     switch (ltp.getLanguage()) {
505                         case "ja":
506                             script = "Jpan";
507                             break;
508                         case "yue":
509                             script = ltp.getScript();
510                             if (script.isEmpty()) {
511                                 script = "Hant";
512                             }
513                             break;
514                         case "zh":
515                             script = ltp.getScript();
516                             if (script.isEmpty()) {
517                                 script = "Hans";
518                             }
519                             break;
520                     }
521                     break;
522                 case UScript.HANGUL:
523                     switch (ltp.getLanguage()) {
524                         case "ko":
525                             script = "Kore";
526                             break;
527                     }
528             }
529             if (!assertEquals(locale, script, ltpScript)) {
530                 ltp.getResolvedScript(); // for debugging
531             }
532             scriptToMains.put(ltpScript, main.freeze());
533             if (!aux.isEmpty()) {
534                 scriptToAuxes.put(ltpScript, aux.freeze());
535             }
536         }
537 
538         if (!collectedBad.isEmpty()) {
539             warnln(
540                     "Locales have "
541                             + collectedBad.size()
542                             + " unexpected characters in main and/or aux:\t"
543                             + collectedBad.toPattern(false)
544                             + "\n Use -DSHOW_EXEMPLARS for details");
545         }
546 
547         // now check that ScriptToExemplars.getExemplars matches the data
548 
549         Set<String> problemScripts = new LinkedHashSet<>();
550         Map<String, UnicodeSet> expected = new TreeMap<>();
551         for (Entry<String, Collection<UnicodeSet>> entry : scriptToMains.asMap().entrySet()) {
552             String script = entry.getKey();
553             Collection<UnicodeSet> mains = entry.getValue();
554             Collection<UnicodeSet> auxes = scriptToAuxes.get(script);
555 
556             UnicodeSet flattened;
557             if (mains.size() <= 1 && auxes.size() <= 1) {
558                 continue;
559             } else {
560                 UnicodeMap<Integer> counts = new UnicodeMap<>();
561                 getCounts(mains, counts);
562                 flattened = getUncommon(counts, mains.size());
563                 if (counts.size() < 32) {
564                     getCounts(auxes, counts);
565                     flattened = getUncommon(counts, mains.size());
566                 }
567             }
568             expected.put(script, flattened.freeze());
569         }
570         for (Entry<String, UnicodeSet> entry : expected.entrySet()) {
571             String script = entry.getKey();
572             UnicodeSet flattened = entry.getValue();
573 
574             // now compare to what we get from the cached file, to make sure the latter is up to
575             // date
576 
577             if (!assertEquals(
578                     script,
579                     flattened.toPattern(false),
580                     ScriptToExemplars.getExemplars(script).toPattern(false))) {
581                 problemScripts.add(script);
582             }
583         }
584 
585         if (!problemScripts.isEmpty()) {
586             warnln(
587                     "Adjust the data in scriptToExemplars.txt. Use -DSHOW_EXEMPLARS to get a fresh copy, or reset to expected value for: "
588                             + problemScripts);
589             if (SHOW_EXEMPLARS) {
590                 for (Entry<String, UnicodeSet> entry : expected.entrySet()) {
591                     String script = entry.getKey();
592                     UnicodeSet flattened = entry.getValue();
593                     if (!flattened.isEmpty()) {
594                         System.out.println(
595                                 script
596                                         + " ;\t"
597                                         + flattened.size()
598                                         + " ;\t"
599                                         + flattened.toPattern(false));
600                     }
601                 }
602             }
603         }
604     }
605 
606     static final UnicodeSet MAIN_AUX_EXPECTED = new UnicodeSet("[\\p{L}\\p{M}\\p{Cf}·]").freeze();
607 
checkSet( String title, String locale, UnicodeSet main, UnicodeSet collected)608     private UnicodeSet checkSet(
609             String title, String locale, UnicodeSet main, UnicodeSet collected) {
610         UnicodeSet bad = new UnicodeSet();
611         for (String s : main) {
612             if (!MAIN_AUX_EXPECTED.containsAll(s)) {
613                 bad.add(s);
614             }
615         }
616         if (!bad.isEmpty()) {
617             if (SHOW_EXEMPLARS) {
618                 warnln(
619                         "\t"
620                                 + title
621                                 + "\tLocale\t"
622                                 + locale
623                                 + "\thas "
624                                 + bad.size()
625                                 + " unexpected exemplar characters:\t"
626                                 + bad.toPattern(false));
627             }
628             collected.addAll(bad);
629         }
630         return CldrUtility.flatten(new UnicodeSet(main).removeAll(bad));
631     }
632 
633     /**
634      * Remove items with a count equal to size (they are common to all locales), and flatten
635      * (against the whole set)
636      */
getUncommon(UnicodeMap<Integer> counts, int size)637     private UnicodeSet getUncommon(UnicodeMap<Integer> counts, int size) {
638         UnicodeSet flattenedAll =
639                 CldrUtility.flatten(counts.keySet()); // we flatten against the whole set
640         UnicodeSet result = new UnicodeSet();
641         for (String s : flattenedAll) {
642             int count = counts.get(s);
643             if (count != size) {
644                 result.add(s);
645             }
646         }
647         return result.freeze();
648     }
649 
getCounts(Collection<UnicodeSet> usets, UnicodeMap<Integer> counts)650     private void getCounts(Collection<UnicodeSet> usets, UnicodeMap<Integer> counts) {
651         for (UnicodeSet uset : usets) {
652             for (String s : uset) {
653                 Integer old = counts.get(s);
654                 if (old == null) {
655                     counts.put(s, 1);
656                 } else {
657                     counts.put(s, old + 1);
658                 }
659             }
660         }
661     }
662 
testUndAllScriptsAndRegions()663     public void testUndAllScriptsAndRegions() {
664         Set<String> regions = new TreeSet<>();
665         Set<String> scripts = new TreeSet<>();
666         Set<String> regularCountries =
667                 VALIDITY.getStatusToCodes(LstrType.region).get(Status.regular);
668         Set<String> macroRegions =
669                 Set
670                         .of(); // Validity.getInstance().getStatusToCodes(LstrType.region).get(Status.macroregion);
671 
672         for (String country : Sets.union(regularCountries, macroRegions)) {
673             regions.add(country);
674         }
675 
676         // for Scripts, just test the ones in CLDR
677         for (String localeString : CLDR_CONFIG.getCldrFactory().getAvailable()) {
678             if (localeString.equals("root")) {
679                 continue;
680             }
681             CLDRLocale cLocale = CLDRLocale.getInstance(localeString);
682             final String script = cLocale.getScript();
683             if (script.equals("Dsrt")) {
684                 continue; // toy script
685             }
686             final String country = cLocale.getCountry();
687             if (!country.isEmpty() && !country.equals("001")) {
688                 regions.add(country);
689             }
690             if (!script.isEmpty()) {
691                 scripts.add(script);
692                 //                if (!country.isEmpty()) {
693                 //                    // we only need this if the value from script + country is
694                 // different from the value of script
695                 //                    combinations.add("und_" + script + "_" + country);
696                 //                }
697             }
698         }
699         for (String script : scripts) {
700             if (script.equals("Latn")) {
701                 assertTrue("contains und_" + script, likely.containsKey("und"));
702             } else if (!assertTrue("contains und_" + script, likely.containsKey("und_" + script))) {
703 
704             }
705         }
706         LanguageTagParser ltp = new LanguageTagParser();
707         Set<String> possibleFixes = new TreeSet<>();
708         for (String region : regions) {
709             final String undRegion = "und_" + region;
710             if (region.equals("150") && likely.containsKey("und")) {
711                 // skip
712             } else if (!assertTrue("contains und_" + region, likely.containsKey(undRegion))) {
713                 Set<String> languages =
714                         SUPPLEMENTAL_DATA_INFO.getLanguagesForTerritoryWithPopulationData(region);
715                 double biggest = -1;
716                 String biggestLang = null;
717                 for (String language : languages) {
718                     PopulationData popData =
719                             SUPPLEMENTAL_DATA_INFO.getLanguageAndTerritoryPopulationData(
720                                     language, region);
721                     if (popData.getLiteratePopulation() > biggest) {
722                         biggest = popData.getLiteratePopulation();
723                         biggestLang = language;
724                     }
725                 }
726                 if (biggestLang != null) {
727                     ltp.set(biggestLang);
728                     if (ltp.getScript().isEmpty()) {
729                         String biggestMax = likely.get(biggestLang);
730                         ltp.set(biggestMax);
731                     }
732                     ltp.setRegion(region);
733                     possibleFixes.add(
734                             "<likelySubtag from=\"" + undRegion + "\" to=\"" + ltp + "\"/>");
735                 }
736             }
737         }
738         System.out.println("\t\t" + Joiner.on("\n\t\t").join(possibleFixes));
739     }
740 
testToAttributeValidityStatus()741     public void testToAttributeValidityStatus() {
742         Set<String> okLanguages = VALIDITY.getStatusToCodes(LstrType.language).get(Status.regular);
743         Set<String> okScripts = VALIDITY.getStatusToCodes(LstrType.script).get(Status.regular);
744         Set<String> okRegions = VALIDITY.getStatusToCodes(LstrType.region).get(Status.regular);
745         Multimap<String, String> badFieldsToLocales = TreeMultimap.create();
746         Set<String> knownExceptions = Set.of("in", "iw", "ji", "jw", "mo", "tl");
747         for (String s : likely.values()) {
748             CLDRLocale cLocale = CLDRLocale.getInstance(s);
749             final String language = cLocale.getLanguage();
750             final String script = cLocale.getScript();
751             final String region = cLocale.getCountry();
752             if (!okLanguages.contains(language)) {
753                 if (knownExceptions.contains(language)) {
754                     continue;
755                 }
756                 badFieldsToLocales.put(language, s);
757             }
758             if (!okScripts.contains(script)) {
759                 badFieldsToLocales.put(script, s);
760             }
761             if (!okRegions.contains(region)) {
762                 badFieldsToLocales.put(region, s);
763             }
764         }
765         if (!badFieldsToLocales.isEmpty()) {
766             Multimap<Status, String> statusToExamples = TreeMultimap.create();
767             for (String field : badFieldsToLocales.keySet()) {
768                 Status status = VALIDITY.getCodeToStatus(LstrType.language).get(field);
769                 if (status == null) {
770                     status = VALIDITY.getCodeToStatus(LstrType.script).get(field);
771                 }
772                 if (status == null) {
773                     status = VALIDITY.getCodeToStatus(LstrType.region).get(field);
774                 }
775                 statusToExamples.put(status, field);
776             }
777             Map<String, String> fieldToOrigin = new TreeMap<>();
778             for (Entry<Status, Collection<String>> entry : statusToExamples.asMap().entrySet()) {
779                 //                for (String value : entry.getValue()) {
780                 //                    String origin =
781                 // SUPPLEMENTAL_DATA_INFO.getLikelyOrigins().get(value);
782                 //                    fieldToOrigin.put(value, origin == null ? "n/a" : origin);
783                 //                }
784                 warnln("Bad status=" + entry.getKey() + " for " + entry.getValue());
785             }
786         }
787     }
788 
789     /**
790      * Test whether any of the mapping lines in likelySubtags.xml are superfluous. <br>
791      * For example, with the following mappings, #2 and #3 are superfluous, since they would be
792      * produced by the algorithm anyway.
793      *
794      * <ol>
795      *   <li>ll => ll_Sss1_R1
796      *   <li>ll_Sss2 => ll_Sss2_RR
797      *   <li>ll_R2 => ll_Ssss_R2
798      * </ol>
799      *
800      * On the other hand, the following are not:
801      *
802      * <ol>
803      *   <li>ll_Sss2 => ll_Sss2_R3
804      *   <li>ll_R2 => ll_Sss3_R2
805      * </ol>
806      */
testSuperfluous()807     public void testSuperfluous() {
808         Map<String, String> origins = SUPPLEMENTAL_DATA_INFO.getLikelyOrigins();
809 
810         // collect all items with same language
811         LanguageTagParser ltp = new LanguageTagParser();
812         TreeMap<String, TreeMap<String, String>> langToLikelySubset = new TreeMap<>();
813         for (Entry<String, String> entry : likely.entrySet()) {
814             String lang = ltp.set(entry.getKey()).getLanguage();
815             if (lang.equals("und")) {
816                 continue;
817             }
818             TreeMap<String, String> subtree = langToLikelySubset.get(lang);
819             if (subtree == null) {
820                 langToLikelySubset.put(lang, subtree = new TreeMap<>());
821             }
822             subtree.put(entry.getKey(), entry.getValue());
823         }
824         boolean first = true;
825 
826         for (Entry<String, TreeMap<String, String>> langAndMap : langToLikelySubset.entrySet()) {
827             String lang0 = langAndMap.getKey();
828             Map<String, String> goldenMap = ImmutableMap.copyOf(langAndMap.getValue());
829             if (goldenMap.size() == 1) {
830                 continue;
831             }
832 
833             // get test sets and build probe data
834 
835             Set<String> scripts = new TreeSet<>();
836             scripts.add("Egyp");
837             scripts.add("");
838             Set<String> regions = new TreeSet<>();
839             regions.add("AQ");
840             regions.add("");
841             for (String key : Sets.union(goldenMap.keySet(), new TreeSet<>(goldenMap.values()))) {
842                 scripts.add(ltp.set(key).getScript());
843                 regions.add(ltp.getRegion());
844             }
845             scripts = ImmutableSet.copyOf(scripts);
846             regions = ImmutableSet.copyOf(regions);
847 
848             TreeSet<String> probeData = new TreeSet<>();
849             ltp.setLanguage(lang0); // clear;
850             for (String script : scripts) {
851                 ltp.setScript(script); // clear;
852                 for (String region : regions) {
853                     ltp.setRegion(region);
854                     probeData.add(ltp.toString());
855                 }
856             }
857 
858             // see if the omission of a <key,value> makes no difference
859 
860             String omittableKey = null;
861 
862             for (String keyToTryOmitting : goldenMap.keySet()) {
863                 if (!keyToTryOmitting.contains("_")) {
864                     continue;
865                 }
866                 TreeMap<String, String> mapWithOmittedKey = new TreeMap<>(goldenMap);
867                 mapWithOmittedKey.remove(keyToTryOmitting);
868 
869                 boolean makesADifference = false;
870                 for (String probe : probeData) {
871                     String expected = LikelySubtags.maximize(probe, goldenMap);
872                     String actual = LikelySubtags.maximize(probe, mapWithOmittedKey);
873                     if (!Objects.equal(expected, actual)) {
874                         makesADifference = true;
875                         break;
876                     }
877                 }
878                 if (!makesADifference) {
879                     omittableKey = keyToTryOmitting;
880                     break;
881                 }
882             }
883 
884             // show the value that doesn't make a difference
885             // NOTE: there may be more than one, but it is sufficient to find one.
886             if (omittableKey != null) {
887                 final String origin = origins.get(omittableKey);
888                 if (origin != null) { // only check the non-sil for now
889                     logKnownIssue("CLDR-17084", "Remove superfluous lines in likelySubtags.txt");
890                     continue;
891                 }
892                 if (first) {
893                     warnln("\tMaps\tKey to omit\tvalue\torigin");
894                     first = false;
895                 }
896                 assertFalse(
897                         "\t"
898                                 + goldenMap
899                                 + "\t"
900                                 + omittableKey
901                                 + "\t"
902                                 + goldenMap.get(omittableKey)
903                                 + "\t"
904                                 + (origin == null ? "" : origin)
905                                 + "\t",
906                         true);
907             }
908         }
909     }
910 }
911