xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestInheritance.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.unittest;
2 
3 import com.google.common.base.Joiner;
4 import com.google.common.collect.Sets;
5 import com.ibm.icu.dev.test.TestFmwk;
6 import com.ibm.icu.impl.Relation;
7 import com.ibm.icu.impl.Row.R2;
8 import java.io.IOException;
9 import java.util.ArrayList;
10 import java.util.Arrays;
11 import java.util.Collections;
12 import java.util.HashMap;
13 import java.util.HashSet;
14 import java.util.LinkedHashSet;
15 import java.util.List;
16 import java.util.Map;
17 import java.util.Map.Entry;
18 import java.util.Set;
19 import java.util.TreeMap;
20 import java.util.TreeSet;
21 import java.util.regex.Matcher;
22 import org.unicode.cldr.draft.ScriptMetadata;
23 import org.unicode.cldr.draft.ScriptMetadata.Info;
24 import org.unicode.cldr.tool.GenerateMaximalLocales;
25 import org.unicode.cldr.tool.LikelySubtags;
26 import org.unicode.cldr.util.Builder;
27 import org.unicode.cldr.util.CLDRConfig;
28 import org.unicode.cldr.util.CLDRFile;
29 import org.unicode.cldr.util.CLDRLocale;
30 import org.unicode.cldr.util.ChainedMap;
31 import org.unicode.cldr.util.ChainedMap.M3;
32 import org.unicode.cldr.util.CldrUtility;
33 import org.unicode.cldr.util.Iso3166Data;
34 import org.unicode.cldr.util.LanguageTagParser;
35 import org.unicode.cldr.util.LocaleIDParser;
36 import org.unicode.cldr.util.PatternCache;
37 import org.unicode.cldr.util.StandardCodes;
38 import org.unicode.cldr.util.SupplementalDataInfo;
39 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData;
40 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type;
41 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
42 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
43 
44 public class TestInheritance extends TestFmwk {
45 
46     static CLDRConfig testInfo = CLDRConfig.getInstance();
47 
48     private static boolean DEBUG = CldrUtility.getProperty("DEBUG", false);
49 
50     private static Matcher pathMatcher =
51             PatternCache.get(CldrUtility.getProperty("XPATH", ".*")).matcher("");
52 
main(String[] args)53     public static void main(String[] args) throws IOException {
54         new TestInheritance().run(args);
55     }
56 
57     private static final SupplementalDataInfo dataInfo = SupplementalDataInfo.getInstance();
58     private static final Set<String> defaultContents = dataInfo.getDefaultContentLocales();
59 
60     private static final boolean EXPECT_EQUALITY = false;
61 
62     private static Set<String> availableLocales = testInfo.getFullCldrFactory().getAvailable();
63 
TestLocalesHaveOfficial()64     public void TestLocalesHaveOfficial() {
65         // If we have a language, we have all the region locales where the
66         // language is official
67         Set<String> SKIP_TERRITORIES = new HashSet<>(Arrays.asList("001", "150"));
68         SKIP_TERRITORIES.addAll(Iso3166Data.getRegionCodesNotForTranslation());
69         for (Entry<String, R2<List<String>, String>> s :
70                 dataInfo.getLocaleAliasInfo().get("territory").entrySet()) {
71             SKIP_TERRITORIES.add(s.getKey());
72         }
73 
74         LanguageTagParser ltp = new LanguageTagParser();
75 
76         Relation<String, String> languageLocalesSeen =
77                 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
78 
79         Set<String> testOrg = StandardCodes.make().getLocaleCoverageLocales("google");
80         ChainedMap.M4<String, OfficialStatus, String, Boolean> languageToOfficialChildren =
81                 ChainedMap.of(
82                         new TreeMap<String, Object>(),
83                         new TreeMap<OfficialStatus, Object>(),
84                         new TreeMap<String, Object>(),
85                         Boolean.class);
86 
87         // gather the data
88 
89         for (String language : dataInfo.getLanguagesForTerritoriesPopulationData()) {
90             for (String territory : dataInfo.getTerritoriesForPopulationData(language)) {
91                 if (SKIP_TERRITORIES.contains(territory)) {
92                     continue;
93                 }
94                 PopulationData data =
95                         dataInfo.getLanguageAndTerritoryPopulationData(language, territory);
96                 OfficialStatus status = data.getOfficialStatus();
97                 if (data.getOfficialStatus() != OfficialStatus.unknown) {
98                     String locale = removeScript(language + "_" + territory);
99                     String lang = removeScript(ltp.set(locale).getLanguage());
100                     languageToOfficialChildren.put(lang, status, locale, Boolean.TRUE);
101                     languageLocalesSeen.put(lang, locale);
102                 }
103             }
104         }
105 
106         // flesh it out by adding 'clean' codes.
107         // also get the child locales in cldr.
108 
109         Relation<String, String> languageToChildren =
110                 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
111         for (String locale : testInfo.getCldrFactory().getAvailable()) {
112             String lang = ltp.set(locale).getLanguage();
113             if (SKIP_TERRITORIES.contains(ltp.getRegion())) {
114                 continue;
115             }
116             lang = removeScript(lang);
117             locale = removeScript(locale);
118 
119             if (!lang.equals(locale)) {
120                 languageToChildren.put(lang, locale);
121                 Set<String> localesSeen = languageLocalesSeen.get(lang);
122                 if (localesSeen == null || !localesSeen.contains(locale)) {
123                     languageToOfficialChildren.put(
124                             lang, OfficialStatus.unknown, locale, Boolean.TRUE);
125                 }
126             }
127         }
128 
129         for (Entry<String, Set<String>> languageAndChildren : languageToChildren.keyValuesSet()) {
130             String language = languageAndChildren.getKey();
131             Set<String> children = languageAndChildren.getValue();
132             M3<OfficialStatus, String, Boolean> officalStatusToChildren =
133                     languageToOfficialChildren.get(language);
134             for (Entry<OfficialStatus, Map<String, Boolean>> entry : officalStatusToChildren) {
135                 OfficialStatus status = entry.getKey();
136                 if (status != OfficialStatus.official
137                         && status != OfficialStatus.de_facto_official) {
138                     continue;
139                 }
140                 Set<String> officalChildren = entry.getValue().keySet();
141                 if (!children.containsAll(officalChildren)) {
142                     Set<String> missing = new TreeSet<>(officalChildren);
143                     missing.removeAll(children);
144                     String message =
145                             "Missing CLDR locales for " + status + " languages: " + missing;
146                     errln(message);
147                 } else {
148                     logln(
149                             "CLDR locales "
150                                     + children
151                                     + " cover "
152                                     + status
153                                     + " locales "
154                                     + officalChildren);
155                 }
156             }
157         }
158 
159         if (DEBUG) {
160             Set<String> languages = new TreeSet<>(languageToChildren.keySet());
161             languages.addAll(languageToOfficialChildren.keySet());
162             System.out.print("\ncode\tlanguage");
163             for (OfficialStatus status : OfficialStatus.values()) {
164                 System.out.print("\tNo\t" + status);
165             }
166             System.out.println();
167             for (String language : languages) {
168                 if (!testOrg.contains(language)) {
169                     continue;
170                 }
171                 System.out.print(language + "\t" + testInfo.getEnglish().getName(language));
172 
173                 M3<OfficialStatus, String, Boolean> officialChildren =
174                         languageToOfficialChildren.get(language);
175                 for (OfficialStatus status : OfficialStatus.values()) {
176                     Map<String, Boolean> children = officialChildren.get(status);
177                     if (children == null) {
178                         System.out.print("\t" + 0 + "\t");
179                     } else {
180                         System.out.print(
181                                 "\t" + children.size() + "\t" + show(children.keySet(), false));
182                     }
183                 }
184                 System.out.println();
185             }
186         }
187     }
188 
show(Set<String> joint, boolean showStatus)189     private String show(Set<String> joint, boolean showStatus) {
190         StringBuffer b = new StringBuffer();
191         for (String s : joint) {
192             if (b.length() != 0) {
193                 b.append(", ");
194             }
195             LanguageTagParser ltp = new LanguageTagParser().set(s);
196             String script = ltp.getScript();
197             if (script.length() != 0) {
198                 b.append(testInfo.getEnglish().getName(CLDRFile.SCRIPT_NAME, script));
199             }
200             String region = ltp.getRegion();
201             if (region.length() != 0) {
202                 if (script.length() != 0) {
203                     b.append("-");
204                 }
205                 b.append(testInfo.getEnglish().getName(CLDRFile.TERRITORY_NAME, region));
206             }
207             b.append(" [").append(s);
208             if (showStatus) {
209                 PopulationData data =
210                         dataInfo.getLanguageAndTerritoryPopulationData(ltp.getLanguage(), region);
211                 if (data == null) {
212                     data =
213                             dataInfo.getLanguageAndTerritoryPopulationData(
214                                     ltp.getLanguageScript(), region);
215                 }
216                 b.append("; ");
217                 b.append(data == null ? "?" : data.getOfficialStatus());
218             }
219             b.append("]");
220         }
221         return b.toString();
222     }
223 
removeScript(String lang)224     private String removeScript(String lang) {
225         if (!lang.contains("_")) {
226             return lang;
227         }
228         LanguageTagParser ltp = new LanguageTagParser().set(lang);
229         // String ls = ltp.getLanguageScript();
230         // if (defaultContents.contains(ls)) {
231         ltp.setScript("");
232         // }
233         return ltp.toString();
234     }
235 
TestLikelyAndDefaultConsistency()236     public void TestLikelyAndDefaultConsistency() {
237         LikelySubtags likelySubtags = new LikelySubtags();
238         LanguageTagParser ltp = new LanguageTagParser();
239         // find multiscript locales
240         Relation<String, String> base2scripts =
241                 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
242         Map<String, String> parent2default = new TreeMap<>();
243         Map<String, String> default2parent = new TreeMap<>();
244         Relation<String, String> base2locales =
245                 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
246 
247         Set<String> knownMultiScriptLanguages = new HashSet<>(Arrays.asList("bm", "ha"));
248         // get multiscript locales
249         for (String localeID : availableLocales) {
250             String script = ltp.set(localeID).getScript();
251             final String base = ltp.getLanguage();
252             if (!availableLocales.contains(base) && !base.equals("und")) {
253                 errln("Missing lang-subtag base " + base + " for: " + localeID);
254             }
255             base2locales.put(base, localeID);
256             if (!script.isEmpty() && !base.equals("en")) { // HACK for en
257                 base2scripts.put(base, script);
258             }
259             if (script.isEmpty() && knownMultiScriptLanguages.contains(base)) {
260                 base2scripts.put(base, dataInfo.getDefaultScript(base));
261             }
262         }
263 
264         // get default contents
265         for (String localeID : defaultContents) {
266             checkLocale(localeID, false);
267             String parent =
268                     LocaleIDParser.getParent(localeID); // was using getSimpleParent, not sure why
269             parent2default.put(parent, localeID);
270             default2parent.put(localeID, parent);
271             // if (!available.contains(simpleParent)) {
272             // // verify that base language has locale in CLDR (we don't want
273             // others)
274             // errln("Default contents contains locale not in CLDR:\t" +
275             // simpleParent);
276             // }
277         }
278 
279         // get likely
280         Map<String, String> likely2Maximized = likelySubtags.getToMaximized();
281         for (Entry<String, String> likelyAndMaximized : likely2Maximized.entrySet()) {
282             checkLocale(likelyAndMaximized.getKey(), true);
283             checkLocale(likelyAndMaximized.getValue(), true);
284         }
285         Map<String, String> exceptionDcLikely = new HashMap<>();
286         Map<String, String> exceptionLikelyDc = new HashMap<>();
287         for (String[] s :
288                 new String[][] {
289                     {"ar_001", "ar_Arab_EG"}, {"nb", "no_Latn_NO"},
290                 }) {
291             exceptionDcLikely.put(s[0], s[1]);
292             exceptionLikelyDc.put(s[1], s[0]);
293         }
294 
295         verifyDefaultContentsImplicationsForLikelySubtags(
296                 ltp, parent2default, likely2Maximized, exceptionDcLikely);
297 
298         verifyLikelySubtagsImplicationsForDefaultContents(
299                 ltp, base2scripts, parent2default, likely2Maximized, exceptionLikelyDc);
300 
301         verifyScriptsWithDefaultContents(ltp, base2scripts, parent2default, base2locales);
302     }
303 
TestParentLocaleRelationships()304     public void TestParentLocaleRelationships() {
305         // Testing invariant relationships between locales - See
306         // http://unicode.org/cldr/trac/ticket/5758
307 
308         /* Examples:
309         <parentLocale parent="no" locales="nb nn no_NO"/>
310         default content locales distinguish the child locale that has identical content, such as:
311         ebu_KE ee_GH el_GR en_Dsrt_US en_Shaw_GB en_US eo_001 es_ES et_EE eu_ES ewo_CM
312          */
313         Matcher langScript = PatternCache.get("^[a-z]{2,3}_[A-Z][a-z]{3}$").matcher("");
314         for (final String loc : availableLocales) {
315             // we only check locales of the form: lang_script
316             if (langScript.reset(loc).matches()) {
317                 if (ALLOW_DIFFERENT_PARENT_LOCALE.contains(loc)) {
318                     // Skip any in that list
319                     continue;
320                 }
321                 String languageSubtag = loc.split("_")[0];
322                 String expectedParent = languageSubtag;
323                 if (!defaultContents.contains(loc)) {
324                     expectedParent = "root";
325                 }
326                 String truncationParent = LocaleIDParser.getSimpleParent(loc);
327                 String actualParent = LocaleIDParser.getParent(loc);
328                 boolean hasExplicitParent = !actualParent.equals(truncationParent);
329 
330                 if (!actualParent.equals(expectedParent)) {
331                     errln(
332                             "Unexpected parent locale for locale "
333                                     + loc
334                                     + ". Expected: "
335                                     + expectedParent
336                                     + " Got: "
337                                     + actualParent
338                                     + " "
339                                     + ALLOW_DIFFERENT_PARENT_LOCALE_MESSAGE);
340                 }
341 
342                 if (hasExplicitParent && defaultContents.contains(loc)) {
343                     errln(
344                             "Locale "
345                                     + loc
346                                     + " can't have an explicit parent AND be a default content locale");
347                 }
348             }
349         }
350     }
351 
352     final String ALLOW_DIFFERENT_PARENT_LOCALE_MESSAGE =
353             "See ALLOW_DIFFERENT_PARENT_LOCALE in TestInheritance.java";
354     public final Set<String> ALLOW_DIFFERENT_PARENT_LOCALE =
355             Collections.unmodifiableSet(
356                     Sets.newHashSet(
357                             // Update this if additional locales have explicit parents in a
358                             // different language code
359 
360                             // Per CLDR-2698/14493 we allow nb,nn to have an explicit parent no
361                             // which is a different language code.
362                             "nn",
363                             "nb",
364                             // Per CLDR-15276 hi-Latn can have an explicit parent
365                             "hi_Latn"));
366 
TestParentLocaleInvariants()367     public void TestParentLocaleInvariants() {
368         // Testing invariant relationships in parent locales - See
369         // http://unicode.org/cldr/trac/ticket/7887
370         CLDRLocale cldrRoot = CLDRLocale.getInstance("root");
371         LikelySubtags likely = new LikelySubtags();
372         for (String loc : availableLocales) {
373             CLDRLocale cldrLoc = CLDRLocale.getInstance(loc);
374             CLDRLocale cldrParent = cldrLoc.getParent();
375             if (cldrParent != null) {
376                 CLDRLocale locLikely = CLDRLocale.getInstance(likely.maximize(loc));
377                 CLDRLocale parentLikely =
378                         CLDRLocale.getInstance(likely.maximize(cldrParent.toString()));
379                 final String locLang = cldrLoc.getLanguage();
380                 final String locScript = cldrLoc.getScript();
381                 final String locRegion = cldrLoc.getCountry();
382                 final String parentLang = cldrParent.getLanguage();
383                 final boolean parentIsRoot = cldrRoot.equals(cldrParent);
384                 if (!parentIsRoot
385                         && !ALLOW_DIFFERENT_PARENT_LOCALE.contains(loc)
386                         && !locLang.equals(parentLang)) {
387                     errln(
388                             "Parent locale ["
389                                     + cldrParent
390                                     + "] for locale ["
391                                     + loc
392                                     + "] cannot be a different language code. "
393                                     + ALLOW_DIFFERENT_PARENT_LOCALE_MESSAGE);
394                 }
395                 if (!parentIsRoot && !locLikely.getScript().equals(parentLikely.getScript())) {
396                     errln(
397                             "Parent locale ["
398                                     + cldrParent
399                                     + "] for locale ["
400                                     + loc
401                                     + "] cannot have a different script code.");
402                 }
403                 String cldrTruncationParent = LocaleIDParser.getSimpleParent(loc);
404                 boolean hasExplicitParent = !cldrTruncationParent.equals(cldrParent.toString());
405                 if (hasExplicitParent
406                         && parentIsRoot
407                         && locScript.length() == 0
408                         && locRegion.length() == 0
409                         && !ALLOW_DIFFERENT_PARENT_LOCALE.contains(loc)) {
410                     errln(
411                             "Base language locale ["
412                                     + loc
413                                     + "] cannot have an explicit parent ("
414                                     + cldrParent
415                                     + ") "
416                                     + ALLOW_DIFFERENT_PARENT_LOCALE_MESSAGE);
417                 }
418             }
419         }
420     }
421 
TestParentLocalesForCycles()422     public void TestParentLocalesForCycles() {
423         // Testing for cyclic relationships in parent locales - See
424         // http://unicode.org/cldr/trac/ticket/7887
425         for (String loc : availableLocales) {
426             String currentLoc = loc;
427             boolean foundError = false;
428             List<String> inheritanceChain = new ArrayList<>(Arrays.asList(loc));
429             while (currentLoc != null && !foundError) {
430                 currentLoc = LocaleIDParser.getParent(currentLoc);
431                 if (inheritanceChain.contains(currentLoc)) {
432                     foundError = true;
433                     inheritanceChain.add(currentLoc);
434                     errln(
435                             "Inheritance chain for locale ["
436                                     + loc
437                                     + "] contains a cyclic relationship. "
438                                     + inheritanceChain.toString());
439                 }
440                 inheritanceChain.add(currentLoc);
441             }
442         }
443     }
444 
verifyScriptsWithDefaultContents( LanguageTagParser ltp, Relation<String, String> base2scripts, Map<String, String> parent2default, Relation<String, String> base2locales)445     private void verifyScriptsWithDefaultContents(
446             LanguageTagParser ltp,
447             Relation<String, String> base2scripts,
448             Map<String, String> parent2default,
449             Relation<String, String> base2locales) {
450         Set<String> skip = Builder.with(new HashSet<String>()).addAll("root", "und").freeze();
451         Set<String> languagesWithOneOrLessLocaleScriptInCommon =
452                 new HashSet<>(Arrays.asList("bm", "ha", "hi", "ms", "iu", "mn"));
453         Set<String> baseLanguagesWhoseDefaultContentHasNoRegion =
454                 new HashSet<>(Arrays.asList("no"));
455         // for each base we have to have,
456         // if multiscript, we have default contents for base+script,
457         // base+script+region;
458         // otherwise base+region.
459         for (String base : base2locales.keySet()) {
460             if (skip.contains(base)) {
461                 continue;
462             }
463             String defaultContent = parent2default.get(base);
464             // Set<String> likely = base2likely.get(base);
465             // if (likely == null) {
466             // errln("Missing likely subtags for: " + base + "  " +
467             // suggestLikelySubtagFor(base));
468             // }
469             if (defaultContent == null) {
470                 errln("Missing default content for: " + base + "  " + suggestLikelySubtagFor(base));
471                 continue;
472             }
473             Set<String> scripts = base2scripts.get(base);
474             ltp.set(defaultContent);
475             String script = ltp.getScript();
476             String region = ltp.getRegion();
477             if (scripts == null || languagesWithOneOrLessLocaleScriptInCommon.contains(base)) {
478                 if (!script.isEmpty()) {
479                     errln(
480                             "Script should be empty in default content for: "
481                                     + base
482                                     + ","
483                                     + defaultContent);
484                 }
485                 if (region.isEmpty()
486                         && !baseLanguagesWhoseDefaultContentHasNoRegion.contains(base)) {
487                     errln(
488                             "Region must not be empty in default content for: "
489                                     + base
490                                     + ","
491                                     + defaultContent);
492                 }
493             } else {
494                 if (script.isEmpty()) {
495                     errln(
496                             "Script should not be empty in default content for: "
497                                     + base
498                                     + ","
499                                     + defaultContent);
500                 }
501                 if (!region.isEmpty()) {
502                     errln(
503                             "Region should be empty in default content for: "
504                                     + base
505                                     + ","
506                                     + defaultContent);
507                 }
508                 String defaultContent2 = parent2default.get(defaultContent);
509                 if (defaultContent2 == null) {
510                     errln("Missing default content for: " + defaultContent);
511                     continue;
512                 }
513                 ltp.set(defaultContent2);
514                 region = ltp.getRegion();
515                 if (region.isEmpty()) {
516                     errln(
517                             "Region must not be empty in default content for: "
518                                     + base
519                                     + ","
520                                     + defaultContent);
521                 }
522             }
523         }
524     }
525 
verifyLikelySubtagsImplicationsForDefaultContents( LanguageTagParser ltp, Relation<String, String> base2scripts, Map<String, String> parent2default, Map<String, String> likely2Maximized, Map<String, String> exceptionLikelyDc)526     private void verifyLikelySubtagsImplicationsForDefaultContents(
527             LanguageTagParser ltp,
528             Relation<String, String> base2scripts,
529             Map<String, String> parent2default,
530             Map<String, String> likely2Maximized,
531             Map<String, String> exceptionLikelyDc) {
532         // Now check invariants for all LikelySubtags implications for Default
533         // Contents
534         // a) suppose likely max for la_Scrp => la_Scrp_RG
535         // Then default contents la_Scrp => la_Scrp_RG
536         // b) suppose likely max for la_RG => la_Scrp_RG
537         // Then we can draw no conclusions // was default contents la_Scrp =>
538         // la_Scrp_RG
539         // c) suppose likely max for la => la_Scrp_RG
540         // Then default contents la => la_Scrp && la_Scrp => la_Scrp_RG
541         // or default contents la => la_RG && ! la_Scrp => la_Scrp_RG
542 
543         TreeSet<String> additionalDefaultContents = new TreeSet<>();
544 
545         for (Entry<String, String> entry : likely2Maximized.entrySet()) {
546             String source = entry.getKey();
547             String likelyMax = entry.getValue();
548             String sourceLang = ltp.set(source).getLanguage();
549             if (sourceLang.equals("und") || source.equals("zh_Hani") || source.equals("tl")) {
550                 continue;
551             }
552             String sourceScript = ltp.getScript();
553             String sourceRegion = ltp.getRegion();
554 
555             String likelyMaxLang = ltp.set(likelyMax).getLanguage();
556             String likelyMaxScript = ltp.getScript();
557             String likelyMaxRegion = ltp.getRegion();
558 
559             String dc = parent2default.get(source);
560             String possibleException = exceptionLikelyDc.get(likelyMax);
561             if (possibleException != null && possibleException.equals(dc)) {
562                 continue;
563             }
564             String likelyLangScript = likelyMaxLang + "_" + likelyMaxScript;
565             String dcFromLangScript = parent2default.get(likelyLangScript);
566 
567             boolean consistent = true;
568             String caseNumber = null;
569             if (consistent) {
570                 if (!sourceScript.isEmpty()) {
571                     caseNumber = "a";
572                     if (dc == null) {
573                         if (EXPECT_EQUALITY) {
574                             String expected = likelyMax;
575                             errln(
576                                     "Default contents null for "
577                                             + source
578                                             + ", expected:\t"
579                                             + expected);
580                             additionalDefaultContents.add(expected);
581                         }
582                         continue;
583                     }
584                     consistent = likelyMax.equals(dc);
585                 } else if (!sourceRegion.isEmpty()) { // a
586                     caseNumber = "b";
587                     // consistent = likelyMax.equals(dcFromLangScript);
588                 } else { // c
589                     caseNumber = "c";
590                     if (dc == null) {
591                         if (EXPECT_EQUALITY) {
592                             String expected =
593                                     base2scripts.get(source) == null
594                                             ? likelyMaxLang + "_" + likelyMaxRegion
595                                             : likelyMaxLang + "_" + likelyMaxScript;
596                             errln(
597                                     "Default contents null for "
598                                             + source
599                                             + ", expected:\t"
600                                             + expected);
601                             additionalDefaultContents.add(expected);
602                         }
603                         continue;
604                     }
605                     String dcScript = ltp.set(dc).getScript();
606                     consistent =
607                             likelyLangScript.equals(dc) && likelyMax.equals(dcFromLangScript)
608                                     || dcScript.isEmpty() && !likelyMax.equals(dcFromLangScript);
609                     // || dcScript.isEmpty() && dcRegion.equals(likelyMaxRegion)
610                     // && dcFromLangScript == null;
611                 }
612             }
613             if (!consistent) {
614                 errln(
615                         "default contents inconsistent with likely subtag: ("
616                                 + caseNumber
617                                 + ")"
618                                 + "\n\t"
619                                 + source
620                                 + " => (ls) "
621                                 + likelyMax
622                                 + "\n\t"
623                                 + source
624                                 + " => (dc) "
625                                 + dc
626                                 + "\n\t"
627                                 + likelyLangScript
628                                 + " => (dc) "
629                                 + dcFromLangScript);
630             }
631         }
632         if (additionalDefaultContents.size() != 0) {
633             errln(
634                     "Suggested additions to supplementalMetadata/../defaultContent:\n"
635                             + Joiner.on(" ").join(additionalDefaultContents));
636         }
637     }
638 
verifyDefaultContentsImplicationsForLikelySubtags( LanguageTagParser ltp, Map<String, String> parent2default, Map<String, String> likely2Maximized, Map<String, String> exceptionDcLikely)639     private void verifyDefaultContentsImplicationsForLikelySubtags(
640             LanguageTagParser ltp,
641             Map<String, String> parent2default,
642             Map<String, String> likely2Maximized,
643             Map<String, String> exceptionDcLikely) {
644         // Now check invariants for all Default Contents implications for
645         // LikelySubtags
646         // a) suppose default contents la => la_Scrp.
647         // Then the likely contents for la => la_Scrp_*
648         // b) suppose default contents la => la_RG.
649         // Then the likely contents for la => la_*_RG
650         // c) suppose default contents la_Scrp => la_Scrp_RG.
651         // Then the likely contents of la_Scrp => la_Scrp_RG OR likely contents
652         // for la => la_*_*
653         for (Entry<String, String> parentAndDefault : parent2default.entrySet()) {
654             String source = parentAndDefault.getKey();
655             String dc = parentAndDefault.getValue();
656             String likelyMax = likely2Maximized.get(source);
657 
658             // skip special exceptions
659             String possibleException = exceptionDcLikely.get(dc);
660             if (possibleException != null && possibleException.equals(likelyMax)) {
661                 continue;
662             }
663 
664             String sourceLang = ltp.set(source).getLanguage();
665             String sourceScript = ltp.getScript();
666             // there cannot be a sourceRegion
667 
668             String dcScript = ltp.set(dc).getScript();
669             String dcRegion = ltp.getRegion();
670 
671             String likelyMaxLang = "", likelyMaxScript = "", likelyMaxRegion = "";
672             if (likelyMax != null) {
673                 likelyMaxLang = ltp.set(likelyMax).getLanguage();
674                 likelyMaxScript = ltp.getScript();
675                 likelyMaxRegion = ltp.getRegion();
676             }
677 
678             String likelyMax2 = likely2Maximized.get(sourceLang);
679 
680             boolean consistent = true;
681 
682             if (sourceScript.isEmpty()) { // a or b
683                 if (!dcScript.isEmpty()) { // a
684                     consistent = likelyMaxLang.equals(source) && likelyMaxScript.equals(dcScript);
685                 } else { // b
686                     consistent = likelyMaxLang.equals(source) && likelyMaxRegion.equals(dcRegion);
687                 }
688             } else { // c
689                 consistent = dc.equals(likelyMax) || likelyMax2 != null;
690             }
691             if (!consistent) {
692                 errln(
693                         "likely subtag inconsistent with default contents: "
694                                 + "\n\t"
695                                 + source
696                                 + " =>( dc) "
697                                 + dc
698                                 + "\n\t"
699                                 + source
700                                 + " => (ls) "
701                                 + likelyMax
702                                 + (source.equals(sourceLang)
703                                         ? ""
704                                         : "\n\t" + sourceLang + " => (ls) " + likelyMax2));
705             }
706         }
707     }
708 
709     /**
710      * Suggest a likely subtag
711      *
712      * @param base
713      * @return
714      */
suggestLikelySubtagFor(String base)715     static String suggestLikelySubtagFor(String base) {
716         SupplementalDataInfo sdi = SupplementalDataInfo.getInstance();
717 
718         CLDRLocale loc = CLDRLocale.getInstance(base);
719 
720         if (!loc.getLanguage().equals(base)) {
721             return " (no suggestion- not a simple language locale)"; // no
722             // suggestion
723             // unless
724             // just
725             // a
726             // language
727             // locale.
728         }
729         Set<BasicLanguageData> basicData = sdi.getBasicLanguageData(base);
730 
731         for (BasicLanguageData bld : basicData) {
732             if (bld.getType()
733                     == org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type.primary) {
734                 Set<String> scripts = bld.getScripts();
735                 Set<String> territories = bld.getTerritories();
736 
737                 if (scripts.size() == 1) {
738                     if (territories.size() == 1) {
739                         return createSuggestion(
740                                 loc,
741                                 CLDRLocale.getInstance(
742                                         base
743                                                 + "_"
744                                                 + scripts.iterator().next()
745                                                 + "_"
746                                                 + territories.iterator().next()));
747                     }
748                 }
749                 return "(no suggestion - multiple scripts or territories)";
750             }
751         }
752         return ("(no suggestion- no data)");
753     }
754 
755     /** Format and return a suggested likelysubtag */
createSuggestion(CLDRLocale loc, CLDRLocale toLoc)756     private static String createSuggestion(CLDRLocale loc, CLDRLocale toLoc) {
757         return " Suggest this to likelySubtags.xml:        <likelySubtag from=\""
758                 + loc
759                 + "\" to=\""
760                 + toLoc
761                 + "\"/>\n"
762                 + "        <!--{ "
763                 + loc.getDisplayName()
764                 + "; ?; ? } => { "
765                 + loc.getDisplayName()
766                 + "; "
767                 + toLoc.toULocale().getDisplayScript()
768                 + "; "
769                 + toLoc.toULocale().getDisplayCountry()
770                 + " }-->";
771     }
772 
TestDeprecatedTerritoryDataLocaleIds()773     public void TestDeprecatedTerritoryDataLocaleIds() {
774         HashSet<String> checked = new HashSet<>();
775         for (String language : dataInfo.getLanguagesForTerritoriesPopulationData()) {
776             checkLocale(language, false); // checks la_Scrp and la
777             for (String region : dataInfo.getTerritoriesForPopulationData(language)) {
778                 if (!checked.contains(region)) {
779                     checkValidCode(language + "_" + region, "territory", region, false);
780                     checked.add(region);
781                 }
782             }
783         }
784         for (String language : dataInfo.getBasicLanguageDataLanguages()) {
785             checkLocale(language, false); // checks la_Scrp and la
786             Set<BasicLanguageData> data = dataInfo.getBasicLanguageData(language);
787             for (BasicLanguageData datum : data) {
788                 for (String script : datum.getScripts()) {
789                     checkValidCode(language + "_" + script, "script", script, false);
790                     checked.add(script);
791                 }
792                 for (String region : datum.getTerritories()) {
793                     checkValidCode(language + "_" + region, "territory", region, false);
794                     checked.add(region);
795                 }
796             }
797         }
798     }
799 
TestBasicLanguageDataAgainstScriptMetadata()800     public void TestBasicLanguageDataAgainstScriptMetadata() {
801         // the invariants are:
802         // if there is primary data, the script must be there
803         // otherwise it must be in the secondary
804         main:
805         for (String script : ScriptMetadata.getScripts()) {
806             Info info = ScriptMetadata.getInfo(script);
807             String language = info.likelyLanguage;
808             if (language.equals("und")) {
809                 continue;
810             }
811             Map<Type, BasicLanguageData> data = dataInfo.getBasicLanguageDataMap(language);
812             if (data == null) {
813                 logln(
814                         "Warning: ScriptMetadata has "
815                                 + language
816                                 + " for "
817                                 + script
818                                 + ","
819                                 + " but "
820                                 + language
821                                 + " is missing in language_script.txt");
822                 continue;
823             }
824             for (BasicLanguageData entry : data.values()) {
825                 if (entry.getScripts().contains(script)) {
826                     continue main;
827                 }
828                 continue;
829             }
830             logln(
831                     "Warning: ScriptMetadata has "
832                             + language
833                             + " for "
834                             + script
835                             + ","
836                             + " but "
837                             + language
838                             + " doesn't have "
839                             + script
840                             + " in language_script.txt");
841         }
842     }
843 
TestCldrFileConsistency()844     public void TestCldrFileConsistency() {
845         boolean haveErrors = false;
846         for (String locale : testInfo.getCldrFactory().getAvailable()) {
847             CLDRFile cldrFileToCheck = testInfo.getCLDRFile(locale, false);
848             int errors = 0;
849             for (String path : cldrFileToCheck) {
850                 if (!pathMatcher.reset(path).find()) {
851                     continue;
852                 }
853                 String fullPath = cldrFileToCheck.getFullXPath(path);
854                 if (fullPath == null) {
855                     // try again, for debugging
856                     fullPath = cldrFileToCheck.getFullXPath(path);
857                     String value = cldrFileToCheck.getStringValue(path);
858                     if (DEBUG) {
859                         errln(
860                                 "Invalid full path\t"
861                                         + locale
862                                         + ", "
863                                         + path
864                                         + ", "
865                                         + fullPath
866                                         + ", "
867                                         + value);
868                     }
869                     errors++;
870                     haveErrors = true;
871                 }
872             }
873             if (errors != 0) {
874                 errln(locale + (errors != 0 ? "\tinvalid getFullXPath() values:" + errors : ""));
875             } else {
876                 logln(locale);
877             }
878         }
879         if (haveErrors && !DEBUG) {
880             errln("Use -DDEBUG to see details");
881         }
882     }
883 
884     static SupplementalDataInfo info = SupplementalDataInfo.getInstance();
885     LanguageTagParser ltp = new LanguageTagParser();
886 
887     Matcher aliasMatcher = PatternCache.get("//ldml.*/alias.*").matcher("");
888 
minimize(Map<String, String> likelySubtags, String locale)889     private String minimize(Map<String, String> likelySubtags, String locale) {
890         String result = GenerateMaximalLocales.minimize(locale, likelySubtags, false);
891         if (result == null) {
892             LanguageTagParser ltp3 = new LanguageTagParser().set(locale);
893             List<String> variants = ltp3.getVariants();
894             Map<String, String> extensions = ltp3.getExtensions();
895             Set<String> emptySet = Collections.emptySet();
896             ltp3.setVariants(emptySet);
897             Map<String, String> emptyMap = Collections.emptyMap();
898             ltp3.setExtensions(emptyMap);
899             String newLocale = ltp3.toString();
900             result = GenerateMaximalLocales.minimize(newLocale, likelySubtags, false);
901             if (result != null) {
902                 ltp3.set(result);
903                 ltp3.setVariants(variants);
904                 ltp3.setExtensions(extensions);
905                 result = ltp3.toString();
906             }
907         }
908         return result;
909     }
910 
maximize(Map<String, String> likelySubtags, String locale)911     private String maximize(Map<String, String> likelySubtags, String locale) {
912         String result = GenerateMaximalLocales.maximize(locale, likelySubtags);
913         if (result == null) {
914             LanguageTagParser ltp3 = new LanguageTagParser().set(locale);
915             List<String> variants = ltp3.getVariants();
916             Map<String, String> extensions = ltp3.getExtensions();
917             Set<String> emptySet = Collections.emptySet();
918             ltp3.setVariants(emptySet);
919             Map<String, String> emptyMap = Collections.emptyMap();
920             ltp3.setExtensions(emptyMap);
921             String newLocale = ltp3.toString();
922             result = GenerateMaximalLocales.maximize(newLocale, likelySubtags);
923             if (result != null) {
924                 ltp3.set(result);
925                 ltp3.setVariants(variants);
926                 ltp3.setExtensions(extensions);
927                 result = ltp3.toString();
928             }
929         }
930         return result;
931     }
932 
933     // TODO move this into central utilities
equals(CharSequence string, int codePoint)934     public static boolean equals(CharSequence string, int codePoint) {
935         if (string == null) {
936             return false;
937         }
938         switch (string.length()) {
939             case 1:
940                 return codePoint == string.charAt(0);
941             case 2:
942                 return codePoint >= 0x10000 && codePoint == Character.codePointAt(string, 0);
943             default:
944                 return false;
945         }
946     }
947 
948     // TODO move this into central utilities
949 
950     private static final StandardCodes STANDARD_CODES = StandardCodes.make();
951     private static final Map<String, Map<String, R2<List<String>, String>>> DEPRECATED_INFO =
952             dataInfo.getLocaleAliasInfo();
953 
checkLocale(String localeID, boolean allowDeprecated)954     private void checkLocale(String localeID, boolean allowDeprecated) {
955         // verify that the localeID is valid
956         LanguageTagParser ltp = new LanguageTagParser().set(localeID);
957         String language = ltp.getLanguage();
958         String script = ltp.getScript();
959         String region = ltp.getRegion();
960         // TODO check variants, extensions also.
961         checkValidCode(localeID, "language", language, allowDeprecated);
962         checkValidCode(localeID, "script", script, allowDeprecated);
963         checkValidCode(localeID, "territory", region, allowDeprecated);
964     }
965 
checkValidCode( String localeID, String subtagType, String subtag, boolean allowDeprecated)966     private void checkValidCode(
967             String localeID, String subtagType, String subtag, boolean allowDeprecated) {
968         if (subtagType.equals("language")) {
969             if (subtag.equals("und")) {
970                 return;
971             }
972         } else {
973             if (subtag.isEmpty()) {
974                 return;
975             }
976         }
977         if (!STANDARD_CODES.getAvailableCodes(subtagType).contains(subtag)) {
978             errln("Locale " + localeID + " contains illegal " + showCode(subtagType, subtag));
979         } else if (!allowDeprecated) {
980             // "language" -> "sh" -> <{"sr_Latn"}, reason>
981             R2<List<String>, String> deprecatedInfo = DEPRECATED_INFO.get(subtagType).get(subtag);
982             if (deprecatedInfo != null) {
983                 errln(
984                         "Locale "
985                                 + localeID
986                                 + " contains deprecated "
987                                 + showCode(subtagType, subtag)
988                                 + " "
989                                 + deprecatedInfo.get1()
990                                 + "; suggest "
991                                 + showName(deprecatedInfo.get0(), subtagType));
992             }
993         }
994     }
995 
showName(List<String> deprecatedInfo, String subtagType)996     private String showName(List<String> deprecatedInfo, String subtagType) {
997         StringBuilder result = new StringBuilder();
998         for (String s : deprecatedInfo) {
999             result.append(showName(subtagType, s)).append(" ");
1000         }
1001         return result.toString();
1002     }
1003 
showCode(String subtagType, String subtag)1004     private String showCode(String subtagType, String subtag) {
1005         return subtagType + " code: " + showName(subtagType, subtag);
1006     }
1007 
showName(String subtagType, String subtag)1008     private String showName(String subtagType, String subtag) {
1009         return subtag + " (" + getName(subtagType, subtag) + ")";
1010     }
1011 
getName(String subtagType, String subtag)1012     private String getName(String subtagType, String subtag) {
1013         Map<String, String> data = STANDARD_CODES.getLangData(subtagType, subtag);
1014         if (data == null) {
1015             return "<no name>";
1016         }
1017         return data.get("Description");
1018     }
1019 
1020     // TODO move this into central utilities
equals(int codePoint, CharSequence string)1021     public static boolean equals(int codePoint, CharSequence string) {
1022         return equals(string, codePoint);
1023     }
1024 
1025     // TODO move this into central utilities
equals(Object a, Object b)1026     public static boolean equals(Object a, Object b) {
1027         return a == b ? true : a == null || b == null ? false : a.equals(b);
1028     }
1029 
1030     // TODO move this into central utilities
showDifferences(Map<K, V> a, Map<K, V> b)1031     private <K, V> String showDifferences(Map<K, V> a, Map<K, V> b) {
1032         StringBuilder result = new StringBuilder();
1033         Set<K> keys = new LinkedHashSet<>();
1034         keys.addAll(a.keySet());
1035         keys.addAll(b.keySet());
1036         for (K key : keys) {
1037             if (!a.containsKey(key)) {
1038                 result.append(key).append("→‹").append(a.get(key)).append("›,∅; ");
1039             } else if (!b.containsKey(key)) {
1040                 result.append(key).append("→∅,‹").append(b.get(key)).append("›; ");
1041             } else {
1042                 V aKey = a.get(key);
1043                 V bKey = b.get(key);
1044                 if (!equals(aKey, bKey)) {
1045                     result.append(key)
1046                             .append("→‹")
1047                             .append(a.get(key))
1048                             .append("›,‹")
1049                             .append(b.get(key))
1050                             .append("›; ");
1051                 }
1052             }
1053         }
1054         return result.toString();
1055     }
1056 
TestLanguageTagParser()1057     public void TestLanguageTagParser() {
1058         LanguageTagParser ltp = new LanguageTagParser();
1059         ltp.set("en-Cyrl-US");
1060         assertEquals(null, "en", ltp.getLanguage());
1061         assertEquals(null, "en_Cyrl", ltp.getLanguageScript());
1062         assertEquals(null, "Cyrl", ltp.getScript());
1063         assertEquals(null, "US", ltp.getRegion());
1064         try {
1065             ltp.set("$");
1066             assertFalse("expected exception", true);
1067         } catch (Exception e) {
1068             logln(e.getMessage());
1069         }
1070     }
1071 
TestParentChain()1072     public void TestParentChain() {
1073         String[][] tests = {
1074             {"en_DE", "[en_150, en_001, en, root]"},
1075             {"fr_CA", "[fr, root]"},
1076             {"fr", "[root]"},
1077             {"root", "[]"},
1078         };
1079 
1080         for (String[] test : tests) {
1081             assertEquals(test[0], test[1], LocaleIDParser.getParentChain(test[0]).toString());
1082         }
1083     }
1084 }
1085