xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateMaximalLocales.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.tool;
2 
3 import com.google.common.base.Joiner;
4 import com.google.common.collect.ImmutableList;
5 import com.google.common.collect.ImmutableMap;
6 import com.google.common.collect.ImmutableSet;
7 import com.ibm.icu.impl.Relation;
8 import com.ibm.icu.impl.Row;
9 import com.ibm.icu.impl.Row.R2;
10 import com.ibm.icu.impl.Row.R3;
11 import com.ibm.icu.impl.Row.R4;
12 import com.ibm.icu.lang.UScript;
13 import com.ibm.icu.text.Collator;
14 import com.ibm.icu.text.NumberFormat;
15 import com.ibm.icu.text.UTF16;
16 import com.ibm.icu.text.UnicodeSet;
17 import com.ibm.icu.text.UnicodeSetIterator;
18 import com.ibm.icu.util.ULocale;
19 import java.io.BufferedReader;
20 import java.io.File;
21 import java.io.IOException;
22 import java.io.PrintWriter;
23 import java.nio.file.Files;
24 import java.util.Arrays;
25 import java.util.BitSet;
26 import java.util.Collection;
27 import java.util.Comparator;
28 import java.util.HashMap;
29 import java.util.HashSet;
30 import java.util.LinkedHashSet;
31 import java.util.List;
32 import java.util.Map;
33 import java.util.Map.Entry;
34 import java.util.Set;
35 import java.util.TreeMap;
36 import java.util.TreeSet;
37 import org.unicode.cldr.draft.FileUtilities;
38 import org.unicode.cldr.draft.ScriptMetadata;
39 import org.unicode.cldr.draft.ScriptMetadata.Info;
40 import org.unicode.cldr.util.Builder;
41 import org.unicode.cldr.util.CLDRConfig;
42 import org.unicode.cldr.util.CLDRFile;
43 import org.unicode.cldr.util.CLDRLocale;
44 import org.unicode.cldr.util.CLDRPaths;
45 import org.unicode.cldr.util.CldrUtility;
46 import org.unicode.cldr.util.Containment;
47 import org.unicode.cldr.util.Counter;
48 import org.unicode.cldr.util.Factory;
49 import org.unicode.cldr.util.Iso3166Data;
50 import org.unicode.cldr.util.Iso639Data;
51 import org.unicode.cldr.util.Iso639Data.Scope;
52 import org.unicode.cldr.util.LanguageTagParser;
53 import org.unicode.cldr.util.LocaleIDParser;
54 import org.unicode.cldr.util.LocaleNames;
55 import org.unicode.cldr.util.Organization;
56 import org.unicode.cldr.util.PatternCache;
57 import org.unicode.cldr.util.SimpleFactory;
58 import org.unicode.cldr.util.StandardCodes;
59 import org.unicode.cldr.util.StandardCodes.LstrType;
60 import org.unicode.cldr.util.SupplementalDataInfo;
61 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData;
62 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type;
63 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
64 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
65 import org.unicode.cldr.util.Validity;
66 import org.unicode.cldr.util.Validity.Status;
67 
68 /**
69  * Problems: "und_Hani", "zh_Hani" "und_Sinh", "si_Sinh"
70  *
71  * @author markdavis
72  */
73 public class GenerateMaximalLocales {
74 
75     private static final Map<String, Status> LANGUAGE_CODE_TO_STATUS =
76             Validity.getInstance().getCodeToStatus(LstrType.language);
77 
78     private static final String TEMP_UNKNOWN_REGION = "XZ";
79 
80     private static final String DEBUG_ADD_KEY = "und_Latn_ZA";
81 
82     private static final boolean SHOW_ADD =
83             CldrUtility.getProperty("GenerateMaximalLocalesDebug", false);
84     private static final boolean SUPPRESS_CHANGES =
85             CldrUtility.getProperty("GenerateMaximalLocalesSuppress", false);
86     private static final boolean SHOW_CONTAINERS = false;
87 
88     private static final boolean SHOW_ALL_LANGUAGE_CODES = false;
89     private static final boolean SHOW_DETAILED = false;
90     private static final boolean SHOW_INCLUDED_EXCLUDED = false;
91 
92     enum OutputStyle {
93         PLAINTEXT,
94         C,
95         C_ALT,
96         XML
97     }
98 
99     private static OutputStyle OUTPUT_STYLE =
100             OutputStyle.valueOf(CldrUtility.getProperty("OutputStyle", "XML", "XML").toUpperCase());
101 
102     // set based on above
103     private static final String SEPARATOR =
104             OUTPUT_STYLE == OutputStyle.C || OUTPUT_STYLE == OutputStyle.C_ALT
105                     ? CldrUtility.LINE_SEPARATOR
106                     : "\t";
107     private static final String TAG_SEPARATOR = OUTPUT_STYLE == OutputStyle.C_ALT ? "-" : "_";
108     // private static final boolean FAVOR_REGION = true; // OUTPUT_STYLE == OutputStyle.C_ALT;
109 
110     private static final boolean tryDifferent = true;
111 
112     private static final File list[] = {
113         new File(CLDRPaths.MAIN_DIRECTORY),
114         new File(CLDRPaths.SEED_DIRECTORY),
115         new File(CLDRPaths.EXEMPLARS_DIRECTORY)
116     };
117 
118     private static Factory factory = SimpleFactory.make(list, ".*");
119     private static Factory mainFactory = CLDRConfig.getInstance().getCldrFactory();
120     private static SupplementalDataInfo supplementalData =
121             SupplementalDataInfo.getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY);
122     private static StandardCodes standardCodes = StandardCodes.make();
123     private static CLDRFile english = factory.make("en", false);
124     static Relation<String, String> cldrContainerToLanguages =
125             Relation.of(new HashMap<String, Set<String>>(), HashSet.class);
126 
127     static {
128         for (CLDRLocale locale :
129                 ToolConfig.getToolInstance().getCldrFactory().getAvailableCLDRLocales()) {
130             String region = locale.getCountry();
131             if (region == null || region.isEmpty() || Containment.isLeaf(region)) {
132                 continue;
133             }
cldrContainerToLanguages.put(region, locale.getLanguage())134             cldrContainerToLanguages.put(region, locale.getLanguage());
135         }
cldrContainerToLanguages.freeze()136         cldrContainerToLanguages.freeze();
137         System.out.println("Keep containers " + cldrContainerToLanguages);
138     }
139 
140     private static final List<String> KEEP_TARGETS =
141             Arrays.asList("und_Arab_PK", "und_Latn_ET", "hi_Latn");
142     private static final ImmutableSet<String> deprecatedISONotInLST = ImmutableSet.of("scc", "scr");
143 
144     /**
145      * This is the simplest way to override, by supplying the max value. It gets a very low weight,
146      * so doesn't override any stronger value.
147      */
148     private static final String[] MAX_ADDITIONS =
149             new String[] {
150                 "bss_Latn_CM",
151                 "gez_Ethi_ET",
152                 "ken_Latn_CM",
153                 "und_Arab_PK",
154                 "wa_Latn_BE",
155                 "fub_Arab_CM",
156                 "fuf_Latn_GN",
157                 "kby_Arab_NE",
158                 "kdh_Latn_TG",
159                 "apd_Arab_TG",
160                 "zlm_Latn_TG",
161                 "cr_Cans_CA",
162                 "hif_Latn_FJ",
163                 "gon_Telu_IN",
164                 "lzz_Latn_TR",
165                 "lif_Deva_NP",
166                 "unx_Beng_IN",
167                 "unr_Beng_IN",
168                 "ttt_Latn_AZ",
169                 "pnt_Grek_GR",
170                 "tly_Latn_AZ",
171                 "tkr_Latn_AZ",
172                 "bsq_Bass_LR",
173                 "ccp_Cakm_BD",
174                 "blt_Tavt_VN",
175                 "rhg_Arab_MM",
176                 "rhg_Rohg_MM",
177                 "clc_Latn_CA",
178                 "crg_Latn_CA",
179                 "hur_Latn_CA",
180                 "kwk_Latn_CA",
181                 "lil_Latn_CA",
182                 "ojs_Cans_CA",
183                 "oka_Latn_CA",
184                 "pqm_Latn_CA",
185                 "hi_Latn_IN",
186                 "no_Latn_NO",
187                 "tok_Latn_001",
188                 "prg_Latn_PL",
189                 "ie_Latn_EE",
190             };
191 
192     /**
193      * The following overrides MASH the final values, so they may not result in consistent results.
194      * Safer is to add to MAX_ADDITIONS. However, if you add, add both the language and
195      * language+script mappings.
196      */
197     // Many of the overrides below can be removed once the language/pop/country data is updated.
198     private static final Map<String, String> LANGUAGE_OVERRIDES =
199             CldrUtility.asMap(
200                     new String[][] {
201                         {"cic", "cic_Latn_US"},
202                         {"cic_Latn", "cic_Latn_US"},
203                         {"eo", "eo_Latn_001"},
204                         {"eo_Latn", "eo_Latn_001"},
205                         {"es", "es_Latn_ES"},
206                         {"es_Latn", "es_Latn_ES"},
207                         {"ff_BF", "ff_Latn_BF"},
208                         {"ff_GM", "ff_Latn_GM"},
209                         {"ff_GH", "ff_Latn_GH"},
210                         {"ff_GW", "ff_Latn_GW"},
211                         {"ff_LR", "ff_Latn_LR"},
212                         {"ff_NE", "ff_Latn_NE"},
213                         {"ff_NG", "ff_Latn_NG"},
214                         {"ff_SL", "ff_Latn_SL"},
215                         {"ff_Adlm", "ff_Adlm_GN"},
216                         {"ia", "ia_Latn_001"},
217                         {"ia_Latn", "ia_Latn_001"},
218                         {"io", "io_Latn_001"},
219                         {"io_Latn", "io_Latn_001"},
220                         {"jbo", "jbo_Latn_001"},
221                         {"jbo_Latn", "jbo_Latn_001"},
222                         {"ku_Arab", "ku_Arab_IQ"},
223                         {"lrc", "lrc_Arab_IR"},
224                         {"lrc_Arab", "lrc_Arab_IR"},
225                         {"man", "man_Latn_GM"},
226                         {"man_Latn", "man_Latn_GM"},
227                         {"mas", "mas_Latn_KE"},
228                         {"mas_Latn", "mas_Latn_KE"},
229                         {"mn", "mn_Cyrl_MN"},
230                         {"mn_Cyrl", "mn_Cyrl_MN"},
231                         {"mro", "mro_Mroo_BD"},
232                         {"mro_BD", "mro_Mroo_BD"},
233                         {"ms_Arab", "ms_Arab_MY"},
234                         {"pap", "pap_Latn_CW"},
235                         {"pap_Latn", "pap_Latn_CW"},
236                         {
237                             "rif", "rif_Latn_MA"
238                         }, // https://unicode-org.atlassian.net/browse/CLDR-14962?focusedCommentId=165053
239                         {"rif_Latn", "rif_Latn_MA"},
240                         {"rif_Tfng", "rif_Tfng_MA"},
241                         {"rif_MA", "rif_Latn_MA"}, // Ibid
242                         {"shi", "shi_Tfng_MA"},
243                         {"shi_Tfng", "shi_Tfng_MA"},
244                         {"shi_MA", "shi_Tfng_MA"},
245                         {"sr_Latn", "sr_Latn_RS"},
246                         {"ss", "ss_Latn_ZA"},
247                         {"ss_Latn", "ss_Latn_ZA"},
248                         {"swc", "swc_Latn_CD"},
249                         {"ti", "ti_Ethi_ET"},
250                         {"ti_Ethi", "ti_Ethi_ET"},
251                         {LocaleNames.UND, "en_Latn_US"},
252                         {"und_Adlm", "ff_Adlm_GN"},
253                         {"und_Adlm_GN", "ff_Adlm_GN"},
254                         {"und_Arab", "ar_Arab_EG"},
255                         {"und_Arab_PK", "ur_Arab_PK"},
256                         {"und_Bopo", "zh_Bopo_TW"},
257                         {"und_Deva_FJ", "hif_Deva_FJ"},
258                         {"und_EZ", "de_Latn_EZ"},
259                         {"und_Hani", "zh_Hani_CN"},
260                         {"und_Hani_CN", "zh_Hani_CN"},
261                         {"und_Kana", "ja_Kana_JP"},
262                         {"und_Kana_JP", "ja_Kana_JP"},
263                         {"und_Latn", "en_Latn_US"},
264                         {"und_001", "en_Latn_US"}, // to not be overridden by tok_Latn_001
265                         {"und_Latn_001", "en_Latn_US"}, // to not be overridden by tok_Latn_001
266                         {"und_Latn_ET", "en_Latn_ET"},
267                         {"und_Latn_NE", "ha_Latn_NE"},
268                         {"und_Latn_PH", "fil_Latn_PH"},
269                         {"und_ML", "bm_Latn_ML"},
270                         {"und_Latn_ML", "bm_Latn_ML"},
271                         {"und_MU", "mfe_Latn_MU"},
272                         {"und_NE", "ha_Latn_NE"},
273                         {"und_PH", "fil_Latn_PH"},
274                         {"und_PK", "ur_Arab_PK"},
275                         {"und_SO", "so_Latn_SO"},
276                         {"und_SS", "en_Latn_SS"},
277                         {"und_TK", "tkl_Latn_TK"},
278                         {"und_UN", "en_Latn_UN"},
279                         {"und_005", "pt_Latn_BR"},
280                         {"vo", "vo_Latn_001"},
281                         {"vo_Latn", "vo_Latn_001"},
282                         {"yi", "yi_Hebr_001"},
283                         {"yi_Hebr", "yi_Hebr_001"},
284                         {"yue", "yue_Hant_HK"},
285                         {"yue_Hant", "yue_Hant_HK"},
286                         {"yue_Hans", "yue_Hans_CN"},
287                         {"yue_CN", "yue_Hans_CN"},
288                         {"zh_Hani", "zh_Hani_CN"},
289                         {"zh_Bopo", "zh_Bopo_TW"},
290                         {"ccp", "ccp_Cakm_BD"},
291                         {"ccp_Cakm", "ccp_Cakm_BD"},
292                         {"und_Cakm", "ccp_Cakm_BD"},
293                         {"cu_Glag", "cu_Glag_BG"},
294                         {"sd_Khoj", "sd_Khoj_IN"},
295                         {"lif_Limb", "lif_Limb_IN"},
296                         {"grc_Linb", "grc_Linb_GR"},
297                         {"arc_Nbat", "arc_Nbat_JO"},
298                         {"arc_Palm", "arc_Palm_SY"},
299                         {"pal_Phlp", "pal_Phlp_CN"},
300                         {"en_Shaw", "en_Shaw_GB"},
301                         {"sd_Sind", "sd_Sind_IN"},
302                         {"und_Brai", "fr_Brai_FR"}, // hack
303                         {"und_Hanb", "zh_Hanb_TW"}, // Special script code
304                         {"zh_Hanb", "zh_Hanb_TW"}, // Special script code
305                         {"und_Jamo", "ko_Jamo_KR"}, // Special script code
306 
307                         // {"und_Cyrl_PL", "be_Cyrl_PL"},
308 
309                         //        {"cr", "cr_Cans_CA"},
310                         //        {"hif", "hif_Latn_FJ"},
311                         //        {"gon", "gon_Telu_IN"},
312                         //        {"lzz", "lzz_Latn_TR"},
313                         //        {"lif", "lif_Deva_NP"},
314                         //        {"unx", "unx_Beng_IN"},
315                         //        {"unr", "unr_Beng_IN"},
316                         //        {"ttt", "ttt_Latn_AZ"},
317                         //        {"pnt", "pnt_Grek_GR"},
318                         //        {"tly", "tly_Latn_AZ"},
319                         //        {"tkr", "tkr_Latn_AZ"},
320                         //        {"bsq", "bsq_Bass_LR"},
321                         //        {"ccp", "ccp_Cakm_BD"},
322                         //        {"blt", "blt_Tavt_VN"},
323                         //        { "mis_Medf", "mis_Medf_NG" },
324 
325                         {"ku_Yezi", "ku_Yezi_GE"},
326                         {"und_EU", "en_Latn_IE"},
327                         {"hnj", "hnj_Hmnp_US"}, // preferred lang/script in CLDR
328                         {"hnj_Hmnp", "hnj_Hmnp_US"},
329                         {"und_Hmnp", "hnj_Hmnp_US"},
330                         {"rhg", "rhg_Rohg_MM"}, // preferred lang/script in CLDR
331                         {"rhg_Arab", "rhg_Arab_MM"},
332                         {"und_Arab_MM", "rhg_Arab_MM"},
333                         {"sd_IN", "sd_Deva_IN"}, // preferred in CLDR
334                         // { "sd_Deva", "sd_Deva_IN"},
335                         {"und_Cpmn", "und_Cpmn_CY"},
336                         {"oc_ES", "oc_Latn_ES"},
337                         {"os", "os_Cyrl_GE"},
338                         {"os_Cyrl", "os_Cyrl_GE"},
339                     });
340 
341     /**
342      * The following supplements the suppress-script. It overrides info from exemplars and the
343      * locale info.
344      */
345     private static String[][] SpecialScripts = {
346         {"zh", "Hans"}, // Hans (not Hani)
347         {"yue", "Hant"}, // Hans (not Hani)
348         {"chk", "Latn"}, // Chuukese (Micronesia)
349         {"fil", "Latn"}, // Filipino (Philippines)"
350         {"ko", "Kore"}, // Korean (North Korea)
351         {"ko_KR", "Kore"}, // Korean (North Korea)
352         {"pap", "Latn"}, // Papiamento (Netherlands Antilles)
353         {"pau", "Latn"}, // Palauan (Palau)
354         {"su", "Latn"}, // Sundanese (Indonesia)
355         {"tet", "Latn"}, // Tetum (East Timor)
356         {"tk", "Latn"}, // Turkmen (Turkmenistan)
357         {"ty", "Latn"}, // Tahitian (French Polynesia)
358         {"ja", "Jpan"}, // Special script for japan
359         {LocaleNames.UND, "Latn"}, // Ultimate fallback
360     };
361 
362     private static Map<String, String> localeToScriptCache = new TreeMap<>();
363 
364     static {
365         for (String language : standardCodes.getAvailableCodes("language")) {
366             Map<String, String> info = standardCodes.getLangData("language", language);
367             String script = info.get("Suppress-Script");
368             if (script != null) {
localeToScriptCache.put(language, script)369                 localeToScriptCache.put(language, script);
370             }
371         }
372         for (String[] pair : SpecialScripts) {
localeToScriptCache.put(pair[0], pair[1])373             localeToScriptCache.put(pair[0], pair[1]);
374         }
375     }
376 
377     private static Map<String, String> FALLBACK_SCRIPTS;
378 
379     static {
380         LanguageTagParser additionLtp = new LanguageTagParser();
381         Map<String, String> _FALLBACK_SCRIPTS = new TreeMap<>();
382         for (String addition : MAX_ADDITIONS) {
383             additionLtp.set(addition);
384             String lan = additionLtp.getLanguage();
_FALLBACK_SCRIPTS.put(lan, additionLtp.getScript())385             _FALLBACK_SCRIPTS.put(lan, additionLtp.getScript());
386         }
387         FALLBACK_SCRIPTS = ImmutableMap.copyOf(_FALLBACK_SCRIPTS);
388     }
389 
390     private static int errorCount;
391 
main(String[] args)392     public static void main(String[] args) throws IOException {
393         if (true) {
394             throw new IllegalArgumentException("Don't run this tool until it is fixed");
395         }
396 
397         printDefaultLanguagesAndScripts();
398 
399         Map<String, String> toMaximized = new TreeMap<>();
400 
401         tryDifferentAlgorithm(toMaximized);
402 
403         minimize(toMaximized);
404 
405         // HACK TEMP_UNKNOWN_REGION
406         // this is to get around the removal of items with ZZ in minimize.
407         // probably cleaner way to do it, but this provides control over just those we want to
408         // retain.
409         Set<String> toRemove = new TreeSet<>();
410         Map<String, String> toFix = new TreeMap<>();
411         for (Entry<String, String> entry : toMaximized.entrySet()) {
412             String key = entry.getKey();
413             String value = entry.getValue();
414             if (key.contains(TEMP_UNKNOWN_REGION)) {
415                 toRemove.add(key);
416             } else if (value.contains(TEMP_UNKNOWN_REGION)) {
417                 toFix.put(key, value.replace(TEMP_UNKNOWN_REGION, UNKNOWN_REGION));
418             }
419         }
420         for (String key : toRemove) {
421             toMaximized.remove(key);
422         }
423         toMaximized.putAll(toFix);
424 
425         Map<String, String> oldLikely = SupplementalDataInfo.getInstance().getLikelySubtags();
426         Set<String> changes =
427                 compareMapsAndFixNew(
428                         "*WARNING* Likely Subtags: ",
429                         oldLikely,
430                         toMaximized,
431                         "ms_Arab",
432                         "ms_Arab_ID");
433         System.out.println(Joiner.on("\n").join(changes));
434 
435         if (OUTPUT_STYLE == OutputStyle.C_ALT) {
436             doAlt(toMaximized);
437         }
438 
439         if (SHOW_ADD)
440             System.out.println(
441                     "/*"
442                             + CldrUtility.LINE_SEPARATOR
443                             + " To Maximize:"
444                             + CldrUtility.LINE_SEPARATOR
445                             + " If using raw strings, make sure the input language/locale uses the right separator, and has the right casing."
446                             + CldrUtility.LINE_SEPARATOR
447                             + " Remove the script Zzzz and the region ZZ if they occur; change an empty language subtag to 'und'."
448                             + CldrUtility.LINE_SEPARATOR
449                             + " Get the language, region, and script from the cleaned-up tag, plus any variants/extensions"
450                             + CldrUtility.LINE_SEPARATOR
451                             + " Try each of the following in order (where the field exists)"
452                             + CldrUtility.LINE_SEPARATOR
453                             + "   Lookup language-script-region. If in the table, return the result + variants"
454                             + CldrUtility.LINE_SEPARATOR
455                             + "   Lookup language-script. If in the table, return the result (substituting the original region if it exists) + variants"
456                             + CldrUtility.LINE_SEPARATOR
457                             + "   Lookup language-region. If in the table, return the result (substituting the original script if it exists) + variants"
458                             + CldrUtility.LINE_SEPARATOR
459                             + "   Lookup language. If in the table, return the result (substituting the original region and script if either or both exist) + variants"
460                             + CldrUtility.LINE_SEPARATOR
461                             + CldrUtility.LINE_SEPARATOR
462                             + " Example: Input is zh-ZZZZ-SG."
463                             + CldrUtility.LINE_SEPARATOR
464                             + " Normalize to zh-SG. Lookup in table. No match."
465                             + CldrUtility.LINE_SEPARATOR
466                             + " Remove SG, but remember it. Lookup zh, and get the match (zh-Hans-CN). Substitute SG, and return zh-Hans-SG."
467                             + CldrUtility.LINE_SEPARATOR
468                             + CldrUtility.LINE_SEPARATOR
469                             + " To Minimize:"
470                             + CldrUtility.LINE_SEPARATOR
471                             + " First get max = maximize(input)."
472                             + CldrUtility.LINE_SEPARATOR
473                             + " Then for trial in {language, language-region, language-script}"
474                             + CldrUtility.LINE_SEPARATOR
475                             + "     If maximize(trial) == max, then return trial."
476                             + CldrUtility.LINE_SEPARATOR
477                             + " If you don't get a match, return max."
478                             + CldrUtility.LINE_SEPARATOR
479                             + CldrUtility.LINE_SEPARATOR
480                             + " Example: Input is zh-Hant. Maximize to get zh-Hant-TW."
481                             + CldrUtility.LINE_SEPARATOR
482                             + " zh => zh-Hans-CN. No match, so continue."
483                             + CldrUtility.LINE_SEPARATOR
484                             + " zh-TW => zh-Hans-TW. Match, so return zh-TW."
485                             + CldrUtility.LINE_SEPARATOR
486                             + CldrUtility.LINE_SEPARATOR
487                             + " (A variant of this uses {language, language-script, language-region}): that is, tries script before language."
488                             + CldrUtility.LINE_SEPARATOR
489                             + " toMaximal size:\t"
490                             + toMaximized.size()
491                             + CldrUtility.LINE_SEPARATOR
492                             + "*/");
493 
494         final File newLikelySubtags = printLikelySubtags(toMaximized);
495 
496         printDefaultContent(toMaximized);
497 
498         // Do this here so the two "Copying…" messages show up together.
499         if (OUTPUT_STYLE == OutputStyle.XML) {
500             final File oldLikelySubtags =
501                     CLDRConfig.getInstance().getEnglish().getSupplementalFile("likelySubtags.xml");
502             System.out.println("Copying " + newLikelySubtags + " to " + oldLikelySubtags);
503             oldLikelySubtags.delete();
504             Files.copy(newLikelySubtags.toPath(), oldLikelySubtags.toPath());
505             System.err.println("TODO: Please revert removal of 'sil1' entries, see CLDR-16380");
506         }
507 
508         System.out.println(
509                 CldrUtility.LINE_SEPARATOR + "ERRORS:\t" + errorCount + CldrUtility.LINE_SEPARATOR);
510 
511         System.exit(errorCount > 0 ? 1 : 0);
512     }
513 
514     static class RowData implements Comparable<RowData> {
515         OfficialStatus os;
516         String name;
517         Long pop;
518 
RowData(OfficialStatus os, String name, Long pop)519         public RowData(OfficialStatus os, String name, Long pop) {
520             this.os = os;
521             this.name = name;
522             this.pop = pop;
523         }
524 
getStatus()525         public OfficialStatus getStatus() {
526             // TODO Auto-generated method stub
527             return os;
528         }
529 
getName()530         public CharSequence getName() {
531             // TODO Auto-generated method stub
532             return name;
533         }
534 
getLiteratePopulation()535         public Long getLiteratePopulation() {
536             // TODO Auto-generated method stub
537             return pop;
538         }
539 
540         @Override
compareTo(RowData o)541         public int compareTo(RowData o) {
542             // TODO Auto-generated method stub
543             int result = os.compareTo(o.os);
544             if (result != 0) return -result;
545             long result2 = pop - o.pop;
546             if (result2 != 0) return result2 < 0 ? 1 : -1;
547             return name.compareTo(o.name);
548         }
549 
550         @Override
equals(Object o)551         public boolean equals(Object o) {
552             return 0 == compareTo((RowData) o);
553         }
554 
555         @Override
hashCode()556         public int hashCode() {
557             throw new UnsupportedOperationException();
558         }
559     }
560 
printDefaultLanguagesAndScripts()561     private static void printDefaultLanguagesAndScripts() {
562 
563         final int minTotalPopulation = 10000000;
564         final int minTerritoryPopulation = 1000000;
565         final double minTerritoryPercent = 1.0 / 3;
566         Map<String, Set<RowData>> languageToReason = new TreeMap<>();
567         Counter<String> languageToLiteratePopulation = new Counter<>();
568         NumberFormat nf = NumberFormat.getIntegerInstance(ULocale.ENGLISH);
569         nf.setGroupingUsed(true);
570         LanguageTagParser ltp = new LanguageTagParser();
571         LikelySubtags likelySubtags = new LikelySubtags();
572         /*
573          * A. X is a qualified language**, and at least one of the following is true:
574          *
575          * 1. X is has official status* in any country
576          * 2. X exceeds a threshold population† of literate users worldwide: 1M
577          * 3. X exceeds a threshold population† in some country Z: 100K and 20% of Z's population†.
578          *
579          * B. X is an exception explicitly approved by the committee or X has minimal
580          * language coverage‡ in CLDR itself.
581          * C. The language is in the CLDR-target locales
582          */
583         OfficialStatus minimalStatus =
584                 OfficialStatus.official_regional; // OfficialStatus.de_facto_official;
585         Map<String, String> languages = new TreeMap<>();
586         for (String language : standardCodes.getAvailableCodes("language")) {
587             String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language);
588             String result = english.getStringValue(path);
589             if (result != null) {
590                 languages.put(language, result);
591             }
592         }
593 
594         if (SHOW_ALL_LANGUAGE_CODES) {
595             for (String language : languages.keySet()) {
596                 System.out.println(language + "\t" + languages.get(language));
597             }
598         } else {
599             System.out.println(
600                     "- GenerateMaximalLocales.java: SHOW_ALL_LANGUAGE_CODES=true to show all language codes");
601         }
602 
603         // also CLDR-target locales
604         final Set<String> CLDRMainLanguages =
605                 new TreeSet<>(StandardCodes.make().getLocaleCoverageLocales(Organization.cldr));
606 
607         for (String territory : supplementalData.getTerritoriesWithPopulationData()) {
608             if (Iso3166Data.isRegionCodeNotForTranslation(territory)) {
609                 System.out.println(
610                         "Iso3166Data.isRegionCodeNotForTranslation("
611                                 + territory
612                                 + ") true, skipping");
613                 continue;
614             }
615             PopulationData territoryPop = supplementalData.getPopulationDataForTerritory(territory);
616             double territoryPopulation = territoryPop.getLiteratePopulation();
617             for (String languageScript :
618                     supplementalData.getLanguagesForTerritoryWithPopulationData(territory)) {
619                 PopulationData popData =
620                         supplementalData.getLanguageAndTerritoryPopulationData(
621                                 languageScript, territory);
622                 ltp.set(languageScript);
623                 String language = ltp.getLanguage();
624                 //                if (ltp.getScript().isEmpty()) {
625                 //                    String max = likelySubtags.maximize(languageScript);
626                 //                    if (max != null) {
627                 //                        ltp.set(max).setRegion("");
628                 //                        languageScript = ltp.toString();
629                 //                    }
630                 //                }
631                 boolean add = false;
632                 // #1
633                 OfficialStatus status = popData.getOfficialStatus();
634                 if (status.compareTo(minimalStatus) >= 0) {
635                     add = true;
636                 }
637                 long literatePopulation = getWritingPopulation(popData);
638                 // #2
639                 languageToLiteratePopulation.add(language, literatePopulation);
640                 // #3
641                 if (literatePopulation > minTerritoryPopulation
642                         && literatePopulation > minTerritoryPercent * territoryPopulation) {
643                     add = true;
644                 }
645                 if (add == false && CLDRMainLanguages.contains(language)) {
646                     add = true;
647                 }
648                 if (add) {
649                     add(languageToReason, language, territory, status, literatePopulation);
650                     Set<String> containers = Containment.leafToContainer(territory);
651                     if (containers == null) {
652                         throw new NullPointerException(
653                                 "Containment.leafToContainer(" + territory + ") is null");
654                     }
655                     // Add the containing regions
656                     for (String container : containers) {
657                         add(
658                                 languageToReason,
659                                 language,
660                                 container,
661                                 OfficialStatus.unknown,
662                                 literatePopulation);
663                     }
664                 }
665             }
666         }
667         // #2, now that we have the data
668         for (String language : languageToLiteratePopulation.keySet()) {
669             long totalPop = languageToLiteratePopulation.getCount(language);
670             if (totalPop > minTotalPopulation) {
671                 add(languageToReason, language, "001", OfficialStatus.unknown, totalPop);
672             }
673         }
674 
675         // Specials
676         add(languageToReason, LocaleNames.UND, "001", OfficialStatus.unknown, 0);
677 
678         // for (String language : Iso639Data.getAvailable()) {
679         // Scope scope = Iso639Data.getScope(language);
680         // Type type = Iso639Data.getType(language);
681         // if (scope == Scope.Special) {
682         // add(languageToReason, language, "001", OfficialStatus.unknown, -1);
683         // }
684         // }
685         // print them
686 
687         System.out.println("Detailed - Including:\t" + languageToReason.size());
688 
689         if (!SHOW_DETAILED) {
690             System.out.println(
691                     "- GenerateMaximalLocales.java: SHOW_DETAILED=true to show more details");
692         } else {
693             for (String language : languageToReason.keySet()) {
694                 Set<RowData> reasons = languageToReason.get(language);
695 
696                 RowData lastReason = reasons.iterator().next();
697 
698                 System.out
699                         .append(language)
700                         .append("\t")
701                         .append(english.getName(language))
702                         .append("\t")
703                         .append(lastReason.getStatus().toShortString())
704                         .append("\t")
705                         .append(nf.format(languageToLiteratePopulation.getCount(language)));
706                 for (RowData reason : reasons) {
707                     String status = reason.getStatus().toShortString();
708                     System.out
709                             .append("\t")
710                             .append(status)
711                             .append("-")
712                             .append(reason.getName())
713                             .append("-")
714                             .append(nf.format(reason.getLiteratePopulation()));
715                 }
716                 System.out.append("\n");
717             }
718         }
719 
720         // now list them
721 
722         Set<String> others = new TreeSet<>();
723         others.addAll(standardCodes.getGoodAvailableCodes("language"));
724         others.removeAll(languageToReason.keySet());
725         System.out.println("\nIncluded Languages:\t" + languageToReason.keySet().size());
726         if (SHOW_INCLUDED_EXCLUDED) {
727             showLanguages(languageToReason.keySet(), languageToReason);
728         }
729         System.out.println("\nExcluded Languages:\t" + others.size());
730         if (SHOW_INCLUDED_EXCLUDED) {
731             showLanguages(others, languageToReason);
732         } else {
733             System.out.println(
734                     " - GenerateMaximalLocales.java: set SHOW_INCLUDED_EXCLUDED=true to show reason details");
735         }
736     }
737 
getWritingPopulation(PopulationData popData)738     private static long getWritingPopulation(PopulationData popData) {
739         final double writingPopulation = popData.getWritingPopulation();
740         if (!Double.isNaN(writingPopulation)) {
741             return (long) writingPopulation;
742         }
743         return (long) popData.getLiteratePopulation();
744     }
745 
showLanguages( Set<String> others, Map<String, Set<RowData>> languageToReason)746     private static void showLanguages(
747             Set<String> others, Map<String, Set<RowData>> languageToReason) {
748         Set<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ENGLISH));
749         for (String language : others) {
750             sorted.add(getLanguageName(language, languageToReason));
751         }
752         char last = 0;
753         for (String language : sorted) {
754             final char curr = language.charAt(0);
755             if (last != curr) {
756                 System.out.println();
757             } else if (last != '\u0000') {
758                 System.out.print(", ");
759             }
760             System.out.print(language);
761             last = curr;
762         }
763         System.out.println();
764     }
765 
getLanguageName( String language, Map<String, Set<RowData>> languageToReason)766     private static String getLanguageName(
767             String language, Map<String, Set<RowData>> languageToReason) {
768         OfficialStatus best = OfficialStatus.unknown;
769         Set<RowData> reasons = languageToReason.get(language);
770         if (reasons != null) {
771             for (RowData reason : reasons) {
772                 final OfficialStatus currentStatus = reason.getStatus();
773                 if (best.compareTo(currentStatus) < 0) {
774                     best = currentStatus;
775                 }
776             }
777         }
778         String status = best.toShortString();
779         Scope scope = Iso639Data.getScope(language);
780         if (scope == Scope.Special) {
781             status = "S";
782         }
783         String languageFormatted = english.getName(language) + " [" + language + "]-" + status;
784         return languageFormatted;
785     }
786 
add( Map<String, Set<RowData>> languageToReason, String language, String territoryRaw, OfficialStatus status, long population)787     private static void add(
788             Map<String, Set<RowData>> languageToReason,
789             String language,
790             String territoryRaw,
791             OfficialStatus status,
792             long population) {
793         String territory = english.getName("territory", territoryRaw) + " [" + territoryRaw + "]";
794         Set<RowData> set = languageToReason.get(language);
795         if (set == null) {
796             languageToReason.put(language, set = new TreeSet<>());
797         }
798         set.add(new RowData(status, territory, population));
799     }
800 
801     /** In computing the defaultContents, no and nb require special handling. */
802     static final Map<String, String> SPECIAL_CHILD_TO_PARENT =
803             ImmutableMap.of("nb", "no", "nb_NO", "nb");
804 
805     /*
806      * Compute the defaultContent values for supplemental data.
807      * It uses the maximization data and the simpleParent (truncation).
808      * We can't use the normal "getParent" because that messes up the logic
809      * used to handle inconsistencies in scripts in CLDR.<br>
810      * That is, there are three situations: <ul>
811      * <li>all children have explicit scripts; </li>
812      * <li>no children have scripts; and </li>
813      * <li>some do and some don't</li></ul>
814      */
815 
printDefaultContent(Map<String, String> toMaximized)816     private static void printDefaultContent(Map<String, String> toMaximized) throws IOException {
817 
818         Set<String> defaultLocaleContent = new TreeSet<>();
819 
820         // go through all the cldr locales, and add default contents
821         // now computed from toMaximized
822         Set<String> available = factory.getAvailable();
823         Relation<String, String> toSimpleChildren =
824                 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
825         LanguageTagParser ltp = new LanguageTagParser();
826 
827         // System.out.println(maximize("az_Latn_AZ", toMaximized));
828         Set<String> hasSimpleChildWithScript = new TreeSet<>();
829 
830         // first get a mapping to children
831         for (String locale : available) {
832             if (locale.equals(LocaleNames.ROOT)) {
833                 continue;
834             }
835             if (ltp.set(locale).getVariants().size() != 0) {
836                 continue;
837             }
838             String parent = SPECIAL_CHILD_TO_PARENT.get(locale);
839             if (parent == null) {
840                 parent =
841                         LocaleIDParser.getSimpleParent(
842                                 locale); // we can't use the regular getParent (see above)
843             }
844 
845             if (ltp.getScript().length() != 0) {
846                 hasSimpleChildWithScript.add(parent);
847             }
848             if (parent.equals(LocaleNames.ROOT)) {
849                 continue;
850             }
851             toSimpleChildren.put(parent, locale);
852         }
853 
854         // Suppress script for locales for which we only have one locale in common/main. See ticket
855         // #7834.
856         Set<String> suppressScriptLocales =
857                 new HashSet<>(
858                         Arrays.asList(
859                                 "bm_ML", "en_US", "ha_NG", "iu_CA", "ms_MY", "mn_MN", "byn_ER",
860                                 "ff_SN", "dyo_SN", "kk_KZ", "ku_TR", "ky_KG", "ml_IN", "so_SO",
861                                 "sw_TZ", "wo_SN", "yo_NG", "dje_NE", "blt_VN", "hi_IN", "nv_US",
862                                 "doi_IN"));
863 
864         // if any have a script, then throw out any that don't have a script (unless they're
865         // specifically included.)
866         Set<String> toRemove = new TreeSet<>();
867         for (String locale : hasSimpleChildWithScript) {
868             toRemove.clear();
869             Set<String> children = toSimpleChildren.getAll(locale);
870             for (String child : children) {
871                 if (ltp.set(child).getScript().length() == 0
872                         && !suppressScriptLocales.contains(child)) {
873                     toRemove.add(child);
874                 }
875             }
876             if (toRemove.size() != 0) {
877                 System.out.println(
878                         "\tRemoving:\t" + locale + "\t" + toRemove + "\tfrom\t" + children);
879                 toSimpleChildren.removeAll(locale, toRemove);
880             }
881         }
882 
883         // we add a child as a default locale if it has the same maximization
884         main:
885         for (String locale : toSimpleChildren.keySet()) {
886             String maximized = maximize(locale, toMaximized);
887             if (maximized == null) {
888                 if (SHOW_ADD) System.out.println("Missing maximized:\t" + locale);
889                 continue;
890             }
891             Set<String> children = toSimpleChildren.getAll(locale);
892             Map<String, String> debugStuff = new TreeMap<>();
893             for (String child : children) {
894                 String maximizedChild = maximize(child, toMaximized);
895                 if (maximized.equals(maximizedChild)) {
896                     defaultLocaleContent.add(child);
897                     continue main;
898                 }
899                 debugStuff.put(child, maximizedChild);
900             }
901             if (SHOW_ADD)
902                 System.out.println(
903                         "Can't find maximized: "
904                                 + locale
905                                 + "="
906                                 + maximized
907                                 + "\tin\t"
908                                 + debugStuff);
909         }
910 
911         for (String specialChild : SPECIAL_CHILD_TO_PARENT.keySet()) {
912             defaultLocaleContent.add(specialChild);
913         }
914         defaultLocaleContent.remove("und_ZZ"); // und_ZZ isn't ever a real locale. (old sandbox)
915         defaultLocaleContent.remove("mul_ZZ"); // mul_ZZ isn't ever a real locale.
916 
917         showDefaultContentDifferencesAndFix(defaultLocaleContent);
918 
919         final File genSuppDir = new File(CLDRPaths.GEN_DIRECTORY, "supplemental");
920         final File genSuppMetadataFile = new File(genSuppDir, "supplementalMetadata.xml");
921         final File oldSuppMetadataFile =
922                 new File(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "supplementalMetadata.xml");
923 
924         try (PrintWriter genFile = FileUtilities.openUTF8Writer(genSuppMetadataFile);
925                 BufferedReader oldFile = FileUtilities.openUTF8Reader(oldSuppMetadataFile); ) {
926             CldrUtility.copyUpTo(
927                     oldFile,
928                     PatternCache.get("\\s*<defaultContent locales=\"\\s*"),
929                     genFile,
930                     false);
931 
932             String sep = CldrUtility.LINE_SEPARATOR + "\t\t\t";
933             String broken =
934                     CldrUtility.breakLines(
935                             CldrUtility.join(defaultLocaleContent, " "),
936                             sep,
937                             PatternCache.get("(\\S)\\S*").matcher(""),
938                             80);
939 
940             genFile.println("\t\t<defaultContent locales=\"" + broken + "\"");
941             genFile.println("\t\t/>");
942 
943             // genFile.println("</supplementalData>");
944             CldrUtility.copyUpTo(
945                     oldFile,
946                     PatternCache.get("\\s*/>\\s*(<!--.*)?"),
947                     null,
948                     true); // skip to matching >
949             CldrUtility.copyUpTo(oldFile, null, genFile, true); // copy the rest
950         }
951 
952         // Move it into place
953         System.out.println(
954                 "Copying generated " + genSuppMetadataFile + " to " + oldSuppMetadataFile);
955         oldSuppMetadataFile.delete();
956         Files.copy(genSuppMetadataFile.toPath(), oldSuppMetadataFile.toPath());
957     }
958 
959     private static class MaxData {
960         Relation<String, Row.R3<Double, String, String>> languages =
961                 Relation.of(
962                         new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
963         Map<String, Counter<String>> languagesToScripts = new TreeMap<>();
964         Map<String, Counter<String>> languagesToRegions = new TreeMap<>();
965 
966         Relation<String, Row.R3<Double, String, String>> scripts =
967                 Relation.of(
968                         new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
969         Map<String, Counter<String>> scriptsToLanguages = new TreeMap<>();
970         Map<String, Counter<String>> scriptsToRegions = new TreeMap<>();
971 
972         Relation<String, Row.R3<Double, String, String>> regions =
973                 Relation.of(
974                         new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class);
975         Map<String, Counter<String>> regionsToLanguages = new TreeMap<>();
976         Map<String, Counter<String>> regionsToScripts = new TreeMap<>();
977 
978         Map<String, Counter<Row.R2<String, String>>> containersToLanguage = new TreeMap<>();
979         Relation<String, Row.R4<Double, String, String, String>> containersToLangRegion =
980                 Relation.of(
981                         new TreeMap<String, Set<Row.R4<Double, String, String, String>>>(),
982                         TreeSet.class);
983 
984         Relation<Row.R2<String, String>, Row.R2<Double, String>> languageScripts =
985                 Relation.of(
986                         new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
987                         TreeSet.class);
988         Relation<Row.R2<String, String>, Row.R2<Double, String>> scriptRegions =
989                 Relation.of(
990                         new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
991                         TreeSet.class);
992         Relation<Row.R2<String, String>, Row.R2<Double, String>> languageRegions =
993                 Relation.of(
994                         new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(),
995                         TreeSet.class);
996 
997         /**
998          * Add population information. "order" is the negative of the population (makes the first be
999          * the highest).
1000          *
1001          * @param language
1002          * @param script
1003          * @param region
1004          * @param order
1005          */
add(String language, String script, String region, Double order)1006         void add(String language, String script, String region, Double order) {
1007             if (SHOW_ADD && language.equals(LocaleNames.MIS)) {
1008                 System.out.println(language + "\t" + script + "\t" + region + "\t" + -order);
1009             }
1010             languages.put(language, Row.of(order, script, region));
1011             // addCounter(languagesToScripts, language, script, order);
1012             // addCounter(languagesToRegions, language, region, order);
1013 
1014             scripts.put(script, Row.of(order, language, region));
1015             // addCounter(scriptsToLanguages, script, language, order);
1016             // addCounter(scriptsToRegions, script, region, order);
1017 
1018             regions.put(region, Row.of(order, language, script));
1019             // addCounter(regionsToLanguages, region, language, order);
1020             // addCounter(regionsToScripts, region, script, order);
1021 
1022             languageScripts.put(Row.of(language, script), Row.of(order, region));
1023             scriptRegions.put(Row.of(script, region), Row.of(order, language));
1024             languageRegions.put(Row.of(language, region), Row.of(order, script));
1025 
1026             Set<String> containerSet = Containment.leafToContainer(region);
1027             if (containerSet != null) {
1028                 for (String container : containerSet) {
1029 
1030                     containersToLangRegion.put(container, Row.of(order, language, script, region));
1031                     Counter<R2<String, String>> data = containersToLanguage.get(container);
1032                     if (data == null) {
1033                         containersToLanguage.put(container, data = new Counter<>());
1034                     }
1035                     data.add(Row.of(language, script), (long) (double) order);
1036                 }
1037             }
1038 
1039             if (SHOW_ADD)
1040                 System.out.println(
1041                         "Data:\t" + language + "\t" + script + "\t" + region + "\t" + order);
1042         }
1043         // private void addCounter(Map<String, Counter<String>> map, String key, String key2, Double
1044         // count) {
1045         // Counter<String> counter = map.get(key);
1046         // if (counter == null) {
1047         // map.put(key, counter = new Counter<String>());
1048         // }
1049         // counter.add(key2, count.longValue());
1050         // }
1051     }
1052 
1053     private static final double MIN_UNOFFICIAL_LANGUAGE_SIZE = 10000000;
1054     private static final double MIN_UNOFFICIAL_LANGUAGE_PROPORTION = 0.20;
1055     private static final double MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE = 100000;
1056     private static final double UNOFFICIAL_SCALE_DOWN = 0.2;
1057 
1058     private static NumberFormat percent = NumberFormat.getPercentInstance();
1059     private static NumberFormat number = NumberFormat.getIntegerInstance();
1060 
tryDifferentAlgorithm(Map<String, String> toMaximized)1061     private static void tryDifferentAlgorithm(Map<String, String> toMaximized) {
1062         // we are going to try a different approach.
1063         // first gather counts for maximized values
1064         // Set<Row.R3<String,String,String>,Double> rowsToCounts = new TreeMap();
1065         MaxData maxData = new MaxData();
1066         Set<String> cldrLocales = factory.getAvailable();
1067         Set<String> otherTerritories =
1068                 new TreeSet<>(standardCodes.getGoodAvailableCodes("territory"));
1069 
1070         // process all the information to get the top values for each triple.
1071         // each of the combinations of 1 or 2 components gets to be a key.
1072         for (String region : supplementalData.getTerritoriesWithPopulationData()) {
1073             otherTerritories.remove(region);
1074             PopulationData regionData = supplementalData.getPopulationDataForTerritory(region);
1075             final double literateTerritoryPopulation = regionData.getLiteratePopulation();
1076             // we need any unofficial language to meet a certain absolute size requirement and
1077             // proportion size
1078             // requirement.
1079             // so the bar is x percent of the population, reset up to y absolute size.
1080             double minimalLiteratePopulation =
1081                     literateTerritoryPopulation * MIN_UNOFFICIAL_LANGUAGE_PROPORTION;
1082             if (minimalLiteratePopulation < MIN_UNOFFICIAL_LANGUAGE_SIZE) {
1083                 minimalLiteratePopulation = MIN_UNOFFICIAL_LANGUAGE_SIZE;
1084             }
1085 
1086             for (String writtenLanguage :
1087                     supplementalData.getLanguagesForTerritoryWithPopulationData(region)) {
1088                 PopulationData data =
1089                         supplementalData.getLanguageAndTerritoryPopulationData(
1090                                 writtenLanguage, region);
1091                 final double literatePopulation =
1092                         getWritingPopulation(data); // data.getLiteratePopulation();
1093                 double order = -literatePopulation; // negative so we get the inverse order
1094 
1095                 if (data.getOfficialStatus() == OfficialStatus.unknown) {
1096                     final String locale = writtenLanguage + "_" + region;
1097                     if (literatePopulation >= minimalLiteratePopulation) {
1098                         // ok, skip
1099                     } else if (literatePopulation >= MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE
1100                             && cldrLocales.contains(locale)) {
1101                         // ok, skip
1102                     } else {
1103                         // if (SHOW_ADD)
1104                         // System.out.println("Skipping:\t" + writtenLanguage + "\t" + region + "\t"
1105                         // + english.getName(locale)
1106                         // + "\t-- too small:\t" + number.format(literatePopulation));
1107                         // continue;
1108                     }
1109                     order *= UNOFFICIAL_SCALE_DOWN;
1110                     if (SHOW_ADD)
1111                         System.out.println(
1112                                 "Retaining\t"
1113                                         + writtenLanguage
1114                                         + "\t"
1115                                         + region
1116                                         + "\t"
1117                                         + english.getName(locale)
1118                                         + "\t"
1119                                         + number.format(literatePopulation)
1120                                         + "\t"
1121                                         + percent.format(
1122                                                 literatePopulation / literateTerritoryPopulation)
1123                                         + (cldrLocales.contains(locale) ? "\tin-CLDR" : ""));
1124                 }
1125                 String script;
1126                 String language = writtenLanguage;
1127                 final int pos = writtenLanguage.indexOf('_');
1128                 if (pos > 0) {
1129                     language = writtenLanguage.substring(0, pos);
1130                     script = writtenLanguage.substring(pos + 1);
1131                 } else {
1132                     script = getScriptForLocale2(language);
1133                 }
1134                 maxData.add(language, script, region, order);
1135             }
1136         }
1137 
1138         LanguageTagParser additionLtp = new LanguageTagParser();
1139 
1140         for (String addition : MAX_ADDITIONS) {
1141             additionLtp.set(addition);
1142             String lan = additionLtp.getLanguage();
1143             Set<R3<Double, String, String>> key = maxData.languages.get(lan);
1144             if (key == null) {
1145                 maxData.add(lan, additionLtp.getScript(), additionLtp.getRegion(), 1.0);
1146             } else {
1147                 int debug = 0;
1148             }
1149         }
1150 
1151         for (Entry<String, Collection<String>> entry :
1152                 DeriveScripts.getLanguageToScript().asMap().entrySet()) {
1153             String language = entry.getKey();
1154             final Collection<String> values = entry.getValue();
1155             if (values.size() != 1) {
1156                 continue; // skip, no either way
1157             }
1158             Set<R3<Double, String, String>> old = maxData.languages.get(language);
1159             if (!maxData.languages.containsKey(language)) {
1160                 maxData.add(language, values.iterator().next(), TEMP_UNKNOWN_REGION, 1.0);
1161             }
1162         }
1163 
1164         // add others, with English default
1165         for (String region : otherTerritories) {
1166             if (region.length() == 3) continue; // FIX ONCE WE ADD REGIONS
1167             maxData.add("en", "Latn", region, 1.0);
1168         }
1169 
1170         // get a reverse mapping, so that we can add the aliases
1171 
1172         Map<String, R2<List<String>, String>> languageAliases =
1173                 SupplementalDataInfo.getInstance().getLocaleAliasInfo().get("language");
1174         for (Entry<String, R2<List<String>, String>> str : languageAliases.entrySet()) {
1175             String reason = str.getValue().get1();
1176             if ("overlong".equals(reason)
1177                     || "bibliographic".equals(reason)
1178                     || "macrolanguage".equals(reason)) {
1179                 continue;
1180             }
1181             List<String> replacements = str.getValue().get0();
1182             if (replacements == null) {
1183                 continue;
1184             }
1185             String goodLanguage = replacements.get(0);
1186 
1187             String badLanguage = str.getKey();
1188             if (badLanguage.contains("_")) {
1189                 continue;
1190             }
1191             if (deprecatedISONotInLST.contains(badLanguage)) {
1192                 continue;
1193             }
1194             Set<R3<Double, String, String>> goodLanguageData =
1195                     maxData.languages.getAll(goodLanguage);
1196             if (goodLanguageData == null) {
1197                 continue;
1198             }
1199             R3<Double, String, String> value = goodLanguageData.iterator().next();
1200             final String script = value.get1();
1201             final String region = value.get2();
1202             maxData.add(badLanguage, script, region, 1.0);
1203             System.out.println(
1204                     "Adding aliases: "
1205                             + badLanguage
1206                             + ", "
1207                             + script
1208                             + ", "
1209                             + region
1210                             + ", "
1211                             + reason);
1212         }
1213 
1214         // now, get the best for each one
1215         for (String language : maxData.languages.keySet()) {
1216             R3<Double, String, String> value = maxData.languages.getAll(language).iterator().next();
1217             final Comparable<String> script = value.get1();
1218             final Comparable<String> region = value.get2();
1219             add(
1220                     language,
1221                     language + "_" + script + "_" + region,
1222                     toMaximized,
1223                     "L->SR",
1224                     LocaleOverride.REPLACE_EXISTING,
1225                     SHOW_ADD);
1226         }
1227         for (String language : maxData.languagesToScripts.keySet()) {
1228             String script =
1229                     maxData.languagesToScripts
1230                             .get(language)
1231                             .getKeysetSortedByCount(true)
1232                             .iterator()
1233                             .next();
1234             add(
1235                     language,
1236                     language + "_" + script,
1237                     toMaximized,
1238                     "L->S",
1239                     LocaleOverride.REPLACE_EXISTING,
1240                     SHOW_ADD);
1241         }
1242         for (String language : maxData.languagesToRegions.keySet()) {
1243             String region =
1244                     maxData.languagesToRegions
1245                             .get(language)
1246                             .getKeysetSortedByCount(true)
1247                             .iterator()
1248                             .next();
1249             add(
1250                     language,
1251                     language + "_" + region,
1252                     toMaximized,
1253                     "L->R",
1254                     LocaleOverride.REPLACE_EXISTING,
1255                     SHOW_ADD);
1256         }
1257 
1258         for (String script : maxData.scripts.keySet()) {
1259             R3<Double, String, String> value = maxData.scripts.getAll(script).iterator().next();
1260             final Comparable<String> language = value.get1();
1261             final Comparable<String> region = value.get2();
1262             add(
1263                     "und_" + script,
1264                     language + "_" + script + "_" + region,
1265                     toMaximized,
1266                     "S->LR",
1267                     LocaleOverride.REPLACE_EXISTING,
1268                     SHOW_ADD);
1269         }
1270         for (String script : maxData.scriptsToLanguages.keySet()) {
1271             String language =
1272                     maxData.scriptsToLanguages
1273                             .get(script)
1274                             .getKeysetSortedByCount(true)
1275                             .iterator()
1276                             .next();
1277             add(
1278                     "und_" + script,
1279                     language + "_" + script,
1280                     toMaximized,
1281                     "S->L",
1282                     LocaleOverride.REPLACE_EXISTING,
1283                     SHOW_ADD);
1284         }
1285         for (String script : maxData.scriptsToRegions.keySet()) {
1286             String region =
1287                     maxData.scriptsToRegions
1288                             .get(script)
1289                             .getKeysetSortedByCount(true)
1290                             .iterator()
1291                             .next();
1292             add(
1293                     "und_" + script,
1294                     "und_" + script + "_" + region,
1295                     toMaximized,
1296                     "S->R",
1297                     LocaleOverride.REPLACE_EXISTING,
1298                     SHOW_ADD);
1299         }
1300 
1301         for (String region : maxData.regions.keySet()) {
1302             R3<Double, String, String> value = maxData.regions.getAll(region).iterator().next();
1303             final Comparable<String> language = value.get1();
1304             final Comparable<String> script = value.get2();
1305             add(
1306                     "und_" + region,
1307                     language + "_" + script + "_" + region,
1308                     toMaximized,
1309                     "R->LS",
1310                     LocaleOverride.REPLACE_EXISTING,
1311                     SHOW_ADD);
1312         }
1313         for (String region : maxData.regionsToLanguages.keySet()) {
1314             String language =
1315                     maxData.regionsToLanguages
1316                             .get(region)
1317                             .getKeysetSortedByCount(true)
1318                             .iterator()
1319                             .next();
1320             add(
1321                     "und_" + region,
1322                     language + "_" + region,
1323                     toMaximized,
1324                     "R->L",
1325                     LocaleOverride.REPLACE_EXISTING,
1326                     SHOW_ADD);
1327         }
1328         for (String region : maxData.regionsToScripts.keySet()) {
1329             String script =
1330                     maxData.regionsToScripts
1331                             .get(region)
1332                             .getKeysetSortedByCount(true)
1333                             .iterator()
1334                             .next();
1335             add(
1336                     "und_" + region,
1337                     "und_" + script + "_" + region,
1338                     toMaximized,
1339                     "R->S",
1340                     LocaleOverride.REPLACE_EXISTING,
1341                     SHOW_ADD);
1342         }
1343 
1344         for (Entry<String, Counter<R2<String, String>>> containerAndInfo :
1345                 maxData.containersToLanguage.entrySet()) {
1346             String region = containerAndInfo.getKey();
1347             if (region.equals("001")) {
1348                 continue;
1349             }
1350             Counter<R2<String, String>> data = containerAndInfo.getValue();
1351             Set<R2<String, String>> keysetSortedByCount = data.getKeysetSortedByCount(true);
1352             if (SHOW_CONTAINERS) { // debug
1353                 System.out.println(
1354                         "Container2L:\t"
1355                                 + region
1356                                 + "\t"
1357                                 + shorten(data.getEntrySetSortedByCount(true, null)));
1358                 System.out.println(
1359                         "Container2LR:\t"
1360                                 + region
1361                                 + "\t"
1362                                 + maxData.containersToLangRegion.get(region));
1363             }
1364             R2<String, String> value =
1365                     keysetSortedByCount.iterator().next(); // will get most negative
1366             final Comparable<String> language = value.get0();
1367             final Comparable<String> script = value.get1();
1368 
1369             // fix special cases like es-419, where a locale exists.
1370             // for those cases, what we add as output is the container. Otherwise the region.
1371             Set<String> skipLanguages = cldrContainerToLanguages.get(region);
1372             if (skipLanguages != null && skipLanguages.contains(language)) {
1373                 add(
1374                         "und_" + region,
1375                         language + "_" + script + "_" + region,
1376                         toMaximized,
1377                         "R*->LS",
1378                         LocaleOverride.REPLACE_EXISTING,
1379                         SHOW_ADD);
1380                 continue;
1381             }
1382 
1383             // we now have the best language and script. Find the best region for that
1384             for (R4<Double, String, String, String> e :
1385                     maxData.containersToLangRegion.get(region)) {
1386                 final Comparable<String> language2 = e.get1();
1387                 final Comparable<String> script2 = e.get2();
1388                 if (language2.equals(language) && script2.equals(script)) {
1389                     add(
1390                             "und_" + region,
1391                             language + "_" + script + "_" + e.get3(),
1392                             toMaximized,
1393                             "R*->LS",
1394                             LocaleOverride.REPLACE_EXISTING,
1395                             SHOW_ADD);
1396                     break;
1397                 }
1398             }
1399         }
1400 
1401         for (R2<String, String> languageScript : maxData.languageScripts.keySet()) {
1402             R2<Double, String> value =
1403                     maxData.languageScripts.getAll(languageScript).iterator().next();
1404             final Comparable<String> language = languageScript.get0();
1405             final Comparable<String> script = languageScript.get1();
1406             final Comparable<String> region = value.get1();
1407             add(
1408                     language + "_" + script,
1409                     language + "_" + script + "_" + region,
1410                     toMaximized,
1411                     "LS->R",
1412                     LocaleOverride.REPLACE_EXISTING,
1413                     SHOW_ADD);
1414         }
1415 
1416         for (R2<String, String> scriptRegion : maxData.scriptRegions.keySet()) {
1417             R2<Double, String> value = maxData.scriptRegions.getAll(scriptRegion).iterator().next();
1418             final Comparable<String> script = scriptRegion.get0();
1419             final Comparable<String> region = scriptRegion.get1();
1420             final Comparable<String> language = value.get1();
1421             add(
1422                     "und_" + script + "_" + region,
1423                     language + "_" + script + "_" + region,
1424                     toMaximized,
1425                     "SR->L",
1426                     LocaleOverride.REPLACE_EXISTING,
1427                     SHOW_ADD);
1428         }
1429 
1430         for (R2<String, String> languageRegion : maxData.languageRegions.keySet()) {
1431             R2<Double, String> value =
1432                     maxData.languageRegions.getAll(languageRegion).iterator().next();
1433             final Comparable<String> language = languageRegion.get0();
1434             final Comparable<String> region = languageRegion.get1();
1435             final Comparable<String> script = value.get1();
1436             add(
1437                     language + "_" + region,
1438                     language + "_" + script + "_" + region,
1439                     toMaximized,
1440                     "LR->S",
1441                     LocaleOverride.REPLACE_EXISTING,
1442                     SHOW_ADD);
1443         }
1444 
1445         // get the script info from metadata as fallback
1446 
1447         TreeSet<String> sorted = new TreeSet<>(ScriptMetadata.getScripts());
1448         for (String script : sorted) {
1449             Info i = ScriptMetadata.getInfo(script);
1450             String likelyLanguage = i.likelyLanguage;
1451             if (LANGUAGE_CODE_TO_STATUS.get(likelyLanguage) == Status.special) {
1452                 likelyLanguage = LocaleNames.UND;
1453             }
1454             String originCountry = i.originCountry;
1455             final String result = likelyLanguage + "_" + script + "_" + originCountry;
1456             add(
1457                     "und_" + script,
1458                     result,
1459                     toMaximized,
1460                     "S->LR•",
1461                     LocaleOverride.KEEP_EXISTING,
1462                     SHOW_ADD);
1463             add(
1464                     likelyLanguage,
1465                     result,
1466                     toMaximized,
1467                     "L->SR•",
1468                     LocaleOverride.KEEP_EXISTING,
1469                     SHOW_ADD);
1470         }
1471 
1472         // add overrides
1473         for (String key : LANGUAGE_OVERRIDES.keySet()) {
1474             add(
1475                     key,
1476                     LANGUAGE_OVERRIDES.get(key),
1477                     toMaximized,
1478                     "OVERRIDE",
1479                     LocaleOverride.REPLACE_EXISTING,
1480                     true);
1481         }
1482 
1483         // Make sure that the mapping is Idempotent. If we have A ==> B, we must never have B ==> C
1484         // We run this check until we get no problems.
1485         Set<List<String>> problems = new HashSet<>();
1486 
1487         while (true) {
1488             problems.clear();
1489             for (Entry<String, String> entry : toMaximized.entrySet()) {
1490                 String source = entry.getKey();
1491                 String target = entry.getValue();
1492                 if (target.contains("_Zzzz") || target.contains("_ZZ")) { // these are special cases
1493                     continue;
1494                 }
1495                 String idempotentCandidate = LikelySubtags.maximize(target, toMaximized);
1496 
1497                 if (idempotentCandidate == null) {
1498                     System.out.println("Can't maximize " + target);
1499                 } else if (!idempotentCandidate.equals(target)) {
1500                     problems.add(ImmutableList.of(source, target, idempotentCandidate));
1501                 }
1502             }
1503             if (problems.isEmpty()) {
1504                 break;
1505             }
1506             for (List<String> row : problems) {
1507                 System.out.println(
1508                         "Idempotence: dropping mapping "
1509                                 + row.get(0)
1510                                 + " to "
1511                                 + row.get(1)
1512                                 + " since the target maps further to "
1513                                 + row.get(2));
1514                 toMaximized.remove(row.get(0));
1515             }
1516         }
1517     }
1518 
shorten(Object data)1519     public static String shorten(Object data) {
1520         String info = data.toString();
1521         if (info.length() > 255) {
1522             info = info.substring(0, 127) + "…";
1523         }
1524         return info;
1525     }
1526 
doAlt(Map<String, String> toMaximized)1527     private static void doAlt(Map<String, String> toMaximized) {
1528         // TODO Auto-generated method stub
1529         Map<String, String> temp = new TreeMap<>();
1530         for (String locale : toMaximized.keySet()) {
1531             String target = toMaximized.get(locale);
1532             temp.put(toAlt(locale, true), toAlt(target, true));
1533         }
1534         toMaximized.clear();
1535         toMaximized.putAll(temp);
1536     }
1537 
maximize(String languageTag, Map<String, String> toMaximized)1538     public static String maximize(String languageTag, Map<String, String> toMaximized) {
1539         LanguageTagParser ltp = new LanguageTagParser();
1540 
1541         // clean up the input by removing Zzzz, ZZ, and changing "" into und.
1542         ltp.set(languageTag);
1543         String language = ltp.getLanguage();
1544         String region = ltp.getRegion();
1545         String script = ltp.getScript();
1546         boolean changed = false;
1547         if (language.equals("")) {
1548             ltp.setLanguage(language = LocaleNames.UND);
1549             changed = true;
1550         }
1551         if (region.equals(UNKNOWN_SCRIPT)) {
1552             ltp.setScript(script = "");
1553             changed = true;
1554         }
1555         if (ltp.getRegion().equals(UNKNOWN_REGION)) {
1556             ltp.setRegion(region = "");
1557             changed = true;
1558         }
1559         if (changed) {
1560             languageTag = ltp.toString();
1561         }
1562         // check whole
1563         String result = toMaximized.get(languageTag);
1564         if (result != null) {
1565             return result;
1566         }
1567         // try empty region
1568         if (region.length() != 0) {
1569             result = toMaximized.get(ltp.setRegion("").toString());
1570             if (result != null) {
1571                 return ltp.set(result).setRegion(region).toString();
1572             }
1573             ltp.setRegion(region); // restore
1574         }
1575         // try empty script
1576         if (script.length() != 0) {
1577             result = toMaximized.get(ltp.setScript("").toString());
1578             if (result != null) {
1579                 return ltp.set(result).setScript(script).toString();
1580             }
1581             // try empty script and region
1582             if (region.length() != 0) {
1583                 result = toMaximized.get(ltp.setRegion("").toString());
1584                 if (result != null) {
1585                     return ltp.set(result).setScript(script).setRegion(region).toString();
1586                 }
1587             }
1588         }
1589         if (!language.equals(LocaleNames.UND) && script.length() != 0 && region.length() != 0) {
1590             return languageTag; // it was ok, and we couldn't do anything with it
1591         }
1592         return null; // couldn't maximize
1593     }
1594 
minimize( String input, Map<String, String> toMaximized, boolean favorRegion)1595     public static String minimize(
1596             String input, Map<String, String> toMaximized, boolean favorRegion) {
1597         if (input.equals("nb_Latn_SJ")) {
1598             System.out.print(""); // debug
1599         }
1600         String maximized = maximize(input, toMaximized);
1601         if (maximized == null) {
1602             return null; // failed
1603         }
1604         LanguageTagParser ltp = new LanguageTagParser().set(maximized);
1605         String language = ltp.getLanguage();
1606         String region = ltp.getRegion();
1607         String script = ltp.getScript();
1608         // try building up from shorter to longer, and find the first that matches
1609         // could be more optimized, but for this code we want simplest
1610         String[] trials = {
1611             language,
1612             language + TAG_SEPARATOR + (favorRegion ? region : script),
1613             language + TAG_SEPARATOR + (!favorRegion ? region : script)
1614         };
1615         for (String trial : trials) {
1616             String newMaximized = maximize(trial, toMaximized);
1617             if (maximized.equals(newMaximized)) {
1618                 return trial;
1619             }
1620         }
1621         return maximized;
1622     }
1623 
1624     // /**
1625     // * Verify that we can map from each language, script, and country to something.
1626     // * @param toMaximized
1627     // */
1628     // private static void checkConsistency(Map<String, String> toMaximized) {
1629     // Map<String,String> needMappings = new TreeMap();
1630     // LanguageTagParser parser = new LanguageTagParser();
1631     // for (String maximized : new TreeSet<String>(toMaximized.values())) {
1632     // parser.set(maximized);
1633     // final String language = parser.getLanguage();
1634     // final String script = parser.getScript();
1635     // final String region = parser.getRegion();
1636     // if (language.length() == 0 || script.length() == 0 || region.length() == 0) {
1637     // failure("   { \"" + maximized + "\", \"" + maximized + "\" },   //     " +
1638     // english.getName(maximized) +
1639     // "\t\tFailed-Consistency");
1640     // continue;
1641     // }
1642     // addIfNotIn(language, maximized, needMappings, toMaximized, "Consistency");
1643     // addIfNotIn(language + "_" + script, maximized, needMappings, toMaximized, "Consistency");
1644     // addIfNotIn(language + "_" + region, maximized, needMappings, toMaximized, "Consistency");
1645     // addIfNotIn("und_" + script, maximized, needMappings, toMaximized, "Consistency");
1646     // addIfNotIn("und_" + script + "_" + region, maximized, needMappings, toMaximized,
1647     // "Consistency");
1648     // addIfNotIn("und_" + region, maximized, needMappings, toMaximized, "Consistency");
1649     // }
1650     // toMaximized.putAll(needMappings);
1651     // }
1652 
1653     // private static void failure(String string) {
1654     // System.out.println(string);
1655     // errorCount++;
1656     // }
1657 
1658     // private static void addIfNotIn(String key, String value, Map<String, String> toAdd,
1659     // Map<String, String>
1660     // otherToCheck, String kind) {
1661     // addIfNotIn(key, value, toAdd, otherToCheck == null ? null : otherToCheck.keySet(), null,
1662     // kind);
1663     // }
1664 
1665     // private static void addIfNotIn(String key, String value, Map<String, String> toAdd,
1666     // Set<String> skipKey,
1667     // Set<String> skipValue, String kind) {
1668     // if (!key.equals(value)
1669     // && !toAdd.containsKey(key)
1670     // && (skipKey == null || !skipKey.contains(key))
1671     // && (skipValue == null || !skipValue.contains(value))) {
1672     // add(key, value, toAdd, kind);
1673     // }
1674     // }
1675 
1676     enum LocaleOverride {
1677         KEEP_EXISTING,
1678         REPLACE_EXISTING
1679     }
1680 
add( String key, String value, Map<String, String> toAdd, String kind, LocaleOverride override, boolean showAction)1681     private static void add(
1682             String key,
1683             String value,
1684             Map<String, String> toAdd,
1685             String kind,
1686             LocaleOverride override,
1687             boolean showAction) {
1688         if (SHOW_ADD && key.startsWith(LocaleNames.MIS)) {
1689             int debug = 1;
1690         }
1691         if (key.equals(DEBUG_ADD_KEY)) {
1692             System.out.println("*debug*");
1693         }
1694         String oldValue = toAdd.get(key);
1695         if (oldValue == null) {
1696             if (showAction) {
1697                 System.out.println(
1698                         "\tAdding:\t\t"
1699                                 + getName(key)
1700                                 + "\t=>\t"
1701                                 + getName(value)
1702                                 + "\t\t\t\t"
1703                                 + kind);
1704             }
1705         } else if (override == LocaleOverride.KEEP_EXISTING || value.equals(oldValue)) {
1706             // if (showAction) {
1707             // System.out.println("Skipping:\t" + key + "\t=>\t" + value + "\t\t\t\t" + kind);
1708             // }
1709             return;
1710         } else {
1711             if (showAction) {
1712                 System.out.println(
1713                         "\tReplacing:\t"
1714                                 + getName(key)
1715                                 + "\t=>\t"
1716                                 + getName(value)
1717                                 + "\t, was\t"
1718                                 + getName(oldValue)
1719                                 + "\t\t"
1720                                 + kind);
1721             }
1722         }
1723         toAdd.put(key, value);
1724     }
1725 
getName(String value)1726     private static String getName(String value) {
1727         return ConvertLanguageData.getLanguageCodeAndName(value);
1728     }
1729 
printLikelySubtags(Map<String, String> fluffup)1730     private static File printLikelySubtags(Map<String, String> fluffup) throws IOException {
1731         final File genDir = new File(CLDRPaths.GEN_DIRECTORY, "supplemental");
1732         final File genFile =
1733                 new File(
1734                         genDir,
1735                         "likelySubtags" + (OUTPUT_STYLE == OutputStyle.XML ? ".xml" : ".txt"));
1736         System.out.println("Writing to " + genFile);
1737 
1738         try (PrintWriter out = FileUtilities.openUTF8Writer(genFile)) {
1739             String spacing = OUTPUT_STYLE == OutputStyle.PLAINTEXT ? "\t" : " ";
1740             String header =
1741                     OUTPUT_STYLE != OutputStyle.XML
1742                             ? "const MapToMaximalSubtags default_subtags[] = {"
1743                             : "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
1744                                     + CldrUtility.LINE_SEPARATOR
1745                                     + "<!DOCTYPE supplementalData SYSTEM \"../../common/dtd/ldmlSupplemental.dtd\">"
1746                                     + CldrUtility.LINE_SEPARATOR
1747                                     + "<!--"
1748                                     + CldrUtility.LINE_SEPARATOR
1749                                     + CldrUtility.getCopyrightString()
1750                                     + CldrUtility.LINE_SEPARATOR
1751                                     + "-->"
1752                                     + CldrUtility.LINE_SEPARATOR
1753                                     + "<!--"
1754                                     + CldrUtility.LINE_SEPARATOR
1755                                     + "Likely subtags data is generated programatically from CLDR's language/territory/population"
1756                                     + CldrUtility.LINE_SEPARATOR
1757                                     + "data using the GenerateMaximalLocales tool. Under normal circumstances, this file should"
1758                                     + CldrUtility.LINE_SEPARATOR
1759                                     + "not be patched by hand, as any changes made in that fashion may be lost."
1760                                     + CldrUtility.LINE_SEPARATOR
1761                                     + "-->"
1762                                     + CldrUtility.LINE_SEPARATOR
1763                                     + "<supplementalData>"
1764                                     + CldrUtility.LINE_SEPARATOR
1765                                     + "    <version number=\"$"
1766                                     + "Revision$\"/>"
1767                                     + CldrUtility.LINE_SEPARATOR
1768                                     + "    <likelySubtags>";
1769             String footer =
1770                     OUTPUT_STYLE != OutputStyle.XML
1771                             ? SEPARATOR + "};"
1772                             : "    </likelySubtags>"
1773                                     + CldrUtility.LINE_SEPARATOR
1774                                     + "</supplementalData>";
1775             out.println(header);
1776             boolean first = true;
1777             Set<String> keys = new TreeSet<>(new LocaleStringComparator());
1778             keys.addAll(fluffup.keySet());
1779             for (String printingLocale : keys) {
1780                 String printingTarget = fluffup.get(printingLocale);
1781                 String comment =
1782                         printingName(printingLocale, spacing)
1783                                 + spacing
1784                                 + "=>"
1785                                 + spacing
1786                                 + printingName(printingTarget, spacing);
1787 
1788                 if (OUTPUT_STYLE == OutputStyle.XML) {
1789                     out.println(
1790                             "\t\t<likelySubtag from=\""
1791                                     + printingLocale
1792                                     + "\" to=\""
1793                                     + printingTarget
1794                                     + "\""
1795                                     + "/>"
1796                                     + CldrUtility.LINE_SEPARATOR
1797                                     + "\t\t"
1798                                     + "<!--"
1799                                     + comment
1800                                     + "-->");
1801                 } else {
1802                     if (first) {
1803                         first = false;
1804                     } else {
1805                         out.print(",");
1806                     }
1807                     if (comment.length() > 70 && SEPARATOR.equals(CldrUtility.LINE_SEPARATOR)) {
1808                         comment =
1809                                 printingName(printingLocale, spacing)
1810                                         + SEPARATOR
1811                                         + "    // "
1812                                         + spacing
1813                                         + "=>"
1814                                         + spacing
1815                                         + printingName(printingTarget, spacing);
1816                     }
1817                     out.print(
1818                             "  {"
1819                                     + SEPARATOR
1820                                     + "    // "
1821                                     + comment
1822                                     + SEPARATOR
1823                                     + "    \""
1824                                     + printingLocale
1825                                     + "\","
1826                                     + SEPARATOR
1827                                     + "    \""
1828                                     + printingTarget
1829                                     + "\""
1830                                     + CldrUtility.LINE_SEPARATOR
1831                                     + "  }");
1832                 }
1833             }
1834             out.println(footer);
1835             out.close();
1836         }
1837         return genFile;
1838     }
1839 
printingName(String locale, String spacing)1840     public static String printingName(String locale, String spacing) {
1841         if (locale == null) {
1842             return null;
1843         }
1844         LanguageTagParser parser = new LanguageTagParser().set(locale);
1845         String lang = parser.getLanguage();
1846         String script = parser.getScript();
1847         String region = parser.getRegion();
1848         return "{"
1849                 + spacing
1850                 + (lang.equals(LocaleNames.UND)
1851                         ? "?"
1852                         : english.getName(CLDRFile.LANGUAGE_NAME, lang))
1853                 + ";"
1854                 + spacing
1855                 + (script == null || script.equals("")
1856                         ? "?"
1857                         : english.getName(CLDRFile.SCRIPT_NAME, script))
1858                 + ";"
1859                 + spacing
1860                 + (region == null || region.equals("")
1861                         ? "?"
1862                         : english.getName(CLDRFile.TERRITORY_NAME, region))
1863                 + spacing
1864                 + "}";
1865     }
1866 
1867     private static final String[][] ALT_REVERSAL = {
1868         // { "no", "nb" },
1869         // { "nb", "no" },
1870         {"he", "iw"},
1871         {"iw", "he"},
1872     };
1873 
toAlt(String locale, boolean change)1874     public static String toAlt(String locale, boolean change) {
1875         if (!change || locale == null) {
1876             return locale;
1877         }
1878         String firstTag = getFirstTag(locale);
1879         for (String[] pair : ALT_REVERSAL) {
1880             if (firstTag.equals(pair[0])) {
1881                 locale = pair[1] + locale.substring(pair[1].length());
1882                 break;
1883             }
1884         }
1885         locale = locale.replace("_", "-");
1886         return locale;
1887     }
1888 
getFirstTag(String locale)1889     private static String getFirstTag(String locale) {
1890         int pos = locale.indexOf('_');
1891         return pos < 0 ? locale : locale.substring(0, pos);
1892     }
1893 
1894     // private static Map<String, String> getBackMapping(Map<String, String> fluffup) {
1895     // Relation<String,String> backMap = new Relation(new TreeMap(), TreeSet.class,
1896     // BEST_LANGUAGE_COMPARATOR);
1897     // for (String source : fluffup.keySet()) {
1898     // if (source.startsWith(LocaleNames.UND)) {
1899     // continue;
1900     // }
1901     // String maximized = fluffup.get(source);
1902     // backMap.put(maximized, source); // put in right order
1903     // }
1904     // Map<String,String> returnBackMap = new TreeMap();
1905     // for (String maximized : backMap.keySet()) {
1906     // final Set<String> all = backMap.getAll(maximized);
1907     // final String minimized = all.iterator().next();
1908     // returnBackMap.put(maximized, minimized);
1909     // }
1910     // return returnBackMap;
1911     // }
1912 
1913     /**
1914      * Language tags are presumed to share the first language, except possibly LocaleNames.UND. Best
1915      * is least
1916      */
1917     // private static Comparator BEST_LANGUAGE_COMPARATOR = new Comparator<String>() {
1918     // LanguageTagParser p1 = new LanguageTagParser();
1919     // LanguageTagParser p2 = new LanguageTagParser();
1920     // public int compare(String o1, String o2) {
1921     // if (o1.equals(o2)) return 0;
1922     // p1.set(o1);
1923     // p2.set(o2);
1924     // String lang1 = p1.getLanguage();
1925     // String lang2 = p2.getLanguage();
1926     //
1927     // // compare languages first
1928     // // put und at the end
1929     // int result = lang1.compareTo(lang2);
1930     // if (result != 0) {
1931     // if (lang1.equals(LocaleNames.UND)) return 1;
1932     // if (lang2.equals(LocaleNames.UND)) return -1;
1933     // return result;
1934     // }
1935     //
1936     // // now scripts and regions.
1937     // // if they have different numbers of fields, the shorter wins.
1938     // // If there are two fields, region is lowest.
1939     // // The simplest way is to just compare scripts first
1940     // // so zh-TW < zh-Hant, because we first compare "" to Hant
1941     // String script1 = p1.getScript();
1942     // String script2 = p2.getScript();
1943     // int scriptOrder = script1.compareTo(script2);
1944     // if (scriptOrder != 0) return scriptOrder;
1945     //
1946     // String region1 = p1.getRegion();
1947     // String region2 = p2.getRegion();
1948     // int regionOrder = region1.compareTo(region2);
1949     // if (regionOrder != 0) return regionOrder;
1950     //
1951     // return o1.compareTo(o2);
1952     // }
1953     //
1954     // };
1955 
minimize(Map<String, String> fluffup)1956     public static void minimize(Map<String, String> fluffup) {
1957         LanguageTagParser parser = new LanguageTagParser();
1958         LanguageTagParser targetParser = new LanguageTagParser();
1959         Set<String> removals = new TreeSet<>();
1960         while (true) {
1961             removals.clear();
1962             for (String locale : fluffup.keySet()) {
1963                 String target = fluffup.get(locale);
1964                 if (targetParser.set(target).getRegion().equals(UNKNOWN_REGION)) {
1965                     removals.add(locale);
1966                     if (SHOW_ADD)
1967                         System.out.println(
1968                                 "Removing:\t"
1969                                         + getName(locale)
1970                                         + "\t=>\t"
1971                                         + getName(target)
1972                                         + "\t\t - Unknown Region in target");
1973                     continue;
1974                 }
1975                 if (targetParser.getScript().equals(UNKNOWN_SCRIPT)) {
1976                     removals.add(locale);
1977                     if (SHOW_ADD)
1978                         System.out.println(
1979                                 "Removing:\t"
1980                                         + getName(locale)
1981                                         + "\t=>\t"
1982                                         + getName(target)
1983                                         + "\t\t - Unknown Script in target");
1984                     continue;
1985                 }
1986 
1987                 String region = parser.set(locale).getRegion();
1988                 if (region.length() != 0) {
1989                     if (region.equals(UNKNOWN_REGION)) {
1990                         removals.add(locale);
1991                         if (SHOW_ADD)
1992                             System.out.println(
1993                                     "Removing:\t"
1994                                             + getName(locale)
1995                                             + "\t=>\t"
1996                                             + getName(target)
1997                                             + "\t\t - Unknown Region in source");
1998                         continue;
1999                     }
2000                     parser.setRegion("");
2001                     String newLocale = parser.toString();
2002                     String newTarget = fluffup.get(newLocale);
2003                     if (newTarget != null) {
2004                         newTarget = targetParser.set(newTarget).setRegion(region).toString();
2005                         if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) {
2006                             removals.add(locale);
2007                             if (SHOW_ADD)
2008                                 System.out.println(
2009                                         "Removing:\t"
2010                                                 + locale
2011                                                 + "\t=>\t"
2012                                                 + target
2013                                                 + "\t\tRedundant with "
2014                                                 + newLocale);
2015                             continue;
2016                         }
2017                     }
2018                 }
2019                 String script = parser.set(locale).getScript();
2020                 if (locale.equals(DEBUG_ADD_KEY)) {
2021                     System.out.println("*debug*");
2022                 }
2023                 if (script.length() != 0) {
2024                     if (script.equals(UNKNOWN_SCRIPT)) {
2025                         removals.add(locale);
2026                         if (SHOW_ADD)
2027                             System.out.println(
2028                                     "Removing:\t"
2029                                             + locale
2030                                             + "\t=>\t"
2031                                             + target
2032                                             + "\t\t - Unknown Script");
2033                         continue;
2034                     }
2035                     parser.setScript("");
2036                     String newLocale = parser.toString();
2037                     String newTarget = fluffup.get(newLocale);
2038                     if (newTarget != null) {
2039                         newTarget = targetParser.set(newTarget).setScript(script).toString();
2040                         if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) {
2041                             removals.add(locale);
2042                             if (SHOW_ADD)
2043                                 System.out.println(
2044                                         "Removing:\t"
2045                                                 + locale
2046                                                 + "\t=>\t"
2047                                                 + target
2048                                                 + "\t\tRedundant with "
2049                                                 + newLocale);
2050                             continue;
2051                         }
2052                     }
2053                 }
2054             }
2055             if (removals.size() == 0) {
2056                 break;
2057             }
2058             for (String locale : removals) {
2059                 fluffup.remove(locale);
2060             }
2061         }
2062     }
2063 
2064     // private static void addLanguageScript(Map<String, String> fluffup, LanguageTagParser parser)
2065     // {
2066     // // add script
2067     // Map<String, String> temp = new TreeMap<String, String>();
2068     // while (true) {
2069     // temp.clear();
2070     // for (String target : new TreeSet<String>(fluffup.values())) {
2071     // parser.set(target);
2072     // final String territory = parser.getRegion();
2073     // if (territory.length() == 0) {
2074     // continue;
2075     // }
2076     // parser.setRegion("");
2077     // String possibleSource = parser.toString();
2078     // if (fluffup.containsKey(possibleSource)) {
2079     // continue;
2080     // }
2081     // String other = temp.get(possibleSource);
2082     // if (other != null) {
2083     // if (!target.equals(other)) {
2084     // System.out.println("**Failure with multiple sources in addLanguageScript: "
2085     // + possibleSource + "\t=>\t" + target + ", " + other);
2086     // }
2087     // continue;
2088     // }
2089     // temp.put(possibleSource, target);
2090     // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target +
2091     // "\t\tLanguage-Script");
2092     // }
2093     // if (temp.size() == 0) {
2094     // break;
2095     // }
2096     // fluffup.putAll(temp);
2097     // }
2098     //
2099     // }
2100 
2101     // private static void addLanguageCountry(Map<String, String> fluffup, LanguageTagParser parser)
2102     // {
2103     // // add script
2104     // Map<String, String> temp = new TreeMap<String, String>();
2105     // while (true) {
2106     // temp.clear();
2107     // for (String target : new TreeSet<String>(fluffup.values())) {
2108     // parser.set(target);
2109     // String script = parser.getScript();
2110     // if (script.length() == 0) {
2111     // continue;
2112     // }
2113     // parser.setScript("");
2114     // String possibleSource = parser.toString();
2115     // if (fluffup.containsKey(possibleSource)) {
2116     // continue;
2117     // }
2118     // String other = temp.get(possibleSource);
2119     //
2120     // if (other != null) {
2121     // if (!target.equals(other)) {
2122     // script = getScriptForLocale(possibleSource);
2123     // if (script == null) {
2124     // System.out.println("**Failure with multiple sources in addLanguageCountry: "
2125     // + possibleSource + "\t=>\t" + target + ", " + other);
2126     // continue; // error message in routine
2127     // }
2128     // parser.setScript(script);
2129     // target = parser.toString();
2130     // }
2131     // }
2132     //
2133     // temp.put(possibleSource, target);
2134     // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target +
2135     // "\t\tLanguageCountry");
2136     // }
2137     // if (temp.size() == 0) {
2138     // break;
2139     // }
2140     // fluffup.putAll(temp);
2141     // }
2142     //
2143     // }
2144 
2145     // private static void addScript(Map<String, String> fluffup, LanguageTagParser parser) {
2146     // // add script
2147     // Map<String, String> temp = new TreeMap<String, String>();
2148     // while (true) {
2149     // temp.clear();
2150     // Set skipTarget = fluffup.keySet();
2151     // for (String locale : fluffup.keySet()) {
2152     // String target = fluffup.get(locale);
2153     // parser.set(target);
2154     // if (parser.getScript().length() != 0) {
2155     // continue;
2156     // }
2157     // String script = getScriptForLocale(target);
2158     //
2159     // if (script == null) {
2160     // continue; // error message in routine
2161     // }
2162     // parser.setScript(script);
2163     // String furtherTarget = parser.toString();
2164     // addIfNotIn(target, furtherTarget, temp, fluffup, "Script");
2165     // }
2166     // if (temp.size() == 0) {
2167     // break;
2168     // }
2169     // fluffup.putAll(temp);
2170     // }
2171     // }
2172 
2173     // private static String getScriptForLocale(String locale) {
2174     // String result = getScriptForLocale2(locale);
2175     // if (result != null) return result;
2176     // int pos = locale.indexOf('_');
2177     // if (pos >= 0) {
2178     // result = getScriptForLocale2(locale.substring(0,pos));
2179     // }
2180     // return result;
2181     // }
2182 
2183     private static String UNKNOWN_SCRIPT = "Zzzz";
2184     private static String UNKNOWN_REGION = "ZZ";
2185 
getScriptForLocale2(String locale)2186     private static String getScriptForLocale2(String locale) {
2187         String result = localeToScriptCache.get(locale);
2188         if (result != null) {
2189             return result;
2190         }
2191         if (locale.equals("ky")) {
2192             int debug = 0;
2193         }
2194         try {
2195             Map<Type, BasicLanguageData> data = supplementalData.getBasicLanguageDataMap(locale);
2196             if (data != null) {
2197                 for (BasicLanguageData datum : data.values()) {
2198                     final Set<String> scripts = datum.getScripts();
2199                     boolean isPrimary = datum.getType() == BasicLanguageData.Type.primary;
2200                     if (scripts.size() != 1) {
2201                         if (scripts.size() > 1 && isPrimary) {
2202                             break;
2203                         }
2204                         continue;
2205                     }
2206                     String script = scripts.iterator().next();
2207                     if (isPrimary) {
2208                         return result = script;
2209                     } else if (result == null) {
2210                         result = script;
2211                     }
2212                 }
2213                 if (result != null) {
2214                     return result;
2215                 }
2216             }
2217             CLDRFile cldrFile;
2218             try {
2219                 cldrFile = factory.make(locale, true);
2220             } catch (RuntimeException e) {
2221                 result = FALLBACK_SCRIPTS.get(locale);
2222                 if (result == null) {
2223                     System.err.println(
2224                             "***Failed to find script in L-S-R or MAX_ADDITIONS for: "
2225                                     + locale
2226                                     + "\t"
2227                                     + english.getName(locale));
2228                     return result = UNKNOWN_SCRIPT;
2229                 } else {
2230                     return result;
2231                 }
2232             }
2233             UnicodeSet exemplars = getExemplarSet(cldrFile, "");
2234             Set<String> CLDRScripts = getScriptsFromUnicodeSet(exemplars);
2235             CLDRScripts.remove(UNKNOWN_SCRIPT);
2236             if (CLDRScripts.size() == 1) {
2237                 return result = CLDRScripts.iterator().next();
2238             } else if (CLDRScripts.size() == 0) {
2239                 System.out.println("**Failed to get script for:\t" + locale);
2240                 return result = UNKNOWN_SCRIPT;
2241             } else {
2242                 System.out.println(
2243                         "**Failed, too many scripts for:\t" + locale + ", " + CLDRScripts);
2244                 return result = UNKNOWN_SCRIPT;
2245             }
2246         } finally {
2247             if (result.equals(UNKNOWN_SCRIPT)) {
2248                 String temp = LANGUAGE_OVERRIDES.get(locale);
2249                 if (temp != null) {
2250                     result = new LanguageTagParser().set(temp).getScript();
2251                     System.err.println(
2252                             "***Warning, Getting script from LANGUAGE_OVERRIDES for "
2253                                     + locale
2254                                     + " => "
2255                                     + result);
2256                 }
2257             }
2258             localeToScriptCache.put(locale, result);
2259             if (SHOW_ADD)
2260                 System.out.println(
2261                         "Script:\t"
2262                                 + locale
2263                                 + "\t"
2264                                 + english.getName(locale)
2265                                 + "\t=>\t"
2266                                 + result
2267                                 + "\t"
2268                                 + english.getName(CLDRFile.SCRIPT_NAME, result));
2269         }
2270     }
2271 
2272     // private static Map<String, String> closeMapping(Map<String, String> fluffup) {
2273     // if (SHOW_ADD) System.out.flush();
2274     // Map<String,String> temp = new TreeMap<String,String>();
2275     // while (true) {
2276     // temp.clear();
2277     // for (String locale : fluffup.keySet()) {
2278     // String target = fluffup.get(locale);
2279     // if (target.equals("si_Sinh") || target.equals("zh-Hani")) {
2280     // System.out.println("????");
2281     // }
2282     // String furtherTarget = fluffup.get(target);
2283     // if (furtherTarget == null) {
2284     // continue;
2285     // }
2286     // addIfNotIn(locale, furtherTarget, temp, null, "Close");
2287     // }
2288     // if (temp.size() == 0) {
2289     // break;
2290     // }
2291     // fluffup.putAll(temp);
2292     // }
2293     // if (SHOW_ADD) System.out.flush();
2294     // return temp;
2295     // }
2296 
getScriptsFromUnicodeSet(UnicodeSet exemplars)2297     public static Set<String> getScriptsFromUnicodeSet(UnicodeSet exemplars) {
2298         // use bits first, since that's faster
2299         BitSet scriptBits = new BitSet();
2300         boolean show = false;
2301         for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next(); ) {
2302             if (show) System.out.println(Integer.toHexString(it.codepoint));
2303             if (it.codepoint != UnicodeSetIterator.IS_STRING) {
2304                 scriptBits.set(UScript.getScript(it.codepoint));
2305             } else {
2306                 int cp;
2307                 for (int i = 0; i < it.string.length(); i += UTF16.getCharCount(cp)) {
2308                     scriptBits.set(UScript.getScript(cp = UTF16.charAt(it.string, i)));
2309                 }
2310             }
2311         }
2312         scriptBits.clear(UScript.COMMON);
2313         scriptBits.clear(UScript.INHERITED);
2314         Set<String> scripts = new TreeSet<>();
2315         for (int j = 0; j < scriptBits.size(); ++j) {
2316             if (scriptBits.get(j)) {
2317                 scripts.add(UScript.getShortName(j));
2318             }
2319         }
2320         return scripts;
2321     }
2322 
getExemplarSet(CLDRFile cldrfile, String type)2323     public static UnicodeSet getExemplarSet(CLDRFile cldrfile, String type) {
2324         if (type.length() != 0) type = "[@type=\"" + type + "\"]";
2325         String v = cldrfile.getStringValue("//ldml/characters/exemplarCharacters" + type);
2326         if (v == null) return new UnicodeSet();
2327         return new UnicodeSet(v);
2328     }
2329 
2330     // private static String[][] SpecialCases = {
2331     // { "zh_Hani", "zh_Hans_CN"},
2332     // { "si_Sinh", "si_Sinh_LK"},
2333     // { "ii", "ii_CN"}, // Sichuan Yi (Yi)
2334     // { "iu", "iu_CA"}, // Inuktitut (Unified Canadian Aboriginal Syllabics)
2335     // { LocaleNames.UND, "en"}, // English default
2336     // };
2337 
showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent)2338     static void showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent) {
2339         Set<String> errors = new LinkedHashSet<>();
2340         Map<String, String> oldDefaultContent =
2341                 SupplementalDataInfo.makeLocaleToDefaultContents(
2342                         ConvertLanguageData.supplementalData.getDefaultContentLocales(),
2343                         new TreeMap<String, String>(),
2344                         errors);
2345         if (!errors.isEmpty()) {
2346             System.out.println(Joiner.on("\n").join(errors));
2347             errors.clear();
2348         }
2349         Map<String, String> newDefaultContent =
2350                 SupplementalDataInfo.makeLocaleToDefaultContents(
2351                         defaultLocaleContent, new TreeMap<String, String>(), errors);
2352         if (!errors.isEmpty()) {
2353             System.out.println("Default Content errors: " + Joiner.on("\n").join(errors));
2354             errors.clear();
2355         }
2356         Set<String> changes =
2357                 compareMapsAndFixNew(
2358                         "*WARNING* Default Content: ",
2359                         oldDefaultContent,
2360                         newDefaultContent,
2361                         "ar",
2362                         "ar_001");
2363         System.out.println(Joiner.on("\n").join(changes));
2364         defaultLocaleContent.clear();
2365         defaultLocaleContent.addAll(newDefaultContent.values());
2366         newDefaultContent =
2367                 SupplementalDataInfo.makeLocaleToDefaultContents(
2368                         defaultLocaleContent, new TreeMap<String, String>(), errors);
2369         if (!errors.isEmpty()) {
2370             System.out.println("***New Errors: " + Joiner.on("\n").join(errors));
2371         }
2372     }
2373 
compareMapsAndFixNew( String title, Map<String, String> oldContent, Map<String, String> newContent, String... allowedOverrideValues)2374     private static Set<String> compareMapsAndFixNew(
2375             String title,
2376             Map<String, String> oldContent,
2377             Map<String, String> newContent,
2378             String... allowedOverrideValues) {
2379         Map<String, String> allowedOverrideValuesTest = new HashMap<>();
2380         for (int i = 0; i < allowedOverrideValues.length; i += 2) {
2381             allowedOverrideValuesTest.put(allowedOverrideValues[i], allowedOverrideValues[i + 1]);
2382         }
2383         Set<String> changes = new TreeSet<>();
2384         for (String parent :
2385                 Builder.with(new TreeSet<String>())
2386                         .addAll(newContent.keySet())
2387                         .addAll(oldContent.keySet())
2388                         .get()) {
2389             String oldValue = oldContent.get(parent);
2390             String newValue = newContent.get(parent);
2391             String overrideValue = allowedOverrideValuesTest.get(parent);
2392             if (overrideValue != null) {
2393                 newContent.put(parent, overrideValue);
2394                 newValue = overrideValue;
2395             }
2396             if (CldrUtility.equals(oldValue, newValue)) {
2397                 continue;
2398             }
2399             String message;
2400             if (oldValue == null) {
2401                 message =
2402                         "Adding "
2403                                 + ConvertLanguageData.getLanguageCodeAndName(parent)
2404                                 + " => "
2405                                 + ConvertLanguageData.getLanguageCodeAndName(newValue);
2406                 newContent.put(parent, newValue);
2407             } else if (newValue == null) {
2408                 if (SUPPRESS_CHANGES) {
2409                     message =
2410                             "Suppressing removal of "
2411                                     + ConvertLanguageData.getLanguageCodeAndName(parent)
2412                                     + " => "
2413                                     + ConvertLanguageData.getLanguageCodeAndName(oldValue);
2414                     newContent.put(parent, oldValue);
2415                 } else {
2416                     message =
2417                             "Removing "
2418                                     + ConvertLanguageData.getLanguageCodeAndName(parent)
2419                                     + " => "
2420                                     + ConvertLanguageData.getLanguageCodeAndName(oldValue);
2421                     newContent.remove(oldValue);
2422                 }
2423             } else {
2424                 if (SUPPRESS_CHANGES) {
2425                     message =
2426                             "Suppressing change of "
2427                                     + ConvertLanguageData.getLanguageCodeAndName(parent)
2428                                     + " => "
2429                                     + ConvertLanguageData.getLanguageCodeAndName(oldValue)
2430                                     + " to "
2431                                     + ConvertLanguageData.getLanguageCodeAndName(newValue);
2432                     newContent.remove(newValue);
2433                     newContent.put(parent, oldValue);
2434                 } else {
2435                     message =
2436                             "Changing "
2437                                     + ConvertLanguageData.getLanguageCodeAndName(parent)
2438                                     + " => "
2439                                     + ConvertLanguageData.getLanguageCodeAndName(oldValue)
2440                                     + " to "
2441                                     + ConvertLanguageData.getLanguageCodeAndName(newValue);
2442                     newContent.remove(oldValue);
2443                     newContent.put(parent, newValue);
2444                 }
2445             }
2446             changes.add(title + message);
2447         }
2448         return changes;
2449     }
2450 
2451     public static class LocaleStringComparator implements Comparator<String> {
2452         LanguageTagParser ltp0 = new LanguageTagParser();
2453         LanguageTagParser ltp1 = new LanguageTagParser();
2454 
2455         @Override
compare(String arg0, String arg1)2456         public int compare(String arg0, String arg1) {
2457             ltp0.set(arg0);
2458             ltp1.set(arg1);
2459             String s0 = ltp0.getLanguage();
2460             String s1 = ltp1.getLanguage();
2461             int result = s0.compareTo(s1);
2462             if (result != 0) {
2463                 return s0.equals(LocaleNames.UND) ? 1 : s1.equals(LocaleNames.UND) ? -1 : result;
2464             }
2465             s0 = ltp0.getScript();
2466             s1 = ltp1.getScript();
2467             result = s0.compareTo(s1);
2468             if (result != 0) {
2469                 return result;
2470             }
2471             s0 = ltp0.getRegion();
2472             s1 = ltp1.getRegion();
2473             result = s0.compareTo(s1);
2474             if (result != 0) {
2475                 return result;
2476             }
2477             return arg0.compareTo(arg1); // just in case
2478         }
2479     }
2480 }
2481