xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ConvertLanguageData.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.tool;
2 
3 import com.google.common.base.Joiner;
4 import com.google.common.collect.ImmutableSet;
5 import com.google.common.math.DoubleMath;
6 import com.ibm.icu.impl.Relation;
7 import com.ibm.icu.impl.Row;
8 import com.ibm.icu.impl.Row.R2;
9 import com.ibm.icu.text.Collator;
10 import com.ibm.icu.text.NumberFormat;
11 import com.ibm.icu.text.RuleBasedCollator;
12 import com.ibm.icu.text.UTF16;
13 import com.ibm.icu.util.ULocale;
14 import java.io.BufferedReader;
15 import java.io.File;
16 import java.io.IOException;
17 import java.io.PrintWriter;
18 import java.nio.file.Files;
19 import java.nio.file.StandardCopyOption;
20 import java.text.ParseException;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collection;
24 import java.util.Collections;
25 import java.util.Comparator;
26 import java.util.EnumMap;
27 import java.util.HashMap;
28 import java.util.HashSet;
29 import java.util.Iterator;
30 import java.util.LinkedHashSet;
31 import java.util.List;
32 import java.util.Map;
33 import java.util.Set;
34 import java.util.TreeMap;
35 import java.util.TreeSet;
36 import java.util.regex.Matcher;
37 import org.unicode.cldr.draft.FileUtilities;
38 import org.unicode.cldr.draft.ScriptMetadata;
39 import org.unicode.cldr.draft.ScriptMetadata.IdUsage;
40 import org.unicode.cldr.draft.ScriptMetadata.Info;
41 import org.unicode.cldr.util.Builder;
42 import org.unicode.cldr.util.CLDRFile;
43 import org.unicode.cldr.util.CLDRPaths;
44 import org.unicode.cldr.util.CldrUtility;
45 import org.unicode.cldr.util.Factory;
46 import org.unicode.cldr.util.Iso639Data;
47 import org.unicode.cldr.util.Iso639Data.Scope;
48 import org.unicode.cldr.util.Iso639Data.Source;
49 import org.unicode.cldr.util.Iso639Data.Type;
50 import org.unicode.cldr.util.LanguageTagCanonicalizer;
51 import org.unicode.cldr.util.LanguageTagParser;
52 import org.unicode.cldr.util.LocaleIDParser;
53 import org.unicode.cldr.util.LocaleIDParser.Level;
54 import org.unicode.cldr.util.Pair;
55 import org.unicode.cldr.util.PatternCache;
56 import org.unicode.cldr.util.SpreadSheet;
57 import org.unicode.cldr.util.StandardCodes;
58 import org.unicode.cldr.util.StandardCodes.LstrType;
59 import org.unicode.cldr.util.SupplementalDataInfo;
60 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData;
61 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
62 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
63 import org.unicode.cldr.util.TransliteratorUtilities;
64 import org.unicode.cldr.util.Validity;
65 import org.unicode.cldr.util.Validity.Status;
66 import org.unicode.cldr.util.XPathParts;
67 import org.unicode.cldr.util.XPathParts.Comments;
68 
69 /**
70  * @author markdavis
71  */
72 public class ConvertLanguageData {
73 
74     private static final boolean DEBUG = false;
75     // change this if you need to override what is generated for the default contents.
76     private static final List<String> defaultOverrides = Arrays.asList("es_ES".split("\\s+"));
77 
78     public static final boolean SHOW_DIFF = false;
79 
80     private static final boolean ALLOW_SMALL_NUMBERS = true;
81 
82     static final Comparator<String> GENERAL_COLLATOR = new GeneralCollator();
83     static final Comparator<String> INVERSE_GENERAL = new InverseComparator<>(GENERAL_COLLATOR);
84 
85     private static StandardCodes sc = StandardCodes.make();
86 
87     static final double populationFactor = 1;
88     static final double gdpFactor = 1;
89     static final int BAD_COUNTRY_NAME = 0,
90             COUNTRY_CODE = 1,
91             COUNTRY_POPULATION = 2,
92             COUNTRY_LITERACY = 3,
93             COUNTRY_GDP = 4,
94             OFFICIAL_STATUS = 5,
95             BAD_LANGUAGE_NAME = 6,
96             LANGUAGE_CODE = 7,
97             LANGUAGE_POPULATION = 8,
98             LANGUAGE_LITERACY = 9,
99             COMMENT = 10,
100             NOTES = 11;
101     static final Map<String, CodeAndPopulation> languageToMaxCountry = new TreeMap<>();
102     static final Map<String, CodeAndPopulation> languageToMaxScript = new TreeMap<>();
103 
104     private static final double NON_OFFICIAL_WEIGHT = 0.40;
105 
106     private static final boolean SHOW_OLD_DEFAULT_CONTENTS = false;
107 
108     private static final ImmutableSet<String> scriptAssumedLocales =
109             ImmutableSet.of(
110                     "bm_ML", "ha_GH", "ha_NE", "ha_NG", "kk_KZ", "ks_IN", "ky_KG", "mn_MN", "ms_BN",
111                     "ms_MY", "ms_SG", "tk_TM", "tzm_MA", "ug_CN");
112 
113     static Set<String> skipLocales =
114             new HashSet<>(
115                     Arrays.asList(
116                             "sh sh_BA sh_CS sh_YU characters supplementalData supplementalData-old supplementalData-old2 supplementalData-old3 supplementalMetadata root"
117                                     .split("\\s")));
118 
119     static Map<String, String> defaultContent = new TreeMap<>();
120 
121     static Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
122     static CLDRFile english = cldrFactory.make("en", true);
123 
124     static SupplementalDataInfo supplementalData =
125             SupplementalDataInfo.getInstance(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY);
126 
main(String[] args)127     public static void main(String[] args) throws IOException, ParseException {
128         final File oldSupp =
129                 new File(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY, "supplementalData.xml");
130         final File genSupp =
131                 new File(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalData.xml");
132         final File genLsraw =
133                 new File(CLDRPaths.GEN_DIRECTORY + "/supplemental", "language_script.tsv");
134         try (final BufferedReader oldFile = FileUtilities.openUTF8Reader(oldSupp);
135                 final PrintWriter newFile = FileUtilities.openUTF8Writer(genSupp);
136                 final PrintWriter newLsraw = FileUtilities.openUTF8Writer(genLsraw); ) {
137             // load elements we care about
138             CldrUtility.copyUpTo(
139                     oldFile, PatternCache.get("\\s*<languageData>\\s*"), newFile, false);
140 
141             Set<String> available = cldrFactory.getAvailable();
142 
143             Set<String> cldrParents = getCldrParents(available);
144 
145             List<String> failures = new ArrayList<>();
146             Map<String, RowData> localeToRowData = new TreeMap<>();
147 
148             Set<RowData> sortedInput = getExcelData(failures, localeToRowData);
149 
150             // get the locales (including parents)
151             Set<String> localesWithData = new TreeSet<>(localeToRowData.keySet());
152             for (String locale : localeToRowData.keySet()) {
153                 while (true) {
154                     String parent = LocaleIDParser.getParent(locale);
155                     if (parent == null) break;
156                     localesWithData.add(parent);
157                     locale = parent;
158                 }
159             }
160 
161             final LanguageTagParser languageTagParser = new LanguageTagParser();
162 
163             for (String localeRaw : available) {
164                 String locale = languageTagCanonicalizer.transform(localeRaw);
165                 if (!localesWithData.contains(locale)) {
166                     CLDRFile locFile = cldrFactory.make(localeRaw, false);
167                     if (locFile.isAliasedAtTopLevel()) {
168                         continue;
169                     }
170                     if (scriptAssumedLocales.contains(locale)) {
171                         continue;
172                     }
173                     languageTagParser.set(locale);
174                     if (languageTagParser.getVariants().size() != 0) {
175                         continue;
176                     }
177                     String withoutScript = languageTagParser.setScript("").toString();
178                     if (!localesWithData.contains(withoutScript)) {
179                         String region = new LanguageTagParser().set(locale).getRegion();
180                         if (StandardCodes.isCountry(region)) {
181                             BadItem.ERROR.show(
182                                     "missing language/population data for CLDR locale",
183                                     locale + " = " + getLanguageCodeAndName(locale));
184                         }
185                     } else {
186                         // These exceptions are OK, because these locales by default use the
187                         // non-default script
188                         Set<String> OKExceptions =
189                                 ImmutableSet.of("sr_Cyrl_ME", "zh_Hans_HK", "zh_Hans_MO");
190                         if (OKExceptions.contains(locale)) {
191                             continue;
192                         }
193                         BadItem.ERROR.show(
194                                 "missing language/population data for CLDR locale",
195                                 locale
196                                         + " = "
197                                         + getLanguageCodeAndName(locale)
198                                         + " but have data for "
199                                         + getLanguageCodeAndName(withoutScript));
200                     }
201                 }
202             }
203 
204             // TODO sort by country code, then functionalPopulation, then language code
205             // and keep the top country for each language code (even if < 1%)
206 
207             addLanguageScriptData();
208 
209             // showAllBasicLanguageData(allLanguageData, "old");
210             getLanguage2Scripts(sortedInput);
211 
212             writeNewBasicData2(newFile, sortedInput);
213             // writeNewBasicData(sortedInput);
214 
215             writeTerritoryLanguageData(newFile, failures, sortedInput);
216 
217             checkBasicData(localeToRowData);
218 
219             Set<String> defaultLocaleContent = new TreeSet<>();
220 
221             showDefaults(cldrParents, nf, defaultContent, localeToRowData, defaultLocaleContent);
222 
223             // showContent(available);
224 
225             // certain items are overridden
226 
227             List<String> toRemove = new ArrayList<>();
228             for (String override : defaultOverrides) {
229                 String replacement = getReplacement(override, defaultLocaleContent);
230                 if (replacement != null) {
231                     toRemove.add(replacement);
232                 }
233             }
234             defaultLocaleContent.removeAll(toRemove);
235             defaultLocaleContent.addAll(defaultOverrides);
236 
237             showFailures(failures);
238 
239             CldrUtility.copyUpTo(
240                     oldFile, PatternCache.get("\\s*</territoryInfo>\\s*"), null, false);
241             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<references>\\s*"), newFile, false);
242             // generateIso639_2Data(newFile);
243             references.printReferences(newFile);
244             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*</references>\\s*"), null, false);
245             CldrUtility.copyUpTo(oldFile, null, newFile, false);
246 
247             getLanguageScriptSpreadsheet(newLsraw);
248 
249             // Only write if there's no exception.
250         } catch (Exception e) {
251             e.printStackTrace();
252             return;
253         }
254 
255         System.out.println("Wrote: " + genLsraw);
256         System.out.println("Wrote: " + genSupp);
257         System.out.println("Moving " + genSupp + " to " + oldSupp);
258         Files.move(genSupp.toPath(), oldSupp.toPath(), StandardCopyOption.REPLACE_EXISTING);
259         System.out.println("DONE");
260     }
261 
getLanguageCodeAndName(String code)262     public static String getLanguageCodeAndName(String code) {
263         if (code == null) return null;
264         return english.getName(code) + " [" + code + "]";
265     }
266 
getReplacement(String oldDefault, Set<String> defaultLocaleContent)267     private static String getReplacement(String oldDefault, Set<String> defaultLocaleContent) {
268         String parent = LocaleIDParser.getParent(oldDefault);
269         for (String replacement : defaultLocaleContent) {
270             if (replacement.startsWith(parent)) {
271                 if (parent.equals(LocaleIDParser.getParent(replacement))) {
272                     return replacement;
273                 }
274             }
275         }
276         return null;
277     }
278 
getLanguageScriptSpreadsheet(PrintWriter out)279     private static void getLanguageScriptSpreadsheet(PrintWriter out) {
280         out.println("#Lcode\tLanguageName\tStatus\tScode\tScriptName\tReferences");
281         Pair<String, String> languageScript = new Pair<>("", "");
282         for (String language : language_status_scripts.keySet()) {
283             Relation<BasicLanguageData.Type, String> status_scripts =
284                     language_status_scripts.get(language);
285             for (BasicLanguageData.Type status : status_scripts.keySet()) {
286                 for (String script : status_scripts.getAll(status)) {
287                     String reference =
288                             language_script_references.get(
289                                     languageScript.setFirst(language).setSecond(script));
290                     out.println(
291                             language
292                                     + "\t"
293                                     + getLanguageName(language)
294                                     + "\t"
295                                     + status
296                                     + "\t"
297                                     + script
298                                     + "\t"
299                                     + getDisplayScript(script)
300                                     + (reference == null ? "" : "\t" + reference));
301                 }
302             }
303         }
304     }
305 
306     /**
307      * Write data in format: <languageData> <language type="aa" scripts="Latn" territories="DJ ER
308      * ET"/>
309      *
310      * @param sortedInput
311      */
writeNewBasicData2(PrintWriter out, Set<RowData> sortedInput)312     private static void writeNewBasicData2(PrintWriter out, Set<RowData> sortedInput) {
313         double cutoff = 0.2; // 20%
314 
315         // Relation<String, BasicLanguageData> newLanguageData = new Relation(new TreeMap(),
316         // TreeSet.class);
317         LanguageTagParser ltp = new LanguageTagParser();
318         Map<String, Relation<BasicLanguageData.Type, String>> language_status_territories =
319                 new TreeMap<>();
320         // Map<String, Pair<String, String>> languageToBestCountry;
321         for (RowData rowData : sortedInput) {
322             if (rowData.countryCode.equals("ZZ")) continue;
323             ltp.set(rowData.languageCode);
324             String languageCode = ltp.getLanguage();
325             Relation<BasicLanguageData.Type, String> status_territories =
326                     language_status_territories.get(languageCode);
327             if (status_territories == null) {
328                 language_status_territories.put(
329                         languageCode,
330                         status_territories =
331                                 Relation.of(
332                                         new TreeMap<BasicLanguageData.Type, Set<String>>(),
333                                         TreeSet.class));
334             }
335             if (rowData.officialStatus.isMajor()) {
336                 status_territories.put(BasicLanguageData.Type.primary, rowData.countryCode);
337             } else if (rowData.officialStatus.isOfficial()
338                     || rowData.getLanguagePopulation() >= cutoff * rowData.countryPopulation
339                     || rowData.getLanguagePopulation() >= 1000000) {
340                 status_territories.put(BasicLanguageData.Type.secondary, rowData.countryCode);
341             }
342         }
343 
344         Set<String> allLanguages = new TreeSet<>(language_status_territories.keySet());
345         allLanguages.addAll(language_status_scripts.keySet());
346         // now add all the remaining language-script info
347         // <language type="sv" scripts="Latn" territories="AX FI SE"/>
348         Set<String> warnings = new LinkedHashSet<>();
349         out.println("\t<languageData>");
350         for (String languageSubtag : allLanguages) {
351             Relation<BasicLanguageData.Type, String> status_scripts =
352                     language_status_scripts.get(languageSubtag);
353             Relation<BasicLanguageData.Type, String> status_territories =
354                     language_status_territories.get(languageSubtag);
355 
356             // check against old:
357             Map<BasicLanguageData.Type, BasicLanguageData> oldData =
358                     supplementalData.getBasicLanguageDataMap(languageSubtag);
359             if (oldData == null) {
360                 oldData = Collections.emptyMap();
361             }
362 
363             EnumMap<BasicLanguageData.Type, BasicLanguageData> newData =
364                     new EnumMap<>(BasicLanguageData.Type.class);
365             for (BasicLanguageData.Type status : BasicLanguageData.Type.values()) {
366                 Set<String> scripts = status_scripts == null ? null : status_scripts.getAll(status);
367                 Set<String> territories =
368                         status_territories == null ? null : status_territories.getAll(status);
369                 if (scripts == null && territories == null) continue;
370                 BasicLanguageData bld = new BasicLanguageData();
371                 bld.setTerritories(territories);
372                 bld.setScripts(scripts);
373                 bld.setType(status);
374                 bld.freeze();
375                 newData.put(status, bld);
376             }
377 
378             // compare
379             if (!CldrUtility.equals(oldData.entrySet(), newData.entrySet())) {
380                 for (String problem : compare(oldData, newData)) {
381                     warnings.add(
382                             BadItem.DETAIL.toString(
383                                     "changing <languageData>",
384                                     languageSubtag + "\t" + english.getName(languageSubtag),
385                                     problem));
386                 }
387             }
388 
389             for (BasicLanguageData bld : newData.values()) {
390                 Set<String> scripts = bld.getScripts();
391                 Set<String> territories = bld.getTerritories();
392                 BasicLanguageData.Type status = bld.getType();
393                 out.println(
394                         "\t\t<language type=\""
395                                 + languageSubtag
396                                 + "\""
397                                 + (scripts.isEmpty()
398                                         ? ""
399                                         : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"")
400                                 + (territories.isEmpty()
401                                         ? ""
402                                         : " territories=\""
403                                                 + CldrUtility.join(territories, " ")
404                                                 + "\"")
405                                 + (status == BasicLanguageData.Type.primary
406                                         ? ""
407                                         : " alt=\"secondary\"")
408                                 + "/>");
409             }
410         }
411         out.println("\t</languageData>");
412         for (String s : warnings) {
413             if (s.contains("!")) {
414                 System.out.println(s);
415             }
416         }
417         for (String s : warnings) {
418             if (!s.contains("!")) {
419                 System.out.println(s);
420             }
421         }
422     }
423 
compare( Map<BasicLanguageData.Type, BasicLanguageData> oldData, Map<BasicLanguageData.Type, BasicLanguageData> newData)424     private static List<String> compare(
425             Map<BasicLanguageData.Type, BasicLanguageData> oldData,
426             Map<BasicLanguageData.Type, BasicLanguageData> newData) {
427         Map<String, BasicLanguageData.Type> oldDataToType = getDataToType(oldData.values(), true);
428         Map<String, BasicLanguageData.Type> newDataToType = getDataToType(newData.values(), true);
429         List<String> result = new ArrayList<>();
430         StringBuilder temp = new StringBuilder();
431         for (String s :
432                 Builder.with(new LinkedHashSet<String>())
433                         .addAll(oldDataToType.keySet())
434                         .addAll(newDataToType.keySet())
435                         .get()) {
436             BasicLanguageData.Type oldValue = oldDataToType.get(s);
437             BasicLanguageData.Type newValue = newDataToType.get(s);
438             if (!CldrUtility.equals(oldValue, newValue)) {
439                 temp.setLength(0);
440                 temp.append("[")
441                         .append(s)
442                         .append(":")
443                         .append(english.getName(s.length() == 4 ? "script" : "region", s))
444                         .append("] ");
445                 if (oldValue == null) {
446                     temp.append(" added as ").append(newValue);
447                 } else if (newValue == null) {
448                     temp.append(" REMOVED!");
449                 } else if (oldValue == BasicLanguageData.Type.primary) {
450                     temp.append(" DOWNGRADED TO! ").append(newValue);
451                 } else {
452                     temp.append(" upgraded to ").append(newValue);
453                 }
454                 result.add(temp.toString());
455             }
456         }
457         result.add(newData.toString());
458         return result;
459     }
460 
getDataToType( Collection<BasicLanguageData> collection, boolean script)461     private static Map<String, BasicLanguageData.Type> getDataToType(
462             Collection<BasicLanguageData> collection, boolean script) {
463         Map<String, BasicLanguageData.Type> result = new TreeMap<>();
464         for (BasicLanguageData i : collection) {
465             for (String s : i.getScripts()) {
466                 result.put(s, i.getType());
467             }
468             for (String s : i.getTerritories()) {
469                 result.put(s, i.getType());
470             }
471         }
472         return result;
473     }
474 
checkBasicData(Map<String, RowData> localeToRowData)475     private static void checkBasicData(Map<String, RowData> localeToRowData) {
476         // find languages with multiple scripts
477         Relation<String, String> languageToScripts =
478                 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
479         for (String languageSubtag : language2BasicLanguageData.keySet()) {
480             for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) {
481                 languageToScripts.putAll(
482                         StandardCodes.fixLanguageTag(languageSubtag), item.getScripts());
483             }
484         }
485         // get primary combinations
486         Set<String> primaryCombos = new TreeSet<>();
487         Set<String> basicCombos = new TreeSet<>();
488         for (String languageSubtag : language2BasicLanguageData.keySet()) {
489             for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) {
490                 Set<String> scripts = new TreeSet<>();
491                 scripts.addAll(item.getScripts());
492                 languageToScripts.putAll(StandardCodes.fixLanguageTag(languageSubtag), scripts);
493                 if (scripts.size() == 0) {
494                     scripts.add("Zzzz");
495                 }
496                 Set<String> territories = new TreeSet<>();
497                 territories.addAll(item.getTerritories());
498                 if (territories.size() == 0) {
499                     territories.add("ZZ");
500                     continue;
501                 }
502 
503                 for (String script : scripts) {
504                     for (String territory : territories) {
505                         String locale =
506                                 StandardCodes.fixLanguageTag(languageSubtag)
507                                         // + (script.equals("Zzzz") ? "" :
508                                         // languageToScripts.getAll(languageSubtag).size() <= 1 ? ""
509                                         // : "_" + script)
510                                         + (territories.equals("ZZ") ? "" : "_" + territory);
511                         if (item.getType() != BasicLanguageData.Type.secondary) {
512                             primaryCombos.add(locale);
513                         }
514                         basicCombos.add(locale);
515                     }
516                 }
517             }
518         }
519         Set<String> populationOver20 = new TreeSet<>();
520         Set<String> population = new TreeSet<>();
521         LanguageTagParser ltp = new LanguageTagParser();
522         for (String rawLocale : localeToRowData.keySet()) {
523             ltp.set(rawLocale);
524             String locale =
525                     ltp.getLanguage()
526                             + (ltp.getRegion().length() == 0 ? "" : "_" + ltp.getRegion());
527             population.add(locale);
528             RowData rowData = localeToRowData.get(rawLocale);
529             if (rowData.getLanguagePopulation() / rowData.countryPopulation >= 0.2
530             // || rowData.getLanguagePopulation() > 900000
531             ) {
532                 populationOver20.add(locale);
533             } else {
534                 PopulationData popData =
535                         supplementalData.getLanguageAndTerritoryPopulationData(
536                                 ltp.getLanguageScript(), ltp.getRegion());
537                 if (popData != null && popData.getOfficialStatus().isOfficial()) {
538                     populationOver20.add(locale);
539                 }
540             }
541         }
542         Set<String> inBasicButNotPopulation = new TreeSet<>(primaryCombos);
543 
544         inBasicButNotPopulation.removeAll(population);
545         for (String locale : inBasicButNotPopulation) {
546             ltp.set(locale);
547             String region = ltp.getRegion();
548             String language = ltp.getLanguage();
549             if (!sc.isModernLanguage(language)) continue;
550             PopulationData popData = supplementalData.getPopulationDataForTerritory(region);
551             // Afghanistan AF "29,928,987" 28.10% "21,500,000,000" Hazaragi haz "1,770,000" 28.10%
552             BadItem.WARNING.show(
553                     "In Basic Data but not Population > 20%",
554                     getDisplayCountry(region)
555                             + "\t"
556                             + region
557                             + "\t\""
558                             + formatNumber(popData.getPopulation(), 0, false)
559                             + "\""
560                             + "\t\""
561                             + formatPercent(
562                                     popData.getLiteratePopulation() / popData.getPopulation(),
563                                     0,
564                                     false)
565                             + "\""
566                             + "\t\""
567                             + formatPercent(popData.getGdp(), 0, false)
568                             + "\""
569                             + "\t"
570                             + ""
571                             + "\t"
572                             + getLanguageName(language)
573                             + "\t"
574                             + language
575                             + "\t"
576                             + -1
577                             + "\t\""
578                             + formatPercent(
579                                     popData.getLiteratePopulation() / popData.getPopulation(),
580                                     0,
581                                     false)
582                             + "\"");
583         }
584 
585         Set<String> inPopulationButNotBasic = new TreeSet<>(populationOver20);
586         inPopulationButNotBasic.removeAll(basicCombos);
587         for (Iterator<String> it = inPopulationButNotBasic.iterator(); it.hasNext(); ) {
588             String locale = it.next();
589             if (locale.endsWith("_ZZ")) {
590                 it.remove();
591             }
592         }
593         for (String locale : inPopulationButNotBasic) {
594             BadItem.WARNING.show(
595                     "In Population>20% but not Basic Data",
596                     locale + " " + getLanguageName(locale), localeToRowData.get(locale).toString());
597         }
598     }
599 
600     static class LanguageInfo {
601         static LanguageInfo INSTANCE = new LanguageInfo();
602 
603         Map<String, Set<String>> languageToScripts = new TreeMap<>();
604         Map<String, Set<String>> languageToRegions = new TreeMap<>();
605         Map<String, Comments> languageToComments = new TreeMap<>();
606 
607         Map<String, Set<String>> languageToScriptsAlt = new TreeMap<>();
608         Map<String, Set<String>> languageToRegionsAlt = new TreeMap<>();
609         Map<String, Comments> languageToCommentsAlt = new TreeMap<>();
610 
LanguageInfo()611         private LanguageInfo() {
612             cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
613             // Set<String> available = cldrFactory.getAvailable();
614             CLDRFile supplemental = cldrFactory.make("supplementalData", true);
615             for (Iterator<String> it =
616                             supplemental.iterator("//supplementalData/languageData/language");
617                     it.hasNext(); ) {
618                 String xpath = it.next();
619                 XPathParts parts = XPathParts.getFrozenInstance(xpath);
620                 Map<String, String> x = parts.getAttributes(-1);
621                 boolean alt = x.containsKey("alt");
622                 String lang = x.get("type");
623                 List<String> scripts = getAttributeList(x, "scripts");
624                 if (scripts != null) {
625                     if (alt) {
626                         putAll(languageToScriptsAlt, lang, new LinkedHashSet<>(scripts));
627                     } else {
628                         putAll(languageToScripts, lang, new LinkedHashSet<>(scripts));
629                     }
630                 }
631                 List<String> regions = getAttributeList(x, "territories");
632                 if (regions != null) {
633                     if (alt) {
634                         putAll(languageToRegionsAlt, lang, new LinkedHashSet<>(regions));
635                     } else {
636                         putAll(languageToRegions, lang, new LinkedHashSet<>(regions));
637                     }
638                 }
639             }
640         }
641 
getAttributeList(Map<String, String> x, String attribute)642         private List<String> getAttributeList(Map<String, String> x, String attribute) {
643             List<String> scripts = null;
644             String scriptString = x.get(attribute);
645             if (scriptString != null) {
646                 scripts = Arrays.asList(scriptString.split("\\s+"));
647             }
648             return scripts;
649         }
650     }
651 
putUnique(Map<K, V> map, K key, V value)652     private static <K, V> void putUnique(Map<K, V> map, K key, V value) {
653         V oldValue = map.get(key);
654         if (oldValue != null && !oldValue.equals(value)) {
655             throw new IllegalArgumentException(
656                     "Duplicate value for <" + key + ">: <" + oldValue + ">, <" + value + ">");
657         }
658         map.put(key, value);
659     }
660 
putAll(Map<K, Set<W>> map, K key, Set<W> values)661     private static <K, W> void putAll(Map<K, Set<W>> map, K key, Set<W> values) {
662         Set<W> oldValue = map.get(key);
663         if (oldValue == null) {
664             map.put(key, values);
665         } else {
666             oldValue.addAll(values);
667         }
668     }
669 
670     // public enum OfficialStatus {unknown, de_facto_official, official, official_regional,
671     // official_minority};
672 
673     static class RowData implements Comparable<Object> {
674         private final String countryCode;
675         private final double countryGdp;
676         private final double countryLiteracy;
677         private final double countryPopulation;
678         private final String languageCode;
679         private final OfficialStatus officialStatus;
680         private final double languagePopulation;
681         private final double languageLiteracy;
682         private final String comment;
683         private final String notes;
684         private final String badLanguageName;
685         private final boolean relativeLanguagePopulation;
686         // String badLanguageCode = "";
687         private static final Set<String> doneCountries = new HashSet<>();
688 
689         private static final Set<String> countryCodes = sc.getGoodAvailableCodes("territory");
690 
RowData(String country, String language)691         public RowData(String country, String language) {
692             this.countryCode = country;
693             this.languageCode = language;
694             badLanguageName = country = language = notes = comment = "";
695             officialStatus = OfficialStatus.unknown;
696             countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000);
697             countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d;
698             countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue();
699             languagePopulation = languageLiteracy = Double.NaN;
700             relativeLanguagePopulation = false;
701         }
702 
RowData(List<String> row)703         RowData(List<String> row) throws ParseException {
704             countryCode = fixCountryCode(row.get(COUNTRY_CODE), row);
705 
706             if (!countryCodes.contains(countryCode)) {
707                 System.err.println("WRONG COUNTRY CODE: " + row);
708             }
709 
710             double countryPopulation1 = parseDecimal(row.get(COUNTRY_POPULATION));
711             double countryLiteracy1 = parsePercent(row.get(COUNTRY_LITERACY), countryPopulation1);
712 
713             countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000);
714             countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d;
715             countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue();
716 
717             String officialStatusString = row.get(OFFICIAL_STATUS).trim().replace(' ', '_');
718             if (officialStatusString.equals("national")) {
719                 officialStatusString = "official";
720             } else if (officialStatusString.equals("regional_official")) {
721                 officialStatusString = "official_regional";
722             } else if (officialStatusString.length() == 0
723                     || officialStatusString.equals("uninhabited")) {
724                 officialStatusString = "unknown";
725             }
726             try {
727                 officialStatus = OfficialStatus.valueOf(officialStatusString);
728             } catch (RuntimeException e) {
729                 throw new IllegalArgumentException(
730                         "Can't interpret offical-status: " + officialStatusString);
731             }
732 
733             String languageCode1 = row.get(LANGUAGE_CODE);
734             if (languageCode1.startsWith("*") || languageCode1.startsWith("\u00A7")) {
735                 languageCode1 = languageCode1.substring(1);
736             }
737             languageCode = fixLanguageCode(languageCode1, row);
738 
739             if (doneCountries.contains(countryCode) == false) {
740                 // showDiff(countryGdp1, countryGdp);
741                 // showDiff(countryLiteracy1, countryLiteracy);
742                 if (SHOW_DIFF) showDiff(countryPopulation1, countryPopulation, 0.1, false);
743                 doneCountries.add(countryCode);
744             }
745 
746             double languagePopulation1 =
747                     parsePercent(row.get(LANGUAGE_POPULATION), countryPopulation1)
748                             * countryPopulation1;
749             if ((officialStatus.isMajor())
750                     && languagePopulation1 * 100 < countryPopulation
751                     && languagePopulation1 < 1000000) {
752                 BadItem.WARNING.show(
753                         "official language has population < 1% of country & < 1,000,000",
754                         languageCode + ", " + Math.round(languagePopulation1), row);
755             }
756             if (languagePopulation1 < 0.999) {
757                 BadItem.WARNING.show(
758                         "suspect language population, < 1",
759                         languageCode + ", " + Math.round(languagePopulation1),
760                         row);
761             }
762             if (languagePopulation1 > 10000) {
763                 relativeLanguagePopulation = true;
764                 languagePopulation1 =
765                         languagePopulation1 * countryPopulation / countryPopulation1; // correct the
766                 // values
767             } else {
768                 relativeLanguagePopulation = false;
769             }
770             if (isApproximatelyGreater(languagePopulation1, countryPopulation, 0.0001)) {
771                 BadItem.ERROR.show(
772                         "language population > country population",
773                         Math.round(languagePopulation1) + " > " + countryPopulation,
774                         row);
775             }
776             languagePopulation =
777                     languagePopulation1 < countryPopulation
778                             ? languagePopulation1
779                             : countryPopulation;
780 
781             if (SHOW_DIFF)
782                 showDiff(
783                         languagePopulation1 / countryPopulation1,
784                         languagePopulation / countryPopulation,
785                         0.01,
786                         true);
787 
788             String stringLanguageLiteracy =
789                     row.size() <= LANGUAGE_LITERACY ? "" : row.get(LANGUAGE_LITERACY);
790             double languageLiteracy1 =
791                     stringLanguageLiteracy.length() == 0
792                             ? countryLiteracy
793                             : parsePercent(stringLanguageLiteracy, languagePopulation);
794             if (isApproximatelyEqual(languageLiteracy1, countryLiteracy1, 0.001)) {
795                 languageLiteracy1 = countryLiteracy; // correct the values
796             }
797             languageLiteracy = languageLiteracy1;
798 
799             if (row.size() > COMMENT) {
800                 comment = row.get(COMMENT);
801             } else {
802                 comment = "";
803             }
804             if (row.size() > NOTES) {
805                 notes = row.get(NOTES);
806             } else {
807                 notes = "";
808             }
809             badLanguageName = row.get(BAD_LANGUAGE_NAME);
810         }
811 
showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang)812         private void showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang) {
813             final double diff = new_a / a - 1;
814             if (Math.abs(diff) > maxRelativeDiff) {
815                 System.out.println(
816                         formatPercent(diff, 0, false)
817                                 + "\t"
818                                 + countryCode
819                                 + "\t"
820                                 + getDisplayCountry(countryCode)
821                                 + (showLang
822                                         ? "\t"
823                                                 + languageCode
824                                                 + "\t"
825                                                 + ConvertLanguageData.getLanguageName(languageCode)
826                                         : "")
827                                 + "\t"
828                                 + formatNumber(a, 0, false)
829                                 + "\t=>\t"
830                                 + formatNumber(new_a, 0, false));
831             }
832         }
833 
roundToPartsPer(double a, double whole)834         private double roundToPartsPer(double a, double whole) {
835             // break this out just to make it easier to follow.
836             double log10 = Math.log10(a / whole);
837             long digitsFound = (long) (log10);
838             long factor = (long) (Math.pow(10, digitsFound));
839             double rounded = Math.round(a / factor);
840             double result = rounded * factor;
841             // if (Math.abs(result - a) >= 1) {
842             // System.out.println("Rounding " + a + " => " + result);
843             // }
844             return result;
845         }
846 
isApproximatelyEqual(double a, double b, double epsilon)847         private static boolean isApproximatelyEqual(double a, double b, double epsilon) {
848             return a == b || Math.abs(a - b) < epsilon;
849         }
850 
isApproximatelyGreater(double a, double b, double epsilon)851         private static boolean isApproximatelyGreater(double a, double b, double epsilon) {
852             return a > b + epsilon;
853         }
854 
parseDecimal(String numericRepresentation)855         double parseDecimal(String numericRepresentation) throws ParseException {
856             try {
857                 // if (numericRepresentation == null || numericRepresentation.length() == 0) return
858                 // Double.NaN;
859                 Number result = nf.parse(numericRepresentation);
860                 // if (result == null) return Double.NaN;
861                 return result.doubleValue();
862             } catch (ParseException e) {
863                 throw e;
864                 // (RuntimeException) new IllegalArgumentException("can't parse <" +
865                 // numericRepresentation +
866                 // ">").initCause(e);
867             }
868         }
869 
parsePercent(String numericRepresentation, double baseValue)870         double parsePercent(String numericRepresentation, double baseValue) throws ParseException {
871             try {
872                 double result;
873                 if (numericRepresentation.contains("%")) {
874                     Number result0 = pf.parse(numericRepresentation);
875                     result = result0.doubleValue();
876                 } else {
877                     Number result0 = nf.parse(numericRepresentation);
878                     result = result0.doubleValue() / baseValue;
879                 }
880                 // if (numericRepresentation == null || numericRepresentation.length() == 0) return
881                 // Double.NaN;
882                 // if (result == null) return Double.NaN;
883                 return result;
884             } catch (ParseException e) {
885                 throw e;
886                 // (RuntimeException) new IllegalArgumentException("can't parse <" +
887                 // numericRepresentation +
888                 // ">").initCause(e);
889             }
890         }
891 
getLanguageLiteratePopulation()892         public double getLanguageLiteratePopulation() {
893             return languageLiteracy * languagePopulation;
894         }
895 
896         /**
897          * Get the weighted population
898          *
899          * @param weightIfNotOfficial
900          * @return
901          */
getLanguageLiteratePopulation(double weightIfNotOfficial)902         public double getLanguageLiteratePopulation(double weightIfNotOfficial) {
903             double result = languageLiteracy * languagePopulation;
904             if (!officialStatus.isMajor()) {
905                 result *= weightIfNotOfficial;
906             }
907             return result;
908         }
909 
910         @Override
compareTo(Object o)911         public int compareTo(Object o) {
912             RowData that = (RowData) o;
913             int result;
914             if (0 != (result = GENERAL_COLLATOR.compare(countryCode, that.countryCode)))
915                 return result;
916             if (languagePopulation > that.languagePopulation) return -1; // descending
917             if (languagePopulation < that.languagePopulation) return 1;
918             if (0 != (result = GENERAL_COLLATOR.compare(languageCode, that.languageCode)))
919                 return result;
920             return 0;
921         }
922 
toStringHeader()923         public static String toStringHeader() {
924             return "countryCode"
925                     + "\t"
926                     + "countryPopulation"
927                     + "\t"
928                     + "countryGdp"
929                     + "\t"
930                     + "countryLiteracy"
931                     + "\t"
932                     + "languagePopulation"
933                     + "\t"
934                     + "languageCode"
935                     + "\t"
936                     + "writingPopulation";
937         }
938 
939         @Override
toString()940         public String toString() {
941             return countryCode
942                     + "\t"
943                     + countryPopulation
944                     + "\t"
945                     + countryGdp
946                     + "\t"
947                     + countryLiteracy
948                     + "\t"
949                     + languagePopulation
950                     + "\t"
951                     + languageCode
952                     + "\t"
953                     + languageLiteracy;
954         }
955 
toString(boolean b)956         public String toString(boolean b) {
957             return "region:\t"
958                     + getCountryCodeAndName(countryCode)
959                     + "\tpop:\t"
960                     + countryPopulation
961                     + "\tgdp:\t"
962                     + countryGdp
963                     + "\tlit:\t"
964                     + countryLiteracy
965                     + "\tlang:\t"
966                     + getLanguageCodeAndName(languageCode)
967                     + "\tpop:\t"
968                     + languagePopulation
969                     + "\tlit:\t"
970                     + languageLiteracy;
971         }
972 
973         static boolean MARK_OUTPUT = false;
974 
getLanguageCode()975         public String getLanguageCode() {
976             if (languageCode.contains("_")) return languageCode;
977             Source source = Iso639Data.getSource(languageCode);
978             if (source == null) {
979                 return "§" + languageCode;
980             }
981             if (MARK_OUTPUT) {
982                 if (source == Source.ISO_639_3) {
983                     return "*" + languageCode;
984                 }
985             }
986             return languageCode;
987         }
988 
989         static Map<String, String> oldToFixed = new HashMap<>();
990 
getLanguageName()991         public String getLanguageName() {
992             String cldrResult = getExcelQuote(english.getName(languageCode, true));
993             //            String result = getLanguageName2();
994             //            if (!result.equalsIgnoreCase(cldrResult)) {
995             //                if (null == oldToFixed.put(result, cldrResult)) {
996             //                    System.out.println("## " + result + "!=" + cldrResult);
997             //                }
998             //            }
999             return cldrResult;
1000         }
1001 
getLanguageName2()1002         public String getLanguageName2() {
1003             String result = new ULocale(languageCode).getDisplayName();
1004             if (!result.equals(languageCode)) return getExcelQuote(result);
1005             Set<String> names = Iso639Data.getNames(languageCode);
1006             if (names != null && names.size() != 0) {
1007                 if (MARK_OUTPUT) {
1008                     return getExcelQuote("*" + names.iterator().next());
1009                 } else {
1010                     return getExcelQuote(names.iterator().next());
1011                 }
1012             }
1013             return getExcelQuote("§" + badLanguageName);
1014         }
1015 
getCountryName()1016         public String getCountryName() {
1017             return getExcelQuote(getDisplayCountry(countryCode));
1018         }
1019 
getCountryGdpString()1020         public String getCountryGdpString() {
1021             return getExcelQuote(formatNumber(countryGdp, 0, false));
1022         }
1023 
getCountryLiteracyString()1024         public String getCountryLiteracyString() {
1025             return formatPercent(countryLiteracy, 2, false);
1026         }
1027 
getCountryPopulationString()1028         public String getCountryPopulationString() {
1029             return getExcelQuote(formatNumber(countryPopulation, 0, false));
1030         }
1031 
getLanguageLiteracyString()1032         public String getLanguageLiteracyString() {
1033             return formatPercent(languageLiteracy, 2, false);
1034         }
1035 
getLanguagePopulationString()1036         public String getLanguagePopulationString() {
1037 
1038             try {
1039                 final double percent = languagePopulation / countryPopulation;
1040                 return getExcelQuote(
1041                         relativeLanguagePopulation && percent > 0.03 && languagePopulation > 10000
1042                                 ? formatPercent(percent, 2, false)
1043                                 : formatNumber(languagePopulation, 3, false));
1044             } catch (IllegalArgumentException e) {
1045                 return "NaN";
1046             }
1047         }
1048 
getLanguagePopulation()1049         private double getLanguagePopulation() {
1050             return languagePopulation;
1051         }
1052     }
1053 
getExcelQuote(String comment)1054     public static String getExcelQuote(String comment) {
1055         return comment == null || comment.length() == 0
1056                 ? ""
1057                 : comment.contains(",")
1058                         ? '"' + comment + '"'
1059                         : comment.contains("\"")
1060                                 ? '"' + comment.replace("\"", "\"\"") + '"'
1061                                 : comment;
1062     }
1063 
getCountryCodeAndName(String code)1064     public static String getCountryCodeAndName(String code) {
1065         if (code == null) return null;
1066         return english.getName(CLDRFile.TERRITORY_NAME, code) + " [" + code + "]";
1067     }
1068 
1069     static class RowComparator implements Comparator<RowData> {
1070         @Override
compare(RowData me, RowData that)1071         public int compare(RowData me, RowData that) {
1072             int result;
1073             if (0
1074                     != (result =
1075                             GENERAL_COLLATOR.compare(me.getCountryName(), that.getCountryName())))
1076                 return result;
1077             if (0
1078                     != (result =
1079                             GENERAL_COLLATOR.compare(me.getLanguageName(), that.getLanguageName())))
1080                 return result;
1081             return me.compareTo(that);
1082         }
1083     }
1084 
writeTerritoryLanguageData( PrintWriter out, List<String> failures, Set<RowData> sortedInput)1085     private static void writeTerritoryLanguageData(
1086             PrintWriter out, List<String> failures, Set<RowData> sortedInput) {
1087 
1088         String lastCountryCode = "";
1089         boolean first = true;
1090         LanguageTagParser ltp = new LanguageTagParser();
1091 
1092         out.println(
1093                 " <!-- See http://unicode.org/cldr/data/diff/supplemental/territory_language_information.html for more information on territoryInfo. -->");
1094         out.println("\t<territoryInfo>");
1095 
1096         for (RowData row : sortedInput) {
1097             String countryCode = row.countryCode;
1098 
1099             double countryPopulationRaw = row.countryPopulation;
1100             double countryPopulation =
1101                     countryPopulationRaw; // (long) Utility.roundToDecimals(countryPopulationRaw,
1102             // 2);
1103             double languageLiteracy = row.languageLiteracy;
1104             double countryLiteracy = row.countryLiteracy;
1105 
1106             double countryGDPRaw = row.countryGdp;
1107             long countryGDP = Math.round(countryGDPRaw / gdpFactor);
1108 
1109             String languageCode = row.languageCode;
1110 
1111             double languagePopulationRaw = row.getLanguagePopulation();
1112             double languagePopulation =
1113                     languagePopulationRaw; // (long) Utility.roundToDecimals(languagePopulationRaw,
1114             // 2);
1115 
1116             double languagePopulationPercent = languagePopulation / countryPopulation;
1117             // Utility.roundToDecimals(Math.min(100, Math.max(0,
1118             // languagePopulation*100 / (double)countryPopulation)),3);
1119 
1120             if (!countryCode.equals(lastCountryCode)) {
1121                 if (first) {
1122                     first = false;
1123                 } else {
1124                     out.println("\t\t</territory>");
1125                 }
1126                 out.print(
1127                         "\t\t<territory type=\""
1128                                 + countryCode
1129                                 + "\""
1130                                 + " gdp=\""
1131                                 + formatNumber(countryGDP, 4, true)
1132                                 + "\""
1133                                 + " literacyPercent=\""
1134                                 + formatPercent(countryLiteracy, 3, true)
1135                                 + "\""
1136                                 + " population=\""
1137                                 + formatNumber(countryPopulation, 6, true)
1138                                 + "\">");
1139                 lastCountryCode = countryCode;
1140                 out.println("\t<!--" + getDisplayCountry(countryCode) + "-->");
1141             }
1142 
1143             if (languageCode.length() != 0
1144                     && languagePopulationPercent > 0.0000
1145                     && (ALLOW_SMALL_NUMBERS
1146                             || languagePopulationPercent >= 1
1147                             || languagePopulationRaw > 100000
1148                             || languageCode.equals("haw")
1149                             || row.officialStatus.isOfficial())) {
1150                 // add best case
1151                 addBestRegion(languageCode, countryCode, languagePopulationRaw);
1152                 String baseScriptLanguage = ltp.set(languageCode).getLanguageScript();
1153                 if (!baseScriptLanguage.equals(languageCode)) {
1154                     addBestRegion(baseScriptLanguage, countryCode, languagePopulationRaw);
1155                 }
1156                 String baseLanguage = ltp.set(baseScriptLanguage).getLanguage();
1157                 if (!baseLanguage.equals(baseScriptLanguage)) {
1158                     addBestRegion(baseLanguage, countryCode, languagePopulationRaw);
1159                     addBestScript(
1160                             baseLanguage, ltp.set(languageCode).getScript(), languagePopulationRaw);
1161                 }
1162 
1163                 if (languageLiteracy != countryLiteracy) {
1164                     int debug = 0;
1165                 }
1166                 out.print(
1167                         "\t\t\t<languagePopulation type=\""
1168                                 + languageCode
1169                                 + "\""
1170                                 + (DoubleMath.fuzzyCompare(
1171                                                         languageLiteracy, countryLiteracy, 0.0001)
1172                                                 == 0
1173                                         ? ""
1174                                         : (DoubleMath.fuzzyCompare(languageLiteracy, 0.05, 0.0001)
1175                                                                 == 0
1176                                                         ? " writingPercent=\""
1177                                                         : " literacyPercent=\"")
1178                                                 + formatPercent(languageLiteracy, 2, true)
1179                                                 + "\"")
1180                                 + " populationPercent=\""
1181                                 + formatPercent(languagePopulationPercent, 2, true)
1182                                 + "\""
1183                                 + (row.officialStatus.isOfficial()
1184                                         ? " officialStatus=\"" + row.officialStatus + "\""
1185                                         : "")
1186                                 + references.addReference(row.notes)
1187                                 + "/>");
1188                 out.println("\t<!--" + getLanguageName(languageCode) + "-->");
1189             } else if (!row.countryCode.equals("ZZ")) {
1190                 failures.add(
1191                         BadItem.ERROR.toString(
1192                                 "too few speakers: suspect line",
1193                                 languageCode,
1194                                 row.toString(true)));
1195             }
1196             // if (first) {
1197             if (false)
1198                 System.out.print(
1199                         "countryCode: "
1200                                 + countryCode
1201                                 + "\t"
1202                                 + "countryPopulation: "
1203                                 + countryPopulation
1204                                 + "\t"
1205                                 + "countryGDP: "
1206                                 + countryGDP
1207                                 + "\t"
1208                                 + "languageCode: "
1209                                 + languageCode
1210                                 + "\t"
1211                                 + "languagePopulation: "
1212                                 + languagePopulation
1213                                 + CldrUtility.LINE_SEPARATOR);
1214             // }
1215         }
1216 
1217         out.println("\t\t</territory>");
1218         out.println("\t</territoryInfo>");
1219     }
1220 
getDisplayCountry(String countryCode)1221     private static String getDisplayCountry(String countryCode) {
1222         String result = getULocaleCountryName(countryCode);
1223         if (!result.equals(countryCode)) {
1224             return result;
1225         }
1226         result = sc.getData("territory", countryCode);
1227         if (result != null) {
1228             return result;
1229         }
1230         return countryCode;
1231         // new ULocale("und-" + countryCode).getDisplayCountry()
1232     }
1233 
getDisplayScript(String scriptCode)1234     private static String getDisplayScript(String scriptCode) {
1235         String result = getULocaleScriptName(scriptCode);
1236         if (!result.equals(scriptCode)) {
1237             return result;
1238         }
1239         result = sc.getData("territory", scriptCode);
1240         if (result != null) {
1241             return result;
1242         }
1243         return scriptCode;
1244         // new ULocale("und-" + countryCode).getDisplayCountry()
1245     }
1246 
getLanguageName(String languageCode)1247     private static String getLanguageName(String languageCode) {
1248         String result = getULocaleLocaleName(languageCode);
1249         if (!result.equals(languageCode)) return result;
1250         Set<String> names = Iso639Data.getNames(languageCode);
1251         if (names != null && names.size() != 0) {
1252             return names.iterator().next();
1253         }
1254         return languageCode;
1255     }
1256 
1257     static class References {
1258         Map<String, Pair<String, String>> Rxxx_to_reference = new TreeMap<>();
1259         Map<Pair<String, String>, String> reference_to_Rxxx = new TreeMap<>();
1260         Map<String, Pair<String, String>> Rxxx_to_oldReferences = supplementalData.getReferences();
1261         Map<Pair<String, String>, String> oldReferences_to_Rxxx = new TreeMap<>();
1262 
1263         {
1264             for (String Rxxx : Rxxx_to_oldReferences.keySet()) {
Rxxx_to_oldReferences.get(Rxxx)1265                 oldReferences_to_Rxxx.put(Rxxx_to_oldReferences.get(Rxxx), Rxxx);
1266             }
1267         }
1268 
1269         Matcher URI = PatternCache.get("([a-z]+\\://[\\S]+)\\s?(.*)").matcher("");
1270 
1271         static int referenceStart = 1000;
1272 
1273         /**
1274          * Returns " references=\"" + Rxxx + "\"" or "" if there is no reference.
1275          *
1276          * @param rawReferenceText
1277          * @return
1278          */
addReference(String rawReferenceText)1279         private String addReference(String rawReferenceText) {
1280             if (rawReferenceText == null || rawReferenceText.length() == 0) return "";
1281             Pair<String, String> p;
1282             if (URI.reset(rawReferenceText).matches()) {
1283                 p =
1284                         new Pair<>(
1285                                         URI.group(1),
1286                                         URI.group(2) == null || URI.group(2).length() == 0
1287                                                 ? "[missing]"
1288                                                 : URI.group(2))
1289                                 .freeze();
1290             } else {
1291                 p = new Pair<String, String>(null, rawReferenceText).freeze();
1292             }
1293 
1294             String Rxxx = reference_to_Rxxx.get(p);
1295             if (Rxxx == null) { // add new
1296                 Rxxx = oldReferences_to_Rxxx.get(p);
1297                 if (Rxxx != null) { // if old, just keep number
1298                     p = Rxxx_to_oldReferences.get(Rxxx);
1299                 } else { // find an empty number
1300                     while (true) {
1301                         Rxxx = "R" + (referenceStart++);
1302                         if (Rxxx_to_reference.get(Rxxx) == null
1303                                 && Rxxx_to_oldReferences.get(Rxxx) == null) {
1304                             break;
1305                         }
1306                     }
1307                 }
1308                 // add to new references
1309                 reference_to_Rxxx.put(p, Rxxx);
1310                 Rxxx_to_reference.put(Rxxx, p);
1311             }
1312             // references="R034"
1313             return " references=\"" + Rxxx + "\"";
1314         }
1315 
getReferenceHTML(String Rxxx)1316         String getReferenceHTML(String Rxxx) {
1317             Pair<String, String> p = Rxxx_to_reference.get(Rxxx); // exception if fails.
1318             String uri = p.getFirst();
1319             String value = p.getSecond();
1320             uri =
1321                     uri == null
1322                             ? ""
1323                             : " uri=\"" + TransliteratorUtilities.toHTML.transliterate(uri) + "\"";
1324             value =
1325                     value == null
1326                             ? "[missing]"
1327                             : TransliteratorUtilities.toHTML.transliterate(value);
1328             return "\t\t<reference type=\"" + Rxxx + "\"" + uri + ">" + value + "</reference>";
1329         }
1330 
printReferences(PrintWriter out)1331         void printReferences(PrintWriter out) {
1332             // <reference type="R034" uri="isbn:0-321-18578-1">The Unicode Standard 4.0</reference>
1333             out.println("\t<references>");
1334             for (String Rxxx : Rxxx_to_reference.keySet()) {
1335                 out.println(getReferenceHTML(Rxxx));
1336             }
1337             out.println("\t</references>");
1338         }
1339     }
1340 
1341     static References references = new References();
1342 
getExcelData( List<String> failures, Map<String, RowData> localeToRowData)1343     private static Set<RowData> getExcelData(
1344             List<String> failures, Map<String, RowData> localeToRowData) throws IOException {
1345 
1346         LanguageTagParser ltp = new LanguageTagParser();
1347 
1348         String dir = CLDRPaths.GEN_DIRECTORY + "supplemental/";
1349         final String countryLanguagePopulation = "country_language_population.tsv";
1350         System.out.println("\n# Problems in " + countryLanguagePopulation + "\n");
1351         List<List<String>> input =
1352                 SpreadSheet.convert(CldrUtility.getUTF8Data(countryLanguagePopulation));
1353 
1354         // TODO: Why is this called? Should it be sc.getGoodAvailableCodes?
1355         Set<String> languages = languagesNeeded; // sc.getGoodAvailableCodes("language");
1356 
1357         Set<String> territories = new TreeSet<>(sc.getGoodAvailableCodes("territory"));
1358         territories.removeAll(supplementalData.getContainers());
1359         // TODO: Why are these removed if they are "good" (per above function)?
1360         territories.remove("EU");
1361         territories.remove("QO");
1362 
1363         Set<String> countriesNotFound = new TreeSet<>(territories);
1364         Set<OfficialStatus> statusFound = new TreeSet<>();
1365         Set<String> countriesWithoutOfficial = new TreeSet<>(territories);
1366         countriesWithoutOfficial.remove("ZZ");
1367 
1368         Map<String, Row.R2<String, Double>> countryToLargestOfficialLanguage = new HashMap<>();
1369 
1370         Set<String> languagesNotFound = new TreeSet<>(languages);
1371         Set<RowData> sortedInput = new TreeSet<>();
1372         int count = 0;
1373         for (List<String> row : input) {
1374             ++count;
1375             if (count == 1 || row.size() <= COUNTRY_GDP) {
1376                 failures.add(join(row, "\t") + "\tShort row");
1377                 continue;
1378             }
1379             try {
1380                 RowData x = new RowData(row);
1381                 if (x.officialStatus.isOfficial()) {
1382                     Row.R2<String, Double> largestOffical =
1383                             countryToLargestOfficialLanguage.get(x.countryCode);
1384                     if (largestOffical == null) {
1385                         countryToLargestOfficialLanguage.put(
1386                                 x.countryCode, Row.of(x.languageCode, x.languagePopulation));
1387                     } else if (largestOffical.get1() < x.languagePopulation) {
1388                         largestOffical.set0(x.languageCode);
1389                         largestOffical.set1(x.languagePopulation);
1390                     }
1391                 }
1392                 if (x.officialStatus.isMajor() || x.countryPopulation < 1000) {
1393                     countriesWithoutOfficial.remove(x.countryCode);
1394                 }
1395                 if (!checkCode(LstrType.region, x.countryCode, row)) continue;
1396                 statusFound.add(x.officialStatus);
1397                 countriesNotFound.remove(x.countryCode);
1398                 languagesNotFound.remove(x.languageCode);
1399                 if (x.languageCode.contains("_")) {
1400                     ltp.set(x.languageCode);
1401                     languagesNotFound.remove(ltp.getLanguage());
1402                     if (!checkCode(LstrType.language, ltp.getLanguage(), row)) continue;
1403                     if (!checkCode(LstrType.script, ltp.getScript(), row)) continue;
1404                 }
1405                 String locale = x.languageCode + "_" + x.countryCode;
1406                 if (localeToRowData.get(locale) != null) {
1407                     BadItem.ERROR.show(
1408                             "duplicate data", x.languageCode + " with " + x.countryCode, row);
1409                 }
1410                 localeToRowData.put(locale, x);
1411                 sortedInput.add(x);
1412             } catch (ParseException e) {
1413                 failures.add(
1414                         join(row, "\t")
1415                                 + "\t"
1416                                 + e.getMessage()
1417                                 + "\t"
1418                                 + join(Arrays.asList(e.getStackTrace()), ";\t"));
1419             } catch (RuntimeException e) {
1420                 throw (RuntimeException)
1421                         new IllegalArgumentException("Failure on line " + count + ")\t" + row)
1422                                 .initCause(e);
1423             }
1424         }
1425         // System.out.println("Note: the following Status values were found in the data: " +
1426         // CldrUtility.join(statusFound, " | "));
1427 
1428         // make sure we have something
1429         for (String country : countriesNotFound) {
1430             RowData x = new RowData(country, "und");
1431             sortedInput.add(x);
1432         }
1433         for (String language : languagesNotFound) {
1434             RowData x = new RowData("ZZ", language);
1435             sortedInput.add(x);
1436         }
1437 
1438         for (RowData row : sortedInput) {
1439             // see which countries have languages that are larger than any offical language
1440 
1441             if (!row.officialStatus.isOfficial()) {
1442                 // String country = row.countryCode;
1443                 Row.R2<String, Double> largestOffical =
1444                         countryToLargestOfficialLanguage.get(row.countryCode);
1445                 if (largestOffical != null && largestOffical.get1() < row.languagePopulation) {
1446                     BadItem.WARNING.show(
1447                             "language population > all official languages",
1448                             getLanguageCodeAndName(largestOffical.get0()),
1449                             row.toString(true));
1450                 }
1451             }
1452 
1453             // see which countries are missing an official language
1454             if (!countriesWithoutOfficial.contains(row.countryCode)) continue;
1455             BadItem.ERROR.show(
1456                     "missing official language",
1457                     row.getCountryName() + "\t" + row.countryCode,
1458                     row.toString(true));
1459             countriesWithoutOfficial.remove(row.countryCode);
1460         }
1461 
1462         PrintWriter log = FileUtilities.openUTF8Writer(dir, countryLanguagePopulation);
1463         log.println(
1464                 "*\tCName"
1465                         + "\tCCode"
1466                         + "\tCPopulation"
1467                         + "\tCLiteracy"
1468                         + "\tCGdp"
1469                         + "\tOfficialStatus"
1470                         + "\tLanguage"
1471                         + "\tLCode"
1472                         + "\tLPopulation"
1473                         + "\tWritingPop"
1474                         + "\tReferences"
1475                         + "\tNotes");
1476         RowComparator rowSorting = new RowComparator();
1477         Set<RowData> rowSorted = new TreeSet<>(rowSorting);
1478         rowSorted.addAll(sortedInput);
1479 
1480         for (RowData row : rowSorted) {
1481             final String langLit = row.getLanguageLiteracyString();
1482             final String countryLit = row.getCountryLiteracyString();
1483             log.println(
1484                     row.getCountryName()
1485                             + "\t"
1486                             + row.countryCode
1487                             + "\t"
1488                             + row.getCountryPopulationString()
1489                             + "\t"
1490                             + countryLit
1491                             + "\t"
1492                             + row.getCountryGdpString()
1493                             + "\t"
1494                             + (row.officialStatus == OfficialStatus.unknown
1495                                     ? ""
1496                                     : row.officialStatus)
1497                             + "\t"
1498                             + row.getLanguageName()
1499                             + "\t"
1500                             + row.getLanguageCode()
1501                             + "\t"
1502                             + row.getLanguagePopulationString()
1503                             + "\t"
1504                             + (langLit.equals(countryLit) ? "" : langLit)
1505                             + "\t"
1506                             + getExcelQuote(row.comment)
1507                             + "\t"
1508                             + getExcelQuote(row.notes));
1509         }
1510         log.close();
1511         return sortedInput;
1512     }
1513 
getCldrParents(Set<String> available)1514     private static Set<String> getCldrParents(Set<String> available) {
1515         LanguageTagParser ltp2 = new LanguageTagParser();
1516         Set<String> cldrParents = new TreeSet<>();
1517         for (String locale : available) {
1518             if (skipLocales.contains(locale)) continue;
1519             try {
1520                 ltp2.set(locale);
1521             } catch (RuntimeException e) {
1522                 System.out.println("Skipping CLDR file: " + locale);
1523                 continue;
1524             }
1525             String locale2 = ltp2.getLanguageScript();
1526             if (locale2.equals("sh")) continue;
1527             // int lastPos = locale.lastIndexOf('_');
1528             // if (lastPos < 0) continue;
1529             // String locale2 = locale.substring(0,lastPos);
1530             cldrParents.add(locale2);
1531             languageToMaxCountry.put(locale2, null);
1532         }
1533         // System.out.println("CLDR Parents: " + cldrParents);
1534         return cldrParents;
1535     }
1536 
showFailures(List<String> failures)1537     private static void showFailures(List<String> failures) {
1538         if (failures.size() <= 1) {
1539             return;
1540         }
1541         System.out.println();
1542         System.out.println("Failures in Output");
1543         System.out.println();
1544 
1545         System.out.println(RowData.toStringHeader());
1546         for (String failure : failures) {
1547             System.out.println(failure);
1548         }
1549     }
1550 
getProcessedParent(String localeCode)1551     public static String getProcessedParent(String localeCode) {
1552         if (localeCode == null || localeCode.equals("root")) return null;
1553         int pos = localeCode.lastIndexOf('_');
1554         if (pos < 0) return "root";
1555         LanguageTagParser ltp = new LanguageTagParser();
1556         String script = ltp.set(localeCode).getScript();
1557         if (script.length() == 0) {
1558             return getFullyResolved(localeCode);
1559         }
1560         return localeCode.substring(0, pos);
1561     }
1562 
getFullyResolved(String languageCode)1563     private static String getFullyResolved(String languageCode) {
1564         String result = defaultContent.get(languageCode);
1565         if (result != null) return result;
1566         // we missed. Try taking parent and trying again
1567         int pos = languageCode.length() + 1;
1568         while (true) {
1569             pos = languageCode.lastIndexOf('_', pos - 1);
1570             if (pos < 0) {
1571                 return "***" + languageCode;
1572             }
1573             result = defaultContent.get(languageCode.substring(0, pos));
1574             if (result != null) {
1575                 LanguageTagParser ltp = new LanguageTagParser().set(languageCode);
1576                 LanguageTagParser ltp2 = new LanguageTagParser().set(result);
1577                 String region = ltp.getRegion();
1578                 if (region.length() == 0) {
1579                     ltp.setRegion(ltp2.getRegion());
1580                 }
1581                 String script = ltp.getScript();
1582                 if (script.length() == 0) {
1583                     ltp.setScript(ltp2.getScript());
1584                 }
1585                 return ltp.toString();
1586             }
1587         }
1588     }
1589 
1590     static Comparator<Iterable> firstElementComparator =
1591             new Comparator<Iterable>() {
1592                 @Override
1593                 public int compare(Iterable o1, Iterable o2) {
1594                     int result =
1595                             ((Comparable) o1.iterator().next()).compareTo((o2.iterator().next()));
1596                     assert result != 0;
1597                     return result;
1598                 }
1599             };
1600 
showDefaults( Set<String> cldrParents, NumberFormat nf, Map<String, String> defaultContent, Map<String, RowData> localeToRowData, Set<String> defaultLocaleContent)1601     private static void showDefaults(
1602             Set<String> cldrParents,
1603             NumberFormat nf,
1604             Map<String, String> defaultContent,
1605             Map<String, RowData> localeToRowData,
1606             Set<String> defaultLocaleContent) {
1607 
1608         if (SHOW_OLD_DEFAULT_CONTENTS) {
1609             System.out.println();
1610             System.out.println("Computing Defaults Contents");
1611             System.out.println();
1612         }
1613 
1614         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
1615         Set<String> locales = new TreeSet<>(cldrFactory.getAvailable());
1616         LocaleIDParser lidp = new LocaleIDParser();
1617 
1618         // add all the combinations of language, script, and territory.
1619         for (String locale : localeToRowData.keySet()) {
1620             String baseLanguage = lidp.set(locale).getLanguage();
1621             if (locales.contains(baseLanguage) && !locales.contains(locale)) {
1622                 locales.add(locale);
1623                 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding: " + locale);
1624             }
1625         }
1626 
1627         // adding parents
1628         Set<String> toAdd = new TreeSet<>();
1629         while (true) {
1630             for (String locale : locales) {
1631                 String newguy = LocaleIDParser.getParent(locale);
1632                 if (newguy != null && !locales.contains(newguy) && !toAdd.contains(newguy)) {
1633                     toAdd.add(newguy);
1634                     if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding parent: " + newguy);
1635                 }
1636             }
1637             if (toAdd.size() == 0) {
1638                 break;
1639             }
1640             locales.addAll(toAdd);
1641             toAdd.clear();
1642         }
1643 
1644         // get sets of siblings
1645         Set<Set<String>> siblingSets = new TreeSet<>(firstElementComparator);
1646         Set<String> needsADoin = new TreeSet<>(locales);
1647 
1648         Set<String> deprecatedLanguages = new TreeSet<>();
1649         // TODO: why are these here and not read from metadata?
1650         deprecatedLanguages.add("sh");
1651         Set<String> deprecatedRegions = new TreeSet<>();
1652         // TODO: why are these here and not read from metadata?
1653         deprecatedRegions.add("YU");
1654         deprecatedRegions.add("CS");
1655         deprecatedRegions.add("ZZ");
1656 
1657         // first find all the language subtags that have scripts, and those we need to skip. Those
1658         // are aliased-only
1659         Set<String> skippingItems = new TreeSet<>();
1660         Set<String> hasAScript = new TreeSet<>();
1661         // Set<LocaleIDParser.Level> languageOnly = EnumSet.of(LocaleIDParser.Level.Language);
1662         for (String locale : locales) {
1663             lidp.set(locale);
1664             if (lidp.getScript().length() != 0) {
1665                 hasAScript.add(lidp.getLanguage());
1666             }
1667             Set<LocaleIDParser.Level> levels = lidp.getLevels();
1668             // must have no variants, must have either script or region, no deprecated elements
1669             if (levels.contains(LocaleIDParser.Level.Variants) // no variants
1670                     || !(levels.contains(LocaleIDParser.Level.Script)
1671                             || levels.contains(LocaleIDParser.Level.Region))
1672                     || deprecatedLanguages.contains(lidp.getLanguage())
1673                     || deprecatedRegions.contains(lidp.getRegion())) {
1674                 // skip language-only locales, and ones with variants
1675                 needsADoin.remove(locale);
1676                 skippingItems.add(locale);
1677                 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tremoving: " + locale);
1678                 continue;
1679             }
1680         }
1681         // walk through the locales, getting the ones we care about.
1682         Map<String, Double> scriptLocaleToLanguageLiteratePopulation = new TreeMap<>();
1683 
1684         for (String locale : new TreeSet<>(needsADoin)) {
1685             if (!needsADoin.contains(locale)) continue;
1686             lidp.set(locale);
1687             Set<Level> level = lidp.getLevels();
1688             // skip locales that need scripts and don't have them
1689             if (!level.contains(LocaleIDParser.Level.Script) // no script
1690                     && hasAScript.contains(lidp.getLanguage())) {
1691                 needsADoin.remove(locale);
1692                 skippingItems.add(locale);
1693                 continue;
1694             }
1695             // get siblings
1696             Set<String> siblingSet = lidp.getSiblings(needsADoin);
1697             // if it has a script and region
1698             if (level.contains(LocaleIDParser.Level.Script)
1699                     && level.contains(LocaleIDParser.Level.Region)) {
1700                 double languageLiteratePopulation = 0;
1701                 for (String localeID2 : siblingSet) {
1702                     RowData rowData = localeToRowData.get(localeID2);
1703                     if (rowData != null) {
1704                         languageLiteratePopulation +=
1705                                 rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT);
1706                     }
1707                 }
1708                 String parentID = LocaleIDParser.getParent(locale);
1709                 scriptLocaleToLanguageLiteratePopulation.put(parentID, languageLiteratePopulation);
1710             }
1711 
1712             try {
1713                 siblingSets.add(siblingSet);
1714             } catch (RuntimeException e) {
1715                 e.printStackTrace();
1716             }
1717             needsADoin.removeAll(siblingSet);
1718         }
1719         if (SHOW_OLD_DEFAULT_CONTENTS)
1720             System.out.println("ConvertLanguageData Skipping: " + skippingItems);
1721         if (needsADoin.size() != 0) {
1722             if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("Missing: " + needsADoin);
1723         }
1724 
1725         // walk through the data
1726         Set<String> skippingSingletons = new TreeSet<>();
1727 
1728         Set<String> missingData = new TreeSet<>();
1729         for (Set<String> siblingSet : siblingSets) {
1730             if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("** From siblings: " + siblingSet);
1731 
1732             if (false & siblingSet.size() == 1) {
1733                 skippingSingletons.add(siblingSet.iterator().next());
1734                 continue;
1735             }
1736             // get best
1737             double best = Double.NEGATIVE_INFINITY;
1738             String bestLocale = "???";
1739             Set<Pair<Double, String>> data = new TreeSet<>();
1740             LanguageTagParser ltp = new LanguageTagParser();
1741             for (String locale : siblingSet) {
1742                 RowData rowData = localeToRowData.get(locale);
1743                 double languageLiteratePopulation = -1;
1744                 if (rowData != null) {
1745                     languageLiteratePopulation =
1746                             rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT);
1747                 } else {
1748                     Double d = scriptLocaleToLanguageLiteratePopulation.get(locale);
1749                     if (d != null) {
1750                         languageLiteratePopulation = d;
1751                     } else {
1752                         final String region = ltp.set(locale).getRegion();
1753                         if (region.isEmpty() || StandardCodes.isCountry(region)) {
1754                             missingData.add(locale);
1755                         }
1756                     }
1757                 }
1758                 data.add(new Pair<>(languageLiteratePopulation, locale));
1759                 if (best < languageLiteratePopulation) {
1760                     best = languageLiteratePopulation;
1761                     bestLocale = locale;
1762                 }
1763             }
1764             // show it
1765             for (Pair<Double, String> datum : data) {
1766                 if (SHOW_OLD_DEFAULT_CONTENTS)
1767                     System.out.format(
1768                             "\tContenders: %s %f (based on literate population)"
1769                                     + CldrUtility.LINE_SEPARATOR,
1770                             datum.getSecond(),
1771                             datum.getFirst());
1772             }
1773             // System.out.format("\tPicking default content: %s %f (based on literate population)" +
1774             // Utility.LINE_SEPARATOR, bestLocale, best);
1775             // Hack to fix English
1776             // TODO Generalize in the future for other locales with non-primary scripts
1777             if (bestLocale.startsWith("en_")) {
1778                 defaultLocaleContent.add("en_US");
1779             } else {
1780                 defaultLocaleContent.add(bestLocale);
1781             }
1782         }
1783 
1784         for (String singleton : skippingSingletons) {
1785             BadItem.WARNING.show("skipping Singletons", singleton);
1786         }
1787         for (String missing : missingData) {
1788             BadItem.WARNING.show("Missing Data", missing);
1789         }
1790 
1791         // LanguageTagParser ltp = new LanguageTagParser();
1792         // Set<String> warnings = new LinkedHashSet();
1793         // for (String languageCode : languageToMaxCountry.keySet()) {
1794         // CodeAndPopulation best = languageToMaxCountry.get(languageCode);
1795         // String languageSubtag = ltp.set(languageCode).getLanguage();
1796         // String countryCode = "ZZ";
1797         // double rawLanguagePopulation = -1;
1798         // if (best != null) {
1799         // countryCode = best.code;
1800         // rawLanguagePopulation = best.population;
1801         // Set<String> regions = LanguageInfo.INSTANCE.languageToRegions.get(languageSubtag);
1802         // if (regions == null || !regions.contains(countryCode)) {
1803         // Set<String> regions2 = LanguageInfo.INSTANCE.languageToRegionsAlt.get(languageSubtag);
1804         // if (regions2 == null || !regions2.contains(countryCode)) {
1805         // warnings.add("WARNING: " + languageCode + " => " + countryCode + ", not in " + regions +
1806         // "/" + regions2);
1807         // }
1808         // }
1809         // }
1810         // String resolvedLanguageCode = languageCode + "_" + countryCode;
1811         // ltp.set(languageCode);
1812         // Set<String> scripts = LanguageInfo.INSTANCE.languageToScripts.get(languageCode);
1813         // String script = ltp.getScript();
1814         // if (script.length() == 0) {
1815         // CodeAndPopulation bestScript = languageToMaxScript.get(languageCode);
1816         // if (bestScript != null) {
1817         // script = bestScript.code;
1818         // if (scripts == null || !scripts.contains(script)) {
1819         // warnings.add("WARNING: " + languageCode + " => " + script + ", not in " + scripts);
1820         // }
1821         // } else {
1822         // script = "Zzzz";
1823         // if (scripts == null) {
1824         // scripts = LanguageInfo.INSTANCE.languageToScriptsAlt.get(languageCode);
1825         // }
1826         // if (scripts != null) {
1827         // script = scripts.iterator().next();
1828         // if (scripts.size() != 1) {
1829         // warnings.add("WARNING: " + languageCode + " => " + scripts);
1830         // }
1831         // }
1832         // }
1833         // if (scripts == null) {
1834         // warnings.add("Missing scripts for: " + languageCode);
1835         // } else if (scripts.size() == 1){
1836         // script = "";
1837         // }
1838         // resolvedLanguageCode = languageCode
1839         // + (script.length() == 0 ? "" : "_" + script)
1840         // + "_" + countryCode;
1841         // }
1842         //
1843         //
1844         // System.out.println(
1845         // resolvedLanguageCode
1846         // + "\t" + languageCode
1847         // + "\t" + ULocale.getDisplayName(languageCode, ULocale.ENGLISH)
1848         // + "\t" + countryCode
1849         // + "\t" + ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH)
1850         // + "\t" + formatNumber(rawLanguagePopulation)
1851         // + (cldrParents.contains(languageCode) ? "\tCLDR" : "")
1852         // );
1853         // if (languageCode.length() == 0) continue;
1854         // defaultContent.put(languageCode, resolvedLanguageCode);
1855         // }
1856         // for (String warning : warnings) {
1857         // System.out.println(warning);
1858         // }
1859     }
1860 
1861     // private static void printDefaultContent(Set<String> defaultLocaleContent) {
1862     // String sep = Utility.LINE_SEPARATOR + "\t\t\t";
1863     // String broken = Utility.breakLines(join(defaultLocaleContent," "), sep,
1864     // PatternCache.get("(\\S)\\S*").matcher(""),
1865     // 80);
1866     //
1867     // Log.println("\t\t<defaultContent locales=\"" + broken + "\"");
1868     // Log.println("\t\t/>");
1869     // }
1870 
getSuppressScript(String languageCode)1871     private static Object getSuppressScript(String languageCode) {
1872         // TODO Auto-generated method stub
1873         return null;
1874     }
1875 
join(Collection c, String separator)1876     public static String join(Collection c, String separator) {
1877         StringBuffer result = new StringBuffer();
1878         boolean first = true;
1879         for (Object x : c) {
1880             if (first) first = false;
1881             else result.append(separator);
1882             result.append(x);
1883         }
1884         return result.toString();
1885     }
1886 
addBestRegion( String languageCode, String countryCode, double languagePopulationRaw)1887     private static void addBestRegion(
1888             String languageCode, String countryCode, double languagePopulationRaw) {
1889         addBest(languageCode, languagePopulationRaw, countryCode, languageToMaxCountry);
1890     }
1891 
addBestScript( String languageCode, String scriptCode, double languagePopulationRaw)1892     private static void addBestScript(
1893             String languageCode, String scriptCode, double languagePopulationRaw) {
1894         addBest(languageCode, languagePopulationRaw, scriptCode, languageToMaxScript);
1895     }
1896 
addBest( String languageCode, double languagePopulationRaw, String code, Map<String, CodeAndPopulation> languageToMaxCode)1897     private static void addBest(
1898             String languageCode,
1899             double languagePopulationRaw,
1900             String code,
1901             Map<String, CodeAndPopulation> languageToMaxCode) {
1902         if (languageCode.length() == 0) {
1903             throw new IllegalArgumentException();
1904         }
1905         CodeAndPopulation best = languageToMaxCode.get(languageCode);
1906         if (best == null) {
1907             languageToMaxCode.put(languageCode, best = new CodeAndPopulation());
1908         } else if (best.population >= languagePopulationRaw) {
1909             return;
1910         }
1911         best.population = languagePopulationRaw;
1912         best.code = code;
1913     }
1914 
1915     static class CodeAndPopulation {
1916         String code = null;
1917         double population = Double.NaN;
1918 
1919         @Override
toString()1920         public String toString() {
1921             return "{" + code + "," + population + "}";
1922         }
1923     }
1924 
1925     public static class GeneralCollator implements Comparator<String> {
1926         static UTF16.StringComparator cpCompare = new UTF16.StringComparator(true, false, 0);
1927         static RuleBasedCollator UCA = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
1928 
1929         static {
1930             UCA.setNumericCollation(true);
1931         }
1932 
1933         @Override
compare(String s1, String s2)1934         public int compare(String s1, String s2) {
1935             if (s1 == null) {
1936                 return s2 == null ? 0 : -1;
1937             } else if (s2 == null) {
1938                 return 1;
1939             }
1940             int result = UCA.compare(s1, s2);
1941             if (result != 0) return result;
1942             return cpCompare.compare(s1, s2);
1943         }
1944     }
1945 
1946     public static class InverseComparator<T> implements Comparator<T> {
1947         private Comparator<T> other;
1948 
InverseComparator()1949         public InverseComparator() {
1950             this.other = null;
1951         }
1952 
InverseComparator(Comparator<T> other)1953         public InverseComparator(Comparator<T> other) {
1954             this.other = other;
1955         }
1956 
1957         @Override
compare(T a, T b)1958         public int compare(T a, T b) {
1959             return other == null ? ((Comparable) b).compareTo(a) : other.compare(b, a);
1960         }
1961     }
1962 
1963     static Set<String> languagesNeeded =
1964             new TreeSet<>(
1965                     // TODO: what is this list?
1966                     Arrays.asList(
1967                             "ab ba bh bi bo fj fy gd ha ht ik iu ks ku ky lg mi na no rm sa sd sg si sm sn su tg tk to tw vo yi za lb dv chr syr kha sco gv"
1968                                     .split("\\s")));
1969 
1970     /** Not called? */
1971     @Deprecated
generateIso639_2Data(PrintWriter out)1972     private static void generateIso639_2Data(PrintWriter out) {
1973         for (String languageSubtag : sc.getAvailableCodes("language")) {
1974             String alpha3 = Iso639Data.toAlpha3(languageSubtag);
1975             Type type = Iso639Data.getType(languageSubtag);
1976             Scope scope = Iso639Data.getScope(languageSubtag);
1977             if (type != null || alpha3 != null || scope != null) {
1978                 out.println(
1979                         "\t\t<languageCode type=\""
1980                                 + languageSubtag
1981                                 + "\""
1982                                 + (alpha3 == null ? "" : " iso639Alpha3=\"" + alpha3 + "\"")
1983                                 + (type == null ? "" : " iso639Type=\"" + type + "\"")
1984                                 + (scope == null ? "" : " iso639Scope=\"" + scope + "\"")
1985                                 + "/>");
1986             }
1987         }
1988     }
1989 
1990     static Relation<String, BasicLanguageData> language2BasicLanguageData =
1991             Relation.of(new TreeMap<String, Set<BasicLanguageData>>(), TreeSet.class);
1992 
1993     static Map<String, Relation<BasicLanguageData.Type, String>> language_status_scripts;
1994     static Map<Pair<String, String>, String> language_script_references = new TreeMap<>();
1995 
1996     static final Map<String, Map<String, R2<List<String>, String>>> LOCALE_ALIAS_INFO =
1997             SupplementalDataInfo.getInstance().getLocaleAliasInfo();
1998 
getLanguage2Scripts(Set<RowData> sortedInput)1999     static void getLanguage2Scripts(Set<RowData> sortedInput) throws IOException {
2000         language_status_scripts = new TreeMap<>();
2001 
2002         // // get current scripts
2003         // Relation<String,String> languageToDefaultScript = new Relation(new TreeMap(),
2004         // TreeSet.class);
2005         // Relation<String,String> secondaryLanguageToDefaultScript = new Relation(new TreeMap(),
2006         // TreeSet.class);
2007         // for (String languageSubtag : language2BasicLanguageData.keySet()) {
2008         // for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) {
2009         // for (String script : item.getScripts()) {
2010         // addLanguage2Script(languageSubtag, item.getType(), script);
2011         // }
2012         // }
2013         // }
2014         // System.out.println("Language 2 scripts: " + language_status_scripts);
2015 
2016         // #Lcode LanguageName Status Scode ScriptName References
2017         List<List<String>> input =
2018                 SpreadSheet.convert(CldrUtility.getUTF8Data("language_script.tsv"));
2019         System.out.println(
2020                 CldrUtility.LINE_SEPARATOR
2021                         + "# Problems in language_script.tsv"
2022                         + CldrUtility.LINE_SEPARATOR);
2023         // int count = -1;
2024         for (List<String> row : input) {
2025             try {
2026                 if (row.size() == 0) continue;
2027                 // ++count;
2028                 String language = row.get(0).trim();
2029                 if (language.length() == 0 || language.startsWith("#")) continue;
2030                 BasicLanguageData.Type status = BasicLanguageData.Type.valueOf(row.get(2));
2031                 String scripts = row.get(3);
2032                 if (!checkCode(LstrType.language, language, row)) continue;
2033                 for (String script : scripts.split("\\s+")) {
2034                     if (!checkCode(LstrType.script, script, row)) continue;
2035                     // if the script is not modern, demote
2036                     Info scriptInfo = ScriptMetadata.getInfo(script);
2037                     if (scriptInfo == null) {
2038                         BadItem.ERROR.toString(
2039                                 "illegal script; must be represented in Unicode, remove line or fix",
2040                                 script,
2041                                 row);
2042                         continue;
2043                     }
2044                     IdUsage idUsage = scriptInfo.idUsage;
2045                     if (status == BasicLanguageData.Type.primary
2046                             && idUsage != IdUsage.RECOMMENDED) {
2047                         if (idUsage == IdUsage.ASPIRATIONAL || idUsage == IdUsage.LIMITED_USE) {
2048                             BadItem.WARNING.toString(
2049                                     "Script has unexpected usage; make secondary if a Recommended script is used widely for the langauge",
2050                                     idUsage + ", " + script + "=" + getULocaleScriptName(script),
2051                                     row);
2052                         } else {
2053                             BadItem.ERROR.toString(
2054                                     "Script is not modern; make secondary",
2055                                     idUsage + ", " + script + "=" + getULocaleScriptName(script),
2056                                     row);
2057                             status = BasicLanguageData.Type.secondary;
2058                         }
2059                     }
2060 
2061                     // if the language is not modern, demote
2062                     if (LOCALE_ALIAS_INFO.get("language").containsKey(language)) {
2063                         BadItem.ERROR.toString(
2064                                 "Remove/Change deprecated language",
2065                                 language
2066                                         + " "
2067                                         + getLanguageName(language)
2068                                         + "; "
2069                                         + LOCALE_ALIAS_INFO.get("language").get(language),
2070                                 row);
2071                         continue;
2072                     }
2073                     if (status == BasicLanguageData.Type.primary
2074                             && !sc.isModernLanguage(language)) {
2075                         BadItem.ERROR.toString(
2076                                 "Should be secondary, language is not modern",
2077                                 language + " " + getLanguageName(language),
2078                                 row);
2079                         status = BasicLanguageData.Type.secondary;
2080                     }
2081 
2082                     addLanguage2Script(language, status, script);
2083                     if (row.size() > 5) {
2084                         String reference = row.get(5);
2085                         if (reference != null && reference.length() == 0) {
2086                             language_script_references.put(new Pair<>(language, script), reference);
2087                         }
2088                     }
2089                 }
2090             } catch (RuntimeException e) {
2091                 System.err.println(row);
2092                 throw e;
2093             }
2094         }
2095 
2096         // System.out.println("Language 2 scripts: " + language_status_scripts);
2097 
2098         for (String language : sc.getGoodAvailableCodes("language")) {
2099             if (supplementalData.getDeprecatedInfo("language", language) != null) {
2100                 continue;
2101             }
2102             Map<String, String> registryData = sc.getLangData("language", language);
2103             if (registryData != null) {
2104                 String suppressScript = registryData.get("Suppress-Script");
2105                 if (suppressScript == null) continue;
2106                 if (ScriptMetadata.getInfo(suppressScript) == null) {
2107                     // skip, not represented in Unicode
2108                     continue;
2109                 }
2110                 // if there is something already there, we have a problem.
2111                 Relation<BasicLanguageData.Type, String> status_scripts =
2112                         language_status_scripts.get(language);
2113                 if (status_scripts == null) {
2114                     System.out.println(
2115                             "Missing Suppress-Script: "
2116                                     + language
2117                                     + "\tSuppress-Script:\t"
2118                                     + suppressScript);
2119                 } else if (!status_scripts.values().contains(suppressScript)) {
2120                     System.out.println(
2121                             "Missing Suppress-Script: "
2122                                     + language
2123                                     + "\tSuppress-Script:\t"
2124                                     + suppressScript
2125                                     + "\tall:\t"
2126                                     + status_scripts.values());
2127                 } else {
2128                     // at this point, the suppressScript is in the union of the primary and
2129                     // secondary.
2130                     Set<String> primaryScripts =
2131                             status_scripts.getAll(BasicLanguageData.Type.primary);
2132                     if (primaryScripts != null && !primaryScripts.contains(suppressScript)) {
2133                         System.out.println(
2134                                 "Suppress-Script is not in primary: "
2135                                         + language
2136                                         + "\tSuppress-Script:\t"
2137                                         + suppressScript
2138                                         + "\tprimary:\t"
2139                                         + primaryScripts);
2140                     }
2141                 }
2142                 addLanguage2Script(language, BasicLanguageData.Type.primary, suppressScript);
2143             }
2144         }
2145 
2146         // remove primaries from secondaries
2147         // check for primaries for scripts
2148         for (String language : language_status_scripts.keySet()) {
2149             Relation<BasicLanguageData.Type, String> status_scripts =
2150                     language_status_scripts.get(language);
2151             Set<String> secondaryScripts = status_scripts.getAll(BasicLanguageData.Type.secondary);
2152             if (secondaryScripts == null) continue;
2153             Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary);
2154             if (primaryScripts == null) {
2155                 // status_scripts.putAll(BasicLanguageData.Type.primary, secondaryScripts);
2156                 // status_scripts.removeAll(BasicLanguageData.Type.secondary);
2157                 if (sc.isModernLanguage(language)) {
2158                     BadItem.ERROR.show(
2159                             "modern language without primary script, might need to edit moribund_languages.txt",
2160                             language + " " + getLanguageName(language));
2161                 }
2162             } else {
2163                 status_scripts.removeAll(BasicLanguageData.Type.secondary, primaryScripts);
2164             }
2165         }
2166 
2167         // check that every living language in the row data has a script
2168         Set<String> livingLanguagesWithTerritories = new TreeSet<>();
2169         for (RowData rowData : sortedInput) {
2170             String language = rowData.languageCode;
2171             if (sc.isModernLanguage(language)
2172                     && Iso639Data.getSource(language) != Iso639Data.Source.ISO_639_3) {
2173                 livingLanguagesWithTerritories.add(language);
2174             }
2175         }
2176         for (String language : livingLanguagesWithTerritories) {
2177             Relation<BasicLanguageData.Type, String> status_scripts =
2178                     language_status_scripts.get(language);
2179             if (status_scripts != null) {
2180                 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary);
2181                 if (primaryScripts != null && primaryScripts.size() > 0) {
2182                     continue;
2183                 }
2184             }
2185             if (language.equals("tw")) continue; // TODO load aliases and check...
2186             BadItem.WARNING.show(
2187                     "ISO 639-1/2 language in language-territory list without primary script",
2188                     language + "\t" + getLanguageName(language));
2189         }
2190 
2191         // System.out.println("Language 2 scripts: " + language_status_scripts);
2192     }
2193 
checkScript(String script)2194     private static boolean checkScript(String script) {
2195         // TODO Auto-generated method stub
2196         return false;
2197     }
2198 
2199     static Validity VALIDITY = Validity.getInstance();
2200 
checkCode(LstrType type, String code, List<String> sourceLine)2201     private static boolean checkCode(LstrType type, String code, List<String> sourceLine) {
2202         Status validity = VALIDITY.getCodeToStatus(type).get(code);
2203         if (validity == Status.regular) {
2204             return true;
2205         } else if (validity == Status.unknown && type == LstrType.region) {
2206             return true;
2207         }
2208         BadItem.ERROR.show("Illegitimate Code", type + ": " + code + " = " + validity, sourceLine);
2209         return false;
2210     }
2211 
addLanguage2Script( String language, BasicLanguageData.Type type, String script)2212     private static void addLanguage2Script(
2213             String language, BasicLanguageData.Type type, String script) {
2214         Relation<BasicLanguageData.Type, String> status_scripts =
2215                 language_status_scripts.get(language);
2216         if (status_scripts == null)
2217             language_status_scripts.put(
2218                     language,
2219                     status_scripts =
2220                             Relation.of(
2221                                     new TreeMap<BasicLanguageData.Type, Set<String>>(),
2222                                     TreeSet.class));
2223         status_scripts.put(type, script);
2224     }
2225 
addLanguageScriptData()2226     static void addLanguageScriptData() throws IOException {
2227         // check to make sure that every language subtag is in 639-3
2228         Set<String> langRegistryCodes = sc.getGoodAvailableCodes("language");
2229         // Set<String> iso639_2_missing = new TreeSet(langRegistryCodes);
2230         // iso639_2_missing.removeAll(Iso639Data.getAvailable());
2231         // iso639_2_missing.remove("root");
2232         // if (iso639_2_missing.size() != 0) {
2233         // for (String missing : iso639_2_missing){
2234         // System.out.println("*ERROR in StandardCodes* Missing Lang/Script data:\t" + missing + ",
2235         // " +
2236         // sc.getData("language", missing));
2237         // }
2238         // }
2239 
2240         // Map<String, String> nameToTerritoryCode = new TreeMap();
2241         // for (String territoryCode : sc.getGoodAvailableCodes("territory")) {
2242         // nameToTerritoryCode.put(sc.getData("territory", territoryCode).toLowerCase(),
2243         // territoryCode);
2244         // }
2245         // nameToTerritoryCode.put("iran", nameToTerritoryCode.get("iran, islamic republic of")); //
2246 
2247         // BasicLanguageData languageData = new BasicLanguageData();
2248 
2249         BufferedReader in = CldrUtility.getUTF8Data("extraLanguagesAndScripts.txt");
2250         while (true) {
2251             String line = in.readLine();
2252             if (line == null) break;
2253             String[] parts = line.split("\\t");
2254             String alpha3 = parts[0];
2255             alpha3 = stripBrackets(alpha3);
2256             String languageSubtag = Iso639Data.fromAlpha3(alpha3);
2257             if (languageSubtag == null) {
2258                 if (langRegistryCodes.contains(alpha3)) {
2259                     languageSubtag = alpha3;
2260                 } else {
2261                     BadItem.WARNING.show("Language subtag not found on line", alpha3, line);
2262                     continue;
2263                 }
2264             }
2265             // String name = parts[1];
2266             Set<String> names = Iso639Data.getNames(languageSubtag);
2267             if (names == null) {
2268                 Map<String, String> name2 = sc.getLangData("language", languageSubtag);
2269                 if (name2 != null) {
2270                     String name3 = name2.get("Description");
2271                     if (name3 != null) {
2272                         names = new TreeSet<>();
2273                         names.add(name3);
2274                     }
2275                 }
2276             }
2277             // if (names == null || !names.contains(name)) {
2278             // System.out.println("Name <" + name + "> for <" + languageSubtag + "> not found in " +
2279             // names);
2280             // }
2281 
2282             // names all straight, now get scripts and territories
2283             // [Cyrl]; [Latn]
2284             Set<String> fullScriptList = sc.getGoodAvailableCodes("script");
2285 
2286             String[] scriptList = parts[2].split("[;,]\\s*");
2287             Set<String> scripts = new TreeSet<>();
2288             Set<String> scriptsAlt = new TreeSet<>();
2289             for (String script : scriptList) {
2290                 if (script.length() == 0) continue;
2291                 boolean alt = false;
2292                 if (script.endsWith("*")) {
2293                     alt = true;
2294                     script = script.substring(0, script.length() - 1);
2295                 }
2296                 script = stripBrackets(script);
2297                 if (!fullScriptList.contains(script)) {
2298                     System.out.println(
2299                             "Script <"
2300                                     + script
2301                                     + "> for <"
2302                                     + languageSubtag
2303                                     + "> not found in "
2304                                     + fullScriptList);
2305                 } else if (alt) {
2306                     scriptsAlt.add(script);
2307                 } else {
2308                     scripts.add(script);
2309                 }
2310             }
2311             // now territories
2312             Set<String> territories = new TreeSet<>();
2313             if (parts.length > 4) {
2314                 String[] territoryList = parts[4].split("\\s*[;,-]\\s*");
2315                 for (String territoryName : territoryList) {
2316                     if (territoryName.equals("ISO/DIS 639") || territoryName.equals("3")) continue;
2317                     String territoryCode =
2318                             CountryCodeConverter.getCodeFromName(territoryName, true);
2319                     if (territoryCode == null) {
2320                         BadItem.ERROR.show(
2321                                 "no name found for territory",
2322                                 "<" + territoryName + ">",
2323                                 languageSubtag);
2324                     } else {
2325                         territories.add(territoryCode);
2326                     }
2327                 }
2328             }
2329             // <language type="de" scripts="Latn" territories="IT" alt="secondary"/>
2330             // we're going to go ahead and set these all to secondary.
2331             if (scripts.size() != 0) {
2332                 language2BasicLanguageData.put(
2333                         languageSubtag,
2334                         new BasicLanguageData()
2335                                 .setType(BasicLanguageData.Type.secondary)
2336                                 .setScripts(scripts)
2337                                 .setTerritories(territories));
2338             }
2339             if (scriptsAlt.size() != 0) {
2340                 language2BasicLanguageData.put(
2341                         languageSubtag,
2342                         new BasicLanguageData()
2343                                 .setType(BasicLanguageData.Type.secondary)
2344                                 .setScripts(scriptsAlt)
2345                                 .setTerritories(territories));
2346             }
2347         }
2348         in.close();
2349 
2350         // add other data
2351         for (String languageSubtag : supplementalData.getBasicLanguageDataLanguages()) {
2352             Set<BasicLanguageData> otherData =
2353                     supplementalData.getBasicLanguageData(languageSubtag);
2354             language2BasicLanguageData.putAll(languageSubtag, otherData);
2355         }
2356     }
2357 
2358     // private static void showAllBasicLanguageData(Relation<String, BasicLanguageData>
2359     // language2basicData, String
2360     // comment) {
2361     // // now print
2362     // Relation<String, String> primaryCombos = new Relation(new TreeMap(), TreeSet.class);
2363     // Relation<String, String> secondaryCombos = new Relation(new TreeMap(), TreeSet.class);
2364     //
2365     // Log.println("\t<languageData>" + (comment == null ? "" : " <!-- " + comment + " -->"));
2366     //
2367     // for (String languageSubtag : language2basicData.keySet()) {
2368     // String duplicate = "";
2369     // // script,territory
2370     // primaryCombos.clear();
2371     // secondaryCombos.clear();
2372     //
2373     // for (BasicLanguageData item : language2basicData.getAll(languageSubtag)) {
2374     // Set<String> scripts = item.getScripts();
2375     // if (scripts.size() == 0) scripts = new TreeSet(Arrays.asList(new String[] { "Zzzz" }));
2376     // for (String script : scripts) {
2377     // Set<String> territories = item.getTerritories();
2378     // if (territories.size() == 0) territories = new TreeSet(Arrays.asList(new String[] { "ZZ" }));
2379     // for (String territory : territories) {
2380     // if (item.getType().equals(BasicLanguageData.Type.primary)) {
2381     // primaryCombos.put(script, territory);
2382     // } else {
2383     // secondaryCombos.put(script, territory);
2384     // }
2385     // }
2386     // }
2387     // }
2388     // secondaryCombos.removeAll(primaryCombos);
2389     // showBasicLanguageData(languageSubtag, primaryCombos, null, BasicLanguageData.Type.primary);
2390     // showBasicLanguageData(languageSubtag, secondaryCombos, primaryCombos.keySet(),
2391     // BasicLanguageData.Type.secondary);
2392     // // System.out.println(item.toString(languageSubtag) + duplicate);
2393     // // duplicate = " <!-- " + "**" + " -->";
2394     // }
2395     // Log.println("\t</languageData>");
2396     // }
2397 
showBasicLanguageData( PrintWriter out, String languageSubtag, Relation<String, String> primaryCombos, Set<String> suppressEmptyScripts, BasicLanguageData.Type type)2398     private static void showBasicLanguageData(
2399             PrintWriter out,
2400             String languageSubtag,
2401             Relation<String, String> primaryCombos,
2402             Set<String> suppressEmptyScripts,
2403             BasicLanguageData.Type type) {
2404         Set<String> scriptsWithSameTerritories = new TreeSet<>();
2405         Set<String> lastTerritories = Collections.emptySet();
2406         for (String script : primaryCombos.keySet()) {
2407             Set<String> territories = primaryCombos.getAll(script);
2408             if (lastTerritories == Collections.EMPTY_SET) {
2409                 // skip first
2410             } else if (lastTerritories.equals(territories)) {
2411                 scriptsWithSameTerritories.add(script);
2412             } else {
2413                 showBasicLanguageData2(
2414                         out,
2415                         languageSubtag,
2416                         scriptsWithSameTerritories,
2417                         suppressEmptyScripts,
2418                         lastTerritories,
2419                         type);
2420                 scriptsWithSameTerritories.clear();
2421             }
2422             lastTerritories = territories;
2423             scriptsWithSameTerritories.add(script);
2424         }
2425         showBasicLanguageData2(
2426                 out,
2427                 languageSubtag,
2428                 scriptsWithSameTerritories,
2429                 suppressEmptyScripts,
2430                 lastTerritories,
2431                 type);
2432     }
2433 
showBasicLanguageData2( PrintWriter out, String languageSubtag, Set<String> scripts, Set<String> suppressEmptyScripts, Set<String> territories, BasicLanguageData.Type type)2434     private static void showBasicLanguageData2(
2435             PrintWriter out,
2436             String languageSubtag,
2437             Set<String> scripts,
2438             Set<String> suppressEmptyScripts,
2439             Set<String> territories,
2440             BasicLanguageData.Type type) {
2441         scripts.remove("Zzzz");
2442         territories.remove("ZZ");
2443         if (territories.size() == 0 && suppressEmptyScripts != null) {
2444             scripts.removeAll(suppressEmptyScripts);
2445         }
2446         if (scripts.size() == 0 && territories.size() == 0) return;
2447         out.println(
2448                 "\t\t<language type=\""
2449                         + languageSubtag
2450                         + "\""
2451                         + (scripts.size() == 0
2452                                 ? ""
2453                                 : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"")
2454                         + (territories.size() == 0
2455                                 ? ""
2456                                 : " territories=\"" + CldrUtility.join(territories, " ") + "\"")
2457                         + (type == BasicLanguageData.Type.primary ? "" : " alt=\"" + type + "\"")
2458                         + "/>");
2459     }
2460 
2461     /*
2462      * System.out.println(
2463      * "\t\t<language type=\"" + languageSubtag + "\"" +
2464      * " scripts=\"" + Utility.join(scripts," ") + "\"" +
2465      * (territories.size() == 0 ? "" : " territories=\"" + Utility.join(territories," ") + "\"") +
2466      * "/>"
2467      * );
2468      */
2469 
stripBrackets(String alpha3)2470     private static String stripBrackets(String alpha3) {
2471         if (alpha3.startsWith("[") && alpha3.endsWith("]")) {
2472             alpha3 = alpha3.substring(1, alpha3.length() - 1);
2473         }
2474         return alpha3;
2475     }
2476 
2477     static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH);
2478     static NumberFormat nf_no_comma = NumberFormat.getInstance(ULocale.ENGLISH);
2479 
2480     static {
2481         nf_no_comma.setGroupingUsed(false);
2482     }
2483 
2484     static NumberFormat pf = NumberFormat.getPercentInstance(ULocale.ENGLISH);
2485 
formatNumber(double original, int roundDigits, boolean xml)2486     public static String formatNumber(double original, int roundDigits, boolean xml) {
2487         double d = original;
2488         if (roundDigits != 0) {
2489             d = CldrUtility.roundToDecimals(original, roundDigits);
2490         }
2491         if (Double.isNaN(d)) {
2492             d = CldrUtility.roundToDecimals(original, roundDigits);
2493             throw new IllegalArgumentException("Double is NaN");
2494         }
2495         if (xml) {
2496             return nf_no_comma.format(d);
2497         }
2498         return nf.format(d);
2499     }
2500 
formatPercent(double d, int roundDigits, boolean xml)2501     public static String formatPercent(double d, int roundDigits, boolean xml) {
2502         if (roundDigits != 0) {
2503             d = CldrUtility.roundToDecimals(d, roundDigits);
2504         }
2505         if (xml) {
2506             nf_no_comma.setMaximumFractionDigits(roundDigits + 2);
2507             return nf_no_comma.format(d * 100.0);
2508         }
2509         pf.setMaximumFractionDigits(roundDigits + 2);
2510         return pf.format(d);
2511     }
2512 
2513     static final LanguageTagCanonicalizer languageTagCanonicalizer = new LanguageTagCanonicalizer();
2514 
fixLanguageCode(String languageCodeRaw, List<String> row)2515     private static String fixLanguageCode(String languageCodeRaw, List<String> row) {
2516         String languageCode = languageTagCanonicalizer.transform(languageCodeRaw);
2517         if (DEBUG && !languageCode.equals(languageCodeRaw)) {
2518             System.out.println("## " + languageCodeRaw + " => " + languageCode);
2519         }
2520         int bar = languageCode.indexOf('_');
2521         String script = "";
2522         if (bar >= 0) {
2523             script = languageCode.substring(bar);
2524             languageCode = languageCode.substring(0, bar);
2525         }
2526         R2<List<String>, String> replacement =
2527                 supplementalData.getLocaleAliasInfo().get("language").get(languageCode);
2528         if (replacement != null) {
2529             String replacementCode = replacement.get0().get(0);
2530             BadItem.ERROR.show(
2531                     "deprecated language code", languageCode + " => " + replacementCode, row);
2532             languageCode = replacementCode;
2533         }
2534         if (!sc.getAvailableCodes("language").contains(languageCode)) {
2535             BadItem.ERROR.show("bad language code", languageCode, row);
2536         }
2537         return languageCode + script;
2538     }
2539 
2540     enum BadItem {
2541         ERROR,
2542         WARNING,
2543         DETAIL;
2544 
show(String problem, String details, String... items)2545         void show(String problem, String details, String... items) {
2546             System.out.println(toString(problem, details, items));
2547         }
2548 
show(String problem, String details, List<String> row)2549         void show(String problem, String details, List<String> row) {
2550             System.out.println(toString(problem, details, row));
2551         }
2552 
toString(String problem, String details, String... items)2553         private String toString(String problem, String details, String... items) {
2554             return toString(problem, details, Arrays.asList(items));
2555         }
2556 
toString(String problem, String details, List<String> row)2557         private String toString(String problem, String details, List<String> row) {
2558             return "* "
2559                     + this
2560                     + " *\t"
2561                     + problem
2562                     + ":"
2563                     + "\t"
2564                     + details
2565                     + (row != null && row.size() > 0 ? "\t" + Joiner.on("\t").join(row) : "");
2566         }
2567     }
2568 
fixCountryCode(String countryCode, List<String> row)2569     private static String fixCountryCode(String countryCode, List<String> row) {
2570         R2<List<String>, String> replacement =
2571                 supplementalData.getLocaleAliasInfo().get("territory").get(countryCode);
2572         if (replacement != null) {
2573             String replacementCode = replacement.get0().get(0);
2574             BadItem.ERROR.show(
2575                     "deprecated territory code", countryCode + " => " + replacementCode, row);
2576             countryCode = replacementCode;
2577         }
2578         if (!sc.getAvailableCodes("territory").contains(countryCode)) {
2579             BadItem.ERROR.show("bad territory code", countryCode, row);
2580         }
2581         return countryCode;
2582     }
2583 
getULocaleLocaleName(String languageCode)2584     private static String getULocaleLocaleName(String languageCode) {
2585         return english.getName(languageCode, true);
2586         // return new ULocale(languageCode).getDisplayName();
2587     }
2588 
getULocaleScriptName(String scriptCode)2589     private static String getULocaleScriptName(String scriptCode) {
2590         return english.getName(CLDRFile.SCRIPT_NAME, scriptCode);
2591         // return ULocale.getDisplayScript("und_" + scriptCode, ULocale.ENGLISH);
2592     }
2593 
getULocaleCountryName(String countryCode)2594     private static String getULocaleCountryName(String countryCode) {
2595         return english.getName(CLDRFile.TERRITORY_NAME, countryCode);
2596         // return ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH);
2597     }
2598 }
2599