1 package org.unicode.cldr.tool; 2 3 import com.google.common.base.Joiner; 4 import com.google.common.collect.ImmutableSet; 5 import com.google.common.math.DoubleMath; 6 import com.ibm.icu.impl.Relation; 7 import com.ibm.icu.impl.Row; 8 import com.ibm.icu.impl.Row.R2; 9 import com.ibm.icu.text.Collator; 10 import com.ibm.icu.text.NumberFormat; 11 import com.ibm.icu.text.RuleBasedCollator; 12 import com.ibm.icu.text.UTF16; 13 import com.ibm.icu.util.ULocale; 14 import java.io.BufferedReader; 15 import java.io.File; 16 import java.io.IOException; 17 import java.io.PrintWriter; 18 import java.nio.file.Files; 19 import java.nio.file.StandardCopyOption; 20 import java.text.ParseException; 21 import java.util.ArrayList; 22 import java.util.Arrays; 23 import java.util.Collection; 24 import java.util.Collections; 25 import java.util.Comparator; 26 import java.util.EnumMap; 27 import java.util.HashMap; 28 import java.util.HashSet; 29 import java.util.Iterator; 30 import java.util.LinkedHashSet; 31 import java.util.List; 32 import java.util.Map; 33 import java.util.Set; 34 import java.util.TreeMap; 35 import java.util.TreeSet; 36 import java.util.regex.Matcher; 37 import org.unicode.cldr.draft.FileUtilities; 38 import org.unicode.cldr.draft.ScriptMetadata; 39 import org.unicode.cldr.draft.ScriptMetadata.IdUsage; 40 import org.unicode.cldr.draft.ScriptMetadata.Info; 41 import org.unicode.cldr.util.Builder; 42 import org.unicode.cldr.util.CLDRFile; 43 import org.unicode.cldr.util.CLDRPaths; 44 import org.unicode.cldr.util.CldrUtility; 45 import org.unicode.cldr.util.Factory; 46 import org.unicode.cldr.util.Iso639Data; 47 import org.unicode.cldr.util.Iso639Data.Scope; 48 import org.unicode.cldr.util.Iso639Data.Source; 49 import org.unicode.cldr.util.Iso639Data.Type; 50 import org.unicode.cldr.util.LanguageTagCanonicalizer; 51 import org.unicode.cldr.util.LanguageTagParser; 52 import org.unicode.cldr.util.LocaleIDParser; 53 import org.unicode.cldr.util.LocaleIDParser.Level; 54 import org.unicode.cldr.util.Pair; 55 import org.unicode.cldr.util.PatternCache; 56 import org.unicode.cldr.util.SpreadSheet; 57 import org.unicode.cldr.util.StandardCodes; 58 import org.unicode.cldr.util.StandardCodes.LstrType; 59 import org.unicode.cldr.util.SupplementalDataInfo; 60 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; 61 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; 62 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; 63 import org.unicode.cldr.util.TransliteratorUtilities; 64 import org.unicode.cldr.util.Validity; 65 import org.unicode.cldr.util.Validity.Status; 66 import org.unicode.cldr.util.XPathParts; 67 import org.unicode.cldr.util.XPathParts.Comments; 68 69 /** 70 * @author markdavis 71 */ 72 public class ConvertLanguageData { 73 74 private static final boolean DEBUG = false; 75 // change this if you need to override what is generated for the default contents. 76 private static final List<String> defaultOverrides = Arrays.asList("es_ES".split("\\s+")); 77 78 public static final boolean SHOW_DIFF = false; 79 80 private static final boolean ALLOW_SMALL_NUMBERS = true; 81 82 static final Comparator<String> GENERAL_COLLATOR = new GeneralCollator(); 83 static final Comparator<String> INVERSE_GENERAL = new InverseComparator<>(GENERAL_COLLATOR); 84 85 private static StandardCodes sc = StandardCodes.make(); 86 87 static final double populationFactor = 1; 88 static final double gdpFactor = 1; 89 static final int BAD_COUNTRY_NAME = 0, 90 COUNTRY_CODE = 1, 91 COUNTRY_POPULATION = 2, 92 COUNTRY_LITERACY = 3, 93 COUNTRY_GDP = 4, 94 OFFICIAL_STATUS = 5, 95 BAD_LANGUAGE_NAME = 6, 96 LANGUAGE_CODE = 7, 97 LANGUAGE_POPULATION = 8, 98 LANGUAGE_LITERACY = 9, 99 COMMENT = 10, 100 NOTES = 11; 101 static final Map<String, CodeAndPopulation> languageToMaxCountry = new TreeMap<>(); 102 static final Map<String, CodeAndPopulation> languageToMaxScript = new TreeMap<>(); 103 104 private static final double NON_OFFICIAL_WEIGHT = 0.40; 105 106 private static final boolean SHOW_OLD_DEFAULT_CONTENTS = false; 107 108 private static final ImmutableSet<String> scriptAssumedLocales = 109 ImmutableSet.of( 110 "bm_ML", "ha_GH", "ha_NE", "ha_NG", "kk_KZ", "ks_IN", "ky_KG", "mn_MN", "ms_BN", 111 "ms_MY", "ms_SG", "tk_TM", "tzm_MA", "ug_CN"); 112 113 static Set<String> skipLocales = 114 new HashSet<>( 115 Arrays.asList( 116 "sh sh_BA sh_CS sh_YU characters supplementalData supplementalData-old supplementalData-old2 supplementalData-old3 supplementalMetadata root" 117 .split("\\s"))); 118 119 static Map<String, String> defaultContent = new TreeMap<>(); 120 121 static Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 122 static CLDRFile english = cldrFactory.make("en", true); 123 124 static SupplementalDataInfo supplementalData = 125 SupplementalDataInfo.getInstance(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY); 126 main(String[] args)127 public static void main(String[] args) throws IOException, ParseException { 128 final File oldSupp = 129 new File(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY, "supplementalData.xml"); 130 final File genSupp = 131 new File(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalData.xml"); 132 final File genLsraw = 133 new File(CLDRPaths.GEN_DIRECTORY + "/supplemental", "language_script.tsv"); 134 try (final BufferedReader oldFile = FileUtilities.openUTF8Reader(oldSupp); 135 final PrintWriter newFile = FileUtilities.openUTF8Writer(genSupp); 136 final PrintWriter newLsraw = FileUtilities.openUTF8Writer(genLsraw); ) { 137 // load elements we care about 138 CldrUtility.copyUpTo( 139 oldFile, PatternCache.get("\\s*<languageData>\\s*"), newFile, false); 140 141 Set<String> available = cldrFactory.getAvailable(); 142 143 Set<String> cldrParents = getCldrParents(available); 144 145 List<String> failures = new ArrayList<>(); 146 Map<String, RowData> localeToRowData = new TreeMap<>(); 147 148 Set<RowData> sortedInput = getExcelData(failures, localeToRowData); 149 150 // get the locales (including parents) 151 Set<String> localesWithData = new TreeSet<>(localeToRowData.keySet()); 152 for (String locale : localeToRowData.keySet()) { 153 while (true) { 154 String parent = LocaleIDParser.getParent(locale); 155 if (parent == null) break; 156 localesWithData.add(parent); 157 locale = parent; 158 } 159 } 160 161 final LanguageTagParser languageTagParser = new LanguageTagParser(); 162 163 for (String localeRaw : available) { 164 String locale = languageTagCanonicalizer.transform(localeRaw); 165 if (!localesWithData.contains(locale)) { 166 CLDRFile locFile = cldrFactory.make(localeRaw, false); 167 if (locFile.isAliasedAtTopLevel()) { 168 continue; 169 } 170 if (scriptAssumedLocales.contains(locale)) { 171 continue; 172 } 173 languageTagParser.set(locale); 174 if (languageTagParser.getVariants().size() != 0) { 175 continue; 176 } 177 String withoutScript = languageTagParser.setScript("").toString(); 178 if (!localesWithData.contains(withoutScript)) { 179 String region = new LanguageTagParser().set(locale).getRegion(); 180 if (StandardCodes.isCountry(region)) { 181 BadItem.ERROR.show( 182 "missing language/population data for CLDR locale", 183 locale + " = " + getLanguageCodeAndName(locale)); 184 } 185 } else { 186 // These exceptions are OK, because these locales by default use the 187 // non-default script 188 Set<String> OKExceptions = 189 ImmutableSet.of("sr_Cyrl_ME", "zh_Hans_HK", "zh_Hans_MO"); 190 if (OKExceptions.contains(locale)) { 191 continue; 192 } 193 BadItem.ERROR.show( 194 "missing language/population data for CLDR locale", 195 locale 196 + " = " 197 + getLanguageCodeAndName(locale) 198 + " but have data for " 199 + getLanguageCodeAndName(withoutScript)); 200 } 201 } 202 } 203 204 // TODO sort by country code, then functionalPopulation, then language code 205 // and keep the top country for each language code (even if < 1%) 206 207 addLanguageScriptData(); 208 209 // showAllBasicLanguageData(allLanguageData, "old"); 210 getLanguage2Scripts(sortedInput); 211 212 writeNewBasicData2(newFile, sortedInput); 213 // writeNewBasicData(sortedInput); 214 215 writeTerritoryLanguageData(newFile, failures, sortedInput); 216 217 checkBasicData(localeToRowData); 218 219 Set<String> defaultLocaleContent = new TreeSet<>(); 220 221 showDefaults(cldrParents, nf, defaultContent, localeToRowData, defaultLocaleContent); 222 223 // showContent(available); 224 225 // certain items are overridden 226 227 List<String> toRemove = new ArrayList<>(); 228 for (String override : defaultOverrides) { 229 String replacement = getReplacement(override, defaultLocaleContent); 230 if (replacement != null) { 231 toRemove.add(replacement); 232 } 233 } 234 defaultLocaleContent.removeAll(toRemove); 235 defaultLocaleContent.addAll(defaultOverrides); 236 237 showFailures(failures); 238 239 CldrUtility.copyUpTo( 240 oldFile, PatternCache.get("\\s*</territoryInfo>\\s*"), null, false); 241 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<references>\\s*"), newFile, false); 242 // generateIso639_2Data(newFile); 243 references.printReferences(newFile); 244 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*</references>\\s*"), null, false); 245 CldrUtility.copyUpTo(oldFile, null, newFile, false); 246 247 getLanguageScriptSpreadsheet(newLsraw); 248 249 // Only write if there's no exception. 250 } catch (Exception e) { 251 e.printStackTrace(); 252 return; 253 } 254 255 System.out.println("Wrote: " + genLsraw); 256 System.out.println("Wrote: " + genSupp); 257 System.out.println("Moving " + genSupp + " to " + oldSupp); 258 Files.move(genSupp.toPath(), oldSupp.toPath(), StandardCopyOption.REPLACE_EXISTING); 259 System.out.println("DONE"); 260 } 261 getLanguageCodeAndName(String code)262 public static String getLanguageCodeAndName(String code) { 263 if (code == null) return null; 264 return english.getName(code) + " [" + code + "]"; 265 } 266 getReplacement(String oldDefault, Set<String> defaultLocaleContent)267 private static String getReplacement(String oldDefault, Set<String> defaultLocaleContent) { 268 String parent = LocaleIDParser.getParent(oldDefault); 269 for (String replacement : defaultLocaleContent) { 270 if (replacement.startsWith(parent)) { 271 if (parent.equals(LocaleIDParser.getParent(replacement))) { 272 return replacement; 273 } 274 } 275 } 276 return null; 277 } 278 getLanguageScriptSpreadsheet(PrintWriter out)279 private static void getLanguageScriptSpreadsheet(PrintWriter out) { 280 out.println("#Lcode\tLanguageName\tStatus\tScode\tScriptName\tReferences"); 281 Pair<String, String> languageScript = new Pair<>("", ""); 282 for (String language : language_status_scripts.keySet()) { 283 Relation<BasicLanguageData.Type, String> status_scripts = 284 language_status_scripts.get(language); 285 for (BasicLanguageData.Type status : status_scripts.keySet()) { 286 for (String script : status_scripts.getAll(status)) { 287 String reference = 288 language_script_references.get( 289 languageScript.setFirst(language).setSecond(script)); 290 out.println( 291 language 292 + "\t" 293 + getLanguageName(language) 294 + "\t" 295 + status 296 + "\t" 297 + script 298 + "\t" 299 + getDisplayScript(script) 300 + (reference == null ? "" : "\t" + reference)); 301 } 302 } 303 } 304 } 305 306 /** 307 * Write data in format: <languageData> <language type="aa" scripts="Latn" territories="DJ ER 308 * ET"/> 309 * 310 * @param sortedInput 311 */ writeNewBasicData2(PrintWriter out, Set<RowData> sortedInput)312 private static void writeNewBasicData2(PrintWriter out, Set<RowData> sortedInput) { 313 double cutoff = 0.2; // 20% 314 315 // Relation<String, BasicLanguageData> newLanguageData = new Relation(new TreeMap(), 316 // TreeSet.class); 317 LanguageTagParser ltp = new LanguageTagParser(); 318 Map<String, Relation<BasicLanguageData.Type, String>> language_status_territories = 319 new TreeMap<>(); 320 // Map<String, Pair<String, String>> languageToBestCountry; 321 for (RowData rowData : sortedInput) { 322 if (rowData.countryCode.equals("ZZ")) continue; 323 ltp.set(rowData.languageCode); 324 String languageCode = ltp.getLanguage(); 325 Relation<BasicLanguageData.Type, String> status_territories = 326 language_status_territories.get(languageCode); 327 if (status_territories == null) { 328 language_status_territories.put( 329 languageCode, 330 status_territories = 331 Relation.of( 332 new TreeMap<BasicLanguageData.Type, Set<String>>(), 333 TreeSet.class)); 334 } 335 if (rowData.officialStatus.isMajor()) { 336 status_territories.put(BasicLanguageData.Type.primary, rowData.countryCode); 337 } else if (rowData.officialStatus.isOfficial() 338 || rowData.getLanguagePopulation() >= cutoff * rowData.countryPopulation 339 || rowData.getLanguagePopulation() >= 1000000) { 340 status_territories.put(BasicLanguageData.Type.secondary, rowData.countryCode); 341 } 342 } 343 344 Set<String> allLanguages = new TreeSet<>(language_status_territories.keySet()); 345 allLanguages.addAll(language_status_scripts.keySet()); 346 // now add all the remaining language-script info 347 // <language type="sv" scripts="Latn" territories="AX FI SE"/> 348 Set<String> warnings = new LinkedHashSet<>(); 349 out.println("\t<languageData>"); 350 for (String languageSubtag : allLanguages) { 351 Relation<BasicLanguageData.Type, String> status_scripts = 352 language_status_scripts.get(languageSubtag); 353 Relation<BasicLanguageData.Type, String> status_territories = 354 language_status_territories.get(languageSubtag); 355 356 // check against old: 357 Map<BasicLanguageData.Type, BasicLanguageData> oldData = 358 supplementalData.getBasicLanguageDataMap(languageSubtag); 359 if (oldData == null) { 360 oldData = Collections.emptyMap(); 361 } 362 363 EnumMap<BasicLanguageData.Type, BasicLanguageData> newData = 364 new EnumMap<>(BasicLanguageData.Type.class); 365 for (BasicLanguageData.Type status : BasicLanguageData.Type.values()) { 366 Set<String> scripts = status_scripts == null ? null : status_scripts.getAll(status); 367 Set<String> territories = 368 status_territories == null ? null : status_territories.getAll(status); 369 if (scripts == null && territories == null) continue; 370 BasicLanguageData bld = new BasicLanguageData(); 371 bld.setTerritories(territories); 372 bld.setScripts(scripts); 373 bld.setType(status); 374 bld.freeze(); 375 newData.put(status, bld); 376 } 377 378 // compare 379 if (!CldrUtility.equals(oldData.entrySet(), newData.entrySet())) { 380 for (String problem : compare(oldData, newData)) { 381 warnings.add( 382 BadItem.DETAIL.toString( 383 "changing <languageData>", 384 languageSubtag + "\t" + english.getName(languageSubtag), 385 problem)); 386 } 387 } 388 389 for (BasicLanguageData bld : newData.values()) { 390 Set<String> scripts = bld.getScripts(); 391 Set<String> territories = bld.getTerritories(); 392 BasicLanguageData.Type status = bld.getType(); 393 out.println( 394 "\t\t<language type=\"" 395 + languageSubtag 396 + "\"" 397 + (scripts.isEmpty() 398 ? "" 399 : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"") 400 + (territories.isEmpty() 401 ? "" 402 : " territories=\"" 403 + CldrUtility.join(territories, " ") 404 + "\"") 405 + (status == BasicLanguageData.Type.primary 406 ? "" 407 : " alt=\"secondary\"") 408 + "/>"); 409 } 410 } 411 out.println("\t</languageData>"); 412 for (String s : warnings) { 413 if (s.contains("!")) { 414 System.out.println(s); 415 } 416 } 417 for (String s : warnings) { 418 if (!s.contains("!")) { 419 System.out.println(s); 420 } 421 } 422 } 423 compare( Map<BasicLanguageData.Type, BasicLanguageData> oldData, Map<BasicLanguageData.Type, BasicLanguageData> newData)424 private static List<String> compare( 425 Map<BasicLanguageData.Type, BasicLanguageData> oldData, 426 Map<BasicLanguageData.Type, BasicLanguageData> newData) { 427 Map<String, BasicLanguageData.Type> oldDataToType = getDataToType(oldData.values(), true); 428 Map<String, BasicLanguageData.Type> newDataToType = getDataToType(newData.values(), true); 429 List<String> result = new ArrayList<>(); 430 StringBuilder temp = new StringBuilder(); 431 for (String s : 432 Builder.with(new LinkedHashSet<String>()) 433 .addAll(oldDataToType.keySet()) 434 .addAll(newDataToType.keySet()) 435 .get()) { 436 BasicLanguageData.Type oldValue = oldDataToType.get(s); 437 BasicLanguageData.Type newValue = newDataToType.get(s); 438 if (!CldrUtility.equals(oldValue, newValue)) { 439 temp.setLength(0); 440 temp.append("[") 441 .append(s) 442 .append(":") 443 .append(english.getName(s.length() == 4 ? "script" : "region", s)) 444 .append("] "); 445 if (oldValue == null) { 446 temp.append(" added as ").append(newValue); 447 } else if (newValue == null) { 448 temp.append(" REMOVED!"); 449 } else if (oldValue == BasicLanguageData.Type.primary) { 450 temp.append(" DOWNGRADED TO! ").append(newValue); 451 } else { 452 temp.append(" upgraded to ").append(newValue); 453 } 454 result.add(temp.toString()); 455 } 456 } 457 result.add(newData.toString()); 458 return result; 459 } 460 getDataToType( Collection<BasicLanguageData> collection, boolean script)461 private static Map<String, BasicLanguageData.Type> getDataToType( 462 Collection<BasicLanguageData> collection, boolean script) { 463 Map<String, BasicLanguageData.Type> result = new TreeMap<>(); 464 for (BasicLanguageData i : collection) { 465 for (String s : i.getScripts()) { 466 result.put(s, i.getType()); 467 } 468 for (String s : i.getTerritories()) { 469 result.put(s, i.getType()); 470 } 471 } 472 return result; 473 } 474 checkBasicData(Map<String, RowData> localeToRowData)475 private static void checkBasicData(Map<String, RowData> localeToRowData) { 476 // find languages with multiple scripts 477 Relation<String, String> languageToScripts = 478 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 479 for (String languageSubtag : language2BasicLanguageData.keySet()) { 480 for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) { 481 languageToScripts.putAll( 482 StandardCodes.fixLanguageTag(languageSubtag), item.getScripts()); 483 } 484 } 485 // get primary combinations 486 Set<String> primaryCombos = new TreeSet<>(); 487 Set<String> basicCombos = new TreeSet<>(); 488 for (String languageSubtag : language2BasicLanguageData.keySet()) { 489 for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) { 490 Set<String> scripts = new TreeSet<>(); 491 scripts.addAll(item.getScripts()); 492 languageToScripts.putAll(StandardCodes.fixLanguageTag(languageSubtag), scripts); 493 if (scripts.size() == 0) { 494 scripts.add("Zzzz"); 495 } 496 Set<String> territories = new TreeSet<>(); 497 territories.addAll(item.getTerritories()); 498 if (territories.size() == 0) { 499 territories.add("ZZ"); 500 continue; 501 } 502 503 for (String script : scripts) { 504 for (String territory : territories) { 505 String locale = 506 StandardCodes.fixLanguageTag(languageSubtag) 507 // + (script.equals("Zzzz") ? "" : 508 // languageToScripts.getAll(languageSubtag).size() <= 1 ? "" 509 // : "_" + script) 510 + (territories.equals("ZZ") ? "" : "_" + territory); 511 if (item.getType() != BasicLanguageData.Type.secondary) { 512 primaryCombos.add(locale); 513 } 514 basicCombos.add(locale); 515 } 516 } 517 } 518 } 519 Set<String> populationOver20 = new TreeSet<>(); 520 Set<String> population = new TreeSet<>(); 521 LanguageTagParser ltp = new LanguageTagParser(); 522 for (String rawLocale : localeToRowData.keySet()) { 523 ltp.set(rawLocale); 524 String locale = 525 ltp.getLanguage() 526 + (ltp.getRegion().length() == 0 ? "" : "_" + ltp.getRegion()); 527 population.add(locale); 528 RowData rowData = localeToRowData.get(rawLocale); 529 if (rowData.getLanguagePopulation() / rowData.countryPopulation >= 0.2 530 // || rowData.getLanguagePopulation() > 900000 531 ) { 532 populationOver20.add(locale); 533 } else { 534 PopulationData popData = 535 supplementalData.getLanguageAndTerritoryPopulationData( 536 ltp.getLanguageScript(), ltp.getRegion()); 537 if (popData != null && popData.getOfficialStatus().isOfficial()) { 538 populationOver20.add(locale); 539 } 540 } 541 } 542 Set<String> inBasicButNotPopulation = new TreeSet<>(primaryCombos); 543 544 inBasicButNotPopulation.removeAll(population); 545 for (String locale : inBasicButNotPopulation) { 546 ltp.set(locale); 547 String region = ltp.getRegion(); 548 String language = ltp.getLanguage(); 549 if (!sc.isModernLanguage(language)) continue; 550 PopulationData popData = supplementalData.getPopulationDataForTerritory(region); 551 // Afghanistan AF "29,928,987" 28.10% "21,500,000,000" Hazaragi haz "1,770,000" 28.10% 552 BadItem.WARNING.show( 553 "In Basic Data but not Population > 20%", 554 getDisplayCountry(region) 555 + "\t" 556 + region 557 + "\t\"" 558 + formatNumber(popData.getPopulation(), 0, false) 559 + "\"" 560 + "\t\"" 561 + formatPercent( 562 popData.getLiteratePopulation() / popData.getPopulation(), 563 0, 564 false) 565 + "\"" 566 + "\t\"" 567 + formatPercent(popData.getGdp(), 0, false) 568 + "\"" 569 + "\t" 570 + "" 571 + "\t" 572 + getLanguageName(language) 573 + "\t" 574 + language 575 + "\t" 576 + -1 577 + "\t\"" 578 + formatPercent( 579 popData.getLiteratePopulation() / popData.getPopulation(), 580 0, 581 false) 582 + "\""); 583 } 584 585 Set<String> inPopulationButNotBasic = new TreeSet<>(populationOver20); 586 inPopulationButNotBasic.removeAll(basicCombos); 587 for (Iterator<String> it = inPopulationButNotBasic.iterator(); it.hasNext(); ) { 588 String locale = it.next(); 589 if (locale.endsWith("_ZZ")) { 590 it.remove(); 591 } 592 } 593 for (String locale : inPopulationButNotBasic) { 594 BadItem.WARNING.show( 595 "In Population>20% but not Basic Data", 596 locale + " " + getLanguageName(locale), localeToRowData.get(locale).toString()); 597 } 598 } 599 600 static class LanguageInfo { 601 static LanguageInfo INSTANCE = new LanguageInfo(); 602 603 Map<String, Set<String>> languageToScripts = new TreeMap<>(); 604 Map<String, Set<String>> languageToRegions = new TreeMap<>(); 605 Map<String, Comments> languageToComments = new TreeMap<>(); 606 607 Map<String, Set<String>> languageToScriptsAlt = new TreeMap<>(); 608 Map<String, Set<String>> languageToRegionsAlt = new TreeMap<>(); 609 Map<String, Comments> languageToCommentsAlt = new TreeMap<>(); 610 LanguageInfo()611 private LanguageInfo() { 612 cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 613 // Set<String> available = cldrFactory.getAvailable(); 614 CLDRFile supplemental = cldrFactory.make("supplementalData", true); 615 for (Iterator<String> it = 616 supplemental.iterator("//supplementalData/languageData/language"); 617 it.hasNext(); ) { 618 String xpath = it.next(); 619 XPathParts parts = XPathParts.getFrozenInstance(xpath); 620 Map<String, String> x = parts.getAttributes(-1); 621 boolean alt = x.containsKey("alt"); 622 String lang = x.get("type"); 623 List<String> scripts = getAttributeList(x, "scripts"); 624 if (scripts != null) { 625 if (alt) { 626 putAll(languageToScriptsAlt, lang, new LinkedHashSet<>(scripts)); 627 } else { 628 putAll(languageToScripts, lang, new LinkedHashSet<>(scripts)); 629 } 630 } 631 List<String> regions = getAttributeList(x, "territories"); 632 if (regions != null) { 633 if (alt) { 634 putAll(languageToRegionsAlt, lang, new LinkedHashSet<>(regions)); 635 } else { 636 putAll(languageToRegions, lang, new LinkedHashSet<>(regions)); 637 } 638 } 639 } 640 } 641 getAttributeList(Map<String, String> x, String attribute)642 private List<String> getAttributeList(Map<String, String> x, String attribute) { 643 List<String> scripts = null; 644 String scriptString = x.get(attribute); 645 if (scriptString != null) { 646 scripts = Arrays.asList(scriptString.split("\\s+")); 647 } 648 return scripts; 649 } 650 } 651 putUnique(Map<K, V> map, K key, V value)652 private static <K, V> void putUnique(Map<K, V> map, K key, V value) { 653 V oldValue = map.get(key); 654 if (oldValue != null && !oldValue.equals(value)) { 655 throw new IllegalArgumentException( 656 "Duplicate value for <" + key + ">: <" + oldValue + ">, <" + value + ">"); 657 } 658 map.put(key, value); 659 } 660 putAll(Map<K, Set<W>> map, K key, Set<W> values)661 private static <K, W> void putAll(Map<K, Set<W>> map, K key, Set<W> values) { 662 Set<W> oldValue = map.get(key); 663 if (oldValue == null) { 664 map.put(key, values); 665 } else { 666 oldValue.addAll(values); 667 } 668 } 669 670 // public enum OfficialStatus {unknown, de_facto_official, official, official_regional, 671 // official_minority}; 672 673 static class RowData implements Comparable<Object> { 674 private final String countryCode; 675 private final double countryGdp; 676 private final double countryLiteracy; 677 private final double countryPopulation; 678 private final String languageCode; 679 private final OfficialStatus officialStatus; 680 private final double languagePopulation; 681 private final double languageLiteracy; 682 private final String comment; 683 private final String notes; 684 private final String badLanguageName; 685 private final boolean relativeLanguagePopulation; 686 // String badLanguageCode = ""; 687 private static final Set<String> doneCountries = new HashSet<>(); 688 689 private static final Set<String> countryCodes = sc.getGoodAvailableCodes("territory"); 690 RowData(String country, String language)691 public RowData(String country, String language) { 692 this.countryCode = country; 693 this.languageCode = language; 694 badLanguageName = country = language = notes = comment = ""; 695 officialStatus = OfficialStatus.unknown; 696 countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000); 697 countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d; 698 countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue(); 699 languagePopulation = languageLiteracy = Double.NaN; 700 relativeLanguagePopulation = false; 701 } 702 RowData(List<String> row)703 RowData(List<String> row) throws ParseException { 704 countryCode = fixCountryCode(row.get(COUNTRY_CODE), row); 705 706 if (!countryCodes.contains(countryCode)) { 707 System.err.println("WRONG COUNTRY CODE: " + row); 708 } 709 710 double countryPopulation1 = parseDecimal(row.get(COUNTRY_POPULATION)); 711 double countryLiteracy1 = parsePercent(row.get(COUNTRY_LITERACY), countryPopulation1); 712 713 countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000); 714 countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d; 715 countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue(); 716 717 String officialStatusString = row.get(OFFICIAL_STATUS).trim().replace(' ', '_'); 718 if (officialStatusString.equals("national")) { 719 officialStatusString = "official"; 720 } else if (officialStatusString.equals("regional_official")) { 721 officialStatusString = "official_regional"; 722 } else if (officialStatusString.length() == 0 723 || officialStatusString.equals("uninhabited")) { 724 officialStatusString = "unknown"; 725 } 726 try { 727 officialStatus = OfficialStatus.valueOf(officialStatusString); 728 } catch (RuntimeException e) { 729 throw new IllegalArgumentException( 730 "Can't interpret offical-status: " + officialStatusString); 731 } 732 733 String languageCode1 = row.get(LANGUAGE_CODE); 734 if (languageCode1.startsWith("*") || languageCode1.startsWith("\u00A7")) { 735 languageCode1 = languageCode1.substring(1); 736 } 737 languageCode = fixLanguageCode(languageCode1, row); 738 739 if (doneCountries.contains(countryCode) == false) { 740 // showDiff(countryGdp1, countryGdp); 741 // showDiff(countryLiteracy1, countryLiteracy); 742 if (SHOW_DIFF) showDiff(countryPopulation1, countryPopulation, 0.1, false); 743 doneCountries.add(countryCode); 744 } 745 746 double languagePopulation1 = 747 parsePercent(row.get(LANGUAGE_POPULATION), countryPopulation1) 748 * countryPopulation1; 749 if ((officialStatus.isMajor()) 750 && languagePopulation1 * 100 < countryPopulation 751 && languagePopulation1 < 1000000) { 752 BadItem.WARNING.show( 753 "official language has population < 1% of country & < 1,000,000", 754 languageCode + ", " + Math.round(languagePopulation1), row); 755 } 756 if (languagePopulation1 < 0.999) { 757 BadItem.WARNING.show( 758 "suspect language population, < 1", 759 languageCode + ", " + Math.round(languagePopulation1), 760 row); 761 } 762 if (languagePopulation1 > 10000) { 763 relativeLanguagePopulation = true; 764 languagePopulation1 = 765 languagePopulation1 * countryPopulation / countryPopulation1; // correct the 766 // values 767 } else { 768 relativeLanguagePopulation = false; 769 } 770 if (isApproximatelyGreater(languagePopulation1, countryPopulation, 0.0001)) { 771 BadItem.ERROR.show( 772 "language population > country population", 773 Math.round(languagePopulation1) + " > " + countryPopulation, 774 row); 775 } 776 languagePopulation = 777 languagePopulation1 < countryPopulation 778 ? languagePopulation1 779 : countryPopulation; 780 781 if (SHOW_DIFF) 782 showDiff( 783 languagePopulation1 / countryPopulation1, 784 languagePopulation / countryPopulation, 785 0.01, 786 true); 787 788 String stringLanguageLiteracy = 789 row.size() <= LANGUAGE_LITERACY ? "" : row.get(LANGUAGE_LITERACY); 790 double languageLiteracy1 = 791 stringLanguageLiteracy.length() == 0 792 ? countryLiteracy 793 : parsePercent(stringLanguageLiteracy, languagePopulation); 794 if (isApproximatelyEqual(languageLiteracy1, countryLiteracy1, 0.001)) { 795 languageLiteracy1 = countryLiteracy; // correct the values 796 } 797 languageLiteracy = languageLiteracy1; 798 799 if (row.size() > COMMENT) { 800 comment = row.get(COMMENT); 801 } else { 802 comment = ""; 803 } 804 if (row.size() > NOTES) { 805 notes = row.get(NOTES); 806 } else { 807 notes = ""; 808 } 809 badLanguageName = row.get(BAD_LANGUAGE_NAME); 810 } 811 showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang)812 private void showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang) { 813 final double diff = new_a / a - 1; 814 if (Math.abs(diff) > maxRelativeDiff) { 815 System.out.println( 816 formatPercent(diff, 0, false) 817 + "\t" 818 + countryCode 819 + "\t" 820 + getDisplayCountry(countryCode) 821 + (showLang 822 ? "\t" 823 + languageCode 824 + "\t" 825 + ConvertLanguageData.getLanguageName(languageCode) 826 : "") 827 + "\t" 828 + formatNumber(a, 0, false) 829 + "\t=>\t" 830 + formatNumber(new_a, 0, false)); 831 } 832 } 833 roundToPartsPer(double a, double whole)834 private double roundToPartsPer(double a, double whole) { 835 // break this out just to make it easier to follow. 836 double log10 = Math.log10(a / whole); 837 long digitsFound = (long) (log10); 838 long factor = (long) (Math.pow(10, digitsFound)); 839 double rounded = Math.round(a / factor); 840 double result = rounded * factor; 841 // if (Math.abs(result - a) >= 1) { 842 // System.out.println("Rounding " + a + " => " + result); 843 // } 844 return result; 845 } 846 isApproximatelyEqual(double a, double b, double epsilon)847 private static boolean isApproximatelyEqual(double a, double b, double epsilon) { 848 return a == b || Math.abs(a - b) < epsilon; 849 } 850 isApproximatelyGreater(double a, double b, double epsilon)851 private static boolean isApproximatelyGreater(double a, double b, double epsilon) { 852 return a > b + epsilon; 853 } 854 parseDecimal(String numericRepresentation)855 double parseDecimal(String numericRepresentation) throws ParseException { 856 try { 857 // if (numericRepresentation == null || numericRepresentation.length() == 0) return 858 // Double.NaN; 859 Number result = nf.parse(numericRepresentation); 860 // if (result == null) return Double.NaN; 861 return result.doubleValue(); 862 } catch (ParseException e) { 863 throw e; 864 // (RuntimeException) new IllegalArgumentException("can't parse <" + 865 // numericRepresentation + 866 // ">").initCause(e); 867 } 868 } 869 parsePercent(String numericRepresentation, double baseValue)870 double parsePercent(String numericRepresentation, double baseValue) throws ParseException { 871 try { 872 double result; 873 if (numericRepresentation.contains("%")) { 874 Number result0 = pf.parse(numericRepresentation); 875 result = result0.doubleValue(); 876 } else { 877 Number result0 = nf.parse(numericRepresentation); 878 result = result0.doubleValue() / baseValue; 879 } 880 // if (numericRepresentation == null || numericRepresentation.length() == 0) return 881 // Double.NaN; 882 // if (result == null) return Double.NaN; 883 return result; 884 } catch (ParseException e) { 885 throw e; 886 // (RuntimeException) new IllegalArgumentException("can't parse <" + 887 // numericRepresentation + 888 // ">").initCause(e); 889 } 890 } 891 getLanguageLiteratePopulation()892 public double getLanguageLiteratePopulation() { 893 return languageLiteracy * languagePopulation; 894 } 895 896 /** 897 * Get the weighted population 898 * 899 * @param weightIfNotOfficial 900 * @return 901 */ getLanguageLiteratePopulation(double weightIfNotOfficial)902 public double getLanguageLiteratePopulation(double weightIfNotOfficial) { 903 double result = languageLiteracy * languagePopulation; 904 if (!officialStatus.isMajor()) { 905 result *= weightIfNotOfficial; 906 } 907 return result; 908 } 909 910 @Override compareTo(Object o)911 public int compareTo(Object o) { 912 RowData that = (RowData) o; 913 int result; 914 if (0 != (result = GENERAL_COLLATOR.compare(countryCode, that.countryCode))) 915 return result; 916 if (languagePopulation > that.languagePopulation) return -1; // descending 917 if (languagePopulation < that.languagePopulation) return 1; 918 if (0 != (result = GENERAL_COLLATOR.compare(languageCode, that.languageCode))) 919 return result; 920 return 0; 921 } 922 toStringHeader()923 public static String toStringHeader() { 924 return "countryCode" 925 + "\t" 926 + "countryPopulation" 927 + "\t" 928 + "countryGdp" 929 + "\t" 930 + "countryLiteracy" 931 + "\t" 932 + "languagePopulation" 933 + "\t" 934 + "languageCode" 935 + "\t" 936 + "writingPopulation"; 937 } 938 939 @Override toString()940 public String toString() { 941 return countryCode 942 + "\t" 943 + countryPopulation 944 + "\t" 945 + countryGdp 946 + "\t" 947 + countryLiteracy 948 + "\t" 949 + languagePopulation 950 + "\t" 951 + languageCode 952 + "\t" 953 + languageLiteracy; 954 } 955 toString(boolean b)956 public String toString(boolean b) { 957 return "region:\t" 958 + getCountryCodeAndName(countryCode) 959 + "\tpop:\t" 960 + countryPopulation 961 + "\tgdp:\t" 962 + countryGdp 963 + "\tlit:\t" 964 + countryLiteracy 965 + "\tlang:\t" 966 + getLanguageCodeAndName(languageCode) 967 + "\tpop:\t" 968 + languagePopulation 969 + "\tlit:\t" 970 + languageLiteracy; 971 } 972 973 static boolean MARK_OUTPUT = false; 974 getLanguageCode()975 public String getLanguageCode() { 976 if (languageCode.contains("_")) return languageCode; 977 Source source = Iso639Data.getSource(languageCode); 978 if (source == null) { 979 return "§" + languageCode; 980 } 981 if (MARK_OUTPUT) { 982 if (source == Source.ISO_639_3) { 983 return "*" + languageCode; 984 } 985 } 986 return languageCode; 987 } 988 989 static Map<String, String> oldToFixed = new HashMap<>(); 990 getLanguageName()991 public String getLanguageName() { 992 String cldrResult = getExcelQuote(english.getName(languageCode, true)); 993 // String result = getLanguageName2(); 994 // if (!result.equalsIgnoreCase(cldrResult)) { 995 // if (null == oldToFixed.put(result, cldrResult)) { 996 // System.out.println("## " + result + "!=" + cldrResult); 997 // } 998 // } 999 return cldrResult; 1000 } 1001 getLanguageName2()1002 public String getLanguageName2() { 1003 String result = new ULocale(languageCode).getDisplayName(); 1004 if (!result.equals(languageCode)) return getExcelQuote(result); 1005 Set<String> names = Iso639Data.getNames(languageCode); 1006 if (names != null && names.size() != 0) { 1007 if (MARK_OUTPUT) { 1008 return getExcelQuote("*" + names.iterator().next()); 1009 } else { 1010 return getExcelQuote(names.iterator().next()); 1011 } 1012 } 1013 return getExcelQuote("§" + badLanguageName); 1014 } 1015 getCountryName()1016 public String getCountryName() { 1017 return getExcelQuote(getDisplayCountry(countryCode)); 1018 } 1019 getCountryGdpString()1020 public String getCountryGdpString() { 1021 return getExcelQuote(formatNumber(countryGdp, 0, false)); 1022 } 1023 getCountryLiteracyString()1024 public String getCountryLiteracyString() { 1025 return formatPercent(countryLiteracy, 2, false); 1026 } 1027 getCountryPopulationString()1028 public String getCountryPopulationString() { 1029 return getExcelQuote(formatNumber(countryPopulation, 0, false)); 1030 } 1031 getLanguageLiteracyString()1032 public String getLanguageLiteracyString() { 1033 return formatPercent(languageLiteracy, 2, false); 1034 } 1035 getLanguagePopulationString()1036 public String getLanguagePopulationString() { 1037 1038 try { 1039 final double percent = languagePopulation / countryPopulation; 1040 return getExcelQuote( 1041 relativeLanguagePopulation && percent > 0.03 && languagePopulation > 10000 1042 ? formatPercent(percent, 2, false) 1043 : formatNumber(languagePopulation, 3, false)); 1044 } catch (IllegalArgumentException e) { 1045 return "NaN"; 1046 } 1047 } 1048 getLanguagePopulation()1049 private double getLanguagePopulation() { 1050 return languagePopulation; 1051 } 1052 } 1053 getExcelQuote(String comment)1054 public static String getExcelQuote(String comment) { 1055 return comment == null || comment.length() == 0 1056 ? "" 1057 : comment.contains(",") 1058 ? '"' + comment + '"' 1059 : comment.contains("\"") 1060 ? '"' + comment.replace("\"", "\"\"") + '"' 1061 : comment; 1062 } 1063 getCountryCodeAndName(String code)1064 public static String getCountryCodeAndName(String code) { 1065 if (code == null) return null; 1066 return english.getName(CLDRFile.TERRITORY_NAME, code) + " [" + code + "]"; 1067 } 1068 1069 static class RowComparator implements Comparator<RowData> { 1070 @Override compare(RowData me, RowData that)1071 public int compare(RowData me, RowData that) { 1072 int result; 1073 if (0 1074 != (result = 1075 GENERAL_COLLATOR.compare(me.getCountryName(), that.getCountryName()))) 1076 return result; 1077 if (0 1078 != (result = 1079 GENERAL_COLLATOR.compare(me.getLanguageName(), that.getLanguageName()))) 1080 return result; 1081 return me.compareTo(that); 1082 } 1083 } 1084 writeTerritoryLanguageData( PrintWriter out, List<String> failures, Set<RowData> sortedInput)1085 private static void writeTerritoryLanguageData( 1086 PrintWriter out, List<String> failures, Set<RowData> sortedInput) { 1087 1088 String lastCountryCode = ""; 1089 boolean first = true; 1090 LanguageTagParser ltp = new LanguageTagParser(); 1091 1092 out.println( 1093 " <!-- See http://unicode.org/cldr/data/diff/supplemental/territory_language_information.html for more information on territoryInfo. -->"); 1094 out.println("\t<territoryInfo>"); 1095 1096 for (RowData row : sortedInput) { 1097 String countryCode = row.countryCode; 1098 1099 double countryPopulationRaw = row.countryPopulation; 1100 double countryPopulation = 1101 countryPopulationRaw; // (long) Utility.roundToDecimals(countryPopulationRaw, 1102 // 2); 1103 double languageLiteracy = row.languageLiteracy; 1104 double countryLiteracy = row.countryLiteracy; 1105 1106 double countryGDPRaw = row.countryGdp; 1107 long countryGDP = Math.round(countryGDPRaw / gdpFactor); 1108 1109 String languageCode = row.languageCode; 1110 1111 double languagePopulationRaw = row.getLanguagePopulation(); 1112 double languagePopulation = 1113 languagePopulationRaw; // (long) Utility.roundToDecimals(languagePopulationRaw, 1114 // 2); 1115 1116 double languagePopulationPercent = languagePopulation / countryPopulation; 1117 // Utility.roundToDecimals(Math.min(100, Math.max(0, 1118 // languagePopulation*100 / (double)countryPopulation)),3); 1119 1120 if (!countryCode.equals(lastCountryCode)) { 1121 if (first) { 1122 first = false; 1123 } else { 1124 out.println("\t\t</territory>"); 1125 } 1126 out.print( 1127 "\t\t<territory type=\"" 1128 + countryCode 1129 + "\"" 1130 + " gdp=\"" 1131 + formatNumber(countryGDP, 4, true) 1132 + "\"" 1133 + " literacyPercent=\"" 1134 + formatPercent(countryLiteracy, 3, true) 1135 + "\"" 1136 + " population=\"" 1137 + formatNumber(countryPopulation, 6, true) 1138 + "\">"); 1139 lastCountryCode = countryCode; 1140 out.println("\t<!--" + getDisplayCountry(countryCode) + "-->"); 1141 } 1142 1143 if (languageCode.length() != 0 1144 && languagePopulationPercent > 0.0000 1145 && (ALLOW_SMALL_NUMBERS 1146 || languagePopulationPercent >= 1 1147 || languagePopulationRaw > 100000 1148 || languageCode.equals("haw") 1149 || row.officialStatus.isOfficial())) { 1150 // add best case 1151 addBestRegion(languageCode, countryCode, languagePopulationRaw); 1152 String baseScriptLanguage = ltp.set(languageCode).getLanguageScript(); 1153 if (!baseScriptLanguage.equals(languageCode)) { 1154 addBestRegion(baseScriptLanguage, countryCode, languagePopulationRaw); 1155 } 1156 String baseLanguage = ltp.set(baseScriptLanguage).getLanguage(); 1157 if (!baseLanguage.equals(baseScriptLanguage)) { 1158 addBestRegion(baseLanguage, countryCode, languagePopulationRaw); 1159 addBestScript( 1160 baseLanguage, ltp.set(languageCode).getScript(), languagePopulationRaw); 1161 } 1162 1163 if (languageLiteracy != countryLiteracy) { 1164 int debug = 0; 1165 } 1166 out.print( 1167 "\t\t\t<languagePopulation type=\"" 1168 + languageCode 1169 + "\"" 1170 + (DoubleMath.fuzzyCompare( 1171 languageLiteracy, countryLiteracy, 0.0001) 1172 == 0 1173 ? "" 1174 : (DoubleMath.fuzzyCompare(languageLiteracy, 0.05, 0.0001) 1175 == 0 1176 ? " writingPercent=\"" 1177 : " literacyPercent=\"") 1178 + formatPercent(languageLiteracy, 2, true) 1179 + "\"") 1180 + " populationPercent=\"" 1181 + formatPercent(languagePopulationPercent, 2, true) 1182 + "\"" 1183 + (row.officialStatus.isOfficial() 1184 ? " officialStatus=\"" + row.officialStatus + "\"" 1185 : "") 1186 + references.addReference(row.notes) 1187 + "/>"); 1188 out.println("\t<!--" + getLanguageName(languageCode) + "-->"); 1189 } else if (!row.countryCode.equals("ZZ")) { 1190 failures.add( 1191 BadItem.ERROR.toString( 1192 "too few speakers: suspect line", 1193 languageCode, 1194 row.toString(true))); 1195 } 1196 // if (first) { 1197 if (false) 1198 System.out.print( 1199 "countryCode: " 1200 + countryCode 1201 + "\t" 1202 + "countryPopulation: " 1203 + countryPopulation 1204 + "\t" 1205 + "countryGDP: " 1206 + countryGDP 1207 + "\t" 1208 + "languageCode: " 1209 + languageCode 1210 + "\t" 1211 + "languagePopulation: " 1212 + languagePopulation 1213 + CldrUtility.LINE_SEPARATOR); 1214 // } 1215 } 1216 1217 out.println("\t\t</territory>"); 1218 out.println("\t</territoryInfo>"); 1219 } 1220 getDisplayCountry(String countryCode)1221 private static String getDisplayCountry(String countryCode) { 1222 String result = getULocaleCountryName(countryCode); 1223 if (!result.equals(countryCode)) { 1224 return result; 1225 } 1226 result = sc.getData("territory", countryCode); 1227 if (result != null) { 1228 return result; 1229 } 1230 return countryCode; 1231 // new ULocale("und-" + countryCode).getDisplayCountry() 1232 } 1233 getDisplayScript(String scriptCode)1234 private static String getDisplayScript(String scriptCode) { 1235 String result = getULocaleScriptName(scriptCode); 1236 if (!result.equals(scriptCode)) { 1237 return result; 1238 } 1239 result = sc.getData("territory", scriptCode); 1240 if (result != null) { 1241 return result; 1242 } 1243 return scriptCode; 1244 // new ULocale("und-" + countryCode).getDisplayCountry() 1245 } 1246 getLanguageName(String languageCode)1247 private static String getLanguageName(String languageCode) { 1248 String result = getULocaleLocaleName(languageCode); 1249 if (!result.equals(languageCode)) return result; 1250 Set<String> names = Iso639Data.getNames(languageCode); 1251 if (names != null && names.size() != 0) { 1252 return names.iterator().next(); 1253 } 1254 return languageCode; 1255 } 1256 1257 static class References { 1258 Map<String, Pair<String, String>> Rxxx_to_reference = new TreeMap<>(); 1259 Map<Pair<String, String>, String> reference_to_Rxxx = new TreeMap<>(); 1260 Map<String, Pair<String, String>> Rxxx_to_oldReferences = supplementalData.getReferences(); 1261 Map<Pair<String, String>, String> oldReferences_to_Rxxx = new TreeMap<>(); 1262 1263 { 1264 for (String Rxxx : Rxxx_to_oldReferences.keySet()) { Rxxx_to_oldReferences.get(Rxxx)1265 oldReferences_to_Rxxx.put(Rxxx_to_oldReferences.get(Rxxx), Rxxx); 1266 } 1267 } 1268 1269 Matcher URI = PatternCache.get("([a-z]+\\://[\\S]+)\\s?(.*)").matcher(""); 1270 1271 static int referenceStart = 1000; 1272 1273 /** 1274 * Returns " references=\"" + Rxxx + "\"" or "" if there is no reference. 1275 * 1276 * @param rawReferenceText 1277 * @return 1278 */ addReference(String rawReferenceText)1279 private String addReference(String rawReferenceText) { 1280 if (rawReferenceText == null || rawReferenceText.length() == 0) return ""; 1281 Pair<String, String> p; 1282 if (URI.reset(rawReferenceText).matches()) { 1283 p = 1284 new Pair<>( 1285 URI.group(1), 1286 URI.group(2) == null || URI.group(2).length() == 0 1287 ? "[missing]" 1288 : URI.group(2)) 1289 .freeze(); 1290 } else { 1291 p = new Pair<String, String>(null, rawReferenceText).freeze(); 1292 } 1293 1294 String Rxxx = reference_to_Rxxx.get(p); 1295 if (Rxxx == null) { // add new 1296 Rxxx = oldReferences_to_Rxxx.get(p); 1297 if (Rxxx != null) { // if old, just keep number 1298 p = Rxxx_to_oldReferences.get(Rxxx); 1299 } else { // find an empty number 1300 while (true) { 1301 Rxxx = "R" + (referenceStart++); 1302 if (Rxxx_to_reference.get(Rxxx) == null 1303 && Rxxx_to_oldReferences.get(Rxxx) == null) { 1304 break; 1305 } 1306 } 1307 } 1308 // add to new references 1309 reference_to_Rxxx.put(p, Rxxx); 1310 Rxxx_to_reference.put(Rxxx, p); 1311 } 1312 // references="R034" 1313 return " references=\"" + Rxxx + "\""; 1314 } 1315 getReferenceHTML(String Rxxx)1316 String getReferenceHTML(String Rxxx) { 1317 Pair<String, String> p = Rxxx_to_reference.get(Rxxx); // exception if fails. 1318 String uri = p.getFirst(); 1319 String value = p.getSecond(); 1320 uri = 1321 uri == null 1322 ? "" 1323 : " uri=\"" + TransliteratorUtilities.toHTML.transliterate(uri) + "\""; 1324 value = 1325 value == null 1326 ? "[missing]" 1327 : TransliteratorUtilities.toHTML.transliterate(value); 1328 return "\t\t<reference type=\"" + Rxxx + "\"" + uri + ">" + value + "</reference>"; 1329 } 1330 printReferences(PrintWriter out)1331 void printReferences(PrintWriter out) { 1332 // <reference type="R034" uri="isbn:0-321-18578-1">The Unicode Standard 4.0</reference> 1333 out.println("\t<references>"); 1334 for (String Rxxx : Rxxx_to_reference.keySet()) { 1335 out.println(getReferenceHTML(Rxxx)); 1336 } 1337 out.println("\t</references>"); 1338 } 1339 } 1340 1341 static References references = new References(); 1342 getExcelData( List<String> failures, Map<String, RowData> localeToRowData)1343 private static Set<RowData> getExcelData( 1344 List<String> failures, Map<String, RowData> localeToRowData) throws IOException { 1345 1346 LanguageTagParser ltp = new LanguageTagParser(); 1347 1348 String dir = CLDRPaths.GEN_DIRECTORY + "supplemental/"; 1349 final String countryLanguagePopulation = "country_language_population.tsv"; 1350 System.out.println("\n# Problems in " + countryLanguagePopulation + "\n"); 1351 List<List<String>> input = 1352 SpreadSheet.convert(CldrUtility.getUTF8Data(countryLanguagePopulation)); 1353 1354 // TODO: Why is this called? Should it be sc.getGoodAvailableCodes? 1355 Set<String> languages = languagesNeeded; // sc.getGoodAvailableCodes("language"); 1356 1357 Set<String> territories = new TreeSet<>(sc.getGoodAvailableCodes("territory")); 1358 territories.removeAll(supplementalData.getContainers()); 1359 // TODO: Why are these removed if they are "good" (per above function)? 1360 territories.remove("EU"); 1361 territories.remove("QO"); 1362 1363 Set<String> countriesNotFound = new TreeSet<>(territories); 1364 Set<OfficialStatus> statusFound = new TreeSet<>(); 1365 Set<String> countriesWithoutOfficial = new TreeSet<>(territories); 1366 countriesWithoutOfficial.remove("ZZ"); 1367 1368 Map<String, Row.R2<String, Double>> countryToLargestOfficialLanguage = new HashMap<>(); 1369 1370 Set<String> languagesNotFound = new TreeSet<>(languages); 1371 Set<RowData> sortedInput = new TreeSet<>(); 1372 int count = 0; 1373 for (List<String> row : input) { 1374 ++count; 1375 if (count == 1 || row.size() <= COUNTRY_GDP) { 1376 failures.add(join(row, "\t") + "\tShort row"); 1377 continue; 1378 } 1379 try { 1380 RowData x = new RowData(row); 1381 if (x.officialStatus.isOfficial()) { 1382 Row.R2<String, Double> largestOffical = 1383 countryToLargestOfficialLanguage.get(x.countryCode); 1384 if (largestOffical == null) { 1385 countryToLargestOfficialLanguage.put( 1386 x.countryCode, Row.of(x.languageCode, x.languagePopulation)); 1387 } else if (largestOffical.get1() < x.languagePopulation) { 1388 largestOffical.set0(x.languageCode); 1389 largestOffical.set1(x.languagePopulation); 1390 } 1391 } 1392 if (x.officialStatus.isMajor() || x.countryPopulation < 1000) { 1393 countriesWithoutOfficial.remove(x.countryCode); 1394 } 1395 if (!checkCode(LstrType.region, x.countryCode, row)) continue; 1396 statusFound.add(x.officialStatus); 1397 countriesNotFound.remove(x.countryCode); 1398 languagesNotFound.remove(x.languageCode); 1399 if (x.languageCode.contains("_")) { 1400 ltp.set(x.languageCode); 1401 languagesNotFound.remove(ltp.getLanguage()); 1402 if (!checkCode(LstrType.language, ltp.getLanguage(), row)) continue; 1403 if (!checkCode(LstrType.script, ltp.getScript(), row)) continue; 1404 } 1405 String locale = x.languageCode + "_" + x.countryCode; 1406 if (localeToRowData.get(locale) != null) { 1407 BadItem.ERROR.show( 1408 "duplicate data", x.languageCode + " with " + x.countryCode, row); 1409 } 1410 localeToRowData.put(locale, x); 1411 sortedInput.add(x); 1412 } catch (ParseException e) { 1413 failures.add( 1414 join(row, "\t") 1415 + "\t" 1416 + e.getMessage() 1417 + "\t" 1418 + join(Arrays.asList(e.getStackTrace()), ";\t")); 1419 } catch (RuntimeException e) { 1420 throw (RuntimeException) 1421 new IllegalArgumentException("Failure on line " + count + ")\t" + row) 1422 .initCause(e); 1423 } 1424 } 1425 // System.out.println("Note: the following Status values were found in the data: " + 1426 // CldrUtility.join(statusFound, " | ")); 1427 1428 // make sure we have something 1429 for (String country : countriesNotFound) { 1430 RowData x = new RowData(country, "und"); 1431 sortedInput.add(x); 1432 } 1433 for (String language : languagesNotFound) { 1434 RowData x = new RowData("ZZ", language); 1435 sortedInput.add(x); 1436 } 1437 1438 for (RowData row : sortedInput) { 1439 // see which countries have languages that are larger than any offical language 1440 1441 if (!row.officialStatus.isOfficial()) { 1442 // String country = row.countryCode; 1443 Row.R2<String, Double> largestOffical = 1444 countryToLargestOfficialLanguage.get(row.countryCode); 1445 if (largestOffical != null && largestOffical.get1() < row.languagePopulation) { 1446 BadItem.WARNING.show( 1447 "language population > all official languages", 1448 getLanguageCodeAndName(largestOffical.get0()), 1449 row.toString(true)); 1450 } 1451 } 1452 1453 // see which countries are missing an official language 1454 if (!countriesWithoutOfficial.contains(row.countryCode)) continue; 1455 BadItem.ERROR.show( 1456 "missing official language", 1457 row.getCountryName() + "\t" + row.countryCode, 1458 row.toString(true)); 1459 countriesWithoutOfficial.remove(row.countryCode); 1460 } 1461 1462 PrintWriter log = FileUtilities.openUTF8Writer(dir, countryLanguagePopulation); 1463 log.println( 1464 "*\tCName" 1465 + "\tCCode" 1466 + "\tCPopulation" 1467 + "\tCLiteracy" 1468 + "\tCGdp" 1469 + "\tOfficialStatus" 1470 + "\tLanguage" 1471 + "\tLCode" 1472 + "\tLPopulation" 1473 + "\tWritingPop" 1474 + "\tReferences" 1475 + "\tNotes"); 1476 RowComparator rowSorting = new RowComparator(); 1477 Set<RowData> rowSorted = new TreeSet<>(rowSorting); 1478 rowSorted.addAll(sortedInput); 1479 1480 for (RowData row : rowSorted) { 1481 final String langLit = row.getLanguageLiteracyString(); 1482 final String countryLit = row.getCountryLiteracyString(); 1483 log.println( 1484 row.getCountryName() 1485 + "\t" 1486 + row.countryCode 1487 + "\t" 1488 + row.getCountryPopulationString() 1489 + "\t" 1490 + countryLit 1491 + "\t" 1492 + row.getCountryGdpString() 1493 + "\t" 1494 + (row.officialStatus == OfficialStatus.unknown 1495 ? "" 1496 : row.officialStatus) 1497 + "\t" 1498 + row.getLanguageName() 1499 + "\t" 1500 + row.getLanguageCode() 1501 + "\t" 1502 + row.getLanguagePopulationString() 1503 + "\t" 1504 + (langLit.equals(countryLit) ? "" : langLit) 1505 + "\t" 1506 + getExcelQuote(row.comment) 1507 + "\t" 1508 + getExcelQuote(row.notes)); 1509 } 1510 log.close(); 1511 return sortedInput; 1512 } 1513 getCldrParents(Set<String> available)1514 private static Set<String> getCldrParents(Set<String> available) { 1515 LanguageTagParser ltp2 = new LanguageTagParser(); 1516 Set<String> cldrParents = new TreeSet<>(); 1517 for (String locale : available) { 1518 if (skipLocales.contains(locale)) continue; 1519 try { 1520 ltp2.set(locale); 1521 } catch (RuntimeException e) { 1522 System.out.println("Skipping CLDR file: " + locale); 1523 continue; 1524 } 1525 String locale2 = ltp2.getLanguageScript(); 1526 if (locale2.equals("sh")) continue; 1527 // int lastPos = locale.lastIndexOf('_'); 1528 // if (lastPos < 0) continue; 1529 // String locale2 = locale.substring(0,lastPos); 1530 cldrParents.add(locale2); 1531 languageToMaxCountry.put(locale2, null); 1532 } 1533 // System.out.println("CLDR Parents: " + cldrParents); 1534 return cldrParents; 1535 } 1536 showFailures(List<String> failures)1537 private static void showFailures(List<String> failures) { 1538 if (failures.size() <= 1) { 1539 return; 1540 } 1541 System.out.println(); 1542 System.out.println("Failures in Output"); 1543 System.out.println(); 1544 1545 System.out.println(RowData.toStringHeader()); 1546 for (String failure : failures) { 1547 System.out.println(failure); 1548 } 1549 } 1550 getProcessedParent(String localeCode)1551 public static String getProcessedParent(String localeCode) { 1552 if (localeCode == null || localeCode.equals("root")) return null; 1553 int pos = localeCode.lastIndexOf('_'); 1554 if (pos < 0) return "root"; 1555 LanguageTagParser ltp = new LanguageTagParser(); 1556 String script = ltp.set(localeCode).getScript(); 1557 if (script.length() == 0) { 1558 return getFullyResolved(localeCode); 1559 } 1560 return localeCode.substring(0, pos); 1561 } 1562 getFullyResolved(String languageCode)1563 private static String getFullyResolved(String languageCode) { 1564 String result = defaultContent.get(languageCode); 1565 if (result != null) return result; 1566 // we missed. Try taking parent and trying again 1567 int pos = languageCode.length() + 1; 1568 while (true) { 1569 pos = languageCode.lastIndexOf('_', pos - 1); 1570 if (pos < 0) { 1571 return "***" + languageCode; 1572 } 1573 result = defaultContent.get(languageCode.substring(0, pos)); 1574 if (result != null) { 1575 LanguageTagParser ltp = new LanguageTagParser().set(languageCode); 1576 LanguageTagParser ltp2 = new LanguageTagParser().set(result); 1577 String region = ltp.getRegion(); 1578 if (region.length() == 0) { 1579 ltp.setRegion(ltp2.getRegion()); 1580 } 1581 String script = ltp.getScript(); 1582 if (script.length() == 0) { 1583 ltp.setScript(ltp2.getScript()); 1584 } 1585 return ltp.toString(); 1586 } 1587 } 1588 } 1589 1590 static Comparator<Iterable> firstElementComparator = 1591 new Comparator<Iterable>() { 1592 @Override 1593 public int compare(Iterable o1, Iterable o2) { 1594 int result = 1595 ((Comparable) o1.iterator().next()).compareTo((o2.iterator().next())); 1596 assert result != 0; 1597 return result; 1598 } 1599 }; 1600 showDefaults( Set<String> cldrParents, NumberFormat nf, Map<String, String> defaultContent, Map<String, RowData> localeToRowData, Set<String> defaultLocaleContent)1601 private static void showDefaults( 1602 Set<String> cldrParents, 1603 NumberFormat nf, 1604 Map<String, String> defaultContent, 1605 Map<String, RowData> localeToRowData, 1606 Set<String> defaultLocaleContent) { 1607 1608 if (SHOW_OLD_DEFAULT_CONTENTS) { 1609 System.out.println(); 1610 System.out.println("Computing Defaults Contents"); 1611 System.out.println(); 1612 } 1613 1614 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 1615 Set<String> locales = new TreeSet<>(cldrFactory.getAvailable()); 1616 LocaleIDParser lidp = new LocaleIDParser(); 1617 1618 // add all the combinations of language, script, and territory. 1619 for (String locale : localeToRowData.keySet()) { 1620 String baseLanguage = lidp.set(locale).getLanguage(); 1621 if (locales.contains(baseLanguage) && !locales.contains(locale)) { 1622 locales.add(locale); 1623 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding: " + locale); 1624 } 1625 } 1626 1627 // adding parents 1628 Set<String> toAdd = new TreeSet<>(); 1629 while (true) { 1630 for (String locale : locales) { 1631 String newguy = LocaleIDParser.getParent(locale); 1632 if (newguy != null && !locales.contains(newguy) && !toAdd.contains(newguy)) { 1633 toAdd.add(newguy); 1634 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding parent: " + newguy); 1635 } 1636 } 1637 if (toAdd.size() == 0) { 1638 break; 1639 } 1640 locales.addAll(toAdd); 1641 toAdd.clear(); 1642 } 1643 1644 // get sets of siblings 1645 Set<Set<String>> siblingSets = new TreeSet<>(firstElementComparator); 1646 Set<String> needsADoin = new TreeSet<>(locales); 1647 1648 Set<String> deprecatedLanguages = new TreeSet<>(); 1649 // TODO: why are these here and not read from metadata? 1650 deprecatedLanguages.add("sh"); 1651 Set<String> deprecatedRegions = new TreeSet<>(); 1652 // TODO: why are these here and not read from metadata? 1653 deprecatedRegions.add("YU"); 1654 deprecatedRegions.add("CS"); 1655 deprecatedRegions.add("ZZ"); 1656 1657 // first find all the language subtags that have scripts, and those we need to skip. Those 1658 // are aliased-only 1659 Set<String> skippingItems = new TreeSet<>(); 1660 Set<String> hasAScript = new TreeSet<>(); 1661 // Set<LocaleIDParser.Level> languageOnly = EnumSet.of(LocaleIDParser.Level.Language); 1662 for (String locale : locales) { 1663 lidp.set(locale); 1664 if (lidp.getScript().length() != 0) { 1665 hasAScript.add(lidp.getLanguage()); 1666 } 1667 Set<LocaleIDParser.Level> levels = lidp.getLevels(); 1668 // must have no variants, must have either script or region, no deprecated elements 1669 if (levels.contains(LocaleIDParser.Level.Variants) // no variants 1670 || !(levels.contains(LocaleIDParser.Level.Script) 1671 || levels.contains(LocaleIDParser.Level.Region)) 1672 || deprecatedLanguages.contains(lidp.getLanguage()) 1673 || deprecatedRegions.contains(lidp.getRegion())) { 1674 // skip language-only locales, and ones with variants 1675 needsADoin.remove(locale); 1676 skippingItems.add(locale); 1677 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tremoving: " + locale); 1678 continue; 1679 } 1680 } 1681 // walk through the locales, getting the ones we care about. 1682 Map<String, Double> scriptLocaleToLanguageLiteratePopulation = new TreeMap<>(); 1683 1684 for (String locale : new TreeSet<>(needsADoin)) { 1685 if (!needsADoin.contains(locale)) continue; 1686 lidp.set(locale); 1687 Set<Level> level = lidp.getLevels(); 1688 // skip locales that need scripts and don't have them 1689 if (!level.contains(LocaleIDParser.Level.Script) // no script 1690 && hasAScript.contains(lidp.getLanguage())) { 1691 needsADoin.remove(locale); 1692 skippingItems.add(locale); 1693 continue; 1694 } 1695 // get siblings 1696 Set<String> siblingSet = lidp.getSiblings(needsADoin); 1697 // if it has a script and region 1698 if (level.contains(LocaleIDParser.Level.Script) 1699 && level.contains(LocaleIDParser.Level.Region)) { 1700 double languageLiteratePopulation = 0; 1701 for (String localeID2 : siblingSet) { 1702 RowData rowData = localeToRowData.get(localeID2); 1703 if (rowData != null) { 1704 languageLiteratePopulation += 1705 rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT); 1706 } 1707 } 1708 String parentID = LocaleIDParser.getParent(locale); 1709 scriptLocaleToLanguageLiteratePopulation.put(parentID, languageLiteratePopulation); 1710 } 1711 1712 try { 1713 siblingSets.add(siblingSet); 1714 } catch (RuntimeException e) { 1715 e.printStackTrace(); 1716 } 1717 needsADoin.removeAll(siblingSet); 1718 } 1719 if (SHOW_OLD_DEFAULT_CONTENTS) 1720 System.out.println("ConvertLanguageData Skipping: " + skippingItems); 1721 if (needsADoin.size() != 0) { 1722 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("Missing: " + needsADoin); 1723 } 1724 1725 // walk through the data 1726 Set<String> skippingSingletons = new TreeSet<>(); 1727 1728 Set<String> missingData = new TreeSet<>(); 1729 for (Set<String> siblingSet : siblingSets) { 1730 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("** From siblings: " + siblingSet); 1731 1732 if (false & siblingSet.size() == 1) { 1733 skippingSingletons.add(siblingSet.iterator().next()); 1734 continue; 1735 } 1736 // get best 1737 double best = Double.NEGATIVE_INFINITY; 1738 String bestLocale = "???"; 1739 Set<Pair<Double, String>> data = new TreeSet<>(); 1740 LanguageTagParser ltp = new LanguageTagParser(); 1741 for (String locale : siblingSet) { 1742 RowData rowData = localeToRowData.get(locale); 1743 double languageLiteratePopulation = -1; 1744 if (rowData != null) { 1745 languageLiteratePopulation = 1746 rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT); 1747 } else { 1748 Double d = scriptLocaleToLanguageLiteratePopulation.get(locale); 1749 if (d != null) { 1750 languageLiteratePopulation = d; 1751 } else { 1752 final String region = ltp.set(locale).getRegion(); 1753 if (region.isEmpty() || StandardCodes.isCountry(region)) { 1754 missingData.add(locale); 1755 } 1756 } 1757 } 1758 data.add(new Pair<>(languageLiteratePopulation, locale)); 1759 if (best < languageLiteratePopulation) { 1760 best = languageLiteratePopulation; 1761 bestLocale = locale; 1762 } 1763 } 1764 // show it 1765 for (Pair<Double, String> datum : data) { 1766 if (SHOW_OLD_DEFAULT_CONTENTS) 1767 System.out.format( 1768 "\tContenders: %s %f (based on literate population)" 1769 + CldrUtility.LINE_SEPARATOR, 1770 datum.getSecond(), 1771 datum.getFirst()); 1772 } 1773 // System.out.format("\tPicking default content: %s %f (based on literate population)" + 1774 // Utility.LINE_SEPARATOR, bestLocale, best); 1775 // Hack to fix English 1776 // TODO Generalize in the future for other locales with non-primary scripts 1777 if (bestLocale.startsWith("en_")) { 1778 defaultLocaleContent.add("en_US"); 1779 } else { 1780 defaultLocaleContent.add(bestLocale); 1781 } 1782 } 1783 1784 for (String singleton : skippingSingletons) { 1785 BadItem.WARNING.show("skipping Singletons", singleton); 1786 } 1787 for (String missing : missingData) { 1788 BadItem.WARNING.show("Missing Data", missing); 1789 } 1790 1791 // LanguageTagParser ltp = new LanguageTagParser(); 1792 // Set<String> warnings = new LinkedHashSet(); 1793 // for (String languageCode : languageToMaxCountry.keySet()) { 1794 // CodeAndPopulation best = languageToMaxCountry.get(languageCode); 1795 // String languageSubtag = ltp.set(languageCode).getLanguage(); 1796 // String countryCode = "ZZ"; 1797 // double rawLanguagePopulation = -1; 1798 // if (best != null) { 1799 // countryCode = best.code; 1800 // rawLanguagePopulation = best.population; 1801 // Set<String> regions = LanguageInfo.INSTANCE.languageToRegions.get(languageSubtag); 1802 // if (regions == null || !regions.contains(countryCode)) { 1803 // Set<String> regions2 = LanguageInfo.INSTANCE.languageToRegionsAlt.get(languageSubtag); 1804 // if (regions2 == null || !regions2.contains(countryCode)) { 1805 // warnings.add("WARNING: " + languageCode + " => " + countryCode + ", not in " + regions + 1806 // "/" + regions2); 1807 // } 1808 // } 1809 // } 1810 // String resolvedLanguageCode = languageCode + "_" + countryCode; 1811 // ltp.set(languageCode); 1812 // Set<String> scripts = LanguageInfo.INSTANCE.languageToScripts.get(languageCode); 1813 // String script = ltp.getScript(); 1814 // if (script.length() == 0) { 1815 // CodeAndPopulation bestScript = languageToMaxScript.get(languageCode); 1816 // if (bestScript != null) { 1817 // script = bestScript.code; 1818 // if (scripts == null || !scripts.contains(script)) { 1819 // warnings.add("WARNING: " + languageCode + " => " + script + ", not in " + scripts); 1820 // } 1821 // } else { 1822 // script = "Zzzz"; 1823 // if (scripts == null) { 1824 // scripts = LanguageInfo.INSTANCE.languageToScriptsAlt.get(languageCode); 1825 // } 1826 // if (scripts != null) { 1827 // script = scripts.iterator().next(); 1828 // if (scripts.size() != 1) { 1829 // warnings.add("WARNING: " + languageCode + " => " + scripts); 1830 // } 1831 // } 1832 // } 1833 // if (scripts == null) { 1834 // warnings.add("Missing scripts for: " + languageCode); 1835 // } else if (scripts.size() == 1){ 1836 // script = ""; 1837 // } 1838 // resolvedLanguageCode = languageCode 1839 // + (script.length() == 0 ? "" : "_" + script) 1840 // + "_" + countryCode; 1841 // } 1842 // 1843 // 1844 // System.out.println( 1845 // resolvedLanguageCode 1846 // + "\t" + languageCode 1847 // + "\t" + ULocale.getDisplayName(languageCode, ULocale.ENGLISH) 1848 // + "\t" + countryCode 1849 // + "\t" + ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH) 1850 // + "\t" + formatNumber(rawLanguagePopulation) 1851 // + (cldrParents.contains(languageCode) ? "\tCLDR" : "") 1852 // ); 1853 // if (languageCode.length() == 0) continue; 1854 // defaultContent.put(languageCode, resolvedLanguageCode); 1855 // } 1856 // for (String warning : warnings) { 1857 // System.out.println(warning); 1858 // } 1859 } 1860 1861 // private static void printDefaultContent(Set<String> defaultLocaleContent) { 1862 // String sep = Utility.LINE_SEPARATOR + "\t\t\t"; 1863 // String broken = Utility.breakLines(join(defaultLocaleContent," "), sep, 1864 // PatternCache.get("(\\S)\\S*").matcher(""), 1865 // 80); 1866 // 1867 // Log.println("\t\t<defaultContent locales=\"" + broken + "\""); 1868 // Log.println("\t\t/>"); 1869 // } 1870 getSuppressScript(String languageCode)1871 private static Object getSuppressScript(String languageCode) { 1872 // TODO Auto-generated method stub 1873 return null; 1874 } 1875 join(Collection c, String separator)1876 public static String join(Collection c, String separator) { 1877 StringBuffer result = new StringBuffer(); 1878 boolean first = true; 1879 for (Object x : c) { 1880 if (first) first = false; 1881 else result.append(separator); 1882 result.append(x); 1883 } 1884 return result.toString(); 1885 } 1886 addBestRegion( String languageCode, String countryCode, double languagePopulationRaw)1887 private static void addBestRegion( 1888 String languageCode, String countryCode, double languagePopulationRaw) { 1889 addBest(languageCode, languagePopulationRaw, countryCode, languageToMaxCountry); 1890 } 1891 addBestScript( String languageCode, String scriptCode, double languagePopulationRaw)1892 private static void addBestScript( 1893 String languageCode, String scriptCode, double languagePopulationRaw) { 1894 addBest(languageCode, languagePopulationRaw, scriptCode, languageToMaxScript); 1895 } 1896 addBest( String languageCode, double languagePopulationRaw, String code, Map<String, CodeAndPopulation> languageToMaxCode)1897 private static void addBest( 1898 String languageCode, 1899 double languagePopulationRaw, 1900 String code, 1901 Map<String, CodeAndPopulation> languageToMaxCode) { 1902 if (languageCode.length() == 0) { 1903 throw new IllegalArgumentException(); 1904 } 1905 CodeAndPopulation best = languageToMaxCode.get(languageCode); 1906 if (best == null) { 1907 languageToMaxCode.put(languageCode, best = new CodeAndPopulation()); 1908 } else if (best.population >= languagePopulationRaw) { 1909 return; 1910 } 1911 best.population = languagePopulationRaw; 1912 best.code = code; 1913 } 1914 1915 static class CodeAndPopulation { 1916 String code = null; 1917 double population = Double.NaN; 1918 1919 @Override toString()1920 public String toString() { 1921 return "{" + code + "," + population + "}"; 1922 } 1923 } 1924 1925 public static class GeneralCollator implements Comparator<String> { 1926 static UTF16.StringComparator cpCompare = new UTF16.StringComparator(true, false, 0); 1927 static RuleBasedCollator UCA = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); 1928 1929 static { 1930 UCA.setNumericCollation(true); 1931 } 1932 1933 @Override compare(String s1, String s2)1934 public int compare(String s1, String s2) { 1935 if (s1 == null) { 1936 return s2 == null ? 0 : -1; 1937 } else if (s2 == null) { 1938 return 1; 1939 } 1940 int result = UCA.compare(s1, s2); 1941 if (result != 0) return result; 1942 return cpCompare.compare(s1, s2); 1943 } 1944 } 1945 1946 public static class InverseComparator<T> implements Comparator<T> { 1947 private Comparator<T> other; 1948 InverseComparator()1949 public InverseComparator() { 1950 this.other = null; 1951 } 1952 InverseComparator(Comparator<T> other)1953 public InverseComparator(Comparator<T> other) { 1954 this.other = other; 1955 } 1956 1957 @Override compare(T a, T b)1958 public int compare(T a, T b) { 1959 return other == null ? ((Comparable) b).compareTo(a) : other.compare(b, a); 1960 } 1961 } 1962 1963 static Set<String> languagesNeeded = 1964 new TreeSet<>( 1965 // TODO: what is this list? 1966 Arrays.asList( 1967 "ab ba bh bi bo fj fy gd ha ht ik iu ks ku ky lg mi na no rm sa sd sg si sm sn su tg tk to tw vo yi za lb dv chr syr kha sco gv" 1968 .split("\\s"))); 1969 1970 /** Not called? */ 1971 @Deprecated generateIso639_2Data(PrintWriter out)1972 private static void generateIso639_2Data(PrintWriter out) { 1973 for (String languageSubtag : sc.getAvailableCodes("language")) { 1974 String alpha3 = Iso639Data.toAlpha3(languageSubtag); 1975 Type type = Iso639Data.getType(languageSubtag); 1976 Scope scope = Iso639Data.getScope(languageSubtag); 1977 if (type != null || alpha3 != null || scope != null) { 1978 out.println( 1979 "\t\t<languageCode type=\"" 1980 + languageSubtag 1981 + "\"" 1982 + (alpha3 == null ? "" : " iso639Alpha3=\"" + alpha3 + "\"") 1983 + (type == null ? "" : " iso639Type=\"" + type + "\"") 1984 + (scope == null ? "" : " iso639Scope=\"" + scope + "\"") 1985 + "/>"); 1986 } 1987 } 1988 } 1989 1990 static Relation<String, BasicLanguageData> language2BasicLanguageData = 1991 Relation.of(new TreeMap<String, Set<BasicLanguageData>>(), TreeSet.class); 1992 1993 static Map<String, Relation<BasicLanguageData.Type, String>> language_status_scripts; 1994 static Map<Pair<String, String>, String> language_script_references = new TreeMap<>(); 1995 1996 static final Map<String, Map<String, R2<List<String>, String>>> LOCALE_ALIAS_INFO = 1997 SupplementalDataInfo.getInstance().getLocaleAliasInfo(); 1998 getLanguage2Scripts(Set<RowData> sortedInput)1999 static void getLanguage2Scripts(Set<RowData> sortedInput) throws IOException { 2000 language_status_scripts = new TreeMap<>(); 2001 2002 // // get current scripts 2003 // Relation<String,String> languageToDefaultScript = new Relation(new TreeMap(), 2004 // TreeSet.class); 2005 // Relation<String,String> secondaryLanguageToDefaultScript = new Relation(new TreeMap(), 2006 // TreeSet.class); 2007 // for (String languageSubtag : language2BasicLanguageData.keySet()) { 2008 // for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) { 2009 // for (String script : item.getScripts()) { 2010 // addLanguage2Script(languageSubtag, item.getType(), script); 2011 // } 2012 // } 2013 // } 2014 // System.out.println("Language 2 scripts: " + language_status_scripts); 2015 2016 // #Lcode LanguageName Status Scode ScriptName References 2017 List<List<String>> input = 2018 SpreadSheet.convert(CldrUtility.getUTF8Data("language_script.tsv")); 2019 System.out.println( 2020 CldrUtility.LINE_SEPARATOR 2021 + "# Problems in language_script.tsv" 2022 + CldrUtility.LINE_SEPARATOR); 2023 // int count = -1; 2024 for (List<String> row : input) { 2025 try { 2026 if (row.size() == 0) continue; 2027 // ++count; 2028 String language = row.get(0).trim(); 2029 if (language.length() == 0 || language.startsWith("#")) continue; 2030 BasicLanguageData.Type status = BasicLanguageData.Type.valueOf(row.get(2)); 2031 String scripts = row.get(3); 2032 if (!checkCode(LstrType.language, language, row)) continue; 2033 for (String script : scripts.split("\\s+")) { 2034 if (!checkCode(LstrType.script, script, row)) continue; 2035 // if the script is not modern, demote 2036 Info scriptInfo = ScriptMetadata.getInfo(script); 2037 if (scriptInfo == null) { 2038 BadItem.ERROR.toString( 2039 "illegal script; must be represented in Unicode, remove line or fix", 2040 script, 2041 row); 2042 continue; 2043 } 2044 IdUsage idUsage = scriptInfo.idUsage; 2045 if (status == BasicLanguageData.Type.primary 2046 && idUsage != IdUsage.RECOMMENDED) { 2047 if (idUsage == IdUsage.ASPIRATIONAL || idUsage == IdUsage.LIMITED_USE) { 2048 BadItem.WARNING.toString( 2049 "Script has unexpected usage; make secondary if a Recommended script is used widely for the langauge", 2050 idUsage + ", " + script + "=" + getULocaleScriptName(script), 2051 row); 2052 } else { 2053 BadItem.ERROR.toString( 2054 "Script is not modern; make secondary", 2055 idUsage + ", " + script + "=" + getULocaleScriptName(script), 2056 row); 2057 status = BasicLanguageData.Type.secondary; 2058 } 2059 } 2060 2061 // if the language is not modern, demote 2062 if (LOCALE_ALIAS_INFO.get("language").containsKey(language)) { 2063 BadItem.ERROR.toString( 2064 "Remove/Change deprecated language", 2065 language 2066 + " " 2067 + getLanguageName(language) 2068 + "; " 2069 + LOCALE_ALIAS_INFO.get("language").get(language), 2070 row); 2071 continue; 2072 } 2073 if (status == BasicLanguageData.Type.primary 2074 && !sc.isModernLanguage(language)) { 2075 BadItem.ERROR.toString( 2076 "Should be secondary, language is not modern", 2077 language + " " + getLanguageName(language), 2078 row); 2079 status = BasicLanguageData.Type.secondary; 2080 } 2081 2082 addLanguage2Script(language, status, script); 2083 if (row.size() > 5) { 2084 String reference = row.get(5); 2085 if (reference != null && reference.length() == 0) { 2086 language_script_references.put(new Pair<>(language, script), reference); 2087 } 2088 } 2089 } 2090 } catch (RuntimeException e) { 2091 System.err.println(row); 2092 throw e; 2093 } 2094 } 2095 2096 // System.out.println("Language 2 scripts: " + language_status_scripts); 2097 2098 for (String language : sc.getGoodAvailableCodes("language")) { 2099 if (supplementalData.getDeprecatedInfo("language", language) != null) { 2100 continue; 2101 } 2102 Map<String, String> registryData = sc.getLangData("language", language); 2103 if (registryData != null) { 2104 String suppressScript = registryData.get("Suppress-Script"); 2105 if (suppressScript == null) continue; 2106 if (ScriptMetadata.getInfo(suppressScript) == null) { 2107 // skip, not represented in Unicode 2108 continue; 2109 } 2110 // if there is something already there, we have a problem. 2111 Relation<BasicLanguageData.Type, String> status_scripts = 2112 language_status_scripts.get(language); 2113 if (status_scripts == null) { 2114 System.out.println( 2115 "Missing Suppress-Script: " 2116 + language 2117 + "\tSuppress-Script:\t" 2118 + suppressScript); 2119 } else if (!status_scripts.values().contains(suppressScript)) { 2120 System.out.println( 2121 "Missing Suppress-Script: " 2122 + language 2123 + "\tSuppress-Script:\t" 2124 + suppressScript 2125 + "\tall:\t" 2126 + status_scripts.values()); 2127 } else { 2128 // at this point, the suppressScript is in the union of the primary and 2129 // secondary. 2130 Set<String> primaryScripts = 2131 status_scripts.getAll(BasicLanguageData.Type.primary); 2132 if (primaryScripts != null && !primaryScripts.contains(suppressScript)) { 2133 System.out.println( 2134 "Suppress-Script is not in primary: " 2135 + language 2136 + "\tSuppress-Script:\t" 2137 + suppressScript 2138 + "\tprimary:\t" 2139 + primaryScripts); 2140 } 2141 } 2142 addLanguage2Script(language, BasicLanguageData.Type.primary, suppressScript); 2143 } 2144 } 2145 2146 // remove primaries from secondaries 2147 // check for primaries for scripts 2148 for (String language : language_status_scripts.keySet()) { 2149 Relation<BasicLanguageData.Type, String> status_scripts = 2150 language_status_scripts.get(language); 2151 Set<String> secondaryScripts = status_scripts.getAll(BasicLanguageData.Type.secondary); 2152 if (secondaryScripts == null) continue; 2153 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary); 2154 if (primaryScripts == null) { 2155 // status_scripts.putAll(BasicLanguageData.Type.primary, secondaryScripts); 2156 // status_scripts.removeAll(BasicLanguageData.Type.secondary); 2157 if (sc.isModernLanguage(language)) { 2158 BadItem.ERROR.show( 2159 "modern language without primary script, might need to edit moribund_languages.txt", 2160 language + " " + getLanguageName(language)); 2161 } 2162 } else { 2163 status_scripts.removeAll(BasicLanguageData.Type.secondary, primaryScripts); 2164 } 2165 } 2166 2167 // check that every living language in the row data has a script 2168 Set<String> livingLanguagesWithTerritories = new TreeSet<>(); 2169 for (RowData rowData : sortedInput) { 2170 String language = rowData.languageCode; 2171 if (sc.isModernLanguage(language) 2172 && Iso639Data.getSource(language) != Iso639Data.Source.ISO_639_3) { 2173 livingLanguagesWithTerritories.add(language); 2174 } 2175 } 2176 for (String language : livingLanguagesWithTerritories) { 2177 Relation<BasicLanguageData.Type, String> status_scripts = 2178 language_status_scripts.get(language); 2179 if (status_scripts != null) { 2180 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary); 2181 if (primaryScripts != null && primaryScripts.size() > 0) { 2182 continue; 2183 } 2184 } 2185 if (language.equals("tw")) continue; // TODO load aliases and check... 2186 BadItem.WARNING.show( 2187 "ISO 639-1/2 language in language-territory list without primary script", 2188 language + "\t" + getLanguageName(language)); 2189 } 2190 2191 // System.out.println("Language 2 scripts: " + language_status_scripts); 2192 } 2193 checkScript(String script)2194 private static boolean checkScript(String script) { 2195 // TODO Auto-generated method stub 2196 return false; 2197 } 2198 2199 static Validity VALIDITY = Validity.getInstance(); 2200 checkCode(LstrType type, String code, List<String> sourceLine)2201 private static boolean checkCode(LstrType type, String code, List<String> sourceLine) { 2202 Status validity = VALIDITY.getCodeToStatus(type).get(code); 2203 if (validity == Status.regular) { 2204 return true; 2205 } else if (validity == Status.unknown && type == LstrType.region) { 2206 return true; 2207 } 2208 BadItem.ERROR.show("Illegitimate Code", type + ": " + code + " = " + validity, sourceLine); 2209 return false; 2210 } 2211 addLanguage2Script( String language, BasicLanguageData.Type type, String script)2212 private static void addLanguage2Script( 2213 String language, BasicLanguageData.Type type, String script) { 2214 Relation<BasicLanguageData.Type, String> status_scripts = 2215 language_status_scripts.get(language); 2216 if (status_scripts == null) 2217 language_status_scripts.put( 2218 language, 2219 status_scripts = 2220 Relation.of( 2221 new TreeMap<BasicLanguageData.Type, Set<String>>(), 2222 TreeSet.class)); 2223 status_scripts.put(type, script); 2224 } 2225 addLanguageScriptData()2226 static void addLanguageScriptData() throws IOException { 2227 // check to make sure that every language subtag is in 639-3 2228 Set<String> langRegistryCodes = sc.getGoodAvailableCodes("language"); 2229 // Set<String> iso639_2_missing = new TreeSet(langRegistryCodes); 2230 // iso639_2_missing.removeAll(Iso639Data.getAvailable()); 2231 // iso639_2_missing.remove("root"); 2232 // if (iso639_2_missing.size() != 0) { 2233 // for (String missing : iso639_2_missing){ 2234 // System.out.println("*ERROR in StandardCodes* Missing Lang/Script data:\t" + missing + ", 2235 // " + 2236 // sc.getData("language", missing)); 2237 // } 2238 // } 2239 2240 // Map<String, String> nameToTerritoryCode = new TreeMap(); 2241 // for (String territoryCode : sc.getGoodAvailableCodes("territory")) { 2242 // nameToTerritoryCode.put(sc.getData("territory", territoryCode).toLowerCase(), 2243 // territoryCode); 2244 // } 2245 // nameToTerritoryCode.put("iran", nameToTerritoryCode.get("iran, islamic republic of")); // 2246 2247 // BasicLanguageData languageData = new BasicLanguageData(); 2248 2249 BufferedReader in = CldrUtility.getUTF8Data("extraLanguagesAndScripts.txt"); 2250 while (true) { 2251 String line = in.readLine(); 2252 if (line == null) break; 2253 String[] parts = line.split("\\t"); 2254 String alpha3 = parts[0]; 2255 alpha3 = stripBrackets(alpha3); 2256 String languageSubtag = Iso639Data.fromAlpha3(alpha3); 2257 if (languageSubtag == null) { 2258 if (langRegistryCodes.contains(alpha3)) { 2259 languageSubtag = alpha3; 2260 } else { 2261 BadItem.WARNING.show("Language subtag not found on line", alpha3, line); 2262 continue; 2263 } 2264 } 2265 // String name = parts[1]; 2266 Set<String> names = Iso639Data.getNames(languageSubtag); 2267 if (names == null) { 2268 Map<String, String> name2 = sc.getLangData("language", languageSubtag); 2269 if (name2 != null) { 2270 String name3 = name2.get("Description"); 2271 if (name3 != null) { 2272 names = new TreeSet<>(); 2273 names.add(name3); 2274 } 2275 } 2276 } 2277 // if (names == null || !names.contains(name)) { 2278 // System.out.println("Name <" + name + "> for <" + languageSubtag + "> not found in " + 2279 // names); 2280 // } 2281 2282 // names all straight, now get scripts and territories 2283 // [Cyrl]; [Latn] 2284 Set<String> fullScriptList = sc.getGoodAvailableCodes("script"); 2285 2286 String[] scriptList = parts[2].split("[;,]\\s*"); 2287 Set<String> scripts = new TreeSet<>(); 2288 Set<String> scriptsAlt = new TreeSet<>(); 2289 for (String script : scriptList) { 2290 if (script.length() == 0) continue; 2291 boolean alt = false; 2292 if (script.endsWith("*")) { 2293 alt = true; 2294 script = script.substring(0, script.length() - 1); 2295 } 2296 script = stripBrackets(script); 2297 if (!fullScriptList.contains(script)) { 2298 System.out.println( 2299 "Script <" 2300 + script 2301 + "> for <" 2302 + languageSubtag 2303 + "> not found in " 2304 + fullScriptList); 2305 } else if (alt) { 2306 scriptsAlt.add(script); 2307 } else { 2308 scripts.add(script); 2309 } 2310 } 2311 // now territories 2312 Set<String> territories = new TreeSet<>(); 2313 if (parts.length > 4) { 2314 String[] territoryList = parts[4].split("\\s*[;,-]\\s*"); 2315 for (String territoryName : territoryList) { 2316 if (territoryName.equals("ISO/DIS 639") || territoryName.equals("3")) continue; 2317 String territoryCode = 2318 CountryCodeConverter.getCodeFromName(territoryName, true); 2319 if (territoryCode == null) { 2320 BadItem.ERROR.show( 2321 "no name found for territory", 2322 "<" + territoryName + ">", 2323 languageSubtag); 2324 } else { 2325 territories.add(territoryCode); 2326 } 2327 } 2328 } 2329 // <language type="de" scripts="Latn" territories="IT" alt="secondary"/> 2330 // we're going to go ahead and set these all to secondary. 2331 if (scripts.size() != 0) { 2332 language2BasicLanguageData.put( 2333 languageSubtag, 2334 new BasicLanguageData() 2335 .setType(BasicLanguageData.Type.secondary) 2336 .setScripts(scripts) 2337 .setTerritories(territories)); 2338 } 2339 if (scriptsAlt.size() != 0) { 2340 language2BasicLanguageData.put( 2341 languageSubtag, 2342 new BasicLanguageData() 2343 .setType(BasicLanguageData.Type.secondary) 2344 .setScripts(scriptsAlt) 2345 .setTerritories(territories)); 2346 } 2347 } 2348 in.close(); 2349 2350 // add other data 2351 for (String languageSubtag : supplementalData.getBasicLanguageDataLanguages()) { 2352 Set<BasicLanguageData> otherData = 2353 supplementalData.getBasicLanguageData(languageSubtag); 2354 language2BasicLanguageData.putAll(languageSubtag, otherData); 2355 } 2356 } 2357 2358 // private static void showAllBasicLanguageData(Relation<String, BasicLanguageData> 2359 // language2basicData, String 2360 // comment) { 2361 // // now print 2362 // Relation<String, String> primaryCombos = new Relation(new TreeMap(), TreeSet.class); 2363 // Relation<String, String> secondaryCombos = new Relation(new TreeMap(), TreeSet.class); 2364 // 2365 // Log.println("\t<languageData>" + (comment == null ? "" : " <!-- " + comment + " -->")); 2366 // 2367 // for (String languageSubtag : language2basicData.keySet()) { 2368 // String duplicate = ""; 2369 // // script,territory 2370 // primaryCombos.clear(); 2371 // secondaryCombos.clear(); 2372 // 2373 // for (BasicLanguageData item : language2basicData.getAll(languageSubtag)) { 2374 // Set<String> scripts = item.getScripts(); 2375 // if (scripts.size() == 0) scripts = new TreeSet(Arrays.asList(new String[] { "Zzzz" })); 2376 // for (String script : scripts) { 2377 // Set<String> territories = item.getTerritories(); 2378 // if (territories.size() == 0) territories = new TreeSet(Arrays.asList(new String[] { "ZZ" })); 2379 // for (String territory : territories) { 2380 // if (item.getType().equals(BasicLanguageData.Type.primary)) { 2381 // primaryCombos.put(script, territory); 2382 // } else { 2383 // secondaryCombos.put(script, territory); 2384 // } 2385 // } 2386 // } 2387 // } 2388 // secondaryCombos.removeAll(primaryCombos); 2389 // showBasicLanguageData(languageSubtag, primaryCombos, null, BasicLanguageData.Type.primary); 2390 // showBasicLanguageData(languageSubtag, secondaryCombos, primaryCombos.keySet(), 2391 // BasicLanguageData.Type.secondary); 2392 // // System.out.println(item.toString(languageSubtag) + duplicate); 2393 // // duplicate = " <!-- " + "**" + " -->"; 2394 // } 2395 // Log.println("\t</languageData>"); 2396 // } 2397 showBasicLanguageData( PrintWriter out, String languageSubtag, Relation<String, String> primaryCombos, Set<String> suppressEmptyScripts, BasicLanguageData.Type type)2398 private static void showBasicLanguageData( 2399 PrintWriter out, 2400 String languageSubtag, 2401 Relation<String, String> primaryCombos, 2402 Set<String> suppressEmptyScripts, 2403 BasicLanguageData.Type type) { 2404 Set<String> scriptsWithSameTerritories = new TreeSet<>(); 2405 Set<String> lastTerritories = Collections.emptySet(); 2406 for (String script : primaryCombos.keySet()) { 2407 Set<String> territories = primaryCombos.getAll(script); 2408 if (lastTerritories == Collections.EMPTY_SET) { 2409 // skip first 2410 } else if (lastTerritories.equals(territories)) { 2411 scriptsWithSameTerritories.add(script); 2412 } else { 2413 showBasicLanguageData2( 2414 out, 2415 languageSubtag, 2416 scriptsWithSameTerritories, 2417 suppressEmptyScripts, 2418 lastTerritories, 2419 type); 2420 scriptsWithSameTerritories.clear(); 2421 } 2422 lastTerritories = territories; 2423 scriptsWithSameTerritories.add(script); 2424 } 2425 showBasicLanguageData2( 2426 out, 2427 languageSubtag, 2428 scriptsWithSameTerritories, 2429 suppressEmptyScripts, 2430 lastTerritories, 2431 type); 2432 } 2433 showBasicLanguageData2( PrintWriter out, String languageSubtag, Set<String> scripts, Set<String> suppressEmptyScripts, Set<String> territories, BasicLanguageData.Type type)2434 private static void showBasicLanguageData2( 2435 PrintWriter out, 2436 String languageSubtag, 2437 Set<String> scripts, 2438 Set<String> suppressEmptyScripts, 2439 Set<String> territories, 2440 BasicLanguageData.Type type) { 2441 scripts.remove("Zzzz"); 2442 territories.remove("ZZ"); 2443 if (territories.size() == 0 && suppressEmptyScripts != null) { 2444 scripts.removeAll(suppressEmptyScripts); 2445 } 2446 if (scripts.size() == 0 && territories.size() == 0) return; 2447 out.println( 2448 "\t\t<language type=\"" 2449 + languageSubtag 2450 + "\"" 2451 + (scripts.size() == 0 2452 ? "" 2453 : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"") 2454 + (territories.size() == 0 2455 ? "" 2456 : " territories=\"" + CldrUtility.join(territories, " ") + "\"") 2457 + (type == BasicLanguageData.Type.primary ? "" : " alt=\"" + type + "\"") 2458 + "/>"); 2459 } 2460 2461 /* 2462 * System.out.println( 2463 * "\t\t<language type=\"" + languageSubtag + "\"" + 2464 * " scripts=\"" + Utility.join(scripts," ") + "\"" + 2465 * (territories.size() == 0 ? "" : " territories=\"" + Utility.join(territories," ") + "\"") + 2466 * "/>" 2467 * ); 2468 */ 2469 stripBrackets(String alpha3)2470 private static String stripBrackets(String alpha3) { 2471 if (alpha3.startsWith("[") && alpha3.endsWith("]")) { 2472 alpha3 = alpha3.substring(1, alpha3.length() - 1); 2473 } 2474 return alpha3; 2475 } 2476 2477 static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH); 2478 static NumberFormat nf_no_comma = NumberFormat.getInstance(ULocale.ENGLISH); 2479 2480 static { 2481 nf_no_comma.setGroupingUsed(false); 2482 } 2483 2484 static NumberFormat pf = NumberFormat.getPercentInstance(ULocale.ENGLISH); 2485 formatNumber(double original, int roundDigits, boolean xml)2486 public static String formatNumber(double original, int roundDigits, boolean xml) { 2487 double d = original; 2488 if (roundDigits != 0) { 2489 d = CldrUtility.roundToDecimals(original, roundDigits); 2490 } 2491 if (Double.isNaN(d)) { 2492 d = CldrUtility.roundToDecimals(original, roundDigits); 2493 throw new IllegalArgumentException("Double is NaN"); 2494 } 2495 if (xml) { 2496 return nf_no_comma.format(d); 2497 } 2498 return nf.format(d); 2499 } 2500 formatPercent(double d, int roundDigits, boolean xml)2501 public static String formatPercent(double d, int roundDigits, boolean xml) { 2502 if (roundDigits != 0) { 2503 d = CldrUtility.roundToDecimals(d, roundDigits); 2504 } 2505 if (xml) { 2506 nf_no_comma.setMaximumFractionDigits(roundDigits + 2); 2507 return nf_no_comma.format(d * 100.0); 2508 } 2509 pf.setMaximumFractionDigits(roundDigits + 2); 2510 return pf.format(d); 2511 } 2512 2513 static final LanguageTagCanonicalizer languageTagCanonicalizer = new LanguageTagCanonicalizer(); 2514 fixLanguageCode(String languageCodeRaw, List<String> row)2515 private static String fixLanguageCode(String languageCodeRaw, List<String> row) { 2516 String languageCode = languageTagCanonicalizer.transform(languageCodeRaw); 2517 if (DEBUG && !languageCode.equals(languageCodeRaw)) { 2518 System.out.println("## " + languageCodeRaw + " => " + languageCode); 2519 } 2520 int bar = languageCode.indexOf('_'); 2521 String script = ""; 2522 if (bar >= 0) { 2523 script = languageCode.substring(bar); 2524 languageCode = languageCode.substring(0, bar); 2525 } 2526 R2<List<String>, String> replacement = 2527 supplementalData.getLocaleAliasInfo().get("language").get(languageCode); 2528 if (replacement != null) { 2529 String replacementCode = replacement.get0().get(0); 2530 BadItem.ERROR.show( 2531 "deprecated language code", languageCode + " => " + replacementCode, row); 2532 languageCode = replacementCode; 2533 } 2534 if (!sc.getAvailableCodes("language").contains(languageCode)) { 2535 BadItem.ERROR.show("bad language code", languageCode, row); 2536 } 2537 return languageCode + script; 2538 } 2539 2540 enum BadItem { 2541 ERROR, 2542 WARNING, 2543 DETAIL; 2544 show(String problem, String details, String... items)2545 void show(String problem, String details, String... items) { 2546 System.out.println(toString(problem, details, items)); 2547 } 2548 show(String problem, String details, List<String> row)2549 void show(String problem, String details, List<String> row) { 2550 System.out.println(toString(problem, details, row)); 2551 } 2552 toString(String problem, String details, String... items)2553 private String toString(String problem, String details, String... items) { 2554 return toString(problem, details, Arrays.asList(items)); 2555 } 2556 toString(String problem, String details, List<String> row)2557 private String toString(String problem, String details, List<String> row) { 2558 return "* " 2559 + this 2560 + " *\t" 2561 + problem 2562 + ":" 2563 + "\t" 2564 + details 2565 + (row != null && row.size() > 0 ? "\t" + Joiner.on("\t").join(row) : ""); 2566 } 2567 } 2568 fixCountryCode(String countryCode, List<String> row)2569 private static String fixCountryCode(String countryCode, List<String> row) { 2570 R2<List<String>, String> replacement = 2571 supplementalData.getLocaleAliasInfo().get("territory").get(countryCode); 2572 if (replacement != null) { 2573 String replacementCode = replacement.get0().get(0); 2574 BadItem.ERROR.show( 2575 "deprecated territory code", countryCode + " => " + replacementCode, row); 2576 countryCode = replacementCode; 2577 } 2578 if (!sc.getAvailableCodes("territory").contains(countryCode)) { 2579 BadItem.ERROR.show("bad territory code", countryCode, row); 2580 } 2581 return countryCode; 2582 } 2583 getULocaleLocaleName(String languageCode)2584 private static String getULocaleLocaleName(String languageCode) { 2585 return english.getName(languageCode, true); 2586 // return new ULocale(languageCode).getDisplayName(); 2587 } 2588 getULocaleScriptName(String scriptCode)2589 private static String getULocaleScriptName(String scriptCode) { 2590 return english.getName(CLDRFile.SCRIPT_NAME, scriptCode); 2591 // return ULocale.getDisplayScript("und_" + scriptCode, ULocale.ENGLISH); 2592 } 2593 getULocaleCountryName(String countryCode)2594 private static String getULocaleCountryName(String countryCode) { 2595 return english.getName(CLDRFile.TERRITORY_NAME, countryCode); 2596 // return ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH); 2597 } 2598 } 2599