1 package org.unicode.cldr.tool; 2 3 import com.google.common.base.Joiner; 4 import com.ibm.icu.impl.Relation; 5 import com.ibm.icu.impl.Row.R2; 6 import com.ibm.icu.impl.Utility; 7 import com.ibm.icu.lang.UCharacter; 8 import com.ibm.icu.text.CaseMap; 9 import com.ibm.icu.text.LocaleDisplayNames; 10 import com.ibm.icu.text.Normalizer2; 11 import com.ibm.icu.util.ULocale; 12 import java.io.IOException; 13 import java.io.PrintWriter; 14 import java.lang.invoke.MethodHandles; 15 import java.util.ArrayList; 16 import java.util.Collection; 17 import java.util.Collections; 18 import java.util.Comparator; 19 import java.util.HashMap; 20 import java.util.HashSet; 21 import java.util.LinkedHashSet; 22 import java.util.List; 23 import java.util.Locale; 24 import java.util.Map; 25 import java.util.Map.Entry; 26 import java.util.Set; 27 import java.util.TreeMap; 28 import java.util.TreeSet; 29 import java.util.regex.Pattern; 30 import org.unicode.cldr.tool.GenerateSubdivisions.SubdivisionInfo; 31 import org.unicode.cldr.util.CLDRConfig; 32 import org.unicode.cldr.util.CLDRFile; 33 import org.unicode.cldr.util.CLDRPaths; 34 import org.unicode.cldr.util.ChainedMap; 35 import org.unicode.cldr.util.ChainedMap.M3; 36 import org.unicode.cldr.util.DtdType; 37 import org.unicode.cldr.util.Factory; 38 import org.unicode.cldr.util.Pair; 39 import org.unicode.cldr.util.PatternCache; 40 import org.unicode.cldr.util.StandardCodes; 41 import org.unicode.cldr.util.StandardCodes.LstrField; 42 import org.unicode.cldr.util.StandardCodes.LstrType; 43 import org.unicode.cldr.util.SupplementalDataInfo; 44 import org.unicode.cldr.util.Validity; 45 import org.unicode.cldr.util.Validity.Status; 46 import org.unicode.cldr.util.XMLFileReader; 47 import org.unicode.cldr.util.XPathParts; 48 import org.unicode.cldr.util.XPathParts.Comments.CommentType; 49 50 public class SubdivisionNode { 51 private static final Comparator<String> COMPARATOR_ROOT = 52 CLDRConfig.getInstance().getComparatorRoot(); 53 static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance(); 54 static final Map<String, R2<List<String>, String>> territoryAliases = 55 SDI.getLocaleAliasInfo().get("territory"); 56 static final Set<String> containment = SDI.getContainers(); 57 static final Map<String, Map<LstrField, String>> codeToData = 58 StandardCodes.getEnumLstreg().get(LstrType.region); 59 60 static LocaleDisplayNames ENGLISH_ICU = LocaleDisplayNames.getInstance(ULocale.ENGLISH); 61 62 static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE = 63 CaseMap.toTitle().wholeString().noLowercase(); 64 static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); 65 static final CLDRFile ENGLISH_CLDR = CLDR_CONFIG.getEnglish(); 66 static final Normalizer2 nfc = Normalizer2.getNFCInstance(); 67 convertToCldr(String regionOrSubdivision)68 public static String convertToCldr(String regionOrSubdivision) { 69 return SubdivisionNames.isRegionCode(regionOrSubdivision) 70 ? regionOrSubdivision.toUpperCase(Locale.ROOT) 71 : regionOrSubdivision.replace("-", "").toLowerCase(Locale.ROOT); 72 } 73 74 final SubdivisionSet sset; 75 final String code; 76 final int level; 77 final SubdivisionNode parent; 78 final Map<String, SubdivisionNode> children = new TreeMap<>(COMPARATOR_ROOT); 79 SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset)80 public SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset) { 81 this.code = code; 82 this.level = parent == null ? -1 : parent.level + 1; 83 this.parent = parent; 84 this.sset = sset; 85 sset.ID_TO_NODE.put(code, this); 86 } 87 addName(String lang, String value)88 public SubdivisionNode addName(String lang, String value) { 89 sset.NAMES.put(code, lang, value); 90 return this; 91 } 92 93 static class SubdivisionSet { 94 95 final M3<String, String, String> NAMES = 96 ChainedMap.of( 97 new TreeMap<String, Object>(), new TreeMap<String, Object>(), String.class); 98 final Map<String, String> TO_COUNTRY_CODE = new TreeMap<>(); 99 final Relation<String, String> ID_SAMPLE = 100 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 101 final Map<String, String> SUB_TO_CAT = new TreeMap<>(); 102 final Relation<String, String> REGION_CONTAINS = 103 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 104 final Map<String, SubdivisionNode> ID_TO_NODE = new HashMap<>(); 105 106 final SubdivisionNode BASE = new SubdivisionNode("001", null, this).addName("en", "World"); 107 addName(String code, String lang, String value)108 public void addName(String code, String lang, String value) { 109 int parenPos = value.indexOf("(see also separate country"); 110 if (parenPos >= 0) { 111 /* 112 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ1: expected "Caribbean Netherlands", got "Bonaire" 113 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ2: expected "Caribbean Netherlands", got "Saba" 114 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ3: expected "Caribbean Netherlands", got "Sint Eustatius" 115 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-21: expected "Svalbard & Jan Mayen", got "Svalbard" 116 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-22: expected "Svalbard & Jan Mayen", got "Jan Mayen" 117 */ 118 // OLD code to guess country from comment 119 // String paren = value.substring(value.length() - 3, value.length() - 120 // 1); 121 // if (!paren.equals("BQ") && !paren.equals("SJ")) { 122 // String old = TO_COUNTRY_CODE.get(code); 123 // if (old != null) { 124 // System.err.println("Duplicate: " + code + "\t" + old + 125 // "\t" + paren); 126 // } 127 // TO_COUNTRY_CODE.put(code, paren); 128 // } 129 value = value.substring(0, parenPos).trim(); 130 } 131 value = value.replace("*", ""); 132 NAMES.put(code, lang, value); 133 } 134 135 static final String[] CRUFT = { 136 "Emirate", 137 "Parish", 138 "County", 139 "District", 140 "Region", 141 "Province of", 142 "Province", 143 "Republic", 144 ", Barbados", 145 ", Burkina Faso", 146 "Governorate", 147 "Department", 148 "Canton of", 149 "(Région des)", 150 "(Région du)", 151 "(Région de la)", 152 "Autonomous", 153 "Archipelago of", 154 "Canton", 155 "kanton", 156 ", Bahamas", 157 "province", 158 "(Région)", 159 "(Région de l')", 160 ", Cameroon", 161 "State of", 162 "State", 163 "Metropolitan Borough of", 164 "London Borough of", 165 "Royal Borough of", 166 "Borough of", 167 "Borough", 168 "Council of", 169 "Council", 170 "City of", 171 ", The", 172 "prefecture", 173 "Prefecture", 174 "municipality" 175 }; 176 177 static final Pattern CRUFT_PATTERN = 178 PatternCache.get("(?i)\\b" + String.join("|", CRUFT) + "\\b"); 179 static final Pattern BRACKETED = PatternCache.get("\\[.*\\]"); 180 clean(String input)181 static String clean(String input) { 182 if (input == null) { 183 return input; 184 } 185 // Quick & dirty 186 input = BRACKETED.matcher(input).replaceAll(""); 187 input = CRUFT_PATTERN.matcher(input).replaceAll(""); 188 // for (String cruft : CRUFT) { 189 // int pos = input.indexOf(cruft); 190 // if (pos >= 0) { 191 // input = input.substring(0,pos) + input.substring(pos + 192 // cruft.length()); 193 // } 194 // } 195 input = input.replace(" ", " "); 196 if (input.endsWith(",")) { 197 input = input.substring(0, input.length() - 1); 198 } 199 return fixName(input); 200 } 201 appendName( CLDRFile fileSubdivisions, final String sdCode, String name, String level)202 private static void appendName( 203 CLDRFile fileSubdivisions, final String sdCode, String name, String level) 204 throws IOException { 205 if (name == null) { 206 return; 207 } 208 String cldrCode = convertToCldr(sdCode); 209 String path = 210 "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\"" 211 + cldrCode 212 + "\"]"; 213 String oldValue = fileSubdivisions.getStringValue(path); 214 if (oldValue != null) { 215 return; // don't override old values 216 } 217 fileSubdivisions.add(path, name); 218 if (level != null) { 219 fileSubdivisions.addComment(path, level, CommentType.LINE); 220 } 221 } 222 isKosher(String regionCode)223 private boolean isKosher(String regionCode) { 224 if (regionCode.equals("001")) { 225 return false; 226 } 227 if (territoryAliases.containsKey(regionCode) 228 || containment.contains(regionCode) 229 || codeToData 230 .get(regionCode) 231 .get(LstrField.Description) 232 .contains("Private use")) { 233 Set<String> rc = REGION_CONTAINS.get(regionCode); 234 if (rc != null) { 235 throw new IllegalArgumentException("? " + regionCode + ": " + rc); 236 } 237 return false; 238 } 239 return true; 240 } 241 addChildren( Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2)242 private static void addChildren( 243 Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2) { 244 TreeMap<String, SubdivisionNode> temp = new TreeMap<>(COMPARATOR_ROOT); 245 temp.putAll(children2); 246 ordered.addAll(temp.values()); 247 for (SubdivisionNode n : temp.values()) { 248 if (!n.children.isEmpty()) { 249 addChildren(ordered, n.children); 250 } 251 } 252 } 253 254 static Map<String, String> NAME_CORRECTIONS = new HashMap<>(); 255 getBestName(String value, boolean useIso)256 private String getBestName(String value, boolean useIso) { 257 String cldrName = null; 258 cldrName = NAME_CORRECTIONS.get(value); 259 if (cldrName != null) { 260 return fixName(cldrName); 261 } 262 R2<List<String>, String> subdivisionAlias = 263 SubdivisionInfo.SUBDIVISION_ALIASES_FORMER.get(value); 264 if (subdivisionAlias != null) { 265 String country = subdivisionAlias.get0().get(0); 266 cldrName = ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, country); 267 if (cldrName != null) { 268 return fixName(cldrName); 269 } 270 } 271 272 cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(value); 273 if (cldrName != null) { 274 return fixName(cldrName); 275 } 276 277 Collection<String> oldAliases = SubdivisionInfo.subdivisionIdToOld.get(value); 278 if (oldAliases != null) { 279 for (String oldAlias : oldAliases) { 280 cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(oldAlias); 281 if (cldrName != null) { 282 return fixName(cldrName); 283 } 284 } 285 } 286 287 if (useIso) { 288 cldrName = getIsoName(value); 289 if (cldrName == null) { 290 cldrName = "UNKNOWN"; 291 // throw new IllegalArgumentException("Failed to find name: " + value); 292 } 293 return fixName(cldrName); 294 } 295 return null; 296 } 297 fixName(String name)298 private static String fixName(String name) { 299 return name == null 300 ? null 301 : nfc.normalize(name.replace('\'', '’').replace(" ", " ").trim()); 302 } 303 SubdivisionSet(String sourceFile)304 public SubdivisionSet(String sourceFile) { 305 306 // <country id="AD" version="16"> 307 // <subdivision-code footnote="*">AD-02</subdivision-code> 308 // <subdivision-locale lang3code="eng" xml:lang="en"> 309 // <subdivision-locale-name>Otago</subdivision-locale-name> 310 311 List<Pair<String, String>> pathValues = 312 XMLFileReader.loadPathValues( 313 sourceFile, new ArrayList<Pair<String, String>>(), false); 314 int maxIndent = 0; 315 SubdivisionNode lastNode = null; 316 String lastCode = null; 317 Set<String> conflictingTargetCountries = new HashSet<>(); 318 319 for (Pair<String, String> pair : pathValues) { 320 String path = pair.getFirst(); 321 boolean code = path.contains("/subdivision-code"); 322 boolean name = path.contains("/subdivision-locale-name"); 323 boolean nameCat = path.contains("/category-name"); 324 boolean relatedCountry = path.contains("/subdivision-related-country"); 325 326 // <country id="AD" version="16"> 327 // <category id="262"> 328 // <category-name lang3code="fra" xml:lang="fr">paroisse</category-name> 329 // <category-name lang3code="eng" xml:lang="en">parish</category-name> 330 // also languages in region... 331 332 // new XML from ISO, so we don't have to guess the country code: 333 // <subdivision-code footnote="*">NL-BQ1</subdivision-code> 334 // <subdivision-related-country country-id="BQ" xml:lang="en">BONAIRE, 335 // SINT EUSTATIUS AND SABA</subdivision-related-country> 336 337 if (!code && !name && !nameCat && !relatedCountry) { 338 continue; 339 } 340 XPathParts parts = XPathParts.getFrozenInstance(path); 341 String value = pair.getSecond(); 342 if (relatedCountry) { 343 String target = parts.getAttributeValue(-1, "country-id"); 344 // remove conflicting target countries 345 for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) { 346 if (entry.getValue().equals(target)) { 347 conflictingTargetCountries.add(target); 348 TO_COUNTRY_CODE.remove( 349 entry.getKey(), target); // there can be at most one 350 break; 351 } 352 } 353 if (!conflictingTargetCountries.contains(target)) { 354 TO_COUNTRY_CODE.put(lastCode, target); 355 // System.out.println(lastCode + " => " + target); 356 } 357 } else if (name) { 358 int elementNum = -2; 359 String lang = parts.getAttributeValue(elementNum, "xml:lang"); 360 if (lang == null) { 361 lang = parts.getAttributeValue(elementNum, "lang3code"); 362 } 363 addName(lastCode, lang, value); 364 // output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang + 365 // ":\t«" + value + "»\t"); 366 } else if (nameCat) { 367 // country-codes[@generated="2015-05-04T15:40:13.424465+02:00"]/country[@id="AD"][@version="16"]/category[@id="262"]/category-name[@lang3code="fra"][@xml:lang="fr"] 368 int elementNum = -1; 369 String lang = parts.getAttributeValue(elementNum, "xml:lang"); 370 if (lang == null) { 371 lang = parts.getAttributeValue(elementNum, "lang3code"); 372 } 373 String category = parts.getAttributeValue(-2, "id"); 374 addName(category, lang, value); 375 // output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang + 376 // ":\t«" + value + "»\t"); 377 } else { 378 int countSubdivision = 0; 379 for (int i = 0; i < parts.size(); ++i) { 380 if (parts.getElement(i).equals("subdivision")) { 381 ++countSubdivision; 382 } 383 } 384 if (maxIndent < countSubdivision) { 385 maxIndent = countSubdivision; 386 } 387 value = convertToCldr(value); 388 if (countSubdivision == 1) { 389 lastNode = addNode(null, value); 390 } else { 391 lastNode = addNode(lastNode, value); 392 } 393 lastCode = value; 394 int subdivisionElement = parts.findElement("subdivision"); 395 String id = parts.getAttributeValue(subdivisionElement, "category-id"); 396 addIdSample(id, value); 397 // <subdivision category-id="262">//<subdivision-code 398 // footnote="*">AD-06</subdivision-code> 399 // <subdivision category-id="262"> 400 // output.println(++count + Utility.repeat("\t", indent) + "code=" + value); 401 } 402 } 403 } 404 addIdSample(String id, String value)405 public void addIdSample(String id, String value) { 406 SUB_TO_CAT.put(value, id); 407 ID_SAMPLE.put(getIsoName(id), value); 408 } 409 addNode(SubdivisionNode lastSubdivision, String subdivision)410 final SubdivisionNode addNode(SubdivisionNode lastSubdivision, String subdivision) { 411 // "NZ-S", x 412 String region = SubdivisionNames.getRegionFromSubdivision(subdivision); 413 REGION_CONTAINS.put(region, subdivision); 414 if (lastSubdivision == null) { 415 lastSubdivision = BASE.children.get(region); 416 if (lastSubdivision == null) { 417 lastSubdivision = 418 new SubdivisionNode(region, BASE, this) 419 .addName("en", ENGLISH_ICU.regionDisplayName(region)); 420 BASE.children.put(region, lastSubdivision); 421 } 422 return add(lastSubdivision, subdivision); 423 } 424 add(lastSubdivision, subdivision); 425 return lastSubdivision; 426 } 427 add(SubdivisionNode subdivisionNode1, String subdivision2)428 private SubdivisionNode add(SubdivisionNode subdivisionNode1, String subdivision2) { 429 SubdivisionNode subdivisionNode2 = subdivisionNode1.children.get(subdivision2); 430 if (subdivisionNode2 == null) { 431 subdivisionNode2 = new SubdivisionNode(subdivision2, subdivisionNode1, this); 432 } 433 subdivisionNode1.children.put(subdivision2, subdivisionNode2); 434 return subdivisionNode2; 435 } 436 getName(SubdivisionNode base2)437 private String getName(SubdivisionNode base2) { 438 return getIsoName(base2.code); 439 } 440 getIsoName(String code)441 private String getIsoName(String code) { 442 if (code == null) { 443 return null; 444 } 445 Map<String, String> map = NAMES.get(code); 446 if (map == null) { 447 return "???"; 448 } 449 String name = map.get("en"); 450 if (name != null) { 451 return name; 452 } 453 name = map.get("es"); 454 if (name != null) { 455 return name; 456 } 457 name = map.get("fr"); 458 if (name != null) { 459 return name; 460 } 461 if (name == null) { 462 name = map.entrySet().iterator().next().getValue(); 463 } 464 return name; 465 } 466 print(PrintWriter out)467 public void print(PrintWriter out) { 468 print(out, 0, "", BASE); 469 for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) { 470 out.println(entry.getKey() + "\t" + entry.getValue()); 471 } 472 } 473 print(PrintWriter out, int indent, String prefix, SubdivisionNode base2)474 private void print(PrintWriter out, int indent, String prefix, SubdivisionNode base2) { 475 if (!prefix.isEmpty()) { 476 prefix += "\t"; 477 } 478 prefix += base2.code; 479 final String indentString = Utility.repeat("\t", 4 - indent); 480 out.println(prefix + indentString + getName(base2)); 481 if (base2.children.isEmpty()) { 482 return; 483 } 484 for (SubdivisionNode child : base2.children.values()) { 485 print(out, indent + 1, prefix, child); 486 } 487 } 488 } 489 490 static class SubDivisionExtractor { 491 final SubdivisionSet sdset; 492 final Validity validityFormer; 493 final Map<String, R2<List<String>, String>> subdivisionAliasesFormer; 494 final Relation<String, String> formerRegionToSubdivisions; 495 SubDivisionExtractor( SubdivisionSet sdset, Validity validityFormer, Map<String, R2<List<String>, String>> subdivisionAliasesFormer, Relation<String, String> formerRegionToSubdivisions)496 public SubDivisionExtractor( 497 SubdivisionSet sdset, 498 Validity validityFormer, 499 Map<String, R2<List<String>, String>> subdivisionAliasesFormer, 500 Relation<String, String> formerRegionToSubdivisions) { 501 this.sdset = sdset; 502 this.validityFormer = validityFormer; 503 this.subdivisionAliasesFormer = subdivisionAliasesFormer; 504 this.formerRegionToSubdivisions = formerRegionToSubdivisions; 505 } 506 printXml(Appendable output)507 void printXml(Appendable output) throws IOException { 508 509 /* 510 <subdivisionContainment> 511 <group type="NZ" category="island" contains="NZ-N NZ-S"/> <!-- New Zealand --> 512 <group type="NZ" category="special island authority" contains="NZ-CIT"/> <!-- New Zealand --> 513 <group type="NZ-N" contains="NZ-AUK NZ-BOP NZ-GIS NZ-HKB NZ-MWT NZ-NTL NZ-AUK NZ-TKI NZ-WGN NZ-WKO"/> <!-- North Island --> 514 <group type="NZ-S" contains="NZ-CAN NZ-MBH NZ-STL NZ-NSN NZ-OTA NZ-TAS NZ-WTC"/> <!-- South Island --> 515 </subdivisionContainment> 516 */ 517 output.append( 518 DtdType.supplementalData.header(MethodHandles.lookup().lookupClass()) 519 + "\t<version number=\"$Revision" 520 + "$\"/>\n" 521 + "\t<subdivisionContainment>\n"); 522 printXml(output, sdset.BASE, 0); 523 output.append("\t</subdivisionContainment>\n</supplementalData>\n"); 524 } 525 printAliases(Appendable output)526 void printAliases(Appendable output) throws IOException { 527 addAliases(output, sdset.TO_COUNTRY_CODE.keySet()); 528 529 // Get the old validity data 530 Map<Status, Set<String>> oldSubdivisionData = 531 validityFormer.getStatusToCodes(LstrType.subdivision); 532 Set<String> missing = new TreeSet<>(COMPARATOR_ROOT); 533 missing.addAll(sdset.TO_COUNTRY_CODE.keySet()); 534 Set<String> nowValid = sdset.ID_TO_NODE.keySet(); 535 for (Entry<Status, Set<String>> e : oldSubdivisionData.entrySet()) { 536 Status v = e.getKey(); 537 if (v == Status.unknown) { 538 continue; 539 } 540 Set<String> set = e.getValue(); 541 for (String sdcodeRaw : set) { 542 String sdcode = sdcodeRaw; // .toUpperCase(Locale.ROOT); 543 // sdcode = sdcode.substring(0,2) + "-" + sdcode.substring(2); 544 if (!nowValid.contains(sdcode)) { 545 missing.add(sdcode); 546 } 547 } 548 } 549 missing.removeAll(sdset.TO_COUNTRY_CODE.keySet()); 550 addAliases(output, missing); 551 } 552 addAliases(Appendable output, Set<String> missing)553 private void addAliases(Appendable output, Set<String> missing) throws IOException { 554 for (String toReplace : missing) { 555 List<String> replaceBy = null; 556 String reason = "deprecated"; 557 R2<List<String>, String> aliasInfo = subdivisionAliasesFormer.get(toReplace); 558 if (aliasInfo != null) { 559 replaceBy = aliasInfo.get0(); 560 reason = aliasInfo.get1(); 561 System.out.println("Adding former alias: " + toReplace + " => " + replaceBy); 562 } else { 563 String replacement = sdset.TO_COUNTRY_CODE.get(toReplace); 564 if (replacement != null) { 565 replaceBy = Collections.singletonList(replacement); 566 reason = "overlong"; 567 System.out.println( 568 "Adding country code alias: " + toReplace + " => " + replaceBy); 569 } 570 } 571 addAlias(output, toReplace, replaceBy, reason); 572 } 573 } 574 addAlias( Appendable output, final String toReplace, final List<String> replaceBy, final String reason)575 private void addAlias( 576 Appendable output, 577 final String toReplace, 578 final List<String> replaceBy, 579 final String reason) 580 throws IOException { 581 // <languageAlias type="art_lojban" replacement="jbo" reason="deprecated"/> <!-- Lojban 582 // --> 583 output.append("\t\t\t"); 584 if (replaceBy == null) { 585 output.append("<!-- "); 586 } 587 output.append( 588 "<subdivisionAlias" 589 + " type=\"" 590 + toReplace 591 + "\"" 592 + " replacement=\"" 593 + (replaceBy == null 594 ? toReplace.substring(0, 2) + "?" 595 : Joiner.on(" ").join(replaceBy)) 596 + "\"" 597 + " reason=\"" 598 + reason 599 + "\"/>" 600 + (replaceBy == null ? " <!- - " : " <!-- ") 601 + sdset.getBestName(toReplace, true) 602 + " => " 603 + (replaceBy == null ? "??" : getBestName(replaceBy, true)) 604 + " -->" 605 + "\n"); 606 } 607 getBestName(List<String> replaceBy, boolean useIso)608 private String getBestName(List<String> replaceBy, boolean useIso) { 609 StringBuilder result = new StringBuilder(); 610 for (String s : replaceBy) { 611 if (result.length() != 0) { 612 result.append(", "); 613 } 614 if (SubdivisionNames.isRegionCode(s)) { 615 result.append(ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, s)); 616 } else { 617 result.append(sdset.getBestName(s, useIso)); 618 } 619 } 620 return result.toString(); 621 } 622 printXml(Appendable output, SubdivisionNode base2, int indent)623 private void printXml(Appendable output, SubdivisionNode base2, int indent) 624 throws IOException { 625 if (base2.children.isEmpty()) { 626 return; 627 } 628 String type = base2.code; 629 if (base2 != sdset.BASE) { 630 type = convertToCldr(type); 631 output.append("\t\t" + "<subgroup" + " type=\"" + type + "\"" + " contains=\""); 632 boolean first = true; 633 for (String child : base2.children.keySet()) { 634 if (first) { 635 first = false; 636 } else { 637 output.append(' '); 638 } 639 String subregion = convertToCldr(child); 640 output.append(subregion); 641 } 642 output.append("\"/>\n"); 643 } 644 for (SubdivisionNode child : base2.children.values()) { 645 printXml(output, child, indent); 646 } 647 } 648 printSamples(Appendable pw)649 public void printSamples(Appendable pw) throws IOException { 650 Set<String> seen = new HashSet<>(); 651 for (Entry<String, Set<String>> entry : sdset.ID_SAMPLE.keyValuesSet()) { 652 pw.append(entry.getKey()); 653 // int max = 10; 654 seen.clear(); 655 for (String sample : entry.getValue()) { 656 String region = sample.substring(0, 2); 657 if (seen.contains(region)) { 658 continue; 659 } 660 seen.add(region); 661 pw.append( 662 ";\t" 663 + ENGLISH_ICU.regionDisplayName(region) 664 + ": " 665 + sdset.getIsoName(sample) 666 + " (" 667 + sample 668 + ")"); 669 // if (--max < 0) break; 670 } 671 pw.append(System.lineSeparator()); 672 } 673 } 674 printEnglishComp(Appendable output)675 public void printEnglishComp(Appendable output) throws IOException { 676 Set<String> countEqual = new TreeSet<>(); 677 String lastCC = null; 678 output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\tEqual\n"); 679 for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) { 680 final String countryCode = entry.getKey(); 681 if (!countryCode.equals(lastCC)) { 682 if (lastCC != null && countEqual.size() != 0) { 683 output.append( 684 ENGLISH_ICU.regionDisplayName(lastCC) 685 + "\t\t\tEquals:\t" 686 + countEqual.size() 687 + "\t" 688 + countEqual 689 + "\n"); 690 } 691 countEqual.clear(); 692 693 lastCC = countryCode; 694 } 695 for (String value : entry.getValue()) { 696 String cldrName = sdset.getBestName(value, false); 697 String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value); 698 final String iso = sdset.getIsoName(value); 699 if (iso.equals(wiki)) { 700 countEqual.add(iso); 701 continue; 702 } 703 output.append( 704 ENGLISH_ICU.regionDisplayName(countryCode) 705 // + "\t" + 706 // WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value) 707 + "\t" 708 + cldrName 709 + "\t" 710 + value 711 + "\t" 712 + iso 713 + "\t" 714 + wiki 715 + "\n"); 716 } 717 } 718 if (countEqual.size() != 0) { 719 output.append( 720 ENGLISH_ICU.regionDisplayName(lastCC) 721 + "\t\t\tEquals:\t" 722 + countEqual.size() 723 + "\t" 724 + countEqual 725 + "\n"); 726 } 727 } 728 printEnglishCompFull(Appendable output)729 public void printEnglishCompFull(Appendable output) throws IOException { 730 output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\n"); 731 for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) { 732 final String countryCode = entry.getKey(); 733 for (String value : entry.getValue()) { 734 String cldrName = sdset.getBestName(value, false); 735 // getBestName(value); 736 String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value); 737 final String iso = sdset.getIsoName(value); 738 output.append( 739 ENGLISH_ICU.regionDisplayName(countryCode) 740 // + "\t" + 741 // WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value) 742 + "\t" 743 + value 744 + "\t" 745 + cldrName 746 + "\t" 747 + iso 748 + "\t" 749 + wiki 750 + "\n"); 751 } 752 } 753 } 754 printEnglish(PrintWriter output)755 public void printEnglish(PrintWriter output) throws IOException { 756 TreeSet<String> allRegions = new TreeSet<>(); 757 allRegions.addAll(codeToData.keySet()); 758 allRegions.addAll(formerRegionToSubdivisions.keySet()); // override 759 760 Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*"); 761 CLDRFile oldFileSubdivisions = cldrFactorySubdivisions.make("en", false); 762 CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed(); 763 764 Set<String> skipped = new LinkedHashSet<>(); 765 766 for (String regionCode : allRegions) { 767 if (!sdset.isKosher(regionCode)) { 768 if (regionCode.length() != 3) { 769 skipped.add(regionCode); 770 } 771 continue; 772 } 773 Set<String> remainder = formerRegionToSubdivisions.get(regionCode); 774 remainder = 775 remainder == null ? Collections.emptySet() : new LinkedHashSet<>(remainder); 776 777 SubdivisionNode regionNode = sdset.ID_TO_NODE.get(regionCode); 778 if (regionNode == null) { 779 continue; 780 } 781 782 Set<SubdivisionNode> ordered = new LinkedHashSet<>(); 783 SubdivisionSet.addChildren(ordered, regionNode.children); 784 785 for (SubdivisionNode node : ordered) { 786 final String sdCode = node.code; 787 String name = sdset.getBestName(sdCode, true); 788 String upper = UCharacter.toUpperCase(name); 789 String title = 790 SubdivisionNode.TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply( 791 Locale.ROOT, null, name); 792 if (name.equals(upper) || !name.equals(title)) { 793 System.out.println("Suspicious name: " + name); 794 } 795 SubdivisionSet.appendName(fileSubdivisions, sdCode, name, null); 796 remainder.remove(sdCode); 797 } 798 for (String sdCode : remainder) { 799 String name = sdset.getBestName(sdCode, true); 800 if (!name.equals("???")) { 801 SubdivisionSet.appendName( 802 fileSubdivisions, sdCode, name, "\t<!-- deprecated -->"); 803 } 804 } 805 } 806 System.out.println("Skipping: " + skipped); 807 fileSubdivisions.write(output); 808 } 809 printMissingMIDs(PrintWriter pw)810 public void printMissingMIDs(PrintWriter pw) { 811 // for (Entry<String, String> entry : 812 // WikiSubdivisionLanguages.WIKIDATA_TO_MID.entrySet()) { 813 // String mid = entry.getValue(); 814 // if (!mid.isEmpty()) { 815 // continue; 816 // } 817 // String subCode = entry.getKey(); 818 // String wiki = clean(getWikiName(subCode)); 819 // String iso = clean(getIsoName(subCode)); 820 // String countryCode = subCode.substring(0, 2); 821 // String cat = SUB_TO_CAT.get(subCode); 822 // String catName = getIsoName(cat); 823 // pw.append( 824 // ENGLISH_ICU.regionDisplayName(countryCode) 825 // + "\t" + mid 826 // + "\t" + subCode 827 // + "\t" + catName 828 // + "\t" + wiki 829 // + "\t" + iso 830 // + "\n" 831 // ); 832 // } 833 } 834 } 835 } 836