xref: /aosp_15_r20/external/cldr/tools/cldr-rdf/src/main/java/org/unicode/cldr/tool/SubdivisionNode.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.tool;
2 
3 import com.google.common.base.Joiner;
4 import com.ibm.icu.impl.Relation;
5 import com.ibm.icu.impl.Row.R2;
6 import com.ibm.icu.impl.Utility;
7 import com.ibm.icu.lang.UCharacter;
8 import com.ibm.icu.text.CaseMap;
9 import com.ibm.icu.text.LocaleDisplayNames;
10 import com.ibm.icu.text.Normalizer2;
11 import com.ibm.icu.util.ULocale;
12 import java.io.IOException;
13 import java.io.PrintWriter;
14 import java.lang.invoke.MethodHandles;
15 import java.util.ArrayList;
16 import java.util.Collection;
17 import java.util.Collections;
18 import java.util.Comparator;
19 import java.util.HashMap;
20 import java.util.HashSet;
21 import java.util.LinkedHashSet;
22 import java.util.List;
23 import java.util.Locale;
24 import java.util.Map;
25 import java.util.Map.Entry;
26 import java.util.Set;
27 import java.util.TreeMap;
28 import java.util.TreeSet;
29 import java.util.regex.Pattern;
30 import org.unicode.cldr.tool.GenerateSubdivisions.SubdivisionInfo;
31 import org.unicode.cldr.util.CLDRConfig;
32 import org.unicode.cldr.util.CLDRFile;
33 import org.unicode.cldr.util.CLDRPaths;
34 import org.unicode.cldr.util.ChainedMap;
35 import org.unicode.cldr.util.ChainedMap.M3;
36 import org.unicode.cldr.util.DtdType;
37 import org.unicode.cldr.util.Factory;
38 import org.unicode.cldr.util.Pair;
39 import org.unicode.cldr.util.PatternCache;
40 import org.unicode.cldr.util.StandardCodes;
41 import org.unicode.cldr.util.StandardCodes.LstrField;
42 import org.unicode.cldr.util.StandardCodes.LstrType;
43 import org.unicode.cldr.util.SupplementalDataInfo;
44 import org.unicode.cldr.util.Validity;
45 import org.unicode.cldr.util.Validity.Status;
46 import org.unicode.cldr.util.XMLFileReader;
47 import org.unicode.cldr.util.XPathParts;
48 import org.unicode.cldr.util.XPathParts.Comments.CommentType;
49 
50 public class SubdivisionNode {
51     private static final Comparator<String> COMPARATOR_ROOT =
52             CLDRConfig.getInstance().getComparatorRoot();
53     static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance();
54     static final Map<String, R2<List<String>, String>> territoryAliases =
55             SDI.getLocaleAliasInfo().get("territory");
56     static final Set<String> containment = SDI.getContainers();
57     static final Map<String, Map<LstrField, String>> codeToData =
58             StandardCodes.getEnumLstreg().get(LstrType.region);
59 
60     static LocaleDisplayNames ENGLISH_ICU = LocaleDisplayNames.getInstance(ULocale.ENGLISH);
61 
62     static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE =
63             CaseMap.toTitle().wholeString().noLowercase();
64     static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance();
65     static final CLDRFile ENGLISH_CLDR = CLDR_CONFIG.getEnglish();
66     static final Normalizer2 nfc = Normalizer2.getNFCInstance();
67 
convertToCldr(String regionOrSubdivision)68     public static String convertToCldr(String regionOrSubdivision) {
69         return SubdivisionNames.isRegionCode(regionOrSubdivision)
70                 ? regionOrSubdivision.toUpperCase(Locale.ROOT)
71                 : regionOrSubdivision.replace("-", "").toLowerCase(Locale.ROOT);
72     }
73 
74     final SubdivisionSet sset;
75     final String code;
76     final int level;
77     final SubdivisionNode parent;
78     final Map<String, SubdivisionNode> children = new TreeMap<>(COMPARATOR_ROOT);
79 
SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset)80     public SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset) {
81         this.code = code;
82         this.level = parent == null ? -1 : parent.level + 1;
83         this.parent = parent;
84         this.sset = sset;
85         sset.ID_TO_NODE.put(code, this);
86     }
87 
addName(String lang, String value)88     public SubdivisionNode addName(String lang, String value) {
89         sset.NAMES.put(code, lang, value);
90         return this;
91     }
92 
93     static class SubdivisionSet {
94 
95         final M3<String, String, String> NAMES =
96                 ChainedMap.of(
97                         new TreeMap<String, Object>(), new TreeMap<String, Object>(), String.class);
98         final Map<String, String> TO_COUNTRY_CODE = new TreeMap<>();
99         final Relation<String, String> ID_SAMPLE =
100                 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
101         final Map<String, String> SUB_TO_CAT = new TreeMap<>();
102         final Relation<String, String> REGION_CONTAINS =
103                 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
104         final Map<String, SubdivisionNode> ID_TO_NODE = new HashMap<>();
105 
106         final SubdivisionNode BASE = new SubdivisionNode("001", null, this).addName("en", "World");
107 
addName(String code, String lang, String value)108         public void addName(String code, String lang, String value) {
109             int parenPos = value.indexOf("(see also separate country");
110             if (parenPos >= 0) {
111                 /*
112                 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ1: expected "Caribbean Netherlands", got "Bonaire"
113                 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ2: expected "Caribbean Netherlands", got "Saba"
114                 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ3: expected "Caribbean Netherlands", got "Sint Eustatius"
115                 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-21: expected "Svalbard & Jan Mayen", got "Svalbard"
116                 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-22: expected "Svalbard & Jan Mayen", got "Jan Mayen"
117                  */
118                 // OLD code to guess country from comment
119                 //              String paren = value.substring(value.length() - 3, value.length() -
120                 // 1);
121                 //                if (!paren.equals("BQ") && !paren.equals("SJ")) {
122                 //                    String old = TO_COUNTRY_CODE.get(code);
123                 //                    if (old != null) {
124                 //                        System.err.println("Duplicate: " + code + "\t" + old +
125                 // "\t" + paren);
126                 //                    }
127                 //                    TO_COUNTRY_CODE.put(code, paren);
128                 //                }
129                 value = value.substring(0, parenPos).trim();
130             }
131             value = value.replace("*", "");
132             NAMES.put(code, lang, value);
133         }
134 
135         static final String[] CRUFT = {
136             "Emirate",
137             "Parish",
138             "County",
139             "District",
140             "Region",
141             "Province of",
142             "Province",
143             "Republic",
144             ", Barbados",
145             ", Burkina Faso",
146             "Governorate",
147             "Department",
148             "Canton of",
149             "(Région des)",
150             "(Région du)",
151             "(Région de la)",
152             "Autonomous",
153             "Archipelago of",
154             "Canton",
155             "kanton",
156             ", Bahamas",
157             "province",
158             "(Région)",
159             "(Région de l')",
160             ", Cameroon",
161             "State of",
162             "State",
163             "Metropolitan Borough of",
164             "London Borough of",
165             "Royal Borough of",
166             "Borough of",
167             "Borough",
168             "Council of",
169             "Council",
170             "City of",
171             ", The",
172             "prefecture",
173             "Prefecture",
174             "municipality"
175         };
176 
177         static final Pattern CRUFT_PATTERN =
178                 PatternCache.get("(?i)\\b" + String.join("|", CRUFT) + "\\b");
179         static final Pattern BRACKETED = PatternCache.get("\\[.*\\]");
180 
clean(String input)181         static String clean(String input) {
182             if (input == null) {
183                 return input;
184             }
185             // Quick & dirty
186             input = BRACKETED.matcher(input).replaceAll("");
187             input = CRUFT_PATTERN.matcher(input).replaceAll("");
188             //            for (String cruft : CRUFT) {
189             //                int pos = input.indexOf(cruft);
190             //                if (pos >= 0) {
191             //                    input = input.substring(0,pos) + input.substring(pos +
192             // cruft.length());
193             //                }
194             //            }
195             input = input.replace("  ", " ");
196             if (input.endsWith(",")) {
197                 input = input.substring(0, input.length() - 1);
198             }
199             return fixName(input);
200         }
201 
appendName( CLDRFile fileSubdivisions, final String sdCode, String name, String level)202         private static void appendName(
203                 CLDRFile fileSubdivisions, final String sdCode, String name, String level)
204                 throws IOException {
205             if (name == null) {
206                 return;
207             }
208             String cldrCode = convertToCldr(sdCode);
209             String path =
210                     "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\""
211                             + cldrCode
212                             + "\"]";
213             String oldValue = fileSubdivisions.getStringValue(path);
214             if (oldValue != null) {
215                 return; // don't override old values
216             }
217             fileSubdivisions.add(path, name);
218             if (level != null) {
219                 fileSubdivisions.addComment(path, level, CommentType.LINE);
220             }
221         }
222 
isKosher(String regionCode)223         private boolean isKosher(String regionCode) {
224             if (regionCode.equals("001")) {
225                 return false;
226             }
227             if (territoryAliases.containsKey(regionCode)
228                     || containment.contains(regionCode)
229                     || codeToData
230                             .get(regionCode)
231                             .get(LstrField.Description)
232                             .contains("Private use")) {
233                 Set<String> rc = REGION_CONTAINS.get(regionCode);
234                 if (rc != null) {
235                     throw new IllegalArgumentException("? " + regionCode + ": " + rc);
236                 }
237                 return false;
238             }
239             return true;
240         }
241 
addChildren( Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2)242         private static void addChildren(
243                 Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2) {
244             TreeMap<String, SubdivisionNode> temp = new TreeMap<>(COMPARATOR_ROOT);
245             temp.putAll(children2);
246             ordered.addAll(temp.values());
247             for (SubdivisionNode n : temp.values()) {
248                 if (!n.children.isEmpty()) {
249                     addChildren(ordered, n.children);
250                 }
251             }
252         }
253 
254         static Map<String, String> NAME_CORRECTIONS = new HashMap<>();
255 
getBestName(String value, boolean useIso)256         private String getBestName(String value, boolean useIso) {
257             String cldrName = null;
258             cldrName = NAME_CORRECTIONS.get(value);
259             if (cldrName != null) {
260                 return fixName(cldrName);
261             }
262             R2<List<String>, String> subdivisionAlias =
263                     SubdivisionInfo.SUBDIVISION_ALIASES_FORMER.get(value);
264             if (subdivisionAlias != null) {
265                 String country = subdivisionAlias.get0().get(0);
266                 cldrName = ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, country);
267                 if (cldrName != null) {
268                     return fixName(cldrName);
269                 }
270             }
271 
272             cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(value);
273             if (cldrName != null) {
274                 return fixName(cldrName);
275             }
276 
277             Collection<String> oldAliases = SubdivisionInfo.subdivisionIdToOld.get(value);
278             if (oldAliases != null) {
279                 for (String oldAlias : oldAliases) {
280                     cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(oldAlias);
281                     if (cldrName != null) {
282                         return fixName(cldrName);
283                     }
284                 }
285             }
286 
287             if (useIso) {
288                 cldrName = getIsoName(value);
289                 if (cldrName == null) {
290                     cldrName = "UNKNOWN";
291                     // throw new IllegalArgumentException("Failed to find name: " + value);
292                 }
293                 return fixName(cldrName);
294             }
295             return null;
296         }
297 
fixName(String name)298         private static String fixName(String name) {
299             return name == null
300                     ? null
301                     : nfc.normalize(name.replace('\'', '’').replace("  ", " ").trim());
302         }
303 
SubdivisionSet(String sourceFile)304         public SubdivisionSet(String sourceFile) {
305 
306             //    <country id="AD" version="16">
307             //           <subdivision-code footnote="*">AD-02</subdivision-code>
308             //             <subdivision-locale lang3code="eng" xml:lang="en">
309             //                  <subdivision-locale-name>Otago</subdivision-locale-name>
310 
311             List<Pair<String, String>> pathValues =
312                     XMLFileReader.loadPathValues(
313                             sourceFile, new ArrayList<Pair<String, String>>(), false);
314             int maxIndent = 0;
315             SubdivisionNode lastNode = null;
316             String lastCode = null;
317             Set<String> conflictingTargetCountries = new HashSet<>();
318 
319             for (Pair<String, String> pair : pathValues) {
320                 String path = pair.getFirst();
321                 boolean code = path.contains("/subdivision-code");
322                 boolean name = path.contains("/subdivision-locale-name");
323                 boolean nameCat = path.contains("/category-name");
324                 boolean relatedCountry = path.contains("/subdivision-related-country");
325 
326                 //    <country id="AD" version="16">
327                 //       <category id="262">
328                 //  <category-name lang3code="fra" xml:lang="fr">paroisse</category-name>
329                 //  <category-name lang3code="eng" xml:lang="en">parish</category-name>
330                 // also languages in region...
331 
332                 // new XML from ISO, so we don't have to guess the country code:
333                 //            <subdivision-code footnote="*">NL-BQ1</subdivision-code>
334                 //            <subdivision-related-country country-id="BQ" xml:lang="en">BONAIRE,
335                 // SINT EUSTATIUS AND SABA</subdivision-related-country>
336 
337                 if (!code && !name && !nameCat && !relatedCountry) {
338                     continue;
339                 }
340                 XPathParts parts = XPathParts.getFrozenInstance(path);
341                 String value = pair.getSecond();
342                 if (relatedCountry) {
343                     String target = parts.getAttributeValue(-1, "country-id");
344                     // remove conflicting target countries
345                     for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) {
346                         if (entry.getValue().equals(target)) {
347                             conflictingTargetCountries.add(target);
348                             TO_COUNTRY_CODE.remove(
349                                     entry.getKey(), target); // there can be at most one
350                             break;
351                         }
352                     }
353                     if (!conflictingTargetCountries.contains(target)) {
354                         TO_COUNTRY_CODE.put(lastCode, target);
355                         // System.out.println(lastCode + " => " + target);
356                     }
357                 } else if (name) {
358                     int elementNum = -2;
359                     String lang = parts.getAttributeValue(elementNum, "xml:lang");
360                     if (lang == null) {
361                         lang = parts.getAttributeValue(elementNum, "lang3code");
362                     }
363                     addName(lastCode, lang, value);
364                     // output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang +
365                     // ":\t«" + value + "»\t");
366                 } else if (nameCat) {
367                     // country-codes[@generated="2015-05-04T15:40:13.424465+02:00"]/country[@id="AD"][@version="16"]/category[@id="262"]/category-name[@lang3code="fra"][@xml:lang="fr"]
368                     int elementNum = -1;
369                     String lang = parts.getAttributeValue(elementNum, "xml:lang");
370                     if (lang == null) {
371                         lang = parts.getAttributeValue(elementNum, "lang3code");
372                     }
373                     String category = parts.getAttributeValue(-2, "id");
374                     addName(category, lang, value);
375                     // output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang +
376                     // ":\t«" + value + "»\t");
377                 } else {
378                     int countSubdivision = 0;
379                     for (int i = 0; i < parts.size(); ++i) {
380                         if (parts.getElement(i).equals("subdivision")) {
381                             ++countSubdivision;
382                         }
383                     }
384                     if (maxIndent < countSubdivision) {
385                         maxIndent = countSubdivision;
386                     }
387                     value = convertToCldr(value);
388                     if (countSubdivision == 1) {
389                         lastNode = addNode(null, value);
390                     } else {
391                         lastNode = addNode(lastNode, value);
392                     }
393                     lastCode = value;
394                     int subdivisionElement = parts.findElement("subdivision");
395                     String id = parts.getAttributeValue(subdivisionElement, "category-id");
396                     addIdSample(id, value);
397                     // <subdivision category-id="262">//<subdivision-code
398                     // footnote="*">AD-06</subdivision-code>
399                     // <subdivision category-id="262">
400                     // output.println(++count + Utility.repeat("\t", indent) + "code=" + value);
401                 }
402             }
403         }
404 
addIdSample(String id, String value)405         public void addIdSample(String id, String value) {
406             SUB_TO_CAT.put(value, id);
407             ID_SAMPLE.put(getIsoName(id), value);
408         }
409 
addNode(SubdivisionNode lastSubdivision, String subdivision)410         final SubdivisionNode addNode(SubdivisionNode lastSubdivision, String subdivision) {
411             // "NZ-S", x
412             String region = SubdivisionNames.getRegionFromSubdivision(subdivision);
413             REGION_CONTAINS.put(region, subdivision);
414             if (lastSubdivision == null) {
415                 lastSubdivision = BASE.children.get(region);
416                 if (lastSubdivision == null) {
417                     lastSubdivision =
418                             new SubdivisionNode(region, BASE, this)
419                                     .addName("en", ENGLISH_ICU.regionDisplayName(region));
420                     BASE.children.put(region, lastSubdivision);
421                 }
422                 return add(lastSubdivision, subdivision);
423             }
424             add(lastSubdivision, subdivision);
425             return lastSubdivision;
426         }
427 
add(SubdivisionNode subdivisionNode1, String subdivision2)428         private SubdivisionNode add(SubdivisionNode subdivisionNode1, String subdivision2) {
429             SubdivisionNode subdivisionNode2 = subdivisionNode1.children.get(subdivision2);
430             if (subdivisionNode2 == null) {
431                 subdivisionNode2 = new SubdivisionNode(subdivision2, subdivisionNode1, this);
432             }
433             subdivisionNode1.children.put(subdivision2, subdivisionNode2);
434             return subdivisionNode2;
435         }
436 
getName(SubdivisionNode base2)437         private String getName(SubdivisionNode base2) {
438             return getIsoName(base2.code);
439         }
440 
getIsoName(String code)441         private String getIsoName(String code) {
442             if (code == null) {
443                 return null;
444             }
445             Map<String, String> map = NAMES.get(code);
446             if (map == null) {
447                 return "???";
448             }
449             String name = map.get("en");
450             if (name != null) {
451                 return name;
452             }
453             name = map.get("es");
454             if (name != null) {
455                 return name;
456             }
457             name = map.get("fr");
458             if (name != null) {
459                 return name;
460             }
461             if (name == null) {
462                 name = map.entrySet().iterator().next().getValue();
463             }
464             return name;
465         }
466 
print(PrintWriter out)467         public void print(PrintWriter out) {
468             print(out, 0, "", BASE);
469             for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) {
470                 out.println(entry.getKey() + "\t" + entry.getValue());
471             }
472         }
473 
print(PrintWriter out, int indent, String prefix, SubdivisionNode base2)474         private void print(PrintWriter out, int indent, String prefix, SubdivisionNode base2) {
475             if (!prefix.isEmpty()) {
476                 prefix += "\t";
477             }
478             prefix += base2.code;
479             final String indentString = Utility.repeat("\t", 4 - indent);
480             out.println(prefix + indentString + getName(base2));
481             if (base2.children.isEmpty()) {
482                 return;
483             }
484             for (SubdivisionNode child : base2.children.values()) {
485                 print(out, indent + 1, prefix, child);
486             }
487         }
488     }
489 
490     static class SubDivisionExtractor {
491         final SubdivisionSet sdset;
492         final Validity validityFormer;
493         final Map<String, R2<List<String>, String>> subdivisionAliasesFormer;
494         final Relation<String, String> formerRegionToSubdivisions;
495 
SubDivisionExtractor( SubdivisionSet sdset, Validity validityFormer, Map<String, R2<List<String>, String>> subdivisionAliasesFormer, Relation<String, String> formerRegionToSubdivisions)496         public SubDivisionExtractor(
497                 SubdivisionSet sdset,
498                 Validity validityFormer,
499                 Map<String, R2<List<String>, String>> subdivisionAliasesFormer,
500                 Relation<String, String> formerRegionToSubdivisions) {
501             this.sdset = sdset;
502             this.validityFormer = validityFormer;
503             this.subdivisionAliasesFormer = subdivisionAliasesFormer;
504             this.formerRegionToSubdivisions = formerRegionToSubdivisions;
505         }
506 
printXml(Appendable output)507         void printXml(Appendable output) throws IOException {
508 
509             /*
510             <subdivisionContainment>
511             <group type="NZ" category="island" contains="NZ-N NZ-S"/> <!-- New Zealand -->
512             <group type="NZ" category="special island authority" contains="NZ-CIT"/> <!-- New Zealand -->
513             <group type="NZ-N" contains="NZ-AUK NZ-BOP NZ-GIS NZ-HKB NZ-MWT NZ-NTL NZ-AUK NZ-TKI NZ-WGN NZ-WKO"/> <!-- North Island -->
514             <group type="NZ-S" contains="NZ-CAN NZ-MBH NZ-STL NZ-NSN NZ-OTA NZ-TAS NZ-WTC"/> <!-- South Island -->
515             </subdivisionContainment>
516              */
517             output.append(
518                     DtdType.supplementalData.header(MethodHandles.lookup().lookupClass())
519                             + "\t<version number=\"$Revision"
520                             + "$\"/>\n"
521                             + "\t<subdivisionContainment>\n");
522             printXml(output, sdset.BASE, 0);
523             output.append("\t</subdivisionContainment>\n</supplementalData>\n");
524         }
525 
printAliases(Appendable output)526         void printAliases(Appendable output) throws IOException {
527             addAliases(output, sdset.TO_COUNTRY_CODE.keySet());
528 
529             // Get the old validity data
530             Map<Status, Set<String>> oldSubdivisionData =
531                     validityFormer.getStatusToCodes(LstrType.subdivision);
532             Set<String> missing = new TreeSet<>(COMPARATOR_ROOT);
533             missing.addAll(sdset.TO_COUNTRY_CODE.keySet());
534             Set<String> nowValid = sdset.ID_TO_NODE.keySet();
535             for (Entry<Status, Set<String>> e : oldSubdivisionData.entrySet()) {
536                 Status v = e.getKey();
537                 if (v == Status.unknown) {
538                     continue;
539                 }
540                 Set<String> set = e.getValue();
541                 for (String sdcodeRaw : set) {
542                     String sdcode = sdcodeRaw; // .toUpperCase(Locale.ROOT);
543                     //                  sdcode = sdcode.substring(0,2) + "-" + sdcode.substring(2);
544                     if (!nowValid.contains(sdcode)) {
545                         missing.add(sdcode);
546                     }
547                 }
548             }
549             missing.removeAll(sdset.TO_COUNTRY_CODE.keySet());
550             addAliases(output, missing);
551         }
552 
addAliases(Appendable output, Set<String> missing)553         private void addAliases(Appendable output, Set<String> missing) throws IOException {
554             for (String toReplace : missing) {
555                 List<String> replaceBy = null;
556                 String reason = "deprecated";
557                 R2<List<String>, String> aliasInfo = subdivisionAliasesFormer.get(toReplace);
558                 if (aliasInfo != null) {
559                     replaceBy = aliasInfo.get0();
560                     reason = aliasInfo.get1();
561                     System.out.println("Adding former alias: " + toReplace + " => " + replaceBy);
562                 } else {
563                     String replacement = sdset.TO_COUNTRY_CODE.get(toReplace);
564                     if (replacement != null) {
565                         replaceBy = Collections.singletonList(replacement);
566                         reason = "overlong";
567                         System.out.println(
568                                 "Adding country code alias: " + toReplace + " => " + replaceBy);
569                     }
570                 }
571                 addAlias(output, toReplace, replaceBy, reason);
572             }
573         }
574 
addAlias( Appendable output, final String toReplace, final List<String> replaceBy, final String reason)575         private void addAlias(
576                 Appendable output,
577                 final String toReplace,
578                 final List<String> replaceBy,
579                 final String reason)
580                 throws IOException {
581             // <languageAlias type="art_lojban" replacement="jbo" reason="deprecated"/> <!-- Lojban
582             // -->
583             output.append("\t\t\t");
584             if (replaceBy == null) {
585                 output.append("<!-- ");
586             }
587             output.append(
588                     "<subdivisionAlias"
589                             + " type=\""
590                             + toReplace
591                             + "\""
592                             + " replacement=\""
593                             + (replaceBy == null
594                                     ? toReplace.substring(0, 2) + "?"
595                                     : Joiner.on(" ").join(replaceBy))
596                             + "\""
597                             + " reason=\""
598                             + reason
599                             + "\"/>"
600                             + (replaceBy == null ? " <!- - " : " <!-- ")
601                             + sdset.getBestName(toReplace, true)
602                             + " => "
603                             + (replaceBy == null ? "??" : getBestName(replaceBy, true))
604                             + " -->"
605                             + "\n");
606         }
607 
getBestName(List<String> replaceBy, boolean useIso)608         private String getBestName(List<String> replaceBy, boolean useIso) {
609             StringBuilder result = new StringBuilder();
610             for (String s : replaceBy) {
611                 if (result.length() != 0) {
612                     result.append(", ");
613                 }
614                 if (SubdivisionNames.isRegionCode(s)) {
615                     result.append(ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, s));
616                 } else {
617                     result.append(sdset.getBestName(s, useIso));
618                 }
619             }
620             return result.toString();
621         }
622 
printXml(Appendable output, SubdivisionNode base2, int indent)623         private void printXml(Appendable output, SubdivisionNode base2, int indent)
624                 throws IOException {
625             if (base2.children.isEmpty()) {
626                 return;
627             }
628             String type = base2.code;
629             if (base2 != sdset.BASE) {
630                 type = convertToCldr(type);
631                 output.append("\t\t" + "<subgroup" + " type=\"" + type + "\"" + " contains=\"");
632                 boolean first = true;
633                 for (String child : base2.children.keySet()) {
634                     if (first) {
635                         first = false;
636                     } else {
637                         output.append(' ');
638                     }
639                     String subregion = convertToCldr(child);
640                     output.append(subregion);
641                 }
642                 output.append("\"/>\n");
643             }
644             for (SubdivisionNode child : base2.children.values()) {
645                 printXml(output, child, indent);
646             }
647         }
648 
printSamples(Appendable pw)649         public void printSamples(Appendable pw) throws IOException {
650             Set<String> seen = new HashSet<>();
651             for (Entry<String, Set<String>> entry : sdset.ID_SAMPLE.keyValuesSet()) {
652                 pw.append(entry.getKey());
653                 // int max = 10;
654                 seen.clear();
655                 for (String sample : entry.getValue()) {
656                     String region = sample.substring(0, 2);
657                     if (seen.contains(region)) {
658                         continue;
659                     }
660                     seen.add(region);
661                     pw.append(
662                             ";\t"
663                                     + ENGLISH_ICU.regionDisplayName(region)
664                                     + ": "
665                                     + sdset.getIsoName(sample)
666                                     + " ("
667                                     + sample
668                                     + ")");
669                     // if (--max < 0) break;
670                 }
671                 pw.append(System.lineSeparator());
672             }
673         }
674 
printEnglishComp(Appendable output)675         public void printEnglishComp(Appendable output) throws IOException {
676             Set<String> countEqual = new TreeSet<>();
677             String lastCC = null;
678             output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\tEqual\n");
679             for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) {
680                 final String countryCode = entry.getKey();
681                 if (!countryCode.equals(lastCC)) {
682                     if (lastCC != null && countEqual.size() != 0) {
683                         output.append(
684                                 ENGLISH_ICU.regionDisplayName(lastCC)
685                                         + "\t\t\tEquals:\t"
686                                         + countEqual.size()
687                                         + "\t"
688                                         + countEqual
689                                         + "\n");
690                     }
691                     countEqual.clear();
692 
693                     lastCC = countryCode;
694                 }
695                 for (String value : entry.getValue()) {
696                     String cldrName = sdset.getBestName(value, false);
697                     String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value);
698                     final String iso = sdset.getIsoName(value);
699                     if (iso.equals(wiki)) {
700                         countEqual.add(iso);
701                         continue;
702                     }
703                     output.append(
704                             ENGLISH_ICU.regionDisplayName(countryCode)
705                                     //                        + "\t" +
706                                     // WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value)
707                                     + "\t"
708                                     + cldrName
709                                     + "\t"
710                                     + value
711                                     + "\t"
712                                     + iso
713                                     + "\t"
714                                     + wiki
715                                     + "\n");
716                 }
717             }
718             if (countEqual.size() != 0) {
719                 output.append(
720                         ENGLISH_ICU.regionDisplayName(lastCC)
721                                 + "\t\t\tEquals:\t"
722                                 + countEqual.size()
723                                 + "\t"
724                                 + countEqual
725                                 + "\n");
726             }
727         }
728 
printEnglishCompFull(Appendable output)729         public void printEnglishCompFull(Appendable output) throws IOException {
730             output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\n");
731             for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) {
732                 final String countryCode = entry.getKey();
733                 for (String value : entry.getValue()) {
734                     String cldrName = sdset.getBestName(value, false);
735                     // getBestName(value);
736                     String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value);
737                     final String iso = sdset.getIsoName(value);
738                     output.append(
739                             ENGLISH_ICU.regionDisplayName(countryCode)
740                                     //                        + "\t" +
741                                     // WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value)
742                                     + "\t"
743                                     + value
744                                     + "\t"
745                                     + cldrName
746                                     + "\t"
747                                     + iso
748                                     + "\t"
749                                     + wiki
750                                     + "\n");
751                 }
752             }
753         }
754 
printEnglish(PrintWriter output)755         public void printEnglish(PrintWriter output) throws IOException {
756             TreeSet<String> allRegions = new TreeSet<>();
757             allRegions.addAll(codeToData.keySet());
758             allRegions.addAll(formerRegionToSubdivisions.keySet()); // override
759 
760             Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*");
761             CLDRFile oldFileSubdivisions = cldrFactorySubdivisions.make("en", false);
762             CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed();
763 
764             Set<String> skipped = new LinkedHashSet<>();
765 
766             for (String regionCode : allRegions) {
767                 if (!sdset.isKosher(regionCode)) {
768                     if (regionCode.length() != 3) {
769                         skipped.add(regionCode);
770                     }
771                     continue;
772                 }
773                 Set<String> remainder = formerRegionToSubdivisions.get(regionCode);
774                 remainder =
775                         remainder == null ? Collections.emptySet() : new LinkedHashSet<>(remainder);
776 
777                 SubdivisionNode regionNode = sdset.ID_TO_NODE.get(regionCode);
778                 if (regionNode == null) {
779                     continue;
780                 }
781 
782                 Set<SubdivisionNode> ordered = new LinkedHashSet<>();
783                 SubdivisionSet.addChildren(ordered, regionNode.children);
784 
785                 for (SubdivisionNode node : ordered) {
786                     final String sdCode = node.code;
787                     String name = sdset.getBestName(sdCode, true);
788                     String upper = UCharacter.toUpperCase(name);
789                     String title =
790                             SubdivisionNode.TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(
791                                     Locale.ROOT, null, name);
792                     if (name.equals(upper) || !name.equals(title)) {
793                         System.out.println("Suspicious name: " + name);
794                     }
795                     SubdivisionSet.appendName(fileSubdivisions, sdCode, name, null);
796                     remainder.remove(sdCode);
797                 }
798                 for (String sdCode : remainder) {
799                     String name = sdset.getBestName(sdCode, true);
800                     if (!name.equals("???")) {
801                         SubdivisionSet.appendName(
802                                 fileSubdivisions, sdCode, name, "\t<!-- deprecated -->");
803                     }
804                 }
805             }
806             System.out.println("Skipping: " + skipped);
807             fileSubdivisions.write(output);
808         }
809 
printMissingMIDs(PrintWriter pw)810         public void printMissingMIDs(PrintWriter pw) {
811             //          for (Entry<String, String> entry :
812             // WikiSubdivisionLanguages.WIKIDATA_TO_MID.entrySet()) {
813             //              String mid = entry.getValue();
814             //              if (!mid.isEmpty()) {
815             //                  continue;
816             //              }
817             //              String subCode = entry.getKey();
818             //              String wiki = clean(getWikiName(subCode));
819             //              String iso = clean(getIsoName(subCode));
820             //              String countryCode = subCode.substring(0, 2);
821             //              String cat = SUB_TO_CAT.get(subCode);
822             //              String catName = getIsoName(cat);
823             //              pw.append(
824             //                  ENGLISH_ICU.regionDisplayName(countryCode)
825             //                  + "\t" + mid
826             //                  + "\t" + subCode
827             //                  + "\t" + catName
828             //                  + "\t" + wiki
829             //                  + "\t" + iso
830             //                  + "\n"
831             //                  );
832             //          }
833         }
834     }
835 }
836