xref: /aosp_15_r20/external/cldr/tools/cldr-rdf/src/main/java/org/unicode/cldr/tool/WikiSubdivisionLanguages.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.tool;
2 
3 import com.google.common.collect.LinkedHashMultimap;
4 import com.google.common.collect.Multimap;
5 import com.google.common.collect.TreeMultimap;
6 import com.ibm.icu.impl.Row.R2;
7 import com.ibm.icu.impl.Row.R3;
8 import com.ibm.icu.impl.Row.R4;
9 import com.ibm.icu.impl.Utility;
10 import com.ibm.icu.lang.UProperty;
11 import com.ibm.icu.lang.UScript;
12 import com.ibm.icu.text.Normalizer2;
13 import com.ibm.icu.text.UTF16;
14 import com.ibm.icu.text.UnicodeSet;
15 import com.ibm.icu.util.ICUUncheckedIOException;
16 import com.ibm.icu.util.ULocale;
17 import java.io.IOException;
18 import java.io.PrintWriter;
19 import java.util.Arrays;
20 import java.util.Collection;
21 import java.util.HashMap;
22 import java.util.HashSet;
23 import java.util.Iterator;
24 import java.util.List;
25 import java.util.Locale;
26 import java.util.Map;
27 import java.util.Map.Entry;
28 import java.util.Set;
29 import java.util.TreeMap;
30 import java.util.TreeSet;
31 import org.apache.jena.query.QuerySolution;
32 import org.apache.jena.query.ResultSet;
33 import org.unicode.cldr.draft.FileUtilities;
34 import org.unicode.cldr.rdf.QueryClient;
35 import org.unicode.cldr.rdf.TsvWriter;
36 import org.unicode.cldr.test.DisplayAndInputProcessor;
37 import org.unicode.cldr.util.CLDRConfig;
38 import org.unicode.cldr.util.CLDRFile;
39 import org.unicode.cldr.util.CLDRFile.NumberingSystem;
40 import org.unicode.cldr.util.CLDRFile.WinningChoice;
41 import org.unicode.cldr.util.CLDRPaths;
42 import org.unicode.cldr.util.ChainedMap;
43 import org.unicode.cldr.util.ChainedMap.M4;
44 import org.unicode.cldr.util.CldrUtility;
45 import org.unicode.cldr.util.Counter;
46 import org.unicode.cldr.util.Factory;
47 import org.unicode.cldr.util.SimpleXMLSource;
48 import org.unicode.cldr.util.StandardCodes.LstrType;
49 import org.unicode.cldr.util.SupplementalDataInfo;
50 import org.unicode.cldr.util.Validity;
51 import org.unicode.cldr.util.Validity.Status;
52 import org.unicode.cldr.util.XPathParts;
53 
54 public final class WikiSubdivisionLanguages {
55     private static final String WIKI_SUBDIVISION_LANGUAGES_TSV = "wikiSubdivisionLanguages.tsv";
56     static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance();
57     static final Set<String> regularSubdivisions =
58             Validity.getInstance().getStatusToCodes(LstrType.subdivision).get(Status.regular);
59 
60     static final Map<String, R2<List<String>, String>> SUBDIVISION_ALIASES =
61             SDI.getLocaleAliasInfo().get("subdivision");
62 
63     private static final boolean DEBUG_CONSOLE = false;
64     private static final String DEBUG_LANG_FILTER = null; // "az";
65 
66     private static final String BEFORE_TYPE =
67             "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\"";
68 
69     private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance();
70     private static final Normalizer2 NFC = Normalizer2.getNFCInstance();
71 
72     private static ChainedMap.M3<String, String, String> SUB_LANG_NAME =
73             ChainedMap.of(
74                     new TreeMap<String, Object>(), new TreeMap<String, Object>(), String.class);
75     private static ChainedMap.M3<String, String, String> LANG_SUB_NAME =
76             ChainedMap.of(
77                     new TreeMap<String, Object>(), new TreeMap<String, Object>(), String.class);
78     private static Set<String> bogus = new TreeSet<>();
79     private static Multimap<Status, String> bogusStatus = TreeMultimap.create();
80 
getSubdivisionName(String subdivisionId, String languageId)81     public static String getSubdivisionName(String subdivisionId, String languageId) {
82         return WikiSubdivisionLanguages.LANG_SUB_NAME.get(languageId, subdivisionId);
83     }
84 
getBestWikiEnglishName(String subdivisionId)85     public static String getBestWikiEnglishName(String subdivisionId) {
86         String languageId = "en";
87         String name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, languageId);
88         if (name != null) {
89             return name;
90         }
91         name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "es");
92         if (name != null) {
93             return name;
94         }
95         name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "fr");
96         if (name != null) {
97             return name;
98         }
99         Map<String, String> data = WikiSubdivisionLanguages.SUB_LANG_NAME.get(subdivisionId);
100         // try Spanish, then French, then first other
101         if (data != null) {
102             return data.entrySet().iterator().next().getValue(); // get first
103         }
104         return null;
105     }
106 
107     private static final String QUERY_NAME = "wikidata-wikisubdivisionLanguages";
108 
109     // static Map<String, String> WIKIDATA_TO_MID = new TreeMap<>();
init()110     static void init() throws IOException {
111 
112         QueryClient queryClient = QueryClient.getInstance();
113 
114         System.out.println("QUERY: " + QUERY_NAME);
115         ResultSet rs =
116                 queryClient.execSelectFromSparql(QUERY_NAME, QueryClient.WIKIDATA_SPARQL_SERVER);
117 
118         Map<String, Status> codeToStatus =
119                 Validity.getInstance().getCodeToStatus(LstrType.subdivision);
120         try (PrintWriter tsv =
121                 FileUtilities.openUTF8Writer(
122                         TsvWriter.getTsvDir(), WIKI_SUBDIVISION_LANGUAGES_TSV)) {
123             TsvWriter.writeRow(tsv, "item", "label", "code", "codeLabel");
124             for (; rs.hasNext(); ) {
125                 final QuerySolution qs = rs.next();
126 
127                 String item = QueryClient.getResourceOrNull(qs, "item");
128                 String label = NFC.normalize(QueryClient.getStringOrNull(qs, "label"));
129                 String code = QueryClient.getStringOrNull(qs, "code");
130                 String codeLabel = QueryClient.getStringOrNull(qs, "codeLabel");
131 
132                 TsvWriter.writeRow(tsv, item, label, code, codeLabel);
133 
134                 String subdivision = SubdivisionNode.convertToCldr(code);
135                 if (!regularSubdivisions.contains(subdivision)) {
136                     Status status = codeToStatus.get(subdivision);
137                     if (status == null) {
138                         bogus.add(subdivision);
139                     } else {
140                         bogusStatus.put(status, subdivision);
141                     }
142                     continue;
143                 }
144                 if (DEBUG_LANG_FILTER != null && !DEBUG_LANG_FILTER.equals(codeLabel)) {
145                     continue;
146                 }
147                 SUB_LANG_NAME.put(subdivision, codeLabel, label);
148                 //                WIKIDATA_TO_MID.put(subdivision, data.get(2));
149                 LANG_SUB_NAME.put(codeLabel, subdivision, label);
150             }
151             System.out.println("Queried " + QUERY_NAME + " at row count " + rs.getRowNumber());
152         }
153         System.out.println("Wrote to " + WIKI_SUBDIVISION_LANGUAGES_TSV);
154         // postprocess
155         String oldLang = null;
156         DisplayAndInputProcessor daip = null;
157         Exception[] internalException = {null};
158 
159         for (R3<String, String, String> row : LANG_SUB_NAME.rows()) {
160             String lang = row.get0();
161             String subdivision = row.get1();
162             String name = row.get2();
163             if (!lang.equals(oldLang)) {
164                 oldLang = lang;
165                 daip = new DisplayAndInputProcessor(new ULocale(lang));
166             }
167             String path = getSubdivisionPath(subdivision);
168             String name2 = daip.processInput(path, name.replace("\u00AD", ""), internalException);
169             if (name2.contains("'")) {
170                 int debug = 0;
171             }
172             // TODO remove soft hyphen in DAIP
173             if (internalException[0] != null) {
174                 throw new IllegalArgumentException(
175                         lang + "\t" + subdivision + "\t" + name, internalException[0]);
176             } else if (!name.equals(name2)) {
177                 // System.out.println(lang + "\t" + subdivision + "\t" + name + "\t" + name2);
178                 SUB_LANG_NAME.put(subdivision, lang, name2);
179                 LANG_SUB_NAME.put(lang, subdivision, name2);
180             }
181         }
182     }
183 
getSubdivisionPath(String subdivision)184     private static String getSubdivisionPath(String subdivision) {
185         return BEFORE_TYPE + subdivision + "\"][@draft=\"contributed\"]";
186     }
187 
getSubdivisionFromPath(String path)188     private static String getSubdivisionFromPath(String path) {
189         return path.substring(BEFORE_TYPE.length(), path.indexOf('"', BEFORE_TYPE.length()));
190     }
191 
main(String[] args)192     public static void main(String[] args) throws IOException {
193         init();
194 
195         Counter<String> counter = new Counter<>();
196         Factory cldrFactory = CLDR_CONFIG.getCldrFactory();
197         Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*");
198         CLDRFile file = null;
199         UnicodeSet exemplars = null;
200 
201         ChainedMap.M4<Integer, String, String, String> exemplarFailureLangSubdivisionName =
202                 ChainedMap.of(
203                         new TreeMap<Integer, Object>(),
204                         new TreeMap<String, Object>(),
205                         new TreeMap<String, Object>(),
206                         String.class);
207 
208         for (Entry<String, Map<String, String>> entry : LANG_SUB_NAME) {
209             String lang = entry.getKey();
210             file = cldrFactory.make(lang, true);
211 
212             CLDRFile oldFileSubdivisions;
213             try {
214                 oldFileSubdivisions = cldrFactorySubdivisions.make(lang, false);
215             } catch (Exception e) {
216                 oldFileSubdivisions = new CLDRFile(new SimpleXMLSource(lang)).freeze();
217             }
218 
219             Multimap<String, String> inverse = LinkedHashMultimap.create();
220             CLDRFile fileSubdivisions = fixedFile(oldFileSubdivisions, inverse);
221 
222             UnicodeSet main = file.getExemplarSet("", WinningChoice.WINNING, 0);
223             UnicodeSet auxiliary = file.getExemplarSet("auxiliary", WinningChoice.WINNING);
224             UnicodeSet punctuation = file.getExemplarSet("punctuation", WinningChoice.WINNING);
225             UnicodeSet numbers = file.getExemplarsNumeric(NumberingSystem.defaultSystem);
226             exemplars =
227                     new UnicodeSet()
228                             .addAll(main)
229                             .addAll(auxiliary)
230                             .addAll(scriptsFor(main)) // broad test,...
231                             .addAll(punctuation)
232                             .addAll(numbers)
233                             .addAll(new UnicodeSet("[\\ ]"))
234                             .freeze();
235 
236             for (Entry<String, String> entry2 : entry.getValue().entrySet()) {
237                 String subdivision = entry2.getKey();
238                 String name = entry2.getValue();
239                 if (name.equals("Böyük Britaniya")) {
240                     int debug = 0;
241                 }
242                 String path = getSubdivisionPath(subdivision);
243                 String oldName = fileSubdivisions.getStringValue(path);
244                 if (oldName != null) {
245                     if (!oldName.equals(name)) {
246                         // System.out.println("Already has translation\t" + lang + "\t" +
247                         // subdivision + "\t" + name + "\t" + oldName);
248                     }
249                     continue;
250                 }
251                 if (!exemplars.containsAll(name)) {
252                     UnicodeSet exemplarFailures =
253                             new UnicodeSet().addAll(name).removeAll(exemplars);
254                     addExemplarFailures(
255                             exemplarFailureLangSubdivisionName,
256                             exemplarFailures,
257                             lang,
258                             subdivision,
259                             name);
260                     continue;
261                 }
262                 fileSubdivisions.add(path, name);
263                 inverse.put(name, path);
264                 counter.add(lang, 1);
265             }
266 
267             // We now fix collisions
268             for (Entry<String, Collection<String>> entry3 : inverse.asMap().entrySet()) {
269                 String name = entry3.getKey();
270                 if (name.isEmpty()) {
271                     continue;
272                 }
273                 if (name.equals("Böyük Britaniya")) {
274                     int debug = 0;
275                 }
276                 Collection<String> paths = entry3.getValue();
277                 if (paths.size() <= 1) {
278                     continue;
279                 }
280                 if (paths.size() > 3) {
281                     int debug = 0;
282                 }
283                 // we only care about collisions *within* a region.
284                 // so group them together
285                 Multimap<String, String> regionToPaths = LinkedHashMultimap.create();
286                 for (String path : paths) {
287                     String sdId = getSubdivisionFromPath(path);
288                     String region = sdId.substring(0, 2).toUpperCase(Locale.ROOT);
289                     regionToPaths.put(region, path);
290                 }
291 
292                 // Now fix as necessary
293                 for (Entry<String, Collection<String>> regionAndPaths :
294                         regionToPaths.asMap().entrySet()) {
295                     Collection<String> paths2 = regionAndPaths.getValue();
296                     int markerIndex = 0;
297                     if (paths2.size() <= 1) {
298                         continue;
299                     }
300 
301                     // find if any of the paths are deprecated
302                     for (Iterator<String> it = paths2.iterator(); it.hasNext(); ) {
303                         String path = it.next();
304                         String sdId = getSubdivisionFromPath(path);
305                         if (!regularSubdivisions.contains(sdId)) { // deprecated
306                             fileSubdivisions.remove(path);
307                             it.remove();
308                             fail(
309                                     "Duplicate, not regular ",
310                                     lang,
311                                     getSubdivisionFromPath(path),
312                                     "REMOVING",
313                                     -1);
314                         }
315                     }
316                     if (paths2.size() <= 1) {
317                         continue;
318                     }
319 
320                     String otherId = null;
321                     for (String path : paths2) {
322                         //                    if (nuke) {
323                         //                        if (oldFileSubdivisions.getStringValue(path) ==
324                         // null) {
325                         //                            fileSubdivisions.remove(path); // get rid of
326                         // new ones
327                         //                            System.out.println("Removing colliding " +
328                         // lang + "\t" + path + "\t" + name);
329                         //                        }
330                         if (markerIndex == 0) {
331                             otherId = getSubdivisionFromPath(path);
332                         } else {
333                             String fixedName = name + MARKERS.get(markerIndex);
334                             fail(
335                                     "Superscripting ",
336                                     lang + "\t(" + otherId + ")",
337                                     getSubdivisionFromPath(path),
338                                     fixedName,
339                                     -1);
340                             // System.out.println("Superscripting colliding:\t" + lang + "\t" + path
341                             // + "\t" + fixedName);
342                             fileSubdivisions.add(path, fixedName); // overwrite with superscripted
343                         }
344                         ++markerIndex;
345                     }
346                 }
347             }
348 
349             if (DEBUG_CONSOLE) {
350                 PrintWriter pw = new PrintWriter(System.out);
351                 fileSubdivisions.write(new PrintWriter(System.out));
352                 pw.flush();
353             } else {
354                 try (PrintWriter out =
355                         FileUtilities.openUTF8Writer(
356                                 CLDRPaths.SUBDIVISIONS_DIRECTORY, lang + ".xml")) {
357                     fileSubdivisions.write(out);
358                 } catch (Exception e) {
359                     throw new ICUUncheckedIOException(e);
360                 }
361             }
362         }
363         fail("ExemplarFailures", exemplarFailureLangSubdivisionName);
364 
365         for (String lang : counter.getKeysetSortedByKey()) {
366             fail("Superscripting", lang, String.valueOf(counter.get(lang)), null, -1);
367         }
368         System.out.println("Bogus subdivisionIds:\t" + "*" + "\t" + bogus.size() + "\t" + bogus);
369         for (Entry<Status, Collection<String>> entry : bogusStatus.asMap().entrySet()) {
370             System.out.println(
371                     "SubdivisionId:\t\t"
372                             + ":\t"
373                             + entry.getKey()
374                             + "\t"
375                             + entry.getValue().size()
376                             + "\t"
377                             + entry.getValue());
378         }
379     }
380 
fixedFile( CLDRFile oldFileSubdivisions, Multimap<String, String> inverse)381     private static CLDRFile fixedFile(
382             CLDRFile oldFileSubdivisions, Multimap<String, String> inverse) {
383         CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed();
384 
385         // for fixing collisions
386         // we first add existing items
387         Set<String> toRemove = new HashSet<>();
388         Map<String, String> toAdd = new HashMap<>();
389 
390         for (String path : fileSubdivisions) {
391             XPathParts parts = XPathParts.getFrozenInstance(path);
392             if (!"subdivision".equals(parts.getElement(-1))) {
393                 continue;
394             }
395             String name = fileSubdivisions.getStringValue(path);
396             if (name.equals("Böyük Britaniya")) {
397                 int debug = 0;
398             }
399             // handle aliases also
400             String type = parts.getAttributeValue(-1, "type");
401             R2<List<String>, String> replacement = SUBDIVISION_ALIASES.get(type);
402             if (replacement != null) {
403                 String fullPath = oldFileSubdivisions.getFullXPath(path);
404                 XPathParts parts2 = XPathParts.getFrozenInstance(fullPath).cloneAsThawed();
405                 for (String replacementType : replacement.get0()) {
406                     parts2.setAttribute(-1, "type", replacementType);
407                     toRemove.add(path);
408                     path = parts2.toString();
409                     toAdd.put(path, name);
410                     System.out.println("Adding alias: " + replacementType + "«" + name + "»");
411                     break;
412                 }
413             }
414             inverse.put(name, path);
415         }
416         fileSubdivisions.removeAll(toRemove, false);
417         for (Entry<String, String> entry2 : toAdd.entrySet()) {
418             fileSubdivisions.add(entry2.getKey(), entry2.getValue());
419         }
420         return fileSubdivisions;
421     }
422 
addExemplarFailures( M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures, String language, String subdivision, String name)423     private static void addExemplarFailures(
424             M4<Integer, String, String, String> exemplarFailureLangSubdivisionName,
425             UnicodeSet exemplarFailures,
426             String language,
427             String subdivision,
428             String name) {
429         for (String s : exemplarFailures) {
430             exemplarFailureLangSubdivisionName.put(s.codePointAt(0), language, subdivision, name);
431         }
432     }
433 
fail( String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName)434     private static void fail(
435             String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName) {
436         for (R4<Integer, String, String, String> entry :
437                 exemplarFailureLangSubdivisionName.rows()) {
438             fail(title, entry.get1(), entry.get2(), entry.get3(), entry.get0());
439         }
440     }
441 
fail( String title, String lang, String subdivision, String name, int exemplarFailure)442     private static void fail(
443             String title, String lang, String subdivision, String name, int exemplarFailure) {
444         System.out.println(
445                 title
446                         + ":\t"
447                         + lang
448                         + "\t"
449                         + subdivision
450                         + "\t"
451                         + (exemplarFailure < 0 ? "" : "«" + UTF16.valueOf(exemplarFailure) + "»")
452                         + "\t"
453                         + (exemplarFailure < 0 ? "" : "U+" + Utility.hex(exemplarFailure))
454                         + "\t"
455                         + CldrUtility.ifNull(getBestWikiEnglishName(subdivision), "")
456                         + "\t"
457                         + CldrUtility.ifNull(name, "").replace("\"", "&quot;"));
458     }
459 
460     static final List<String> MARKERS =
461             Arrays.asList(
462                     "¹", "²", "³"); // if there are more than 3 of the same kind, throw exception
463 
scriptsFor(UnicodeSet main)464     private static UnicodeSet scriptsFor(UnicodeSet main) {
465         UnicodeSet result = UnicodeSet.EMPTY;
466         for (String s : main) {
467             int scriptCode = UScript.getScript(s.codePointAt(0));
468             if (scriptCode != UScript.COMMON || scriptCode != UScript.INHERITED) {
469                 result = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, scriptCode);
470                 if (scriptCode == UScript.LATIN) {
471                     result.addAll("ʻ’&");
472                 }
473                 break;
474             }
475         }
476         return result;
477     }
478 }
479