xref: /aosp_15_r20/external/cldr/tools/cldr-rdf/src/main/java/org/unicode/cldr/tool/GenerateLanguageContainment.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.tool;
2 
3 import com.google.common.base.Joiner;
4 import com.google.common.base.Splitter;
5 import com.google.common.collect.ImmutableMap;
6 import com.google.common.collect.ImmutableMultimap;
7 import com.google.common.collect.ImmutableSet;
8 import com.google.common.collect.ImmutableSet.Builder;
9 import com.google.common.collect.LinkedHashMultimap;
10 import com.google.common.collect.Multimap;
11 import com.google.common.collect.Multimaps;
12 import com.google.common.collect.SortedSetMultimap;
13 import com.google.common.collect.TreeMultimap;
14 import com.ibm.icu.impl.Row.R2;
15 import com.ibm.icu.util.ICUUncheckedIOException;
16 import java.io.IOException;
17 import java.io.PrintWriter;
18 import java.io.Writer;
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.Collection;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.LinkedHashSet;
25 import java.util.List;
26 import java.util.Map;
27 import java.util.Map.Entry;
28 import java.util.Set;
29 import java.util.TreeMap;
30 import java.util.TreeSet;
31 import java.util.function.Consumer;
32 import java.util.function.Function;
33 import org.apache.jena.query.QuerySolution;
34 import org.apache.jena.query.ResultSet;
35 import org.unicode.cldr.draft.FileUtilities;
36 import org.unicode.cldr.rdf.QueryClient;
37 import org.unicode.cldr.rdf.TsvWriter;
38 import org.unicode.cldr.util.*;
39 import org.unicode.cldr.util.Iso639Data.Type;
40 import org.unicode.cldr.util.StandardCodes.LstrField;
41 import org.unicode.cldr.util.StandardCodes.LstrType;
42 import org.unicode.cldr.util.Validity.Status;
43 
44 /**
45  * This code generates language group containment based on Wikidata. For example, it finds: root >
46  * Indo-European [Other] (ine) > Germanic [Other] (gem) > West Germanic languages (gmw) > English
47  * (en)
48  *
49  * <p>To do this, it reads three tables from Wikidata, and combines them. The combination is not
50  * trivial, because wikidata offers multiple "parents" for the same language, and many of the
51  * parents do not have ISO codes. For the first problem, the software computes the possible parent
52  * chains and picks among them. For the second problem, any parents without ISO codes are skipped
53  * (after forming the chains, so the ultimate ancestors are still found). <br>
54  * A number of debugging files are written to the external directory.
55  *
56  * <p>Some failures will be exposed by running this tool. Examples: <br>
57  * <b>wikidata-entityToCode Multiple values:</b> Cebaara [Q1097512] [sef, sev]. <br>
58  * If these are not CLDR languages then they do not need to be fixed. <br>
59  * <b>wikidata-childToParent Multiple values:</b> Q118712 [Q118712] [German [de, Q18], English [en,
60  * Q186]] <br>
61  * Normally these don't need to be fixed; the generation code works around them. <br>
62  * <b>Cycle in [dng, zhx]</b> from [[http://www.wikidata.org/entity/Q33050, <br>
63  * These indicate that the Wikidata has a cycle in it. A => B => C => A. Ignore these unless the
64  * cases are worth investigating.
65  *
66  * <p>Others are exposed by running TestLanguageGroup.java <br>
67  * Error: (TestLanguageGroup.java:55) Single ancestor but not in ISOLATES: ce [Chechen] [ce] <br>
68  * Check to see if the language has a language group (in this case not, so add to
69  * TestLanguageGroup.ISOLATEs). <br>
70  * For kea [Kabuverdianu] [kea], you can add cpp as the parent, as follows. <br>
71  * <b>Missing.</b> If a child-parent relation is missing, you can add it to EXTRA_PARENT_CHILDREN so
72  * that it shows up. For example, .put("gmw", "lb") says that West Germanic is the parent of
73  * Luxembourgish. <br>
74  * <b>Extra.</b> Sometimes wikidata has conflicting or erroneous entries. Those can be fixed by
75  * adding to REMOVE_PARENT_CHILDREN. Use * to remove all children, such as .put("crp", "*") <br>
76  * Sometimes the tool fails with JsonParseExceptions, but works if you rerun. <br>
77  * Cycle in [dng, zhx] from ... Will be fixed by giving the language 'no parent' (mul)
78  *
79  * <p>
80  */
81 public class GenerateLanguageContainment {
82     static {
83         System.out.println(
84                 "See the class description for GenerateLanguageContainment.java about fixing problems.");
85     }
86 
87     private static final boolean ONLY_LIVING = false;
88     private static final CLDRConfig CONFIG = CLDRConfig.getInstance();
89     private static final QueryClient queryClient = QueryClient.getInstance();
90 
91     static final Splitter TAB = Splitter.on('\t').trimResults();
92     static final CLDRFile ENGLISH = CONFIG.getEnglish();
93     static final String relDir = "../util/data/languages/";
94     static final Map<String, R2<List<String>, String>> ALIAS_MAP =
95             CONFIG.getSupplementalDataInfo().getLocaleAliasInfo().get("language");
96 
97     /** We load the SparQL queries using this helper object, to be able to catch exceptions… */
98     static final class QueryHelper {
99         public final Map<String, String> entityToLabel;
100         public final Map<String, String> entityToCode;
101         public final ImmutableMultimap<String, String> codeToEntity;
102         public final Multimap<String, String> childToParent;
103 
QueryHelper()104         QueryHelper() {
105             try {
106                 entityToLabel =
107                         loadQueryPairsUnique(
108                                 GenerateLanguageContainment.class,
109                                 "wikidata-entityToLabel",
110                                 null,
111                                 null,
112                                 null);
113 
114                 entityToCode =
115                         loadQueryPairsUnique(
116                                 GenerateLanguageContainment.class,
117                                 "wikidata-entityToCode",
118                                 code -> {
119                                     code = code.replace("\"", "");
120                                     R2<List<String>, String> v = ALIAS_MAP.get(code);
121                                     String result = v == null ? code : v.get0().get(0);
122                                     result = result.contains("_") ? code : result;
123                                     return result;
124                                 },
125                                 code -> showNameAndCode(code),
126                                 NAME);
127 
128                 codeToEntity =
129                         ImmutableMultimap.copyOf(
130                                 Multimaps.invertFrom(
131                                         Multimaps.forMap(entityToCode),
132                                         LinkedHashMultimap.create()));
133 
134                 childToParent =
135                         loadQueryPairs(
136                                 GenerateLanguageContainment.class,
137                                 "wikidata-childToParent",
138                                 code -> showNameAndCode(code),
139                                 code -> showNameAndCode(code));
140 
141             } catch (Throwable t) {
142                 t.printStackTrace();
143                 throw new RuntimeException(t);
144             }
145         }
146 
getEntityName(String key)147         String getEntityName(String key) {
148             String code = getEntityCode(key);
149             if (code != null) {
150                 try {
151                     String name = NAME.apply(code);
152                     if (name != null) {
153                         return name;
154                     }
155                 } catch (Exception e) {
156                     // TODO: Why would NAME.apply throw?
157                     // TODO: Need better handling here?
158                 }
159             }
160             String name = entityToLabel.get(key);
161             if (name != null) {
162                 return name;
163             }
164             return afterLastSlash(key);
165         }
166 
getEntityCode(String key)167         private String getEntityCode(String key) {
168             return entityToCode == null ? null : entityToCode.get(key);
169         }
170 
afterLastSlash(String key)171         private String afterLastSlash(String key) {
172             return key.substring(key.lastIndexOf('/') + 1, key.length() - 1);
173         }
174 
writeTsvs()175         public void writeTsvs() throws IOException {
176             TsvWriter.writeTsv("childToParent.tsv", childToParent, "child", "parent");
177             TsvWriter.writeTsv("entityToCode.tsv", entityToCode, "lang", "langCode");
178             TsvWriter.writeTsv("entityToLabel.tsv", entityToLabel, "lang", "langLabel");
179             SortedSetMultimap<String, String> childToParentWithCodes = TreeMultimap.create();
180             for (Entry<String, String> entry : childToParent.entries()) {
181                 String child = entry.getKey();
182                 String parent = entry.getValue();
183                 childToParentWithCodes.put(showNameAndCode(child), showNameAndCode(parent));
184             }
185             TsvWriter.writeTsv(
186                     "childToParentWithCodes.tsv",
187                     childToParentWithCodes,
188                     "childCode\tLabel",
189                     "parentCode\tLabel");
190         }
191 
showNameAndCode(String qid)192         public String showNameAndCode(String qid) {
193             return getEntityName(qid)
194                     + " ("
195                     + (getEntityCode(qid) == null ? "" : getEntityCode(qid) + ", ")
196                     + afterLastSlash(qid)
197                     + ")";
198         }
199 
showNameAndCode(T qids)200         public <T extends Iterable<String>> String showNameAndCode(T qids) {
201             StringBuilder b = new StringBuilder();
202             qids.forEach(
203                     qid -> {
204                         if (b.length() != 0) b.append(", ");
205                         b.append(showNameAndCode(qid));
206                     });
207             return b.toString();
208         }
209 
showNameAndCode2(U qids)210         public <T extends Iterable<String>, U extends Iterable<T>> String showNameAndCode2(U qids) {
211             StringBuilder b = new StringBuilder();
212             qids.forEach(
213                     qid -> {
214                         if (b.length() != 0) b.append("; ");
215                         b.append(showNameAndCode(qid));
216                     });
217             return b.toString();
218         }
219     }
220 
221     static final QueryHelper QUERY_HELPER = new QueryHelper();
222 
223     static final Function<String, String> NAME =
224             code ->
225                     code.equals(LocaleNames.MUL)
226                             ? LocaleNames.ROOT
227                             : ENGLISH.getName(code) + " (" + code + ")";
228 
229     static final Set<String> COLLECTIONS;
230 
231     static {
232         Map<String, Map<LstrField, String>> languages =
233                 StandardCodes.getEnumLstreg().get(LstrType.language);
234         Builder<String> _collections = ImmutableSet.<String>builder();
235         for (Entry<String, Map<LstrField, String>> e : languages.entrySet()) {
236             String scope = e.getValue().get(LstrField.Scope);
237             if (scope != null && "Collection".equalsIgnoreCase(scope)) {
e.getKey()238                 _collections.add(e.getKey());
239             }
240         }
241         COLLECTIONS = _collections.build();
242     }
243 
244     static class Tree {
245         Set<String> leaves = new LinkedHashSet<>();
246 
add(List<String> chain)247         void add(List<String> chain) {
248             Collections.reverse(chain);
249         }
250     }
251 
252     /** To add parent-child relations to Wikidata */
253     static final Multimap<String, String> EXTRA_PARENT_CHILDREN =
254             ImmutableMultimap.<String, String>builder()
255                     .put("alv", "agq")
256                     .put("alv", "cch") // Atlantic–Congo <= cch [Atsam]
257                     .put("alv", "kcg") // Atlantic–Congo <= kcg [Tyap]
258                     .put("alv", "ken") // Atlantic–Congo <= ken [Kenyang]
259                     .put("alv", "ngb")
260                     .put("alv", "yav")
261                     .put("ber", "zgh")
262                     .put("bnt", "asa")
263                     .put("bnt", "bez")
264                     .put("bnt", "cgg")
265                     .put("bnt", "ebu")
266                     .put("bnt", "jmc")
267                     .put("bnt", "ksb")
268                     .put("bnt", "lag")
269                     .put("bnt", "mer")
270                     .put("bnt", "mgh")
271                     .put("bnt", "nmg")
272                     .put("bnt", "rof")
273                     .put("bnt", "rwk")
274                     .put("bnt", "sbp")
275                     .put("bnt", "seh")
276                     .put("bnt", "vun")
277                     .put("bnt", "xog")
278                     .put("cpp", "kea")
279                     .put("euq", "eu")
280                     // gmw = West Germanic
281                     .put("gmw", "ksh")
282                     .put("gmw", "lb")
283                     .put("gmw", "wae")
284                     .put("grk", "el")
285                     .put("grk", "gmy")
286                     .put("grk", "grc")
287                     .put("ira", "lrc")
288                     .put("ira", "bgn") // Iranian <= Western Balochi
289                     .put("inc", "trw") // Indo-Aryan <= Torwali
290                     .put("jpx", "ja")
291                     .put(LocaleNames.MUL, "art")
292                     .put(LocaleNames.MUL, "euq")
293                     .put(LocaleNames.MUL, "jpx")
294                     .put(LocaleNames.MUL, "tai")
295                     .put("ngb", "sg")
296                     .put("roa", "cpf")
297                     .put("roa", "cpp")
298                     .put("roa", "cpp")
299                     .put("sdv", "saq")
300                     .put("son", "khq")
301                     .put("sw", "swc")
302                     .put("tai", "blt") // tai [Tai] <= blt [Tai Dam]
303                     .put("tai", "lo")
304                     .put("tai", "th")
305                     .put("zlw", "szl") // West Slavic <= Silesian
306                     .build();
307 
308     /**
309      * To remove parent-child relations from Wikidata, eg if a child has two parents (where that
310      * causes problems)
311      */
312     static final Multimap<String, String> REMOVE_PARENT_CHILDREN =
313             ImmutableMultimap.<String, String>builder()
314                     .put("alv", "ukg") // ngf [Trans-New Guinea languages] <= ukg [Ukuriguma]
315                     .put(
316                             "crp",
317                             "*") // general Creole group interferes with French/Spanish/... language
318                     // grouping
319                     .put("cus", "mhd") // bnt [Bantu] <= mhd [Mbugu] (not cus [Cushitic])
320                     .put("gmw", "pih") // cpe [Creoles and pidgins, English based] <= pih
321                     // [Pitcairn-Norfolk]
322                     .put("inc", "rmg")
323                     // Indo-European
324                     .put("ine", "el")
325                     .put("ine", "gmy")
326                     .put("ine", "grc")
327                     .put("ine", "trw") // inc [Indic] <= trw [Torwali]
328                     .put(LocaleNames.MUL, "crp")
329                     .put(LocaleNames.MUL, "cpp") // Creoles and pidgins, Portuguese-based
330                     .put(LocaleNames.MUL, LocaleNames.UND) // anomaly
331                     .put("nic", "kcp") // ssa [Nilo-Saharan] <= kcp [Kanga]
332                     .put("nic", "kec") // ssa [Nilo-Saharan] <= kec [Keiga]
333                     .put("nic", "kgo") // ssa [Nilo-Saharan] <= kgo [Krongo]
334                     .put("nic", "rof") // ssa [Nilo-Saharan] <= rof [Rombo]
335                     .put("nic", "tbr") // ssa [Nilo-Saharan] <= tbr [Tumtum]
336                     .put("nic", "tey") // ssa [Nilo-Saharan] <= tey [Tulishi]
337                     .put("sit", "th") // sit <= tbq <= th
338                     .put("sit", "dz") // sit <= tbq <= dz
339                     .put("sit", "zh")
340                     .put("sla", "cu")
341                     .put("tbq", "psq") // paa [Papuan]; for	psq [Pasi] - not tbq [Tibeto-Burman
342                     // languages]; 	(There is also a variety of the Sino-Tibetan Adi
343                     // language called Pasi.
344                     .build();
345 
main(String[] args)346     public static void main(String[] args) throws IOException {
347         new GenerateLanguageContainment().run(args);
348         if (Containment.hadErrors) {
349             System.err.println("ERROR: Containment Errors detected, see errors above.");
350             System.exit(1);
351         }
352     }
353 
run(String[] args)354     void run(String[] args) throws IOException {
355         if (true) {
356             // check on items
357             for (String check : Arrays.asList("sw", "km", "ksh", "wae", "kea", "mfe", "th", "lo")) {
358                 System.out.println("Checking " + ENGLISH.getName(check) + "[" + check + "]");
359                 Collection<String> entities = QUERY_HELPER.codeToEntity.get(check);
360                 if (entities.isEmpty()) {
361                     System.out.println("no code for " + check + ": " + entities);
362                     continue;
363                 }
364                 for (String entity : entities) {
365                     Set<List<String>> ancestors = getAllAncestors(entity);
366                     showEntityLists(entity + " parents ", ancestors);
367                     System.out.println();
368                 }
369             }
370         }
371 
372         Map<Status, Set<String>> table = Validity.getInstance().getStatusToCodes(LstrType.language);
373         TreeMultimap<String, String> _parentToChild = TreeMultimap.create();
374         TreeSet<String> missing = new TreeSet<>(table.get(Status.regular));
375         _parentToChild.put(LocaleNames.MUL, LocaleNames.UND);
376         Set<String> skipping = new LinkedHashSet<>();
377         for (String code : table.get(Status.regular)) {
378             if (ONLY_LIVING) {
379                 Type type = Iso639Data.getType(code);
380                 if (type != Type.Living) {
381                     continue;
382                 }
383             }
384             if (code.compareTo("hdz") > 0) {
385                 int debug = 0;
386             }
387             //            if (COLLECTIONS.contains(code)) {
388             //                continue;
389             //            }
390             Collection<String> entities = QUERY_HELPER.codeToEntity.get(code);
391             if (entities.isEmpty()) {
392                 continue;
393             }
394             for (String entity : entities) {
395                 if (QUERY_HELPER.childToParent.get(entity).isEmpty()) {
396                     continue;
397                 }
398                 Set<Set<String>> chains = getAncestors(entity, skipping);
399                 if (chains.size() > 1) {
400                     int debug = 0;
401                 }
402                 for (Set<String> chain : chains) {
403                     String last = null;
404                     for (String link : chain) {
405                         if (last != null) {
406                             _parentToChild.put(link, last);
407                         }
408                         last = link;
409                     }
410                 }
411             }
412         }
413         System.out.println("Writing " + "skippingCodes.tsv");
414         try (PrintWriter w =
415                 FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), "skippingCodes.tsv")) {
416             // TsvWriter.writeRow(w, "childCode\tLabel", "parentCode\tLabel"); // header
417             skipping.forEach(e -> w.println(e));
418         }
419 
420         for (Entry<String, Collection<String>> entity : REMOVE_PARENT_CHILDREN.asMap().entrySet()) {
421             String key = entity.getKey();
422             for (String value : entity.getValue()) {
423                 if (value.equals("*")) {
424                     _parentToChild.removeAll(key);
425                 } else {
426                     _parentToChild.remove(key, value);
427                 }
428             }
429         }
430 
431         _parentToChild.putAll(EXTRA_PARENT_CHILDREN);
432 
433         // special code for artificial
434         for (String code : Iso639Data.getAvailable()) {
435             Type type = Iso639Data.getType(code);
436             if (type == Type.Constructed) {
437                 _parentToChild.put("art", code);
438             }
439         }
440 
441         Multimap<String, String> parentToChild = ImmutableMultimap.copyOf(_parentToChild);
442         Multimap<String, String> childToParent =
443                 ImmutableMultimap.copyOf(
444                         Multimaps.invertFrom(parentToChild, TreeMultimap.create()));
445         System.out.println(
446                 "Checking " + "he" + "\t" + Containment.getAllDirected(childToParent, "he"));
447 
448         try (PrintWriter w =
449                 FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), "RawLanguageContainment.txt")) {
450             print(w, parentToChild, new ArrayList<>(Arrays.asList(LocaleNames.MUL)));
451         }
452         SimpleXMLSource xmlSource = new SimpleXMLSource("languageGroup");
453         xmlSource.setNonInheriting(true); // should be gotten from DtdType...
454         CLDRFile newFile = new CLDRFile(xmlSource);
455         newFile.setDtdType(DtdType.supplementalData);
456         newFile.add("//" + DtdType.supplementalData + "/version[@number='$Revision$']", "");
457         printXML(newFile, parentToChild);
458 
459         try (PrintWriter outFile =
460                 FileUtilities.openUTF8Writer(
461                         CLDRPaths.SUPPLEMENTAL_DIRECTORY, "languageGroup.xml")) {
462             newFile.write(outFile);
463         } catch (IOException e1) {
464             throw new ICUUncheckedIOException("Can't write to languageGroup.xml", e1);
465         }
466 
467         //        for (Entry<String,String> entry : childToParent.entries()) {
468         //            String childNames = getName(entityToCode, entityToLabel, entry.getKey());
469         //            String parentNames = getName(entityToCode, entityToLabel, entry.getValue());
470         //            System.out.println(entry.getKey() + "\t" + entry.getValue() + "\t" +
471         // childNames + "\t" + parentNames);
472         //        }
473         QUERY_HELPER.writeTsvs();
474     }
475 
showEntityLists(String title, Set<List<String>> ancestors)476     private static void showEntityLists(String title, Set<List<String>> ancestors) {
477         ancestors.forEach(
478                 new Consumer<List<String>>() {
479                     @Override
480                     public void accept(List<String> item) {
481                         item.forEach(
482                                 new Consumer<String>() {
483                                     @Override
484                                     public void accept(String t) {
485                                         System.out.println(
486                                                 t
487                                                         + "\t"
488                                                         + QUERY_HELPER.entityToCode.get(t)
489                                                         + "\t"
490                                                         + QUERY_HELPER.entityToLabel.get(t));
491                                     }
492                                 });
493                         System.out.println();
494                     }
495                 });
496     }
497 
printXML(CLDRFile newFile, Multimap<String, String> parentToChild)498     private static void printXML(CLDRFile newFile, Multimap<String, String> parentToChild) {
499         printXML(newFile, parentToChild, LocaleNames.MUL);
500     }
501 
printXML( CLDRFile newFile, Multimap<String, String> parentToChild, String base)502     private static void printXML(
503             CLDRFile newFile, Multimap<String, String> parentToChild, String base) {
504         Collection<String> children = parentToChild.get(base);
505         if (children.isEmpty()) {
506             return;
507         }
508         if (base.equals(LocaleNames.UND)) {
509             // skip, no good info
510         } else {
511             newFile.add(
512                     "//"
513                             + DtdType.supplementalData
514                             + "/languageGroups/languageGroup[@parent=\""
515                             + base
516                             + "\"]",
517                     Joiner.on(" ").join(children));
518         }
519         for (String child : children) {
520             printXML(newFile, parentToChild, child);
521         }
522     }
523 
print( Writer out, Multimap<String, String> parentToChild, List<String> line)524     private static void print(
525             Writer out, Multimap<String, String> parentToChild, List<String> line) {
526         String current = line.get(line.size() - 1);
527         Collection<String> children = parentToChild.get(current);
528         if (children.isEmpty()) {
529             try {
530                 String sep = "";
531                 for (String item : line) {
532                     out.append(sep).append(NAME.apply(item));
533                     sep = " > ";
534                 }
535                 out.append('\n');
536                 out.flush();
537             } catch (IOException e) {
538             }
539         } else {
540             for (String child : children) {
541                 line.add(child);
542                 print(out, parentToChild, line);
543                 line.remove(line.size() - 1);
544             }
545         }
546     }
547 
getAncestors(String leaf, Set<String> skipping)548     private static Set<Set<String>> getAncestors(String leaf, Set<String> skipping) {
549         Set<List<String>> items = Containment.getAllDirected(QUERY_HELPER.childToParent, leaf);
550         Set<Set<String>> itemsFixed = new LinkedHashSet<>();
551         main:
552         for (List<String> item : items) {
553             Set<String> chain = new LinkedHashSet<>();
554             for (String id : item) {
555                 String code = QUERY_HELPER.entityToCode.get(id);
556                 if (code == null) {
557                     continue;
558                 }
559 
560                 // skip leaf nodes after the first
561 
562                 if (!chain.isEmpty() && !COLLECTIONS.contains(code)) {
563                     if (code.equals("zh")) {
564                         code = "zhx"; // rewrite collections usage
565                     } else {
566                         skipping.add(
567                                 "Skipping inheritance from\t"
568                                         + chain
569                                         + "\t"
570                                         + code
571                                         + "\tfrom\t"
572                                         + QUERY_HELPER.showNameAndCode2(items));
573                         continue;
574                     }
575                 }
576 
577                 // check for cycle, and skip if we have one
578 
579                 boolean changed = chain.add(code);
580                 if (!changed) {
581                     log("Cycle in\t" + chain + "\tfrom\t" + QUERY_HELPER.showNameAndCode2(items));
582                     continue main;
583                 }
584             }
585             if (chain.size() > 1) {
586                 chain.add(LocaleNames.MUL); // root
587                 itemsFixed.add(chain);
588             }
589         }
590         // remove subsets
591         // eg [[smp, he, mul], [smp, he, sem, afa, mul]]
592         // => [[smp, he, sem, afa, mul]]
593         if (itemsFixed.size() > 1) {
594             Set<Set<String>> removals = new HashSet<>();
595             for (Set<String> chain1 : itemsFixed) {
596                 for (Set<String> chain2 : itemsFixed) {
597                     if (chain1.containsAll(chain2) && !chain2.containsAll(chain1)) {
598                         removals.add(chain2);
599                     }
600                 }
601             }
602             itemsFixed.removeAll(removals);
603         }
604         return itemsFixed;
605         // TODO: delete this commented-out code?
606         //        while (true) {
607         //            String code = entityToCode.get(leaf);
608         //            if (code != null) {
609         //                chain.add(code);
610         //            }
611         //            Collection<String> parents = childToParent.get(leaf);
612         //            if (parents.isEmpty()) {
613         //                // clean up duplicates
614         //                chain = new ArrayList<>(new LinkedHashSet<>(chain));
615         //                // wikipedia has non-collections as parents. Remove those if they are not
616         // first.
617         //                break;
618         //            }
619         //            leaf = getBest(parents);
620         //        }
621         //        String last = chain.get(0);
622         //        for (int i = 1; i < chain.size(); ++i) {
623         //            String item = chain.get(i);
624         //            if (!COLLECTIONS.contains(item)) {
625         //                chain.set(i, item.equals("zh") ? "zhx" : "");
626         //                DROPPED_PARENTS_TO_CHILDREN.put(item, last);
627         //            } else {
628         //                last = item;
629         //            }
630         //        }
631         //        chain.removeIf(x -> x.isEmpty());
632         //        if ("zh".equals(chain.get(0))) {
633         //            chain.add(1,"zhx");
634         //        }
635         //        last = chain.get(chain.size()-1);
636         //        if (!LocaleNames.MUL.equals(last)) {
637         //            chain.add(LocaleNames.MUL); // make sure we have root.
638         //        }
639         //        if (chain.size() == 2) {
640         //            chain.add(1,LocaleNames.UND);
641         //        }
642         //        return chain;
643     }
644 
log(String string)645     private static void log(String string) {
646         System.out.println(string);
647         //        for (Entry<String, String> e : DROPPED_PARENTS_TO_CHILDREN.entries()) {
648         //            System.out.println(NAME.apply(e.getKey()) + "\t" + NAME.apply(e.getValue())
649         //                );
650         //        }
651     }
652 
653     // TODO: This function is only called by other commented-out code above.
654     //    private static String getBest(Collection<String> parents) {
655     //        for (String parent : parents) {
656     //            String code = QUERY_HELPER.entityToCode.get(parent);
657     //            if (code == null) continue;
658     //            Type type = Iso639Data.getType(code);
659     //            if (type != Type.Living) {
660     //                continue;
661     //            }
662     //            return parent;
663     //        }
664     //        // failed
665     //        return parents.iterator().next();
666     //    }
667 
loadQueryPairs( Class<?> class1, String file, Function<String, String> keyMapper, Function<String, String> valueMapper)668     private static Multimap<String, String> loadQueryPairs(
669             Class<?> class1,
670             String file,
671             Function<String, String> keyMapper,
672             Function<String, String> valueMapper)
673             throws IOException {
674         System.out.println("QUERY: " + file);
675         ResultSet rs = queryClient.execSelectFromSparql(file, QueryClient.WIKIDATA_SPARQL_SERVER);
676         // the query must return exactly two variables.
677         List<String> resultVars = rs.getResultVars();
678         assertTwoVars(resultVars);
679         final String keyName = resultVars.get(0);
680         final String valueName = resultVars.get(1);
681 
682         ImmutableMultimap.Builder<String, String> _keyToValues = ImmutableMultimap.builder();
683         for (; rs.hasNext(); ) {
684             final QuerySolution qs = rs.next();
685             String key = QueryClient.getStringOrNull(qs, keyName);
686             String value = QueryClient.getStringOrNull(qs, valueName);
687             _keyToValues.put(key, value);
688         }
689         ImmutableMultimap<String, String> result = _keyToValues.build();
690         showDups(file, result, keyMapper, valueMapper);
691         System.out.println("LOADED: " + file + " with rows " + rs.getRowNumber());
692         return result;
693     }
694 
695     /**
696      * Assuming that the SPARQL query returns exactly 2 results, treat them as Key=Value.
697      *
698      * @param class1
699      * @param file name of a sparql query, such as 'wikidata-childToParent'
700      * @param fixValue
701      * @param keyMapper
702      * @param valueMapper
703      * @return
704      * @throws IOException
705      */
loadQueryPairsUnique( Class<?> class1, String file, Function<String, String> fixValue, Function<String, String> keyMapper, Function<String, String> valueMapper)706     private static Map<String, String> loadQueryPairsUnique(
707             Class<?> class1,
708             String file,
709             Function<String, String> fixValue,
710             Function<String, String> keyMapper,
711             Function<String, String> valueMapper)
712             throws IOException {
713 
714         System.out.println("QUERY: " + file);
715         ResultSet rs = queryClient.execSelectFromSparql(file, QueryClient.WIKIDATA_SPARQL_SERVER);
716 
717         // the query must return exactly two variables.
718         List<String> resultVars = rs.getResultVars();
719         assertTwoVars(resultVars);
720         final String keyName = resultVars.get(0);
721         final String valueName = resultVars.get(1);
722 
723         Map<String, String> _keyToValue = new TreeMap<>();
724         Multimap<String, String> _keyToValues = TreeMultimap.create();
725         for (; rs.hasNext(); ) {
726             final QuerySolution qs = rs.next();
727             String key = QueryClient.getStringOrNull(qs, keyName);
728             String value = QueryClient.getStringOrNull(qs, valueName);
729             if (fixValue != null) {
730                 value = fixValue.apply(value);
731             }
732             _keyToValues.put(key, value);
733             String oldValue = _keyToValue.get(key);
734             if (oldValue == null || oldValue.equals("kxm")) {
735                 _keyToValue.put(key, value);
736             }
737         }
738         _keyToValue = ImmutableMap.copyOf(_keyToValue);
739         showDups(file, _keyToValues, keyMapper, valueMapper);
740         System.out.println("LOADED: " + file + " with rows " + rs.getRowNumber());
741         return _keyToValue;
742     }
743 
assertTwoVars(List<String> resultVars)744     private static void assertTwoVars(List<String> resultVars) {
745         if (resultVars.size() != 2) {
746             throw new IllegalArgumentException(
747                     "expected 2 result vars but got " + resultVars.size() + ": " + resultVars);
748         }
749     }
750 
showDups( String file, Multimap<String, String> _keyToValues, Function<String, String> keyMapper, Function<String, String> valueMapper)751     private static void showDups(
752             String file,
753             Multimap<String, String> _keyToValues,
754             Function<String, String> keyMapper,
755             Function<String, String> valueMapper) {
756         for (Entry<String, Collection<String>> entry : _keyToValues.asMap().entrySet()) {
757             Collection<String> valueSet = entry.getValue();
758             if (valueSet.size() > 1) {
759                 String key = entry.getKey();
760                 key = keyMapper == null ? key : keyMapper.apply(key);
761                 if (valueMapper != null) {
762                     Set<String> result = new LinkedHashSet<>();
763                     valueSet.stream().map(valueMapper).forEach(x -> result.add(x));
764                     valueSet = result;
765                 }
766                 log(file + "\tMultiple values: " + key + "\t" + valueSet);
767             }
768         }
769     }
770 
getAllAncestors(String lang)771     static Set<List<String>> getAllAncestors(String lang) {
772         return Containment.getAllDirected(QUERY_HELPER.childToParent, lang);
773     }
774 }
775