1 package org.unicode.cldr.tool; 2 3 import com.google.common.base.Joiner; 4 import com.google.common.base.Splitter; 5 import com.google.common.collect.ImmutableMap; 6 import com.google.common.collect.ImmutableMultimap; 7 import com.google.common.collect.ImmutableSet; 8 import com.google.common.collect.ImmutableSet.Builder; 9 import com.google.common.collect.LinkedHashMultimap; 10 import com.google.common.collect.Multimap; 11 import com.google.common.collect.Multimaps; 12 import com.google.common.collect.SortedSetMultimap; 13 import com.google.common.collect.TreeMultimap; 14 import com.ibm.icu.impl.Row.R2; 15 import com.ibm.icu.util.ICUUncheckedIOException; 16 import java.io.IOException; 17 import java.io.PrintWriter; 18 import java.io.Writer; 19 import java.util.ArrayList; 20 import java.util.Arrays; 21 import java.util.Collection; 22 import java.util.Collections; 23 import java.util.HashSet; 24 import java.util.LinkedHashSet; 25 import java.util.List; 26 import java.util.Map; 27 import java.util.Map.Entry; 28 import java.util.Set; 29 import java.util.TreeMap; 30 import java.util.TreeSet; 31 import java.util.function.Consumer; 32 import java.util.function.Function; 33 import org.apache.jena.query.QuerySolution; 34 import org.apache.jena.query.ResultSet; 35 import org.unicode.cldr.draft.FileUtilities; 36 import org.unicode.cldr.rdf.QueryClient; 37 import org.unicode.cldr.rdf.TsvWriter; 38 import org.unicode.cldr.util.*; 39 import org.unicode.cldr.util.Iso639Data.Type; 40 import org.unicode.cldr.util.StandardCodes.LstrField; 41 import org.unicode.cldr.util.StandardCodes.LstrType; 42 import org.unicode.cldr.util.Validity.Status; 43 44 /** 45 * This code generates language group containment based on Wikidata. For example, it finds: root > 46 * Indo-European [Other] (ine) > Germanic [Other] (gem) > West Germanic languages (gmw) > English 47 * (en) 48 * 49 * <p>To do this, it reads three tables from Wikidata, and combines them. The combination is not 50 * trivial, because wikidata offers multiple "parents" for the same language, and many of the 51 * parents do not have ISO codes. For the first problem, the software computes the possible parent 52 * chains and picks among them. For the second problem, any parents without ISO codes are skipped 53 * (after forming the chains, so the ultimate ancestors are still found). <br> 54 * A number of debugging files are written to the external directory. 55 * 56 * <p>Some failures will be exposed by running this tool. Examples: <br> 57 * <b>wikidata-entityToCode Multiple values:</b> Cebaara [Q1097512] [sef, sev]. <br> 58 * If these are not CLDR languages then they do not need to be fixed. <br> 59 * <b>wikidata-childToParent Multiple values:</b> Q118712 [Q118712] [German [de, Q18], English [en, 60 * Q186]] <br> 61 * Normally these don't need to be fixed; the generation code works around them. <br> 62 * <b>Cycle in [dng, zhx]</b> from [[http://www.wikidata.org/entity/Q33050, <br> 63 * These indicate that the Wikidata has a cycle in it. A => B => C => A. Ignore these unless the 64 * cases are worth investigating. 65 * 66 * <p>Others are exposed by running TestLanguageGroup.java <br> 67 * Error: (TestLanguageGroup.java:55) Single ancestor but not in ISOLATES: ce [Chechen] [ce] <br> 68 * Check to see if the language has a language group (in this case not, so add to 69 * TestLanguageGroup.ISOLATEs). <br> 70 * For kea [Kabuverdianu] [kea], you can add cpp as the parent, as follows. <br> 71 * <b>Missing.</b> If a child-parent relation is missing, you can add it to EXTRA_PARENT_CHILDREN so 72 * that it shows up. For example, .put("gmw", "lb") says that West Germanic is the parent of 73 * Luxembourgish. <br> 74 * <b>Extra.</b> Sometimes wikidata has conflicting or erroneous entries. Those can be fixed by 75 * adding to REMOVE_PARENT_CHILDREN. Use * to remove all children, such as .put("crp", "*") <br> 76 * Sometimes the tool fails with JsonParseExceptions, but works if you rerun. <br> 77 * Cycle in [dng, zhx] from ... Will be fixed by giving the language 'no parent' (mul) 78 * 79 * <p> 80 */ 81 public class GenerateLanguageContainment { 82 static { 83 System.out.println( 84 "See the class description for GenerateLanguageContainment.java about fixing problems."); 85 } 86 87 private static final boolean ONLY_LIVING = false; 88 private static final CLDRConfig CONFIG = CLDRConfig.getInstance(); 89 private static final QueryClient queryClient = QueryClient.getInstance(); 90 91 static final Splitter TAB = Splitter.on('\t').trimResults(); 92 static final CLDRFile ENGLISH = CONFIG.getEnglish(); 93 static final String relDir = "../util/data/languages/"; 94 static final Map<String, R2<List<String>, String>> ALIAS_MAP = 95 CONFIG.getSupplementalDataInfo().getLocaleAliasInfo().get("language"); 96 97 /** We load the SparQL queries using this helper object, to be able to catch exceptions… */ 98 static final class QueryHelper { 99 public final Map<String, String> entityToLabel; 100 public final Map<String, String> entityToCode; 101 public final ImmutableMultimap<String, String> codeToEntity; 102 public final Multimap<String, String> childToParent; 103 QueryHelper()104 QueryHelper() { 105 try { 106 entityToLabel = 107 loadQueryPairsUnique( 108 GenerateLanguageContainment.class, 109 "wikidata-entityToLabel", 110 null, 111 null, 112 null); 113 114 entityToCode = 115 loadQueryPairsUnique( 116 GenerateLanguageContainment.class, 117 "wikidata-entityToCode", 118 code -> { 119 code = code.replace("\"", ""); 120 R2<List<String>, String> v = ALIAS_MAP.get(code); 121 String result = v == null ? code : v.get0().get(0); 122 result = result.contains("_") ? code : result; 123 return result; 124 }, 125 code -> showNameAndCode(code), 126 NAME); 127 128 codeToEntity = 129 ImmutableMultimap.copyOf( 130 Multimaps.invertFrom( 131 Multimaps.forMap(entityToCode), 132 LinkedHashMultimap.create())); 133 134 childToParent = 135 loadQueryPairs( 136 GenerateLanguageContainment.class, 137 "wikidata-childToParent", 138 code -> showNameAndCode(code), 139 code -> showNameAndCode(code)); 140 141 } catch (Throwable t) { 142 t.printStackTrace(); 143 throw new RuntimeException(t); 144 } 145 } 146 getEntityName(String key)147 String getEntityName(String key) { 148 String code = getEntityCode(key); 149 if (code != null) { 150 try { 151 String name = NAME.apply(code); 152 if (name != null) { 153 return name; 154 } 155 } catch (Exception e) { 156 // TODO: Why would NAME.apply throw? 157 // TODO: Need better handling here? 158 } 159 } 160 String name = entityToLabel.get(key); 161 if (name != null) { 162 return name; 163 } 164 return afterLastSlash(key); 165 } 166 getEntityCode(String key)167 private String getEntityCode(String key) { 168 return entityToCode == null ? null : entityToCode.get(key); 169 } 170 afterLastSlash(String key)171 private String afterLastSlash(String key) { 172 return key.substring(key.lastIndexOf('/') + 1, key.length() - 1); 173 } 174 writeTsvs()175 public void writeTsvs() throws IOException { 176 TsvWriter.writeTsv("childToParent.tsv", childToParent, "child", "parent"); 177 TsvWriter.writeTsv("entityToCode.tsv", entityToCode, "lang", "langCode"); 178 TsvWriter.writeTsv("entityToLabel.tsv", entityToLabel, "lang", "langLabel"); 179 SortedSetMultimap<String, String> childToParentWithCodes = TreeMultimap.create(); 180 for (Entry<String, String> entry : childToParent.entries()) { 181 String child = entry.getKey(); 182 String parent = entry.getValue(); 183 childToParentWithCodes.put(showNameAndCode(child), showNameAndCode(parent)); 184 } 185 TsvWriter.writeTsv( 186 "childToParentWithCodes.tsv", 187 childToParentWithCodes, 188 "childCode\tLabel", 189 "parentCode\tLabel"); 190 } 191 showNameAndCode(String qid)192 public String showNameAndCode(String qid) { 193 return getEntityName(qid) 194 + " (" 195 + (getEntityCode(qid) == null ? "" : getEntityCode(qid) + ", ") 196 + afterLastSlash(qid) 197 + ")"; 198 } 199 showNameAndCode(T qids)200 public <T extends Iterable<String>> String showNameAndCode(T qids) { 201 StringBuilder b = new StringBuilder(); 202 qids.forEach( 203 qid -> { 204 if (b.length() != 0) b.append(", "); 205 b.append(showNameAndCode(qid)); 206 }); 207 return b.toString(); 208 } 209 showNameAndCode2(U qids)210 public <T extends Iterable<String>, U extends Iterable<T>> String showNameAndCode2(U qids) { 211 StringBuilder b = new StringBuilder(); 212 qids.forEach( 213 qid -> { 214 if (b.length() != 0) b.append("; "); 215 b.append(showNameAndCode(qid)); 216 }); 217 return b.toString(); 218 } 219 } 220 221 static final QueryHelper QUERY_HELPER = new QueryHelper(); 222 223 static final Function<String, String> NAME = 224 code -> 225 code.equals(LocaleNames.MUL) 226 ? LocaleNames.ROOT 227 : ENGLISH.getName(code) + " (" + code + ")"; 228 229 static final Set<String> COLLECTIONS; 230 231 static { 232 Map<String, Map<LstrField, String>> languages = 233 StandardCodes.getEnumLstreg().get(LstrType.language); 234 Builder<String> _collections = ImmutableSet.<String>builder(); 235 for (Entry<String, Map<LstrField, String>> e : languages.entrySet()) { 236 String scope = e.getValue().get(LstrField.Scope); 237 if (scope != null && "Collection".equalsIgnoreCase(scope)) { e.getKey()238 _collections.add(e.getKey()); 239 } 240 } 241 COLLECTIONS = _collections.build(); 242 } 243 244 static class Tree { 245 Set<String> leaves = new LinkedHashSet<>(); 246 add(List<String> chain)247 void add(List<String> chain) { 248 Collections.reverse(chain); 249 } 250 } 251 252 /** To add parent-child relations to Wikidata */ 253 static final Multimap<String, String> EXTRA_PARENT_CHILDREN = 254 ImmutableMultimap.<String, String>builder() 255 .put("alv", "agq") 256 .put("alv", "cch") // Atlantic–Congo <= cch [Atsam] 257 .put("alv", "kcg") // Atlantic–Congo <= kcg [Tyap] 258 .put("alv", "ken") // Atlantic–Congo <= ken [Kenyang] 259 .put("alv", "ngb") 260 .put("alv", "yav") 261 .put("ber", "zgh") 262 .put("bnt", "asa") 263 .put("bnt", "bez") 264 .put("bnt", "cgg") 265 .put("bnt", "ebu") 266 .put("bnt", "jmc") 267 .put("bnt", "ksb") 268 .put("bnt", "lag") 269 .put("bnt", "mer") 270 .put("bnt", "mgh") 271 .put("bnt", "nmg") 272 .put("bnt", "rof") 273 .put("bnt", "rwk") 274 .put("bnt", "sbp") 275 .put("bnt", "seh") 276 .put("bnt", "vun") 277 .put("bnt", "xog") 278 .put("cpp", "kea") 279 .put("euq", "eu") 280 // gmw = West Germanic 281 .put("gmw", "ksh") 282 .put("gmw", "lb") 283 .put("gmw", "wae") 284 .put("grk", "el") 285 .put("grk", "gmy") 286 .put("grk", "grc") 287 .put("ira", "lrc") 288 .put("ira", "bgn") // Iranian <= Western Balochi 289 .put("inc", "trw") // Indo-Aryan <= Torwali 290 .put("jpx", "ja") 291 .put(LocaleNames.MUL, "art") 292 .put(LocaleNames.MUL, "euq") 293 .put(LocaleNames.MUL, "jpx") 294 .put(LocaleNames.MUL, "tai") 295 .put("ngb", "sg") 296 .put("roa", "cpf") 297 .put("roa", "cpp") 298 .put("roa", "cpp") 299 .put("sdv", "saq") 300 .put("son", "khq") 301 .put("sw", "swc") 302 .put("tai", "blt") // tai [Tai] <= blt [Tai Dam] 303 .put("tai", "lo") 304 .put("tai", "th") 305 .put("zlw", "szl") // West Slavic <= Silesian 306 .build(); 307 308 /** 309 * To remove parent-child relations from Wikidata, eg if a child has two parents (where that 310 * causes problems) 311 */ 312 static final Multimap<String, String> REMOVE_PARENT_CHILDREN = 313 ImmutableMultimap.<String, String>builder() 314 .put("alv", "ukg") // ngf [Trans-New Guinea languages] <= ukg [Ukuriguma] 315 .put( 316 "crp", 317 "*") // general Creole group interferes with French/Spanish/... language 318 // grouping 319 .put("cus", "mhd") // bnt [Bantu] <= mhd [Mbugu] (not cus [Cushitic]) 320 .put("gmw", "pih") // cpe [Creoles and pidgins, English based] <= pih 321 // [Pitcairn-Norfolk] 322 .put("inc", "rmg") 323 // Indo-European 324 .put("ine", "el") 325 .put("ine", "gmy") 326 .put("ine", "grc") 327 .put("ine", "trw") // inc [Indic] <= trw [Torwali] 328 .put(LocaleNames.MUL, "crp") 329 .put(LocaleNames.MUL, "cpp") // Creoles and pidgins, Portuguese-based 330 .put(LocaleNames.MUL, LocaleNames.UND) // anomaly 331 .put("nic", "kcp") // ssa [Nilo-Saharan] <= kcp [Kanga] 332 .put("nic", "kec") // ssa [Nilo-Saharan] <= kec [Keiga] 333 .put("nic", "kgo") // ssa [Nilo-Saharan] <= kgo [Krongo] 334 .put("nic", "rof") // ssa [Nilo-Saharan] <= rof [Rombo] 335 .put("nic", "tbr") // ssa [Nilo-Saharan] <= tbr [Tumtum] 336 .put("nic", "tey") // ssa [Nilo-Saharan] <= tey [Tulishi] 337 .put("sit", "th") // sit <= tbq <= th 338 .put("sit", "dz") // sit <= tbq <= dz 339 .put("sit", "zh") 340 .put("sla", "cu") 341 .put("tbq", "psq") // paa [Papuan]; for psq [Pasi] - not tbq [Tibeto-Burman 342 // languages]; (There is also a variety of the Sino-Tibetan Adi 343 // language called Pasi. 344 .build(); 345 main(String[] args)346 public static void main(String[] args) throws IOException { 347 new GenerateLanguageContainment().run(args); 348 if (Containment.hadErrors) { 349 System.err.println("ERROR: Containment Errors detected, see errors above."); 350 System.exit(1); 351 } 352 } 353 run(String[] args)354 void run(String[] args) throws IOException { 355 if (true) { 356 // check on items 357 for (String check : Arrays.asList("sw", "km", "ksh", "wae", "kea", "mfe", "th", "lo")) { 358 System.out.println("Checking " + ENGLISH.getName(check) + "[" + check + "]"); 359 Collection<String> entities = QUERY_HELPER.codeToEntity.get(check); 360 if (entities.isEmpty()) { 361 System.out.println("no code for " + check + ": " + entities); 362 continue; 363 } 364 for (String entity : entities) { 365 Set<List<String>> ancestors = getAllAncestors(entity); 366 showEntityLists(entity + " parents ", ancestors); 367 System.out.println(); 368 } 369 } 370 } 371 372 Map<Status, Set<String>> table = Validity.getInstance().getStatusToCodes(LstrType.language); 373 TreeMultimap<String, String> _parentToChild = TreeMultimap.create(); 374 TreeSet<String> missing = new TreeSet<>(table.get(Status.regular)); 375 _parentToChild.put(LocaleNames.MUL, LocaleNames.UND); 376 Set<String> skipping = new LinkedHashSet<>(); 377 for (String code : table.get(Status.regular)) { 378 if (ONLY_LIVING) { 379 Type type = Iso639Data.getType(code); 380 if (type != Type.Living) { 381 continue; 382 } 383 } 384 if (code.compareTo("hdz") > 0) { 385 int debug = 0; 386 } 387 // if (COLLECTIONS.contains(code)) { 388 // continue; 389 // } 390 Collection<String> entities = QUERY_HELPER.codeToEntity.get(code); 391 if (entities.isEmpty()) { 392 continue; 393 } 394 for (String entity : entities) { 395 if (QUERY_HELPER.childToParent.get(entity).isEmpty()) { 396 continue; 397 } 398 Set<Set<String>> chains = getAncestors(entity, skipping); 399 if (chains.size() > 1) { 400 int debug = 0; 401 } 402 for (Set<String> chain : chains) { 403 String last = null; 404 for (String link : chain) { 405 if (last != null) { 406 _parentToChild.put(link, last); 407 } 408 last = link; 409 } 410 } 411 } 412 } 413 System.out.println("Writing " + "skippingCodes.tsv"); 414 try (PrintWriter w = 415 FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), "skippingCodes.tsv")) { 416 // TsvWriter.writeRow(w, "childCode\tLabel", "parentCode\tLabel"); // header 417 skipping.forEach(e -> w.println(e)); 418 } 419 420 for (Entry<String, Collection<String>> entity : REMOVE_PARENT_CHILDREN.asMap().entrySet()) { 421 String key = entity.getKey(); 422 for (String value : entity.getValue()) { 423 if (value.equals("*")) { 424 _parentToChild.removeAll(key); 425 } else { 426 _parentToChild.remove(key, value); 427 } 428 } 429 } 430 431 _parentToChild.putAll(EXTRA_PARENT_CHILDREN); 432 433 // special code for artificial 434 for (String code : Iso639Data.getAvailable()) { 435 Type type = Iso639Data.getType(code); 436 if (type == Type.Constructed) { 437 _parentToChild.put("art", code); 438 } 439 } 440 441 Multimap<String, String> parentToChild = ImmutableMultimap.copyOf(_parentToChild); 442 Multimap<String, String> childToParent = 443 ImmutableMultimap.copyOf( 444 Multimaps.invertFrom(parentToChild, TreeMultimap.create())); 445 System.out.println( 446 "Checking " + "he" + "\t" + Containment.getAllDirected(childToParent, "he")); 447 448 try (PrintWriter w = 449 FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), "RawLanguageContainment.txt")) { 450 print(w, parentToChild, new ArrayList<>(Arrays.asList(LocaleNames.MUL))); 451 } 452 SimpleXMLSource xmlSource = new SimpleXMLSource("languageGroup"); 453 xmlSource.setNonInheriting(true); // should be gotten from DtdType... 454 CLDRFile newFile = new CLDRFile(xmlSource); 455 newFile.setDtdType(DtdType.supplementalData); 456 newFile.add("//" + DtdType.supplementalData + "/version[@number='$Revision$']", ""); 457 printXML(newFile, parentToChild); 458 459 try (PrintWriter outFile = 460 FileUtilities.openUTF8Writer( 461 CLDRPaths.SUPPLEMENTAL_DIRECTORY, "languageGroup.xml")) { 462 newFile.write(outFile); 463 } catch (IOException e1) { 464 throw new ICUUncheckedIOException("Can't write to languageGroup.xml", e1); 465 } 466 467 // for (Entry<String,String> entry : childToParent.entries()) { 468 // String childNames = getName(entityToCode, entityToLabel, entry.getKey()); 469 // String parentNames = getName(entityToCode, entityToLabel, entry.getValue()); 470 // System.out.println(entry.getKey() + "\t" + entry.getValue() + "\t" + 471 // childNames + "\t" + parentNames); 472 // } 473 QUERY_HELPER.writeTsvs(); 474 } 475 showEntityLists(String title, Set<List<String>> ancestors)476 private static void showEntityLists(String title, Set<List<String>> ancestors) { 477 ancestors.forEach( 478 new Consumer<List<String>>() { 479 @Override 480 public void accept(List<String> item) { 481 item.forEach( 482 new Consumer<String>() { 483 @Override 484 public void accept(String t) { 485 System.out.println( 486 t 487 + "\t" 488 + QUERY_HELPER.entityToCode.get(t) 489 + "\t" 490 + QUERY_HELPER.entityToLabel.get(t)); 491 } 492 }); 493 System.out.println(); 494 } 495 }); 496 } 497 printXML(CLDRFile newFile, Multimap<String, String> parentToChild)498 private static void printXML(CLDRFile newFile, Multimap<String, String> parentToChild) { 499 printXML(newFile, parentToChild, LocaleNames.MUL); 500 } 501 printXML( CLDRFile newFile, Multimap<String, String> parentToChild, String base)502 private static void printXML( 503 CLDRFile newFile, Multimap<String, String> parentToChild, String base) { 504 Collection<String> children = parentToChild.get(base); 505 if (children.isEmpty()) { 506 return; 507 } 508 if (base.equals(LocaleNames.UND)) { 509 // skip, no good info 510 } else { 511 newFile.add( 512 "//" 513 + DtdType.supplementalData 514 + "/languageGroups/languageGroup[@parent=\"" 515 + base 516 + "\"]", 517 Joiner.on(" ").join(children)); 518 } 519 for (String child : children) { 520 printXML(newFile, parentToChild, child); 521 } 522 } 523 print( Writer out, Multimap<String, String> parentToChild, List<String> line)524 private static void print( 525 Writer out, Multimap<String, String> parentToChild, List<String> line) { 526 String current = line.get(line.size() - 1); 527 Collection<String> children = parentToChild.get(current); 528 if (children.isEmpty()) { 529 try { 530 String sep = ""; 531 for (String item : line) { 532 out.append(sep).append(NAME.apply(item)); 533 sep = " > "; 534 } 535 out.append('\n'); 536 out.flush(); 537 } catch (IOException e) { 538 } 539 } else { 540 for (String child : children) { 541 line.add(child); 542 print(out, parentToChild, line); 543 line.remove(line.size() - 1); 544 } 545 } 546 } 547 getAncestors(String leaf, Set<String> skipping)548 private static Set<Set<String>> getAncestors(String leaf, Set<String> skipping) { 549 Set<List<String>> items = Containment.getAllDirected(QUERY_HELPER.childToParent, leaf); 550 Set<Set<String>> itemsFixed = new LinkedHashSet<>(); 551 main: 552 for (List<String> item : items) { 553 Set<String> chain = new LinkedHashSet<>(); 554 for (String id : item) { 555 String code = QUERY_HELPER.entityToCode.get(id); 556 if (code == null) { 557 continue; 558 } 559 560 // skip leaf nodes after the first 561 562 if (!chain.isEmpty() && !COLLECTIONS.contains(code)) { 563 if (code.equals("zh")) { 564 code = "zhx"; // rewrite collections usage 565 } else { 566 skipping.add( 567 "Skipping inheritance from\t" 568 + chain 569 + "\t" 570 + code 571 + "\tfrom\t" 572 + QUERY_HELPER.showNameAndCode2(items)); 573 continue; 574 } 575 } 576 577 // check for cycle, and skip if we have one 578 579 boolean changed = chain.add(code); 580 if (!changed) { 581 log("Cycle in\t" + chain + "\tfrom\t" + QUERY_HELPER.showNameAndCode2(items)); 582 continue main; 583 } 584 } 585 if (chain.size() > 1) { 586 chain.add(LocaleNames.MUL); // root 587 itemsFixed.add(chain); 588 } 589 } 590 // remove subsets 591 // eg [[smp, he, mul], [smp, he, sem, afa, mul]] 592 // => [[smp, he, sem, afa, mul]] 593 if (itemsFixed.size() > 1) { 594 Set<Set<String>> removals = new HashSet<>(); 595 for (Set<String> chain1 : itemsFixed) { 596 for (Set<String> chain2 : itemsFixed) { 597 if (chain1.containsAll(chain2) && !chain2.containsAll(chain1)) { 598 removals.add(chain2); 599 } 600 } 601 } 602 itemsFixed.removeAll(removals); 603 } 604 return itemsFixed; 605 // TODO: delete this commented-out code? 606 // while (true) { 607 // String code = entityToCode.get(leaf); 608 // if (code != null) { 609 // chain.add(code); 610 // } 611 // Collection<String> parents = childToParent.get(leaf); 612 // if (parents.isEmpty()) { 613 // // clean up duplicates 614 // chain = new ArrayList<>(new LinkedHashSet<>(chain)); 615 // // wikipedia has non-collections as parents. Remove those if they are not 616 // first. 617 // break; 618 // } 619 // leaf = getBest(parents); 620 // } 621 // String last = chain.get(0); 622 // for (int i = 1; i < chain.size(); ++i) { 623 // String item = chain.get(i); 624 // if (!COLLECTIONS.contains(item)) { 625 // chain.set(i, item.equals("zh") ? "zhx" : ""); 626 // DROPPED_PARENTS_TO_CHILDREN.put(item, last); 627 // } else { 628 // last = item; 629 // } 630 // } 631 // chain.removeIf(x -> x.isEmpty()); 632 // if ("zh".equals(chain.get(0))) { 633 // chain.add(1,"zhx"); 634 // } 635 // last = chain.get(chain.size()-1); 636 // if (!LocaleNames.MUL.equals(last)) { 637 // chain.add(LocaleNames.MUL); // make sure we have root. 638 // } 639 // if (chain.size() == 2) { 640 // chain.add(1,LocaleNames.UND); 641 // } 642 // return chain; 643 } 644 log(String string)645 private static void log(String string) { 646 System.out.println(string); 647 // for (Entry<String, String> e : DROPPED_PARENTS_TO_CHILDREN.entries()) { 648 // System.out.println(NAME.apply(e.getKey()) + "\t" + NAME.apply(e.getValue()) 649 // ); 650 // } 651 } 652 653 // TODO: This function is only called by other commented-out code above. 654 // private static String getBest(Collection<String> parents) { 655 // for (String parent : parents) { 656 // String code = QUERY_HELPER.entityToCode.get(parent); 657 // if (code == null) continue; 658 // Type type = Iso639Data.getType(code); 659 // if (type != Type.Living) { 660 // continue; 661 // } 662 // return parent; 663 // } 664 // // failed 665 // return parents.iterator().next(); 666 // } 667 loadQueryPairs( Class<?> class1, String file, Function<String, String> keyMapper, Function<String, String> valueMapper)668 private static Multimap<String, String> loadQueryPairs( 669 Class<?> class1, 670 String file, 671 Function<String, String> keyMapper, 672 Function<String, String> valueMapper) 673 throws IOException { 674 System.out.println("QUERY: " + file); 675 ResultSet rs = queryClient.execSelectFromSparql(file, QueryClient.WIKIDATA_SPARQL_SERVER); 676 // the query must return exactly two variables. 677 List<String> resultVars = rs.getResultVars(); 678 assertTwoVars(resultVars); 679 final String keyName = resultVars.get(0); 680 final String valueName = resultVars.get(1); 681 682 ImmutableMultimap.Builder<String, String> _keyToValues = ImmutableMultimap.builder(); 683 for (; rs.hasNext(); ) { 684 final QuerySolution qs = rs.next(); 685 String key = QueryClient.getStringOrNull(qs, keyName); 686 String value = QueryClient.getStringOrNull(qs, valueName); 687 _keyToValues.put(key, value); 688 } 689 ImmutableMultimap<String, String> result = _keyToValues.build(); 690 showDups(file, result, keyMapper, valueMapper); 691 System.out.println("LOADED: " + file + " with rows " + rs.getRowNumber()); 692 return result; 693 } 694 695 /** 696 * Assuming that the SPARQL query returns exactly 2 results, treat them as Key=Value. 697 * 698 * @param class1 699 * @param file name of a sparql query, such as 'wikidata-childToParent' 700 * @param fixValue 701 * @param keyMapper 702 * @param valueMapper 703 * @return 704 * @throws IOException 705 */ loadQueryPairsUnique( Class<?> class1, String file, Function<String, String> fixValue, Function<String, String> keyMapper, Function<String, String> valueMapper)706 private static Map<String, String> loadQueryPairsUnique( 707 Class<?> class1, 708 String file, 709 Function<String, String> fixValue, 710 Function<String, String> keyMapper, 711 Function<String, String> valueMapper) 712 throws IOException { 713 714 System.out.println("QUERY: " + file); 715 ResultSet rs = queryClient.execSelectFromSparql(file, QueryClient.WIKIDATA_SPARQL_SERVER); 716 717 // the query must return exactly two variables. 718 List<String> resultVars = rs.getResultVars(); 719 assertTwoVars(resultVars); 720 final String keyName = resultVars.get(0); 721 final String valueName = resultVars.get(1); 722 723 Map<String, String> _keyToValue = new TreeMap<>(); 724 Multimap<String, String> _keyToValues = TreeMultimap.create(); 725 for (; rs.hasNext(); ) { 726 final QuerySolution qs = rs.next(); 727 String key = QueryClient.getStringOrNull(qs, keyName); 728 String value = QueryClient.getStringOrNull(qs, valueName); 729 if (fixValue != null) { 730 value = fixValue.apply(value); 731 } 732 _keyToValues.put(key, value); 733 String oldValue = _keyToValue.get(key); 734 if (oldValue == null || oldValue.equals("kxm")) { 735 _keyToValue.put(key, value); 736 } 737 } 738 _keyToValue = ImmutableMap.copyOf(_keyToValue); 739 showDups(file, _keyToValues, keyMapper, valueMapper); 740 System.out.println("LOADED: " + file + " with rows " + rs.getRowNumber()); 741 return _keyToValue; 742 } 743 assertTwoVars(List<String> resultVars)744 private static void assertTwoVars(List<String> resultVars) { 745 if (resultVars.size() != 2) { 746 throw new IllegalArgumentException( 747 "expected 2 result vars but got " + resultVars.size() + ": " + resultVars); 748 } 749 } 750 showDups( String file, Multimap<String, String> _keyToValues, Function<String, String> keyMapper, Function<String, String> valueMapper)751 private static void showDups( 752 String file, 753 Multimap<String, String> _keyToValues, 754 Function<String, String> keyMapper, 755 Function<String, String> valueMapper) { 756 for (Entry<String, Collection<String>> entry : _keyToValues.asMap().entrySet()) { 757 Collection<String> valueSet = entry.getValue(); 758 if (valueSet.size() > 1) { 759 String key = entry.getKey(); 760 key = keyMapper == null ? key : keyMapper.apply(key); 761 if (valueMapper != null) { 762 Set<String> result = new LinkedHashSet<>(); 763 valueSet.stream().map(valueMapper).forEach(x -> result.add(x)); 764 valueSet = result; 765 } 766 log(file + "\tMultiple values: " + key + "\t" + valueSet); 767 } 768 } 769 } 770 getAllAncestors(String lang)771 static Set<List<String>> getAllAncestors(String lang) { 772 return Containment.getAllDirected(QUERY_HELPER.childToParent, lang); 773 } 774 } 775