1 package org.unicode.cldr.tool; 2 3 import com.google.common.collect.LinkedHashMultimap; 4 import com.google.common.collect.Multimap; 5 import com.google.common.collect.TreeMultimap; 6 import com.ibm.icu.impl.Row.R2; 7 import com.ibm.icu.impl.Row.R3; 8 import com.ibm.icu.impl.Row.R4; 9 import com.ibm.icu.impl.Utility; 10 import com.ibm.icu.lang.UProperty; 11 import com.ibm.icu.lang.UScript; 12 import com.ibm.icu.text.Normalizer2; 13 import com.ibm.icu.text.UTF16; 14 import com.ibm.icu.text.UnicodeSet; 15 import com.ibm.icu.util.ICUUncheckedIOException; 16 import com.ibm.icu.util.ULocale; 17 import java.io.IOException; 18 import java.io.PrintWriter; 19 import java.util.Arrays; 20 import java.util.Collection; 21 import java.util.HashMap; 22 import java.util.HashSet; 23 import java.util.Iterator; 24 import java.util.List; 25 import java.util.Locale; 26 import java.util.Map; 27 import java.util.Map.Entry; 28 import java.util.Set; 29 import java.util.TreeMap; 30 import java.util.TreeSet; 31 import org.apache.jena.query.QuerySolution; 32 import org.apache.jena.query.ResultSet; 33 import org.unicode.cldr.draft.FileUtilities; 34 import org.unicode.cldr.rdf.QueryClient; 35 import org.unicode.cldr.rdf.TsvWriter; 36 import org.unicode.cldr.test.DisplayAndInputProcessor; 37 import org.unicode.cldr.util.CLDRConfig; 38 import org.unicode.cldr.util.CLDRFile; 39 import org.unicode.cldr.util.CLDRFile.NumberingSystem; 40 import org.unicode.cldr.util.CLDRFile.WinningChoice; 41 import org.unicode.cldr.util.CLDRPaths; 42 import org.unicode.cldr.util.ChainedMap; 43 import org.unicode.cldr.util.ChainedMap.M4; 44 import org.unicode.cldr.util.CldrUtility; 45 import org.unicode.cldr.util.Counter; 46 import org.unicode.cldr.util.Factory; 47 import org.unicode.cldr.util.SimpleXMLSource; 48 import org.unicode.cldr.util.StandardCodes.LstrType; 49 import org.unicode.cldr.util.SupplementalDataInfo; 50 import org.unicode.cldr.util.Validity; 51 import org.unicode.cldr.util.Validity.Status; 52 import org.unicode.cldr.util.XPathParts; 53 54 public final class WikiSubdivisionLanguages { 55 private static final String WIKI_SUBDIVISION_LANGUAGES_TSV = "wikiSubdivisionLanguages.tsv"; 56 static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance(); 57 static final Set<String> regularSubdivisions = 58 Validity.getInstance().getStatusToCodes(LstrType.subdivision).get(Status.regular); 59 60 static final Map<String, R2<List<String>, String>> SUBDIVISION_ALIASES = 61 SDI.getLocaleAliasInfo().get("subdivision"); 62 63 private static final boolean DEBUG_CONSOLE = false; 64 private static final String DEBUG_LANG_FILTER = null; // "az"; 65 66 private static final String BEFORE_TYPE = 67 "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\""; 68 69 private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); 70 private static final Normalizer2 NFC = Normalizer2.getNFCInstance(); 71 72 private static ChainedMap.M3<String, String, String> SUB_LANG_NAME = 73 ChainedMap.of( 74 new TreeMap<String, Object>(), new TreeMap<String, Object>(), String.class); 75 private static ChainedMap.M3<String, String, String> LANG_SUB_NAME = 76 ChainedMap.of( 77 new TreeMap<String, Object>(), new TreeMap<String, Object>(), String.class); 78 private static Set<String> bogus = new TreeSet<>(); 79 private static Multimap<Status, String> bogusStatus = TreeMultimap.create(); 80 getSubdivisionName(String subdivisionId, String languageId)81 public static String getSubdivisionName(String subdivisionId, String languageId) { 82 return WikiSubdivisionLanguages.LANG_SUB_NAME.get(languageId, subdivisionId); 83 } 84 getBestWikiEnglishName(String subdivisionId)85 public static String getBestWikiEnglishName(String subdivisionId) { 86 String languageId = "en"; 87 String name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, languageId); 88 if (name != null) { 89 return name; 90 } 91 name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "es"); 92 if (name != null) { 93 return name; 94 } 95 name = WikiSubdivisionLanguages.getSubdivisionName(subdivisionId, "fr"); 96 if (name != null) { 97 return name; 98 } 99 Map<String, String> data = WikiSubdivisionLanguages.SUB_LANG_NAME.get(subdivisionId); 100 // try Spanish, then French, then first other 101 if (data != null) { 102 return data.entrySet().iterator().next().getValue(); // get first 103 } 104 return null; 105 } 106 107 private static final String QUERY_NAME = "wikidata-wikisubdivisionLanguages"; 108 109 // static Map<String, String> WIKIDATA_TO_MID = new TreeMap<>(); init()110 static void init() throws IOException { 111 112 QueryClient queryClient = QueryClient.getInstance(); 113 114 System.out.println("QUERY: " + QUERY_NAME); 115 ResultSet rs = 116 queryClient.execSelectFromSparql(QUERY_NAME, QueryClient.WIKIDATA_SPARQL_SERVER); 117 118 Map<String, Status> codeToStatus = 119 Validity.getInstance().getCodeToStatus(LstrType.subdivision); 120 try (PrintWriter tsv = 121 FileUtilities.openUTF8Writer( 122 TsvWriter.getTsvDir(), WIKI_SUBDIVISION_LANGUAGES_TSV)) { 123 TsvWriter.writeRow(tsv, "item", "label", "code", "codeLabel"); 124 for (; rs.hasNext(); ) { 125 final QuerySolution qs = rs.next(); 126 127 String item = QueryClient.getResourceOrNull(qs, "item"); 128 String label = NFC.normalize(QueryClient.getStringOrNull(qs, "label")); 129 String code = QueryClient.getStringOrNull(qs, "code"); 130 String codeLabel = QueryClient.getStringOrNull(qs, "codeLabel"); 131 132 TsvWriter.writeRow(tsv, item, label, code, codeLabel); 133 134 String subdivision = SubdivisionNode.convertToCldr(code); 135 if (!regularSubdivisions.contains(subdivision)) { 136 Status status = codeToStatus.get(subdivision); 137 if (status == null) { 138 bogus.add(subdivision); 139 } else { 140 bogusStatus.put(status, subdivision); 141 } 142 continue; 143 } 144 if (DEBUG_LANG_FILTER != null && !DEBUG_LANG_FILTER.equals(codeLabel)) { 145 continue; 146 } 147 SUB_LANG_NAME.put(subdivision, codeLabel, label); 148 // WIKIDATA_TO_MID.put(subdivision, data.get(2)); 149 LANG_SUB_NAME.put(codeLabel, subdivision, label); 150 } 151 System.out.println("Queried " + QUERY_NAME + " at row count " + rs.getRowNumber()); 152 } 153 System.out.println("Wrote to " + WIKI_SUBDIVISION_LANGUAGES_TSV); 154 // postprocess 155 String oldLang = null; 156 DisplayAndInputProcessor daip = null; 157 Exception[] internalException = {null}; 158 159 for (R3<String, String, String> row : LANG_SUB_NAME.rows()) { 160 String lang = row.get0(); 161 String subdivision = row.get1(); 162 String name = row.get2(); 163 if (!lang.equals(oldLang)) { 164 oldLang = lang; 165 daip = new DisplayAndInputProcessor(new ULocale(lang)); 166 } 167 String path = getSubdivisionPath(subdivision); 168 String name2 = daip.processInput(path, name.replace("\u00AD", ""), internalException); 169 if (name2.contains("'")) { 170 int debug = 0; 171 } 172 // TODO remove soft hyphen in DAIP 173 if (internalException[0] != null) { 174 throw new IllegalArgumentException( 175 lang + "\t" + subdivision + "\t" + name, internalException[0]); 176 } else if (!name.equals(name2)) { 177 // System.out.println(lang + "\t" + subdivision + "\t" + name + "\t" + name2); 178 SUB_LANG_NAME.put(subdivision, lang, name2); 179 LANG_SUB_NAME.put(lang, subdivision, name2); 180 } 181 } 182 } 183 getSubdivisionPath(String subdivision)184 private static String getSubdivisionPath(String subdivision) { 185 return BEFORE_TYPE + subdivision + "\"][@draft=\"contributed\"]"; 186 } 187 getSubdivisionFromPath(String path)188 private static String getSubdivisionFromPath(String path) { 189 return path.substring(BEFORE_TYPE.length(), path.indexOf('"', BEFORE_TYPE.length())); 190 } 191 main(String[] args)192 public static void main(String[] args) throws IOException { 193 init(); 194 195 Counter<String> counter = new Counter<>(); 196 Factory cldrFactory = CLDR_CONFIG.getCldrFactory(); 197 Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*"); 198 CLDRFile file = null; 199 UnicodeSet exemplars = null; 200 201 ChainedMap.M4<Integer, String, String, String> exemplarFailureLangSubdivisionName = 202 ChainedMap.of( 203 new TreeMap<Integer, Object>(), 204 new TreeMap<String, Object>(), 205 new TreeMap<String, Object>(), 206 String.class); 207 208 for (Entry<String, Map<String, String>> entry : LANG_SUB_NAME) { 209 String lang = entry.getKey(); 210 file = cldrFactory.make(lang, true); 211 212 CLDRFile oldFileSubdivisions; 213 try { 214 oldFileSubdivisions = cldrFactorySubdivisions.make(lang, false); 215 } catch (Exception e) { 216 oldFileSubdivisions = new CLDRFile(new SimpleXMLSource(lang)).freeze(); 217 } 218 219 Multimap<String, String> inverse = LinkedHashMultimap.create(); 220 CLDRFile fileSubdivisions = fixedFile(oldFileSubdivisions, inverse); 221 222 UnicodeSet main = file.getExemplarSet("", WinningChoice.WINNING, 0); 223 UnicodeSet auxiliary = file.getExemplarSet("auxiliary", WinningChoice.WINNING); 224 UnicodeSet punctuation = file.getExemplarSet("punctuation", WinningChoice.WINNING); 225 UnicodeSet numbers = file.getExemplarsNumeric(NumberingSystem.defaultSystem); 226 exemplars = 227 new UnicodeSet() 228 .addAll(main) 229 .addAll(auxiliary) 230 .addAll(scriptsFor(main)) // broad test,... 231 .addAll(punctuation) 232 .addAll(numbers) 233 .addAll(new UnicodeSet("[\\ ]")) 234 .freeze(); 235 236 for (Entry<String, String> entry2 : entry.getValue().entrySet()) { 237 String subdivision = entry2.getKey(); 238 String name = entry2.getValue(); 239 if (name.equals("Böyük Britaniya")) { 240 int debug = 0; 241 } 242 String path = getSubdivisionPath(subdivision); 243 String oldName = fileSubdivisions.getStringValue(path); 244 if (oldName != null) { 245 if (!oldName.equals(name)) { 246 // System.out.println("Already has translation\t" + lang + "\t" + 247 // subdivision + "\t" + name + "\t" + oldName); 248 } 249 continue; 250 } 251 if (!exemplars.containsAll(name)) { 252 UnicodeSet exemplarFailures = 253 new UnicodeSet().addAll(name).removeAll(exemplars); 254 addExemplarFailures( 255 exemplarFailureLangSubdivisionName, 256 exemplarFailures, 257 lang, 258 subdivision, 259 name); 260 continue; 261 } 262 fileSubdivisions.add(path, name); 263 inverse.put(name, path); 264 counter.add(lang, 1); 265 } 266 267 // We now fix collisions 268 for (Entry<String, Collection<String>> entry3 : inverse.asMap().entrySet()) { 269 String name = entry3.getKey(); 270 if (name.isEmpty()) { 271 continue; 272 } 273 if (name.equals("Böyük Britaniya")) { 274 int debug = 0; 275 } 276 Collection<String> paths = entry3.getValue(); 277 if (paths.size() <= 1) { 278 continue; 279 } 280 if (paths.size() > 3) { 281 int debug = 0; 282 } 283 // we only care about collisions *within* a region. 284 // so group them together 285 Multimap<String, String> regionToPaths = LinkedHashMultimap.create(); 286 for (String path : paths) { 287 String sdId = getSubdivisionFromPath(path); 288 String region = sdId.substring(0, 2).toUpperCase(Locale.ROOT); 289 regionToPaths.put(region, path); 290 } 291 292 // Now fix as necessary 293 for (Entry<String, Collection<String>> regionAndPaths : 294 regionToPaths.asMap().entrySet()) { 295 Collection<String> paths2 = regionAndPaths.getValue(); 296 int markerIndex = 0; 297 if (paths2.size() <= 1) { 298 continue; 299 } 300 301 // find if any of the paths are deprecated 302 for (Iterator<String> it = paths2.iterator(); it.hasNext(); ) { 303 String path = it.next(); 304 String sdId = getSubdivisionFromPath(path); 305 if (!regularSubdivisions.contains(sdId)) { // deprecated 306 fileSubdivisions.remove(path); 307 it.remove(); 308 fail( 309 "Duplicate, not regular ", 310 lang, 311 getSubdivisionFromPath(path), 312 "REMOVING", 313 -1); 314 } 315 } 316 if (paths2.size() <= 1) { 317 continue; 318 } 319 320 String otherId = null; 321 for (String path : paths2) { 322 // if (nuke) { 323 // if (oldFileSubdivisions.getStringValue(path) == 324 // null) { 325 // fileSubdivisions.remove(path); // get rid of 326 // new ones 327 // System.out.println("Removing colliding " + 328 // lang + "\t" + path + "\t" + name); 329 // } 330 if (markerIndex == 0) { 331 otherId = getSubdivisionFromPath(path); 332 } else { 333 String fixedName = name + MARKERS.get(markerIndex); 334 fail( 335 "Superscripting ", 336 lang + "\t(" + otherId + ")", 337 getSubdivisionFromPath(path), 338 fixedName, 339 -1); 340 // System.out.println("Superscripting colliding:\t" + lang + "\t" + path 341 // + "\t" + fixedName); 342 fileSubdivisions.add(path, fixedName); // overwrite with superscripted 343 } 344 ++markerIndex; 345 } 346 } 347 } 348 349 if (DEBUG_CONSOLE) { 350 PrintWriter pw = new PrintWriter(System.out); 351 fileSubdivisions.write(new PrintWriter(System.out)); 352 pw.flush(); 353 } else { 354 try (PrintWriter out = 355 FileUtilities.openUTF8Writer( 356 CLDRPaths.SUBDIVISIONS_DIRECTORY, lang + ".xml")) { 357 fileSubdivisions.write(out); 358 } catch (Exception e) { 359 throw new ICUUncheckedIOException(e); 360 } 361 } 362 } 363 fail("ExemplarFailures", exemplarFailureLangSubdivisionName); 364 365 for (String lang : counter.getKeysetSortedByKey()) { 366 fail("Superscripting", lang, String.valueOf(counter.get(lang)), null, -1); 367 } 368 System.out.println("Bogus subdivisionIds:\t" + "*" + "\t" + bogus.size() + "\t" + bogus); 369 for (Entry<Status, Collection<String>> entry : bogusStatus.asMap().entrySet()) { 370 System.out.println( 371 "SubdivisionId:\t\t" 372 + ":\t" 373 + entry.getKey() 374 + "\t" 375 + entry.getValue().size() 376 + "\t" 377 + entry.getValue()); 378 } 379 } 380 fixedFile( CLDRFile oldFileSubdivisions, Multimap<String, String> inverse)381 private static CLDRFile fixedFile( 382 CLDRFile oldFileSubdivisions, Multimap<String, String> inverse) { 383 CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed(); 384 385 // for fixing collisions 386 // we first add existing items 387 Set<String> toRemove = new HashSet<>(); 388 Map<String, String> toAdd = new HashMap<>(); 389 390 for (String path : fileSubdivisions) { 391 XPathParts parts = XPathParts.getFrozenInstance(path); 392 if (!"subdivision".equals(parts.getElement(-1))) { 393 continue; 394 } 395 String name = fileSubdivisions.getStringValue(path); 396 if (name.equals("Böyük Britaniya")) { 397 int debug = 0; 398 } 399 // handle aliases also 400 String type = parts.getAttributeValue(-1, "type"); 401 R2<List<String>, String> replacement = SUBDIVISION_ALIASES.get(type); 402 if (replacement != null) { 403 String fullPath = oldFileSubdivisions.getFullXPath(path); 404 XPathParts parts2 = XPathParts.getFrozenInstance(fullPath).cloneAsThawed(); 405 for (String replacementType : replacement.get0()) { 406 parts2.setAttribute(-1, "type", replacementType); 407 toRemove.add(path); 408 path = parts2.toString(); 409 toAdd.put(path, name); 410 System.out.println("Adding alias: " + replacementType + "«" + name + "»"); 411 break; 412 } 413 } 414 inverse.put(name, path); 415 } 416 fileSubdivisions.removeAll(toRemove, false); 417 for (Entry<String, String> entry2 : toAdd.entrySet()) { 418 fileSubdivisions.add(entry2.getKey(), entry2.getValue()); 419 } 420 return fileSubdivisions; 421 } 422 addExemplarFailures( M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, UnicodeSet exemplarFailures, String language, String subdivision, String name)423 private static void addExemplarFailures( 424 M4<Integer, String, String, String> exemplarFailureLangSubdivisionName, 425 UnicodeSet exemplarFailures, 426 String language, 427 String subdivision, 428 String name) { 429 for (String s : exemplarFailures) { 430 exemplarFailureLangSubdivisionName.put(s.codePointAt(0), language, subdivision, name); 431 } 432 } 433 fail( String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName)434 private static void fail( 435 String title, M4<Integer, String, String, String> exemplarFailureLangSubdivisionName) { 436 for (R4<Integer, String, String, String> entry : 437 exemplarFailureLangSubdivisionName.rows()) { 438 fail(title, entry.get1(), entry.get2(), entry.get3(), entry.get0()); 439 } 440 } 441 fail( String title, String lang, String subdivision, String name, int exemplarFailure)442 private static void fail( 443 String title, String lang, String subdivision, String name, int exemplarFailure) { 444 System.out.println( 445 title 446 + ":\t" 447 + lang 448 + "\t" 449 + subdivision 450 + "\t" 451 + (exemplarFailure < 0 ? "" : "«" + UTF16.valueOf(exemplarFailure) + "»") 452 + "\t" 453 + (exemplarFailure < 0 ? "" : "U+" + Utility.hex(exemplarFailure)) 454 + "\t" 455 + CldrUtility.ifNull(getBestWikiEnglishName(subdivision), "") 456 + "\t" 457 + CldrUtility.ifNull(name, "").replace("\"", """)); 458 } 459 460 static final List<String> MARKERS = 461 Arrays.asList( 462 "¹", "²", "³"); // if there are more than 3 of the same kind, throw exception 463 scriptsFor(UnicodeSet main)464 private static UnicodeSet scriptsFor(UnicodeSet main) { 465 UnicodeSet result = UnicodeSet.EMPTY; 466 for (String s : main) { 467 int scriptCode = UScript.getScript(s.codePointAt(0)); 468 if (scriptCode != UScript.COMMON || scriptCode != UScript.INHERITED) { 469 result = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, scriptCode); 470 if (scriptCode == UScript.LATIN) { 471 result.addAll("ʻ’&"); 472 } 473 break; 474 } 475 } 476 return result; 477 } 478 } 479