1 /** */ 2 package org.unicode.cldr.util; 3 4 import com.google.common.collect.BiMap; 5 import com.google.common.collect.HashBiMap; 6 import com.google.common.collect.HashMultimap; 7 import com.google.common.collect.ImmutableSet; 8 import com.google.common.collect.Multimap; 9 import com.google.common.collect.Multimaps; 10 import com.google.common.collect.TreeMultimap; 11 import com.ibm.icu.lang.UScript; 12 import com.ibm.icu.text.RuleBasedTransliterator; 13 import com.ibm.icu.text.Transliterator; 14 import com.ibm.icu.text.UnicodeFilter; 15 import com.ibm.icu.util.ICUUncheckedIOException; 16 import java.io.File; 17 import java.io.IOException; 18 import java.io.UncheckedIOException; 19 import java.io.Writer; 20 import java.util.Arrays; 21 import java.util.Collection; 22 import java.util.Collections; 23 import java.util.HashSet; 24 import java.util.LinkedHashSet; 25 import java.util.List; 26 import java.util.Locale; 27 import java.util.Map; 28 import java.util.Map.Entry; 29 import java.util.Set; 30 import java.util.TreeMap; 31 import java.util.TreeSet; 32 import java.util.regex.Matcher; 33 import java.util.regex.Pattern; 34 import java.util.stream.Collectors; 35 import org.unicode.cldr.tool.LikelySubtags; 36 import org.unicode.cldr.util.DiscreteComparator.Builder; 37 38 public class CLDRTransforms { 39 40 public static final String TRANSFORM_DIR = (CLDRPaths.COMMON_DIRECTORY + "transforms/"); 41 42 static final CLDRTransforms SINGLETON = new CLDRTransforms(); 43 44 private static final boolean PARANOID = true; 45 getInstance()46 public static CLDRTransforms getInstance() { 47 return SINGLETON; 48 } 49 getShowProgress()50 public Appendable getShowProgress() { 51 return showProgress; 52 } 53 setShowProgress(Appendable showProgress)54 public CLDRTransforms setShowProgress(Appendable showProgress) { 55 this.showProgress = showProgress; 56 return this; 57 } 58 59 final Set<String> overridden = new HashSet<>(); 60 // final DependencyOrder dependencyOrder = new DependencyOrder(); 61 62 // static public class RegexFindFilenameFilter implements FilenameFilter { 63 // Matcher matcher; 64 // 65 // public RegexFindFilenameFilter(Matcher filter) { 66 // matcher = filter; 67 // } 68 // 69 // @Override 70 // public boolean accept(File dir, String name) { 71 // return matcher.reset(name).find(); 72 // } 73 // } 74 75 /** 76 * @param dir TODO 77 * @param namesMatchingRegex TODO 78 * @param showProgress null if no progress needed 79 * @param skipDashTIds TODO 80 * @return 81 */ registerCldrTransforms( String dir, String namesMatchingRegex, Appendable showProgress, boolean keepDashTIds)82 public static void registerCldrTransforms( 83 String dir, String namesMatchingRegex, Appendable showProgress, boolean keepDashTIds) { 84 CLDRTransforms r = getInstance(); 85 if (dir == null) { 86 dir = TRANSFORM_DIR; 87 } 88 // reorder to preload some 89 r.showProgress = showProgress; 90 Set<String> ordered = getFileRegistrationOrder(dir); 91 92 if (namesMatchingRegex != null) { 93 Matcher filter = PatternCache.get(namesMatchingRegex).matcher(""); 94 ordered = 95 ordered.stream() 96 .filter(x -> filter.reset(x).matches()) 97 .collect(Collectors.toCollection(LinkedHashSet::new)); 98 // r.deregisterIcuTransliterators(filter); 99 // files = Arrays.asList(new File(TRANSFORM_DIR).list(new 100 // RegexFindFilenameFilter(filter))); 101 // ordered = r.dependencyOrder.getOrderedItems(files, filter, true); 102 } 103 104 // System.out.println(ordered); 105 for (String cldrFileName : ordered) { 106 r.registerTransliteratorsFromXML( 107 dir, cldrFileName, Collections.emptySet(), keepDashTIds); 108 } 109 Transliterator.registerAny(); // do this last! 110 } 111 getAvailableIds()112 public static List<String> getAvailableIds() { 113 return Arrays.asList(new File(TRANSFORM_DIR).list()); 114 } 115 getOverriddenTransliterators()116 public Set<String> getOverriddenTransliterators() { 117 return Collections.unmodifiableSet(overridden); 118 } 119 120 static Transliterator fixup = Transliterator.getInstance("[:Mn:]any-hex/java"); 121 getInstance(String id)122 public Transliterator getInstance(String id) { 123 if (!overridden.contains(id)) { 124 throw new IllegalArgumentException("No overriden transform for " + id); 125 } 126 return Transliterator.getInstance(id); 127 } 128 129 public static Pattern TRANSFORM_ID_PATTERN = PatternCache.get("(.+)-([^/]+)(/(.*))?"); 130 getReverseInstance(String id)131 public Transliterator getReverseInstance(String id) { 132 Matcher matcher = TRANSFORM_ID_PATTERN.matcher(id); 133 if (!matcher.matches()) { 134 throw new IllegalArgumentException("**No transform for " + id); 135 } 136 return getInstance( 137 matcher.group(2) 138 + "-" 139 + matcher.group(1) 140 + (matcher.group(4) == null ? "" : "/" + matcher.group(4))); 141 } 142 143 private BiMap<String, String> displayNameToId = HashBiMap.create(); 144 getDisplayNameToId()145 public BiMap<String, String> getDisplayNameToId() { 146 return displayNameToId; 147 } 148 addDisplayNameToId(Map<String, String> ids2, ParsedTransformID directionInfo)149 private void addDisplayNameToId(Map<String, String> ids2, ParsedTransformID directionInfo) { 150 displayNameToId.put(directionInfo.getDisplayId(), directionInfo.toString()); 151 } 152 registerTransliteratorsFromXML( String dir, String cldrFileName, Set<String> cantSkip, boolean keepDashTIds)153 public String registerTransliteratorsFromXML( 154 String dir, String cldrFileName, Set<String> cantSkip, boolean keepDashTIds) { 155 ParsedTransformID directionInfo = new ParsedTransformID(); 156 String ruleString = getIcuRulesFromXmlFile(dir, cldrFileName, directionInfo); 157 158 String id = directionInfo.getId(); 159 addDisplayNameToId(displayNameToId, directionInfo); 160 161 if (directionInfo.getDirection() == Direction.both 162 || directionInfo.getDirection() == Direction.forward) { 163 for (String alias : directionInfo.getAliases()) { 164 if (!keepDashTIds && alias.contains("-t-")) { 165 continue; 166 } 167 Transliterator.unregister(alias); 168 Transliterator.registerAlias(alias, id); 169 } 170 internalRegister(id, ruleString, Transliterator.FORWARD); 171 } 172 if (directionInfo.getDirection() == Direction.both 173 || directionInfo.getDirection() == Direction.backward) { 174 for (String alias : directionInfo.getBackwardAliases()) { 175 if (!keepDashTIds && alias.contains("-t-")) { 176 continue; 177 } 178 Transliterator.unregister(alias); 179 Transliterator.registerAlias(alias, directionInfo.getBackwardId()); 180 } 181 internalRegister(id, ruleString, Transliterator.REVERSE); 182 } 183 return id; 184 } 185 186 /** 187 * Return Icu rules, and the direction info 188 * 189 * @param dir TODO 190 * @param cldrFileName 191 * @param directionInfo 192 * @return 193 */ getIcuRulesFromXmlFile( String dir, String cldrFileName, ParsedTransformID directionInfo)194 public static String getIcuRulesFromXmlFile( 195 String dir, String cldrFileName, ParsedTransformID directionInfo) { 196 final MyHandler myHandler = new MyHandler(cldrFileName, directionInfo); 197 XMLFileReader xfr = new XMLFileReader().setHandler(myHandler); 198 xfr.read( 199 dir + cldrFileName, 200 XMLFileReader.CONTENT_HANDLER | XMLFileReader.ERROR_HANDLER, 201 true); 202 return myHandler.getRules(); 203 } 204 internalRegister(String id, String ruleString, int direction)205 private void internalRegister(String id, String ruleString, int direction) { 206 if (direction == Transliterator.REVERSE) { 207 id = ParsedTransformID.reverse(id); 208 } 209 internalRegisterNoReverseId(id, ruleString, direction); 210 } 211 internalRegisterNoReverseId(String id, String ruleString, int direction)212 private void internalRegisterNoReverseId(String id, String ruleString, int direction) { 213 try { 214 Transliterator t = Transliterator.createFromRules(id, ruleString, direction); 215 overridden.add(id); 216 Transliterator oldTranslit = null; 217 if (showProgress != null) { 218 try { 219 oldTranslit = Transliterator.getInstance(id); 220 } catch (Exception e) { 221 } 222 } 223 Transliterator.unregister(id); 224 Transliterator.registerInstance(t); 225 226 if (PARANOID) { // for paranoid testing 227 String r1 = 228 CLDRTransforms.showTransliterator("", t, 9999, new StringBuilder()) 229 .toString(); 230 Transliterator t2 = Transliterator.getInstance(id); 231 String r2 = 232 CLDRTransforms.showTransliterator("", t2, 9999, new StringBuilder()) 233 .toString(); 234 if (!r1.equals(r2)) { 235 throw new IllegalArgumentException( 236 "Rules unequal\n" + ruleString + "$$$\n$$$" + r2); 237 } 238 } 239 // verifyNullFilter("halfwidth-fullwidth"); 240 if (showProgress != null) { 241 append( 242 "Registered new Transliterator: " 243 + id 244 + (oldTranslit == null ? "" : "\told:\t" + oldTranslit.getID()) 245 + '\n'); 246 if (id.startsWith("el-")) { 247 CLDRTransforms.showTransliterator("", t, 999); 248 Transliterator t2 = Transliterator.getInstance(id); 249 CLDRTransforms.showTransliterator("", t2, 999); 250 } 251 } 252 } catch (RuntimeException e) { 253 if (showProgress != null) { 254 e.printStackTrace(); 255 append( 256 "Couldn't register new Transliterator: " 257 + id 258 + "\t" 259 + e.getMessage() 260 + '\n'); 261 } else { 262 throw (IllegalArgumentException) 263 new IllegalArgumentException("Couldn't register new Transliterator: " + id) 264 .initCause(e); 265 } 266 } 267 } 268 269 Appendable showProgress; 270 append(String string)271 private void append(String string) { 272 try { 273 if (showProgress == null) { 274 return; 275 } 276 showProgress.append(string); 277 if (showProgress instanceof Writer) { 278 ((Writer) showProgress).flush(); 279 } 280 } catch (IOException e) { 281 throw new ICUUncheckedIOException(e); 282 } 283 } 284 appendln(String s)285 private void appendln(String s) { 286 append(s + "\n"); 287 } 288 289 // =================================== 290 291 // @SuppressWarnings("deprecation") 292 // public void registerFromIcuFormatFiles(String directory) throws IOException { 293 // 294 //// deregisterIcuTransliterators((Matcher) null); 295 // 296 // Matcher getId = PatternCache.get("\\s*(\\S*)\\s*\\{\\s*").matcher(""); 297 // Matcher getSource = 298 // PatternCache.get("\\s*(\\S*)\\s*\\{\\s*\\\"(.*)\\\".*").matcher(""); 299 // Matcher translitID = PatternCache.get("([^-]+)-([^/]+)+(?:[/](.+))?").matcher(""); 300 // 301 // Map<String, String> fixedIDs = new TreeMap<>(); 302 // Set<String> oddIDs = new TreeSet<>(); 303 // 304 // File dir = new File(directory); 305 // // get the list of files to take, and their directions 306 // BufferedReader input = FileUtilities.openUTF8Reader(directory, "root.txt"); 307 // String id = null; 308 // String filename = null; 309 // Map<String, String> aliasMap = new LinkedHashMap<>(); 310 // 311 // // deregisterIcuTransliterators(); 312 // 313 // // do first, since others depend on theseregisterFromIcuFile 314 // /** 315 // * Special aliases. 316 // * Tone-Digit { 317 // * alias {"Pinyin-NumericPinyin"} 318 // * } 319 // * Digit-Tone { 320 // * alias {"NumericPinyin-Pinyin"} 321 // * } 322 // */ 323 // // registerFromIcuFile("Latin-ConjoiningJamo", directory, null); 324 // // registerFromIcuFile("Pinyin-NumericPinyin", directory, null); 325 // // Transliterator.registerAlias("Tone-Digit", "Pinyin-NumericPinyin"); 326 // // Transliterator.registerAlias("Digit-Tone", "NumericPinyin-Pinyin"); 327 // // registerFromIcuFile("Fullwidth-Halfwidth", directory, null); 328 // // registerFromIcuFile("Hiragana-Katakana", directory, null); 329 // // registerFromIcuFile("Latin-Katakana", directory, null); 330 // // registerFromIcuFile("Hiragana-Latin", directory, null); 331 // 332 // while (true) { 333 // String line = input.readLine(); 334 // if (line == null) break; 335 // line = line.trim(); 336 // if (line.startsWith("\uFEFF")) { 337 // line = line.substring(1); 338 // } 339 // if (line.startsWith("TransliteratorNamePattern")) break; // done 340 // // if (line.indexOf("Ethiopic") >= 0) { 341 // // appendln("Skipping Ethiopic"); 342 // // continue; 343 // // } 344 // if (getId.reset(line).matches()) { 345 // String temp = getId.group(1); 346 // if (!temp.equals("file") && !temp.equals("internal")) id = temp; 347 // continue; 348 // } 349 // if (getSource.reset(line).matches()) { 350 // String operation = getSource.group(1); 351 // String source = getSource.group(2); 352 // if (operation.equals("alias")) { 353 // aliasMap.put(id, source); 354 // checkIdFix(id, fixedIDs, oddIDs, translitID); 355 // id = null; 356 // } else if (operation.equals("resource:process(transliterator)")) { 357 // filename = source; 358 // } else if (operation.equals("direction")) { 359 // try { 360 // if (id == null || filename == null) { 361 // // appendln("skipping: " + line); 362 // continue; 363 // } 364 // if (filename.indexOf("InterIndic") >= 0 && filename.indexOf("Latin") 365 // >= 0) { 366 // // append("**" + id); 367 // } 368 // checkIdFix(id, fixedIDs, oddIDs, translitID); 369 // 370 // final int direction = source.equals("FORWARD") ? 371 // Transliterator.FORWARD 372 // : Transliterator.REVERSE; 373 // registerFromIcuFile(id, directory, filename, direction); 374 // 375 // verifyNullFilter("halfwidth-fullwidth"); 376 // 377 // id = null; 378 // filename = null; 379 // } catch (RuntimeException e) { 380 // throw (RuntimeException) new IllegalArgumentException("Failed with " + 381 // filename + ", " + source) 382 // .initCause(e); 383 // } 384 // } else { 385 // append(dir + "root.txt unhandled line:" + line); 386 // } 387 // continue; 388 // } 389 // String trimmed = line.trim(); 390 // if (trimmed.equals("")) continue; 391 // if (trimmed.equals("}")) continue; 392 // if (trimmed.startsWith("//")) continue; 393 // throw new IllegalArgumentException("Unhandled:" + line); 394 // } 395 // 396 // final Set<String> rawIds = idToRules.keySet(); 397 // Set<String> ordered = dependencyOrder.getOrderedItems(rawIds, null, false); 398 // ordered.retainAll(rawIds); // since we are in ID space, kick out anything that isn't 399 // 400 // for (String id2 : ordered) { 401 // RuleDirection stuff = idToRules.get(id2); 402 // internalRegisterNoReverseId(id2, stuff.ruleString, stuff.direction); 403 // verifyNullFilter("halfwidth-fullwidth"); // TESTING 404 // } 405 // 406 // for (Iterator<String> it = aliasMap.keySet().iterator(); it.hasNext();) { 407 // id = it.next(); 408 // String source = aliasMap.get(id); 409 // Transliterator.unregister(id); 410 // Transliterator t = Transliterator.createFromRules(id, "::" + source + ";", 411 // Transliterator.FORWARD); 412 // Transliterator.registerInstance(t); 413 // // verifyNullFilter("halfwidth-fullwidth"); 414 // appendln("Registered new Transliterator Alias: " + id); 415 // 416 // } 417 // appendln("Fixed IDs"); 418 // for (Iterator<String> it = fixedIDs.keySet().iterator(); it.hasNext();) { 419 // String id2 = it.next(); 420 // appendln("\t" + id2 + "\t" + fixedIDs.get(id2)); 421 // } 422 // appendln("Odd IDs"); 423 // for (Iterator<String> it = oddIDs.iterator(); it.hasNext();) { 424 // String id2 = it.next(); 425 // appendln("\t" + id2); 426 // } 427 // Transliterator.registerAny(); // do this last! 428 // } 429 430 Map<String, RuleDirection> idToRules = new TreeMap<>(); 431 432 private class RuleDirection { 433 String ruleString; 434 int direction; 435 RuleDirection(String ruleString, int direction)436 public RuleDirection(String ruleString, int direction) { 437 super(); 438 this.ruleString = ruleString; 439 this.direction = direction; 440 } 441 } 442 registerFromIcuFile(String id, String directory, String filename, int direction)443 private void registerFromIcuFile(String id, String directory, String filename, int direction) { 444 if (filename == null) { 445 filename = id.replace("-", "_").replace("/", "_") + ".txt"; 446 } 447 String ruleString = CldrUtility.getText(directory, filename); 448 idToRules.put(id, new RuleDirection(ruleString, direction)); 449 } 450 451 // private void registerFromIcuFile(String id, String dir, String filename) { 452 // registerFromIcuFile(id, dir, filename, Transliterator.FORWARD); 453 // registerFromIcuFile(id, dir, filename, Transliterator.REVERSE); 454 // } 455 checkIdFix( String id, Map<String, String> fixedIDs, Set<String> oddIDs, Matcher translitID)456 public void checkIdFix( 457 String id, Map<String, String> fixedIDs, Set<String> oddIDs, Matcher translitID) { 458 if (fixedIDs.containsKey(id)) return; 459 if (!translitID.reset(id).matches()) { 460 appendln("Can't fix: " + id); 461 fixedIDs.put(id, "?" + id); 462 return; 463 } 464 String source1 = translitID.group(1); 465 String target1 = translitID.group(2); 466 String variant = translitID.group(3); 467 String source = fixID(source1); 468 String target = fixID(target1); 469 if (!source1.equals(source)) { 470 fixedIDs.put(source1, source); 471 } 472 if (!target1.equals(target)) { 473 fixedIDs.put(target1, target); 474 } 475 if (variant != null) { 476 oddIDs.add("variant: " + variant); 477 } 478 } 479 fixID(String source)480 static String fixID(String source) { 481 return source; // for now 482 } 483 484 // public void deregisterIcuTransliterators(Matcher filter) { 485 // // Remove all of the current registrations 486 // // first load into array, so we don't get sync problems. 487 // List<String> rawAvailable = new ArrayList<>(); 488 // for (Enumeration<String> en = Transliterator.getAvailableIDs(); en.hasMoreElements();) 489 // { 490 // final String id = en.nextElement(); 491 // if (filter != null && !filter.reset(id).matches()) { 492 // continue; 493 // } 494 // rawAvailable.add(id); 495 // } 496 // 497 // // deregisterIcuTransliterators(rawAvailable); 498 // 499 // Set<String> available = dependencyOrder.getOrderedItems(rawAvailable, filter, false); 500 // List<String> reversed = new LinkedList<>(); 501 // for (String item : available) { 502 // reversed.add(0, item); 503 // } 504 // // available.retainAll(rawAvailable); // remove the items we won't touch anyway 505 // // rawAvailable.removeAll(available); // now the ones whose order doesn't matter 506 // // deregisterIcuTransliterators(rawAvailable); 507 // deregisterIcuTransliterators(reversed); 508 // 509 // for (Enumeration<String> en = Transliterator.getAvailableIDs(); en.hasMoreElements();) 510 // { 511 // String oldId = en.nextElement(); 512 // append("Retaining: " + oldId + "\n"); 513 // } 514 // } 515 // 516 // public void deregisterIcuTransliterators(Collection<String> available) { 517 // for (String oldId : available) { 518 // Transliterator t; 519 // try { 520 // t = Transliterator.getInstance(oldId); 521 // } catch (IllegalArgumentException e) { 522 // if (e.getMessage().startsWith("Illegal ID")) { 523 // continue; 524 // } 525 // append("Failure with: " + oldId); 526 // t = Transliterator.getInstance(oldId); 527 // throw e; 528 // } catch (RuntimeException e) { 529 // append("Failure with: " + oldId); 530 // t = Transliterator.getInstance(oldId); 531 // throw e; 532 // } 533 // String className = t.getClass().getName(); 534 // if (className.endsWith(".CompoundTransliterator") 535 // || className.endsWith(".RuleBasedTransliterator") 536 // || className.endsWith(".AnyTransliterator")) { 537 // appendln("REMOVING: " + oldId); 538 // Transliterator.unregister(oldId); 539 // } else { 540 // appendln("Retaining: " + oldId + "\t\t" + className); 541 // } 542 // } 543 // } 544 545 public enum Direction { 546 backward, 547 both, 548 forward 549 } 550 551 public enum Visibility { 552 external, 553 internal 554 } 555 556 public static class ParsedTransformID { 557 public String source = "Any"; 558 public String target = "Any"; 559 public String variant; 560 protected String[] aliases = {}; 561 protected String[] backwardAliases = {}; 562 protected Direction direction = null; 563 protected Visibility visibility; 564 getId()565 public String getId() { 566 return getSource() 567 + "-" 568 + getTarget() 569 + (getVariant() == null ? "" : "/" + getVariant()); 570 } 571 getDisplayId()572 public String getDisplayId() { 573 return getDisplaySource() 574 + "-" 575 + getDisplayTarget() 576 + (getVariant() == null ? "" : "/" + getDisplayVariant()); 577 } 578 getDisplayVariant()579 private String getDisplayVariant() { 580 return getVariant(); 581 } 582 getDisplayTarget()583 private String getDisplayTarget() { 584 return getDisplaySourceOrTarget(getTarget()); 585 } 586 getDisplaySource()587 private String getDisplaySource() { 588 return getDisplaySourceOrTarget(getSource()); 589 } 590 getDisplaySourceOrTarget(String sourceOrTarget)591 private String getDisplaySourceOrTarget(String sourceOrTarget) { 592 int uscript = UScript.getCodeFromName(sourceOrTarget); 593 if (uscript >= 0) { 594 return UScript.getName(uscript); 595 } 596 if (sourceOrTarget.contains("FONIPA")) { 597 return "IPA"; 598 } 599 if (sourceOrTarget.equals("InterIndic")) { 600 return "Indic"; 601 } 602 try { 603 String name = CLDRConfig.getInstance().getEnglish().getName(sourceOrTarget); 604 return name; 605 } catch (Exception e) { 606 return sourceOrTarget; 607 } 608 } 609 610 static final LikelySubtags likely = new LikelySubtags(); 611 getScriptCode(String sourceOrTarget)612 public static String getScriptCode(String sourceOrTarget) { 613 int uscript = UScript.getCodeFromName(sourceOrTarget); 614 if (uscript >= 0) { 615 return UScript.getShortName(uscript); 616 } 617 if (sourceOrTarget.contains("FONIPA")) { 618 return "Ipa0"; 619 } 620 if (sourceOrTarget.equals("InterIndic")) { 621 return "Ind0"; 622 } 623 try { 624 String max = likely.maximize(sourceOrTarget); 625 return max == null ? null : new LanguageTagParser().set(max).getScript(); 626 } catch (Exception e) { 627 return null; 628 } 629 } 630 getBackwardId()631 public String getBackwardId() { 632 return getTarget() 633 + "-" 634 + getSource() 635 + (getVariant() == null ? "" : "/" + getVariant()); 636 } 637 ParsedTransformID()638 public ParsedTransformID() {} 639 set( String source, String target, String variant, Direction direction)640 public ParsedTransformID set( 641 String source, String target, String variant, Direction direction) { 642 this.source = source; 643 this.target = target; 644 this.variant = variant; 645 this.direction = direction; 646 return this; 647 } 648 set(String id)649 public ParsedTransformID set(String id) { 650 variant = null; 651 int pos = id.indexOf('-'); 652 if (pos < 0) { 653 source = "Any"; 654 target = id; 655 return this; 656 } 657 source = id.substring(0, pos); 658 int pos2 = id.indexOf('/', pos); 659 if (pos2 < 0) { 660 target = id.substring(pos + 1); 661 return this; 662 } 663 target = id.substring(pos + 1, pos2); 664 variant = id.substring(pos2 + 1); 665 return this; 666 } 667 reverse()668 public ParsedTransformID reverse() { 669 String temp = source; 670 source = target; 671 target = temp; 672 return this; 673 } 674 getTargetVariant()675 public String getTargetVariant() { 676 return target + (variant == null ? "" : "/" + variant); 677 } 678 getSourceVariant()679 public String getSourceVariant() { 680 return source + (variant == null ? "" : "/" + variant); 681 } 682 setDirection(Direction direction)683 protected void setDirection(Direction direction) { 684 this.direction = direction; 685 } 686 getDirection()687 public Direction getDirection() { 688 return direction; 689 } 690 setVariant(String variant)691 public void setVariant(String variant) { 692 this.variant = variant; 693 } 694 getVariant()695 protected String getVariant() { 696 return variant; 697 } 698 setTarget(String target)699 public void setTarget(String target) { 700 this.target = target; 701 } 702 getTarget()703 public String getTarget() { 704 return target; 705 } 706 setSource(String source)707 public void setSource(String source) { 708 this.source = source; 709 } 710 getSource()711 public String getSource() { 712 return source; 713 } 714 715 @Override toString()716 public String toString() { 717 return source + "-" + getTargetVariant(); 718 } 719 getId(String source, String target, String variant)720 public static String getId(String source, String target, String variant) { 721 String id = source + '-' + target; 722 if (variant != null) id += "/" + variant; 723 return id; 724 } 725 reverse(String id)726 public static String reverse(String id) { 727 return new ParsedTransformID().set(id).getBackwardId(); 728 } 729 setAliases(String[] aliases)730 public void setAliases(String[] aliases) { 731 this.aliases = aliases; 732 } 733 getAliases()734 public String[] getAliases() { 735 return aliases; 736 } 737 setBackwardAliases(String[] backwardAliases)738 public void setBackwardAliases(String[] backwardAliases) { 739 this.backwardAliases = backwardAliases; 740 } 741 getBackwardAliases()742 public String[] getBackwardAliases() { 743 return backwardAliases; 744 } 745 setVisibility(String string)746 protected void setVisibility(String string) { 747 visibility = Visibility.valueOf(string); 748 } 749 getVisibility()750 public Visibility getVisibility() { 751 return visibility; 752 } 753 } 754 755 /** 756 * Verify that if the transliterator exists, it has a null filter 757 * 758 * @param id 759 */ verifyNullFilter(String id)760 public static void verifyNullFilter(String id) { 761 Transliterator widen; 762 try { 763 widen = Transliterator.getInstance(id); 764 } catch (Exception e) { 765 return; 766 } 767 UnicodeFilter filter = widen.getFilter(); 768 if (filter != null) { 769 throw new IllegalArgumentException(id + " has non-empty filter: " + filter); 770 } 771 } 772 773 public static class MyHandler extends XMLFileReader.SimpleHandler { 774 boolean first = true; 775 ParsedTransformID directionInfo; 776 String cldrFileName; 777 StringBuilder rules = new StringBuilder(); 778 getRules()779 public String getRules() { 780 return rules.toString(); 781 } 782 MyHandler(String cldrFileName, ParsedTransformID directionInfo)783 public MyHandler(String cldrFileName, ParsedTransformID directionInfo) { 784 super(); 785 this.cldrFileName = cldrFileName; 786 this.directionInfo = directionInfo; 787 } 788 789 @Override handlePathValue(String path, String value)790 public void handlePathValue(String path, String value) { 791 if (first) { 792 if (path.startsWith("//supplementalData/version")) { 793 return; 794 } else if (path.startsWith("//supplementalData/generation")) { 795 return; 796 } 797 XPathParts parts = XPathParts.getFrozenInstance(path); 798 Map<String, String> attributes = parts.findAttributes("transform"); 799 if (attributes == null) { 800 throw new IllegalArgumentException( 801 "Not an XML transform file: " + cldrFileName + "\t" + path); 802 } 803 directionInfo.setSource(attributes.get("source")); 804 directionInfo.setTarget(attributes.get("target")); 805 directionInfo.setVariant(attributes.get("variant")); 806 directionInfo.setDirection( 807 Direction.valueOf(attributes.get("direction").toLowerCase(Locale.ENGLISH))); 808 809 String alias = attributes.get("alias"); 810 if (alias != null) { 811 directionInfo.setAliases(alias.trim().split("\\s+")); 812 } 813 814 String backwardAlias = attributes.get("backwardAlias"); 815 if (backwardAlias != null) { 816 directionInfo.setBackwardAliases(backwardAlias.trim().split("\\s+")); 817 } 818 819 directionInfo.setVisibility(attributes.get("visibility")); 820 first = false; 821 } 822 if (path.indexOf("/comment") >= 0) { 823 // skip 824 } else if (path.indexOf("/tRule") >= 0) { 825 value = fixup.transliterate(value); 826 rules.append(value).append(CldrUtility.LINE_SEPARATOR); 827 } else { 828 throw new IllegalArgumentException("Unknown element: " + path + "\t " + value); 829 } 830 } 831 } 832 833 static boolean ALREADY_REGISTERED = false; 834 /** 835 * Register just those transliterators that are different than ICU. TODO: check against the file 836 * system to make sure the list is accurate. 837 */ registerModified()838 public void registerModified() { 839 synchronized (CLDRTransforms.class) { 840 if (ALREADY_REGISTERED) { 841 return; 842 } 843 // NEW 844 registerTranslit("Lao-Latin", "ບ", "b"); 845 registerTranslit("Khmer-Latin", "ឥ", "ĕ"); 846 registerTranslit("Sinhala-Latin", "ක", "ka"); 847 registerTranslit("Japn-Latn", "譆", "aa"); 848 849 // MODIFIED 850 registerTranslit("Han-SpacedHan", "《", "«"); 851 registerTranslit("Greek-Latin", "΄", "´"); 852 registerTranslit("Hebrew-Latin", "־", "-"); 853 registerTranslit("Cyrillic-Latin", "ө", "ö"); 854 registerTranslit("Myanmar-Latin", "ဿ", "s"); 855 registerTranslit("Latin-Armenian", "’", "՚"); 856 857 registerTranslit("Interindic-Latin", "\uE070", ".", "\uE03C", "\u0323", "\uE04D", ""); 858 859 registerTranslit("Malayalam-Interindic", "ൺ", ""); 860 registerTranslit("Interindic-Malayalam", "", "ണ്"); 861 registerTranslit("Malayalam-Latin", "ൺ", "ṇ"); 862 863 registerTranslit("Devanagari-Interindic", "ॲ", "\uE084"); 864 registerTranslit("Devanagari-Latin", "ॲ", "æ"); 865 866 registerTranslit("Arabic-Latin", "؉", "‰"); 867 ALREADY_REGISTERED = true; 868 } 869 } 870 871 private static final ImmutableSet<String> noSkip = ImmutableSet.of(); 872 873 private static final boolean SHOW = false; 874 private static final boolean SHOW_FAILED_MATCHES = false; 875 876 /** Register a transliterator and verify that a sample changed value is accurate */ registerTranslit(String ID, String... sourcePairs)877 public void registerTranslit(String ID, String... sourcePairs) { 878 String internalId = registerTransliteratorsFromXML(TRANSFORM_DIR, ID, noSkip, true); 879 Transliterator.registerAny(); // do this last! 880 Transliterator t = null; 881 try { 882 t = Transliterator.getInstance(internalId); 883 } catch (Exception e) { 884 System.out.println("For " + ID + " (" + internalId + ")"); 885 e.printStackTrace(); 886 return; 887 } 888 testSourceTarget(t, sourcePairs); 889 } 890 showTransliterator(String prefix, Transliterator t, int limit)891 public static void showTransliterator(String prefix, Transliterator t, int limit) { 892 showTransliterator(prefix, t, limit, System.out); 893 System.out.flush(); 894 } 895 showTransliterator( String prefix, Transliterator t, int limit, T output)896 public static <T extends Appendable> T showTransliterator( 897 String prefix, Transliterator t, int limit, T output) { 898 if (!prefix.isEmpty()) { 899 prefix += " "; 900 } 901 try { 902 output.append(prefix + "ID:\t" + t.getID() + "\n"); 903 output.append(prefix + "Class:\t" + t.getClass().getName() + "\n"); 904 if (t.getFilter() != null) { 905 output.append(prefix + "Filter:\t" + t.getFilter().toPattern(false) + "\n"); 906 } 907 if (t instanceof RuleBasedTransliterator) { 908 RuleBasedTransliterator rbt = (RuleBasedTransliterator) t; 909 String[] rules = rbt.toRules(true).split("\n"); 910 int length = rules.length; 911 if (limit >= 0 && limit < length) length = limit; 912 output.append(prefix + "Rules:\n"); 913 prefix += "\t"; 914 for (int i = 0; i < length; ++i) { 915 output.append(prefix + rules[i] + "\n"); 916 } 917 } else { 918 Transliterator[] elements = t.getElements(); 919 if (elements[0] == t) { 920 output.append(prefix + "\tNonRuleBased\n"); 921 return output; 922 } else { 923 prefix += "\t"; 924 for (int i = 0; i < elements.length; ++i) { 925 showTransliterator(prefix, elements[i], limit, output); 926 } 927 } 928 } 929 } catch (IOException e) { 930 throw new UncheckedIOException(e); 931 } 932 return output; 933 } 934 testSourceTarget(Transliterator t, String... sourcePairs)935 public static void testSourceTarget(Transliterator t, String... sourcePairs) { 936 for (int i = 0; i < sourcePairs.length; i += 2) { 937 String sourceTest = sourcePairs[i]; 938 String targetTest = sourcePairs[i + 1]; 939 String target = t.transform(sourceTest); 940 if (!target.equals(targetTest)) { 941 throw new IllegalArgumentException( 942 t.getID() 943 + " For " 944 + sourceTest 945 + ", expected " 946 + targetTest 947 + ", got " 948 + target); 949 } 950 } 951 } 952 953 /** 954 * Gets a transform from a script to Latin. for testing For a locale, use 955 * ExemplarUtilities.getScript(locale) to get the script 956 */ getTestingLatinScriptTransform(final String script)957 public static Transliterator getTestingLatinScriptTransform(final String script) { 958 String id; 959 960 switch (script) { 961 case "Latn": 962 return null; 963 case "Khmr": 964 id = "Khmr-Latn/UNGEGN"; 965 break; 966 case "Laoo": 967 id = "Laoo-Latn/UNGEGN"; 968 break; 969 case "Sinh": 970 id = "Sinh-Latn/UNGEGN"; 971 break; 972 case "Japn": 973 id = "Jpan-Latn"; 974 break; 975 case "Kore": 976 id = "Hangul-Latn"; 977 break; 978 case "Hant": 979 case "Hans": 980 id = "Han-Latn"; 981 break; 982 case "Olck": 983 id = "sat_Olck-sat_FONIPA"; // Latin IPA 984 break; 985 case "Cher": 986 id = "chr-chr_FONIPA"; 987 break; 988 default: 989 id = script + "-Latn"; 990 } 991 return Transliterator.getInstance(id); 992 } 993 994 /** 995 * Returns the set of all files that can be registered, in an order that makes sure that all 996 * dependencies are handled. That is, if X uses Y in its rules, then Y has to come before X. 997 * 998 * <p>The problem is that when you build a transliterator from rules, and one of those rules is 999 * to call another transliterator X, it inserts the <b>currently</b> registered transliterator 1000 * into the transliterator being built. So whenever a transliterator X is changed, you have to 1001 * reregister every transliterator that calls X. Otherwise the old version of X sticks around in 1002 * those calling transliterators. So the order that you register transliterators is important! 1003 */ getFileRegistrationOrder(String dir)1004 public static Set<String> getFileRegistrationOrder(String dir) { 1005 if (dir == null) { 1006 dir = TRANSFORM_DIR; 1007 } 1008 List<String> files = getAvailableIds(); 1009 Multimap<String, String> fileToAliases = HashMultimap.create(); 1010 Multimap<String, String> fileToDependencies = TreeMultimap.create(); 1011 for (String file : files) { 1012 // Very simple test that depends on standard format 1013 // eg 1014 // ::[॑ ॒ ॔ ॓ ़ ँ-ः । ॥ ॰ ०-९ ॐ ॲ ऄ-ऋ ॠ ऌ ॡ ऍ-कक़ खख़ गग़ घ-जज़ झ-डड़ ढढ़ ण-फफ़ ब-यय़ 1015 // र-ह ऽ ॽ ा-ॄ ॢ ॣ ॅ-्]; 1016 // ::NFD; 1017 // ::Devanagari-InterIndic; 1018 // ::InterIndic-Latin; 1019 // ::NFC; 1020 ParsedTransformID directionInfo = new ParsedTransformID(); 1021 String ruleString = getIcuRulesFromXmlFile(dir, file, directionInfo); 1022 Set<String> others = new LinkedHashSet<>(); 1023 Set<String> order = 1024 ruleString 1025 .lines() 1026 .map(x -> x.trim()) 1027 .filter(x -> x.contains("::") && !x.trim().startsWith("#")) 1028 .map(x -> parseDoubleColon(x, others)) 1029 .collect(Collectors.toCollection(LinkedHashSet::new)); 1030 order.addAll(others); 1031 if (SHOW) { 1032 System.out.println(file + "=>" + order); 1033 } 1034 if (!order.isEmpty()) { 1035 fileToDependencies.putAll(file, order); 1036 } 1037 if (directionInfo.direction != Direction.backward) { // that is, forward or both 1038 fileToAliases.put(file, directionInfo.getId()); 1039 fileToAliases.putAll(file, Arrays.asList(directionInfo.getAliases())); 1040 if (SHOW) { 1041 System.out.println( 1042 "\t" 1043 + directionInfo.getId() 1044 + "\t" 1045 + Arrays.asList(directionInfo.getAliases())); 1046 } 1047 } 1048 if (directionInfo.direction != Direction.forward) { // that is, backward or both 1049 fileToAliases.put(file, directionInfo.getBackwardId()); 1050 fileToAliases.putAll(file, Arrays.asList(directionInfo.getBackwardAliases())); 1051 if (SHOW) { 1052 System.out.println( 1053 "\t" 1054 + directionInfo.getBackwardId() 1055 + "\t" 1056 + Arrays.asList(directionInfo.getBackwardAliases())); 1057 } 1058 } 1059 } 1060 TreeMultimap<String, String> aliasesToFile = 1061 Multimaps.invertFrom(fileToAliases, TreeMultimap.create()); 1062 Multimap<String, String> fileToDependentFiles = TreeMultimap.create(); 1063 1064 for (Entry<String, Collection<String>> entry : fileToDependencies.asMap().entrySet()) { 1065 Set<String> v = 1066 entry.getValue().stream() 1067 .filter(x -> aliasesToFile.containsKey(x)) 1068 .map(y -> aliasesToFile.get(y).first()) 1069 .collect(Collectors.toSet()); 1070 fileToDependentFiles.putAll(entry.getKey(), v); 1071 } 1072 Builder<String> comp = new DiscreteComparator.Builder<>(null); 1073 fileToDependentFiles.forEach( 1074 (x, y) -> { 1075 if (SHOW) { 1076 System.out.println(x + "=" + y); 1077 } 1078 comp.add(y, x); // put dependent earlier 1079 }); 1080 // .add("c", "d", "b", "a").add("m", "n", "d").get(); 1081 1082 DiscreteComparator<String> comp2 = comp.get(); 1083 Set<String> orderedDependents = new LinkedHashSet<>(comp2.getOrdering()); 1084 orderedDependents.retainAll( 1085 fileToDependentFiles.values()); // remove files that are not dependents 1086 Set<String> remainingFiles = new TreeSet<>(files); 1087 remainingFiles.removeAll(orderedDependents); 1088 orderedDependents.addAll(remainingFiles); 1089 if (SHOW_FAILED_MATCHES) { 1090 System.out.println(orderedDependents); 1091 } 1092 return ImmutableSet.copyOf(orderedDependents); 1093 } 1094 // fails match: :: [:Latin:] fullwidth-halfwidth (); 1095 1096 static final Pattern TRANSLIT_FINDER = 1097 Pattern.compile( 1098 "\\s*::\\s*" 1099 + "(?:\\[[^\\]]+\\]\\s*)?" 1100 + "([A-Za-z0-9////_//-]*)?" 1101 + "(?:" 1102 + "\\s*\\(" 1103 + "(?:\\[[^\\]]+\\]\\s*)?" 1104 + "([A-Za-z0-9////_//-]*)?" 1105 + "\\s*\\)" 1106 + ")?" 1107 + "\\s*;\\s*(#.*)?"); 1108 // static { 1109 // Matcher matcher = TRANSLIT_FINDER.matcher("::[:Latin:] fullwidth-halfwidth();"); 1110 // System.out.println(matcher.matches()); 1111 // } 1112 parseDoubleColon(String x, Set<String> others)1113 static String parseDoubleColon(String x, Set<String> others) { 1114 Matcher matcher = TRANSLIT_FINDER.matcher(x); 1115 if (matcher.matches()) { 1116 String first = matcher.group(1); 1117 String second = matcher.group(2); 1118 if (SHOW) { 1119 System.out.println("1: " + first + "\t2:" + second); 1120 } 1121 if (second != null && !second.isBlank()) { 1122 others.add(second); 1123 } 1124 return first == null || first.isBlank() ? "" : first; 1125 } else { 1126 if (SHOW_FAILED_MATCHES) { 1127 System.out.println("fails match: " + x); 1128 } 1129 } 1130 return ""; 1131 } 1132 } 1133