1 /* 2 ********************************************************************** 3 * Copyright (c) 2006-2007, Google and others. All Rights Reserved. 4 ********************************************************************** 5 * Author: Mark Davis 6 ********************************************************************** 7 */ 8 package org.unicode.cldr.util; 9 10 import com.ibm.icu.dev.util.UnicodeMap; 11 import com.ibm.icu.impl.Utility; 12 import com.ibm.icu.lang.UCharacter; 13 import com.ibm.icu.text.CanonicalIterator; 14 import com.ibm.icu.text.CollationElementIterator; 15 import com.ibm.icu.text.Collator; 16 import com.ibm.icu.text.Normalizer; 17 import com.ibm.icu.text.RawCollationKey; 18 import com.ibm.icu.text.RuleBasedCollator; 19 import com.ibm.icu.text.Transliterator; 20 import com.ibm.icu.text.UTF16; 21 import com.ibm.icu.text.UnicodeSet; 22 import com.ibm.icu.text.UnicodeSetIterator; 23 import com.ibm.icu.util.ULocale; 24 import java.util.ArrayList; 25 import java.util.Collection; 26 import java.util.Comparator; 27 import java.util.HashSet; 28 import java.util.Iterator; 29 import java.util.List; 30 import java.util.Map; 31 import java.util.Map.Entry; 32 import java.util.Set; 33 import java.util.TreeMap; 34 import java.util.TreeSet; 35 import org.unicode.cldr.util.VariantFolder.CaseVariantFolder; 36 37 public class CollationMapMaker { 38 39 public static final boolean SHOW_DEBUG = false; 40 41 /** 42 * Used to pick the "best" sample from a set for using as the canonical form. 43 * 44 * @author markdavis 45 */ 46 static class ExemplarComparator implements java.util.Comparator { 47 Comparator comp; 48 49 @Override compare(Object o1, Object o2)50 public int compare(Object o1, Object o2) { 51 String s1 = (String) o1; 52 String s2 = (String) o2; 53 int result; 54 55 // sort normalized first 56 int n1 = Normalizer.isNormalized(s1, Normalizer.DECOMP_COMPAT, 0) ? 0 : 1; 57 int n2 = Normalizer.isNormalized(s2, Normalizer.DECOMP_COMPAT, 0) ? 0 : 1; 58 if ((result = n1 - n2) != 0) { 59 return result; 60 } 61 62 // choose the shortest 63 result = UTF16.countCodePoint(s2) - UTF16.countCodePoint(s1); 64 if (result != 0) { // special fix to make zero be first 65 if (s1.length() == 0) return -1; 66 if (s2.length() == 0) return 1; 67 return result; 68 } 69 result = comp.compare(s1, s2); 70 if (result != 0) return result; 71 return s1.compareTo(s2); 72 } 73 ExemplarComparator(Comparator comp)74 public ExemplarComparator(Comparator comp) { 75 this.comp = comp; 76 } 77 } 78 79 public static class Folder implements Cloneable { 80 private UnicodeMap m = new UnicodeMap(); 81 82 @Override clone()83 public Object clone() { 84 try { 85 Folder result = (Folder) super.clone(); 86 result.m = m.cloneAsThawed(); 87 return result; 88 } catch (CloneNotSupportedException e) { 89 throw new InternalError("Clone problem"); 90 } 91 } 92 getValue(int i)93 public Object getValue(int i) { 94 return m.getValue(i); 95 } 96 keySet()97 public UnicodeSet keySet() { 98 return m.keySet(); 99 } 100 put(int cp, String result)101 public Folder put(int cp, String result) { 102 m.put(cp, result); 103 return this; 104 } 105 putAll(UnicodeSet set, String result)106 public Folder putAll(UnicodeSet set, String result) { 107 m.putAll(set, result); 108 return this; 109 } 110 getCharactersMapped()111 public UnicodeSet getCharactersMapped() { 112 return m.keySet(); 113 } 114 minimalize()115 public void minimalize() { 116 UnicodeMap newMap = (m.cloneAsThawed()); 117 118 for (UnicodeSetIterator i = new UnicodeSetIterator(m.keySet()); i.next(); ) { 119 String s = (String) m.getValue(i.codepoint); 120 newMap.put(i.codepoint, null); 121 String t = newMap.transform(newMap.transform(i.getString())); 122 if (!t.equals(s)) { // restore 123 newMap.put(i.codepoint, s); 124 } 125 } 126 } 127 complete()128 public void complete() { 129 while (fixNeeded()) 130 ; 131 } 132 fixNeeded()133 public boolean fixNeeded() { 134 UnicodeMap newMap = new UnicodeMap(); 135 UnicodeSet values = m.keySet(); 136 boolean changed = false; 137 for (UnicodeSetIterator i = new UnicodeSetIterator(values); i.next(); ) { 138 String result = (String) m.getValue(i.codepoint); 139 String newResult = fold(result); 140 if (!newResult.equals(result)) { 141 changed = true; 142 if (SHOW_DEBUG) { 143 System.out.println(i.getString()); 144 System.out.println("->\t" + result); 145 System.out.println("=>\t" + newResult); 146 } 147 } 148 newMap.put(i.codepoint, newResult); 149 } 150 m = newMap; 151 return changed; 152 } 153 fold(String s)154 public String fold(String s) { 155 return m.transform(s); 156 } 157 158 /** 159 * Re 160 * 161 * @param toRemove 162 * @param addIdentity TODO 163 * @param old 164 * @return 165 */ removeEquals(Folder toRemove, boolean addIdentity)166 Folder removeEquals(Folder toRemove, boolean addIdentity) { 167 UnicodeMap result = new UnicodeMap(); 168 for (int i = 0; i <= 0x10FFFF; ++i) { 169 Object x = m.getValue(i); 170 Object y = toRemove.m.getValue(i); 171 if (!UnicodeMap.areEqual(x, y)) { 172 if (x != null) { 173 result.put(i, x); 174 } else if (addIdentity) { 175 result.put(i, UTF16.valueOf(i)); // have to add mapping 176 } 177 } 178 } 179 m = result; 180 return this; 181 } 182 183 static Transliterator FullWidthKana = 184 Transliterator.getInstance( 185 "fullwidth-halfwidth; [[:script=katakana:][:script=hangul:]] halfwidth-fullwidth; katakana-hiragana"); 186 getSpecialFolded(String a)187 static String getSpecialFolded(String a) { 188 String result = a; 189 result = Normalizer.normalize(result, Normalizer.NFKC); 190 result = FullWidthKana.transliterate(result); 191 result = UCharacter.foldCase(result, true); 192 result = Normalizer.normalize(result, Normalizer.NFKC); 193 return result; 194 } 195 getUnicodeMap()196 public UnicodeMap getUnicodeMap() { 197 return m.cloneAsThawed(); 198 } 199 } 200 201 static final boolean showDetails = false; 202 static final RuleBasedCollator uca = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); 203 static final UnicodeSet filteredChars = 204 new UnicodeSet( 205 "[{ss}[^[:Co:][:Cf:][:Cc:][:Cn:][:Cs:][:script=Han:][:script=Hangul:]-[:nfkcquickcheck=no:]]]") 206 .freeze(); // skip 207 // a 208 // bunch 209 // of 210 // stuff, 211 // but 212 // include 213 // the 214 // items 215 // that 216 // are 217 // not 218 // nfkc 219 220 RuleBasedCollator equivalenceClassCollator; 221 Comparator exemplarComparator; 222 Map<String, String> reasonMap = new TreeMap(); 223 XEquivalenceMap equivMap = new XEquivalenceMap(); 224 generateCollatorFolding( RuleBasedCollator equivalenceClassCollator, Map<CharSequence, String> mapping)225 public Map<CharSequence, String> generateCollatorFolding( 226 RuleBasedCollator equivalenceClassCollator, Map<CharSequence, String> mapping) { 227 this.equivalenceClassCollator = equivalenceClassCollator; 228 try { 229 RuleBasedCollator exemplarCollator = 230 (RuleBasedCollator) equivalenceClassCollator.clone(); 231 exemplarCollator.setStrength(Collator.IDENTICAL); 232 exemplarCollator.setDecomposition(Collator.CANONICAL_DECOMPOSITION); 233 exemplarCollator.setUpperCaseFirst(false); 234 exemplarCollator.setAlternateHandlingShifted(false); 235 exemplarCollator.setNumericCollation(false); 236 exemplarCollator.setCaseLevel(false); 237 exemplarCollator.setFrenchCollation(false); 238 exemplarCollator.setHiraganaQuaternary(false); 239 exemplarComparator = new ExemplarComparator(exemplarCollator); 240 } catch (CloneNotSupportedException e) { 241 } // will never happen 242 243 UnicodeSet expansions = new UnicodeSet(); 244 UnicodeSet contractions = new UnicodeSet(); 245 try { 246 equivalenceClassCollator.getContractionsAndExpansions(contractions, expansions, true); 247 } catch (Exception e) { 248 } 249 250 UnicodeSet trialCharacters = 251 new UnicodeSet(filteredChars) 252 .addAll(equivalenceClassCollator.getTailoredSet()) 253 .addAll(contractions) 254 .addAll(expansions); 255 256 for (UnicodeSetIterator i = new UnicodeSetIterator(trialCharacters); i.next(); ) { 257 String item = i.getString(); 258 addItems(item); 259 } 260 261 for (UnicodeSetIterator i = new UnicodeSetIterator(getFullExpansions()); i.next(); ) { 262 String item = i.getString(); 263 addItems(item); 264 } 265 266 closeUnderFolding(); 267 268 if (showDetails) Log.getLog().println("Printing Values: " + equivMap.size()); 269 int count = 0; 270 int countMapped = 0; 271 272 // Now process the results to figure out what we need to keep 273 Set<String> values = new TreeSet(exemplarComparator); 274 275 for (Iterator it = equivMap.iterator(); it.hasNext(); ) { 276 Set<String> values1 = (Set) it.next(); 277 // if there is only one result, drop it 278 if (values1.size() == 1) { 279 if (false && SHOW_DEBUG) { 280 String item = values1.iterator().next(); 281 System.out.println( 282 "Skipping: " 283 + item 284 + "\t" 285 + equivalenceClassCollator.getRawCollationKey(item, null)); 286 } 287 continue; 288 } 289 values.clear(); 290 values.addAll(values1); 291 // if (showDetails) Log.getLog().println(bf.showSetNames(values)); 292 Iterator chars = values.iterator(); 293 // the lowest value is the exemplar value, so use it as the base 294 String target = (String) chars.next(); 295 if (SHOW_DEBUG) { 296 System.out.println( 297 "Target: <" 298 + target 299 + ">\t " 300 + Utility.hex(target) 301 + "\t" 302 + equivalenceClassCollator.getRawCollationKey(target, null)); 303 } 304 while (chars.hasNext()) { 305 String source = (String) chars.next(); 306 mapping.put(source, target); 307 if (SHOW_DEBUG) { 308 System.out.println( 309 "\tSource: <" 310 + source 311 + ">\t " 312 + Utility.hex(source) 313 + "\t" 314 + equivalenceClassCollator.getRawCollationKey(source, null)); 315 } 316 } 317 // for (Iterator it2 = values.iterator(); it.hasNext();) { 318 // bf. 319 // } 320 count++; 321 countMapped += values.size() - 1; 322 } 323 return mapping; 324 // // convert mapping to UnicodeMap 325 // Set problems = new TreeSet(); 326 // Folder folder = new Folder(); 327 // //folder.put('\u2215', "/"); 328 // 329 // for (Iterator it = mapping.keySet().iterator(); it.hasNext();) { 330 // String source = (String) it.next(); 331 // folder.put(UTF16.charAt(source, 0), (String) mapping.get(source)); 332 // } 333 // // redo folder until it stabilizes 334 // // folder.complete(); 335 // 336 // for (Iterator it = problems.iterator(); it.hasNext();) { 337 // String source = (String) it.next(); 338 // String target = folder.fold((String) mapping.get(source)); 339 // String other = folder.fold(source); 340 // if (target.equals(other)) { 341 // it.remove(); 342 // } else { 343 // if (showDetails) Log.getLog().println("Couldn't map source " + Utility.escape(source) 344 // + "\t" + Utility.escape(target) + "\t" + Utility.escape(other)); 345 // } 346 // } 347 // if (showDetails && problems.size() != 0) { 348 // Log.getLog().println("Problems"); 349 // for (Iterator it = problems.iterator(); it.hasNext();) { 350 // String item = (String) it.next(); 351 // //Log.getLog().println(bf.showSetNames(item)); 352 // //Log.getLog().println("\t-" + bf.showSetNames(reasonMap.get(item))); 353 // } 354 // } 355 // 356 // if (showDetails) Log.getLog().println( "\tEquivalence Classes:\t" + count + "\tMapped 357 // characters:\t" + 358 // countMapped + "\tProblems:\t" + problems.size()); 359 // return folder; 360 } 361 362 VariantFolder caseFolder = new VariantFolder(new CaseVariantFolder()); 363 364 VariantFolder.AlternateFetcher COLLATION_FETCHER = 365 new VariantFolder.AlternateFetcher() { 366 @Override 367 public Set<String> getAlternates(String item, Set<String> output) { 368 output.add(item); 369 Set set = equivMap.getEquivalences(item); 370 if (set != null) { 371 output.addAll(set); 372 } 373 return output; 374 } 375 }; 376 closeUnderFolding()377 private void closeUnderFolding() { 378 if (false) return; 379 // TODO Generalize 380 Set<String> others = new HashSet<>(); 381 List<Collection<String>> input = new ArrayList<>(); 382 VariantFolder recursiveFolder = new VariantFolder(COLLATION_FETCHER); 383 TreeSet<CharSequence> hack = new TreeSet(); 384 hack.add("aa"); 385 386 while (true) { 387 others.clear(); 388 for (CharSequence item : hack) { // seenSoFar 389 if (item.length() == 1) { 390 continue; 391 } 392 String str = item.toString(); 393 if (UTF16.countCodePoint(str) <= 1) { 394 continue; 395 } 396 Set<String> results = recursiveFolder.getClosure(item.toString()); 397 results.removeAll(seenSoFar); 398 Log.logln(item + "\t" + results); 399 others.addAll(results); 400 } 401 if (others.size() == 0) { 402 break; 403 } 404 for (String item : others) { 405 addToEquiv(item, item); 406 } 407 } 408 } 409 410 private static UnicodeSet fullExpansions = null; 411 getFullExpansions()412 static UnicodeSet getFullExpansions() { 413 if (fullExpansions == null) addExpansionResults(fullExpansions = new UnicodeSet()); 414 return fullExpansions; 415 } 416 addExpansionResults(UnicodeSet fullExpansions)417 private static UnicodeSet addExpansionResults(UnicodeSet fullExpansions) { 418 StringBuffer trialString = new StringBuffer(); 419 Map stringToKey = new TreeMap(); 420 Map keyToString = new TreeMap(); 421 Set nfkc = new HashSet(); 422 for (int i = 0; i < 0x10FFFF; ++i) { 423 int cat = UCharacter.getType(i); 424 if (cat == UCharacter.UNASSIGNED 425 || cat == UCharacter.PRIVATE_USE 426 || cat == UCharacter.SURROGATE) continue; 427 String source = UTF16.valueOf(i); 428 nfkc.add(Normalizer.compose(source, true)); 429 430 CollationElementIterator x = uca.getCollationElementIterator(source); 431 trialString.setLength(0); 432 while (true) { 433 int element = x.next(); 434 if (element == CollationElementIterator.NULLORDER) break; 435 char primaryOrder = (char) CollationElementIterator.primaryOrder(element); 436 if (primaryOrder == 0) continue; 437 trialString.append(primaryOrder); 438 } 439 if (trialString.length() == 0) continue; 440 String key = trialString.toString(); 441 stringToKey.put(source, key); 442 String newSource = (String) keyToString.get(key); 443 if (newSource == null) { 444 keyToString.put(key, source); 445 } 446 } 447 UnicodeSet expansions = new UnicodeSet(); 448 UnicodeSet contractions = new UnicodeSet(); 449 try { 450 uca.getContractionsAndExpansions(contractions, expansions, true); 451 } catch (Exception e1) { 452 throw new IllegalArgumentException(e1); 453 } 454 455 fullExpansions = new UnicodeSet(); 456 global: 457 for (UnicodeSetIterator usi = new UnicodeSetIterator(expansions); usi.next(); ) { 458 trialString.setLength(0); 459 String source = usi.getString(); 460 String key = (String) stringToKey.get(source); 461 if (key == null || key.length() == 1) continue; 462 main: 463 while (key.length() > 0) { 464 for (Iterator it = keyToString.entrySet().iterator(); it.hasNext(); ) { 465 Entry e = (Entry) it.next(); 466 String otherKey = (String) e.getKey(); 467 if (key.startsWith(otherKey)) { 468 trialString.append((String) e.getValue()); 469 key = key.substring(otherKey.length()); 470 continue main; 471 } 472 } 473 System.out.println("Failed with: " + source); 474 continue global; 475 } 476 String result = trialString.toString(); 477 if (contractions.contains(result) || nfkc.contains(result)) { 478 continue global; 479 } 480 if (SHOW_DEBUG & false) { 481 System.out.println("Adding: " + usi.getString() + "\t=>\t" + trialString); 482 } 483 fullExpansions.add(result); 484 } 485 fullExpansions.freeze(); 486 return fullExpansions; 487 } 488 489 CanonicalIterator canonicalIterator = new CanonicalIterator(""); 490 491 /** 492 * Adds items, looking for all canonically equivalent strings as well. 493 * 494 * @param item 495 */ addItems(String item)496 private void addItems(String item) { 497 addItems2(item, item); 498 String minNFKD = getMinimalNKFD(item, equivalenceClassCollator); 499 if (!minNFKD.equals(item)) { 500 addItems2(minNFKD, item); 501 } 502 canonicalIterator.setSource(item); 503 for (String nextItem = canonicalIterator.next(); 504 nextItem != null; 505 nextItem = canonicalIterator.next()) { 506 addItems2(nextItem, item); 507 } 508 } 509 510 /** 511 * Adds items, looking for all case-equivalent strings as well. 512 * 513 * @param item 514 * @param original 515 */ addItems2(String item, String original)516 private void addItems2(String item, String original) { 517 addItems3(item, original); 518 for (String nextItem : caseFolder.getClosure(item)) { 519 addItems3(nextItem, original); 520 } 521 } 522 addItems3(String item, String original)523 private void addItems3(String item, String original) { 524 addToEquiv(item, original); 525 canonicalIterator.setSource(item); 526 for (String newItem = canonicalIterator.next(); 527 newItem != null; 528 newItem = canonicalIterator.next()) { 529 addToEquiv(newItem, original); 530 } 531 } 532 533 Set<CharSequence> seenSoFar = new TreeSet<>(); 534 addToEquiv(String item, String original)535 private void addToEquiv(String item, String original) { 536 if (item.equals("aA")) { 537 System.out.println("ouch"); 538 } 539 if (seenSoFar.contains(item)) { 540 return; 541 } 542 seenSoFar.add(item); 543 // String norm = Normalizer.compose(item, true); 544 // if (UTF16.countCodePoint(norm) < UTF16.countCodePoint(item)) { 545 // item = norm; 546 // } 547 RawCollationKey k = equivalenceClassCollator.getRawCollationKey(item, null); 548 equivMap.add(item, k); 549 reasonMap.put(item, original); 550 } 551 552 static UnicodeSet spaceTatweelAndNSM = new UnicodeSet("[\\u0020\\u0640[:Mn:][:Me:]]"); 553 static UnicodeSet NSM = new UnicodeSet("[[:Mn:][:Me:]]"); 554 555 /** 556 * Return the minimal NFKD string that has the same uca key 557 * 558 * @param item 559 * @param k 560 * @param ucaWeak 561 * @return 562 */ getMinimalNKFD(String item, Collator ucaWeak)563 private String getMinimalNKFD(String item, Collator ucaWeak) { 564 String nfkd = com.ibm.icu.text.Normalizer.decompose(item, true); 565 if (nfkd.startsWith(" ")) { 566 if (spaceTatweelAndNSM.containsAll(nfkd)) { 567 return item; // fails 568 } 569 } 570 String start = ""; 571 String end = nfkd; 572 while (end.length() != 0) { 573 int cp = UTF16.charAt(end, 0); 574 String tryEnd = end.substring(UTF16.getCharCount(cp)); 575 String trial = start + tryEnd; 576 if (!ucaWeak.equals(trial, item)) { // retain character 577 start += UTF16.valueOf(cp); 578 } 579 end = tryEnd; 580 } 581 return start; 582 } 583 } 584