xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/util/CollationMapMaker.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 /*
2  **********************************************************************
3  * Copyright (c) 2006-2007, Google and others.  All Rights Reserved.
4  **********************************************************************
5  * Author: Mark Davis
6  **********************************************************************
7  */
8 package org.unicode.cldr.util;
9 
10 import com.ibm.icu.dev.util.UnicodeMap;
11 import com.ibm.icu.impl.Utility;
12 import com.ibm.icu.lang.UCharacter;
13 import com.ibm.icu.text.CanonicalIterator;
14 import com.ibm.icu.text.CollationElementIterator;
15 import com.ibm.icu.text.Collator;
16 import com.ibm.icu.text.Normalizer;
17 import com.ibm.icu.text.RawCollationKey;
18 import com.ibm.icu.text.RuleBasedCollator;
19 import com.ibm.icu.text.Transliterator;
20 import com.ibm.icu.text.UTF16;
21 import com.ibm.icu.text.UnicodeSet;
22 import com.ibm.icu.text.UnicodeSetIterator;
23 import com.ibm.icu.util.ULocale;
24 import java.util.ArrayList;
25 import java.util.Collection;
26 import java.util.Comparator;
27 import java.util.HashSet;
28 import java.util.Iterator;
29 import java.util.List;
30 import java.util.Map;
31 import java.util.Map.Entry;
32 import java.util.Set;
33 import java.util.TreeMap;
34 import java.util.TreeSet;
35 import org.unicode.cldr.util.VariantFolder.CaseVariantFolder;
36 
37 public class CollationMapMaker {
38 
39     public static final boolean SHOW_DEBUG = false;
40 
41     /**
42      * Used to pick the "best" sample from a set for using as the canonical form.
43      *
44      * @author markdavis
45      */
46     static class ExemplarComparator implements java.util.Comparator {
47         Comparator comp;
48 
49         @Override
compare(Object o1, Object o2)50         public int compare(Object o1, Object o2) {
51             String s1 = (String) o1;
52             String s2 = (String) o2;
53             int result;
54 
55             // sort normalized first
56             int n1 = Normalizer.isNormalized(s1, Normalizer.DECOMP_COMPAT, 0) ? 0 : 1;
57             int n2 = Normalizer.isNormalized(s2, Normalizer.DECOMP_COMPAT, 0) ? 0 : 1;
58             if ((result = n1 - n2) != 0) {
59                 return result;
60             }
61 
62             // choose the shortest
63             result = UTF16.countCodePoint(s2) - UTF16.countCodePoint(s1);
64             if (result != 0) { // special fix to make zero be first
65                 if (s1.length() == 0) return -1;
66                 if (s2.length() == 0) return 1;
67                 return result;
68             }
69             result = comp.compare(s1, s2);
70             if (result != 0) return result;
71             return s1.compareTo(s2);
72         }
73 
ExemplarComparator(Comparator comp)74         public ExemplarComparator(Comparator comp) {
75             this.comp = comp;
76         }
77     }
78 
79     public static class Folder implements Cloneable {
80         private UnicodeMap m = new UnicodeMap();
81 
82         @Override
clone()83         public Object clone() {
84             try {
85                 Folder result = (Folder) super.clone();
86                 result.m = m.cloneAsThawed();
87                 return result;
88             } catch (CloneNotSupportedException e) {
89                 throw new InternalError("Clone problem");
90             }
91         }
92 
getValue(int i)93         public Object getValue(int i) {
94             return m.getValue(i);
95         }
96 
keySet()97         public UnicodeSet keySet() {
98             return m.keySet();
99         }
100 
put(int cp, String result)101         public Folder put(int cp, String result) {
102             m.put(cp, result);
103             return this;
104         }
105 
putAll(UnicodeSet set, String result)106         public Folder putAll(UnicodeSet set, String result) {
107             m.putAll(set, result);
108             return this;
109         }
110 
getCharactersMapped()111         public UnicodeSet getCharactersMapped() {
112             return m.keySet();
113         }
114 
minimalize()115         public void minimalize() {
116             UnicodeMap newMap = (m.cloneAsThawed());
117 
118             for (UnicodeSetIterator i = new UnicodeSetIterator(m.keySet()); i.next(); ) {
119                 String s = (String) m.getValue(i.codepoint);
120                 newMap.put(i.codepoint, null);
121                 String t = newMap.transform(newMap.transform(i.getString()));
122                 if (!t.equals(s)) { // restore
123                     newMap.put(i.codepoint, s);
124                 }
125             }
126         }
127 
complete()128         public void complete() {
129             while (fixNeeded())
130                 ;
131         }
132 
fixNeeded()133         public boolean fixNeeded() {
134             UnicodeMap newMap = new UnicodeMap();
135             UnicodeSet values = m.keySet();
136             boolean changed = false;
137             for (UnicodeSetIterator i = new UnicodeSetIterator(values); i.next(); ) {
138                 String result = (String) m.getValue(i.codepoint);
139                 String newResult = fold(result);
140                 if (!newResult.equals(result)) {
141                     changed = true;
142                     if (SHOW_DEBUG) {
143                         System.out.println(i.getString());
144                         System.out.println("->\t" + result);
145                         System.out.println("=>\t" + newResult);
146                     }
147                 }
148                 newMap.put(i.codepoint, newResult);
149             }
150             m = newMap;
151             return changed;
152         }
153 
fold(String s)154         public String fold(String s) {
155             return m.transform(s);
156         }
157 
158         /**
159          * Re
160          *
161          * @param toRemove
162          * @param addIdentity TODO
163          * @param old
164          * @return
165          */
removeEquals(Folder toRemove, boolean addIdentity)166         Folder removeEquals(Folder toRemove, boolean addIdentity) {
167             UnicodeMap result = new UnicodeMap();
168             for (int i = 0; i <= 0x10FFFF; ++i) {
169                 Object x = m.getValue(i);
170                 Object y = toRemove.m.getValue(i);
171                 if (!UnicodeMap.areEqual(x, y)) {
172                     if (x != null) {
173                         result.put(i, x);
174                     } else if (addIdentity) {
175                         result.put(i, UTF16.valueOf(i)); // have to add mapping
176                     }
177                 }
178             }
179             m = result;
180             return this;
181         }
182 
183         static Transliterator FullWidthKana =
184                 Transliterator.getInstance(
185                         "fullwidth-halfwidth; [[:script=katakana:][:script=hangul:]] halfwidth-fullwidth; katakana-hiragana");
186 
getSpecialFolded(String a)187         static String getSpecialFolded(String a) {
188             String result = a;
189             result = Normalizer.normalize(result, Normalizer.NFKC);
190             result = FullWidthKana.transliterate(result);
191             result = UCharacter.foldCase(result, true);
192             result = Normalizer.normalize(result, Normalizer.NFKC);
193             return result;
194         }
195 
getUnicodeMap()196         public UnicodeMap getUnicodeMap() {
197             return m.cloneAsThawed();
198         }
199     }
200 
201     static final boolean showDetails = false;
202     static final RuleBasedCollator uca = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
203     static final UnicodeSet filteredChars =
204             new UnicodeSet(
205                             "[{ss}[^[:Co:][:Cf:][:Cc:][:Cn:][:Cs:][:script=Han:][:script=Hangul:]-[:nfkcquickcheck=no:]]]")
206                     .freeze(); // skip
207     // a
208     // bunch
209     // of
210     // stuff,
211     // but
212     // include
213     // the
214     // items
215     // that
216     // are
217     // not
218     // nfkc
219 
220     RuleBasedCollator equivalenceClassCollator;
221     Comparator exemplarComparator;
222     Map<String, String> reasonMap = new TreeMap();
223     XEquivalenceMap equivMap = new XEquivalenceMap();
224 
generateCollatorFolding( RuleBasedCollator equivalenceClassCollator, Map<CharSequence, String> mapping)225     public Map<CharSequence, String> generateCollatorFolding(
226             RuleBasedCollator equivalenceClassCollator, Map<CharSequence, String> mapping) {
227         this.equivalenceClassCollator = equivalenceClassCollator;
228         try {
229             RuleBasedCollator exemplarCollator =
230                     (RuleBasedCollator) equivalenceClassCollator.clone();
231             exemplarCollator.setStrength(Collator.IDENTICAL);
232             exemplarCollator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
233             exemplarCollator.setUpperCaseFirst(false);
234             exemplarCollator.setAlternateHandlingShifted(false);
235             exemplarCollator.setNumericCollation(false);
236             exemplarCollator.setCaseLevel(false);
237             exemplarCollator.setFrenchCollation(false);
238             exemplarCollator.setHiraganaQuaternary(false);
239             exemplarComparator = new ExemplarComparator(exemplarCollator);
240         } catch (CloneNotSupportedException e) {
241         } // will never happen
242 
243         UnicodeSet expansions = new UnicodeSet();
244         UnicodeSet contractions = new UnicodeSet();
245         try {
246             equivalenceClassCollator.getContractionsAndExpansions(contractions, expansions, true);
247         } catch (Exception e) {
248         }
249 
250         UnicodeSet trialCharacters =
251                 new UnicodeSet(filteredChars)
252                         .addAll(equivalenceClassCollator.getTailoredSet())
253                         .addAll(contractions)
254                         .addAll(expansions);
255 
256         for (UnicodeSetIterator i = new UnicodeSetIterator(trialCharacters); i.next(); ) {
257             String item = i.getString();
258             addItems(item);
259         }
260 
261         for (UnicodeSetIterator i = new UnicodeSetIterator(getFullExpansions()); i.next(); ) {
262             String item = i.getString();
263             addItems(item);
264         }
265 
266         closeUnderFolding();
267 
268         if (showDetails) Log.getLog().println("Printing Values: " + equivMap.size());
269         int count = 0;
270         int countMapped = 0;
271 
272         // Now process the results to figure out what we need to keep
273         Set<String> values = new TreeSet(exemplarComparator);
274 
275         for (Iterator it = equivMap.iterator(); it.hasNext(); ) {
276             Set<String> values1 = (Set) it.next();
277             // if there is only one result, drop it
278             if (values1.size() == 1) {
279                 if (false && SHOW_DEBUG) {
280                     String item = values1.iterator().next();
281                     System.out.println(
282                             "Skipping: "
283                                     + item
284                                     + "\t"
285                                     + equivalenceClassCollator.getRawCollationKey(item, null));
286                 }
287                 continue;
288             }
289             values.clear();
290             values.addAll(values1);
291             // if (showDetails) Log.getLog().println(bf.showSetNames(values));
292             Iterator chars = values.iterator();
293             // the lowest value is the exemplar value, so use it as the base
294             String target = (String) chars.next();
295             if (SHOW_DEBUG) {
296                 System.out.println(
297                         "Target: <"
298                                 + target
299                                 + ">\t "
300                                 + Utility.hex(target)
301                                 + "\t"
302                                 + equivalenceClassCollator.getRawCollationKey(target, null));
303             }
304             while (chars.hasNext()) {
305                 String source = (String) chars.next();
306                 mapping.put(source, target);
307                 if (SHOW_DEBUG) {
308                     System.out.println(
309                             "\tSource: <"
310                                     + source
311                                     + ">\t "
312                                     + Utility.hex(source)
313                                     + "\t"
314                                     + equivalenceClassCollator.getRawCollationKey(source, null));
315                 }
316             }
317             // for (Iterator it2 = values.iterator(); it.hasNext();) {
318             // bf.
319             // }
320             count++;
321             countMapped += values.size() - 1;
322         }
323         return mapping;
324         // // convert mapping to UnicodeMap
325         // Set problems = new TreeSet();
326         // Folder folder = new Folder();
327         // //folder.put('\u2215', "/");
328         //
329         // for (Iterator it = mapping.keySet().iterator(); it.hasNext();) {
330         // String source = (String) it.next();
331         // folder.put(UTF16.charAt(source, 0), (String) mapping.get(source));
332         // }
333         // // redo folder until it stabilizes
334         // // folder.complete();
335         //
336         // for (Iterator it = problems.iterator(); it.hasNext();) {
337         // String source = (String) it.next();
338         // String target = folder.fold((String) mapping.get(source));
339         // String other = folder.fold(source);
340         // if (target.equals(other)) {
341         // it.remove();
342         // } else {
343         // if (showDetails) Log.getLog().println("Couldn't map source " + Utility.escape(source)
344         // + "\t" + Utility.escape(target) + "\t" + Utility.escape(other));
345         // }
346         // }
347         // if (showDetails && problems.size() != 0) {
348         // Log.getLog().println("Problems");
349         // for (Iterator it = problems.iterator(); it.hasNext();) {
350         // String item = (String) it.next();
351         // //Log.getLog().println(bf.showSetNames(item));
352         // //Log.getLog().println("\t-" + bf.showSetNames(reasonMap.get(item)));
353         // }
354         // }
355         //
356         // if (showDetails) Log.getLog().println( "\tEquivalence Classes:\t" + count + "\tMapped
357         // characters:\t" +
358         // countMapped + "\tProblems:\t" + problems.size());
359         // return folder;
360     }
361 
362     VariantFolder caseFolder = new VariantFolder(new CaseVariantFolder());
363 
364     VariantFolder.AlternateFetcher COLLATION_FETCHER =
365             new VariantFolder.AlternateFetcher() {
366                 @Override
367                 public Set<String> getAlternates(String item, Set<String> output) {
368                     output.add(item);
369                     Set set = equivMap.getEquivalences(item);
370                     if (set != null) {
371                         output.addAll(set);
372                     }
373                     return output;
374                 }
375             };
376 
closeUnderFolding()377     private void closeUnderFolding() {
378         if (false) return;
379         // TODO Generalize
380         Set<String> others = new HashSet<>();
381         List<Collection<String>> input = new ArrayList<>();
382         VariantFolder recursiveFolder = new VariantFolder(COLLATION_FETCHER);
383         TreeSet<CharSequence> hack = new TreeSet();
384         hack.add("aa");
385 
386         while (true) {
387             others.clear();
388             for (CharSequence item : hack) { // seenSoFar
389                 if (item.length() == 1) {
390                     continue;
391                 }
392                 String str = item.toString();
393                 if (UTF16.countCodePoint(str) <= 1) {
394                     continue;
395                 }
396                 Set<String> results = recursiveFolder.getClosure(item.toString());
397                 results.removeAll(seenSoFar);
398                 Log.logln(item + "\t" + results);
399                 others.addAll(results);
400             }
401             if (others.size() == 0) {
402                 break;
403             }
404             for (String item : others) {
405                 addToEquiv(item, item);
406             }
407         }
408     }
409 
410     private static UnicodeSet fullExpansions = null;
411 
getFullExpansions()412     static UnicodeSet getFullExpansions() {
413         if (fullExpansions == null) addExpansionResults(fullExpansions = new UnicodeSet());
414         return fullExpansions;
415     }
416 
addExpansionResults(UnicodeSet fullExpansions)417     private static UnicodeSet addExpansionResults(UnicodeSet fullExpansions) {
418         StringBuffer trialString = new StringBuffer();
419         Map stringToKey = new TreeMap();
420         Map keyToString = new TreeMap();
421         Set nfkc = new HashSet();
422         for (int i = 0; i < 0x10FFFF; ++i) {
423             int cat = UCharacter.getType(i);
424             if (cat == UCharacter.UNASSIGNED
425                     || cat == UCharacter.PRIVATE_USE
426                     || cat == UCharacter.SURROGATE) continue;
427             String source = UTF16.valueOf(i);
428             nfkc.add(Normalizer.compose(source, true));
429 
430             CollationElementIterator x = uca.getCollationElementIterator(source);
431             trialString.setLength(0);
432             while (true) {
433                 int element = x.next();
434                 if (element == CollationElementIterator.NULLORDER) break;
435                 char primaryOrder = (char) CollationElementIterator.primaryOrder(element);
436                 if (primaryOrder == 0) continue;
437                 trialString.append(primaryOrder);
438             }
439             if (trialString.length() == 0) continue;
440             String key = trialString.toString();
441             stringToKey.put(source, key);
442             String newSource = (String) keyToString.get(key);
443             if (newSource == null) {
444                 keyToString.put(key, source);
445             }
446         }
447         UnicodeSet expansions = new UnicodeSet();
448         UnicodeSet contractions = new UnicodeSet();
449         try {
450             uca.getContractionsAndExpansions(contractions, expansions, true);
451         } catch (Exception e1) {
452             throw new IllegalArgumentException(e1);
453         }
454 
455         fullExpansions = new UnicodeSet();
456         global:
457         for (UnicodeSetIterator usi = new UnicodeSetIterator(expansions); usi.next(); ) {
458             trialString.setLength(0);
459             String source = usi.getString();
460             String key = (String) stringToKey.get(source);
461             if (key == null || key.length() == 1) continue;
462             main:
463             while (key.length() > 0) {
464                 for (Iterator it = keyToString.entrySet().iterator(); it.hasNext(); ) {
465                     Entry e = (Entry) it.next();
466                     String otherKey = (String) e.getKey();
467                     if (key.startsWith(otherKey)) {
468                         trialString.append((String) e.getValue());
469                         key = key.substring(otherKey.length());
470                         continue main;
471                     }
472                 }
473                 System.out.println("Failed with: " + source);
474                 continue global;
475             }
476             String result = trialString.toString();
477             if (contractions.contains(result) || nfkc.contains(result)) {
478                 continue global;
479             }
480             if (SHOW_DEBUG & false) {
481                 System.out.println("Adding: " + usi.getString() + "\t=>\t" + trialString);
482             }
483             fullExpansions.add(result);
484         }
485         fullExpansions.freeze();
486         return fullExpansions;
487     }
488 
489     CanonicalIterator canonicalIterator = new CanonicalIterator("");
490 
491     /**
492      * Adds items, looking for all canonically equivalent strings as well.
493      *
494      * @param item
495      */
addItems(String item)496     private void addItems(String item) {
497         addItems2(item, item);
498         String minNFKD = getMinimalNKFD(item, equivalenceClassCollator);
499         if (!minNFKD.equals(item)) {
500             addItems2(minNFKD, item);
501         }
502         canonicalIterator.setSource(item);
503         for (String nextItem = canonicalIterator.next();
504                 nextItem != null;
505                 nextItem = canonicalIterator.next()) {
506             addItems2(nextItem, item);
507         }
508     }
509 
510     /**
511      * Adds items, looking for all case-equivalent strings as well.
512      *
513      * @param item
514      * @param original
515      */
addItems2(String item, String original)516     private void addItems2(String item, String original) {
517         addItems3(item, original);
518         for (String nextItem : caseFolder.getClosure(item)) {
519             addItems3(nextItem, original);
520         }
521     }
522 
addItems3(String item, String original)523     private void addItems3(String item, String original) {
524         addToEquiv(item, original);
525         canonicalIterator.setSource(item);
526         for (String newItem = canonicalIterator.next();
527                 newItem != null;
528                 newItem = canonicalIterator.next()) {
529             addToEquiv(newItem, original);
530         }
531     }
532 
533     Set<CharSequence> seenSoFar = new TreeSet<>();
534 
addToEquiv(String item, String original)535     private void addToEquiv(String item, String original) {
536         if (item.equals("aA")) {
537             System.out.println("ouch");
538         }
539         if (seenSoFar.contains(item)) {
540             return;
541         }
542         seenSoFar.add(item);
543         // String norm = Normalizer.compose(item, true);
544         // if (UTF16.countCodePoint(norm) < UTF16.countCodePoint(item)) {
545         // item = norm;
546         // }
547         RawCollationKey k = equivalenceClassCollator.getRawCollationKey(item, null);
548         equivMap.add(item, k);
549         reasonMap.put(item, original);
550     }
551 
552     static UnicodeSet spaceTatweelAndNSM = new UnicodeSet("[\\u0020\\u0640[:Mn:][:Me:]]");
553     static UnicodeSet NSM = new UnicodeSet("[[:Mn:][:Me:]]");
554 
555     /**
556      * Return the minimal NFKD string that has the same uca key
557      *
558      * @param item
559      * @param k
560      * @param ucaWeak
561      * @return
562      */
getMinimalNKFD(String item, Collator ucaWeak)563     private String getMinimalNKFD(String item, Collator ucaWeak) {
564         String nfkd = com.ibm.icu.text.Normalizer.decompose(item, true);
565         if (nfkd.startsWith(" ")) {
566             if (spaceTatweelAndNSM.containsAll(nfkd)) {
567                 return item; // fails
568             }
569         }
570         String start = "";
571         String end = nfkd;
572         while (end.length() != 0) {
573             int cp = UTF16.charAt(end, 0);
574             String tryEnd = end.substring(UTF16.getCharCount(cp));
575             String trial = start + tryEnd;
576             if (!ucaWeak.equals(trial, item)) { // retain character
577                 start += UTF16.valueOf(cp);
578             }
579             end = tryEnd;
580         }
581         return start;
582     }
583 }
584