xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/util/Emoji.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.util;
2 
3 import com.google.common.base.CharMatcher;
4 import com.google.common.base.Splitter;
5 import com.google.common.collect.ImmutableMap;
6 import com.google.common.collect.ImmutableSet;
7 import com.ibm.icu.dev.util.UnicodeMap;
8 import com.ibm.icu.impl.Utility;
9 import com.ibm.icu.lang.CharSequences;
10 import com.ibm.icu.text.UnicodeSet;
11 import com.ibm.icu.util.ICUException;
12 import java.util.ArrayList;
13 import java.util.HashMap;
14 import java.util.Iterator;
15 import java.util.LinkedHashMap;
16 import java.util.LinkedHashSet;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.Map.Entry;
20 import java.util.Set;
21 import org.unicode.cldr.draft.FileUtilities;
22 import org.unicode.cldr.util.PathHeader.PageId;
23 
24 public class Emoji {
25     public static final String EMOJI_VARIANT = "\uFE0F";
26     public static final String COMBINING_ENCLOSING_KEYCAP = "\u20E3";
27     public static final String ZWJ = "\u200D";
28     public static final UnicodeSet REGIONAL_INDICATORS = new UnicodeSet(0x1F1E6, 0x1F1FF).freeze();
29     public static final UnicodeSet MODIFIERS = new UnicodeSet("[��-��]").freeze();
30     public static final UnicodeSet TAGS = new UnicodeSet(0xE0000, 0xE007F).freeze();
31     public static final UnicodeSet FAMILY = new UnicodeSet("[\u200D ��-�� �� ❤]").freeze();
32     public static final UnicodeSet GENDER = new UnicodeSet().add(0x2640).add(0x2642).freeze();
33     public static final UnicodeSet SPECIALS =
34             new UnicodeSet(
35                             "["
36                                     + "{��‍⬛}{��‍❄}{��‍��}{��‍��}{��‍��}{��‍��}{��‍��‍��}{��‍��} {��‍��} {��‍☠} {��‍��} {��‍��} {��‍��} {��‍��} {��‍��} {��‍��} {��‍��}"
37                                     + "{��‍⚧}{��‍⚕}{��‍⚖}{��‍✈}{��‍��}{��‍��}{��‍��}{��‍��}{��‍��}{��‍��}{��‍��}{��‍��}{��‍��}{��‍��}{��‍��}{��‍��}{��‍��}{��‍��}{��‍��}{��‍��}"
38                                     + "{❤‍��}, {❤‍��}, {��‍��}, {��‍��}" // #E13.1
39                                     + "]")
40                     .freeze();
41     // May have to add from above, if there is a failure in testAnnotationPaths. Failure will be
42     // like:
43     // got java.util.TreeSet<[//ldml/annotations/annotation[@cp="��‍⚧"][@type="tts"],
44     // //ldml/annotations/annotation[@cp="��‍⚕"][@type="tts"], ...
45     // just extract the items in "...", and change into {...} for adding above.
46     // Example: //ldml/annotations/annotation[@cp="��‍⚕"] ==> {��‍⚕}
47     public static final UnicodeSet MAN_WOMAN = new UnicodeSet("[�� ��]").freeze();
48     public static final UnicodeSet OBJECT =
49             new UnicodeSet("[�� �� �� �� �� �� �� �� ✈ �� �� �� �� �� �� ⚖ ⚕]").freeze();
50 
51     static final UnicodeMap<String> emojiToMajorCategory = new UnicodeMap<>();
52     static final UnicodeMap<String> emojiToMinorCategory = new UnicodeMap<>();
53     static final UnicodeMap<String> toName = new UnicodeMap<>();
54 
55     static {
56         emojiToMajorCategory.setErrorOnReset(true);
57         emojiToMinorCategory.setErrorOnReset(true);
58         toName.setErrorOnReset(true);
59     }
60     /**
61      * A mapping from a majorCategory to a unique ordering number, based on the first time it is
62      * encountered.
63      */
64     static final Map<String, Long> majorToOrder = new HashMap<>();
65     /**
66      * A mapping from a minorCategory to a unique ordering number, based on the first time it is
67      * encountered.
68      */
69     static final Map<String, Long> minorToOrder = new HashMap<>();
70 
71     static final Map<String, Long> emojiToOrder = new LinkedHashMap<>();
72     static final UnicodeSet nonConstructed = new UnicodeSet();
73     static final UnicodeSet allRgi = new UnicodeSet();
74     static final UnicodeSet allRgiNoES = new UnicodeSet();
75 
76     static {
77         /*
78          * Example from emoji-test.txt:
79          *   # group: Smileys & Emotion
80          *   # subgroup: face-smiling
81          *   1F600 ; fully-qualified # �� grinning face
82          */
83         Splitter semi = Splitter.on(CharMatcher.anyOf(";#")).trimResults();
84         String majorCategory = null;
85         String minorCategory = null;
86         for (String line : FileUtilities.in(Emoji.class, "data/emoji/emoji-test.txt")) {
87             if (line.startsWith("#")) {
88                 line = line.substring(1).trim();
89                 if (line.startsWith("group:")) {
90                     majorCategory = line.substring("group:".length()).trim();
majorToOrder.computeIfAbsent(majorCategory, k -> (long) majorToOrder.size())91                     majorToOrder.computeIfAbsent(majorCategory, k -> (long) majorToOrder.size());
92                 } else if (line.startsWith("subgroup:")) {
93                     minorCategory = line.substring("subgroup:".length()).trim();
minorToOrder.computeIfAbsent(minorCategory, k -> (long) minorToOrder.size())94                     minorToOrder.computeIfAbsent(minorCategory, k -> (long) minorToOrder.size());
95                 }
96                 continue;
97             }
98             line = line.trim();
99             if (line.isEmpty()) {
100                 continue;
101             }
102             Iterator<String> it = semi.split(line).iterator();
103 
104             String emojiHex = it.next();
105             String original = Utility.fromHex(emojiHex, 4, " ");
106             String type = it.next();
107             if (type.startsWith("fully-qualified")) {
108                 allRgi.add(original);
original.replace(Emoji.EMOJI_VARIANT, "")109                 allRgiNoES.add(original.replace(Emoji.EMOJI_VARIANT, ""));
110             }
emojiToMajorCategory.put(original, majorCategory)111             emojiToMajorCategory.put(original, majorCategory);
emojiToMinorCategory.put(original, minorCategory)112             emojiToMinorCategory.put(original, minorCategory);
113             String comment = it.next();
114             // The comment is now of the form:  # �� E0.6 beaming face with smiling eyes
115             int spacePos = comment.indexOf(' ');
116             // The format changed in v15.1, so there is no version number.
117             // Thus the following is commented out:
118             // spacePos = comment.indexOf(' ', spacePos + 1); // get second space
119             String name = comment.substring(spacePos + 1).trim();
toName.put(original, name)120             toName.put(original, name);
121 
122             // add all the non-constructed values to a set for annotations
123 
124             String minimal = original.replace(EMOJI_VARIANT, "");
125 
126             // Add the order. If it is not minimal, add that also.
127             if (!emojiToOrder.containsKey(original)) {
putUnique(emojiToOrder, original, emojiToOrder.size() * 100L)128                 putUnique(emojiToOrder, original, emojiToOrder.size() * 100L);
129             }
130             if (!emojiToOrder.containsKey(minimal)) {
putUnique(emojiToOrder, minimal, emojiToOrder.size() * 100L)131                 putUnique(emojiToOrder, minimal, emojiToOrder.size() * 100L);
132             }
133 
134             boolean singleton = CharSequences.getSingleCodePoint(minimal) != Integer.MAX_VALUE;
135 
136             // skip constructed values
137             if (minimal.contains(COMBINING_ENCLOSING_KEYCAP)
138                     || REGIONAL_INDICATORS.containsSome(minimal)
139                     || TAGS.containsSome(minimal)
140                     || !singleton && MODIFIERS.containsSome(minimal)
141                     || !singleton && FAMILY.containsAll(minimal)) {
142                 // do nothing
143             } else if (minimal.contains(ZWJ)) { // only do certain ZWJ sequences
144                 if (SPECIALS.contains(minimal)
145                         || GENDER.containsSome(minimal)
146                         || MAN_WOMAN.contains(minimal.codePointAt(0))
147                                 && OBJECT.contains(minimal.codePointBefore(minimal.length()))) {
148                     nonConstructed.add(minimal);
149                 }
150             } else if (!minimal.contains("��")) {
151                 nonConstructed.add(minimal);
152             }
153         }
emojiToMajorCategory.freeze()154         emojiToMajorCategory.freeze();
emojiToMinorCategory.freeze()155         emojiToMinorCategory.freeze();
156         nonConstructed.add(MODIFIERS); // needed for names
nonConstructed.freeze()157         nonConstructed.freeze();
toName.freeze()158         toName.freeze();
allRgi.freeze()159         allRgi.freeze();
allRgiNoES.freeze()160         allRgiNoES.freeze();
161     }
162 
putUnique(Map<K, V> map, K key, V value)163     private static <K, V> void putUnique(Map<K, V> map, K key, V value) {
164         V oldValue = map.put(key, value);
165         if (oldValue != null) {
166             throw new ICUException(
167                     "Attempt to change value of "
168                             + map
169                             + " for "
170                             + key
171                             + " from "
172                             + oldValue
173                             + " to "
174                             + value);
175         }
176     }
177 
getAllRgi()178     public static UnicodeSet getAllRgi() {
179         return allRgi;
180     }
181 
getAllRgiNoES()182     public static UnicodeSet getAllRgiNoES() {
183         return allRgiNoES;
184     }
185 
186     public static final UnicodeMap<String> EXTRA_SYMBOL_MINOR_CATEGORIES = new UnicodeMap<>();
187     public static final Map<String, Long> EXTRA_SYMBOL_ORDER;
188     private static final boolean DEBUG = false;
189 
190     static {
191         String[][] data = {
192             {"arrow", "→ ↓ ↑ ← ↔ ↕ ⇆ ⇅"},
193             {"alphanum", "© ® ℗ ™ µ"},
194             {"geometric", "▼ ▶ ▲ ◀ ● ○ ◯ ◊"},
195             {"math", "× ÷ √ ∞ ∆ ∇ ⁻ ¹ ² ³ ≡ ∈ ⊂ ∩ ∪ ° + ± − = ≈ ≠ > < ≤ ≥ ¬ | ~"},
196             {
197                 "punctuation",
198                 "§ † ‡ \\u0020  , 、 ، ; : ؛ ! ¡ ? ¿ ؟ ¶ ※ / \\ & # % ‰ ′ ″ ‴ @ * ♪ ♭ ♯ ` ´ ^ ¨ ‐ ― _ - – — • · . … 。 ‧ ・ ‘ ’ ‚ ' “ ” „ » « ( ) [ ] { } 〔 〕 〈 〉 《 》 「 」 『 』 〖 〗 【 】"
199             },
200             {"currency", "€ £ ¥ ₹ ₽ $ ¢ ฿ ₪ ₺ ₫ ₱ ₩ ₡ ₦ ₮ ৳ ₴ ₸ ₲ ₵ ៛ ₭ ֏ ₥ ₾ ₼ ₿ ؋ ₧ ¤"},
201             {
202                 "other-symbol",
203                 "‾‽‸⁂↚↛↮↙↜↝↞↟↠↡↢↣↤↥↦↧↨↫↬↭↯↰↱↲↳↴↵↶↷↸↹↺↻↼↽↾↿⇀⇁⇂⇃⇄⇇⇈⇉⇊⇋⇌⇐⇍⇑⇒⇏⇓⇔⇎⇖⇗⇘⇙⇚⇛⇜⇝⇞⇟⇠⇡⇢⇣⇤⇥⇦⇧⇨⇩⇪⇵∀∂∃∅∉∋∎∏∑≮≯∓∕⁄∗∘∙∝∟∠∣∥∧∫∬∮∴∵∶∷∼∽∾≃≅≌≒≖≣≦≧≪≫≬≳≺≻⊁⊃⊆⊇⊕⊖⊗⊘⊙⊚⊛⊞⊟⊥⊮⊰⊱⋭⊶⊹⊿⋁⋂⋃⋅⋆⋈⋒⋘⋙⋮⋯⋰⋱■□▢▣▤▥▦▧▨▩▬▭▮▰△▴▵▷▸▹►▻▽▾▿◁◂◃◄◅◆◇◈◉◌◍◎◐◑◒◓◔◕◖◗◘◙◜◝◞◟◠◡◢◣◤◥◦◳◷◻◽◿⨧⨯⨼⩣⩽⪍⪚⪺₢₣₤₰₳₶₷₨﷼"
204             },
205         };
206         // get the maximum suborder for each subcategory
207         Map<String, Long> subcategoryToMaxSuborder = new HashMap<>();
208         for (String[] row : data) {
209             final String subcategory = row[0];
210             for (Entry<String, String> entry : emojiToMinorCategory.entrySet()) {
211                 if (entry.getValue().equals(subcategory)) {
212                     String emoji = entry.getKey();
213                     Long order = emojiToOrder.get(emoji);
214                     Long currentMax = subcategoryToMaxSuborder.get(subcategory);
215                     if (currentMax == null || currentMax < order) {
subcategoryToMaxSuborder.put(subcategory, order)216                         subcategoryToMaxSuborder.put(subcategory, order);
217                     }
218                 }
219             }
220         }
221         if (DEBUG) System.out.println(subcategoryToMaxSuborder);
222         Map<String, Long> _EXTRA_SYMBOL_ORDER = new LinkedHashMap<>();
223         for (String[] row : data) {
224             final String subcategory = row[0];
225             final String characters = row[1];
226 
227             List<String> items = new ArrayList<>();
228             for (int cp : With.codePointArray(characters)) {
229                 if (cp != ' ') {
With.fromCodePoint(cp)230                     items.add(With.fromCodePoint(cp));
231                 }
232             }
233             final UnicodeSet uset = new UnicodeSet().addAll(items);
234             if (uset.containsSome(EXTRA_SYMBOL_MINOR_CATEGORIES.keySet())) {
235                 throw new IllegalArgumentException(
236                         "Duplicate values in " + EXTRA_SYMBOL_MINOR_CATEGORIES);
237             }
EXTRA_SYMBOL_MINOR_CATEGORIES.putAll(uset, subcategory)238             EXTRA_SYMBOL_MINOR_CATEGORIES.putAll(uset, subcategory);
239             long count = subcategoryToMaxSuborder.get(subcategory);
240             for (String s : items) {
241                 ++count;
_EXTRA_SYMBOL_ORDER.put(s, count)242                 _EXTRA_SYMBOL_ORDER.put(s, count);
243             }
subcategoryToMaxSuborder.put(subcategory, count)244             subcategoryToMaxSuborder.put(subcategory, count);
245         }
246         if (DEBUG) System.out.println(_EXTRA_SYMBOL_ORDER);
EXTRA_SYMBOL_MINOR_CATEGORIES.freeze()247         EXTRA_SYMBOL_MINOR_CATEGORIES.freeze();
248         EXTRA_SYMBOL_ORDER = ImmutableMap.copyOf(_EXTRA_SYMBOL_ORDER);
249     }
250 
getMinorCategory(String emoji)251     public static String getMinorCategory(String emoji) {
252         String minorCat = emojiToMinorCategory.get(emoji);
253         if (minorCat == null) {
254             minorCat = EXTRA_SYMBOL_MINOR_CATEGORIES.get(emoji);
255             if (minorCat == null) {
256                 throw new InternalCldrException(
257                         "No minor category (aka subgroup) found for "
258                                 + emoji
259                                 + ". Update emoji-test.txt to latest, and adjust PathHeader.. functionMap.put(\"minor\", ...");
260             }
261         }
262         return minorCat;
263     }
264 
getName(String emoji)265     public static String getName(String emoji) {
266         return toName.get(emoji);
267     }
268 
getEmojiToOrder(String emoji)269     public static long getEmojiToOrder(String emoji) {
270         Long result = emojiToOrder.get(emoji);
271         if (result == null) {
272             result = EXTRA_SYMBOL_ORDER.get(emoji);
273             if (result == null) {
274                 throw new InternalCldrException(
275                         "No Order found for "
276                                 + emoji
277                                 + ". Update emoji-test.txt to latest, and adjust PathHeader.. functionMap.put(\"minor\", ...");
278             }
279         }
280         return result;
281     }
282 
getEmojiMinorOrder(String minor)283     public static long getEmojiMinorOrder(String minor) {
284         Long result = minorToOrder.get(minor);
285         if (result == null) {
286             throw new InternalCldrException(
287                     "No minor category (aka subgroup) found for "
288                             + minor
289                             + ". Update emoji-test.txt to latest, and adjust PathHeader.. functionMap.put(\"minor\", ...");
290         }
291         return result;
292     }
293 
getMajorCategory(String emoji)294     public static String getMajorCategory(String emoji) {
295         String majorCat = emojiToMajorCategory.get(emoji);
296         if (majorCat == null) {
297             if (EXTRA_SYMBOL_MINOR_CATEGORIES.containsKey(emoji)) {
298                 majorCat = "Symbols";
299             } else {
300                 throw new InternalCldrException(
301                         "No minor category (aka subgroup) found for "
302                                 + emoji
303                                 + ". Update emoji-test.txt to latest, and adjust PathHeader.. functionMap.put(\"major\", ...");
304             }
305         }
306         return majorCat;
307     }
308 
getMinorCategoriesWithExtras()309     public static Set<String> getMinorCategoriesWithExtras() {
310         Set<String> result = new LinkedHashSet<>(emojiToMinorCategory.values());
311         result.addAll(EXTRA_SYMBOL_MINOR_CATEGORIES.getAvailableValues());
312         return ImmutableSet.copyOf(result);
313     }
314 
getEmojiInMinorCategoriesWithExtras(String minorCategory)315     public static UnicodeSet getEmojiInMinorCategoriesWithExtras(String minorCategory) {
316         return new UnicodeSet(emojiToMinorCategory.getSet(minorCategory))
317                 .addAll(EXTRA_SYMBOL_MINOR_CATEGORIES.getSet(minorCategory))
318                 .freeze();
319     }
320 
getNonConstructed()321     public static UnicodeSet getNonConstructed() {
322         return nonConstructed;
323     }
324 
325     private static Set<String> NAME_PATHS = null;
326     public static final String TYPE_TTS = "[@type=\"tts\"]";
327 
getNamePaths()328     public static synchronized Set<String> getNamePaths() {
329         return NAME_PATHS != null ? NAME_PATHS : (NAME_PATHS = buildPaths(TYPE_TTS));
330     }
331 
buildPaths(String suffix)332     private static ImmutableSet<String> buildPaths(String suffix) {
333         ImmutableSet.Builder<String> builder = ImmutableSet.builder();
334         for (String s : Emoji.getNonConstructed()) {
335             String base = "//ldml/annotations/annotation[@cp=\"" + s + "\"]" + suffix;
336             builder.add(base);
337         }
338         return builder.build();
339     }
340 
341     /**
342      * Return the PageId for the given emoji, making adjustments for pages that are united in
343      * emoji-test.txt but divided in Survey Tool, such as Symbols, Symbols2, and Symbols3
344      *
345      * @param emoji the emoji as a string
346      * @return the adjusted PageId
347      */
getPageId(String emoji)348     public static PageId getPageId(String emoji) {
349         final String major = getMajorCategory(emoji);
350         final String minor = getMinorCategory(emoji);
351         final PageId pageId = PageId.forString(major);
352         final Long minorOrder = minorToOrder.get(minor);
353         switch (pageId) {
354             case Objects:
355                 return (minorOrder < minorToOrder.get("money")) ? PageId.Objects : PageId.Objects2;
356             case People:
357                 return (minorOrder < minorToOrder.get("person-fantasy"))
358                         ? PageId.People
359                         : PageId.People2;
360             case Symbols:
361                 return (minorOrder < minorToOrder.get("transport-sign"))
362                         ? PageId.Symbols
363                         : PageId.EmojiSymbols;
364             case Travel_Places:
365                 return (minorOrder < minorToOrder.get("transport-ground"))
366                         ? PageId.Travel_Places
367                         : PageId.Travel_Places2;
368             default:
369                 return pageId;
370         }
371     }
372 }
373