xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/draft/ScriptMetadata.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.draft;
2 
3 import com.google.common.base.Joiner;
4 import com.ibm.icu.impl.Relation;
5 import com.ibm.icu.lang.UScript;
6 import com.ibm.icu.text.Transform;
7 import com.ibm.icu.text.UTF16;
8 import com.ibm.icu.util.ICUException;
9 import com.ibm.icu.util.VersionInfo;
10 import java.util.Arrays;
11 import java.util.Collections;
12 import java.util.HashMap;
13 import java.util.HashSet;
14 import java.util.LinkedHashSet;
15 import java.util.List;
16 import java.util.Locale;
17 import java.util.Map;
18 import java.util.Map.Entry;
19 import java.util.Set;
20 import java.util.TreeSet;
21 import org.unicode.cldr.tool.CountryCodeConverter;
22 import org.unicode.cldr.util.CldrUtility;
23 import org.unicode.cldr.util.Containment;
24 import org.unicode.cldr.util.SemiFileReader;
25 import org.unicode.cldr.util.StandardCodes;
26 import org.unicode.cldr.util.With;
27 
28 public class ScriptMetadata {
29     private static final int MAX_RANK = 33;
30     private static final String DATA_FILE = "/org/unicode/cldr/util/data/Script_Metadata.csv";
31     private static final VersionInfo UNICODE_VERSION =
32             VersionInfo.getInstance(CldrUtility.getProperty("SCRIPT_UNICODE_VERSION", "15"));
33 
34     // To get the data, go do the Script MetaData spreadsheet
35     // Download As Comma Separated Items into DATA_FILE
36     // Set the last string in the UNICODE_VERSION line above to the right Unicode Version (for
37     // Unicode beta).
38     // Run TestScriptMetadata.
39     // Then run GenerateScriptMetadata.
40     // See http://cldr.unicode.org/development/updating-codes/updating-script-metadata
41     private enum Column {
42         // must match the spreadsheet header (caseless compare) or have the alternate header as an
43         // argument.
44         // doesn't have to be in order
45         WR,
46         AGE,
47         SAMPLE_CODE,
48         ID_USAGE("ID Usage (UAX31)"),
49         RTL("RTL?"),
50         LB_LETTERS("LB letters?"),
51         SHAPING_REQ("Shaping Req?"),
52         IME("IME?"),
53         ORIGIN_COUNTRY("Origin Country"),
54         DENSITY("~Density"),
55         LANG_CODE,
56         HAS_CASE("Has Case?");
57 
58         int columnNumber = -1;
59         final Set<String> names = new HashSet<>();
60 
Column(String... alternateNames)61         Column(String... alternateNames) {
62             names.add(this.name());
63             for (String name : alternateNames) {
64                 names.add(name.toUpperCase(Locale.ENGLISH));
65             }
66         }
67 
setColumns(String[] headers)68         static void setColumns(String[] headers) {
69             for (int i = 0; i < headers.length; ++i) {
70                 String header = headers[i].toUpperCase(Locale.ENGLISH);
71                 for (Column v : values()) {
72                     if (v.names.contains(header)) {
73                         v.columnNumber = i;
74                     }
75                 }
76             }
77             for (Column v : values()) {
78                 if (v.columnNumber == -1) {
79                     throw new IllegalArgumentException(
80                             "Missing field for " + v + ", may need to add additional column alias");
81                 }
82             }
83         }
84 
getItem(String[] items)85         String getItem(String[] items) {
86             return items[columnNumber];
87         }
88 
getInt(String[] items, int defaultValue)89         int getInt(String[] items, int defaultValue) {
90             final String item = getItem(items);
91             return item.isEmpty() || item.equalsIgnoreCase("n/a")
92                     ? defaultValue
93                     : Integer.parseInt(item);
94         }
95     }
96 
97     public enum IdUsage {
98         UNKNOWN("Other"),
99         EXCLUSION("Historic"),
100         LIMITED_USE("Limited Use"),
101         ASPIRATIONAL("Aspirational"),
102         RECOMMENDED("Major Use");
103 
104         public final String name;
105 
IdUsage(String name)106         private IdUsage(String name) {
107             this.name = name;
108         }
109     }
110 
111     public enum Trinary {
112         UNKNOWN,
113         NO,
114         YES
115     }
116 
117     public enum Shaping {
118         UNKNOWN,
119         NO,
120         MIN,
121         YES
122     }
123 
124     static StandardCodes SC = StandardCodes.make();
125     static EnumLookup<Shaping> shapingLookup =
126             EnumLookup.of(Shaping.class, null, "n/a", Shaping.UNKNOWN);
127     static EnumLookup<Trinary> trinaryLookup =
128             EnumLookup.of(Trinary.class, null, "n/a", Trinary.UNKNOWN);
129     static EnumLookup<IdUsage> idUsageLookup =
130             EnumLookup.of(IdUsage.class, null, "n/a", IdUsage.UNKNOWN);
131 
addNameToCode(String type, Map<String, String> hashMap)132     public static void addNameToCode(String type, Map<String, String> hashMap) {
133         for (String language : SC.getAvailableCodes(type)) {
134             Map<String, String> fullData = StandardCodes.getLStreg().get(type).get(language);
135             String name = fullData.get("Description");
136             hashMap.put(name.toUpperCase(Locale.ENGLISH), language);
137         }
138     }
139 
140     public static final class SkipNewUnicodeException extends ICUException {}
141 
142     public static class Info implements Comparable<Info> {
143         public final int rank;
144         public final VersionInfo age;
145         public final String sampleChar;
146         public final IdUsage idUsage;
147         public final Trinary rtl;
148         public final Trinary lbLetters;
149         public final Trinary hasCase;
150         public final Shaping shapingReq;
151         public final Trinary ime;
152         public final int density;
153         public final String originCountry;
154         public final String likelyLanguage;
155 
Info(String[] items)156         private Info(String[] items) {
157             // 3,Han,Hani,1.1,"75,963",字,5B57,China,3,Chinese,zh,Recommended,no,Yes,no,Yes,no
158             rank = Math.min(Column.WR.getInt(items, 999), MAX_RANK);
159             age = VersionInfo.getInstance(Column.AGE.getItem(items));
160             if (age.compareTo(UNICODE_VERSION) > 0) {
161                 throw new SkipNewUnicodeException();
162             }
163             // Parse the code point of the sample character, rather than the sample character
164             // itself.
165             // The code point is more reliable, especially when the spreadsheet has a bug
166             // for supplementary characters.
167             int sampleCode = Integer.parseInt(Column.SAMPLE_CODE.getItem(items), 16);
168             sampleChar = UTF16.valueOf(sampleCode);
169             idUsage = idUsageLookup.forString(Column.ID_USAGE.getItem(items));
170             rtl = trinaryLookup.forString(Column.RTL.getItem(items));
171             lbLetters = trinaryLookup.forString(Column.LB_LETTERS.getItem(items));
172             shapingReq = shapingLookup.forString(Column.SHAPING_REQ.getItem(items));
173             ime = trinaryLookup.forString(Column.IME.getItem(items));
174             hasCase = trinaryLookup.forString(Column.HAS_CASE.getItem(items));
175             density = Column.DENSITY.getInt(items, -1);
176 
177             final String countryRaw = Column.ORIGIN_COUNTRY.getItem(items);
178             String country = CountryCodeConverter.getCodeFromName(countryRaw, false);
179             if (country == null) {
180                 // Give context when throwing an error. Because this is run in a static init
181                 // context, the stack trace is typically incorrect when something goes wrong.
182                 errors.add(
183                         "ScriptMetadata.java: Can't map "
184                                 + countryRaw
185                                 + " to country/region. Try updating external/alternate_country_names.txt");
186             }
187             originCountry = country == null ? "ZZ" : country;
188 
189             String langCode = Column.LANG_CODE.getItem(items);
190             if (langCode.equals("n/a")) {
191                 langCode = null;
192             }
193             likelyLanguage = langCode == null ? "und" : langCode;
194         }
195 
Info(Info other, String string, String sampleCharacter)196         public Info(Info other, String string, String sampleCharacter) {
197             rank = other.rank;
198             age = other.age;
199             sampleChar = sampleCharacter == null ? other.sampleChar : sampleCharacter;
200             idUsage = other.idUsage;
201             rtl = other.rtl;
202             lbLetters = other.lbLetters;
203             hasCase = other.hasCase;
204             shapingReq = other.shapingReq;
205             ime = "IME:YES".equals(string) ? Trinary.YES : other.ime;
206             density = other.density;
207             originCountry = other.originCountry;
208             likelyLanguage = other.likelyLanguage;
209         }
210 
211         // public Trinary parseTrinary(Column title, String[] items) {
212         // return Trinary.valueOf(fix(title.getItem(items)).toUpperCase(Locale.ENGLISH));
213         // }
fix(String in)214         String fix(String in) {
215             return in.toUpperCase(Locale.ENGLISH)
216                     .replace("N/A", "UNKNOWN")
217                     .replace("?", "UNKNOWN")
218                     .replace("RTL", "YES");
219         }
220 
221         @Override
toString()222         public String toString() {
223             return rank
224                     + "\tSample: "
225                     + sampleChar
226                     + "\tCountry: "
227                     + getName("territory", originCountry)
228                     + " ("
229                     + originCountry
230                     + ")"
231                     + "\tLanguage: "
232                     + getName("language", likelyLanguage)
233                     + " ("
234                     + likelyLanguage
235                     + ")"
236                     + "\tId: "
237                     + idUsage
238                     + "\tRtl: "
239                     + rtl
240                     + "\tLb: "
241                     + lbLetters
242                     + "\tShape: "
243                     + shapingReq
244                     + "\tIme: "
245                     + ime
246                     + "\tCase: "
247                     + hasCase
248                     + "\tDensity: "
249                     + density;
250         }
251 
getName(String type, String code)252         public Object getName(String type, String code) {
253             List<String> fullData = SC.getFullData(type, code);
254             if (fullData == null) {
255                 return "unavailable";
256             }
257             return fullData.get(0);
258         }
259 
260         @Override
compareTo(Info o)261         public int compareTo(Info o) {
262             // we don't actually care what the comparison value is, as long as it is transitive and
263             // consistent with equals.
264             return toString().compareTo(o.toString());
265         }
266     }
267 
268     public static Set<String> errors = new LinkedHashSet<>();
269     static HashMap<String, Integer> titleToColumn = new HashMap<>();
270 
271     private static class MyFileReader extends SemiFileReader {
272         private Map<String, Info> data = new HashMap<>();
273 
274         @Override
isCodePoint()275         protected boolean isCodePoint() {
276             return false;
277         }
278 
279         @Override
splitLine(String line)280         protected String[] splitLine(String line) {
281             return CldrUtility.splitCommaSeparated(line);
282         }
283 
284         @Override
handleLine(int lineCount, int start, int end, String[] items)285         protected boolean handleLine(int lineCount, int start, int end, String[] items) {
286             if (items[0].startsWith("For help") || items[0].isEmpty()) {
287                 return true; // header lines
288             }
289             if (items[0].equals("WR")) {
290                 Column.setColumns(items);
291                 return true;
292             }
293             Info info;
294             try {
295                 info = new Info(items);
296             } catch (SkipNewUnicodeException e) {
297                 return true;
298             } catch (Exception e) {
299                 errors.add(
300                         e.getClass().getName()
301                                 + "\t"
302                                 + e.getMessage()
303                                 + "\t"
304                                 + Arrays.asList(items));
305                 return true;
306             }
307 
308             String script = items[2];
309             data.put(script, info);
310             Set<String> extras = EXTRAS.get(script);
311             if (extras != null) {
312                 for (String script2 : extras) {
313                     Info info2 = info;
314                     if (script2.equals("Jpan")) {
315                         // HACK
316                         info2 = new Info(info, "IME:YES", null);
317                     } else if (script2.equals("Jamo")) {
318                         info2 = new Info(info, null, "ᄒ");
319                     }
320                     data.put(script2, info2);
321                 }
322             }
323             return true;
324         }
325 
326         @Override
process(Class<?> classLocation, String fileName)327         public MyFileReader process(Class<?> classLocation, String fileName) {
328             super.process(classLocation, fileName);
329             return this;
330         }
331 
getData()332         private Map<String, Info> getData() {
333             if (!errors.isEmpty()) {
334                 throw new RuntimeException(Joiner.on("\n\t").join(errors));
335             }
336             return Collections.unmodifiableMap(data);
337         }
338     }
339 
340     public enum Groupings {
341         EUROPEAN("150"),
342         MIDDLE_EASTERN("145"),
343         CENTRAL_ASIAN("143"),
344         SOUTH_ASIAN("034"),
345         SOUTHEAST_ASIAN("035"),
346         EAST_ASIAN("030"),
347         AFRICAN("002"),
348         AMERICAN("019"),
349         ;
350         public final Set<String> scripts;
351 
Groupings(String... regions)352         private Groupings(String... regions) {
353             scripts =
354                     With.in(getScripts())
355                             .toUnmodifiableCollection(
356                                     new ScriptMetadata.RegionFilter(regions),
357                                     new TreeSet<String>());
358         }
359     }
360 
361     static class RegionFilter implements com.ibm.icu.text.Transform<String, String> {
362         final String[] containingRegion;
363 
RegionFilter(String... containingRegion)364         RegionFilter(String... containingRegion) {
365             this.containingRegion = containingRegion;
366         }
367 
368         @Override
transform(String script)369         public String transform(String script) {
370             String currentRegion = getInfo(script).originCountry;
371             while (true) {
372                 for (String s : containingRegion) {
373                     if (s.equals(currentRegion)) {
374                         return script;
375                     }
376                 }
377                 if (currentRegion.equals("001") || currentRegion.equals("ZZ")) {
378                     return null;
379                 }
380                 currentRegion = Containment.getContainer(currentRegion);
381             }
382         }
383     }
384 
385     static Relation<String, String> EXTRAS =
386             Relation.of(new HashMap<String, Set<String>>(), HashSet.class);
387 
388     static {
389         EXTRAS.put("Hani", "Hans");
390         EXTRAS.put("Hani", "Hant");
391         EXTRAS.put("Hani", "Hanb");
392         EXTRAS.put("Hang", "Kore");
393         EXTRAS.put("Hang", "Jamo");
394         EXTRAS.put("Hira", "Jpan");
EXTRAS.freeze()395         EXTRAS.freeze();
396     }
397 
398     static final Map<String, Info> data =
399             new MyFileReader().process(ScriptMetadata.class, DATA_FILE).getData();
400 
getInfo(String s)401     public static Info getInfo(String s) {
402         Info result = data.get(s);
403         if (result == null) {
404             try {
405                 String name2 = UScript.getShortName(UScript.getCodeFromName(s));
406                 result = data.get(name2);
407             } catch (Exception e) {
408             }
409         }
410         return result;
411     }
412 
getScripts()413     public static Set<String> getScripts() {
414         return data.keySet();
415     }
416 
getInfo(int i)417     public static Info getInfo(int i) {
418         return data.get(UScript.getShortName(i));
419     }
420 
iterable()421     public static Set<Entry<String, Info>> iterable() {
422         return data.entrySet();
423     }
424 
425     /**
426      * Specialized scripts
427      *
428      * @return
429      */
getExtras()430     public static Set<String> getExtras() {
431         return EXTRAS.values();
432     }
433 
434     public static Transform<String, String> TO_SHORT_SCRIPT =
435             new Transform<String, String>() {
436                 @Override
437                 public String transform(String source) {
438                     return UScript.getShortName(UScript.getCodeFromName(source));
439                 }
440             };
441     public static Transform<String, String> TO_LONG_SCRIPT =
442             new Transform<String, String>() {
443                 @Override
444                 public String transform(String source) {
445                     return UScript.getName(UScript.getCodeFromName(source));
446                 }
447             };
448 }
449