1 package org.unicode.cldr.draft; 2 3 import com.google.common.base.Joiner; 4 import com.ibm.icu.impl.Relation; 5 import com.ibm.icu.lang.UScript; 6 import com.ibm.icu.text.Transform; 7 import com.ibm.icu.text.UTF16; 8 import com.ibm.icu.util.ICUException; 9 import com.ibm.icu.util.VersionInfo; 10 import java.util.Arrays; 11 import java.util.Collections; 12 import java.util.HashMap; 13 import java.util.HashSet; 14 import java.util.LinkedHashSet; 15 import java.util.List; 16 import java.util.Locale; 17 import java.util.Map; 18 import java.util.Map.Entry; 19 import java.util.Set; 20 import java.util.TreeSet; 21 import org.unicode.cldr.tool.CountryCodeConverter; 22 import org.unicode.cldr.util.CldrUtility; 23 import org.unicode.cldr.util.Containment; 24 import org.unicode.cldr.util.SemiFileReader; 25 import org.unicode.cldr.util.StandardCodes; 26 import org.unicode.cldr.util.With; 27 28 public class ScriptMetadata { 29 private static final int MAX_RANK = 33; 30 private static final String DATA_FILE = "/org/unicode/cldr/util/data/Script_Metadata.csv"; 31 private static final VersionInfo UNICODE_VERSION = 32 VersionInfo.getInstance(CldrUtility.getProperty("SCRIPT_UNICODE_VERSION", "15")); 33 34 // To get the data, go do the Script MetaData spreadsheet 35 // Download As Comma Separated Items into DATA_FILE 36 // Set the last string in the UNICODE_VERSION line above to the right Unicode Version (for 37 // Unicode beta). 38 // Run TestScriptMetadata. 39 // Then run GenerateScriptMetadata. 40 // See http://cldr.unicode.org/development/updating-codes/updating-script-metadata 41 private enum Column { 42 // must match the spreadsheet header (caseless compare) or have the alternate header as an 43 // argument. 44 // doesn't have to be in order 45 WR, 46 AGE, 47 SAMPLE_CODE, 48 ID_USAGE("ID Usage (UAX31)"), 49 RTL("RTL?"), 50 LB_LETTERS("LB letters?"), 51 SHAPING_REQ("Shaping Req?"), 52 IME("IME?"), 53 ORIGIN_COUNTRY("Origin Country"), 54 DENSITY("~Density"), 55 LANG_CODE, 56 HAS_CASE("Has Case?"); 57 58 int columnNumber = -1; 59 final Set<String> names = new HashSet<>(); 60 Column(String... alternateNames)61 Column(String... alternateNames) { 62 names.add(this.name()); 63 for (String name : alternateNames) { 64 names.add(name.toUpperCase(Locale.ENGLISH)); 65 } 66 } 67 setColumns(String[] headers)68 static void setColumns(String[] headers) { 69 for (int i = 0; i < headers.length; ++i) { 70 String header = headers[i].toUpperCase(Locale.ENGLISH); 71 for (Column v : values()) { 72 if (v.names.contains(header)) { 73 v.columnNumber = i; 74 } 75 } 76 } 77 for (Column v : values()) { 78 if (v.columnNumber == -1) { 79 throw new IllegalArgumentException( 80 "Missing field for " + v + ", may need to add additional column alias"); 81 } 82 } 83 } 84 getItem(String[] items)85 String getItem(String[] items) { 86 return items[columnNumber]; 87 } 88 getInt(String[] items, int defaultValue)89 int getInt(String[] items, int defaultValue) { 90 final String item = getItem(items); 91 return item.isEmpty() || item.equalsIgnoreCase("n/a") 92 ? defaultValue 93 : Integer.parseInt(item); 94 } 95 } 96 97 public enum IdUsage { 98 UNKNOWN("Other"), 99 EXCLUSION("Historic"), 100 LIMITED_USE("Limited Use"), 101 ASPIRATIONAL("Aspirational"), 102 RECOMMENDED("Major Use"); 103 104 public final String name; 105 IdUsage(String name)106 private IdUsage(String name) { 107 this.name = name; 108 } 109 } 110 111 public enum Trinary { 112 UNKNOWN, 113 NO, 114 YES 115 } 116 117 public enum Shaping { 118 UNKNOWN, 119 NO, 120 MIN, 121 YES 122 } 123 124 static StandardCodes SC = StandardCodes.make(); 125 static EnumLookup<Shaping> shapingLookup = 126 EnumLookup.of(Shaping.class, null, "n/a", Shaping.UNKNOWN); 127 static EnumLookup<Trinary> trinaryLookup = 128 EnumLookup.of(Trinary.class, null, "n/a", Trinary.UNKNOWN); 129 static EnumLookup<IdUsage> idUsageLookup = 130 EnumLookup.of(IdUsage.class, null, "n/a", IdUsage.UNKNOWN); 131 addNameToCode(String type, Map<String, String> hashMap)132 public static void addNameToCode(String type, Map<String, String> hashMap) { 133 for (String language : SC.getAvailableCodes(type)) { 134 Map<String, String> fullData = StandardCodes.getLStreg().get(type).get(language); 135 String name = fullData.get("Description"); 136 hashMap.put(name.toUpperCase(Locale.ENGLISH), language); 137 } 138 } 139 140 public static final class SkipNewUnicodeException extends ICUException {} 141 142 public static class Info implements Comparable<Info> { 143 public final int rank; 144 public final VersionInfo age; 145 public final String sampleChar; 146 public final IdUsage idUsage; 147 public final Trinary rtl; 148 public final Trinary lbLetters; 149 public final Trinary hasCase; 150 public final Shaping shapingReq; 151 public final Trinary ime; 152 public final int density; 153 public final String originCountry; 154 public final String likelyLanguage; 155 Info(String[] items)156 private Info(String[] items) { 157 // 3,Han,Hani,1.1,"75,963",字,5B57,China,3,Chinese,zh,Recommended,no,Yes,no,Yes,no 158 rank = Math.min(Column.WR.getInt(items, 999), MAX_RANK); 159 age = VersionInfo.getInstance(Column.AGE.getItem(items)); 160 if (age.compareTo(UNICODE_VERSION) > 0) { 161 throw new SkipNewUnicodeException(); 162 } 163 // Parse the code point of the sample character, rather than the sample character 164 // itself. 165 // The code point is more reliable, especially when the spreadsheet has a bug 166 // for supplementary characters. 167 int sampleCode = Integer.parseInt(Column.SAMPLE_CODE.getItem(items), 16); 168 sampleChar = UTF16.valueOf(sampleCode); 169 idUsage = idUsageLookup.forString(Column.ID_USAGE.getItem(items)); 170 rtl = trinaryLookup.forString(Column.RTL.getItem(items)); 171 lbLetters = trinaryLookup.forString(Column.LB_LETTERS.getItem(items)); 172 shapingReq = shapingLookup.forString(Column.SHAPING_REQ.getItem(items)); 173 ime = trinaryLookup.forString(Column.IME.getItem(items)); 174 hasCase = trinaryLookup.forString(Column.HAS_CASE.getItem(items)); 175 density = Column.DENSITY.getInt(items, -1); 176 177 final String countryRaw = Column.ORIGIN_COUNTRY.getItem(items); 178 String country = CountryCodeConverter.getCodeFromName(countryRaw, false); 179 if (country == null) { 180 // Give context when throwing an error. Because this is run in a static init 181 // context, the stack trace is typically incorrect when something goes wrong. 182 errors.add( 183 "ScriptMetadata.java: Can't map " 184 + countryRaw 185 + " to country/region. Try updating external/alternate_country_names.txt"); 186 } 187 originCountry = country == null ? "ZZ" : country; 188 189 String langCode = Column.LANG_CODE.getItem(items); 190 if (langCode.equals("n/a")) { 191 langCode = null; 192 } 193 likelyLanguage = langCode == null ? "und" : langCode; 194 } 195 Info(Info other, String string, String sampleCharacter)196 public Info(Info other, String string, String sampleCharacter) { 197 rank = other.rank; 198 age = other.age; 199 sampleChar = sampleCharacter == null ? other.sampleChar : sampleCharacter; 200 idUsage = other.idUsage; 201 rtl = other.rtl; 202 lbLetters = other.lbLetters; 203 hasCase = other.hasCase; 204 shapingReq = other.shapingReq; 205 ime = "IME:YES".equals(string) ? Trinary.YES : other.ime; 206 density = other.density; 207 originCountry = other.originCountry; 208 likelyLanguage = other.likelyLanguage; 209 } 210 211 // public Trinary parseTrinary(Column title, String[] items) { 212 // return Trinary.valueOf(fix(title.getItem(items)).toUpperCase(Locale.ENGLISH)); 213 // } fix(String in)214 String fix(String in) { 215 return in.toUpperCase(Locale.ENGLISH) 216 .replace("N/A", "UNKNOWN") 217 .replace("?", "UNKNOWN") 218 .replace("RTL", "YES"); 219 } 220 221 @Override toString()222 public String toString() { 223 return rank 224 + "\tSample: " 225 + sampleChar 226 + "\tCountry: " 227 + getName("territory", originCountry) 228 + " (" 229 + originCountry 230 + ")" 231 + "\tLanguage: " 232 + getName("language", likelyLanguage) 233 + " (" 234 + likelyLanguage 235 + ")" 236 + "\tId: " 237 + idUsage 238 + "\tRtl: " 239 + rtl 240 + "\tLb: " 241 + lbLetters 242 + "\tShape: " 243 + shapingReq 244 + "\tIme: " 245 + ime 246 + "\tCase: " 247 + hasCase 248 + "\tDensity: " 249 + density; 250 } 251 getName(String type, String code)252 public Object getName(String type, String code) { 253 List<String> fullData = SC.getFullData(type, code); 254 if (fullData == null) { 255 return "unavailable"; 256 } 257 return fullData.get(0); 258 } 259 260 @Override compareTo(Info o)261 public int compareTo(Info o) { 262 // we don't actually care what the comparison value is, as long as it is transitive and 263 // consistent with equals. 264 return toString().compareTo(o.toString()); 265 } 266 } 267 268 public static Set<String> errors = new LinkedHashSet<>(); 269 static HashMap<String, Integer> titleToColumn = new HashMap<>(); 270 271 private static class MyFileReader extends SemiFileReader { 272 private Map<String, Info> data = new HashMap<>(); 273 274 @Override isCodePoint()275 protected boolean isCodePoint() { 276 return false; 277 } 278 279 @Override splitLine(String line)280 protected String[] splitLine(String line) { 281 return CldrUtility.splitCommaSeparated(line); 282 } 283 284 @Override handleLine(int lineCount, int start, int end, String[] items)285 protected boolean handleLine(int lineCount, int start, int end, String[] items) { 286 if (items[0].startsWith("For help") || items[0].isEmpty()) { 287 return true; // header lines 288 } 289 if (items[0].equals("WR")) { 290 Column.setColumns(items); 291 return true; 292 } 293 Info info; 294 try { 295 info = new Info(items); 296 } catch (SkipNewUnicodeException e) { 297 return true; 298 } catch (Exception e) { 299 errors.add( 300 e.getClass().getName() 301 + "\t" 302 + e.getMessage() 303 + "\t" 304 + Arrays.asList(items)); 305 return true; 306 } 307 308 String script = items[2]; 309 data.put(script, info); 310 Set<String> extras = EXTRAS.get(script); 311 if (extras != null) { 312 for (String script2 : extras) { 313 Info info2 = info; 314 if (script2.equals("Jpan")) { 315 // HACK 316 info2 = new Info(info, "IME:YES", null); 317 } else if (script2.equals("Jamo")) { 318 info2 = new Info(info, null, "ᄒ"); 319 } 320 data.put(script2, info2); 321 } 322 } 323 return true; 324 } 325 326 @Override process(Class<?> classLocation, String fileName)327 public MyFileReader process(Class<?> classLocation, String fileName) { 328 super.process(classLocation, fileName); 329 return this; 330 } 331 getData()332 private Map<String, Info> getData() { 333 if (!errors.isEmpty()) { 334 throw new RuntimeException(Joiner.on("\n\t").join(errors)); 335 } 336 return Collections.unmodifiableMap(data); 337 } 338 } 339 340 public enum Groupings { 341 EUROPEAN("150"), 342 MIDDLE_EASTERN("145"), 343 CENTRAL_ASIAN("143"), 344 SOUTH_ASIAN("034"), 345 SOUTHEAST_ASIAN("035"), 346 EAST_ASIAN("030"), 347 AFRICAN("002"), 348 AMERICAN("019"), 349 ; 350 public final Set<String> scripts; 351 Groupings(String... regions)352 private Groupings(String... regions) { 353 scripts = 354 With.in(getScripts()) 355 .toUnmodifiableCollection( 356 new ScriptMetadata.RegionFilter(regions), 357 new TreeSet<String>()); 358 } 359 } 360 361 static class RegionFilter implements com.ibm.icu.text.Transform<String, String> { 362 final String[] containingRegion; 363 RegionFilter(String... containingRegion)364 RegionFilter(String... containingRegion) { 365 this.containingRegion = containingRegion; 366 } 367 368 @Override transform(String script)369 public String transform(String script) { 370 String currentRegion = getInfo(script).originCountry; 371 while (true) { 372 for (String s : containingRegion) { 373 if (s.equals(currentRegion)) { 374 return script; 375 } 376 } 377 if (currentRegion.equals("001") || currentRegion.equals("ZZ")) { 378 return null; 379 } 380 currentRegion = Containment.getContainer(currentRegion); 381 } 382 } 383 } 384 385 static Relation<String, String> EXTRAS = 386 Relation.of(new HashMap<String, Set<String>>(), HashSet.class); 387 388 static { 389 EXTRAS.put("Hani", "Hans"); 390 EXTRAS.put("Hani", "Hant"); 391 EXTRAS.put("Hani", "Hanb"); 392 EXTRAS.put("Hang", "Kore"); 393 EXTRAS.put("Hang", "Jamo"); 394 EXTRAS.put("Hira", "Jpan"); EXTRAS.freeze()395 EXTRAS.freeze(); 396 } 397 398 static final Map<String, Info> data = 399 new MyFileReader().process(ScriptMetadata.class, DATA_FILE).getData(); 400 getInfo(String s)401 public static Info getInfo(String s) { 402 Info result = data.get(s); 403 if (result == null) { 404 try { 405 String name2 = UScript.getShortName(UScript.getCodeFromName(s)); 406 result = data.get(name2); 407 } catch (Exception e) { 408 } 409 } 410 return result; 411 } 412 getScripts()413 public static Set<String> getScripts() { 414 return data.keySet(); 415 } 416 getInfo(int i)417 public static Info getInfo(int i) { 418 return data.get(UScript.getShortName(i)); 419 } 420 iterable()421 public static Set<Entry<String, Info>> iterable() { 422 return data.entrySet(); 423 } 424 425 /** 426 * Specialized scripts 427 * 428 * @return 429 */ getExtras()430 public static Set<String> getExtras() { 431 return EXTRAS.values(); 432 } 433 434 public static Transform<String, String> TO_SHORT_SCRIPT = 435 new Transform<String, String>() { 436 @Override 437 public String transform(String source) { 438 return UScript.getShortName(UScript.getCodeFromName(source)); 439 } 440 }; 441 public static Transform<String, String> TO_LONG_SCRIPT = 442 new Transform<String, String>() { 443 @Override 444 public String transform(String source) { 445 return UScript.getName(UScript.getCodeFromName(source)); 446 } 447 }; 448 } 449