1 package org.unicode.cldr.tool; 2 3 import java.util.Arrays; 4 import java.util.Collections; 5 import java.util.HashSet; 6 import java.util.LinkedHashMap; 7 import java.util.List; 8 import java.util.Locale; 9 import java.util.Map; 10 import java.util.regex.Matcher; 11 import java.util.regex.Pattern; 12 import org.unicode.cldr.util.LocaleNames; 13 import org.unicode.cldr.util.PatternCache; 14 15 /** 16 * Parse Locales, extended to BCP 47 and CLDR. Also normalizes the case of the results. Only does 17 * syntactic parse: does not replace deprecated elements; does not check for validity. Will throw 18 * IllegalArgumentException for duplicate variants and extensions. 19 * 20 * @author markdavis 21 */ 22 class SimpleLocaleParser { 23 // mechanically generated regex -- don't worry about trying to read it! 24 // if we want to allow multiple --, change [-_] into [-_]+ 25 private static final Pattern rootPattern = 26 Pattern.compile( 27 "(?:" 28 + " (?: ( [a-z]{2,8} )" 29 + // language 30 " (?: [-_] ( [a-z]{4} ) )?" 31 + // script 32 " (?: [-_] ( [a-z]{2} | [0-9]{3} ) )?" 33 + // region 34 " (?: [-_] ( (?: [a-z 0-9]{5,8} | [0-9] [a-z 0-9]{3} ) (?: [-_] (?: [a-z 0-9]{5,8} | [0-9] [a-z 0-9]{3} ) )* ) )?" 35 + // variant(s) 36 " (?: [-_] ( [a-w y-z] (?: [-_] [a-z 0-9]{2,8} )+ (?: [-_] [a-w y-z] (?: [-_] [a-z 0-9]{2,8} )+ )* ) )?" 37 + // extensions 38 " (?: [-_] ( x (?: [-_] [a-z 0-9]{1,8} )+ ) )? )" 39 + // private use 40 " | ( x (?: [-_] [a-z 0-9]{1,8} )+ )" 41 + // private use 42 " | ( en [-_] GB [-_] oed" 43 + // legacy gorp 44 " | i [-_] (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu )" 45 + " | no [-_] (?: bok | nyn )" 46 + " | sgn [-_] (?: BE [-_] (?: fr | nl) | CH [-_] de )" 47 + " | zh [-_] (?: cmn (?: [-_] Hans | [-_] Hant )? | gan | min (?: [-_] nan)? | wuu | yue ) ) )" 48 + " (?: \\@ ((?: [a-z 0-9]+ \\= [a-z 0-9]+) (?: \\; (?: [a-z 0-9]+ \\= [a-z 0-9]+))*))?" 49 + // CLDR/ICU 50 // keywords 51 "", 52 Pattern.COMMENTS 53 | Pattern.CASE_INSENSITIVE); // TODO change above to be lowercase, since 54 // source is 55 // already when we compare 56 // Other regex patterns for splitting apart lists of items detected above. 57 private static final Pattern variantSeparatorPattern = PatternCache.get("[-_]"); 58 private static final Pattern extensionPattern = 59 Pattern.compile( 60 "([a-z]) [-_] ( [a-z 0-9]{2,8} (?:[-_] [a-z 0-9]{2,8})* )", Pattern.COMMENTS); 61 private static final Pattern privateUsePattern = 62 Pattern.compile( 63 "(x) [-_] ( [a-z 0-9]{1,8} (?:[-_] [a-z 0-9]{1,8})* )", Pattern.COMMENTS); 64 private static final Pattern keywordPattern = 65 Pattern.compile("([a-z 0-9]+) \\= ([a-z 0-9]+)", Pattern.COMMENTS); 66 67 /** The fields set by set(). */ 68 private String language; 69 70 private String script; 71 private String region; 72 private List<String> variants; 73 private Map<String, String> extensions; 74 75 /** 76 * Set the object to the source. 77 * 78 * <p>Example (artificially complicated): 79 * 80 * <pre> 81 * myParser.set("zh-Hans-HK-SCOUSE-a-foobar-x-a-en@collation=phonebook;calendar=islamic"); 82 * String language = myParser.getLanguage(); 83 * </pre> 84 * 85 * @param source 86 * @return 87 */ set(String source)88 public boolean set(String source) { 89 source = source.toLowerCase(Locale.ENGLISH); 90 Matcher root = rootPattern.matcher(source); 91 if (!root.matches()) { 92 return false; 93 } 94 language = root.group(1); 95 if (language == null) { 96 language = root.group(8); // marked as “Type: grandfathered” in BCP 47 97 if (language == null) { 98 language = LocaleNames.UND; // placeholder for completely private use 99 } 100 } 101 script = root.group(2); 102 if (script == null) { 103 script = ""; 104 } else { 105 script = script.substring(0, 1).toUpperCase(Locale.ENGLISH) + script.substring(1); 106 } 107 region = root.group(3); 108 if (region == null) { 109 region = ""; 110 } else { 111 region = region.toUpperCase(Locale.ENGLISH); 112 } 113 final String variantList = root.group(4); 114 if (variantList == null) { 115 variants = Collections.emptyList(); 116 } else { 117 // make uppercase for compatibility with CLDR. 118 variants = 119 Arrays.asList( 120 variantSeparatorPattern.split(variantList.toUpperCase(Locale.ENGLISH))); 121 // check for duplicate variants 122 if (new HashSet<>(variants).size() != variants.size()) { 123 throw new IllegalArgumentException("Duplicate variants"); 124 } 125 } 126 extensions = new LinkedHashMap<>(); // group 5 are extensions, 6 is private use 127 // extensions are a bit more complicated 128 addExtensions(root.group(5), extensionPattern); 129 addExtensions(root.group(6), privateUsePattern); 130 addExtensions(root.group(7), privateUsePattern); 131 addExtensions(root.group(9), keywordPattern); 132 extensions = Collections.unmodifiableMap(extensions); 133 return true; 134 } 135 addExtensions(String item, Pattern pattern)136 private void addExtensions(String item, Pattern pattern) { 137 if (item != null) { 138 Matcher extension = pattern.matcher(item); 139 while (extension.find()) { 140 final String key = extension.group(1); 141 // check for duplicate keys 142 if (extensions.containsKey(key)) { 143 throw new IllegalArgumentException("duplicate key: " + key); 144 } 145 extensions.put(key, extension.group(2)); 146 } 147 } 148 } 149 150 /** 151 * Return BCP 47 language subtag (may be ISO registered code). If the language tag is irregular, 152 * then the entire tag is in the language field. If the entire code is private use, then the 153 * language code is "und". Examples: 154 * 155 * <table style="border-width:1; border-style:collapse"> 156 * <tr> 157 * <th>Input String</th> 158 * <th>Parsed</th> 159 * </tr> 160 * <tr> 161 * <td>zh-cmn-Hans</td> 162 * <td>{language=zh-cmn-hans, script=, country=, variants=[], keywords={}}</td> 163 * </tr> 164 * <tr> 165 * <td>i-default@abc=def</td> 166 * <td>{language=i-default, script=, country=, variants=[], keywords={abc=def}}</td> 167 * </tr> 168 * <tr> 169 * <td>x-foobar@abc=def</td> 170 * <td>{language=und, script=, country=, variants=[], keywords={x=foobar, abc=def}}</td> 171 * </tr> 172 * </table> 173 * 174 * @return language subtag, lowercased. 175 */ getLanguage()176 public String getLanguage() { 177 return language; 178 } 179 180 /** 181 * Return BCP 47 script subtag (may be ISO or UN) 182 * 183 * @return script subtag, titlecased. 184 */ getScript()185 public String getScript() { 186 return script; 187 } 188 189 /** 190 * Return BCP 47 region subtag (may be ISO or UN) 191 * 192 * @return country (region) subtag, uppercased. 193 */ getCountry()194 public String getCountry() { 195 return region; 196 } 197 198 /** 199 * Return immutable list of BCP 47 variants 200 * 201 * @return list of uppercased variants. 202 */ getVariants()203 public List<String> getVariants() { 204 return variants; 205 } 206 207 /** 208 * Return the first variant, for compatibility 209 * 210 * @return first (uppercased) variant 211 */ getVariant()212 public String getVariant() { 213 return variants.size() == 0 ? "" : variants.iterator().next(); 214 } 215 216 /** 217 * Return immutable map of key/value extensions. Includes BCP 47 extensions and private use, 218 * also locale keyword extensions. If the entire code is private use, then the language is set 219 * to "und" for consistency. 220 * 221 * <p>Example: 222 * 223 * <table style="border-width:1; border-style:collapse"> 224 * <tr> 225 * <th>Input String</th> 226 * <th>Parsed</th> 227 * </tr> 228 * <tr> 229 * <td>zh-Hans-HK-SCOUSE-a-foobar-x-a-en@collation=phonebook;calendar=islamic</td> 230 * <td>{language=zh, script=Hans, country=HK, variants=[SCOUSE], keywords={a=foobar, x=a-en, collation=phonebook, 231 * calendar=islamic}}</td> 232 * </tr> 233 * </table> 234 * 235 * @return map of key/value pairs, lowercased. 236 */ getExtensions()237 public Map<String, String> getExtensions() { 238 return extensions; 239 } 240 241 @Override toString()242 public String toString() { 243 return "{language=" 244 + language 245 + ", script=" 246 + script 247 + ", country=" 248 + region 249 + ", variants=" 250 + variants 251 + ", keywords=" 252 + extensions 253 + "}"; 254 } 255 } 256