1 /* 2 ********************************************************************** 3 * Copyright (c) 2002-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Author: Mark Davis 7 ********************************************************************** 8 */ 9 package org.unicode.cldr.util; 10 11 import com.google.common.base.CharMatcher; 12 import com.google.common.base.Joiner; 13 import com.google.common.base.Splitter; 14 import com.google.common.collect.ImmutableList; 15 import com.google.common.collect.ImmutableMap; 16 import com.ibm.icu.impl.Row.R2; 17 import com.ibm.icu.text.UnicodeSet; 18 import java.util.ArrayList; 19 import java.util.Collection; 20 import java.util.Collections; 21 import java.util.Comparator; 22 import java.util.EnumSet; 23 import java.util.Iterator; 24 import java.util.List; 25 import java.util.Locale; 26 import java.util.Map; 27 import java.util.Map.Entry; 28 import java.util.NoSuchElementException; 29 import java.util.Set; 30 import java.util.StringTokenizer; 31 import java.util.TreeMap; 32 import java.util.TreeSet; 33 import java.util.regex.Pattern; 34 import org.unicode.cldr.tool.LikelySubtags; 35 36 public class LanguageTagParser { 37 38 private static final Joiner HYPHEN_JOINER = Joiner.on('-'); 39 40 private static final Comparator<? super String> EXTENSION_ORDER = 41 new Comparator<String>() { 42 43 @Override 44 public int compare(String o1, String o2) { 45 int diff = getBucket(o1) - getBucket(o2); 46 if (diff != 0) { 47 return diff; 48 } 49 return o1.compareTo(o2); 50 } 51 52 private int getBucket(String o1) { 53 switch (o1.length()) { 54 case 1: 55 return o1.charAt(0) == 't' ? 0 : 2; 56 case 2: 57 return o1.charAt(1) <= '9' ? 1 : 3; 58 default: 59 throw new IllegalArgumentException(); 60 } 61 } 62 }; 63 64 /** 65 * @return Returns the language, or "" if none. 66 */ getLanguage()67 public String getLanguage() { 68 return language; 69 } 70 71 /** 72 * @return Returns the script, or "" if none. 73 */ getScript()74 public String getScript() { 75 return script; 76 } 77 78 /** 79 * @return Returns the region, or "" if none. 80 */ getRegion()81 public String getRegion() { 82 return region; 83 } 84 85 /** 86 * @return Returns the variants. 87 */ getVariants()88 public List<String> getVariants() { 89 return ImmutableList.copyOf(variants); 90 } 91 92 /** 93 * @return True if the language tag is marked as “Type: grandfathered” in BCP 47. 94 */ isLegacy()95 public boolean isLegacy() { 96 return legacy; 97 } 98 99 /** 100 * @return Returns the extensions. 101 */ 102 @Deprecated getExtensions()103 public Map<String, String> getExtensions() { 104 return OutputOption.ICU.convert(extensions); 105 } 106 107 /** 108 * @return Returns the localeExtensions. 109 */ 110 @Deprecated getLocaleExtensions()111 public Map<String, String> getLocaleExtensions() { 112 return OutputOption.ICU.convert(localeExtensions); 113 } 114 115 /** 116 * @return Returns the extensions. 117 */ getExtensionsDetailed()118 public Map<String, List<String>> getExtensionsDetailed() { 119 return ImmutableMap.copyOf(extensions); 120 } 121 122 /** 123 * @return Returns the localeExtensions. 124 */ getLocaleExtensionsDetailed()125 public Map<String, List<String>> getLocaleExtensionsDetailed() { 126 return ImmutableMap.copyOf(localeExtensions); 127 } 128 129 /** 130 * @return Returns the original, preparsed language tag 131 */ getOriginal()132 public String getOriginal() { 133 return original; 134 } 135 136 /** 137 * @return Returns the language-script (or language) part of a tag. 138 */ getLanguageScript()139 public String getLanguageScript() { 140 if (script.length() != 0) return language + "_" + script; 141 return language; 142 } 143 144 /** 145 * @param in Collection of language tag strings 146 * @return Returns each of the language-script tags in the collection. 147 */ getLanguageScript(Collection<String> in)148 public static Set<String> getLanguageScript(Collection<String> in) { 149 return getLanguageAndScript(in, null); 150 } 151 152 /** 153 * @param in Collection of language tag strings 154 * @return Returns each of the language-script tags in the collection. 155 */ getLanguageAndScript(Collection<String> in, Set<String> output)156 public static Set<String> getLanguageAndScript(Collection<String> in, Set<String> output) { 157 if (output == null) output = new TreeSet<>(); 158 LanguageTagParser lparser = new LanguageTagParser(); 159 for (Iterator<String> it = in.iterator(); it.hasNext(); ) { 160 output.add(lparser.set(it.next()).getLanguageScript()); 161 } 162 return output; 163 } 164 165 // private fields 166 167 private String original; 168 private boolean legacy = false; 169 private String language; 170 private String script; 171 private String region; 172 private Set<String> variants = new TreeSet<>(); 173 private Map<String, List<String>> extensions = new TreeMap<>(); // use tree map 174 private Map<String, List<String>> localeExtensions = new TreeMap<>(EXTENSION_ORDER); 175 176 private static final UnicodeSet ALPHA = new UnicodeSet("[a-zA-Z]").freeze(); 177 private static final UnicodeSet DIGIT = new UnicodeSet("[0-9]").freeze(); 178 private static final UnicodeSet ALPHANUM = new UnicodeSet("[0-9a-zA-Z]").freeze(); 179 private static final UnicodeSet EXTENSION_VALUE = new UnicodeSet("[0-9a-zA-Z/_]").freeze(); 180 private static final UnicodeSet X = new UnicodeSet("[xX]").freeze(); 181 private static final UnicodeSet ALPHA_MINUS_X = new UnicodeSet(ALPHA).removeAll(X).freeze(); 182 private static StandardCodes standardCodes = StandardCodes.make(); 183 private static final Set<String> legacyCodes = standardCodes.getAvailableCodes("legacy"); 184 private static final String separator = "-_"; // '-' alone for 3066bis language tags 185 private static final UnicodeSet SEPARATORS = new UnicodeSet().addAll(separator).freeze(); 186 private static final Splitter SPLIT_BAR = Splitter.on(CharMatcher.anyOf(separator)); 187 private static final Splitter SPLIT_COLON = Splitter.on(';'); 188 private static final Splitter SPLIT_EQUAL = Splitter.on('='); 189 private static SupplementalDataInfo SDI = 190 null; // postpone assignment to avoid re-entrance of SupplementalDataInfo.getInstance 191 192 /** 193 * Parses out a language tag, setting a number of fields that can subsequently be retrieved. If 194 * a private-use field is found, it is returned as the last extension.<br> 195 * This only checks for well-formedness (syntax), not for validity (subtags in registry). For 196 * the latter, see isValid. 197 * 198 * @param languageTag 199 * @return 200 */ set(String languageTag)201 public LanguageTagParser set(String languageTag) { 202 if (languageTag.length() == 0 || languageTag.equals("root")) { 203 // throw new IllegalArgumentException("Language tag cannot be empty"); 204 // 205 // With ICU 64 the language tag for root is normalized to empty string so we 206 // cannot throw for empty string as above. However, code here and in clients 207 // assumes a non-empty language tag, so for now just map "" or "root" to "und". 208 languageTag = "und"; 209 } else if (languageTag.startsWith("_") || languageTag.startsWith("-")) { 210 languageTag = "und" + languageTag; 211 } 212 languageTag = languageTag.toLowerCase(Locale.ROOT); 213 214 // clear everything out 215 language = region = script = ""; 216 legacy = false; 217 variants.clear(); 218 extensions.clear(); 219 localeExtensions.clear(); 220 original = languageTag; 221 int atPosition = languageTag.indexOf('@'); 222 if (atPosition >= 0) { 223 final String extensionsString = 224 languageTag.substring(atPosition + 1).toLowerCase(Locale.ROOT); 225 for (String keyValue : SPLIT_COLON.split(extensionsString)) { 226 final Iterator<String> keyValuePair = SPLIT_EQUAL.split(keyValue).iterator(); 227 final String key = keyValuePair.next(); 228 final String value = keyValuePair.next(); 229 if (keyValuePair.hasNext() 230 || !ALPHANUM.containsAll(key) 231 || !EXTENSION_VALUE.containsAll(value)) { 232 throwError(keyValue, "Invalid key/value pair"); 233 } 234 List<String> valueList = SPLIT_BAR.splitToList(value); 235 switch (key.length()) { 236 case 1: 237 extensions.put(key, valueList); 238 break; 239 case 2: 240 localeExtensions.put(key, valueList); 241 break; 242 default: 243 throwError(keyValue, "Invalid key/value pair"); 244 break; 245 } 246 } 247 languageTag = languageTag.substring(0, atPosition); 248 } 249 250 if (legacyCodes.contains(languageTag)) { 251 language = languageTag; 252 legacy = true; 253 return this; 254 } 255 256 // each time we fetch a token, we check for length from 1..8, and all alphanum 257 StringTokenizer st = new StringTokenizer(languageTag, separator); 258 String subtag; 259 try { 260 subtag = getSubtag(st); 261 } catch (Exception e1) { 262 throw new IllegalArgumentException("Illegal language tag: " + languageTag, e1); 263 } 264 265 // check for private use (x-...) and return if so 266 if (subtag.equalsIgnoreCase("x")) { 267 getExtension(subtag, st, 1); 268 return this; 269 } 270 271 // check that language subtag is valid 272 if (!ALPHA.containsAll(subtag) || subtag.length() < 2) { 273 throwError(subtag, "Invalid language subtag"); 274 } 275 try { // The try block is to catch the out-of-tokens case. Easier than checking each time. 276 language = subtag; 277 subtag = getSubtag(st); // prepare for next 278 279 // check for script, 4 letters 280 if (subtag.length() == 4 && ALPHA.containsAll(subtag)) { 281 script = subtag; 282 script = script.substring(0, 1).toUpperCase(Locale.ROOT) + script.substring(1); 283 subtag = getSubtag(st); // prepare for next 284 } 285 286 // check for region, 2 letters or 3 digits 287 if (subtag.length() == 2 && ALPHA.containsAll(subtag) 288 || subtag.length() == 3 && DIGIT.containsAll(subtag)) { 289 region = subtag.toUpperCase(Locale.ENGLISH); 290 subtag = getSubtag(st); // prepare for next 291 } 292 293 // get variants: length > 4 or len=4 & starts with digit 294 while (isValidVariant(subtag)) { 295 variants.add(subtag); 296 subtag = getSubtag(st); // prepare for next 297 } 298 299 // get extensions: singleton '-' subtag (2-8 long) 300 while (subtag.length() == 1 && ALPHA_MINUS_X.contains(subtag)) { 301 subtag = getExtension(subtag, st, 2); 302 if (subtag == null) return this; // done 303 } 304 305 if (subtag.equalsIgnoreCase("x")) { 306 getExtension(subtag, st, 1); 307 return this; 308 } 309 310 // if we make it to this point, then we have an error 311 throwError(subtag, "Illegal subtag"); 312 313 } catch (NoSuchElementException e) { 314 // this exception just means we ran out of tokens. That's ok, so we just return. 315 } 316 return this; 317 } 318 isValidVariant(String subtag)319 private boolean isValidVariant(String subtag) { 320 return subtag != null 321 && ALPHANUM.containsAll(subtag) 322 && (subtag.length() > 4 323 || subtag.length() == 4 && DIGIT.contains(subtag.charAt(0))); 324 } 325 326 /** 327 * @return true iff the language tag validates 328 */ isValid()329 public boolean isValid() { 330 return LocaleValidator.isValid(this, null, null); 331 } 332 333 public enum Status { 334 WELL_FORMED, 335 VALID, 336 CANONICAL, 337 MINIMAL 338 } 339 getStatus(Set<String> errors)340 public Status getStatus(Set<String> errors) { 341 return getStatus(errors, Collections.emptySet()); 342 } 343 getStatus(Set<String> errors, Set<Validity.Status> allowed)344 public Status getStatus(Set<String> errors, Set<Validity.Status> allowed) { 345 errors.clear(); 346 if (!isValid()) { 347 return Status.WELL_FORMED; 348 // TODO, check the bcp47 extension codes also 349 } 350 351 if (SDI == null) { 352 SDI = SupplementalDataInfo.getInstance(); 353 } 354 Map<String, Map<String, R2<List<String>, String>>> aliasInfo = SDI.getLocaleAliasInfo(); 355 Map<String, Map<String, String>> languageInfo = StandardCodes.getLStreg().get("language"); 356 357 if (aliasInfo.get("language").containsKey(language)) { 358 errors.add("Non-canonical language: " + language); 359 } 360 Map<String, String> lstrInfo = languageInfo.get(language); 361 if (lstrInfo != null) { 362 String scope = lstrInfo.get("Scope"); 363 if ("collection".equals(scope)) { 364 errors.add("Collection language: " + language); 365 } 366 } 367 if (aliasInfo.get("script").containsKey(script)) { 368 errors.add("Non-canonical script: " + script); 369 } 370 if (aliasInfo.get("territory").containsKey(region)) { 371 errors.add("Non-canonical region: " + region); 372 } 373 if (!errors.isEmpty()) { 374 return Status.VALID; 375 } 376 String tag = 377 language 378 + (script.isEmpty() ? "" : "_" + script) 379 + (region.isEmpty() ? "" : "_" + region); 380 String minimized = LikelySubtags.minimize(tag, SDI.getLikelySubtags(), false); 381 if (minimized == null) { 382 errors.add("No minimal data for:" + tag); 383 if (script.isEmpty() && region.isEmpty()) { 384 return Status.MINIMAL; 385 } else { 386 return Status.CANONICAL; 387 } 388 } 389 if (!tag.equals(minimized)) { 390 errors.add("Not minimal:" + tag + "-->" + minimized); 391 return Status.CANONICAL; 392 } 393 return Status.MINIMAL; 394 } 395 396 /** 397 * Internal method 398 * 399 * @param minLength TODO 400 */ getExtension(String subtag, StringTokenizer st, int minLength)401 private String getExtension(String subtag, StringTokenizer st, int minLength) { 402 String base = subtag; 403 final char extension = subtag.charAt(0); 404 if (extensions.containsKey(subtag)) { 405 throwError(subtag, "Can't have two extensions with the same key"); 406 } 407 if (!st.hasMoreElements()) { 408 throwError(subtag, "Private Use / Extension requires subsequent subtag"); 409 } 410 boolean takesSubkeys = extension == 'u' || extension == 't'; 411 boolean firstT = extension == 't'; 412 boolean haveContents = false; 413 List<String> result = new ArrayList<>(); 414 try { 415 while (st.hasMoreElements()) { 416 subtag = getSubtag(st); 417 if (subtag.length() < minLength) { 418 return subtag; 419 } 420 if (takesSubkeys 421 && subtag.length() == 2 422 && (!firstT || isTKey(subtag))) { // start new key-value pair 423 if (!result.isEmpty() || base.length() != 1) { // don't add empty t- or u- 424 localeExtensions.put(base, ImmutableList.copyOf(result)); 425 haveContents = true; 426 result.clear(); 427 } 428 base = subtag; 429 continue; 430 } 431 firstT = false; 432 result.add(subtag); 433 } 434 return null; 435 } finally { 436 if (takesSubkeys) { 437 if (!result.isEmpty() || base.length() != 1) { // don't add empty t- or u- 438 localeExtensions.put(base, ImmutableList.copyOf(result)); 439 haveContents = true; 440 } 441 if (!haveContents) { 442 throw new IllegalArgumentException("extension must not be empty: " + base); 443 } 444 } else { 445 if (result.isEmpty()) { 446 throw new IllegalArgumentException("extension must not be empty: " + base); 447 } 448 extensions.put(base, ImmutableList.copyOf(result)); 449 } 450 } 451 } 452 453 /** Internal method */ getSubtag(StringTokenizer st)454 private String getSubtag(StringTokenizer st) { 455 String result = st.nextToken(); 456 if (result.length() < 1 || result.length() > 8) { 457 throwError(result, "Illegal length (must be 1..8)"); 458 } 459 if (!ALPHANUM.containsAll(result)) { 460 throwError( 461 result, 462 "Illegal characters (" 463 + new UnicodeSet().addAll(result).removeAll(ALPHANUM) 464 + ")"); 465 } 466 return result; 467 } 468 469 /** Internal method */ throwError(String subtag, String errorText)470 private void throwError(String subtag, String errorText) { 471 throw new IllegalArgumentException(errorText + ": " + subtag + " in " + original); 472 } 473 setRegion(String region)474 public LanguageTagParser setRegion(String region) { 475 this.region = region; 476 return this; 477 } 478 setScript(String script)479 public LanguageTagParser setScript(String script) { 480 this.script = script; 481 return this; 482 } 483 484 public enum OutputOption { 485 ICU('_'), 486 ICU_LCVARIANT('_'), 487 BCP47('-'); 488 final char separator; 489 final Joiner joiner; 490 OutputOption(char separator)491 private OutputOption(char separator) { 492 this.separator = separator; 493 joiner = Joiner.on(separator); 494 } 495 convert(Map<String, List<String>> mapToList)496 public Map<String, String> convert(Map<String, List<String>> mapToList) { 497 if (mapToList.isEmpty()) { 498 return Collections.emptyMap(); 499 } 500 ImmutableMap.Builder<String, String> builder = ImmutableMap.builder(); 501 for (Entry<String, List<String>> entry : mapToList.entrySet()) { 502 builder.put(entry.getKey(), joiner.join(entry.getValue())); 503 } 504 return builder.build(); 505 } 506 } 507 508 @Override toString()509 public String toString() { 510 return toString(OutputOption.ICU); 511 } 512 toString(OutputOption oo)513 public String toString(OutputOption oo) { 514 StringBuilder result = new StringBuilder(language); // optimize for the simple cases 515 if (this.script.length() != 0) result.append(oo.separator).append(script); 516 if (this.region.length() != 0) result.append(oo.separator).append(region); 517 if (this.variants.size() != 0) { 518 for (String variant : variants) { 519 result.append(oo.separator) 520 .append( 521 oo != OutputOption.ICU 522 ? variant 523 : variant.toUpperCase(Locale.ROOT)); 524 } 525 } 526 boolean haveAt = false; 527 boolean needSep = false; 528 529 StringBuilder extensionsAfterU = null; 530 StringBuilder extensionX = null; 531 if (this.extensions.size() != 0) { 532 StringBuilder target = result; 533 for (Entry<String, List<String>> extension : extensions.entrySet()) { 534 String key = extension.getKey(); 535 String value = oo.joiner.join(extension.getValue()); 536 switch (key) { 537 case "v": 538 case "w": 539 case "y": 540 case "z": 541 if (extensionsAfterU == null) { 542 extensionsAfterU = new StringBuilder(); 543 } 544 target = extensionsAfterU; 545 break; 546 case "x": 547 if (extensionX == null) { 548 extensionX = new StringBuilder(); 549 } 550 target = extensionX; 551 break; 552 default: 553 // no action; we already have target set right for earlier items. 554 } 555 if (oo == OutputOption.BCP47) { 556 target.append(oo.separator).append(key).append(oo.separator).append(value); 557 } else { 558 if (!haveAt) { 559 target.append('@'); 560 haveAt = true; 561 } 562 if (needSep) { 563 target.append(";"); 564 } else { 565 needSep = true; 566 } 567 target.append(key).append('=').append(value); 568 } 569 } 570 } 571 if (this.localeExtensions.size() != 0) { 572 if (oo == OutputOption.BCP47) { 573 List<String> tValue = localeExtensions.get("t"); 574 if (tValue != null) { 575 result.append(oo.separator) 576 .append('t') 577 .append(oo.separator) 578 .append(oo.joiner.join(tValue)); 579 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) { 580 String key = extension.getKey(); 581 if (isTKey(key)) { 582 String value = oo.joiner.join(extension.getValue()); 583 result.append(oo.separator) 584 .append(key) 585 .append(oo.separator) 586 .append(value); 587 } 588 } 589 } 590 boolean haveU = false; 591 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) { 592 if (!haveU) { 593 List<String> uValue = localeExtensions.get("u"); 594 result.append(oo.separator).append('u'); 595 if (uValue != null) { 596 result.append(oo.separator).append(oo.joiner.join(uValue)); 597 } 598 haveU = true; 599 } 600 String key = extension.getKey(); 601 if (key.length() == 2 && key.charAt(1) >= 'a') { 602 String value = oo.joiner.join(extension.getValue()); 603 result.append(oo.separator).append(key).append(oo.separator).append(value); 604 } 605 } 606 } else { 607 if (!haveAt) { 608 result.append('@'); 609 } 610 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) { 611 if (needSep) { 612 result.append(";"); 613 } else { 614 needSep = true; 615 } 616 String key = extension.getKey(); 617 String value = oo.joiner.join(extension.getValue()); 618 result.append(key.toUpperCase(Locale.ROOT)) 619 .append('=') 620 .append(value.toUpperCase(Locale.ROOT)); 621 } 622 } 623 } 624 // do extensions after u, with x last 625 if (extensionsAfterU != null) { 626 result.append(extensionsAfterU); 627 } 628 if (extensionX != null) { 629 result.append(extensionX); 630 } 631 return result.toString(); 632 } 633 isTKey(String key)634 public static boolean isTKey(String key) { 635 return key.length() == 2 && key.charAt(1) < 'a'; 636 } 637 hasT()638 public boolean hasT() { 639 for (String key : localeExtensions.keySet()) { 640 if (key.equals("t") || isTKey(key)) { 641 return true; 642 } 643 } 644 return false; 645 } 646 647 /** 648 * Return just the language, script, and region (no variants or extensions) 649 * 650 * @return 651 */ toLSR()652 public String toLSR() { 653 String result = language; // optimize for the simple cases 654 if (this.script.length() != 0) result += "_" + script; 655 if (this.region.length() != 0) result += "_" + region; 656 return result; 657 } 658 659 public enum Fields { 660 LANGUAGE, 661 SCRIPT, 662 REGION, 663 VARIANTS 664 } 665 666 public static Set<Fields> LANGUAGE_SCRIPT = 667 Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.SCRIPT)); 668 public static Set<Fields> LANGUAGE_REGION = 669 Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.REGION)); 670 public static Set<Fields> LANGUAGE_SCRIPT_REGION = 671 Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.SCRIPT, Fields.REGION)); 672 toString(Set<Fields> selection)673 public String toString(Set<Fields> selection) { 674 String result = language; 675 if (selection.contains(Fields.SCRIPT) && script.length() != 0) result += "_" + script; 676 if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region; 677 if (selection.contains(Fields.VARIANTS) && variants.size() != 0) { 678 for (String variant : (Collection<String>) variants) { 679 result += "_" + variant; 680 } 681 } 682 return result; 683 } 684 setLanguage(String language)685 public LanguageTagParser setLanguage(String language) { 686 if (SEPARATORS.containsSome(language)) { 687 String oldScript = script; 688 String oldRegion = region; 689 Set<String> oldVariants = variants; 690 set(language); 691 if (script.length() == 0) { 692 script = oldScript; 693 } 694 if (region.length() == 0) { 695 region = oldRegion; 696 } 697 if (oldVariants.size() != 0) { 698 variants = oldVariants; 699 } 700 } else { 701 this.language = language; 702 } 703 return this; 704 } 705 setLocaleExtensions(Map<String, String> localeExtensions)706 public LanguageTagParser setLocaleExtensions(Map<String, String> localeExtensions) { 707 this.localeExtensions = expandMap(localeExtensions, 1, Integer.MAX_VALUE); 708 return this; 709 } 710 setVariants(Collection<String> newVariants)711 public LanguageTagParser setVariants(Collection<String> newVariants) { 712 for (String variant : newVariants) { 713 if (!isValidVariant(variant)) { 714 throw new IllegalArgumentException("Illegal variant: " + variant); 715 } 716 } 717 variants.clear(); 718 variants.addAll(newVariants); 719 return this; 720 } 721 722 static final Pattern EXTENSION_PATTERN = 723 PatternCache.get("([0-9a-zA-Z]{2,8}(-[0-9a-zA-Z]{2,8})*)?"); 724 setExtensions(Map<String, String> newExtensions)725 public LanguageTagParser setExtensions(Map<String, String> newExtensions) { 726 this.extensions = expandMap(newExtensions, 2, 8); 727 return this; 728 } 729 getSimpleParent(String s)730 public static String getSimpleParent(String s) { 731 int lastBar = s.lastIndexOf('_'); 732 return lastBar >= 0 ? s.substring(0, lastBar) : ""; 733 } 734 expandMap( Map<String, String> newLocaleExtensions, int minLength, int maxLength)735 private Map<String, List<String>> expandMap( 736 Map<String, String> newLocaleExtensions, int minLength, int maxLength) { 737 if (newLocaleExtensions.isEmpty()) { 738 return Collections.emptyMap(); 739 } 740 ImmutableMap.Builder<String, List<String>> result = ImmutableMap.builder(); 741 for (Entry<String, String> entry : newLocaleExtensions.entrySet()) { 742 result.put(entry.getKey(), split(entry.getValue(), minLength, maxLength)); 743 } 744 return result.build(); 745 } 746 split(String value, int minLength, int maxLength)747 private List<String> split(String value, int minLength, int maxLength) { 748 List<String> values = SPLIT_BAR.splitToList(value); 749 for (String s : values) { 750 if (s.length() < minLength || s.length() > maxLength) { 751 throw new IllegalArgumentException("Illegal subtag length for: " + s); 752 } 753 if (!ALPHANUM.containsAll(s)) { 754 throw new IllegalArgumentException("Illegal locale character in: " + s); 755 } 756 } 757 return values; 758 } 759 760 public enum Format { 761 icu("_", "_"), 762 bcp47("-", "-"), 763 structure("; ", "="); 764 public final String separator; 765 public final String separator2; 766 Format(String separator, String separator2)767 private Format(String separator, String separator2) { 768 this.separator = separator; 769 this.separator2 = separator2; 770 } 771 } 772 toString(Format format)773 public String toString(Format format) { 774 StringBuilder result = new StringBuilder(); 775 if (format == Format.structure) { 776 result.append("["); 777 } 778 appendField(format, result, "language", language); 779 appendField(format, result, "script", script); 780 appendField(format, result, "region", region); 781 appendField(format, result, "variants", variants); 782 appendField(format, result, "extensions", extensions, new UnicodeSet('a', 's')); 783 appendField(format, result, "localeX", localeExtensions, null); 784 appendField(format, result, "extensions", extensions, new UnicodeSet('v', 'w', 'y', 'z')); 785 appendField(format, result, "extensions", extensions, new UnicodeSet('x', 'x')); 786 if (format == Format.structure) { 787 result.append("]"); 788 } 789 // if (script.length() != 0) { 790 // result. += "_" + script; 791 // } 792 // if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + 793 // region; 794 // if (selection.contains(Fields.VARIANTS) && variants.size() != 0) { 795 // for (String variant : (Collection<String>) variants) { 796 // result += "_" + variant; 797 // } 798 // } 799 return result.toString(); 800 } 801 appendField( Format format, StringBuilder result, String fieldName, String fieldValue)802 private void appendField( 803 Format format, StringBuilder result, String fieldName, String fieldValue) { 804 if (!fieldValue.isEmpty()) { 805 if (result.length() > 1) { 806 result.append(format.separator); 807 } 808 if (format == Format.structure) { 809 result.append(fieldName).append("="); 810 } 811 result.append(fieldValue); 812 } 813 } 814 appendFieldKey( Format format, StringBuilder result, String fieldName, String fieldValue)815 private void appendFieldKey( 816 Format format, StringBuilder result, String fieldName, String fieldValue) { 817 result.append(format.separator) 818 .append(fieldName) 819 .append(format.separator2) 820 .append(fieldValue); 821 } 822 appendField( Format format, StringBuilder result, String fieldName, Collection<String> fieldValues)823 private void appendField( 824 Format format, StringBuilder result, String fieldName, Collection<String> fieldValues) { 825 if (!fieldValues.isEmpty()) { 826 appendField(format, result, fieldName, Joiner.on(",").join(fieldValues)); 827 } 828 } 829 830 /** null match means it is -t- or -u- */ appendField( Format format, StringBuilder result, String fieldName, Map<String, List<String>> fieldValues, UnicodeSet match)831 private void appendField( 832 Format format, 833 StringBuilder result, 834 String fieldName, 835 Map<String, List<String>> fieldValues, 836 UnicodeSet match) { 837 if (match == null && format != Format.structure) { 838 List<String> tLang = fieldValues.get("t"); 839 List<String> uSpecial = fieldValues.get("u"); 840 boolean haveTLang = tLang != null; 841 boolean haveUSpecial = uSpecial != null; 842 843 // do all the keys ending with digits first 844 boolean haveT = false; 845 boolean haveU = false; 846 StringBuilder result2 = new StringBuilder(); // put -u- at end 847 for (Entry<String, List<String>> entry : fieldValues.entrySet()) { 848 String key = entry.getKey(); 849 if (key.length() < 2) { 850 continue; 851 } 852 int lastChar = key.codePointBefore(key.length()); 853 if (lastChar < 'a') { 854 if (!haveT) { 855 result.append(format.separator).append('t'); 856 if (haveTLang) { // empty is illegal, but just in case 857 result.append(format.separator) 858 .append(Joiner.on(format.separator).join(tLang)); 859 haveTLang = false; 860 } 861 haveT = true; 862 } 863 appendFieldKey( 864 format, 865 result, 866 entry.getKey(), 867 Joiner.on(format.separator).join(entry.getValue())); 868 } else { 869 if (!haveU) { 870 result2.append(format.separator).append('u'); 871 if (haveUSpecial) { // not yet valid, but just in case 872 result2.append(format.separator) 873 .append(Joiner.on(format.separator).join(uSpecial)); 874 haveUSpecial = false; 875 } 876 haveU = true; 877 } 878 appendFieldKey( 879 format, 880 result2, 881 entry.getKey(), 882 Joiner.on(format.separator).join(entry.getValue())); 883 } 884 } 885 if (haveTLang) { 886 result.append(format.separator) 887 .append('t') 888 .append(format.separator) 889 .append(Joiner.on(format.separator).join(tLang)); 890 } 891 if (haveUSpecial) { 892 result2.append(format.separator) 893 .append('u') 894 .append(format.separator) 895 .append(Joiner.on(format.separator).join(uSpecial)); 896 } 897 result.append(result2); // put in right order 898 } else { 899 for (Entry<String, List<String>> entry : fieldValues.entrySet()) { 900 if (match == null || match.contains(entry.getKey())) { 901 appendFieldKey( 902 format, 903 result, 904 entry.getKey(), 905 Joiner.on(format.separator).join(entry.getValue())); 906 } 907 } 908 } 909 } 910 /** 911 * Return the script of the locale (without creating a CLDRFile). Note that for ja, the script 912 * is Jpan; for ko, Kore; and zh/yue, either Hant or Hans. <br> 913 * TODO optimize if needed 914 */ getResolvedScript()915 public String getResolvedScript() { 916 if (!script.isEmpty()) { 917 return script; 918 } 919 LanguageTagParser ltp2 = new LanguageTagParser().set(toLSR()); 920 new LikelySubtags().maximize(ltp2); 921 return ltp2.script; 922 } 923 } 924