xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/util/LanguageTagParser.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 /*
2  **********************************************************************
3  * Copyright (c) 2002-2011, International Business Machines
4  * Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  * Author: Mark Davis
7  **********************************************************************
8  */
9 package org.unicode.cldr.util;
10 
11 import com.google.common.base.CharMatcher;
12 import com.google.common.base.Joiner;
13 import com.google.common.base.Splitter;
14 import com.google.common.collect.ImmutableList;
15 import com.google.common.collect.ImmutableMap;
16 import com.ibm.icu.impl.Row.R2;
17 import com.ibm.icu.text.UnicodeSet;
18 import java.util.ArrayList;
19 import java.util.Collection;
20 import java.util.Collections;
21 import java.util.Comparator;
22 import java.util.EnumSet;
23 import java.util.Iterator;
24 import java.util.List;
25 import java.util.Locale;
26 import java.util.Map;
27 import java.util.Map.Entry;
28 import java.util.NoSuchElementException;
29 import java.util.Set;
30 import java.util.StringTokenizer;
31 import java.util.TreeMap;
32 import java.util.TreeSet;
33 import java.util.regex.Pattern;
34 import org.unicode.cldr.tool.LikelySubtags;
35 
36 public class LanguageTagParser {
37 
38     private static final Joiner HYPHEN_JOINER = Joiner.on('-');
39 
40     private static final Comparator<? super String> EXTENSION_ORDER =
41             new Comparator<String>() {
42 
43                 @Override
44                 public int compare(String o1, String o2) {
45                     int diff = getBucket(o1) - getBucket(o2);
46                     if (diff != 0) {
47                         return diff;
48                     }
49                     return o1.compareTo(o2);
50                 }
51 
52                 private int getBucket(String o1) {
53                     switch (o1.length()) {
54                         case 1:
55                             return o1.charAt(0) == 't' ? 0 : 2;
56                         case 2:
57                             return o1.charAt(1) <= '9' ? 1 : 3;
58                         default:
59                             throw new IllegalArgumentException();
60                     }
61                 }
62             };
63 
64     /**
65      * @return Returns the language, or "" if none.
66      */
getLanguage()67     public String getLanguage() {
68         return language;
69     }
70 
71     /**
72      * @return Returns the script, or "" if none.
73      */
getScript()74     public String getScript() {
75         return script;
76     }
77 
78     /**
79      * @return Returns the region, or "" if none.
80      */
getRegion()81     public String getRegion() {
82         return region;
83     }
84 
85     /**
86      * @return Returns the variants.
87      */
getVariants()88     public List<String> getVariants() {
89         return ImmutableList.copyOf(variants);
90     }
91 
92     /**
93      * @return True if the language tag is marked as “Type: grandfathered” in BCP 47.
94      */
isLegacy()95     public boolean isLegacy() {
96         return legacy;
97     }
98 
99     /**
100      * @return Returns the extensions.
101      */
102     @Deprecated
getExtensions()103     public Map<String, String> getExtensions() {
104         return OutputOption.ICU.convert(extensions);
105     }
106 
107     /**
108      * @return Returns the localeExtensions.
109      */
110     @Deprecated
getLocaleExtensions()111     public Map<String, String> getLocaleExtensions() {
112         return OutputOption.ICU.convert(localeExtensions);
113     }
114 
115     /**
116      * @return Returns the extensions.
117      */
getExtensionsDetailed()118     public Map<String, List<String>> getExtensionsDetailed() {
119         return ImmutableMap.copyOf(extensions);
120     }
121 
122     /**
123      * @return Returns the localeExtensions.
124      */
getLocaleExtensionsDetailed()125     public Map<String, List<String>> getLocaleExtensionsDetailed() {
126         return ImmutableMap.copyOf(localeExtensions);
127     }
128 
129     /**
130      * @return Returns the original, preparsed language tag
131      */
getOriginal()132     public String getOriginal() {
133         return original;
134     }
135 
136     /**
137      * @return Returns the language-script (or language) part of a tag.
138      */
getLanguageScript()139     public String getLanguageScript() {
140         if (script.length() != 0) return language + "_" + script;
141         return language;
142     }
143 
144     /**
145      * @param in Collection of language tag strings
146      * @return Returns each of the language-script tags in the collection.
147      */
getLanguageScript(Collection<String> in)148     public static Set<String> getLanguageScript(Collection<String> in) {
149         return getLanguageAndScript(in, null);
150     }
151 
152     /**
153      * @param in Collection of language tag strings
154      * @return Returns each of the language-script tags in the collection.
155      */
getLanguageAndScript(Collection<String> in, Set<String> output)156     public static Set<String> getLanguageAndScript(Collection<String> in, Set<String> output) {
157         if (output == null) output = new TreeSet<>();
158         LanguageTagParser lparser = new LanguageTagParser();
159         for (Iterator<String> it = in.iterator(); it.hasNext(); ) {
160             output.add(lparser.set(it.next()).getLanguageScript());
161         }
162         return output;
163     }
164 
165     // private fields
166 
167     private String original;
168     private boolean legacy = false;
169     private String language;
170     private String script;
171     private String region;
172     private Set<String> variants = new TreeSet<>();
173     private Map<String, List<String>> extensions = new TreeMap<>(); // use tree map
174     private Map<String, List<String>> localeExtensions = new TreeMap<>(EXTENSION_ORDER);
175 
176     private static final UnicodeSet ALPHA = new UnicodeSet("[a-zA-Z]").freeze();
177     private static final UnicodeSet DIGIT = new UnicodeSet("[0-9]").freeze();
178     private static final UnicodeSet ALPHANUM = new UnicodeSet("[0-9a-zA-Z]").freeze();
179     private static final UnicodeSet EXTENSION_VALUE = new UnicodeSet("[0-9a-zA-Z/_]").freeze();
180     private static final UnicodeSet X = new UnicodeSet("[xX]").freeze();
181     private static final UnicodeSet ALPHA_MINUS_X = new UnicodeSet(ALPHA).removeAll(X).freeze();
182     private static StandardCodes standardCodes = StandardCodes.make();
183     private static final Set<String> legacyCodes = standardCodes.getAvailableCodes("legacy");
184     private static final String separator = "-_"; // '-' alone for 3066bis language tags
185     private static final UnicodeSet SEPARATORS = new UnicodeSet().addAll(separator).freeze();
186     private static final Splitter SPLIT_BAR = Splitter.on(CharMatcher.anyOf(separator));
187     private static final Splitter SPLIT_COLON = Splitter.on(';');
188     private static final Splitter SPLIT_EQUAL = Splitter.on('=');
189     private static SupplementalDataInfo SDI =
190             null; // postpone assignment to avoid re-entrance of SupplementalDataInfo.getInstance
191 
192     /**
193      * Parses out a language tag, setting a number of fields that can subsequently be retrieved. If
194      * a private-use field is found, it is returned as the last extension.<br>
195      * This only checks for well-formedness (syntax), not for validity (subtags in registry). For
196      * the latter, see isValid.
197      *
198      * @param languageTag
199      * @return
200      */
set(String languageTag)201     public LanguageTagParser set(String languageTag) {
202         if (languageTag.length() == 0 || languageTag.equals("root")) {
203             // throw new IllegalArgumentException("Language tag cannot be empty");
204             //
205             // With ICU 64 the language tag for root is normalized to empty string so we
206             // cannot throw for empty string as above. However, code here and in clients
207             // assumes a non-empty language tag, so for now just map "" or "root" to "und".
208             languageTag = "und";
209         } else if (languageTag.startsWith("_") || languageTag.startsWith("-")) {
210             languageTag = "und" + languageTag;
211         }
212         languageTag = languageTag.toLowerCase(Locale.ROOT);
213 
214         // clear everything out
215         language = region = script = "";
216         legacy = false;
217         variants.clear();
218         extensions.clear();
219         localeExtensions.clear();
220         original = languageTag;
221         int atPosition = languageTag.indexOf('@');
222         if (atPosition >= 0) {
223             final String extensionsString =
224                     languageTag.substring(atPosition + 1).toLowerCase(Locale.ROOT);
225             for (String keyValue : SPLIT_COLON.split(extensionsString)) {
226                 final Iterator<String> keyValuePair = SPLIT_EQUAL.split(keyValue).iterator();
227                 final String key = keyValuePair.next();
228                 final String value = keyValuePair.next();
229                 if (keyValuePair.hasNext()
230                         || !ALPHANUM.containsAll(key)
231                         || !EXTENSION_VALUE.containsAll(value)) {
232                     throwError(keyValue, "Invalid key/value pair");
233                 }
234                 List<String> valueList = SPLIT_BAR.splitToList(value);
235                 switch (key.length()) {
236                     case 1:
237                         extensions.put(key, valueList);
238                         break;
239                     case 2:
240                         localeExtensions.put(key, valueList);
241                         break;
242                     default:
243                         throwError(keyValue, "Invalid key/value pair");
244                         break;
245                 }
246             }
247             languageTag = languageTag.substring(0, atPosition);
248         }
249 
250         if (legacyCodes.contains(languageTag)) {
251             language = languageTag;
252             legacy = true;
253             return this;
254         }
255 
256         // each time we fetch a token, we check for length from 1..8, and all alphanum
257         StringTokenizer st = new StringTokenizer(languageTag, separator);
258         String subtag;
259         try {
260             subtag = getSubtag(st);
261         } catch (Exception e1) {
262             throw new IllegalArgumentException("Illegal language tag: " + languageTag, e1);
263         }
264 
265         // check for private use (x-...) and return if so
266         if (subtag.equalsIgnoreCase("x")) {
267             getExtension(subtag, st, 1);
268             return this;
269         }
270 
271         // check that language subtag is valid
272         if (!ALPHA.containsAll(subtag) || subtag.length() < 2) {
273             throwError(subtag, "Invalid language subtag");
274         }
275         try { // The try block is to catch the out-of-tokens case. Easier than checking each time.
276             language = subtag;
277             subtag = getSubtag(st); // prepare for next
278 
279             // check for script, 4 letters
280             if (subtag.length() == 4 && ALPHA.containsAll(subtag)) {
281                 script = subtag;
282                 script = script.substring(0, 1).toUpperCase(Locale.ROOT) + script.substring(1);
283                 subtag = getSubtag(st); // prepare for next
284             }
285 
286             // check for region, 2 letters or 3 digits
287             if (subtag.length() == 2 && ALPHA.containsAll(subtag)
288                     || subtag.length() == 3 && DIGIT.containsAll(subtag)) {
289                 region = subtag.toUpperCase(Locale.ENGLISH);
290                 subtag = getSubtag(st); // prepare for next
291             }
292 
293             // get variants: length > 4 or len=4 & starts with digit
294             while (isValidVariant(subtag)) {
295                 variants.add(subtag);
296                 subtag = getSubtag(st); // prepare for next
297             }
298 
299             // get extensions: singleton '-' subtag (2-8 long)
300             while (subtag.length() == 1 && ALPHA_MINUS_X.contains(subtag)) {
301                 subtag = getExtension(subtag, st, 2);
302                 if (subtag == null) return this; // done
303             }
304 
305             if (subtag.equalsIgnoreCase("x")) {
306                 getExtension(subtag, st, 1);
307                 return this;
308             }
309 
310             // if we make it to this point, then we have an error
311             throwError(subtag, "Illegal subtag");
312 
313         } catch (NoSuchElementException e) {
314             // this exception just means we ran out of tokens. That's ok, so we just return.
315         }
316         return this;
317     }
318 
isValidVariant(String subtag)319     private boolean isValidVariant(String subtag) {
320         return subtag != null
321                 && ALPHANUM.containsAll(subtag)
322                 && (subtag.length() > 4
323                         || subtag.length() == 4 && DIGIT.contains(subtag.charAt(0)));
324     }
325 
326     /**
327      * @return true iff the language tag validates
328      */
isValid()329     public boolean isValid() {
330         return LocaleValidator.isValid(this, null, null);
331     }
332 
333     public enum Status {
334         WELL_FORMED,
335         VALID,
336         CANONICAL,
337         MINIMAL
338     }
339 
getStatus(Set<String> errors)340     public Status getStatus(Set<String> errors) {
341         return getStatus(errors, Collections.emptySet());
342     }
343 
getStatus(Set<String> errors, Set<Validity.Status> allowed)344     public Status getStatus(Set<String> errors, Set<Validity.Status> allowed) {
345         errors.clear();
346         if (!isValid()) {
347             return Status.WELL_FORMED;
348             // TODO, check the bcp47 extension codes also
349         }
350 
351         if (SDI == null) {
352             SDI = SupplementalDataInfo.getInstance();
353         }
354         Map<String, Map<String, R2<List<String>, String>>> aliasInfo = SDI.getLocaleAliasInfo();
355         Map<String, Map<String, String>> languageInfo = StandardCodes.getLStreg().get("language");
356 
357         if (aliasInfo.get("language").containsKey(language)) {
358             errors.add("Non-canonical language: " + language);
359         }
360         Map<String, String> lstrInfo = languageInfo.get(language);
361         if (lstrInfo != null) {
362             String scope = lstrInfo.get("Scope");
363             if ("collection".equals(scope)) {
364                 errors.add("Collection language: " + language);
365             }
366         }
367         if (aliasInfo.get("script").containsKey(script)) {
368             errors.add("Non-canonical script: " + script);
369         }
370         if (aliasInfo.get("territory").containsKey(region)) {
371             errors.add("Non-canonical region: " + region);
372         }
373         if (!errors.isEmpty()) {
374             return Status.VALID;
375         }
376         String tag =
377                 language
378                         + (script.isEmpty() ? "" : "_" + script)
379                         + (region.isEmpty() ? "" : "_" + region);
380         String minimized = LikelySubtags.minimize(tag, SDI.getLikelySubtags(), false);
381         if (minimized == null) {
382             errors.add("No minimal data for:" + tag);
383             if (script.isEmpty() && region.isEmpty()) {
384                 return Status.MINIMAL;
385             } else {
386                 return Status.CANONICAL;
387             }
388         }
389         if (!tag.equals(minimized)) {
390             errors.add("Not minimal:" + tag + "-->" + minimized);
391             return Status.CANONICAL;
392         }
393         return Status.MINIMAL;
394     }
395 
396     /**
397      * Internal method
398      *
399      * @param minLength TODO
400      */
getExtension(String subtag, StringTokenizer st, int minLength)401     private String getExtension(String subtag, StringTokenizer st, int minLength) {
402         String base = subtag;
403         final char extension = subtag.charAt(0);
404         if (extensions.containsKey(subtag)) {
405             throwError(subtag, "Can't have two extensions with the same key");
406         }
407         if (!st.hasMoreElements()) {
408             throwError(subtag, "Private Use / Extension requires subsequent subtag");
409         }
410         boolean takesSubkeys = extension == 'u' || extension == 't';
411         boolean firstT = extension == 't';
412         boolean haveContents = false;
413         List<String> result = new ArrayList<>();
414         try {
415             while (st.hasMoreElements()) {
416                 subtag = getSubtag(st);
417                 if (subtag.length() < minLength) {
418                     return subtag;
419                 }
420                 if (takesSubkeys
421                         && subtag.length() == 2
422                         && (!firstT || isTKey(subtag))) { // start new key-value pair
423                     if (!result.isEmpty() || base.length() != 1) { // don't add empty t- or u-
424                         localeExtensions.put(base, ImmutableList.copyOf(result));
425                         haveContents = true;
426                         result.clear();
427                     }
428                     base = subtag;
429                     continue;
430                 }
431                 firstT = false;
432                 result.add(subtag);
433             }
434             return null;
435         } finally {
436             if (takesSubkeys) {
437                 if (!result.isEmpty() || base.length() != 1) { // don't add empty t- or u-
438                     localeExtensions.put(base, ImmutableList.copyOf(result));
439                     haveContents = true;
440                 }
441                 if (!haveContents) {
442                     throw new IllegalArgumentException("extension must not be empty: " + base);
443                 }
444             } else {
445                 if (result.isEmpty()) {
446                     throw new IllegalArgumentException("extension must not be empty: " + base);
447                 }
448                 extensions.put(base, ImmutableList.copyOf(result));
449             }
450         }
451     }
452 
453     /** Internal method */
getSubtag(StringTokenizer st)454     private String getSubtag(StringTokenizer st) {
455         String result = st.nextToken();
456         if (result.length() < 1 || result.length() > 8) {
457             throwError(result, "Illegal length (must be 1..8)");
458         }
459         if (!ALPHANUM.containsAll(result)) {
460             throwError(
461                     result,
462                     "Illegal characters ("
463                             + new UnicodeSet().addAll(result).removeAll(ALPHANUM)
464                             + ")");
465         }
466         return result;
467     }
468 
469     /** Internal method */
throwError(String subtag, String errorText)470     private void throwError(String subtag, String errorText) {
471         throw new IllegalArgumentException(errorText + ": " + subtag + " in " + original);
472     }
473 
setRegion(String region)474     public LanguageTagParser setRegion(String region) {
475         this.region = region;
476         return this;
477     }
478 
setScript(String script)479     public LanguageTagParser setScript(String script) {
480         this.script = script;
481         return this;
482     }
483 
484     public enum OutputOption {
485         ICU('_'),
486         ICU_LCVARIANT('_'),
487         BCP47('-');
488         final char separator;
489         final Joiner joiner;
490 
OutputOption(char separator)491         private OutputOption(char separator) {
492             this.separator = separator;
493             joiner = Joiner.on(separator);
494         }
495 
convert(Map<String, List<String>> mapToList)496         public Map<String, String> convert(Map<String, List<String>> mapToList) {
497             if (mapToList.isEmpty()) {
498                 return Collections.emptyMap();
499             }
500             ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
501             for (Entry<String, List<String>> entry : mapToList.entrySet()) {
502                 builder.put(entry.getKey(), joiner.join(entry.getValue()));
503             }
504             return builder.build();
505         }
506     }
507 
508     @Override
toString()509     public String toString() {
510         return toString(OutputOption.ICU);
511     }
512 
toString(OutputOption oo)513     public String toString(OutputOption oo) {
514         StringBuilder result = new StringBuilder(language); // optimize for the simple cases
515         if (this.script.length() != 0) result.append(oo.separator).append(script);
516         if (this.region.length() != 0) result.append(oo.separator).append(region);
517         if (this.variants.size() != 0) {
518             for (String variant : variants) {
519                 result.append(oo.separator)
520                         .append(
521                                 oo != OutputOption.ICU
522                                         ? variant
523                                         : variant.toUpperCase(Locale.ROOT));
524             }
525         }
526         boolean haveAt = false;
527         boolean needSep = false;
528 
529         StringBuilder extensionsAfterU = null;
530         StringBuilder extensionX = null;
531         if (this.extensions.size() != 0) {
532             StringBuilder target = result;
533             for (Entry<String, List<String>> extension : extensions.entrySet()) {
534                 String key = extension.getKey();
535                 String value = oo.joiner.join(extension.getValue());
536                 switch (key) {
537                     case "v":
538                     case "w":
539                     case "y":
540                     case "z":
541                         if (extensionsAfterU == null) {
542                             extensionsAfterU = new StringBuilder();
543                         }
544                         target = extensionsAfterU;
545                         break;
546                     case "x":
547                         if (extensionX == null) {
548                             extensionX = new StringBuilder();
549                         }
550                         target = extensionX;
551                         break;
552                     default:
553                         // no action; we already have target set right for earlier items.
554                 }
555                 if (oo == OutputOption.BCP47) {
556                     target.append(oo.separator).append(key).append(oo.separator).append(value);
557                 } else {
558                     if (!haveAt) {
559                         target.append('@');
560                         haveAt = true;
561                     }
562                     if (needSep) {
563                         target.append(";");
564                     } else {
565                         needSep = true;
566                     }
567                     target.append(key).append('=').append(value);
568                 }
569             }
570         }
571         if (this.localeExtensions.size() != 0) {
572             if (oo == OutputOption.BCP47) {
573                 List<String> tValue = localeExtensions.get("t");
574                 if (tValue != null) {
575                     result.append(oo.separator)
576                             .append('t')
577                             .append(oo.separator)
578                             .append(oo.joiner.join(tValue));
579                     for (Entry<String, List<String>> extension : localeExtensions.entrySet()) {
580                         String key = extension.getKey();
581                         if (isTKey(key)) {
582                             String value = oo.joiner.join(extension.getValue());
583                             result.append(oo.separator)
584                                     .append(key)
585                                     .append(oo.separator)
586                                     .append(value);
587                         }
588                     }
589                 }
590                 boolean haveU = false;
591                 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) {
592                     if (!haveU) {
593                         List<String> uValue = localeExtensions.get("u");
594                         result.append(oo.separator).append('u');
595                         if (uValue != null) {
596                             result.append(oo.separator).append(oo.joiner.join(uValue));
597                         }
598                         haveU = true;
599                     }
600                     String key = extension.getKey();
601                     if (key.length() == 2 && key.charAt(1) >= 'a') {
602                         String value = oo.joiner.join(extension.getValue());
603                         result.append(oo.separator).append(key).append(oo.separator).append(value);
604                     }
605                 }
606             } else {
607                 if (!haveAt) {
608                     result.append('@');
609                 }
610                 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) {
611                     if (needSep) {
612                         result.append(";");
613                     } else {
614                         needSep = true;
615                     }
616                     String key = extension.getKey();
617                     String value = oo.joiner.join(extension.getValue());
618                     result.append(key.toUpperCase(Locale.ROOT))
619                             .append('=')
620                             .append(value.toUpperCase(Locale.ROOT));
621                 }
622             }
623         }
624         // do extensions after u, with x last
625         if (extensionsAfterU != null) {
626             result.append(extensionsAfterU);
627         }
628         if (extensionX != null) {
629             result.append(extensionX);
630         }
631         return result.toString();
632     }
633 
isTKey(String key)634     public static boolean isTKey(String key) {
635         return key.length() == 2 && key.charAt(1) < 'a';
636     }
637 
hasT()638     public boolean hasT() {
639         for (String key : localeExtensions.keySet()) {
640             if (key.equals("t") || isTKey(key)) {
641                 return true;
642             }
643         }
644         return false;
645     }
646 
647     /**
648      * Return just the language, script, and region (no variants or extensions)
649      *
650      * @return
651      */
toLSR()652     public String toLSR() {
653         String result = language; // optimize for the simple cases
654         if (this.script.length() != 0) result += "_" + script;
655         if (this.region.length() != 0) result += "_" + region;
656         return result;
657     }
658 
659     public enum Fields {
660         LANGUAGE,
661         SCRIPT,
662         REGION,
663         VARIANTS
664     }
665 
666     public static Set<Fields> LANGUAGE_SCRIPT =
667             Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.SCRIPT));
668     public static Set<Fields> LANGUAGE_REGION =
669             Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.REGION));
670     public static Set<Fields> LANGUAGE_SCRIPT_REGION =
671             Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.SCRIPT, Fields.REGION));
672 
toString(Set<Fields> selection)673     public String toString(Set<Fields> selection) {
674         String result = language;
675         if (selection.contains(Fields.SCRIPT) && script.length() != 0) result += "_" + script;
676         if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region;
677         if (selection.contains(Fields.VARIANTS) && variants.size() != 0) {
678             for (String variant : (Collection<String>) variants) {
679                 result += "_" + variant;
680             }
681         }
682         return result;
683     }
684 
setLanguage(String language)685     public LanguageTagParser setLanguage(String language) {
686         if (SEPARATORS.containsSome(language)) {
687             String oldScript = script;
688             String oldRegion = region;
689             Set<String> oldVariants = variants;
690             set(language);
691             if (script.length() == 0) {
692                 script = oldScript;
693             }
694             if (region.length() == 0) {
695                 region = oldRegion;
696             }
697             if (oldVariants.size() != 0) {
698                 variants = oldVariants;
699             }
700         } else {
701             this.language = language;
702         }
703         return this;
704     }
705 
setLocaleExtensions(Map<String, String> localeExtensions)706     public LanguageTagParser setLocaleExtensions(Map<String, String> localeExtensions) {
707         this.localeExtensions = expandMap(localeExtensions, 1, Integer.MAX_VALUE);
708         return this;
709     }
710 
setVariants(Collection<String> newVariants)711     public LanguageTagParser setVariants(Collection<String> newVariants) {
712         for (String variant : newVariants) {
713             if (!isValidVariant(variant)) {
714                 throw new IllegalArgumentException("Illegal variant: " + variant);
715             }
716         }
717         variants.clear();
718         variants.addAll(newVariants);
719         return this;
720     }
721 
722     static final Pattern EXTENSION_PATTERN =
723             PatternCache.get("([0-9a-zA-Z]{2,8}(-[0-9a-zA-Z]{2,8})*)?");
724 
setExtensions(Map<String, String> newExtensions)725     public LanguageTagParser setExtensions(Map<String, String> newExtensions) {
726         this.extensions = expandMap(newExtensions, 2, 8);
727         return this;
728     }
729 
getSimpleParent(String s)730     public static String getSimpleParent(String s) {
731         int lastBar = s.lastIndexOf('_');
732         return lastBar >= 0 ? s.substring(0, lastBar) : "";
733     }
734 
expandMap( Map<String, String> newLocaleExtensions, int minLength, int maxLength)735     private Map<String, List<String>> expandMap(
736             Map<String, String> newLocaleExtensions, int minLength, int maxLength) {
737         if (newLocaleExtensions.isEmpty()) {
738             return Collections.emptyMap();
739         }
740         ImmutableMap.Builder<String, List<String>> result = ImmutableMap.builder();
741         for (Entry<String, String> entry : newLocaleExtensions.entrySet()) {
742             result.put(entry.getKey(), split(entry.getValue(), minLength, maxLength));
743         }
744         return result.build();
745     }
746 
split(String value, int minLength, int maxLength)747     private List<String> split(String value, int minLength, int maxLength) {
748         List<String> values = SPLIT_BAR.splitToList(value);
749         for (String s : values) {
750             if (s.length() < minLength || s.length() > maxLength) {
751                 throw new IllegalArgumentException("Illegal subtag length for: " + s);
752             }
753             if (!ALPHANUM.containsAll(s)) {
754                 throw new IllegalArgumentException("Illegal locale character in: " + s);
755             }
756         }
757         return values;
758     }
759 
760     public enum Format {
761         icu("_", "_"),
762         bcp47("-", "-"),
763         structure("; ", "=");
764         public final String separator;
765         public final String separator2;
766 
Format(String separator, String separator2)767         private Format(String separator, String separator2) {
768             this.separator = separator;
769             this.separator2 = separator2;
770         }
771     }
772 
toString(Format format)773     public String toString(Format format) {
774         StringBuilder result = new StringBuilder();
775         if (format == Format.structure) {
776             result.append("[");
777         }
778         appendField(format, result, "language", language);
779         appendField(format, result, "script", script);
780         appendField(format, result, "region", region);
781         appendField(format, result, "variants", variants);
782         appendField(format, result, "extensions", extensions, new UnicodeSet('a', 's'));
783         appendField(format, result, "localeX", localeExtensions, null);
784         appendField(format, result, "extensions", extensions, new UnicodeSet('v', 'w', 'y', 'z'));
785         appendField(format, result, "extensions", extensions, new UnicodeSet('x', 'x'));
786         if (format == Format.structure) {
787             result.append("]");
788         }
789         //            if (script.length() != 0) {
790         //                result. += "_" + script;
791         //            }
792         //            if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" +
793         // region;
794         //            if (selection.contains(Fields.VARIANTS) && variants.size() != 0) {
795         //                for (String variant : (Collection<String>) variants) {
796         //                    result += "_" + variant;
797         //                }
798         //            }
799         return result.toString();
800     }
801 
appendField( Format format, StringBuilder result, String fieldName, String fieldValue)802     private void appendField(
803             Format format, StringBuilder result, String fieldName, String fieldValue) {
804         if (!fieldValue.isEmpty()) {
805             if (result.length() > 1) {
806                 result.append(format.separator);
807             }
808             if (format == Format.structure) {
809                 result.append(fieldName).append("=");
810             }
811             result.append(fieldValue);
812         }
813     }
814 
appendFieldKey( Format format, StringBuilder result, String fieldName, String fieldValue)815     private void appendFieldKey(
816             Format format, StringBuilder result, String fieldName, String fieldValue) {
817         result.append(format.separator)
818                 .append(fieldName)
819                 .append(format.separator2)
820                 .append(fieldValue);
821     }
822 
appendField( Format format, StringBuilder result, String fieldName, Collection<String> fieldValues)823     private void appendField(
824             Format format, StringBuilder result, String fieldName, Collection<String> fieldValues) {
825         if (!fieldValues.isEmpty()) {
826             appendField(format, result, fieldName, Joiner.on(",").join(fieldValues));
827         }
828     }
829 
830     /** null match means it is -t- or -u- */
appendField( Format format, StringBuilder result, String fieldName, Map<String, List<String>> fieldValues, UnicodeSet match)831     private void appendField(
832             Format format,
833             StringBuilder result,
834             String fieldName,
835             Map<String, List<String>> fieldValues,
836             UnicodeSet match) {
837         if (match == null && format != Format.structure) {
838             List<String> tLang = fieldValues.get("t");
839             List<String> uSpecial = fieldValues.get("u");
840             boolean haveTLang = tLang != null;
841             boolean haveUSpecial = uSpecial != null;
842 
843             // do all the keys ending with digits first
844             boolean haveT = false;
845             boolean haveU = false;
846             StringBuilder result2 = new StringBuilder(); // put -u- at end
847             for (Entry<String, List<String>> entry : fieldValues.entrySet()) {
848                 String key = entry.getKey();
849                 if (key.length() < 2) {
850                     continue;
851                 }
852                 int lastChar = key.codePointBefore(key.length());
853                 if (lastChar < 'a') {
854                     if (!haveT) {
855                         result.append(format.separator).append('t');
856                         if (haveTLang) { // empty is illegal, but just in case
857                             result.append(format.separator)
858                                     .append(Joiner.on(format.separator).join(tLang));
859                             haveTLang = false;
860                         }
861                         haveT = true;
862                     }
863                     appendFieldKey(
864                             format,
865                             result,
866                             entry.getKey(),
867                             Joiner.on(format.separator).join(entry.getValue()));
868                 } else {
869                     if (!haveU) {
870                         result2.append(format.separator).append('u');
871                         if (haveUSpecial) { // not yet valid, but just in case
872                             result2.append(format.separator)
873                                     .append(Joiner.on(format.separator).join(uSpecial));
874                             haveUSpecial = false;
875                         }
876                         haveU = true;
877                     }
878                     appendFieldKey(
879                             format,
880                             result2,
881                             entry.getKey(),
882                             Joiner.on(format.separator).join(entry.getValue()));
883                 }
884             }
885             if (haveTLang) {
886                 result.append(format.separator)
887                         .append('t')
888                         .append(format.separator)
889                         .append(Joiner.on(format.separator).join(tLang));
890             }
891             if (haveUSpecial) {
892                 result2.append(format.separator)
893                         .append('u')
894                         .append(format.separator)
895                         .append(Joiner.on(format.separator).join(uSpecial));
896             }
897             result.append(result2); // put in right order
898         } else {
899             for (Entry<String, List<String>> entry : fieldValues.entrySet()) {
900                 if (match == null || match.contains(entry.getKey())) {
901                     appendFieldKey(
902                             format,
903                             result,
904                             entry.getKey(),
905                             Joiner.on(format.separator).join(entry.getValue()));
906                 }
907             }
908         }
909     }
910     /**
911      * Return the script of the locale (without creating a CLDRFile). Note that for ja, the script
912      * is Jpan; for ko, Kore; and zh/yue, either Hant or Hans. <br>
913      * TODO optimize if needed
914      */
getResolvedScript()915     public String getResolvedScript() {
916         if (!script.isEmpty()) {
917             return script;
918         }
919         LanguageTagParser ltp2 = new LanguageTagParser().set(toLSR());
920         new LikelySubtags().maximize(ltp2);
921         return ltp2.script;
922     }
923 }
924