xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/tool/SimpleLocaleParser.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.tool;
2 
3 import java.util.Arrays;
4 import java.util.Collections;
5 import java.util.HashSet;
6 import java.util.LinkedHashMap;
7 import java.util.List;
8 import java.util.Locale;
9 import java.util.Map;
10 import java.util.regex.Matcher;
11 import java.util.regex.Pattern;
12 import org.unicode.cldr.util.LocaleNames;
13 import org.unicode.cldr.util.PatternCache;
14 
15 /**
16  * Parse Locales, extended to BCP 47 and CLDR. Also normalizes the case of the results. Only does
17  * syntactic parse: does not replace deprecated elements; does not check for validity. Will throw
18  * IllegalArgumentException for duplicate variants and extensions.
19  *
20  * @author markdavis
21  */
22 class SimpleLocaleParser {
23     // mechanically generated regex -- don't worry about trying to read it!
24     // if we want to allow multiple --, change [-_] into [-_]+
25     private static final Pattern rootPattern =
26             Pattern.compile(
27                     "(?:"
28                             + " (?: ( [a-z]{2,8} )"
29                             + // language
30                             "   (?: [-_] ( [a-z]{4} ) )?"
31                             + // script
32                             "   (?: [-_] ( [a-z]{2} | [0-9]{3} ) )?"
33                             + // region
34                             "   (?: [-_] ( (?: [a-z 0-9]{5,8} | [0-9] [a-z 0-9]{3} ) (?: [-_] (?: [a-z 0-9]{5,8} | [0-9] [a-z 0-9]{3} ) )* ) )?"
35                             + // variant(s)
36                             "   (?: [-_] ( [a-w y-z] (?: [-_] [a-z 0-9]{2,8} )+ (?: [-_] [a-w y-z] (?: [-_] [a-z 0-9]{2,8} )+ )* ) )?"
37                             + // extensions
38                             "   (?: [-_] ( x (?: [-_] [a-z 0-9]{1,8} )+ ) )? )"
39                             + // private use
40                             " | ( x (?: [-_] [a-z 0-9]{1,8} )+ )"
41                             + // private use
42                             " | ( en [-_] GB [-_] oed"
43                             + // legacy gorp
44                             "   | i [-_] (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu )"
45                             + "   | no [-_] (?: bok | nyn )"
46                             + "   | sgn [-_] (?: BE [-_] (?: fr | nl) | CH [-_] de )"
47                             + "   | zh [-_] (?: cmn (?: [-_] Hans | [-_] Hant )? | gan | min (?: [-_] nan)? | wuu | yue ) ) )"
48                             + " (?: \\@ ((?: [a-z 0-9]+ \\= [a-z 0-9]+) (?: \\; (?: [a-z 0-9]+ \\= [a-z 0-9]+))*))?"
49                             + // CLDR/ICU
50                             // keywords
51                             "",
52                     Pattern.COMMENTS
53                             | Pattern.CASE_INSENSITIVE); // TODO change above to be lowercase, since
54     // source is
55     // already when we compare
56     // Other regex patterns for splitting apart lists of items detected above.
57     private static final Pattern variantSeparatorPattern = PatternCache.get("[-_]");
58     private static final Pattern extensionPattern =
59             Pattern.compile(
60                     "([a-z]) [-_] ( [a-z 0-9]{2,8} (?:[-_] [a-z 0-9]{2,8})* )", Pattern.COMMENTS);
61     private static final Pattern privateUsePattern =
62             Pattern.compile(
63                     "(x) [-_] ( [a-z 0-9]{1,8} (?:[-_] [a-z 0-9]{1,8})* )", Pattern.COMMENTS);
64     private static final Pattern keywordPattern =
65             Pattern.compile("([a-z 0-9]+) \\= ([a-z 0-9]+)", Pattern.COMMENTS);
66 
67     /** The fields set by set(). */
68     private String language;
69 
70     private String script;
71     private String region;
72     private List<String> variants;
73     private Map<String, String> extensions;
74 
75     /**
76      * Set the object to the source.
77      *
78      * <p>Example (artificially complicated):
79      *
80      * <pre>
81      * myParser.set(&quot;zh-Hans-HK-SCOUSE-a-foobar-x-a-en@collation=phonebook;calendar=islamic&quot;);
82      * String language = myParser.getLanguage();
83      * </pre>
84      *
85      * @param source
86      * @return
87      */
set(String source)88     public boolean set(String source) {
89         source = source.toLowerCase(Locale.ENGLISH);
90         Matcher root = rootPattern.matcher(source);
91         if (!root.matches()) {
92             return false;
93         }
94         language = root.group(1);
95         if (language == null) {
96             language = root.group(8); // marked as “Type: grandfathered” in BCP 47
97             if (language == null) {
98                 language = LocaleNames.UND; // placeholder for completely private use
99             }
100         }
101         script = root.group(2);
102         if (script == null) {
103             script = "";
104         } else {
105             script = script.substring(0, 1).toUpperCase(Locale.ENGLISH) + script.substring(1);
106         }
107         region = root.group(3);
108         if (region == null) {
109             region = "";
110         } else {
111             region = region.toUpperCase(Locale.ENGLISH);
112         }
113         final String variantList = root.group(4);
114         if (variantList == null) {
115             variants = Collections.emptyList();
116         } else {
117             // make uppercase for compatibility with CLDR.
118             variants =
119                     Arrays.asList(
120                             variantSeparatorPattern.split(variantList.toUpperCase(Locale.ENGLISH)));
121             // check for duplicate variants
122             if (new HashSet<>(variants).size() != variants.size()) {
123                 throw new IllegalArgumentException("Duplicate variants");
124             }
125         }
126         extensions = new LinkedHashMap<>(); // group 5 are extensions, 6 is private use
127         // extensions are a bit more complicated
128         addExtensions(root.group(5), extensionPattern);
129         addExtensions(root.group(6), privateUsePattern);
130         addExtensions(root.group(7), privateUsePattern);
131         addExtensions(root.group(9), keywordPattern);
132         extensions = Collections.unmodifiableMap(extensions);
133         return true;
134     }
135 
addExtensions(String item, Pattern pattern)136     private void addExtensions(String item, Pattern pattern) {
137         if (item != null) {
138             Matcher extension = pattern.matcher(item);
139             while (extension.find()) {
140                 final String key = extension.group(1);
141                 // check for duplicate keys
142                 if (extensions.containsKey(key)) {
143                     throw new IllegalArgumentException("duplicate key: " + key);
144                 }
145                 extensions.put(key, extension.group(2));
146             }
147         }
148     }
149 
150     /**
151      * Return BCP 47 language subtag (may be ISO registered code). If the language tag is irregular,
152      * then the entire tag is in the language field. If the entire code is private use, then the
153      * language code is "und". Examples:
154      *
155      * <table style="border-width:1; border-style:collapse">
156      * <tr>
157      * <th>Input String</th>
158      * <th>Parsed</th>
159      * </tr>
160      * <tr>
161      * <td>zh-cmn-Hans</td>
162      * <td>{language=zh-cmn-hans, script=, country=, variants=[], keywords={}}</td>
163      * </tr>
164      * <tr>
165      * <td>i-default@abc=def</td>
166      * <td>{language=i-default, script=, country=, variants=[], keywords={abc=def}}</td>
167      * </tr>
168      * <tr>
169      * <td>x-foobar@abc=def</td>
170      * <td>{language=und, script=, country=, variants=[], keywords={x=foobar, abc=def}}</td>
171      * </tr>
172      * </table>
173      *
174      * @return language subtag, lowercased.
175      */
getLanguage()176     public String getLanguage() {
177         return language;
178     }
179 
180     /**
181      * Return BCP 47 script subtag (may be ISO or UN)
182      *
183      * @return script subtag, titlecased.
184      */
getScript()185     public String getScript() {
186         return script;
187     }
188 
189     /**
190      * Return BCP 47 region subtag (may be ISO or UN)
191      *
192      * @return country (region) subtag, uppercased.
193      */
getCountry()194     public String getCountry() {
195         return region;
196     }
197 
198     /**
199      * Return immutable list of BCP 47 variants
200      *
201      * @return list of uppercased variants.
202      */
getVariants()203     public List<String> getVariants() {
204         return variants;
205     }
206 
207     /**
208      * Return the first variant, for compatibility
209      *
210      * @return first (uppercased) variant
211      */
getVariant()212     public String getVariant() {
213         return variants.size() == 0 ? "" : variants.iterator().next();
214     }
215 
216     /**
217      * Return immutable map of key/value extensions. Includes BCP 47 extensions and private use,
218      * also locale keyword extensions. If the entire code is private use, then the language is set
219      * to "und" for consistency.
220      *
221      * <p>Example:
222      *
223      * <table style="border-width:1; border-style:collapse">
224      * <tr>
225      * <th>Input String</th>
226      * <th>Parsed</th>
227      * </tr>
228      * <tr>
229      * <td>zh-Hans-HK-SCOUSE-a-foobar-x-a-en@collation=phonebook;calendar=islamic</td>
230      * <td>{language=zh, script=Hans, country=HK, variants=[SCOUSE], keywords={a=foobar, x=a-en, collation=phonebook,
231      * calendar=islamic}}</td>
232      * </tr>
233      * </table>
234      *
235      * @return map of key/value pairs, lowercased.
236      */
getExtensions()237     public Map<String, String> getExtensions() {
238         return extensions;
239     }
240 
241     @Override
toString()242     public String toString() {
243         return "{language="
244                 + language
245                 + ", script="
246                 + script
247                 + ", country="
248                 + region
249                 + ", variants="
250                 + variants
251                 + ", keywords="
252                 + extensions
253                 + "}";
254     }
255 }
256