1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1997-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 *
9 * File ULOC.CPP
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 04/01/97 aliu Creation.
15 * 08/21/98 stephen JDK 1.2 sync
16 * 12/08/98 rtg New Locale implementation and C API
17 * 03/15/99 damiba overhaul.
18 * 04/06/99 stephen changed setDefault() to realloc and copy
19 * 06/14/99 stephen Changed calls to ures_open for new params
20 * 07/21/99 stephen Modified setDefault() to propagate to C++
21 * 05/14/04 alan 7 years later: refactored, cleaned up, fixed bugs,
22 * brought canonicalization code into line with spec
23 *****************************************************************************/
24
25 /*
26 POSIX's locale format, from putil.c: [no spaces]
27
28 ll [ _CC ] [ . MM ] [ @ VV]
29
30 l = lang, C = ctry, M = charmap, V = variant
31 */
32
33 #include "unicode/bytestream.h"
34 #include "unicode/errorcode.h"
35 #include "unicode/stringpiece.h"
36 #include "unicode/utypes.h"
37 #include "unicode/ustring.h"
38 #include "unicode/uloc.h"
39
40 #include "bytesinkutil.h"
41 #include "putilimp.h"
42 #include "ustr_imp.h"
43 #include "ulocimp.h"
44 #include "umutex.h"
45 #include "cstring.h"
46 #include "cmemory.h"
47 #include "locmap.h"
48 #include "uarrsort.h"
49 #include "uenumimp.h"
50 #include "uassert.h"
51 #include "charstr.h"
52
53 U_NAMESPACE_USE
54
55 /* ### Declarations **************************************************/
56
57 /* Locale stuff from locid.cpp */
58 U_CFUNC void locale_set_default(const char *id);
59 U_CFUNC const char *locale_get_default();
60
61 /* ### Data tables **************************************************/
62
63 /**
64 * Table of language codes, both 2- and 3-letter, with preference
65 * given to 2-letter codes where possible. Includes 3-letter codes
66 * that lack a 2-letter equivalent.
67 *
68 * This list must be in sorted order. This list is returned directly
69 * to the user by some API.
70 *
71 * This list must be kept in sync with LANGUAGES_3, with corresponding
72 * entries matched.
73 *
74 * This table should be terminated with a nullptr entry, followed by a
75 * second list, and another nullptr entry. The first list is visible to
76 * user code when this array is returned by API. The second list
77 * contains codes we support, but do not expose through user API.
78 *
79 * Notes
80 *
81 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
82 * include the revisions up to 2001/7/27 *CWB*
83 *
84 * The 3 character codes are the terminology codes like RFC 3066. This
85 * is compatible with prior ICU codes
86 *
87 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
88 * table but now at the end of the table because 3 character codes are
89 * duplicates. This avoids bad searches going from 3 to 2 character
90 * codes.
91 *
92 * The range qaa-qtz is reserved for local use
93 */
94 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
95 /* ISO639 table version is 20150505 */
96 /* Subsequent hand addition of selected languages */
97 static const char * const LANGUAGES[] = {
98 "aa", "ab", "ace", "ach", "ada", "ady", "ae", "aeb",
99 "af", "afh", "agq", "ain", "ak", "akk", "akz", "ale",
100 "aln", "alt", "am", "an", "ang", "anp", "ar", "arc",
101 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
102 "asa", "ase", "ast", "av", "avk", "awa", "ay", "az",
103 "ba", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
104 "be", "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
105 "bgc", "bgn", "bho", "bi", "bik", "bin", "bjn", "bkm", "bla",
106 "blo", "bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh",
107 "brx", "bs", "bss", "bua", "bug", "bum", "byn", "byv",
108 "ca", "cad", "car", "cay", "cch", "ccp", "ce", "ceb", "cgg",
109 "ch", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
110 "chr", "chy", "ckb", "co", "cop", "cps", "cr", "crh",
111 "cs", "csb", "csw", "cu", "cv", "cy",
112 "da", "dak", "dar", "dav", "de", "del", "den", "dgr",
113 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
114 "dyo", "dyu", "dz", "dzg",
115 "ebu", "ee", "efi", "egl", "egy", "eka", "el", "elx",
116 "en", "enm", "eo", "es", "esu", "et", "eu", "ewo",
117 "ext",
118 "fa", "fan", "fat", "ff", "fi", "fil", "fit", "fj",
119 "fo", "fon", "fr", "frc", "frm", "fro", "frp", "frr",
120 "frs", "fur", "fy",
121 "ga", "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
122 "gez", "gil", "gl", "glk", "gmh", "gn", "goh", "gom",
123 "gon", "gor", "got", "grb", "grc", "gsw", "gu", "guc",
124 "gur", "guz", "gv", "gwi",
125 "ha", "hai", "hak", "haw", "he", "hi", "hif", "hil",
126 "hit", "hmn", "ho", "hr", "hsb", "hsn", "ht", "hu",
127 "hup", "hy", "hz",
128 "ia", "iba", "ibb", "id", "ie", "ig", "ii", "ik",
129 "ilo", "inh", "io", "is", "it", "iu", "izh",
130 "ja", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
131 "jv",
132 "ka", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
133 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg", "kgp",
134 "kha", "kho", "khq", "khw", "ki", "kiu", "kj", "kk",
135 "kkj", "kl", "kln", "km", "kmb", "kn", "ko", "koi",
136 "kok", "kos", "kpe", "kr", "krc", "kri", "krj", "krl",
137 "kru", "ks", "ksb", "ksf", "ksh", "ku", "kum", "kut",
138 "kv", "kw", "kxv", "ky",
139 "la", "lad", "lag", "lah", "lam", "lb", "lez", "lfn",
140 "lg", "li", "lij", "liv", "lkt", "lmo", "ln", "lo",
141 "lol", "loz", "lrc", "lt", "ltg", "lu", "lua", "lui",
142 "lun", "luo", "lus", "luy", "lv", "lzh", "lzz",
143 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
144 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg", "mga",
145 "mgh", "mgo", "mh", "mi", "mic", "min", "mis", "mk",
146 "ml", "mn", "mnc", "mni",
147 "moh", "mos", "mr", "mrj",
148 "ms", "mt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
149 "my", "mye", "myv", "mzn",
150 "na", "nan", "nap", "naq", "nb", "nd", "nds", "ne",
151 "new", "ng", "nia", "niu", "njo", "nl", "nmg", "nn",
152 "nnh", "no", "nog", "non", "nov", "nqo", "nr", "nso",
153 "nus", "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi",
154 "oc", "oj", "om", "or", "os", "osa", "ota",
155 "pa", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
156 "pdt", "peo", "pfl", "phn", "pi", "pl", "pms", "pnt",
157 "pon", "prg", "pro", "ps", "pt",
158 "qu", "quc", "qug",
159 "raj", "rap", "rar", "rgn", "rif", "rm", "rn", "ro",
160 "rof", "rom", "rtm", "ru", "rue", "rug", "rup",
161 "rw", "rwk",
162 "sa", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
163 "sba", "sbp", "sc", "scn", "sco", "sd", "sdc", "sdh",
164 "se", "see", "seh", "sei", "sel", "ses", "sg", "sga",
165 "sgs", "shi", "shn", "shu", "si", "sid", "sk",
166 "sl", "sli", "sly", "sm", "sma", "smj", "smn", "sms",
167 "sn", "snk", "so", "sog", "sq", "sr", "srn", "srr",
168 "ss", "ssy", "st", "stq", "su", "suk", "sus", "sux",
169 "sv", "sw", "swb", "syc", "syr", "szl",
170 "ta", "tcy", "te", "tem", "teo", "ter", "tet", "tg",
171 "th", "ti", "tig", "tiv", "tk", "tkl", "tkr",
172 "tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tok", "tpi",
173 "tr", "tru", "trv", "ts", "tsd", "tsi", "tt", "ttt",
174 "tum", "tvl", "tw", "twq", "ty", "tyv", "tzm",
175 "udm", "ug", "uga", "uk", "umb", "und", "ur", "uz",
176 "vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vmw",
177 "vo", "vot", "vro", "vun",
178 "wa", "wae", "wal", "war", "was", "wbp", "wo", "wuu",
179 "xal", "xh", "xmf", "xnr", "xog",
180 "yao", "yap", "yav", "ybb", "yi", "yo", "yrl", "yue",
181 "za", "zap", "zbl", "zea", "zen", "zgh", "zh", "zu",
182 "zun", "zxx", "zza",
183 nullptr,
184 "in", "iw", "ji", "jw", "mo", "sh", "swc", "tl", /* obsolete language codes */
185 nullptr
186 };
187
188 static const char* const DEPRECATED_LANGUAGES[]={
189 "in", "iw", "ji", "jw", "mo", nullptr, nullptr
190 };
191 static const char* const REPLACEMENT_LANGUAGES[]={
192 "id", "he", "yi", "jv", "ro", nullptr, nullptr
193 };
194
195 /**
196 * Table of 3-letter language codes.
197 *
198 * This is a lookup table used to convert 3-letter language codes to
199 * their 2-letter equivalent, where possible. It must be kept in sync
200 * with LANGUAGES. For all valid i, LANGUAGES[i] must refer to the
201 * same language as LANGUAGES_3[i]. The commented-out lines are
202 * copied from LANGUAGES to make eyeballing this baby easier.
203 *
204 * Where a 3-letter language code has no 2-letter equivalent, the
205 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
206 *
207 * This table should be terminated with a nullptr entry, followed by a
208 * second list, and another nullptr entry. The two lists correspond to
209 * the two lists in LANGUAGES.
210 */
211 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
212 /* ISO639 table version is 20150505 */
213 /* Subsequent hand addition of selected languages */
214 static const char * const LANGUAGES_3[] = {
215 "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
216 "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
217 "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
218 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
219 "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
220 "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
221 "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
222 "bgc", "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
223 "blo", "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
224 "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
225 "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
226 "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
227 "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
228 "ces", "csb", "csw", "chu", "chv", "cym",
229 "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
230 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
231 "dyo", "dyu", "dzo", "dzg",
232 "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
233 "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
234 "ext",
235 "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
236 "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
237 "frs", "fur", "fry",
238 "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
239 "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
240 "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
241 "gur", "guz", "glv", "gwi",
242 "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
243 "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
244 "hup", "hye", "her",
245 "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
246 "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
247 "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
248 "jav",
249 "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
250 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
251 "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
252 "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
253 "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
254 "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
255 "kom", "cor", "kxv", "kir",
256 "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
257 "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
258 "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
259 "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
260 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
261 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
262 "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
263 "mal", "mon", "mnc", "mni",
264 "moh", "mos", "mar", "mrj",
265 "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
266 "mya", "mye", "myv", "mzn",
267 "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
268 "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
269 "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
270 "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
271 "oci", "oji", "orm", "ori", "oss", "osa", "ota",
272 "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
273 "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
274 "pon", "prg", "pro", "pus", "por",
275 "que", "quc", "qug",
276 "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
277 "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
278 "kin", "rwk",
279 "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
280 "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
281 "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
282 "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
283 "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
284 "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
285 "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
286 "swe", "swa", "swb", "syc", "syr", "szl",
287 "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
288 "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
289 "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tok", "tpi",
290 "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
291 "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
292 "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
293 "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vmw",
294 "vol", "vot", "vro", "vun",
295 "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
296 "xal", "xho", "xmf", "xnr", "xog",
297 "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
298 "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
299 "zun", "zxx", "zza",
300 nullptr,
301 /* "in", "iw", "ji", "jw", "mo", "sh", "swc", "tl", */
302 "ind", "heb", "yid", "jaw", "mol", "srp", "swc", "tgl",
303 nullptr
304 };
305
306 /**
307 * Table of 2-letter country codes.
308 *
309 * This list must be in sorted order. This list is returned directly
310 * to the user by some API.
311 *
312 * This list must be kept in sync with COUNTRIES_3, with corresponding
313 * entries matched.
314 *
315 * This table should be terminated with a nullptr entry, followed by a
316 * second list, and another nullptr entry. The first list is visible to
317 * user code when this array is returned by API. The second list
318 * contains codes we support, but do not expose through user API.
319 *
320 * Notes:
321 *
322 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
323 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
324 * new codes keeping the old ones for compatibility updated to include
325 * 1999/12/03 revisions *CWB*
326 *
327 * RO(ROM) is now RO(ROU) according to
328 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
329 */
330 static const char * const COUNTRIES[] = {
331 "AD", "AE", "AF", "AG", "AI", "AL", "AM",
332 "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ",
333 "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI",
334 "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV",
335 "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG",
336 "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CQ", "CR",
337 "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK",
338 "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER",
339 "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR",
340 "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL",
341 "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU",
342 "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU",
343 "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS",
344 "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
345 "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA",
346 "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU",
347 "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK",
348 "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS",
349 "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA",
350 "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP",
351 "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG",
352 "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT",
353 "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA",
354 "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ",
355 "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV",
356 "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ",
357 "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV",
358 "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ",
359 "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF",
360 "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW",
361 nullptr,
362 "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR", /* obsolete country codes */
363 nullptr
364 };
365
366 static const char* const DEPRECATED_COUNTRIES[] = {
367 "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", nullptr, nullptr /* deprecated country list */
368 };
369 static const char* const REPLACEMENT_COUNTRIES[] = {
370 /* "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
371 "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", nullptr, nullptr /* replacement country codes */
372 };
373
374 /**
375 * Table of 3-letter country codes.
376 *
377 * This is a lookup table used to convert 3-letter country codes to
378 * their 2-letter equivalent. It must be kept in sync with COUNTRIES.
379 * For all valid i, COUNTRIES[i] must refer to the same country as
380 * COUNTRIES_3[i]. The commented-out lines are copied from COUNTRIES
381 * to make eyeballing this baby easier.
382 *
383 * This table should be terminated with a nullptr entry, followed by a
384 * second list, and another nullptr entry. The two lists correspond to
385 * the two lists in COUNTRIES.
386 */
387 static const char * const COUNTRIES_3[] = {
388 /* "AD", "AE", "AF", "AG", "AI", "AL", "AM", */
389 "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
390 /* "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", */
391 "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
392 /* "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", */
393 "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
394 /* "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV", */
395 "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
396 /* "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", */
397 "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
398 /* "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CQ", "CR", */
399 "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRQ", "CRI",
400 /* "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK", */
401 "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
402 /* "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER", */
403 "DMA", "DOM", "DZA", "XEA", "ECU", "EST", "EGY", "ESH", "ERI",
404 /* "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", */
405 "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
406 /* "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", */
407 "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
408 /* "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", */
409 "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
410 /* "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", */
411 "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
412 /* "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS" */
413 "XIC", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
414 /* "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", */
415 "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
416 /* "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", */
417 "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
418 /* "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", */
419 "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
420 /* "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", */
421 "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
422 /* "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", */
423 "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
424 /* "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", */
425 "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
426 /* "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", */
427 "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
428 /* "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", */
429 "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
430 /* "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", */
431 "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
432 /* "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", */
433 "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
434 /* "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", */
435 "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
436 /* "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", */
437 "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
438 /* "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ", */
439 "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
440 /* "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", */
441 "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
442 /* "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", */
443 "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
444 /* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */
445 "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
446 /* "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW", */
447 "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
448 nullptr,
449 /* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */
450 "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
451 nullptr
452 };
453
454 typedef struct CanonicalizationMap {
455 const char *id; /* input ID */
456 const char *canonicalID; /* canonicalized output ID */
457 } CanonicalizationMap;
458
459 /**
460 * A map to canonicalize locale IDs. This handles a variety of
461 * different semantic kinds of transformations.
462 */
463 static const CanonicalizationMap CANONICALIZE_MAP[] = {
464 { "art__LOJBAN", "jbo" }, /* registered name */
465 { "hy__AREVELA", "hy" }, /* Registered IANA variant */
466 { "hy__AREVMDA", "hyw" }, /* Registered IANA variant */
467 { "zh__GUOYU", "zh" }, /* registered name */
468 { "zh__HAKKA", "hak" }, /* registered name */
469 { "zh__XIANG", "hsn" }, /* registered name */
470 // subtags with 3 chars won't be treated as variants.
471 { "zh_GAN", "gan" }, /* registered name */
472 { "zh_MIN_NAN", "nan" }, /* registered name */
473 { "zh_WUU", "wuu" }, /* registered name */
474 { "zh_YUE", "yue" }, /* registered name */
475 };
476
477 /* ### BCP47 Conversion *******************************************/
478 /* Test if the locale id has BCP47 u extension and does not have '@' */
479 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == nullptr && getShortestSubtagLength(localeID) == 1)
480 /* Gets the size of the shortest subtag in the given localeID. */
getShortestSubtagLength(const char * localeID)481 static int32_t getShortestSubtagLength(const char *localeID) {
482 int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
483 int32_t length = localeIDLength;
484 int32_t tmpLength = 0;
485 int32_t i;
486 UBool reset = true;
487
488 for (i = 0; i < localeIDLength; i++) {
489 if (localeID[i] != '_' && localeID[i] != '-') {
490 if (reset) {
491 tmpLength = 0;
492 reset = false;
493 }
494 tmpLength++;
495 } else {
496 if (tmpLength != 0 && tmpLength < length) {
497 length = tmpLength;
498 }
499 reset = true;
500 }
501 }
502
503 return length;
504 }
505
506 /* ### Keywords **************************************************/
507 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
508 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
509 /* Punctuation/symbols allowed in legacy key values */
510 #define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
511
512 #define ULOC_KEYWORD_BUFFER_LEN 25
513 #define ULOC_MAX_NO_KEYWORDS 25
514
515 U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char * localeID)516 locale_getKeywordsStart(const char *localeID) {
517 const char *result = nullptr;
518 if((result = uprv_strchr(localeID, '@')) != nullptr) {
519 return result;
520 }
521 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
522 else {
523 /* We do this because the @ sign is variant, and the @ sign used on one
524 EBCDIC machine won't be compiled the same way on other EBCDIC based
525 machines. */
526 static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
527 const uint8_t *charToFind = ebcdicSigns;
528 while(*charToFind) {
529 if((result = uprv_strchr(localeID, *charToFind)) != nullptr) {
530 return result;
531 }
532 charToFind++;
533 }
534 }
535 #endif
536 return nullptr;
537 }
538
539 /**
540 * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
541 * @param keywordName incoming name to be canonicalized
542 * @param status return status (keyword too long)
543 * @return length of the keyword name
544 */
locale_canonKeywordName(char * buf,const char * keywordName,UErrorCode * status)545 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
546 {
547 int32_t keywordNameLen = 0;
548
549 for (; *keywordName != 0; keywordName++) {
550 if (!UPRV_ISALPHANUM(*keywordName)) {
551 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
552 return 0;
553 }
554 if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
555 buf[keywordNameLen++] = uprv_tolower(*keywordName);
556 } else {
557 /* keyword name too long for internal buffer */
558 *status = U_INTERNAL_PROGRAM_ERROR;
559 return 0;
560 }
561 }
562 if (keywordNameLen == 0) {
563 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
564 return 0;
565 }
566 buf[keywordNameLen] = 0; /* terminate */
567
568 return keywordNameLen;
569 }
570
571 typedef struct {
572 char keyword[ULOC_KEYWORD_BUFFER_LEN];
573 int32_t keywordLen;
574 const char *valueStart;
575 int32_t valueLen;
576 } KeywordStruct;
577
578 static int32_t U_CALLCONV
compareKeywordStructs(const void *,const void * left,const void * right)579 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
580 const char* leftString = ((const KeywordStruct *)left)->keyword;
581 const char* rightString = ((const KeywordStruct *)right)->keyword;
582 return uprv_strcmp(leftString, rightString);
583 }
584
585 U_CFUNC void
ulocimp_getKeywords(const char * localeID,char prev,ByteSink & sink,UBool valuesToo,UErrorCode * status)586 ulocimp_getKeywords(const char *localeID,
587 char prev,
588 ByteSink& sink,
589 UBool valuesToo,
590 UErrorCode *status)
591 {
592 KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
593
594 int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
595 int32_t numKeywords = 0;
596 const char* pos = localeID;
597 const char* equalSign = nullptr;
598 const char* semicolon = nullptr;
599 int32_t i = 0, j, n;
600
601 if(prev == '@') { /* start of keyword definition */
602 /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
603 do {
604 UBool duplicate = false;
605 /* skip leading spaces */
606 while(*pos == ' ') {
607 pos++;
608 }
609 if (!*pos) { /* handle trailing "; " */
610 break;
611 }
612 if(numKeywords == maxKeywords) {
613 *status = U_INTERNAL_PROGRAM_ERROR;
614 return;
615 }
616 equalSign = uprv_strchr(pos, '=');
617 semicolon = uprv_strchr(pos, ';');
618 /* lack of '=' [foo@currency] is illegal */
619 /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
620 if(!equalSign || (semicolon && semicolon<equalSign)) {
621 *status = U_INVALID_FORMAT_ERROR;
622 return;
623 }
624 /* need to normalize both keyword and keyword name */
625 if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
626 /* keyword name too long for internal buffer */
627 *status = U_INTERNAL_PROGRAM_ERROR;
628 return;
629 }
630 for(i = 0, n = 0; i < equalSign - pos; ++i) {
631 if (pos[i] != ' ') {
632 keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
633 }
634 }
635
636 /* zero-length keyword is an error. */
637 if (n == 0) {
638 *status = U_INVALID_FORMAT_ERROR;
639 return;
640 }
641
642 keywordList[numKeywords].keyword[n] = 0;
643 keywordList[numKeywords].keywordLen = n;
644 /* now grab the value part. First we skip the '=' */
645 equalSign++;
646 /* then we leading spaces */
647 while(*equalSign == ' ') {
648 equalSign++;
649 }
650
651 /* Premature end or zero-length value */
652 if (!*equalSign || equalSign == semicolon) {
653 *status = U_INVALID_FORMAT_ERROR;
654 return;
655 }
656
657 keywordList[numKeywords].valueStart = equalSign;
658
659 pos = semicolon;
660 i = 0;
661 if(pos) {
662 while(*(pos - i - 1) == ' ') {
663 i++;
664 }
665 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
666 pos++;
667 } else {
668 i = (int32_t)uprv_strlen(equalSign);
669 while(i && equalSign[i-1] == ' ') {
670 i--;
671 }
672 keywordList[numKeywords].valueLen = i;
673 }
674 /* If this is a duplicate keyword, then ignore it */
675 for (j=0; j<numKeywords; ++j) {
676 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
677 duplicate = true;
678 break;
679 }
680 }
681 if (!duplicate) {
682 ++numKeywords;
683 }
684 } while(pos);
685
686 /* now we have a list of keywords */
687 /* we need to sort it */
688 uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, nullptr, false, status);
689
690 /* Now construct the keyword part */
691 for(i = 0; i < numKeywords; i++) {
692 sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
693 if(valuesToo) {
694 sink.Append("=", 1);
695 sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
696 if(i < numKeywords - 1) {
697 sink.Append(";", 1);
698 }
699 } else {
700 sink.Append("\0", 1);
701 }
702 }
703 }
704 }
705
706 U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char * localeID,const char * keywordName,char * buffer,int32_t bufferCapacity,UErrorCode * status)707 uloc_getKeywordValue(const char* localeID,
708 const char* keywordName,
709 char* buffer, int32_t bufferCapacity,
710 UErrorCode* status)
711 {
712 if (U_FAILURE(*status)) {
713 return 0;
714 }
715
716 CheckedArrayByteSink sink(buffer, bufferCapacity);
717 ulocimp_getKeywordValue(localeID, keywordName, sink, status);
718
719 int32_t reslen = sink.NumberOfBytesAppended();
720
721 if (U_FAILURE(*status)) {
722 return reslen;
723 }
724
725 if (sink.Overflowed()) {
726 *status = U_BUFFER_OVERFLOW_ERROR;
727 } else {
728 u_terminateChars(buffer, bufferCapacity, reslen, status);
729 }
730
731 return reslen;
732 }
733
734 U_CAPI void U_EXPORT2
ulocimp_getKeywordValue(const char * localeID,const char * keywordName,icu::ByteSink & sink,UErrorCode * status)735 ulocimp_getKeywordValue(const char* localeID,
736 const char* keywordName,
737 icu::ByteSink& sink,
738 UErrorCode* status)
739 {
740 const char* startSearchHere = nullptr;
741 const char* nextSeparator = nullptr;
742 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
743 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
744
745 if(status && U_SUCCESS(*status) && localeID) {
746 CharString tempBuffer;
747 const char* tmpLocaleID;
748
749 if (keywordName == nullptr || keywordName[0] == 0) {
750 *status = U_ILLEGAL_ARGUMENT_ERROR;
751 return;
752 }
753
754 locale_canonKeywordName(keywordNameBuffer, keywordName, status);
755 if(U_FAILURE(*status)) {
756 return;
757 }
758
759 if (_hasBCP47Extension(localeID)) {
760 CharStringByteSink sink(&tempBuffer);
761 ulocimp_forLanguageTag(localeID, -1, sink, nullptr, status);
762 tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
763 } else {
764 tmpLocaleID=localeID;
765 }
766
767 startSearchHere = locale_getKeywordsStart(tmpLocaleID);
768 if(startSearchHere == nullptr) {
769 /* no keywords, return at once */
770 return;
771 }
772
773 /* find the first keyword */
774 while(startSearchHere) {
775 const char* keyValueTail;
776 int32_t keyValueLen;
777
778 startSearchHere++; /* skip @ or ; */
779 nextSeparator = uprv_strchr(startSearchHere, '=');
780 if(!nextSeparator) {
781 *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
782 return;
783 }
784 /* strip leading & trailing spaces (TC decided to tolerate these) */
785 while(*startSearchHere == ' ') {
786 startSearchHere++;
787 }
788 keyValueTail = nextSeparator;
789 while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
790 keyValueTail--;
791 }
792 /* now keyValueTail points to first char after the keyName */
793 /* copy & normalize keyName from locale */
794 if (startSearchHere == keyValueTail) {
795 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
796 return;
797 }
798 keyValueLen = 0;
799 while (startSearchHere < keyValueTail) {
800 if (!UPRV_ISALPHANUM(*startSearchHere)) {
801 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
802 return;
803 }
804 if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
805 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
806 } else {
807 /* keyword name too long for internal buffer */
808 *status = U_INTERNAL_PROGRAM_ERROR;
809 return;
810 }
811 }
812 localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
813
814 startSearchHere = uprv_strchr(nextSeparator, ';');
815
816 if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
817 /* current entry matches the keyword. */
818 nextSeparator++; /* skip '=' */
819 /* First strip leading & trailing spaces (TC decided to tolerate these) */
820 while(*nextSeparator == ' ') {
821 nextSeparator++;
822 }
823 keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
824 while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
825 keyValueTail--;
826 }
827 /* Now copy the value, but check well-formedness */
828 if (nextSeparator == keyValueTail) {
829 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
830 return;
831 }
832 while (nextSeparator < keyValueTail) {
833 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
834 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
835 return;
836 }
837 /* Should we lowercase value to return here? Tests expect as-is. */
838 sink.Append(nextSeparator++, 1);
839 }
840 return;
841 }
842 }
843 }
844 }
845
846 U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char * keywordName,const char * keywordValue,char * buffer,int32_t bufferCapacity,UErrorCode * status)847 uloc_setKeywordValue(const char* keywordName,
848 const char* keywordValue,
849 char* buffer, int32_t bufferCapacity,
850 UErrorCode* status)
851 {
852 /* TODO: sorting. removal. */
853 int32_t keywordNameLen;
854 int32_t keywordValueLen;
855 int32_t bufLen;
856 int32_t needLen = 0;
857 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
858 char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
859 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
860 int32_t rc;
861 char* nextSeparator = nullptr;
862 char* nextEqualsign = nullptr;
863 char* startSearchHere = nullptr;
864 char* keywordStart = nullptr;
865 CharString updatedKeysAndValues;
866 UBool handledInputKeyAndValue = false;
867 char keyValuePrefix = '@';
868
869 if(U_FAILURE(*status)) {
870 return -1;
871 }
872 if (*status == U_STRING_NOT_TERMINATED_WARNING) {
873 *status = U_ZERO_ERROR;
874 }
875 if (keywordName == nullptr || keywordName[0] == 0 || bufferCapacity <= 1) {
876 *status = U_ILLEGAL_ARGUMENT_ERROR;
877 return 0;
878 }
879 bufLen = (int32_t)uprv_strlen(buffer);
880 if(bufferCapacity<bufLen) {
881 /* The capacity is less than the length?! Is this NUL terminated? */
882 *status = U_ILLEGAL_ARGUMENT_ERROR;
883 return 0;
884 }
885 keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
886 if(U_FAILURE(*status)) {
887 return 0;
888 }
889
890 keywordValueLen = 0;
891 if(keywordValue) {
892 while (*keywordValue != 0) {
893 if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
894 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
895 return 0;
896 }
897 if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
898 /* Should we force lowercase in value to set? */
899 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
900 } else {
901 /* keywordValue too long for internal buffer */
902 *status = U_INTERNAL_PROGRAM_ERROR;
903 return 0;
904 }
905 }
906 }
907 keywordValueBuffer[keywordValueLen] = 0; /* terminate */
908
909 startSearchHere = (char*)locale_getKeywordsStart(buffer);
910 if(startSearchHere == nullptr || (startSearchHere[1]==0)) {
911 if(keywordValueLen == 0) { /* no keywords = nothing to remove */
912 U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
913 return bufLen;
914 }
915
916 needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
917 if(startSearchHere) { /* had a single @ */
918 needLen--; /* already had the @ */
919 /* startSearchHere points at the @ */
920 } else {
921 startSearchHere=buffer+bufLen;
922 }
923 if(needLen >= bufferCapacity) {
924 *status = U_BUFFER_OVERFLOW_ERROR;
925 return needLen; /* no change */
926 }
927 *startSearchHere++ = '@';
928 uprv_strcpy(startSearchHere, keywordNameBuffer);
929 startSearchHere += keywordNameLen;
930 *startSearchHere++ = '=';
931 uprv_strcpy(startSearchHere, keywordValueBuffer);
932 U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
933 return needLen;
934 } /* end shortcut - no @ */
935
936 keywordStart = startSearchHere;
937 /* search for keyword */
938 while(keywordStart) {
939 const char* keyValueTail;
940 int32_t keyValueLen;
941
942 keywordStart++; /* skip @ or ; */
943 nextEqualsign = uprv_strchr(keywordStart, '=');
944 if (!nextEqualsign) {
945 *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
946 return 0;
947 }
948 /* strip leading & trailing spaces (TC decided to tolerate these) */
949 while(*keywordStart == ' ') {
950 keywordStart++;
951 }
952 keyValueTail = nextEqualsign;
953 while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
954 keyValueTail--;
955 }
956 /* now keyValueTail points to first char after the keyName */
957 /* copy & normalize keyName from locale */
958 if (keywordStart == keyValueTail) {
959 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
960 return 0;
961 }
962 keyValueLen = 0;
963 while (keywordStart < keyValueTail) {
964 if (!UPRV_ISALPHANUM(*keywordStart)) {
965 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
966 return 0;
967 }
968 if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
969 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
970 } else {
971 /* keyword name too long for internal buffer */
972 *status = U_INTERNAL_PROGRAM_ERROR;
973 return 0;
974 }
975 }
976 localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
977
978 nextSeparator = uprv_strchr(nextEqualsign, ';');
979
980 /* start processing the value part */
981 nextEqualsign++; /* skip '=' */
982 /* First strip leading & trailing spaces (TC decided to tolerate these) */
983 while(*nextEqualsign == ' ') {
984 nextEqualsign++;
985 }
986 keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
987 while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
988 keyValueTail--;
989 }
990 if (nextEqualsign == keyValueTail) {
991 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
992 return 0;
993 }
994
995 rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
996 if(rc == 0) {
997 /* Current entry matches the input keyword. Update the entry */
998 if(keywordValueLen > 0) { /* updating a value */
999 updatedKeysAndValues.append(keyValuePrefix, *status);
1000 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1001 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1002 updatedKeysAndValues.append('=', *status);
1003 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1004 } /* else removing this entry, don't emit anything */
1005 handledInputKeyAndValue = true;
1006 } else {
1007 /* input keyword sorts earlier than current entry, add before current entry */
1008 if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
1009 /* insert new entry at this location */
1010 updatedKeysAndValues.append(keyValuePrefix, *status);
1011 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1012 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1013 updatedKeysAndValues.append('=', *status);
1014 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1015 handledInputKeyAndValue = true;
1016 }
1017 /* copy the current entry */
1018 updatedKeysAndValues.append(keyValuePrefix, *status);
1019 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1020 updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1021 updatedKeysAndValues.append('=', *status);
1022 updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), *status);
1023 }
1024 if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
1025 /* append new entry at the end, it sorts later than existing entries */
1026 updatedKeysAndValues.append(keyValuePrefix, *status);
1027 /* skip keyValuePrefix update, no subsequent key-value pair */
1028 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1029 updatedKeysAndValues.append('=', *status);
1030 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1031 handledInputKeyAndValue = true;
1032 }
1033 keywordStart = nextSeparator;
1034 } /* end loop searching */
1035
1036 /* Any error from updatedKeysAndValues.append above would be internal and not due to
1037 * problems with the passed-in locale. So if we did encounter problems with the
1038 * passed-in locale above, those errors took precedence and overrode any error
1039 * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1040 * are errors here they are from updatedKeysAndValues.append; they do cause an
1041 * error return but the passed-in locale is unmodified and the original bufLen is
1042 * returned.
1043 */
1044 if (!handledInputKeyAndValue || U_FAILURE(*status)) {
1045 /* if input key/value specified removal of a keyword not present in locale, or
1046 * there was an error in CharString.append, leave original locale alone. */
1047 U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
1048 return bufLen;
1049 }
1050
1051 // needLen = length of the part before '@'
1052 needLen = (int32_t)(startSearchHere - buffer);
1053 // Check to see can we fit the startSearchHere, if not, return
1054 // U_BUFFER_OVERFLOW_ERROR without copy updatedKeysAndValues into it.
1055 // We do this because this API function does not behave like most others:
1056 // It promises never to set a U_STRING_NOT_TERMINATED_WARNING.
1057 // When the contents fits but without the terminating NUL, in this case we need to not change
1058 // the buffer contents and return with a buffer overflow error.
1059 int32_t appendLength = updatedKeysAndValues.length();
1060 if (appendLength >= bufferCapacity - needLen) {
1061 *status = U_BUFFER_OVERFLOW_ERROR;
1062 return needLen + appendLength;
1063 }
1064 needLen += updatedKeysAndValues.extract(
1065 startSearchHere, bufferCapacity - needLen, *status);
1066 U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
1067 return needLen;
1068 }
1069
1070 /* ### ID parsing implementation **************************************************/
1071
1072 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1073
1074 /*returns true if one of the special prefixes is here (s=string)
1075 'x-' or 'i-' */
1076 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1077
1078 /* Dot terminates it because of POSIX form where dot precedes the codepage
1079 * except for variant
1080 */
1081 #define _isTerminator(a) ((a==0)||(a=='.')||(a=='@'))
1082
1083 /**
1084 * Lookup 'key' in the array 'list'. The array 'list' should contain
1085 * a nullptr entry, followed by more entries, and a second nullptr entry.
1086 *
1087 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1088 * COUNTRIES_3.
1089 */
_findIndex(const char * const * list,const char * key)1090 static int16_t _findIndex(const char* const* list, const char* key)
1091 {
1092 const char* const* anchor = list;
1093 int32_t pass = 0;
1094
1095 /* Make two passes through two nullptr-terminated arrays at 'list' */
1096 while (pass++ < 2) {
1097 while (*list) {
1098 if (uprv_strcmp(key, *list) == 0) {
1099 return (int16_t)(list - anchor);
1100 }
1101 list++;
1102 }
1103 ++list; /* skip final nullptr *CWB*/
1104 }
1105 return -1;
1106 }
1107
1108 U_CFUNC const char*
uloc_getCurrentCountryID(const char * oldID)1109 uloc_getCurrentCountryID(const char* oldID){
1110 int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1111 if (offset >= 0) {
1112 return REPLACEMENT_COUNTRIES[offset];
1113 }
1114 return oldID;
1115 }
1116 U_CFUNC const char*
uloc_getCurrentLanguageID(const char * oldID)1117 uloc_getCurrentLanguageID(const char* oldID){
1118 int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1119 if (offset >= 0) {
1120 return REPLACEMENT_LANGUAGES[offset];
1121 }
1122 return oldID;
1123 }
1124 /*
1125 * the internal functions _getLanguage(), _getCountry(), _getVariant()
1126 * avoid duplicating code to handle the earlier locale ID pieces
1127 * in the functions for the later ones by
1128 * setting the *pEnd pointer to where they stopped parsing
1129 *
1130 * TODO try to use this in Locale
1131 */
1132 CharString U_EXPORT2
ulocimp_getLanguage(const char * localeID,const char ** pEnd,UErrorCode & status)1133 ulocimp_getLanguage(const char *localeID,
1134 const char **pEnd,
1135 UErrorCode &status) {
1136 CharString result;
1137
1138 if (uprv_stricmp(localeID, "root") == 0) {
1139 localeID += 4;
1140 } else if (uprv_strnicmp(localeID, "und", 3) == 0 &&
1141 (localeID[3] == '\0' ||
1142 localeID[3] == '-' ||
1143 localeID[3] == '_' ||
1144 localeID[3] == '@')) {
1145 localeID += 3;
1146 }
1147
1148 /* if it starts with i- or x- then copy that prefix */
1149 if(_isIDPrefix(localeID)) {
1150 result.append((char)uprv_tolower(*localeID), status);
1151 result.append('-', status);
1152 localeID+=2;
1153 }
1154
1155 /* copy the language as far as possible and count its length */
1156 while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1157 result.append((char)uprv_tolower(*localeID), status);
1158 localeID++;
1159 }
1160
1161 if(result.length()==3) {
1162 /* convert 3 character code to 2 character code if possible *CWB*/
1163 int32_t offset = _findIndex(LANGUAGES_3, result.data());
1164 if(offset>=0) {
1165 result.clear();
1166 result.append(LANGUAGES[offset], status);
1167 }
1168 }
1169
1170 if(pEnd!=nullptr) {
1171 *pEnd=localeID;
1172 }
1173
1174 return result;
1175 }
1176
1177 CharString U_EXPORT2
ulocimp_getScript(const char * localeID,const char ** pEnd,UErrorCode & status)1178 ulocimp_getScript(const char *localeID,
1179 const char **pEnd,
1180 UErrorCode &status) {
1181 CharString result;
1182 int32_t idLen = 0;
1183
1184 if (pEnd != nullptr) {
1185 *pEnd = localeID;
1186 }
1187
1188 /* copy the second item as far as possible and count its length */
1189 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1190 && uprv_isASCIILetter(localeID[idLen])) {
1191 idLen++;
1192 }
1193
1194 /* If it's exactly 4 characters long, then it's a script and not a country. */
1195 if (idLen == 4) {
1196 int32_t i;
1197 if (pEnd != nullptr) {
1198 *pEnd = localeID+idLen;
1199 }
1200 if (idLen >= 1) {
1201 result.append((char)uprv_toupper(*(localeID++)), status);
1202 }
1203 for (i = 1; i < idLen; i++) {
1204 result.append((char)uprv_tolower(*(localeID++)), status);
1205 }
1206 }
1207
1208 return result;
1209 }
1210
1211 CharString U_EXPORT2
ulocimp_getCountry(const char * localeID,const char ** pEnd,UErrorCode & status)1212 ulocimp_getCountry(const char *localeID,
1213 const char **pEnd,
1214 UErrorCode &status) {
1215 CharString result;
1216 int32_t idLen=0;
1217
1218 /* copy the country as far as possible and count its length */
1219 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1220 result.append((char)uprv_toupper(localeID[idLen]), status);
1221 idLen++;
1222 }
1223
1224 /* the country should be either length 2 or 3 */
1225 if (idLen == 2 || idLen == 3) {
1226 /* convert 3 character code to 2 character code if possible *CWB*/
1227 if(idLen==3) {
1228 int32_t offset = _findIndex(COUNTRIES_3, result.data());
1229 if(offset>=0) {
1230 result.clear();
1231 result.append(COUNTRIES[offset], status);
1232 }
1233 }
1234 localeID+=idLen;
1235 } else {
1236 result.clear();
1237 }
1238
1239 if(pEnd!=nullptr) {
1240 *pEnd=localeID;
1241 }
1242
1243 return result;
1244 }
1245
1246 /**
1247 * @param needSeparator if true, then add leading '_' if any variants
1248 * are added to 'variant'
1249 */
1250 static void
_getVariant(const char * localeID,char prev,ByteSink & sink,UBool needSeparator)1251 _getVariant(const char *localeID,
1252 char prev,
1253 ByteSink& sink,
1254 UBool needSeparator) {
1255 UBool hasVariant = false;
1256
1257 /* get one or more variant tags and separate them with '_' */
1258 if(_isIDSeparator(prev)) {
1259 /* get a variant string after a '-' or '_' */
1260 while(!_isTerminator(*localeID)) {
1261 if (needSeparator) {
1262 sink.Append("_", 1);
1263 needSeparator = false;
1264 }
1265 char c = (char)uprv_toupper(*localeID);
1266 if (c == '-') c = '_';
1267 sink.Append(&c, 1);
1268 hasVariant = true;
1269 localeID++;
1270 }
1271 }
1272
1273 /* if there is no variant tag after a '-' or '_' then look for '@' */
1274 if(!hasVariant) {
1275 if(prev=='@') {
1276 /* keep localeID */
1277 } else if((localeID=locale_getKeywordsStart(localeID))!=nullptr) {
1278 ++localeID; /* point after the '@' */
1279 } else {
1280 return;
1281 }
1282 while(!_isTerminator(*localeID)) {
1283 if (needSeparator) {
1284 sink.Append("_", 1);
1285 needSeparator = false;
1286 }
1287 char c = (char)uprv_toupper(*localeID);
1288 if (c == '-' || c == ',') c = '_';
1289 sink.Append(&c, 1);
1290 localeID++;
1291 }
1292 }
1293 }
1294
1295 /* Keyword enumeration */
1296
1297 typedef struct UKeywordsContext {
1298 char* keywords;
1299 char* current;
1300 } UKeywordsContext;
1301
1302 U_CDECL_BEGIN
1303
1304 static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration * enumerator)1305 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1306 uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1307 uprv_free(enumerator->context);
1308 uprv_free(enumerator);
1309 }
1310
1311 static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration * en,UErrorCode *)1312 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1313 char *kw = ((UKeywordsContext *)en->context)->keywords;
1314 int32_t result = 0;
1315 while(*kw) {
1316 result++;
1317 kw += uprv_strlen(kw)+1;
1318 }
1319 return result;
1320 }
1321
1322 static const char * U_CALLCONV
uloc_kw_nextKeyword(UEnumeration * en,int32_t * resultLength,UErrorCode *)1323 uloc_kw_nextKeyword(UEnumeration* en,
1324 int32_t* resultLength,
1325 UErrorCode* /*status*/) {
1326 const char* result = ((UKeywordsContext *)en->context)->current;
1327 int32_t len = 0;
1328 if(*result) {
1329 len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1330 ((UKeywordsContext *)en->context)->current += len+1;
1331 } else {
1332 result = nullptr;
1333 }
1334 if (resultLength) {
1335 *resultLength = len;
1336 }
1337 return result;
1338 }
1339
1340 static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration * en,UErrorCode *)1341 uloc_kw_resetKeywords(UEnumeration* en,
1342 UErrorCode* /*status*/) {
1343 ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1344 }
1345
1346 U_CDECL_END
1347
1348
1349 static const UEnumeration gKeywordsEnum = {
1350 nullptr,
1351 nullptr,
1352 uloc_kw_closeKeywords,
1353 uloc_kw_countKeywords,
1354 uenum_unextDefault,
1355 uloc_kw_nextKeyword,
1356 uloc_kw_resetKeywords
1357 };
1358
1359 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char * keywordList,int32_t keywordListSize,UErrorCode * status)1360 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1361 {
1362 LocalMemory<UKeywordsContext> myContext;
1363 LocalMemory<UEnumeration> result;
1364
1365 if (U_FAILURE(*status)) {
1366 return nullptr;
1367 }
1368 myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1369 result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1370 if (myContext.isNull() || result.isNull()) {
1371 *status = U_MEMORY_ALLOCATION_ERROR;
1372 return nullptr;
1373 }
1374 uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1375 myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1376 if (myContext->keywords == nullptr) {
1377 *status = U_MEMORY_ALLOCATION_ERROR;
1378 return nullptr;
1379 }
1380 uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1381 myContext->keywords[keywordListSize] = 0;
1382 myContext->current = myContext->keywords;
1383 result->context = myContext.orphan();
1384 return result.orphan();
1385 }
1386
1387 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char * localeID,UErrorCode * status)1388 uloc_openKeywords(const char* localeID,
1389 UErrorCode* status)
1390 {
1391 CharString tempBuffer;
1392 const char* tmpLocaleID;
1393
1394 if(status==nullptr || U_FAILURE(*status)) {
1395 return 0;
1396 }
1397
1398 if (_hasBCP47Extension(localeID)) {
1399 CharStringByteSink sink(&tempBuffer);
1400 ulocimp_forLanguageTag(localeID, -1, sink, nullptr, status);
1401 tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
1402 } else {
1403 if (localeID==nullptr) {
1404 localeID=uloc_getDefault();
1405 }
1406 tmpLocaleID=localeID;
1407 }
1408
1409 /* Skip the language */
1410 ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *status);
1411 if (U_FAILURE(*status)) {
1412 return 0;
1413 }
1414
1415 if(_isIDSeparator(*tmpLocaleID)) {
1416 const char *scriptID;
1417 /* Skip the script if available */
1418 ulocimp_getScript(tmpLocaleID+1, &scriptID, *status);
1419 if (U_FAILURE(*status)) {
1420 return 0;
1421 }
1422 if(scriptID != tmpLocaleID+1) {
1423 /* Found optional script */
1424 tmpLocaleID = scriptID;
1425 }
1426 /* Skip the Country */
1427 if (_isIDSeparator(*tmpLocaleID)) {
1428 ulocimp_getCountry(tmpLocaleID+1, &tmpLocaleID, *status);
1429 if (U_FAILURE(*status)) {
1430 return 0;
1431 }
1432 }
1433 }
1434
1435 /* keywords are located after '@' */
1436 if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != nullptr) {
1437 CharString keywords;
1438 CharStringByteSink sink(&keywords);
1439 ulocimp_getKeywords(tmpLocaleID+1, '@', sink, false, status);
1440 if (U_FAILURE(*status)) {
1441 return nullptr;
1442 }
1443 return uloc_openKeywordList(keywords.data(), keywords.length(), status);
1444 }
1445 return nullptr;
1446 }
1447
1448
1449 /* bit-flags for 'options' parameter of _canonicalize */
1450 #define _ULOC_STRIP_KEYWORDS 0x2
1451 #define _ULOC_CANONICALIZE 0x1
1452
1453 #define OPTION_SET(options, mask) ((options & mask) != 0)
1454
1455 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1456 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1457
1458 /**
1459 * Canonicalize the given localeID, to level 1 or to level 2,
1460 * depending on the options. To specify level 1, pass in options=0.
1461 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1462 *
1463 * This is the code underlying uloc_getName and uloc_canonicalize.
1464 */
1465 static void
_canonicalize(const char * localeID,ByteSink & sink,uint32_t options,UErrorCode * err)1466 _canonicalize(const char* localeID,
1467 ByteSink& sink,
1468 uint32_t options,
1469 UErrorCode* err) {
1470 if (U_FAILURE(*err)) {
1471 return;
1472 }
1473
1474 int32_t j, fieldCount=0, scriptSize=0, variantSize=0;
1475 CharString tempBuffer; // if localeID has a BCP47 extension, tmpLocaleID points to this
1476 CharString localeIDWithHyphens; // if localeID has a BPC47 extension and have _, tmpLocaleID points to this
1477 const char* origLocaleID;
1478 const char* tmpLocaleID;
1479 const char* keywordAssign = nullptr;
1480 const char* separatorIndicator = nullptr;
1481
1482 if (_hasBCP47Extension(localeID)) {
1483 const char* localeIDPtr = localeID;
1484
1485 // convert all underbars to hyphens, unless the "BCP47 extension" comes at the beginning of the string
1486 if (uprv_strchr(localeID, '_') != nullptr && localeID[1] != '-' && localeID[1] != '_') {
1487 localeIDWithHyphens.append(localeID, -1, *err);
1488 if (U_SUCCESS(*err)) {
1489 for (char* p = localeIDWithHyphens.data(); *p != '\0'; ++p) {
1490 if (*p == '_') {
1491 *p = '-';
1492 }
1493 }
1494 localeIDPtr = localeIDWithHyphens.data();
1495 }
1496 }
1497
1498 CharStringByteSink tempSink(&tempBuffer);
1499 ulocimp_forLanguageTag(localeIDPtr, -1, tempSink, nullptr, err);
1500 tmpLocaleID = U_SUCCESS(*err) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeIDPtr;
1501 } else {
1502 if (localeID==nullptr) {
1503 localeID=uloc_getDefault();
1504 }
1505 tmpLocaleID=localeID;
1506 }
1507
1508 origLocaleID=tmpLocaleID;
1509
1510 /* get all pieces, one after another, and separate with '_' */
1511 CharString tag = ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1512
1513 if (tag.length() == I_DEFAULT_LENGTH &&
1514 uprv_strncmp(origLocaleID, i_default, I_DEFAULT_LENGTH) == 0) {
1515 tag.clear();
1516 tag.append(uloc_getDefault(), *err);
1517 } else if(_isIDSeparator(*tmpLocaleID)) {
1518 const char *scriptID;
1519
1520 ++fieldCount;
1521 tag.append('_', *err);
1522
1523 CharString script = ulocimp_getScript(tmpLocaleID+1, &scriptID, *err);
1524 tag.append(script, *err);
1525 scriptSize = script.length();
1526 if(scriptSize > 0) {
1527 /* Found optional script */
1528 tmpLocaleID = scriptID;
1529 ++fieldCount;
1530 if (_isIDSeparator(*tmpLocaleID)) {
1531 /* If there is something else, then we add the _ */
1532 tag.append('_', *err);
1533 }
1534 }
1535
1536 if (_isIDSeparator(*tmpLocaleID)) {
1537 const char *cntryID;
1538
1539 CharString country = ulocimp_getCountry(tmpLocaleID+1, &cntryID, *err);
1540 tag.append(country, *err);
1541 if (!country.isEmpty()) {
1542 /* Found optional country */
1543 tmpLocaleID = cntryID;
1544 }
1545 if(_isIDSeparator(*tmpLocaleID)) {
1546 /* If there is something else, then we add the _ if we found country before. */
1547 if (!_isIDSeparator(*(tmpLocaleID+1))) {
1548 ++fieldCount;
1549 tag.append('_', *err);
1550 }
1551
1552 variantSize = -tag.length();
1553 {
1554 CharStringByteSink s(&tag);
1555 _getVariant(tmpLocaleID+1, *tmpLocaleID, s, false);
1556 }
1557 variantSize += tag.length();
1558 if (variantSize > 0) {
1559 tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1560 }
1561 }
1562 }
1563 }
1564
1565 /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1566 if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1567 UBool done = false;
1568 do {
1569 char c = *tmpLocaleID;
1570 switch (c) {
1571 case 0:
1572 case '@':
1573 done = true;
1574 break;
1575 default:
1576 tag.append(c, *err);
1577 ++tmpLocaleID;
1578 break;
1579 }
1580 } while (!done);
1581 }
1582
1583 /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1584 After this, tmpLocaleID either points to '@' or is nullptr */
1585 if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=nullptr) {
1586 keywordAssign = uprv_strchr(tmpLocaleID, '=');
1587 separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1588 }
1589
1590 /* Copy POSIX-style variant, if any [mr@FOO] */
1591 if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1592 tmpLocaleID != nullptr && keywordAssign == nullptr) {
1593 for (;;) {
1594 char c = *tmpLocaleID;
1595 if (c == 0) {
1596 break;
1597 }
1598 tag.append(c, *err);
1599 ++tmpLocaleID;
1600 }
1601 }
1602
1603 if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1604 /* Handle @FOO variant if @ is present and not followed by = */
1605 if (tmpLocaleID!=nullptr && keywordAssign==nullptr) {
1606 /* Add missing '_' if needed */
1607 if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1608 do {
1609 tag.append('_', *err);
1610 ++fieldCount;
1611 } while(fieldCount<2);
1612 }
1613
1614 int32_t posixVariantSize = -tag.length();
1615 {
1616 CharStringByteSink s(&tag);
1617 _getVariant(tmpLocaleID+1, '@', s, (UBool)(variantSize > 0));
1618 }
1619 posixVariantSize += tag.length();
1620 if (posixVariantSize > 0) {
1621 variantSize += posixVariantSize;
1622 }
1623 }
1624
1625 /* Look up the ID in the canonicalization map */
1626 for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1627 StringPiece id(CANONICALIZE_MAP[j].id);
1628 if (tag == id) {
1629 if (id.empty() && tmpLocaleID != nullptr) {
1630 break; /* Don't remap "" if keywords present */
1631 }
1632 tag.clear();
1633 tag.append(CANONICALIZE_MAP[j].canonicalID, *err);
1634 break;
1635 }
1636 }
1637 }
1638
1639 sink.Append(tag.data(), tag.length());
1640
1641 if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1642 if (tmpLocaleID!=nullptr && keywordAssign!=nullptr &&
1643 (!separatorIndicator || separatorIndicator > keywordAssign)) {
1644 sink.Append("@", 1);
1645 ++fieldCount;
1646 ulocimp_getKeywords(tmpLocaleID+1, '@', sink, true, err);
1647 }
1648 }
1649 }
1650
1651 /* ### ID parsing API **************************************************/
1652
1653 U_CAPI int32_t U_EXPORT2
uloc_getParent(const char * localeID,char * parent,int32_t parentCapacity,UErrorCode * err)1654 uloc_getParent(const char* localeID,
1655 char* parent,
1656 int32_t parentCapacity,
1657 UErrorCode* err)
1658 {
1659 if (U_FAILURE(*err)) {
1660 return 0;
1661 }
1662
1663 CheckedArrayByteSink sink(parent, parentCapacity);
1664 ulocimp_getParent(localeID, sink, err);
1665
1666 int32_t reslen = sink.NumberOfBytesAppended();
1667
1668 if (U_FAILURE(*err)) {
1669 return reslen;
1670 }
1671
1672 if (sink.Overflowed()) {
1673 *err = U_BUFFER_OVERFLOW_ERROR;
1674 } else {
1675 u_terminateChars(parent, parentCapacity, reslen, err);
1676 }
1677
1678 return reslen;
1679 }
1680
1681 U_CAPI void U_EXPORT2
ulocimp_getParent(const char * localeID,icu::ByteSink & sink,UErrorCode * err)1682 ulocimp_getParent(const char* localeID,
1683 icu::ByteSink& sink,
1684 UErrorCode* err)
1685 {
1686 const char *lastUnderscore;
1687 int32_t i;
1688
1689 if (U_FAILURE(*err))
1690 return;
1691
1692 if (localeID == nullptr)
1693 localeID = uloc_getDefault();
1694
1695 lastUnderscore=uprv_strrchr(localeID, '_');
1696 if(lastUnderscore!=nullptr) {
1697 i=(int32_t)(lastUnderscore-localeID);
1698 } else {
1699 i=0;
1700 }
1701
1702 if (i > 0) {
1703 if (uprv_strnicmp(localeID, "und_", 4) == 0) {
1704 localeID += 3;
1705 i -= 3;
1706 }
1707 sink.Append(localeID, i);
1708 }
1709 }
1710
1711 U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char * localeID,char * language,int32_t languageCapacity,UErrorCode * err)1712 uloc_getLanguage(const char* localeID,
1713 char* language,
1714 int32_t languageCapacity,
1715 UErrorCode* err)
1716 {
1717 /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1718
1719 if (err==nullptr || U_FAILURE(*err)) {
1720 return 0;
1721 }
1722
1723 if(localeID==nullptr) {
1724 localeID=uloc_getDefault();
1725 }
1726
1727 return ulocimp_getLanguage(localeID, nullptr, *err).extract(language, languageCapacity, *err);
1728 }
1729
1730 U_CAPI int32_t U_EXPORT2
uloc_getScript(const char * localeID,char * script,int32_t scriptCapacity,UErrorCode * err)1731 uloc_getScript(const char* localeID,
1732 char* script,
1733 int32_t scriptCapacity,
1734 UErrorCode* err)
1735 {
1736 if(err==nullptr || U_FAILURE(*err)) {
1737 return 0;
1738 }
1739
1740 if(localeID==nullptr) {
1741 localeID=uloc_getDefault();
1742 }
1743
1744 /* skip the language */
1745 ulocimp_getLanguage(localeID, &localeID, *err);
1746 if (U_FAILURE(*err)) {
1747 return 0;
1748 }
1749
1750 if(_isIDSeparator(*localeID)) {
1751 return ulocimp_getScript(localeID+1, nullptr, *err).extract(script, scriptCapacity, *err);
1752 }
1753 return u_terminateChars(script, scriptCapacity, 0, err);
1754 }
1755
1756 U_CAPI int32_t U_EXPORT2
uloc_getCountry(const char * localeID,char * country,int32_t countryCapacity,UErrorCode * err)1757 uloc_getCountry(const char* localeID,
1758 char* country,
1759 int32_t countryCapacity,
1760 UErrorCode* err)
1761 {
1762 if(err==nullptr || U_FAILURE(*err)) {
1763 return 0;
1764 }
1765
1766 if(localeID==nullptr) {
1767 localeID=uloc_getDefault();
1768 }
1769
1770 /* Skip the language */
1771 ulocimp_getLanguage(localeID, &localeID, *err);
1772 if (U_FAILURE(*err)) {
1773 return 0;
1774 }
1775
1776 if(_isIDSeparator(*localeID)) {
1777 const char *scriptID;
1778 /* Skip the script if available */
1779 ulocimp_getScript(localeID+1, &scriptID, *err);
1780 if (U_FAILURE(*err)) {
1781 return 0;
1782 }
1783 if(scriptID != localeID+1) {
1784 /* Found optional script */
1785 localeID = scriptID;
1786 }
1787 if(_isIDSeparator(*localeID)) {
1788 return ulocimp_getCountry(localeID+1, nullptr, *err).extract(country, countryCapacity, *err);
1789 }
1790 }
1791 return u_terminateChars(country, countryCapacity, 0, err);
1792 }
1793
1794 U_CAPI int32_t U_EXPORT2
uloc_getVariant(const char * localeID,char * variant,int32_t variantCapacity,UErrorCode * err)1795 uloc_getVariant(const char* localeID,
1796 char* variant,
1797 int32_t variantCapacity,
1798 UErrorCode* err)
1799 {
1800 CharString tempBuffer;
1801 const char* tmpLocaleID;
1802 int32_t i=0;
1803
1804 if(err==nullptr || U_FAILURE(*err)) {
1805 return 0;
1806 }
1807
1808 if (_hasBCP47Extension(localeID)) {
1809 CharStringByteSink sink(&tempBuffer);
1810 ulocimp_forLanguageTag(localeID, -1, sink, nullptr, err);
1811 tmpLocaleID = U_SUCCESS(*err) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
1812 } else {
1813 if (localeID==nullptr) {
1814 localeID=uloc_getDefault();
1815 }
1816 tmpLocaleID=localeID;
1817 }
1818
1819 /* Skip the language */
1820 ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1821 if (U_FAILURE(*err)) {
1822 return 0;
1823 }
1824
1825 if(_isIDSeparator(*tmpLocaleID)) {
1826 const char *scriptID;
1827 /* Skip the script if available */
1828 ulocimp_getScript(tmpLocaleID+1, &scriptID, *err);
1829 if (U_FAILURE(*err)) {
1830 return 0;
1831 }
1832 if(scriptID != tmpLocaleID+1) {
1833 /* Found optional script */
1834 tmpLocaleID = scriptID;
1835 }
1836 /* Skip the Country */
1837 if (_isIDSeparator(*tmpLocaleID)) {
1838 const char *cntryID;
1839 ulocimp_getCountry(tmpLocaleID+1, &cntryID, *err);
1840 if (U_FAILURE(*err)) {
1841 return 0;
1842 }
1843 if (cntryID != tmpLocaleID+1) {
1844 /* Found optional country */
1845 tmpLocaleID = cntryID;
1846 }
1847 if(_isIDSeparator(*tmpLocaleID)) {
1848 /* If there was no country ID, skip a possible extra IDSeparator */
1849 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
1850 tmpLocaleID++;
1851 }
1852
1853 CheckedArrayByteSink sink(variant, variantCapacity);
1854 _getVariant(tmpLocaleID+1, *tmpLocaleID, sink, false);
1855
1856 i = sink.NumberOfBytesAppended();
1857
1858 if (U_FAILURE(*err)) {
1859 return i;
1860 }
1861
1862 if (sink.Overflowed()) {
1863 *err = U_BUFFER_OVERFLOW_ERROR;
1864 return i;
1865 }
1866 }
1867 }
1868 }
1869
1870 return u_terminateChars(variant, variantCapacity, i, err);
1871 }
1872
1873 U_CAPI int32_t U_EXPORT2
uloc_getName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1874 uloc_getName(const char* localeID,
1875 char* name,
1876 int32_t nameCapacity,
1877 UErrorCode* err)
1878 {
1879 if (U_FAILURE(*err)) {
1880 return 0;
1881 }
1882
1883 CheckedArrayByteSink sink(name, nameCapacity);
1884 ulocimp_getName(localeID, sink, err);
1885
1886 int32_t reslen = sink.NumberOfBytesAppended();
1887
1888 if (U_FAILURE(*err)) {
1889 return reslen;
1890 }
1891
1892 if (sink.Overflowed()) {
1893 *err = U_BUFFER_OVERFLOW_ERROR;
1894 } else {
1895 u_terminateChars(name, nameCapacity, reslen, err);
1896 }
1897
1898 return reslen;
1899 }
1900
1901 U_CAPI void U_EXPORT2
ulocimp_getName(const char * localeID,ByteSink & sink,UErrorCode * err)1902 ulocimp_getName(const char* localeID,
1903 ByteSink& sink,
1904 UErrorCode* err)
1905 {
1906 _canonicalize(localeID, sink, 0, err);
1907 }
1908
1909 U_CAPI int32_t U_EXPORT2
uloc_getBaseName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1910 uloc_getBaseName(const char* localeID,
1911 char* name,
1912 int32_t nameCapacity,
1913 UErrorCode* err)
1914 {
1915 if (U_FAILURE(*err)) {
1916 return 0;
1917 }
1918
1919 CheckedArrayByteSink sink(name, nameCapacity);
1920 ulocimp_getBaseName(localeID, sink, err);
1921
1922 int32_t reslen = sink.NumberOfBytesAppended();
1923
1924 if (U_FAILURE(*err)) {
1925 return reslen;
1926 }
1927
1928 if (sink.Overflowed()) {
1929 *err = U_BUFFER_OVERFLOW_ERROR;
1930 } else {
1931 u_terminateChars(name, nameCapacity, reslen, err);
1932 }
1933
1934 return reslen;
1935 }
1936
1937 U_CAPI void U_EXPORT2
ulocimp_getBaseName(const char * localeID,ByteSink & sink,UErrorCode * err)1938 ulocimp_getBaseName(const char* localeID,
1939 ByteSink& sink,
1940 UErrorCode* err)
1941 {
1942 _canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
1943 }
1944
1945 U_CAPI int32_t U_EXPORT2
uloc_canonicalize(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1946 uloc_canonicalize(const char* localeID,
1947 char* name,
1948 int32_t nameCapacity,
1949 UErrorCode* err)
1950 {
1951 if (U_FAILURE(*err)) {
1952 return 0;
1953 }
1954
1955 CheckedArrayByteSink sink(name, nameCapacity);
1956 ulocimp_canonicalize(localeID, sink, err);
1957
1958 int32_t reslen = sink.NumberOfBytesAppended();
1959
1960 if (U_FAILURE(*err)) {
1961 return reslen;
1962 }
1963
1964 if (sink.Overflowed()) {
1965 *err = U_BUFFER_OVERFLOW_ERROR;
1966 } else {
1967 u_terminateChars(name, nameCapacity, reslen, err);
1968 }
1969
1970 return reslen;
1971 }
1972
1973 U_CAPI void U_EXPORT2
ulocimp_canonicalize(const char * localeID,ByteSink & sink,UErrorCode * err)1974 ulocimp_canonicalize(const char* localeID,
1975 ByteSink& sink,
1976 UErrorCode* err)
1977 {
1978 _canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
1979 }
1980
1981 U_CAPI const char* U_EXPORT2
uloc_getISO3Language(const char * localeID)1982 uloc_getISO3Language(const char* localeID)
1983 {
1984 int16_t offset;
1985 char lang[ULOC_LANG_CAPACITY];
1986 UErrorCode err = U_ZERO_ERROR;
1987
1988 if (localeID == nullptr)
1989 {
1990 localeID = uloc_getDefault();
1991 }
1992 uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
1993 if (U_FAILURE(err))
1994 return "";
1995 offset = _findIndex(LANGUAGES, lang);
1996 if (offset < 0)
1997 return "";
1998 return LANGUAGES_3[offset];
1999 }
2000
2001 U_CAPI const char* U_EXPORT2
uloc_getISO3Country(const char * localeID)2002 uloc_getISO3Country(const char* localeID)
2003 {
2004 int16_t offset;
2005 char cntry[ULOC_LANG_CAPACITY];
2006 UErrorCode err = U_ZERO_ERROR;
2007
2008 if (localeID == nullptr)
2009 {
2010 localeID = uloc_getDefault();
2011 }
2012 uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2013 if (U_FAILURE(err))
2014 return "";
2015 offset = _findIndex(COUNTRIES, cntry);
2016 if (offset < 0)
2017 return "";
2018
2019 return COUNTRIES_3[offset];
2020 }
2021
2022 U_CAPI uint32_t U_EXPORT2
uloc_getLCID(const char * localeID)2023 uloc_getLCID(const char* localeID)
2024 {
2025 UErrorCode status = U_ZERO_ERROR;
2026 char langID[ULOC_FULLNAME_CAPACITY];
2027 uint32_t lcid = 0;
2028
2029 /* Check for incomplete id. */
2030 if (!localeID || uprv_strlen(localeID) < 2) {
2031 return 0;
2032 }
2033
2034 // First, attempt Windows platform lookup if available, but fall
2035 // through to catch any special cases (ICU vs Windows name differences).
2036 lcid = uprv_convertToLCIDPlatform(localeID, &status);
2037 if (U_FAILURE(status)) {
2038 return 0;
2039 }
2040 if (lcid > 0) {
2041 // Windows found an LCID, return that
2042 return lcid;
2043 }
2044
2045 uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2046 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) {
2047 return 0;
2048 }
2049
2050 if (uprv_strchr(localeID, '@')) {
2051 // uprv_convertToLCID does not support keywords other than collation.
2052 // Remove all keywords except collation.
2053 int32_t len;
2054 char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2055
2056 CharString collVal;
2057 {
2058 CharStringByteSink sink(&collVal);
2059 ulocimp_getKeywordValue(localeID, "collation", sink, &status);
2060 }
2061
2062 if (U_SUCCESS(status) && !collVal.isEmpty()) {
2063 len = uloc_getBaseName(localeID, tmpLocaleID,
2064 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2065
2066 if (U_SUCCESS(status) && len > 0) {
2067 tmpLocaleID[len] = 0;
2068
2069 len = uloc_setKeywordValue("collation", collVal.data(), tmpLocaleID,
2070 UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2071
2072 if (U_SUCCESS(status) && len > 0) {
2073 tmpLocaleID[len] = 0;
2074 return uprv_convertToLCID(langID, tmpLocaleID, &status);
2075 }
2076 }
2077 }
2078
2079 // fall through - all keywords are simply ignored
2080 status = U_ZERO_ERROR;
2081 }
2082
2083 return uprv_convertToLCID(langID, localeID, &status);
2084 }
2085
2086 U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid,char * locale,int32_t localeCapacity,UErrorCode * status)2087 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2088 UErrorCode *status)
2089 {
2090 return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2091 }
2092
2093 /* ### Default locale **************************************************/
2094
2095 U_CAPI const char* U_EXPORT2
uloc_getDefault()2096 uloc_getDefault()
2097 {
2098 return locale_get_default();
2099 }
2100
2101 U_CAPI void U_EXPORT2
uloc_setDefault(const char * newDefaultLocale,UErrorCode * err)2102 uloc_setDefault(const char* newDefaultLocale,
2103 UErrorCode* err)
2104 {
2105 if (U_FAILURE(*err))
2106 return;
2107 /* the error code isn't currently used for anything by this function*/
2108
2109 /* propagate change to C++ */
2110 locale_set_default(newDefaultLocale);
2111 }
2112
2113 /**
2114 * Returns a list of all 2-letter language codes defined in ISO 639. This is a pointer
2115 * to an array of pointers to arrays of char. All of these pointers are owned
2116 * by ICU-- do not delete them, and do not write through them. The array is
2117 * terminated with a null pointer.
2118 */
2119 U_CAPI const char* const* U_EXPORT2
uloc_getISOLanguages()2120 uloc_getISOLanguages()
2121 {
2122 return LANGUAGES;
2123 }
2124
2125 /**
2126 * Returns a list of all 2-letter country codes defined in ISO 639. This is a
2127 * pointer to an array of pointers to arrays of char. All of these pointers are
2128 * owned by ICU-- do not delete them, and do not write through them. The array is
2129 * terminated with a null pointer.
2130 */
2131 U_CAPI const char* const* U_EXPORT2
uloc_getISOCountries()2132 uloc_getISOCountries()
2133 {
2134 return COUNTRIES;
2135 }
2136
2137 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleKey(const char * keyword)2138 uloc_toUnicodeLocaleKey(const char* keyword)
2139 {
2140 const char* bcpKey = ulocimp_toBcpKey(keyword);
2141 if (bcpKey == nullptr && ultag_isUnicodeLocaleKey(keyword, -1)) {
2142 // unknown keyword, but syntax is fine..
2143 return keyword;
2144 }
2145 return bcpKey;
2146 }
2147
2148 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleType(const char * keyword,const char * value)2149 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2150 {
2151 const char* bcpType = ulocimp_toBcpType(keyword, value, nullptr, nullptr);
2152 if (bcpType == nullptr && ultag_isUnicodeLocaleType(value, -1)) {
2153 // unknown keyword, but syntax is fine..
2154 return value;
2155 }
2156 return bcpType;
2157 }
2158
2159 static UBool
isWellFormedLegacyKey(const char * legacyKey)2160 isWellFormedLegacyKey(const char* legacyKey)
2161 {
2162 const char* p = legacyKey;
2163 while (*p) {
2164 if (!UPRV_ISALPHANUM(*p)) {
2165 return false;
2166 }
2167 p++;
2168 }
2169 return true;
2170 }
2171
2172 static UBool
isWellFormedLegacyType(const char * legacyType)2173 isWellFormedLegacyType(const char* legacyType)
2174 {
2175 const char* p = legacyType;
2176 int32_t alphaNumLen = 0;
2177 while (*p) {
2178 if (*p == '_' || *p == '/' || *p == '-') {
2179 if (alphaNumLen == 0) {
2180 return false;
2181 }
2182 alphaNumLen = 0;
2183 } else if (UPRV_ISALPHANUM(*p)) {
2184 alphaNumLen++;
2185 } else {
2186 return false;
2187 }
2188 p++;
2189 }
2190 return (alphaNumLen != 0);
2191 }
2192
2193 U_CAPI const char* U_EXPORT2
uloc_toLegacyKey(const char * keyword)2194 uloc_toLegacyKey(const char* keyword)
2195 {
2196 const char* legacyKey = ulocimp_toLegacyKey(keyword);
2197 if (legacyKey == nullptr) {
2198 // Checks if the specified locale key is well-formed with the legacy locale syntax.
2199 //
2200 // Note:
2201 // LDML/CLDR provides some definition of keyword syntax in
2202 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2203 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2204 // Keys can only consist of [0-9a-zA-Z].
2205 if (isWellFormedLegacyKey(keyword)) {
2206 return keyword;
2207 }
2208 }
2209 return legacyKey;
2210 }
2211
2212 U_CAPI const char* U_EXPORT2
uloc_toLegacyType(const char * keyword,const char * value)2213 uloc_toLegacyType(const char* keyword, const char* value)
2214 {
2215 const char* legacyType = ulocimp_toLegacyType(keyword, value, nullptr, nullptr);
2216 if (legacyType == nullptr) {
2217 // Checks if the specified locale type is well-formed with the legacy locale syntax.
2218 //
2219 // Note:
2220 // LDML/CLDR provides some definition of keyword syntax in
2221 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2222 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2223 // Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2224 // we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2225 if (isWellFormedLegacyType(value)) {
2226 return value;
2227 }
2228 }
2229 return legacyType;
2230 }
2231
2232 /*eof*/
2233