1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1997-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 *
9 * File ULOC.CPP
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 04/01/97 aliu Creation.
15 * 08/21/98 stephen JDK 1.2 sync
16 * 12/08/98 rtg New Locale implementation and C API
17 * 03/15/99 damiba overhaul.
18 * 04/06/99 stephen changed setDefault() to realloc and copy
19 * 06/14/99 stephen Changed calls to ures_open for new params
20 * 07/21/99 stephen Modified setDefault() to propagate to C++
21 * 05/14/04 alan 7 years later: refactored, cleaned up, fixed bugs,
22 * brought canonicalization code into line with spec
23 *****************************************************************************/
24
25 /*
26 POSIX's locale format, from putil.c: [no spaces]
27
28 ll [ _CC ] [ . MM ] [ @ VV]
29
30 l = lang, C = ctry, M = charmap, V = variant
31 */
32
33 #include <optional>
34
35 #include "unicode/bytestream.h"
36 #include "unicode/errorcode.h"
37 #include "unicode/stringpiece.h"
38 #include "unicode/utypes.h"
39 #include "unicode/ustring.h"
40 #include "unicode/uloc.h"
41
42 #include "bytesinkutil.h"
43 #include "putilimp.h"
44 #include "ustr_imp.h"
45 #include "ulocimp.h"
46 #include "umutex.h"
47 #include "cstring.h"
48 #include "cmemory.h"
49 #include "locmap.h"
50 #include "uarrsort.h"
51 #include "uenumimp.h"
52 #include "uassert.h"
53 #include "charstr.h"
54
55 U_NAMESPACE_USE
56
57 /* ### Declarations **************************************************/
58
59 /* Locale stuff from locid.cpp */
60 U_CFUNC void locale_set_default(const char *id);
61 U_CFUNC const char *locale_get_default();
62
63 namespace {
64
65 /* ### Data tables **************************************************/
66
67 /**
68 * Table of language codes, both 2- and 3-letter, with preference
69 * given to 2-letter codes where possible. Includes 3-letter codes
70 * that lack a 2-letter equivalent.
71 *
72 * This list must be in sorted order. This list is returned directly
73 * to the user by some API.
74 *
75 * This list must be kept in sync with LANGUAGES_3, with corresponding
76 * entries matched.
77 *
78 * This table should be terminated with a nullptr entry, followed by a
79 * second list, and another nullptr entry. The first list is visible to
80 * user code when this array is returned by API. The second list
81 * contains codes we support, but do not expose through user API.
82 *
83 * Notes
84 *
85 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
86 * include the revisions up to 2001/7/27 *CWB*
87 *
88 * The 3 character codes are the terminology codes like RFC 3066. This
89 * is compatible with prior ICU codes
90 *
91 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
92 * table but now at the end of the table because 3 character codes are
93 * duplicates. This avoids bad searches going from 3 to 2 character
94 * codes.
95 *
96 * The range qaa-qtz is reserved for local use
97 */
98 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
99 /* ISO639 table version is 20150505 */
100 /* Subsequent hand addition of selected languages */
101 constexpr const char* LANGUAGES[] = {
102 "aa", "ab", "ace", "ach", "ada", "ady", "ae", "aeb",
103 "af", "afh", "agq", "ain", "ak", "akk", "akz", "ale",
104 "aln", "alt", "am", "an", "ang", "anp", "ar", "arc",
105 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
106 "asa", "ase", "ast", "av", "avk", "awa", "ay", "az",
107 "ba", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
108 "be", "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
109 "bgc", "bgn", "bho", "bi", "bik", "bin", "bjn", "bkm", "bla",
110 "blo", "bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh",
111 "brx", "bs", "bss", "bua", "bug", "bum", "byn", "byv",
112 "ca", "cad", "car", "cay", "cch", "ccp", "ce", "ceb", "cgg",
113 "ch", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
114 "chr", "chy", "ckb", "co", "cop", "cps", "cr", "crh",
115 "cs", "csb", "csw", "cu", "cv", "cy",
116 "da", "dak", "dar", "dav", "de", "del", "den", "dgr",
117 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
118 "dyo", "dyu", "dz", "dzg",
119 "ebu", "ee", "efi", "egl", "egy", "eka", "el", "elx",
120 "en", "enm", "eo", "es", "esu", "et", "eu", "ewo",
121 "ext",
122 "fa", "fan", "fat", "ff", "fi", "fil", "fit", "fj",
123 "fo", "fon", "fr", "frc", "frm", "fro", "frp", "frr",
124 "frs", "fur", "fy",
125 "ga", "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
126 "gez", "gil", "gl", "glk", "gmh", "gn", "goh", "gom",
127 "gon", "gor", "got", "grb", "grc", "gsw", "gu", "guc",
128 "gur", "guz", "gv", "gwi",
129 "ha", "hai", "hak", "haw", "he", "hi", "hif", "hil",
130 "hit", "hmn", "ho", "hr", "hsb", "hsn", "ht", "hu",
131 "hup", "hy", "hz",
132 "ia", "iba", "ibb", "id", "ie", "ig", "ii", "ik",
133 "ilo", "inh", "io", "is", "it", "iu", "izh",
134 "ja", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
135 "jv",
136 "ka", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
137 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg", "kgp",
138 "kha", "kho", "khq", "khw", "ki", "kiu", "kj", "kk",
139 "kkj", "kl", "kln", "km", "kmb", "kn", "ko", "koi",
140 "kok", "kos", "kpe", "kr", "krc", "kri", "krj", "krl",
141 "kru", "ks", "ksb", "ksf", "ksh", "ku", "kum", "kut",
142 "kv", "kw", "kxv", "ky",
143 "la", "lad", "lag", "lah", "lam", "lb", "lez", "lfn",
144 "lg", "li", "lij", "liv", "lkt", "lmo", "ln", "lo",
145 "lol", "loz", "lrc", "lt", "ltg", "lu", "lua", "lui",
146 "lun", "luo", "lus", "luy", "lv", "lzh", "lzz",
147 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
148 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg", "mga",
149 "mgh", "mgo", "mh", "mi", "mic", "min", "mis", "mk",
150 "ml", "mn", "mnc", "mni",
151 "moh", "mos", "mr", "mrj",
152 "ms", "mt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
153 "my", "mye", "myv", "mzn",
154 "na", "nan", "nap", "naq", "nb", "nd", "nds", "ne",
155 "new", "ng", "nia", "niu", "njo", "nl", "nmg", "nn",
156 "nnh", "no", "nog", "non", "nov", "nqo", "nr", "nso",
157 "nus", "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi",
158 "oc", "oj", "om", "or", "os", "osa", "ota",
159 "pa", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
160 "pdt", "peo", "pfl", "phn", "pi", "pl", "pms", "pnt",
161 "pon", "prg", "pro", "ps", "pt",
162 "qu", "quc", "qug",
163 "raj", "rap", "rar", "rgn", "rif", "rm", "rn", "ro",
164 "rof", "rom", "rtm", "ru", "rue", "rug", "rup",
165 "rw", "rwk",
166 "sa", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
167 "sba", "sbp", "sc", "scn", "sco", "sd", "sdc", "sdh",
168 "se", "see", "seh", "sei", "sel", "ses", "sg", "sga",
169 "sgs", "shi", "shn", "shu", "si", "sid", "sk",
170 "sl", "sli", "sly", "sm", "sma", "smj", "smn", "sms",
171 "sn", "snk", "so", "sog", "sq", "sr", "srn", "srr",
172 "ss", "ssy", "st", "stq", "su", "suk", "sus", "sux",
173 "sv", "sw", "swb", "syc", "syr", "szl",
174 "ta", "tcy", "te", "tem", "teo", "ter", "tet", "tg",
175 "th", "ti", "tig", "tiv", "tk", "tkl", "tkr",
176 "tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tok", "tpi",
177 "tr", "tru", "trv", "ts", "tsd", "tsi", "tt", "ttt",
178 "tum", "tvl", "tw", "twq", "ty", "tyv", "tzm",
179 "udm", "ug", "uga", "uk", "umb", "und", "ur", "uz",
180 "vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vmw",
181 "vo", "vot", "vro", "vun",
182 "wa", "wae", "wal", "war", "was", "wbp", "wo", "wuu",
183 "xal", "xh", "xmf", "xnr", "xog",
184 "yao", "yap", "yav", "ybb", "yi", "yo", "yrl", "yue",
185 "za", "zap", "zbl", "zea", "zen", "zgh", "zh", "zu",
186 "zun", "zxx", "zza",
187 nullptr,
188 "in", "iw", "ji", "jw", "mo", "sh", "swc", "tl", /* obsolete language codes */
189 nullptr
190 };
191
192 constexpr const char* DEPRECATED_LANGUAGES[]={
193 "in", "iw", "ji", "jw", "mo", nullptr, nullptr
194 };
195 constexpr const char* REPLACEMENT_LANGUAGES[]={
196 "id", "he", "yi", "jv", "ro", nullptr, nullptr
197 };
198
199 /**
200 * Table of 3-letter language codes.
201 *
202 * This is a lookup table used to convert 3-letter language codes to
203 * their 2-letter equivalent, where possible. It must be kept in sync
204 * with LANGUAGES. For all valid i, LANGUAGES[i] must refer to the
205 * same language as LANGUAGES_3[i]. The commented-out lines are
206 * copied from LANGUAGES to make eyeballing this baby easier.
207 *
208 * Where a 3-letter language code has no 2-letter equivalent, the
209 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
210 *
211 * This table should be terminated with a nullptr entry, followed by a
212 * second list, and another nullptr entry. The two lists correspond to
213 * the two lists in LANGUAGES.
214 */
215 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
216 /* ISO639 table version is 20150505 */
217 /* Subsequent hand addition of selected languages */
218 constexpr const char* LANGUAGES_3[] = {
219 "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
220 "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
221 "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
222 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
223 "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
224 "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
225 "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
226 "bgc", "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
227 "blo", "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
228 "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
229 "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
230 "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
231 "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
232 "ces", "csb", "csw", "chu", "chv", "cym",
233 "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
234 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
235 "dyo", "dyu", "dzo", "dzg",
236 "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
237 "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
238 "ext",
239 "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
240 "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
241 "frs", "fur", "fry",
242 "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
243 "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
244 "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
245 "gur", "guz", "glv", "gwi",
246 "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
247 "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
248 "hup", "hye", "her",
249 "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
250 "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
251 "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
252 "jav",
253 "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
254 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
255 "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
256 "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
257 "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
258 "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
259 "kom", "cor", "kxv", "kir",
260 "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
261 "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
262 "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
263 "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
264 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
265 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
266 "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
267 "mal", "mon", "mnc", "mni",
268 "moh", "mos", "mar", "mrj",
269 "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
270 "mya", "mye", "myv", "mzn",
271 "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
272 "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
273 "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
274 "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
275 "oci", "oji", "orm", "ori", "oss", "osa", "ota",
276 "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
277 "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
278 "pon", "prg", "pro", "pus", "por",
279 "que", "quc", "qug",
280 "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
281 "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
282 "kin", "rwk",
283 "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
284 "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
285 "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
286 "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
287 "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
288 "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
289 "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
290 "swe", "swa", "swb", "syc", "syr", "szl",
291 "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
292 "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
293 "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tok", "tpi",
294 "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
295 "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
296 "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
297 "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vmw",
298 "vol", "vot", "vro", "vun",
299 "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
300 "xal", "xho", "xmf", "xnr", "xog",
301 "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
302 "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
303 "zun", "zxx", "zza",
304 nullptr,
305 /* "in", "iw", "ji", "jw", "mo", "sh", "swc", "tl", */
306 "ind", "heb", "yid", "jaw", "mol", "srp", "swc", "tgl",
307 nullptr
308 };
309
310 /**
311 * Table of 2-letter country codes.
312 *
313 * This list must be in sorted order. This list is returned directly
314 * to the user by some API.
315 *
316 * This list must be kept in sync with COUNTRIES_3, with corresponding
317 * entries matched.
318 *
319 * This table should be terminated with a nullptr entry, followed by a
320 * second list, and another nullptr entry. The first list is visible to
321 * user code when this array is returned by API. The second list
322 * contains codes we support, but do not expose through user API.
323 *
324 * Notes:
325 *
326 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
327 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
328 * new codes keeping the old ones for compatibility updated to include
329 * 1999/12/03 revisions *CWB*
330 *
331 * RO(ROM) is now RO(ROU) according to
332 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
333 */
334 constexpr const char* COUNTRIES[] = {
335 "AD", "AE", "AF", "AG", "AI", "AL", "AM",
336 "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ",
337 "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI",
338 "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV",
339 "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG",
340 "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CQ", "CR",
341 "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK",
342 "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER",
343 "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR",
344 "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL",
345 "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU",
346 "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU",
347 "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS",
348 "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
349 "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA",
350 "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU",
351 "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK",
352 "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS",
353 "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA",
354 "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP",
355 "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG",
356 "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT",
357 "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA",
358 "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ",
359 "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV",
360 "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ",
361 "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV",
362 "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ",
363 "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF",
364 "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW",
365 nullptr,
366 "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR", /* obsolete country codes */
367 nullptr
368 };
369
370 constexpr const char* DEPRECATED_COUNTRIES[] = {
371 "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", nullptr, nullptr /* deprecated country list */
372 };
373 constexpr const char* REPLACEMENT_COUNTRIES[] = {
374 /* "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
375 "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", nullptr, nullptr /* replacement country codes */
376 };
377
378 /**
379 * Table of 3-letter country codes.
380 *
381 * This is a lookup table used to convert 3-letter country codes to
382 * their 2-letter equivalent. It must be kept in sync with COUNTRIES.
383 * For all valid i, COUNTRIES[i] must refer to the same country as
384 * COUNTRIES_3[i]. The commented-out lines are copied from COUNTRIES
385 * to make eyeballing this baby easier.
386 *
387 * This table should be terminated with a nullptr entry, followed by a
388 * second list, and another nullptr entry. The two lists correspond to
389 * the two lists in COUNTRIES.
390 */
391 constexpr const char* COUNTRIES_3[] = {
392 /* "AD", "AE", "AF", "AG", "AI", "AL", "AM", */
393 "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
394 /* "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", */
395 "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
396 /* "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", */
397 "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
398 /* "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV", */
399 "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
400 /* "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", */
401 "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
402 /* "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CQ", "CR", */
403 "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRQ", "CRI",
404 /* "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK", */
405 "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
406 /* "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER", */
407 "DMA", "DOM", "DZA", "XEA", "ECU", "EST", "EGY", "ESH", "ERI",
408 /* "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", */
409 "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
410 /* "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", */
411 "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
412 /* "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", */
413 "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
414 /* "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", */
415 "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
416 /* "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS" */
417 "XIC", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
418 /* "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", */
419 "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
420 /* "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", */
421 "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
422 /* "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", */
423 "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
424 /* "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", */
425 "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
426 /* "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", */
427 "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
428 /* "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", */
429 "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
430 /* "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", */
431 "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
432 /* "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", */
433 "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
434 /* "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", */
435 "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
436 /* "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", */
437 "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
438 /* "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", */
439 "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
440 /* "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", */
441 "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
442 /* "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ", */
443 "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
444 /* "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", */
445 "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
446 /* "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", */
447 "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
448 /* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */
449 "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
450 /* "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW", */
451 "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
452 nullptr,
453 /* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */
454 "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
455 nullptr
456 };
457
458 typedef struct CanonicalizationMap {
459 const char *id; /* input ID */
460 const char *canonicalID; /* canonicalized output ID */
461 } CanonicalizationMap;
462
463 /**
464 * A map to canonicalize locale IDs. This handles a variety of
465 * different semantic kinds of transformations.
466 */
467 constexpr CanonicalizationMap CANONICALIZE_MAP[] = {
468 { "art__LOJBAN", "jbo" }, /* registered name */
469 { "hy__AREVELA", "hy" }, /* Registered IANA variant */
470 { "hy__AREVMDA", "hyw" }, /* Registered IANA variant */
471 { "zh__GUOYU", "zh" }, /* registered name */
472 { "zh__HAKKA", "hak" }, /* registered name */
473 { "zh__XIANG", "hsn" }, /* registered name */
474 // subtags with 3 chars won't be treated as variants.
475 { "zh_GAN", "gan" }, /* registered name */
476 { "zh_MIN_NAN", "nan" }, /* registered name */
477 { "zh_WUU", "wuu" }, /* registered name */
478 { "zh_YUE", "yue" }, /* registered name */
479 };
480
481 /* ### BCP47 Conversion *******************************************/
482 /* Gets the size of the shortest subtag in the given localeID. */
getShortestSubtagLength(const char * localeID)483 int32_t getShortestSubtagLength(const char *localeID) {
484 int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
485 int32_t length = localeIDLength;
486 int32_t tmpLength = 0;
487 int32_t i;
488 bool reset = true;
489
490 for (i = 0; i < localeIDLength; i++) {
491 if (localeID[i] != '_' && localeID[i] != '-') {
492 if (reset) {
493 tmpLength = 0;
494 reset = false;
495 }
496 tmpLength++;
497 } else {
498 if (tmpLength != 0 && tmpLength < length) {
499 length = tmpLength;
500 }
501 reset = true;
502 }
503 }
504
505 return length;
506 }
507 /* Test if the locale id has BCP47 u extension and does not have '@' */
_hasBCP47Extension(const char * id)508 inline bool _hasBCP47Extension(const char *id) {
509 return id != nullptr && uprv_strstr(id, "@") == nullptr && getShortestSubtagLength(id) == 1;
510 }
511
512 /* ### Keywords **************************************************/
UPRV_ISDIGIT(char c)513 inline bool UPRV_ISDIGIT(char c) { return c >= '0' && c <= '9'; }
UPRV_ISALPHANUM(char c)514 inline bool UPRV_ISALPHANUM(char c) { return uprv_isASCIILetter(c) || UPRV_ISDIGIT(c); }
515 /* Punctuation/symbols allowed in legacy key values */
UPRV_OK_VALUE_PUNCTUATION(char c)516 inline bool UPRV_OK_VALUE_PUNCTUATION(char c) { return c == '_' || c == '-' || c == '+' || c == '/'; }
517
518 } // namespace
519
520 #define ULOC_KEYWORD_BUFFER_LEN 25
521 #define ULOC_MAX_NO_KEYWORDS 25
522
523 U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char * localeID)524 locale_getKeywordsStart(const char *localeID) {
525 const char *result = nullptr;
526 if((result = uprv_strchr(localeID, '@')) != nullptr) {
527 return result;
528 }
529 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
530 else {
531 /* We do this because the @ sign is variant, and the @ sign used on one
532 EBCDIC machine won't be compiled the same way on other EBCDIC based
533 machines. */
534 static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
535 const uint8_t *charToFind = ebcdicSigns;
536 while(*charToFind) {
537 if((result = uprv_strchr(localeID, *charToFind)) != nullptr) {
538 return result;
539 }
540 charToFind++;
541 }
542 }
543 #endif
544 return nullptr;
545 }
546
547 namespace {
548
549 /**
550 * @param keywordName incoming name to be canonicalized
551 * @param status return status (keyword too long)
552 * @return the keyword name
553 */
locale_canonKeywordName(const char * keywordName,UErrorCode & status)554 CharString locale_canonKeywordName(const char* keywordName, UErrorCode& status)
555 {
556 if (U_FAILURE(status)) { return {}; }
557 CharString result;
558
559 for (; *keywordName != 0; keywordName++) {
560 if (!UPRV_ISALPHANUM(*keywordName)) {
561 status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
562 return {};
563 }
564 result.append(uprv_tolower(*keywordName), status);
565 }
566 if (result.isEmpty()) {
567 status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
568 return {};
569 }
570
571 return result;
572 }
573
574 typedef struct {
575 char keyword[ULOC_KEYWORD_BUFFER_LEN];
576 int32_t keywordLen;
577 const char *valueStart;
578 int32_t valueLen;
579 } KeywordStruct;
580
581 int32_t U_CALLCONV
compareKeywordStructs(const void *,const void * left,const void * right)582 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
583 const char* leftString = ((const KeywordStruct *)left)->keyword;
584 const char* rightString = ((const KeywordStruct *)right)->keyword;
585 return uprv_strcmp(leftString, rightString);
586 }
587
588 } // namespace
589
590 U_EXPORT CharString
ulocimp_getKeywords(const char * localeID,char prev,bool valuesToo,UErrorCode & status)591 ulocimp_getKeywords(const char* localeID,
592 char prev,
593 bool valuesToo,
594 UErrorCode& status)
595 {
596 return ByteSinkUtil::viaByteSinkToCharString(
597 [&](ByteSink& sink, UErrorCode& status) {
598 ulocimp_getKeywords(localeID,
599 prev,
600 sink,
601 valuesToo,
602 status);
603 },
604 status);
605 }
606
607 U_EXPORT void
ulocimp_getKeywords(const char * localeID,char prev,ByteSink & sink,bool valuesToo,UErrorCode & status)608 ulocimp_getKeywords(const char* localeID,
609 char prev,
610 ByteSink& sink,
611 bool valuesToo,
612 UErrorCode& status)
613 {
614 if (U_FAILURE(status)) { return; }
615
616 KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
617
618 int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
619 int32_t numKeywords = 0;
620 const char* pos = localeID;
621 const char* equalSign = nullptr;
622 const char* semicolon = nullptr;
623 int32_t i = 0, j, n;
624
625 if(prev == '@') { /* start of keyword definition */
626 /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
627 do {
628 bool duplicate = false;
629 /* skip leading spaces */
630 while(*pos == ' ') {
631 pos++;
632 }
633 if (!*pos) { /* handle trailing "; " */
634 break;
635 }
636 if(numKeywords == maxKeywords) {
637 status = U_INTERNAL_PROGRAM_ERROR;
638 return;
639 }
640 equalSign = uprv_strchr(pos, '=');
641 semicolon = uprv_strchr(pos, ';');
642 /* lack of '=' [foo@currency] is illegal */
643 /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
644 if(!equalSign || (semicolon && semicolon<equalSign)) {
645 status = U_INVALID_FORMAT_ERROR;
646 return;
647 }
648 /* need to normalize both keyword and keyword name */
649 if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
650 /* keyword name too long for internal buffer */
651 status = U_INTERNAL_PROGRAM_ERROR;
652 return;
653 }
654 for(i = 0, n = 0; i < equalSign - pos; ++i) {
655 if (pos[i] != ' ') {
656 keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
657 }
658 }
659
660 /* zero-length keyword is an error. */
661 if (n == 0) {
662 status = U_INVALID_FORMAT_ERROR;
663 return;
664 }
665
666 keywordList[numKeywords].keyword[n] = 0;
667 keywordList[numKeywords].keywordLen = n;
668 /* now grab the value part. First we skip the '=' */
669 equalSign++;
670 /* then we leading spaces */
671 while(*equalSign == ' ') {
672 equalSign++;
673 }
674
675 /* Premature end or zero-length value */
676 if (!*equalSign || equalSign == semicolon) {
677 status = U_INVALID_FORMAT_ERROR;
678 return;
679 }
680
681 keywordList[numKeywords].valueStart = equalSign;
682
683 pos = semicolon;
684 i = 0;
685 if(pos) {
686 while(*(pos - i - 1) == ' ') {
687 i++;
688 }
689 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
690 pos++;
691 } else {
692 i = (int32_t)uprv_strlen(equalSign);
693 while(i && equalSign[i-1] == ' ') {
694 i--;
695 }
696 keywordList[numKeywords].valueLen = i;
697 }
698 /* If this is a duplicate keyword, then ignore it */
699 for (j=0; j<numKeywords; ++j) {
700 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
701 duplicate = true;
702 break;
703 }
704 }
705 if (!duplicate) {
706 ++numKeywords;
707 }
708 } while(pos);
709
710 /* now we have a list of keywords */
711 /* we need to sort it */
712 uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, nullptr, false, &status);
713
714 /* Now construct the keyword part */
715 for(i = 0; i < numKeywords; i++) {
716 sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
717 if(valuesToo) {
718 sink.Append("=", 1);
719 sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
720 if(i < numKeywords - 1) {
721 sink.Append(";", 1);
722 }
723 } else {
724 sink.Append("\0", 1);
725 }
726 }
727 }
728 }
729
730 U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char * localeID,const char * keywordName,char * buffer,int32_t bufferCapacity,UErrorCode * status)731 uloc_getKeywordValue(const char* localeID,
732 const char* keywordName,
733 char* buffer, int32_t bufferCapacity,
734 UErrorCode* status)
735 {
736 return ByteSinkUtil::viaByteSinkToTerminatedChars(
737 buffer, bufferCapacity,
738 [&](ByteSink& sink, UErrorCode& status) {
739 ulocimp_getKeywordValue(localeID, keywordName, sink, status);
740 },
741 *status);
742 }
743
744 U_EXPORT CharString
ulocimp_getKeywordValue(const char * localeID,const char * keywordName,UErrorCode & status)745 ulocimp_getKeywordValue(const char* localeID,
746 const char* keywordName,
747 UErrorCode& status)
748 {
749 return ByteSinkUtil::viaByteSinkToCharString(
750 [&](ByteSink& sink, UErrorCode& status) {
751 ulocimp_getKeywordValue(localeID, keywordName, sink, status);
752 },
753 status);
754 }
755
756 U_EXPORT void
ulocimp_getKeywordValue(const char * localeID,const char * keywordName,icu::ByteSink & sink,UErrorCode & status)757 ulocimp_getKeywordValue(const char* localeID,
758 const char* keywordName,
759 icu::ByteSink& sink,
760 UErrorCode& status)
761 {
762 if (U_FAILURE(status)) { return; }
763
764 if (localeID == nullptr || keywordName == nullptr || keywordName[0] == 0) {
765 status = U_ILLEGAL_ARGUMENT_ERROR;
766 return;
767 }
768
769 const char* startSearchHere = nullptr;
770 const char* nextSeparator = nullptr;
771
772 CharString tempBuffer;
773 const char* tmpLocaleID;
774
775 CharString canonKeywordName = locale_canonKeywordName(keywordName, status);
776 if (U_FAILURE(status)) {
777 return;
778 }
779
780 if (_hasBCP47Extension(localeID)) {
781 tempBuffer = ulocimp_forLanguageTag(localeID, -1, nullptr, status);
782 tmpLocaleID = U_SUCCESS(status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
783 } else {
784 tmpLocaleID=localeID;
785 }
786
787 startSearchHere = locale_getKeywordsStart(tmpLocaleID);
788 if(startSearchHere == nullptr) {
789 /* no keywords, return at once */
790 return;
791 }
792
793 /* find the first keyword */
794 while(startSearchHere) {
795 const char* keyValueTail;
796
797 startSearchHere++; /* skip @ or ; */
798 nextSeparator = uprv_strchr(startSearchHere, '=');
799 if(!nextSeparator) {
800 status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
801 return;
802 }
803 /* strip leading & trailing spaces (TC decided to tolerate these) */
804 while(*startSearchHere == ' ') {
805 startSearchHere++;
806 }
807 keyValueTail = nextSeparator;
808 while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
809 keyValueTail--;
810 }
811 /* now keyValueTail points to first char after the keyName */
812 /* copy & normalize keyName from locale */
813 if (startSearchHere == keyValueTail) {
814 status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
815 return;
816 }
817 CharString localeKeywordName;
818 while (startSearchHere < keyValueTail) {
819 if (!UPRV_ISALPHANUM(*startSearchHere)) {
820 status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
821 return;
822 }
823 localeKeywordName.append(uprv_tolower(*startSearchHere++), status);
824 }
825 if (U_FAILURE(status)) {
826 return;
827 }
828
829 startSearchHere = uprv_strchr(nextSeparator, ';');
830
831 if (canonKeywordName == localeKeywordName) {
832 /* current entry matches the keyword. */
833 nextSeparator++; /* skip '=' */
834 /* First strip leading & trailing spaces (TC decided to tolerate these) */
835 while(*nextSeparator == ' ') {
836 nextSeparator++;
837 }
838 keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
839 while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
840 keyValueTail--;
841 }
842 /* Now copy the value, but check well-formedness */
843 if (nextSeparator == keyValueTail) {
844 status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
845 return;
846 }
847 while (nextSeparator < keyValueTail) {
848 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
849 status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
850 return;
851 }
852 /* Should we lowercase value to return here? Tests expect as-is. */
853 sink.Append(nextSeparator++, 1);
854 }
855 return;
856 }
857 }
858 }
859
860 U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char * keywordName,const char * keywordValue,char * buffer,int32_t bufferCapacity,UErrorCode * status)861 uloc_setKeywordValue(const char* keywordName,
862 const char* keywordValue,
863 char* buffer, int32_t bufferCapacity,
864 UErrorCode* status)
865 {
866 if (U_FAILURE(*status)) { return 0; }
867
868 if (bufferCapacity <= 1) {
869 *status = U_ILLEGAL_ARGUMENT_ERROR;
870 return 0;
871 }
872
873 int32_t bufLen = (int32_t)uprv_strlen(buffer);
874 if(bufferCapacity<bufLen) {
875 /* The capacity is less than the length?! Is this NUL terminated? */
876 *status = U_ILLEGAL_ARGUMENT_ERROR;
877 return 0;
878 }
879
880 char* keywords = const_cast<char*>(locale_getKeywordsStart(buffer));
881 int32_t baseLen = keywords == nullptr ? bufLen : keywords - buffer;
882 // Remove -1 from the capacity so that this function can guarantee NUL termination.
883 CheckedArrayByteSink sink(keywords == nullptr ? buffer + bufLen : keywords,
884 bufferCapacity - baseLen - 1);
885 int32_t reslen = ulocimp_setKeywordValue(
886 keywords, keywordName, keywordValue, sink, *status);
887
888 if (U_FAILURE(*status)) {
889 return *status == U_BUFFER_OVERFLOW_ERROR ? reslen + baseLen : 0;
890 }
891
892 // See the documentation for this function, it's guaranteed to never
893 // overflow the buffer but instead abort with BUFFER_OVERFLOW_ERROR.
894 // In this case, nothing has been written to the sink, so it cannot have Overflowed().
895 U_ASSERT(!sink.Overflowed());
896 U_ASSERT(reslen >= 0);
897 return u_terminateChars(buffer, bufferCapacity, reslen + baseLen, status);
898 }
899
900 U_EXPORT void
ulocimp_setKeywordValue(const char * keywordName,const char * keywordValue,CharString & localeID,UErrorCode & status)901 ulocimp_setKeywordValue(const char* keywordName,
902 const char* keywordValue,
903 CharString& localeID,
904 UErrorCode& status)
905 {
906 if (U_FAILURE(status)) { return; }
907 // This is safe because CharString::truncate() doesn't actually erase any
908 // data, but simply sets the position for where new data will be written.
909 const char* keywords = locale_getKeywordsStart(localeID.data());
910 if (keywords != nullptr) localeID.truncate(keywords - localeID.data());
911 CharStringByteSink sink(&localeID);
912 ulocimp_setKeywordValue(keywords, keywordName, keywordValue, sink, status);
913 }
914
915 U_EXPORT int32_t
ulocimp_setKeywordValue(const char * keywords,const char * keywordName,const char * keywordValue,ByteSink & sink,UErrorCode & status)916 ulocimp_setKeywordValue(const char* keywords,
917 const char* keywordName,
918 const char* keywordValue,
919 ByteSink& sink,
920 UErrorCode& status)
921 {
922 if (U_FAILURE(status)) { return 0; }
923
924 /* TODO: sorting. removal. */
925 int32_t needLen = 0;
926 int32_t rc;
927 const char* nextSeparator = nullptr;
928 const char* nextEqualsign = nullptr;
929 const char* keywordStart = nullptr;
930 CharString updatedKeysAndValues;
931 bool handledInputKeyAndValue = false;
932 char keyValuePrefix = '@';
933
934 if (status == U_STRING_NOT_TERMINATED_WARNING) {
935 status = U_ZERO_ERROR;
936 }
937 if (keywordName == nullptr || keywordName[0] == 0) {
938 status = U_ILLEGAL_ARGUMENT_ERROR;
939 return 0;
940 }
941 CharString canonKeywordName = locale_canonKeywordName(keywordName, status);
942 if (U_FAILURE(status)) {
943 return 0;
944 }
945
946 CharString canonKeywordValue;
947 if(keywordValue) {
948 while (*keywordValue != 0) {
949 if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
950 status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
951 return 0;
952 }
953 /* Should we force lowercase in value to set? */
954 canonKeywordValue.append(*keywordValue++, status);
955 }
956 }
957 if (U_FAILURE(status)) {
958 return 0;
959 }
960
961 if (keywords == nullptr || keywords[1] == '\0') {
962 if (canonKeywordValue.isEmpty()) { /* no keywords = nothing to remove */
963 U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
964 return 0;
965 }
966
967 needLen = 1 + canonKeywordName.length() + 1 + canonKeywordValue.length();
968 int32_t capacity = 0;
969 char* buffer = sink.GetAppendBuffer(
970 needLen, needLen, nullptr, needLen, &capacity);
971 if (capacity < needLen || buffer == nullptr) {
972 status = U_BUFFER_OVERFLOW_ERROR;
973 return needLen; /* no change */
974 }
975 char* it = buffer;
976
977 *it++ = '@';
978 uprv_memcpy(it, canonKeywordName.data(), canonKeywordName.length());
979 it += canonKeywordName.length();
980 *it++ = '=';
981 uprv_memcpy(it, canonKeywordValue.data(), canonKeywordValue.length());
982 sink.Append(buffer, needLen);
983 U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
984 return needLen;
985 } /* end shortcut - no @ */
986
987 keywordStart = keywords;
988 /* search for keyword */
989 while(keywordStart) {
990 const char* keyValueTail;
991
992 keywordStart++; /* skip @ or ; */
993 nextEqualsign = uprv_strchr(keywordStart, '=');
994 if (!nextEqualsign) {
995 status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
996 return 0;
997 }
998 /* strip leading & trailing spaces (TC decided to tolerate these) */
999 while(*keywordStart == ' ') {
1000 keywordStart++;
1001 }
1002 keyValueTail = nextEqualsign;
1003 while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
1004 keyValueTail--;
1005 }
1006 /* now keyValueTail points to first char after the keyName */
1007 /* copy & normalize keyName from locale */
1008 if (keywordStart == keyValueTail) {
1009 status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
1010 return 0;
1011 }
1012 CharString localeKeywordName;
1013 while (keywordStart < keyValueTail) {
1014 if (!UPRV_ISALPHANUM(*keywordStart)) {
1015 status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1016 return 0;
1017 }
1018 localeKeywordName.append(uprv_tolower(*keywordStart++), status);
1019 }
1020 if (U_FAILURE(status)) {
1021 return 0;
1022 }
1023
1024 nextSeparator = uprv_strchr(nextEqualsign, ';');
1025
1026 /* start processing the value part */
1027 nextEqualsign++; /* skip '=' */
1028 /* First strip leading & trailing spaces (TC decided to tolerate these) */
1029 while(*nextEqualsign == ' ') {
1030 nextEqualsign++;
1031 }
1032 keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1033 while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
1034 keyValueTail--;
1035 }
1036 if (nextEqualsign == keyValueTail) {
1037 status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1038 return 0;
1039 }
1040
1041 rc = uprv_strcmp(canonKeywordName.data(), localeKeywordName.data());
1042 if(rc == 0) {
1043 /* Current entry matches the input keyword. Update the entry */
1044 if (!canonKeywordValue.isEmpty()) { /* updating a value */
1045 updatedKeysAndValues.append(keyValuePrefix, status);
1046 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1047 updatedKeysAndValues.append(canonKeywordName, status);
1048 updatedKeysAndValues.append('=', status);
1049 updatedKeysAndValues.append(canonKeywordValue, status);
1050 } /* else removing this entry, don't emit anything */
1051 handledInputKeyAndValue = true;
1052 } else {
1053 /* input keyword sorts earlier than current entry, add before current entry */
1054 if (rc < 0 && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) {
1055 /* insert new entry at this location */
1056 updatedKeysAndValues.append(keyValuePrefix, status);
1057 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1058 updatedKeysAndValues.append(canonKeywordName, status);
1059 updatedKeysAndValues.append('=', status);
1060 updatedKeysAndValues.append(canonKeywordValue, status);
1061 handledInputKeyAndValue = true;
1062 }
1063 /* copy the current entry */
1064 updatedKeysAndValues.append(keyValuePrefix, status);
1065 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1066 updatedKeysAndValues.append(localeKeywordName, status);
1067 updatedKeysAndValues.append('=', status);
1068 updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), status);
1069 }
1070 if (!nextSeparator && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) {
1071 /* append new entry at the end, it sorts later than existing entries */
1072 updatedKeysAndValues.append(keyValuePrefix, status);
1073 /* skip keyValuePrefix update, no subsequent key-value pair */
1074 updatedKeysAndValues.append(canonKeywordName, status);
1075 updatedKeysAndValues.append('=', status);
1076 updatedKeysAndValues.append(canonKeywordValue, status);
1077 handledInputKeyAndValue = true;
1078 }
1079 keywordStart = nextSeparator;
1080 } /* end loop searching */
1081
1082 /* Any error from updatedKeysAndValues.append above would be internal and not due to
1083 * problems with the passed-in locale. So if we did encounter problems with the
1084 * passed-in locale above, those errors took precedence and overrode any error
1085 * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1086 * are errors here they are from updatedKeysAndValues.append; they do cause an
1087 * error return but the passed-in locale is unmodified and the original bufLen is
1088 * returned.
1089 */
1090 if (!handledInputKeyAndValue || U_FAILURE(status)) {
1091 /* if input key/value specified removal of a keyword not present in locale, or
1092 * there was an error in CharString.append, leave original locale alone. */
1093 U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
1094 return (int32_t)uprv_strlen(keywords);
1095 }
1096
1097 needLen = updatedKeysAndValues.length();
1098 // Check to see can we fit the updatedKeysAndValues, if not, return
1099 // U_BUFFER_OVERFLOW_ERROR without copy updatedKeysAndValues into it.
1100 // We do this because this API function does not behave like most others:
1101 // It promises never to set a U_STRING_NOT_TERMINATED_WARNING.
1102 // When the contents fits but without the terminating NUL, in this case we need to not change
1103 // the buffer contents and return with a buffer overflow error.
1104 if (needLen > 0) {
1105 int32_t capacity = 0;
1106 char* buffer = sink.GetAppendBuffer(
1107 needLen, needLen, nullptr, needLen, &capacity);
1108 if (capacity < needLen || buffer == nullptr) {
1109 status = U_BUFFER_OVERFLOW_ERROR;
1110 return needLen;
1111 }
1112 uprv_memcpy(buffer, updatedKeysAndValues.data(), needLen);
1113 sink.Append(buffer, needLen);
1114 }
1115 U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
1116 return needLen;
1117 }
1118
1119 /* ### ID parsing implementation **************************************************/
1120
1121 namespace {
1122
_isPrefixLetter(char a)1123 inline bool _isPrefixLetter(char a) { return a == 'x' || a == 'X' || a == 'i' || a == 'I'; }
1124
1125 /*returns true if one of the special prefixes is here (s=string)
1126 'x-' or 'i-' */
_isIDPrefix(const char * s)1127 inline bool _isIDPrefix(const char *s) { return _isPrefixLetter(s[0]) && _isIDSeparator(s[1]); }
1128
1129 /* Dot terminates it because of POSIX form where dot precedes the codepage
1130 * except for variant
1131 */
_isTerminator(char a)1132 inline bool _isTerminator(char a) { return a == 0 || a == '.' || a == '@'; }
1133
_isBCP47Extension(const char * p)1134 inline bool _isBCP47Extension(const char* p) {
1135 return p[0] == '-' &&
1136 (p[1] == 't' || p[1] == 'T' ||
1137 p[1] == 'u' || p[1] == 'U' ||
1138 p[1] == 'x' || p[1] == 'X') &&
1139 p[2] == '-';
1140 }
1141
1142 /**
1143 * Lookup 'key' in the array 'list'. The array 'list' should contain
1144 * a nullptr entry, followed by more entries, and a second nullptr entry.
1145 *
1146 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1147 * COUNTRIES_3.
1148 */
_findIndex(const char * const * list,const char * key)1149 std::optional<int16_t> _findIndex(const char* const* list, const char* key)
1150 {
1151 const char* const* anchor = list;
1152 int32_t pass = 0;
1153
1154 /* Make two passes through two nullptr-terminated arrays at 'list' */
1155 while (pass++ < 2) {
1156 while (*list) {
1157 if (uprv_strcmp(key, *list) == 0) {
1158 return (int16_t)(list - anchor);
1159 }
1160 list++;
1161 }
1162 ++list; /* skip final nullptr *CWB*/
1163 }
1164 return std::nullopt;
1165 }
1166
1167 } // namespace
1168
1169 U_CFUNC const char*
uloc_getCurrentCountryID(const char * oldID)1170 uloc_getCurrentCountryID(const char* oldID){
1171 std::optional<int16_t> offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1172 return offset.has_value() ? REPLACEMENT_COUNTRIES[*offset] : oldID;
1173 }
1174 U_CFUNC const char*
uloc_getCurrentLanguageID(const char * oldID)1175 uloc_getCurrentLanguageID(const char* oldID){
1176 std::optional<int16_t> offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1177 return offset.has_value() ? REPLACEMENT_LANGUAGES[*offset] : oldID;
1178 }
1179
1180 namespace {
1181
1182 /*
1183 * the internal functions _getLanguage(), _getScript(), _getRegion(), _getVariant()
1184 * avoid duplicating code to handle the earlier locale ID pieces
1185 * in the functions for the later ones by
1186 * setting the *pEnd pointer to where they stopped parsing
1187 *
1188 * TODO try to use this in Locale
1189 */
1190
1191 void
_getLanguage(const char * localeID,ByteSink * sink,const char ** pEnd,UErrorCode & status)1192 _getLanguage(const char* localeID,
1193 ByteSink* sink,
1194 const char** pEnd,
1195 UErrorCode& status) {
1196 U_ASSERT(pEnd != nullptr);
1197 *pEnd = localeID;
1198
1199 if (uprv_stricmp(localeID, "root") == 0) {
1200 localeID += 4;
1201 } else if (uprv_strnicmp(localeID, "und", 3) == 0 &&
1202 (localeID[3] == '\0' ||
1203 localeID[3] == '-' ||
1204 localeID[3] == '_' ||
1205 localeID[3] == '@')) {
1206 localeID += 3;
1207 }
1208
1209 constexpr int32_t MAXLEN = ULOC_LANG_CAPACITY - 1; // Minus NUL.
1210
1211 /* if it starts with i- or x- then copy that prefix */
1212 int32_t len = _isIDPrefix(localeID) ? 2 : 0;
1213 while (!_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) {
1214 if (len == MAXLEN) {
1215 status = U_ILLEGAL_ARGUMENT_ERROR;
1216 return;
1217 }
1218 len++;
1219 }
1220
1221 *pEnd = localeID + len;
1222 if (sink == nullptr || len == 0) { return; }
1223
1224 int32_t minCapacity = uprv_max(len, 4); // Minimum 3 letters plus NUL.
1225 char scratch[MAXLEN];
1226 int32_t capacity = 0;
1227 char* buffer = sink->GetAppendBuffer(
1228 minCapacity, minCapacity, scratch, UPRV_LENGTHOF(scratch), &capacity);
1229
1230 for (int32_t i = 0; i < len; ++i) {
1231 buffer[i] = uprv_tolower(localeID[i]);
1232 }
1233 if (_isIDSeparator(localeID[1])) {
1234 buffer[1] = '-';
1235 }
1236
1237 if (len == 3) {
1238 /* convert 3 character code to 2 character code if possible *CWB*/
1239 U_ASSERT(capacity >= 4);
1240 buffer[3] = '\0';
1241 std::optional<int16_t> offset = _findIndex(LANGUAGES_3, buffer);
1242 if (offset.has_value()) {
1243 const char* const alias = LANGUAGES[*offset];
1244 sink->Append(alias, (int32_t)uprv_strlen(alias));
1245 return;
1246 }
1247 }
1248
1249 sink->Append(buffer, len);
1250 }
1251
1252 void
_getScript(const char * localeID,ByteSink * sink,const char ** pEnd)1253 _getScript(const char* localeID,
1254 ByteSink* sink,
1255 const char** pEnd) {
1256 U_ASSERT(pEnd != nullptr);
1257 *pEnd = localeID;
1258
1259 constexpr int32_t LENGTH = 4;
1260
1261 int32_t len = 0;
1262 while (!_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len]) &&
1263 uprv_isASCIILetter(localeID[len])) {
1264 if (len == LENGTH) { return; }
1265 len++;
1266 }
1267 if (len != LENGTH) { return; }
1268
1269 *pEnd = localeID + LENGTH;
1270 if (sink == nullptr) { return; }
1271
1272 char scratch[LENGTH];
1273 int32_t capacity = 0;
1274 char* buffer = sink->GetAppendBuffer(
1275 LENGTH, LENGTH, scratch, UPRV_LENGTHOF(scratch), &capacity);
1276
1277 buffer[0] = uprv_toupper(localeID[0]);
1278 for (int32_t i = 1; i < LENGTH; ++i) {
1279 buffer[i] = uprv_tolower(localeID[i]);
1280 }
1281
1282 sink->Append(buffer, LENGTH);
1283 }
1284
1285 void
_getRegion(const char * localeID,ByteSink * sink,const char ** pEnd)1286 _getRegion(const char* localeID,
1287 ByteSink* sink,
1288 const char** pEnd) {
1289 U_ASSERT(pEnd != nullptr);
1290 *pEnd = localeID;
1291
1292 constexpr int32_t MINLEN = 2;
1293 constexpr int32_t MAXLEN = ULOC_COUNTRY_CAPACITY - 1; // Minus NUL.
1294
1295 int32_t len = 0;
1296 while (!_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) {
1297 if (len == MAXLEN) { return; }
1298 len++;
1299 }
1300 if (len < MINLEN) { return; }
1301
1302 *pEnd = localeID + len;
1303 if (sink == nullptr) { return; }
1304
1305 char scratch[ULOC_COUNTRY_CAPACITY];
1306 int32_t capacity = 0;
1307 char* buffer = sink->GetAppendBuffer(
1308 ULOC_COUNTRY_CAPACITY,
1309 ULOC_COUNTRY_CAPACITY,
1310 scratch,
1311 UPRV_LENGTHOF(scratch),
1312 &capacity);
1313
1314 for (int32_t i = 0; i < len; ++i) {
1315 buffer[i] = uprv_toupper(localeID[i]);
1316 }
1317
1318 if (len == 3) {
1319 /* convert 3 character code to 2 character code if possible *CWB*/
1320 U_ASSERT(capacity >= 4);
1321 buffer[3] = '\0';
1322 std::optional<int16_t> offset = _findIndex(COUNTRIES_3, buffer);
1323 if (offset.has_value()) {
1324 const char* const alias = COUNTRIES[*offset];
1325 sink->Append(alias, (int32_t)uprv_strlen(alias));
1326 return;
1327 }
1328 }
1329
1330 sink->Append(buffer, len);
1331 }
1332
1333 /**
1334 * @param needSeparator if true, then add leading '_' if any variants
1335 * are added to 'variant'
1336 */
1337 void
_getVariant(const char * localeID,char prev,ByteSink * sink,const char ** pEnd,bool needSeparator,UErrorCode & status)1338 _getVariant(const char* localeID,
1339 char prev,
1340 ByteSink* sink,
1341 const char** pEnd,
1342 bool needSeparator,
1343 UErrorCode& status) {
1344 if (U_FAILURE(status)) return;
1345 if (pEnd != nullptr) { *pEnd = localeID; }
1346
1347 // Reasonable upper limit for variants
1348 // There are no strict limitation of the syntax of variant in the legacy
1349 // locale format. If the locale is constructed from unicode_locale_id
1350 // as defined in UTS35, then we know each unicode_variant_subtag
1351 // could have max length of 8 ((alphanum{5,8} | digit alphanum{3})
1352 // 179 would allow 20 unicode_variant_subtag with sep in the
1353 // unicode_locale_id
1354 // 8*20 + 1*(20-1) = 179
1355 constexpr int32_t MAX_VARIANTS_LENGTH = 179;
1356
1357 /* get one or more variant tags and separate them with '_' */
1358 int32_t index = 0;
1359 if (_isIDSeparator(prev)) {
1360 /* get a variant string after a '-' or '_' */
1361 for (index=0; !_isTerminator(localeID[index]); index++) {
1362 if (index >= MAX_VARIANTS_LENGTH) { // same as length > MAX_VARIANTS_LENGTH
1363 status = U_ILLEGAL_ARGUMENT_ERROR;
1364 return;
1365 }
1366 if (needSeparator) {
1367 if (sink != nullptr) {
1368 sink->Append("_", 1);
1369 }
1370 needSeparator = false;
1371 }
1372 if (sink != nullptr) {
1373 char c = (char)uprv_toupper(localeID[index]);
1374 if (c == '-') c = '_';
1375 sink->Append(&c, 1);
1376 }
1377 }
1378 if (pEnd != nullptr) { *pEnd = localeID+index; }
1379 }
1380
1381 /* if there is no variant tag after a '-' or '_' then look for '@' */
1382 if (index == 0) {
1383 if (prev=='@') {
1384 /* keep localeID */
1385 } else if((localeID=locale_getKeywordsStart(localeID))!=nullptr) {
1386 ++localeID; /* point after the '@' */
1387 } else {
1388 return;
1389 }
1390 for(; !_isTerminator(localeID[index]); index++) {
1391 if (index >= MAX_VARIANTS_LENGTH) { // same as length > MAX_VARIANTS_LENGTH
1392 status = U_ILLEGAL_ARGUMENT_ERROR;
1393 return;
1394 }
1395 if (needSeparator) {
1396 if (sink != nullptr) {
1397 sink->Append("_", 1);
1398 }
1399 needSeparator = false;
1400 }
1401 if (sink != nullptr) {
1402 char c = (char)uprv_toupper(localeID[index]);
1403 if (c == '-' || c == ',') c = '_';
1404 sink->Append(&c, 1);
1405 }
1406 }
1407 if (pEnd != nullptr) { *pEnd = localeID + index; }
1408 }
1409 }
1410
1411 } // namespace
1412
1413 U_EXPORT CharString
ulocimp_getLanguage(const char * localeID,UErrorCode & status)1414 ulocimp_getLanguage(const char* localeID, UErrorCode& status) {
1415 return ByteSinkUtil::viaByteSinkToCharString(
1416 [&](ByteSink& sink, UErrorCode& status) {
1417 ulocimp_getSubtags(
1418 localeID,
1419 &sink,
1420 nullptr,
1421 nullptr,
1422 nullptr,
1423 nullptr,
1424 status);
1425 },
1426 status);
1427 }
1428
1429 U_EXPORT CharString
ulocimp_getScript(const char * localeID,UErrorCode & status)1430 ulocimp_getScript(const char* localeID, UErrorCode& status) {
1431 return ByteSinkUtil::viaByteSinkToCharString(
1432 [&](ByteSink& sink, UErrorCode& status) {
1433 ulocimp_getSubtags(
1434 localeID,
1435 nullptr,
1436 &sink,
1437 nullptr,
1438 nullptr,
1439 nullptr,
1440 status);
1441 },
1442 status);
1443 }
1444
1445 U_EXPORT CharString
ulocimp_getRegion(const char * localeID,UErrorCode & status)1446 ulocimp_getRegion(const char* localeID, UErrorCode& status) {
1447 return ByteSinkUtil::viaByteSinkToCharString(
1448 [&](ByteSink& sink, UErrorCode& status) {
1449 ulocimp_getSubtags(
1450 localeID,
1451 nullptr,
1452 nullptr,
1453 &sink,
1454 nullptr,
1455 nullptr,
1456 status);
1457 },
1458 status);
1459 }
1460
1461 U_EXPORT CharString
ulocimp_getVariant(const char * localeID,UErrorCode & status)1462 ulocimp_getVariant(const char* localeID, UErrorCode& status) {
1463 return ByteSinkUtil::viaByteSinkToCharString(
1464 [&](ByteSink& sink, UErrorCode& status) {
1465 ulocimp_getSubtags(
1466 localeID,
1467 nullptr,
1468 nullptr,
1469 nullptr,
1470 &sink,
1471 nullptr,
1472 status);
1473 },
1474 status);
1475 }
1476
1477 U_EXPORT void
ulocimp_getSubtags(const char * localeID,CharString * language,CharString * script,CharString * region,CharString * variant,const char ** pEnd,UErrorCode & status)1478 ulocimp_getSubtags(
1479 const char* localeID,
1480 CharString* language,
1481 CharString* script,
1482 CharString* region,
1483 CharString* variant,
1484 const char** pEnd,
1485 UErrorCode& status) {
1486 if (U_FAILURE(status)) { return; }
1487
1488 std::optional<CharStringByteSink> languageSink;
1489 std::optional<CharStringByteSink> scriptSink;
1490 std::optional<CharStringByteSink> regionSink;
1491 std::optional<CharStringByteSink> variantSink;
1492
1493 if (language != nullptr) { languageSink.emplace(language); }
1494 if (script != nullptr) { scriptSink.emplace(script); }
1495 if (region != nullptr) { regionSink.emplace(region); }
1496 if (variant != nullptr) { variantSink.emplace(variant); }
1497
1498 ulocimp_getSubtags(
1499 localeID,
1500 languageSink.has_value() ? &*languageSink : nullptr,
1501 scriptSink.has_value() ? &*scriptSink : nullptr,
1502 regionSink.has_value() ? &*regionSink : nullptr,
1503 variantSink.has_value() ? &*variantSink : nullptr,
1504 pEnd,
1505 status);
1506 }
1507
1508 U_EXPORT void
ulocimp_getSubtags(const char * localeID,ByteSink * language,ByteSink * script,ByteSink * region,ByteSink * variant,const char ** pEnd,UErrorCode & status)1509 ulocimp_getSubtags(
1510 const char* localeID,
1511 ByteSink* language,
1512 ByteSink* script,
1513 ByteSink* region,
1514 ByteSink* variant,
1515 const char** pEnd,
1516 UErrorCode& status) {
1517 if (U_FAILURE(status)) { return; }
1518
1519 if (pEnd != nullptr) {
1520 *pEnd = localeID;
1521 } else if (language == nullptr &&
1522 script == nullptr &&
1523 region == nullptr &&
1524 variant == nullptr) {
1525 return;
1526 }
1527
1528 bool hasRegion = false;
1529
1530 if (localeID == nullptr) {
1531 localeID = uloc_getDefault();
1532 }
1533
1534 _getLanguage(localeID, language, &localeID, status);
1535 if (U_FAILURE(status)) { return; }
1536 U_ASSERT(localeID != nullptr);
1537
1538 if (pEnd != nullptr) {
1539 *pEnd = localeID;
1540 } else if (script == nullptr &&
1541 region == nullptr &&
1542 variant == nullptr) {
1543 return;
1544 }
1545
1546 if (_isIDSeparator(*localeID)) {
1547 const char* begin = localeID + 1;
1548 const char* end = nullptr;
1549 _getScript(begin, script, &end);
1550 U_ASSERT(end != nullptr);
1551 if (end != begin) {
1552 localeID = end;
1553 if (pEnd != nullptr) { *pEnd = localeID; }
1554 }
1555 }
1556
1557 if (region == nullptr && variant == nullptr && pEnd == nullptr) { return; }
1558
1559 if (_isIDSeparator(*localeID)) {
1560 const char* begin = localeID + 1;
1561 const char* end = nullptr;
1562 _getRegion(begin, region, &end);
1563 U_ASSERT(end != nullptr);
1564 if (end != begin) {
1565 hasRegion = true;
1566 localeID = end;
1567 if (pEnd != nullptr) { *pEnd = localeID; }
1568 }
1569 }
1570
1571 if (variant == nullptr && pEnd == nullptr) { return; }
1572
1573 if (_isIDSeparator(*localeID) && !_isBCP47Extension(localeID)) {
1574 /* If there was no country ID, skip a possible extra IDSeparator */
1575 if (!hasRegion && _isIDSeparator(localeID[1])) {
1576 localeID++;
1577 }
1578 const char* begin = localeID + 1;
1579 const char* end = nullptr;
1580 _getVariant(begin, *localeID, variant, &end, false, status);
1581 if (U_FAILURE(status)) { return; }
1582 U_ASSERT(end != nullptr);
1583 if (end != begin && pEnd != nullptr) { *pEnd = end; }
1584 }
1585 }
1586
1587 /* Keyword enumeration */
1588
1589 typedef struct UKeywordsContext {
1590 char* keywords;
1591 char* current;
1592 } UKeywordsContext;
1593
1594 U_CDECL_BEGIN
1595
1596 static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration * enumerator)1597 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1598 uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1599 uprv_free(enumerator->context);
1600 uprv_free(enumerator);
1601 }
1602
1603 static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration * en,UErrorCode *)1604 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1605 char *kw = ((UKeywordsContext *)en->context)->keywords;
1606 int32_t result = 0;
1607 while(*kw) {
1608 result++;
1609 kw += uprv_strlen(kw)+1;
1610 }
1611 return result;
1612 }
1613
1614 static const char * U_CALLCONV
uloc_kw_nextKeyword(UEnumeration * en,int32_t * resultLength,UErrorCode *)1615 uloc_kw_nextKeyword(UEnumeration* en,
1616 int32_t* resultLength,
1617 UErrorCode* /*status*/) {
1618 const char* result = ((UKeywordsContext *)en->context)->current;
1619 int32_t len = 0;
1620 if(*result) {
1621 len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1622 ((UKeywordsContext *)en->context)->current += len+1;
1623 } else {
1624 result = nullptr;
1625 }
1626 if (resultLength) {
1627 *resultLength = len;
1628 }
1629 return result;
1630 }
1631
1632 static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration * en,UErrorCode *)1633 uloc_kw_resetKeywords(UEnumeration* en,
1634 UErrorCode* /*status*/) {
1635 ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1636 }
1637
1638 U_CDECL_END
1639
1640
1641 static const UEnumeration gKeywordsEnum = {
1642 nullptr,
1643 nullptr,
1644 uloc_kw_closeKeywords,
1645 uloc_kw_countKeywords,
1646 uenum_unextDefault,
1647 uloc_kw_nextKeyword,
1648 uloc_kw_resetKeywords
1649 };
1650
1651 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char * keywordList,int32_t keywordListSize,UErrorCode * status)1652 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1653 {
1654 if (U_FAILURE(*status)) { return nullptr; }
1655
1656 LocalMemory<UKeywordsContext> myContext;
1657 LocalMemory<UEnumeration> result;
1658
1659 myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1660 result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1661 if (myContext.isNull() || result.isNull()) {
1662 *status = U_MEMORY_ALLOCATION_ERROR;
1663 return nullptr;
1664 }
1665 uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1666 myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1667 if (myContext->keywords == nullptr) {
1668 *status = U_MEMORY_ALLOCATION_ERROR;
1669 return nullptr;
1670 }
1671 uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1672 myContext->keywords[keywordListSize] = 0;
1673 myContext->current = myContext->keywords;
1674 result->context = myContext.orphan();
1675 return result.orphan();
1676 }
1677
1678 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char * localeID,UErrorCode * status)1679 uloc_openKeywords(const char* localeID,
1680 UErrorCode* status)
1681 {
1682 if(status==nullptr || U_FAILURE(*status)) {
1683 return nullptr;
1684 }
1685
1686 CharString tempBuffer;
1687 const char* tmpLocaleID;
1688
1689 if (_hasBCP47Extension(localeID)) {
1690 tempBuffer = ulocimp_forLanguageTag(localeID, -1, nullptr, *status);
1691 tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
1692 } else {
1693 if (localeID==nullptr) {
1694 localeID=uloc_getDefault();
1695 }
1696 tmpLocaleID=localeID;
1697 }
1698
1699 ulocimp_getSubtags(
1700 tmpLocaleID,
1701 nullptr,
1702 nullptr,
1703 nullptr,
1704 nullptr,
1705 &tmpLocaleID,
1706 *status);
1707 if (U_FAILURE(*status)) {
1708 return nullptr;
1709 }
1710
1711 /* keywords are located after '@' */
1712 if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != nullptr) {
1713 CharString keywords = ulocimp_getKeywords(tmpLocaleID + 1, '@', false, *status);
1714 if (U_FAILURE(*status)) {
1715 return nullptr;
1716 }
1717 return uloc_openKeywordList(keywords.data(), keywords.length(), status);
1718 }
1719 return nullptr;
1720 }
1721
1722
1723 /* bit-flags for 'options' parameter of _canonicalize */
1724 #define _ULOC_STRIP_KEYWORDS 0x2
1725 #define _ULOC_CANONICALIZE 0x1
1726
1727 namespace {
1728
OPTION_SET(uint32_t options,uint32_t mask)1729 inline bool OPTION_SET(uint32_t options, uint32_t mask) { return (options & mask) != 0; }
1730
1731 constexpr char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1732 constexpr int32_t I_DEFAULT_LENGTH = UPRV_LENGTHOF(i_default);
1733
1734 /**
1735 * Canonicalize the given localeID, to level 1 or to level 2,
1736 * depending on the options. To specify level 1, pass in options=0.
1737 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1738 *
1739 * This is the code underlying uloc_getName and uloc_canonicalize.
1740 */
1741 void
_canonicalize(const char * localeID,ByteSink & sink,uint32_t options,UErrorCode & err)1742 _canonicalize(const char* localeID,
1743 ByteSink& sink,
1744 uint32_t options,
1745 UErrorCode& err) {
1746 if (U_FAILURE(err)) {
1747 return;
1748 }
1749
1750 int32_t j, fieldCount=0;
1751 CharString tempBuffer; // if localeID has a BCP47 extension, tmpLocaleID points to this
1752 CharString localeIDWithHyphens; // if localeID has a BPC47 extension and have _, tmpLocaleID points to this
1753 const char* origLocaleID;
1754 const char* tmpLocaleID;
1755 const char* keywordAssign = nullptr;
1756 const char* separatorIndicator = nullptr;
1757
1758 if (_hasBCP47Extension(localeID)) {
1759 const char* localeIDPtr = localeID;
1760
1761 // convert all underbars to hyphens, unless the "BCP47 extension" comes at the beginning of the string
1762 if (uprv_strchr(localeID, '_') != nullptr && localeID[1] != '-' && localeID[1] != '_') {
1763 localeIDWithHyphens.append(localeID, -1, err);
1764 if (U_SUCCESS(err)) {
1765 for (char* p = localeIDWithHyphens.data(); *p != '\0'; ++p) {
1766 if (*p == '_') {
1767 *p = '-';
1768 }
1769 }
1770 localeIDPtr = localeIDWithHyphens.data();
1771 }
1772 }
1773
1774 tempBuffer = ulocimp_forLanguageTag(localeIDPtr, -1, nullptr, err);
1775 tmpLocaleID = U_SUCCESS(err) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeIDPtr;
1776 } else {
1777 if (localeID==nullptr) {
1778 localeID=uloc_getDefault();
1779 }
1780 tmpLocaleID=localeID;
1781 }
1782
1783 origLocaleID=tmpLocaleID;
1784
1785 /* get all pieces, one after another, and separate with '_' */
1786 CharString tag;
1787 CharString script;
1788 CharString country;
1789 CharString variant;
1790 ulocimp_getSubtags(
1791 tmpLocaleID,
1792 &tag,
1793 &script,
1794 &country,
1795 &variant,
1796 &tmpLocaleID,
1797 err);
1798 if (U_FAILURE(err)) {
1799 return;
1800 }
1801
1802 if (tag.length() == I_DEFAULT_LENGTH &&
1803 uprv_strncmp(origLocaleID, i_default, I_DEFAULT_LENGTH) == 0) {
1804 tag.clear();
1805 tag.append(uloc_getDefault(), err);
1806 } else {
1807 if (!script.isEmpty()) {
1808 ++fieldCount;
1809 tag.append('_', err);
1810 tag.append(script, err);
1811 }
1812 if (!country.isEmpty()) {
1813 ++fieldCount;
1814 tag.append('_', err);
1815 tag.append(country, err);
1816 }
1817 if (!variant.isEmpty()) {
1818 ++fieldCount;
1819 if (country.isEmpty()) {
1820 tag.append('_', err);
1821 }
1822 tag.append('_', err);
1823 tag.append(variant, err);
1824 }
1825 }
1826
1827 /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1828 if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1829 tag.append('.', err);
1830 ++tmpLocaleID;
1831 const char *atPos = nullptr;
1832 size_t length;
1833 if((atPos = uprv_strchr(tmpLocaleID, '@')) != nullptr) {
1834 length = atPos - tmpLocaleID;
1835 } else {
1836 length = uprv_strlen(tmpLocaleID);
1837 }
1838 // The longest charset name we found in IANA charset registry
1839 // https://www.iana.org/assignments/character-sets/ is
1840 // "Extended_UNIX_Code_Packed_Format_for_Japanese" in length 45.
1841 // we therefore restrict the length here to be 64 which is a power of 2
1842 // number that is longer than 45.
1843 constexpr size_t kMaxCharsetLength = 64;
1844 if (length > kMaxCharsetLength) {
1845 err = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1846 return;
1847 }
1848 tag.append(tmpLocaleID, static_cast<int32_t>(length), err);
1849 tmpLocaleID += length;
1850 }
1851
1852 /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1853 After this, tmpLocaleID either points to '@' or is nullptr */
1854 if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=nullptr) {
1855 keywordAssign = uprv_strchr(tmpLocaleID, '=');
1856 separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1857 }
1858
1859 /* Copy POSIX-style variant, if any [mr@FOO] */
1860 if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1861 tmpLocaleID != nullptr && keywordAssign == nullptr) {
1862 for (;;) {
1863 char c = *tmpLocaleID;
1864 if (c == 0) {
1865 break;
1866 }
1867 tag.append(c, err);
1868 ++tmpLocaleID;
1869 }
1870 }
1871
1872 if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1873 /* Handle @FOO variant if @ is present and not followed by = */
1874 if (tmpLocaleID!=nullptr && keywordAssign==nullptr) {
1875 /* Add missing '_' if needed */
1876 if (fieldCount < 2 || (fieldCount < 3 && !script.isEmpty())) {
1877 do {
1878 tag.append('_', err);
1879 ++fieldCount;
1880 } while(fieldCount<2);
1881 }
1882
1883 CharStringByteSink s(&tag);
1884 _getVariant(tmpLocaleID+1, '@', &s, nullptr, !variant.isEmpty(), err);
1885 if (U_FAILURE(err)) { return; }
1886 }
1887
1888 /* Look up the ID in the canonicalization map */
1889 for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1890 StringPiece id(CANONICALIZE_MAP[j].id);
1891 if (tag == id) {
1892 if (id.empty() && tmpLocaleID != nullptr) {
1893 break; /* Don't remap "" if keywords present */
1894 }
1895 tag.clear();
1896 tag.append(CANONICALIZE_MAP[j].canonicalID, err);
1897 break;
1898 }
1899 }
1900 }
1901
1902 sink.Append(tag.data(), tag.length());
1903
1904 if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1905 if (tmpLocaleID!=nullptr && keywordAssign!=nullptr &&
1906 (!separatorIndicator || separatorIndicator > keywordAssign)) {
1907 sink.Append("@", 1);
1908 ++fieldCount;
1909 ulocimp_getKeywords(tmpLocaleID+1, '@', sink, true, err);
1910 }
1911 }
1912 }
1913
1914 } // namespace
1915
1916 /* ### ID parsing API **************************************************/
1917
1918 U_CAPI int32_t U_EXPORT2
uloc_getParent(const char * localeID,char * parent,int32_t parentCapacity,UErrorCode * err)1919 uloc_getParent(const char* localeID,
1920 char* parent,
1921 int32_t parentCapacity,
1922 UErrorCode* err)
1923 {
1924 return ByteSinkUtil::viaByteSinkToTerminatedChars(
1925 parent, parentCapacity,
1926 [&](ByteSink& sink, UErrorCode& status) {
1927 ulocimp_getParent(localeID, sink, status);
1928 },
1929 *err);
1930 }
1931
1932 U_EXPORT CharString
ulocimp_getParent(const char * localeID,UErrorCode & err)1933 ulocimp_getParent(const char* localeID,
1934 UErrorCode& err)
1935 {
1936 return ByteSinkUtil::viaByteSinkToCharString(
1937 [&](ByteSink& sink, UErrorCode& status) {
1938 ulocimp_getParent(localeID, sink, status);
1939 },
1940 err);
1941 }
1942
1943 U_EXPORT void
ulocimp_getParent(const char * localeID,icu::ByteSink & sink,UErrorCode & err)1944 ulocimp_getParent(const char* localeID,
1945 icu::ByteSink& sink,
1946 UErrorCode& err)
1947 {
1948 if (U_FAILURE(err)) { return; }
1949
1950 const char *lastUnderscore;
1951 int32_t i;
1952
1953 if (localeID == nullptr)
1954 localeID = uloc_getDefault();
1955
1956 lastUnderscore=uprv_strrchr(localeID, '_');
1957 if(lastUnderscore!=nullptr) {
1958 i=(int32_t)(lastUnderscore-localeID);
1959 } else {
1960 i=0;
1961 }
1962
1963 if (i > 0) {
1964 if (uprv_strnicmp(localeID, "und_", 4) == 0) {
1965 localeID += 3;
1966 i -= 3;
1967 }
1968 sink.Append(localeID, i);
1969 }
1970 }
1971
1972 U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char * localeID,char * language,int32_t languageCapacity,UErrorCode * err)1973 uloc_getLanguage(const char* localeID,
1974 char* language,
1975 int32_t languageCapacity,
1976 UErrorCode* err)
1977 {
1978 /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1979 return ByteSinkUtil::viaByteSinkToTerminatedChars(
1980 language, languageCapacity,
1981 [&](ByteSink& sink, UErrorCode& status) {
1982 ulocimp_getSubtags(
1983 localeID,
1984 &sink,
1985 nullptr,
1986 nullptr,
1987 nullptr,
1988 nullptr,
1989 status);
1990 },
1991 *err);
1992 }
1993
1994 U_CAPI int32_t U_EXPORT2
uloc_getScript(const char * localeID,char * script,int32_t scriptCapacity,UErrorCode * err)1995 uloc_getScript(const char* localeID,
1996 char* script,
1997 int32_t scriptCapacity,
1998 UErrorCode* err)
1999 {
2000 return ByteSinkUtil::viaByteSinkToTerminatedChars(
2001 script, scriptCapacity,
2002 [&](ByteSink& sink, UErrorCode& status) {
2003 ulocimp_getSubtags(
2004 localeID,
2005 nullptr,
2006 &sink,
2007 nullptr,
2008 nullptr,
2009 nullptr,
2010 status);
2011 },
2012 *err);
2013 }
2014
2015 U_CAPI int32_t U_EXPORT2
uloc_getCountry(const char * localeID,char * country,int32_t countryCapacity,UErrorCode * err)2016 uloc_getCountry(const char* localeID,
2017 char* country,
2018 int32_t countryCapacity,
2019 UErrorCode* err)
2020 {
2021 return ByteSinkUtil::viaByteSinkToTerminatedChars(
2022 country, countryCapacity,
2023 [&](ByteSink& sink, UErrorCode& status) {
2024 ulocimp_getSubtags(
2025 localeID,
2026 nullptr,
2027 nullptr,
2028 &sink,
2029 nullptr,
2030 nullptr,
2031 status);
2032 },
2033 *err);
2034 }
2035
2036 U_CAPI int32_t U_EXPORT2
uloc_getVariant(const char * localeID,char * variant,int32_t variantCapacity,UErrorCode * err)2037 uloc_getVariant(const char* localeID,
2038 char* variant,
2039 int32_t variantCapacity,
2040 UErrorCode* err)
2041 {
2042 return ByteSinkUtil::viaByteSinkToTerminatedChars(
2043 variant, variantCapacity,
2044 [&](ByteSink& sink, UErrorCode& status) {
2045 ulocimp_getSubtags(
2046 localeID,
2047 nullptr,
2048 nullptr,
2049 nullptr,
2050 &sink,
2051 nullptr,
2052 status);
2053 },
2054 *err);
2055 }
2056
2057 U_CAPI int32_t U_EXPORT2
uloc_getName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2058 uloc_getName(const char* localeID,
2059 char* name,
2060 int32_t nameCapacity,
2061 UErrorCode* err)
2062 {
2063 return ByteSinkUtil::viaByteSinkToTerminatedChars(
2064 name, nameCapacity,
2065 [&](ByteSink& sink, UErrorCode& status) {
2066 ulocimp_getName(localeID, sink, status);
2067 },
2068 *err);
2069 }
2070
2071 U_EXPORT CharString
ulocimp_getName(const char * localeID,UErrorCode & err)2072 ulocimp_getName(const char* localeID,
2073 UErrorCode& err)
2074 {
2075 return ByteSinkUtil::viaByteSinkToCharString(
2076 [&](ByteSink& sink, UErrorCode& status) {
2077 ulocimp_getName(localeID, sink, status);
2078 },
2079 err);
2080 }
2081
2082 U_EXPORT void
ulocimp_getName(const char * localeID,ByteSink & sink,UErrorCode & err)2083 ulocimp_getName(const char* localeID,
2084 ByteSink& sink,
2085 UErrorCode& err)
2086 {
2087 _canonicalize(localeID, sink, 0, err);
2088 }
2089
2090 U_CAPI int32_t U_EXPORT2
uloc_getBaseName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2091 uloc_getBaseName(const char* localeID,
2092 char* name,
2093 int32_t nameCapacity,
2094 UErrorCode* err)
2095 {
2096 return ByteSinkUtil::viaByteSinkToTerminatedChars(
2097 name, nameCapacity,
2098 [&](ByteSink& sink, UErrorCode& status) {
2099 ulocimp_getBaseName(localeID, sink, status);
2100 },
2101 *err);
2102 }
2103
2104 U_EXPORT CharString
ulocimp_getBaseName(const char * localeID,UErrorCode & err)2105 ulocimp_getBaseName(const char* localeID,
2106 UErrorCode& err)
2107 {
2108 return ByteSinkUtil::viaByteSinkToCharString(
2109 [&](ByteSink& sink, UErrorCode& status) {
2110 ulocimp_getBaseName(localeID, sink, status);
2111 },
2112 err);
2113 }
2114
2115 U_EXPORT void
ulocimp_getBaseName(const char * localeID,ByteSink & sink,UErrorCode & err)2116 ulocimp_getBaseName(const char* localeID,
2117 ByteSink& sink,
2118 UErrorCode& err)
2119 {
2120 _canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
2121 }
2122
2123 U_CAPI int32_t U_EXPORT2
uloc_canonicalize(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2124 uloc_canonicalize(const char* localeID,
2125 char* name,
2126 int32_t nameCapacity,
2127 UErrorCode* err)
2128 {
2129 return ByteSinkUtil::viaByteSinkToTerminatedChars(
2130 name, nameCapacity,
2131 [&](ByteSink& sink, UErrorCode& status) {
2132 ulocimp_canonicalize(localeID, sink, status);
2133 },
2134 *err);
2135 }
2136
2137 U_EXPORT CharString
ulocimp_canonicalize(const char * localeID,UErrorCode & err)2138 ulocimp_canonicalize(const char* localeID,
2139 UErrorCode& err)
2140 {
2141 return ByteSinkUtil::viaByteSinkToCharString(
2142 [&](ByteSink& sink, UErrorCode& status) {
2143 ulocimp_canonicalize(localeID, sink, status);
2144 },
2145 err);
2146 }
2147
2148 U_EXPORT void
ulocimp_canonicalize(const char * localeID,ByteSink & sink,UErrorCode & err)2149 ulocimp_canonicalize(const char* localeID,
2150 ByteSink& sink,
2151 UErrorCode& err)
2152 {
2153 _canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
2154 }
2155
2156 U_CAPI const char* U_EXPORT2
uloc_getISO3Language(const char * localeID)2157 uloc_getISO3Language(const char* localeID)
2158 {
2159 UErrorCode err = U_ZERO_ERROR;
2160
2161 if (localeID == nullptr)
2162 {
2163 localeID = uloc_getDefault();
2164 }
2165 CharString lang = ulocimp_getLanguage(localeID, err);
2166 if (U_FAILURE(err))
2167 return "";
2168 std::optional<int16_t> offset = _findIndex(LANGUAGES, lang.data());
2169 return offset.has_value() ? LANGUAGES_3[*offset] : "";
2170 }
2171
2172 U_CAPI const char* U_EXPORT2
uloc_getISO3Country(const char * localeID)2173 uloc_getISO3Country(const char* localeID)
2174 {
2175 UErrorCode err = U_ZERO_ERROR;
2176
2177 if (localeID == nullptr)
2178 {
2179 localeID = uloc_getDefault();
2180 }
2181 CharString cntry = ulocimp_getRegion(localeID, err);
2182 if (U_FAILURE(err))
2183 return "";
2184 std::optional<int16_t> offset = _findIndex(COUNTRIES, cntry.data());
2185 return offset.has_value() ? COUNTRIES_3[*offset] : "";
2186 }
2187
2188 U_CAPI uint32_t U_EXPORT2
uloc_getLCID(const char * localeID)2189 uloc_getLCID(const char* localeID)
2190 {
2191 UErrorCode status = U_ZERO_ERROR;
2192 uint32_t lcid = 0;
2193
2194 /* Check for incomplete id. */
2195 if (!localeID || uprv_strlen(localeID) < 2) {
2196 return 0;
2197 }
2198
2199 // First, attempt Windows platform lookup if available, but fall
2200 // through to catch any special cases (ICU vs Windows name differences).
2201 lcid = uprv_convertToLCIDPlatform(localeID, &status);
2202 if (U_FAILURE(status)) {
2203 return 0;
2204 }
2205 if (lcid > 0) {
2206 // Windows found an LCID, return that
2207 return lcid;
2208 }
2209
2210 CharString langID = ulocimp_getLanguage(localeID, status);
2211 if (U_FAILURE(status)) {
2212 return 0;
2213 }
2214
2215 if (uprv_strchr(localeID, '@')) {
2216 // uprv_convertToLCID does not support keywords other than collation.
2217 // Remove all keywords except collation.
2218 CharString collVal = ulocimp_getKeywordValue(localeID, "collation", status);
2219 if (U_SUCCESS(status) && !collVal.isEmpty()) {
2220 CharString tmpLocaleID = ulocimp_getBaseName(localeID, status);
2221 ulocimp_setKeywordValue("collation", collVal.data(), tmpLocaleID, status);
2222 if (U_SUCCESS(status)) {
2223 return uprv_convertToLCID(langID.data(), tmpLocaleID.data(), &status);
2224 }
2225 }
2226
2227 // fall through - all keywords are simply ignored
2228 status = U_ZERO_ERROR;
2229 }
2230
2231 return uprv_convertToLCID(langID.data(), localeID, &status);
2232 }
2233
2234 U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid,char * locale,int32_t localeCapacity,UErrorCode * status)2235 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2236 UErrorCode *status)
2237 {
2238 return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2239 }
2240
2241 /* ### Default locale **************************************************/
2242
2243 U_CAPI const char* U_EXPORT2
uloc_getDefault()2244 uloc_getDefault()
2245 {
2246 return locale_get_default();
2247 }
2248
2249 U_CAPI void U_EXPORT2
uloc_setDefault(const char * newDefaultLocale,UErrorCode * err)2250 uloc_setDefault(const char* newDefaultLocale,
2251 UErrorCode* err)
2252 {
2253 if (U_FAILURE(*err))
2254 return;
2255 /* the error code isn't currently used for anything by this function*/
2256
2257 /* propagate change to C++ */
2258 locale_set_default(newDefaultLocale);
2259 }
2260
2261 /**
2262 * Returns a list of all 2-letter language codes defined in ISO 639. This is a pointer
2263 * to an array of pointers to arrays of char. All of these pointers are owned
2264 * by ICU-- do not delete them, and do not write through them. The array is
2265 * terminated with a null pointer.
2266 */
2267 U_CAPI const char* const* U_EXPORT2
uloc_getISOLanguages()2268 uloc_getISOLanguages()
2269 {
2270 return LANGUAGES;
2271 }
2272
2273 /**
2274 * Returns a list of all 2-letter country codes defined in ISO 639. This is a
2275 * pointer to an array of pointers to arrays of char. All of these pointers are
2276 * owned by ICU-- do not delete them, and do not write through them. The array is
2277 * terminated with a null pointer.
2278 */
2279 U_CAPI const char* const* U_EXPORT2
uloc_getISOCountries()2280 uloc_getISOCountries()
2281 {
2282 return COUNTRIES;
2283 }
2284
2285 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleKey(const char * keyword)2286 uloc_toUnicodeLocaleKey(const char* keyword)
2287 {
2288 const char* bcpKey = ulocimp_toBcpKey(keyword);
2289 if (bcpKey == nullptr && ultag_isUnicodeLocaleKey(keyword, -1)) {
2290 // unknown keyword, but syntax is fine..
2291 return keyword;
2292 }
2293 return bcpKey;
2294 }
2295
2296 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleType(const char * keyword,const char * value)2297 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2298 {
2299 const char* bcpType = ulocimp_toBcpType(keyword, value, nullptr, nullptr);
2300 if (bcpType == nullptr && ultag_isUnicodeLocaleType(value, -1)) {
2301 // unknown keyword, but syntax is fine..
2302 return value;
2303 }
2304 return bcpType;
2305 }
2306
2307 namespace {
2308
2309 bool
isWellFormedLegacyKey(const char * legacyKey)2310 isWellFormedLegacyKey(const char* legacyKey)
2311 {
2312 const char* p = legacyKey;
2313 while (*p) {
2314 if (!UPRV_ISALPHANUM(*p)) {
2315 return false;
2316 }
2317 p++;
2318 }
2319 return true;
2320 }
2321
2322 bool
isWellFormedLegacyType(const char * legacyType)2323 isWellFormedLegacyType(const char* legacyType)
2324 {
2325 const char* p = legacyType;
2326 int32_t alphaNumLen = 0;
2327 while (*p) {
2328 if (*p == '_' || *p == '/' || *p == '-') {
2329 if (alphaNumLen == 0) {
2330 return false;
2331 }
2332 alphaNumLen = 0;
2333 } else if (UPRV_ISALPHANUM(*p)) {
2334 alphaNumLen++;
2335 } else {
2336 return false;
2337 }
2338 p++;
2339 }
2340 return (alphaNumLen != 0);
2341 }
2342
2343 } // namespace
2344
2345 U_CAPI const char* U_EXPORT2
uloc_toLegacyKey(const char * keyword)2346 uloc_toLegacyKey(const char* keyword)
2347 {
2348 const char* legacyKey = ulocimp_toLegacyKey(keyword);
2349 if (legacyKey == nullptr) {
2350 // Checks if the specified locale key is well-formed with the legacy locale syntax.
2351 //
2352 // Note:
2353 // LDML/CLDR provides some definition of keyword syntax in
2354 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2355 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2356 // Keys can only consist of [0-9a-zA-Z].
2357 if (isWellFormedLegacyKey(keyword)) {
2358 return keyword;
2359 }
2360 }
2361 return legacyKey;
2362 }
2363
2364 U_CAPI const char* U_EXPORT2
uloc_toLegacyType(const char * keyword,const char * value)2365 uloc_toLegacyType(const char* keyword, const char* value)
2366 {
2367 const char* legacyType = ulocimp_toLegacyType(keyword, value, nullptr, nullptr);
2368 if (legacyType == nullptr) {
2369 // Checks if the specified locale type is well-formed with the legacy locale syntax.
2370 //
2371 // Note:
2372 // LDML/CLDR provides some definition of keyword syntax in
2373 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2374 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2375 // Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2376 // we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2377 if (isWellFormedLegacyType(value)) {
2378 return value;
2379 }
2380 }
2381 return legacyType;
2382 }
2383
2384 /*eof*/
2385