xref: /aosp_15_r20/external/icu/icu4c/source/common/uloc.cpp (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1997-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *
9 * File ULOC.CPP
10 *
11 * Modification History:
12 *
13 *   Date        Name        Description
14 *   04/01/97    aliu        Creation.
15 *   08/21/98    stephen     JDK 1.2 sync
16 *   12/08/98    rtg         New Locale implementation and C API
17 *   03/15/99    damiba      overhaul.
18 *   04/06/99    stephen     changed setDefault() to realloc and copy
19 *   06/14/99    stephen     Changed calls to ures_open for new params
20 *   07/21/99    stephen     Modified setDefault() to propagate to C++
21 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
22 *                           brought canonicalization code into line with spec
23 *****************************************************************************/
24 
25 /*
26    POSIX's locale format, from putil.c: [no spaces]
27 
28      ll [ _CC ] [ . MM ] [ @ VV]
29 
30      l = lang, C = ctry, M = charmap, V = variant
31 */
32 
33 #include <optional>
34 
35 #include "unicode/bytestream.h"
36 #include "unicode/errorcode.h"
37 #include "unicode/stringpiece.h"
38 #include "unicode/utypes.h"
39 #include "unicode/ustring.h"
40 #include "unicode/uloc.h"
41 
42 #include "bytesinkutil.h"
43 #include "putilimp.h"
44 #include "ustr_imp.h"
45 #include "ulocimp.h"
46 #include "umutex.h"
47 #include "cstring.h"
48 #include "cmemory.h"
49 #include "locmap.h"
50 #include "uarrsort.h"
51 #include "uenumimp.h"
52 #include "uassert.h"
53 #include "charstr.h"
54 
55 U_NAMESPACE_USE
56 
57 /* ### Declarations **************************************************/
58 
59 /* Locale stuff from locid.cpp */
60 U_CFUNC void locale_set_default(const char *id);
61 U_CFUNC const char *locale_get_default();
62 
63 namespace {
64 
65 /* ### Data tables **************************************************/
66 
67 /**
68  * Table of language codes, both 2- and 3-letter, with preference
69  * given to 2-letter codes where possible.  Includes 3-letter codes
70  * that lack a 2-letter equivalent.
71  *
72  * This list must be in sorted order.  This list is returned directly
73  * to the user by some API.
74  *
75  * This list must be kept in sync with LANGUAGES_3, with corresponding
76  * entries matched.
77  *
78  * This table should be terminated with a nullptr entry, followed by a
79  * second list, and another nullptr entry.  The first list is visible to
80  * user code when this array is returned by API.  The second list
81  * contains codes we support, but do not expose through user API.
82  *
83  * Notes
84  *
85  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
86  * include the revisions up to 2001/7/27 *CWB*
87  *
88  * The 3 character codes are the terminology codes like RFC 3066.  This
89  * is compatible with prior ICU codes
90  *
91  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
92  * table but now at the end of the table because 3 character codes are
93  * duplicates.  This avoids bad searches going from 3 to 2 character
94  * codes.
95  *
96  * The range qaa-qtz is reserved for local use
97  */
98 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
99 /* ISO639 table version is 20150505 */
100 /* Subsequent hand addition of selected languages */
101 constexpr const char* LANGUAGES[] = {
102     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
103     "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
104     "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
105     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
106     "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
107     "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
108     "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
109     "bgc", "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
110     "blo", "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
111     "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
112     "ca",  "cad", "car", "cay", "cch", "ccp", "ce",  "ceb", "cgg",
113     "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
114     "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
115     "cs",  "csb", "csw", "cu",  "cv",  "cy",
116     "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
117     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
118     "dyo", "dyu", "dz",  "dzg",
119     "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
120     "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
121     "ext",
122     "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
123     "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
124     "frs", "fur", "fy",
125     "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
126     "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
127     "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
128     "gur", "guz", "gv",  "gwi",
129     "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
130     "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
131     "hup", "hy",  "hz",
132     "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
133     "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
134     "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
135     "jv",
136     "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
137     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
138     "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
139     "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
140     "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
141     "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
142     "kv",  "kw",  "kxv", "ky",
143     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
144     "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
145     "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
146     "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
147     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
148     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
149     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
150     "ml",  "mn",  "mnc", "mni",
151     "moh", "mos", "mr",  "mrj",
152     "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
153     "my",  "mye", "myv", "mzn",
154     "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
155     "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
156     "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
157     "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
158     "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
159     "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
160     "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
161     "pon", "prg", "pro", "ps",  "pt",
162     "qu",  "quc", "qug",
163     "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
164     "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
165     "rw",  "rwk",
166     "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
167     "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
168     "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
169     "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
170     "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
171     "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
172     "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
173     "sv",  "sw",  "swb", "syc", "syr", "szl",
174     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
175     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr",
176     "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tok", "tpi",
177     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
178     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
179     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
180     "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vmw",
181     "vo", "vot", "vro", "vun",
182     "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
183     "xal", "xh",  "xmf", "xnr", "xog",
184     "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
185     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
186     "zun", "zxx", "zza",
187 nullptr,
188     "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  /* obsolete language codes */
189 nullptr
190 };
191 
192 constexpr const char* DEPRECATED_LANGUAGES[]={
193     "in", "iw", "ji", "jw", "mo", nullptr, nullptr
194 };
195 constexpr const char* REPLACEMENT_LANGUAGES[]={
196     "id", "he", "yi", "jv", "ro", nullptr, nullptr
197 };
198 
199 /**
200  * Table of 3-letter language codes.
201  *
202  * This is a lookup table used to convert 3-letter language codes to
203  * their 2-letter equivalent, where possible.  It must be kept in sync
204  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
205  * same language as LANGUAGES_3[i].  The commented-out lines are
206  * copied from LANGUAGES to make eyeballing this baby easier.
207  *
208  * Where a 3-letter language code has no 2-letter equivalent, the
209  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
210  *
211  * This table should be terminated with a nullptr entry, followed by a
212  * second list, and another nullptr entry.  The two lists correspond to
213  * the two lists in LANGUAGES.
214  */
215 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
216 /* ISO639 table version is 20150505 */
217 /* Subsequent hand addition of selected languages */
218 constexpr const char* LANGUAGES_3[] = {
219     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
220     "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
221     "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
222     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
223     "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
224     "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
225     "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
226     "bgc", "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
227     "blo", "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
228     "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
229     "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
230     "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
231     "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
232     "ces", "csb", "csw", "chu", "chv", "cym",
233     "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
234     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
235     "dyo", "dyu", "dzo", "dzg",
236     "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
237     "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
238     "ext",
239     "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
240     "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
241     "frs", "fur", "fry",
242     "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
243     "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
244     "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
245     "gur", "guz", "glv", "gwi",
246     "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
247     "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
248     "hup", "hye", "her",
249     "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
250     "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
251     "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
252     "jav",
253     "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
254     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
255     "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
256     "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
257     "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
258     "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
259     "kom", "cor", "kxv", "kir",
260     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
261     "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
262     "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
263     "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
264     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
265     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
266     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
267     "mal", "mon", "mnc", "mni",
268     "moh", "mos", "mar", "mrj",
269     "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
270     "mya", "mye", "myv", "mzn",
271     "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
272     "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
273     "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
274     "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
275     "oci", "oji", "orm", "ori", "oss", "osa", "ota",
276     "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
277     "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
278     "pon", "prg", "pro", "pus", "por",
279     "que", "quc", "qug",
280     "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
281     "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
282     "kin", "rwk",
283     "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
284     "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
285     "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
286     "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
287     "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
288     "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
289     "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
290     "swe", "swa", "swb", "syc", "syr", "szl",
291     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
292     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
293     "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tok", "tpi",
294     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
295     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
296     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
297     "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vmw",
298     "vol", "vot", "vro", "vun",
299     "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
300     "xal", "xho", "xmf", "xnr", "xog",
301     "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
302     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
303     "zun", "zxx", "zza",
304 nullptr,
305 /*  "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  */
306     "ind", "heb", "yid", "jaw", "mol", "srp", "swc", "tgl",
307 nullptr
308 };
309 
310 /**
311  * Table of 2-letter country codes.
312  *
313  * This list must be in sorted order.  This list is returned directly
314  * to the user by some API.
315  *
316  * This list must be kept in sync with COUNTRIES_3, with corresponding
317  * entries matched.
318  *
319  * This table should be terminated with a nullptr entry, followed by a
320  * second list, and another nullptr entry.  The first list is visible to
321  * user code when this array is returned by API.  The second list
322  * contains codes we support, but do not expose through user API.
323  *
324  * Notes:
325  *
326  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
327  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
328  * new codes keeping the old ones for compatibility updated to include
329  * 1999/12/03 revisions *CWB*
330  *
331  * RO(ROM) is now RO(ROU) according to
332  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
333  */
334 constexpr const char* COUNTRIES[] = {
335     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
336     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
337     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
338     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
339     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
340     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CQ",  "CR",
341     "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",
342     "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",
343     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
344     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
345     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
346     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
347     "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
348     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
349     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
350     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
351     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
352     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
353     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
354     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
355     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
356     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
357     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
358     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
359     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
360     "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
361     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
362     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
363     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
364     "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
365 nullptr,
366     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
367 nullptr
368 };
369 
370 constexpr const char* DEPRECATED_COUNTRIES[] = {
371     "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", nullptr, nullptr /* deprecated country list */
372 };
373 constexpr const char* REPLACEMENT_COUNTRIES[] = {
374 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
375     "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", nullptr, nullptr  /* replacement country codes */
376 };
377 
378 /**
379  * Table of 3-letter country codes.
380  *
381  * This is a lookup table used to convert 3-letter country codes to
382  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
383  * For all valid i, COUNTRIES[i] must refer to the same country as
384  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
385  * to make eyeballing this baby easier.
386  *
387  * This table should be terminated with a nullptr entry, followed by a
388  * second list, and another nullptr entry.  The two lists correspond to
389  * the two lists in COUNTRIES.
390  */
391 constexpr const char* COUNTRIES_3[] = {
392 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
393     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
394 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
395     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
396 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
397     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
398 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
399     "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
400 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
401     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
402 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CQ",  "CR",     */
403     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRQ", "CRI",
404 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",     */
405     "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
406 /*  "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",     */
407     "DMA", "DOM", "DZA", "XEA", "ECU", "EST", "EGY", "ESH", "ERI",
408 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
409     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
410 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
411     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
412 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
413     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
414 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
415     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
416 /*  "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
417     "XIC", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
418 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
419     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
420 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
421     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
422 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
423     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
424 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
425     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
426 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
427     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
428 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
429     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
430 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
431     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
432 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
433     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
434 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
435     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
436 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
437     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
438 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
439     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
440 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
441     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
442 /*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
443     "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
444 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
445     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
446 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
447     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
448 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
449     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
450 /*  "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
451     "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
452 nullptr,
453 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
454     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
455 nullptr
456 };
457 
458 typedef struct CanonicalizationMap {
459     const char *id;          /* input ID */
460     const char *canonicalID; /* canonicalized output ID */
461 } CanonicalizationMap;
462 
463 /**
464  * A map to canonicalize locale IDs.  This handles a variety of
465  * different semantic kinds of transformations.
466  */
467 constexpr CanonicalizationMap CANONICALIZE_MAP[] = {
468     { "art__LOJBAN",    "jbo" }, /* registered name */
469     { "hy__AREVELA",    "hy" }, /* Registered IANA variant */
470     { "hy__AREVMDA",    "hyw" }, /* Registered IANA variant */
471     { "zh__GUOYU",      "zh" }, /* registered name */
472     { "zh__HAKKA",      "hak" }, /* registered name */
473     { "zh__XIANG",      "hsn" }, /* registered name */
474     // subtags with 3 chars won't be treated as variants.
475     { "zh_GAN",         "gan" }, /* registered name */
476     { "zh_MIN_NAN",     "nan" }, /* registered name */
477     { "zh_WUU",         "wuu" }, /* registered name */
478     { "zh_YUE",         "yue" }, /* registered name */
479 };
480 
481 /* ### BCP47 Conversion *******************************************/
482 /* Gets the size of the shortest subtag in the given localeID. */
getShortestSubtagLength(const char * localeID)483 int32_t getShortestSubtagLength(const char *localeID) {
484     int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
485     int32_t length = localeIDLength;
486     int32_t tmpLength = 0;
487     int32_t i;
488     bool reset = true;
489 
490     for (i = 0; i < localeIDLength; i++) {
491         if (localeID[i] != '_' && localeID[i] != '-') {
492             if (reset) {
493                 tmpLength = 0;
494                 reset = false;
495             }
496             tmpLength++;
497         } else {
498             if (tmpLength != 0 && tmpLength < length) {
499                 length = tmpLength;
500             }
501             reset = true;
502         }
503     }
504 
505     return length;
506 }
507 /* Test if the locale id has BCP47 u extension and does not have '@' */
_hasBCP47Extension(const char * id)508 inline bool _hasBCP47Extension(const char *id) {
509     return id != nullptr && uprv_strstr(id, "@") == nullptr && getShortestSubtagLength(id) == 1;
510 }
511 
512 /* ### Keywords **************************************************/
UPRV_ISDIGIT(char c)513 inline bool UPRV_ISDIGIT(char c) { return c >= '0' && c <= '9'; }
UPRV_ISALPHANUM(char c)514 inline bool UPRV_ISALPHANUM(char c) { return uprv_isASCIILetter(c) || UPRV_ISDIGIT(c); }
515 /* Punctuation/symbols allowed in legacy key values */
UPRV_OK_VALUE_PUNCTUATION(char c)516 inline bool UPRV_OK_VALUE_PUNCTUATION(char c) { return c == '_' || c == '-' || c == '+' || c == '/'; }
517 
518 }  // namespace
519 
520 #define ULOC_KEYWORD_BUFFER_LEN 25
521 #define ULOC_MAX_NO_KEYWORDS 25
522 
523 U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char * localeID)524 locale_getKeywordsStart(const char *localeID) {
525     const char *result = nullptr;
526     if((result = uprv_strchr(localeID, '@')) != nullptr) {
527         return result;
528     }
529 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
530     else {
531         /* We do this because the @ sign is variant, and the @ sign used on one
532         EBCDIC machine won't be compiled the same way on other EBCDIC based
533         machines. */
534         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
535         const uint8_t *charToFind = ebcdicSigns;
536         while(*charToFind) {
537             if((result = uprv_strchr(localeID, *charToFind)) != nullptr) {
538                 return result;
539             }
540             charToFind++;
541         }
542     }
543 #endif
544     return nullptr;
545 }
546 
547 namespace {
548 
549 /**
550  * @param keywordName incoming name to be canonicalized
551  * @param status return status (keyword too long)
552  * @return the keyword name
553  */
locale_canonKeywordName(const char * keywordName,UErrorCode & status)554 CharString locale_canonKeywordName(const char* keywordName, UErrorCode& status)
555 {
556   if (U_FAILURE(status)) { return {}; }
557   CharString result;
558 
559   for (; *keywordName != 0; keywordName++) {
560     if (!UPRV_ISALPHANUM(*keywordName)) {
561       status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
562       return {};
563     }
564     result.append(uprv_tolower(*keywordName), status);
565   }
566   if (result.isEmpty()) {
567     status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
568     return {};
569   }
570 
571   return result;
572 }
573 
574 typedef struct {
575     char keyword[ULOC_KEYWORD_BUFFER_LEN];
576     int32_t keywordLen;
577     const char *valueStart;
578     int32_t valueLen;
579 } KeywordStruct;
580 
581 int32_t U_CALLCONV
compareKeywordStructs(const void *,const void * left,const void * right)582 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
583     const char* leftString = ((const KeywordStruct *)left)->keyword;
584     const char* rightString = ((const KeywordStruct *)right)->keyword;
585     return uprv_strcmp(leftString, rightString);
586 }
587 
588 }  // namespace
589 
590 U_EXPORT CharString
ulocimp_getKeywords(const char * localeID,char prev,bool valuesToo,UErrorCode & status)591 ulocimp_getKeywords(const char* localeID,
592                     char prev,
593                     bool valuesToo,
594                     UErrorCode& status)
595 {
596     return ByteSinkUtil::viaByteSinkToCharString(
597         [&](ByteSink& sink, UErrorCode& status) {
598             ulocimp_getKeywords(localeID,
599                                 prev,
600                                 sink,
601                                 valuesToo,
602                                 status);
603         },
604         status);
605 }
606 
607 U_EXPORT void
ulocimp_getKeywords(const char * localeID,char prev,ByteSink & sink,bool valuesToo,UErrorCode & status)608 ulocimp_getKeywords(const char* localeID,
609                     char prev,
610                     ByteSink& sink,
611                     bool valuesToo,
612                     UErrorCode& status)
613 {
614     if (U_FAILURE(status)) { return; }
615 
616     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
617 
618     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
619     int32_t numKeywords = 0;
620     const char* pos = localeID;
621     const char* equalSign = nullptr;
622     const char* semicolon = nullptr;
623     int32_t i = 0, j, n;
624 
625     if(prev == '@') { /* start of keyword definition */
626         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
627         do {
628             bool duplicate = false;
629             /* skip leading spaces */
630             while(*pos == ' ') {
631                 pos++;
632             }
633             if (!*pos) { /* handle trailing "; " */
634                 break;
635             }
636             if(numKeywords == maxKeywords) {
637                 status = U_INTERNAL_PROGRAM_ERROR;
638                 return;
639             }
640             equalSign = uprv_strchr(pos, '=');
641             semicolon = uprv_strchr(pos, ';');
642             /* lack of '=' [foo@currency] is illegal */
643             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
644             if(!equalSign || (semicolon && semicolon<equalSign)) {
645                 status = U_INVALID_FORMAT_ERROR;
646                 return;
647             }
648             /* need to normalize both keyword and keyword name */
649             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
650                 /* keyword name too long for internal buffer */
651                 status = U_INTERNAL_PROGRAM_ERROR;
652                 return;
653             }
654             for(i = 0, n = 0; i < equalSign - pos; ++i) {
655                 if (pos[i] != ' ') {
656                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
657                 }
658             }
659 
660             /* zero-length keyword is an error. */
661             if (n == 0) {
662                 status = U_INVALID_FORMAT_ERROR;
663                 return;
664             }
665 
666             keywordList[numKeywords].keyword[n] = 0;
667             keywordList[numKeywords].keywordLen = n;
668             /* now grab the value part. First we skip the '=' */
669             equalSign++;
670             /* then we leading spaces */
671             while(*equalSign == ' ') {
672                 equalSign++;
673             }
674 
675             /* Premature end or zero-length value */
676             if (!*equalSign || equalSign == semicolon) {
677                 status = U_INVALID_FORMAT_ERROR;
678                 return;
679             }
680 
681             keywordList[numKeywords].valueStart = equalSign;
682 
683             pos = semicolon;
684             i = 0;
685             if(pos) {
686                 while(*(pos - i - 1) == ' ') {
687                     i++;
688                 }
689                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
690                 pos++;
691             } else {
692                 i = (int32_t)uprv_strlen(equalSign);
693                 while(i && equalSign[i-1] == ' ') {
694                     i--;
695                 }
696                 keywordList[numKeywords].valueLen = i;
697             }
698             /* If this is a duplicate keyword, then ignore it */
699             for (j=0; j<numKeywords; ++j) {
700                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
701                     duplicate = true;
702                     break;
703                 }
704             }
705             if (!duplicate) {
706                 ++numKeywords;
707             }
708         } while(pos);
709 
710         /* now we have a list of keywords */
711         /* we need to sort it */
712         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, nullptr, false, &status);
713 
714         /* Now construct the keyword part */
715         for(i = 0; i < numKeywords; i++) {
716             sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
717             if(valuesToo) {
718                 sink.Append("=", 1);
719                 sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
720                 if(i < numKeywords - 1) {
721                     sink.Append(";", 1);
722                 }
723             } else {
724                 sink.Append("\0", 1);
725             }
726         }
727     }
728 }
729 
730 U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char * localeID,const char * keywordName,char * buffer,int32_t bufferCapacity,UErrorCode * status)731 uloc_getKeywordValue(const char* localeID,
732                      const char* keywordName,
733                      char* buffer, int32_t bufferCapacity,
734                      UErrorCode* status)
735 {
736     return ByteSinkUtil::viaByteSinkToTerminatedChars(
737         buffer, bufferCapacity,
738         [&](ByteSink& sink, UErrorCode& status) {
739             ulocimp_getKeywordValue(localeID, keywordName, sink, status);
740         },
741         *status);
742 }
743 
744 U_EXPORT CharString
ulocimp_getKeywordValue(const char * localeID,const char * keywordName,UErrorCode & status)745 ulocimp_getKeywordValue(const char* localeID,
746                         const char* keywordName,
747                         UErrorCode& status)
748 {
749     return ByteSinkUtil::viaByteSinkToCharString(
750         [&](ByteSink& sink, UErrorCode& status) {
751             ulocimp_getKeywordValue(localeID, keywordName, sink, status);
752         },
753         status);
754 }
755 
756 U_EXPORT void
ulocimp_getKeywordValue(const char * localeID,const char * keywordName,icu::ByteSink & sink,UErrorCode & status)757 ulocimp_getKeywordValue(const char* localeID,
758                         const char* keywordName,
759                         icu::ByteSink& sink,
760                         UErrorCode& status)
761 {
762     if (U_FAILURE(status)) { return; }
763 
764     if (localeID == nullptr || keywordName == nullptr || keywordName[0] == 0) {
765         status = U_ILLEGAL_ARGUMENT_ERROR;
766         return;
767     }
768 
769     const char* startSearchHere = nullptr;
770     const char* nextSeparator = nullptr;
771 
772     CharString tempBuffer;
773     const char* tmpLocaleID;
774 
775     CharString canonKeywordName = locale_canonKeywordName(keywordName, status);
776     if (U_FAILURE(status)) {
777       return;
778     }
779 
780     if (_hasBCP47Extension(localeID)) {
781         tempBuffer = ulocimp_forLanguageTag(localeID, -1, nullptr, status);
782         tmpLocaleID = U_SUCCESS(status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
783     } else {
784         tmpLocaleID=localeID;
785     }
786 
787     startSearchHere = locale_getKeywordsStart(tmpLocaleID);
788     if(startSearchHere == nullptr) {
789         /* no keywords, return at once */
790         return;
791     }
792 
793     /* find the first keyword */
794     while(startSearchHere) {
795         const char* keyValueTail;
796 
797         startSearchHere++; /* skip @ or ; */
798         nextSeparator = uprv_strchr(startSearchHere, '=');
799         if(!nextSeparator) {
800             status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
801             return;
802         }
803         /* strip leading & trailing spaces (TC decided to tolerate these) */
804         while(*startSearchHere == ' ') {
805             startSearchHere++;
806         }
807         keyValueTail = nextSeparator;
808         while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
809             keyValueTail--;
810         }
811         /* now keyValueTail points to first char after the keyName */
812         /* copy & normalize keyName from locale */
813         if (startSearchHere == keyValueTail) {
814             status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
815             return;
816         }
817         CharString localeKeywordName;
818         while (startSearchHere < keyValueTail) {
819           if (!UPRV_ISALPHANUM(*startSearchHere)) {
820             status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
821             return;
822           }
823           localeKeywordName.append(uprv_tolower(*startSearchHere++), status);
824         }
825         if (U_FAILURE(status)) {
826             return;
827         }
828 
829         startSearchHere = uprv_strchr(nextSeparator, ';');
830 
831         if (canonKeywordName == localeKeywordName) {
832              /* current entry matches the keyword. */
833            nextSeparator++; /* skip '=' */
834             /* First strip leading & trailing spaces (TC decided to tolerate these) */
835             while(*nextSeparator == ' ') {
836               nextSeparator++;
837             }
838             keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
839             while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
840               keyValueTail--;
841             }
842             /* Now copy the value, but check well-formedness */
843             if (nextSeparator == keyValueTail) {
844               status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
845               return;
846             }
847             while (nextSeparator < keyValueTail) {
848               if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
849                 status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
850                 return;
851               }
852               /* Should we lowercase value to return here? Tests expect as-is. */
853               sink.Append(nextSeparator++, 1);
854             }
855             return;
856         }
857     }
858 }
859 
860 U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char * keywordName,const char * keywordValue,char * buffer,int32_t bufferCapacity,UErrorCode * status)861 uloc_setKeywordValue(const char* keywordName,
862                      const char* keywordValue,
863                      char* buffer, int32_t bufferCapacity,
864                      UErrorCode* status)
865 {
866     if (U_FAILURE(*status)) { return 0; }
867 
868     if (bufferCapacity <= 1) {
869         *status = U_ILLEGAL_ARGUMENT_ERROR;
870         return 0;
871     }
872 
873     int32_t bufLen = (int32_t)uprv_strlen(buffer);
874     if(bufferCapacity<bufLen) {
875         /* The capacity is less than the length?! Is this NUL terminated? */
876         *status = U_ILLEGAL_ARGUMENT_ERROR;
877         return 0;
878     }
879 
880     char* keywords = const_cast<char*>(locale_getKeywordsStart(buffer));
881     int32_t baseLen = keywords == nullptr ? bufLen : keywords - buffer;
882     // Remove -1 from the capacity so that this function can guarantee NUL termination.
883     CheckedArrayByteSink sink(keywords == nullptr ? buffer + bufLen : keywords,
884                               bufferCapacity - baseLen - 1);
885     int32_t reslen = ulocimp_setKeywordValue(
886             keywords, keywordName, keywordValue, sink, *status);
887 
888     if (U_FAILURE(*status)) {
889         return *status == U_BUFFER_OVERFLOW_ERROR ? reslen + baseLen : 0;
890     }
891 
892     // See the documentation for this function, it's guaranteed to never
893     // overflow the buffer but instead abort with BUFFER_OVERFLOW_ERROR.
894     // In this case, nothing has been written to the sink, so it cannot have Overflowed().
895     U_ASSERT(!sink.Overflowed());
896     U_ASSERT(reslen >= 0);
897     return u_terminateChars(buffer, bufferCapacity, reslen + baseLen, status);
898 }
899 
900 U_EXPORT void
ulocimp_setKeywordValue(const char * keywordName,const char * keywordValue,CharString & localeID,UErrorCode & status)901 ulocimp_setKeywordValue(const char* keywordName,
902                         const char* keywordValue,
903                         CharString& localeID,
904                         UErrorCode& status)
905 {
906     if (U_FAILURE(status)) { return; }
907     // This is safe because CharString::truncate() doesn't actually erase any
908     // data, but simply sets the position for where new data will be written.
909     const char* keywords = locale_getKeywordsStart(localeID.data());
910     if (keywords != nullptr) localeID.truncate(keywords - localeID.data());
911     CharStringByteSink sink(&localeID);
912     ulocimp_setKeywordValue(keywords, keywordName, keywordValue, sink, status);
913 }
914 
915 U_EXPORT int32_t
ulocimp_setKeywordValue(const char * keywords,const char * keywordName,const char * keywordValue,ByteSink & sink,UErrorCode & status)916 ulocimp_setKeywordValue(const char* keywords,
917                         const char* keywordName,
918                         const char* keywordValue,
919                         ByteSink& sink,
920                         UErrorCode& status)
921 {
922     if (U_FAILURE(status)) { return 0; }
923 
924     /* TODO: sorting. removal. */
925     int32_t needLen = 0;
926     int32_t rc;
927     const char* nextSeparator = nullptr;
928     const char* nextEqualsign = nullptr;
929     const char* keywordStart = nullptr;
930     CharString updatedKeysAndValues;
931     bool handledInputKeyAndValue = false;
932     char keyValuePrefix = '@';
933 
934     if (status == U_STRING_NOT_TERMINATED_WARNING) {
935         status = U_ZERO_ERROR;
936     }
937     if (keywordName == nullptr || keywordName[0] == 0) {
938         status = U_ILLEGAL_ARGUMENT_ERROR;
939         return 0;
940     }
941     CharString canonKeywordName = locale_canonKeywordName(keywordName, status);
942     if (U_FAILURE(status)) {
943         return 0;
944     }
945 
946     CharString canonKeywordValue;
947     if(keywordValue) {
948         while (*keywordValue != 0) {
949             if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
950                 status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
951                 return 0;
952             }
953             /* Should we force lowercase in value to set? */
954             canonKeywordValue.append(*keywordValue++, status);
955         }
956     }
957     if (U_FAILURE(status)) {
958         return 0;
959     }
960 
961     if (keywords == nullptr || keywords[1] == '\0') {
962         if (canonKeywordValue.isEmpty()) { /* no keywords = nothing to remove */
963             U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
964             return 0;
965         }
966 
967         needLen = 1 + canonKeywordName.length() + 1 + canonKeywordValue.length();
968         int32_t capacity = 0;
969         char* buffer = sink.GetAppendBuffer(
970                 needLen, needLen, nullptr, needLen, &capacity);
971         if (capacity < needLen || buffer == nullptr) {
972             status = U_BUFFER_OVERFLOW_ERROR;
973             return needLen; /* no change */
974         }
975         char* it = buffer;
976 
977         *it++ = '@';
978         uprv_memcpy(it, canonKeywordName.data(), canonKeywordName.length());
979         it += canonKeywordName.length();
980         *it++ = '=';
981         uprv_memcpy(it, canonKeywordValue.data(), canonKeywordValue.length());
982         sink.Append(buffer, needLen);
983         U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
984         return needLen;
985     } /* end shortcut - no @ */
986 
987     keywordStart = keywords;
988     /* search for keyword */
989     while(keywordStart) {
990         const char* keyValueTail;
991 
992         keywordStart++; /* skip @ or ; */
993         nextEqualsign = uprv_strchr(keywordStart, '=');
994         if (!nextEqualsign) {
995             status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
996             return 0;
997         }
998         /* strip leading & trailing spaces (TC decided to tolerate these) */
999         while(*keywordStart == ' ') {
1000             keywordStart++;
1001         }
1002         keyValueTail = nextEqualsign;
1003         while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
1004             keyValueTail--;
1005         }
1006         /* now keyValueTail points to first char after the keyName */
1007         /* copy & normalize keyName from locale */
1008         if (keywordStart == keyValueTail) {
1009             status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
1010             return 0;
1011         }
1012         CharString localeKeywordName;
1013         while (keywordStart < keyValueTail) {
1014             if (!UPRV_ISALPHANUM(*keywordStart)) {
1015                 status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1016                 return 0;
1017             }
1018             localeKeywordName.append(uprv_tolower(*keywordStart++), status);
1019         }
1020         if (U_FAILURE(status)) {
1021             return 0;
1022         }
1023 
1024         nextSeparator = uprv_strchr(nextEqualsign, ';');
1025 
1026         /* start processing the value part */
1027         nextEqualsign++; /* skip '=' */
1028         /* First strip leading & trailing spaces (TC decided to tolerate these) */
1029         while(*nextEqualsign == ' ') {
1030             nextEqualsign++;
1031         }
1032         keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1033         while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
1034             keyValueTail--;
1035         }
1036         if (nextEqualsign == keyValueTail) {
1037             status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1038             return 0;
1039         }
1040 
1041         rc = uprv_strcmp(canonKeywordName.data(), localeKeywordName.data());
1042         if(rc == 0) {
1043             /* Current entry matches the input keyword. Update the entry */
1044             if (!canonKeywordValue.isEmpty()) { /* updating a value */
1045                 updatedKeysAndValues.append(keyValuePrefix, status);
1046                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1047                 updatedKeysAndValues.append(canonKeywordName, status);
1048                 updatedKeysAndValues.append('=', status);
1049                 updatedKeysAndValues.append(canonKeywordValue, status);
1050             } /* else removing this entry, don't emit anything */
1051             handledInputKeyAndValue = true;
1052         } else {
1053            /* input keyword sorts earlier than current entry, add before current entry */
1054             if (rc < 0 && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) {
1055                 /* insert new entry at this location */
1056                 updatedKeysAndValues.append(keyValuePrefix, status);
1057                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1058                 updatedKeysAndValues.append(canonKeywordName, status);
1059                 updatedKeysAndValues.append('=', status);
1060                 updatedKeysAndValues.append(canonKeywordValue, status);
1061                 handledInputKeyAndValue = true;
1062             }
1063             /* copy the current entry */
1064             updatedKeysAndValues.append(keyValuePrefix, status);
1065             keyValuePrefix = ';'; /* for any subsequent key-value pair */
1066             updatedKeysAndValues.append(localeKeywordName, status);
1067             updatedKeysAndValues.append('=', status);
1068             updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), status);
1069         }
1070         if (!nextSeparator && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) {
1071             /* append new entry at the end, it sorts later than existing entries */
1072             updatedKeysAndValues.append(keyValuePrefix, status);
1073             /* skip keyValuePrefix update, no subsequent key-value pair */
1074             updatedKeysAndValues.append(canonKeywordName, status);
1075             updatedKeysAndValues.append('=', status);
1076             updatedKeysAndValues.append(canonKeywordValue, status);
1077             handledInputKeyAndValue = true;
1078         }
1079         keywordStart = nextSeparator;
1080     } /* end loop searching */
1081 
1082     /* Any error from updatedKeysAndValues.append above would be internal and not due to
1083      * problems with the passed-in locale. So if we did encounter problems with the
1084      * passed-in locale above, those errors took precedence and overrode any error
1085      * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1086      * are errors here they are from updatedKeysAndValues.append; they do cause an
1087      * error return but the passed-in locale is unmodified and the original bufLen is
1088      * returned.
1089      */
1090     if (!handledInputKeyAndValue || U_FAILURE(status)) {
1091         /* if input key/value specified removal of a keyword not present in locale, or
1092          * there was an error in CharString.append, leave original locale alone. */
1093         U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
1094         return (int32_t)uprv_strlen(keywords);
1095     }
1096 
1097     needLen = updatedKeysAndValues.length();
1098     // Check to see can we fit the updatedKeysAndValues, if not, return
1099     // U_BUFFER_OVERFLOW_ERROR without copy updatedKeysAndValues into it.
1100     // We do this because this API function does not behave like most others:
1101     // It promises never to set a U_STRING_NOT_TERMINATED_WARNING.
1102     // When the contents fits but without the terminating NUL, in this case we need to not change
1103     // the buffer contents and return with a buffer overflow error.
1104     if (needLen > 0) {
1105         int32_t capacity = 0;
1106         char* buffer = sink.GetAppendBuffer(
1107                 needLen, needLen, nullptr, needLen, &capacity);
1108         if (capacity < needLen || buffer == nullptr) {
1109             status = U_BUFFER_OVERFLOW_ERROR;
1110             return needLen;
1111         }
1112         uprv_memcpy(buffer, updatedKeysAndValues.data(), needLen);
1113         sink.Append(buffer, needLen);
1114     }
1115     U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
1116     return needLen;
1117 }
1118 
1119 /* ### ID parsing implementation **************************************************/
1120 
1121 namespace {
1122 
_isPrefixLetter(char a)1123 inline bool _isPrefixLetter(char a) { return a == 'x' || a == 'X' || a == 'i' || a == 'I'; }
1124 
1125 /*returns true if one of the special prefixes is here (s=string)
1126   'x-' or 'i-' */
_isIDPrefix(const char * s)1127 inline bool _isIDPrefix(const char *s) { return _isPrefixLetter(s[0]) && _isIDSeparator(s[1]); }
1128 
1129 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1130  * except for variant
1131  */
_isTerminator(char a)1132 inline bool _isTerminator(char a) { return a == 0 || a == '.' || a == '@'; }
1133 
_isBCP47Extension(const char * p)1134 inline bool _isBCP47Extension(const char* p) {
1135     return p[0] == '-' &&
1136            (p[1] == 't' || p[1] == 'T' ||
1137             p[1] == 'u' || p[1] == 'U' ||
1138             p[1] == 'x' || p[1] == 'X') &&
1139            p[2] == '-';
1140 }
1141 
1142 /**
1143  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1144  * a nullptr entry, followed by more entries, and a second nullptr entry.
1145  *
1146  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1147  * COUNTRIES_3.
1148  */
_findIndex(const char * const * list,const char * key)1149 std::optional<int16_t> _findIndex(const char* const* list, const char* key)
1150 {
1151     const char* const* anchor = list;
1152     int32_t pass = 0;
1153 
1154     /* Make two passes through two nullptr-terminated arrays at 'list' */
1155     while (pass++ < 2) {
1156         while (*list) {
1157             if (uprv_strcmp(key, *list) == 0) {
1158                 return (int16_t)(list - anchor);
1159             }
1160             list++;
1161         }
1162         ++list;     /* skip final nullptr *CWB*/
1163     }
1164     return std::nullopt;
1165 }
1166 
1167 }  // namespace
1168 
1169 U_CFUNC const char*
uloc_getCurrentCountryID(const char * oldID)1170 uloc_getCurrentCountryID(const char* oldID){
1171     std::optional<int16_t> offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1172     return offset.has_value() ? REPLACEMENT_COUNTRIES[*offset] : oldID;
1173 }
1174 U_CFUNC const char*
uloc_getCurrentLanguageID(const char * oldID)1175 uloc_getCurrentLanguageID(const char* oldID){
1176     std::optional<int16_t> offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1177     return offset.has_value() ? REPLACEMENT_LANGUAGES[*offset] : oldID;
1178 }
1179 
1180 namespace {
1181 
1182 /*
1183  * the internal functions _getLanguage(), _getScript(), _getRegion(), _getVariant()
1184  * avoid duplicating code to handle the earlier locale ID pieces
1185  * in the functions for the later ones by
1186  * setting the *pEnd pointer to where they stopped parsing
1187  *
1188  * TODO try to use this in Locale
1189  */
1190 
1191 void
_getLanguage(const char * localeID,ByteSink * sink,const char ** pEnd,UErrorCode & status)1192 _getLanguage(const char* localeID,
1193              ByteSink* sink,
1194              const char** pEnd,
1195              UErrorCode& status) {
1196     U_ASSERT(pEnd != nullptr);
1197     *pEnd = localeID;
1198 
1199     if (uprv_stricmp(localeID, "root") == 0) {
1200         localeID += 4;
1201     } else if (uprv_strnicmp(localeID, "und", 3) == 0 &&
1202                (localeID[3] == '\0' ||
1203                 localeID[3] == '-' ||
1204                 localeID[3] == '_' ||
1205                 localeID[3] == '@')) {
1206         localeID += 3;
1207     }
1208 
1209     constexpr int32_t MAXLEN = ULOC_LANG_CAPACITY - 1;  // Minus NUL.
1210 
1211     /* if it starts with i- or x- then copy that prefix */
1212     int32_t len = _isIDPrefix(localeID) ? 2 : 0;
1213     while (!_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) {
1214         if (len == MAXLEN) {
1215             status = U_ILLEGAL_ARGUMENT_ERROR;
1216             return;
1217         }
1218         len++;
1219     }
1220 
1221     *pEnd = localeID + len;
1222     if (sink == nullptr || len == 0) { return; }
1223 
1224     int32_t minCapacity = uprv_max(len, 4);  // Minimum 3 letters plus NUL.
1225     char scratch[MAXLEN];
1226     int32_t capacity = 0;
1227     char* buffer = sink->GetAppendBuffer(
1228             minCapacity, minCapacity, scratch, UPRV_LENGTHOF(scratch), &capacity);
1229 
1230     for (int32_t i = 0; i < len; ++i) {
1231         buffer[i] = uprv_tolower(localeID[i]);
1232     }
1233     if (_isIDSeparator(localeID[1])) {
1234         buffer[1] = '-';
1235     }
1236 
1237     if (len == 3) {
1238         /* convert 3 character code to 2 character code if possible *CWB*/
1239         U_ASSERT(capacity >= 4);
1240         buffer[3] = '\0';
1241         std::optional<int16_t> offset = _findIndex(LANGUAGES_3, buffer);
1242         if (offset.has_value()) {
1243             const char* const alias = LANGUAGES[*offset];
1244             sink->Append(alias, (int32_t)uprv_strlen(alias));
1245             return;
1246         }
1247     }
1248 
1249     sink->Append(buffer, len);
1250 }
1251 
1252 void
_getScript(const char * localeID,ByteSink * sink,const char ** pEnd)1253 _getScript(const char* localeID,
1254            ByteSink* sink,
1255            const char** pEnd) {
1256     U_ASSERT(pEnd != nullptr);
1257     *pEnd = localeID;
1258 
1259     constexpr int32_t LENGTH = 4;
1260 
1261     int32_t len = 0;
1262     while (!_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len]) &&
1263             uprv_isASCIILetter(localeID[len])) {
1264         if (len == LENGTH) { return; }
1265         len++;
1266     }
1267     if (len != LENGTH) { return; }
1268 
1269     *pEnd = localeID + LENGTH;
1270     if (sink == nullptr) { return; }
1271 
1272     char scratch[LENGTH];
1273     int32_t capacity = 0;
1274     char* buffer = sink->GetAppendBuffer(
1275             LENGTH, LENGTH, scratch, UPRV_LENGTHOF(scratch), &capacity);
1276 
1277     buffer[0] = uprv_toupper(localeID[0]);
1278     for (int32_t i = 1; i < LENGTH; ++i) {
1279         buffer[i] = uprv_tolower(localeID[i]);
1280     }
1281 
1282     sink->Append(buffer, LENGTH);
1283 }
1284 
1285 void
_getRegion(const char * localeID,ByteSink * sink,const char ** pEnd)1286 _getRegion(const char* localeID,
1287            ByteSink* sink,
1288            const char** pEnd) {
1289     U_ASSERT(pEnd != nullptr);
1290     *pEnd = localeID;
1291 
1292     constexpr int32_t MINLEN = 2;
1293     constexpr int32_t MAXLEN = ULOC_COUNTRY_CAPACITY - 1;  // Minus NUL.
1294 
1295     int32_t len = 0;
1296     while (!_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) {
1297         if (len == MAXLEN) { return; }
1298         len++;
1299     }
1300     if (len < MINLEN) { return; }
1301 
1302     *pEnd = localeID + len;
1303     if (sink == nullptr) { return; }
1304 
1305     char scratch[ULOC_COUNTRY_CAPACITY];
1306     int32_t capacity = 0;
1307     char* buffer = sink->GetAppendBuffer(
1308             ULOC_COUNTRY_CAPACITY,
1309             ULOC_COUNTRY_CAPACITY,
1310             scratch,
1311             UPRV_LENGTHOF(scratch),
1312             &capacity);
1313 
1314     for (int32_t i = 0; i < len; ++i) {
1315         buffer[i] = uprv_toupper(localeID[i]);
1316     }
1317 
1318     if (len == 3) {
1319         /* convert 3 character code to 2 character code if possible *CWB*/
1320         U_ASSERT(capacity >= 4);
1321         buffer[3] = '\0';
1322         std::optional<int16_t> offset = _findIndex(COUNTRIES_3, buffer);
1323         if (offset.has_value()) {
1324             const char* const alias = COUNTRIES[*offset];
1325             sink->Append(alias, (int32_t)uprv_strlen(alias));
1326             return;
1327         }
1328     }
1329 
1330     sink->Append(buffer, len);
1331 }
1332 
1333 /**
1334  * @param needSeparator if true, then add leading '_' if any variants
1335  * are added to 'variant'
1336  */
1337 void
_getVariant(const char * localeID,char prev,ByteSink * sink,const char ** pEnd,bool needSeparator,UErrorCode & status)1338 _getVariant(const char* localeID,
1339             char prev,
1340             ByteSink* sink,
1341             const char** pEnd,
1342             bool needSeparator,
1343             UErrorCode& status) {
1344     if (U_FAILURE(status)) return;
1345     if (pEnd != nullptr) { *pEnd = localeID; }
1346 
1347     // Reasonable upper limit for variants
1348     // There are no strict limitation of the syntax of variant in the legacy
1349     // locale format. If the locale is constructed from unicode_locale_id
1350     // as defined in UTS35, then we know each unicode_variant_subtag
1351     // could have max length of 8 ((alphanum{5,8} | digit alphanum{3})
1352     // 179 would allow 20 unicode_variant_subtag with sep in the
1353     // unicode_locale_id
1354     // 8*20 + 1*(20-1) = 179
1355     constexpr int32_t MAX_VARIANTS_LENGTH = 179;
1356 
1357     /* get one or more variant tags and separate them with '_' */
1358     int32_t index = 0;
1359     if (_isIDSeparator(prev)) {
1360         /* get a variant string after a '-' or '_' */
1361         for (index=0; !_isTerminator(localeID[index]); index++) {
1362             if (index >= MAX_VARIANTS_LENGTH) { // same as length > MAX_VARIANTS_LENGTH
1363                 status = U_ILLEGAL_ARGUMENT_ERROR;
1364                 return;
1365             }
1366             if (needSeparator) {
1367                 if (sink != nullptr) {
1368                     sink->Append("_", 1);
1369                 }
1370                 needSeparator = false;
1371             }
1372             if (sink != nullptr) {
1373                 char c = (char)uprv_toupper(localeID[index]);
1374                 if (c == '-') c = '_';
1375                 sink->Append(&c, 1);
1376             }
1377         }
1378         if (pEnd != nullptr) { *pEnd = localeID+index; }
1379     }
1380 
1381     /* if there is no variant tag after a '-' or '_' then look for '@' */
1382     if (index == 0) {
1383         if (prev=='@') {
1384             /* keep localeID */
1385         } else if((localeID=locale_getKeywordsStart(localeID))!=nullptr) {
1386             ++localeID; /* point after the '@' */
1387         } else {
1388             return;
1389         }
1390         for(; !_isTerminator(localeID[index]); index++) {
1391             if (index >= MAX_VARIANTS_LENGTH) { // same as length > MAX_VARIANTS_LENGTH
1392                 status = U_ILLEGAL_ARGUMENT_ERROR;
1393                 return;
1394             }
1395             if (needSeparator) {
1396                 if (sink != nullptr) {
1397                     sink->Append("_", 1);
1398                 }
1399                 needSeparator = false;
1400             }
1401             if (sink != nullptr) {
1402                 char c = (char)uprv_toupper(localeID[index]);
1403                 if (c == '-' || c == ',') c = '_';
1404                 sink->Append(&c, 1);
1405             }
1406         }
1407         if (pEnd != nullptr) { *pEnd = localeID + index; }
1408     }
1409 }
1410 
1411 }  // namespace
1412 
1413 U_EXPORT CharString
ulocimp_getLanguage(const char * localeID,UErrorCode & status)1414 ulocimp_getLanguage(const char* localeID, UErrorCode& status) {
1415     return ByteSinkUtil::viaByteSinkToCharString(
1416         [&](ByteSink& sink, UErrorCode& status) {
1417             ulocimp_getSubtags(
1418                     localeID,
1419                     &sink,
1420                     nullptr,
1421                     nullptr,
1422                     nullptr,
1423                     nullptr,
1424                     status);
1425         },
1426         status);
1427 }
1428 
1429 U_EXPORT CharString
ulocimp_getScript(const char * localeID,UErrorCode & status)1430 ulocimp_getScript(const char* localeID, UErrorCode& status) {
1431     return ByteSinkUtil::viaByteSinkToCharString(
1432         [&](ByteSink& sink, UErrorCode& status) {
1433             ulocimp_getSubtags(
1434                     localeID,
1435                     nullptr,
1436                     &sink,
1437                     nullptr,
1438                     nullptr,
1439                     nullptr,
1440                     status);
1441         },
1442         status);
1443 }
1444 
1445 U_EXPORT CharString
ulocimp_getRegion(const char * localeID,UErrorCode & status)1446 ulocimp_getRegion(const char* localeID, UErrorCode& status) {
1447     return ByteSinkUtil::viaByteSinkToCharString(
1448         [&](ByteSink& sink, UErrorCode& status) {
1449             ulocimp_getSubtags(
1450                     localeID,
1451                     nullptr,
1452                     nullptr,
1453                     &sink,
1454                     nullptr,
1455                     nullptr,
1456                     status);
1457         },
1458         status);
1459 }
1460 
1461 U_EXPORT CharString
ulocimp_getVariant(const char * localeID,UErrorCode & status)1462 ulocimp_getVariant(const char* localeID, UErrorCode& status) {
1463     return ByteSinkUtil::viaByteSinkToCharString(
1464         [&](ByteSink& sink, UErrorCode& status) {
1465             ulocimp_getSubtags(
1466                     localeID,
1467                     nullptr,
1468                     nullptr,
1469                     nullptr,
1470                     &sink,
1471                     nullptr,
1472                     status);
1473         },
1474         status);
1475 }
1476 
1477 U_EXPORT void
ulocimp_getSubtags(const char * localeID,CharString * language,CharString * script,CharString * region,CharString * variant,const char ** pEnd,UErrorCode & status)1478 ulocimp_getSubtags(
1479         const char* localeID,
1480         CharString* language,
1481         CharString* script,
1482         CharString* region,
1483         CharString* variant,
1484         const char** pEnd,
1485         UErrorCode& status) {
1486     if (U_FAILURE(status)) { return; }
1487 
1488     std::optional<CharStringByteSink> languageSink;
1489     std::optional<CharStringByteSink> scriptSink;
1490     std::optional<CharStringByteSink> regionSink;
1491     std::optional<CharStringByteSink> variantSink;
1492 
1493     if (language != nullptr) { languageSink.emplace(language); }
1494     if (script != nullptr) { scriptSink.emplace(script); }
1495     if (region != nullptr) { regionSink.emplace(region); }
1496     if (variant != nullptr) { variantSink.emplace(variant); }
1497 
1498     ulocimp_getSubtags(
1499             localeID,
1500             languageSink.has_value() ? &*languageSink : nullptr,
1501             scriptSink.has_value() ? &*scriptSink : nullptr,
1502             regionSink.has_value() ? &*regionSink : nullptr,
1503             variantSink.has_value() ? &*variantSink : nullptr,
1504             pEnd,
1505             status);
1506 }
1507 
1508 U_EXPORT void
ulocimp_getSubtags(const char * localeID,ByteSink * language,ByteSink * script,ByteSink * region,ByteSink * variant,const char ** pEnd,UErrorCode & status)1509 ulocimp_getSubtags(
1510         const char* localeID,
1511         ByteSink* language,
1512         ByteSink* script,
1513         ByteSink* region,
1514         ByteSink* variant,
1515         const char** pEnd,
1516         UErrorCode& status) {
1517     if (U_FAILURE(status)) { return; }
1518 
1519     if (pEnd != nullptr) {
1520         *pEnd = localeID;
1521     } else if (language == nullptr &&
1522                script == nullptr &&
1523                region == nullptr &&
1524                variant == nullptr) {
1525         return;
1526     }
1527 
1528     bool hasRegion = false;
1529 
1530     if (localeID == nullptr) {
1531         localeID = uloc_getDefault();
1532     }
1533 
1534     _getLanguage(localeID, language, &localeID, status);
1535     if (U_FAILURE(status)) { return; }
1536     U_ASSERT(localeID != nullptr);
1537 
1538     if (pEnd != nullptr) {
1539         *pEnd = localeID;
1540     } else if (script == nullptr &&
1541                region == nullptr &&
1542                variant == nullptr) {
1543         return;
1544     }
1545 
1546     if (_isIDSeparator(*localeID)) {
1547         const char* begin = localeID + 1;
1548         const char* end = nullptr;
1549         _getScript(begin, script, &end);
1550         U_ASSERT(end != nullptr);
1551         if (end != begin) {
1552             localeID = end;
1553             if (pEnd != nullptr) { *pEnd = localeID; }
1554         }
1555     }
1556 
1557     if (region == nullptr && variant == nullptr && pEnd == nullptr) { return; }
1558 
1559     if (_isIDSeparator(*localeID)) {
1560         const char* begin = localeID + 1;
1561         const char* end = nullptr;
1562         _getRegion(begin, region, &end);
1563         U_ASSERT(end != nullptr);
1564         if (end != begin) {
1565             hasRegion = true;
1566             localeID = end;
1567             if (pEnd != nullptr) { *pEnd = localeID; }
1568         }
1569     }
1570 
1571     if (variant == nullptr && pEnd == nullptr) { return; }
1572 
1573     if (_isIDSeparator(*localeID) && !_isBCP47Extension(localeID)) {
1574         /* If there was no country ID, skip a possible extra IDSeparator */
1575         if (!hasRegion && _isIDSeparator(localeID[1])) {
1576             localeID++;
1577         }
1578         const char* begin = localeID + 1;
1579         const char* end = nullptr;
1580         _getVariant(begin, *localeID, variant, &end, false, status);
1581         if (U_FAILURE(status)) { return; }
1582         U_ASSERT(end != nullptr);
1583         if (end != begin && pEnd != nullptr) { *pEnd = end; }
1584     }
1585 }
1586 
1587 /* Keyword enumeration */
1588 
1589 typedef struct UKeywordsContext {
1590     char* keywords;
1591     char* current;
1592 } UKeywordsContext;
1593 
1594 U_CDECL_BEGIN
1595 
1596 static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration * enumerator)1597 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1598     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1599     uprv_free(enumerator->context);
1600     uprv_free(enumerator);
1601 }
1602 
1603 static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration * en,UErrorCode *)1604 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1605     char *kw = ((UKeywordsContext *)en->context)->keywords;
1606     int32_t result = 0;
1607     while(*kw) {
1608         result++;
1609         kw += uprv_strlen(kw)+1;
1610     }
1611     return result;
1612 }
1613 
1614 static const char * U_CALLCONV
uloc_kw_nextKeyword(UEnumeration * en,int32_t * resultLength,UErrorCode *)1615 uloc_kw_nextKeyword(UEnumeration* en,
1616                     int32_t* resultLength,
1617                     UErrorCode* /*status*/) {
1618     const char* result = ((UKeywordsContext *)en->context)->current;
1619     int32_t len = 0;
1620     if(*result) {
1621         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1622         ((UKeywordsContext *)en->context)->current += len+1;
1623     } else {
1624         result = nullptr;
1625     }
1626     if (resultLength) {
1627         *resultLength = len;
1628     }
1629     return result;
1630 }
1631 
1632 static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration * en,UErrorCode *)1633 uloc_kw_resetKeywords(UEnumeration* en,
1634                       UErrorCode* /*status*/) {
1635     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1636 }
1637 
1638 U_CDECL_END
1639 
1640 
1641 static const UEnumeration gKeywordsEnum = {
1642     nullptr,
1643     nullptr,
1644     uloc_kw_closeKeywords,
1645     uloc_kw_countKeywords,
1646     uenum_unextDefault,
1647     uloc_kw_nextKeyword,
1648     uloc_kw_resetKeywords
1649 };
1650 
1651 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char * keywordList,int32_t keywordListSize,UErrorCode * status)1652 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1653 {
1654     if (U_FAILURE(*status)) { return nullptr; }
1655 
1656     LocalMemory<UKeywordsContext> myContext;
1657     LocalMemory<UEnumeration> result;
1658 
1659     myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1660     result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1661     if (myContext.isNull() || result.isNull()) {
1662         *status = U_MEMORY_ALLOCATION_ERROR;
1663         return nullptr;
1664     }
1665     uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1666     myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1667     if (myContext->keywords == nullptr) {
1668         *status = U_MEMORY_ALLOCATION_ERROR;
1669         return nullptr;
1670     }
1671     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1672     myContext->keywords[keywordListSize] = 0;
1673     myContext->current = myContext->keywords;
1674     result->context = myContext.orphan();
1675     return result.orphan();
1676 }
1677 
1678 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char * localeID,UErrorCode * status)1679 uloc_openKeywords(const char* localeID,
1680                         UErrorCode* status)
1681 {
1682     if(status==nullptr || U_FAILURE(*status)) {
1683         return nullptr;
1684     }
1685 
1686     CharString tempBuffer;
1687     const char* tmpLocaleID;
1688 
1689     if (_hasBCP47Extension(localeID)) {
1690         tempBuffer = ulocimp_forLanguageTag(localeID, -1, nullptr, *status);
1691         tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
1692     } else {
1693         if (localeID==nullptr) {
1694             localeID=uloc_getDefault();
1695         }
1696         tmpLocaleID=localeID;
1697     }
1698 
1699     ulocimp_getSubtags(
1700             tmpLocaleID,
1701             nullptr,
1702             nullptr,
1703             nullptr,
1704             nullptr,
1705             &tmpLocaleID,
1706             *status);
1707     if (U_FAILURE(*status)) {
1708         return nullptr;
1709     }
1710 
1711     /* keywords are located after '@' */
1712     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != nullptr) {
1713         CharString keywords = ulocimp_getKeywords(tmpLocaleID + 1, '@', false, *status);
1714         if (U_FAILURE(*status)) {
1715             return nullptr;
1716         }
1717         return uloc_openKeywordList(keywords.data(), keywords.length(), status);
1718     }
1719     return nullptr;
1720 }
1721 
1722 
1723 /* bit-flags for 'options' parameter of _canonicalize */
1724 #define _ULOC_STRIP_KEYWORDS 0x2
1725 #define _ULOC_CANONICALIZE   0x1
1726 
1727 namespace {
1728 
OPTION_SET(uint32_t options,uint32_t mask)1729 inline bool OPTION_SET(uint32_t options, uint32_t mask) { return (options & mask) != 0; }
1730 
1731 constexpr char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1732 constexpr int32_t I_DEFAULT_LENGTH = UPRV_LENGTHOF(i_default);
1733 
1734 /**
1735  * Canonicalize the given localeID, to level 1 or to level 2,
1736  * depending on the options.  To specify level 1, pass in options=0.
1737  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1738  *
1739  * This is the code underlying uloc_getName and uloc_canonicalize.
1740  */
1741 void
_canonicalize(const char * localeID,ByteSink & sink,uint32_t options,UErrorCode & err)1742 _canonicalize(const char* localeID,
1743               ByteSink& sink,
1744               uint32_t options,
1745               UErrorCode& err) {
1746     if (U_FAILURE(err)) {
1747         return;
1748     }
1749 
1750     int32_t j, fieldCount=0;
1751     CharString tempBuffer;  // if localeID has a BCP47 extension, tmpLocaleID points to this
1752     CharString localeIDWithHyphens;  // if localeID has a BPC47 extension and have _, tmpLocaleID points to this
1753     const char* origLocaleID;
1754     const char* tmpLocaleID;
1755     const char* keywordAssign = nullptr;
1756     const char* separatorIndicator = nullptr;
1757 
1758     if (_hasBCP47Extension(localeID)) {
1759         const char* localeIDPtr = localeID;
1760 
1761         // convert all underbars to hyphens, unless the "BCP47 extension" comes at the beginning of the string
1762         if (uprv_strchr(localeID, '_') != nullptr && localeID[1] != '-' && localeID[1] != '_') {
1763             localeIDWithHyphens.append(localeID, -1, err);
1764             if (U_SUCCESS(err)) {
1765                 for (char* p = localeIDWithHyphens.data(); *p != '\0'; ++p) {
1766                     if (*p == '_') {
1767                         *p = '-';
1768                     }
1769                 }
1770                 localeIDPtr = localeIDWithHyphens.data();
1771             }
1772         }
1773 
1774         tempBuffer = ulocimp_forLanguageTag(localeIDPtr, -1, nullptr, err);
1775         tmpLocaleID = U_SUCCESS(err) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeIDPtr;
1776     } else {
1777         if (localeID==nullptr) {
1778            localeID=uloc_getDefault();
1779         }
1780         tmpLocaleID=localeID;
1781     }
1782 
1783     origLocaleID=tmpLocaleID;
1784 
1785     /* get all pieces, one after another, and separate with '_' */
1786     CharString tag;
1787     CharString script;
1788     CharString country;
1789     CharString variant;
1790     ulocimp_getSubtags(
1791             tmpLocaleID,
1792             &tag,
1793             &script,
1794             &country,
1795             &variant,
1796             &tmpLocaleID,
1797             err);
1798     if (U_FAILURE(err)) {
1799         return;
1800     }
1801 
1802     if (tag.length() == I_DEFAULT_LENGTH &&
1803             uprv_strncmp(origLocaleID, i_default, I_DEFAULT_LENGTH) == 0) {
1804         tag.clear();
1805         tag.append(uloc_getDefault(), err);
1806     } else {
1807         if (!script.isEmpty()) {
1808             ++fieldCount;
1809             tag.append('_', err);
1810             tag.append(script, err);
1811         }
1812         if (!country.isEmpty()) {
1813             ++fieldCount;
1814             tag.append('_', err);
1815             tag.append(country, err);
1816         }
1817         if (!variant.isEmpty()) {
1818             ++fieldCount;
1819             if (country.isEmpty()) {
1820                 tag.append('_', err);
1821             }
1822             tag.append('_', err);
1823             tag.append(variant, err);
1824         }
1825     }
1826 
1827     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1828     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1829         tag.append('.', err);
1830         ++tmpLocaleID;
1831         const char *atPos = nullptr;
1832         size_t length;
1833         if((atPos = uprv_strchr(tmpLocaleID, '@')) != nullptr) {
1834             length = atPos - tmpLocaleID;
1835         } else {
1836             length = uprv_strlen(tmpLocaleID);
1837         }
1838         // The longest charset name we found in IANA charset registry
1839         // https://www.iana.org/assignments/character-sets/ is
1840         // "Extended_UNIX_Code_Packed_Format_for_Japanese" in length 45.
1841         // we therefore restrict the length here to be 64 which is a power of 2
1842         // number that is longer than 45.
1843         constexpr size_t kMaxCharsetLength = 64;
1844         if (length > kMaxCharsetLength) {
1845            err = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1846            return;
1847         }
1848         tag.append(tmpLocaleID, static_cast<int32_t>(length), err);
1849         tmpLocaleID += length;
1850     }
1851 
1852     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1853        After this, tmpLocaleID either points to '@' or is nullptr */
1854     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=nullptr) {
1855         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1856         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1857     }
1858 
1859     /* Copy POSIX-style variant, if any [mr@FOO] */
1860     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1861         tmpLocaleID != nullptr && keywordAssign == nullptr) {
1862         for (;;) {
1863             char c = *tmpLocaleID;
1864             if (c == 0) {
1865                 break;
1866             }
1867             tag.append(c, err);
1868             ++tmpLocaleID;
1869         }
1870     }
1871 
1872     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1873         /* Handle @FOO variant if @ is present and not followed by = */
1874         if (tmpLocaleID!=nullptr && keywordAssign==nullptr) {
1875             /* Add missing '_' if needed */
1876             if (fieldCount < 2 || (fieldCount < 3 && !script.isEmpty())) {
1877                 do {
1878                     tag.append('_', err);
1879                     ++fieldCount;
1880                 } while(fieldCount<2);
1881             }
1882 
1883             CharStringByteSink s(&tag);
1884             _getVariant(tmpLocaleID+1, '@', &s, nullptr, !variant.isEmpty(), err);
1885             if (U_FAILURE(err)) { return; }
1886         }
1887 
1888         /* Look up the ID in the canonicalization map */
1889         for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1890             StringPiece id(CANONICALIZE_MAP[j].id);
1891             if (tag == id) {
1892                 if (id.empty() && tmpLocaleID != nullptr) {
1893                     break; /* Don't remap "" if keywords present */
1894                 }
1895                 tag.clear();
1896                 tag.append(CANONICALIZE_MAP[j].canonicalID, err);
1897                 break;
1898             }
1899         }
1900     }
1901 
1902     sink.Append(tag.data(), tag.length());
1903 
1904     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1905         if (tmpLocaleID!=nullptr && keywordAssign!=nullptr &&
1906             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1907             sink.Append("@", 1);
1908             ++fieldCount;
1909             ulocimp_getKeywords(tmpLocaleID+1, '@', sink, true, err);
1910         }
1911     }
1912 }
1913 
1914 }  // namespace
1915 
1916 /* ### ID parsing API **************************************************/
1917 
1918 U_CAPI int32_t  U_EXPORT2
uloc_getParent(const char * localeID,char * parent,int32_t parentCapacity,UErrorCode * err)1919 uloc_getParent(const char*    localeID,
1920                char* parent,
1921                int32_t parentCapacity,
1922                UErrorCode* err)
1923 {
1924     return ByteSinkUtil::viaByteSinkToTerminatedChars(
1925         parent, parentCapacity,
1926         [&](ByteSink& sink, UErrorCode& status) {
1927             ulocimp_getParent(localeID, sink, status);
1928         },
1929         *err);
1930 }
1931 
1932 U_EXPORT CharString
ulocimp_getParent(const char * localeID,UErrorCode & err)1933 ulocimp_getParent(const char* localeID,
1934                   UErrorCode& err)
1935 {
1936     return ByteSinkUtil::viaByteSinkToCharString(
1937         [&](ByteSink& sink, UErrorCode& status) {
1938             ulocimp_getParent(localeID, sink, status);
1939         },
1940         err);
1941 }
1942 
1943 U_EXPORT void
ulocimp_getParent(const char * localeID,icu::ByteSink & sink,UErrorCode & err)1944 ulocimp_getParent(const char* localeID,
1945                   icu::ByteSink& sink,
1946                   UErrorCode& err)
1947 {
1948     if (U_FAILURE(err)) { return; }
1949 
1950     const char *lastUnderscore;
1951     int32_t i;
1952 
1953     if (localeID == nullptr)
1954         localeID = uloc_getDefault();
1955 
1956     lastUnderscore=uprv_strrchr(localeID, '_');
1957     if(lastUnderscore!=nullptr) {
1958         i=(int32_t)(lastUnderscore-localeID);
1959     } else {
1960         i=0;
1961     }
1962 
1963     if (i > 0) {
1964         if (uprv_strnicmp(localeID, "und_", 4) == 0) {
1965             localeID += 3;
1966             i -= 3;
1967         }
1968         sink.Append(localeID, i);
1969     }
1970 }
1971 
1972 U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char * localeID,char * language,int32_t languageCapacity,UErrorCode * err)1973 uloc_getLanguage(const char*    localeID,
1974          char* language,
1975          int32_t languageCapacity,
1976          UErrorCode* err)
1977 {
1978     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1979     return ByteSinkUtil::viaByteSinkToTerminatedChars(
1980         language, languageCapacity,
1981         [&](ByteSink& sink, UErrorCode& status) {
1982             ulocimp_getSubtags(
1983                     localeID,
1984                     &sink,
1985                     nullptr,
1986                     nullptr,
1987                     nullptr,
1988                     nullptr,
1989                     status);
1990         },
1991         *err);
1992 }
1993 
1994 U_CAPI int32_t U_EXPORT2
uloc_getScript(const char * localeID,char * script,int32_t scriptCapacity,UErrorCode * err)1995 uloc_getScript(const char*    localeID,
1996          char* script,
1997          int32_t scriptCapacity,
1998          UErrorCode* err)
1999 {
2000     return ByteSinkUtil::viaByteSinkToTerminatedChars(
2001         script, scriptCapacity,
2002         [&](ByteSink& sink, UErrorCode& status) {
2003             ulocimp_getSubtags(
2004                     localeID,
2005                     nullptr,
2006                     &sink,
2007                     nullptr,
2008                     nullptr,
2009                     nullptr,
2010                     status);
2011         },
2012         *err);
2013 }
2014 
2015 U_CAPI int32_t  U_EXPORT2
uloc_getCountry(const char * localeID,char * country,int32_t countryCapacity,UErrorCode * err)2016 uloc_getCountry(const char* localeID,
2017             char* country,
2018             int32_t countryCapacity,
2019             UErrorCode* err)
2020 {
2021     return ByteSinkUtil::viaByteSinkToTerminatedChars(
2022         country, countryCapacity,
2023         [&](ByteSink& sink, UErrorCode& status) {
2024             ulocimp_getSubtags(
2025                     localeID,
2026                     nullptr,
2027                     nullptr,
2028                     &sink,
2029                     nullptr,
2030                     nullptr,
2031                     status);
2032         },
2033         *err);
2034 }
2035 
2036 U_CAPI int32_t  U_EXPORT2
uloc_getVariant(const char * localeID,char * variant,int32_t variantCapacity,UErrorCode * err)2037 uloc_getVariant(const char* localeID,
2038                 char* variant,
2039                 int32_t variantCapacity,
2040                 UErrorCode* err)
2041 {
2042     return ByteSinkUtil::viaByteSinkToTerminatedChars(
2043         variant, variantCapacity,
2044         [&](ByteSink& sink, UErrorCode& status) {
2045             ulocimp_getSubtags(
2046                     localeID,
2047                     nullptr,
2048                     nullptr,
2049                     nullptr,
2050                     &sink,
2051                     nullptr,
2052                     status);
2053         },
2054         *err);
2055 }
2056 
2057 U_CAPI int32_t  U_EXPORT2
uloc_getName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2058 uloc_getName(const char* localeID,
2059              char* name,
2060              int32_t nameCapacity,
2061              UErrorCode* err)
2062 {
2063     return ByteSinkUtil::viaByteSinkToTerminatedChars(
2064         name, nameCapacity,
2065         [&](ByteSink& sink, UErrorCode& status) {
2066             ulocimp_getName(localeID, sink, status);
2067         },
2068         *err);
2069 }
2070 
2071 U_EXPORT CharString
ulocimp_getName(const char * localeID,UErrorCode & err)2072 ulocimp_getName(const char* localeID,
2073                 UErrorCode& err)
2074 {
2075     return ByteSinkUtil::viaByteSinkToCharString(
2076         [&](ByteSink& sink, UErrorCode& status) {
2077             ulocimp_getName(localeID, sink, status);
2078         },
2079         err);
2080 }
2081 
2082 U_EXPORT void
ulocimp_getName(const char * localeID,ByteSink & sink,UErrorCode & err)2083 ulocimp_getName(const char* localeID,
2084                 ByteSink& sink,
2085                 UErrorCode& err)
2086 {
2087     _canonicalize(localeID, sink, 0, err);
2088 }
2089 
2090 U_CAPI int32_t  U_EXPORT2
uloc_getBaseName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2091 uloc_getBaseName(const char* localeID,
2092                  char* name,
2093                  int32_t nameCapacity,
2094                  UErrorCode* err)
2095 {
2096     return ByteSinkUtil::viaByteSinkToTerminatedChars(
2097         name, nameCapacity,
2098         [&](ByteSink& sink, UErrorCode& status) {
2099             ulocimp_getBaseName(localeID, sink, status);
2100         },
2101         *err);
2102 }
2103 
2104 U_EXPORT CharString
ulocimp_getBaseName(const char * localeID,UErrorCode & err)2105 ulocimp_getBaseName(const char* localeID,
2106                     UErrorCode& err)
2107 {
2108     return ByteSinkUtil::viaByteSinkToCharString(
2109         [&](ByteSink& sink, UErrorCode& status) {
2110             ulocimp_getBaseName(localeID, sink, status);
2111         },
2112         err);
2113 }
2114 
2115 U_EXPORT void
ulocimp_getBaseName(const char * localeID,ByteSink & sink,UErrorCode & err)2116 ulocimp_getBaseName(const char* localeID,
2117                     ByteSink& sink,
2118                     UErrorCode& err)
2119 {
2120     _canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
2121 }
2122 
2123 U_CAPI int32_t  U_EXPORT2
uloc_canonicalize(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2124 uloc_canonicalize(const char* localeID,
2125                   char* name,
2126                   int32_t nameCapacity,
2127                   UErrorCode* err)
2128 {
2129     return ByteSinkUtil::viaByteSinkToTerminatedChars(
2130         name, nameCapacity,
2131         [&](ByteSink& sink, UErrorCode& status) {
2132             ulocimp_canonicalize(localeID, sink, status);
2133         },
2134         *err);
2135 }
2136 
2137 U_EXPORT CharString
ulocimp_canonicalize(const char * localeID,UErrorCode & err)2138 ulocimp_canonicalize(const char* localeID,
2139                      UErrorCode& err)
2140 {
2141     return ByteSinkUtil::viaByteSinkToCharString(
2142         [&](ByteSink& sink, UErrorCode& status) {
2143             ulocimp_canonicalize(localeID, sink, status);
2144         },
2145         err);
2146 }
2147 
2148 U_EXPORT void
ulocimp_canonicalize(const char * localeID,ByteSink & sink,UErrorCode & err)2149 ulocimp_canonicalize(const char* localeID,
2150                      ByteSink& sink,
2151                      UErrorCode& err)
2152 {
2153     _canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
2154 }
2155 
2156 U_CAPI const char*  U_EXPORT2
uloc_getISO3Language(const char * localeID)2157 uloc_getISO3Language(const char* localeID)
2158 {
2159     UErrorCode err = U_ZERO_ERROR;
2160 
2161     if (localeID == nullptr)
2162     {
2163         localeID = uloc_getDefault();
2164     }
2165     CharString lang = ulocimp_getLanguage(localeID, err);
2166     if (U_FAILURE(err))
2167         return "";
2168     std::optional<int16_t> offset = _findIndex(LANGUAGES, lang.data());
2169     return offset.has_value() ? LANGUAGES_3[*offset] : "";
2170 }
2171 
2172 U_CAPI const char*  U_EXPORT2
uloc_getISO3Country(const char * localeID)2173 uloc_getISO3Country(const char* localeID)
2174 {
2175     UErrorCode err = U_ZERO_ERROR;
2176 
2177     if (localeID == nullptr)
2178     {
2179         localeID = uloc_getDefault();
2180     }
2181     CharString cntry = ulocimp_getRegion(localeID, err);
2182     if (U_FAILURE(err))
2183         return "";
2184     std::optional<int16_t> offset = _findIndex(COUNTRIES, cntry.data());
2185     return offset.has_value() ? COUNTRIES_3[*offset] : "";
2186 }
2187 
2188 U_CAPI uint32_t  U_EXPORT2
uloc_getLCID(const char * localeID)2189 uloc_getLCID(const char* localeID)
2190 {
2191     UErrorCode status = U_ZERO_ERROR;
2192     uint32_t   lcid = 0;
2193 
2194     /* Check for incomplete id. */
2195     if (!localeID || uprv_strlen(localeID) < 2) {
2196         return 0;
2197     }
2198 
2199     // First, attempt Windows platform lookup if available, but fall
2200     // through to catch any special cases (ICU vs Windows name differences).
2201     lcid = uprv_convertToLCIDPlatform(localeID, &status);
2202     if (U_FAILURE(status)) {
2203         return 0;
2204     }
2205     if (lcid > 0) {
2206         // Windows found an LCID, return that
2207         return lcid;
2208     }
2209 
2210     CharString langID = ulocimp_getLanguage(localeID, status);
2211     if (U_FAILURE(status)) {
2212         return 0;
2213     }
2214 
2215     if (uprv_strchr(localeID, '@')) {
2216         // uprv_convertToLCID does not support keywords other than collation.
2217         // Remove all keywords except collation.
2218         CharString collVal = ulocimp_getKeywordValue(localeID, "collation", status);
2219         if (U_SUCCESS(status) && !collVal.isEmpty()) {
2220             CharString tmpLocaleID = ulocimp_getBaseName(localeID, status);
2221             ulocimp_setKeywordValue("collation", collVal.data(), tmpLocaleID, status);
2222             if (U_SUCCESS(status)) {
2223                 return uprv_convertToLCID(langID.data(), tmpLocaleID.data(), &status);
2224             }
2225         }
2226 
2227         // fall through - all keywords are simply ignored
2228         status = U_ZERO_ERROR;
2229     }
2230 
2231     return uprv_convertToLCID(langID.data(), localeID, &status);
2232 }
2233 
2234 U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid,char * locale,int32_t localeCapacity,UErrorCode * status)2235 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2236                 UErrorCode *status)
2237 {
2238     return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2239 }
2240 
2241 /* ### Default locale **************************************************/
2242 
2243 U_CAPI const char*  U_EXPORT2
uloc_getDefault()2244 uloc_getDefault()
2245 {
2246     return locale_get_default();
2247 }
2248 
2249 U_CAPI void  U_EXPORT2
uloc_setDefault(const char * newDefaultLocale,UErrorCode * err)2250 uloc_setDefault(const char*   newDefaultLocale,
2251              UErrorCode* err)
2252 {
2253     if (U_FAILURE(*err))
2254         return;
2255     /* the error code isn't currently used for anything by this function*/
2256 
2257     /* propagate change to C++ */
2258     locale_set_default(newDefaultLocale);
2259 }
2260 
2261 /**
2262  * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2263  * to an array of pointers to arrays of char.  All of these pointers are owned
2264  * by ICU-- do not delete them, and do not write through them.  The array is
2265  * terminated with a null pointer.
2266  */
2267 U_CAPI const char* const*  U_EXPORT2
uloc_getISOLanguages()2268 uloc_getISOLanguages()
2269 {
2270     return LANGUAGES;
2271 }
2272 
2273 /**
2274  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2275  * pointer to an array of pointers to arrays of char.  All of these pointers are
2276  * owned by ICU-- do not delete them, and do not write through them.  The array is
2277  * terminated with a null pointer.
2278  */
2279 U_CAPI const char* const*  U_EXPORT2
uloc_getISOCountries()2280 uloc_getISOCountries()
2281 {
2282     return COUNTRIES;
2283 }
2284 
2285 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleKey(const char * keyword)2286 uloc_toUnicodeLocaleKey(const char* keyword)
2287 {
2288     const char* bcpKey = ulocimp_toBcpKey(keyword);
2289     if (bcpKey == nullptr && ultag_isUnicodeLocaleKey(keyword, -1)) {
2290         // unknown keyword, but syntax is fine..
2291         return keyword;
2292     }
2293     return bcpKey;
2294 }
2295 
2296 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleType(const char * keyword,const char * value)2297 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2298 {
2299     const char* bcpType = ulocimp_toBcpType(keyword, value, nullptr, nullptr);
2300     if (bcpType == nullptr && ultag_isUnicodeLocaleType(value, -1)) {
2301         // unknown keyword, but syntax is fine..
2302         return value;
2303     }
2304     return bcpType;
2305 }
2306 
2307 namespace {
2308 
2309 bool
isWellFormedLegacyKey(const char * legacyKey)2310 isWellFormedLegacyKey(const char* legacyKey)
2311 {
2312     const char* p = legacyKey;
2313     while (*p) {
2314         if (!UPRV_ISALPHANUM(*p)) {
2315             return false;
2316         }
2317         p++;
2318     }
2319     return true;
2320 }
2321 
2322 bool
isWellFormedLegacyType(const char * legacyType)2323 isWellFormedLegacyType(const char* legacyType)
2324 {
2325     const char* p = legacyType;
2326     int32_t alphaNumLen = 0;
2327     while (*p) {
2328         if (*p == '_' || *p == '/' || *p == '-') {
2329             if (alphaNumLen == 0) {
2330                 return false;
2331             }
2332             alphaNumLen = 0;
2333         } else if (UPRV_ISALPHANUM(*p)) {
2334             alphaNumLen++;
2335         } else {
2336             return false;
2337         }
2338         p++;
2339     }
2340     return (alphaNumLen != 0);
2341 }
2342 
2343 }  // namespace
2344 
2345 U_CAPI const char* U_EXPORT2
uloc_toLegacyKey(const char * keyword)2346 uloc_toLegacyKey(const char* keyword)
2347 {
2348     const char* legacyKey = ulocimp_toLegacyKey(keyword);
2349     if (legacyKey == nullptr) {
2350         // Checks if the specified locale key is well-formed with the legacy locale syntax.
2351         //
2352         // Note:
2353         //  LDML/CLDR provides some definition of keyword syntax in
2354         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2355         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2356         //  Keys can only consist of [0-9a-zA-Z].
2357         if (isWellFormedLegacyKey(keyword)) {
2358             return keyword;
2359         }
2360     }
2361     return legacyKey;
2362 }
2363 
2364 U_CAPI const char* U_EXPORT2
uloc_toLegacyType(const char * keyword,const char * value)2365 uloc_toLegacyType(const char* keyword, const char* value)
2366 {
2367     const char* legacyType = ulocimp_toLegacyType(keyword, value, nullptr, nullptr);
2368     if (legacyType == nullptr) {
2369         // Checks if the specified locale type is well-formed with the legacy locale syntax.
2370         //
2371         // Note:
2372         //  LDML/CLDR provides some definition of keyword syntax in
2373         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2374         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2375         //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2376         //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2377         if (isWellFormedLegacyType(value)) {
2378             return value;
2379         }
2380     }
2381     return legacyType;
2382 }
2383 
2384 /*eof*/
2385