xref: /aosp_15_r20/external/cronet/third_party/icu/source/common/uloc.cpp (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1997-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *
9 * File ULOC.CPP
10 *
11 * Modification History:
12 *
13 *   Date        Name        Description
14 *   04/01/97    aliu        Creation.
15 *   08/21/98    stephen     JDK 1.2 sync
16 *   12/08/98    rtg         New Locale implementation and C API
17 *   03/15/99    damiba      overhaul.
18 *   04/06/99    stephen     changed setDefault() to realloc and copy
19 *   06/14/99    stephen     Changed calls to ures_open for new params
20 *   07/21/99    stephen     Modified setDefault() to propagate to C++
21 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
22 *                           brought canonicalization code into line with spec
23 *****************************************************************************/
24 
25 /*
26    POSIX's locale format, from putil.c: [no spaces]
27 
28      ll [ _CC ] [ . MM ] [ @ VV]
29 
30      l = lang, C = ctry, M = charmap, V = variant
31 */
32 
33 #include "unicode/bytestream.h"
34 #include "unicode/errorcode.h"
35 #include "unicode/stringpiece.h"
36 #include "unicode/utypes.h"
37 #include "unicode/ustring.h"
38 #include "unicode/uloc.h"
39 
40 #include "bytesinkutil.h"
41 #include "putilimp.h"
42 #include "ustr_imp.h"
43 #include "ulocimp.h"
44 #include "umutex.h"
45 #include "cstring.h"
46 #include "cmemory.h"
47 #include "locmap.h"
48 #include "uarrsort.h"
49 #include "uenumimp.h"
50 #include "uassert.h"
51 #include "charstr.h"
52 
53 U_NAMESPACE_USE
54 
55 /* ### Declarations **************************************************/
56 
57 /* Locale stuff from locid.cpp */
58 U_CFUNC void locale_set_default(const char *id);
59 U_CFUNC const char *locale_get_default();
60 
61 /* ### Data tables **************************************************/
62 
63 /**
64  * Table of language codes, both 2- and 3-letter, with preference
65  * given to 2-letter codes where possible.  Includes 3-letter codes
66  * that lack a 2-letter equivalent.
67  *
68  * This list must be in sorted order.  This list is returned directly
69  * to the user by some API.
70  *
71  * This list must be kept in sync with LANGUAGES_3, with corresponding
72  * entries matched.
73  *
74  * This table should be terminated with a nullptr entry, followed by a
75  * second list, and another nullptr entry.  The first list is visible to
76  * user code when this array is returned by API.  The second list
77  * contains codes we support, but do not expose through user API.
78  *
79  * Notes
80  *
81  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
82  * include the revisions up to 2001/7/27 *CWB*
83  *
84  * The 3 character codes are the terminology codes like RFC 3066.  This
85  * is compatible with prior ICU codes
86  *
87  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
88  * table but now at the end of the table because 3 character codes are
89  * duplicates.  This avoids bad searches going from 3 to 2 character
90  * codes.
91  *
92  * The range qaa-qtz is reserved for local use
93  */
94 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
95 /* ISO639 table version is 20150505 */
96 /* Subsequent hand addition of selected languages */
97 static const char * const LANGUAGES[] = {
98     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
99     "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
100     "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
101     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
102     "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
103     "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
104     "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
105     "bgc", "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
106     "blo", "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
107     "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
108     "ca",  "cad", "car", "cay", "cch", "ccp", "ce",  "ceb", "cgg",
109     "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
110     "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
111     "cs",  "csb", "csw", "cu",  "cv",  "cy",
112     "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
113     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
114     "dyo", "dyu", "dz",  "dzg",
115     "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
116     "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
117     "ext",
118     "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
119     "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
120     "frs", "fur", "fy",
121     "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
122     "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
123     "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
124     "gur", "guz", "gv",  "gwi",
125     "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
126     "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
127     "hup", "hy",  "hz",
128     "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
129     "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
130     "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
131     "jv",
132     "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
133     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
134     "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
135     "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
136     "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
137     "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
138     "kv",  "kw",  "kxv", "ky",
139     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
140     "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
141     "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
142     "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
143     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
144     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
145     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
146     "ml",  "mn",  "mnc", "mni",
147     "moh", "mos", "mr",  "mrj",
148     "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
149     "my",  "mye", "myv", "mzn",
150     "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
151     "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
152     "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
153     "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
154     "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
155     "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
156     "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
157     "pon", "prg", "pro", "ps",  "pt",
158     "qu",  "quc", "qug",
159     "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
160     "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
161     "rw",  "rwk",
162     "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
163     "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
164     "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
165     "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
166     "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
167     "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
168     "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
169     "sv",  "sw",  "swb", "syc", "syr", "szl",
170     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
171     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr",
172     "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tok", "tpi",
173     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
174     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
175     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
176     "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vmw",
177     "vo", "vot", "vro", "vun",
178     "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
179     "xal", "xh",  "xmf", "xnr", "xog",
180     "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
181     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
182     "zun", "zxx", "zza",
183 nullptr,
184     "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  /* obsolete language codes */
185 nullptr
186 };
187 
188 static const char* const DEPRECATED_LANGUAGES[]={
189     "in", "iw", "ji", "jw", "mo", nullptr, nullptr
190 };
191 static const char* const REPLACEMENT_LANGUAGES[]={
192     "id", "he", "yi", "jv", "ro", nullptr, nullptr
193 };
194 
195 /**
196  * Table of 3-letter language codes.
197  *
198  * This is a lookup table used to convert 3-letter language codes to
199  * their 2-letter equivalent, where possible.  It must be kept in sync
200  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
201  * same language as LANGUAGES_3[i].  The commented-out lines are
202  * copied from LANGUAGES to make eyeballing this baby easier.
203  *
204  * Where a 3-letter language code has no 2-letter equivalent, the
205  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
206  *
207  * This table should be terminated with a nullptr entry, followed by a
208  * second list, and another nullptr entry.  The two lists correspond to
209  * the two lists in LANGUAGES.
210  */
211 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
212 /* ISO639 table version is 20150505 */
213 /* Subsequent hand addition of selected languages */
214 static const char * const LANGUAGES_3[] = {
215     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
216     "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
217     "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
218     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
219     "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
220     "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
221     "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
222     "bgc", "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
223     "blo", "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
224     "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
225     "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
226     "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
227     "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
228     "ces", "csb", "csw", "chu", "chv", "cym",
229     "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
230     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
231     "dyo", "dyu", "dzo", "dzg",
232     "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
233     "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
234     "ext",
235     "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
236     "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
237     "frs", "fur", "fry",
238     "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
239     "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
240     "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
241     "gur", "guz", "glv", "gwi",
242     "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
243     "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
244     "hup", "hye", "her",
245     "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
246     "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
247     "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
248     "jav",
249     "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
250     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
251     "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
252     "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
253     "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
254     "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
255     "kom", "cor", "kxv", "kir",
256     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
257     "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
258     "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
259     "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
260     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
261     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
262     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
263     "mal", "mon", "mnc", "mni",
264     "moh", "mos", "mar", "mrj",
265     "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
266     "mya", "mye", "myv", "mzn",
267     "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
268     "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
269     "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
270     "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
271     "oci", "oji", "orm", "ori", "oss", "osa", "ota",
272     "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
273     "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
274     "pon", "prg", "pro", "pus", "por",
275     "que", "quc", "qug",
276     "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
277     "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
278     "kin", "rwk",
279     "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
280     "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
281     "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
282     "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
283     "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
284     "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
285     "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
286     "swe", "swa", "swb", "syc", "syr", "szl",
287     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
288     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
289     "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tok", "tpi",
290     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
291     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
292     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
293     "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vmw",
294     "vol", "vot", "vro", "vun",
295     "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
296     "xal", "xho", "xmf", "xnr", "xog",
297     "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
298     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
299     "zun", "zxx", "zza",
300 nullptr,
301 /*  "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  */
302     "ind", "heb", "yid", "jaw", "mol", "srp", "swc", "tgl",
303 nullptr
304 };
305 
306 /**
307  * Table of 2-letter country codes.
308  *
309  * This list must be in sorted order.  This list is returned directly
310  * to the user by some API.
311  *
312  * This list must be kept in sync with COUNTRIES_3, with corresponding
313  * entries matched.
314  *
315  * This table should be terminated with a nullptr entry, followed by a
316  * second list, and another nullptr entry.  The first list is visible to
317  * user code when this array is returned by API.  The second list
318  * contains codes we support, but do not expose through user API.
319  *
320  * Notes:
321  *
322  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
323  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
324  * new codes keeping the old ones for compatibility updated to include
325  * 1999/12/03 revisions *CWB*
326  *
327  * RO(ROM) is now RO(ROU) according to
328  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
329  */
330 static const char * const COUNTRIES[] = {
331     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
332     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
333     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
334     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
335     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
336     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CQ",  "CR",
337     "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",
338     "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",
339     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
340     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
341     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
342     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
343     "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
344     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
345     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
346     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
347     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
348     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
349     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
350     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
351     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
352     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
353     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
354     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
355     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
356     "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
357     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
358     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
359     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
360     "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
361 nullptr,
362     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
363 nullptr
364 };
365 
366 static const char* const DEPRECATED_COUNTRIES[] = {
367     "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", nullptr, nullptr /* deprecated country list */
368 };
369 static const char* const REPLACEMENT_COUNTRIES[] = {
370 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
371     "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", nullptr, nullptr  /* replacement country codes */
372 };
373 
374 /**
375  * Table of 3-letter country codes.
376  *
377  * This is a lookup table used to convert 3-letter country codes to
378  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
379  * For all valid i, COUNTRIES[i] must refer to the same country as
380  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
381  * to make eyeballing this baby easier.
382  *
383  * This table should be terminated with a nullptr entry, followed by a
384  * second list, and another nullptr entry.  The two lists correspond to
385  * the two lists in COUNTRIES.
386  */
387 static const char * const COUNTRIES_3[] = {
388 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
389     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
390 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
391     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
392 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
393     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
394 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
395     "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
396 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
397     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
398 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CQ",  "CR",     */
399     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRQ", "CRI",
400 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",     */
401     "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
402 /*  "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",     */
403     "DMA", "DOM", "DZA", "XEA", "ECU", "EST", "EGY", "ESH", "ERI",
404 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
405     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
406 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
407     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
408 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
409     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
410 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
411     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
412 /*  "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
413     "XIC", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
414 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
415     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
416 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
417     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
418 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
419     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
420 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
421     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
422 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
423     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
424 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
425     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
426 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
427     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
428 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
429     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
430 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
431     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
432 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
433     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
434 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
435     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
436 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
437     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
438 /*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
439     "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
440 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
441     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
442 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
443     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
444 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
445     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
446 /*  "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
447     "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
448 nullptr,
449 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
450     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
451 nullptr
452 };
453 
454 typedef struct CanonicalizationMap {
455     const char *id;          /* input ID */
456     const char *canonicalID; /* canonicalized output ID */
457 } CanonicalizationMap;
458 
459 /**
460  * A map to canonicalize locale IDs.  This handles a variety of
461  * different semantic kinds of transformations.
462  */
463 static const CanonicalizationMap CANONICALIZE_MAP[] = {
464     { "art__LOJBAN",    "jbo" }, /* registered name */
465     { "hy__AREVELA",    "hy" }, /* Registered IANA variant */
466     { "hy__AREVMDA",    "hyw" }, /* Registered IANA variant */
467     { "zh__GUOYU",      "zh" }, /* registered name */
468     { "zh__HAKKA",      "hak" }, /* registered name */
469     { "zh__XIANG",      "hsn" }, /* registered name */
470     // subtags with 3 chars won't be treated as variants.
471     { "zh_GAN",         "gan" }, /* registered name */
472     { "zh_MIN_NAN",     "nan" }, /* registered name */
473     { "zh_WUU",         "wuu" }, /* registered name */
474     { "zh_YUE",         "yue" }, /* registered name */
475 };
476 
477 /* ### BCP47 Conversion *******************************************/
478 /* Test if the locale id has BCP47 u extension and does not have '@' */
479 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == nullptr && getShortestSubtagLength(localeID) == 1)
480 /* Gets the size of the shortest subtag in the given localeID. */
getShortestSubtagLength(const char * localeID)481 static int32_t getShortestSubtagLength(const char *localeID) {
482     int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
483     int32_t length = localeIDLength;
484     int32_t tmpLength = 0;
485     int32_t i;
486     UBool reset = true;
487 
488     for (i = 0; i < localeIDLength; i++) {
489         if (localeID[i] != '_' && localeID[i] != '-') {
490             if (reset) {
491                 tmpLength = 0;
492                 reset = false;
493             }
494             tmpLength++;
495         } else {
496             if (tmpLength != 0 && tmpLength < length) {
497                 length = tmpLength;
498             }
499             reset = true;
500         }
501     }
502 
503     return length;
504 }
505 
506 /* ### Keywords **************************************************/
507 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
508 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
509 /* Punctuation/symbols allowed in legacy key values */
510 #define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
511 
512 #define ULOC_KEYWORD_BUFFER_LEN 25
513 #define ULOC_MAX_NO_KEYWORDS 25
514 
515 U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char * localeID)516 locale_getKeywordsStart(const char *localeID) {
517     const char *result = nullptr;
518     if((result = uprv_strchr(localeID, '@')) != nullptr) {
519         return result;
520     }
521 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
522     else {
523         /* We do this because the @ sign is variant, and the @ sign used on one
524         EBCDIC machine won't be compiled the same way on other EBCDIC based
525         machines. */
526         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
527         const uint8_t *charToFind = ebcdicSigns;
528         while(*charToFind) {
529             if((result = uprv_strchr(localeID, *charToFind)) != nullptr) {
530                 return result;
531             }
532             charToFind++;
533         }
534     }
535 #endif
536     return nullptr;
537 }
538 
539 /**
540  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
541  * @param keywordName incoming name to be canonicalized
542  * @param status return status (keyword too long)
543  * @return length of the keyword name
544  */
locale_canonKeywordName(char * buf,const char * keywordName,UErrorCode * status)545 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
546 {
547   int32_t keywordNameLen = 0;
548 
549   for (; *keywordName != 0; keywordName++) {
550     if (!UPRV_ISALPHANUM(*keywordName)) {
551       *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
552       return 0;
553     }
554     if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
555       buf[keywordNameLen++] = uprv_tolower(*keywordName);
556     } else {
557       /* keyword name too long for internal buffer */
558       *status = U_INTERNAL_PROGRAM_ERROR;
559       return 0;
560     }
561   }
562   if (keywordNameLen == 0) {
563     *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
564     return 0;
565   }
566   buf[keywordNameLen] = 0; /* terminate */
567 
568   return keywordNameLen;
569 }
570 
571 typedef struct {
572     char keyword[ULOC_KEYWORD_BUFFER_LEN];
573     int32_t keywordLen;
574     const char *valueStart;
575     int32_t valueLen;
576 } KeywordStruct;
577 
578 static int32_t U_CALLCONV
compareKeywordStructs(const void *,const void * left,const void * right)579 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
580     const char* leftString = ((const KeywordStruct *)left)->keyword;
581     const char* rightString = ((const KeywordStruct *)right)->keyword;
582     return uprv_strcmp(leftString, rightString);
583 }
584 
585 U_CFUNC void
ulocimp_getKeywords(const char * localeID,char prev,ByteSink & sink,UBool valuesToo,UErrorCode * status)586 ulocimp_getKeywords(const char *localeID,
587                     char prev,
588                     ByteSink& sink,
589                     UBool valuesToo,
590                     UErrorCode *status)
591 {
592     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
593 
594     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
595     int32_t numKeywords = 0;
596     const char* pos = localeID;
597     const char* equalSign = nullptr;
598     const char* semicolon = nullptr;
599     int32_t i = 0, j, n;
600 
601     if(prev == '@') { /* start of keyword definition */
602         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
603         do {
604             UBool duplicate = false;
605             /* skip leading spaces */
606             while(*pos == ' ') {
607                 pos++;
608             }
609             if (!*pos) { /* handle trailing "; " */
610                 break;
611             }
612             if(numKeywords == maxKeywords) {
613                 *status = U_INTERNAL_PROGRAM_ERROR;
614                 return;
615             }
616             equalSign = uprv_strchr(pos, '=');
617             semicolon = uprv_strchr(pos, ';');
618             /* lack of '=' [foo@currency] is illegal */
619             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
620             if(!equalSign || (semicolon && semicolon<equalSign)) {
621                 *status = U_INVALID_FORMAT_ERROR;
622                 return;
623             }
624             /* need to normalize both keyword and keyword name */
625             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
626                 /* keyword name too long for internal buffer */
627                 *status = U_INTERNAL_PROGRAM_ERROR;
628                 return;
629             }
630             for(i = 0, n = 0; i < equalSign - pos; ++i) {
631                 if (pos[i] != ' ') {
632                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
633                 }
634             }
635 
636             /* zero-length keyword is an error. */
637             if (n == 0) {
638                 *status = U_INVALID_FORMAT_ERROR;
639                 return;
640             }
641 
642             keywordList[numKeywords].keyword[n] = 0;
643             keywordList[numKeywords].keywordLen = n;
644             /* now grab the value part. First we skip the '=' */
645             equalSign++;
646             /* then we leading spaces */
647             while(*equalSign == ' ') {
648                 equalSign++;
649             }
650 
651             /* Premature end or zero-length value */
652             if (!*equalSign || equalSign == semicolon) {
653                 *status = U_INVALID_FORMAT_ERROR;
654                 return;
655             }
656 
657             keywordList[numKeywords].valueStart = equalSign;
658 
659             pos = semicolon;
660             i = 0;
661             if(pos) {
662                 while(*(pos - i - 1) == ' ') {
663                     i++;
664                 }
665                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
666                 pos++;
667             } else {
668                 i = (int32_t)uprv_strlen(equalSign);
669                 while(i && equalSign[i-1] == ' ') {
670                     i--;
671                 }
672                 keywordList[numKeywords].valueLen = i;
673             }
674             /* If this is a duplicate keyword, then ignore it */
675             for (j=0; j<numKeywords; ++j) {
676                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
677                     duplicate = true;
678                     break;
679                 }
680             }
681             if (!duplicate) {
682                 ++numKeywords;
683             }
684         } while(pos);
685 
686         /* now we have a list of keywords */
687         /* we need to sort it */
688         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, nullptr, false, status);
689 
690         /* Now construct the keyword part */
691         for(i = 0; i < numKeywords; i++) {
692             sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
693             if(valuesToo) {
694                 sink.Append("=", 1);
695                 sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
696                 if(i < numKeywords - 1) {
697                     sink.Append(";", 1);
698                 }
699             } else {
700                 sink.Append("\0", 1);
701             }
702         }
703     }
704 }
705 
706 U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char * localeID,const char * keywordName,char * buffer,int32_t bufferCapacity,UErrorCode * status)707 uloc_getKeywordValue(const char* localeID,
708                      const char* keywordName,
709                      char* buffer, int32_t bufferCapacity,
710                      UErrorCode* status)
711 {
712     if (U_FAILURE(*status)) {
713         return 0;
714     }
715 
716     CheckedArrayByteSink sink(buffer, bufferCapacity);
717     ulocimp_getKeywordValue(localeID, keywordName, sink, status);
718 
719     int32_t reslen = sink.NumberOfBytesAppended();
720 
721     if (U_FAILURE(*status)) {
722         return reslen;
723     }
724 
725     if (sink.Overflowed()) {
726         *status = U_BUFFER_OVERFLOW_ERROR;
727     } else {
728         u_terminateChars(buffer, bufferCapacity, reslen, status);
729     }
730 
731     return reslen;
732 }
733 
734 U_CAPI void U_EXPORT2
ulocimp_getKeywordValue(const char * localeID,const char * keywordName,icu::ByteSink & sink,UErrorCode * status)735 ulocimp_getKeywordValue(const char* localeID,
736                         const char* keywordName,
737                         icu::ByteSink& sink,
738                         UErrorCode* status)
739 {
740     const char* startSearchHere = nullptr;
741     const char* nextSeparator = nullptr;
742     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
743     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
744 
745     if(status && U_SUCCESS(*status) && localeID) {
746       CharString tempBuffer;
747       const char* tmpLocaleID;
748 
749       if (keywordName == nullptr || keywordName[0] == 0) {
750         *status = U_ILLEGAL_ARGUMENT_ERROR;
751         return;
752       }
753 
754       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
755       if(U_FAILURE(*status)) {
756         return;
757       }
758 
759       if (_hasBCP47Extension(localeID)) {
760         CharStringByteSink sink(&tempBuffer);
761         ulocimp_forLanguageTag(localeID, -1, sink, nullptr, status);
762         tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
763       } else {
764           tmpLocaleID=localeID;
765       }
766 
767       startSearchHere = locale_getKeywordsStart(tmpLocaleID);
768       if(startSearchHere == nullptr) {
769           /* no keywords, return at once */
770           return;
771       }
772 
773       /* find the first keyword */
774       while(startSearchHere) {
775           const char* keyValueTail;
776           int32_t keyValueLen;
777 
778           startSearchHere++; /* skip @ or ; */
779           nextSeparator = uprv_strchr(startSearchHere, '=');
780           if(!nextSeparator) {
781               *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
782               return;
783           }
784           /* strip leading & trailing spaces (TC decided to tolerate these) */
785           while(*startSearchHere == ' ') {
786               startSearchHere++;
787           }
788           keyValueTail = nextSeparator;
789           while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
790               keyValueTail--;
791           }
792           /* now keyValueTail points to first char after the keyName */
793           /* copy & normalize keyName from locale */
794           if (startSearchHere == keyValueTail) {
795               *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
796               return;
797           }
798           keyValueLen = 0;
799           while (startSearchHere < keyValueTail) {
800             if (!UPRV_ISALPHANUM(*startSearchHere)) {
801               *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
802               return;
803             }
804             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
805               localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
806             } else {
807               /* keyword name too long for internal buffer */
808               *status = U_INTERNAL_PROGRAM_ERROR;
809               return;
810             }
811           }
812           localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
813 
814           startSearchHere = uprv_strchr(nextSeparator, ';');
815 
816           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
817                /* current entry matches the keyword. */
818              nextSeparator++; /* skip '=' */
819               /* First strip leading & trailing spaces (TC decided to tolerate these) */
820               while(*nextSeparator == ' ') {
821                 nextSeparator++;
822               }
823               keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
824               while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
825                 keyValueTail--;
826               }
827               /* Now copy the value, but check well-formedness */
828               if (nextSeparator == keyValueTail) {
829                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
830                 return;
831               }
832               while (nextSeparator < keyValueTail) {
833                 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
834                   *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
835                   return;
836                 }
837                 /* Should we lowercase value to return here? Tests expect as-is. */
838                 sink.Append(nextSeparator++, 1);
839               }
840               return;
841           }
842       }
843     }
844 }
845 
846 U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char * keywordName,const char * keywordValue,char * buffer,int32_t bufferCapacity,UErrorCode * status)847 uloc_setKeywordValue(const char* keywordName,
848                      const char* keywordValue,
849                      char* buffer, int32_t bufferCapacity,
850                      UErrorCode* status)
851 {
852     /* TODO: sorting. removal. */
853     int32_t keywordNameLen;
854     int32_t keywordValueLen;
855     int32_t bufLen;
856     int32_t needLen = 0;
857     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
858     char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
859     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
860     int32_t rc;
861     char* nextSeparator = nullptr;
862     char* nextEqualsign = nullptr;
863     char* startSearchHere = nullptr;
864     char* keywordStart = nullptr;
865     CharString updatedKeysAndValues;
866     UBool handledInputKeyAndValue = false;
867     char keyValuePrefix = '@';
868 
869     if(U_FAILURE(*status)) {
870         return -1;
871     }
872     if (*status == U_STRING_NOT_TERMINATED_WARNING) {
873         *status = U_ZERO_ERROR;
874     }
875     if (keywordName == nullptr || keywordName[0] == 0 || bufferCapacity <= 1) {
876         *status = U_ILLEGAL_ARGUMENT_ERROR;
877         return 0;
878     }
879     bufLen = (int32_t)uprv_strlen(buffer);
880     if(bufferCapacity<bufLen) {
881         /* The capacity is less than the length?! Is this NUL terminated? */
882         *status = U_ILLEGAL_ARGUMENT_ERROR;
883         return 0;
884     }
885     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
886     if(U_FAILURE(*status)) {
887         return 0;
888     }
889 
890     keywordValueLen = 0;
891     if(keywordValue) {
892         while (*keywordValue != 0) {
893             if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
894                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
895                 return 0;
896             }
897             if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
898                 /* Should we force lowercase in value to set? */
899                 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
900             } else {
901                 /* keywordValue too long for internal buffer */
902                 *status = U_INTERNAL_PROGRAM_ERROR;
903                 return 0;
904             }
905         }
906     }
907     keywordValueBuffer[keywordValueLen] = 0; /* terminate */
908 
909     startSearchHere = (char*)locale_getKeywordsStart(buffer);
910     if(startSearchHere == nullptr || (startSearchHere[1]==0)) {
911         if(keywordValueLen == 0) { /* no keywords = nothing to remove */
912             U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
913             return bufLen;
914         }
915 
916         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
917         if(startSearchHere) { /* had a single @ */
918             needLen--; /* already had the @ */
919             /* startSearchHere points at the @ */
920         } else {
921             startSearchHere=buffer+bufLen;
922         }
923         if(needLen >= bufferCapacity) {
924             *status = U_BUFFER_OVERFLOW_ERROR;
925             return needLen; /* no change */
926         }
927         *startSearchHere++ = '@';
928         uprv_strcpy(startSearchHere, keywordNameBuffer);
929         startSearchHere += keywordNameLen;
930         *startSearchHere++ = '=';
931         uprv_strcpy(startSearchHere, keywordValueBuffer);
932         U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
933         return needLen;
934     } /* end shortcut - no @ */
935 
936     keywordStart = startSearchHere;
937     /* search for keyword */
938     while(keywordStart) {
939         const char* keyValueTail;
940         int32_t keyValueLen;
941 
942         keywordStart++; /* skip @ or ; */
943         nextEqualsign = uprv_strchr(keywordStart, '=');
944         if (!nextEqualsign) {
945             *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
946             return 0;
947         }
948         /* strip leading & trailing spaces (TC decided to tolerate these) */
949         while(*keywordStart == ' ') {
950             keywordStart++;
951         }
952         keyValueTail = nextEqualsign;
953         while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
954             keyValueTail--;
955         }
956         /* now keyValueTail points to first char after the keyName */
957         /* copy & normalize keyName from locale */
958         if (keywordStart == keyValueTail) {
959             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
960             return 0;
961         }
962         keyValueLen = 0;
963         while (keywordStart < keyValueTail) {
964             if (!UPRV_ISALPHANUM(*keywordStart)) {
965                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
966                 return 0;
967             }
968             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
969                 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
970             } else {
971                 /* keyword name too long for internal buffer */
972                 *status = U_INTERNAL_PROGRAM_ERROR;
973                 return 0;
974             }
975         }
976         localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
977 
978         nextSeparator = uprv_strchr(nextEqualsign, ';');
979 
980         /* start processing the value part */
981         nextEqualsign++; /* skip '=' */
982         /* First strip leading & trailing spaces (TC decided to tolerate these) */
983         while(*nextEqualsign == ' ') {
984             nextEqualsign++;
985         }
986         keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
987         while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
988             keyValueTail--;
989         }
990         if (nextEqualsign == keyValueTail) {
991             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
992             return 0;
993         }
994 
995         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
996         if(rc == 0) {
997             /* Current entry matches the input keyword. Update the entry */
998             if(keywordValueLen > 0) { /* updating a value */
999                 updatedKeysAndValues.append(keyValuePrefix, *status);
1000                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1001                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1002                 updatedKeysAndValues.append('=', *status);
1003                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1004             } /* else removing this entry, don't emit anything */
1005             handledInputKeyAndValue = true;
1006         } else {
1007            /* input keyword sorts earlier than current entry, add before current entry */
1008             if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
1009                 /* insert new entry at this location */
1010                 updatedKeysAndValues.append(keyValuePrefix, *status);
1011                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1012                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1013                 updatedKeysAndValues.append('=', *status);
1014                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1015                 handledInputKeyAndValue = true;
1016             }
1017             /* copy the current entry */
1018             updatedKeysAndValues.append(keyValuePrefix, *status);
1019             keyValuePrefix = ';'; /* for any subsequent key-value pair */
1020             updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1021             updatedKeysAndValues.append('=', *status);
1022             updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), *status);
1023         }
1024         if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
1025             /* append new entry at the end, it sorts later than existing entries */
1026             updatedKeysAndValues.append(keyValuePrefix, *status);
1027             /* skip keyValuePrefix update, no subsequent key-value pair */
1028             updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1029             updatedKeysAndValues.append('=', *status);
1030             updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1031             handledInputKeyAndValue = true;
1032         }
1033         keywordStart = nextSeparator;
1034     } /* end loop searching */
1035 
1036     /* Any error from updatedKeysAndValues.append above would be internal and not due to
1037      * problems with the passed-in locale. So if we did encounter problems with the
1038      * passed-in locale above, those errors took precedence and overrode any error
1039      * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1040      * are errors here they are from updatedKeysAndValues.append; they do cause an
1041      * error return but the passed-in locale is unmodified and the original bufLen is
1042      * returned.
1043      */
1044     if (!handledInputKeyAndValue || U_FAILURE(*status)) {
1045         /* if input key/value specified removal of a keyword not present in locale, or
1046          * there was an error in CharString.append, leave original locale alone. */
1047         U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
1048         return bufLen;
1049     }
1050 
1051     // needLen = length of the part before '@'
1052     needLen = (int32_t)(startSearchHere - buffer);
1053     // Check to see can we fit the startSearchHere, if not, return
1054     // U_BUFFER_OVERFLOW_ERROR without copy updatedKeysAndValues into it.
1055     // We do this because this API function does not behave like most others:
1056     // It promises never to set a U_STRING_NOT_TERMINATED_WARNING.
1057     // When the contents fits but without the terminating NUL, in this case we need to not change
1058     // the buffer contents and return with a buffer overflow error.
1059     int32_t appendLength = updatedKeysAndValues.length();
1060     if (appendLength >= bufferCapacity - needLen) {
1061         *status = U_BUFFER_OVERFLOW_ERROR;
1062         return needLen + appendLength;
1063     }
1064     needLen += updatedKeysAndValues.extract(
1065                          startSearchHere, bufferCapacity - needLen, *status);
1066     U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
1067     return needLen;
1068 }
1069 
1070 /* ### ID parsing implementation **************************************************/
1071 
1072 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1073 
1074 /*returns true if one of the special prefixes is here (s=string)
1075   'x-' or 'i-' */
1076 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1077 
1078 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1079  * except for variant
1080  */
1081 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1082 
1083 /**
1084  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1085  * a nullptr entry, followed by more entries, and a second nullptr entry.
1086  *
1087  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1088  * COUNTRIES_3.
1089  */
_findIndex(const char * const * list,const char * key)1090 static int16_t _findIndex(const char* const* list, const char* key)
1091 {
1092     const char* const* anchor = list;
1093     int32_t pass = 0;
1094 
1095     /* Make two passes through two nullptr-terminated arrays at 'list' */
1096     while (pass++ < 2) {
1097         while (*list) {
1098             if (uprv_strcmp(key, *list) == 0) {
1099                 return (int16_t)(list - anchor);
1100             }
1101             list++;
1102         }
1103         ++list;     /* skip final nullptr *CWB*/
1104     }
1105     return -1;
1106 }
1107 
1108 U_CFUNC const char*
uloc_getCurrentCountryID(const char * oldID)1109 uloc_getCurrentCountryID(const char* oldID){
1110     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1111     if (offset >= 0) {
1112         return REPLACEMENT_COUNTRIES[offset];
1113     }
1114     return oldID;
1115 }
1116 U_CFUNC const char*
uloc_getCurrentLanguageID(const char * oldID)1117 uloc_getCurrentLanguageID(const char* oldID){
1118     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1119     if (offset >= 0) {
1120         return REPLACEMENT_LANGUAGES[offset];
1121     }
1122     return oldID;
1123 }
1124 /*
1125  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1126  * avoid duplicating code to handle the earlier locale ID pieces
1127  * in the functions for the later ones by
1128  * setting the *pEnd pointer to where they stopped parsing
1129  *
1130  * TODO try to use this in Locale
1131  */
1132 CharString U_EXPORT2
ulocimp_getLanguage(const char * localeID,const char ** pEnd,UErrorCode & status)1133 ulocimp_getLanguage(const char *localeID,
1134                     const char **pEnd,
1135                     UErrorCode &status) {
1136     CharString result;
1137 
1138     if (uprv_stricmp(localeID, "root") == 0) {
1139         localeID += 4;
1140     } else if (uprv_strnicmp(localeID, "und", 3) == 0 &&
1141                (localeID[3] == '\0' ||
1142                 localeID[3] == '-' ||
1143                 localeID[3] == '_' ||
1144                 localeID[3] == '@')) {
1145         localeID += 3;
1146     }
1147 
1148     /* if it starts with i- or x- then copy that prefix */
1149     if(_isIDPrefix(localeID)) {
1150         result.append((char)uprv_tolower(*localeID), status);
1151         result.append('-', status);
1152         localeID+=2;
1153     }
1154 
1155     /* copy the language as far as possible and count its length */
1156     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1157         result.append((char)uprv_tolower(*localeID), status);
1158         localeID++;
1159     }
1160 
1161     if(result.length()==3) {
1162         /* convert 3 character code to 2 character code if possible *CWB*/
1163         int32_t offset = _findIndex(LANGUAGES_3, result.data());
1164         if(offset>=0) {
1165             result.clear();
1166             result.append(LANGUAGES[offset], status);
1167         }
1168     }
1169 
1170     if(pEnd!=nullptr) {
1171         *pEnd=localeID;
1172     }
1173 
1174     return result;
1175 }
1176 
1177 CharString U_EXPORT2
ulocimp_getScript(const char * localeID,const char ** pEnd,UErrorCode & status)1178 ulocimp_getScript(const char *localeID,
1179                   const char **pEnd,
1180                   UErrorCode &status) {
1181     CharString result;
1182     int32_t idLen = 0;
1183 
1184     if (pEnd != nullptr) {
1185         *pEnd = localeID;
1186     }
1187 
1188     /* copy the second item as far as possible and count its length */
1189     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1190             && uprv_isASCIILetter(localeID[idLen])) {
1191         idLen++;
1192     }
1193 
1194     /* If it's exactly 4 characters long, then it's a script and not a country. */
1195     if (idLen == 4) {
1196         int32_t i;
1197         if (pEnd != nullptr) {
1198             *pEnd = localeID+idLen;
1199         }
1200         if (idLen >= 1) {
1201             result.append((char)uprv_toupper(*(localeID++)), status);
1202         }
1203         for (i = 1; i < idLen; i++) {
1204             result.append((char)uprv_tolower(*(localeID++)), status);
1205         }
1206     }
1207 
1208     return result;
1209 }
1210 
1211 CharString U_EXPORT2
ulocimp_getCountry(const char * localeID,const char ** pEnd,UErrorCode & status)1212 ulocimp_getCountry(const char *localeID,
1213                    const char **pEnd,
1214                    UErrorCode &status) {
1215     CharString result;
1216     int32_t idLen=0;
1217 
1218     /* copy the country as far as possible and count its length */
1219     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1220         result.append((char)uprv_toupper(localeID[idLen]), status);
1221         idLen++;
1222     }
1223 
1224     /* the country should be either length 2 or 3 */
1225     if (idLen == 2 || idLen == 3) {
1226         /* convert 3 character code to 2 character code if possible *CWB*/
1227         if(idLen==3) {
1228             int32_t offset = _findIndex(COUNTRIES_3, result.data());
1229             if(offset>=0) {
1230                 result.clear();
1231                 result.append(COUNTRIES[offset], status);
1232             }
1233         }
1234         localeID+=idLen;
1235     } else {
1236         result.clear();
1237     }
1238 
1239     if(pEnd!=nullptr) {
1240         *pEnd=localeID;
1241     }
1242 
1243     return result;
1244 }
1245 
1246 /**
1247  * @param needSeparator if true, then add leading '_' if any variants
1248  * are added to 'variant'
1249  */
1250 static void
_getVariant(const char * localeID,char prev,ByteSink & sink,UBool needSeparator)1251 _getVariant(const char *localeID,
1252             char prev,
1253             ByteSink& sink,
1254             UBool needSeparator) {
1255     UBool hasVariant = false;
1256 
1257     /* get one or more variant tags and separate them with '_' */
1258     if(_isIDSeparator(prev)) {
1259         /* get a variant string after a '-' or '_' */
1260         while(!_isTerminator(*localeID)) {
1261             if (needSeparator) {
1262                 sink.Append("_", 1);
1263                 needSeparator = false;
1264             }
1265             char c = (char)uprv_toupper(*localeID);
1266             if (c == '-') c = '_';
1267             sink.Append(&c, 1);
1268             hasVariant = true;
1269             localeID++;
1270         }
1271     }
1272 
1273     /* if there is no variant tag after a '-' or '_' then look for '@' */
1274     if(!hasVariant) {
1275         if(prev=='@') {
1276             /* keep localeID */
1277         } else if((localeID=locale_getKeywordsStart(localeID))!=nullptr) {
1278             ++localeID; /* point after the '@' */
1279         } else {
1280             return;
1281         }
1282         while(!_isTerminator(*localeID)) {
1283             if (needSeparator) {
1284                 sink.Append("_", 1);
1285                 needSeparator = false;
1286             }
1287             char c = (char)uprv_toupper(*localeID);
1288             if (c == '-' || c == ',') c = '_';
1289             sink.Append(&c, 1);
1290             localeID++;
1291         }
1292     }
1293 }
1294 
1295 /* Keyword enumeration */
1296 
1297 typedef struct UKeywordsContext {
1298     char* keywords;
1299     char* current;
1300 } UKeywordsContext;
1301 
1302 U_CDECL_BEGIN
1303 
1304 static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration * enumerator)1305 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1306     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1307     uprv_free(enumerator->context);
1308     uprv_free(enumerator);
1309 }
1310 
1311 static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration * en,UErrorCode *)1312 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1313     char *kw = ((UKeywordsContext *)en->context)->keywords;
1314     int32_t result = 0;
1315     while(*kw) {
1316         result++;
1317         kw += uprv_strlen(kw)+1;
1318     }
1319     return result;
1320 }
1321 
1322 static const char * U_CALLCONV
uloc_kw_nextKeyword(UEnumeration * en,int32_t * resultLength,UErrorCode *)1323 uloc_kw_nextKeyword(UEnumeration* en,
1324                     int32_t* resultLength,
1325                     UErrorCode* /*status*/) {
1326     const char* result = ((UKeywordsContext *)en->context)->current;
1327     int32_t len = 0;
1328     if(*result) {
1329         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1330         ((UKeywordsContext *)en->context)->current += len+1;
1331     } else {
1332         result = nullptr;
1333     }
1334     if (resultLength) {
1335         *resultLength = len;
1336     }
1337     return result;
1338 }
1339 
1340 static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration * en,UErrorCode *)1341 uloc_kw_resetKeywords(UEnumeration* en,
1342                       UErrorCode* /*status*/) {
1343     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1344 }
1345 
1346 U_CDECL_END
1347 
1348 
1349 static const UEnumeration gKeywordsEnum = {
1350     nullptr,
1351     nullptr,
1352     uloc_kw_closeKeywords,
1353     uloc_kw_countKeywords,
1354     uenum_unextDefault,
1355     uloc_kw_nextKeyword,
1356     uloc_kw_resetKeywords
1357 };
1358 
1359 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char * keywordList,int32_t keywordListSize,UErrorCode * status)1360 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1361 {
1362     LocalMemory<UKeywordsContext> myContext;
1363     LocalMemory<UEnumeration> result;
1364 
1365     if (U_FAILURE(*status)) {
1366         return nullptr;
1367     }
1368     myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1369     result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1370     if (myContext.isNull() || result.isNull()) {
1371         *status = U_MEMORY_ALLOCATION_ERROR;
1372         return nullptr;
1373     }
1374     uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1375     myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1376     if (myContext->keywords == nullptr) {
1377         *status = U_MEMORY_ALLOCATION_ERROR;
1378         return nullptr;
1379     }
1380     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1381     myContext->keywords[keywordListSize] = 0;
1382     myContext->current = myContext->keywords;
1383     result->context = myContext.orphan();
1384     return result.orphan();
1385 }
1386 
1387 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char * localeID,UErrorCode * status)1388 uloc_openKeywords(const char* localeID,
1389                         UErrorCode* status)
1390 {
1391     CharString tempBuffer;
1392     const char* tmpLocaleID;
1393 
1394     if(status==nullptr || U_FAILURE(*status)) {
1395         return 0;
1396     }
1397 
1398     if (_hasBCP47Extension(localeID)) {
1399         CharStringByteSink sink(&tempBuffer);
1400         ulocimp_forLanguageTag(localeID, -1, sink, nullptr, status);
1401         tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
1402     } else {
1403         if (localeID==nullptr) {
1404             localeID=uloc_getDefault();
1405         }
1406         tmpLocaleID=localeID;
1407     }
1408 
1409     /* Skip the language */
1410     ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *status);
1411     if (U_FAILURE(*status)) {
1412         return 0;
1413     }
1414 
1415     if(_isIDSeparator(*tmpLocaleID)) {
1416         const char *scriptID;
1417         /* Skip the script if available */
1418         ulocimp_getScript(tmpLocaleID+1, &scriptID, *status);
1419         if (U_FAILURE(*status)) {
1420             return 0;
1421         }
1422         if(scriptID != tmpLocaleID+1) {
1423             /* Found optional script */
1424             tmpLocaleID = scriptID;
1425         }
1426         /* Skip the Country */
1427         if (_isIDSeparator(*tmpLocaleID)) {
1428             ulocimp_getCountry(tmpLocaleID+1, &tmpLocaleID, *status);
1429             if (U_FAILURE(*status)) {
1430                 return 0;
1431             }
1432         }
1433     }
1434 
1435     /* keywords are located after '@' */
1436     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != nullptr) {
1437         CharString keywords;
1438         CharStringByteSink sink(&keywords);
1439         ulocimp_getKeywords(tmpLocaleID+1, '@', sink, false, status);
1440         if (U_FAILURE(*status)) {
1441             return nullptr;
1442         }
1443         return uloc_openKeywordList(keywords.data(), keywords.length(), status);
1444     }
1445     return nullptr;
1446 }
1447 
1448 
1449 /* bit-flags for 'options' parameter of _canonicalize */
1450 #define _ULOC_STRIP_KEYWORDS 0x2
1451 #define _ULOC_CANONICALIZE   0x1
1452 
1453 #define OPTION_SET(options, mask) ((options & mask) != 0)
1454 
1455 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1456 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1457 
1458 /**
1459  * Canonicalize the given localeID, to level 1 or to level 2,
1460  * depending on the options.  To specify level 1, pass in options=0.
1461  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1462  *
1463  * This is the code underlying uloc_getName and uloc_canonicalize.
1464  */
1465 static void
_canonicalize(const char * localeID,ByteSink & sink,uint32_t options,UErrorCode * err)1466 _canonicalize(const char* localeID,
1467               ByteSink& sink,
1468               uint32_t options,
1469               UErrorCode* err) {
1470     if (U_FAILURE(*err)) {
1471         return;
1472     }
1473 
1474     int32_t j, fieldCount=0, scriptSize=0, variantSize=0;
1475     CharString tempBuffer;  // if localeID has a BCP47 extension, tmpLocaleID points to this
1476     CharString localeIDWithHyphens;  // if localeID has a BPC47 extension and have _, tmpLocaleID points to this
1477     const char* origLocaleID;
1478     const char* tmpLocaleID;
1479     const char* keywordAssign = nullptr;
1480     const char* separatorIndicator = nullptr;
1481 
1482     if (_hasBCP47Extension(localeID)) {
1483         const char* localeIDPtr = localeID;
1484 
1485         // convert all underbars to hyphens, unless the "BCP47 extension" comes at the beginning of the string
1486         if (uprv_strchr(localeID, '_') != nullptr && localeID[1] != '-' && localeID[1] != '_') {
1487             localeIDWithHyphens.append(localeID, -1, *err);
1488             if (U_SUCCESS(*err)) {
1489                 for (char* p = localeIDWithHyphens.data(); *p != '\0'; ++p) {
1490                     if (*p == '_') {
1491                         *p = '-';
1492                     }
1493                 }
1494                 localeIDPtr = localeIDWithHyphens.data();
1495             }
1496         }
1497 
1498         CharStringByteSink tempSink(&tempBuffer);
1499         ulocimp_forLanguageTag(localeIDPtr, -1, tempSink, nullptr, err);
1500         tmpLocaleID = U_SUCCESS(*err) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeIDPtr;
1501     } else {
1502         if (localeID==nullptr) {
1503            localeID=uloc_getDefault();
1504         }
1505         tmpLocaleID=localeID;
1506     }
1507 
1508     origLocaleID=tmpLocaleID;
1509 
1510     /* get all pieces, one after another, and separate with '_' */
1511     CharString tag = ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1512 
1513     if (tag.length() == I_DEFAULT_LENGTH &&
1514             uprv_strncmp(origLocaleID, i_default, I_DEFAULT_LENGTH) == 0) {
1515         tag.clear();
1516         tag.append(uloc_getDefault(), *err);
1517     } else if(_isIDSeparator(*tmpLocaleID)) {
1518         const char *scriptID;
1519 
1520         ++fieldCount;
1521         tag.append('_', *err);
1522 
1523         CharString script = ulocimp_getScript(tmpLocaleID+1, &scriptID, *err);
1524         tag.append(script, *err);
1525         scriptSize = script.length();
1526         if(scriptSize > 0) {
1527             /* Found optional script */
1528             tmpLocaleID = scriptID;
1529             ++fieldCount;
1530             if (_isIDSeparator(*tmpLocaleID)) {
1531                 /* If there is something else, then we add the _ */
1532                 tag.append('_', *err);
1533             }
1534         }
1535 
1536         if (_isIDSeparator(*tmpLocaleID)) {
1537             const char *cntryID;
1538 
1539             CharString country = ulocimp_getCountry(tmpLocaleID+1, &cntryID, *err);
1540             tag.append(country, *err);
1541             if (!country.isEmpty()) {
1542                 /* Found optional country */
1543                 tmpLocaleID = cntryID;
1544             }
1545             if(_isIDSeparator(*tmpLocaleID)) {
1546                 /* If there is something else, then we add the _  if we found country before. */
1547                 if (!_isIDSeparator(*(tmpLocaleID+1))) {
1548                     ++fieldCount;
1549                     tag.append('_', *err);
1550                 }
1551 
1552                 variantSize = -tag.length();
1553                 {
1554                     CharStringByteSink s(&tag);
1555                     _getVariant(tmpLocaleID+1, *tmpLocaleID, s, false);
1556                 }
1557                 variantSize += tag.length();
1558                 if (variantSize > 0) {
1559                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1560                 }
1561             }
1562         }
1563     }
1564 
1565     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1566     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1567         UBool done = false;
1568         do {
1569             char c = *tmpLocaleID;
1570             switch (c) {
1571             case 0:
1572             case '@':
1573                 done = true;
1574                 break;
1575             default:
1576                 tag.append(c, *err);
1577                 ++tmpLocaleID;
1578                 break;
1579             }
1580         } while (!done);
1581     }
1582 
1583     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1584        After this, tmpLocaleID either points to '@' or is nullptr */
1585     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=nullptr) {
1586         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1587         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1588     }
1589 
1590     /* Copy POSIX-style variant, if any [mr@FOO] */
1591     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1592         tmpLocaleID != nullptr && keywordAssign == nullptr) {
1593         for (;;) {
1594             char c = *tmpLocaleID;
1595             if (c == 0) {
1596                 break;
1597             }
1598             tag.append(c, *err);
1599             ++tmpLocaleID;
1600         }
1601     }
1602 
1603     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1604         /* Handle @FOO variant if @ is present and not followed by = */
1605         if (tmpLocaleID!=nullptr && keywordAssign==nullptr) {
1606             /* Add missing '_' if needed */
1607             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1608                 do {
1609                     tag.append('_', *err);
1610                     ++fieldCount;
1611                 } while(fieldCount<2);
1612             }
1613 
1614             int32_t posixVariantSize = -tag.length();
1615             {
1616                 CharStringByteSink s(&tag);
1617                 _getVariant(tmpLocaleID+1, '@', s, (UBool)(variantSize > 0));
1618             }
1619             posixVariantSize += tag.length();
1620             if (posixVariantSize > 0) {
1621                 variantSize += posixVariantSize;
1622             }
1623         }
1624 
1625         /* Look up the ID in the canonicalization map */
1626         for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1627             StringPiece id(CANONICALIZE_MAP[j].id);
1628             if (tag == id) {
1629                 if (id.empty() && tmpLocaleID != nullptr) {
1630                     break; /* Don't remap "" if keywords present */
1631                 }
1632                 tag.clear();
1633                 tag.append(CANONICALIZE_MAP[j].canonicalID, *err);
1634                 break;
1635             }
1636         }
1637     }
1638 
1639     sink.Append(tag.data(), tag.length());
1640 
1641     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1642         if (tmpLocaleID!=nullptr && keywordAssign!=nullptr &&
1643             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1644             sink.Append("@", 1);
1645             ++fieldCount;
1646             ulocimp_getKeywords(tmpLocaleID+1, '@', sink, true, err);
1647         }
1648     }
1649 }
1650 
1651 /* ### ID parsing API **************************************************/
1652 
1653 U_CAPI int32_t  U_EXPORT2
uloc_getParent(const char * localeID,char * parent,int32_t parentCapacity,UErrorCode * err)1654 uloc_getParent(const char*    localeID,
1655                char* parent,
1656                int32_t parentCapacity,
1657                UErrorCode* err)
1658 {
1659     if (U_FAILURE(*err)) {
1660         return 0;
1661     }
1662 
1663     CheckedArrayByteSink sink(parent, parentCapacity);
1664     ulocimp_getParent(localeID, sink, err);
1665 
1666     int32_t reslen = sink.NumberOfBytesAppended();
1667 
1668     if (U_FAILURE(*err)) {
1669         return reslen;
1670     }
1671 
1672     if (sink.Overflowed()) {
1673         *err = U_BUFFER_OVERFLOW_ERROR;
1674     } else {
1675         u_terminateChars(parent, parentCapacity, reslen, err);
1676     }
1677 
1678     return reslen;
1679 }
1680 
1681 U_CAPI void U_EXPORT2
ulocimp_getParent(const char * localeID,icu::ByteSink & sink,UErrorCode * err)1682 ulocimp_getParent(const char* localeID,
1683                   icu::ByteSink& sink,
1684                   UErrorCode* err)
1685 {
1686     const char *lastUnderscore;
1687     int32_t i;
1688 
1689     if (U_FAILURE(*err))
1690         return;
1691 
1692     if (localeID == nullptr)
1693         localeID = uloc_getDefault();
1694 
1695     lastUnderscore=uprv_strrchr(localeID, '_');
1696     if(lastUnderscore!=nullptr) {
1697         i=(int32_t)(lastUnderscore-localeID);
1698     } else {
1699         i=0;
1700     }
1701 
1702     if (i > 0) {
1703         if (uprv_strnicmp(localeID, "und_", 4) == 0) {
1704             localeID += 3;
1705             i -= 3;
1706         }
1707         sink.Append(localeID, i);
1708     }
1709 }
1710 
1711 U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char * localeID,char * language,int32_t languageCapacity,UErrorCode * err)1712 uloc_getLanguage(const char*    localeID,
1713          char* language,
1714          int32_t languageCapacity,
1715          UErrorCode* err)
1716 {
1717     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1718 
1719     if (err==nullptr || U_FAILURE(*err)) {
1720         return 0;
1721     }
1722 
1723     if(localeID==nullptr) {
1724         localeID=uloc_getDefault();
1725     }
1726 
1727     return ulocimp_getLanguage(localeID, nullptr, *err).extract(language, languageCapacity, *err);
1728 }
1729 
1730 U_CAPI int32_t U_EXPORT2
uloc_getScript(const char * localeID,char * script,int32_t scriptCapacity,UErrorCode * err)1731 uloc_getScript(const char*    localeID,
1732          char* script,
1733          int32_t scriptCapacity,
1734          UErrorCode* err)
1735 {
1736     if(err==nullptr || U_FAILURE(*err)) {
1737         return 0;
1738     }
1739 
1740     if(localeID==nullptr) {
1741         localeID=uloc_getDefault();
1742     }
1743 
1744     /* skip the language */
1745     ulocimp_getLanguage(localeID, &localeID, *err);
1746     if (U_FAILURE(*err)) {
1747         return 0;
1748     }
1749 
1750     if(_isIDSeparator(*localeID)) {
1751         return ulocimp_getScript(localeID+1, nullptr, *err).extract(script, scriptCapacity, *err);
1752     }
1753     return u_terminateChars(script, scriptCapacity, 0, err);
1754 }
1755 
1756 U_CAPI int32_t  U_EXPORT2
uloc_getCountry(const char * localeID,char * country,int32_t countryCapacity,UErrorCode * err)1757 uloc_getCountry(const char* localeID,
1758             char* country,
1759             int32_t countryCapacity,
1760             UErrorCode* err)
1761 {
1762     if(err==nullptr || U_FAILURE(*err)) {
1763         return 0;
1764     }
1765 
1766     if(localeID==nullptr) {
1767         localeID=uloc_getDefault();
1768     }
1769 
1770     /* Skip the language */
1771     ulocimp_getLanguage(localeID, &localeID, *err);
1772     if (U_FAILURE(*err)) {
1773         return 0;
1774     }
1775 
1776     if(_isIDSeparator(*localeID)) {
1777         const char *scriptID;
1778         /* Skip the script if available */
1779         ulocimp_getScript(localeID+1, &scriptID, *err);
1780         if (U_FAILURE(*err)) {
1781             return 0;
1782         }
1783         if(scriptID != localeID+1) {
1784             /* Found optional script */
1785             localeID = scriptID;
1786         }
1787         if(_isIDSeparator(*localeID)) {
1788             return ulocimp_getCountry(localeID+1, nullptr, *err).extract(country, countryCapacity, *err);
1789         }
1790     }
1791     return u_terminateChars(country, countryCapacity, 0, err);
1792 }
1793 
1794 U_CAPI int32_t  U_EXPORT2
uloc_getVariant(const char * localeID,char * variant,int32_t variantCapacity,UErrorCode * err)1795 uloc_getVariant(const char* localeID,
1796                 char* variant,
1797                 int32_t variantCapacity,
1798                 UErrorCode* err)
1799 {
1800     CharString tempBuffer;
1801     const char* tmpLocaleID;
1802     int32_t i=0;
1803 
1804     if(err==nullptr || U_FAILURE(*err)) {
1805         return 0;
1806     }
1807 
1808     if (_hasBCP47Extension(localeID)) {
1809         CharStringByteSink sink(&tempBuffer);
1810         ulocimp_forLanguageTag(localeID, -1, sink, nullptr, err);
1811         tmpLocaleID = U_SUCCESS(*err) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
1812     } else {
1813         if (localeID==nullptr) {
1814            localeID=uloc_getDefault();
1815         }
1816         tmpLocaleID=localeID;
1817     }
1818 
1819     /* Skip the language */
1820     ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1821     if (U_FAILURE(*err)) {
1822         return 0;
1823     }
1824 
1825     if(_isIDSeparator(*tmpLocaleID)) {
1826         const char *scriptID;
1827         /* Skip the script if available */
1828         ulocimp_getScript(tmpLocaleID+1, &scriptID, *err);
1829         if (U_FAILURE(*err)) {
1830             return 0;
1831         }
1832         if(scriptID != tmpLocaleID+1) {
1833             /* Found optional script */
1834             tmpLocaleID = scriptID;
1835         }
1836         /* Skip the Country */
1837         if (_isIDSeparator(*tmpLocaleID)) {
1838             const char *cntryID;
1839             ulocimp_getCountry(tmpLocaleID+1, &cntryID, *err);
1840             if (U_FAILURE(*err)) {
1841                 return 0;
1842             }
1843             if (cntryID != tmpLocaleID+1) {
1844                 /* Found optional country */
1845                 tmpLocaleID = cntryID;
1846             }
1847             if(_isIDSeparator(*tmpLocaleID)) {
1848                 /* If there was no country ID, skip a possible extra IDSeparator */
1849                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
1850                     tmpLocaleID++;
1851                 }
1852 
1853                 CheckedArrayByteSink sink(variant, variantCapacity);
1854                 _getVariant(tmpLocaleID+1, *tmpLocaleID, sink, false);
1855 
1856                 i = sink.NumberOfBytesAppended();
1857 
1858                 if (U_FAILURE(*err)) {
1859                     return i;
1860                 }
1861 
1862                 if (sink.Overflowed()) {
1863                     *err = U_BUFFER_OVERFLOW_ERROR;
1864                     return i;
1865                 }
1866             }
1867         }
1868     }
1869 
1870     return u_terminateChars(variant, variantCapacity, i, err);
1871 }
1872 
1873 U_CAPI int32_t  U_EXPORT2
uloc_getName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1874 uloc_getName(const char* localeID,
1875              char* name,
1876              int32_t nameCapacity,
1877              UErrorCode* err)
1878 {
1879     if (U_FAILURE(*err)) {
1880         return 0;
1881     }
1882 
1883     CheckedArrayByteSink sink(name, nameCapacity);
1884     ulocimp_getName(localeID, sink, err);
1885 
1886     int32_t reslen = sink.NumberOfBytesAppended();
1887 
1888     if (U_FAILURE(*err)) {
1889         return reslen;
1890     }
1891 
1892     if (sink.Overflowed()) {
1893         *err = U_BUFFER_OVERFLOW_ERROR;
1894     } else {
1895         u_terminateChars(name, nameCapacity, reslen, err);
1896     }
1897 
1898     return reslen;
1899 }
1900 
1901 U_CAPI void U_EXPORT2
ulocimp_getName(const char * localeID,ByteSink & sink,UErrorCode * err)1902 ulocimp_getName(const char* localeID,
1903                 ByteSink& sink,
1904                 UErrorCode* err)
1905 {
1906     _canonicalize(localeID, sink, 0, err);
1907 }
1908 
1909 U_CAPI int32_t  U_EXPORT2
uloc_getBaseName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1910 uloc_getBaseName(const char* localeID,
1911                  char* name,
1912                  int32_t nameCapacity,
1913                  UErrorCode* err)
1914 {
1915     if (U_FAILURE(*err)) {
1916         return 0;
1917     }
1918 
1919     CheckedArrayByteSink sink(name, nameCapacity);
1920     ulocimp_getBaseName(localeID, sink, err);
1921 
1922     int32_t reslen = sink.NumberOfBytesAppended();
1923 
1924     if (U_FAILURE(*err)) {
1925         return reslen;
1926     }
1927 
1928     if (sink.Overflowed()) {
1929         *err = U_BUFFER_OVERFLOW_ERROR;
1930     } else {
1931         u_terminateChars(name, nameCapacity, reslen, err);
1932     }
1933 
1934     return reslen;
1935 }
1936 
1937 U_CAPI void U_EXPORT2
ulocimp_getBaseName(const char * localeID,ByteSink & sink,UErrorCode * err)1938 ulocimp_getBaseName(const char* localeID,
1939                     ByteSink& sink,
1940                     UErrorCode* err)
1941 {
1942     _canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
1943 }
1944 
1945 U_CAPI int32_t  U_EXPORT2
uloc_canonicalize(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1946 uloc_canonicalize(const char* localeID,
1947                   char* name,
1948                   int32_t nameCapacity,
1949                   UErrorCode* err)
1950 {
1951     if (U_FAILURE(*err)) {
1952         return 0;
1953     }
1954 
1955     CheckedArrayByteSink sink(name, nameCapacity);
1956     ulocimp_canonicalize(localeID, sink, err);
1957 
1958     int32_t reslen = sink.NumberOfBytesAppended();
1959 
1960     if (U_FAILURE(*err)) {
1961         return reslen;
1962     }
1963 
1964     if (sink.Overflowed()) {
1965         *err = U_BUFFER_OVERFLOW_ERROR;
1966     } else {
1967         u_terminateChars(name, nameCapacity, reslen, err);
1968     }
1969 
1970     return reslen;
1971 }
1972 
1973 U_CAPI void U_EXPORT2
ulocimp_canonicalize(const char * localeID,ByteSink & sink,UErrorCode * err)1974 ulocimp_canonicalize(const char* localeID,
1975                      ByteSink& sink,
1976                      UErrorCode* err)
1977 {
1978     _canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
1979 }
1980 
1981 U_CAPI const char*  U_EXPORT2
uloc_getISO3Language(const char * localeID)1982 uloc_getISO3Language(const char* localeID)
1983 {
1984     int16_t offset;
1985     char lang[ULOC_LANG_CAPACITY];
1986     UErrorCode err = U_ZERO_ERROR;
1987 
1988     if (localeID == nullptr)
1989     {
1990         localeID = uloc_getDefault();
1991     }
1992     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
1993     if (U_FAILURE(err))
1994         return "";
1995     offset = _findIndex(LANGUAGES, lang);
1996     if (offset < 0)
1997         return "";
1998     return LANGUAGES_3[offset];
1999 }
2000 
2001 U_CAPI const char*  U_EXPORT2
uloc_getISO3Country(const char * localeID)2002 uloc_getISO3Country(const char* localeID)
2003 {
2004     int16_t offset;
2005     char cntry[ULOC_LANG_CAPACITY];
2006     UErrorCode err = U_ZERO_ERROR;
2007 
2008     if (localeID == nullptr)
2009     {
2010         localeID = uloc_getDefault();
2011     }
2012     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2013     if (U_FAILURE(err))
2014         return "";
2015     offset = _findIndex(COUNTRIES, cntry);
2016     if (offset < 0)
2017         return "";
2018 
2019     return COUNTRIES_3[offset];
2020 }
2021 
2022 U_CAPI uint32_t  U_EXPORT2
uloc_getLCID(const char * localeID)2023 uloc_getLCID(const char* localeID)
2024 {
2025     UErrorCode status = U_ZERO_ERROR;
2026     char       langID[ULOC_FULLNAME_CAPACITY];
2027     uint32_t   lcid = 0;
2028 
2029     /* Check for incomplete id. */
2030     if (!localeID || uprv_strlen(localeID) < 2) {
2031         return 0;
2032     }
2033 
2034     // First, attempt Windows platform lookup if available, but fall
2035     // through to catch any special cases (ICU vs Windows name differences).
2036     lcid = uprv_convertToLCIDPlatform(localeID, &status);
2037     if (U_FAILURE(status)) {
2038         return 0;
2039     }
2040     if (lcid > 0) {
2041         // Windows found an LCID, return that
2042         return lcid;
2043     }
2044 
2045     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2046     if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) {
2047         return 0;
2048     }
2049 
2050     if (uprv_strchr(localeID, '@')) {
2051         // uprv_convertToLCID does not support keywords other than collation.
2052         // Remove all keywords except collation.
2053         int32_t len;
2054         char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2055 
2056         CharString collVal;
2057         {
2058             CharStringByteSink sink(&collVal);
2059             ulocimp_getKeywordValue(localeID, "collation", sink, &status);
2060         }
2061 
2062         if (U_SUCCESS(status) && !collVal.isEmpty()) {
2063             len = uloc_getBaseName(localeID, tmpLocaleID,
2064                 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2065 
2066             if (U_SUCCESS(status) && len > 0) {
2067                 tmpLocaleID[len] = 0;
2068 
2069                 len = uloc_setKeywordValue("collation", collVal.data(), tmpLocaleID,
2070                     UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2071 
2072                 if (U_SUCCESS(status) && len > 0) {
2073                     tmpLocaleID[len] = 0;
2074                     return uprv_convertToLCID(langID, tmpLocaleID, &status);
2075                 }
2076             }
2077         }
2078 
2079         // fall through - all keywords are simply ignored
2080         status = U_ZERO_ERROR;
2081     }
2082 
2083     return uprv_convertToLCID(langID, localeID, &status);
2084 }
2085 
2086 U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid,char * locale,int32_t localeCapacity,UErrorCode * status)2087 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2088                 UErrorCode *status)
2089 {
2090     return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2091 }
2092 
2093 /* ### Default locale **************************************************/
2094 
2095 U_CAPI const char*  U_EXPORT2
uloc_getDefault()2096 uloc_getDefault()
2097 {
2098     return locale_get_default();
2099 }
2100 
2101 U_CAPI void  U_EXPORT2
uloc_setDefault(const char * newDefaultLocale,UErrorCode * err)2102 uloc_setDefault(const char*   newDefaultLocale,
2103              UErrorCode* err)
2104 {
2105     if (U_FAILURE(*err))
2106         return;
2107     /* the error code isn't currently used for anything by this function*/
2108 
2109     /* propagate change to C++ */
2110     locale_set_default(newDefaultLocale);
2111 }
2112 
2113 /**
2114  * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2115  * to an array of pointers to arrays of char.  All of these pointers are owned
2116  * by ICU-- do not delete them, and do not write through them.  The array is
2117  * terminated with a null pointer.
2118  */
2119 U_CAPI const char* const*  U_EXPORT2
uloc_getISOLanguages()2120 uloc_getISOLanguages()
2121 {
2122     return LANGUAGES;
2123 }
2124 
2125 /**
2126  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2127  * pointer to an array of pointers to arrays of char.  All of these pointers are
2128  * owned by ICU-- do not delete them, and do not write through them.  The array is
2129  * terminated with a null pointer.
2130  */
2131 U_CAPI const char* const*  U_EXPORT2
uloc_getISOCountries()2132 uloc_getISOCountries()
2133 {
2134     return COUNTRIES;
2135 }
2136 
2137 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleKey(const char * keyword)2138 uloc_toUnicodeLocaleKey(const char* keyword)
2139 {
2140     const char* bcpKey = ulocimp_toBcpKey(keyword);
2141     if (bcpKey == nullptr && ultag_isUnicodeLocaleKey(keyword, -1)) {
2142         // unknown keyword, but syntax is fine..
2143         return keyword;
2144     }
2145     return bcpKey;
2146 }
2147 
2148 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleType(const char * keyword,const char * value)2149 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2150 {
2151     const char* bcpType = ulocimp_toBcpType(keyword, value, nullptr, nullptr);
2152     if (bcpType == nullptr && ultag_isUnicodeLocaleType(value, -1)) {
2153         // unknown keyword, but syntax is fine..
2154         return value;
2155     }
2156     return bcpType;
2157 }
2158 
2159 static UBool
isWellFormedLegacyKey(const char * legacyKey)2160 isWellFormedLegacyKey(const char* legacyKey)
2161 {
2162     const char* p = legacyKey;
2163     while (*p) {
2164         if (!UPRV_ISALPHANUM(*p)) {
2165             return false;
2166         }
2167         p++;
2168     }
2169     return true;
2170 }
2171 
2172 static UBool
isWellFormedLegacyType(const char * legacyType)2173 isWellFormedLegacyType(const char* legacyType)
2174 {
2175     const char* p = legacyType;
2176     int32_t alphaNumLen = 0;
2177     while (*p) {
2178         if (*p == '_' || *p == '/' || *p == '-') {
2179             if (alphaNumLen == 0) {
2180                 return false;
2181             }
2182             alphaNumLen = 0;
2183         } else if (UPRV_ISALPHANUM(*p)) {
2184             alphaNumLen++;
2185         } else {
2186             return false;
2187         }
2188         p++;
2189     }
2190     return (alphaNumLen != 0);
2191 }
2192 
2193 U_CAPI const char* U_EXPORT2
uloc_toLegacyKey(const char * keyword)2194 uloc_toLegacyKey(const char* keyword)
2195 {
2196     const char* legacyKey = ulocimp_toLegacyKey(keyword);
2197     if (legacyKey == nullptr) {
2198         // Checks if the specified locale key is well-formed with the legacy locale syntax.
2199         //
2200         // Note:
2201         //  LDML/CLDR provides some definition of keyword syntax in
2202         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2203         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2204         //  Keys can only consist of [0-9a-zA-Z].
2205         if (isWellFormedLegacyKey(keyword)) {
2206             return keyword;
2207         }
2208     }
2209     return legacyKey;
2210 }
2211 
2212 U_CAPI const char* U_EXPORT2
uloc_toLegacyType(const char * keyword,const char * value)2213 uloc_toLegacyType(const char* keyword, const char* value)
2214 {
2215     const char* legacyType = ulocimp_toLegacyType(keyword, value, nullptr, nullptr);
2216     if (legacyType == nullptr) {
2217         // Checks if the specified locale type is well-formed with the legacy locale syntax.
2218         //
2219         // Note:
2220         //  LDML/CLDR provides some definition of keyword syntax in
2221         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2222         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2223         //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2224         //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2225         if (isWellFormedLegacyType(value)) {
2226             return value;
2227         }
2228     }
2229     return legacyType;
2230 }
2231 
2232 /*eof*/
2233