xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/tool/LanguageCodeConverter.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.tool;
2 
3 import com.ibm.icu.impl.Row.R2;
4 import com.ibm.icu.util.ULocale;
5 import java.util.Collections;
6 import java.util.HashMap;
7 import java.util.HashSet;
8 import java.util.LinkedHashMap;
9 import java.util.LinkedHashSet;
10 import java.util.List;
11 import java.util.Locale;
12 import java.util.Map;
13 import java.util.Map.Entry;
14 import java.util.Set;
15 import java.util.TreeMap;
16 import java.util.TreeSet;
17 import org.unicode.cldr.util.Builder;
18 import org.unicode.cldr.util.CLDRConfig;
19 import org.unicode.cldr.util.CldrUtility;
20 import org.unicode.cldr.util.LanguageTagParser;
21 import org.unicode.cldr.util.StandardCodes;
22 import org.unicode.cldr.util.StringIterables;
23 
24 public class LanguageCodeConverter {
25     private static Map<String, String> languageNameToCode = new TreeMap<>();
26     private static Set<String> exceptionCodes = new TreeSet<>();
27     private static Set<String> parseErrors = new LinkedHashSet<>();
28 
29     private static Map<String, R2<List<String>, String>> languageAliases =
30             CLDRConfig.getInstance().getSupplementalDataInfo().getLocaleAliasInfo().get("language");
31 
32     /**
33      * Public only for testing.
34      *
35      * @internal
36      */
37     public static final Map<String, String> GOOGLE_CLDR =
38             Builder.with(new LinkedHashMap<String, String>()) // preserve order
39                     .put("iw", "he")
40                     .put("jw", "jv")
41                     // .put("nb", "no")
42                     .put("tl", "fil")
43                     .put("pt-BR", "pt")
44                     .put("xx-bork", "x_bork")
45                     .put("xx-elmer", "x_elmer")
46                     .put("xx-hacker", "x_hacker")
47                     .put("xx-pirate", "x_pirate")
48                     .put("xx-klingon", "tlh")
49                     .put("zh-CN", "zh")
50                     .put("zh-TW", "zh_Hant")
51                     .put("zh-HK", "zh_Hant_HK")
52                     .put("sit-NP", "lif")
53                     .put("ut", "und")
54                     .put("un", "und")
55                     .put("xx", "und")
56 
57                     // .put("sh", "fil")
58                     .freeze();
59 
60     /**
61      * Public only for testing.
62      *
63      * @internal
64      */
65     public static final Map<String, String> CLDR_GOOGLE =
66             Builder.with(new HashMap<String, String>()).putAllTransposed(GOOGLE_CLDR).freeze();
67 
68     /**
69      * Public only for testing.
70      *
71      * @internal
72      */
73     public static final Map<String, String> EXTRA_SCRIPTS =
74             Builder.with(new HashMap<String, String>())
75                     .on("crs", "pcm", "tlh")
76                     .put("Latn")
77                     .freeze();
78 
79     static {
80         // Reads the CLDR copy of
81         // http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
82         Map<String, Map<String, Map<String, String>>> lstreg = StandardCodes.getLStreg();
83         Map<String, Map<String, String>> languages = lstreg.get("language");
84         Set<String> validCodes = new HashSet<>();
85 
86         for (Entry<String, Map<String, String>> codeInfo : languages.entrySet()) {
87             String code = codeInfo.getKey();
88             R2<List<String>, String> replacement = languageAliases.get(code);
89             // Returns "sh" -> <{"sr_Latn"}, reason>
90             if (replacement != null) {
91                 List<String> replacements = replacement.get0();
92                 if (replacements.size() != 1) {
93                     continue;
94                 }
95                 code = replacements.get(0);
96                 if (code.contains("_")) {
97                     continue;
98                 }
99             }
100             // if (languageAliases.containsKey(code)) {
101             // continue;
102             // }
103             final Map<String, String> info = codeInfo.getValue();
104             String deprecated = info.get("Deprecated");
105             if (deprecated != null) {
106                 continue;
107             }
108             String name = info.get("Description");
109             if (name.equals("Private use")) {
110                 continue;
111             }
112             validCodes.add(code);
113             if (name.contains(StandardCodes.DESCRIPTION_SEPARATOR)) {
114                 for (String namePart : name.split(StandardCodes.DESCRIPTION_SEPARATOR)) {
115                     addNameToCode("lstr", code, namePart);
116                 }
117             } else {
118                 addNameToCode("lstr", code, name);
119             }
120         }
121 
122         // CLDRFile english; // = testInfo.getEnglish();
123         for (String code : validCodes) {
124             String icuName = ULocale.getDisplayName(code, "en");
125             addNameToCode("cldr", code, icuName);
126             // if (languageAliases.containsKey(code)) {
127             // continue;
128             // }
129             // String cldrName = english.getName("language", code);
130             // if (cldrName != null && !cldrName.equals("private-use")) {
131             // addNameToCode("cldr", code, cldrName);
132             // }
133         }
134         // add exceptions
135         LanguageTagParser ltp = new LanguageTagParser();
136         for (String line :
137                 StringIterables.in(
138                         CldrUtility.getUTF8Data("external/alternate_language_names.txt"))) {
139             String[] parts = CldrUtility.cleanSemiFields(line);
140             if (parts == null || parts.length == 0) continue;
141             String code = parts[0];
142             if (!validCodes.contains(code)) {
143                 if (code.equals("*OMIT")) {
144                     parseErrors.add("Skipping " + line);
145                     continue;
146                 }
147                 String base = ltp.set(code).getLanguage();
148                 if (!validCodes.contains(base)) {
149                     R2<List<String>, String> alias = languageAliases.get(base);
150                     if (alias != null) {
151                         code = alias.get0().get(0);
152                     } else {
153                         parseErrors.add("Skipping " + line);
154                         continue;
155                     }
156                 }
157             }
toUnderbarLocale(code)158             exceptionCodes.add(toUnderbarLocale(code));
159             if (parts.length < 2) {
160                 continue;
161             }
162             String name = parts[1];
163             if (parts.length > 2) {
164                 name += ";" + parts[2]; // HACK
165             }
166             addNameToCode("exception", code, name);
167         }
168         for (String cldr : GOOGLE_CLDR.values()) {
169             String goodCode = toUnderbarLocale(cldr);
170             exceptionCodes.add(goodCode);
171         }
172         languageNameToCode = Collections.unmodifiableMap(languageNameToCode);
173         exceptionCodes = Collections.unmodifiableSet(exceptionCodes);
174         parseErrors = Collections.unmodifiableSet(parseErrors);
175     }
176 
addNameToCode(final String type, final String code, String name)177     private static void addNameToCode(final String type, final String code, String name) {
178         if (code.equals("mru") && name.equals("mru")) {
179             // mru=Mono (Cameroon)
180             // mro=Mru
181             // Ignore the CLDR mapping of the code to itself,
182             // to avoid clobbering the mapping of the real name Mru to the real code mro.
183             return;
184         }
185         name = name.toLowerCase(Locale.ENGLISH);
186         String oldCode = languageNameToCode.get(name);
187         if (oldCode != null) {
188             if (!oldCode.equals(code)) {
189                 parseErrors.add(
190                         "Name Collision! "
191                                 + type
192                                 + ": "
193                                 + name
194                                 + " <"
195                                 + oldCode
196                                 + ", "
197                                 + code
198                                 + ">");
199             } else {
200                 return;
201             }
202         }
203         languageNameToCode.put(name, code);
204     }
205 
toGoogleLocaleId(String localeId)206     public static String toGoogleLocaleId(String localeId) {
207         // TODO fix to do languages, etc. field by field
208         localeId = localeId.replace("-", "_");
209         String result = CLDR_GOOGLE.get(localeId);
210         result = result == null ? localeId : result;
211         return result.replace("_", "-");
212     }
213 
fromGoogleLocaleId(String localeId)214     public static String fromGoogleLocaleId(String localeId) {
215         localeId = localeId.replace("_", "-");
216         // TODO fix to do languages, etc. field by field
217         String result = GOOGLE_CLDR.get(localeId);
218         result = result == null ? localeId : result;
219         return result.replace("-", "_");
220     }
221 
toUnderbarLocale(String localeId)222     public static String toUnderbarLocale(String localeId) {
223         return localeId.replace("-", "_");
224     }
225 
toHyphenLocale(String localeId)226     public static String toHyphenLocale(String localeId) {
227         return localeId.replace("_", "-");
228     }
229 
getCodeForName(String languageName)230     public static String getCodeForName(String languageName) {
231         return languageNameToCode.get(languageName.toLowerCase(Locale.ENGLISH));
232     }
233 
getExceptionCodes()234     public static Set<String> getExceptionCodes() {
235         return exceptionCodes;
236     }
237 
getParseErrors()238     public static Set<String> getParseErrors() {
239         return parseErrors;
240     }
241 
getLanguageNameToCode()242     public static Map<String, String> getLanguageNameToCode() {
243         return languageNameToCode;
244     }
245 }
246