xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/util/CoreCoverageInfo.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.util;
2 
3 import com.google.common.collect.ImmutableMultimap;
4 import com.google.common.collect.ImmutableSet;
5 import com.google.common.collect.Multimap;
6 import com.google.common.collect.Sets;
7 import com.google.common.collect.TreeMultimap;
8 import com.ibm.icu.impl.Relation;
9 import com.ibm.icu.lang.UScript;
10 import com.ibm.icu.text.UnicodeSet;
11 import java.io.File;
12 import java.util.Arrays;
13 import java.util.EnumMap;
14 import java.util.HashMap;
15 import java.util.HashSet;
16 import java.util.LinkedHashSet;
17 import java.util.Map;
18 import java.util.Set;
19 import org.unicode.cldr.draft.ScriptMetadata;
20 import org.unicode.cldr.draft.ScriptMetadata.Info;
21 import org.unicode.cldr.draft.ScriptMetadata.Trinary;
22 import org.unicode.cldr.tool.LikelySubtags;
23 import org.unicode.cldr.util.CLDRFile.ExemplarType;
24 import org.unicode.cldr.util.Iso639Data.Type;
25 import org.unicode.cldr.util.SupplementalDataInfo.PluralType;
26 
27 public class CoreCoverageInfo {
28 
29     private static final CLDRConfig config = CLDRConfig.getInstance();
30     private static final String CLDR_BASE_DIRECTORY = config.getCldrBaseDirectory().toString();
31     private static final SupplementalDataInfo sdi = SupplementalDataInfo.getInstance();
32     private static final LikelySubtags ls = new LikelySubtags();
33 
34     public enum Sublevel {
35         /** Needs to be present at the start of that level's vetting */
36         start,
37         /** (default) Only to be present by the end start of that level's vetting */
38         end
39     }
40 
41     public enum CoreItems {
42         default_content(Level.CORE),
43         likely_subtags(Level.CORE),
44         country_data(Level.CORE),
45         orientation(Level.CORE),
46         time_cycle(Level.CORE),
47 
48         own_language(Level.BASIC),
49         own_regions(Level.BASIC),
50 
51         casing(Level.MODERATE, Sublevel.start),
52         plurals(Level.MODERATE, Sublevel.start),
53         collation(Level.MODERATE),
54 
55         grammar(Level.MODERN, Sublevel.start),
56         ordinals(Level.MODERN),
57         romanization(Level.MODERN),
58         ;
59 
60         public static final Set<CoreItems> ALL = ImmutableSet.copyOf(CoreItems.values());
61         public static final Multimap<Level, CoreItems> LEVEL_TO_ITEMS;
62 
63         static {
64             final Multimap<Level, CoreItems> _levelToItems = TreeMultimap.create();
65             ALL.forEach(
66                     x -> {
67                         for (Level level : Level.values()) {
68                             if (level.compareTo(x.desiredLevel) <= 0) {
69                                 _levelToItems.put(x.desiredLevel, x);
70                             }
71                         }
72                     });
73             LEVEL_TO_ITEMS = ImmutableMultimap.copyOf(_levelToItems);
74         }
75 
76         public final Level desiredLevel;
77         public final Sublevel sublevel;
78 
CoreItems()79         CoreItems() {
80             this(Level.CORE);
81         }
82 
CoreItems(Level desiredLevel)83         CoreItems(Level desiredLevel) {
84             this(desiredLevel, Sublevel.end);
85         }
86 
CoreItems(Level desiredLevel, Sublevel sublevel)87         CoreItems(Level desiredLevel, Sublevel sublevel) {
88             this.desiredLevel = desiredLevel;
89             this.sublevel = sublevel;
90         }
91 
92         @Override
toString()93         public String toString() {
94             return desiredLevel.getAbbreviation() + " " + name();
95         }
96     }
97 
98     static UnicodeSet RTL = new UnicodeSet("[[:bc=R:][:bc=AL:]]").freeze();
99 
100     /** Only call on L1 locales (parent = root) */
getCoreCoverageInfo( CLDRFile resolvedFile, Multimap<CoreItems, String> detailedErrors)101     public static Set<CoreItems> getCoreCoverageInfo(
102             CLDRFile resolvedFile, Multimap<CoreItems, String> detailedErrors) {
103         detailedErrors.clear();
104         if (!resolvedFile.isResolved()) {
105             throw new IllegalArgumentException();
106         }
107         CLDRFile file = resolvedFile.getUnresolved();
108         String locale = file.getLocaleID();
109         LanguageTagParser ltp = new LanguageTagParser();
110         locale = ltp.set(locale).getLanguageScript();
111         final String baseLanguage = ltp.getLanguage();
112         final String script = ltp.getScript();
113         final String region = ltp.getRegion();
114 
115         // Set<CoreItems> result = EnumSet.noneOf(CoreItems.class);
116 
117         //      (02) Orientation (bidi writing systems only) [main/xxx.xml]
118         UnicodeSet main = file.getExemplarSet(ExemplarType.main, null);
119         boolean isRtl = main.containsSome(RTL);
120 
121         String path = "//ldml/layout/orientation/characterOrder";
122         String value = file.getStringValue(path);
123         if ("right-to-left".equals(value) != isRtl) {
124             detailedErrors.put(CoreItems.orientation, path);
125         }
126 
127         //      (01) Plural rules [supplemental/plurals.xml and ordinals.xml]
128         //      For more information, see cldr-spec/plural-rules.
129         if (!sdi.getPluralLocales(PluralType.cardinal).contains(baseLanguage)) {
130             detailedErrors.put(
131                     CoreItems.plurals,
132                     "//supplementalData/plurals[@type=\"cardinal\"]/pluralRules[@locales=\""
133                             + locale
134                             + "\"]/pluralRule[@count=\"other\"]");
135         }
136         if (!sdi.getPluralLocales(PluralType.ordinal).contains(baseLanguage)) {
137             detailedErrors.put(
138                     CoreItems.ordinals,
139                     "//supplementalData/plurals[@type=\"ordinal\"]/pluralRules[@locales=\""
140                             + locale
141                             + "\"]/pluralRule[@count=\"other\"]");
142         }
143 
144         //      (01) Default content script and region (normally: normally country with largest
145         // population using that language, and normal script for that).
146         // [supplemental/supplementalMetadata.xml]
147 
148         String defaultContent = sdi.getDefaultContentLocale(locale);
149         if (defaultContent == null) { //  || locale.equals("no")
150             detailedErrors.put(
151                     CoreItems.default_content,
152                     "//supplementalData/supplementalMetadata/defaultContent");
153         }
154         // likely subtags
155         final String max = ls.maximize(locale);
156         String maxLangScript = "";
157         String maxScript = "";
158         String maxRegion = "";
159         if (max != null) {
160             ltp.set(max);
161             maxLangScript = ltp.getLanguageScript();
162             maxScript = ltp.getScript();
163             maxRegion = ltp.getRegion();
164             if (maxRegion.equals("ZZ")
165                     || maxRegion.equals("001")
166                             && Iso639Data.getType(baseLanguage) != Type.Constructed) {
167                 maxRegion = "";
168             }
169         }
170         if (maxScript.isEmpty() || maxRegion.isEmpty()) {
171             detailedErrors.put(CoreItems.likely_subtags, "//supplementalData/likelySubtags");
172         }
173 
174         String bestScript = script.isEmpty() ? maxScript : script;
175         String bestRegion = region.isEmpty() ? maxRegion : region;
176 
177         String languagePath = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, baseLanguage);
178         String languageName = resolvedFile.getStringValue(languagePath);
179         if (languageName == null) {
180             detailedErrors.put(CoreItems.own_language, languagePath);
181         } else {
182             String localeWhereFound = resolvedFile.getSourceLocaleID(languagePath, null);
183             if ("root".equals(localeWhereFound) || "code-fallback".equals(localeWhereFound)) {
184                 detailedErrors.put(CoreItems.own_language, languagePath);
185             }
186         }
187 
188         if (bestRegion.isEmpty()) {
189             detailedErrors.put(CoreItems.own_regions, "//supplementalData/likelySubtags");
190         } else {
191             String regionPath = CLDRFile.getKey(CLDRFile.TERRITORY_NAME, bestRegion);
192             String regionName = file.getStringValue(regionPath);
193             if (regionName == null) {
194                 detailedErrors.put(CoreItems.own_regions, regionPath);
195             } else {
196                 String localeWhereFound = resolvedFile.getSourceLocaleID(regionPath, null);
197                 if (XMLSource.ROOT_ID.equals(localeWhereFound)
198                         || XMLSource.CODE_FALLBACK_ID.equals(localeWhereFound)) {
199                     detailedErrors.put(CoreItems.own_regions, regionPath);
200                 }
201             }
202         }
203         // NOTE: other regions will be captured in the coverageLevels
204 
205         // (N) Verify the country data ( i.e. which territories in which the language is spoken
206         // enough to create a locale ) [supplemental/supplementalData.xml]
207         // we verify that there is at least one region
208         // we try 3 cases: language, locale, maxLangScript
209         Set<String> territories = sdi.getTerritoriesForPopulationData(locale);
210         if (territories == null) {
211             territories = sdi.getTerritoriesForPopulationData(baseLanguage);
212         }
213         if (territories == null && maxLangScript != null) {
214             territories = sdi.getTerritoriesForPopulationData(maxLangScript);
215         }
216         if (territories == null || territories.isEmpty()) {
217             detailedErrors.put(CoreItems.country_data, "//supplementalData/territoryInfo");
218             sdi.getTerritoriesForPopulationData(locale); // for debugging
219         }
220         //      *(N) Romanization table (non-Latin writing systems only) [spreadsheet, we'll
221         // translate into transforms/xxx-en.xml]
222         //      If a spreadsheet, for each letter (or sequence) in the exemplars, what is the
223         // corresponding Latin letter (or sequence).
224         //      More sophisticated users can do a better job, supplying a file of rules like
225         // transforms/Arabic-Latin-BGN.xml.
226 
227         if (!bestScript.equals("Latn")) {
228             boolean found = false;
229             Set<String> scriptLongCodes = getScriptNames(bestScript);
230             if (scriptLongCodes != null) {
231                 Set<String> debugErrors = new LinkedHashSet<>();
232                 for (String scriptLongCode : scriptLongCodes) {
233                     for (String[] pair : ROMANIZATION_PATHS) {
234                         String filename = pair[0] + scriptLongCode + pair[1];
235                         if (hasFile(SpecialDir.transforms, filename)) {
236                             found = true;
237                             break;
238                         } else {
239                             debugErrors.add(script);
240                         }
241                     }
242                 }
243             }
244             if (!found) {
245                 detailedErrors.put(
246                         CoreItems.romanization,
247                         "//supplementalData/transforms/transform"
248                                 + "[@source=\"und-"
249                                 + script
250                                 + "\"]"
251                                 + "[@target=\"und-Latn\"]"
252                         // + "[@direction=\"forward\"]"
253                         );
254             }
255         }
256 
257         //      (N) Casing information (cased scripts only, according to ScriptMetadata.txt)
258         //      This will be in common/casing
259         Info scriptData = ScriptMetadata.getInfo(bestScript);
260         if (scriptData != null
261                 && scriptData.hasCase == Trinary.YES
262                 && !hasFile(SpecialDir.casing, baseLanguage)) {
263             detailedErrors.put(
264                     CoreItems.casing, "//ldml/metadata/casingData/casingItem[@type=\"*\"]");
265         }
266         //      (N) Collation rules [non-Survey Tool]
267         //      For details, see cldr-spec/collation-guidelines.
268         //      The result will be a file like: common/collation/ar.xml or common/collation/da.xml.
269         //      Note that the "search" collators (which tend to be large) are not needed initially.
270 
271         // check for file cldr/collation/<language>.xml
272         if (!hasFile(SpecialDir.collation, baseLanguage)) {
273             detailedErrors.put(
274                     CoreItems.collation, "//ldml/collations/collation[@type=\"standard\"]");
275         }
276 
277         Map<String, PreferredAndAllowedHour> timeData = sdi.getTimeData();
278         if (timeData.get(bestRegion) == null) {
279             detailedErrors.put(CoreItems.time_cycle, "//supplementalData/timeData/hours");
280         }
281 
282         GrammarInfo grammarInfo = sdi.getGrammarInfo(locale);
283         if (grammarInfo == null) {
284             detailedErrors.put(
285                     CoreItems.grammar, "//supplementalData/grammaticalData/grammaticalFeatures");
286         }
287 
288         // finalize
289         return ImmutableSet.copyOf(Sets.difference(CoreItems.ALL, detailedErrors.keySet()));
290     }
291 
292     private static final String[][] ROMANIZATION_PATHS = {
293         {"", "-Latin"},
294         {"", "-Latin-BGN"},
295         {"Latin-", ""},
296     };
297 
298     private static final Relation SCRIPT_NAMES = Relation.of(new HashMap(), HashSet.class);
299 
300     static {
301         SCRIPT_NAMES.putAll("Arab", Arrays.asList("Arabic", "Arab"));
302         SCRIPT_NAMES.putAll("Jpan", Arrays.asList("Jpan", "Han"));
303         SCRIPT_NAMES.putAll("Hant", Arrays.asList("Hant", "Han"));
304         SCRIPT_NAMES.putAll("Hans", Arrays.asList("Hans", "Han"));
305         SCRIPT_NAMES.putAll("Kore", Arrays.asList("Hang", "Hangul"));
SCRIPT_NAMES.freeze()306         SCRIPT_NAMES.freeze();
307     }
308 
getScriptNames(String script)309     private static Set<String> getScriptNames(String script) {
310         Set<String> result = SCRIPT_NAMES.get(script);
311         if (result != null) {
312             return result;
313         }
314         result = new HashSet<>();
315         try {
316             String name = UScript.getName(UScript.getCodeFromName(script));
317             result.add(name);
318             result.add(script);
319         } catch (Exception e) {
320         }
321         return result;
322     }
323 
324     private enum SpecialDir {
325         transforms,
326         collation,
327         casing
328     }
329 
330     private static final Relation<SpecialDir, String> SPECIAL_FILES =
331             Relation.of(new EnumMap(SpecialDir.class), HashSet.class);
332 
333     static {
334         for (SpecialDir dir : SpecialDir.values()) {
335             File realDir = new File(CLDR_BASE_DIRECTORY + "/common/" + dir);
336             for (String s : realDir.list()) {
337                 if (s.endsWith(".xml")) {
338                     s = s.substring(0, s.length() - 4);
339                 }
SPECIAL_FILES.put(dir, s)340                 SPECIAL_FILES.put(dir, s);
341             }
342         }
343     }
344 
hasFile(SpecialDir type, String filename)345     private static boolean hasFile(SpecialDir type, String filename) {
346         return SPECIAL_FILES.get(type).contains(filename);
347     }
348 }
349