1 package org.unicode.cldr.util; 2 3 import com.google.common.collect.ImmutableMultimap; 4 import com.google.common.collect.ImmutableSet; 5 import com.google.common.collect.Multimap; 6 import com.google.common.collect.Sets; 7 import com.google.common.collect.TreeMultimap; 8 import com.ibm.icu.impl.Relation; 9 import com.ibm.icu.lang.UScript; 10 import com.ibm.icu.text.UnicodeSet; 11 import java.io.File; 12 import java.util.Arrays; 13 import java.util.EnumMap; 14 import java.util.HashMap; 15 import java.util.HashSet; 16 import java.util.LinkedHashSet; 17 import java.util.Map; 18 import java.util.Set; 19 import org.unicode.cldr.draft.ScriptMetadata; 20 import org.unicode.cldr.draft.ScriptMetadata.Info; 21 import org.unicode.cldr.draft.ScriptMetadata.Trinary; 22 import org.unicode.cldr.tool.LikelySubtags; 23 import org.unicode.cldr.util.CLDRFile.ExemplarType; 24 import org.unicode.cldr.util.Iso639Data.Type; 25 import org.unicode.cldr.util.SupplementalDataInfo.PluralType; 26 27 public class CoreCoverageInfo { 28 29 private static final CLDRConfig config = CLDRConfig.getInstance(); 30 private static final String CLDR_BASE_DIRECTORY = config.getCldrBaseDirectory().toString(); 31 private static final SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); 32 private static final LikelySubtags ls = new LikelySubtags(); 33 34 public enum Sublevel { 35 /** Needs to be present at the start of that level's vetting */ 36 start, 37 /** (default) Only to be present by the end start of that level's vetting */ 38 end 39 } 40 41 public enum CoreItems { 42 default_content(Level.CORE), 43 likely_subtags(Level.CORE), 44 country_data(Level.CORE), 45 orientation(Level.CORE), 46 time_cycle(Level.CORE), 47 48 own_language(Level.BASIC), 49 own_regions(Level.BASIC), 50 51 casing(Level.MODERATE, Sublevel.start), 52 plurals(Level.MODERATE, Sublevel.start), 53 collation(Level.MODERATE), 54 55 grammar(Level.MODERN, Sublevel.start), 56 ordinals(Level.MODERN), 57 romanization(Level.MODERN), 58 ; 59 60 public static final Set<CoreItems> ALL = ImmutableSet.copyOf(CoreItems.values()); 61 public static final Multimap<Level, CoreItems> LEVEL_TO_ITEMS; 62 63 static { 64 final Multimap<Level, CoreItems> _levelToItems = TreeMultimap.create(); 65 ALL.forEach( 66 x -> { 67 for (Level level : Level.values()) { 68 if (level.compareTo(x.desiredLevel) <= 0) { 69 _levelToItems.put(x.desiredLevel, x); 70 } 71 } 72 }); 73 LEVEL_TO_ITEMS = ImmutableMultimap.copyOf(_levelToItems); 74 } 75 76 public final Level desiredLevel; 77 public final Sublevel sublevel; 78 CoreItems()79 CoreItems() { 80 this(Level.CORE); 81 } 82 CoreItems(Level desiredLevel)83 CoreItems(Level desiredLevel) { 84 this(desiredLevel, Sublevel.end); 85 } 86 CoreItems(Level desiredLevel, Sublevel sublevel)87 CoreItems(Level desiredLevel, Sublevel sublevel) { 88 this.desiredLevel = desiredLevel; 89 this.sublevel = sublevel; 90 } 91 92 @Override toString()93 public String toString() { 94 return desiredLevel.getAbbreviation() + " " + name(); 95 } 96 } 97 98 static UnicodeSet RTL = new UnicodeSet("[[:bc=R:][:bc=AL:]]").freeze(); 99 100 /** Only call on L1 locales (parent = root) */ getCoreCoverageInfo( CLDRFile resolvedFile, Multimap<CoreItems, String> detailedErrors)101 public static Set<CoreItems> getCoreCoverageInfo( 102 CLDRFile resolvedFile, Multimap<CoreItems, String> detailedErrors) { 103 detailedErrors.clear(); 104 if (!resolvedFile.isResolved()) { 105 throw new IllegalArgumentException(); 106 } 107 CLDRFile file = resolvedFile.getUnresolved(); 108 String locale = file.getLocaleID(); 109 LanguageTagParser ltp = new LanguageTagParser(); 110 locale = ltp.set(locale).getLanguageScript(); 111 final String baseLanguage = ltp.getLanguage(); 112 final String script = ltp.getScript(); 113 final String region = ltp.getRegion(); 114 115 // Set<CoreItems> result = EnumSet.noneOf(CoreItems.class); 116 117 // (02) Orientation (bidi writing systems only) [main/xxx.xml] 118 UnicodeSet main = file.getExemplarSet(ExemplarType.main, null); 119 boolean isRtl = main.containsSome(RTL); 120 121 String path = "//ldml/layout/orientation/characterOrder"; 122 String value = file.getStringValue(path); 123 if ("right-to-left".equals(value) != isRtl) { 124 detailedErrors.put(CoreItems.orientation, path); 125 } 126 127 // (01) Plural rules [supplemental/plurals.xml and ordinals.xml] 128 // For more information, see cldr-spec/plural-rules. 129 if (!sdi.getPluralLocales(PluralType.cardinal).contains(baseLanguage)) { 130 detailedErrors.put( 131 CoreItems.plurals, 132 "//supplementalData/plurals[@type=\"cardinal\"]/pluralRules[@locales=\"" 133 + locale 134 + "\"]/pluralRule[@count=\"other\"]"); 135 } 136 if (!sdi.getPluralLocales(PluralType.ordinal).contains(baseLanguage)) { 137 detailedErrors.put( 138 CoreItems.ordinals, 139 "//supplementalData/plurals[@type=\"ordinal\"]/pluralRules[@locales=\"" 140 + locale 141 + "\"]/pluralRule[@count=\"other\"]"); 142 } 143 144 // (01) Default content script and region (normally: normally country with largest 145 // population using that language, and normal script for that). 146 // [supplemental/supplementalMetadata.xml] 147 148 String defaultContent = sdi.getDefaultContentLocale(locale); 149 if (defaultContent == null) { // || locale.equals("no") 150 detailedErrors.put( 151 CoreItems.default_content, 152 "//supplementalData/supplementalMetadata/defaultContent"); 153 } 154 // likely subtags 155 final String max = ls.maximize(locale); 156 String maxLangScript = ""; 157 String maxScript = ""; 158 String maxRegion = ""; 159 if (max != null) { 160 ltp.set(max); 161 maxLangScript = ltp.getLanguageScript(); 162 maxScript = ltp.getScript(); 163 maxRegion = ltp.getRegion(); 164 if (maxRegion.equals("ZZ") 165 || maxRegion.equals("001") 166 && Iso639Data.getType(baseLanguage) != Type.Constructed) { 167 maxRegion = ""; 168 } 169 } 170 if (maxScript.isEmpty() || maxRegion.isEmpty()) { 171 detailedErrors.put(CoreItems.likely_subtags, "//supplementalData/likelySubtags"); 172 } 173 174 String bestScript = script.isEmpty() ? maxScript : script; 175 String bestRegion = region.isEmpty() ? maxRegion : region; 176 177 String languagePath = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, baseLanguage); 178 String languageName = resolvedFile.getStringValue(languagePath); 179 if (languageName == null) { 180 detailedErrors.put(CoreItems.own_language, languagePath); 181 } else { 182 String localeWhereFound = resolvedFile.getSourceLocaleID(languagePath, null); 183 if ("root".equals(localeWhereFound) || "code-fallback".equals(localeWhereFound)) { 184 detailedErrors.put(CoreItems.own_language, languagePath); 185 } 186 } 187 188 if (bestRegion.isEmpty()) { 189 detailedErrors.put(CoreItems.own_regions, "//supplementalData/likelySubtags"); 190 } else { 191 String regionPath = CLDRFile.getKey(CLDRFile.TERRITORY_NAME, bestRegion); 192 String regionName = file.getStringValue(regionPath); 193 if (regionName == null) { 194 detailedErrors.put(CoreItems.own_regions, regionPath); 195 } else { 196 String localeWhereFound = resolvedFile.getSourceLocaleID(regionPath, null); 197 if (XMLSource.ROOT_ID.equals(localeWhereFound) 198 || XMLSource.CODE_FALLBACK_ID.equals(localeWhereFound)) { 199 detailedErrors.put(CoreItems.own_regions, regionPath); 200 } 201 } 202 } 203 // NOTE: other regions will be captured in the coverageLevels 204 205 // (N) Verify the country data ( i.e. which territories in which the language is spoken 206 // enough to create a locale ) [supplemental/supplementalData.xml] 207 // we verify that there is at least one region 208 // we try 3 cases: language, locale, maxLangScript 209 Set<String> territories = sdi.getTerritoriesForPopulationData(locale); 210 if (territories == null) { 211 territories = sdi.getTerritoriesForPopulationData(baseLanguage); 212 } 213 if (territories == null && maxLangScript != null) { 214 territories = sdi.getTerritoriesForPopulationData(maxLangScript); 215 } 216 if (territories == null || territories.isEmpty()) { 217 detailedErrors.put(CoreItems.country_data, "//supplementalData/territoryInfo"); 218 sdi.getTerritoriesForPopulationData(locale); // for debugging 219 } 220 // *(N) Romanization table (non-Latin writing systems only) [spreadsheet, we'll 221 // translate into transforms/xxx-en.xml] 222 // If a spreadsheet, for each letter (or sequence) in the exemplars, what is the 223 // corresponding Latin letter (or sequence). 224 // More sophisticated users can do a better job, supplying a file of rules like 225 // transforms/Arabic-Latin-BGN.xml. 226 227 if (!bestScript.equals("Latn")) { 228 boolean found = false; 229 Set<String> scriptLongCodes = getScriptNames(bestScript); 230 if (scriptLongCodes != null) { 231 Set<String> debugErrors = new LinkedHashSet<>(); 232 for (String scriptLongCode : scriptLongCodes) { 233 for (String[] pair : ROMANIZATION_PATHS) { 234 String filename = pair[0] + scriptLongCode + pair[1]; 235 if (hasFile(SpecialDir.transforms, filename)) { 236 found = true; 237 break; 238 } else { 239 debugErrors.add(script); 240 } 241 } 242 } 243 } 244 if (!found) { 245 detailedErrors.put( 246 CoreItems.romanization, 247 "//supplementalData/transforms/transform" 248 + "[@source=\"und-" 249 + script 250 + "\"]" 251 + "[@target=\"und-Latn\"]" 252 // + "[@direction=\"forward\"]" 253 ); 254 } 255 } 256 257 // (N) Casing information (cased scripts only, according to ScriptMetadata.txt) 258 // This will be in common/casing 259 Info scriptData = ScriptMetadata.getInfo(bestScript); 260 if (scriptData != null 261 && scriptData.hasCase == Trinary.YES 262 && !hasFile(SpecialDir.casing, baseLanguage)) { 263 detailedErrors.put( 264 CoreItems.casing, "//ldml/metadata/casingData/casingItem[@type=\"*\"]"); 265 } 266 // (N) Collation rules [non-Survey Tool] 267 // For details, see cldr-spec/collation-guidelines. 268 // The result will be a file like: common/collation/ar.xml or common/collation/da.xml. 269 // Note that the "search" collators (which tend to be large) are not needed initially. 270 271 // check for file cldr/collation/<language>.xml 272 if (!hasFile(SpecialDir.collation, baseLanguage)) { 273 detailedErrors.put( 274 CoreItems.collation, "//ldml/collations/collation[@type=\"standard\"]"); 275 } 276 277 Map<String, PreferredAndAllowedHour> timeData = sdi.getTimeData(); 278 if (timeData.get(bestRegion) == null) { 279 detailedErrors.put(CoreItems.time_cycle, "//supplementalData/timeData/hours"); 280 } 281 282 GrammarInfo grammarInfo = sdi.getGrammarInfo(locale); 283 if (grammarInfo == null) { 284 detailedErrors.put( 285 CoreItems.grammar, "//supplementalData/grammaticalData/grammaticalFeatures"); 286 } 287 288 // finalize 289 return ImmutableSet.copyOf(Sets.difference(CoreItems.ALL, detailedErrors.keySet())); 290 } 291 292 private static final String[][] ROMANIZATION_PATHS = { 293 {"", "-Latin"}, 294 {"", "-Latin-BGN"}, 295 {"Latin-", ""}, 296 }; 297 298 private static final Relation SCRIPT_NAMES = Relation.of(new HashMap(), HashSet.class); 299 300 static { 301 SCRIPT_NAMES.putAll("Arab", Arrays.asList("Arabic", "Arab")); 302 SCRIPT_NAMES.putAll("Jpan", Arrays.asList("Jpan", "Han")); 303 SCRIPT_NAMES.putAll("Hant", Arrays.asList("Hant", "Han")); 304 SCRIPT_NAMES.putAll("Hans", Arrays.asList("Hans", "Han")); 305 SCRIPT_NAMES.putAll("Kore", Arrays.asList("Hang", "Hangul")); SCRIPT_NAMES.freeze()306 SCRIPT_NAMES.freeze(); 307 } 308 getScriptNames(String script)309 private static Set<String> getScriptNames(String script) { 310 Set<String> result = SCRIPT_NAMES.get(script); 311 if (result != null) { 312 return result; 313 } 314 result = new HashSet<>(); 315 try { 316 String name = UScript.getName(UScript.getCodeFromName(script)); 317 result.add(name); 318 result.add(script); 319 } catch (Exception e) { 320 } 321 return result; 322 } 323 324 private enum SpecialDir { 325 transforms, 326 collation, 327 casing 328 } 329 330 private static final Relation<SpecialDir, String> SPECIAL_FILES = 331 Relation.of(new EnumMap(SpecialDir.class), HashSet.class); 332 333 static { 334 for (SpecialDir dir : SpecialDir.values()) { 335 File realDir = new File(CLDR_BASE_DIRECTORY + "/common/" + dir); 336 for (String s : realDir.list()) { 337 if (s.endsWith(".xml")) { 338 s = s.substring(0, s.length() - 4); 339 } SPECIAL_FILES.put(dir, s)340 SPECIAL_FILES.put(dir, s); 341 } 342 } 343 } 344 hasFile(SpecialDir type, String filename)345 private static boolean hasFile(SpecialDir type, String filename) { 346 return SPECIAL_FILES.get(type).contains(filename); 347 } 348 } 349