1 // © 2020 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu.localedistance; 4 5 import static com.google.common.base.Preconditions.checkArgument; 6 import static com.google.common.truth.Truth.assertThat; 7 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.DEPRECATED; 8 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.LEGACY; 9 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.MACRO; 10 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasType.LANGUAGE; 11 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasType.TERRITORY; 12 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.alias; 13 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.cldrData; 14 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.deprecatedTerritory; 15 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.languageMatch; 16 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.likelySubtag; 17 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.matchVariable; 18 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.paradigms; 19 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.territoryGroup; 20 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.territoryGrouping; 21 import static org.unicode.icu.tool.cldrtoicu.testing.IcuDataSubjectFactory.assertThat; 22 23 import java.io.ByteArrayOutputStream; 24 import java.util.List; 25 26 import org.junit.Test; 27 import org.unicode.cldr.api.CldrData; 28 import org.unicode.icu.tool.cldrtoicu.IcuData; 29 import org.unicode.icu.tool.cldrtoicu.RbPath; 30 import org.unicode.icu.tool.cldrtoicu.RbValue; 31 32 import com.google.common.base.CharMatcher; 33 import com.google.common.collect.ImmutableList; 34 import com.google.common.collect.ImmutableMap; 35 import com.google.common.collect.ImmutableSetMultimap; 36 import com.ibm.icu.impl.locale.LSR; 37 import com.ibm.icu.util.BytesTrie; 38 39 /** 40 * Higher level tests for {@link LocaleDistanceMapper} to demonstrate that CLDR values 41 * are matched and processed, and the IcuData is written as expected. 42 * 43 * <p>Most of the separate parts which make up this mapper are already tested at a 44 * lower level in the other tests in this package. 45 */ 46 public class LocaleDistanceMapperTest { 47 @Test testEndToEnd()48 public void testEndToEnd() { 49 // Language match elements are ordered, so need an incrementing sort index. 50 int idx = 0; 51 52 // A representative subset of CLDR data needed to generate the locale distance. 53 // This focuses on two distinct cases: 54 // 1: American vs non-American and British English 55 // This demonstrates the way that special case mappings are handled. 56 // 2: Chinese, Simplified and Traditional 57 // This demonstrates languages with multiple scripts. 58 CldrData testData = cldrData( 59 paradigms("en", "en_GB", "es", "es_419"), 60 matchVariable("$enUS", "PR+US+VI"), 61 matchVariable("$cnsar", "HK+MO"), 62 63 // The <languageMatch> element is marked "ORDERED" in the DTD, so 64 // ordering of match rules can can affect output (when paths are 65 // otherwise equal). DTD ordering will not re-order this data. 66 languageMatch("yue", "zh", 10, true, ++idx), 67 languageMatch("*", "*", 80, false, ++idx), 68 69 languageMatch("zh_Hans", "zh_Hant", 15, true, ++idx), 70 languageMatch("zh_Hant", "zh_Hans", 19, true, ++idx), 71 languageMatch("zh_Latn", "zh_Hans", 20, true, ++idx), 72 languageMatch("*_*", "*_*", 50, false, ++idx), 73 74 languageMatch("en_*_$enUS", "en_*_$enUS", 4, false, ++idx), 75 languageMatch("en_*_$!enUS", "en_*_GB", 3, false, ++idx), 76 languageMatch("en_*_$!enUS", "en_*_$!enUS", 4, false, ++idx), 77 languageMatch("en_*_*", "en_*_*", 5, false, ++idx), 78 79 languageMatch("zh_Hant_$cnsar", "zh_Hant_$cnsar", 4, false, ++idx), 80 languageMatch("zh_Hant_$!cnsar", "zh_Hant_$!cnsar", 4, false, ++idx), 81 languageMatch("zh_Hant_*", "zh_Hant_*", 5, false, ++idx), 82 languageMatch("*_*_*", "*_*_*", 4, false, ++idx), 83 84 // NOTE: This is deliberately NOT in DTD order to demonstrate that the 85 // mapper will reorder these (putting "und" last) which means that the 86 // ICU data here is NOT affected by changes in the likely subtag order). 87 likelySubtag("und", "en_Latn_US"), 88 likelySubtag("und_HK", "zh_Hant_HK"), 89 likelySubtag("und_MO", "zh_Hant_MO"), 90 likelySubtag("und_TW", "zh_Hant_TW"), 91 likelySubtag("und_030", "zh_Hans_CN"), 92 likelySubtag("und_142", "zh_Hans_CN"), 93 likelySubtag("und_CN", "zh_Hans_CN"), 94 likelySubtag("und_Hans", "zh_Hans_CN"), 95 likelySubtag("und_Hant", "zh_Hant_TW"), 96 likelySubtag("zh", "zh_Hans_CN"), 97 likelySubtag("zh_Hant", "zh_Hant_TW"), 98 likelySubtag("zh_TW", "zh_Hant_TW"), 99 100 // NOT in DTD order (to demonstrate order invariance later). 101 alias(LANGUAGE, LEGACY, "zh_SG", "zh_Hans_SG"), 102 alias(LANGUAGE, LEGACY, "zh_HK", "zh_Hant_HK"), 103 alias(LANGUAGE, LEGACY, "zh_TW", "zh_Hant_TW"), 104 alias(LANGUAGE, LEGACY, "zh_MO", "zh_Hant_MO"), 105 alias(LANGUAGE, LEGACY, "zh_CN", "zh_Hans_CN"), 106 alias(LANGUAGE, MACRO, "cmn", "zh"), 107 108 // NOT in DTD order (to demonstrate order invariance later). 109 alias(TERRITORY, DEPRECATED, "UK", "GB"), 110 alias(TERRITORY, DEPRECATED, "AN", "CW", "SX", "BQ"), 111 112 // Rather trimmed down containment hierarchy. It still retains macro 113 // regions and grouping to demonstrate that these work as expected. 114 territoryGroup("001", "019", "142", "150"), // World 115 territoryGrouping("001", "EU"), 116 territoryGroup("019", "021", "419"), // Americas 117 territoryGroup("142", "030", "035"), // Asia 118 territoryGroup("150", "154", "155"), // Europe 119 territoryGrouping("EU", "DE", "FR", "IE"), // European Union (no CH or GB) 120 territoryGroup("021", "CA", "PM", "US"), // Northern America 121 territoryGroup("419", "013", "029"), // Latin America and the Caribbean 122 territoryGroup("030", "CN", "HK", "MO", "TW"), // Eastern Asia 123 territoryGroup("035", "PH", "SG", "TH", "VN"), // South-Eastern Asia 124 territoryGroup("154", "GB", "IE"), // Northern Europe 125 territoryGroup("155", "CH", "DE", "FR"), // Western Europe 126 territoryGroup("013", "CR", "MX", "PA"), // Central America 127 territoryGroup("029", "BQ", "CW", "PR", "SX", "VI"), // Caribbean 128 deprecatedTerritory("029", "AN")); // Antilles (=> BQ, CW, SX) 129 130 IcuData icuData = LocaleDistanceMapper.process(testData); 131 // Aliases come in (deprecated, replacement) pairs. 132 assertThat(icuData).hasValuesFor("likely/languageAliases", "cmn", "zh"); 133 assertThat(icuData).hasValuesFor("likely/regionAliases", "AN", "CW", "UK", "GB"); 134 135 // LSR values come in (language, script, region) tuples. They are the mapped-to 136 // values for the likely subtag mappings, ordered by the DTD order in which the 137 // mapping keys were encountered. 138 assertThat(icuData).hasValuesFor("likely/lsrs", 139 "", "", "", 140 "skip", "script", "", 141 "zh", "Hans", "CN", 142 "zh", "Hant", "TW", 143 "en", "Latn", "US", 144 "zh", "Hant", "HK", 145 "zh", "Hant", "MO"); 146 147 // It's a bit easier to see how match keys are grouped against the partitions. 148 ImmutableSetMultimap<Integer, String> likelyTrie = 149 getTrieMap(icuData, "likely/trie:bin", "*").asMultimap().inverse(); 150 151 // Special values in the lookup table don't map from any locales directly. 152 assertThat(likelyTrie).valuesForKey(0).isEmpty(); 153 assertThat(likelyTrie).valuesForKey(1).isEmpty(); 154 155 // Index 4: en-Latn-US (the general default and default for Latn). 156 assertThat(likelyTrie).valuesForKey(4).containsExactly("*-Latn-*", "*-Latn-US", "*-*-*"); 157 158 // Index 2: zh-Hans-CN (default for zh, Hans and CN separately). 159 assertThat(likelyTrie).valuesForKey(2).containsExactly( 160 "*-*-030", "*-*-142", // macro regions 161 "*-*-CN", "*-Hans-*", "*-Hans-CN", // unknown language match 162 "cmn-*-*", // language alias 163 "zh-*-*"); // default for language 164 165 // Index 2: zh-Hant-TW (default for zh if Hant or TW is given). 166 assertThat(likelyTrie).valuesForKey(3).containsExactly( 167 "*-*-TW", "*-Hant-*", "*-Hant-TW", // unknown language match 168 "cmn-*-TW", "cmn-Hant", // language alias with specific script/region 169 "zh-*-TW", "zh-Hant"); // default for script/region 170 171 // Other zh languages (zh-Hant-HK, zh-Hant-MO) require an explicit region match. 172 assertThat(likelyTrie).valuesForKey(5).containsExactly("*-*-HK", "*-Hant-HK"); 173 assertThat(likelyTrie).valuesForKey(6).containsExactly("*-*-MO", "*-Hant-MO"); 174 175 // Pairs of expanded paradigm locales (using LSR tuples) in declaration order. 176 // This is just the list from the CLDR data with no processing. 177 assertThat(icuData).hasValuesFor("match/paradigms", 178 "en", "Latn", "US", 179 "en", "Latn", "GB", 180 "es", "Latn", "ES", 181 "es", "Latn", "419"); 182 183 // See PartitionInfoTest for a description of the ordering of these strings. 184 assertThat(icuData).hasValuesFor("match/partitions", 185 ".", "0", "1", "2", "3", "0123", "03", "02", "01"); 186 187 ImmutableMap<String, Integer> matchTrie = getTrieMap(icuData, "match/trie:bin", "*-*"); 188 byte[] regionLookup = getBytes(icuData, "match/regionToPartitions:bin"); 189 ImmutableList<String> partitions = 190 icuData.get(RbPath.parse("match/partitions")).get(0).getElements(); 191 192 // Test defaults have been trimmed. 193 assertThat(matchTrie).doesNotContainKey("*-*"); 194 assertThat(matchTrie).doesNotContainKey("*-*-*-*"); 195 assertThat(matchTrie).doesNotContainKey("*-*-*-*-*-*"); 196 197 // Some zh specific tests. 198 assertThat(matchTrie).containsEntry("yue-zh", 10); // Encapsulated language 199 assertThat(matchTrie).containsEntry("zh-zh-Hant-Hant-*-*", 5); 200 201 // Special marker that means "en-en" matches don't use script information. 202 // This is assumed in the distance tests below, so it's important to check. 203 assertThat(matchTrie).containsEntry("en-en", 128); 204 205 // British English is a slightly better match against non-American English. 206 assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "CA", "GB", 3); 207 assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "GB", "GB", 3); 208 // "EU" works here because while it's a macro region, in this data it only 209 // covers a single partition. 210 assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "GB", "EU", 3); 211 212 // Pairs of non-American or American English languages get a larger distance. 213 assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "CA", "DE", 4); 214 assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "US", "PR", 4); 215 // Deprecated regions (AN) are still mapped to partitions and get real distances. 216 assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "AN", "TW", 4); 217 218 // Mixing American and non-American English gets the default "en-en-*-*" distance. 219 assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "GB", "US", 5); 220 assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "CA", "US", 5); 221 assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "US", "AN", 5); 222 223 // Default distances for language, script and region, plus minimum region distance. 224 // Minimum region distance is "en_*_$!enUS" -> "en_*_GB" (as seen above). 225 assertThat(icuData).hasValuesFor("match/distances:intvector", "80", "50", "4", "3"); 226 } 227 228 // Helper to make assertions about language distance a bit more readable. 229 // PartitionInfoTest includes more low level tests for precise ordering etc. assertEnDistanceForRegions( ImmutableMap<String, Integer> matchTrie, byte[] regionLookup, ImmutableList<String> paritions, String regionA, String regionB, int distance)230 private static void assertEnDistanceForRegions( 231 ImmutableMap<String, Integer> matchTrie, 232 byte[] regionLookup, 233 ImmutableList<String> paritions, 234 String regionA, String regionB, 235 int distance) { 236 // Three step lookup for each region: 237 // 1: Find LSR index from region string. 238 // 2: Lookup partition group index from region lookup table. 239 // 3: Lookup partition group string from partitions table. 240 String partitionA = paritions.get(regionLookup[LSR.indexForRegion(regionA)]); 241 String partitionB = paritions.get(regionLookup[LSR.indexForRegion(regionB)]); 242 243 // For now only support cases where there's a single partition ID associated 244 // with the region (this is all non-macro regions and *some* macro regions). 245 checkArgument(partitionA.length() == 1 && partitionB.length() == 1, 246 "multiple partitions unsupported in test: %s %s", regionA, regionB); 247 248 // This is a depth 2 key because we know that "en" skips scripts. This will 249 // not work the same for "zh" because that needs scripts information. 250 String key = String.format("en-en-%s-%s", partitionA, partitionB); 251 if (matchTrie.containsKey(key)) { 252 assertThat(matchTrie).containsEntry(key, distance); 253 } else { 254 assertThat(matchTrie).containsEntry("en-en-*-*", distance); 255 } 256 } 257 258 // Returns the mapping for a Trie from a ":bin" suffixed resource value. 259 // "star" defines what the Trie wildcard should be expanded to (for readability). getTrieMap(IcuData icuData, String path, String star)260 private static ImmutableMap<String, Integer> getTrieMap(IcuData icuData, String path, String star) { 261 return TestData.getTrieTable(getTrie(icuData, path), star, i -> i); 262 } 263 264 // Reads a Trie from a ":bin" suffixed resource value. getTrie(IcuData icuData, String path)265 private static BytesTrie getTrie(IcuData icuData, String path) { 266 return new BytesTrie(getBytes(icuData, path), 0); 267 } 268 269 // Reads a byte array from a ":bin" suffixed resource value. getBytes(IcuData icuData, String path)270 private static byte[] getBytes(IcuData icuData, String path) { 271 RbPath rbPath = RbPath.parse(path); 272 checkArgument(rbPath.isBinPath(), "only binary paths (:bin) should have binary data: %s", path); 273 List<RbValue> rbValues = icuData.get(rbPath); 274 checkArgument(rbValues != null, "missing value for: %s", rbPath); 275 checkArgument(rbValues.size() == 1, "expect single RbValue: %s", rbValues); 276 // Take a sequence of hex-strings, convert each to a byte[] and collect them. 277 return rbValues.get(0).getElements().stream() 278 .map(LocaleDistanceMapperTest::decodeHex) 279 .collect( 280 ByteArrayOutputStream::new, 281 (out, b) -> out.write(b, 0, b.length), 282 (out, b) -> out.write(b.toByteArray(), 0, b.size())) 283 .toByteArray(); 284 } 285 286 // Hex chars to byte array (2 chars per byte, little endian). decodeHex(String s)287 private static byte[] decodeHex(String s) { 288 checkArgument(s.length() % 2 == 0, "binary hex strings must have an even length: %s", s); 289 checkArgument(HEX.matchesAllOf(s), "invalid binary hex string: %s", s); 290 byte[] bytes = new byte[s.length() / 2]; 291 for (int n = 0; n < bytes.length; n++) { 292 bytes[n] = (byte) Integer.parseUnsignedInt(s.substring(2 * n, 2 * (n + 1)), 16); 293 } 294 return bytes; 295 } 296 297 private static final CharMatcher HEX = CharMatcher.anyOf("0123456789abcdefABCDEF"); 298 } 299