1 // © 2020 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package org.unicode.icu.tool.cldrtoicu.localedistance;
4 
5 import static com.google.common.base.Preconditions.checkArgument;
6 import static com.google.common.truth.Truth.assertThat;
7 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.DEPRECATED;
8 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.LEGACY;
9 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.MACRO;
10 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasType.LANGUAGE;
11 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasType.TERRITORY;
12 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.alias;
13 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.cldrData;
14 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.deprecatedTerritory;
15 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.languageMatch;
16 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.likelySubtag;
17 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.matchVariable;
18 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.paradigms;
19 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.territoryGroup;
20 import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.territoryGrouping;
21 import static org.unicode.icu.tool.cldrtoicu.testing.IcuDataSubjectFactory.assertThat;
22 
23 import java.io.ByteArrayOutputStream;
24 import java.util.List;
25 
26 import org.junit.Test;
27 import org.unicode.cldr.api.CldrData;
28 import org.unicode.icu.tool.cldrtoicu.IcuData;
29 import org.unicode.icu.tool.cldrtoicu.RbPath;
30 import org.unicode.icu.tool.cldrtoicu.RbValue;
31 
32 import com.google.common.base.CharMatcher;
33 import com.google.common.collect.ImmutableList;
34 import com.google.common.collect.ImmutableMap;
35 import com.google.common.collect.ImmutableSetMultimap;
36 import com.ibm.icu.impl.locale.LSR;
37 import com.ibm.icu.util.BytesTrie;
38 
39 /**
40  * Higher level tests for {@link LocaleDistanceMapper} to demonstrate that CLDR values
41  * are matched and processed, and the IcuData is written as expected.
42  *
43  * <p>Most of the separate parts which make up this mapper are already tested at a
44  * lower level in the other tests in this package.
45  */
46 public class LocaleDistanceMapperTest {
47     @Test
testEndToEnd()48     public void testEndToEnd() {
49         // Language match elements are ordered, so need an incrementing sort index.
50         int idx = 0;
51 
52         // A representative subset of CLDR data needed to generate the locale distance.
53         // This focuses on two distinct cases:
54         // 1: American vs non-American and British English
55         //    This demonstrates the way that special case mappings are handled.
56         // 2: Chinese, Simplified and Traditional
57         //    This demonstrates languages with multiple scripts.
58         CldrData testData = cldrData(
59                 paradigms("en", "en_GB", "es", "es_419"),
60                 matchVariable("$enUS", "PR+US+VI"),
61                 matchVariable("$cnsar", "HK+MO"),
62 
63                 // The <languageMatch> element is marked "ORDERED" in the DTD, so
64                 // ordering of match rules can can affect output (when paths are
65                 // otherwise equal). DTD ordering will not re-order this data.
66                 languageMatch("yue", "zh", 10, true, ++idx),
67                 languageMatch("*", "*", 80, false, ++idx),
68 
69                 languageMatch("zh_Hans", "zh_Hant", 15, true, ++idx),
70                 languageMatch("zh_Hant", "zh_Hans", 19, true, ++idx),
71                 languageMatch("zh_Latn", "zh_Hans", 20, true, ++idx),
72                 languageMatch("*_*", "*_*", 50, false, ++idx),
73 
74                 languageMatch("en_*_$enUS", "en_*_$enUS", 4, false, ++idx),
75                 languageMatch("en_*_$!enUS", "en_*_GB", 3, false, ++idx),
76                 languageMatch("en_*_$!enUS", "en_*_$!enUS", 4, false, ++idx),
77                 languageMatch("en_*_*", "en_*_*", 5, false, ++idx),
78 
79                 languageMatch("zh_Hant_$cnsar", "zh_Hant_$cnsar", 4, false, ++idx),
80                 languageMatch("zh_Hant_$!cnsar", "zh_Hant_$!cnsar", 4, false, ++idx),
81                 languageMatch("zh_Hant_*", "zh_Hant_*", 5, false, ++idx),
82                 languageMatch("*_*_*", "*_*_*", 4, false, ++idx),
83 
84                 // NOTE: This is deliberately NOT in DTD order to demonstrate that the
85                 // mapper will reorder these (putting "und" last) which means that the
86                 // ICU data here is NOT affected by changes in the likely subtag order).
87                 likelySubtag("und", "en_Latn_US"),
88                 likelySubtag("und_HK", "zh_Hant_HK"),
89                 likelySubtag("und_MO", "zh_Hant_MO"),
90                 likelySubtag("und_TW", "zh_Hant_TW"),
91                 likelySubtag("und_030", "zh_Hans_CN"),
92                 likelySubtag("und_142", "zh_Hans_CN"),
93                 likelySubtag("und_CN", "zh_Hans_CN"),
94                 likelySubtag("und_Hans", "zh_Hans_CN"),
95                 likelySubtag("und_Hant", "zh_Hant_TW"),
96                 likelySubtag("zh", "zh_Hans_CN"),
97                 likelySubtag("zh_Hant", "zh_Hant_TW"),
98                 likelySubtag("zh_TW", "zh_Hant_TW"),
99 
100                 // NOT in DTD order (to demonstrate order invariance later).
101                 alias(LANGUAGE, LEGACY, "zh_SG", "zh_Hans_SG"),
102                 alias(LANGUAGE, LEGACY, "zh_HK", "zh_Hant_HK"),
103                 alias(LANGUAGE, LEGACY, "zh_TW", "zh_Hant_TW"),
104                 alias(LANGUAGE, LEGACY, "zh_MO", "zh_Hant_MO"),
105                 alias(LANGUAGE, LEGACY, "zh_CN", "zh_Hans_CN"),
106                 alias(LANGUAGE, MACRO, "cmn", "zh"),
107 
108                 // NOT in DTD order (to demonstrate order invariance later).
109                 alias(TERRITORY, DEPRECATED, "UK", "GB"),
110                 alias(TERRITORY, DEPRECATED, "AN", "CW", "SX", "BQ"),
111 
112                 // Rather trimmed down containment hierarchy. It still retains macro
113                 // regions and grouping to demonstrate that these work as expected.
114                 territoryGroup("001", "019", "142", "150"),          // World
115                 territoryGrouping("001", "EU"),
116                 territoryGroup("019", "021", "419"),                 // Americas
117                 territoryGroup("142", "030", "035"),                 // Asia
118                 territoryGroup("150", "154", "155"),                 // Europe
119                 territoryGrouping("EU", "DE", "FR", "IE"),           // European Union (no CH or GB)
120                 territoryGroup("021", "CA", "PM", "US"),             // Northern America
121                 territoryGroup("419", "013", "029"),                 // Latin America and the Caribbean
122                 territoryGroup("030", "CN", "HK", "MO", "TW"),       // Eastern Asia
123                 territoryGroup("035", "PH", "SG", "TH", "VN"),       // South-Eastern Asia
124                 territoryGroup("154", "GB", "IE"),                   // Northern Europe
125                 territoryGroup("155", "CH", "DE", "FR"),             // Western Europe
126                 territoryGroup("013", "CR", "MX", "PA"),             // Central America
127                 territoryGroup("029", "BQ", "CW", "PR", "SX", "VI"), // Caribbean
128                 deprecatedTerritory("029", "AN"));                   // Antilles (=> BQ, CW, SX)
129 
130         IcuData icuData = LocaleDistanceMapper.process(testData);
131         // Aliases come in (deprecated, replacement) pairs.
132         assertThat(icuData).hasValuesFor("likely/languageAliases", "cmn", "zh");
133         assertThat(icuData).hasValuesFor("likely/regionAliases", "AN", "CW", "UK", "GB");
134 
135         // LSR values come in (language, script, region) tuples. They are the mapped-to
136         // values for the likely subtag mappings, ordered by the DTD order in which the
137         // mapping keys were encountered.
138         assertThat(icuData).hasValuesFor("likely/lsrs",
139                 "", "", "",
140                 "skip", "script", "",
141                 "zh", "Hans", "CN",
142                 "zh", "Hant", "TW",
143                 "en", "Latn", "US",
144                 "zh", "Hant", "HK",
145                 "zh", "Hant", "MO");
146 
147         // It's a bit easier to see how match keys are grouped against the partitions.
148         ImmutableSetMultimap<Integer, String> likelyTrie =
149                 getTrieMap(icuData, "likely/trie:bin", "*").asMultimap().inverse();
150 
151         // Special values in the lookup table don't map from any locales directly.
152         assertThat(likelyTrie).valuesForKey(0).isEmpty();
153         assertThat(likelyTrie).valuesForKey(1).isEmpty();
154 
155         // Index 4: en-Latn-US (the general default and default for Latn).
156         assertThat(likelyTrie).valuesForKey(4).containsExactly("*-Latn-*", "*-Latn-US", "*-*-*");
157 
158         // Index 2: zh-Hans-CN (default for zh, Hans and CN separately).
159         assertThat(likelyTrie).valuesForKey(2).containsExactly(
160                 "*-*-030", "*-*-142",               // macro regions
161                 "*-*-CN", "*-Hans-*", "*-Hans-CN",  // unknown language match
162                 "cmn-*-*",                          // language alias
163                 "zh-*-*");                          // default for language
164 
165         // Index 2: zh-Hant-TW (default for zh if Hant or TW is given).
166         assertThat(likelyTrie).valuesForKey(3).containsExactly(
167                 "*-*-TW", "*-Hant-*", "*-Hant-TW",  // unknown language match
168                 "cmn-*-TW", "cmn-Hant",             // language alias with specific script/region
169                 "zh-*-TW", "zh-Hant");              // default for script/region
170 
171         // Other zh languages (zh-Hant-HK, zh-Hant-MO) require an explicit region match.
172         assertThat(likelyTrie).valuesForKey(5).containsExactly("*-*-HK", "*-Hant-HK");
173         assertThat(likelyTrie).valuesForKey(6).containsExactly("*-*-MO", "*-Hant-MO");
174 
175         // Pairs of expanded paradigm locales (using LSR tuples) in declaration order.
176         // This is just the list from the CLDR data with no processing.
177         assertThat(icuData).hasValuesFor("match/paradigms",
178                 "en", "Latn", "US",
179                 "en", "Latn", "GB",
180                 "es", "Latn", "ES",
181                 "es", "Latn", "419");
182 
183         // See PartitionInfoTest for a description of the ordering of these strings.
184         assertThat(icuData).hasValuesFor("match/partitions",
185                 ".", "0", "1", "2", "3", "0123", "03", "02", "01");
186 
187         ImmutableMap<String, Integer> matchTrie = getTrieMap(icuData, "match/trie:bin", "*-*");
188         byte[] regionLookup = getBytes(icuData, "match/regionToPartitions:bin");
189         ImmutableList<String> partitions =
190                 icuData.get(RbPath.parse("match/partitions")).get(0).getElements();
191 
192         // Test defaults have been trimmed.
193         assertThat(matchTrie).doesNotContainKey("*-*");
194         assertThat(matchTrie).doesNotContainKey("*-*-*-*");
195         assertThat(matchTrie).doesNotContainKey("*-*-*-*-*-*");
196 
197         // Some zh specific tests.
198         assertThat(matchTrie).containsEntry("yue-zh", 10);  // Encapsulated language
199         assertThat(matchTrie).containsEntry("zh-zh-Hant-Hant-*-*", 5);
200 
201         // Special marker that means "en-en" matches don't use script information.
202         // This is assumed in the distance tests below, so it's important to check.
203         assertThat(matchTrie).containsEntry("en-en", 128);
204 
205         // British English is a slightly better match against non-American English.
206         assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "CA", "GB", 3);
207         assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "GB", "GB", 3);
208         // "EU" works here because while it's a macro region, in this data it only
209         // covers a single partition.
210         assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "GB", "EU", 3);
211 
212         // Pairs of non-American or American English languages get a larger distance.
213         assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "CA", "DE", 4);
214         assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "US", "PR", 4);
215         // Deprecated regions (AN) are still mapped to partitions and get real distances.
216         assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "AN", "TW", 4);
217 
218         // Mixing American and non-American English gets the default "en-en-*-*" distance.
219         assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "GB", "US", 5);
220         assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "CA", "US", 5);
221         assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "US", "AN", 5);
222 
223         // Default distances for language, script and region, plus minimum region distance.
224         // Minimum region distance is "en_*_$!enUS" -> "en_*_GB" (as seen above).
225         assertThat(icuData).hasValuesFor("match/distances:intvector", "80", "50", "4", "3");
226     }
227 
228     // Helper to make assertions about language distance a bit more readable.
229     // PartitionInfoTest includes more low level tests for precise ordering etc.
assertEnDistanceForRegions( ImmutableMap<String, Integer> matchTrie, byte[] regionLookup, ImmutableList<String> paritions, String regionA, String regionB, int distance)230     private static void assertEnDistanceForRegions(
231             ImmutableMap<String, Integer> matchTrie,
232             byte[] regionLookup,
233             ImmutableList<String> paritions,
234             String regionA, String regionB,
235             int distance) {
236         // Three step lookup for each region:
237         // 1: Find LSR index from region string.
238         // 2: Lookup partition group index from region lookup table.
239         // 3: Lookup partition group string from partitions table.
240         String partitionA = paritions.get(regionLookup[LSR.indexForRegion(regionA)]);
241         String partitionB = paritions.get(regionLookup[LSR.indexForRegion(regionB)]);
242 
243         // For now only support cases where there's a single partition ID associated
244         // with the region (this is all non-macro regions and *some* macro regions).
245         checkArgument(partitionA.length() == 1 && partitionB.length() == 1,
246                 "multiple partitions unsupported in test: %s %s", regionA, regionB);
247 
248         // This is a depth 2 key because we know that "en" skips scripts. This will
249         // not work the same for "zh" because that needs scripts information.
250         String key = String.format("en-en-%s-%s", partitionA, partitionB);
251         if (matchTrie.containsKey(key)) {
252             assertThat(matchTrie).containsEntry(key, distance);
253         } else {
254             assertThat(matchTrie).containsEntry("en-en-*-*", distance);
255         }
256     }
257 
258     // Returns the mapping for a Trie from a ":bin" suffixed resource value.
259     // "star" defines what the Trie wildcard should be expanded to (for readability).
getTrieMap(IcuData icuData, String path, String star)260     private static ImmutableMap<String, Integer> getTrieMap(IcuData icuData, String path, String star) {
261         return TestData.getTrieTable(getTrie(icuData, path), star, i -> i);
262     }
263 
264     // Reads a Trie from a ":bin" suffixed resource value.
getTrie(IcuData icuData, String path)265     private static BytesTrie getTrie(IcuData icuData, String path) {
266         return new BytesTrie(getBytes(icuData, path), 0);
267     }
268 
269     // Reads a byte array from a ":bin" suffixed resource value.
getBytes(IcuData icuData, String path)270     private static byte[] getBytes(IcuData icuData, String path) {
271         RbPath rbPath = RbPath.parse(path);
272         checkArgument(rbPath.isBinPath(), "only binary paths (:bin) should have binary data: %s", path);
273         List<RbValue> rbValues = icuData.get(rbPath);
274         checkArgument(rbValues != null, "missing value for: %s", rbPath);
275         checkArgument(rbValues.size() == 1, "expect single RbValue: %s", rbValues);
276         // Take a sequence of hex-strings, convert each to a byte[] and collect them.
277         return rbValues.get(0).getElements().stream()
278                 .map(LocaleDistanceMapperTest::decodeHex)
279                 .collect(
280                         ByteArrayOutputStream::new,
281                         (out, b) -> out.write(b, 0, b.length),
282                         (out, b) -> out.write(b.toByteArray(), 0, b.size()))
283                 .toByteArray();
284     }
285 
286     // Hex chars to byte array (2 chars per byte, little endian).
decodeHex(String s)287     private static byte[] decodeHex(String s) {
288         checkArgument(s.length() % 2 == 0, "binary hex strings must have an even length: %s", s);
289         checkArgument(HEX.matchesAllOf(s), "invalid binary hex string: %s", s);
290         byte[] bytes = new byte[s.length() / 2];
291         for (int n = 0; n < bytes.length; n++) {
292             bytes[n] = (byte) Integer.parseUnsignedInt(s.substring(2 * n, 2 * (n + 1)), 16);
293         }
294         return bytes;
295     }
296 
297     private static final CharMatcher HEX = CharMatcher.anyOf("0123456789abcdefABCDEF");
298 }
299