1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/transform/map/normalization-map.h"
16
17 #include <cstdint>
18 #include "icing/legacy/core/icing-packed-pod.h"
19
20 namespace icing {
21 namespace lib {
22
23 namespace {
24 // A pair representing the mapping of the 'from' character to 'to' character.
25 struct NormalizationPair {
26 // All the mapped characters can be stored in 2 bytes.
27 char16_t from;
28 char16_t to;
29 } __attribute__((packed));
30
31 // The following mappings contain multiple categories:
32 // 1. Hiragana -> Katakana, listed in the order of Hiragana chart rows.
33 // All regular and small Hiragana characters are mapped to Katakana. Note
34 // that half-width Katakana characters are not handled here.
35 // 2. Common full-width characters -> ASCII characters.
36 // Full-width characters in the Unicode range of [0xff01, 0xff5e] are mapped
37 // to the corresponding ASCII forms.
38 // 3. Common ideographic punctuation marks -> ASCII characters.
39 // Ideographic characters are in the Unicode range of [0x3000, 0x303f]. Here
40 // we list two that are frequently used in CJK and can be converted to ASCII.
41 // 4. Common diacritic Latin characters -> ASCII characters.
42 // We list most diacritic Latin characters within the Unicode range of
43 // [0x00c0, 0x017e], some from [0x01a0, 0x021b], and most from [0x1e00,
44 // 0x1ef9].
45 //
46 // All the characters can be stored in a single UTF16 code unit, so we use
47 // char16_t to store them. Size of the following array is about 2.5KiB.
48 constexpr NormalizationPair kNormalizationMappings[] = {
49 // Part 1: Hiragana -> Katakana
50 // 'a' row
51 {0x3042, 0x30a2}, // Hiragana letter A -> Katakana letter A
52 {0x3044, 0x30a4}, // Hiragana letter I -> Katakana letter I
53 {0x3046, 0x30a6}, // Hiragana letter U -> Katakana letter U
54 {0x3048, 0x30a8}, // Hiragana letter E -> Katakana letter E
55 {0x304a, 0x30aa}, // Hiragana letter O -> Katakana letter O
56 {0x3041, 0x30a2}, // Hiragana letter small A -> Katakana letter A
57 {0x3043, 0x30a4}, // Hiragana letter small I -> Katakana letter I
58 {0x3045, 0x30a6}, // Hiragana letter small U -> Katakana letter U
59 {0x3047, 0x30a8}, // Hiragana letter small E -> Katakana letter E
60 {0x3049, 0x30aa}, // Hiragana letter small O -> Katakana letter O
61 // 'ka' row
62 {0x304b, 0x30ab}, // Hiragana letter KA -> Katakana letter KA
63 {0x304d, 0x30ad}, // Hiragana letter KI -> Katakana letter KI
64 {0x304f, 0x30af}, // Hiragana letter KU -> Katakana letter KU
65 {0x3051, 0x30b1}, // Hiragana letter KE -> Katakana letter KE
66 {0x3053, 0x30b3}, // Hiragana letter KO -> Katakana letter KO
67 {0x3095, 0x30ab}, // Hiragana letter small KA -> Katakana letter KA
68 {0x3096, 0x30b1}, // Hiragana letter small KE -> Katakana letter KE
69 // 'sa' row
70 {0x3055, 0x30b5}, // Hiragana letter SA -> Katakana letter SA
71 {0x3057, 0x30b7}, // Hiragana letter SI -> Katakana letter SI
72 {0x3059, 0x30b9}, // Hiragana letter SU -> Katakana letter SU
73 {0x305b, 0x30bb}, // Hiragana letter SE -> Katakana letter SE
74 {0x305d, 0x30bd}, // Hiragana letter SO -> Katakana letter SO
75 // 'ta' row
76 {0x305f, 0x30bf}, // Hiragana letter TA -> Katakana letter TA
77 {0x3061, 0x30c1}, // Hiragana letter TI -> Katakana letter TI
78 {0x3063, 0x30c4}, // Hiragana letter small TU -> Katakana letter TU
79 {0x3064, 0x30c4}, // Hiragana letter TU -> Katakana letter TU
80 {0x3066, 0x30c6}, // Hiragana letter TE -> Katakana letter TE
81 {0x3068, 0x30c8}, // Hiragana letter TO -> Katakana letter TO
82 // 'na' row
83 {0x306a, 0x30ca}, // Hiragana letter NA -> Katakana letter NA
84 {0x306b, 0x30cb}, // Hiragana letter NI -> Katakana letter NI
85 {0x306c, 0x30cc}, // Hiragana letter NU -> Katakana letter NU
86 {0x306d, 0x30cd}, // Hiragana letter NE -> Katakana letter NE
87 {0x306e, 0x30ce}, // Hiragana letter NO -> Katakana letter NO
88 // 'ha' row
89 {0x306f, 0x30cf}, // Hiragana letter HA -> Katakana letter HA
90 {0x3072, 0x30d2}, // Hiragana letter HI -> Katakana letter HI
91 {0x3075, 0x30d5}, // Hiragana letter HU -> Katakana letter HU
92 {0x3078, 0x30d8}, // Hiragana letter HE -> Katakana letter HE
93 {0x307b, 0x30db}, // Hiragana letter HO -> Katakana letter HO
94 // 'ma' row
95 {0x307e, 0x30de}, // Hiragana letter MA -> Katakana letter MA
96 {0x307f, 0x30df}, // Hiragana letter MI -> Katakana letter MI
97 {0x3080, 0x30e0}, // Hiragana letter MU -> Katakana letter MU
98 {0x3081, 0x30e1}, // Hiragana letter ME -> Katakana letter ME
99 {0x3082, 0x30e2}, // Hiragana letter MO -> Katakana letter MO
100 // 'ya' row
101 {0x3083, 0x30e4}, // Hiragana letter small YA -> Katakana letter YA
102 {0x3084, 0x30e4}, // Hiragana letter YA -> Katakana letter YA
103 {0x3085, 0x30e6}, // Hiragana letter small YU -> Katakana letter YU
104 {0x3086, 0x30e6}, // Hiragana letter YU -> Katakana letter YU
105 {0x3087, 0x30e8}, // Hiragana letter small YO -> Katakana letter YO
106 {0x3088, 0x30e8}, // Hiragana letter YO -> Katakana letter YO
107 // 'ra' row
108 {0x3089, 0x30e9}, // Hiragana letter RA -> Katakana letter RA
109 {0x308a, 0x30ea}, // Hiragana letter RI -> Katakana letter RI
110 {0x308b, 0x30eb}, // Hiragana letter RU -> Katakana letter RU
111 {0x308c, 0x30ec}, // Hiragana letter RE -> Katakana letter RE
112 {0x308d, 0x30ed}, // Hiragana letter RO -> Katakana letter RO
113 // 'wa' row
114 {0x308e, 0x30ef}, // Hiragana letter small WA -> Katakana letter WA
115 {0x308f, 0x30ef}, // Hiragana letter WA -> Katakana letter WA
116 {0x3090, 0x30f0}, // Hiragana letter WI -> Katakana letter WI
117 {0x3091, 0x30f1}, // Hiragana letter WE -> Katakana letter WE
118 {0x3092, 0x30f2}, // Hiragana letter WO -> Katakana letter WO
119 // 'n'
120 {0x3093, 0x30f3}, // Hiragana letter N -> Katakana letter N
121 // 'ga' row
122 {0x304c, 0x30ac}, // Hiragana letter GA -> Katakana letter GA
123 {0x304e, 0x30ae}, // Hiragana letter GI -> Katakana letter GI
124 {0x3050, 0x30b0}, // Hiragana letter GU -> Katakana letter GU
125 {0x3052, 0x30b2}, // Hiragana letter GE -> Katakana letter GE
126 {0x3054, 0x30b4}, // Hiragana letter GO -> Katakana letter GO
127 // 'za' row
128 {0x3056, 0x30b6}, // Hiragana letter ZA -> Katakana letter ZA
129 {0x3058, 0x30b8}, // Hiragana letter ZI -> Katakana letter ZI
130 {0x305a, 0x30ba}, // Hiragana letter ZU -> Katakana letter ZU
131 {0x305c, 0x30bc}, // Hiragana letter ZE -> Katakana letter ZE
132 {0x305e, 0x30be}, // Hiragana letter ZO -> Katakana letter ZO
133 // 'da' row
134 {0x3060, 0x30c0}, // Hiragana letter DA -> Katakana letter DA
135 {0x3062, 0x30c2}, // Hiragana letter DI -> Katakana letter DI
136 {0x3065, 0x30c5}, // Hiragana letter DU -> Katakana letter DU
137 {0x3067, 0x30c7}, // Hiragana letter DE -> Katakana letter DE
138 {0x3069, 0x30c9}, // Hiragana letter DO -> Katakana letter DO
139 // 'ba' row
140 {0x3070, 0x30d0}, // Hiragana letter BA -> Katakana letter BA
141 {0x3073, 0x30d3}, // Hiragana letter BI -> Katakana letter BI
142 {0x3076, 0x30d6}, // Hiragana letter BU -> Katakana letter BU
143 {0x3079, 0x30d9}, // Hiragana letter BE -> Katakana letter BE
144 {0x307c, 0x30dc}, // Hiragana letter BO -> Katakana letter BO
145 // 'pa' row
146 {0x3071, 0x30d1}, // Hiragana letter PA -> Katakana letter PA
147 {0x3074, 0x30d4}, // Hiragana letter PI -> Katakana letter PI
148 {0x3077, 0x30d7}, // Hiragana letter PU -> Katakana letter PU
149 {0x307a, 0x30da}, // Hiragana letter PE -> Katakana letter PE
150 {0x307d, 0x30dd}, // Hiragana letter PO -> Katakana letter PO
151 // Additional Hiragana
152 {0x3094, 0x30f4}, // Hiragana letter VU -> Katakana letter VU
153 // Part 2: Common full-width characters -> ASCII characters.
154 {0xff01, 33}, // ASCII !
155 {0xff02, 34}, // ASCII "
156 {0xff03, 35}, // ASCII #
157 {0xff04, 36}, // ASCII $
158 {0xff05, 37}, // ASCII %
159 {0xff06, 38}, // ASCII &
160 {0xff07, 39}, // ASCII '
161 {0xff08, 40}, // ASCII (
162 {0xff09, 41}, // ASCII )
163 {0xff0a, 42}, // ASCII *
164 {0xff0b, 43}, // ASCII +
165 {0xff0c, 44}, // ASCII ,
166 {0xff0d, 45}, // ASCII -
167 {0xff0e, 46}, // ASCII .
168 {0xff0f, 47}, // ASCII /
169 {0xff10, 48}, // ASCII 0
170 {0xff11, 49}, // ASCII 1
171 {0xff12, 50}, // ASCII 2
172 {0xff13, 51}, // ASCII 3
173 {0xff14, 52}, // ASCII 4
174 {0xff15, 53}, // ASCII 5
175 {0xff16, 54}, // ASCII 6
176 {0xff17, 55}, // ASCII 7
177 {0xff18, 56}, // ASCII 8
178 {0xff19, 57}, // ASCII 9
179 {0xff1a, 58}, // ASCII :
180 {0xff1b, 59}, // ASCII ;
181 {0xff1c, 60}, // ASCII <
182 {0xff1d, 61}, // ASCII =
183 {0xff1e, 62}, // ASCII >
184 {0xff1f, 63}, // ASCII ?
185 {0xff20, 64}, // ASCII @
186 {0xff21, 65}, // ASCII A
187 {0xff22, 66}, // ASCII B
188 {0xff23, 67}, // ASCII C
189 {0xff24, 68}, // ASCII D
190 {0xff25, 69}, // ASCII E
191 {0xff26, 70}, // ASCII F
192 {0xff27, 71}, // ASCII G
193 {0xff28, 72}, // ASCII H
194 {0xff29, 73}, // ASCII I
195 {0xff2a, 74}, // ASCII J
196 {0xff2b, 75}, // ASCII K
197 {0xff2c, 76}, // ASCII L
198 {0xff2d, 77}, // ASCII M
199 {0xff2e, 78}, // ASCII N
200 {0xff2f, 79}, // ASCII O
201 {0xff30, 80}, // ASCII P
202 {0xff31, 81}, // ASCII Q
203 {0xff32, 82}, // ASCII R
204 {0xff33, 83}, // ASCII S
205 {0xff34, 84}, // ASCII T
206 {0xff35, 85}, // ASCII U
207 {0xff36, 86}, // ASCII V
208 {0xff37, 87}, // ASCII W
209 {0xff38, 88}, // ASCII X
210 {0xff39, 89}, // ASCII Y
211 {0xff3a, 90}, // ASCII Z
212 {0xff3b, 91}, // ASCII [
213 {0xff3c, 92}, // ASCII forward slash
214 {0xff3d, 93}, // ASCII ]
215 {0xff3e, 94}, // ASCII ^
216 {0xff3f, 95}, // ASCII _
217 {0xff40, 96}, // ASCII `
218 {0xff41, 97}, // ASCII a
219 {0xff42, 98}, // ASCII b
220 {0xff43, 99}, // ASCII c
221 {0xff44, 100}, // ASCII d
222 {0xff45, 101}, // ASCII e
223 {0xff46, 102}, // ASCII f
224 {0xff47, 103}, // ASCII g
225 {0xff48, 104}, // ASCII h
226 {0xff49, 105}, // ASCII i
227 {0xff4a, 106}, // ASCII j
228 {0xff4b, 107}, // ASCII k
229 {0xff4c, 108}, // ASCII l
230 {0xff4d, 109}, // ASCII m
231 {0xff4e, 110}, // ASCII n
232 {0xff4f, 111}, // ASCII o
233 {0xff50, 112}, // ASCII p
234 {0xff51, 113}, // ASCII q
235 {0xff52, 114}, // ASCII r
236 {0xff53, 115}, // ASCII s
237 {0xff54, 116}, // ASCII t
238 {0xff55, 117}, // ASCII u
239 {0xff56, 118}, // ASCII v
240 {0xff57, 119}, // ASCII w
241 {0xff58, 120}, // ASCII x
242 {0xff59, 121}, // ASCII y
243 {0xff5a, 122}, // ASCII z
244 {0xff5b, 123}, // ASCII {
245 {0xff5c, 124}, // ASCII |
246 {0xff5d, 125}, // ASCII }
247 {0xff5e, 126}, // ASCII ~
248 {0x2018, 39}, // Left single quote -> ASCII apostrophe
249 {0x2019, 39}, // Right single quote -> ASCII apostrophe
250 {0x201c, 34}, // Left double quote -> ASCII quote
251 {0x201d, 34}, // Right double quote -> ASCII quote
252 // Part 3: Common ideographic punctuation marks -> ASCII.
253 // Usually used in CJK.
254 {0x3001, 44}, // ASCII ,
255 {0x3002, 46}, // ASCII .
256 // Part 4: Common diacritic Latin characters -> ASCII characters.
257 {0x00c0, 65}, // À -> A
258 {0x00c1, 65}, // Á -> A
259 {0x00c2, 65}, // Â -> A
260 {0x00c3, 65}, // Ã -> A
261 {0x00c4, 65}, // Ä -> A
262 {0x00c5, 65}, // Å -> A
263 {0x00c7, 67}, // Ç -> C
264 {0x00c8, 69}, // È -> E
265 {0x00c9, 69}, // É -> E
266 {0x00ca, 69}, // Ê -> E
267 {0x00cb, 69}, // Ë -> E
268 {0x00cc, 73}, // Ì -> I
269 {0x00cd, 73}, // Í -> I
270 {0x00ce, 73}, // Î -> I
271 {0x00cf, 73}, // Ï -> I
272 {0x00d0, 68}, // Ð -> D
273 {0x00d1, 78}, // Ñ -> N
274 {0x00d2, 79}, // Ò -> O
275 {0x00d3, 79}, // Ó -> O
276 {0x00d4, 79}, // Ô -> O
277 {0x00d5, 79}, // Õ -> O
278 {0x00d6, 79}, // Ö -> O
279 {0x00d8, 79}, // Ø -> O
280 {0x00d9, 85}, // Ù -> U
281 {0x00da, 85}, // Ú -> U
282 {0x00db, 85}, // Û -> U
283 {0x00dc, 85}, // Ü -> U
284 {0x00dd, 89}, // Ý -> Y
285 {0x00e0, 97}, // à -> a
286 {0x00e1, 97}, // á -> a
287 {0x00e2, 97}, // â -> a
288 {0x00e3, 97}, // ã -> a
289 {0x00e4, 97}, // ä -> a
290 {0x00e5, 97}, // å -> a
291 {0x00e7, 99}, // ç -> c
292 {0x00e8, 101}, // è -> e
293 {0x00e9, 101}, // é -> e
294 {0x00ea, 101}, // ê -> e
295 {0x00eb, 101}, // ë -> e
296 {0x00ec, 105}, // ì -> i
297 {0x00ed, 105}, // í -> i
298 {0x00ee, 105}, // î -> i
299 {0x00ef, 105}, // ï -> i
300 {0x00f0, 100}, // ð -> d
301 {0x00f1, 110}, // ñ -> n
302 {0x00f2, 111}, // ò -> o
303 {0x00f3, 111}, // ó -> o
304 {0x00f4, 111}, // ô -> o
305 {0x00f5, 111}, // õ -> o
306 {0x00f6, 111}, // ö -> o
307 {0x00f8, 111}, // ø -> o
308 {0x00f9, 117}, // ù -> u
309 {0x00fa, 117}, // ú -> u
310 {0x00fb, 117}, // û -> u
311 {0x00fc, 117}, // ü -> u
312 {0x00fd, 121}, // ý -> y
313 {0x00ff, 121}, // ÿ -> y
314 {0x0100, 65}, // Ā -> A
315 {0x0101, 97}, // ā -> a
316 {0x0102, 65}, // Ă -> A
317 {0x0103, 97}, // ă -> a
318 {0x0104, 65}, // Ą -> A
319 {0x0105, 97}, // ą -> a
320 {0x0106, 67}, // Ć -> C
321 {0x0107, 99}, // ć -> c
322 {0x0108, 67}, // Ĉ -> C
323 {0x0109, 99}, // ĉ -> c
324 {0x010a, 67}, // Ċ -> C
325 {0x010b, 99}, // ċ -> c
326 {0x010c, 67}, // Č -> C
327 {0x010d, 99}, // č -> c
328 {0x010e, 68}, // Ď -> D
329 {0x010f, 100}, // ď -> d
330 {0x0110, 68}, // Đ -> D
331 {0x0111, 100}, // đ -> d
332 {0x0112, 69}, // Ē -> E
333 {0x0113, 101}, // ē -> e
334 {0x0114, 69}, // Ĕ -> E
335 {0x0115, 101}, // ĕ -> e
336 {0x0116, 69}, // Ė -> E
337 {0x0117, 101}, // ė -> e
338 {0x0118, 69}, // Ę -> E
339 {0x0119, 101}, // ę -> e
340 {0x011a, 69}, // Ě -> E
341 {0x011b, 101}, // ě -> e
342 {0x011c, 71}, // Ĝ -> G
343 {0x011d, 103}, // ĝ -> g
344 {0x011e, 71}, // Ğ -> G
345 {0x011f, 103}, // ğ -> g
346 {0x0120, 71}, // Ġ -> G
347 {0x0121, 103}, // ġ -> g
348 {0x0122, 71}, // Ģ -> G
349 {0x0123, 103}, // ģ -> g
350 {0x0124, 72}, // Ĥ -> H
351 {0x0125, 104}, // ĥ -> h
352 {0x0126, 72}, // Ħ -> H
353 {0x0127, 104}, // ħ -> h
354 {0x0128, 73}, // Ĩ -> I
355 {0x0129, 105}, // ĩ -> i
356 {0x012a, 73}, // Ī -> I
357 {0x012b, 105}, // ī -> i
358 {0x012c, 73}, // Ĭ -> I
359 {0x012d, 105}, // ĭ -> i
360 {0x012e, 73}, // Į -> I
361 {0x012f, 105}, // į -> i
362 {0x0130, 73}, // İ -> I
363 {0x0131, 105}, // ı -> i
364 {0x0134, 74}, // Ĵ -> J
365 {0x0135, 106}, // ĵ -> j
366 {0x0136, 75}, // Ķ -> K
367 {0x0137, 107}, // ķ -> k
368 {0x0139, 76}, // Ĺ -> L
369 {0x013a, 108}, // ĺ -> l
370 {0x013b, 76}, // Ļ -> L
371 {0x013c, 108}, // ļ -> l
372 {0x013d, 76}, // Ľ -> L
373 {0x013e, 108}, // ľ -> l
374 {0x013f, 76}, // Ŀ -> L
375 {0x0140, 108}, // ŀ -> l
376 {0x0141, 76}, // Ł -> L
377 {0x0142, 108}, // ł -> l
378 {0x0143, 78}, // Ń -> N
379 {0x0144, 110}, // ń -> n
380 {0x0145, 78}, // Ņ -> N
381 {0x0146, 110}, // ņ -> n
382 {0x0147, 78}, // Ň -> N
383 {0x0148, 110}, // ň -> n
384 {0x014a, 78}, // Ŋ -> N
385 {0x014b, 110}, // ŋ -> n
386 {0x014c, 79}, // Ō -> O
387 {0x014d, 111}, // ō -> o
388 {0x014e, 79}, // Ŏ -> O
389 {0x014f, 111}, // ŏ -> o
390 {0x0150, 79}, // Ő -> O
391 {0x0151, 111}, // ő -> o
392 {0x0154, 82}, // Ŕ -> R
393 {0x0155, 114}, // ŕ -> r
394 {0x0156, 82}, // Ŗ -> R
395 {0x0157, 114}, // ŗ -> r
396 {0x0158, 82}, // Ř -> R
397 {0x0159, 114}, // ř -> r
398 {0x015a, 83}, // Ś -> S
399 {0x015b, 115}, // ś -> s
400 {0x015c, 83}, // Ŝ -> S
401 {0x015d, 115}, // ŝ -> s
402 {0x015e, 83}, // Ş -> S
403 {0x015f, 115}, // ş -> s
404 {0x0160, 83}, // Š -> S
405 {0x0161, 115}, // š -> s
406 {0x0162, 84}, // Ţ -> T
407 {0x0163, 116}, // ţ -> t
408 {0x0164, 84}, // Ť -> T
409 {0x0165, 116}, // ť -> t
410 {0x0166, 84}, // Ŧ -> T
411 {0x0167, 116}, // ŧ -> t
412 {0x0168, 85}, // Ũ -> U
413 {0x0169, 117}, // ũ -> u
414 {0x016a, 85}, // Ū -> U
415 {0x016b, 117}, // ū -> u
416 {0x016c, 85}, // Ŭ -> U
417 {0x016d, 117}, // ŭ -> u
418 {0x016e, 85}, // Ů -> U
419 {0x016f, 117}, // ů -> u
420 {0x0170, 85}, // Ű -> U
421 {0x0171, 117}, // ű -> u
422 {0x0172, 85}, // Ų -> U
423 {0x0173, 117}, // ų -> u
424 {0x0174, 87}, // Ŵ -> W
425 {0x0175, 119}, // ŵ -> w
426 {0x0176, 89}, // Ŷ -> Y
427 {0x0177, 121}, // ŷ -> y
428 {0x0178, 89}, // Ÿ -> Y
429 {0x0179, 90}, // Ź -> Z
430 {0x017a, 122}, // ź -> z
431 {0x017b, 90}, // Ż -> Z
432 {0x017c, 122}, // ż -> z
433 {0x017d, 90}, // Ž -> Z
434 {0x017e, 122}, // ž -> z
435 {0x01a0, 79}, // Ơ -> O
436 {0x01a1, 111}, // ơ -> o
437 {0x01af, 85}, // Ư -> U
438 {0x01b0, 117}, // ư -> u
439 {0x01b5, 90}, // Ƶ -> Z
440 {0x01b6, 122}, // ƶ -> z
441 {0x0218, 83}, // Ș -> S
442 {0x0219, 115}, // ș -> s
443 {0x021a, 84}, // Ț -> T
444 {0x021b, 116}, // ț -> t
445 {0x1e00, 65}, // Ḁ -> A
446 {0x1e01, 97}, // ḁ -> a
447 {0x1e02, 66}, // Ḃ -> B
448 {0x1e03, 98}, // ḃ -> b
449 {0x1e04, 66}, // Ḅ -> B
450 {0x1e05, 98}, // ḅ -> b
451 {0x1e06, 66}, // Ḇ -> B
452 {0x1e07, 98}, // ḇ -> b
453 {0x1e08, 67}, // Ḉ -> C
454 {0x1e09, 99}, // ḉ -> c
455 {0x1e0a, 68}, // Ḋ -> D
456 {0x1e0b, 100}, // ḋ -> d
457 {0x1e0c, 68}, // Ḍ -> D
458 {0x1e0d, 100}, // ḍ -> d
459 {0x1e0e, 68}, // Ḏ -> D
460 {0x1e0f, 100}, // ḏ -> d
461 {0x1e10, 68}, // Ḑ -> D
462 {0x1e11, 100}, // ḑ -> d
463 {0x1e12, 68}, // Ḓ -> D
464 {0x1e13, 100}, // ḓ -> d
465 {0x1e14, 69}, // Ḕ -> E
466 {0x1e15, 101}, // ḕ -> e
467 {0x1e16, 69}, // Ḗ -> E
468 {0x1e17, 101}, // ḗ -> e
469 {0x1e18, 69}, // Ḙ -> E
470 {0x1e19, 101}, // ḙ -> e
471 {0x1e1a, 69}, // Ḛ -> E
472 {0x1e1b, 101}, // ḛ -> e
473 {0x1e1c, 69}, // Ḝ -> E
474 {0x1e1d, 101}, // ḝ -> e
475 {0x1e1e, 70}, // Ḟ -> F
476 {0x1e1f, 102}, // ḟ -> f
477 {0x1e20, 71}, // Ḡ -> G
478 {0x1e21, 103}, // ḡ -> g
479 {0x1e22, 72}, // Ḣ -> H
480 {0x1e23, 104}, // ḣ -> h
481 {0x1e24, 72}, // Ḥ -> H
482 {0x1e25, 104}, // ḥ -> h
483 {0x1e26, 72}, // Ḧ -> H
484 {0x1e27, 104}, // ḧ -> h
485 {0x1e28, 72}, // Ḩ -> H
486 {0x1e29, 104}, // ḩ -> h
487 {0x1e2a, 72}, // Ḫ -> H
488 {0x1e2b, 104}, // ḫ -> h
489 {0x1e2c, 73}, // Ḭ -> I
490 {0x1e2d, 105}, // ḭ -> i
491 {0x1e2e, 73}, // Ḯ -> I
492 {0x1e2f, 105}, // ḯ -> i
493 {0x1e30, 75}, // Ḱ -> K
494 {0x1e31, 107}, // ḱ -> k
495 {0x1e32, 75}, // Ḳ -> K
496 {0x1e33, 107}, // ḳ -> k
497 {0x1e34, 75}, // Ḵ -> K
498 {0x1e35, 107}, // ḵ -> k
499 {0x1e36, 76}, // Ḷ -> L
500 {0x1e37, 108}, // ḷ -> l
501 {0x1e38, 76}, // Ḹ -> L
502 {0x1e39, 108}, // ḹ -> l
503 {0x1e3b, 108}, // ḻ -> l
504 {0x1e3c, 76}, // Ḽ -> L
505 {0x1e3d, 108}, // ḽ -> l
506 {0x1e3e, 77}, // Ḿ -> M
507 {0x1e3f, 109}, // ḿ -> m
508 {0x1e40, 77}, // Ṁ -> M
509 {0x1e41, 109}, // ṁ -> m
510 {0x1e42, 77}, // Ṃ -> M
511 {0x1e43, 109}, // ṃ -> m
512 {0x1e44, 78}, // Ṅ -> N
513 {0x1e45, 110}, // ṅ -> n
514 {0x1e46, 78}, // Ṇ -> N
515 {0x1e47, 110}, // ṇ -> n
516 {0x1e48, 78}, // Ṉ -> N
517 {0x1e49, 110}, // ṉ -> n
518 {0x1e4a, 78}, // Ṋ -> N
519 {0x1e4b, 110}, // ṋ -> n
520 {0x1e4c, 79}, // Ṍ -> O
521 {0x1e4d, 111}, // ṍ -> o
522 {0x1e4e, 79}, // Ṏ -> O
523 {0x1e4f, 111}, // ṏ -> o
524 {0x1e50, 79}, // Ṑ -> O
525 {0x1e51, 111}, // ṑ -> o
526 {0x1e52, 79}, // Ṓ -> O
527 {0x1e53, 111}, // ṓ -> o
528 {0x1e54, 80}, // Ṕ -> P
529 {0x1e55, 112}, // ṕ -> p
530 {0x1e56, 80}, // Ṗ -> P
531 {0x1e57, 112}, // ṗ -> p
532 {0x1e58, 82}, // Ṙ -> R
533 {0x1e59, 114}, // ṙ -> r
534 {0x1e5a, 82}, // Ṛ -> R
535 {0x1e5b, 114}, // ṛ -> r
536 {0x1e5c, 82}, // Ṝ -> R
537 {0x1e5d, 114}, // ṝ -> r
538 {0x1e5e, 82}, // Ṟ -> R
539 {0x1e5f, 114}, // ṟ -> r
540 {0x1e60, 83}, // Ṡ -> S
541 {0x1e61, 115}, // ṡ -> s
542 {0x1e62, 83}, // Ṣ -> S
543 {0x1e63, 115}, // ṣ -> s
544 {0x1e64, 83}, // Ṥ -> S
545 {0x1e65, 115}, // ṥ -> s
546 {0x1e66, 83}, // Ṧ -> S
547 {0x1e67, 115}, // ṧ -> s
548 {0x1e68, 83}, // Ṩ -> S
549 {0x1e69, 115}, // ṩ -> s
550 {0x1e6a, 84}, // Ṫ -> T
551 {0x1e6b, 116}, // ṫ -> t
552 {0x1e6c, 84}, // Ṭ -> T
553 {0x1e6d, 116}, // ṭ -> t
554 {0x1e6e, 84}, // Ṯ -> T
555 {0x1e6f, 116}, // ṯ -> t
556 {0x1e70, 84}, // Ṱ -> T
557 {0x1e71, 116}, // ṱ -> t
558 {0x1e72, 85}, // Ṳ -> U
559 {0x1e73, 117}, // ṳ -> u
560 {0x1e74, 85}, // Ṵ -> U
561 {0x1e75, 117}, // ṵ -> u
562 {0x1e76, 85}, // Ṷ -> U
563 {0x1e77, 117}, // ṷ -> u
564 {0x1e78, 85}, // Ṹ -> U
565 {0x1e79, 117}, // ṹ -> u
566 {0x1e7a, 85}, // Ṻ -> U
567 {0x1e7b, 117}, // ṻ -> u
568 {0x1e7c, 86}, // Ṽ -> V
569 {0x1e7d, 118}, // ṽ -> v
570 {0x1e7e, 86}, // Ṿ -> V
571 {0x1e7f, 118}, // ṿ -> v
572 {0x1e80, 87}, // Ẁ -> W
573 {0x1e81, 119}, // ẁ -> w
574 {0x1e82, 87}, // Ẃ -> W
575 {0x1e83, 119}, // ẃ -> w
576 {0x1e84, 87}, // Ẅ -> W
577 {0x1e85, 119}, // ẅ -> w
578 {0x1e86, 87}, // Ẇ -> W
579 {0x1e87, 119}, // ẇ -> w
580 {0x1e88, 87}, // Ẉ -> W
581 {0x1e89, 119}, // ẉ -> w
582 {0x1e8a, 88}, // Ẋ -> X
583 {0x1e8b, 120}, // ẋ -> x
584 {0x1e8c, 88}, // Ẍ -> X
585 {0x1e8d, 120}, // ẍ -> x
586 {0x1e8e, 89}, // Ẏ -> Y
587 {0x1e8f, 121}, // ẏ -> y
588 {0x1e90, 90}, // Ẑ -> Z
589 {0x1e91, 122}, // ẑ -> z
590 {0x1e92, 90}, // Ẓ -> Z
591 {0x1e93, 122}, // ẓ -> z
592 {0x1e94, 90}, // Ẕ -> Z
593 {0x1e95, 122}, // ẕ -> z
594 {0x1e96, 104}, // ẖ -> h
595 {0x1e97, 116}, // ẗ -> t
596 {0x1e98, 119}, // ẘ -> w
597 {0x1e99, 121}, // ẙ -> y
598 {0x1e9a, 97}, // ẚ -> a
599 {0x1e9b, 102}, // ẛ -> f
600 {0x1ea0, 65}, // Ạ -> A
601 {0x1ea1, 97}, // ạ -> a
602 {0x1ea2, 65}, // Ả -> A
603 {0x1ea3, 97}, // ả -> a
604 {0x1ea4, 65}, // Ấ -> A
605 {0x1ea5, 97}, // ấ -> a
606 {0x1ea6, 65}, // Ầ -> A
607 {0x1ea7, 97}, // ầ -> a
608 {0x1ea8, 65}, // Ẩ -> A
609 {0x1ea9, 97}, // ẩ -> a
610 {0x1eaa, 65}, // Ẫ -> A
611 {0x1eab, 97}, // ẫ -> a
612 {0x1eac, 65}, // Ậ -> A
613 {0x1ead, 97}, // ậ -> a
614 {0x1eae, 65}, // Ắ -> A
615 {0x1eaf, 97}, // ắ -> a
616 {0x1eb0, 65}, // Ằ -> A
617 {0x1eb1, 97}, // ằ -> a
618 {0x1eb2, 65}, // Ẳ -> A
619 {0x1eb3, 97}, // ẳ -> a
620 {0x1eb4, 65}, // Ẵ -> A
621 {0x1eb5, 97}, // ẵ -> a
622 {0x1eb6, 65}, // Ặ -> A
623 {0x1eb7, 97}, // ặ -> a
624 {0x1eb8, 69}, // Ẹ -> E
625 {0x1eb9, 101}, // ẹ -> e
626 {0x1eba, 69}, // Ẻ -> E
627 {0x1ebb, 101}, // ẻ -> e
628 {0x1ebc, 69}, // Ẽ -> E
629 {0x1ebd, 101}, // ẽ -> e
630 {0x1ebe, 69}, // Ế -> E
631 {0x1ebf, 101}, // ế -> e
632 {0x1ec0, 69}, // Ề -> E
633 {0x1ec1, 101}, // ề -> e
634 {0x1ec2, 69}, // Ể -> E
635 {0x1ec3, 101}, // ể -> e
636 {0x1ec4, 69}, // Ễ -> E
637 {0x1ec5, 101}, // ễ -> e
638 {0x1ec6, 69}, // Ệ -> E
639 {0x1ec7, 101}, // ệ -> e
640 {0x1ec8, 73}, // Ỉ -> I
641 {0x1ec9, 105}, // ỉ -> i
642 {0x1eca, 73}, // Ị -> I
643 {0x1ecb, 105}, // ị -> i
644 {0x1ecc, 79}, // Ọ -> O
645 {0x1ecd, 111}, // ọ -> o
646 {0x1ece, 79}, // Ỏ -> O
647 {0x1ecf, 111}, // ỏ -> o
648 {0x1ed0, 79}, // Ố -> O
649 {0x1ed1, 111}, // ố -> o
650 {0x1ed2, 79}, // Ồ -> O
651 {0x1ed3, 111}, // ồ -> o
652 {0x1ed4, 79}, // Ổ -> O
653 {0x1ed5, 111}, // ổ -> o
654 {0x1ed6, 79}, // Ỗ -> O
655 {0x1ed7, 111}, // ỗ -> o
656 {0x1ed8, 79}, // Ộ -> O
657 {0x1ed9, 111}, // ộ -> o
658 {0x1eda, 79}, // Ớ -> O
659 {0x1edb, 111}, // ớ -> o
660 {0x1edc, 79}, // Ờ -> O
661 {0x1edd, 111}, // ờ -> o
662 {0x1ede, 79}, // Ở -> O
663 {0x1edf, 111}, // ở -> o
664 {0x1ee0, 79}, // Ỡ -> O
665 {0x1ee1, 111}, // ỡ -> o
666 {0x1ee2, 79}, // Ợ -> O
667 {0x1ee3, 111}, // ợ -> o
668 {0x1ee4, 85}, // Ụ -> U
669 {0x1ee5, 117}, // ụ -> u
670 {0x1ee6, 85}, // Ủ -> U
671 {0x1ee7, 117}, // ủ -> u
672 {0x1ee8, 85}, // Ứ -> U
673 {0x1ee9, 117}, // ứ -> u
674 {0x1eea, 85}, // Ừ -> U
675 {0x1eeb, 117}, // ừ -> u
676 {0x1eec, 85}, // Ử -> U
677 {0x1eed, 117}, // ử -> u
678 {0x1eee, 85}, // Ữ -> U
679 {0x1eef, 117}, // ữ -> u
680 {0x1ef0, 85}, // Ự -> U
681 {0x1ef1, 117}, // ự -> u
682 {0x1ef2, 89}, // Ỳ -> Y
683 {0x1ef3, 121}, // ỳ -> y
684 {0x1ef4, 89}, // Ỵ -> Y
685 {0x1ef5, 121}, // ỵ -> y
686 {0x1ef6, 89}, // Ỷ -> Y
687 {0x1ef7, 121}, // ỷ -> y
688 {0x1ef8, 89}, // Ỹ -> Y
689 {0x1ef9, 121}, // ỹ -> y
690 };
691
692 } // namespace
693
GetNormalizationMap()694 const std::unordered_map<char16_t, char16_t> *GetNormalizationMap() {
695 // The map is allocated dynamically the first time this function is executed.
696 static const std::unordered_map<char16_t, char16_t> *const normalization_map =
697 [] {
698 auto *map = new std::unordered_map<char16_t, char16_t>();
699 // Size of all the mappings is about 2.5 KiB.
700 constexpr int numMappings =
701 sizeof(kNormalizationMappings) / sizeof(NormalizationPair);
702 map->reserve(numMappings);
703 for (size_t i = 0; i < numMappings; ++i) {
704 map->emplace(kNormalizationMappings[i].from,
705 kNormalizationMappings[i].to);
706 }
707 return map;
708 }();
709
710 return normalization_map;
711 }
712
713 } // namespace lib
714 } // namespace icing
715