xref: /aosp_15_r20/frameworks/minikin/rust/hyphenator.rs (revision 834a2baab5fdfc28e9a428ee87c7ea8f6a06a53d)
1 /*
2  * Copyright 2024 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 use std::cmp;
18 
19 /// An implementation of hyphenation for Android.
20 ///
21 /// The hyphenation in Android is done with two steps: first performs the Knuth-Liang hyphenation
22 /// algorithm for populating all possible hyphenation points. Then, resolve hyphenation type from
23 /// the scripts and locales.
24 ///
25 /// The Knuth-Liang hyphenation works like as follows:
26 /// The Knuth-Liang hyphenation uses two dictionary: pattern dictionary and exception files. The
27 /// files end with ".hyp.txt" are exception files and the files end with ".pat.txt" are pattern
28 /// files. If the word is in exception file, the hyphenation is performed as it is specified in the
29 /// exception file.
30 ///
31 /// Then, if the word is not in the exception file, the Knuth-Liang hyphenation is performed with
32 /// hyphenation pattern dictionary. The hyphenation pattern dictionary is a list of sub-word with
33 /// hyphenation level as number. The level values are assigned between each letters including before
34 /// the first letter and last letter. If the value is odd number, then the position is a hyphenation
35 /// point. If the value is even number, then the position is not a hyphenation point. In the pattern
36 /// file, the 0 is dropped, so the meaning of "re4ti4z" is level values "0040040" for sub-word
37 /// "retiz". The hyphenation is performed by iterating all patterns and assigning level values to
38 /// the possible break points. If the break point can be assigned from multiple patterns, the
39 /// maximum value is used. If none of the pattern matches the break point, the level is zero,
40 /// therefore do not break. And finally the odd numbered positions are the break points.
41 ///
42 /// Here is an example how the "hyphenation" is broken into "hy-phen-ation".
43 /// The relevant patterns in the pattern dictionary are
44 /// - hy3ph
45 /// - he2n
46 /// - hena4
47 /// - hen5at
48 /// - 1na
49 /// - n2at
50 /// - 1tio
51 /// - 2io
52 /// - o2n
53 ///
54 /// Then when these patterns are applied to the word "hyphenation", it becomes like
55 ///
56 ///   h y p h e n a t i o n
57 ///  0 0 3 0 0              : hy3ph
58 ///        0 0 2 0          : he2n
59 ///        0 0 0 0 4        : hena4
60 ///        0 0 0 5 0 0      : hen5at
61 ///            1 0 0        : 1na
62 ///            0 2 0 0      : n2at
63 ///                1 0 0 0  : 1tio
64 ///                  2 0 0  : 2io
65 ///                    0 2 0: o2n
66 /// ---------------------------------
67 ///  0 0 3 0 0 2 5 4 2 0 2 0: max
68 ///
69 /// Then, the odd-numbered break points are hyphenation allowed break points, so the result is
70 /// "hy-phen-ation".
71 ///
72 /// In the Android implementation, the hyphenation pattern file is preprocessed to Trie in build
73 /// time. For the detailed file format, see comments of HyphenationData struct.
74 ///
75 /// Once the all possible hyphenation break points are collected, the decide the hyphenation break
76 /// type is determined based on the script and locale. For example, in case of Arabic, the letter
77 /// form should not be changed by hyphenation, so ZWJ can be inserted before and after hyphen
78 /// letter.
79 
80 const CHAR_SOFT_HYPHEN: u16 = 0x00AD;
81 const CHAR_MIDDLE_DOT: u16 = 0x00B7;
82 const CHAR_HYPHEN_MINUS: u16 = 0x002D;
83 const CHAR_HYPHEN: u16 = 0x2010;
84 
85 // The following U_JT_* constants must be same to the ones defined in
86 // frameworks/minikin/lib/minikin/ffi/IciBridge.h
87 // TODO: Replace with ICU4X once it becomes available in Android.
88 const U_JT_NON_JOINING: u8 = 0;
89 const U_JT_DUAL_JOINING: u8 = 1;
90 const U_JT_RIGHT_JOINING: u8 = 2;
91 const U_JT_LEFT_JOINING: u8 = 3;
92 const U_JT_JOIN_CAUSING: u8 = 4;
93 const U_JT_TRANSPARENT: u8 = 5;
94 
95 // The following USCRIPT_* constants must be same to the ones defined in
96 // frameworks/minikin/lib/minikin/ffi/IciBridge.h
97 // TODO: Replace with ICU4X once it becomes available in Android.
98 const USCRIPT_LATIN: u8 = 0;
99 const USCRIPT_ARABIC: u8 = 1;
100 const USCRIPT_KANNADA: u8 = 2;
101 const USCRIPT_MALAYALAM: u8 = 3;
102 const USCRIPT_TAMIL: u8 = 4;
103 const USCRIPT_TELUGU: u8 = 5;
104 const USCRIPT_ARMENIAN: u8 = 6;
105 const USCRIPT_CANADIAN_ABORIGINAL: u8 = 7;
106 
107 use crate::ffi::getJoiningType;
108 use crate::ffi::getScript;
109 
110 /// Hyphenation types
111 /// The following values must be equal to the ones in
112 /// frameworks/minikin/include/minikin/Hyphenator.h
113 #[repr(u8)]
114 #[derive(PartialEq, Copy, Clone)]
115 pub enum HyphenationType {
116     /// Do not break.
117     DontBreak = 0,
118     /// Break the line and insert a normal hyphen.
119     BreakAndInsertHyphen = 1,
120     /// Break the line and insert an Armenian hyphen (U+058A).
121     BreakAndInsertArmenianHyphen = 2,
122     /// Break the line and insert a Canadian Syllabics hyphen (U+1400).
123     BreakAndInsertUcasHyphen = 4,
124     /// Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen
125     /// present or the script does not use a hyphen (e.g. in Malayalam).
126     BreakAndDontInsertHyphen = 5,
127     /// Break and replace the last code unit with hyphen. Used for Catalan "l·l" which hyphenates
128     /// as "l-/l".
129     BreakAndReplaceWithHyphen = 6,
130     /// Break the line, and repeat the hyphen (which is the last character) at the beginning of the
131     /// next line. Used in Polish (where "czerwono-niebieska" should hyphenate as
132     /// "czerwono-/-niebieska") and Slovenian.
133     BreakAndInsertHyphenAtNextLine = 7,
134     /// Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line.
135     /// This is used in Arabic script, mostly for writing systems of Central Asia. It's our default
136     /// behavior when a soft hyphen is used in Arabic script.
137     BreakAndInsertHyphenAndZwj = 8,
138 }
139 
140 /// Hyphenation locale
141 #[repr(u8)]
142 #[derive(PartialEq, Copy, Clone)]
143 pub enum HyphenationLocale {
144     /// Other locale
145     Other = 0,
146     /// Catalan
147     Catalan = 1,
148     /// Polish
149     Polish = 2,
150     /// Slovenian
151     Slovenian = 3,
152     /// Portuguese
153     Portuguese = 4,
154 }
155 
156 const MAX_HYPHEN_SIZE: u32 = 64;
157 
158 struct HyphenationData<'a> {
159     bytes: &'a [u8],
160 }
161 
162 /// The Hyphenation pattern file is encoded into binary format during build time.
163 /// The hyphenation pattern file is encoded into three objects: AlphabetTable, Trie, Patterns.
164 ///
165 /// First, to avoid high value of utf16 char values in Trie object, char values are mapped to
166 /// internal alphabet codes. The AlphabetTable0 and AndroidTable1 has a map from utf16 char values
167 /// to internal u16 alphabet codes. The AlphabetTable0 is used if the min and max used code points
168 /// has less than 1024, i.e. max_codepoint - min_codepoint < 1024. The AlphabetTable1 is used
169 /// otherwise.
170 ///
171 /// Then, the pattern file is encoded with Trie and Pattern object with using internal
172 /// alphabet code. For example, in case of the entry "ef5i5nite", the hyphenation score "00550000"
173 /// is stored in the Pattern object and the subword "efinite" is stored in the Trie object.
174 ///
175 /// The Trie object is encoded as one dimensional u32 arrays. Each u32 integer contains packed
176 /// index to the Pattern object, index to the next node entry and alphabet code.
177 /// Trie Entry:
178 ///    0                   1                   2                   3
179 ///    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1  (bits)
180 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
181 ///   |   index to pattern data   |  index to the next node |   code  |
182 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
183 ///   Note: the layout is as an example of pattern_shift = 19, link_shift = 5.
184 ///
185 /// The Pattern object is encoded into two data: entry list and data payload. The entry is a packed
186 /// u32 integer that contains length of the pattern, an amount of shift of the pattern index and
187 /// an offset from the payload head.
188 ///
189 /// Pattern Entry:
190 ///    0                   1                   2                   3
191 ///    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1  (bits)
192 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
193 ///   |len of pat | pat shift |      offset to the pattern data       |
194 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
195 ///
196 /// The pattern data and related information can be obtained as follows:
197 ///
198 ///   // Pattern information
199 ///   let entry = pattern[16 + pattern_index * 4]  // read u32 as little endian.
200 ///   let pattern_length = entry >> 26
201 ///   let pattern_shift = (entry > 20) & 0x3f
202 ///
203 ///   // Pattern value retrieval: i-th offset in the word.
204 ///   let pattern_offset = pattern[8] // read u32 as little endian.
205 ///   let pattern_value = pattern[pattern_offset + (entry & 0xfffff) + i]
206 impl<'a> HyphenationData<'a> {
new(bytes: &'a [u8]) -> Self207     pub const fn new(bytes: &'a [u8]) -> Self {
208         HyphenationData { bytes }
209     }
210 
read_u32(&self, offset: u32) -> u32211     pub fn read_u32(&self, offset: u32) -> u32 {
212         let usize_offset = offset as usize;
213         self.bytes
214             .get(usize_offset..usize_offset + 4)
215             .map(|x: &[u8]| u32::from_le_bytes(x.try_into().unwrap()))
216             .unwrap()
217     }
218 }
219 
220 /// Header struct of the hyphenation pattern file.
221 /// The object layout follows:
222 ///    0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F    (bytes)
223 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
224 ///   |      magic    |     version   |alphabet offset|  trie offset  |
225 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
226 ///   |pattern offset |   file size   |
227 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
228 pub struct Header<'a> {
229     data: HyphenationData<'a>,
230 }
231 
232 /// Alphabet Table version 0 struct of the hyphenation pattern file.
233 /// The object layout follows:
234 ///    0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F    (bytes)
235 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
236 ///   |     version   | min codepoint | max codepoint |    payload
237 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
238 pub struct AlphabetTable0<'a> {
239     data: HyphenationData<'a>,
240     min_codepoint: u32,
241     max_codepoint: u32,
242 }
243 
244 /// Alphabet Table version 1 struct of the hyphenation pattern file.
245 /// The object layout follows:
246 ///    0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F    (bytes)
247 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
248 ///   |     version   | num of entries|         payload
249 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
250 pub struct AlphabetTable1<'a> {
251     data: HyphenationData<'a>,
252     num_entries: u32,
253 }
254 
255 /// An entry of alphabet table version 1 struct of the hyphenation pattern file.
256 /// The entry is packed u32 value: the high 21 bits are code point and low 11 bits
257 /// are alphabet code value.
258 ///    0                   1                   2                   3
259 ///    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1  (bits)
260 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
261 ///   |                code point               |     code value      |
262 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
263 pub struct AlphabetTable1Entry {
264     entry: u32,
265 }
266 
267 /// Trie struct of the hyphenation pattern file.
268 /// The object layout follows:
269 ///    0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F    (bytes)
270 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
271 ///   |     version   |   char mask   |  link shift   |   link mask   |
272 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
273 ///   | pattern shift |  num entries  |         payload
274 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
275 pub struct Trie<'a> {
276     data: HyphenationData<'a>,
277 }
278 
279 /// Pattern struct of the hyphenation pattern file.
280 /// The object layout follows:
281 ///    0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F    (bytes)
282 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
283 ///   |     version   | num entries   | pattern offset|  pattern size |
284 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
285 ///   | payload
286 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
287 pub struct Pattern<'a> {
288     data: HyphenationData<'a>,
289     pattern_offset: u32,
290 }
291 
292 /// An entry of pattern struct of the hyphenation pattern file.
293 /// The entry is packed u32 value: the highest 6 bits are for length, next 6 bits are amount of
294 /// shift, and lowest 20 bits are offset of the first value from the pattern offset value.
295 ///    0                   1                   2                   3
296 ///    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1  (bits)
297 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
298 ///   |   length  |   shift   |     offset of the first value         |
299 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
300 pub struct PatternEntry<'a> {
301     data: HyphenationData<'a>,
302     pattern_offset: u32,
303     entry: u32,
304 }
305 
306 impl<'a> Header<'a> {
307     /// Construct a reader of the Header struct from the byte array.
new(bytes: &'a [u8]) -> Self308     pub const fn new(bytes: &'a [u8]) -> Self {
309         Header { data: HyphenationData::new(bytes) }
310     }
311 
312     /// Returns the reader of the alphabet code.
alphabet_table(&self) -> Option<Box<dyn AlphabetLookup + 'a>>313     pub fn alphabet_table(&self) -> Option<Box<dyn AlphabetLookup + 'a>> {
314         let offset = self.data.read_u32(8);
315         let version = self.data.read_u32(offset);
316         return match version {
317             0 => Some(Box::new(AlphabetTable0::new(self.read_offset_and_slice(8)))),
318             1 => Some(Box::new(AlphabetTable1::new(self.read_offset_and_slice(8)))),
319             _ => None,
320         };
321     }
322 
323     /// Returns the reader of the trie struct.
trie_table(&self) -> Trie<'a>324     pub fn trie_table(&self) -> Trie<'a> {
325         Trie::new(self.read_offset_and_slice(12))
326     }
327 
328     /// Returns the reader of the pattern struct.
pattern_table(&self) -> Pattern<'a>329     pub fn pattern_table(&self) -> Pattern<'a> {
330         Pattern::new(self.read_offset_and_slice(16))
331     }
332 
read_offset_and_slice(&self, offset: u32) -> &'a [u8]333     fn read_offset_and_slice(&self, offset: u32) -> &'a [u8] {
334         let offset = self.data.read_u32(offset) as usize;
335         self.data.bytes.get(offset..).unwrap()
336     }
337 }
338 
339 pub trait AlphabetLookup {
340     /// Get the alphabet code for the code point.
get_at(&self, c: u32) -> Option<u16>341     fn get_at(&self, c: u32) -> Option<u16>;
342 
343     /// Lookup the internal alphabet codes from UTF-16 character codes.
lookup( &self, alpha_codes: &mut [u16; MAX_HYPHEN_SIZE as usize], word: &[u16], ) -> HyphenationType344     fn lookup(
345         &self,
346         alpha_codes: &mut [u16; MAX_HYPHEN_SIZE as usize],
347         word: &[u16],
348     ) -> HyphenationType {
349         let mut result = HyphenationType::BreakAndInsertHyphen;
350         alpha_codes[0] = 0; // word start
351         for i in 0..word.len() {
352             let c = word[i] as u32;
353             if let Some(code) = self.get_at(c) {
354                 alpha_codes[i + 1] = code;
355             } else {
356                 return HyphenationType::DontBreak;
357             }
358             if result == HyphenationType::BreakAndInsertHyphen {
359                 result = Hyphenator::hyphenation_type_based_on_script(c);
360             }
361         }
362         alpha_codes[word.len() + 1] = 0; // word termination
363         result
364     }
365 }
366 
367 /// Map from utf16 code unit to the internal alphabet code.
368 impl<'a> AlphabetTable0<'a> {
369     /// Construct a reader of the Alphabet Table version 0 struct from the byte array.
new(bytes: &'a [u8]) -> Self370     pub fn new(bytes: &'a [u8]) -> Self {
371         let data = HyphenationData::new(bytes);
372         let min_codepoint = data.read_u32(4);
373         let max_codepoint = data.read_u32(8);
374         AlphabetTable0 { data, min_codepoint, max_codepoint }
375     }
376 }
377 
378 impl<'a> AlphabetLookup for AlphabetTable0<'a> {
379     /// Returns an entry of the specified offset.
get_at(&self, offset: u32) -> Option<u16>380     fn get_at(&self, offset: u32) -> Option<u16> {
381         if offset < self.min_codepoint || offset >= self.max_codepoint {
382             None
383         } else {
384             let code = self.data.bytes[(offset - self.min_codepoint) as usize + 12] as u16;
385             if code == 0 {
386                 None
387             } else {
388                 Some(code)
389             }
390         }
391     }
392 }
393 
394 /// Map from utf16 code unit to the internal alphabet code.
395 impl<'a> AlphabetTable1<'a> {
396     /// Construct a reader of the Alphabet Table version 1 struct from the byte array.
new(bytes: &'a [u8]) -> Self397     pub fn new(bytes: &'a [u8]) -> Self {
398         let data = HyphenationData::new(bytes);
399         let num_entries = data.read_u32(4);
400         AlphabetTable1 { data, num_entries }
401     }
402 
lower_bounds(&self, value: u32) -> Option<u32>403     fn lower_bounds(&self, value: u32) -> Option<u32> {
404         let mut b = 0;
405         let mut e = self.num_entries;
406         while b != e {
407             let m = b + (e - b) / 2;
408             let c = self.data.read_u32(8 + m * 4);
409             if c >= value {
410                 e = m;
411             } else {
412                 b = m + 1;
413             }
414         }
415         if b == self.num_entries {
416             None
417         } else {
418             Some(b)
419         }
420     }
421 }
422 
423 impl<'a> AlphabetLookup for AlphabetTable1<'a> {
get_at(&self, c: u32) -> Option<u16>424     fn get_at(&self, c: u32) -> Option<u16> {
425         if let Some(r) = self.lower_bounds(c << 11) {
426             let entry = AlphabetTable1Entry::new(self.data.read_u32(8 + r * 4));
427             if entry.codepoint() == c {
428                 Some(entry.value())
429             } else {
430                 None
431             }
432         } else {
433             None
434         }
435     }
436 }
437 
438 /// A packed u32 entry of the AlphabetTable1.
439 impl AlphabetTable1Entry {
new(entry_value: u32) -> Self440     pub const fn new(entry_value: u32) -> Self {
441         AlphabetTable1Entry { entry: entry_value }
442     }
443 
444     /// Unpack code point from entry value.
codepoint(&self) -> u32445     pub fn codepoint(&self) -> u32 {
446         self.entry >> 11
447     }
448 
449     /// Unpack value from entry value.
value(&self) -> u16450     pub fn value(&self) -> u16 {
451         (self.entry & 0x7ff).try_into().unwrap()
452     }
453 }
454 
455 /// A Trie object.
456 /// See the function comment of HyphenationData for the details.
457 impl<'a> Trie<'a> {
458     /// Construct a reader of the Trie struct from the byte array.
new(bytes: &'a [u8]) -> Self459     pub const fn new(bytes: &'a [u8]) -> Self {
460         Trie { data: HyphenationData::new(bytes) }
461     }
462 
463     /// Returns an entry of at the offset.
464     /// The entry of the next alphabet code is
465     ///
466     /// let entry = trie.get_at(node + alphabet_codes[char])
get_at(&self, offset: u32) -> u32467     pub fn get_at(&self, offset: u32) -> u32 {
468         self.data.read_u32(24 + offset * 4)
469     }
470 
471     /// Returns the bit mask for the character code point of the node.
472     /// You can get node's character code point by
473     ///
474     /// let node_character = entry & char_mask.
char_mask(&self) -> u32475     pub fn char_mask(&self) -> u32 {
476         self.data.read_u32(4)
477     }
478 
479     /// Returns the amount of shift of the node index.
480     /// You can get node number as following
481     ///
482     /// let next_node = (entry & link_mask) >> link_shift
link_shift(&self) -> u32483     pub fn link_shift(&self) -> u32 {
484         self.data.read_u32(8)
485     }
486 
487     /// Returns the mask for the node index.
488     /// You can get node number as following
489     ///
490     /// let next_node = (entry & link_mask) >> link_shift
link_mask(&self) -> u32491     pub fn link_mask(&self) -> u32 {
492         self.data.read_u32(12)
493     }
494 
495     /// Returns the amount of shift of the pattern index.
496     /// You can get pattern index as following
497     ///
498     /// let pattern_index = entry >> pattern_shift
pattern_shift(&self) -> u32499     pub fn pattern_shift(&self) -> u32 {
500         self.data.read_u32(16)
501     }
502 }
503 
504 /// A Pattern object.
505 /// See the function comment of HyphenationData for the details.
506 impl<'a> Pattern<'a> {
507     /// Construct a reader of the Pattern struct from the byte array.
new(bytes: &'a [u8]) -> Self508     pub fn new(bytes: &'a [u8]) -> Self {
509         let data = HyphenationData::new(bytes);
510         let pattern_offset = data.read_u32(8);
511         Pattern { data, pattern_offset }
512     }
513 
514     /// Returns a packed u32 entry at the given offset.
entry_at(&self, offset: u32) -> PatternEntry<'a>515     pub fn entry_at(&self, offset: u32) -> PatternEntry<'a> {
516         let entry = self.data.read_u32(16 + offset * 4);
517         PatternEntry::new(self.data.bytes, self.pattern_offset, entry)
518     }
519 }
520 
521 /// An entry of the pattern object.
522 impl<'a> PatternEntry<'a> {
523     /// Construct a reader of the Pattern struct from the byte array.
new(bytes: &'a [u8], pattern_offset: u32, entry: u32) -> Self524     pub const fn new(bytes: &'a [u8], pattern_offset: u32, entry: u32) -> Self {
525         PatternEntry { data: HyphenationData::new(bytes), pattern_offset, entry }
526     }
527 
528     /// Unpack length of the pattern from the packed entry value.
len(&self) -> u32529     pub fn len(&self) -> u32 {
530         self.entry >> 26
531     }
532 
533     /// Unpack an amount of shift of the pattern data from the packed entry value.
shift(&self) -> u32534     pub fn shift(&self) -> u32 {
535         (self.entry >> 20) & 0x3f
536     }
537 
538     /// Returns a hyphenation score value at the offset in word with the entry.
value_at(&self, offset: u32) -> u8539     pub fn value_at(&self, offset: u32) -> u8 {
540         self.data.bytes[(self.pattern_offset + (self.entry & 0xfffff) + offset) as usize]
541     }
542 }
543 
544 /// Performs hyphenation
545 pub struct Hyphenator {
546     data: &'static [u8],
547     min_prefix: u32,
548     min_suffix: u32,
549     locale: HyphenationLocale,
550 }
551 
552 impl Hyphenator {
553     /// Create a new hyphenator instance
new(data: &'static [u8], min_prefix: u32, min_suffix: u32, locale: &str) -> Self554     pub fn new(data: &'static [u8], min_prefix: u32, min_suffix: u32, locale: &str) -> Self {
555         logger::init(
556             logger::Config::default()
557                 .with_tag_on_device("Minikin")
558                 .with_max_level(log::LevelFilter::Trace),
559         );
560         Self {
561             data,
562             min_prefix,
563             min_suffix,
564             locale: if locale == "pl" {
565                 HyphenationLocale::Polish
566             } else if locale == "ca" {
567                 HyphenationLocale::Catalan
568             } else if locale == "sl" {
569                 HyphenationLocale::Slovenian
570             } else if locale == "pt" {
571                 HyphenationLocale::Portuguese
572             } else {
573                 HyphenationLocale::Other
574             },
575         }
576     }
577 
578     /// Performs a hyphenation
hyphenate(&self, word: &[u16], out: &mut [u8])579     pub fn hyphenate(&self, word: &[u16], out: &mut [u8]) {
580         let len: u32 = word.len().try_into().unwrap();
581         let padded_len = len + 2;
582         if !self.data.is_empty()
583             && len >= self.min_prefix + self.min_suffix
584             && padded_len <= MAX_HYPHEN_SIZE
585         {
586             let header = Header::new(self.data);
587             let mut alpha_codes: [u16; MAX_HYPHEN_SIZE as usize] = [0; MAX_HYPHEN_SIZE as usize];
588             let hyphen_value = if let Some(alphabet) = header.alphabet_table() {
589                 alphabet.lookup(&mut alpha_codes, word)
590             } else {
591                 HyphenationType::DontBreak
592             };
593 
594             if hyphen_value != HyphenationType::DontBreak {
595                 self.hyphenate_from_codes(alpha_codes, padded_len, hyphen_value, word, out);
596                 return;
597             }
598             // TODO: try NFC normalization
599             // TODO: handle non-BMP Unicode (requires remapping of offsets)
600         }
601         // Note that we will always get here if the word contains a hyphen or a soft hyphen, because
602         // the alphabet is not expected to contain a hyphen or a soft hyphen character, so
603         // alphabetLookup would return DONT_BREAK.
604         self.hyphenate_with_no_pattern(word, out);
605     }
606 
607     /// This function determines whether a character is like U+2010 HYPHEN in line breaking and
608     /// usage: a character immediately after which line breaks are allowed, but words containing
609     /// it should not be automatically hyphenated using patterns. This is a curated set, created by
610     /// manually inspecting all the characters that have the Unicode line breaking property of BA or
611     /// HY and seeing which ones are hyphens.
is_line_breaking_hyphen(c: u16) -> bool612     fn is_line_breaking_hyphen(c: u16) -> bool {
613         c == 0x002D ||  // HYPHEN-MINUS
614             c == 0x058A ||  // ARMENIAN HYPHEN
615             c == 0x05BE ||  // HEBREW PUNCTUATION MAQAF
616             c == 0x1400 ||  // CANADIAN SYLLABICS HYPHEN
617             c == 0x2010 ||  // HYPHEN
618             c == 0x2013 ||  // EN DASH
619             c == 0x2027 ||  // HYPHENATION POINT
620             c == 0x2E17 ||  // DOUBLE OBLIQUE HYPHEN
621             c == 0x2E40 // DOUBLE HYPHEN
622     }
623 
624     /// Resolves the hyphenation type for Arabic text.
625     /// In case of Arabic text, the letter form should not be changed by hyphenation.
626     /// So, if the hyphenation is in the middle of the joining context, insert ZWJ for keeping the
627     /// form from the original text.
get_hyph_type_for_arabic(word: &[u16], location: u32) -> HyphenationType628     fn get_hyph_type_for_arabic(word: &[u16], location: u32) -> HyphenationType {
629         let mut i = location;
630         let mut join_type: u8 = U_JT_NON_JOINING;
631         while i < word.len().try_into().unwrap() {
632             join_type = getJoiningType(word[i as usize].into());
633             if join_type != U_JT_TRANSPARENT {
634                 break;
635             }
636             i += 1;
637         }
638         if join_type == U_JT_DUAL_JOINING
639             || join_type == U_JT_RIGHT_JOINING
640             || join_type == U_JT_JOIN_CAUSING
641         {
642             // The next character is of the type that may join the last character. See if the last
643             // character is also of the right type.
644             join_type = U_JT_NON_JOINING;
645             if i >= 2 {
646                 i = location - 2; // skip the soft hyphen
647                 loop {
648                     join_type = getJoiningType(word[i as usize].into());
649                     if join_type != U_JT_TRANSPARENT {
650                         break;
651                     }
652                     if i == 0 {
653                         break;
654                     }
655                     i -= 1;
656                 }
657             }
658             if join_type == U_JT_DUAL_JOINING
659                 || join_type == U_JT_LEFT_JOINING
660                 || join_type == U_JT_JOIN_CAUSING
661             {
662                 return HyphenationType::BreakAndInsertHyphenAndZwj;
663             }
664         }
665         HyphenationType::BreakAndInsertHyphen
666     }
667 
668     /// Performs the hyphenation without pattern files.
hyphenate_with_no_pattern(&self, word: &[u16], out: &mut [u8])669     fn hyphenate_with_no_pattern(&self, word: &[u16], out: &mut [u8]) {
670         let word_len: u32 = word.len().try_into().unwrap();
671         out[0] = HyphenationType::DontBreak as u8;
672         for i in 1..word_len {
673             let prev_char = word[i as usize - 1];
674             if i > 1 && Self::is_line_breaking_hyphen(prev_char) {
675                 if (prev_char == CHAR_HYPHEN_MINUS || prev_char == CHAR_HYPHEN)
676                     && (self.locale == HyphenationLocale::Polish
677                         || self.locale == HyphenationLocale::Slovenian)
678                     && getScript(word[i as usize].into()) == USCRIPT_LATIN
679                 {
680                     // In Polish and Slovenian, hyphens get repeated at the next line. To be safe,
681                     // we will do this only if the next character is Latin.
682                     out[i as usize] = HyphenationType::BreakAndInsertHyphenAtNextLine as u8;
683                 } else {
684                     out[i as usize] = HyphenationType::BreakAndDontInsertHyphen as u8;
685                 }
686             } else if i > 1 && prev_char == CHAR_SOFT_HYPHEN {
687                 // Break after soft hyphens, but only if they don't start the word (a soft hyphen
688                 // starting the word doesn't give any useful break opportunities). The type of the
689                 // break is based on the script of the character we break on.
690                 if getScript(word[i as usize].into()) == USCRIPT_ARABIC {
691                     // For Arabic, we need to look and see if the characters around the soft hyphen
692                     // actually join. If they don't, we'll just insert a normal hyphen.
693                     out[i as usize] = Self::get_hyph_type_for_arabic(word, i) as u8;
694                 } else {
695                     out[i as usize] =
696                         Self::hyphenation_type_based_on_script(word[i as usize] as u32) as u8;
697                 }
698             } else if prev_char == CHAR_MIDDLE_DOT
699                 && self.min_prefix < i
700                 && i <= word_len - self.min_suffix
701                 && ((word[i as usize - 2] == 'l' as u16 && word[i as usize] == 'l' as u16)
702                     || (word[i as usize - 2] == 'L' as u16 && word[i as usize] == 'L' as u16))
703                 && self.locale == HyphenationLocale::Catalan
704             {
705                 // In Catalan, "l·l" should break as "l-" on the first line
706                 // and "l" on the next line.
707                 out[i as usize] = HyphenationType::BreakAndReplaceWithHyphen as u8;
708             } else {
709                 out[i as usize] = HyphenationType::DontBreak as u8;
710             }
711         }
712     }
713 
714     /// Performs the hyphenation with pattern file.
hyphenate_from_codes( &self, codes: [u16; MAX_HYPHEN_SIZE as usize], len: u32, hyphen_value: HyphenationType, word: &[u16], out: &mut [u8], )715     fn hyphenate_from_codes(
716         &self,
717         codes: [u16; MAX_HYPHEN_SIZE as usize],
718         len: u32,
719         hyphen_value: HyphenationType,
720         word: &[u16],
721         out: &mut [u8],
722     ) {
723         let header = Header::new(self.data);
724         let trie = header.trie_table();
725         let pattern = header.pattern_table();
726         let char_mask = trie.char_mask();
727         let link_shift = trie.link_shift();
728         let link_mask = trie.link_mask();
729         let pattern_shift = trie.pattern_shift();
730         let max_offset = len - self.min_suffix - 1;
731 
732         for i in 0..(len - 1) {
733             let mut node: u32 = 0; // index into Trie table
734             for j in i..len {
735                 let c: u32 = codes[j as usize].into();
736                 let entry = trie.get_at(node + c);
737                 if (entry & char_mask) == c {
738                     node = (entry & link_mask) >> link_shift;
739                 } else {
740                     break;
741                 }
742                 let pat_ix = trie.get_at(node) >> pattern_shift;
743                 // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an
744                 // offset into the buf pool. This is the pattern for the substring (i..j) we just
745                 // matched, which we combine (via point-wise max) into the buffer vector.
746                 if pat_ix != 0 {
747                     let pat_entry = pattern.entry_at(pat_ix);
748                     let pat_len = pat_entry.len();
749                     let pat_shift = pat_entry.shift();
750                     let offset = j + 1 - (pat_len + pat_shift);
751                     // offset is the index within buffer that lines up with the start of pat_buf
752                     let start = if self.min_prefix < offset { 0 } else { self.min_prefix - offset };
753                     if offset > max_offset {
754                         continue;
755                     }
756                     let end = cmp::min(pat_len, max_offset - offset);
757                     for k in start..end {
758                         out[(offset + k) as usize] =
759                             cmp::max(out[(offset + k) as usize], pat_entry.value_at(k));
760                     }
761                 }
762             }
763         }
764 
765         // Since the above calculation does not modify values outside
766         // [mMinPrefix, len - mMinSuffix], they are left as 0 = DONT_BREAK.
767         for i in self.min_prefix as usize..max_offset as usize {
768             if out[i] & 1 == 0 {
769                 out[i] = HyphenationType::DontBreak as u8;
770                 continue;
771             }
772 
773             if i == 0 || !Self::is_line_breaking_hyphen(word[i - 1]) {
774                 out[i] = hyphen_value as u8;
775                 continue;
776             }
777 
778             if self.locale == HyphenationLocale::Portuguese {
779                 // In Portuguese, prefer to break before the hyphen, i.e. the line start with
780                 // the hyphen. If we see hyphenation break point after the hyphen character,
781                 // prefer to break before the hyphen.
782                 out[i - 1] = HyphenationType::BreakAndDontInsertHyphen as u8;
783                 out[i] = HyphenationType::DontBreak as u8; // Not prefer to break here because
784                                                            // this character is just after the
785                                                            // hyphen character.
786             } else {
787                 // If we see hyphen character just before this character, add hyphenation break
788                 // point and don't break here.
789                 out[i - 1] = HyphenationType::DontBreak as u8;
790                 out[i] = HyphenationType::BreakAndDontInsertHyphen as u8;
791             }
792         }
793     }
794 
hyphenation_type_based_on_script(code_point: u32) -> HyphenationType795     fn hyphenation_type_based_on_script(code_point: u32) -> HyphenationType {
796         let script = getScript(code_point);
797         if script == USCRIPT_KANNADA
798             || script == USCRIPT_MALAYALAM
799             || script == USCRIPT_TAMIL
800             || script == USCRIPT_TELUGU
801         {
802             HyphenationType::BreakAndDontInsertHyphen
803         } else if script == USCRIPT_ARMENIAN {
804             HyphenationType::BreakAndInsertArmenianHyphen
805         } else if script == USCRIPT_CANADIAN_ABORIGINAL {
806             HyphenationType::BreakAndInsertUcasHyphen
807         } else {
808             HyphenationType::BreakAndInsertHyphen
809         }
810     }
811 }
812