1 /* 2 * Copyright 2024 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 use std::cmp; 18 19 /// An implementation of hyphenation for Android. 20 /// 21 /// The hyphenation in Android is done with two steps: first performs the Knuth-Liang hyphenation 22 /// algorithm for populating all possible hyphenation points. Then, resolve hyphenation type from 23 /// the scripts and locales. 24 /// 25 /// The Knuth-Liang hyphenation works like as follows: 26 /// The Knuth-Liang hyphenation uses two dictionary: pattern dictionary and exception files. The 27 /// files end with ".hyp.txt" are exception files and the files end with ".pat.txt" are pattern 28 /// files. If the word is in exception file, the hyphenation is performed as it is specified in the 29 /// exception file. 30 /// 31 /// Then, if the word is not in the exception file, the Knuth-Liang hyphenation is performed with 32 /// hyphenation pattern dictionary. The hyphenation pattern dictionary is a list of sub-word with 33 /// hyphenation level as number. The level values are assigned between each letters including before 34 /// the first letter and last letter. If the value is odd number, then the position is a hyphenation 35 /// point. If the value is even number, then the position is not a hyphenation point. In the pattern 36 /// file, the 0 is dropped, so the meaning of "re4ti4z" is level values "0040040" for sub-word 37 /// "retiz". The hyphenation is performed by iterating all patterns and assigning level values to 38 /// the possible break points. If the break point can be assigned from multiple patterns, the 39 /// maximum value is used. If none of the pattern matches the break point, the level is zero, 40 /// therefore do not break. And finally the odd numbered positions are the break points. 41 /// 42 /// Here is an example how the "hyphenation" is broken into "hy-phen-ation". 43 /// The relevant patterns in the pattern dictionary are 44 /// - hy3ph 45 /// - he2n 46 /// - hena4 47 /// - hen5at 48 /// - 1na 49 /// - n2at 50 /// - 1tio 51 /// - 2io 52 /// - o2n 53 /// 54 /// Then when these patterns are applied to the word "hyphenation", it becomes like 55 /// 56 /// h y p h e n a t i o n 57 /// 0 0 3 0 0 : hy3ph 58 /// 0 0 2 0 : he2n 59 /// 0 0 0 0 4 : hena4 60 /// 0 0 0 5 0 0 : hen5at 61 /// 1 0 0 : 1na 62 /// 0 2 0 0 : n2at 63 /// 1 0 0 0 : 1tio 64 /// 2 0 0 : 2io 65 /// 0 2 0: o2n 66 /// --------------------------------- 67 /// 0 0 3 0 0 2 5 4 2 0 2 0: max 68 /// 69 /// Then, the odd-numbered break points are hyphenation allowed break points, so the result is 70 /// "hy-phen-ation". 71 /// 72 /// In the Android implementation, the hyphenation pattern file is preprocessed to Trie in build 73 /// time. For the detailed file format, see comments of HyphenationData struct. 74 /// 75 /// Once the all possible hyphenation break points are collected, the decide the hyphenation break 76 /// type is determined based on the script and locale. For example, in case of Arabic, the letter 77 /// form should not be changed by hyphenation, so ZWJ can be inserted before and after hyphen 78 /// letter. 79 80 const CHAR_SOFT_HYPHEN: u16 = 0x00AD; 81 const CHAR_MIDDLE_DOT: u16 = 0x00B7; 82 const CHAR_HYPHEN_MINUS: u16 = 0x002D; 83 const CHAR_HYPHEN: u16 = 0x2010; 84 85 // The following U_JT_* constants must be same to the ones defined in 86 // frameworks/minikin/lib/minikin/ffi/IciBridge.h 87 // TODO: Replace with ICU4X once it becomes available in Android. 88 const U_JT_NON_JOINING: u8 = 0; 89 const U_JT_DUAL_JOINING: u8 = 1; 90 const U_JT_RIGHT_JOINING: u8 = 2; 91 const U_JT_LEFT_JOINING: u8 = 3; 92 const U_JT_JOIN_CAUSING: u8 = 4; 93 const U_JT_TRANSPARENT: u8 = 5; 94 95 // The following USCRIPT_* constants must be same to the ones defined in 96 // frameworks/minikin/lib/minikin/ffi/IciBridge.h 97 // TODO: Replace with ICU4X once it becomes available in Android. 98 const USCRIPT_LATIN: u8 = 0; 99 const USCRIPT_ARABIC: u8 = 1; 100 const USCRIPT_KANNADA: u8 = 2; 101 const USCRIPT_MALAYALAM: u8 = 3; 102 const USCRIPT_TAMIL: u8 = 4; 103 const USCRIPT_TELUGU: u8 = 5; 104 const USCRIPT_ARMENIAN: u8 = 6; 105 const USCRIPT_CANADIAN_ABORIGINAL: u8 = 7; 106 107 use crate::ffi::getJoiningType; 108 use crate::ffi::getScript; 109 110 /// Hyphenation types 111 /// The following values must be equal to the ones in 112 /// frameworks/minikin/include/minikin/Hyphenator.h 113 #[repr(u8)] 114 #[derive(PartialEq, Copy, Clone)] 115 pub enum HyphenationType { 116 /// Do not break. 117 DontBreak = 0, 118 /// Break the line and insert a normal hyphen. 119 BreakAndInsertHyphen = 1, 120 /// Break the line and insert an Armenian hyphen (U+058A). 121 BreakAndInsertArmenianHyphen = 2, 122 /// Break the line and insert a Canadian Syllabics hyphen (U+1400). 123 BreakAndInsertUcasHyphen = 4, 124 /// Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen 125 /// present or the script does not use a hyphen (e.g. in Malayalam). 126 BreakAndDontInsertHyphen = 5, 127 /// Break and replace the last code unit with hyphen. Used for Catalan "l·l" which hyphenates 128 /// as "l-/l". 129 BreakAndReplaceWithHyphen = 6, 130 /// Break the line, and repeat the hyphen (which is the last character) at the beginning of the 131 /// next line. Used in Polish (where "czerwono-niebieska" should hyphenate as 132 /// "czerwono-/-niebieska") and Slovenian. 133 BreakAndInsertHyphenAtNextLine = 7, 134 /// Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line. 135 /// This is used in Arabic script, mostly for writing systems of Central Asia. It's our default 136 /// behavior when a soft hyphen is used in Arabic script. 137 BreakAndInsertHyphenAndZwj = 8, 138 } 139 140 /// Hyphenation locale 141 #[repr(u8)] 142 #[derive(PartialEq, Copy, Clone)] 143 pub enum HyphenationLocale { 144 /// Other locale 145 Other = 0, 146 /// Catalan 147 Catalan = 1, 148 /// Polish 149 Polish = 2, 150 /// Slovenian 151 Slovenian = 3, 152 /// Portuguese 153 Portuguese = 4, 154 } 155 156 const MAX_HYPHEN_SIZE: u32 = 64; 157 158 struct HyphenationData<'a> { 159 bytes: &'a [u8], 160 } 161 162 /// The Hyphenation pattern file is encoded into binary format during build time. 163 /// The hyphenation pattern file is encoded into three objects: AlphabetTable, Trie, Patterns. 164 /// 165 /// First, to avoid high value of utf16 char values in Trie object, char values are mapped to 166 /// internal alphabet codes. The AlphabetTable0 and AndroidTable1 has a map from utf16 char values 167 /// to internal u16 alphabet codes. The AlphabetTable0 is used if the min and max used code points 168 /// has less than 1024, i.e. max_codepoint - min_codepoint < 1024. The AlphabetTable1 is used 169 /// otherwise. 170 /// 171 /// Then, the pattern file is encoded with Trie and Pattern object with using internal 172 /// alphabet code. For example, in case of the entry "ef5i5nite", the hyphenation score "00550000" 173 /// is stored in the Pattern object and the subword "efinite" is stored in the Trie object. 174 /// 175 /// The Trie object is encoded as one dimensional u32 arrays. Each u32 integer contains packed 176 /// index to the Pattern object, index to the next node entry and alphabet code. 177 /// Trie Entry: 178 /// 0 1 2 3 179 /// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 (bits) 180 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 181 /// | index to pattern data | index to the next node | code | 182 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 183 /// Note: the layout is as an example of pattern_shift = 19, link_shift = 5. 184 /// 185 /// The Pattern object is encoded into two data: entry list and data payload. The entry is a packed 186 /// u32 integer that contains length of the pattern, an amount of shift of the pattern index and 187 /// an offset from the payload head. 188 /// 189 /// Pattern Entry: 190 /// 0 1 2 3 191 /// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 (bits) 192 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 193 /// |len of pat | pat shift | offset to the pattern data | 194 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 195 /// 196 /// The pattern data and related information can be obtained as follows: 197 /// 198 /// // Pattern information 199 /// let entry = pattern[16 + pattern_index * 4] // read u32 as little endian. 200 /// let pattern_length = entry >> 26 201 /// let pattern_shift = (entry > 20) & 0x3f 202 /// 203 /// // Pattern value retrieval: i-th offset in the word. 204 /// let pattern_offset = pattern[8] // read u32 as little endian. 205 /// let pattern_value = pattern[pattern_offset + (entry & 0xfffff) + i] 206 impl<'a> HyphenationData<'a> { new(bytes: &'a [u8]) -> Self207 pub const fn new(bytes: &'a [u8]) -> Self { 208 HyphenationData { bytes } 209 } 210 read_u32(&self, offset: u32) -> u32211 pub fn read_u32(&self, offset: u32) -> u32 { 212 let usize_offset = offset as usize; 213 self.bytes 214 .get(usize_offset..usize_offset + 4) 215 .map(|x: &[u8]| u32::from_le_bytes(x.try_into().unwrap())) 216 .unwrap() 217 } 218 } 219 220 /// Header struct of the hyphenation pattern file. 221 /// The object layout follows: 222 /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) 223 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 224 /// | magic | version |alphabet offset| trie offset | 225 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 226 /// |pattern offset | file size | 227 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 228 pub struct Header<'a> { 229 data: HyphenationData<'a>, 230 } 231 232 /// Alphabet Table version 0 struct of the hyphenation pattern file. 233 /// The object layout follows: 234 /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) 235 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 236 /// | version | min codepoint | max codepoint | payload 237 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 238 pub struct AlphabetTable0<'a> { 239 data: HyphenationData<'a>, 240 min_codepoint: u32, 241 max_codepoint: u32, 242 } 243 244 /// Alphabet Table version 1 struct of the hyphenation pattern file. 245 /// The object layout follows: 246 /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) 247 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 248 /// | version | num of entries| payload 249 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 250 pub struct AlphabetTable1<'a> { 251 data: HyphenationData<'a>, 252 num_entries: u32, 253 } 254 255 /// An entry of alphabet table version 1 struct of the hyphenation pattern file. 256 /// The entry is packed u32 value: the high 21 bits are code point and low 11 bits 257 /// are alphabet code value. 258 /// 0 1 2 3 259 /// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 (bits) 260 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 261 /// | code point | code value | 262 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 263 pub struct AlphabetTable1Entry { 264 entry: u32, 265 } 266 267 /// Trie struct of the hyphenation pattern file. 268 /// The object layout follows: 269 /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) 270 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 271 /// | version | char mask | link shift | link mask | 272 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 273 /// | pattern shift | num entries | payload 274 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 275 pub struct Trie<'a> { 276 data: HyphenationData<'a>, 277 } 278 279 /// Pattern struct of the hyphenation pattern file. 280 /// The object layout follows: 281 /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) 282 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 283 /// | version | num entries | pattern offset| pattern size | 284 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 285 /// | payload 286 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 287 pub struct Pattern<'a> { 288 data: HyphenationData<'a>, 289 pattern_offset: u32, 290 } 291 292 /// An entry of pattern struct of the hyphenation pattern file. 293 /// The entry is packed u32 value: the highest 6 bits are for length, next 6 bits are amount of 294 /// shift, and lowest 20 bits are offset of the first value from the pattern offset value. 295 /// 0 1 2 3 296 /// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 (bits) 297 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 298 /// | length | shift | offset of the first value | 299 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 300 pub struct PatternEntry<'a> { 301 data: HyphenationData<'a>, 302 pattern_offset: u32, 303 entry: u32, 304 } 305 306 impl<'a> Header<'a> { 307 /// Construct a reader of the Header struct from the byte array. new(bytes: &'a [u8]) -> Self308 pub const fn new(bytes: &'a [u8]) -> Self { 309 Header { data: HyphenationData::new(bytes) } 310 } 311 312 /// Returns the reader of the alphabet code. alphabet_table(&self) -> Option<Box<dyn AlphabetLookup + 'a>>313 pub fn alphabet_table(&self) -> Option<Box<dyn AlphabetLookup + 'a>> { 314 let offset = self.data.read_u32(8); 315 let version = self.data.read_u32(offset); 316 return match version { 317 0 => Some(Box::new(AlphabetTable0::new(self.read_offset_and_slice(8)))), 318 1 => Some(Box::new(AlphabetTable1::new(self.read_offset_and_slice(8)))), 319 _ => None, 320 }; 321 } 322 323 /// Returns the reader of the trie struct. trie_table(&self) -> Trie<'a>324 pub fn trie_table(&self) -> Trie<'a> { 325 Trie::new(self.read_offset_and_slice(12)) 326 } 327 328 /// Returns the reader of the pattern struct. pattern_table(&self) -> Pattern<'a>329 pub fn pattern_table(&self) -> Pattern<'a> { 330 Pattern::new(self.read_offset_and_slice(16)) 331 } 332 read_offset_and_slice(&self, offset: u32) -> &'a [u8]333 fn read_offset_and_slice(&self, offset: u32) -> &'a [u8] { 334 let offset = self.data.read_u32(offset) as usize; 335 self.data.bytes.get(offset..).unwrap() 336 } 337 } 338 339 pub trait AlphabetLookup { 340 /// Get the alphabet code for the code point. get_at(&self, c: u32) -> Option<u16>341 fn get_at(&self, c: u32) -> Option<u16>; 342 343 /// Lookup the internal alphabet codes from UTF-16 character codes. lookup( &self, alpha_codes: &mut [u16; MAX_HYPHEN_SIZE as usize], word: &[u16], ) -> HyphenationType344 fn lookup( 345 &self, 346 alpha_codes: &mut [u16; MAX_HYPHEN_SIZE as usize], 347 word: &[u16], 348 ) -> HyphenationType { 349 let mut result = HyphenationType::BreakAndInsertHyphen; 350 alpha_codes[0] = 0; // word start 351 for i in 0..word.len() { 352 let c = word[i] as u32; 353 if let Some(code) = self.get_at(c) { 354 alpha_codes[i + 1] = code; 355 } else { 356 return HyphenationType::DontBreak; 357 } 358 if result == HyphenationType::BreakAndInsertHyphen { 359 result = Hyphenator::hyphenation_type_based_on_script(c); 360 } 361 } 362 alpha_codes[word.len() + 1] = 0; // word termination 363 result 364 } 365 } 366 367 /// Map from utf16 code unit to the internal alphabet code. 368 impl<'a> AlphabetTable0<'a> { 369 /// Construct a reader of the Alphabet Table version 0 struct from the byte array. new(bytes: &'a [u8]) -> Self370 pub fn new(bytes: &'a [u8]) -> Self { 371 let data = HyphenationData::new(bytes); 372 let min_codepoint = data.read_u32(4); 373 let max_codepoint = data.read_u32(8); 374 AlphabetTable0 { data, min_codepoint, max_codepoint } 375 } 376 } 377 378 impl<'a> AlphabetLookup for AlphabetTable0<'a> { 379 /// Returns an entry of the specified offset. get_at(&self, offset: u32) -> Option<u16>380 fn get_at(&self, offset: u32) -> Option<u16> { 381 if offset < self.min_codepoint || offset >= self.max_codepoint { 382 None 383 } else { 384 let code = self.data.bytes[(offset - self.min_codepoint) as usize + 12] as u16; 385 if code == 0 { 386 None 387 } else { 388 Some(code) 389 } 390 } 391 } 392 } 393 394 /// Map from utf16 code unit to the internal alphabet code. 395 impl<'a> AlphabetTable1<'a> { 396 /// Construct a reader of the Alphabet Table version 1 struct from the byte array. new(bytes: &'a [u8]) -> Self397 pub fn new(bytes: &'a [u8]) -> Self { 398 let data = HyphenationData::new(bytes); 399 let num_entries = data.read_u32(4); 400 AlphabetTable1 { data, num_entries } 401 } 402 lower_bounds(&self, value: u32) -> Option<u32>403 fn lower_bounds(&self, value: u32) -> Option<u32> { 404 let mut b = 0; 405 let mut e = self.num_entries; 406 while b != e { 407 let m = b + (e - b) / 2; 408 let c = self.data.read_u32(8 + m * 4); 409 if c >= value { 410 e = m; 411 } else { 412 b = m + 1; 413 } 414 } 415 if b == self.num_entries { 416 None 417 } else { 418 Some(b) 419 } 420 } 421 } 422 423 impl<'a> AlphabetLookup for AlphabetTable1<'a> { get_at(&self, c: u32) -> Option<u16>424 fn get_at(&self, c: u32) -> Option<u16> { 425 if let Some(r) = self.lower_bounds(c << 11) { 426 let entry = AlphabetTable1Entry::new(self.data.read_u32(8 + r * 4)); 427 if entry.codepoint() == c { 428 Some(entry.value()) 429 } else { 430 None 431 } 432 } else { 433 None 434 } 435 } 436 } 437 438 /// A packed u32 entry of the AlphabetTable1. 439 impl AlphabetTable1Entry { new(entry_value: u32) -> Self440 pub const fn new(entry_value: u32) -> Self { 441 AlphabetTable1Entry { entry: entry_value } 442 } 443 444 /// Unpack code point from entry value. codepoint(&self) -> u32445 pub fn codepoint(&self) -> u32 { 446 self.entry >> 11 447 } 448 449 /// Unpack value from entry value. value(&self) -> u16450 pub fn value(&self) -> u16 { 451 (self.entry & 0x7ff).try_into().unwrap() 452 } 453 } 454 455 /// A Trie object. 456 /// See the function comment of HyphenationData for the details. 457 impl<'a> Trie<'a> { 458 /// Construct a reader of the Trie struct from the byte array. new(bytes: &'a [u8]) -> Self459 pub const fn new(bytes: &'a [u8]) -> Self { 460 Trie { data: HyphenationData::new(bytes) } 461 } 462 463 /// Returns an entry of at the offset. 464 /// The entry of the next alphabet code is 465 /// 466 /// let entry = trie.get_at(node + alphabet_codes[char]) get_at(&self, offset: u32) -> u32467 pub fn get_at(&self, offset: u32) -> u32 { 468 self.data.read_u32(24 + offset * 4) 469 } 470 471 /// Returns the bit mask for the character code point of the node. 472 /// You can get node's character code point by 473 /// 474 /// let node_character = entry & char_mask. char_mask(&self) -> u32475 pub fn char_mask(&self) -> u32 { 476 self.data.read_u32(4) 477 } 478 479 /// Returns the amount of shift of the node index. 480 /// You can get node number as following 481 /// 482 /// let next_node = (entry & link_mask) >> link_shift link_shift(&self) -> u32483 pub fn link_shift(&self) -> u32 { 484 self.data.read_u32(8) 485 } 486 487 /// Returns the mask for the node index. 488 /// You can get node number as following 489 /// 490 /// let next_node = (entry & link_mask) >> link_shift link_mask(&self) -> u32491 pub fn link_mask(&self) -> u32 { 492 self.data.read_u32(12) 493 } 494 495 /// Returns the amount of shift of the pattern index. 496 /// You can get pattern index as following 497 /// 498 /// let pattern_index = entry >> pattern_shift pattern_shift(&self) -> u32499 pub fn pattern_shift(&self) -> u32 { 500 self.data.read_u32(16) 501 } 502 } 503 504 /// A Pattern object. 505 /// See the function comment of HyphenationData for the details. 506 impl<'a> Pattern<'a> { 507 /// Construct a reader of the Pattern struct from the byte array. new(bytes: &'a [u8]) -> Self508 pub fn new(bytes: &'a [u8]) -> Self { 509 let data = HyphenationData::new(bytes); 510 let pattern_offset = data.read_u32(8); 511 Pattern { data, pattern_offset } 512 } 513 514 /// Returns a packed u32 entry at the given offset. entry_at(&self, offset: u32) -> PatternEntry<'a>515 pub fn entry_at(&self, offset: u32) -> PatternEntry<'a> { 516 let entry = self.data.read_u32(16 + offset * 4); 517 PatternEntry::new(self.data.bytes, self.pattern_offset, entry) 518 } 519 } 520 521 /// An entry of the pattern object. 522 impl<'a> PatternEntry<'a> { 523 /// Construct a reader of the Pattern struct from the byte array. new(bytes: &'a [u8], pattern_offset: u32, entry: u32) -> Self524 pub const fn new(bytes: &'a [u8], pattern_offset: u32, entry: u32) -> Self { 525 PatternEntry { data: HyphenationData::new(bytes), pattern_offset, entry } 526 } 527 528 /// Unpack length of the pattern from the packed entry value. len(&self) -> u32529 pub fn len(&self) -> u32 { 530 self.entry >> 26 531 } 532 533 /// Unpack an amount of shift of the pattern data from the packed entry value. shift(&self) -> u32534 pub fn shift(&self) -> u32 { 535 (self.entry >> 20) & 0x3f 536 } 537 538 /// Returns a hyphenation score value at the offset in word with the entry. value_at(&self, offset: u32) -> u8539 pub fn value_at(&self, offset: u32) -> u8 { 540 self.data.bytes[(self.pattern_offset + (self.entry & 0xfffff) + offset) as usize] 541 } 542 } 543 544 /// Performs hyphenation 545 pub struct Hyphenator { 546 data: &'static [u8], 547 min_prefix: u32, 548 min_suffix: u32, 549 locale: HyphenationLocale, 550 } 551 552 impl Hyphenator { 553 /// Create a new hyphenator instance new(data: &'static [u8], min_prefix: u32, min_suffix: u32, locale: &str) -> Self554 pub fn new(data: &'static [u8], min_prefix: u32, min_suffix: u32, locale: &str) -> Self { 555 logger::init( 556 logger::Config::default() 557 .with_tag_on_device("Minikin") 558 .with_max_level(log::LevelFilter::Trace), 559 ); 560 Self { 561 data, 562 min_prefix, 563 min_suffix, 564 locale: if locale == "pl" { 565 HyphenationLocale::Polish 566 } else if locale == "ca" { 567 HyphenationLocale::Catalan 568 } else if locale == "sl" { 569 HyphenationLocale::Slovenian 570 } else if locale == "pt" { 571 HyphenationLocale::Portuguese 572 } else { 573 HyphenationLocale::Other 574 }, 575 } 576 } 577 578 /// Performs a hyphenation hyphenate(&self, word: &[u16], out: &mut [u8])579 pub fn hyphenate(&self, word: &[u16], out: &mut [u8]) { 580 let len: u32 = word.len().try_into().unwrap(); 581 let padded_len = len + 2; 582 if !self.data.is_empty() 583 && len >= self.min_prefix + self.min_suffix 584 && padded_len <= MAX_HYPHEN_SIZE 585 { 586 let header = Header::new(self.data); 587 let mut alpha_codes: [u16; MAX_HYPHEN_SIZE as usize] = [0; MAX_HYPHEN_SIZE as usize]; 588 let hyphen_value = if let Some(alphabet) = header.alphabet_table() { 589 alphabet.lookup(&mut alpha_codes, word) 590 } else { 591 HyphenationType::DontBreak 592 }; 593 594 if hyphen_value != HyphenationType::DontBreak { 595 self.hyphenate_from_codes(alpha_codes, padded_len, hyphen_value, word, out); 596 return; 597 } 598 // TODO: try NFC normalization 599 // TODO: handle non-BMP Unicode (requires remapping of offsets) 600 } 601 // Note that we will always get here if the word contains a hyphen or a soft hyphen, because 602 // the alphabet is not expected to contain a hyphen or a soft hyphen character, so 603 // alphabetLookup would return DONT_BREAK. 604 self.hyphenate_with_no_pattern(word, out); 605 } 606 607 /// This function determines whether a character is like U+2010 HYPHEN in line breaking and 608 /// usage: a character immediately after which line breaks are allowed, but words containing 609 /// it should not be automatically hyphenated using patterns. This is a curated set, created by 610 /// manually inspecting all the characters that have the Unicode line breaking property of BA or 611 /// HY and seeing which ones are hyphens. is_line_breaking_hyphen(c: u16) -> bool612 fn is_line_breaking_hyphen(c: u16) -> bool { 613 c == 0x002D || // HYPHEN-MINUS 614 c == 0x058A || // ARMENIAN HYPHEN 615 c == 0x05BE || // HEBREW PUNCTUATION MAQAF 616 c == 0x1400 || // CANADIAN SYLLABICS HYPHEN 617 c == 0x2010 || // HYPHEN 618 c == 0x2013 || // EN DASH 619 c == 0x2027 || // HYPHENATION POINT 620 c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN 621 c == 0x2E40 // DOUBLE HYPHEN 622 } 623 624 /// Resolves the hyphenation type for Arabic text. 625 /// In case of Arabic text, the letter form should not be changed by hyphenation. 626 /// So, if the hyphenation is in the middle of the joining context, insert ZWJ for keeping the 627 /// form from the original text. get_hyph_type_for_arabic(word: &[u16], location: u32) -> HyphenationType628 fn get_hyph_type_for_arabic(word: &[u16], location: u32) -> HyphenationType { 629 let mut i = location; 630 let mut join_type: u8 = U_JT_NON_JOINING; 631 while i < word.len().try_into().unwrap() { 632 join_type = getJoiningType(word[i as usize].into()); 633 if join_type != U_JT_TRANSPARENT { 634 break; 635 } 636 i += 1; 637 } 638 if join_type == U_JT_DUAL_JOINING 639 || join_type == U_JT_RIGHT_JOINING 640 || join_type == U_JT_JOIN_CAUSING 641 { 642 // The next character is of the type that may join the last character. See if the last 643 // character is also of the right type. 644 join_type = U_JT_NON_JOINING; 645 if i >= 2 { 646 i = location - 2; // skip the soft hyphen 647 loop { 648 join_type = getJoiningType(word[i as usize].into()); 649 if join_type != U_JT_TRANSPARENT { 650 break; 651 } 652 if i == 0 { 653 break; 654 } 655 i -= 1; 656 } 657 } 658 if join_type == U_JT_DUAL_JOINING 659 || join_type == U_JT_LEFT_JOINING 660 || join_type == U_JT_JOIN_CAUSING 661 { 662 return HyphenationType::BreakAndInsertHyphenAndZwj; 663 } 664 } 665 HyphenationType::BreakAndInsertHyphen 666 } 667 668 /// Performs the hyphenation without pattern files. hyphenate_with_no_pattern(&self, word: &[u16], out: &mut [u8])669 fn hyphenate_with_no_pattern(&self, word: &[u16], out: &mut [u8]) { 670 let word_len: u32 = word.len().try_into().unwrap(); 671 out[0] = HyphenationType::DontBreak as u8; 672 for i in 1..word_len { 673 let prev_char = word[i as usize - 1]; 674 if i > 1 && Self::is_line_breaking_hyphen(prev_char) { 675 if (prev_char == CHAR_HYPHEN_MINUS || prev_char == CHAR_HYPHEN) 676 && (self.locale == HyphenationLocale::Polish 677 || self.locale == HyphenationLocale::Slovenian) 678 && getScript(word[i as usize].into()) == USCRIPT_LATIN 679 { 680 // In Polish and Slovenian, hyphens get repeated at the next line. To be safe, 681 // we will do this only if the next character is Latin. 682 out[i as usize] = HyphenationType::BreakAndInsertHyphenAtNextLine as u8; 683 } else { 684 out[i as usize] = HyphenationType::BreakAndDontInsertHyphen as u8; 685 } 686 } else if i > 1 && prev_char == CHAR_SOFT_HYPHEN { 687 // Break after soft hyphens, but only if they don't start the word (a soft hyphen 688 // starting the word doesn't give any useful break opportunities). The type of the 689 // break is based on the script of the character we break on. 690 if getScript(word[i as usize].into()) == USCRIPT_ARABIC { 691 // For Arabic, we need to look and see if the characters around the soft hyphen 692 // actually join. If they don't, we'll just insert a normal hyphen. 693 out[i as usize] = Self::get_hyph_type_for_arabic(word, i) as u8; 694 } else { 695 out[i as usize] = 696 Self::hyphenation_type_based_on_script(word[i as usize] as u32) as u8; 697 } 698 } else if prev_char == CHAR_MIDDLE_DOT 699 && self.min_prefix < i 700 && i <= word_len - self.min_suffix 701 && ((word[i as usize - 2] == 'l' as u16 && word[i as usize] == 'l' as u16) 702 || (word[i as usize - 2] == 'L' as u16 && word[i as usize] == 'L' as u16)) 703 && self.locale == HyphenationLocale::Catalan 704 { 705 // In Catalan, "l·l" should break as "l-" on the first line 706 // and "l" on the next line. 707 out[i as usize] = HyphenationType::BreakAndReplaceWithHyphen as u8; 708 } else { 709 out[i as usize] = HyphenationType::DontBreak as u8; 710 } 711 } 712 } 713 714 /// Performs the hyphenation with pattern file. hyphenate_from_codes( &self, codes: [u16; MAX_HYPHEN_SIZE as usize], len: u32, hyphen_value: HyphenationType, word: &[u16], out: &mut [u8], )715 fn hyphenate_from_codes( 716 &self, 717 codes: [u16; MAX_HYPHEN_SIZE as usize], 718 len: u32, 719 hyphen_value: HyphenationType, 720 word: &[u16], 721 out: &mut [u8], 722 ) { 723 let header = Header::new(self.data); 724 let trie = header.trie_table(); 725 let pattern = header.pattern_table(); 726 let char_mask = trie.char_mask(); 727 let link_shift = trie.link_shift(); 728 let link_mask = trie.link_mask(); 729 let pattern_shift = trie.pattern_shift(); 730 let max_offset = len - self.min_suffix - 1; 731 732 for i in 0..(len - 1) { 733 let mut node: u32 = 0; // index into Trie table 734 for j in i..len { 735 let c: u32 = codes[j as usize].into(); 736 let entry = trie.get_at(node + c); 737 if (entry & char_mask) == c { 738 node = (entry & link_mask) >> link_shift; 739 } else { 740 break; 741 } 742 let pat_ix = trie.get_at(node) >> pattern_shift; 743 // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an 744 // offset into the buf pool. This is the pattern for the substring (i..j) we just 745 // matched, which we combine (via point-wise max) into the buffer vector. 746 if pat_ix != 0 { 747 let pat_entry = pattern.entry_at(pat_ix); 748 let pat_len = pat_entry.len(); 749 let pat_shift = pat_entry.shift(); 750 let offset = j + 1 - (pat_len + pat_shift); 751 // offset is the index within buffer that lines up with the start of pat_buf 752 let start = if self.min_prefix < offset { 0 } else { self.min_prefix - offset }; 753 if offset > max_offset { 754 continue; 755 } 756 let end = cmp::min(pat_len, max_offset - offset); 757 for k in start..end { 758 out[(offset + k) as usize] = 759 cmp::max(out[(offset + k) as usize], pat_entry.value_at(k)); 760 } 761 } 762 } 763 } 764 765 // Since the above calculation does not modify values outside 766 // [mMinPrefix, len - mMinSuffix], they are left as 0 = DONT_BREAK. 767 for i in self.min_prefix as usize..max_offset as usize { 768 if out[i] & 1 == 0 { 769 out[i] = HyphenationType::DontBreak as u8; 770 continue; 771 } 772 773 if i == 0 || !Self::is_line_breaking_hyphen(word[i - 1]) { 774 out[i] = hyphen_value as u8; 775 continue; 776 } 777 778 if self.locale == HyphenationLocale::Portuguese { 779 // In Portuguese, prefer to break before the hyphen, i.e. the line start with 780 // the hyphen. If we see hyphenation break point after the hyphen character, 781 // prefer to break before the hyphen. 782 out[i - 1] = HyphenationType::BreakAndDontInsertHyphen as u8; 783 out[i] = HyphenationType::DontBreak as u8; // Not prefer to break here because 784 // this character is just after the 785 // hyphen character. 786 } else { 787 // If we see hyphen character just before this character, add hyphenation break 788 // point and don't break here. 789 out[i - 1] = HyphenationType::DontBreak as u8; 790 out[i] = HyphenationType::BreakAndDontInsertHyphen as u8; 791 } 792 } 793 } 794 hyphenation_type_based_on_script(code_point: u32) -> HyphenationType795 fn hyphenation_type_based_on_script(code_point: u32) -> HyphenationType { 796 let script = getScript(code_point); 797 if script == USCRIPT_KANNADA 798 || script == USCRIPT_MALAYALAM 799 || script == USCRIPT_TAMIL 800 || script == USCRIPT_TELUGU 801 { 802 HyphenationType::BreakAndDontInsertHyphen 803 } else if script == USCRIPT_ARMENIAN { 804 HyphenationType::BreakAndInsertArmenianHyphen 805 } else if script == USCRIPT_CANADIAN_ABORIGINAL { 806 HyphenationType::BreakAndInsertUcasHyphen 807 } else { 808 HyphenationType::BreakAndInsertHyphen 809 } 810 } 811 } 812