1 // Copyright 2023 The Mozilla Foundation. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use super::TextSource;
11 
12 use alloc::borrow::Cow;
13 use alloc::vec::Vec;
14 use core::char;
15 use core::ops::Range;
16 
17 use crate::{
18     compute_bidi_info_for_para, compute_initial_info, level, para_direction, reorder_levels,
19     reorder_visual, visual_runs_for_line,
20 };
21 use crate::{BidiClass, BidiDataSource, Direction, Level, LevelRun, ParagraphInfo};
22 
23 #[cfg(feature = "hardcoded-data")]
24 use crate::HardcodedBidiData;
25 
26 /// Initial bidi information of the text (UTF-16 version).
27 ///
28 /// Contains the text paragraphs and `BidiClass` of its characters.
29 #[derive(PartialEq, Debug)]
30 pub struct InitialInfo<'text> {
31     /// The text
32     pub text: &'text [u16],
33 
34     /// The BidiClass of the character at each code unit in the text.
35     /// If a character is multiple code units, its class will appear multiple times in the vector.
36     pub original_classes: Vec<BidiClass>,
37 
38     /// The boundaries and level of each paragraph within the text.
39     pub paragraphs: Vec<ParagraphInfo>,
40 }
41 
42 impl<'text> InitialInfo<'text> {
43     /// Find the paragraphs and BidiClasses in a string of text.
44     ///
45     /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
46     ///
47     /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
48     /// character is found before the matching PDI.  If no strong character is found, the class will
49     /// remain FSI, and it's up to later stages to treat these as LRI when needed.
50     ///
51     /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
52     #[cfg_attr(feature = "flame_it", flamer::flame)]
53     #[cfg(feature = "hardcoded-data")]
new(text: &[u16], default_para_level: Option<Level>) -> InitialInfo<'_>54     pub fn new(text: &[u16], default_para_level: Option<Level>) -> InitialInfo<'_> {
55         Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
56     }
57 
58     /// Find the paragraphs and BidiClasses in a string of text, with a custom [`BidiDataSource`]
59     /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`InitialInfo::new()`]
60     /// instead (enabled with tbe default `hardcoded-data` Cargo feature)
61     ///
62     /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
63     ///
64     /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
65     /// character is found before the matching PDI.  If no strong character is found, the class will
66     /// remain FSI, and it's up to later stages to treat these as LRI when needed.
67     #[cfg_attr(feature = "flame_it", flamer::flame)]
new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a [u16], default_para_level: Option<Level>, ) -> InitialInfo<'a>68     pub fn new_with_data_source<'a, D: BidiDataSource>(
69         data_source: &D,
70         text: &'a [u16],
71         default_para_level: Option<Level>,
72     ) -> InitialInfo<'a> {
73         InitialInfoExt::new_with_data_source(data_source, text, default_para_level).base
74     }
75 }
76 
77 /// Extended version of InitialInfo (not public API).
78 #[derive(PartialEq, Debug)]
79 struct InitialInfoExt<'text> {
80     /// The base InitialInfo for the text, recording its paragraphs and bidi classes.
81     base: InitialInfo<'text>,
82 
83     /// Parallel to base.paragraphs, records whether each paragraph is "pure LTR" that
84     /// requires no further bidi processing (i.e. there are no RTL characters or bidi
85     /// control codes present).
86     pure_ltr: Vec<bool>,
87 }
88 
89 impl<'text> InitialInfoExt<'text> {
90     /// Find the paragraphs and BidiClasses in a string of text, with a custom [`BidiDataSource`]
91     /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`InitialInfo::new()`]
92     /// instead (enabled with tbe default `hardcoded-data` Cargo feature)
93     ///
94     /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
95     ///
96     /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
97     /// character is found before the matching PDI.  If no strong character is found, the class will
98     /// remain FSI, and it's up to later stages to treat these as LRI when needed.
99     #[cfg_attr(feature = "flame_it", flamer::flame)]
new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a [u16], default_para_level: Option<Level>, ) -> InitialInfoExt<'a>100     pub fn new_with_data_source<'a, D: BidiDataSource>(
101         data_source: &D,
102         text: &'a [u16],
103         default_para_level: Option<Level>,
104     ) -> InitialInfoExt<'a> {
105         let mut paragraphs = Vec::<ParagraphInfo>::new();
106         let mut pure_ltr = Vec::<bool>::new();
107         let (original_classes, _, _) = compute_initial_info(
108             data_source,
109             text,
110             default_para_level,
111             Some((&mut paragraphs, &mut pure_ltr)),
112         );
113 
114         InitialInfoExt {
115             base: InitialInfo {
116                 text,
117                 original_classes,
118                 paragraphs,
119             },
120             pure_ltr,
121         }
122     }
123 }
124 
125 /// Bidi information of the text (UTF-16 version).
126 ///
127 /// The `original_classes` and `levels` vectors are indexed by code unit offsets into the text.  If a
128 /// character is multiple code units wide, then its class and level will appear multiple times in these
129 /// vectors.
130 // TODO: Impl `struct StringProperty<T> { values: Vec<T> }` and use instead of Vec<T>
131 #[derive(Debug, PartialEq)]
132 pub struct BidiInfo<'text> {
133     /// The text
134     pub text: &'text [u16],
135 
136     /// The BidiClass of the character at each byte in the text.
137     pub original_classes: Vec<BidiClass>,
138 
139     /// The directional embedding level of each byte in the text.
140     pub levels: Vec<Level>,
141 
142     /// The boundaries and paragraph embedding level of each paragraph within the text.
143     ///
144     /// TODO: Use SmallVec or similar to avoid overhead when there are only one or two paragraphs?
145     /// Or just don't include the first paragraph, which always starts at 0?
146     pub paragraphs: Vec<ParagraphInfo>,
147 }
148 
149 impl<'text> BidiInfo<'text> {
150     /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph.
151     ///
152     ///
153     /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
154     ///
155     /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
156     /// text that is entirely LTR.  See the `nsBidi` class from Gecko for comparison.
157     ///
158     /// TODO: Support auto-RTL base direction
159     #[cfg_attr(feature = "flame_it", flamer::flame)]
160     #[cfg(feature = "hardcoded-data")]
161     #[inline]
new(text: &[u16], default_para_level: Option<Level>) -> BidiInfo<'_>162     pub fn new(text: &[u16], default_para_level: Option<Level>) -> BidiInfo<'_> {
163         Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
164     }
165 
166     /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph, with a custom [`BidiDataSource`]
167     /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`BidiInfo::new()`]
168     /// instead (enabled with tbe default `hardcoded-data` Cargo feature).
169     ///
170     /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
171     /// text that is entirely LTR.  See the `nsBidi` class from Gecko for comparison.
172     ///
173     /// TODO: Support auto-RTL base direction
174     #[cfg_attr(feature = "flame_it", flamer::flame)]
new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a [u16], default_para_level: Option<Level>, ) -> BidiInfo<'a>175     pub fn new_with_data_source<'a, D: BidiDataSource>(
176         data_source: &D,
177         text: &'a [u16],
178         default_para_level: Option<Level>,
179     ) -> BidiInfo<'a> {
180         let InitialInfoExt { base, pure_ltr, .. } =
181             InitialInfoExt::new_with_data_source(data_source, text, default_para_level);
182 
183         let mut levels = Vec::<Level>::with_capacity(text.len());
184         let mut processing_classes = base.original_classes.clone();
185 
186         for (para, is_pure_ltr) in base.paragraphs.iter().zip(pure_ltr.iter()) {
187             let text = &text[para.range.clone()];
188             let original_classes = &base.original_classes[para.range.clone()];
189 
190             compute_bidi_info_for_para(
191                 data_source,
192                 para,
193                 *is_pure_ltr,
194                 text,
195                 original_classes,
196                 &mut processing_classes,
197                 &mut levels,
198             );
199         }
200 
201         BidiInfo {
202             text,
203             original_classes: base.original_classes,
204             paragraphs: base.paragraphs,
205             levels,
206         }
207     }
208 
209     /// Produce the levels for this paragraph as needed for reordering, one level per *byte*
210     /// in the paragraph. The returned vector includes bytes that are not included
211     /// in the `line`, but will not adjust them.
212     ///
213     /// This runs [Rule L1], you can run
214     /// [Rule L2] by calling [`Self::reorder_visual()`].
215     /// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead
216     /// to avoid non-byte indices.
217     ///
218     /// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`].
219     ///
220     /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
221     /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
222     #[cfg_attr(feature = "flame_it", flamer::flame)]
reordered_levels(&self, para: &ParagraphInfo, line: Range<usize>) -> Vec<Level>223     pub fn reordered_levels(&self, para: &ParagraphInfo, line: Range<usize>) -> Vec<Level> {
224         assert!(line.start <= self.levels.len());
225         assert!(line.end <= self.levels.len());
226 
227         let mut levels = self.levels.clone();
228         let line_classes = &self.original_classes[line.clone()];
229         let line_levels = &mut levels[line.clone()];
230         let line_str: &[u16] = &self.text[line.clone()];
231 
232         reorder_levels(line_classes, line_levels, line_str, para.level);
233 
234         levels
235     }
236 
237     /// Produce the levels for this paragraph as needed for reordering, one level per *character*
238     /// in the paragraph. The returned vector includes characters that are not included
239     /// in the `line`, but will not adjust them.
240     ///
241     /// This runs [Rule L1], you can run
242     /// [Rule L2] by calling [`Self::reorder_visual()`].
243     /// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead
244     /// to avoid non-byte indices.
245     ///
246     /// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`].
247     ///
248     /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
249     /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
250     #[cfg_attr(feature = "flame_it", flamer::flame)]
reordered_levels_per_char( &self, para: &ParagraphInfo, line: Range<usize>, ) -> Vec<Level>251     pub fn reordered_levels_per_char(
252         &self,
253         para: &ParagraphInfo,
254         line: Range<usize>,
255     ) -> Vec<Level> {
256         let levels = self.reordered_levels(para, line);
257         self.text.char_indices().map(|(i, _)| levels[i]).collect()
258     }
259 
260     /// Re-order a line based on resolved levels and return the line in display order.
261     ///
262     /// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring.
263     ///
264     /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
265     /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
266     #[cfg_attr(feature = "flame_it", flamer::flame)]
reorder_line(&self, para: &ParagraphInfo, line: Range<usize>) -> Cow<'text, [u16]>267     pub fn reorder_line(&self, para: &ParagraphInfo, line: Range<usize>) -> Cow<'text, [u16]> {
268         if !level::has_rtl(&self.levels[line.clone()]) {
269             return self.text[line].into();
270         }
271         let (levels, runs) = self.visual_runs(para, line.clone());
272         reorder_line(self.text, line, levels, runs)
273     }
274 
275     /// Reorders pre-calculated levels of a sequence of characters.
276     ///
277     /// NOTE: This is a convenience method that does not use a `Paragraph`  object. It is
278     /// intended to be used when an application has determined the levels of the objects (character sequences)
279     /// and just needs to have them reordered.
280     ///
281     /// the index map will result in `indexMap[visualIndex]==logicalIndex`.
282     ///
283     /// This only runs [Rule L2](http://www.unicode.org/reports/tr9/#L2) as it does not have
284     /// information about the actual text.
285     ///
286     /// Furthermore, if `levels` is an array that is aligned with code units, bytes within a codepoint may be
287     /// reversed. You may need to fix up the map to deal with this. Alternatively, only pass in arrays where each `Level`
288     /// is for a single code point.
289     ///
290     ///
291     ///   # # Example
292     /// ```
293     /// use unicode_bidi::BidiInfo;
294     /// use unicode_bidi::Level;
295     ///
296     /// let l0 = Level::from(0);
297     /// let l1 = Level::from(1);
298     /// let l2 = Level::from(2);
299     ///
300     /// let levels = vec![l0, l0, l0, l0];
301     /// let index_map = BidiInfo::reorder_visual(&levels);
302     /// assert_eq!(levels.len(), index_map.len());
303     /// assert_eq!(index_map, [0, 1, 2, 3]);
304     ///
305     /// let levels: Vec<Level> = vec![l0, l0, l0, l1, l1, l1, l2, l2];
306     /// let index_map = BidiInfo::reorder_visual(&levels);
307     /// assert_eq!(levels.len(), index_map.len());
308     /// assert_eq!(index_map, [0, 1, 2, 6, 7, 5, 4, 3]);
309     /// ```
310     #[cfg_attr(feature = "flame_it", flamer::flame)]
311     #[inline]
reorder_visual(levels: &[Level]) -> Vec<usize>312     pub fn reorder_visual(levels: &[Level]) -> Vec<usize> {
313         reorder_visual(levels)
314     }
315 
316     /// Find the level runs within a line and return them in visual order.
317     ///
318     /// `line` is a range of bytes indices within `levels`.
319     ///
320     /// The first return value is a vector of levels used by the reordering algorithm,
321     /// i.e. the result of [Rule L1]. The second return value is a vector of level runs,
322     /// the result of [Rule L2], showing the visual order that each level run (a run of text with the
323     /// same level) should be displayed. Within each run, the display order can be checked
324     /// against the Level vector.
325     ///
326     /// This does not handle [Rule L3] (combining characters) or [Rule L4] (mirroring),
327     /// as that should be handled by the engine using this API.
328     ///
329     /// Conceptually, this is the same as running [`Self::reordered_levels()`] followed by
330     /// [`Self::reorder_visual()`], however it returns the result as a list of level runs instead
331     /// of producing a level map, since one may wish to deal with the fact that this is operating on
332     /// byte rather than character indices.
333     ///
334     /// <http://www.unicode.org/reports/tr9/#Reordering_Resolved_Levels>
335     ///
336     /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
337     /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
338     /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
339     /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
340     #[cfg_attr(feature = "flame_it", flamer::flame)]
341     #[inline]
visual_runs( &self, para: &ParagraphInfo, line: Range<usize>, ) -> (Vec<Level>, Vec<LevelRun>)342     pub fn visual_runs(
343         &self,
344         para: &ParagraphInfo,
345         line: Range<usize>,
346     ) -> (Vec<Level>, Vec<LevelRun>) {
347         let levels = self.reordered_levels(para, line.clone());
348         visual_runs_for_line(levels, &line)
349     }
350 
351     /// If processed text has any computed RTL levels
352     ///
353     /// This information is usually used to skip re-ordering of text when no RTL level is present
354     #[inline]
has_rtl(&self) -> bool355     pub fn has_rtl(&self) -> bool {
356         level::has_rtl(&self.levels)
357     }
358 }
359 
360 /// Bidi information of text treated as a single paragraph.
361 ///
362 /// The `original_classes` and `levels` vectors are indexed by code unit offsets into the text.  If a
363 /// character is multiple code units wide, then its class and level will appear multiple times in these
364 /// vectors.
365 #[derive(Debug, PartialEq)]
366 pub struct ParagraphBidiInfo<'text> {
367     /// The text
368     pub text: &'text [u16],
369 
370     /// The BidiClass of the character at each byte in the text.
371     pub original_classes: Vec<BidiClass>,
372 
373     /// The directional embedding level of each byte in the text.
374     pub levels: Vec<Level>,
375 
376     /// The paragraph embedding level.
377     pub paragraph_level: Level,
378 
379     /// Whether the paragraph is purely LTR.
380     pub is_pure_ltr: bool,
381 }
382 
383 impl<'text> ParagraphBidiInfo<'text> {
384     /// Determine the bidi embedding level.
385     ///
386     ///
387     /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
388     ///
389     /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
390     /// text that is entirely LTR.  See the `nsBidi` class from Gecko for comparison.
391     ///
392     /// TODO: Support auto-RTL base direction
393     #[cfg_attr(feature = "flame_it", flamer::flame)]
394     #[cfg(feature = "hardcoded-data")]
395     #[inline]
new(text: &[u16], default_para_level: Option<Level>) -> ParagraphBidiInfo<'_>396     pub fn new(text: &[u16], default_para_level: Option<Level>) -> ParagraphBidiInfo<'_> {
397         Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
398     }
399 
400     /// Determine the bidi embedding level, with a custom [`BidiDataSource`]
401     /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`BidiInfo::new()`]
402     /// instead (enabled with tbe default `hardcoded-data` Cargo feature).
403     ///
404     /// (This is the single-paragraph equivalent of BidiInfo::new_with_data_source,
405     /// and should be kept in sync with it.
406     #[cfg_attr(feature = "flame_it", flamer::flame)]
new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a [u16], default_para_level: Option<Level>, ) -> ParagraphBidiInfo<'a>407     pub fn new_with_data_source<'a, D: BidiDataSource>(
408         data_source: &D,
409         text: &'a [u16],
410         default_para_level: Option<Level>,
411     ) -> ParagraphBidiInfo<'a> {
412         // Here we could create a ParagraphInitialInfo struct to parallel the one
413         // used by BidiInfo, but there doesn't seem any compelling reason for it.
414         let (original_classes, paragraph_level, is_pure_ltr) =
415             compute_initial_info(data_source, text, default_para_level, None);
416 
417         let mut levels = Vec::<Level>::with_capacity(text.len());
418         let mut processing_classes = original_classes.clone();
419 
420         let para_info = ParagraphInfo {
421             range: Range {
422                 start: 0,
423                 end: text.len(),
424             },
425             level: paragraph_level,
426         };
427 
428         compute_bidi_info_for_para(
429             data_source,
430             &para_info,
431             is_pure_ltr,
432             text,
433             &original_classes,
434             &mut processing_classes,
435             &mut levels,
436         );
437 
438         ParagraphBidiInfo {
439             text,
440             original_classes,
441             levels,
442             paragraph_level,
443             is_pure_ltr,
444         }
445     }
446 
447     /// Produce the levels for this paragraph as needed for reordering, one level per *code unit*
448     /// in the paragraph. The returned vector includes code units that are not included
449     /// in the `line`, but will not adjust them.
450     ///
451     /// See BidiInfo::reordered_levels for details.
452     ///
453     /// (This should be kept in sync with BidiInfo::reordered_levels.)
454     #[cfg_attr(feature = "flame_it", flamer::flame)]
reordered_levels(&self, line: Range<usize>) -> Vec<Level>455     pub fn reordered_levels(&self, line: Range<usize>) -> Vec<Level> {
456         assert!(line.start <= self.levels.len());
457         assert!(line.end <= self.levels.len());
458 
459         let mut levels = self.levels.clone();
460         let line_classes = &self.original_classes[line.clone()];
461         let line_levels = &mut levels[line.clone()];
462 
463         reorder_levels(
464             line_classes,
465             line_levels,
466             self.text.subrange(line),
467             self.paragraph_level,
468         );
469 
470         levels
471     }
472 
473     /// Produce the levels for this paragraph as needed for reordering, one level per *character*
474     /// in the paragraph. The returned vector includes characters that are not included
475     /// in the `line`, but will not adjust them.
476     ///
477     /// See BidiInfo::reordered_levels_per_char for details.
478     ///
479     /// (This should be kept in sync with BidiInfo::reordered_levels_per_char.)
480     #[cfg_attr(feature = "flame_it", flamer::flame)]
reordered_levels_per_char(&self, line: Range<usize>) -> Vec<Level>481     pub fn reordered_levels_per_char(&self, line: Range<usize>) -> Vec<Level> {
482         let levels = self.reordered_levels(line);
483         self.text.char_indices().map(|(i, _)| levels[i]).collect()
484     }
485 
486     /// Re-order a line based on resolved levels and return the line in display order.
487     ///
488     /// See BidiInfo::reorder_line for details.
489     ///
490     /// (This should be kept in sync with BidiInfo::reorder_line.)
491     #[cfg_attr(feature = "flame_it", flamer::flame)]
reorder_line(&self, line: Range<usize>) -> Cow<'text, [u16]>492     pub fn reorder_line(&self, line: Range<usize>) -> Cow<'text, [u16]> {
493         if !level::has_rtl(&self.levels[line.clone()]) {
494             return self.text[line].into();
495         }
496         let (levels, runs) = self.visual_runs(line.clone());
497         reorder_line(self.text, line, levels, runs)
498     }
499 
500     /// Reorders pre-calculated levels of a sequence of characters.
501     ///
502     /// See BidiInfo::reorder_visual for details.
503     #[cfg_attr(feature = "flame_it", flamer::flame)]
504     #[inline]
reorder_visual(levels: &[Level]) -> Vec<usize>505     pub fn reorder_visual(levels: &[Level]) -> Vec<usize> {
506         reorder_visual(levels)
507     }
508 
509     /// Find the level runs within a line and return them in visual order.
510     ///
511     /// `line` is a range of code-unit indices within `levels`.
512     ///
513     /// See `BidiInfo::visual_runs` for details.
514     ///
515     /// (This should be kept in sync with BidiInfo::visual_runs.)
516     #[cfg_attr(feature = "flame_it", flamer::flame)]
517     #[inline]
visual_runs(&self, line: Range<usize>) -> (Vec<Level>, Vec<LevelRun>)518     pub fn visual_runs(&self, line: Range<usize>) -> (Vec<Level>, Vec<LevelRun>) {
519         let levels = self.reordered_levels(line.clone());
520         visual_runs_for_line(levels, &line)
521     }
522 
523     /// If processed text has any computed RTL levels
524     ///
525     /// This information is usually used to skip re-ordering of text when no RTL level is present
526     #[inline]
has_rtl(&self) -> bool527     pub fn has_rtl(&self) -> bool {
528         !self.is_pure_ltr
529     }
530 
531     /// Return the paragraph's Direction (Ltr, Rtl, or Mixed) based on its levels.
532     #[inline]
direction(&self) -> Direction533     pub fn direction(&self) -> Direction {
534         para_direction(&self.levels)
535     }
536 }
537 
538 /// Return a line of the text in display order based on resolved levels.
539 ///
540 /// `text`   the full text passed to the `BidiInfo` or `ParagraphBidiInfo` for analysis
541 /// `line`   a range of byte indices within `text` corresponding to one line
542 /// `levels` array of `Level` values, with `line`'s levels reordered into visual order
543 /// `runs`   array of `LevelRun`s in visual order
544 ///
545 /// (`levels` and `runs` are the result of calling `BidiInfo::visual_runs()` or
546 /// `ParagraphBidiInfo::visual_runs()` for the line of interest.)
547 ///
548 /// Returns: the reordered text of the line.
549 ///
550 /// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring.
551 ///
552 /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
553 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
reorder_line<'text>( text: &'text [u16], line: Range<usize>, levels: Vec<Level>, runs: Vec<LevelRun>, ) -> Cow<'text, [u16]>554 fn reorder_line<'text>(
555     text: &'text [u16],
556     line: Range<usize>,
557     levels: Vec<Level>,
558     runs: Vec<LevelRun>,
559 ) -> Cow<'text, [u16]> {
560     // If all isolating run sequences are LTR, no reordering is needed
561     if runs.iter().all(|run| levels[run.start].is_ltr()) {
562         return text[line].into();
563     }
564 
565     let mut result = Vec::<u16>::with_capacity(line.len());
566     for run in runs {
567         if levels[run.start].is_rtl() {
568             let mut buf = [0; 2];
569             for c in text[run].chars().rev() {
570                 result.extend(c.encode_utf16(&mut buf).iter());
571             }
572         } else {
573             result.extend(text[run].iter());
574         }
575     }
576     result.into()
577 }
578 
579 /// Contains a reference of `BidiInfo` and one of its `paragraphs`.
580 /// And it supports all operation in the `Paragraph` that needs also its
581 /// `BidiInfo` such as `direction`.
582 #[derive(Debug)]
583 pub struct Paragraph<'a, 'text> {
584     pub info: &'a BidiInfo<'text>,
585     pub para: &'a ParagraphInfo,
586 }
587 
588 impl<'a, 'text> Paragraph<'a, 'text> {
589     #[inline]
new(info: &'a BidiInfo<'text>, para: &'a ParagraphInfo) -> Paragraph<'a, 'text>590     pub fn new(info: &'a BidiInfo<'text>, para: &'a ParagraphInfo) -> Paragraph<'a, 'text> {
591         Paragraph { info, para }
592     }
593 
594     /// Returns if the paragraph is Left direction, right direction or mixed.
595     #[inline]
direction(&self) -> Direction596     pub fn direction(&self) -> Direction {
597         para_direction(&self.info.levels[self.para.range.clone()])
598     }
599 
600     /// Returns the `Level` of a certain character in the paragraph.
601     #[inline]
level_at(&self, pos: usize) -> Level602     pub fn level_at(&self, pos: usize) -> Level {
603         let actual_position = self.para.range.start + pos;
604         self.info.levels[actual_position]
605     }
606 }
607 
608 /// Implementation of TextSource for UTF-16 text in a [u16] array.
609 /// Note that there could be unpaired surrogates present!
610 
611 // Convenience functions to check whether a UTF16 code unit is a surrogate.
612 #[inline]
is_high_surrogate(code: u16) -> bool613 fn is_high_surrogate(code: u16) -> bool {
614     (code & 0xFC00) == 0xD800
615 }
616 #[inline]
is_low_surrogate(code: u16) -> bool617 fn is_low_surrogate(code: u16) -> bool {
618     (code & 0xFC00) == 0xDC00
619 }
620 
621 impl<'text> TextSource<'text> for [u16] {
622     type CharIter = Utf16CharIter<'text>;
623     type CharIndexIter = Utf16CharIndexIter<'text>;
624     type IndexLenIter = Utf16IndexLenIter<'text>;
625 
626     #[inline]
len(&self) -> usize627     fn len(&self) -> usize {
628         (self as &[u16]).len()
629     }
char_at(&self, index: usize) -> Option<(char, usize)>630     fn char_at(&self, index: usize) -> Option<(char, usize)> {
631         if index >= self.len() {
632             return None;
633         }
634         // Get the indicated code unit and try simply converting it to a char;
635         // this will fail if it is half of a surrogate pair.
636         let c = self[index];
637         if let Some(ch) = char::from_u32(c.into()) {
638             return Some((ch, 1));
639         }
640         // If it's a low surrogate, and was immediately preceded by a high surrogate,
641         // then we're in the middle of a (valid) character, and should return None.
642         if is_low_surrogate(c) && index > 0 && is_high_surrogate(self[index - 1]) {
643             return None;
644         }
645         // Otherwise, try to decode, returning REPLACEMENT_CHARACTER for errors.
646         if let Some(ch) = char::decode_utf16(self[index..].iter().cloned()).next() {
647             if let Ok(ch) = ch {
648                 // This must be a surrogate pair, otherwise char::from_u32() above should
649                 // have succeeded!
650                 debug_assert!(ch.len_utf16() == 2, "BMP should have already been handled");
651                 return Some((ch, ch.len_utf16()));
652             }
653         } else {
654             debug_assert!(
655                 false,
656                 "Why did decode_utf16 return None when we're not at the end?"
657             );
658             return None;
659         }
660         // Failed to decode UTF-16: we must have encountered an unpaired surrogate.
661         // Return REPLACEMENT_CHARACTER (not None), to continue processing the following text
662         // and keep indexing correct.
663         Some((char::REPLACEMENT_CHARACTER, 1))
664     }
665     #[inline]
subrange(&self, range: Range<usize>) -> &Self666     fn subrange(&self, range: Range<usize>) -> &Self {
667         &(self as &[u16])[range]
668     }
669     #[inline]
chars(&'text self) -> Self::CharIter670     fn chars(&'text self) -> Self::CharIter {
671         Utf16CharIter::new(&self)
672     }
673     #[inline]
char_indices(&'text self) -> Self::CharIndexIter674     fn char_indices(&'text self) -> Self::CharIndexIter {
675         Utf16CharIndexIter::new(&self)
676     }
677     #[inline]
indices_lengths(&'text self) -> Self::IndexLenIter678     fn indices_lengths(&'text self) -> Self::IndexLenIter {
679         Utf16IndexLenIter::new(&self)
680     }
681     #[inline]
char_len(ch: char) -> usize682     fn char_len(ch: char) -> usize {
683         ch.len_utf16()
684     }
685 }
686 
687 /// Iterator over UTF-16 text in a [u16] slice, returning (index, char_len) tuple.
688 #[derive(Debug)]
689 pub struct Utf16IndexLenIter<'text> {
690     text: &'text [u16],
691     cur_pos: usize,
692 }
693 
694 impl<'text> Utf16IndexLenIter<'text> {
695     #[inline]
new(text: &'text [u16]) -> Self696     pub fn new(text: &'text [u16]) -> Self {
697         Utf16IndexLenIter { text, cur_pos: 0 }
698     }
699 }
700 
701 impl Iterator for Utf16IndexLenIter<'_> {
702     type Item = (usize, usize);
703 
704     #[inline]
next(&mut self) -> Option<Self::Item>705     fn next(&mut self) -> Option<Self::Item> {
706         if let Some((_, char_len)) = self.text.char_at(self.cur_pos) {
707             let result = (self.cur_pos, char_len);
708             self.cur_pos += char_len;
709             return Some(result);
710         }
711         None
712     }
713 }
714 
715 /// Iterator over UTF-16 text in a [u16] slice, returning (index, char) tuple.
716 #[derive(Debug)]
717 pub struct Utf16CharIndexIter<'text> {
718     text: &'text [u16],
719     cur_pos: usize,
720 }
721 
722 impl<'text> Utf16CharIndexIter<'text> {
new(text: &'text [u16]) -> Self723     pub fn new(text: &'text [u16]) -> Self {
724         Utf16CharIndexIter { text, cur_pos: 0 }
725     }
726 }
727 
728 impl Iterator for Utf16CharIndexIter<'_> {
729     type Item = (usize, char);
730 
next(&mut self) -> Option<Self::Item>731     fn next(&mut self) -> Option<Self::Item> {
732         if let Some((ch, char_len)) = self.text.char_at(self.cur_pos) {
733             let result = (self.cur_pos, ch);
734             self.cur_pos += char_len;
735             return Some(result);
736         }
737         None
738     }
739 }
740 
741 /// Iterator over UTF-16 text in a [u16] slice, returning Unicode chars.
742 /// (Unlike the other iterators above, this also supports reverse iteration.)
743 #[derive(Debug)]
744 pub struct Utf16CharIter<'text> {
745     text: &'text [u16],
746     cur_pos: usize,
747     end_pos: usize,
748 }
749 
750 impl<'text> Utf16CharIter<'text> {
new(text: &'text [u16]) -> Self751     pub fn new(text: &'text [u16]) -> Self {
752         Utf16CharIter {
753             text,
754             cur_pos: 0,
755             end_pos: text.len(),
756         }
757     }
758 }
759 
760 impl Iterator for Utf16CharIter<'_> {
761     type Item = char;
762 
next(&mut self) -> Option<Self::Item>763     fn next(&mut self) -> Option<Self::Item> {
764         if let Some((ch, char_len)) = self.text.char_at(self.cur_pos) {
765             self.cur_pos += char_len;
766             return Some(ch);
767         }
768         None
769     }
770 }
771 
772 impl DoubleEndedIterator for Utf16CharIter<'_> {
next_back(&mut self) -> Option<Self::Item>773     fn next_back(&mut self) -> Option<Self::Item> {
774         if self.end_pos <= self.cur_pos {
775             return None;
776         }
777         self.end_pos -= 1;
778         if let Some(ch) = char::from_u32(self.text[self.end_pos] as u32) {
779             return Some(ch);
780         }
781         if self.end_pos > self.cur_pos {
782             if let Some((ch, char_len)) = self.text.char_at(self.end_pos - 1) {
783                 if char_len == 2 {
784                     self.end_pos -= 1;
785                     return Some(ch);
786                 }
787             }
788         }
789         Some(char::REPLACEMENT_CHARACTER)
790     }
791 }
792