1 // Copyright 2015 The Servo Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! 3.3.3 Preparations for Implicit Processing
11 //!
12 //! <http://www.unicode.org/reports/tr9/#Preparations_for_Implicit_Processing>
13 
14 use alloc::vec::Vec;
15 use core::cmp::max;
16 use core::ops::Range;
17 
18 use super::level::Level;
19 use super::BidiClass::{self, *};
20 
21 /// A maximal substring of characters with the same embedding level.
22 ///
23 /// Represented as a range of byte indices.
24 pub type LevelRun = Range<usize>;
25 
26 /// Output of `isolating_run_sequences` (steps X9-X10)
27 #[derive(Debug, PartialEq)]
28 pub struct IsolatingRunSequence {
29     pub runs: Vec<LevelRun>,
30     pub sos: BidiClass, // Start-of-sequence type.
31     pub eos: BidiClass, // End-of-sequence type.
32 }
33 
34 /// Compute the set of isolating run sequences.
35 ///
36 /// An isolating run sequence is a maximal sequence of level runs such that for all level runs
37 /// except the last one in the sequence, the last character of the run is an isolate initiator
38 /// whose matching PDI is the first character of the next level run in the sequence.
39 ///
40 /// Note: This function does *not* return the sequences in order by their first characters.
41 #[cfg_attr(feature = "flame_it", flamer::flame)]
isolating_run_sequences( para_level: Level, original_classes: &[BidiClass], levels: &[Level], ) -> Vec<IsolatingRunSequence>42 pub fn isolating_run_sequences(
43     para_level: Level,
44     original_classes: &[BidiClass],
45     levels: &[Level],
46 ) -> Vec<IsolatingRunSequence> {
47     let runs = level_runs(levels, original_classes);
48 
49     // Compute the set of isolating run sequences.
50     // <http://www.unicode.org/reports/tr9/#BD13>
51     let mut sequences = Vec::with_capacity(runs.len());
52 
53     // When we encounter an isolate initiator, we push the current sequence onto the
54     // stack so we can resume it after the matching PDI.
55     let mut stack = vec![Vec::new()];
56 
57     for run in runs {
58         assert!(run.len() > 0);
59         assert!(!stack.is_empty());
60 
61         let start_class = original_classes[run.start];
62         // > In rule X10, [..] skip over any BNs when [..].
63         // > Do the same when determining if the last character of the sequence is an isolate initiator.
64         //
65         // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
66         let end_class = original_classes[run.start..run.end]
67             .iter()
68             .copied()
69             .rev()
70             .filter(not_removed_by_x9)
71             .next()
72             .unwrap_or(start_class);
73 
74         let mut sequence = if start_class == PDI && stack.len() > 1 {
75             // Continue a previous sequence interrupted by an isolate.
76             stack.pop().unwrap()
77         } else {
78             // Start a new sequence.
79             Vec::new()
80         };
81 
82         sequence.push(run);
83 
84         if let RLI | LRI | FSI = end_class {
85             // Resume this sequence after the isolate.
86             stack.push(sequence);
87         } else {
88             // This sequence is finished.
89             sequences.push(sequence);
90         }
91     }
92     // Pop any remaning sequences off the stack.
93     sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty()));
94 
95     // Determine the `sos` and `eos` class for each sequence.
96     // <http://www.unicode.org/reports/tr9/#X10>
97     sequences
98         .into_iter()
99         .map(|sequence: Vec<LevelRun>| {
100             assert!(!sequence.is_empty());
101 
102             let mut result = IsolatingRunSequence {
103                 runs: sequence,
104                 sos: L,
105                 eos: L,
106             };
107 
108             let start_of_seq = result.runs[0].start;
109             let runs_len = result.runs.len();
110             let end_of_seq = result.runs[runs_len - 1].end;
111 
112             // > (not counting characters removed by X9)
113             let seq_level = result
114                 .iter_forwards_from(start_of_seq, 0)
115                 .filter(|i| not_removed_by_x9(&original_classes[*i]))
116                 .map(|i| levels[i])
117                 .next()
118                 .unwrap_or(levels[start_of_seq]);
119 
120             // XXXManishearth the spec talks of a start and end level,
121             // but for a given IRS the two should be equivalent, yes?
122             let end_level = result
123                 .iter_backwards_from(end_of_seq, runs_len - 1)
124                 .filter(|i| not_removed_by_x9(&original_classes[*i]))
125                 .map(|i| levels[i])
126                 .next()
127                 .unwrap_or(levels[end_of_seq - 1]);
128 
129             #[cfg(test)]
130             for run in result.runs.clone() {
131                 for idx in run {
132                     if not_removed_by_x9(&original_classes[idx]) {
133                         assert_eq!(seq_level, levels[idx]);
134                     }
135                 }
136             }
137 
138             // Get the level of the last non-removed char before the runs.
139             let pred_level = match original_classes[..start_of_seq]
140                 .iter()
141                 .rposition(not_removed_by_x9)
142             {
143                 Some(idx) => levels[idx],
144                 None => para_level,
145             };
146 
147             // Get the last non-removed character to check if it is an isolate initiator.
148             // The spec calls for an unmatched one, but matched isolate initiators
149             // will never be at the end of a level run (otherwise there would be more to the run).
150             // We unwrap_or(BN) because BN marks removed classes and it won't matter for the check.
151             let last_non_removed = original_classes[..end_of_seq]
152                 .iter()
153                 .copied()
154                 .rev()
155                 .find(not_removed_by_x9)
156                 .unwrap_or(BN);
157 
158             // Get the level of the next non-removed char after the runs.
159             let succ_level = if let RLI | LRI | FSI = last_non_removed {
160                 para_level
161             } else {
162                 match original_classes[end_of_seq..]
163                     .iter()
164                     .position(not_removed_by_x9)
165                 {
166                     Some(idx) => levels[end_of_seq + idx],
167                     None => para_level,
168                 }
169             };
170 
171             result.sos = max(seq_level, pred_level).bidi_class();
172             result.eos = max(end_level, succ_level).bidi_class();
173             result
174         })
175         .collect()
176 }
177 
178 impl IsolatingRunSequence {
179     /// Given a text-relative position `pos` and an index of the level run it is in,
180     /// produce an iterator of all characters after and pos (`pos..`) that are in this
181     /// run sequence
iter_forwards_from( &self, pos: usize, level_run_index: usize, ) -> impl Iterator<Item = usize> + '_182     pub(crate) fn iter_forwards_from(
183         &self,
184         pos: usize,
185         level_run_index: usize,
186     ) -> impl Iterator<Item = usize> + '_ {
187         let runs = &self.runs[level_run_index..];
188 
189         // Check that it is in range
190         // (we can't use contains() since we want an inclusive range)
191         #[cfg(feature = "std")]
192         debug_assert!(runs[0].start <= pos && pos <= runs[0].end);
193 
194         (pos..runs[0].end).chain(runs[1..].iter().flat_map(Clone::clone))
195     }
196 
197     /// Given a text-relative position `pos` and an index of the level run it is in,
198     /// produce an iterator of all characters before and excludingpos (`..pos`) that are in this
199     /// run sequence
iter_backwards_from( &self, pos: usize, level_run_index: usize, ) -> impl Iterator<Item = usize> + '_200     pub(crate) fn iter_backwards_from(
201         &self,
202         pos: usize,
203         level_run_index: usize,
204     ) -> impl Iterator<Item = usize> + '_ {
205         let prev_runs = &self.runs[..level_run_index];
206         let current = &self.runs[level_run_index];
207 
208         // Check that it is in range
209         // (we can't use contains() since we want an inclusive range)
210         #[cfg(feature = "std")]
211         debug_assert!(current.start <= pos && pos <= current.end);
212 
213         (current.start..pos)
214             .rev()
215             .chain(prev_runs.iter().rev().flat_map(Clone::clone))
216     }
217 }
218 
219 /// Finds the level runs in a paragraph.
220 ///
221 /// <http://www.unicode.org/reports/tr9/#BD7>
level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun>222 fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> {
223     assert_eq!(levels.len(), original_classes.len());
224 
225     let mut runs = Vec::new();
226     if levels.is_empty() {
227         return runs;
228     }
229 
230     let mut current_run_level = levels[0];
231     let mut current_run_start = 0;
232     for i in 1..levels.len() {
233         if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level {
234             // End the last run and start a new one.
235             runs.push(current_run_start..i);
236             current_run_level = levels[i];
237             current_run_start = i;
238         }
239     }
240     runs.push(current_run_start..levels.len());
241 
242     runs
243 }
244 
245 /// Should this character be ignored in steps after X9?
246 ///
247 /// <http://www.unicode.org/reports/tr9/#X9>
removed_by_x9(class: BidiClass) -> bool248 pub fn removed_by_x9(class: BidiClass) -> bool {
249     match class {
250         RLE | LRE | RLO | LRO | PDF | BN => true,
251         _ => false,
252     }
253 }
254 
255 // For use as a predicate for `position` / `rposition`
not_removed_by_x9(class: &BidiClass) -> bool256 pub fn not_removed_by_x9(class: &BidiClass) -> bool {
257     !removed_by_x9(*class)
258 }
259 
260 #[cfg(test)]
261 mod tests {
262     use super::*;
263 
264     #[test]
test_level_runs()265     fn test_level_runs() {
266         assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]);
267         assert_eq!(
268             level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]),
269             &[0..3, 3..5, 5..6, 6..8]
270         );
271     }
272 
273     // From <http://www.unicode.org/reports/tr9/#BD13>
274     #[rustfmt::skip]
275     #[test]
test_isolating_run_sequences()276     fn test_isolating_run_sequences() {
277 
278         // == Example 1 ==
279         // text1·RLE·text2·PDF·RLE·text3·PDF·text4
280         // index        0    1  2    3    4  5    6  7
281         let classes = &[L, RLE, L, PDF, RLE, L, PDF, L];
282         let levels =  &[0,   1, 1,   1,   1, 1,   1, 0];
283         let para_level = Level::ltr();
284         let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
285         sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
286         assert_eq!(
287             sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
288             vec![vec![0..2], vec![2..7], vec![7..8]]
289         );
290 
291         // == Example 2 ==
292         // text1·RLI·text2·PDI·RLI·text3·PDI·text4
293         // index        0    1  2    3    4  5    6  7
294         let classes = &[L, RLI, L, PDI, RLI, L, PDI, L];
295         let levels =  &[0,   0, 1,   0,   0, 1,   0, 0];
296         let para_level = Level::ltr();
297         let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
298         sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
299         assert_eq!(
300             sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
301             vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]]
302         );
303 
304         // == Example 3 ==
305         // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7
306         // index        0    1  2    3  4    5  6    7  8    9  10  11  12
307         let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI,  L];
308         let levels =  &[0,   0, 1,   1, 2,   3, 3,   3, 2,   1, 1,   0,  0];
309         let para_level = Level::ltr();
310         let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
311         sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
312         assert_eq!(
313             sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
314             vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]]
315         );
316     }
317 
318     // From <http://www.unicode.org/reports/tr9/#X10>
319     #[rustfmt::skip]
320     #[test]
test_isolating_run_sequences_sos_and_eos()321     fn test_isolating_run_sequences_sos_and_eos() {
322 
323         // == Example 1 ==
324         // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6
325         // index        0    1  2    3  4    5  6    7    8  9   10  11
326         let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF,  L];
327         let levels =  &[0,   1, 1,   2, 2,   2, 1,   1,   1, 1,   1,  0];
328         let para_level = Level::ltr();
329         let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
330         sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
331 
332         // text1
333         assert_eq!(
334             &sequences[0],
335             &IsolatingRunSequence {
336                 runs: vec![0..2],
337                 sos: L,
338                 eos: R,
339             }
340         );
341 
342         // text2
343         assert_eq!(
344             &sequences[1],
345             &IsolatingRunSequence {
346                 runs: vec![2..4],
347                 sos: R,
348                 eos: L,
349             }
350         );
351 
352         // text3
353         assert_eq!(
354             &sequences[2],
355             &IsolatingRunSequence {
356                 runs: vec![4..6],
357                 sos: L,
358                 eos: L,
359             }
360         );
361 
362         // text4 text5
363         assert_eq!(
364             &sequences[3],
365             &IsolatingRunSequence {
366                 runs: vec![6..11],
367                 sos: L,
368                 eos: R,
369             }
370         );
371 
372         // text6
373         assert_eq!(
374             &sequences[4],
375             &IsolatingRunSequence {
376                 runs: vec![11..12],
377                 sos: R,
378                 eos: L,
379             }
380         );
381 
382         // == Example 2 ==
383         // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6
384         // index        0    1  2    3  4    5  6    7    8  9   10  11
385         let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI,  L];
386         let levels =  &[0,   0, 1,   1, 2,   1, 1,   0,   0, 1,   0,  0];
387         let para_level = Level::ltr();
388         let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
389         sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
390 
391         // text1·RLI·PDI·RLI·PDI·text6
392         assert_eq!(
393             &sequences[0],
394             &IsolatingRunSequence {
395                 runs: vec![0..2, 7..9, 10..12],
396                 sos: L,
397                 eos: L,
398             }
399         );
400 
401         // text2·LRI·PDI·text4
402         assert_eq!(
403             &sequences[1],
404             &IsolatingRunSequence {
405                 runs: vec![2..4, 5..7],
406                 sos: R,
407                 eos: R,
408             }
409         );
410 
411         // text3
412         assert_eq!(
413             &sequences[2],
414             &IsolatingRunSequence {
415                 runs: vec![4..5],
416                 sos: L,
417                 eos: L,
418             }
419         );
420 
421         // text5
422         assert_eq!(
423             &sequences[3],
424             &IsolatingRunSequence {
425                 runs: vec![9..10],
426                 sos: R,
427                 eos: R,
428             }
429         );
430     }
431 
432     #[test]
test_removed_by_x9()433     fn test_removed_by_x9() {
434         let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN];
435         let not_classes = &[L, RLI, AL, LRI, PDI];
436         for x in rem_classes {
437             assert_eq!(removed_by_x9(*x), true);
438         }
439         for x in not_classes {
440             assert_eq!(removed_by_x9(*x), false);
441         }
442     }
443 
444     #[test]
test_not_removed_by_x9()445     fn test_not_removed_by_x9() {
446         let non_x9_classes = &[L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI];
447         for x in non_x9_classes {
448             assert_eq!(not_removed_by_x9(&x), true);
449         }
450     }
451 }
452