1 //! Building blocks for advanced wrapping functionality.
2 //!
3 //! The functions and structs in this module can be used to implement
4 //! advanced wrapping functionality when the [`wrap`](super::wrap) and
5 //! [`fill`](super::fill) function don't do what you want.
6 //!
7 //! In general, you want to follow these steps when wrapping
8 //! something:
9 //!
10 //! 1. Split your input into [`Fragment`]s. These are abstract blocks
11 //!    of text or content which can be wrapped into lines. See
12 //!    [`WordSeparator`](crate::word_separators::WordSeparator) for
13 //!    how to do this for text.
14 //!
15 //! 2. Potentially split your fragments into smaller pieces. This
16 //!    allows you to implement things like hyphenation. If you use the
17 //!    `Word` type, you can use [`WordSplitter`](crate::WordSplitter)
18 //!    enum for this.
19 //!
20 //! 3. Potentially break apart fragments that are still too large to
21 //!    fit on a single line. This is implemented in [`break_words`].
22 //!
23 //! 4. Finally take your fragments and put them into lines. There are
24 //!    two algorithms for this in the
25 //!    [`wrap_algorithms`](crate::wrap_algorithms) module:
26 //!    [`wrap_optimal_fit`](crate::wrap_algorithms::wrap_optimal_fit)
27 //!    and [`wrap_first_fit`](crate::wrap_algorithms::wrap_first_fit).
28 //!    The former produces better line breaks, the latter is faster.
29 //!
30 //! 5. Iterate through the slices returned by the wrapping functions
31 //!    and construct your lines of output.
32 //!
33 //! Please [open an issue](https://github.com/mgeisler/textwrap/) if
34 //! the functionality here is not sufficient or if you have ideas for
35 //! improving it. We would love to hear from you!
36 
37 /// The CSI or “Control Sequence Introducer” introduces an ANSI escape
38 /// sequence. This is typically used for colored text and will be
39 /// ignored when computing the text width.
40 const CSI: (char, char) = ('\x1b', '[');
41 /// The final bytes of an ANSI escape sequence must be in this range.
42 const ANSI_FINAL_BYTE: std::ops::RangeInclusive<char> = '\x40'..='\x7e';
43 
44 /// Skip ANSI escape sequences. The `ch` is the current `char`, the
45 /// `chars` provide the following characters. The `chars` will be
46 /// modified if `ch` is the start of an ANSI escape sequence.
47 #[inline]
skip_ansi_escape_sequence<I: Iterator<Item = char>>(ch: char, chars: &mut I) -> bool48 pub(crate) fn skip_ansi_escape_sequence<I: Iterator<Item = char>>(ch: char, chars: &mut I) -> bool {
49     if ch == CSI.0 && chars.next() == Some(CSI.1) {
50         // We have found the start of an ANSI escape code, typically
51         // used for colored terminal text. We skip until we find a
52         // "final byte" in the range 0x40–0x7E.
53         for ch in chars {
54             if ANSI_FINAL_BYTE.contains(&ch) {
55                 return true;
56             }
57         }
58     }
59     false
60 }
61 
62 #[cfg(feature = "unicode-width")]
63 #[inline]
ch_width(ch: char) -> usize64 fn ch_width(ch: char) -> usize {
65     unicode_width::UnicodeWidthChar::width(ch).unwrap_or(0)
66 }
67 
68 /// First character which [`ch_width`] will classify as double-width.
69 /// Please see [`display_width`].
70 #[cfg(not(feature = "unicode-width"))]
71 const DOUBLE_WIDTH_CUTOFF: char = '\u{1100}';
72 
73 #[cfg(not(feature = "unicode-width"))]
74 #[inline]
ch_width(ch: char) -> usize75 fn ch_width(ch: char) -> usize {
76     if ch < DOUBLE_WIDTH_CUTOFF {
77         1
78     } else {
79         2
80     }
81 }
82 
83 /// Compute the display width of `text` while skipping over ANSI
84 /// escape sequences.
85 ///
86 /// # Examples
87 ///
88 /// ```
89 /// use textwrap::core::display_width;
90 ///
91 /// assert_eq!(display_width("Café Plain"), 10);
92 /// assert_eq!(display_width("\u{1b}[31mCafé Rouge\u{1b}[0m"), 10);
93 /// ```
94 ///
95 /// **Note:** When the `unicode-width` Cargo feature is disabled, the
96 /// width of a `char` is determined by a crude approximation which
97 /// simply counts chars below U+1100 as 1 column wide, and all other
98 /// characters as 2 columns wide. With the feature enabled, function
99 /// will correctly deal with [combining characters] in their
100 /// decomposed form (see [Unicode equivalence]).
101 ///
102 /// An example of a decomposed character is “é”, which can be
103 /// decomposed into: “e” followed by a combining acute accent: “◌́”.
104 /// Without the `unicode-width` Cargo feature, every `char` below
105 /// U+1100 has a width of 1. This includes the combining accent:
106 ///
107 /// ```
108 /// use textwrap::core::display_width;
109 ///
110 /// assert_eq!(display_width("Cafe Plain"), 10);
111 /// #[cfg(feature = "unicode-width")]
112 /// assert_eq!(display_width("Cafe\u{301} Plain"), 10);
113 /// #[cfg(not(feature = "unicode-width"))]
114 /// assert_eq!(display_width("Cafe\u{301} Plain"), 11);
115 /// ```
116 ///
117 /// ## Emojis and CJK Characters
118 ///
119 /// Characters such as emojis and [CJK characters] used in the
120 /// Chinese, Japanese, and Korean langauges are seen as double-width,
121 /// even if the `unicode-width` feature is disabled:
122 ///
123 /// ```
124 /// use textwrap::core::display_width;
125 ///
126 /// assert_eq!(display_width("��������✨����������"), 20);
127 /// assert_eq!(display_width("你好"), 4);  // “Nǐ hǎo” or “Hello” in Chinese
128 /// ```
129 ///
130 /// # Limitations
131 ///
132 /// The displayed width of a string cannot always be computed from the
133 /// string alone. This is because the width depends on the rendering
134 /// engine used. This is particularly visible with [emoji modifier
135 /// sequences] where a base emoji is modified with, e.g., skin tone or
136 /// hair color modifiers. It is up to the rendering engine to detect
137 /// this and to produce a suitable emoji.
138 ///
139 /// A simple example is “❤️”, which consists of “❤” (U+2764: Black
140 /// Heart Symbol) followed by U+FE0F (Variation Selector-16). By
141 /// itself, “❤” is a black heart, but if you follow it with the
142 /// variant selector, you may get a wider red heart.
143 ///
144 /// A more complex example would be “��‍��” which should depict a man
145 /// with red hair. Here the computed width is too large — and the
146 /// width differs depending on the use of the `unicode-width` feature:
147 ///
148 /// ```
149 /// use textwrap::core::display_width;
150 ///
151 /// assert_eq!("��‍��".chars().collect::<Vec<char>>(), ['\u{1f468}', '\u{200d}', '\u{1f9b0}']);
152 /// #[cfg(feature = "unicode-width")]
153 /// assert_eq!(display_width("��‍��"), 4);
154 /// #[cfg(not(feature = "unicode-width"))]
155 /// assert_eq!(display_width("��‍��"), 6);
156 /// ```
157 ///
158 /// This happens because the grapheme consists of three code points:
159 /// “��” (U+1F468: Man), Zero Width Joiner (U+200D), and “��”
160 /// (U+1F9B0: Red Hair). You can see them above in the test. With
161 /// `unicode-width` enabled, the ZWJ is correctly seen as having zero
162 /// width, without it is counted as a double-width character.
163 ///
164 /// ## Terminal Support
165 ///
166 /// Modern browsers typically do a great job at combining characters
167 /// as shown above, but terminals often struggle more. As an example,
168 /// Gnome Terminal version 3.38.1, shows “❤️” as a big red heart, but
169 /// shows "��‍��" as “����”.
170 ///
171 /// [combining characters]: https://en.wikipedia.org/wiki/Combining_character
172 /// [Unicode equivalence]: https://en.wikipedia.org/wiki/Unicode_equivalence
173 /// [CJK characters]: https://en.wikipedia.org/wiki/CJK_characters
174 /// [emoji modifier sequences]: https://unicode.org/emoji/charts/full-emoji-modifiers.html
display_width(text: &str) -> usize175 pub fn display_width(text: &str) -> usize {
176     let mut chars = text.chars();
177     let mut width = 0;
178     while let Some(ch) = chars.next() {
179         if skip_ansi_escape_sequence(ch, &mut chars) {
180             continue;
181         }
182         width += ch_width(ch);
183     }
184     width
185 }
186 
187 /// A (text) fragment denotes the unit which we wrap into lines.
188 ///
189 /// Fragments represent an abstract _word_ plus the _whitespace_
190 /// following the word. In case the word falls at the end of the line,
191 /// the whitespace is dropped and a so-called _penalty_ is inserted
192 /// instead (typically `"-"` if the word was hyphenated).
193 ///
194 /// For wrapping purposes, the precise content of the word, the
195 /// whitespace, and the penalty is irrelevant. All we need to know is
196 /// the displayed width of each part, which this trait provides.
197 pub trait Fragment: std::fmt::Debug {
198     /// Displayed width of word represented by this fragment.
width(&self) -> f64199     fn width(&self) -> f64;
200 
201     /// Displayed width of the whitespace that must follow the word
202     /// when the word is not at the end of a line.
whitespace_width(&self) -> f64203     fn whitespace_width(&self) -> f64;
204 
205     /// Displayed width of the penalty that must be inserted if the
206     /// word falls at the end of a line.
penalty_width(&self) -> f64207     fn penalty_width(&self) -> f64;
208 }
209 
210 /// A piece of wrappable text, including any trailing whitespace.
211 ///
212 /// A `Word` is an example of a [`Fragment`], so it has a width,
213 /// trailing whitespace, and potentially a penalty item.
214 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
215 pub struct Word<'a> {
216     /// Word content.
217     pub word: &'a str,
218     /// Whitespace to insert if the word does not fall at the end of a line.
219     pub whitespace: &'a str,
220     /// Penalty string to insert if the word falls at the end of a line.
221     pub penalty: &'a str,
222     // Cached width in columns.
223     pub(crate) width: usize,
224 }
225 
226 impl std::ops::Deref for Word<'_> {
227     type Target = str;
228 
deref(&self) -> &Self::Target229     fn deref(&self) -> &Self::Target {
230         self.word
231     }
232 }
233 
234 impl<'a> Word<'a> {
235     /// Construct a `Word` from a string.
236     ///
237     /// A trailing stretch of `' '` is automatically taken to be the
238     /// whitespace part of the word.
from(word: &str) -> Word<'_>239     pub fn from(word: &str) -> Word<'_> {
240         let trimmed = word.trim_end_matches(' ');
241         Word {
242             word: trimmed,
243             width: display_width(trimmed),
244             whitespace: &word[trimmed.len()..],
245             penalty: "",
246         }
247     }
248 
249     /// Break this word into smaller words with a width of at most
250     /// `line_width`. The whitespace and penalty from this `Word` is
251     /// added to the last piece.
252     ///
253     /// # Examples
254     ///
255     /// ```
256     /// use textwrap::core::Word;
257     /// assert_eq!(
258     ///     Word::from("Hello!  ").break_apart(3).collect::<Vec<_>>(),
259     ///     vec![Word::from("Hel"), Word::from("lo!  ")]
260     /// );
261     /// ```
break_apart<'b>(&'b self, line_width: usize) -> impl Iterator<Item = Word<'a>> + 'b262     pub fn break_apart<'b>(&'b self, line_width: usize) -> impl Iterator<Item = Word<'a>> + 'b {
263         let mut char_indices = self.word.char_indices();
264         let mut offset = 0;
265         let mut width = 0;
266 
267         std::iter::from_fn(move || {
268             while let Some((idx, ch)) = char_indices.next() {
269                 if skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
270                     continue;
271                 }
272 
273                 if width > 0 && width + ch_width(ch) > line_width {
274                     let word = Word {
275                         word: &self.word[offset..idx],
276                         width: width,
277                         whitespace: "",
278                         penalty: "",
279                     };
280                     offset = idx;
281                     width = ch_width(ch);
282                     return Some(word);
283                 }
284 
285                 width += ch_width(ch);
286             }
287 
288             if offset < self.word.len() {
289                 let word = Word {
290                     word: &self.word[offset..],
291                     width: width,
292                     whitespace: self.whitespace,
293                     penalty: self.penalty,
294                 };
295                 offset = self.word.len();
296                 return Some(word);
297             }
298 
299             None
300         })
301     }
302 }
303 
304 impl Fragment for Word<'_> {
305     #[inline]
width(&self) -> f64306     fn width(&self) -> f64 {
307         self.width as f64
308     }
309 
310     // We assume the whitespace consist of ' ' only. This allows us to
311     // compute the display width in constant time.
312     #[inline]
whitespace_width(&self) -> f64313     fn whitespace_width(&self) -> f64 {
314         self.whitespace.len() as f64
315     }
316 
317     // We assume the penalty is `""` or `"-"`. This allows us to
318     // compute the display width in constant time.
319     #[inline]
penalty_width(&self) -> f64320     fn penalty_width(&self) -> f64 {
321         self.penalty.len() as f64
322     }
323 }
324 
325 /// Forcibly break words wider than `line_width` into smaller words.
326 ///
327 /// This simply calls [`Word::break_apart`] on words that are too
328 /// wide. This means that no extra `'-'` is inserted, the word is
329 /// simply broken into smaller pieces.
break_words<'a, I>(words: I, line_width: usize) -> Vec<Word<'a>> where I: IntoIterator<Item = Word<'a>>,330 pub fn break_words<'a, I>(words: I, line_width: usize) -> Vec<Word<'a>>
331 where
332     I: IntoIterator<Item = Word<'a>>,
333 {
334     let mut shortened_words = Vec::new();
335     for word in words {
336         if word.width() > line_width as f64 {
337             shortened_words.extend(word.break_apart(line_width));
338         } else {
339             shortened_words.push(word);
340         }
341     }
342     shortened_words
343 }
344 
345 #[cfg(test)]
346 mod tests {
347     use super::*;
348 
349     #[cfg(feature = "unicode-width")]
350     use unicode_width::UnicodeWidthChar;
351 
352     #[test]
skip_ansi_escape_sequence_works()353     fn skip_ansi_escape_sequence_works() {
354         let blue_text = "\u{1b}[34mHello\u{1b}[0m";
355         let mut chars = blue_text.chars();
356         let ch = chars.next().unwrap();
357         assert!(skip_ansi_escape_sequence(ch, &mut chars));
358         assert_eq!(chars.next(), Some('H'));
359     }
360 
361     #[test]
emojis_have_correct_width()362     fn emojis_have_correct_width() {
363         use unic_emoji_char::is_emoji;
364 
365         // Emojis in the Basic Latin (ASCII) and Latin-1 Supplement
366         // blocks all have a width of 1 column. This includes
367         // characters such as '#' and '©'.
368         for ch in '\u{1}'..'\u{FF}' {
369             if is_emoji(ch) {
370                 let desc = format!("{:?} U+{:04X}", ch, ch as u32);
371 
372                 #[cfg(feature = "unicode-width")]
373                 assert_eq!(ch.width().unwrap(), 1, "char: {}", desc);
374 
375                 #[cfg(not(feature = "unicode-width"))]
376                 assert_eq!(ch_width(ch), 1, "char: {}", desc);
377             }
378         }
379 
380         // Emojis in the remaining blocks of the Basic Multilingual
381         // Plane (BMP), in the Supplementary Multilingual Plane (SMP),
382         // and in the Supplementary Ideographic Plane (SIP), are all 1
383         // or 2 columns wide when unicode-width is used, and always 2
384         // columns wide otherwise. This includes all of our favorite
385         // emojis such as ��.
386         for ch in '\u{FF}'..'\u{2FFFF}' {
387             if is_emoji(ch) {
388                 let desc = format!("{:?} U+{:04X}", ch, ch as u32);
389 
390                 #[cfg(feature = "unicode-width")]
391                 assert!(ch.width().unwrap() <= 2, "char: {}", desc);
392 
393                 #[cfg(not(feature = "unicode-width"))]
394                 assert_eq!(ch_width(ch), 2, "char: {}", desc);
395             }
396         }
397 
398         // The remaining planes contain almost no assigned code points
399         // and thus also no emojis.
400     }
401 
402     #[test]
display_width_works()403     fn display_width_works() {
404         assert_eq!("Café Plain".len(), 11); // “é” is two bytes
405         assert_eq!(display_width("Café Plain"), 10);
406         assert_eq!(display_width("\u{1b}[31mCafé Rouge\u{1b}[0m"), 10);
407     }
408 
409     #[test]
display_width_narrow_emojis()410     fn display_width_narrow_emojis() {
411         #[cfg(feature = "unicode-width")]
412         assert_eq!(display_width("⁉"), 1);
413 
414         // The ⁉ character is above DOUBLE_WIDTH_CUTOFF.
415         #[cfg(not(feature = "unicode-width"))]
416         assert_eq!(display_width("⁉"), 2);
417     }
418 
419     #[test]
display_width_narrow_emojis_variant_selector()420     fn display_width_narrow_emojis_variant_selector() {
421         #[cfg(feature = "unicode-width")]
422         assert_eq!(display_width("⁉\u{fe0f}"), 1);
423 
424         // The variant selector-16 is also counted.
425         #[cfg(not(feature = "unicode-width"))]
426         assert_eq!(display_width("⁉\u{fe0f}"), 4);
427     }
428 
429     #[test]
display_width_emojis()430     fn display_width_emojis() {
431         assert_eq!(display_width("��������✨����������"), 20);
432     }
433 }
434