1 //! Word splitting functionality.
2 //!
3 //! To wrap text into lines, long words sometimes need to be split
4 //! across lines. The [`WordSplitter`] enum defines this
5 //! functionality.
6 
7 use crate::core::{display_width, Word};
8 
9 /// The `WordSplitter` enum describes where words can be split.
10 ///
11 /// If the textwrap crate has been compiled with the `hyphenation`
12 /// Cargo feature enabled, you will find a
13 /// [`WordSplitter::Hyphenation`] variant. Use this struct for
14 /// language-aware hyphenation:
15 ///
16 /// ```
17 /// #[cfg(feature = "hyphenation")] {
18 ///     use hyphenation::{Language, Load, Standard};
19 ///     use textwrap::{wrap, Options, WordSplitter};
20 ///
21 ///     let text = "Oxidation is the loss of electrons.";
22 ///     let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap();
23 ///     let options = Options::new(8).word_splitter(WordSplitter::Hyphenation(dictionary));
24 ///     assert_eq!(wrap(text, &options), vec!["Oxida-",
25 ///                                           "tion is",
26 ///                                           "the loss",
27 ///                                           "of elec-",
28 ///                                           "trons."]);
29 /// }
30 /// ```
31 ///
32 /// Please see the documentation for the [hyphenation] crate for more
33 /// details.
34 ///
35 /// [hyphenation]: https://docs.rs/hyphenation/
36 #[derive(Clone)]
37 pub enum WordSplitter {
38     /// Use this as a [`Options.word_splitter`] to avoid any kind of
39     /// hyphenation:
40     ///
41     /// ```
42     /// use textwrap::{wrap, Options, WordSplitter};
43     ///
44     /// let options = Options::new(8).word_splitter(WordSplitter::NoHyphenation);
45     /// assert_eq!(wrap("foo bar-baz", &options),
46     ///            vec!["foo", "bar-baz"]);
47     /// ```
48     ///
49     /// [`Options.word_splitter`]: super::Options::word_splitter
50     NoHyphenation,
51 
52     /// `HyphenSplitter` is the default `WordSplitter` used by
53     /// [`Options::new`](super::Options::new). It will split words on
54     /// existing hyphens in the word.
55     ///
56     /// It will only use hyphens that are surrounded by alphanumeric
57     /// characters, which prevents a word like `"--foo-bar"` from
58     /// being split into `"--"` and `"foo-bar"`.
59     ///
60     /// # Examples
61     ///
62     /// ```
63     /// use textwrap::WordSplitter;
64     ///
65     /// assert_eq!(WordSplitter::HyphenSplitter.split_points("--foo-bar"),
66     ///            vec![6]);
67     /// ```
68     HyphenSplitter,
69 
70     /// Use a custom function as the word splitter.
71     ///
72     /// This varian lets you implement a custom word splitter using
73     /// your own function.
74     ///
75     /// # Examples
76     ///
77     /// ```
78     /// use textwrap::WordSplitter;
79     ///
80     /// fn split_at_underscore(word: &str) -> Vec<usize> {
81     ///     word.match_indices('_').map(|(idx, _)| idx + 1).collect()
82     /// }
83     ///
84     /// let word_splitter = WordSplitter::Custom(split_at_underscore);
85     /// assert_eq!(word_splitter.split_points("a_long_identifier"),
86     ///            vec![2, 7]);
87     /// ```
88     Custom(fn(word: &str) -> Vec<usize>),
89 
90     /// A hyphenation dictionary can be used to do language-specific
91     /// hyphenation using patterns from the [hyphenation] crate.
92     ///
93     /// **Note:** Only available when the `hyphenation` Cargo feature is
94     /// enabled.
95     ///
96     /// [hyphenation]: https://docs.rs/hyphenation/
97     #[cfg(feature = "hyphenation")]
98     Hyphenation(hyphenation::Standard),
99 }
100 
101 impl std::fmt::Debug for WordSplitter {
fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result102     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
103         match self {
104             WordSplitter::NoHyphenation => f.write_str("NoHyphenation"),
105             WordSplitter::HyphenSplitter => f.write_str("HyphenSplitter"),
106             WordSplitter::Custom(_) => f.write_str("Custom(...)"),
107             #[cfg(feature = "hyphenation")]
108             WordSplitter::Hyphenation(dict) => write!(f, "Hyphenation({})", dict.language()),
109         }
110     }
111 }
112 
113 impl PartialEq<WordSplitter> for WordSplitter {
eq(&self, other: &WordSplitter) -> bool114     fn eq(&self, other: &WordSplitter) -> bool {
115         match (self, other) {
116             (WordSplitter::NoHyphenation, WordSplitter::NoHyphenation) => true,
117             (WordSplitter::HyphenSplitter, WordSplitter::HyphenSplitter) => true,
118             #[cfg(feature = "hyphenation")]
119             (WordSplitter::Hyphenation(this_dict), WordSplitter::Hyphenation(other_dict)) => {
120                 this_dict.language() == other_dict.language()
121             }
122             (_, _) => false,
123         }
124     }
125 }
126 
127 impl WordSplitter {
128     /// Return all possible indices where `word` can be split.
129     ///
130     /// The indices are in the range `0..word.len()`. They point to
131     /// the index _after_ the split point, i.e., after `-` if
132     /// splitting on hyphens. This way, `word.split_at(idx)` will
133     /// break the word into two well-formed pieces.
134     ///
135     /// # Examples
136     ///
137     /// ```
138     /// use textwrap::WordSplitter;
139     /// assert_eq!(WordSplitter::NoHyphenation.split_points("cannot-be-split"), vec![]);
140     /// assert_eq!(WordSplitter::HyphenSplitter.split_points("can-be-split"), vec![4, 7]);
141     /// assert_eq!(WordSplitter::Custom(|word| vec![word.len()/2]).split_points("middle"), vec![3]);
142     /// ```
split_points(&self, word: &str) -> Vec<usize>143     pub fn split_points(&self, word: &str) -> Vec<usize> {
144         match self {
145             WordSplitter::NoHyphenation => Vec::new(),
146             WordSplitter::HyphenSplitter => {
147                 let mut splits = Vec::new();
148 
149                 for (idx, _) in word.match_indices('-') {
150                     // We only use hyphens that are surrounded by alphanumeric
151                     // characters. This is to avoid splitting on repeated hyphens,
152                     // such as those found in --foo-bar.
153                     let prev = word[..idx].chars().next_back();
154                     let next = word[idx + 1..].chars().next();
155 
156                     if prev.filter(|ch| ch.is_alphanumeric()).is_some()
157                         && next.filter(|ch| ch.is_alphanumeric()).is_some()
158                     {
159                         splits.push(idx + 1); // +1 due to width of '-'.
160                     }
161                 }
162 
163                 splits
164             }
165             WordSplitter::Custom(splitter_func) => splitter_func(word),
166             #[cfg(feature = "hyphenation")]
167             WordSplitter::Hyphenation(dictionary) => {
168                 use hyphenation::Hyphenator;
169                 dictionary.hyphenate(word).breaks
170             }
171         }
172     }
173 }
174 
175 /// Split words into smaller words according to the split points given
176 /// by `word_splitter`.
177 ///
178 /// Note that we split all words, regardless of their length. This is
179 /// to more cleanly separate the business of splitting (including
180 /// automatic hyphenation) from the business of word wrapping.
split_words<'a, I>( words: I, word_splitter: &'a WordSplitter, ) -> impl Iterator<Item = Word<'a>> where I: IntoIterator<Item = Word<'a>>,181 pub fn split_words<'a, I>(
182     words: I,
183     word_splitter: &'a WordSplitter,
184 ) -> impl Iterator<Item = Word<'a>>
185 where
186     I: IntoIterator<Item = Word<'a>>,
187 {
188     words.into_iter().flat_map(move |word| {
189         let mut prev = 0;
190         let mut split_points = word_splitter.split_points(&word).into_iter();
191         std::iter::from_fn(move || {
192             if let Some(idx) = split_points.next() {
193                 let need_hyphen = !word[..idx].ends_with('-');
194                 let w = Word {
195                     word: &word.word[prev..idx],
196                     width: display_width(&word[prev..idx]),
197                     whitespace: "",
198                     penalty: if need_hyphen { "-" } else { "" },
199                 };
200                 prev = idx;
201                 return Some(w);
202             }
203 
204             if prev < word.word.len() || prev == 0 {
205                 let w = Word {
206                     word: &word.word[prev..],
207                     width: display_width(&word[prev..]),
208                     whitespace: word.whitespace,
209                     penalty: word.penalty,
210                 };
211                 prev = word.word.len() + 1;
212                 return Some(w);
213             }
214 
215             None
216         })
217     })
218 }
219 
220 #[cfg(test)]
221 mod tests {
222     use super::*;
223 
224     // Like assert_eq!, but the left expression is an iterator.
225     macro_rules! assert_iter_eq {
226         ($left:expr, $right:expr) => {
227             assert_eq!($left.collect::<Vec<_>>(), $right);
228         };
229     }
230 
231     #[test]
split_words_no_words()232     fn split_words_no_words() {
233         assert_iter_eq!(split_words(vec![], &WordSplitter::HyphenSplitter), vec![]);
234     }
235 
236     #[test]
split_words_empty_word()237     fn split_words_empty_word() {
238         assert_iter_eq!(
239             split_words(vec![Word::from("   ")], &WordSplitter::HyphenSplitter),
240             vec![Word::from("   ")]
241         );
242     }
243 
244     #[test]
split_words_single_word()245     fn split_words_single_word() {
246         assert_iter_eq!(
247             split_words(vec![Word::from("foobar")], &WordSplitter::HyphenSplitter),
248             vec![Word::from("foobar")]
249         );
250     }
251 
252     #[test]
split_words_hyphen_splitter()253     fn split_words_hyphen_splitter() {
254         assert_iter_eq!(
255             split_words(vec![Word::from("foo-bar")], &WordSplitter::HyphenSplitter),
256             vec![Word::from("foo-"), Word::from("bar")]
257         );
258     }
259 
260     #[test]
split_words_no_hyphenation()261     fn split_words_no_hyphenation() {
262         assert_iter_eq!(
263             split_words(vec![Word::from("foo-bar")], &WordSplitter::NoHyphenation),
264             vec![Word::from("foo-bar")]
265         );
266     }
267 
268     #[test]
split_words_adds_penalty()269     fn split_words_adds_penalty() {
270         let fixed_split_point = |_: &str| vec![3];
271 
272         assert_iter_eq!(
273             split_words(
274                 vec![Word::from("foobar")].into_iter(),
275                 &WordSplitter::Custom(fixed_split_point)
276             ),
277             vec![
278                 Word {
279                     word: "foo",
280                     width: 3,
281                     whitespace: "",
282                     penalty: "-"
283                 },
284                 Word {
285                     word: "bar",
286                     width: 3,
287                     whitespace: "",
288                     penalty: ""
289                 }
290             ]
291         );
292 
293         assert_iter_eq!(
294             split_words(
295                 vec![Word::from("fo-bar")].into_iter(),
296                 &WordSplitter::Custom(fixed_split_point)
297             ),
298             vec![
299                 Word {
300                     word: "fo-",
301                     width: 3,
302                     whitespace: "",
303                     penalty: ""
304                 },
305                 Word {
306                     word: "bar",
307                     width: 3,
308                     whitespace: "",
309                     penalty: ""
310                 }
311             ]
312         );
313     }
314 }
315