1 use core::{char, cmp, fmt, str};
2 
3 #[cfg(feature = "std")]
4 use std::error;
5 
6 use crate::{ascii, bstr::BStr, ext_slice::ByteSlice};
7 
8 // The UTF-8 decoder provided here is based on the one presented here:
9 // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
10 //
11 // We *could* have done UTF-8 decoding by using a DFA generated by `\p{any}`
12 // using regex-automata that is roughly the same size. The real benefit of
13 // Hoehrmann's formulation is that the byte class mapping below is manually
14 // tailored such that each byte's class doubles as a shift to mask out the
15 // bits necessary for constructing the leading bits of each codepoint value
16 // from the initial byte.
17 //
18 // There are some minor differences between this implementation and Hoehrmann's
19 // formulation.
20 //
21 // Firstly, we make REJECT have state ID 0, since it makes the state table
22 // itself a little easier to read and is consistent with the notion that 0
23 // means "false" or "bad."
24 //
25 // Secondly, when doing bulk decoding, we add a SIMD accelerated ASCII fast
26 // path.
27 //
28 // Thirdly, we pre-multiply the state IDs to avoid a multiplication instruction
29 // in the core decoding loop. (Which is what regex-automata would do by
30 // default.)
31 //
32 // Fourthly, we split the byte class mapping and transition table into two
33 // arrays because it's clearer.
34 //
35 // It is unlikely that this is the fastest way to do UTF-8 decoding, however,
36 // it is fairly simple.
37 
38 const ACCEPT: usize = 12;
39 const REJECT: usize = 0;
40 
41 /// SAFETY: The decode below function relies on the correctness of these
42 /// equivalence classes.
43 #[cfg_attr(rustfmt, rustfmt::skip)]
44 const CLASSES: [u8; 256] = [
45    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
46    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
50    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
51    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
52   10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
53 ];
54 
55 /// SAFETY: The decode below function relies on the correctness of this state
56 /// machine.
57 #[cfg_attr(rustfmt, rustfmt::skip)]
58 const STATES_FORWARD: &'static [u8] = &[
59   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60   12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
61   0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
62   0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
63   0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
64   0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
65   0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
66   0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
67   0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68 ];
69 
70 /// An iterator over Unicode scalar values in a byte string.
71 ///
72 /// When invalid UTF-8 byte sequences are found, they are substituted with the
73 /// Unicode replacement codepoint (`U+FFFD`) using the
74 /// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
75 ///
76 /// This iterator is created by the
77 /// [`chars`](trait.ByteSlice.html#method.chars) method provided by the
78 /// [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
79 #[derive(Clone, Debug)]
80 pub struct Chars<'a> {
81     bs: &'a [u8],
82 }
83 
84 impl<'a> Chars<'a> {
new(bs: &'a [u8]) -> Chars<'a>85     pub(crate) fn new(bs: &'a [u8]) -> Chars<'a> {
86         Chars { bs }
87     }
88 
89     /// View the underlying data as a subslice of the original data.
90     ///
91     /// The slice returned has the same lifetime as the original slice, and so
92     /// the iterator can continue to be used while this exists.
93     ///
94     /// # Examples
95     ///
96     /// ```
97     /// use bstr::ByteSlice;
98     ///
99     /// let mut chars = b"abc".chars();
100     ///
101     /// assert_eq!(b"abc", chars.as_bytes());
102     /// chars.next();
103     /// assert_eq!(b"bc", chars.as_bytes());
104     /// chars.next();
105     /// chars.next();
106     /// assert_eq!(b"", chars.as_bytes());
107     /// ```
108     #[inline]
as_bytes(&self) -> &'a [u8]109     pub fn as_bytes(&self) -> &'a [u8] {
110         self.bs
111     }
112 }
113 
114 impl<'a> Iterator for Chars<'a> {
115     type Item = char;
116 
117     #[inline]
next(&mut self) -> Option<char>118     fn next(&mut self) -> Option<char> {
119         let (ch, size) = decode_lossy(self.bs);
120         if size == 0 {
121             return None;
122         }
123         self.bs = &self.bs[size..];
124         Some(ch)
125     }
126 }
127 
128 impl<'a> DoubleEndedIterator for Chars<'a> {
129     #[inline]
next_back(&mut self) -> Option<char>130     fn next_back(&mut self) -> Option<char> {
131         let (ch, size) = decode_last_lossy(self.bs);
132         if size == 0 {
133             return None;
134         }
135         self.bs = &self.bs[..self.bs.len() - size];
136         Some(ch)
137     }
138 }
139 
140 /// An iterator over Unicode scalar values in a byte string and their
141 /// byte index positions.
142 ///
143 /// When invalid UTF-8 byte sequences are found, they are substituted with the
144 /// Unicode replacement codepoint (`U+FFFD`) using the
145 /// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
146 ///
147 /// Note that this is slightly different from the `CharIndices` iterator
148 /// provided by the standard library. Aside from working on possibly invalid
149 /// UTF-8, this iterator provides both the corresponding starting and ending
150 /// byte indices of each codepoint yielded. The ending position is necessary to
151 /// slice the original byte string when invalid UTF-8 bytes are converted into
152 /// a Unicode replacement codepoint, since a single replacement codepoint can
153 /// substitute anywhere from 1 to 3 invalid bytes (inclusive).
154 ///
155 /// This iterator is created by the
156 /// [`char_indices`](trait.ByteSlice.html#method.char_indices) method provided
157 /// by the [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
158 #[derive(Clone, Debug)]
159 pub struct CharIndices<'a> {
160     bs: &'a [u8],
161     forward_index: usize,
162     reverse_index: usize,
163 }
164 
165 impl<'a> CharIndices<'a> {
new(bs: &'a [u8]) -> CharIndices<'a>166     pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
167         CharIndices { bs, forward_index: 0, reverse_index: bs.len() }
168     }
169 
170     /// View the underlying data as a subslice of the original data.
171     ///
172     /// The slice returned has the same lifetime as the original slice, and so
173     /// the iterator can continue to be used while this exists.
174     ///
175     /// # Examples
176     ///
177     /// ```
178     /// use bstr::ByteSlice;
179     ///
180     /// let mut it = b"abc".char_indices();
181     ///
182     /// assert_eq!(b"abc", it.as_bytes());
183     /// it.next();
184     /// assert_eq!(b"bc", it.as_bytes());
185     /// it.next();
186     /// it.next();
187     /// assert_eq!(b"", it.as_bytes());
188     /// ```
189     #[inline]
as_bytes(&self) -> &'a [u8]190     pub fn as_bytes(&self) -> &'a [u8] {
191         self.bs
192     }
193 }
194 
195 impl<'a> Iterator for CharIndices<'a> {
196     type Item = (usize, usize, char);
197 
198     #[inline]
next(&mut self) -> Option<(usize, usize, char)>199     fn next(&mut self) -> Option<(usize, usize, char)> {
200         let index = self.forward_index;
201         let (ch, size) = decode_lossy(self.bs);
202         if size == 0 {
203             return None;
204         }
205         self.bs = &self.bs[size..];
206         self.forward_index += size;
207         Some((index, index + size, ch))
208     }
209 }
210 
211 impl<'a> DoubleEndedIterator for CharIndices<'a> {
212     #[inline]
next_back(&mut self) -> Option<(usize, usize, char)>213     fn next_back(&mut self) -> Option<(usize, usize, char)> {
214         let (ch, size) = decode_last_lossy(self.bs);
215         if size == 0 {
216             return None;
217         }
218         self.bs = &self.bs[..self.bs.len() - size];
219         self.reverse_index -= size;
220         Some((self.reverse_index, self.reverse_index + size, ch))
221     }
222 }
223 
224 impl<'a> ::core::iter::FusedIterator for CharIndices<'a> {}
225 
226 /// An iterator over chunks of valid UTF-8 in a byte slice.
227 ///
228 /// See [`utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks).
229 #[derive(Clone, Debug)]
230 pub struct Utf8Chunks<'a> {
231     pub(super) bytes: &'a [u8],
232 }
233 
234 /// A chunk of valid UTF-8, possibly followed by invalid UTF-8 bytes.
235 ///
236 /// This is yielded by the
237 /// [`Utf8Chunks`](struct.Utf8Chunks.html)
238 /// iterator, which can be created via the
239 /// [`ByteSlice::utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks)
240 /// method.
241 ///
242 /// The `'a` lifetime parameter corresponds to the lifetime of the bytes that
243 /// are being iterated over.
244 #[cfg_attr(test, derive(Debug, PartialEq))]
245 pub struct Utf8Chunk<'a> {
246     /// A valid UTF-8 piece, at the start, end, or between invalid UTF-8 bytes.
247     ///
248     /// This is empty between adjacent invalid UTF-8 byte sequences.
249     valid: &'a str,
250     /// A sequence of invalid UTF-8 bytes.
251     ///
252     /// Can only be empty in the last chunk.
253     ///
254     /// Should be replaced by a single unicode replacement character, if not
255     /// empty.
256     invalid: &'a BStr,
257     /// Indicates whether the invalid sequence could've been valid if there
258     /// were more bytes.
259     ///
260     /// Can only be true in the last chunk.
261     incomplete: bool,
262 }
263 
264 impl<'a> Utf8Chunk<'a> {
265     /// Returns the (possibly empty) valid UTF-8 bytes in this chunk.
266     ///
267     /// This may be empty if there are consecutive sequences of invalid UTF-8
268     /// bytes.
269     #[inline]
valid(&self) -> &'a str270     pub fn valid(&self) -> &'a str {
271         self.valid
272     }
273 
274     /// Returns the (possibly empty) invalid UTF-8 bytes in this chunk that
275     /// immediately follow the valid UTF-8 bytes in this chunk.
276     ///
277     /// This is only empty when this chunk corresponds to the last chunk in
278     /// the original bytes.
279     ///
280     /// The maximum length of this slice is 3. That is, invalid UTF-8 byte
281     /// sequences greater than 1 always correspond to a valid _prefix_ of
282     /// a valid UTF-8 encoded codepoint. This corresponds to the "substitution
283     /// of maximal subparts" strategy that is described in more detail in the
284     /// docs for the
285     /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
286     /// method.
287     #[inline]
invalid(&self) -> &'a [u8]288     pub fn invalid(&self) -> &'a [u8] {
289         self.invalid.as_bytes()
290     }
291 
292     /// Returns whether the invalid sequence might still become valid if more
293     /// bytes are added.
294     ///
295     /// Returns true if the end of the input was reached unexpectedly,
296     /// without encountering an unexpected byte.
297     ///
298     /// This can only be the case for the last chunk.
299     #[inline]
incomplete(&self) -> bool300     pub fn incomplete(&self) -> bool {
301         self.incomplete
302     }
303 }
304 
305 impl<'a> Iterator for Utf8Chunks<'a> {
306     type Item = Utf8Chunk<'a>;
307 
308     #[inline]
next(&mut self) -> Option<Utf8Chunk<'a>>309     fn next(&mut self) -> Option<Utf8Chunk<'a>> {
310         if self.bytes.is_empty() {
311             return None;
312         }
313         match validate(self.bytes) {
314             Ok(()) => {
315                 let valid = self.bytes;
316                 self.bytes = &[];
317                 Some(Utf8Chunk {
318                     // SAFETY: This is safe because of the guarantees provided
319                     // by utf8::validate.
320                     valid: unsafe { str::from_utf8_unchecked(valid) },
321                     invalid: [].as_bstr(),
322                     incomplete: false,
323                 })
324             }
325             Err(e) => {
326                 let (valid, rest) = self.bytes.split_at(e.valid_up_to());
327                 // SAFETY: This is safe because of the guarantees provided by
328                 // utf8::validate.
329                 let valid = unsafe { str::from_utf8_unchecked(valid) };
330                 let (invalid_len, incomplete) = match e.error_len() {
331                     Some(n) => (n, false),
332                     None => (rest.len(), true),
333                 };
334                 let (invalid, rest) = rest.split_at(invalid_len);
335                 self.bytes = rest;
336                 Some(Utf8Chunk {
337                     valid,
338                     invalid: invalid.as_bstr(),
339                     incomplete,
340                 })
341             }
342         }
343     }
344 
345     #[inline]
size_hint(&self) -> (usize, Option<usize>)346     fn size_hint(&self) -> (usize, Option<usize>) {
347         if self.bytes.is_empty() {
348             (0, Some(0))
349         } else {
350             (1, Some(self.bytes.len()))
351         }
352     }
353 }
354 
355 impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {}
356 
357 /// An error that occurs when UTF-8 decoding fails.
358 ///
359 /// This error occurs when attempting to convert a non-UTF-8 byte
360 /// string to a Rust string that must be valid UTF-8. For example,
361 /// [`to_str`](trait.ByteSlice.html#method.to_str) is one such method.
362 ///
363 /// # Example
364 ///
365 /// This example shows what happens when a given byte sequence is invalid,
366 /// but ends with a sequence that is a possible prefix of valid UTF-8.
367 ///
368 /// ```
369 /// use bstr::{B, ByteSlice};
370 ///
371 /// let s = B(b"foobar\xF1\x80\x80");
372 /// let err = s.to_str().unwrap_err();
373 /// assert_eq!(err.valid_up_to(), 6);
374 /// assert_eq!(err.error_len(), None);
375 /// ```
376 ///
377 /// This example shows what happens when a given byte sequence contains
378 /// invalid UTF-8.
379 ///
380 /// ```
381 /// use bstr::ByteSlice;
382 ///
383 /// let s = b"foobar\xF1\x80\x80quux";
384 /// let err = s.to_str().unwrap_err();
385 /// assert_eq!(err.valid_up_to(), 6);
386 /// // The error length reports the maximum number of bytes that correspond to
387 /// // a valid prefix of a UTF-8 encoded codepoint.
388 /// assert_eq!(err.error_len(), Some(3));
389 ///
390 /// // In contrast to the above which contains a single invalid prefix,
391 /// // consider the case of multiple individual bytes that are never valid
392 /// // prefixes. Note how the value of error_len changes!
393 /// let s = b"foobar\xFF\xFFquux";
394 /// let err = s.to_str().unwrap_err();
395 /// assert_eq!(err.valid_up_to(), 6);
396 /// assert_eq!(err.error_len(), Some(1));
397 ///
398 /// // The fact that it's an invalid prefix does not change error_len even
399 /// // when it immediately precedes the end of the string.
400 /// let s = b"foobar\xFF";
401 /// let err = s.to_str().unwrap_err();
402 /// assert_eq!(err.valid_up_to(), 6);
403 /// assert_eq!(err.error_len(), Some(1));
404 /// ```
405 #[derive(Clone, Debug, Eq, PartialEq)]
406 pub struct Utf8Error {
407     valid_up_to: usize,
408     error_len: Option<usize>,
409 }
410 
411 impl Utf8Error {
412     /// Returns the byte index of the position immediately following the last
413     /// valid UTF-8 byte.
414     ///
415     /// # Example
416     ///
417     /// This examples shows how `valid_up_to` can be used to retrieve a
418     /// possibly empty prefix that is guaranteed to be valid UTF-8:
419     ///
420     /// ```
421     /// use bstr::ByteSlice;
422     ///
423     /// let s = b"foobar\xF1\x80\x80quux";
424     /// let err = s.to_str().unwrap_err();
425     ///
426     /// // This is guaranteed to never panic.
427     /// let string = s[..err.valid_up_to()].to_str().unwrap();
428     /// assert_eq!(string, "foobar");
429     /// ```
430     #[inline]
valid_up_to(&self) -> usize431     pub fn valid_up_to(&self) -> usize {
432         self.valid_up_to
433     }
434 
435     /// Returns the total number of invalid UTF-8 bytes immediately following
436     /// the position returned by `valid_up_to`. This value is always at least
437     /// `1`, but can be up to `3` if bytes form a valid prefix of some UTF-8
438     /// encoded codepoint.
439     ///
440     /// If the end of the original input was found before a valid UTF-8 encoded
441     /// codepoint could be completed, then this returns `None`. This is useful
442     /// when processing streams, where a `None` value signals that more input
443     /// might be needed.
444     #[inline]
error_len(&self) -> Option<usize>445     pub fn error_len(&self) -> Option<usize> {
446         self.error_len
447     }
448 }
449 
450 #[cfg(feature = "std")]
451 impl error::Error for Utf8Error {
description(&self) -> &str452     fn description(&self) -> &str {
453         "invalid UTF-8"
454     }
455 }
456 
457 impl fmt::Display for Utf8Error {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result458     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
459         write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to)
460     }
461 }
462 
463 /// Returns OK if and only if the given slice is completely valid UTF-8.
464 ///
465 /// If the slice isn't valid UTF-8, then an error is returned that explains
466 /// the first location at which invalid UTF-8 was detected.
validate(slice: &[u8]) -> Result<(), Utf8Error>467 pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> {
468     // The fast path for validating UTF-8. It steps through a UTF-8 automaton
469     // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is
470     // detected, it backs up and runs the slower version of the UTF-8 automaton
471     // to determine correct error information.
472     fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
473         let mut state = ACCEPT;
474         let mut i = 0;
475 
476         while i < slice.len() {
477             let b = slice[i];
478 
479             // ASCII fast path. If we see two consecutive ASCII bytes, then try
480             // to validate as much ASCII as possible very quickly.
481             if state == ACCEPT
482                 && b <= 0x7F
483                 && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
484             {
485                 i += ascii::first_non_ascii_byte(&slice[i..]);
486                 continue;
487             }
488 
489             state = step(state, b);
490             if state == REJECT {
491                 return Err(find_valid_up_to(slice, i));
492             }
493             i += 1;
494         }
495         if state != ACCEPT {
496             Err(find_valid_up_to(slice, slice.len()))
497         } else {
498             Ok(())
499         }
500     }
501 
502     // Given the first position at which a UTF-8 sequence was determined to be
503     // invalid, return an error that correctly reports the position at which
504     // the last complete UTF-8 sequence ends.
505     #[inline(never)]
506     fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
507         // In order to find the last valid byte, we need to back up an amount
508         // that guarantees every preceding byte is part of a valid UTF-8
509         // code unit sequence. To do this, we simply locate the last leading
510         // byte that occurs before rejected_at.
511         let mut backup = rejected_at.saturating_sub(1);
512         while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) {
513             backup -= 1;
514         }
515         let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
516         let mut err = slow(&slice[backup..upto]).unwrap_err();
517         err.valid_up_to += backup;
518         err
519     }
520 
521     // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error
522     // when an invalid sequence is found. This is split out from validate so
523     // that the fast path doesn't need to keep track of the position of the
524     // last valid UTF-8 byte. In particular, tracking this requires checking
525     // for an ACCEPT state on each byte, which degrades throughput pretty
526     // badly.
527     fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
528         let mut state = ACCEPT;
529         let mut valid_up_to = 0;
530         for (i, &b) in slice.iter().enumerate() {
531             state = step(state, b);
532             if state == ACCEPT {
533                 valid_up_to = i + 1;
534             } else if state == REJECT {
535                 // Our error length must always be at least 1.
536                 let error_len = Some(cmp::max(1, i - valid_up_to));
537                 return Err(Utf8Error { valid_up_to, error_len });
538             }
539         }
540         if state != ACCEPT {
541             Err(Utf8Error { valid_up_to, error_len: None })
542         } else {
543             Ok(())
544         }
545     }
546 
547     // Advance to the next state given the current state and current byte.
548     fn step(state: usize, b: u8) -> usize {
549         let class = CLASSES[b as usize];
550         // SAFETY: This is safe because 'class' is always <=11 and 'state' is
551         // always <=96. Therefore, the maximal index is 96+11 = 107, where
552         // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
553         // valid by construction of the state machine and the byte equivalence
554         // classes.
555         unsafe {
556             *STATES_FORWARD.get_unchecked(state + class as usize) as usize
557         }
558     }
559 
560     fast(slice)
561 }
562 
563 /// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
564 ///
565 /// When successful, the corresponding Unicode scalar value is returned along
566 /// with the number of bytes it was encoded with. The number of bytes consumed
567 /// for a successful decode is always between 1 and 4, inclusive.
568 ///
569 /// When unsuccessful, `None` is returned along with the number of bytes that
570 /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
571 /// the number of bytes consumed is always between 0 and 3, inclusive, where
572 /// 0 is only returned when `slice` is empty.
573 ///
574 /// # Examples
575 ///
576 /// Basic usage:
577 ///
578 /// ```
579 /// use bstr::decode_utf8;
580 ///
581 /// // Decoding a valid codepoint.
582 /// let (ch, size) = decode_utf8(b"\xE2\x98\x83");
583 /// assert_eq!(Some('☃'), ch);
584 /// assert_eq!(3, size);
585 ///
586 /// // Decoding an incomplete codepoint.
587 /// let (ch, size) = decode_utf8(b"\xE2\x98");
588 /// assert_eq!(None, ch);
589 /// assert_eq!(2, size);
590 /// ```
591 ///
592 /// This example shows how to iterate over all codepoints in UTF-8 encoded
593 /// bytes, while replacing invalid UTF-8 sequences with the replacement
594 /// codepoint:
595 ///
596 /// ```
597 /// use bstr::{B, decode_utf8};
598 ///
599 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
600 /// let mut chars = vec![];
601 /// while !bytes.is_empty() {
602 ///     let (ch, size) = decode_utf8(bytes);
603 ///     bytes = &bytes[size..];
604 ///     chars.push(ch.unwrap_or('\u{FFFD}'));
605 /// }
606 /// assert_eq!(vec!['☃', '\u{FFFD}', '��', '\u{FFFD}', 'a'], chars);
607 /// ```
608 #[inline]
decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize)609 pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
610     let slice = slice.as_ref();
611     match slice.get(0) {
612         None => return (None, 0),
613         Some(&b) if b <= 0x7F => return (Some(b as char), 1),
614         _ => {}
615     }
616 
617     let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
618     while i < slice.len() {
619         decode_step(&mut state, &mut cp, slice[i]);
620         i += 1;
621 
622         if state == ACCEPT {
623             // SAFETY: This is safe because `decode_step` guarantees that
624             // `cp` is a valid Unicode scalar value in an ACCEPT state.
625             let ch = unsafe { char::from_u32_unchecked(cp) };
626             return (Some(ch), i);
627         } else if state == REJECT {
628             // At this point, we always want to advance at least one byte.
629             return (None, cmp::max(1, i.saturating_sub(1)));
630         }
631     }
632     (None, i)
633 }
634 
635 /// Lossily UTF-8 decode a single Unicode scalar value from the beginning of a
636 /// slice.
637 ///
638 /// When successful, the corresponding Unicode scalar value is returned along
639 /// with the number of bytes it was encoded with. The number of bytes consumed
640 /// for a successful decode is always between 1 and 4, inclusive.
641 ///
642 /// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
643 /// along with the number of bytes that make up a maximal prefix of a valid
644 /// UTF-8 code unit sequence. In this case, the number of bytes consumed is
645 /// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
646 /// empty.
647 ///
648 /// # Examples
649 ///
650 /// Basic usage:
651 ///
652 /// ```ignore
653 /// use bstr::decode_utf8_lossy;
654 ///
655 /// // Decoding a valid codepoint.
656 /// let (ch, size) = decode_utf8_lossy(b"\xE2\x98\x83");
657 /// assert_eq!('☃', ch);
658 /// assert_eq!(3, size);
659 ///
660 /// // Decoding an incomplete codepoint.
661 /// let (ch, size) = decode_utf8_lossy(b"\xE2\x98");
662 /// assert_eq!('\u{FFFD}', ch);
663 /// assert_eq!(2, size);
664 /// ```
665 ///
666 /// This example shows how to iterate over all codepoints in UTF-8 encoded
667 /// bytes, while replacing invalid UTF-8 sequences with the replacement
668 /// codepoint:
669 ///
670 /// ```ignore
671 /// use bstr::{B, decode_utf8_lossy};
672 ///
673 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
674 /// let mut chars = vec![];
675 /// while !bytes.is_empty() {
676 ///     let (ch, size) = decode_utf8_lossy(bytes);
677 ///     bytes = &bytes[size..];
678 ///     chars.push(ch);
679 /// }
680 /// assert_eq!(vec!['☃', '\u{FFFD}', '��', '\u{FFFD}', 'a'], chars);
681 /// ```
682 #[inline]
decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize)683 pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
684     match decode(slice) {
685         (Some(ch), size) => (ch, size),
686         (None, size) => ('\u{FFFD}', size),
687     }
688 }
689 
690 /// UTF-8 decode a single Unicode scalar value from the end of a slice.
691 ///
692 /// When successful, the corresponding Unicode scalar value is returned along
693 /// with the number of bytes it was encoded with. The number of bytes consumed
694 /// for a successful decode is always between 1 and 4, inclusive.
695 ///
696 /// When unsuccessful, `None` is returned along with the number of bytes that
697 /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
698 /// the number of bytes consumed is always between 0 and 3, inclusive, where
699 /// 0 is only returned when `slice` is empty.
700 ///
701 /// # Examples
702 ///
703 /// Basic usage:
704 ///
705 /// ```
706 /// use bstr::decode_last_utf8;
707 ///
708 /// // Decoding a valid codepoint.
709 /// let (ch, size) = decode_last_utf8(b"\xE2\x98\x83");
710 /// assert_eq!(Some('☃'), ch);
711 /// assert_eq!(3, size);
712 ///
713 /// // Decoding an incomplete codepoint.
714 /// let (ch, size) = decode_last_utf8(b"\xE2\x98");
715 /// assert_eq!(None, ch);
716 /// assert_eq!(2, size);
717 /// ```
718 ///
719 /// This example shows how to iterate over all codepoints in UTF-8 encoded
720 /// bytes in reverse, while replacing invalid UTF-8 sequences with the
721 /// replacement codepoint:
722 ///
723 /// ```
724 /// use bstr::{B, decode_last_utf8};
725 ///
726 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
727 /// let mut chars = vec![];
728 /// while !bytes.is_empty() {
729 ///     let (ch, size) = decode_last_utf8(bytes);
730 ///     bytes = &bytes[..bytes.len()-size];
731 ///     chars.push(ch.unwrap_or('\u{FFFD}'));
732 /// }
733 /// assert_eq!(vec!['a', '\u{FFFD}', '��', '\u{FFFD}', '☃'], chars);
734 /// ```
735 #[inline]
decode_last<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize)736 pub fn decode_last<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
737     // TODO: We could implement this by reversing the UTF-8 automaton, but for
738     // now, we do it the slow way by using the forward automaton.
739 
740     let slice = slice.as_ref();
741     if slice.is_empty() {
742         return (None, 0);
743     }
744     let mut start = slice.len() - 1;
745     let limit = slice.len().saturating_sub(4);
746     while start > limit && !is_leading_or_invalid_utf8_byte(slice[start]) {
747         start -= 1;
748     }
749     let (ch, size) = decode(&slice[start..]);
750     // If we didn't consume all of the bytes, then that means there's at least
751     // one stray byte that never occurs in a valid code unit prefix, so we can
752     // advance by one byte.
753     if start + size != slice.len() {
754         (None, 1)
755     } else {
756         (ch, size)
757     }
758 }
759 
760 /// Lossily UTF-8 decode a single Unicode scalar value from the end of a slice.
761 ///
762 /// When successful, the corresponding Unicode scalar value is returned along
763 /// with the number of bytes it was encoded with. The number of bytes consumed
764 /// for a successful decode is always between 1 and 4, inclusive.
765 ///
766 /// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
767 /// along with the number of bytes that make up a maximal prefix of a valid
768 /// UTF-8 code unit sequence. In this case, the number of bytes consumed is
769 /// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
770 /// empty.
771 ///
772 /// # Examples
773 ///
774 /// Basic usage:
775 ///
776 /// ```ignore
777 /// use bstr::decode_last_utf8_lossy;
778 ///
779 /// // Decoding a valid codepoint.
780 /// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98\x83");
781 /// assert_eq!('☃', ch);
782 /// assert_eq!(3, size);
783 ///
784 /// // Decoding an incomplete codepoint.
785 /// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98");
786 /// assert_eq!('\u{FFFD}', ch);
787 /// assert_eq!(2, size);
788 /// ```
789 ///
790 /// This example shows how to iterate over all codepoints in UTF-8 encoded
791 /// bytes in reverse, while replacing invalid UTF-8 sequences with the
792 /// replacement codepoint:
793 ///
794 /// ```ignore
795 /// use bstr::decode_last_utf8_lossy;
796 ///
797 /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
798 /// let mut chars = vec![];
799 /// while !bytes.is_empty() {
800 ///     let (ch, size) = decode_last_utf8_lossy(bytes);
801 ///     bytes = &bytes[..bytes.len()-size];
802 ///     chars.push(ch);
803 /// }
804 /// assert_eq!(vec!['a', '\u{FFFD}', '��', '\u{FFFD}', '☃'], chars);
805 /// ```
806 #[inline]
decode_last_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize)807 pub fn decode_last_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
808     match decode_last(slice) {
809         (Some(ch), size) => (ch, size),
810         (None, size) => ('\u{FFFD}', size),
811     }
812 }
813 
814 /// SAFETY: The decode function relies on state being equal to ACCEPT only if
815 /// cp is a valid Unicode scalar value.
816 #[inline]
decode_step(state: &mut usize, cp: &mut u32, b: u8)817 pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
818     let class = CLASSES[b as usize];
819     if *state == ACCEPT {
820         *cp = (0xFF >> class) & (b as u32);
821     } else {
822         *cp = (b as u32 & 0b111111) | (*cp << 6);
823     }
824     *state = STATES_FORWARD[*state + class as usize] as usize;
825 }
826 
827 /// Returns true if and only if the given byte is either a valid leading UTF-8
828 /// byte, or is otherwise an invalid byte that can never appear anywhere in a
829 /// valid UTF-8 sequence.
is_leading_or_invalid_utf8_byte(b: u8) -> bool830 fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
831     // In the ASCII case, the most significant bit is never set. The leading
832     // byte of a 2/3/4-byte sequence always has the top two most significant
833     // bits set. For bytes that can never appear anywhere in valid UTF-8, this
834     // also returns true, since every such byte has its two most significant
835     // bits set:
836     //
837     //     \xC0 :: 11000000
838     //     \xC1 :: 11000001
839     //     \xF5 :: 11110101
840     //     \xF6 :: 11110110
841     //     \xF7 :: 11110111
842     //     \xF8 :: 11111000
843     //     \xF9 :: 11111001
844     //     \xFA :: 11111010
845     //     \xFB :: 11111011
846     //     \xFC :: 11111100
847     //     \xFD :: 11111101
848     //     \xFE :: 11111110
849     //     \xFF :: 11111111
850     (b & 0b1100_0000) != 0b1000_0000
851 }
852 
853 #[cfg(all(test, feature = "std"))]
854 mod tests {
855     use std::char;
856 
857     use crate::{
858         ext_slice::{ByteSlice, B},
859         tests::LOSSY_TESTS,
860         utf8::{self, Utf8Error},
861     };
862 
utf8e(valid_up_to: usize) -> Utf8Error863     fn utf8e(valid_up_to: usize) -> Utf8Error {
864         Utf8Error { valid_up_to, error_len: None }
865     }
866 
utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error867     fn utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error {
868         Utf8Error { valid_up_to, error_len: Some(error_len) }
869     }
870 
871     #[test]
872     #[cfg(not(miri))]
validate_all_codepoints()873     fn validate_all_codepoints() {
874         for i in 0..(0x10FFFF + 1) {
875             let cp = match char::from_u32(i) {
876                 None => continue,
877                 Some(cp) => cp,
878             };
879             let mut buf = [0; 4];
880             let s = cp.encode_utf8(&mut buf);
881             assert_eq!(Ok(()), utf8::validate(s.as_bytes()));
882         }
883     }
884 
885     #[test]
validate_multiple_codepoints()886     fn validate_multiple_codepoints() {
887         assert_eq!(Ok(()), utf8::validate(b"abc"));
888         assert_eq!(Ok(()), utf8::validate(b"a\xE2\x98\x83a"));
889         assert_eq!(Ok(()), utf8::validate(b"a\xF0\x9D\x9C\xB7a"));
890         assert_eq!(Ok(()), utf8::validate(b"\xE2\x98\x83\xF0\x9D\x9C\xB7",));
891         assert_eq!(
892             Ok(()),
893             utf8::validate(b"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a",)
894         );
895         assert_eq!(
896             Ok(()),
897             utf8::validate(b"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD",)
898         );
899     }
900 
901     #[test]
validate_errors()902     fn validate_errors() {
903         // single invalid byte
904         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xFF"));
905         // single invalid byte after ASCII
906         assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xFF"));
907         // single invalid byte after 2 byte sequence
908         assert_eq!(Err(utf8e2(2, 1)), utf8::validate(b"\xCE\xB2\xFF"));
909         // single invalid byte after 3 byte sequence
910         assert_eq!(Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xFF"));
911         // single invalid byte after 4 byte sequence
912         assert_eq!(Err(utf8e2(4, 1)), utf8::validate(b"\xF0\x9D\x9D\xB1\xFF"));
913 
914         // An invalid 2-byte sequence with a valid 1-byte prefix.
915         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCE\xF0"));
916         // An invalid 3-byte sequence with a valid 2-byte prefix.
917         assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98\xF0"));
918         // An invalid 4-byte sequence with a valid 3-byte prefix.
919         assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9D\xF0"));
920 
921         // An overlong sequence. Should be \xE2\x82\xAC, but we encode the
922         // same codepoint value in 4 bytes. This not only tests that we reject
923         // overlong sequences, but that we get valid_up_to correct.
924         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xF0\x82\x82\xAC"));
925         assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xF0\x82\x82\xAC"));
926         assert_eq!(
927             Err(utf8e2(3, 1)),
928             utf8::validate(b"\xE2\x98\x83\xF0\x82\x82\xAC",)
929         );
930 
931         // Check that encoding a surrogate codepoint using the UTF-8 scheme
932         // fails validation.
933         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xED\xA0\x80"));
934         assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xED\xA0\x80"));
935         assert_eq!(
936             Err(utf8e2(3, 1)),
937             utf8::validate(b"\xE2\x98\x83\xED\xA0\x80",)
938         );
939 
940         // Check that an incomplete 2-byte sequence fails.
941         assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCEa"));
942         assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xCEa"));
943         assert_eq!(
944             Err(utf8e2(3, 1)),
945             utf8::validate(b"\xE2\x98\x83\xCE\xE2\x98\x83",)
946         );
947         // Check that an incomplete 3-byte sequence fails.
948         assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98a"));
949         assert_eq!(Err(utf8e2(1, 2)), utf8::validate(b"a\xE2\x98a"));
950         assert_eq!(
951             Err(utf8e2(3, 2)),
952             utf8::validate(b"\xE2\x98\x83\xE2\x98\xE2\x98\x83",)
953         );
954         // Check that an incomplete 4-byte sequence fails.
955         assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9Ca"));
956         assert_eq!(Err(utf8e2(1, 3)), utf8::validate(b"a\xF0\x9D\x9Ca"));
957         assert_eq!(
958             Err(utf8e2(4, 3)),
959             utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83",)
960         );
961         assert_eq!(
962             Err(utf8e2(6, 3)),
963             utf8::validate(b"foobar\xF1\x80\x80quux",)
964         );
965 
966         // Check that an incomplete (EOF) 2-byte sequence fails.
967         assert_eq!(Err(utf8e(0)), utf8::validate(b"\xCE"));
968         assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xCE"));
969         assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xCE"));
970         // Check that an incomplete (EOF) 3-byte sequence fails.
971         assert_eq!(Err(utf8e(0)), utf8::validate(b"\xE2\x98"));
972         assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xE2\x98"));
973         assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xE2\x98"));
974         // Check that an incomplete (EOF) 4-byte sequence fails.
975         assert_eq!(Err(utf8e(0)), utf8::validate(b"\xF0\x9D\x9C"));
976         assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xF0\x9D\x9C"));
977         assert_eq!(
978             Err(utf8e(4)),
979             utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C",)
980         );
981 
982         // Test that we errors correct even after long valid sequences. This
983         // checks that our "backup" logic for detecting errors is correct.
984         assert_eq!(
985             Err(utf8e2(8, 1)),
986             utf8::validate(b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF",)
987         );
988     }
989 
990     #[test]
decode_valid()991     fn decode_valid() {
992         fn d(mut s: &str) -> Vec<char> {
993             let mut chars = vec![];
994             while !s.is_empty() {
995                 let (ch, size) = utf8::decode(s.as_bytes());
996                 s = &s[size..];
997                 chars.push(ch.unwrap());
998             }
999             chars
1000         }
1001 
1002         assert_eq!(vec!['☃'], d("☃"));
1003         assert_eq!(vec!['☃', '☃'], d("☃☃"));
1004         assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
1005         assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
1006         assert_eq!(vec!['��', '��', '��', '��', '��'], d("����������"));
1007     }
1008 
1009     #[test]
decode_invalid()1010     fn decode_invalid() {
1011         let (ch, size) = utf8::decode(b"");
1012         assert_eq!(None, ch);
1013         assert_eq!(0, size);
1014 
1015         let (ch, size) = utf8::decode(b"\xFF");
1016         assert_eq!(None, ch);
1017         assert_eq!(1, size);
1018 
1019         let (ch, size) = utf8::decode(b"\xCE\xF0");
1020         assert_eq!(None, ch);
1021         assert_eq!(1, size);
1022 
1023         let (ch, size) = utf8::decode(b"\xE2\x98\xF0");
1024         assert_eq!(None, ch);
1025         assert_eq!(2, size);
1026 
1027         let (ch, size) = utf8::decode(b"\xF0\x9D\x9D");
1028         assert_eq!(None, ch);
1029         assert_eq!(3, size);
1030 
1031         let (ch, size) = utf8::decode(b"\xF0\x9D\x9D\xF0");
1032         assert_eq!(None, ch);
1033         assert_eq!(3, size);
1034 
1035         let (ch, size) = utf8::decode(b"\xF0\x82\x82\xAC");
1036         assert_eq!(None, ch);
1037         assert_eq!(1, size);
1038 
1039         let (ch, size) = utf8::decode(b"\xED\xA0\x80");
1040         assert_eq!(None, ch);
1041         assert_eq!(1, size);
1042 
1043         let (ch, size) = utf8::decode(b"\xCEa");
1044         assert_eq!(None, ch);
1045         assert_eq!(1, size);
1046 
1047         let (ch, size) = utf8::decode(b"\xE2\x98a");
1048         assert_eq!(None, ch);
1049         assert_eq!(2, size);
1050 
1051         let (ch, size) = utf8::decode(b"\xF0\x9D\x9Ca");
1052         assert_eq!(None, ch);
1053         assert_eq!(3, size);
1054     }
1055 
1056     #[test]
decode_lossy()1057     fn decode_lossy() {
1058         let (ch, size) = utf8::decode_lossy(b"");
1059         assert_eq!('\u{FFFD}', ch);
1060         assert_eq!(0, size);
1061 
1062         let (ch, size) = utf8::decode_lossy(b"\xFF");
1063         assert_eq!('\u{FFFD}', ch);
1064         assert_eq!(1, size);
1065 
1066         let (ch, size) = utf8::decode_lossy(b"\xCE\xF0");
1067         assert_eq!('\u{FFFD}', ch);
1068         assert_eq!(1, size);
1069 
1070         let (ch, size) = utf8::decode_lossy(b"\xE2\x98\xF0");
1071         assert_eq!('\u{FFFD}', ch);
1072         assert_eq!(2, size);
1073 
1074         let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9D\xF0");
1075         assert_eq!('\u{FFFD}', ch);
1076         assert_eq!(3, size);
1077 
1078         let (ch, size) = utf8::decode_lossy(b"\xF0\x82\x82\xAC");
1079         assert_eq!('\u{FFFD}', ch);
1080         assert_eq!(1, size);
1081 
1082         let (ch, size) = utf8::decode_lossy(b"\xED\xA0\x80");
1083         assert_eq!('\u{FFFD}', ch);
1084         assert_eq!(1, size);
1085 
1086         let (ch, size) = utf8::decode_lossy(b"\xCEa");
1087         assert_eq!('\u{FFFD}', ch);
1088         assert_eq!(1, size);
1089 
1090         let (ch, size) = utf8::decode_lossy(b"\xE2\x98a");
1091         assert_eq!('\u{FFFD}', ch);
1092         assert_eq!(2, size);
1093 
1094         let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9Ca");
1095         assert_eq!('\u{FFFD}', ch);
1096         assert_eq!(3, size);
1097     }
1098 
1099     #[test]
decode_last_valid()1100     fn decode_last_valid() {
1101         fn d(mut s: &str) -> Vec<char> {
1102             let mut chars = vec![];
1103             while !s.is_empty() {
1104                 let (ch, size) = utf8::decode_last(s.as_bytes());
1105                 s = &s[..s.len() - size];
1106                 chars.push(ch.unwrap());
1107             }
1108             chars
1109         }
1110 
1111         assert_eq!(vec!['☃'], d("☃"));
1112         assert_eq!(vec!['☃', '☃'], d("☃☃"));
1113         assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε"));
1114         assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇"));
1115         assert_eq!(vec!['��', '��', '��', '��', '��'], d("����������"));
1116     }
1117 
1118     #[test]
decode_last_invalid()1119     fn decode_last_invalid() {
1120         let (ch, size) = utf8::decode_last(b"");
1121         assert_eq!(None, ch);
1122         assert_eq!(0, size);
1123 
1124         let (ch, size) = utf8::decode_last(b"\xFF");
1125         assert_eq!(None, ch);
1126         assert_eq!(1, size);
1127 
1128         let (ch, size) = utf8::decode_last(b"\xCE\xF0");
1129         assert_eq!(None, ch);
1130         assert_eq!(1, size);
1131 
1132         let (ch, size) = utf8::decode_last(b"\xCE");
1133         assert_eq!(None, ch);
1134         assert_eq!(1, size);
1135 
1136         let (ch, size) = utf8::decode_last(b"\xE2\x98\xF0");
1137         assert_eq!(None, ch);
1138         assert_eq!(1, size);
1139 
1140         let (ch, size) = utf8::decode_last(b"\xE2\x98");
1141         assert_eq!(None, ch);
1142         assert_eq!(2, size);
1143 
1144         let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D\xF0");
1145         assert_eq!(None, ch);
1146         assert_eq!(1, size);
1147 
1148         let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D");
1149         assert_eq!(None, ch);
1150         assert_eq!(3, size);
1151 
1152         let (ch, size) = utf8::decode_last(b"\xF0\x82\x82\xAC");
1153         assert_eq!(None, ch);
1154         assert_eq!(1, size);
1155 
1156         let (ch, size) = utf8::decode_last(b"\xED\xA0\x80");
1157         assert_eq!(None, ch);
1158         assert_eq!(1, size);
1159 
1160         let (ch, size) = utf8::decode_last(b"\xED\xA0");
1161         assert_eq!(None, ch);
1162         assert_eq!(1, size);
1163 
1164         let (ch, size) = utf8::decode_last(b"\xED");
1165         assert_eq!(None, ch);
1166         assert_eq!(1, size);
1167 
1168         let (ch, size) = utf8::decode_last(b"a\xCE");
1169         assert_eq!(None, ch);
1170         assert_eq!(1, size);
1171 
1172         let (ch, size) = utf8::decode_last(b"a\xE2\x98");
1173         assert_eq!(None, ch);
1174         assert_eq!(2, size);
1175 
1176         let (ch, size) = utf8::decode_last(b"a\xF0\x9D\x9C");
1177         assert_eq!(None, ch);
1178         assert_eq!(3, size);
1179     }
1180 
1181     #[test]
decode_last_lossy()1182     fn decode_last_lossy() {
1183         let (ch, size) = utf8::decode_last_lossy(b"");
1184         assert_eq!('\u{FFFD}', ch);
1185         assert_eq!(0, size);
1186 
1187         let (ch, size) = utf8::decode_last_lossy(b"\xFF");
1188         assert_eq!('\u{FFFD}', ch);
1189         assert_eq!(1, size);
1190 
1191         let (ch, size) = utf8::decode_last_lossy(b"\xCE\xF0");
1192         assert_eq!('\u{FFFD}', ch);
1193         assert_eq!(1, size);
1194 
1195         let (ch, size) = utf8::decode_last_lossy(b"\xCE");
1196         assert_eq!('\u{FFFD}', ch);
1197         assert_eq!(1, size);
1198 
1199         let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98\xF0");
1200         assert_eq!('\u{FFFD}', ch);
1201         assert_eq!(1, size);
1202 
1203         let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98");
1204         assert_eq!('\u{FFFD}', ch);
1205         assert_eq!(2, size);
1206 
1207         let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D\xF0");
1208         assert_eq!('\u{FFFD}', ch);
1209         assert_eq!(1, size);
1210 
1211         let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D");
1212         assert_eq!('\u{FFFD}', ch);
1213         assert_eq!(3, size);
1214 
1215         let (ch, size) = utf8::decode_last_lossy(b"\xF0\x82\x82\xAC");
1216         assert_eq!('\u{FFFD}', ch);
1217         assert_eq!(1, size);
1218 
1219         let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0\x80");
1220         assert_eq!('\u{FFFD}', ch);
1221         assert_eq!(1, size);
1222 
1223         let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0");
1224         assert_eq!('\u{FFFD}', ch);
1225         assert_eq!(1, size);
1226 
1227         let (ch, size) = utf8::decode_last_lossy(b"\xED");
1228         assert_eq!('\u{FFFD}', ch);
1229         assert_eq!(1, size);
1230 
1231         let (ch, size) = utf8::decode_last_lossy(b"a\xCE");
1232         assert_eq!('\u{FFFD}', ch);
1233         assert_eq!(1, size);
1234 
1235         let (ch, size) = utf8::decode_last_lossy(b"a\xE2\x98");
1236         assert_eq!('\u{FFFD}', ch);
1237         assert_eq!(2, size);
1238 
1239         let (ch, size) = utf8::decode_last_lossy(b"a\xF0\x9D\x9C");
1240         assert_eq!('\u{FFFD}', ch);
1241         assert_eq!(3, size);
1242     }
1243 
1244     #[test]
chars()1245     fn chars() {
1246         for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
1247             let got: String = B(input).chars().collect();
1248             assert_eq!(
1249                 expected, got,
1250                 "chars(ith: {:?}, given: {:?})",
1251                 i, input,
1252             );
1253             let got: String =
1254                 B(input).char_indices().map(|(_, _, ch)| ch).collect();
1255             assert_eq!(
1256                 expected, got,
1257                 "char_indices(ith: {:?}, given: {:?})",
1258                 i, input,
1259             );
1260 
1261             let expected: String = expected.chars().rev().collect();
1262 
1263             let got: String = B(input).chars().rev().collect();
1264             assert_eq!(
1265                 expected, got,
1266                 "chars.rev(ith: {:?}, given: {:?})",
1267                 i, input,
1268             );
1269             let got: String =
1270                 B(input).char_indices().rev().map(|(_, _, ch)| ch).collect();
1271             assert_eq!(
1272                 expected, got,
1273                 "char_indices.rev(ith: {:?}, given: {:?})",
1274                 i, input,
1275             );
1276         }
1277     }
1278 
1279     #[test]
utf8_chunks()1280     fn utf8_chunks() {
1281         let mut c = utf8::Utf8Chunks { bytes: b"123\xC0" };
1282         assert_eq!(
1283             (c.next(), c.next()),
1284             (
1285                 Some(utf8::Utf8Chunk {
1286                     valid: "123",
1287                     invalid: b"\xC0".as_bstr(),
1288                     incomplete: false,
1289                 }),
1290                 None,
1291             )
1292         );
1293 
1294         let mut c = utf8::Utf8Chunks { bytes: b"123\xFF\xFF" };
1295         assert_eq!(
1296             (c.next(), c.next(), c.next()),
1297             (
1298                 Some(utf8::Utf8Chunk {
1299                     valid: "123",
1300                     invalid: b"\xFF".as_bstr(),
1301                     incomplete: false,
1302                 }),
1303                 Some(utf8::Utf8Chunk {
1304                     valid: "",
1305                     invalid: b"\xFF".as_bstr(),
1306                     incomplete: false,
1307                 }),
1308                 None,
1309             )
1310         );
1311 
1312         let mut c = utf8::Utf8Chunks { bytes: b"123\xD0" };
1313         assert_eq!(
1314             (c.next(), c.next()),
1315             (
1316                 Some(utf8::Utf8Chunk {
1317                     valid: "123",
1318                     invalid: b"\xD0".as_bstr(),
1319                     incomplete: true,
1320                 }),
1321                 None,
1322             )
1323         );
1324 
1325         let mut c = utf8::Utf8Chunks { bytes: b"123\xD0456" };
1326         assert_eq!(
1327             (c.next(), c.next(), c.next()),
1328             (
1329                 Some(utf8::Utf8Chunk {
1330                     valid: "123",
1331                     invalid: b"\xD0".as_bstr(),
1332                     incomplete: false,
1333                 }),
1334                 Some(utf8::Utf8Chunk {
1335                     valid: "456",
1336                     invalid: b"".as_bstr(),
1337                     incomplete: false,
1338                 }),
1339                 None,
1340             )
1341         );
1342 
1343         let mut c = utf8::Utf8Chunks { bytes: b"123\xE2\x98" };
1344         assert_eq!(
1345             (c.next(), c.next()),
1346             (
1347                 Some(utf8::Utf8Chunk {
1348                     valid: "123",
1349                     invalid: b"\xE2\x98".as_bstr(),
1350                     incomplete: true,
1351                 }),
1352                 None,
1353             )
1354         );
1355 
1356         let mut c = utf8::Utf8Chunks { bytes: b"123\xF4\x8F\xBF" };
1357         assert_eq!(
1358             (c.next(), c.next()),
1359             (
1360                 Some(utf8::Utf8Chunk {
1361                     valid: "123",
1362                     invalid: b"\xF4\x8F\xBF".as_bstr(),
1363                     incomplete: true,
1364                 }),
1365                 None,
1366             )
1367         );
1368     }
1369 }
1370