1 mod lossy;
2 mod read;
3 
4 pub use lossy::LossyDecoder;
5 pub use read::{BufReadDecoder, BufReadDecoderError};
6 
7 use std::cmp;
8 use std::error::Error;
9 use std::fmt;
10 use std::str;
11 
12 /// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error.
13 pub const REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}";
14 
15 #[derive(Debug, Copy, Clone)]
16 pub enum DecodeError<'a> {
17     /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`,
18     /// then call `decode()` again with `remaining_input`.
19     Invalid {
20         valid_prefix: &'a str,
21         invalid_sequence: &'a [u8],
22         remaining_input: &'a [u8],
23     },
24 
25     /// Call the `incomplete_suffix.try_complete` method with more input when available.
26     /// If no more input is available, this is an invalid byte sequence.
27     Incomplete {
28         valid_prefix: &'a str,
29         incomplete_suffix: Incomplete,
30     },
31 }
32 
33 impl<'a> fmt::Display for DecodeError<'a> {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result34     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
35         match *self {
36             DecodeError::Invalid {
37                 valid_prefix,
38                 invalid_sequence,
39                 remaining_input,
40             } => write!(
41                 f,
42                 "found invalid byte sequence {invalid_sequence:02x?} after \
43                  {valid_byte_count} valid bytes, followed by {unprocessed_byte_count} more \
44                  unprocessed bytes",
45                 invalid_sequence = invalid_sequence,
46                 valid_byte_count = valid_prefix.len(),
47                 unprocessed_byte_count = remaining_input.len()
48             ),
49             DecodeError::Incomplete {
50                 valid_prefix,
51                 incomplete_suffix,
52             } => write!(
53                 f,
54                 "found incomplete byte sequence {incomplete_suffix:02x?} after \
55                  {valid_byte_count} bytes",
56                 incomplete_suffix = incomplete_suffix,
57                 valid_byte_count = valid_prefix.len()
58             ),
59         }
60     }
61 }
62 
63 impl<'a> Error for DecodeError<'a> {}
64 
65 #[derive(Debug, Copy, Clone)]
66 pub struct Incomplete {
67     pub buffer: [u8; 4],
68     pub buffer_len: u8,
69 }
70 
decode(input: &[u8]) -> Result<&str, DecodeError>71 pub fn decode(input: &[u8]) -> Result<&str, DecodeError> {
72     let error = match str::from_utf8(input) {
73         Ok(valid) => return Ok(valid),
74         Err(error) => error,
75     };
76 
77     // FIXME: separate function from here to guide inlining?
78     let (valid, after_valid) = input.split_at(error.valid_up_to());
79     let valid = unsafe {
80         str::from_utf8_unchecked(valid)
81     };
82 
83     match error.error_len() {
84         Some(invalid_sequence_length) => {
85             let (invalid, rest) = after_valid.split_at(invalid_sequence_length);
86             Err(DecodeError::Invalid {
87                 valid_prefix: valid,
88                 invalid_sequence: invalid,
89                 remaining_input: rest
90             })
91         }
92         None => {
93             Err(DecodeError::Incomplete {
94                 valid_prefix: valid,
95                 incomplete_suffix: Incomplete::new(after_valid),
96             })
97         }
98     }
99 }
100 
101 impl Incomplete {
empty() -> Self102     pub fn empty() -> Self {
103         Incomplete {
104             buffer: [0, 0, 0, 0],
105             buffer_len: 0,
106         }
107     }
108 
is_empty(&self) -> bool109     pub fn is_empty(&self) -> bool {
110         self.buffer_len == 0
111     }
112 
new(bytes: &[u8]) -> Self113     pub fn new(bytes: &[u8]) -> Self {
114         let mut buffer = [0, 0, 0, 0];
115         let len = bytes.len();
116         buffer[..len].copy_from_slice(bytes);
117         Incomplete {
118             buffer: buffer,
119             buffer_len: len as u8,
120         }
121     }
122 
123     /// * `None`: still incomplete, call `try_complete` again with more input.
124     ///   If no more input is available, this is invalid byte sequence.
125     /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`.
126     ///   To keep decoding, pass `remaining_input` to `decode()`.
try_complete<'input>(&mut self, input: &'input [u8]) -> Option<(Result<&str, &[u8]>, &'input [u8])>127     pub fn try_complete<'input>(&mut self, input: &'input [u8])
128                                 -> Option<(Result<&str, &[u8]>, &'input [u8])> {
129         let (consumed, opt_result) = self.try_complete_offsets(input);
130         let result = opt_result?;
131         let remaining_input = &input[consumed..];
132         let result_bytes = self.take_buffer();
133         let result = match result {
134             Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }),
135             Err(()) => Err(result_bytes),
136         };
137         Some((result, remaining_input))
138     }
139 
take_buffer(&mut self) -> &[u8]140     fn take_buffer(&mut self) -> &[u8] {
141         let len = self.buffer_len as usize;
142         self.buffer_len = 0;
143         &self.buffer[..len as usize]
144     }
145 
146     /// (consumed_from_input, None): not enough input
147     /// (consumed_from_input, Some(Err(()))): error bytes in buffer
148     /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer
try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>)149     fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) {
150         let initial_buffer_len = self.buffer_len as usize;
151         let copied_from_input;
152         {
153             let unwritten = &mut self.buffer[initial_buffer_len..];
154             copied_from_input = cmp::min(unwritten.len(), input.len());
155             unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
156         }
157         let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
158         match str::from_utf8(spliced) {
159             Ok(_) => {
160                 self.buffer_len = spliced.len() as u8;
161                 (copied_from_input, Some(Ok(())))
162             }
163             Err(error) => {
164                 let valid_up_to = error.valid_up_to();
165                 if valid_up_to > 0 {
166                     let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
167                     self.buffer_len = valid_up_to as u8;
168                     (consumed, Some(Ok(())))
169                 } else {
170                     match error.error_len() {
171                         Some(invalid_sequence_length) => {
172                             let consumed = invalid_sequence_length
173                                 .checked_sub(initial_buffer_len).unwrap();
174                             self.buffer_len = invalid_sequence_length as u8;
175                             (consumed, Some(Err(())))
176                         }
177                         None => {
178                             self.buffer_len = spliced.len() as u8;
179                             (copied_from_input, None)
180                         }
181                     }
182                 }
183             }
184         }
185     }
186 }
187