1 extern crate utf8;
2 
3 use std::borrow::Cow;
4 use std::collections::VecDeque;
5 use std::io;
6 use utf8::*;
7 
8 /// A re-implementation of std::str::from_utf8
str_from_utf8(input: &[u8]) -> Result<&str, usize>9 pub fn str_from_utf8(input: &[u8]) -> Result<&str, usize> {
10     match decode(input) {
11         Ok(s) => return Ok(s),
12         Err(DecodeError::Invalid { valid_prefix, .. }) |
13         Err(DecodeError::Incomplete { valid_prefix, .. }) => Err(valid_prefix.len()),
14     }
15 }
16 
17 #[test]
test_str_from_utf8()18 fn test_str_from_utf8() {
19     let xs = b"hello";
20     assert_eq!(str_from_utf8(xs), Ok("hello"));
21 
22     let xs = "ศไทย中华Việt Nam".as_bytes();
23     assert_eq!(str_from_utf8(xs), Ok("ศไทย中华Việt Nam"));
24 
25     let xs = b"hello\xFF";
26     assert!(str_from_utf8(xs).is_err());
27 }
28 
29 #[test]
test_is_utf8()30 fn test_is_utf8() {
31     // Chars of 1, 2, 3, and 4 bytes
32     assert!(str_from_utf8("eé€\u{10000}".as_bytes()).is_ok());
33     // invalid prefix
34     assert!(str_from_utf8(&[0x80]).is_err());
35     // invalid 2 byte prefix
36     assert!(str_from_utf8(&[0xc0]).is_err());
37     assert!(str_from_utf8(&[0xc0, 0x10]).is_err());
38     // invalid 3 byte prefix
39     assert!(str_from_utf8(&[0xe0]).is_err());
40     assert!(str_from_utf8(&[0xe0, 0x10]).is_err());
41     assert!(str_from_utf8(&[0xe0, 0xff, 0x10]).is_err());
42     // invalid 4 byte prefix
43     assert!(str_from_utf8(&[0xf0]).is_err());
44     assert!(str_from_utf8(&[0xf0, 0x10]).is_err());
45     assert!(str_from_utf8(&[0xf0, 0xff, 0x10]).is_err());
46     assert!(str_from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_err());
47 
48     // deny overlong encodings
49     assert!(str_from_utf8(&[0xc0, 0x80]).is_err());
50     assert!(str_from_utf8(&[0xc0, 0xae]).is_err());
51     assert!(str_from_utf8(&[0xe0, 0x80, 0x80]).is_err());
52     assert!(str_from_utf8(&[0xe0, 0x80, 0xaf]).is_err());
53     assert!(str_from_utf8(&[0xe0, 0x81, 0x81]).is_err());
54     assert!(str_from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err());
55     assert!(str_from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err());
56 
57     // deny surrogates
58     assert!(str_from_utf8(&[0xED, 0xA0, 0x80]).is_err());
59     assert!(str_from_utf8(&[0xED, 0xBF, 0xBF]).is_err());
60 
61     assert!(str_from_utf8(&[0xC2, 0x80]).is_ok());
62     assert!(str_from_utf8(&[0xDF, 0xBF]).is_ok());
63     assert!(str_from_utf8(&[0xE0, 0xA0, 0x80]).is_ok());
64     assert!(str_from_utf8(&[0xED, 0x9F, 0xBF]).is_ok());
65     assert!(str_from_utf8(&[0xEE, 0x80, 0x80]).is_ok());
66     assert!(str_from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok());
67     assert!(str_from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok());
68     assert!(str_from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok());
69 }
70 
71 /// A re-implementation of String::from_utf8_lossy
string_from_utf8_lossy(input: &[u8]) -> Cow<str>72 pub fn string_from_utf8_lossy(input: &[u8]) -> Cow<str> {
73     let mut result = decode(input);
74     if let Ok(s) = result {
75         return s.into()
76     }
77     let mut string = String::with_capacity(input.len() + REPLACEMENT_CHARACTER.len());
78     loop {
79         match result {
80             Ok(s) => {
81                 string.push_str(s);
82                 return string.into()
83             }
84             Err(DecodeError::Incomplete { valid_prefix, .. }) => {
85                 string.push_str(valid_prefix);
86                 string.push_str(REPLACEMENT_CHARACTER);
87                 return string.into()
88             }
89             Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => {
90                 string.push_str(valid_prefix);
91                 string.push_str(REPLACEMENT_CHARACTER);
92                 result = decode(remaining_input);
93             }
94         }
95     }
96 }
97 
98 pub const DECODED_LOSSY: &'static [(&'static [u8], &'static str)] = &[
99     (b"hello", "hello"),
100     (b"\xe0\xb8\xa8\xe0\xb9\x84\xe0\xb8\x97\xe0\xb8\xa2\xe4\xb8\xad\xe5\x8d\x8e", "ศไทย中华"),
101     (b"Vi\xe1\xbb\x87t Nam", "Việt Nam"),
102     (b"Hello\xC2 There\xFF ", "Hello\u{FFFD} There\u{FFFD} "),
103     (b"Hello\xC0\x80 There", "Hello\u{FFFD}\u{FFFD} There"),
104     (b"\xE6\x83 Goodbye", "\u{FFFD} Goodbye"),
105     (b"\xF5foo\xF5\x80bar", "\u{FFFD}foo\u{FFFD}\u{FFFD}bar"),
106     (b"\xF5foo\xF5\xC2", "\u{FFFD}foo\u{FFFD}\u{FFFD}"),
107     (b"\xF1foo\xF1\x80bar\xF1\x80\x80baz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz"),
108     (b"\xF4foo\xF4\x80bar\xF4\xBFbaz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz"),
109     (b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar", "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar"),
110     (b"\xF0\x90\x80foo", "\u{FFFD}foo"),
111     // surrogates
112     (b"\xED\xA0\x80foo\xED\xBF\xBFbar", "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar"),
113 ];
114 
115 #[test]
test_string_from_utf8_lossy()116 fn test_string_from_utf8_lossy() {
117     for &(input, expected) in DECODED_LOSSY {
118         assert_eq!(string_from_utf8_lossy(input), expected);
119     }
120 }
121 
all_partitions<'a, F>(input: &'a [u8], f: F) where F: Fn(&[&[u8]])122 pub fn all_partitions<'a, F>(input: &'a [u8], f: F)
123     where F: Fn(&[&[u8]])
124 {
125 
126     fn all_partitions_inner<'a, F>(chunks: &mut Vec<&'a [u8]>, input: &'a [u8], f: &F)
127         where F: Fn(&[&[u8]])
128     {
129         if input.is_empty() {
130             f(chunks)
131         }
132         for i in 1..(input.len() + 1) {
133             chunks.push(&input[..i]);
134             all_partitions_inner(chunks, &input[i..], f);
135             chunks.pop();
136         }
137     }
138 
139     let mut chunks = Vec::new();
140     all_partitions_inner(&mut chunks, input, &f);
141     assert_eq!(chunks.len(), 0);
142 }
143 
144 #[test]
test_incremental_decoder()145 fn test_incremental_decoder() {
146     for &(input, expected) in DECODED_LOSSY {
147         all_partitions(input, |chunks| {
148             let mut string = String::new();
149             {
150                 let mut decoder = LossyDecoder::new(|s| string.push_str(s));
151                 for &chunk in &*chunks {
152                     decoder.feed(chunk);
153                 }
154             }
155             assert_eq!(string, expected);
156         });
157     }
158 }
159 
160 #[test]
test_bufread_decoder()161 fn test_bufread_decoder() {
162     for &(input, expected) in DECODED_LOSSY {
163         all_partitions(input, |chunks| {
164             let chunks = Chunks(chunks.to_vec().into());
165             let string = BufReadDecoder::read_to_string_lossy(chunks).unwrap();
166             assert_eq!(string, expected)
167         });
168     }
169 }
170 
171 struct Chunks<'a>(VecDeque<&'a [u8]>);
172 
173 impl<'a> io::Read for Chunks<'a> {
read(&mut self, _: &mut [u8]) -> io::Result<usize>174     fn read(&mut self, _: &mut [u8]) -> io::Result<usize> {
175         unimplemented!()
176     }
177 }
178 
179 impl<'a> io::BufRead for Chunks<'a> {
fill_buf(&mut self) -> io::Result<&[u8]>180     fn fill_buf(&mut self) -> io::Result<&[u8]> {
181         Ok(*self.0.front().unwrap())
182     }
183 
consume(&mut self, bytes: usize)184     fn consume(&mut self, bytes: usize) {
185         {
186             let front = self.0.front_mut().unwrap();
187             *front = &front[bytes..];
188             if !front.is_empty() {
189                 return
190             }
191         }
192         if self.0.len() > 1 {
193             self.0.pop_front();
194         }
195     }
196 
197 }
198