1 extern crate utf8;
2
3 use std::borrow::Cow;
4 use std::collections::VecDeque;
5 use std::io;
6 use utf8::*;
7
8 /// A re-implementation of std::str::from_utf8
str_from_utf8(input: &[u8]) -> Result<&str, usize>9 pub fn str_from_utf8(input: &[u8]) -> Result<&str, usize> {
10 match decode(input) {
11 Ok(s) => return Ok(s),
12 Err(DecodeError::Invalid { valid_prefix, .. }) |
13 Err(DecodeError::Incomplete { valid_prefix, .. }) => Err(valid_prefix.len()),
14 }
15 }
16
17 #[test]
test_str_from_utf8()18 fn test_str_from_utf8() {
19 let xs = b"hello";
20 assert_eq!(str_from_utf8(xs), Ok("hello"));
21
22 let xs = "ศไทย中华Việt Nam".as_bytes();
23 assert_eq!(str_from_utf8(xs), Ok("ศไทย中华Việt Nam"));
24
25 let xs = b"hello\xFF";
26 assert!(str_from_utf8(xs).is_err());
27 }
28
29 #[test]
test_is_utf8()30 fn test_is_utf8() {
31 // Chars of 1, 2, 3, and 4 bytes
32 assert!(str_from_utf8("eé€\u{10000}".as_bytes()).is_ok());
33 // invalid prefix
34 assert!(str_from_utf8(&[0x80]).is_err());
35 // invalid 2 byte prefix
36 assert!(str_from_utf8(&[0xc0]).is_err());
37 assert!(str_from_utf8(&[0xc0, 0x10]).is_err());
38 // invalid 3 byte prefix
39 assert!(str_from_utf8(&[0xe0]).is_err());
40 assert!(str_from_utf8(&[0xe0, 0x10]).is_err());
41 assert!(str_from_utf8(&[0xe0, 0xff, 0x10]).is_err());
42 // invalid 4 byte prefix
43 assert!(str_from_utf8(&[0xf0]).is_err());
44 assert!(str_from_utf8(&[0xf0, 0x10]).is_err());
45 assert!(str_from_utf8(&[0xf0, 0xff, 0x10]).is_err());
46 assert!(str_from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_err());
47
48 // deny overlong encodings
49 assert!(str_from_utf8(&[0xc0, 0x80]).is_err());
50 assert!(str_from_utf8(&[0xc0, 0xae]).is_err());
51 assert!(str_from_utf8(&[0xe0, 0x80, 0x80]).is_err());
52 assert!(str_from_utf8(&[0xe0, 0x80, 0xaf]).is_err());
53 assert!(str_from_utf8(&[0xe0, 0x81, 0x81]).is_err());
54 assert!(str_from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err());
55 assert!(str_from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err());
56
57 // deny surrogates
58 assert!(str_from_utf8(&[0xED, 0xA0, 0x80]).is_err());
59 assert!(str_from_utf8(&[0xED, 0xBF, 0xBF]).is_err());
60
61 assert!(str_from_utf8(&[0xC2, 0x80]).is_ok());
62 assert!(str_from_utf8(&[0xDF, 0xBF]).is_ok());
63 assert!(str_from_utf8(&[0xE0, 0xA0, 0x80]).is_ok());
64 assert!(str_from_utf8(&[0xED, 0x9F, 0xBF]).is_ok());
65 assert!(str_from_utf8(&[0xEE, 0x80, 0x80]).is_ok());
66 assert!(str_from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok());
67 assert!(str_from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok());
68 assert!(str_from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok());
69 }
70
71 /// A re-implementation of String::from_utf8_lossy
string_from_utf8_lossy(input: &[u8]) -> Cow<str>72 pub fn string_from_utf8_lossy(input: &[u8]) -> Cow<str> {
73 let mut result = decode(input);
74 if let Ok(s) = result {
75 return s.into()
76 }
77 let mut string = String::with_capacity(input.len() + REPLACEMENT_CHARACTER.len());
78 loop {
79 match result {
80 Ok(s) => {
81 string.push_str(s);
82 return string.into()
83 }
84 Err(DecodeError::Incomplete { valid_prefix, .. }) => {
85 string.push_str(valid_prefix);
86 string.push_str(REPLACEMENT_CHARACTER);
87 return string.into()
88 }
89 Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => {
90 string.push_str(valid_prefix);
91 string.push_str(REPLACEMENT_CHARACTER);
92 result = decode(remaining_input);
93 }
94 }
95 }
96 }
97
98 pub const DECODED_LOSSY: &'static [(&'static [u8], &'static str)] = &[
99 (b"hello", "hello"),
100 (b"\xe0\xb8\xa8\xe0\xb9\x84\xe0\xb8\x97\xe0\xb8\xa2\xe4\xb8\xad\xe5\x8d\x8e", "ศไทย中华"),
101 (b"Vi\xe1\xbb\x87t Nam", "Việt Nam"),
102 (b"Hello\xC2 There\xFF ", "Hello\u{FFFD} There\u{FFFD} "),
103 (b"Hello\xC0\x80 There", "Hello\u{FFFD}\u{FFFD} There"),
104 (b"\xE6\x83 Goodbye", "\u{FFFD} Goodbye"),
105 (b"\xF5foo\xF5\x80bar", "\u{FFFD}foo\u{FFFD}\u{FFFD}bar"),
106 (b"\xF5foo\xF5\xC2", "\u{FFFD}foo\u{FFFD}\u{FFFD}"),
107 (b"\xF1foo\xF1\x80bar\xF1\x80\x80baz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz"),
108 (b"\xF4foo\xF4\x80bar\xF4\xBFbaz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz"),
109 (b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar", "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar"),
110 (b"\xF0\x90\x80foo", "\u{FFFD}foo"),
111 // surrogates
112 (b"\xED\xA0\x80foo\xED\xBF\xBFbar", "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar"),
113 ];
114
115 #[test]
test_string_from_utf8_lossy()116 fn test_string_from_utf8_lossy() {
117 for &(input, expected) in DECODED_LOSSY {
118 assert_eq!(string_from_utf8_lossy(input), expected);
119 }
120 }
121
all_partitions<'a, F>(input: &'a [u8], f: F) where F: Fn(&[&[u8]])122 pub fn all_partitions<'a, F>(input: &'a [u8], f: F)
123 where F: Fn(&[&[u8]])
124 {
125
126 fn all_partitions_inner<'a, F>(chunks: &mut Vec<&'a [u8]>, input: &'a [u8], f: &F)
127 where F: Fn(&[&[u8]])
128 {
129 if input.is_empty() {
130 f(chunks)
131 }
132 for i in 1..(input.len() + 1) {
133 chunks.push(&input[..i]);
134 all_partitions_inner(chunks, &input[i..], f);
135 chunks.pop();
136 }
137 }
138
139 let mut chunks = Vec::new();
140 all_partitions_inner(&mut chunks, input, &f);
141 assert_eq!(chunks.len(), 0);
142 }
143
144 #[test]
test_incremental_decoder()145 fn test_incremental_decoder() {
146 for &(input, expected) in DECODED_LOSSY {
147 all_partitions(input, |chunks| {
148 let mut string = String::new();
149 {
150 let mut decoder = LossyDecoder::new(|s| string.push_str(s));
151 for &chunk in &*chunks {
152 decoder.feed(chunk);
153 }
154 }
155 assert_eq!(string, expected);
156 });
157 }
158 }
159
160 #[test]
test_bufread_decoder()161 fn test_bufread_decoder() {
162 for &(input, expected) in DECODED_LOSSY {
163 all_partitions(input, |chunks| {
164 let chunks = Chunks(chunks.to_vec().into());
165 let string = BufReadDecoder::read_to_string_lossy(chunks).unwrap();
166 assert_eq!(string, expected)
167 });
168 }
169 }
170
171 struct Chunks<'a>(VecDeque<&'a [u8]>);
172
173 impl<'a> io::Read for Chunks<'a> {
read(&mut self, _: &mut [u8]) -> io::Result<usize>174 fn read(&mut self, _: &mut [u8]) -> io::Result<usize> {
175 unimplemented!()
176 }
177 }
178
179 impl<'a> io::BufRead for Chunks<'a> {
fill_buf(&mut self) -> io::Result<&[u8]>180 fn fill_buf(&mut self) -> io::Result<&[u8]> {
181 Ok(*self.0.front().unwrap())
182 }
183
consume(&mut self, bytes: usize)184 fn consume(&mut self, bytes: usize) {
185 {
186 let front = self.0.front_mut().unwrap();
187 *front = &front[bytes..];
188 if !front.is_empty() {
189 return
190 }
191 }
192 if self.0.len() > 1 {
193 self.0.pop_front();
194 }
195 }
196
197 }
198