1 //! Utility functions for the UCS-2 character encoding.
2 
3 #![no_std]
4 #![deny(missing_docs)]
5 #![deny(clippy::all)]
6 
7 mod macros;
8 
9 /// These need to be public for the `ucs2_cstr!` macro, but are not
10 /// intended to be called directly.
11 #[doc(hidden)]
12 pub use macros::{str_num_ucs2_chars, str_to_ucs2};
13 
14 use bit_field::BitField;
15 use core::fmt::{self, Display, Formatter};
16 
17 /// Possible errors returned by the API.
18 #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
19 pub enum Error {
20     /// Not enough space left in the output buffer.
21     BufferOverflow,
22     /// Input contained a character which cannot be represented in UCS-2.
23     MultiByte,
24 }
25 
26 impl Display for Error {
fmt(&self, f: &mut Formatter<'_>) -> fmt::Result27     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
28         match self {
29             Self::BufferOverflow => f.write_str("output buffer is too small"),
30             Self::MultiByte => {
31                 f.write_str("input contains a character which cannot be represented in UCS-2")
32             }
33         }
34     }
35 }
36 
37 type Result<T> = core::result::Result<T, Error>;
38 
39 /// Value returned by `ucs2_from_utf8_at_offset`.
40 struct Ucs2CharFromUtf8 {
41     /// UCS-2 character.
42     val: u16,
43     /// Number of bytes needed to encode the character in UTF-8.
44     num_bytes: u8,
45 }
46 
47 /// Get a UCS-2 character from a UTF-8 byte slice at the given offset.
48 ///
49 /// # Safety
50 ///
51 /// The input `bytes` must be valid UTF-8.
ucs2_from_utf8_at_offset(bytes: &[u8], offset: usize) -> Result<Ucs2CharFromUtf8>52 const unsafe fn ucs2_from_utf8_at_offset(bytes: &[u8], offset: usize) -> Result<Ucs2CharFromUtf8> {
53     let len = bytes.len();
54     let ch;
55     let ch_len;
56 
57     if bytes[offset] & 0b1000_0000 == 0b0000_0000 {
58         ch = bytes[offset] as u16;
59         ch_len = 1;
60     } else if bytes[offset] & 0b1110_0000 == 0b1100_0000 {
61         // 2 byte codepoint
62         if offset + 1 >= len {
63             // safe: len is the length of bytes,
64             // and bytes is a direct view into the
65             // buffer of input, which in order to be a valid
66             // utf-8 string _must_ contain `i + 1`.
67             unsafe { core::hint::unreachable_unchecked() }
68         }
69 
70         let a = (bytes[offset] & 0b0001_1111) as u16;
71         let b = (bytes[offset + 1] & 0b0011_1111) as u16;
72         ch = a << 6 | b;
73         ch_len = 2;
74     } else if bytes[offset] & 0b1111_0000 == 0b1110_0000 {
75         // 3 byte codepoint
76         if offset + 2 >= len || offset + 1 >= len {
77             // safe: impossible utf-8 string.
78             unsafe { core::hint::unreachable_unchecked() }
79         }
80 
81         let a = (bytes[offset] & 0b0000_1111) as u16;
82         let b = (bytes[offset + 1] & 0b0011_1111) as u16;
83         let c = (bytes[offset + 2] & 0b0011_1111) as u16;
84         ch = a << 12 | b << 6 | c;
85         ch_len = 3;
86     } else if bytes[offset] & 0b1111_0000 == 0b1111_0000 {
87         return Err(Error::MultiByte); // UTF-16
88     } else {
89         // safe: impossible utf-8 string.
90         unsafe { core::hint::unreachable_unchecked() }
91     }
92 
93     Ok(Ucs2CharFromUtf8 {
94         val: ch,
95         num_bytes: ch_len,
96     })
97 }
98 
99 /// Encodes an input UTF-8 string into a UCS-2 string.
100 ///
101 /// The returned `usize` represents the length of the returned buffer,
102 /// measured in 2-byte characters.
encode(input: &str, buffer: &mut [u16]) -> Result<usize>103 pub fn encode(input: &str, buffer: &mut [u16]) -> Result<usize> {
104     let buffer_size = buffer.len();
105     let mut i = 0;
106 
107     encode_with(input, |ch| {
108         if i >= buffer_size {
109             Err(Error::BufferOverflow)
110         } else {
111             buffer[i] = ch;
112             i += 1;
113             Ok(())
114         }
115     })?;
116 
117     Ok(i)
118 }
119 
120 /// Encode UTF-8 string to UCS-2 with a custom callback function.
121 ///
122 /// `output` is a function which receives every encoded character.
encode_with<F>(input: &str, mut output: F) -> Result<()> where F: FnMut(u16) -> Result<()>,123 pub fn encode_with<F>(input: &str, mut output: F) -> Result<()>
124 where
125     F: FnMut(u16) -> Result<()>,
126 {
127     let bytes = input.as_bytes();
128     let len = bytes.len();
129     let mut i = 0;
130 
131     while i < len {
132         // SAFETY: `bytes` is valid UTF-8.
133         let ch = unsafe { ucs2_from_utf8_at_offset(bytes, i) }?;
134         i += usize::from(ch.num_bytes);
135         output(ch.val)?;
136     }
137     Ok(())
138 }
139 
140 /// Decode UCS-2 string to UTF-8 with a custom callback function.
141 ///
142 /// `output` is a function which receives every decoded character.
143 /// Due to the nature of UCS-2, the function can receive an UTF-8 character
144 /// of up to three bytes, for every input character.
decode_with<F>(input: &[u16], mut output: F) -> Result<usize> where F: FnMut(&[u8]) -> Result<()>,145 pub fn decode_with<F>(input: &[u16], mut output: F) -> Result<usize>
146 where
147     F: FnMut(&[u8]) -> Result<()>,
148 {
149     let mut written = 0;
150 
151     for ch in input.iter() {
152         /*
153          * We need to find how many bytes of UTF-8 this UCS-2 code-point needs. Because UCS-2 can only encode
154          * the Basic Multilingual Plane, a maximum of three bytes are needed.
155          */
156         if (0x000..0x0080).contains(ch) {
157             output(&[*ch as u8])?;
158 
159             written += 1;
160         } else if (0x0080..0x0800).contains(ch) {
161             let first = 0b1100_0000 + ch.get_bits(6..11) as u8;
162             let last = 0b1000_0000 + ch.get_bits(0..6) as u8;
163 
164             output(&[first, last])?;
165 
166             written += 2;
167         } else {
168             let first = 0b1110_0000 + ch.get_bits(12..16) as u8;
169             let mid = 0b1000_0000 + ch.get_bits(6..12) as u8;
170             let last = 0b1000_0000 + ch.get_bits(0..6) as u8;
171 
172             output(&[first, mid, last])?;
173 
174             written += 3;
175         }
176     }
177 
178     Ok(written)
179 }
180 
181 /// Decode an input UCS-2 string into a UTF-8 string.
182 ///
183 /// The returned `usize` represents the length of the returned buffer,
184 /// in bytes. Due to the nature of UCS-2, the output buffer could end up with
185 /// three bytes for every character in the input buffer.
decode(input: &[u16], output: &mut [u8]) -> Result<usize>186 pub fn decode(input: &[u16], output: &mut [u8]) -> Result<usize> {
187     let buffer_size = output.len();
188     let mut i = 0;
189 
190     decode_with(input, |bytes| {
191         if bytes.len() == 1 {
192             // Can be encoded in a single byte
193             if i >= buffer_size {
194                 return Err(Error::BufferOverflow);
195             }
196 
197             output[i] = bytes[0];
198 
199             i += 1;
200         } else if bytes.len() == 2 {
201             // Can be encoded two bytes
202             if i + 1 >= buffer_size {
203                 return Err(Error::BufferOverflow);
204             }
205 
206             output[i] = bytes[0];
207             output[i + 1] = bytes[1];
208 
209             i += 2;
210         } else if bytes.len() == 3 {
211             // Can be encoded three bytes
212             if i + 2 >= buffer_size {
213                 return Err(Error::BufferOverflow);
214             }
215 
216             output[i] = bytes[0];
217             output[i + 1] = bytes[1];
218             output[i + 2] = bytes[2];
219 
220             i += 3;
221         } else {
222             unreachable!("More than three bytes per UCS-2 character.");
223         }
224 
225         Ok(())
226     })
227 }
228