1 // Copyright 2018 Google LLC.
2 // Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
3
4 #include "src/base/SkUTF.h"
5
6 #include "include/private/base/SkTFitsIn.h"
7
left_shift(int32_t value,int32_t shift)8 static constexpr inline int32_t left_shift(int32_t value, int32_t shift) {
9 return (int32_t) ((uint32_t) value << shift);
10 }
11
is_align2(T x)12 template <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); }
13
is_align4(T x)14 template <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); }
15
utf16_is_high_surrogate(uint16_t c)16 static constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; }
17
utf16_is_low_surrogate(uint16_t c)18 static constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; }
19
20 /** @returns -1 iff invalid UTF8 byte,
21 0 iff UTF8 continuation byte,
22 1 iff ASCII byte,
23 2 iff leading byte of 2-byte sequence,
24 3 iff leading byte of 3-byte sequence, and
25 4 iff leading byte of 4-byte sequence.
26 I.e.: if return value > 0, then gives length of sequence.
27 */
utf8_byte_type(uint8_t c)28 static int utf8_byte_type(uint8_t c) {
29 if (c < 0x80) {
30 return 1;
31 } else if (c < 0xC0) {
32 return 0;
33 } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear"
34 return -1;
35 } else {
36 int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
37 // assert(value >= 2 && value <=4);
38 return value;
39 }
40 }
utf8_type_is_valid_leading_byte(int type)41 static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
42
utf8_byte_is_continuation(uint8_t c)43 static bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; }
44
45 ////////////////////////////////////////////////////////////////////////////////
46
CountUTF8(const char * utf8,size_t byteLength)47 int SkUTF::CountUTF8(const char* utf8, size_t byteLength) {
48 if (!utf8 && byteLength) {
49 return -1;
50 }
51 int count = 0;
52 const char* stop = utf8 + byteLength;
53 while (utf8 < stop) {
54 int type = utf8_byte_type(*(const uint8_t*)utf8);
55 if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) {
56 return -1; // Sequence extends beyond end.
57 }
58 while(type-- > 1) {
59 ++utf8;
60 if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
61 return -1;
62 }
63 }
64 ++utf8;
65 ++count;
66 }
67 return count;
68 }
69
CountUTF16(const uint16_t * utf16,size_t byteLength)70 int SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) {
71 if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) {
72 return -1;
73 }
74 const uint16_t* src = (const uint16_t*)utf16;
75 const uint16_t* stop = src + (byteLength >> 1);
76 int count = 0;
77 while (src < stop) {
78 unsigned c = *src++;
79 if (utf16_is_low_surrogate(c)) {
80 return -1;
81 }
82 if (utf16_is_high_surrogate(c)) {
83 if (src >= stop) {
84 return -1;
85 }
86 c = *src++;
87 if (!utf16_is_low_surrogate(c)) {
88 return -1;
89 }
90 }
91 count += 1;
92 }
93 return count;
94 }
95
CountUTF32(const int32_t * utf32,size_t byteLength)96 int SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) {
97 if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || !SkTFitsIn<int>(byteLength >> 2)) {
98 return -1;
99 }
100 const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits
101 const uint32_t* ptr = (const uint32_t*)utf32;
102 const uint32_t* stop = ptr + (byteLength >> 2);
103 while (ptr < stop) {
104 if (*ptr & kInvalidUnicharMask) {
105 return -1;
106 }
107 ptr += 1;
108 }
109 return (int)(byteLength >> 2);
110 }
111
112 template <typename T>
next_fail(const T ** ptr,const T * end)113 static SkUnichar next_fail(const T** ptr, const T* end) {
114 *ptr = end;
115 return -1;
116 }
117
NextUTF8(const char ** ptr,const char * end)118 SkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) {
119 if (!ptr || !end ) {
120 return -1;
121 }
122 const uint8_t* p = (const uint8_t*)*ptr;
123 if (!p || p >= (const uint8_t*)end) {
124 return next_fail(ptr, end);
125 }
126 int c = *p;
127 int hic = c << 24;
128
129 if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) {
130 return next_fail(ptr, end);
131 }
132 if (hic < 0) {
133 uint32_t mask = (uint32_t)~0x3F;
134 hic = left_shift(hic, 1);
135 do {
136 ++p;
137 if (p >= (const uint8_t*)end) {
138 return next_fail(ptr, end);
139 }
140 // check before reading off end of array.
141 uint8_t nextByte = *p;
142 if (!utf8_byte_is_continuation(nextByte)) {
143 return next_fail(ptr, end);
144 }
145 c = (c << 6) | (nextByte & 0x3F);
146 mask <<= 5;
147 } while ((hic = left_shift(hic, 1)) < 0);
148 c &= ~mask;
149 }
150 *ptr = (const char*)p + 1;
151 return c;
152 }
153
NextUTF8WithReplacement(const char ** ptr,const char * end)154 SkUnichar SkUTF::NextUTF8WithReplacement(const char** ptr, const char* end) {
155 SkUnichar val = SkUTF::NextUTF8(ptr, end);
156 return val < 0 ? 0xFFFD : val;
157 }
158
NextUTF16(const uint16_t ** ptr,const uint16_t * end)159 SkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) {
160 if (!ptr || !end ) {
161 return -1;
162 }
163 const uint16_t* src = *ptr;
164 if (!src || src + 1 > end || !is_align2(intptr_t(src))) {
165 return next_fail(ptr, end);
166 }
167 uint16_t c = *src++;
168 SkUnichar result = c;
169 if (utf16_is_low_surrogate(c)) {
170 return next_fail(ptr, end); // srcPtr should never point at low surrogate.
171 }
172 if (utf16_is_high_surrogate(c)) {
173 if (src + 1 > end) {
174 return next_fail(ptr, end); // Truncated string.
175 }
176 uint16_t low = *src++;
177 if (!utf16_is_low_surrogate(low)) {
178 return next_fail(ptr, end);
179 }
180 /*
181 [paraphrased from wikipedia]
182 Take the high surrogate and subtract 0xD800, then multiply by 0x400.
183 Take the low surrogate and subtract 0xDC00. Add these two results
184 together, and finally add 0x10000 to get the final decoded codepoint.
185
186 unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
187 unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000
188 unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000
189 unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000)
190 */
191 result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000);
192 }
193 *ptr = src;
194 return result;
195 }
196
NextUTF32(const int32_t ** ptr,const int32_t * end)197 SkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) {
198 if (!ptr || !end ) {
199 return -1;
200 }
201 const int32_t* s = *ptr;
202 if (!s || s + 1 > end || !is_align4(intptr_t(s))) {
203 return next_fail(ptr, end);
204 }
205 int32_t value = *s;
206 const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits
207 if (value & kInvalidUnicharMask) {
208 return next_fail(ptr, end);
209 }
210 *ptr = s + 1;
211 return value;
212 }
213
ToUTF8(SkUnichar uni,char utf8[SkUTF::kMaxBytesInUTF8Sequence])214 size_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) {
215 if ((uint32_t)uni > 0x10FFFF) {
216 return 0;
217 }
218 if (uni <= 127) {
219 if (utf8) {
220 *utf8 = (char)uni;
221 }
222 return 1;
223 }
224 char tmp[4];
225 char* p = tmp;
226 size_t count = 1;
227 while (uni > 0x7F >> count) {
228 *p++ = (char)(0x80 | (uni & 0x3F));
229 uni >>= 6;
230 count += 1;
231 }
232 if (utf8) {
233 p = tmp;
234 utf8 += count;
235 while (p < tmp + count - 1) {
236 *--utf8 = *p++;
237 }
238 *--utf8 = (char)(~(0xFF >> count) | uni);
239 }
240 return count;
241 }
242
ToUTF16(SkUnichar uni,uint16_t utf16[2])243 size_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) {
244 if ((uint32_t)uni > 0x10FFFF) {
245 return 0;
246 }
247 int extra = (uni > 0xFFFF);
248 if (utf16) {
249 if (extra) {
250 utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10));
251 utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF));
252 } else {
253 utf16[0] = (uint16_t)uni;
254 }
255 }
256 return 1 + extra;
257 }
258
UTF8ToUTF16(uint16_t dst[],int dstCapacity,const char src[],size_t srcByteLength)259 int SkUTF::UTF8ToUTF16(uint16_t dst[], int dstCapacity, const char src[], size_t srcByteLength) {
260 if (!dst) {
261 dstCapacity = 0;
262 }
263
264 int dstLength = 0;
265 uint16_t* endDst = dst + dstCapacity;
266 const char* endSrc = src + srcByteLength;
267 while (src < endSrc) {
268 SkUnichar uni = NextUTF8(&src, endSrc);
269 if (uni < 0) {
270 return -1;
271 }
272
273 uint16_t utf16[2];
274 size_t count = ToUTF16(uni, utf16);
275 if (count == 0) {
276 return -1;
277 }
278 dstLength += count;
279
280 if (dst) {
281 uint16_t* elems = utf16;
282 while (dst < endDst && count > 0) {
283 *dst++ = *elems++;
284 count -= 1;
285 }
286 }
287 }
288 return dstLength;
289 }
290
UTF16ToUTF8(char dst[],int dstCapacity,const uint16_t src[],size_t srcLength)291 int SkUTF::UTF16ToUTF8(char dst[], int dstCapacity, const uint16_t src[], size_t srcLength) {
292 if (!dst) {
293 dstCapacity = 0;
294 }
295
296 int dstLength = 0;
297 const char* endDst = dst + dstCapacity;
298 const uint16_t* endSrc = src + srcLength;
299 while (src < endSrc) {
300 SkUnichar uni = NextUTF16(&src, endSrc);
301 if (uni < 0) {
302 return -1;
303 }
304
305 char utf8[SkUTF::kMaxBytesInUTF8Sequence];
306 size_t count = ToUTF8(uni, utf8);
307 if (count == 0) {
308 return -1;
309 }
310 dstLength += count;
311
312 if (dst) {
313 const char* elems = utf8;
314 while (dst < endDst && count > 0) {
315 *dst++ = *elems++;
316 count -= 1;
317 }
318 }
319 }
320 return dstLength;
321 }
322