xref: /aosp_15_r20/external/skia/src/base/SkUTF.cpp (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1 // Copyright 2018 Google LLC.
2 // Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
3 
4 #include "src/base/SkUTF.h"
5 
6 #include "include/private/base/SkTFitsIn.h"
7 
left_shift(int32_t value,int32_t shift)8 static constexpr inline int32_t left_shift(int32_t value, int32_t shift) {
9     return (int32_t) ((uint32_t) value << shift);
10 }
11 
is_align2(T x)12 template <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); }
13 
is_align4(T x)14 template <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); }
15 
utf16_is_high_surrogate(uint16_t c)16 static constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; }
17 
utf16_is_low_surrogate(uint16_t c)18 static constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; }
19 
20 /** @returns   -1  iff invalid UTF8 byte,
21                 0  iff UTF8 continuation byte,
22                 1  iff ASCII byte,
23                 2  iff leading byte of 2-byte sequence,
24                 3  iff leading byte of 3-byte sequence, and
25                 4  iff leading byte of 4-byte sequence.
26       I.e.: if return value > 0, then gives length of sequence.
27 */
utf8_byte_type(uint8_t c)28 static int utf8_byte_type(uint8_t c) {
29     if (c < 0x80) {
30         return 1;
31     } else if (c < 0xC0) {
32         return 0;
33     } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear"
34         return -1;
35     } else {
36         int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
37         // assert(value >= 2 && value <=4);
38         return value;
39     }
40 }
utf8_type_is_valid_leading_byte(int type)41 static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
42 
utf8_byte_is_continuation(uint8_t c)43 static bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; }
44 
45 ////////////////////////////////////////////////////////////////////////////////
46 
CountUTF8(const char * utf8,size_t byteLength)47 int SkUTF::CountUTF8(const char* utf8, size_t byteLength) {
48     if (!utf8 && byteLength) {
49         return -1;
50     }
51     int count = 0;
52     const char* stop = utf8 + byteLength;
53     while (utf8 < stop) {
54         int type = utf8_byte_type(*(const uint8_t*)utf8);
55         if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) {
56             return -1;  // Sequence extends beyond end.
57         }
58         while(type-- > 1) {
59             ++utf8;
60             if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
61                 return -1;
62             }
63         }
64         ++utf8;
65         ++count;
66     }
67     return count;
68 }
69 
CountUTF16(const uint16_t * utf16,size_t byteLength)70 int SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) {
71     if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) {
72         return -1;
73     }
74     const uint16_t* src = (const uint16_t*)utf16;
75     const uint16_t* stop = src + (byteLength >> 1);
76     int count = 0;
77     while (src < stop) {
78         unsigned c = *src++;
79         if (utf16_is_low_surrogate(c)) {
80             return -1;
81         }
82         if (utf16_is_high_surrogate(c)) {
83             if (src >= stop) {
84                 return -1;
85             }
86             c = *src++;
87             if (!utf16_is_low_surrogate(c)) {
88                 return -1;
89             }
90         }
91         count += 1;
92     }
93     return count;
94 }
95 
CountUTF32(const int32_t * utf32,size_t byteLength)96 int SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) {
97     if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || !SkTFitsIn<int>(byteLength >> 2)) {
98         return -1;
99     }
100     const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
101     const uint32_t* ptr = (const uint32_t*)utf32;
102     const uint32_t* stop = ptr + (byteLength >> 2);
103     while (ptr < stop) {
104         if (*ptr & kInvalidUnicharMask) {
105             return -1;
106         }
107         ptr += 1;
108     }
109     return (int)(byteLength >> 2);
110 }
111 
112 template <typename T>
next_fail(const T ** ptr,const T * end)113 static SkUnichar next_fail(const T** ptr, const T* end) {
114     *ptr = end;
115     return -1;
116 }
117 
NextUTF8(const char ** ptr,const char * end)118 SkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) {
119     if (!ptr || !end ) {
120         return -1;
121     }
122     const uint8_t*  p = (const uint8_t*)*ptr;
123     if (!p || p >= (const uint8_t*)end) {
124         return next_fail(ptr, end);
125     }
126     int             c = *p;
127     int             hic = c << 24;
128 
129     if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) {
130         return next_fail(ptr, end);
131     }
132     if (hic < 0) {
133         uint32_t mask = (uint32_t)~0x3F;
134         hic = left_shift(hic, 1);
135         do {
136             ++p;
137             if (p >= (const uint8_t*)end) {
138                 return next_fail(ptr, end);
139             }
140             // check before reading off end of array.
141             uint8_t nextByte = *p;
142             if (!utf8_byte_is_continuation(nextByte)) {
143                 return next_fail(ptr, end);
144             }
145             c = (c << 6) | (nextByte & 0x3F);
146             mask <<= 5;
147         } while ((hic = left_shift(hic, 1)) < 0);
148         c &= ~mask;
149     }
150     *ptr = (const char*)p + 1;
151     return c;
152 }
153 
NextUTF8WithReplacement(const char ** ptr,const char * end)154 SkUnichar SkUTF::NextUTF8WithReplacement(const char** ptr, const char* end) {
155     SkUnichar val = SkUTF::NextUTF8(ptr, end);
156     return val < 0 ? 0xFFFD : val;
157 }
158 
NextUTF16(const uint16_t ** ptr,const uint16_t * end)159 SkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) {
160     if (!ptr || !end ) {
161         return -1;
162     }
163     const uint16_t* src = *ptr;
164     if (!src || src + 1 > end || !is_align2(intptr_t(src))) {
165         return next_fail(ptr, end);
166     }
167     uint16_t c = *src++;
168     SkUnichar result = c;
169     if (utf16_is_low_surrogate(c)) {
170         return next_fail(ptr, end);  // srcPtr should never point at low surrogate.
171     }
172     if (utf16_is_high_surrogate(c)) {
173         if (src + 1 > end) {
174             return next_fail(ptr, end);  // Truncated string.
175         }
176         uint16_t low = *src++;
177         if (!utf16_is_low_surrogate(low)) {
178             return next_fail(ptr, end);
179         }
180         /*
181         [paraphrased from wikipedia]
182         Take the high surrogate and subtract 0xD800, then multiply by 0x400.
183         Take the low surrogate and subtract 0xDC00.  Add these two results
184         together, and finally add 0x10000 to get the final decoded codepoint.
185 
186         unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
187         unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000
188         unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000
189         unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000)
190         */
191         result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000);
192     }
193     *ptr = src;
194     return result;
195 }
196 
NextUTF32(const int32_t ** ptr,const int32_t * end)197 SkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) {
198     if (!ptr || !end ) {
199         return -1;
200     }
201     const int32_t* s = *ptr;
202     if (!s || s + 1 > end || !is_align4(intptr_t(s))) {
203         return next_fail(ptr, end);
204     }
205     int32_t value = *s;
206     const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
207     if (value & kInvalidUnicharMask) {
208         return next_fail(ptr, end);
209     }
210     *ptr = s + 1;
211     return value;
212 }
213 
ToUTF8(SkUnichar uni,char utf8[SkUTF::kMaxBytesInUTF8Sequence])214 size_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) {
215     if ((uint32_t)uni > 0x10FFFF) {
216         return 0;
217     }
218     if (uni <= 127) {
219         if (utf8) {
220             *utf8 = (char)uni;
221         }
222         return 1;
223     }
224     char    tmp[4];
225     char*   p = tmp;
226     size_t  count = 1;
227     while (uni > 0x7F >> count) {
228         *p++ = (char)(0x80 | (uni & 0x3F));
229         uni >>= 6;
230         count += 1;
231     }
232     if (utf8) {
233         p = tmp;
234         utf8 += count;
235         while (p < tmp + count - 1) {
236             *--utf8 = *p++;
237         }
238         *--utf8 = (char)(~(0xFF >> count) | uni);
239     }
240     return count;
241 }
242 
ToUTF16(SkUnichar uni,uint16_t utf16[2])243 size_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) {
244     if ((uint32_t)uni > 0x10FFFF) {
245         return 0;
246     }
247     int extra = (uni > 0xFFFF);
248     if (utf16) {
249         if (extra) {
250             utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10));
251             utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF));
252         } else {
253             utf16[0] = (uint16_t)uni;
254         }
255     }
256     return 1 + extra;
257 }
258 
UTF8ToUTF16(uint16_t dst[],int dstCapacity,const char src[],size_t srcByteLength)259 int SkUTF::UTF8ToUTF16(uint16_t dst[], int dstCapacity, const char src[], size_t srcByteLength) {
260     if (!dst) {
261         dstCapacity = 0;
262     }
263 
264     int dstLength = 0;
265     uint16_t* endDst = dst + dstCapacity;
266     const char* endSrc = src + srcByteLength;
267     while (src < endSrc) {
268         SkUnichar uni = NextUTF8(&src, endSrc);
269         if (uni < 0) {
270             return -1;
271         }
272 
273         uint16_t utf16[2];
274         size_t count = ToUTF16(uni, utf16);
275         if (count == 0) {
276             return -1;
277         }
278         dstLength += count;
279 
280         if (dst) {
281             uint16_t* elems = utf16;
282             while (dst < endDst && count > 0) {
283                 *dst++ = *elems++;
284                 count -= 1;
285             }
286         }
287     }
288     return dstLength;
289 }
290 
UTF16ToUTF8(char dst[],int dstCapacity,const uint16_t src[],size_t srcLength)291 int SkUTF::UTF16ToUTF8(char dst[], int dstCapacity, const uint16_t src[], size_t srcLength) {
292     if (!dst) {
293         dstCapacity = 0;
294     }
295 
296     int dstLength = 0;
297     const char* endDst = dst + dstCapacity;
298     const uint16_t* endSrc = src + srcLength;
299     while (src < endSrc) {
300         SkUnichar uni = NextUTF16(&src, endSrc);
301         if (uni < 0) {
302             return -1;
303         }
304 
305         char utf8[SkUTF::kMaxBytesInUTF8Sequence];
306         size_t count = ToUTF8(uni, utf8);
307         if (count == 0) {
308             return -1;
309         }
310         dstLength += count;
311 
312         if (dst) {
313             const char* elems = utf8;
314             while (dst < endDst && count > 0) {
315                 *dst++ = *elems++;
316                 count -= 1;
317             }
318         }
319     }
320     return dstLength;
321 }
322