1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/strings/utf_string_conversions.h"
6
7 #include <limits.h>
8 #include <stdint.h>
9
10 #include <concepts>
11 #include <ostream>
12 #include <string_view>
13 #include <type_traits>
14
15 #include "base/strings/string_piece.h"
16 #include "base/strings/string_util.h"
17 #include "base/strings/utf_ostream_operators.h"
18 #include "base/strings/utf_string_conversion_utils.h"
19 #include "base/third_party/icu/icu_utf.h"
20 #include "build/build_config.h"
21
22 namespace base {
23
24 namespace {
25
26 constexpr base_icu::UChar32 kErrorCodePoint = 0xFFFD;
27
28 // Size coefficient ----------------------------------------------------------
29 // The maximum number of codeunits in the destination encoding corresponding to
30 // one codeunit in the source encoding.
31
32 template <typename SrcChar, typename DestChar>
33 struct SizeCoefficient {
34 static_assert(sizeof(SrcChar) < sizeof(DestChar),
35 "Default case: from a smaller encoding to the bigger one");
36
37 // ASCII symbols are encoded by one codeunit in all encodings.
38 static constexpr int value = 1;
39 };
40
41 template <>
42 struct SizeCoefficient<char16_t, char> {
43 // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
44 static constexpr int value = 3;
45 };
46
47 #if defined(WCHAR_T_IS_32_BIT)
48 template <>
49 struct SizeCoefficient<wchar_t, char> {
50 // UTF-8 uses at most 4 codeunits per character.
51 static constexpr int value = 4;
52 };
53
54 template <>
55 struct SizeCoefficient<wchar_t, char16_t> {
56 // UTF-16 uses at most 2 codeunits per character.
57 static constexpr int value = 2;
58 };
59 #endif // defined(WCHAR_T_IS_32_BIT)
60
61 template <typename SrcChar, typename DestChar>
62 constexpr int size_coefficient_v =
63 SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value;
64
65 // UnicodeAppendUnsafe --------------------------------------------------------
66 // Function overloads that write code_point to the output string. Output string
67 // has to have enough space for the codepoint.
68
69 // Convenience typedef that checks whether the passed in type is integral (i.e.
70 // bool, char, int or their extended versions) and is of the correct size.
71 template <typename Char, size_t N>
72 concept BitsAre = std::integral<Char> && CHAR_BIT * sizeof(Char) == N;
73
74 template <typename Char>
75 requires(BitsAre<Char, 8>)
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)76 void UnicodeAppendUnsafe(Char* out,
77 size_t* size,
78 base_icu::UChar32 code_point) {
79 CBU8_APPEND_UNSAFE(reinterpret_cast<uint8_t*>(out), *size, code_point);
80 }
81
82 template <typename Char>
83 requires(BitsAre<Char, 16>)
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)84 void UnicodeAppendUnsafe(Char* out,
85 size_t* size,
86 base_icu::UChar32 code_point) {
87 CBU16_APPEND_UNSAFE(out, *size, code_point);
88 }
89
90 template <typename Char>
91 requires(BitsAre<Char, 32>)
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)92 void UnicodeAppendUnsafe(Char* out,
93 size_t* size,
94 base_icu::UChar32 code_point) {
95 out[(*size)++] = static_cast<Char>(code_point);
96 }
97
98 // DoUTFConversion ------------------------------------------------------------
99 // Main driver of UTFConversion specialized for different Src encodings.
100 // dest has to have enough room for the converted text.
101
102 template <typename DestChar>
DoUTFConversion(const char * src,size_t src_len,DestChar * dest,size_t * dest_len)103 bool DoUTFConversion(const char* src,
104 size_t src_len,
105 DestChar* dest,
106 size_t* dest_len) {
107 bool success = true;
108
109 for (size_t i = 0; i < src_len;) {
110 base_icu::UChar32 code_point;
111 CBU8_NEXT(reinterpret_cast<const uint8_t*>(src), i, src_len, code_point);
112
113 if (!IsValidCodepoint(code_point)) {
114 success = false;
115 code_point = kErrorCodePoint;
116 }
117
118 UnicodeAppendUnsafe(dest, dest_len, code_point);
119 }
120
121 return success;
122 }
123
124 template <typename DestChar>
DoUTFConversion(const char16_t * src,size_t src_len,DestChar * dest,size_t * dest_len)125 bool DoUTFConversion(const char16_t* src,
126 size_t src_len,
127 DestChar* dest,
128 size_t* dest_len) {
129 bool success = true;
130
131 auto ConvertSingleChar = [&success](char16_t in) -> base_icu::UChar32 {
132 if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) {
133 success = false;
134 return kErrorCodePoint;
135 }
136 return in;
137 };
138
139 size_t i = 0;
140
141 // Always have another symbol in order to avoid checking boundaries in the
142 // middle of the surrogate pair.
143 while (i + 1 < src_len) {
144 base_icu::UChar32 code_point;
145
146 if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) {
147 code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]);
148 if (!IsValidCodepoint(code_point)) {
149 code_point = kErrorCodePoint;
150 success = false;
151 }
152 i += 2;
153 } else {
154 code_point = ConvertSingleChar(src[i]);
155 ++i;
156 }
157
158 UnicodeAppendUnsafe(dest, dest_len, code_point);
159 }
160
161 if (i < src_len) {
162 UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i]));
163 }
164
165 return success;
166 }
167
168 #if defined(WCHAR_T_IS_32_BIT)
169
170 template <typename DestChar>
DoUTFConversion(const wchar_t * src,size_t src_len,DestChar * dest,size_t * dest_len)171 bool DoUTFConversion(const wchar_t* src,
172 size_t src_len,
173 DestChar* dest,
174 size_t* dest_len) {
175 bool success = true;
176
177 for (size_t i = 0; i < src_len; ++i) {
178 auto code_point = static_cast<base_icu::UChar32>(src[i]);
179
180 if (!IsValidCodepoint(code_point)) {
181 success = false;
182 code_point = kErrorCodePoint;
183 }
184
185 UnicodeAppendUnsafe(dest, dest_len, code_point);
186 }
187
188 return success;
189 }
190
191 #endif // defined(WCHAR_T_IS_32_BIT)
192
193 // UTFConversion --------------------------------------------------------------
194 // Function template for generating all UTF conversions.
195
196 template <typename InputString, typename DestString>
UTFConversion(const InputString & src_str,DestString * dest_str)197 bool UTFConversion(const InputString& src_str, DestString* dest_str) {
198 if (IsStringASCII(src_str)) {
199 dest_str->assign(src_str.begin(), src_str.end());
200 return true;
201 }
202
203 dest_str->resize(src_str.length() *
204 size_coefficient_v<typename InputString::value_type,
205 typename DestString::value_type>);
206
207 // Empty string is ASCII => it OK to call operator[].
208 auto* dest = &(*dest_str)[0];
209
210 // ICU requires 32 bit numbers.
211 size_t src_len = src_str.length();
212 size_t dest_len = 0;
213
214 bool res = DoUTFConversion(src_str.data(), src_len, dest, &dest_len);
215
216 dest_str->resize(dest_len);
217 dest_str->shrink_to_fit();
218
219 return res;
220 }
221
222 } // namespace
223
224 // UTF16 <-> UTF8 --------------------------------------------------------------
225
UTF8ToUTF16(const char * src,size_t src_len,std::u16string * output)226 bool UTF8ToUTF16(const char* src, size_t src_len, std::u16string* output) {
227 return UTFConversion(StringPiece(src, src_len), output);
228 }
229
UTF8ToUTF16(StringPiece utf8)230 std::u16string UTF8ToUTF16(StringPiece utf8) {
231 std::u16string ret;
232 // Ignore the success flag of this call, it will do the best it can for
233 // invalid input, which is what we want here.
234 UTF8ToUTF16(utf8.data(), utf8.size(), &ret);
235 return ret;
236 }
237
UTF16ToUTF8(const char16_t * src,size_t src_len,std::string * output)238 bool UTF16ToUTF8(const char16_t* src, size_t src_len, std::string* output) {
239 return UTFConversion(StringPiece16(src, src_len), output);
240 }
241
UTF16ToUTF8(StringPiece16 utf16)242 std::string UTF16ToUTF8(StringPiece16 utf16) {
243 std::string ret;
244 // Ignore the success flag of this call, it will do the best it can for
245 // invalid input, which is what we want here.
246 UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
247 return ret;
248 }
249
250 // UTF-16 <-> Wide -------------------------------------------------------------
251
252 #if defined(WCHAR_T_IS_16_BIT)
253 // When wide == UTF-16 the conversions are a NOP.
254
WideToUTF16(const wchar_t * src,size_t src_len,std::u16string * output)255 bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
256 output->assign(src, src + src_len);
257 return true;
258 }
259
WideToUTF16(std::wstring_view wide)260 std::u16string WideToUTF16(std::wstring_view wide) {
261 return std::u16string(wide.begin(), wide.end());
262 }
263
UTF16ToWide(const char16_t * src,size_t src_len,std::wstring * output)264 bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
265 output->assign(src, src + src_len);
266 return true;
267 }
268
UTF16ToWide(StringPiece16 utf16)269 std::wstring UTF16ToWide(StringPiece16 utf16) {
270 return std::wstring(utf16.begin(), utf16.end());
271 }
272
273 #elif defined(WCHAR_T_IS_32_BIT)
274
WideToUTF16(const wchar_t * src,size_t src_len,std::u16string * output)275 bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
276 return UTFConversion(std::wstring_view(src, src_len), output);
277 }
278
WideToUTF16(std::wstring_view wide)279 std::u16string WideToUTF16(std::wstring_view wide) {
280 std::u16string ret;
281 // Ignore the success flag of this call, it will do the best it can for
282 // invalid input, which is what we want here.
283 WideToUTF16(wide.data(), wide.length(), &ret);
284 return ret;
285 }
286
UTF16ToWide(const char16_t * src,size_t src_len,std::wstring * output)287 bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
288 return UTFConversion(StringPiece16(src, src_len), output);
289 }
290
UTF16ToWide(StringPiece16 utf16)291 std::wstring UTF16ToWide(StringPiece16 utf16) {
292 std::wstring ret;
293 // Ignore the success flag of this call, it will do the best it can for
294 // invalid input, which is what we want here.
295 UTF16ToWide(utf16.data(), utf16.length(), &ret);
296 return ret;
297 }
298
299 #endif // defined(WCHAR_T_IS_32_BIT)
300
301 // UTF-8 <-> Wide --------------------------------------------------------------
302
303 // UTF8ToWide is the same code, regardless of whether wide is 16 or 32 bits
304
UTF8ToWide(const char * src,size_t src_len,std::wstring * output)305 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
306 return UTFConversion(StringPiece(src, src_len), output);
307 }
308
UTF8ToWide(StringPiece utf8)309 std::wstring UTF8ToWide(StringPiece utf8) {
310 std::wstring ret;
311 // Ignore the success flag of this call, it will do the best it can for
312 // invalid input, which is what we want here.
313 UTF8ToWide(utf8.data(), utf8.length(), &ret);
314 return ret;
315 }
316
317 #if defined(WCHAR_T_IS_16_BIT)
318 // Easy case since we can use the "utf" versions we already wrote above.
319
WideToUTF8(const wchar_t * src,size_t src_len,std::string * output)320 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
321 return UTF16ToUTF8(as_u16cstr(src), src_len, output);
322 }
323
WideToUTF8(std::wstring_view wide)324 std::string WideToUTF8(std::wstring_view wide) {
325 return UTF16ToUTF8(StringPiece16(as_u16cstr(wide), wide.size()));
326 }
327
328 #elif defined(WCHAR_T_IS_32_BIT)
329
WideToUTF8(const wchar_t * src,size_t src_len,std::string * output)330 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
331 return UTFConversion(std::wstring_view(src, src_len), output);
332 }
333
WideToUTF8(std::wstring_view wide)334 std::string WideToUTF8(std::wstring_view wide) {
335 std::string ret;
336 // Ignore the success flag of this call, it will do the best it can for
337 // invalid input, which is what we want here.
338 WideToUTF8(wide.data(), wide.length(), &ret);
339 return ret;
340 }
341
342 #endif // defined(WCHAR_T_IS_32_BIT)
343
ASCIIToUTF16(StringPiece ascii)344 std::u16string ASCIIToUTF16(StringPiece ascii) {
345 DCHECK(IsStringASCII(ascii)) << ascii;
346 return std::u16string(ascii.begin(), ascii.end());
347 }
348
UTF16ToASCII(StringPiece16 utf16)349 std::string UTF16ToASCII(StringPiece16 utf16) {
350 DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
351 return std::string(utf16.begin(), utf16.end());
352 }
353
354 #if defined(WCHAR_T_IS_16_BIT)
ASCIIToWide(StringPiece ascii)355 std::wstring ASCIIToWide(StringPiece ascii) {
356 DCHECK(IsStringASCII(ascii)) << ascii;
357 return std::wstring(ascii.begin(), ascii.end());
358 }
359
WideToASCII(std::wstring_view wide)360 std::string WideToASCII(std::wstring_view wide) {
361 DCHECK(IsStringASCII(wide)) << wide;
362 return std::string(wide.begin(), wide.end());
363 }
364 #endif // defined(WCHAR_T_IS_16_BIT)
365
366 } // namespace base
367