xref: /aosp_15_r20/external/cronet/base/strings/utf_string_conversions.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/utf_string_conversions.h"
6 
7 #include <limits.h>
8 #include <stdint.h>
9 
10 #include <concepts>
11 #include <ostream>
12 #include <string_view>
13 #include <type_traits>
14 
15 #include "base/strings/string_piece.h"
16 #include "base/strings/string_util.h"
17 #include "base/strings/utf_ostream_operators.h"
18 #include "base/strings/utf_string_conversion_utils.h"
19 #include "base/third_party/icu/icu_utf.h"
20 #include "build/build_config.h"
21 
22 namespace base {
23 
24 namespace {
25 
26 constexpr base_icu::UChar32 kErrorCodePoint = 0xFFFD;
27 
28 // Size coefficient ----------------------------------------------------------
29 // The maximum number of codeunits in the destination encoding corresponding to
30 // one codeunit in the source encoding.
31 
32 template <typename SrcChar, typename DestChar>
33 struct SizeCoefficient {
34   static_assert(sizeof(SrcChar) < sizeof(DestChar),
35                 "Default case: from a smaller encoding to the bigger one");
36 
37   // ASCII symbols are encoded by one codeunit in all encodings.
38   static constexpr int value = 1;
39 };
40 
41 template <>
42 struct SizeCoefficient<char16_t, char> {
43   // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
44   static constexpr int value = 3;
45 };
46 
47 #if defined(WCHAR_T_IS_32_BIT)
48 template <>
49 struct SizeCoefficient<wchar_t, char> {
50   // UTF-8 uses at most 4 codeunits per character.
51   static constexpr int value = 4;
52 };
53 
54 template <>
55 struct SizeCoefficient<wchar_t, char16_t> {
56   // UTF-16 uses at most 2 codeunits per character.
57   static constexpr int value = 2;
58 };
59 #endif  // defined(WCHAR_T_IS_32_BIT)
60 
61 template <typename SrcChar, typename DestChar>
62 constexpr int size_coefficient_v =
63     SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value;
64 
65 // UnicodeAppendUnsafe --------------------------------------------------------
66 // Function overloads that write code_point to the output string. Output string
67 // has to have enough space for the codepoint.
68 
69 // Convenience typedef that checks whether the passed in type is integral (i.e.
70 // bool, char, int or their extended versions) and is of the correct size.
71 template <typename Char, size_t N>
72 concept BitsAre = std::integral<Char> && CHAR_BIT * sizeof(Char) == N;
73 
74 template <typename Char>
75   requires(BitsAre<Char, 8>)
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)76 void UnicodeAppendUnsafe(Char* out,
77                          size_t* size,
78                          base_icu::UChar32 code_point) {
79   CBU8_APPEND_UNSAFE(reinterpret_cast<uint8_t*>(out), *size, code_point);
80 }
81 
82 template <typename Char>
83   requires(BitsAre<Char, 16>)
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)84 void UnicodeAppendUnsafe(Char* out,
85                          size_t* size,
86                          base_icu::UChar32 code_point) {
87   CBU16_APPEND_UNSAFE(out, *size, code_point);
88 }
89 
90 template <typename Char>
91   requires(BitsAre<Char, 32>)
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)92 void UnicodeAppendUnsafe(Char* out,
93                          size_t* size,
94                          base_icu::UChar32 code_point) {
95   out[(*size)++] = static_cast<Char>(code_point);
96 }
97 
98 // DoUTFConversion ------------------------------------------------------------
99 // Main driver of UTFConversion specialized for different Src encodings.
100 // dest has to have enough room for the converted text.
101 
102 template <typename DestChar>
DoUTFConversion(const char * src,size_t src_len,DestChar * dest,size_t * dest_len)103 bool DoUTFConversion(const char* src,
104                      size_t src_len,
105                      DestChar* dest,
106                      size_t* dest_len) {
107   bool success = true;
108 
109   for (size_t i = 0; i < src_len;) {
110     base_icu::UChar32 code_point;
111     CBU8_NEXT(reinterpret_cast<const uint8_t*>(src), i, src_len, code_point);
112 
113     if (!IsValidCodepoint(code_point)) {
114       success = false;
115       code_point = kErrorCodePoint;
116     }
117 
118     UnicodeAppendUnsafe(dest, dest_len, code_point);
119   }
120 
121   return success;
122 }
123 
124 template <typename DestChar>
DoUTFConversion(const char16_t * src,size_t src_len,DestChar * dest,size_t * dest_len)125 bool DoUTFConversion(const char16_t* src,
126                      size_t src_len,
127                      DestChar* dest,
128                      size_t* dest_len) {
129   bool success = true;
130 
131   auto ConvertSingleChar = [&success](char16_t in) -> base_icu::UChar32 {
132     if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) {
133       success = false;
134       return kErrorCodePoint;
135     }
136     return in;
137   };
138 
139   size_t i = 0;
140 
141   // Always have another symbol in order to avoid checking boundaries in the
142   // middle of the surrogate pair.
143   while (i + 1 < src_len) {
144     base_icu::UChar32 code_point;
145 
146     if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) {
147       code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]);
148       if (!IsValidCodepoint(code_point)) {
149         code_point = kErrorCodePoint;
150         success = false;
151       }
152       i += 2;
153     } else {
154       code_point = ConvertSingleChar(src[i]);
155       ++i;
156     }
157 
158     UnicodeAppendUnsafe(dest, dest_len, code_point);
159   }
160 
161   if (i < src_len) {
162     UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i]));
163   }
164 
165   return success;
166 }
167 
168 #if defined(WCHAR_T_IS_32_BIT)
169 
170 template <typename DestChar>
DoUTFConversion(const wchar_t * src,size_t src_len,DestChar * dest,size_t * dest_len)171 bool DoUTFConversion(const wchar_t* src,
172                      size_t src_len,
173                      DestChar* dest,
174                      size_t* dest_len) {
175   bool success = true;
176 
177   for (size_t i = 0; i < src_len; ++i) {
178     auto code_point = static_cast<base_icu::UChar32>(src[i]);
179 
180     if (!IsValidCodepoint(code_point)) {
181       success = false;
182       code_point = kErrorCodePoint;
183     }
184 
185     UnicodeAppendUnsafe(dest, dest_len, code_point);
186   }
187 
188   return success;
189 }
190 
191 #endif  // defined(WCHAR_T_IS_32_BIT)
192 
193 // UTFConversion --------------------------------------------------------------
194 // Function template for generating all UTF conversions.
195 
196 template <typename InputString, typename DestString>
UTFConversion(const InputString & src_str,DestString * dest_str)197 bool UTFConversion(const InputString& src_str, DestString* dest_str) {
198   if (IsStringASCII(src_str)) {
199     dest_str->assign(src_str.begin(), src_str.end());
200     return true;
201   }
202 
203   dest_str->resize(src_str.length() *
204                    size_coefficient_v<typename InputString::value_type,
205                                       typename DestString::value_type>);
206 
207   // Empty string is ASCII => it OK to call operator[].
208   auto* dest = &(*dest_str)[0];
209 
210   // ICU requires 32 bit numbers.
211   size_t src_len = src_str.length();
212   size_t dest_len = 0;
213 
214   bool res = DoUTFConversion(src_str.data(), src_len, dest, &dest_len);
215 
216   dest_str->resize(dest_len);
217   dest_str->shrink_to_fit();
218 
219   return res;
220 }
221 
222 }  // namespace
223 
224 // UTF16 <-> UTF8 --------------------------------------------------------------
225 
UTF8ToUTF16(const char * src,size_t src_len,std::u16string * output)226 bool UTF8ToUTF16(const char* src, size_t src_len, std::u16string* output) {
227   return UTFConversion(StringPiece(src, src_len), output);
228 }
229 
UTF8ToUTF16(StringPiece utf8)230 std::u16string UTF8ToUTF16(StringPiece utf8) {
231   std::u16string ret;
232   // Ignore the success flag of this call, it will do the best it can for
233   // invalid input, which is what we want here.
234   UTF8ToUTF16(utf8.data(), utf8.size(), &ret);
235   return ret;
236 }
237 
UTF16ToUTF8(const char16_t * src,size_t src_len,std::string * output)238 bool UTF16ToUTF8(const char16_t* src, size_t src_len, std::string* output) {
239   return UTFConversion(StringPiece16(src, src_len), output);
240 }
241 
UTF16ToUTF8(StringPiece16 utf16)242 std::string UTF16ToUTF8(StringPiece16 utf16) {
243   std::string ret;
244   // Ignore the success flag of this call, it will do the best it can for
245   // invalid input, which is what we want here.
246   UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
247   return ret;
248 }
249 
250 // UTF-16 <-> Wide -------------------------------------------------------------
251 
252 #if defined(WCHAR_T_IS_16_BIT)
253 // When wide == UTF-16 the conversions are a NOP.
254 
WideToUTF16(const wchar_t * src,size_t src_len,std::u16string * output)255 bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
256   output->assign(src, src + src_len);
257   return true;
258 }
259 
WideToUTF16(std::wstring_view wide)260 std::u16string WideToUTF16(std::wstring_view wide) {
261   return std::u16string(wide.begin(), wide.end());
262 }
263 
UTF16ToWide(const char16_t * src,size_t src_len,std::wstring * output)264 bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
265   output->assign(src, src + src_len);
266   return true;
267 }
268 
UTF16ToWide(StringPiece16 utf16)269 std::wstring UTF16ToWide(StringPiece16 utf16) {
270   return std::wstring(utf16.begin(), utf16.end());
271 }
272 
273 #elif defined(WCHAR_T_IS_32_BIT)
274 
WideToUTF16(const wchar_t * src,size_t src_len,std::u16string * output)275 bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
276   return UTFConversion(std::wstring_view(src, src_len), output);
277 }
278 
WideToUTF16(std::wstring_view wide)279 std::u16string WideToUTF16(std::wstring_view wide) {
280   std::u16string ret;
281   // Ignore the success flag of this call, it will do the best it can for
282   // invalid input, which is what we want here.
283   WideToUTF16(wide.data(), wide.length(), &ret);
284   return ret;
285 }
286 
UTF16ToWide(const char16_t * src,size_t src_len,std::wstring * output)287 bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
288   return UTFConversion(StringPiece16(src, src_len), output);
289 }
290 
UTF16ToWide(StringPiece16 utf16)291 std::wstring UTF16ToWide(StringPiece16 utf16) {
292   std::wstring ret;
293   // Ignore the success flag of this call, it will do the best it can for
294   // invalid input, which is what we want here.
295   UTF16ToWide(utf16.data(), utf16.length(), &ret);
296   return ret;
297 }
298 
299 #endif  // defined(WCHAR_T_IS_32_BIT)
300 
301 // UTF-8 <-> Wide --------------------------------------------------------------
302 
303 // UTF8ToWide is the same code, regardless of whether wide is 16 or 32 bits
304 
UTF8ToWide(const char * src,size_t src_len,std::wstring * output)305 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
306   return UTFConversion(StringPiece(src, src_len), output);
307 }
308 
UTF8ToWide(StringPiece utf8)309 std::wstring UTF8ToWide(StringPiece utf8) {
310   std::wstring ret;
311   // Ignore the success flag of this call, it will do the best it can for
312   // invalid input, which is what we want here.
313   UTF8ToWide(utf8.data(), utf8.length(), &ret);
314   return ret;
315 }
316 
317 #if defined(WCHAR_T_IS_16_BIT)
318 // Easy case since we can use the "utf" versions we already wrote above.
319 
WideToUTF8(const wchar_t * src,size_t src_len,std::string * output)320 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
321   return UTF16ToUTF8(as_u16cstr(src), src_len, output);
322 }
323 
WideToUTF8(std::wstring_view wide)324 std::string WideToUTF8(std::wstring_view wide) {
325   return UTF16ToUTF8(StringPiece16(as_u16cstr(wide), wide.size()));
326 }
327 
328 #elif defined(WCHAR_T_IS_32_BIT)
329 
WideToUTF8(const wchar_t * src,size_t src_len,std::string * output)330 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
331   return UTFConversion(std::wstring_view(src, src_len), output);
332 }
333 
WideToUTF8(std::wstring_view wide)334 std::string WideToUTF8(std::wstring_view wide) {
335   std::string ret;
336   // Ignore the success flag of this call, it will do the best it can for
337   // invalid input, which is what we want here.
338   WideToUTF8(wide.data(), wide.length(), &ret);
339   return ret;
340 }
341 
342 #endif  // defined(WCHAR_T_IS_32_BIT)
343 
ASCIIToUTF16(StringPiece ascii)344 std::u16string ASCIIToUTF16(StringPiece ascii) {
345   DCHECK(IsStringASCII(ascii)) << ascii;
346   return std::u16string(ascii.begin(), ascii.end());
347 }
348 
UTF16ToASCII(StringPiece16 utf16)349 std::string UTF16ToASCII(StringPiece16 utf16) {
350   DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
351   return std::string(utf16.begin(), utf16.end());
352 }
353 
354 #if defined(WCHAR_T_IS_16_BIT)
ASCIIToWide(StringPiece ascii)355 std::wstring ASCIIToWide(StringPiece ascii) {
356   DCHECK(IsStringASCII(ascii)) << ascii;
357   return std::wstring(ascii.begin(), ascii.end());
358 }
359 
WideToASCII(std::wstring_view wide)360 std::string WideToASCII(std::wstring_view wide) {
361   DCHECK(IsStringASCII(wide)) << wide;
362   return std::string(wide.begin(), wide.end());
363 }
364 #endif  // defined(WCHAR_T_IS_16_BIT)
365 
366 }  // namespace base
367