xref: /aosp_15_r20/external/pigweed/pw_string/public/pw_string/utf_codecs.h (revision 61c4878ac05f98d0ceed94b57d316916de578985)
1 // Copyright 2024 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 
15 #pragma once
16 
17 /// Provides basic helpers for reading and writing UTF-8 encoded strings.
18 
19 #include <array>
20 #include <cstdint>
21 #include <string_view>
22 
23 #include "pw_result/result.h"
24 #include "pw_status/status.h"
25 #include "pw_string/string_builder.h"
26 
27 namespace pw {
28 namespace utf {
29 /// Checks if the code point is in a valid range.
30 ///
31 /// Excludes the surrogate code points (`[0xD800, 0xDFFF]`) and
32 /// codepoints larger than `0x10FFFF` (the highest codepoint allowed).
33 /// Non-characters and unassigned codepoints are allowed.
IsValidCodepoint(uint32_t code_point)34 constexpr inline bool IsValidCodepoint(uint32_t code_point) {
35   return code_point < 0xD800u ||
36          (code_point >= 0xE000u && code_point <= 0x10FFFFu);
37 }
38 
39 /// Checks if the code point is a valid character.
40 ///
41 /// Excludes non-characters (`U+FDD0..U+FDEF`, and all codepoints ending in
42 /// `0xFFFE` or `0xFFFF`) from the set of valid code points.
IsValidCharacter(uint32_t code_point)43 constexpr inline bool IsValidCharacter(uint32_t code_point) {
44   return code_point < 0xD800u ||
45          (code_point >= 0xE000u && code_point < 0xFDD0u) ||
46          (code_point > 0xFDEFu && code_point <= 0x10FFFFu &&
47           (code_point & 0xFFFEu) != 0xFFFEu);
48 }
49 
50 /// @class CodePointAndSize
51 ///
52 /// Provides a combined view of a valid codepoint and the number of bytes its
53 /// encoding requires. The maximum valid codepoint is `0x10FFFFU` which requires
54 /// 20 bits to represent. This combined view uses the available upper bits to
55 /// encode the number of bytes required to represent the codepoint when UTF
56 /// encoded.
57 class CodePointAndSize final {
58  public:
59   /// Creates a combined view of a @code_point and its encoded @size.
CodePointAndSize(uint32_t code_point,size_t size)60   explicit constexpr CodePointAndSize(uint32_t code_point, size_t size)
61       : code_point_((static_cast<uint32_t>(size) << kSizeShift) | code_point) {}
62 
63   constexpr CodePointAndSize(const CodePointAndSize&) = default;
64   constexpr CodePointAndSize& operator=(const CodePointAndSize&) = default;
65   constexpr CodePointAndSize(CodePointAndSize&&) = default;
66   constexpr CodePointAndSize& operator=(CodePointAndSize&&) = default;
67 
68   /// Returns the code point this represents.
code_point()69   constexpr uint32_t code_point() const { return code_point_ & kCodePointMask; }
70 
71   /// Returns the number of bytes required to encode this codepoint.
size()72   constexpr size_t size() const {
73     return (code_point_ & kSizeMask) >> kSizeShift;
74   }
75 
76  private:
77   static constexpr size_t kSizeBits = 4;
78   static constexpr uint32_t kCodePointMask = ~0U >> kSizeBits;
79   static constexpr uint32_t kSizeMask = ~kCodePointMask;
80   static constexpr size_t kSizeShift = sizeof(uint32_t) * 8 - kSizeBits;
81   uint32_t code_point_;
82 };
83 }  // namespace utf
84 
85 namespace utf8 {
86 /// @brief Reads the first code point from a UTF-8 encoded `str`.
87 ///
88 /// This is a very basic decoder without much thought for performance and very
89 /// basic validation that the correct number of bytes are available and that
90 /// each byte of a multibyte sequence has a continuation character. See
91 /// `pw::utf8::EncodeCharacter()` for encoding details.
92 ///
93 /// @return @rst
94 ///
95 /// .. pw-status-codes::
96 ///
97 ///    OK: The decoded code point and the number of bytes read.
98 ///
99 ///    INVALID_ARGUMENT: The string was empty or malformed.
100 ///
101 ///    OUT_OF_RANGE: The decoded code point was not in the valid range.
102 ///
103 /// @endrst
ReadCodePoint(std::string_view str)104 constexpr pw::Result<utf::CodePointAndSize> ReadCodePoint(
105     std::string_view str) {
106   if (str.empty()) {
107     return pw::Status::InvalidArgument();
108   }
109 
110   const uint8_t leading_byte = static_cast<uint8_t>(str.front());
111   size_t byte_count = 0;
112   uint32_t code_point = 0xFFFFFFFFu;
113 
114   if (leading_byte <= 0x7F) {
115     byte_count = 1;
116     // b0xxx xxxx
117     code_point = leading_byte;
118   } else if (leading_byte <= 0xDF) {
119     byte_count = 2;
120     if (str.size() < byte_count) {
121       return pw::Status::InvalidArgument();
122     }
123     // b110x xxxx 10xx xxxx
124     if ((str[1] & 0xC0) != 0x80) {
125       // Invalid continuation
126       return pw::Status::InvalidArgument();
127     }
128     code_point = (static_cast<uint32_t>(str[0] & 0x1F) << 6) +
129                  static_cast<uint32_t>(str[1] & 0x3F);
130   } else if (leading_byte <= 0xEF) {
131     byte_count = 3;
132     if (str.size() < byte_count) {
133       return pw::Status::InvalidArgument();
134     }
135     if ((str[1] & 0xC0) != 0x80 || (str[2] & 0xC0) != 0x80) {
136       // Invalid continuation
137       return pw::Status::InvalidArgument();
138     }
139     // b1110 xxxx 10xx xxxx 10xx xxxx
140     code_point = (static_cast<uint32_t>(str[0] & 0x0F) << 12) +
141                  (static_cast<uint32_t>(str[1] & 0x3F) << 6) +
142                  static_cast<uint32_t>(str[2] & 0x3F);
143   } else if (leading_byte <= 0xF7) {
144     byte_count = 4;
145     if (str.size() < byte_count) {
146       return pw::Status::InvalidArgument();
147     }
148     if ((str[1] & 0xC0) != 0x80 || (str[2] & 0xC0) != 0x80 ||
149         (str[3] & 0xC0) != 0x80) {
150       // Invalid continuation
151       return pw::Status::InvalidArgument();
152     }
153     // b1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
154     code_point = (static_cast<uint32_t>(str[0] & 0x07) << 18) +
155                  (static_cast<uint32_t>(str[1] & 0x3F) << 12) +
156                  (static_cast<uint32_t>(str[2] & 0x3F) << 6) +
157                  static_cast<uint32_t>(str[3] & 0x3F);
158   } else {
159     return pw::Status::InvalidArgument();
160   }
161 
162   // Validate the decoded value.
163   if (utf::IsValidCodepoint(code_point)) {
164     return utf::CodePointAndSize(code_point, byte_count);
165   }
166 
167   return pw::Status::OutOfRange();
168 }
169 
170 /// Determines if `str` is a valid UTF-8 string.
IsStringValid(std::string_view str)171 constexpr bool IsStringValid(std::string_view str) {
172   while (!str.empty()) {
173     auto rslt = utf8::ReadCodePoint(str);
174     if (!rslt.ok() || !utf::IsValidCharacter(rslt->code_point())) {
175       return false;
176     }
177     str = str.substr(rslt->size());
178   }
179   return true;
180 }
181 
182 /// Encapsulates the result of encoding a single code point as UTF-8.
183 class EncodedCodePoint {
184  public:
EncodedCodePoint(uint32_t size,std::array<char,4> data)185   constexpr EncodedCodePoint(uint32_t size, std::array<char, 4> data)
186       : size_(size), data_(std::move(data)) {}
187   constexpr EncodedCodePoint(EncodedCodePoint&& encoded) = default;
as_view()188   constexpr std::string_view as_view() const { return {data_.data(), size_}; }
189 
190  private:
191   uint32_t size_;
192   std::array<char, 4> data_;
193 };
194 
195 /// @brief Encodes a single code point as UTF-8.
196 ///
197 /// UTF-8 encodes as 1-4 bytes from a range of `[0, 0x10FFFF]`.
198 ///
199 /// 1-byte encoding has a top bit of zero:
200 /// @code
201 /// [0, 0x7F] 1-bytes: b0xxx xxxx
202 /// @endcode
203 /// N-bytes sequences are denoted by annotating the top N+1 bits of the leading
204 /// byte and then using a 2-bit continuation marker on the following bytes.
205 /// @code
206 /// [0x00080, 0x0007FF] 2-bytes: b110x xxxx 10xx xxxx
207 /// [0x00800, 0x00FFFF] 3-bytes: b1110 xxxx 10xx xxxx 10xx xxxx
208 /// [0x10000, 0x10FFFF] 4-bytes: b1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
209 /// @endcode
210 ///
211 /// @return @rst
212 ///
213 /// .. pw-status-codes::
214 ///
215 ///    OK: The codepoint encoded as UTF-8.
216 ///
217 ///    OUT_OF_RANGE: The code point was not in the valid range for UTF-8
218 ///    encoding.
219 ///
220 /// @endrst
EncodeCodePoint(uint32_t code_point)221 constexpr Result<EncodedCodePoint> EncodeCodePoint(uint32_t code_point) {
222   if (code_point <= 0x7F) {
223     return EncodedCodePoint{1, {static_cast<char>(code_point)}};
224   }
225   if (code_point <= 0x7FF) {
226     return EncodedCodePoint{2,
227                             {static_cast<char>(0xC0 | (code_point >> 6)),
228                              static_cast<char>(0x80 | (code_point & 0x3F))}};
229   }
230   if (code_point <= 0xFFFF) {
231     return EncodedCodePoint{
232         3,
233         {static_cast<char>(0xE0 | (code_point >> 12)),
234          static_cast<char>(0x80 | ((code_point >> 6) & 0x3F)),
235          static_cast<char>(0x80 | (code_point & 0x3F))}};
236   }
237   if (code_point <= 0x10FFFF) {
238     return EncodedCodePoint{
239         4,
240         {static_cast<char>(0xF0 | (code_point >> 18)),
241          static_cast<char>(0x80 | ((code_point >> 12) & 0x3F)),
242          static_cast<char>(0x80 | ((code_point >> 6) & 0x3F)),
243          static_cast<char>(0x80 | (code_point & 0x3F))}};
244   }
245 
246   return pw::Status::OutOfRange();
247 }
248 
249 /// Helper that writes a code point to the provided `pw::StringBuilder`.
250 Status WriteCodePoint(uint32_t code_point, pw::StringBuilder& output);
251 
252 }  // namespace utf8
253 
254 }  // namespace pw
255