xref: /aosp_15_r20/external/pdfium/core/fxcrt/utf16.h (revision 3ac0a46f773bac49fa9476ec2b1cf3f8da5ec3a4)
1 // Copyright 2023 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef CORE_FXCRT_UTF16_H_
6 #define CORE_FXCRT_UTF16_H_
7 
8 #include "third_party/base/check.h"
9 
10 namespace pdfium {
11 
12 // The number of suffix bits in a UTF-16 surrogate.
13 inline constexpr int kSurrogateBits = 10;
14 
15 // A bitmask for the suffix of a UTF-16 surrogate.
16 inline constexpr char16_t kSurrogateMask = (1 << kSurrogateBits) - 1;
17 
18 // The first supplementary code point, `U+10000`.
19 inline constexpr char32_t kMinimumSupplementaryCodePoint = 0x10000;
20 
21 // The last supplementary code point, `U+10FFFF`.
22 inline constexpr char32_t kMaximumSupplementaryCodePoint =
23     kMinimumSupplementaryCodePoint +
24     (kSurrogateMask << kSurrogateBits | kSurrogateMask);
25 
26 // The first UTF-16 high surrogate code unit, `U+D800`.
27 inline constexpr char16_t kMinimumHighSurrogateCodeUnit = 0xd800;
28 
29 // The last UTF-16 high surrogate code unit, `U+DBFF`.
30 inline constexpr char16_t kMaximumHighSurrogateCodeUnit =
31     kMinimumHighSurrogateCodeUnit | kSurrogateMask;
32 
33 // The first UTF-16 low surrogate code unit, `U+DC00`.
34 inline constexpr char16_t kMinimumLowSurrogateCodeUnit =
35     kMaximumHighSurrogateCodeUnit + 1;
36 
37 // The last UTF-16 low surrogate code unit, `U+DFFF`.
38 inline constexpr char16_t kMaximumLowSurrogateCodeUnit =
39     kMinimumLowSurrogateCodeUnit | kSurrogateMask;
40 
41 // Returns `true` if `code_point` is in a supplementary plane, and therefore
42 // requires encoding as a UTF-16 surrogate pair.
IsSupplementary(char32_t code_point)43 constexpr bool IsSupplementary(char32_t code_point) {
44   return code_point >= kMinimumSupplementaryCodePoint &&
45          code_point <= kMaximumSupplementaryCodePoint;
46 }
47 
48 // Returns `true` if `code_point` is a UTF-16 high surrogate.
IsHighSurrogate(char32_t code_point)49 constexpr bool IsHighSurrogate(char32_t code_point) {
50   return code_point >= kMinimumHighSurrogateCodeUnit &&
51          code_point <= kMaximumHighSurrogateCodeUnit;
52 }
53 
54 // Returns `true` if `code_point` is a UTF-16 low surrogate.
IsLowSurrogate(char32_t code_point)55 constexpr bool IsLowSurrogate(char32_t code_point) {
56   return code_point >= kMinimumLowSurrogateCodeUnit &&
57          code_point <= kMaximumLowSurrogateCodeUnit;
58 }
59 
60 // A UTF-16 surrogate pair.
61 class SurrogatePair final {
62  public:
63   // Constructs a surrogate pair from a high and a low surrogate.
SurrogatePair(char16_t high,char16_t low)64   constexpr SurrogatePair(char16_t high, char16_t low)
65       : high_(high), low_(low) {
66     DCHECK(IsHighSurrogate(high_));
67     DCHECK(IsLowSurrogate(low_));
68   }
69 
70   // Constructs a surrogate pair from a code point.
SurrogatePair(char32_t code_point)71   explicit constexpr SurrogatePair(char32_t code_point)
72       : high_(GetHighSurrogate(code_point)), low_(GetLowSurrogate(code_point)) {
73     // This constructor initializes `high_` and `low_` using helper functions
74     // because C++17 requires it for `constexpr` constructors.
75     DCHECK(IsSupplementary(code_point));
76   }
77 
high()78   constexpr char16_t high() const { return high_; }
low()79   constexpr char16_t low() const { return low_; }
80 
81   // Decodes this surrogate pair to a code point.
ToCodePoint()82   constexpr char32_t ToCodePoint() const {
83     char32_t code_point = low_ & kSurrogateMask;
84     code_point |= (high_ & kSurrogateMask) << kSurrogateBits;
85     return kMinimumSupplementaryCodePoint + code_point;
86   }
87 
88  private:
GetHighSurrogate(char32_t code_point)89   static constexpr char16_t GetHighSurrogate(char32_t code_point) {
90     code_point -= kMinimumSupplementaryCodePoint;
91     char16_t code_unit = (code_point >> kSurrogateBits) & kSurrogateMask;
92     return kMinimumHighSurrogateCodeUnit | code_unit;
93   }
94 
GetLowSurrogate(char32_t code_point)95   static constexpr char16_t GetLowSurrogate(char32_t code_point) {
96     code_point -= kMinimumSupplementaryCodePoint;
97     char16_t code_unit = code_point & kSurrogateMask;
98     return kMinimumLowSurrogateCodeUnit | code_unit;
99   }
100 
101   char16_t high_;
102   char16_t low_;
103 };
104 
105 }  // namespace pdfium
106 
107 #endif  // CORE_FXCRT_UTF16_H_
108