xref: /aosp_15_r20/external/angle/third_party/abseil-cpp/absl/strings/ascii.cc (revision 8975f5c5ed3d1c378011245431ada316dfb6f244)
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/ascii.h"
16 
17 #include <climits>
18 #include <cstddef>
19 #include <cstring>
20 #include <string>
21 
22 #include "absl/base/attributes.h"
23 #include "absl/base/config.h"
24 #include "absl/base/nullability.h"
25 #include "absl/base/optimization.h"
26 
27 namespace absl {
28 ABSL_NAMESPACE_BEGIN
29 namespace ascii_internal {
30 
31 // # Table generated by this Python code (bit 0x02 is currently unused):
32 // TODO(mbar) Move Python code for generation of table to BUILD and link here.
33 
34 // NOTE: The kAsciiPropertyBits table used within this code was generated by
35 // Python code of the following form. (Bit 0x02 is currently unused and
36 // available.)
37 //
38 // def Hex2(n):
39 //   return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
40 // def IsPunct(ch):
41 //   return (ord(ch) >= 32 and ord(ch) < 127 and
42 //           not ch.isspace() and not ch.isalnum())
43 // def IsBlank(ch):
44 //   return ch in ' \t'
45 // def IsCntrl(ch):
46 //   return ord(ch) < 32 or ord(ch) == 127
47 // def IsXDigit(ch):
48 //   return ch.isdigit() or ch.lower() in 'abcdef'
49 // for i in range(128):
50 //   ch = chr(i)
51 //   mask = ((ch.isalpha() and 0x01 or 0) |
52 //           (ch.isalnum() and 0x04 or 0) |
53 //           (ch.isspace() and 0x08 or 0) |
54 //           (IsPunct(ch) and 0x10 or 0) |
55 //           (IsBlank(ch) and 0x20 or 0) |
56 //           (IsCntrl(ch) and 0x40 or 0) |
57 //           (IsXDigit(ch) and 0x80 or 0))
58 //   print Hex2(mask) + ',',
59 //   if i % 16 == 7:
60 //     print ' //', Hex2(i & 0x78)
61 //   elif i % 16 == 15:
62 //     print
63 
64 // clang-format off
65 // Array of bitfields holding character information. Each bit value corresponds
66 // to a particular character feature. For readability, and because the value
67 // of these bits is tightly coupled to this implementation, the individual bits
68 // are not named. Note that bitfields for all characters above ASCII 127 are
69 // zero-initialized.
70 ABSL_DLL const unsigned char kPropertyBits[256] = {
71     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x00
72     0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
73     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x10
74     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
75     0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,  // 0x20
76     0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
77     0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84,  // 0x30
78     0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
79     0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x40
80     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
81     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x50
82     0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
83     0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x60
84     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
85     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x70
86     0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
87 };
88 
89 // Array of characters for the ascii_tolower() function. For values 'A'
90 // through 'Z', return the lower-case character; otherwise, return the
91 // identity of the passed character.
92 ABSL_DLL const char kToLower[256] = {
93   '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
94   '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
95   '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
96   '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
97   '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
98   '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
99   '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
100   '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
101   '\x40',    'a',    'b',    'c',    'd',    'e',    'f',    'g',
102      'h',    'i',    'j',    'k',    'l',    'm',    'n',    'o',
103      'p',    'q',    'r',    's',    't',    'u',    'v',    'w',
104      'x',    'y',    'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
105   '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
106   '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
107   '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
108   '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
109   '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
110   '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
111   '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
112   '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
113   '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
114   '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
115   '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
116   '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
117   '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
118   '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
119   '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
120   '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
121   '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
122   '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
123   '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
124   '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
125 };
126 
127 // Array of characters for the ascii_toupper() function. For values 'a'
128 // through 'z', return the upper-case character; otherwise, return the
129 // identity of the passed character.
130 ABSL_DLL const char kToUpper[256] = {
131   '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
132   '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
133   '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
134   '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
135   '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
136   '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
137   '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
138   '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
139   '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
140   '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
141   '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
142   '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
143   '\x60',    'A',    'B',    'C',    'D',    'E',    'F',    'G',
144      'H',    'I',    'J',    'K',    'L',    'M',    'N',    'O',
145      'P',    'Q',    'R',    'S',    'T',    'U',    'V',    'W',
146      'X',    'Y',    'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
147   '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
148   '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
149   '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
150   '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
151   '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
152   '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
153   '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
154   '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
155   '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
156   '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
157   '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
158   '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
159   '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
160   '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
161   '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
162   '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
163 };
164 // clang-format on
165 
166 // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`).
167 // Implemented by:
168 //  1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26).
169 //  2. Comparing to SCHAR_MIN + 26.
170 template <bool ToUpper>
AsciiInAZRange(unsigned char c)171 constexpr bool AsciiInAZRange(unsigned char c) {
172   constexpr unsigned char sub = (ToUpper ? 'a' : 'A') - SCHAR_MIN;
173   constexpr signed char threshold = SCHAR_MIN + 26;  // 26 = alphabet size.
174   // Using unsigned arithmetic as overflows/underflows are well defined.
175   unsigned char u = c - sub;
176   // Using signed cmp, as SIMD unsigned cmp isn't available in many platforms.
177   return static_cast<signed char>(u) < threshold;
178 }
179 
180 // Force-inline so the compiler won't merge the short and long implementations.
181 // `src` may be null iff `size` is zero.
182 template <bool ToUpper>
AsciiStrCaseFoldImpl(absl::Nonnull<char * > dst,absl::Nullable<const char * > src,size_t size)183 ABSL_ATTRIBUTE_ALWAYS_INLINE inline constexpr void AsciiStrCaseFoldImpl(
184     absl::Nonnull<char*> dst, absl::Nullable<const char*> src, size_t size) {
185   // The upper- and lowercase versions of ASCII characters differ by only 1 bit.
186   // When we need to flip the case, we can xor with this bit to achieve the
187   // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We
188   // could have chosen 'z' and 'Z', or any other pair of characters as they all
189   // have the same single bit difference.
190   constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A';
191 
192   for (size_t i = 0; i < size; ++i) {
193     unsigned char v = static_cast<unsigned char>(src[i]);
194     v ^= AsciiInAZRange<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
195     dst[i] = static_cast<char>(v);
196   }
197 }
198 
199 // The string size threshold for starting using the long string version.
200 constexpr size_t kCaseFoldThreshold = 16;
201 
202 // No-inline so the compiler won't merge the short and long implementations.
203 // `src` may be null iff `size` is zero.
204 template <bool ToUpper>
AsciiStrCaseFoldLong(absl::Nonnull<char * > dst,absl::Nullable<const char * > src,size_t size)205 ABSL_ATTRIBUTE_NOINLINE constexpr void AsciiStrCaseFoldLong(
206     absl::Nonnull<char*> dst, absl::Nullable<const char*> src, size_t size) {
207   ABSL_ASSUME(size >= kCaseFoldThreshold);
208   AsciiStrCaseFoldImpl<ToUpper>(dst, src, size);
209 }
210 
211 // Splitting to short and long strings to allow vectorization decisions
212 // to be made separately in the long and short cases.
213 // `src` may be null iff `size` is zero.
214 template <bool ToUpper>
AsciiStrCaseFold(absl::Nonnull<char * > dst,absl::Nullable<const char * > src,size_t size)215 constexpr void AsciiStrCaseFold(absl::Nonnull<char*> dst,
216                                 absl::Nullable<const char*> src, size_t size) {
217   size < kCaseFoldThreshold ? AsciiStrCaseFoldImpl<ToUpper>(dst, src, size)
218                             : AsciiStrCaseFoldLong<ToUpper>(dst, src, size);
219 }
220 
AsciiStrToLower(absl::Nonnull<char * > dst,absl::Nullable<const char * > src,size_t n)221 void AsciiStrToLower(absl::Nonnull<char*> dst, absl::Nullable<const char*> src,
222                      size_t n) {
223   return AsciiStrCaseFold<false>(dst, src, n);
224 }
225 
AsciiStrToUpper(absl::Nonnull<char * > dst,absl::Nullable<const char * > src,size_t n)226 void AsciiStrToUpper(absl::Nonnull<char*> dst, absl::Nullable<const char*> src,
227                      size_t n) {
228   return AsciiStrCaseFold<true>(dst, src, n);
229 }
230 
ValidateAsciiCasefold()231 static constexpr size_t ValidateAsciiCasefold() {
232   constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN;
233   size_t incorrect_index = 0;
234   char lowered[num_chars] = {};
235   char uppered[num_chars] = {};
236   for (unsigned int i = 0; i < num_chars; ++i) {
237     uppered[i] = lowered[i] = static_cast<char>(i);
238   }
239   AsciiStrCaseFold<false>(&lowered[0], &lowered[0], num_chars);
240   AsciiStrCaseFold<true>(&uppered[0], &uppered[0], num_chars);
241   for (size_t i = 0; i < num_chars; ++i) {
242     const char ch = static_cast<char>(i),
243                ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch),
244                ch_lower = ('A' <= ch && ch <= 'Z' ? 'a' + (ch - 'A') : ch);
245     if (uppered[i] != ch_upper || lowered[i] != ch_lower) {
246       incorrect_index = i > 0 ? i : num_chars;
247       break;
248     }
249   }
250   return incorrect_index;
251 }
252 
253 static_assert(ValidateAsciiCasefold() == 0, "error in case conversion");
254 
255 }  // namespace ascii_internal
256 
AsciiStrToLower(absl::Nonnull<std::string * > s)257 void AsciiStrToLower(absl::Nonnull<std::string*> s) {
258   char* p = &(*s)[0];
259   return ascii_internal::AsciiStrCaseFold<false>(p, p, s->size());
260 }
261 
AsciiStrToUpper(absl::Nonnull<std::string * > s)262 void AsciiStrToUpper(absl::Nonnull<std::string*> s) {
263   char* p = &(*s)[0];
264   return ascii_internal::AsciiStrCaseFold<true>(p, p, s->size());
265 }
266 
RemoveExtraAsciiWhitespace(absl::Nonnull<std::string * > str)267 void RemoveExtraAsciiWhitespace(absl::Nonnull<std::string*> str) {
268   auto stripped = StripAsciiWhitespace(*str);
269 
270   if (stripped.empty()) {
271     str->clear();
272     return;
273   }
274 
275   auto input_it = stripped.begin();
276   auto input_end = stripped.end();
277   auto output_it = &(*str)[0];
278   bool is_ws = false;
279 
280   for (; input_it < input_end; ++input_it) {
281     if (is_ws) {
282       // Consecutive whitespace?  Keep only the last.
283       is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
284       if (is_ws) --output_it;
285     } else {
286       is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
287     }
288 
289     *output_it = *input_it;
290     ++output_it;
291   }
292 
293   str->erase(static_cast<size_t>(output_it - &(*str)[0]));
294 }
295 
296 ABSL_NAMESPACE_END
297 }  // namespace absl
298