xref: /aosp_15_r20/external/llvm-libc/src/stdio/scanf_core/parser.h (revision 71db0c75aadcf003ffe3238005f61d7618a3fead)
1 //===-- Format string parser for scanf -------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
10 #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
11 
12 #include "src/__support/arg_list.h"
13 #include "src/__support/ctype_utils.h"
14 #include "src/__support/macros/config.h"
15 #include "src/__support/str_to_integer.h"
16 #include "src/stdio/scanf_core/core_structs.h"
17 #include "src/stdio/scanf_core/scanf_config.h"
18 
19 #include <stddef.h>
20 
21 namespace LIBC_NAMESPACE_DECL {
22 namespace scanf_core {
23 
24 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
25 #define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index)
26 #else
27 #define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>()
28 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
29 
30 template <typename ArgProvider> class Parser {
31   const char *__restrict str;
32 
33   size_t cur_pos = 0;
34   ArgProvider args_cur;
35 
36 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
37   // args_start stores the start of the va_args, which is used when a previous
38   // argument is needed. In that case, we have to read the arguments from the
39   // beginning since they don't support reading backwards.
40   ArgProvider args_start;
41   size_t args_index = 1;
42 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
43 
44 public:
45 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
Parser(const char * __restrict new_str,internal::ArgList & args)46   LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args)
47       : str(new_str), args_cur(args), args_start(args) {}
48 #else
49   LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args)
50       : str(new_str), args_cur(args) {}
51 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
52 
53   // get_next_section will parse the format string until it has a fully
54   // specified format section. This can either be a raw format section with no
55   // conversion, or a format section with a conversion that has all of its
56   // variables stored in the format section.
get_next_section()57   LIBC_INLINE FormatSection get_next_section() {
58     FormatSection section;
59     size_t starting_pos = cur_pos;
60     if (str[cur_pos] == '%') {
61       // format section
62       section.has_conv = true;
63 
64       ++cur_pos;
65       [[maybe_unused]] size_t conv_index = 0;
66 
67 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
68       conv_index = parse_index(&cur_pos);
69 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
70 
71       if (str[cur_pos] == '*') {
72         ++cur_pos;
73         section.flags = FormatFlags::NO_WRITE;
74       }
75 
76       // handle width
77       section.max_width = -1;
78       if (internal::isdigit(str[cur_pos])) {
79         auto result = internal::strtointeger<int>(str + cur_pos, 10);
80         section.max_width = result.value;
81         cur_pos = cur_pos + result.parsed_len;
82       }
83 
84       // TODO(michaelrj): add posix allocate flag support.
85       // if (str[cur_pos] == 'm') {
86       //   ++cur_pos;
87       //   section.flags = FormatFlags::ALLOCATE;
88       // }
89 
90       LengthModifier lm = parse_length_modifier(&cur_pos);
91       section.length_modifier = lm;
92 
93       section.conv_name = str[cur_pos];
94 
95       // If NO_WRITE is not set, then read the next arg as the output pointer.
96       if ((section.flags & FormatFlags::NO_WRITE) == 0) {
97         // Since all outputs are pointers, there's no need to distinguish when
98         // reading from va_args. They're all the same size and stored the same.
99         section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index);
100       }
101 
102       // If the end of the format section is on the '\0'. This means we need to
103       // not advance the cur_pos and we should not count this has having a
104       // conversion.
105       if (str[cur_pos] != '\0') {
106         ++cur_pos;
107       } else {
108         section.has_conv = false;
109       }
110 
111       // If the format is a bracketed one, then we need to parse out the insides
112       // of the brackets.
113       if (section.conv_name == '[') {
114         constexpr char CLOSING_BRACKET = ']';
115         constexpr char INVERT_FLAG = '^';
116         constexpr char RANGE_OPERATOR = '-';
117 
118         cpp::bitset<256> scan_set;
119         bool invert = false;
120 
121         // The circumflex in the first position represents the inversion flag,
122         // but it's easier to apply that at the end so we just store it for now.
123         if (str[cur_pos] == INVERT_FLAG) {
124           invert = true;
125           ++cur_pos;
126         }
127 
128         // This is used to determine if a hyphen is being used as a literal or
129         // as a range operator.
130         size_t set_start_pos = cur_pos;
131 
132         // Normally the right bracket closes the set, but if it's the first
133         // character (possibly after the inversion flag) then it's instead
134         // included as a character in the set and the second right bracket
135         // closes the set.
136         if (str[cur_pos] == CLOSING_BRACKET) {
137           scan_set.set(CLOSING_BRACKET);
138           ++cur_pos;
139         }
140 
141         while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) {
142           // If a hyphen is being used as a range operator, since it's neither
143           // at the beginning nor end of the set.
144           if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos &&
145               str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') {
146             // Technically there is no requirement to correct the ordering of
147             // the range, but since the range operator is entirely
148             // implementation defined it seems like a good convenience.
149             char a = str[cur_pos - 1];
150             char b = str[cur_pos + 1];
151             char start = (a < b ? a : b);
152             char end = (a < b ? b : a);
153             scan_set.set_range(start, end);
154             cur_pos += 2;
155           } else {
156             scan_set.set(str[cur_pos]);
157             ++cur_pos;
158           }
159         }
160         if (invert)
161           scan_set.flip();
162 
163         if (str[cur_pos] == CLOSING_BRACKET) {
164           ++cur_pos;
165           section.scan_set = scan_set;
166         } else {
167           // if the end of the string was encountered, this is not a valid set.
168           section.has_conv = false;
169         }
170       }
171     } else {
172       // raw section
173       section.has_conv = false;
174       while (str[cur_pos] != '%' && str[cur_pos] != '\0')
175         ++cur_pos;
176     }
177     section.raw_string = {str + starting_pos, cur_pos - starting_pos};
178     return section;
179   }
180 
181 private:
182   // parse_length_modifier parses the length modifier inside a format string. It
183   // assumes that str[*local_pos] is inside a format specifier. It returns a
184   // LengthModifier with the length modifier it found. It will advance local_pos
185   // after the format specifier if one is found.
parse_length_modifier(size_t * local_pos)186   LIBC_INLINE LengthModifier parse_length_modifier(size_t *local_pos) {
187     switch (str[*local_pos]) {
188     case ('l'):
189       if (str[*local_pos + 1] == 'l') {
190         *local_pos += 2;
191         return LengthModifier::ll;
192       } else {
193         ++*local_pos;
194         return LengthModifier::l;
195       }
196     case ('h'):
197       if (str[*local_pos + 1] == 'h') {
198         *local_pos += 2;
199         return LengthModifier::hh;
200       } else {
201         ++*local_pos;
202         return LengthModifier::h;
203       }
204     case ('L'):
205       ++*local_pos;
206       return LengthModifier::L;
207     case ('j'):
208       ++*local_pos;
209       return LengthModifier::j;
210     case ('z'):
211       ++*local_pos;
212       return LengthModifier::z;
213     case ('t'):
214       ++*local_pos;
215       return LengthModifier::t;
216     default:
217       return LengthModifier::NONE;
218     }
219   }
220 
221   // get_next_arg_value gets the next value from the arg list as type T.
get_next_arg_value()222   template <class T> LIBC_INLINE T get_next_arg_value() {
223     return args_cur.template next_var<T>();
224   }
225 
226   //----------------------------------------------------
227   // INDEX MODE ONLY FUNCTIONS AFTER HERE:
228   //----------------------------------------------------
229 
230 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
231 
232   // parse_index parses the index of a value inside a format string. It
233   // assumes that str[*local_pos] points to character after a '%' or '*', and
234   // returns 0 if there is no closing $, or if it finds no number. If it finds a
235   // number, it will move local_pos past the end of the $, else it will not move
236   // local_pos.
parse_index(size_t * local_pos)237   LIBC_INLINE size_t parse_index(size_t *local_pos) {
238     if (internal::isdigit(str[*local_pos])) {
239       auto result = internal::strtointeger<int>(str + *local_pos, 10);
240       size_t index = result.value;
241       if (str[*local_pos + result.parsed_len] != '$')
242         return 0;
243       *local_pos = 1 + result.parsed_len + *local_pos;
244       return index;
245     }
246     return 0;
247   }
248 
249   // get_arg_value gets the value from the arg list at index (starting at 1).
250   // This may require parsing the format string. An index of 0 is interpreted as
251   // the next value.
get_arg_value(size_t index)252   template <class T> LIBC_INLINE T get_arg_value(size_t index) {
253     if (!(index == 0 || index == args_index))
254       args_to_index(index);
255 
256     ++args_index;
257     return get_next_arg_value<T>();
258   }
259 
260   // the ArgList can only return the next item in the list. This function is
261   // used in index mode when the item that needs to be read is not the next one.
262   // It moves cur_args to the index requested so the appropriate value may
263   // be read. This may involve parsing the format string, and is in the worst
264   // case an O(n^2) operation.
args_to_index(size_t index)265   LIBC_INLINE void args_to_index(size_t index) {
266     if (args_index > index) {
267       args_index = 1;
268       args_cur = args_start;
269     }
270 
271     while (args_index < index) {
272       // Since all arguments must be pointers, we can just read all of them as
273       // void * and not worry about type issues.
274       args_cur.template next_var<void *>();
275       ++args_index;
276     }
277   }
278 
279 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
280 };
281 
282 } // namespace scanf_core
283 } // namespace LIBC_NAMESPACE_DECL
284 
285 #endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
286