1 //===-- Format string parser for scanf -------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H 10 #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H 11 12 #include "src/__support/arg_list.h" 13 #include "src/__support/ctype_utils.h" 14 #include "src/__support/macros/config.h" 15 #include "src/__support/str_to_integer.h" 16 #include "src/stdio/scanf_core/core_structs.h" 17 #include "src/stdio/scanf_core/scanf_config.h" 18 19 #include <stddef.h> 20 21 namespace LIBC_NAMESPACE_DECL { 22 namespace scanf_core { 23 24 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE 25 #define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index) 26 #else 27 #define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>() 28 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 29 30 template <typename ArgProvider> class Parser { 31 const char *__restrict str; 32 33 size_t cur_pos = 0; 34 ArgProvider args_cur; 35 36 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE 37 // args_start stores the start of the va_args, which is used when a previous 38 // argument is needed. In that case, we have to read the arguments from the 39 // beginning since they don't support reading backwards. 40 ArgProvider args_start; 41 size_t args_index = 1; 42 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 43 44 public: 45 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE Parser(const char * __restrict new_str,internal::ArgList & args)46 LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args) 47 : str(new_str), args_cur(args), args_start(args) {} 48 #else 49 LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args) 50 : str(new_str), args_cur(args) {} 51 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 52 53 // get_next_section will parse the format string until it has a fully 54 // specified format section. This can either be a raw format section with no 55 // conversion, or a format section with a conversion that has all of its 56 // variables stored in the format section. get_next_section()57 LIBC_INLINE FormatSection get_next_section() { 58 FormatSection section; 59 size_t starting_pos = cur_pos; 60 if (str[cur_pos] == '%') { 61 // format section 62 section.has_conv = true; 63 64 ++cur_pos; 65 [[maybe_unused]] size_t conv_index = 0; 66 67 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE 68 conv_index = parse_index(&cur_pos); 69 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 70 71 if (str[cur_pos] == '*') { 72 ++cur_pos; 73 section.flags = FormatFlags::NO_WRITE; 74 } 75 76 // handle width 77 section.max_width = -1; 78 if (internal::isdigit(str[cur_pos])) { 79 auto result = internal::strtointeger<int>(str + cur_pos, 10); 80 section.max_width = result.value; 81 cur_pos = cur_pos + result.parsed_len; 82 } 83 84 // TODO(michaelrj): add posix allocate flag support. 85 // if (str[cur_pos] == 'm') { 86 // ++cur_pos; 87 // section.flags = FormatFlags::ALLOCATE; 88 // } 89 90 LengthModifier lm = parse_length_modifier(&cur_pos); 91 section.length_modifier = lm; 92 93 section.conv_name = str[cur_pos]; 94 95 // If NO_WRITE is not set, then read the next arg as the output pointer. 96 if ((section.flags & FormatFlags::NO_WRITE) == 0) { 97 // Since all outputs are pointers, there's no need to distinguish when 98 // reading from va_args. They're all the same size and stored the same. 99 section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index); 100 } 101 102 // If the end of the format section is on the '\0'. This means we need to 103 // not advance the cur_pos and we should not count this has having a 104 // conversion. 105 if (str[cur_pos] != '\0') { 106 ++cur_pos; 107 } else { 108 section.has_conv = false; 109 } 110 111 // If the format is a bracketed one, then we need to parse out the insides 112 // of the brackets. 113 if (section.conv_name == '[') { 114 constexpr char CLOSING_BRACKET = ']'; 115 constexpr char INVERT_FLAG = '^'; 116 constexpr char RANGE_OPERATOR = '-'; 117 118 cpp::bitset<256> scan_set; 119 bool invert = false; 120 121 // The circumflex in the first position represents the inversion flag, 122 // but it's easier to apply that at the end so we just store it for now. 123 if (str[cur_pos] == INVERT_FLAG) { 124 invert = true; 125 ++cur_pos; 126 } 127 128 // This is used to determine if a hyphen is being used as a literal or 129 // as a range operator. 130 size_t set_start_pos = cur_pos; 131 132 // Normally the right bracket closes the set, but if it's the first 133 // character (possibly after the inversion flag) then it's instead 134 // included as a character in the set and the second right bracket 135 // closes the set. 136 if (str[cur_pos] == CLOSING_BRACKET) { 137 scan_set.set(CLOSING_BRACKET); 138 ++cur_pos; 139 } 140 141 while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) { 142 // If a hyphen is being used as a range operator, since it's neither 143 // at the beginning nor end of the set. 144 if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos && 145 str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') { 146 // Technically there is no requirement to correct the ordering of 147 // the range, but since the range operator is entirely 148 // implementation defined it seems like a good convenience. 149 char a = str[cur_pos - 1]; 150 char b = str[cur_pos + 1]; 151 char start = (a < b ? a : b); 152 char end = (a < b ? b : a); 153 scan_set.set_range(start, end); 154 cur_pos += 2; 155 } else { 156 scan_set.set(str[cur_pos]); 157 ++cur_pos; 158 } 159 } 160 if (invert) 161 scan_set.flip(); 162 163 if (str[cur_pos] == CLOSING_BRACKET) { 164 ++cur_pos; 165 section.scan_set = scan_set; 166 } else { 167 // if the end of the string was encountered, this is not a valid set. 168 section.has_conv = false; 169 } 170 } 171 } else { 172 // raw section 173 section.has_conv = false; 174 while (str[cur_pos] != '%' && str[cur_pos] != '\0') 175 ++cur_pos; 176 } 177 section.raw_string = {str + starting_pos, cur_pos - starting_pos}; 178 return section; 179 } 180 181 private: 182 // parse_length_modifier parses the length modifier inside a format string. It 183 // assumes that str[*local_pos] is inside a format specifier. It returns a 184 // LengthModifier with the length modifier it found. It will advance local_pos 185 // after the format specifier if one is found. parse_length_modifier(size_t * local_pos)186 LIBC_INLINE LengthModifier parse_length_modifier(size_t *local_pos) { 187 switch (str[*local_pos]) { 188 case ('l'): 189 if (str[*local_pos + 1] == 'l') { 190 *local_pos += 2; 191 return LengthModifier::ll; 192 } else { 193 ++*local_pos; 194 return LengthModifier::l; 195 } 196 case ('h'): 197 if (str[*local_pos + 1] == 'h') { 198 *local_pos += 2; 199 return LengthModifier::hh; 200 } else { 201 ++*local_pos; 202 return LengthModifier::h; 203 } 204 case ('L'): 205 ++*local_pos; 206 return LengthModifier::L; 207 case ('j'): 208 ++*local_pos; 209 return LengthModifier::j; 210 case ('z'): 211 ++*local_pos; 212 return LengthModifier::z; 213 case ('t'): 214 ++*local_pos; 215 return LengthModifier::t; 216 default: 217 return LengthModifier::NONE; 218 } 219 } 220 221 // get_next_arg_value gets the next value from the arg list as type T. get_next_arg_value()222 template <class T> LIBC_INLINE T get_next_arg_value() { 223 return args_cur.template next_var<T>(); 224 } 225 226 //---------------------------------------------------- 227 // INDEX MODE ONLY FUNCTIONS AFTER HERE: 228 //---------------------------------------------------- 229 230 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE 231 232 // parse_index parses the index of a value inside a format string. It 233 // assumes that str[*local_pos] points to character after a '%' or '*', and 234 // returns 0 if there is no closing $, or if it finds no number. If it finds a 235 // number, it will move local_pos past the end of the $, else it will not move 236 // local_pos. parse_index(size_t * local_pos)237 LIBC_INLINE size_t parse_index(size_t *local_pos) { 238 if (internal::isdigit(str[*local_pos])) { 239 auto result = internal::strtointeger<int>(str + *local_pos, 10); 240 size_t index = result.value; 241 if (str[*local_pos + result.parsed_len] != '$') 242 return 0; 243 *local_pos = 1 + result.parsed_len + *local_pos; 244 return index; 245 } 246 return 0; 247 } 248 249 // get_arg_value gets the value from the arg list at index (starting at 1). 250 // This may require parsing the format string. An index of 0 is interpreted as 251 // the next value. get_arg_value(size_t index)252 template <class T> LIBC_INLINE T get_arg_value(size_t index) { 253 if (!(index == 0 || index == args_index)) 254 args_to_index(index); 255 256 ++args_index; 257 return get_next_arg_value<T>(); 258 } 259 260 // the ArgList can only return the next item in the list. This function is 261 // used in index mode when the item that needs to be read is not the next one. 262 // It moves cur_args to the index requested so the appropriate value may 263 // be read. This may involve parsing the format string, and is in the worst 264 // case an O(n^2) operation. args_to_index(size_t index)265 LIBC_INLINE void args_to_index(size_t index) { 266 if (args_index > index) { 267 args_index = 1; 268 args_cur = args_start; 269 } 270 271 while (args_index < index) { 272 // Since all arguments must be pointers, we can just read all of them as 273 // void * and not worry about type issues. 274 args_cur.template next_var<void *>(); 275 ++args_index; 276 } 277 } 278 279 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 280 }; 281 282 } // namespace scanf_core 283 } // namespace LIBC_NAMESPACE_DECL 284 285 #endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H 286