1 // tokeniser_helper.hpp 2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) 3 // 4 // Distributed under the Boost Software License, Version 1.0. (See accompanying 5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 6 #ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_TOKENISER_RE_TOKENISER_HELPER_HPP 7 #define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_TOKENISER_RE_TOKENISER_HELPER_HPP 8 9 #include "../../char_traits.hpp" 10 // strlen() 11 #include <cstring> 12 #include "../../size_t.hpp" 13 #include "re_tokeniser_state.hpp" 14 15 namespace boost 16 { 17 namespace lexer 18 { 19 namespace detail 20 { 21 template<typename CharT, typename Traits = char_traits<CharT> > 22 class basic_re_tokeniser_helper 23 { 24 public: 25 typedef basic_re_tokeniser_state<CharT> state; 26 typedef std::basic_string<CharT> string; 27 escape_sequence(state & state_,CharT & ch_,std::size_t & str_len_)28 static const CharT *escape_sequence (state &state_, CharT &ch_, 29 std::size_t &str_len_) 30 { 31 bool eos_ = state_.eos (); 32 33 if (eos_) 34 { 35 throw runtime_error ("Unexpected end of regex " 36 "following '\\'."); 37 } 38 39 const CharT *str_ = charset_shortcut (*state_._curr, str_len_); 40 41 if (str_) 42 { 43 state_.increment (); 44 } 45 else 46 { 47 ch_ = chr (state_); 48 } 49 50 return str_; 51 } 52 53 // This function can call itself. charset(state & state_,string & chars_,bool & negated_)54 static void charset (state &state_, string &chars_, bool &negated_) 55 { 56 CharT ch_ = 0; 57 bool eos_ = state_.next (ch_); 58 59 if (eos_) 60 { 61 // Pointless returning index if at end of string 62 throw runtime_error ("Unexpected end of regex " 63 "following '['."); 64 } 65 66 negated_ = ch_ == '^'; 67 68 if (negated_) 69 { 70 eos_ = state_.next (ch_); 71 72 if (eos_) 73 { 74 // Pointless returning index if at end of string 75 throw runtime_error ("Unexpected end of regex " 76 "following '^'."); 77 } 78 } 79 80 bool chset_ = false; 81 CharT prev_ = 0; 82 83 while (ch_ != ']') 84 { 85 if (ch_ == '\\') 86 { 87 std::size_t str_len_ = 0; 88 const CharT *str_ = escape_sequence (state_, prev_, str_len_); 89 90 chset_ = str_ != 0; 91 92 if (chset_) 93 { 94 state temp_state_ (str_ + 1, str_ + str_len_, 95 state_._flags, state_._locale); 96 string temp_chars_; 97 bool temp_negated_ = false; 98 99 charset (temp_state_, temp_chars_, temp_negated_); 100 101 if (negated_ != temp_negated_) 102 { 103 std::ostringstream ss_; 104 105 ss_ << "Mismatch in charset negation preceding " 106 "index " << state_.index () << '.'; 107 throw runtime_error (ss_.str ().c_str ()); 108 } 109 110 chars_ += temp_chars_; 111 } 112 } 113 /* 114 else if (ch_ == '[' && !state_.eos () && *state_._curr == ':') 115 { 116 // TODO: POSIX charsets 117 } 118 */ 119 else 120 { 121 chset_ = false; 122 prev_ = ch_; 123 } 124 125 eos_ = state_.next (ch_); 126 127 // Covers preceding if, else if and else 128 if (eos_) 129 { 130 // Pointless returning index if at end of string 131 throw runtime_error ("Unexpected end of regex " 132 "(missing ']')."); 133 } 134 135 if (ch_ == '-') 136 { 137 charset_range (chset_, state_, eos_, ch_, prev_, chars_); 138 } 139 else if (!chset_) 140 { 141 if ((state_._flags & icase) && 142 (std::isupper (prev_, state_._locale) || 143 std::islower (prev_, state_._locale))) 144 { 145 CharT upper_ = std::toupper (prev_, state_._locale); 146 CharT lower_ = std::tolower (prev_, state_._locale); 147 148 chars_ += upper_; 149 chars_ += lower_; 150 } 151 else 152 { 153 chars_ += prev_; 154 } 155 } 156 } 157 158 if (!negated_ && chars_.empty ()) 159 { 160 throw runtime_error ("Empty charsets not allowed."); 161 } 162 } 163 chr(state & state_)164 static CharT chr (state &state_) 165 { 166 CharT ch_ = 0; 167 168 // eos_ has already been checked for. 169 switch (*state_._curr) 170 { 171 case '0': 172 case '1': 173 case '2': 174 case '3': 175 case '4': 176 case '5': 177 case '6': 178 case '7': 179 ch_ = decode_octal (state_); 180 break; 181 case 'a': 182 ch_ = '\a'; 183 state_.increment (); 184 break; 185 case 'b': 186 ch_ = '\b'; 187 state_.increment (); 188 break; 189 case 'c': 190 ch_ = decode_control_char (state_); 191 break; 192 case 'e': 193 ch_ = 27; // '\e' not recognised by compiler 194 state_.increment (); 195 break; 196 case 'f': 197 ch_ = '\f'; 198 state_.increment (); 199 break; 200 case 'n': 201 ch_ = '\n'; 202 state_.increment (); 203 break; 204 case 'r': 205 ch_ = '\r'; 206 state_.increment (); 207 break; 208 case 't': 209 ch_ = '\t'; 210 state_.increment (); 211 break; 212 case 'v': 213 ch_ = '\v'; 214 state_.increment (); 215 break; 216 case 'x': 217 ch_ = decode_hex (state_); 218 break; 219 default: 220 ch_ = *state_._curr; 221 state_.increment (); 222 break; 223 } 224 225 return ch_; 226 } 227 228 private: charset_shortcut(const char ch_,std::size_t & str_len_)229 static const char *charset_shortcut (const char ch_, 230 std::size_t &str_len_) 231 { 232 const char *str_ = 0; 233 234 switch (ch_) 235 { 236 case 'd': 237 str_ = "[0-9]"; 238 break; 239 case 'D': 240 str_ = "[^0-9]"; 241 break; 242 case 's': 243 str_ = "[ \t\n\r\f\v]"; 244 break; 245 case 'S': 246 str_ = "[^ \t\n\r\f\v]"; 247 break; 248 case 'w': 249 str_ = "[_0-9A-Za-z]"; 250 break; 251 case 'W': 252 str_ = "[^_0-9A-Za-z]"; 253 break; 254 } 255 256 if (str_) 257 { 258 // Some systems have strlen in namespace std. 259 using namespace std; 260 261 str_len_ = strlen (str_); 262 } 263 else 264 { 265 str_len_ = 0; 266 } 267 268 return str_; 269 } 270 charset_shortcut(const wchar_t ch_,std::size_t & str_len_)271 static const wchar_t *charset_shortcut (const wchar_t ch_, 272 std::size_t &str_len_) 273 { 274 const wchar_t *str_ = 0; 275 276 switch (ch_) 277 { 278 case 'd': 279 str_ = L"[0-9]"; 280 break; 281 case 'D': 282 str_ = L"[^0-9]"; 283 break; 284 case 's': 285 str_ = L"[ \t\n\r\f\v]"; 286 break; 287 case 'S': 288 str_ = L"[^ \t\n\r\f\v]"; 289 break; 290 case 'w': 291 str_ = L"[_0-9A-Za-z]"; 292 break; 293 case 'W': 294 str_ = L"[^_0-9A-Za-z]"; 295 break; 296 } 297 298 if (str_) 299 { 300 // Some systems have wcslen in namespace std. 301 using namespace std; 302 303 str_len_ = wcslen (str_); 304 } 305 else 306 { 307 str_len_ = 0; 308 } 309 310 return str_; 311 } 312 decode_octal(state & state_)313 static CharT decode_octal (state &state_) 314 { 315 std::size_t accumulator_ = 0; 316 CharT ch_ = *state_._curr; 317 unsigned short count_ = 3; 318 bool eos_ = false; 319 320 for (;;) 321 { 322 accumulator_ *= 8; 323 accumulator_ += ch_ - '0'; 324 --count_; 325 state_.increment (); 326 eos_ = state_.eos (); 327 328 if (!count_ || eos_) break; 329 330 ch_ = *state_._curr; 331 332 // Don't consume invalid chars! 333 if (ch_ < '0' || ch_ > '7') 334 { 335 break; 336 } 337 } 338 339 return static_cast<CharT> (accumulator_); 340 } 341 decode_control_char(state & state_)342 static CharT decode_control_char (state &state_) 343 { 344 // Skip over 'c' 345 state_.increment (); 346 347 CharT ch_ = 0; 348 bool eos_ = state_.next (ch_); 349 350 if (eos_) 351 { 352 // Pointless returning index if at end of string 353 throw runtime_error ("Unexpected end of regex following \\c."); 354 } 355 else 356 { 357 if (ch_ >= 'a' && ch_ <= 'z') 358 { 359 ch_ -= 'a' - 1; 360 } 361 else if (ch_ >= 'A' && ch_ <= 'Z') 362 { 363 ch_ -= 'A' - 1; 364 } 365 else if (ch_ == '@') 366 { 367 // Apparently... 368 ch_ = 0; 369 } 370 else 371 { 372 std::ostringstream ss_; 373 374 ss_ << "Invalid control char at index " << 375 state_.index () - 1 << '.'; 376 throw runtime_error (ss_.str ().c_str ()); 377 } 378 } 379 380 return ch_; 381 } 382 decode_hex(state & state_)383 static CharT decode_hex (state &state_) 384 { 385 // Skip over 'x' 386 state_.increment (); 387 388 CharT ch_ = 0; 389 bool eos_ = state_.next (ch_); 390 391 if (eos_) 392 { 393 // Pointless returning index if at end of string 394 throw runtime_error ("Unexpected end of regex following \\x."); 395 } 396 397 if (!((ch_ >= '0' && ch_ <= '9') || (ch_ >= 'a' && ch_ <= 'f') || 398 (ch_ >= 'A' && ch_ <= 'F'))) 399 { 400 std::ostringstream ss_; 401 402 ss_ << "Illegal char following \\x at index " << 403 state_.index () - 1 << '.'; 404 throw runtime_error (ss_.str ().c_str ()); 405 } 406 407 std::size_t hex_ = 0; 408 409 do 410 { 411 hex_ *= 16; 412 413 if (ch_ >= '0' && ch_ <= '9') 414 { 415 hex_ += ch_ - '0'; 416 } 417 else if (ch_ >= 'a' && ch_ <= 'f') 418 { 419 hex_ += 10 + (ch_ - 'a'); 420 } 421 else 422 { 423 hex_ += 10 + (ch_ - 'A'); 424 } 425 426 eos_ = state_.eos (); 427 428 if (!eos_) 429 { 430 ch_ = *state_._curr; 431 432 // Don't consume invalid chars! 433 if (((ch_ >= '0' && ch_ <= '9') || 434 (ch_ >= 'a' && ch_ <= 'f') || (ch_ >= 'A' && ch_ <= 'F'))) 435 { 436 state_.increment (); 437 } 438 else 439 { 440 eos_ = true; 441 } 442 } 443 } while (!eos_); 444 445 return static_cast<CharT> (hex_); 446 } 447 charset_range(const bool chset_,state & state_,bool & eos_,CharT & ch_,const CharT prev_,string & chars_)448 static void charset_range (const bool chset_, state &state_, bool &eos_, 449 CharT &ch_, const CharT prev_, string &chars_) 450 { 451 if (chset_) 452 { 453 std::ostringstream ss_; 454 455 ss_ << "Charset cannot form start of range preceding " 456 "index " << state_.index () - 1 << '.'; 457 throw runtime_error (ss_.str ().c_str ()); 458 } 459 460 eos_ = state_.next (ch_); 461 462 if (eos_) 463 { 464 // Pointless returning index if at end of string 465 throw runtime_error ("Unexpected end of regex " 466 "following '-'."); 467 } 468 469 CharT curr_ = 0; 470 471 if (ch_ == '\\') 472 { 473 std::size_t str_len_ = 0; 474 475 if (escape_sequence (state_, curr_, str_len_)) 476 { 477 std::ostringstream ss_; 478 479 ss_ << "Charset cannot form end of range preceding index " 480 << state_.index () << '.'; 481 throw runtime_error (ss_.str ().c_str ()); 482 } 483 } 484 /* 485 else if (ch_ == '[' && !state_.eos () && *state_._curr == ':') 486 { 487 std::ostringstream ss_; 488 489 ss_ << "POSIX char class cannot form end of range at " 490 "index " << state_.index () - 1 << '.'; 491 throw runtime_error (ss_.str ().c_str ()); 492 } 493 */ 494 else 495 { 496 curr_ = ch_; 497 } 498 499 eos_ = state_.next (ch_); 500 501 // Covers preceding if and else 502 if (eos_) 503 { 504 // Pointless returning index if at end of string 505 throw runtime_error ("Unexpected end of regex " 506 "(missing ']')."); 507 } 508 509 std::size_t start_ = static_cast<typename Traits::index_type> (prev_); 510 std::size_t end_ = static_cast<typename Traits::index_type> (curr_); 511 512 // Semanic check 513 if (end_ < start_) 514 { 515 std::ostringstream ss_; 516 517 ss_ << "Invalid range in charset preceding index " << 518 state_.index () - 1 << '.'; 519 throw runtime_error (ss_.str ().c_str ()); 520 } 521 522 chars_.reserve (chars_.size () + (end_ + 1 - start_)); 523 524 for (; start_ <= end_; ++start_) 525 { 526 CharT ch_ = static_cast<CharT> (start_); 527 528 if ((state_._flags & icase) && 529 (std::isupper (ch_, state_._locale) || 530 std::islower (ch_, state_._locale))) 531 { 532 CharT upper_ = std::toupper (ch_, state_._locale); 533 CharT lower_ = std::tolower (ch_, state_._locale); 534 535 chars_ += (upper_); 536 chars_ += (lower_); 537 } 538 else 539 { 540 chars_ += (ch_); 541 } 542 } 543 } 544 }; 545 } 546 } 547 } 548 549 #endif 550