1 // tokeniser_helper.hpp
2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 #ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_TOKENISER_RE_TOKENISER_HELPER_HPP
7 #define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_TOKENISER_RE_TOKENISER_HELPER_HPP
8 
9 #include "../../char_traits.hpp"
10 // strlen()
11 #include <cstring>
12 #include "../../size_t.hpp"
13 #include "re_tokeniser_state.hpp"
14 
15 namespace boost
16 {
17 namespace lexer
18 {
19 namespace detail
20 {
21 template<typename CharT, typename Traits = char_traits<CharT> >
22 class basic_re_tokeniser_helper
23 {
24 public:
25     typedef basic_re_tokeniser_state<CharT> state;
26     typedef std::basic_string<CharT> string;
27 
escape_sequence(state & state_,CharT & ch_,std::size_t & str_len_)28     static const CharT *escape_sequence (state &state_, CharT &ch_,
29         std::size_t &str_len_)
30     {
31         bool eos_ = state_.eos ();
32 
33         if (eos_)
34         {
35             throw runtime_error ("Unexpected end of regex "
36                 "following '\\'.");
37         }
38 
39         const CharT *str_ = charset_shortcut (*state_._curr, str_len_);
40 
41         if (str_)
42         {
43             state_.increment ();
44         }
45         else
46         {
47             ch_ = chr (state_);
48         }
49 
50         return str_;
51     }
52 
53     // This function can call itself.
charset(state & state_,string & chars_,bool & negated_)54     static void charset (state &state_, string &chars_, bool &negated_)
55     {
56         CharT ch_ = 0;
57         bool eos_ = state_.next (ch_);
58 
59         if (eos_)
60         {
61             // Pointless returning index if at end of string
62             throw runtime_error ("Unexpected end of regex "
63                 "following '['.");
64         }
65 
66         negated_ = ch_ == '^';
67 
68         if (negated_)
69         {
70             eos_ = state_.next (ch_);
71 
72             if (eos_)
73             {
74                 // Pointless returning index if at end of string
75                 throw runtime_error ("Unexpected end of regex "
76                     "following '^'.");
77             }
78         }
79 
80         bool chset_ = false;
81         CharT prev_ = 0;
82 
83         while (ch_ != ']')
84         {
85             if (ch_ == '\\')
86             {
87                 std::size_t str_len_ = 0;
88                 const CharT *str_ = escape_sequence (state_, prev_, str_len_);
89 
90                 chset_ = str_ != 0;
91 
92                 if (chset_)
93                 {
94                     state temp_state_ (str_ + 1, str_ + str_len_,
95                         state_._flags, state_._locale);
96                     string temp_chars_;
97                     bool temp_negated_ = false;
98 
99                     charset (temp_state_, temp_chars_, temp_negated_);
100 
101                     if (negated_ != temp_negated_)
102                     {
103                         std::ostringstream ss_;
104 
105                         ss_ << "Mismatch in charset negation preceding "
106                             "index " << state_.index () << '.';
107                         throw runtime_error (ss_.str ().c_str ());
108                     }
109 
110                     chars_ += temp_chars_;
111                 }
112             }
113 /*
114             else if (ch_ == '[' && !state_.eos () && *state_._curr == ':')
115             {
116                 // TODO: POSIX charsets
117             }
118 */
119             else
120             {
121                 chset_ = false;
122                 prev_ = ch_;
123             }
124 
125             eos_ = state_.next (ch_);
126 
127             // Covers preceding if, else if and else
128             if (eos_)
129             {
130                 // Pointless returning index if at end of string
131                 throw runtime_error ("Unexpected end of regex "
132                     "(missing ']').");
133             }
134 
135             if (ch_ == '-')
136             {
137                 charset_range (chset_, state_, eos_, ch_, prev_, chars_);
138             }
139             else if (!chset_)
140             {
141                 if ((state_._flags & icase) &&
142                     (std::isupper (prev_, state_._locale) ||
143                     std::islower (prev_, state_._locale)))
144                 {
145                     CharT upper_ = std::toupper (prev_, state_._locale);
146                     CharT lower_ = std::tolower (prev_, state_._locale);
147 
148                     chars_ += upper_;
149                     chars_ += lower_;
150                 }
151                 else
152                 {
153                     chars_ += prev_;
154                 }
155             }
156         }
157 
158         if (!negated_ && chars_.empty ())
159         {
160             throw runtime_error ("Empty charsets not allowed.");
161         }
162     }
163 
chr(state & state_)164     static CharT chr (state &state_)
165     {
166         CharT ch_ = 0;
167 
168         // eos_ has already been checked for.
169         switch (*state_._curr)
170         {
171             case '0':
172             case '1':
173             case '2':
174             case '3':
175             case '4':
176             case '5':
177             case '6':
178             case '7':
179                 ch_ = decode_octal (state_);
180                 break;
181             case 'a':
182                 ch_ = '\a';
183                 state_.increment ();
184                 break;
185             case 'b':
186                 ch_ = '\b';
187                 state_.increment ();
188                 break;
189             case 'c':
190                 ch_ = decode_control_char (state_);
191                 break;
192             case 'e':
193                 ch_ = 27; // '\e' not recognised by compiler
194                 state_.increment ();
195                 break;
196             case 'f':
197                 ch_ = '\f';
198                 state_.increment ();
199                 break;
200             case 'n':
201                 ch_ = '\n';
202                 state_.increment ();
203                 break;
204             case 'r':
205                 ch_ = '\r';
206                 state_.increment ();
207                 break;
208             case 't':
209                 ch_ = '\t';
210                 state_.increment ();
211                 break;
212             case 'v':
213                 ch_ = '\v';
214                 state_.increment ();
215                 break;
216             case 'x':
217                 ch_ = decode_hex (state_);
218                 break;
219             default:
220                 ch_ = *state_._curr;
221                 state_.increment ();
222                 break;
223         }
224 
225         return ch_;
226     }
227 
228 private:
charset_shortcut(const char ch_,std::size_t & str_len_)229     static const char *charset_shortcut (const char ch_,
230         std::size_t &str_len_)
231     {
232         const char *str_ = 0;
233 
234         switch (ch_)
235         {
236         case 'd':
237             str_ = "[0-9]";
238             break;
239         case 'D':
240             str_ = "[^0-9]";
241             break;
242         case 's':
243             str_ = "[ \t\n\r\f\v]";
244             break;
245         case 'S':
246             str_ = "[^ \t\n\r\f\v]";
247             break;
248         case 'w':
249             str_ = "[_0-9A-Za-z]";
250             break;
251         case 'W':
252             str_ = "[^_0-9A-Za-z]";
253             break;
254         }
255 
256         if (str_)
257         {
258             // Some systems have strlen in namespace std.
259             using namespace std;
260 
261             str_len_ = strlen (str_);
262         }
263         else
264         {
265             str_len_ = 0;
266         }
267 
268         return str_;
269     }
270 
charset_shortcut(const wchar_t ch_,std::size_t & str_len_)271     static const wchar_t *charset_shortcut (const wchar_t ch_,
272         std::size_t &str_len_)
273     {
274         const wchar_t *str_ = 0;
275 
276         switch (ch_)
277         {
278         case 'd':
279             str_ = L"[0-9]";
280             break;
281         case 'D':
282             str_ = L"[^0-9]";
283             break;
284         case 's':
285             str_ = L"[ \t\n\r\f\v]";
286             break;
287         case 'S':
288             str_ = L"[^ \t\n\r\f\v]";
289             break;
290         case 'w':
291             str_ = L"[_0-9A-Za-z]";
292             break;
293         case 'W':
294             str_ = L"[^_0-9A-Za-z]";
295             break;
296         }
297 
298         if (str_)
299         {
300             // Some systems have wcslen in namespace std.
301             using namespace std;
302 
303             str_len_ = wcslen (str_);
304         }
305         else
306         {
307             str_len_ = 0;
308         }
309 
310         return str_;
311     }
312 
decode_octal(state & state_)313     static CharT decode_octal (state &state_)
314     {
315         std::size_t accumulator_ = 0;
316         CharT ch_ = *state_._curr;
317         unsigned short count_ = 3;
318         bool eos_ = false;
319 
320         for (;;)
321         {
322             accumulator_ *= 8;
323             accumulator_ += ch_ - '0';
324             --count_;
325             state_.increment ();
326             eos_ = state_.eos ();
327 
328             if (!count_ || eos_) break;
329 
330             ch_ = *state_._curr;
331 
332             // Don't consume invalid chars!
333             if (ch_ < '0' || ch_ > '7')
334             {
335                 break;
336             }
337         }
338 
339         return static_cast<CharT> (accumulator_);
340     }
341 
decode_control_char(state & state_)342     static CharT decode_control_char (state &state_)
343     {
344         // Skip over 'c'
345         state_.increment ();
346 
347         CharT ch_ = 0;
348         bool eos_ = state_.next (ch_);
349 
350         if (eos_)
351         {
352             // Pointless returning index if at end of string
353             throw runtime_error ("Unexpected end of regex following \\c.");
354         }
355         else
356         {
357             if (ch_ >= 'a' && ch_ <= 'z')
358             {
359                 ch_ -= 'a' - 1;
360             }
361             else if (ch_ >= 'A' && ch_ <= 'Z')
362             {
363                 ch_ -= 'A' - 1;
364             }
365             else if (ch_ == '@')
366             {
367                 // Apparently...
368                 ch_ = 0;
369             }
370             else
371             {
372                 std::ostringstream ss_;
373 
374                 ss_ << "Invalid control char at index " <<
375                     state_.index () - 1 << '.';
376                 throw runtime_error (ss_.str ().c_str ());
377             }
378         }
379 
380         return ch_;
381     }
382 
decode_hex(state & state_)383     static CharT decode_hex (state &state_)
384     {
385         // Skip over 'x'
386         state_.increment ();
387 
388         CharT ch_ = 0;
389         bool eos_ = state_.next (ch_);
390 
391         if (eos_)
392         {
393             // Pointless returning index if at end of string
394             throw runtime_error ("Unexpected end of regex following \\x.");
395         }
396 
397         if (!((ch_ >= '0' && ch_ <= '9') || (ch_ >= 'a' && ch_ <= 'f') ||
398             (ch_ >= 'A' && ch_ <= 'F')))
399         {
400             std::ostringstream ss_;
401 
402             ss_ << "Illegal char following \\x at index " <<
403                 state_.index () - 1 << '.';
404             throw runtime_error (ss_.str ().c_str ());
405         }
406 
407         std::size_t hex_ = 0;
408 
409         do
410         {
411             hex_ *= 16;
412 
413             if (ch_ >= '0' && ch_ <= '9')
414             {
415                 hex_ += ch_ - '0';
416             }
417             else if (ch_ >= 'a' && ch_ <= 'f')
418             {
419                 hex_ += 10 + (ch_ - 'a');
420             }
421             else
422             {
423                 hex_ += 10 + (ch_ - 'A');
424             }
425 
426             eos_ = state_.eos ();
427 
428             if (!eos_)
429             {
430                 ch_ = *state_._curr;
431 
432                 // Don't consume invalid chars!
433                 if (((ch_ >= '0' && ch_ <= '9') ||
434                     (ch_ >= 'a' && ch_ <= 'f') || (ch_ >= 'A' && ch_ <= 'F')))
435                 {
436                     state_.increment ();
437                 }
438                 else
439                 {
440                     eos_ = true;
441                 }
442             }
443         } while (!eos_);
444 
445         return static_cast<CharT> (hex_);
446     }
447 
charset_range(const bool chset_,state & state_,bool & eos_,CharT & ch_,const CharT prev_,string & chars_)448     static void charset_range (const bool chset_, state &state_, bool &eos_,
449         CharT &ch_, const CharT prev_, string &chars_)
450     {
451         if (chset_)
452         {
453             std::ostringstream ss_;
454 
455             ss_ << "Charset cannot form start of range preceding "
456                 "index " << state_.index () - 1 << '.';
457             throw runtime_error (ss_.str ().c_str ());
458         }
459 
460         eos_ = state_.next (ch_);
461 
462         if (eos_)
463         {
464             // Pointless returning index if at end of string
465             throw runtime_error ("Unexpected end of regex "
466                 "following '-'.");
467         }
468 
469         CharT curr_ = 0;
470 
471         if (ch_ == '\\')
472         {
473             std::size_t str_len_ = 0;
474 
475             if (escape_sequence (state_, curr_, str_len_))
476             {
477                 std::ostringstream ss_;
478 
479                 ss_ << "Charset cannot form end of range preceding index "
480                     << state_.index () << '.';
481                 throw runtime_error (ss_.str ().c_str ());
482             }
483         }
484 /*
485         else if (ch_ == '[' && !state_.eos () && *state_._curr == ':')
486         {
487             std::ostringstream ss_;
488 
489             ss_ << "POSIX char class cannot form end of range at "
490                 "index " << state_.index () - 1 << '.';
491             throw runtime_error (ss_.str ().c_str ());
492         }
493 */
494         else
495         {
496             curr_ = ch_;
497         }
498 
499         eos_ = state_.next (ch_);
500 
501         // Covers preceding if and else
502         if (eos_)
503         {
504             // Pointless returning index if at end of string
505             throw runtime_error ("Unexpected end of regex "
506                 "(missing ']').");
507         }
508 
509         std::size_t start_ = static_cast<typename Traits::index_type> (prev_);
510         std::size_t end_ = static_cast<typename Traits::index_type> (curr_);
511 
512         // Semanic check
513         if (end_ < start_)
514         {
515             std::ostringstream ss_;
516 
517             ss_ << "Invalid range in charset preceding index " <<
518                 state_.index () - 1 << '.';
519             throw runtime_error (ss_.str ().c_str ());
520         }
521 
522         chars_.reserve (chars_.size () + (end_ + 1 - start_));
523 
524         for (; start_ <= end_; ++start_)
525         {
526             CharT ch_ = static_cast<CharT> (start_);
527 
528             if ((state_._flags & icase) &&
529                 (std::isupper (ch_, state_._locale) ||
530                 std::islower (ch_, state_._locale)))
531             {
532                 CharT upper_ = std::toupper (ch_, state_._locale);
533                 CharT lower_ = std::tolower (ch_, state_._locale);
534 
535                 chars_ += (upper_);
536                 chars_ += (lower_);
537             }
538             else
539             {
540                 chars_ += (ch_);
541             }
542         }
543     }
544 };
545 }
546 }
547 }
548 
549 #endif
550