1 /*
2 regexec.c - TRE POSIX compatible matching functions (and more).
3
4 Copyright (c) 2001-2009 Ville Laurikari <[email protected]>
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 */
31
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include <wctype.h>
36 #include <limits.h>
37 #include <stdint.h>
38
39 #include <regex.h>
40
41 #include "tre.h"
42
43 #include <assert.h>
44
45 static void
46 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
47 const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
48
49 /***********************************************************************
50 from tre-match-utils.h
51 ***********************************************************************/
52
53 #define GET_NEXT_WCHAR() do { \
54 prev_c = next_c; pos += pos_add_next; \
55 if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \
56 if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \
57 else pos_add_next++; \
58 } \
59 str_byte += pos_add_next; \
60 } while (0)
61
62 #define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum(c))
63
64 #define CHECK_ASSERTIONS(assertions) \
65 (((assertions & ASSERT_AT_BOL) \
66 && (pos > 0 || reg_notbol) \
67 && (prev_c != L'\n' || !reg_newline)) \
68 || ((assertions & ASSERT_AT_EOL) \
69 && (next_c != L'\0' || reg_noteol) \
70 && (next_c != L'\n' || !reg_newline)) \
71 || ((assertions & ASSERT_AT_BOW) \
72 && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \
73 || ((assertions & ASSERT_AT_EOW) \
74 && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \
75 || ((assertions & ASSERT_AT_WB) \
76 && (pos != 0 && next_c != L'\0' \
77 && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \
78 || ((assertions & ASSERT_AT_WB_NEG) \
79 && (pos == 0 || next_c == L'\0' \
80 || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
81
82 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \
83 (((trans_i->assertions & ASSERT_CHAR_CLASS) \
84 && !(tnfa->cflags & REG_ICASE) \
85 && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class)) \
86 || ((trans_i->assertions & ASSERT_CHAR_CLASS) \
87 && (tnfa->cflags & REG_ICASE) \
88 && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class) \
89 && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class)) \
90 || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) \
91 && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\
92 tnfa->cflags & REG_ICASE)))
93
94
95
96
97 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
98 static int
tre_tag_order(int num_tags,tre_tag_direction_t * tag_directions,regoff_t * t1,regoff_t * t2)99 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
100 regoff_t *t1, regoff_t *t2)
101 {
102 int i;
103 for (i = 0; i < num_tags; i++)
104 {
105 if (tag_directions[i] == TRE_TAG_MINIMIZE)
106 {
107 if (t1[i] < t2[i])
108 return 1;
109 if (t1[i] > t2[i])
110 return 0;
111 }
112 else
113 {
114 if (t1[i] > t2[i])
115 return 1;
116 if (t1[i] < t2[i])
117 return 0;
118 }
119 }
120 /* assert(0);*/
121 return 0;
122 }
123
124 static int
tre_neg_char_classes_match(tre_ctype_t * classes,tre_cint_t wc,int icase)125 tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
126 {
127 while (*classes != (tre_ctype_t)0)
128 if ((!icase && tre_isctype(wc, *classes))
129 || (icase && (tre_isctype(tre_toupper(wc), *classes)
130 || tre_isctype(tre_tolower(wc), *classes))))
131 return 1; /* Match. */
132 else
133 classes++;
134 return 0; /* No match. */
135 }
136
137
138 /***********************************************************************
139 from tre-match-parallel.c
140 ***********************************************************************/
141
142 /*
143 This algorithm searches for matches basically by reading characters
144 in the searched string one by one, starting at the beginning. All
145 matching paths in the TNFA are traversed in parallel. When two or
146 more paths reach the same state, exactly one is chosen according to
147 tag ordering rules; if returning submatches is not required it does
148 not matter which path is chosen.
149
150 The worst case time required for finding the leftmost and longest
151 match, or determining that there is no match, is always linearly
152 dependent on the length of the text being searched.
153
154 This algorithm cannot handle TNFAs with back referencing nodes.
155 See `tre-match-backtrack.c'.
156 */
157
158 typedef struct {
159 tre_tnfa_transition_t *state;
160 regoff_t *tags;
161 } tre_tnfa_reach_t;
162
163 typedef struct {
164 regoff_t pos;
165 regoff_t **tags;
166 } tre_reach_pos_t;
167
168
169 static reg_errcode_t
tre_tnfa_run_parallel(const tre_tnfa_t * tnfa,const void * string,regoff_t * match_tags,int eflags,regoff_t * match_end_ofs)170 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
171 regoff_t *match_tags, int eflags,
172 regoff_t *match_end_ofs)
173 {
174 /* State variables required by GET_NEXT_WCHAR. */
175 tre_char_t prev_c = 0, next_c = 0;
176 const char *str_byte = string;
177 regoff_t pos = -1;
178 regoff_t pos_add_next = 1;
179 #ifdef TRE_MBSTATE
180 mbstate_t mbstate;
181 #endif /* TRE_MBSTATE */
182 int reg_notbol = eflags & REG_NOTBOL;
183 int reg_noteol = eflags & REG_NOTEOL;
184 int reg_newline = tnfa->cflags & REG_NEWLINE;
185 reg_errcode_t ret;
186
187 char *buf;
188 tre_tnfa_transition_t *trans_i;
189 tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
190 tre_reach_pos_t *reach_pos;
191 int *tag_i;
192 int num_tags, i;
193
194 regoff_t match_eo = -1; /* end offset of match (-1 if no match found yet) */
195 int new_match = 0;
196 regoff_t *tmp_tags = NULL;
197 regoff_t *tmp_iptr;
198
199 #ifdef TRE_MBSTATE
200 memset(&mbstate, '\0', sizeof(mbstate));
201 #endif /* TRE_MBSTATE */
202
203 if (!match_tags)
204 num_tags = 0;
205 else
206 num_tags = tnfa->num_tags;
207
208 /* Allocate memory for temporary data required for matching. This needs to
209 be done for every matching operation to be thread safe. This allocates
210 everything in a single large block with calloc(). */
211 {
212 size_t tbytes, rbytes, pbytes, xbytes, total_bytes;
213 char *tmp_buf;
214
215 /* Ensure that tbytes and xbytes*num_states cannot overflow, and that
216 * they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */
217 if (num_tags > SIZE_MAX/(8 * sizeof(regoff_t) * tnfa->num_states))
218 return REG_ESPACE;
219
220 /* Likewise check rbytes. */
221 if (tnfa->num_states+1 > SIZE_MAX/(8 * sizeof(*reach_next)))
222 return REG_ESPACE;
223
224 /* Likewise check pbytes. */
225 if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_pos)))
226 return REG_ESPACE;
227
228 /* Compute the length of the block we need. */
229 tbytes = sizeof(*tmp_tags) * num_tags;
230 rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
231 pbytes = sizeof(*reach_pos) * tnfa->num_states;
232 xbytes = sizeof(regoff_t) * num_tags;
233 total_bytes =
234 (sizeof(long) - 1) * 4 /* for alignment paddings */
235 + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
236
237 /* Allocate the memory. */
238 buf = calloc(total_bytes, 1);
239 if (buf == NULL)
240 return REG_ESPACE;
241
242 /* Get the various pointers within tmp_buf (properly aligned). */
243 tmp_tags = (void *)buf;
244 tmp_buf = buf + tbytes;
245 tmp_buf += ALIGN(tmp_buf, long);
246 reach_next = (void *)tmp_buf;
247 tmp_buf += rbytes;
248 tmp_buf += ALIGN(tmp_buf, long);
249 reach = (void *)tmp_buf;
250 tmp_buf += rbytes;
251 tmp_buf += ALIGN(tmp_buf, long);
252 reach_pos = (void *)tmp_buf;
253 tmp_buf += pbytes;
254 tmp_buf += ALIGN(tmp_buf, long);
255 for (i = 0; i < tnfa->num_states; i++)
256 {
257 reach[i].tags = (void *)tmp_buf;
258 tmp_buf += xbytes;
259 reach_next[i].tags = (void *)tmp_buf;
260 tmp_buf += xbytes;
261 }
262 }
263
264 for (i = 0; i < tnfa->num_states; i++)
265 reach_pos[i].pos = -1;
266
267 GET_NEXT_WCHAR();
268 pos = 0;
269
270 reach_next_i = reach_next;
271 while (1)
272 {
273 /* If no match found yet, add the initial states to `reach_next'. */
274 if (match_eo < 0)
275 {
276 trans_i = tnfa->initial;
277 while (trans_i->state != NULL)
278 {
279 if (reach_pos[trans_i->state_id].pos < pos)
280 {
281 if (trans_i->assertions
282 && CHECK_ASSERTIONS(trans_i->assertions))
283 {
284 trans_i++;
285 continue;
286 }
287
288 reach_next_i->state = trans_i->state;
289 for (i = 0; i < num_tags; i++)
290 reach_next_i->tags[i] = -1;
291 tag_i = trans_i->tags;
292 if (tag_i)
293 while (*tag_i >= 0)
294 {
295 if (*tag_i < num_tags)
296 reach_next_i->tags[*tag_i] = pos;
297 tag_i++;
298 }
299 if (reach_next_i->state == tnfa->final)
300 {
301 match_eo = pos;
302 new_match = 1;
303 for (i = 0; i < num_tags; i++)
304 match_tags[i] = reach_next_i->tags[i];
305 }
306 reach_pos[trans_i->state_id].pos = pos;
307 reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
308 reach_next_i++;
309 }
310 trans_i++;
311 }
312 reach_next_i->state = NULL;
313 }
314 else
315 {
316 if (num_tags == 0 || reach_next_i == reach_next)
317 /* We have found a match. */
318 break;
319 }
320
321 /* Check for end of string. */
322 if (!next_c) break;
323
324 GET_NEXT_WCHAR();
325
326 /* Swap `reach' and `reach_next'. */
327 reach_i = reach;
328 reach = reach_next;
329 reach_next = reach_i;
330
331 /* For each state in `reach', weed out states that don't fulfill the
332 minimal matching conditions. */
333 if (tnfa->num_minimals && new_match)
334 {
335 new_match = 0;
336 reach_next_i = reach_next;
337 for (reach_i = reach; reach_i->state; reach_i++)
338 {
339 int skip = 0;
340 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
341 {
342 int end = tnfa->minimal_tags[i];
343 int start = tnfa->minimal_tags[i + 1];
344 if (end >= num_tags)
345 {
346 skip = 1;
347 break;
348 }
349 else if (reach_i->tags[start] == match_tags[start]
350 && reach_i->tags[end] < match_tags[end])
351 {
352 skip = 1;
353 break;
354 }
355 }
356 if (!skip)
357 {
358 reach_next_i->state = reach_i->state;
359 tmp_iptr = reach_next_i->tags;
360 reach_next_i->tags = reach_i->tags;
361 reach_i->tags = tmp_iptr;
362 reach_next_i++;
363 }
364 }
365 reach_next_i->state = NULL;
366
367 /* Swap `reach' and `reach_next'. */
368 reach_i = reach;
369 reach = reach_next;
370 reach_next = reach_i;
371 }
372
373 /* For each state in `reach' see if there is a transition leaving with
374 the current input symbol to a state not yet in `reach_next', and
375 add the destination states to `reach_next'. */
376 reach_next_i = reach_next;
377 for (reach_i = reach; reach_i->state; reach_i++)
378 {
379 for (trans_i = reach_i->state; trans_i->state; trans_i++)
380 {
381 /* Does this transition match the input symbol? */
382 if (trans_i->code_min <= (tre_cint_t)prev_c &&
383 trans_i->code_max >= (tre_cint_t)prev_c)
384 {
385 if (trans_i->assertions
386 && (CHECK_ASSERTIONS(trans_i->assertions)
387 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
388 {
389 continue;
390 }
391
392 /* Compute the tags after this transition. */
393 for (i = 0; i < num_tags; i++)
394 tmp_tags[i] = reach_i->tags[i];
395 tag_i = trans_i->tags;
396 if (tag_i != NULL)
397 while (*tag_i >= 0)
398 {
399 if (*tag_i < num_tags)
400 tmp_tags[*tag_i] = pos;
401 tag_i++;
402 }
403
404 if (reach_pos[trans_i->state_id].pos < pos)
405 {
406 /* Found an unvisited node. */
407 reach_next_i->state = trans_i->state;
408 tmp_iptr = reach_next_i->tags;
409 reach_next_i->tags = tmp_tags;
410 tmp_tags = tmp_iptr;
411 reach_pos[trans_i->state_id].pos = pos;
412 reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
413
414 if (reach_next_i->state == tnfa->final
415 && (match_eo == -1
416 || (num_tags > 0
417 && reach_next_i->tags[0] <= match_tags[0])))
418 {
419 match_eo = pos;
420 new_match = 1;
421 for (i = 0; i < num_tags; i++)
422 match_tags[i] = reach_next_i->tags[i];
423 }
424 reach_next_i++;
425
426 }
427 else
428 {
429 assert(reach_pos[trans_i->state_id].pos == pos);
430 /* Another path has also reached this state. We choose
431 the winner by examining the tag values for both
432 paths. */
433 if (tre_tag_order(num_tags, tnfa->tag_directions,
434 tmp_tags,
435 *reach_pos[trans_i->state_id].tags))
436 {
437 /* The new path wins. */
438 tmp_iptr = *reach_pos[trans_i->state_id].tags;
439 *reach_pos[trans_i->state_id].tags = tmp_tags;
440 if (trans_i->state == tnfa->final)
441 {
442 match_eo = pos;
443 new_match = 1;
444 for (i = 0; i < num_tags; i++)
445 match_tags[i] = tmp_tags[i];
446 }
447 tmp_tags = tmp_iptr;
448 }
449 }
450 }
451 }
452 }
453 reach_next_i->state = NULL;
454 }
455
456 *match_end_ofs = match_eo;
457 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
458 error_exit:
459 xfree(buf);
460 return ret;
461 }
462
463
464
465 /***********************************************************************
466 from tre-match-backtrack.c
467 ***********************************************************************/
468
469 /*
470 This matcher is for regexps that use back referencing. Regexp matching
471 with back referencing is an NP-complete problem on the number of back
472 references. The easiest way to match them is to use a backtracking
473 routine which basically goes through all possible paths in the TNFA
474 and chooses the one which results in the best (leftmost and longest)
475 match. This can be spectacularly expensive and may run out of stack
476 space, but there really is no better known generic algorithm. Quoting
477 Henry Spencer from comp.compilers:
478 <URL: http://compilers.iecc.com/comparch/article/93-03-102>
479
480 POSIX.2 REs require longest match, which is really exciting to
481 implement since the obsolete ("basic") variant also includes
482 \<digit>. I haven't found a better way of tackling this than doing
483 a preliminary match using a DFA (or simulation) on a modified RE
484 that just replicates subREs for \<digit>, and then doing a
485 backtracking match to determine whether the subRE matches were
486 right. This can be rather slow, but I console myself with the
487 thought that people who use \<digit> deserve very slow execution.
488 (Pun unintentional but very appropriate.)
489
490 */
491
492 typedef struct {
493 regoff_t pos;
494 const char *str_byte;
495 tre_tnfa_transition_t *state;
496 int state_id;
497 int next_c;
498 regoff_t *tags;
499 #ifdef TRE_MBSTATE
500 mbstate_t mbstate;
501 #endif /* TRE_MBSTATE */
502 } tre_backtrack_item_t;
503
504 typedef struct tre_backtrack_struct {
505 tre_backtrack_item_t item;
506 struct tre_backtrack_struct *prev;
507 struct tre_backtrack_struct *next;
508 } *tre_backtrack_t;
509
510 #ifdef TRE_MBSTATE
511 #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate)
512 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
513 #else /* !TRE_MBSTATE */
514 #define BT_STACK_MBSTATE_IN
515 #define BT_STACK_MBSTATE_OUT
516 #endif /* !TRE_MBSTATE */
517
518 #define tre_bt_mem_new tre_mem_new
519 #define tre_bt_mem_alloc tre_mem_alloc
520 #define tre_bt_mem_destroy tre_mem_destroy
521
522
523 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
524 do \
525 { \
526 int i; \
527 if (!stack->next) \
528 { \
529 tre_backtrack_t s; \
530 s = tre_bt_mem_alloc(mem, sizeof(*s)); \
531 if (!s) \
532 { \
533 tre_bt_mem_destroy(mem); \
534 if (tags) \
535 xfree(tags); \
536 if (pmatch) \
537 xfree(pmatch); \
538 if (states_seen) \
539 xfree(states_seen); \
540 return REG_ESPACE; \
541 } \
542 s->prev = stack; \
543 s->next = NULL; \
544 s->item.tags = tre_bt_mem_alloc(mem, \
545 sizeof(*tags) * tnfa->num_tags); \
546 if (!s->item.tags) \
547 { \
548 tre_bt_mem_destroy(mem); \
549 if (tags) \
550 xfree(tags); \
551 if (pmatch) \
552 xfree(pmatch); \
553 if (states_seen) \
554 xfree(states_seen); \
555 return REG_ESPACE; \
556 } \
557 stack->next = s; \
558 stack = s; \
559 } \
560 else \
561 stack = stack->next; \
562 stack->item.pos = (_pos); \
563 stack->item.str_byte = (_str_byte); \
564 stack->item.state = (_state); \
565 stack->item.state_id = (_state_id); \
566 stack->item.next_c = (_next_c); \
567 for (i = 0; i < tnfa->num_tags; i++) \
568 stack->item.tags[i] = (_tags)[i]; \
569 BT_STACK_MBSTATE_IN; \
570 } \
571 while (0)
572
573 #define BT_STACK_POP() \
574 do \
575 { \
576 int i; \
577 assert(stack->prev); \
578 pos = stack->item.pos; \
579 str_byte = stack->item.str_byte; \
580 state = stack->item.state; \
581 next_c = stack->item.next_c; \
582 for (i = 0; i < tnfa->num_tags; i++) \
583 tags[i] = stack->item.tags[i]; \
584 BT_STACK_MBSTATE_OUT; \
585 stack = stack->prev; \
586 } \
587 while (0)
588
589 #ifndef MIN
590 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
591 #endif
592
593 static reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t * tnfa,const void * string,regoff_t * match_tags,int eflags,regoff_t * match_end_ofs)594 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
595 regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
596 {
597 /* State variables required by GET_NEXT_WCHAR. */
598 tre_char_t prev_c = 0, next_c = 0;
599 const char *str_byte = string;
600 regoff_t pos = 0;
601 regoff_t pos_add_next = 1;
602 #ifdef TRE_MBSTATE
603 mbstate_t mbstate;
604 #endif /* TRE_MBSTATE */
605 int reg_notbol = eflags & REG_NOTBOL;
606 int reg_noteol = eflags & REG_NOTEOL;
607 int reg_newline = tnfa->cflags & REG_NEWLINE;
608
609 /* These are used to remember the necessary values of the above
610 variables to return to the position where the current search
611 started from. */
612 int next_c_start;
613 const char *str_byte_start;
614 regoff_t pos_start = -1;
615 #ifdef TRE_MBSTATE
616 mbstate_t mbstate_start;
617 #endif /* TRE_MBSTATE */
618
619 /* End offset of best match so far, or -1 if no match found yet. */
620 regoff_t match_eo = -1;
621 /* Tag arrays. */
622 int *next_tags;
623 regoff_t *tags = NULL;
624 /* Current TNFA state. */
625 tre_tnfa_transition_t *state;
626 int *states_seen = NULL;
627
628 /* Memory allocator to for allocating the backtracking stack. */
629 tre_mem_t mem = tre_bt_mem_new();
630
631 /* The backtracking stack. */
632 tre_backtrack_t stack;
633
634 tre_tnfa_transition_t *trans_i;
635 regmatch_t *pmatch = NULL;
636 int ret;
637
638 #ifdef TRE_MBSTATE
639 memset(&mbstate, '\0', sizeof(mbstate));
640 #endif /* TRE_MBSTATE */
641
642 if (!mem)
643 return REG_ESPACE;
644 stack = tre_bt_mem_alloc(mem, sizeof(*stack));
645 if (!stack)
646 {
647 ret = REG_ESPACE;
648 goto error_exit;
649 }
650 stack->prev = NULL;
651 stack->next = NULL;
652
653 if (tnfa->num_tags)
654 {
655 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
656 if (!tags)
657 {
658 ret = REG_ESPACE;
659 goto error_exit;
660 }
661 }
662 if (tnfa->num_submatches)
663 {
664 pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
665 if (!pmatch)
666 {
667 ret = REG_ESPACE;
668 goto error_exit;
669 }
670 }
671 if (tnfa->num_states)
672 {
673 states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
674 if (!states_seen)
675 {
676 ret = REG_ESPACE;
677 goto error_exit;
678 }
679 }
680
681 retry:
682 {
683 int i;
684 for (i = 0; i < tnfa->num_tags; i++)
685 {
686 tags[i] = -1;
687 if (match_tags)
688 match_tags[i] = -1;
689 }
690 for (i = 0; i < tnfa->num_states; i++)
691 states_seen[i] = 0;
692 }
693
694 state = NULL;
695 pos = pos_start;
696 GET_NEXT_WCHAR();
697 pos_start = pos;
698 next_c_start = next_c;
699 str_byte_start = str_byte;
700 #ifdef TRE_MBSTATE
701 mbstate_start = mbstate;
702 #endif /* TRE_MBSTATE */
703
704 /* Handle initial states. */
705 next_tags = NULL;
706 for (trans_i = tnfa->initial; trans_i->state; trans_i++)
707 {
708 if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
709 {
710 continue;
711 }
712 if (state == NULL)
713 {
714 /* Start from this state. */
715 state = trans_i->state;
716 next_tags = trans_i->tags;
717 }
718 else
719 {
720 /* Backtrack to this state. */
721 BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
722 trans_i->state_id, next_c, tags, mbstate);
723 {
724 int *tmp = trans_i->tags;
725 if (tmp)
726 while (*tmp >= 0)
727 stack->item.tags[*tmp++] = pos;
728 }
729 }
730 }
731
732 if (next_tags)
733 for (; *next_tags >= 0; next_tags++)
734 tags[*next_tags] = pos;
735
736
737 if (state == NULL)
738 goto backtrack;
739
740 while (1)
741 {
742 tre_tnfa_transition_t *next_state;
743 int empty_br_match;
744
745 if (state == tnfa->final)
746 {
747 if (match_eo < pos
748 || (match_eo == pos
749 && match_tags
750 && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
751 tags, match_tags)))
752 {
753 int i;
754 /* This match wins the previous match. */
755 match_eo = pos;
756 if (match_tags)
757 for (i = 0; i < tnfa->num_tags; i++)
758 match_tags[i] = tags[i];
759 }
760 /* Our TNFAs never have transitions leaving from the final state,
761 so we jump right to backtracking. */
762 goto backtrack;
763 }
764
765 /* Go to the next character in the input string. */
766 empty_br_match = 0;
767 trans_i = state;
768 if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
769 {
770 /* This is a back reference state. All transitions leaving from
771 this state have the same back reference "assertion". Instead
772 of reading the next character, we match the back reference. */
773 regoff_t so, eo;
774 int bt = trans_i->u.backref;
775 regoff_t bt_len;
776 int result;
777
778 /* Get the substring we need to match against. Remember to
779 turn off REG_NOSUB temporarily. */
780 tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
781 tnfa, tags, pos);
782 so = pmatch[bt].rm_so;
783 eo = pmatch[bt].rm_eo;
784 bt_len = eo - so;
785
786 result = strncmp((const char*)string + so, str_byte - 1,
787 (size_t)bt_len);
788
789 if (result == 0)
790 {
791 /* Back reference matched. Check for infinite loop. */
792 if (bt_len == 0)
793 empty_br_match = 1;
794 if (empty_br_match && states_seen[trans_i->state_id])
795 {
796 goto backtrack;
797 }
798
799 states_seen[trans_i->state_id] = empty_br_match;
800
801 /* Advance in input string and resync `prev_c', `next_c'
802 and pos. */
803 str_byte += bt_len - 1;
804 pos += bt_len - 1;
805 GET_NEXT_WCHAR();
806 }
807 else
808 {
809 goto backtrack;
810 }
811 }
812 else
813 {
814 /* Check for end of string. */
815 if (next_c == L'\0')
816 goto backtrack;
817
818 /* Read the next character. */
819 GET_NEXT_WCHAR();
820 }
821
822 next_state = NULL;
823 for (trans_i = state; trans_i->state; trans_i++)
824 {
825 if (trans_i->code_min <= (tre_cint_t)prev_c
826 && trans_i->code_max >= (tre_cint_t)prev_c)
827 {
828 if (trans_i->assertions
829 && (CHECK_ASSERTIONS(trans_i->assertions)
830 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
831 {
832 continue;
833 }
834
835 if (next_state == NULL)
836 {
837 /* First matching transition. */
838 next_state = trans_i->state;
839 next_tags = trans_i->tags;
840 }
841 else
842 {
843 /* Second matching transition. We may need to backtrack here
844 to take this transition instead of the first one, so we
845 push this transition in the backtracking stack so we can
846 jump back here if needed. */
847 BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
848 trans_i->state_id, next_c, tags, mbstate);
849 {
850 int *tmp;
851 for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
852 stack->item.tags[*tmp] = pos;
853 }
854 #if 0 /* XXX - it's important not to look at all transitions here to keep
855 the stack small! */
856 break;
857 #endif
858 }
859 }
860 }
861
862 if (next_state != NULL)
863 {
864 /* Matching transitions were found. Take the first one. */
865 state = next_state;
866
867 /* Update the tag values. */
868 if (next_tags)
869 while (*next_tags >= 0)
870 tags[*next_tags++] = pos;
871 }
872 else
873 {
874 backtrack:
875 /* A matching transition was not found. Try to backtrack. */
876 if (stack->prev)
877 {
878 if (stack->item.state->assertions & ASSERT_BACKREF)
879 {
880 states_seen[stack->item.state_id] = 0;
881 }
882
883 BT_STACK_POP();
884 }
885 else if (match_eo < 0)
886 {
887 /* Try starting from a later position in the input string. */
888 /* Check for end of string. */
889 if (next_c == L'\0')
890 {
891 break;
892 }
893 next_c = next_c_start;
894 #ifdef TRE_MBSTATE
895 mbstate = mbstate_start;
896 #endif /* TRE_MBSTATE */
897 str_byte = str_byte_start;
898 goto retry;
899 }
900 else
901 {
902 break;
903 }
904 }
905 }
906
907 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
908 *match_end_ofs = match_eo;
909
910 error_exit:
911 tre_bt_mem_destroy(mem);
912 #ifndef TRE_USE_ALLOCA
913 if (tags)
914 xfree(tags);
915 if (pmatch)
916 xfree(pmatch);
917 if (states_seen)
918 xfree(states_seen);
919 #endif /* !TRE_USE_ALLOCA */
920
921 return ret;
922 }
923
924 /***********************************************************************
925 from regexec.c
926 ***********************************************************************/
927
928 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
929 endpoint values. */
930 static void
tre_fill_pmatch(size_t nmatch,regmatch_t pmatch[],int cflags,const tre_tnfa_t * tnfa,regoff_t * tags,regoff_t match_eo)931 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
932 const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
933 {
934 tre_submatch_data_t *submatch_data;
935 unsigned int i, j;
936 int *parents;
937
938 i = 0;
939 if (match_eo >= 0 && !(cflags & REG_NOSUB))
940 {
941 /* Construct submatch offsets from the tags. */
942 submatch_data = tnfa->submatch_data;
943 while (i < tnfa->num_submatches && i < nmatch)
944 {
945 if (submatch_data[i].so_tag == tnfa->end_tag)
946 pmatch[i].rm_so = match_eo;
947 else
948 pmatch[i].rm_so = tags[submatch_data[i].so_tag];
949
950 if (submatch_data[i].eo_tag == tnfa->end_tag)
951 pmatch[i].rm_eo = match_eo;
952 else
953 pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
954
955 /* If either of the endpoints were not used, this submatch
956 was not part of the match. */
957 if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
958 pmatch[i].rm_so = pmatch[i].rm_eo = -1;
959
960 i++;
961 }
962 /* Reset all submatches that are not within all of their parent
963 submatches. */
964 i = 0;
965 while (i < tnfa->num_submatches && i < nmatch)
966 {
967 if (pmatch[i].rm_eo == -1)
968 assert(pmatch[i].rm_so == -1);
969 assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
970
971 parents = submatch_data[i].parents;
972 if (parents != NULL)
973 for (j = 0; parents[j] >= 0; j++)
974 {
975 if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
976 || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
977 pmatch[i].rm_so = pmatch[i].rm_eo = -1;
978 }
979 i++;
980 }
981 }
982
983 while (i < nmatch)
984 {
985 pmatch[i].rm_so = -1;
986 pmatch[i].rm_eo = -1;
987 i++;
988 }
989 }
990
991
992 /*
993 Wrapper functions for POSIX compatible regexp matching.
994 */
995
996 int
regexec(const regex_t * restrict preg,const char * restrict string,size_t nmatch,regmatch_t pmatch[restrict],int eflags)997 regexec(const regex_t *restrict preg, const char *restrict string,
998 size_t nmatch, regmatch_t pmatch[restrict], int eflags)
999 {
1000 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
1001 reg_errcode_t status;
1002 regoff_t *tags = NULL, eo;
1003 if (tnfa->cflags & REG_NOSUB) nmatch = 0;
1004 if (tnfa->num_tags > 0 && nmatch > 0)
1005 {
1006 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
1007 if (tags == NULL)
1008 return REG_ESPACE;
1009 }
1010
1011 /* Dispatch to the appropriate matcher. */
1012 if (tnfa->have_backrefs)
1013 {
1014 /* The regex has back references, use the backtracking matcher. */
1015 status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
1016 }
1017 else
1018 {
1019 /* Exact matching, no back references, use the parallel matcher. */
1020 status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
1021 }
1022
1023 if (status == REG_OK)
1024 /* A match was found, so fill the submatch registers. */
1025 tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
1026 if (tags)
1027 xfree(tags);
1028 return status;
1029 }
1030