1 /*
2   regexec.c - TRE POSIX compatible matching functions (and more).
3 
4   Copyright (c) 2001-2009 Ville Laurikari <[email protected]>
5   All rights reserved.
6 
7   Redistribution and use in source and binary forms, with or without
8   modification, are permitted provided that the following conditions
9   are met:
10 
11     1. Redistributions of source code must retain the above copyright
12        notice, this list of conditions and the following disclaimer.
13 
14     2. Redistributions in binary form must reproduce the above copyright
15        notice, this list of conditions and the following disclaimer in the
16        documentation and/or other materials provided with the distribution.
17 
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 */
31 
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include <wctype.h>
36 #include <limits.h>
37 #include <stdint.h>
38 
39 #include <regex.h>
40 
41 #include "tre.h"
42 
43 #include <assert.h>
44 
45 static void
46 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
47 		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
48 
49 /***********************************************************************
50  from tre-match-utils.h
51 ***********************************************************************/
52 
53 #define GET_NEXT_WCHAR() do {                                                 \
54     prev_c = next_c; pos += pos_add_next;                                     \
55     if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
56         if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
57         else pos_add_next++;                                                  \
58     }                                                                         \
59     str_byte += pos_add_next;                                                 \
60   } while (0)
61 
62 #define IS_WORD_CHAR(c)	 ((c) == L'_' || tre_isalnum(c))
63 
64 #define CHECK_ASSERTIONS(assertions)					      \
65   (((assertions & ASSERT_AT_BOL)					      \
66     && (pos > 0 || reg_notbol)						      \
67     && (prev_c != L'\n' || !reg_newline))				      \
68    || ((assertions & ASSERT_AT_EOL)					      \
69        && (next_c != L'\0' || reg_noteol)				      \
70        && (next_c != L'\n' || !reg_newline))				      \
71    || ((assertions & ASSERT_AT_BOW)					      \
72        && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c)))	              \
73    || ((assertions & ASSERT_AT_EOW)					      \
74        && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c)))		      \
75    || ((assertions & ASSERT_AT_WB)					      \
76        && (pos != 0 && next_c != L'\0'					      \
77 	   && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c)))		      \
78    || ((assertions & ASSERT_AT_WB_NEG)					      \
79        && (pos == 0 || next_c == L'\0'					      \
80 	   || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
81 
82 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)                             \
83   (((trans_i->assertions & ASSERT_CHAR_CLASS)                                 \
84        && !(tnfa->cflags & REG_ICASE)                                         \
85        && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class))                 \
86     || ((trans_i->assertions & ASSERT_CHAR_CLASS)                             \
87         && (tnfa->cflags & REG_ICASE)                                         \
88         && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class)     \
89 	&& !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class))    \
90     || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)                         \
91         && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\
92                                       tnfa->cflags & REG_ICASE)))
93 
94 
95 
96 
97 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
98 static int
tre_tag_order(int num_tags,tre_tag_direction_t * tag_directions,regoff_t * t1,regoff_t * t2)99 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
100 	      regoff_t *t1, regoff_t *t2)
101 {
102   int i;
103   for (i = 0; i < num_tags; i++)
104     {
105       if (tag_directions[i] == TRE_TAG_MINIMIZE)
106 	{
107 	  if (t1[i] < t2[i])
108 	    return 1;
109 	  if (t1[i] > t2[i])
110 	    return 0;
111 	}
112       else
113 	{
114 	  if (t1[i] > t2[i])
115 	    return 1;
116 	  if (t1[i] < t2[i])
117 	    return 0;
118 	}
119     }
120   /*  assert(0);*/
121   return 0;
122 }
123 
124 static int
tre_neg_char_classes_match(tre_ctype_t * classes,tre_cint_t wc,int icase)125 tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
126 {
127   while (*classes != (tre_ctype_t)0)
128     if ((!icase && tre_isctype(wc, *classes))
129 	|| (icase && (tre_isctype(tre_toupper(wc), *classes)
130 		      || tre_isctype(tre_tolower(wc), *classes))))
131       return 1; /* Match. */
132     else
133       classes++;
134   return 0; /* No match. */
135 }
136 
137 
138 /***********************************************************************
139  from tre-match-parallel.c
140 ***********************************************************************/
141 
142 /*
143   This algorithm searches for matches basically by reading characters
144   in the searched string one by one, starting at the beginning.	 All
145   matching paths in the TNFA are traversed in parallel.	 When two or
146   more paths reach the same state, exactly one is chosen according to
147   tag ordering rules; if returning submatches is not required it does
148   not matter which path is chosen.
149 
150   The worst case time required for finding the leftmost and longest
151   match, or determining that there is no match, is always linearly
152   dependent on the length of the text being searched.
153 
154   This algorithm cannot handle TNFAs with back referencing nodes.
155   See `tre-match-backtrack.c'.
156 */
157 
158 typedef struct {
159   tre_tnfa_transition_t *state;
160   regoff_t *tags;
161 } tre_tnfa_reach_t;
162 
163 typedef struct {
164   regoff_t pos;
165   regoff_t **tags;
166 } tre_reach_pos_t;
167 
168 
169 static reg_errcode_t
tre_tnfa_run_parallel(const tre_tnfa_t * tnfa,const void * string,regoff_t * match_tags,int eflags,regoff_t * match_end_ofs)170 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
171 		      regoff_t *match_tags, int eflags,
172 		      regoff_t *match_end_ofs)
173 {
174   /* State variables required by GET_NEXT_WCHAR. */
175   tre_char_t prev_c = 0, next_c = 0;
176   const char *str_byte = string;
177   regoff_t pos = -1;
178   regoff_t pos_add_next = 1;
179 #ifdef TRE_MBSTATE
180   mbstate_t mbstate;
181 #endif /* TRE_MBSTATE */
182   int reg_notbol = eflags & REG_NOTBOL;
183   int reg_noteol = eflags & REG_NOTEOL;
184   int reg_newline = tnfa->cflags & REG_NEWLINE;
185   reg_errcode_t ret;
186 
187   char *buf;
188   tre_tnfa_transition_t *trans_i;
189   tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
190   tre_reach_pos_t *reach_pos;
191   int *tag_i;
192   int num_tags, i;
193 
194   regoff_t match_eo = -1;	   /* end offset of match (-1 if no match found yet) */
195   int new_match = 0;
196   regoff_t *tmp_tags = NULL;
197   regoff_t *tmp_iptr;
198 
199 #ifdef TRE_MBSTATE
200   memset(&mbstate, '\0', sizeof(mbstate));
201 #endif /* TRE_MBSTATE */
202 
203   if (!match_tags)
204     num_tags = 0;
205   else
206     num_tags = tnfa->num_tags;
207 
208   /* Allocate memory for temporary data required for matching.	This needs to
209      be done for every matching operation to be thread safe.  This allocates
210      everything in a single large block with calloc(). */
211   {
212     size_t tbytes, rbytes, pbytes, xbytes, total_bytes;
213     char *tmp_buf;
214 
215     /* Ensure that tbytes and xbytes*num_states cannot overflow, and that
216      * they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */
217     if (num_tags > SIZE_MAX/(8 * sizeof(regoff_t) * tnfa->num_states))
218       return REG_ESPACE;
219 
220     /* Likewise check rbytes. */
221     if (tnfa->num_states+1 > SIZE_MAX/(8 * sizeof(*reach_next)))
222       return REG_ESPACE;
223 
224     /* Likewise check pbytes. */
225     if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_pos)))
226       return REG_ESPACE;
227 
228     /* Compute the length of the block we need. */
229     tbytes = sizeof(*tmp_tags) * num_tags;
230     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
231     pbytes = sizeof(*reach_pos) * tnfa->num_states;
232     xbytes = sizeof(regoff_t) * num_tags;
233     total_bytes =
234       (sizeof(long) - 1) * 4 /* for alignment paddings */
235       + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
236 
237     /* Allocate the memory. */
238     buf = calloc(total_bytes, 1);
239     if (buf == NULL)
240       return REG_ESPACE;
241 
242     /* Get the various pointers within tmp_buf (properly aligned). */
243     tmp_tags = (void *)buf;
244     tmp_buf = buf + tbytes;
245     tmp_buf += ALIGN(tmp_buf, long);
246     reach_next = (void *)tmp_buf;
247     tmp_buf += rbytes;
248     tmp_buf += ALIGN(tmp_buf, long);
249     reach = (void *)tmp_buf;
250     tmp_buf += rbytes;
251     tmp_buf += ALIGN(tmp_buf, long);
252     reach_pos = (void *)tmp_buf;
253     tmp_buf += pbytes;
254     tmp_buf += ALIGN(tmp_buf, long);
255     for (i = 0; i < tnfa->num_states; i++)
256       {
257 	reach[i].tags = (void *)tmp_buf;
258 	tmp_buf += xbytes;
259 	reach_next[i].tags = (void *)tmp_buf;
260 	tmp_buf += xbytes;
261       }
262   }
263 
264   for (i = 0; i < tnfa->num_states; i++)
265     reach_pos[i].pos = -1;
266 
267   GET_NEXT_WCHAR();
268   pos = 0;
269 
270   reach_next_i = reach_next;
271   while (1)
272     {
273       /* If no match found yet, add the initial states to `reach_next'. */
274       if (match_eo < 0)
275 	{
276 	  trans_i = tnfa->initial;
277 	  while (trans_i->state != NULL)
278 	    {
279 	      if (reach_pos[trans_i->state_id].pos < pos)
280 		{
281 		  if (trans_i->assertions
282 		      && CHECK_ASSERTIONS(trans_i->assertions))
283 		    {
284 		      trans_i++;
285 		      continue;
286 		    }
287 
288 		  reach_next_i->state = trans_i->state;
289 		  for (i = 0; i < num_tags; i++)
290 		    reach_next_i->tags[i] = -1;
291 		  tag_i = trans_i->tags;
292 		  if (tag_i)
293 		    while (*tag_i >= 0)
294 		      {
295 			if (*tag_i < num_tags)
296 			  reach_next_i->tags[*tag_i] = pos;
297 			tag_i++;
298 		      }
299 		  if (reach_next_i->state == tnfa->final)
300 		    {
301 		      match_eo = pos;
302 		      new_match = 1;
303 		      for (i = 0; i < num_tags; i++)
304 			match_tags[i] = reach_next_i->tags[i];
305 		    }
306 		  reach_pos[trans_i->state_id].pos = pos;
307 		  reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
308 		  reach_next_i++;
309 		}
310 	      trans_i++;
311 	    }
312 	  reach_next_i->state = NULL;
313 	}
314       else
315 	{
316 	  if (num_tags == 0 || reach_next_i == reach_next)
317 	    /* We have found a match. */
318 	    break;
319 	}
320 
321       /* Check for end of string. */
322       if (!next_c) break;
323 
324       GET_NEXT_WCHAR();
325 
326       /* Swap `reach' and `reach_next'. */
327       reach_i = reach;
328       reach = reach_next;
329       reach_next = reach_i;
330 
331       /* For each state in `reach', weed out states that don't fulfill the
332 	 minimal matching conditions. */
333       if (tnfa->num_minimals && new_match)
334 	{
335 	  new_match = 0;
336 	  reach_next_i = reach_next;
337 	  for (reach_i = reach; reach_i->state; reach_i++)
338 	    {
339 	      int skip = 0;
340 	      for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
341 		{
342 		  int end = tnfa->minimal_tags[i];
343 		  int start = tnfa->minimal_tags[i + 1];
344 		  if (end >= num_tags)
345 		    {
346 		      skip = 1;
347 		      break;
348 		    }
349 		  else if (reach_i->tags[start] == match_tags[start]
350 			   && reach_i->tags[end] < match_tags[end])
351 		    {
352 		      skip = 1;
353 		      break;
354 		    }
355 		}
356 	      if (!skip)
357 		{
358 		  reach_next_i->state = reach_i->state;
359 		  tmp_iptr = reach_next_i->tags;
360 		  reach_next_i->tags = reach_i->tags;
361 		  reach_i->tags = tmp_iptr;
362 		  reach_next_i++;
363 		}
364 	    }
365 	  reach_next_i->state = NULL;
366 
367 	  /* Swap `reach' and `reach_next'. */
368 	  reach_i = reach;
369 	  reach = reach_next;
370 	  reach_next = reach_i;
371 	}
372 
373       /* For each state in `reach' see if there is a transition leaving with
374 	 the current input symbol to a state not yet in `reach_next', and
375 	 add the destination states to `reach_next'. */
376       reach_next_i = reach_next;
377       for (reach_i = reach; reach_i->state; reach_i++)
378 	{
379 	  for (trans_i = reach_i->state; trans_i->state; trans_i++)
380 	    {
381 	      /* Does this transition match the input symbol? */
382 	      if (trans_i->code_min <= (tre_cint_t)prev_c &&
383 		  trans_i->code_max >= (tre_cint_t)prev_c)
384 		{
385 		  if (trans_i->assertions
386 		      && (CHECK_ASSERTIONS(trans_i->assertions)
387 			  || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
388 		    {
389 		      continue;
390 		    }
391 
392 		  /* Compute the tags after this transition. */
393 		  for (i = 0; i < num_tags; i++)
394 		    tmp_tags[i] = reach_i->tags[i];
395 		  tag_i = trans_i->tags;
396 		  if (tag_i != NULL)
397 		    while (*tag_i >= 0)
398 		      {
399 			if (*tag_i < num_tags)
400 			  tmp_tags[*tag_i] = pos;
401 			tag_i++;
402 		      }
403 
404 		  if (reach_pos[trans_i->state_id].pos < pos)
405 		    {
406 		      /* Found an unvisited node. */
407 		      reach_next_i->state = trans_i->state;
408 		      tmp_iptr = reach_next_i->tags;
409 		      reach_next_i->tags = tmp_tags;
410 		      tmp_tags = tmp_iptr;
411 		      reach_pos[trans_i->state_id].pos = pos;
412 		      reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
413 
414 		      if (reach_next_i->state == tnfa->final
415 			  && (match_eo == -1
416 			      || (num_tags > 0
417 				  && reach_next_i->tags[0] <= match_tags[0])))
418 			{
419 			  match_eo = pos;
420 			  new_match = 1;
421 			  for (i = 0; i < num_tags; i++)
422 			    match_tags[i] = reach_next_i->tags[i];
423 			}
424 		      reach_next_i++;
425 
426 		    }
427 		  else
428 		    {
429 		      assert(reach_pos[trans_i->state_id].pos == pos);
430 		      /* Another path has also reached this state.  We choose
431 			 the winner by examining the tag values for both
432 			 paths. */
433 		      if (tre_tag_order(num_tags, tnfa->tag_directions,
434 					tmp_tags,
435 					*reach_pos[trans_i->state_id].tags))
436 			{
437 			  /* The new path wins. */
438 			  tmp_iptr = *reach_pos[trans_i->state_id].tags;
439 			  *reach_pos[trans_i->state_id].tags = tmp_tags;
440 			  if (trans_i->state == tnfa->final)
441 			    {
442 			      match_eo = pos;
443 			      new_match = 1;
444 			      for (i = 0; i < num_tags; i++)
445 				match_tags[i] = tmp_tags[i];
446 			    }
447 			  tmp_tags = tmp_iptr;
448 			}
449 		    }
450 		}
451 	    }
452 	}
453       reach_next_i->state = NULL;
454     }
455 
456   *match_end_ofs = match_eo;
457   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
458 error_exit:
459   xfree(buf);
460   return ret;
461 }
462 
463 
464 
465 /***********************************************************************
466  from tre-match-backtrack.c
467 ***********************************************************************/
468 
469 /*
470   This matcher is for regexps that use back referencing.  Regexp matching
471   with back referencing is an NP-complete problem on the number of back
472   references.  The easiest way to match them is to use a backtracking
473   routine which basically goes through all possible paths in the TNFA
474   and chooses the one which results in the best (leftmost and longest)
475   match.  This can be spectacularly expensive and may run out of stack
476   space, but there really is no better known generic algorithm.	 Quoting
477   Henry Spencer from comp.compilers:
478   <URL: http://compilers.iecc.com/comparch/article/93-03-102>
479 
480     POSIX.2 REs require longest match, which is really exciting to
481     implement since the obsolete ("basic") variant also includes
482     \<digit>.  I haven't found a better way of tackling this than doing
483     a preliminary match using a DFA (or simulation) on a modified RE
484     that just replicates subREs for \<digit>, and then doing a
485     backtracking match to determine whether the subRE matches were
486     right.  This can be rather slow, but I console myself with the
487     thought that people who use \<digit> deserve very slow execution.
488     (Pun unintentional but very appropriate.)
489 
490 */
491 
492 typedef struct {
493   regoff_t pos;
494   const char *str_byte;
495   tre_tnfa_transition_t *state;
496   int state_id;
497   int next_c;
498   regoff_t *tags;
499 #ifdef TRE_MBSTATE
500   mbstate_t mbstate;
501 #endif /* TRE_MBSTATE */
502 } tre_backtrack_item_t;
503 
504 typedef struct tre_backtrack_struct {
505   tre_backtrack_item_t item;
506   struct tre_backtrack_struct *prev;
507   struct tre_backtrack_struct *next;
508 } *tre_backtrack_t;
509 
510 #ifdef TRE_MBSTATE
511 #define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate)
512 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
513 #else /* !TRE_MBSTATE */
514 #define BT_STACK_MBSTATE_IN
515 #define BT_STACK_MBSTATE_OUT
516 #endif /* !TRE_MBSTATE */
517 
518 #define tre_bt_mem_new		  tre_mem_new
519 #define tre_bt_mem_alloc	  tre_mem_alloc
520 #define tre_bt_mem_destroy	  tre_mem_destroy
521 
522 
523 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
524   do									      \
525     {									      \
526       int i;								      \
527       if (!stack->next)							      \
528 	{								      \
529 	  tre_backtrack_t s;						      \
530 	  s = tre_bt_mem_alloc(mem, sizeof(*s));			      \
531 	  if (!s)							      \
532 	    {								      \
533 	      tre_bt_mem_destroy(mem);					      \
534 	      if (tags)							      \
535 		xfree(tags);						      \
536 	      if (pmatch)						      \
537 		xfree(pmatch);						      \
538 	      if (states_seen)						      \
539 		xfree(states_seen);					      \
540 	      return REG_ESPACE;					      \
541 	    }								      \
542 	  s->prev = stack;						      \
543 	  s->next = NULL;						      \
544 	  s->item.tags = tre_bt_mem_alloc(mem,				      \
545 					  sizeof(*tags) * tnfa->num_tags);    \
546 	  if (!s->item.tags)						      \
547 	    {								      \
548 	      tre_bt_mem_destroy(mem);					      \
549 	      if (tags)							      \
550 		xfree(tags);						      \
551 	      if (pmatch)						      \
552 		xfree(pmatch);						      \
553 	      if (states_seen)						      \
554 		xfree(states_seen);					      \
555 	      return REG_ESPACE;					      \
556 	    }								      \
557 	  stack->next = s;						      \
558 	  stack = s;							      \
559 	}								      \
560       else								      \
561 	stack = stack->next;						      \
562       stack->item.pos = (_pos);						      \
563       stack->item.str_byte = (_str_byte);				      \
564       stack->item.state = (_state);					      \
565       stack->item.state_id = (_state_id);				      \
566       stack->item.next_c = (_next_c);					      \
567       for (i = 0; i < tnfa->num_tags; i++)				      \
568 	stack->item.tags[i] = (_tags)[i];				      \
569       BT_STACK_MBSTATE_IN;						      \
570     }									      \
571   while (0)
572 
573 #define BT_STACK_POP()							      \
574   do									      \
575     {									      \
576       int i;								      \
577       assert(stack->prev);						      \
578       pos = stack->item.pos;						      \
579       str_byte = stack->item.str_byte;					      \
580       state = stack->item.state;					      \
581       next_c = stack->item.next_c;					      \
582       for (i = 0; i < tnfa->num_tags; i++)				      \
583 	tags[i] = stack->item.tags[i];					      \
584       BT_STACK_MBSTATE_OUT;						      \
585       stack = stack->prev;						      \
586     }									      \
587   while (0)
588 
589 #ifndef MIN
590 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
591 #endif
592 
593 static reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t * tnfa,const void * string,regoff_t * match_tags,int eflags,regoff_t * match_end_ofs)594 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
595 		       regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
596 {
597   /* State variables required by GET_NEXT_WCHAR. */
598   tre_char_t prev_c = 0, next_c = 0;
599   const char *str_byte = string;
600   regoff_t pos = 0;
601   regoff_t pos_add_next = 1;
602 #ifdef TRE_MBSTATE
603   mbstate_t mbstate;
604 #endif /* TRE_MBSTATE */
605   int reg_notbol = eflags & REG_NOTBOL;
606   int reg_noteol = eflags & REG_NOTEOL;
607   int reg_newline = tnfa->cflags & REG_NEWLINE;
608 
609   /* These are used to remember the necessary values of the above
610      variables to return to the position where the current search
611      started from. */
612   int next_c_start;
613   const char *str_byte_start;
614   regoff_t pos_start = -1;
615 #ifdef TRE_MBSTATE
616   mbstate_t mbstate_start;
617 #endif /* TRE_MBSTATE */
618 
619   /* End offset of best match so far, or -1 if no match found yet. */
620   regoff_t match_eo = -1;
621   /* Tag arrays. */
622   int *next_tags;
623   regoff_t *tags = NULL;
624   /* Current TNFA state. */
625   tre_tnfa_transition_t *state;
626   int *states_seen = NULL;
627 
628   /* Memory allocator to for allocating the backtracking stack. */
629   tre_mem_t mem = tre_bt_mem_new();
630 
631   /* The backtracking stack. */
632   tre_backtrack_t stack;
633 
634   tre_tnfa_transition_t *trans_i;
635   regmatch_t *pmatch = NULL;
636   int ret;
637 
638 #ifdef TRE_MBSTATE
639   memset(&mbstate, '\0', sizeof(mbstate));
640 #endif /* TRE_MBSTATE */
641 
642   if (!mem)
643     return REG_ESPACE;
644   stack = tre_bt_mem_alloc(mem, sizeof(*stack));
645   if (!stack)
646     {
647       ret = REG_ESPACE;
648       goto error_exit;
649     }
650   stack->prev = NULL;
651   stack->next = NULL;
652 
653   if (tnfa->num_tags)
654     {
655       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
656       if (!tags)
657 	{
658 	  ret = REG_ESPACE;
659 	  goto error_exit;
660 	}
661     }
662   if (tnfa->num_submatches)
663     {
664       pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
665       if (!pmatch)
666 	{
667 	  ret = REG_ESPACE;
668 	  goto error_exit;
669 	}
670     }
671   if (tnfa->num_states)
672     {
673       states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
674       if (!states_seen)
675 	{
676 	  ret = REG_ESPACE;
677 	  goto error_exit;
678 	}
679     }
680 
681  retry:
682   {
683     int i;
684     for (i = 0; i < tnfa->num_tags; i++)
685       {
686 	tags[i] = -1;
687 	if (match_tags)
688 	  match_tags[i] = -1;
689       }
690     for (i = 0; i < tnfa->num_states; i++)
691       states_seen[i] = 0;
692   }
693 
694   state = NULL;
695   pos = pos_start;
696   GET_NEXT_WCHAR();
697   pos_start = pos;
698   next_c_start = next_c;
699   str_byte_start = str_byte;
700 #ifdef TRE_MBSTATE
701   mbstate_start = mbstate;
702 #endif /* TRE_MBSTATE */
703 
704   /* Handle initial states. */
705   next_tags = NULL;
706   for (trans_i = tnfa->initial; trans_i->state; trans_i++)
707     {
708       if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
709 	{
710 	  continue;
711 	}
712       if (state == NULL)
713 	{
714 	  /* Start from this state. */
715 	  state = trans_i->state;
716 	  next_tags = trans_i->tags;
717 	}
718       else
719 	{
720 	  /* Backtrack to this state. */
721 	  BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
722 			trans_i->state_id, next_c, tags, mbstate);
723 	  {
724 	    int *tmp = trans_i->tags;
725 	    if (tmp)
726 	      while (*tmp >= 0)
727 		stack->item.tags[*tmp++] = pos;
728 	  }
729 	}
730     }
731 
732   if (next_tags)
733     for (; *next_tags >= 0; next_tags++)
734       tags[*next_tags] = pos;
735 
736 
737   if (state == NULL)
738     goto backtrack;
739 
740   while (1)
741     {
742       tre_tnfa_transition_t *next_state;
743       int empty_br_match;
744 
745       if (state == tnfa->final)
746 	{
747 	  if (match_eo < pos
748 	      || (match_eo == pos
749 		  && match_tags
750 		  && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
751 				   tags, match_tags)))
752 	    {
753 	      int i;
754 	      /* This match wins the previous match. */
755 	      match_eo = pos;
756 	      if (match_tags)
757 		for (i = 0; i < tnfa->num_tags; i++)
758 		  match_tags[i] = tags[i];
759 	    }
760 	  /* Our TNFAs never have transitions leaving from the final state,
761 	     so we jump right to backtracking. */
762 	  goto backtrack;
763 	}
764 
765       /* Go to the next character in the input string. */
766       empty_br_match = 0;
767       trans_i = state;
768       if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
769 	{
770 	  /* This is a back reference state.  All transitions leaving from
771 	     this state have the same back reference "assertion".  Instead
772 	     of reading the next character, we match the back reference. */
773 	  regoff_t so, eo;
774 	  int bt = trans_i->u.backref;
775 	  regoff_t bt_len;
776 	  int result;
777 
778 	  /* Get the substring we need to match against.  Remember to
779 	     turn off REG_NOSUB temporarily. */
780 	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
781 			  tnfa, tags, pos);
782 	  so = pmatch[bt].rm_so;
783 	  eo = pmatch[bt].rm_eo;
784 	  bt_len = eo - so;
785 
786 	  result = strncmp((const char*)string + so, str_byte - 1,
787 				 (size_t)bt_len);
788 
789 	  if (result == 0)
790 	    {
791 	      /* Back reference matched.  Check for infinite loop. */
792 	      if (bt_len == 0)
793 		empty_br_match = 1;
794 	      if (empty_br_match && states_seen[trans_i->state_id])
795 		{
796 		  goto backtrack;
797 		}
798 
799 	      states_seen[trans_i->state_id] = empty_br_match;
800 
801 	      /* Advance in input string and resync `prev_c', `next_c'
802 		 and pos. */
803 	      str_byte += bt_len - 1;
804 	      pos += bt_len - 1;
805 	      GET_NEXT_WCHAR();
806 	    }
807 	  else
808 	    {
809 	      goto backtrack;
810 	    }
811 	}
812       else
813 	{
814 	  /* Check for end of string. */
815 	  if (next_c == L'\0')
816 		goto backtrack;
817 
818 	  /* Read the next character. */
819 	  GET_NEXT_WCHAR();
820 	}
821 
822       next_state = NULL;
823       for (trans_i = state; trans_i->state; trans_i++)
824 	{
825 	  if (trans_i->code_min <= (tre_cint_t)prev_c
826 	      && trans_i->code_max >= (tre_cint_t)prev_c)
827 	    {
828 	      if (trans_i->assertions
829 		  && (CHECK_ASSERTIONS(trans_i->assertions)
830 		      || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
831 		{
832 		  continue;
833 		}
834 
835 	      if (next_state == NULL)
836 		{
837 		  /* First matching transition. */
838 		  next_state = trans_i->state;
839 		  next_tags = trans_i->tags;
840 		}
841 	      else
842 		{
843 		  /* Second matching transition.  We may need to backtrack here
844 		     to take this transition instead of the first one, so we
845 		     push this transition in the backtracking stack so we can
846 		     jump back here if needed. */
847 		  BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
848 				trans_i->state_id, next_c, tags, mbstate);
849 		  {
850 		    int *tmp;
851 		    for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
852 		      stack->item.tags[*tmp] = pos;
853 		  }
854 #if 0 /* XXX - it's important not to look at all transitions here to keep
855 	 the stack small! */
856 		  break;
857 #endif
858 		}
859 	    }
860 	}
861 
862       if (next_state != NULL)
863 	{
864 	  /* Matching transitions were found.  Take the first one. */
865 	  state = next_state;
866 
867 	  /* Update the tag values. */
868 	  if (next_tags)
869 	    while (*next_tags >= 0)
870 	      tags[*next_tags++] = pos;
871 	}
872       else
873 	{
874 	backtrack:
875 	  /* A matching transition was not found.  Try to backtrack. */
876 	  if (stack->prev)
877 	    {
878 	      if (stack->item.state->assertions & ASSERT_BACKREF)
879 		{
880 		  states_seen[stack->item.state_id] = 0;
881 		}
882 
883 	      BT_STACK_POP();
884 	    }
885 	  else if (match_eo < 0)
886 	    {
887 	      /* Try starting from a later position in the input string. */
888 	      /* Check for end of string. */
889 	      if (next_c == L'\0')
890 		    {
891 		      break;
892 		    }
893 	      next_c = next_c_start;
894 #ifdef TRE_MBSTATE
895 	      mbstate = mbstate_start;
896 #endif /* TRE_MBSTATE */
897 	      str_byte = str_byte_start;
898 	      goto retry;
899 	    }
900 	  else
901 	    {
902 	      break;
903 	    }
904 	}
905     }
906 
907   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
908   *match_end_ofs = match_eo;
909 
910  error_exit:
911   tre_bt_mem_destroy(mem);
912 #ifndef TRE_USE_ALLOCA
913   if (tags)
914     xfree(tags);
915   if (pmatch)
916     xfree(pmatch);
917   if (states_seen)
918     xfree(states_seen);
919 #endif /* !TRE_USE_ALLOCA */
920 
921   return ret;
922 }
923 
924 /***********************************************************************
925  from regexec.c
926 ***********************************************************************/
927 
928 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
929    endpoint values. */
930 static void
tre_fill_pmatch(size_t nmatch,regmatch_t pmatch[],int cflags,const tre_tnfa_t * tnfa,regoff_t * tags,regoff_t match_eo)931 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
932 		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
933 {
934   tre_submatch_data_t *submatch_data;
935   unsigned int i, j;
936   int *parents;
937 
938   i = 0;
939   if (match_eo >= 0 && !(cflags & REG_NOSUB))
940     {
941       /* Construct submatch offsets from the tags. */
942       submatch_data = tnfa->submatch_data;
943       while (i < tnfa->num_submatches && i < nmatch)
944 	{
945 	  if (submatch_data[i].so_tag == tnfa->end_tag)
946 	    pmatch[i].rm_so = match_eo;
947 	  else
948 	    pmatch[i].rm_so = tags[submatch_data[i].so_tag];
949 
950 	  if (submatch_data[i].eo_tag == tnfa->end_tag)
951 	    pmatch[i].rm_eo = match_eo;
952 	  else
953 	    pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
954 
955 	  /* If either of the endpoints were not used, this submatch
956 	     was not part of the match. */
957 	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
958 	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
959 
960 	  i++;
961 	}
962       /* Reset all submatches that are not within all of their parent
963 	 submatches. */
964       i = 0;
965       while (i < tnfa->num_submatches && i < nmatch)
966 	{
967 	  if (pmatch[i].rm_eo == -1)
968 	    assert(pmatch[i].rm_so == -1);
969 	  assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
970 
971 	  parents = submatch_data[i].parents;
972 	  if (parents != NULL)
973 	    for (j = 0; parents[j] >= 0; j++)
974 	      {
975 		if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
976 		    || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
977 		  pmatch[i].rm_so = pmatch[i].rm_eo = -1;
978 	      }
979 	  i++;
980 	}
981     }
982 
983   while (i < nmatch)
984     {
985       pmatch[i].rm_so = -1;
986       pmatch[i].rm_eo = -1;
987       i++;
988     }
989 }
990 
991 
992 /*
993   Wrapper functions for POSIX compatible regexp matching.
994 */
995 
996 int
regexec(const regex_t * restrict preg,const char * restrict string,size_t nmatch,regmatch_t pmatch[restrict],int eflags)997 regexec(const regex_t *restrict preg, const char *restrict string,
998 	  size_t nmatch, regmatch_t pmatch[restrict], int eflags)
999 {
1000   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
1001   reg_errcode_t status;
1002   regoff_t *tags = NULL, eo;
1003   if (tnfa->cflags & REG_NOSUB) nmatch = 0;
1004   if (tnfa->num_tags > 0 && nmatch > 0)
1005     {
1006       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
1007       if (tags == NULL)
1008 	return REG_ESPACE;
1009     }
1010 
1011   /* Dispatch to the appropriate matcher. */
1012   if (tnfa->have_backrefs)
1013     {
1014       /* The regex has back references, use the backtracking matcher. */
1015       status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
1016     }
1017   else
1018     {
1019       /* Exact matching, no back references, use the parallel matcher. */
1020       status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
1021     }
1022 
1023   if (status == REG_OK)
1024     /* A match was found, so fill the submatch registers. */
1025     tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
1026   if (tags)
1027     xfree(tags);
1028   return status;
1029 }
1030