1 // Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Tested by search_test.cc.
6 //
7 // Prog::SearchNFA, an NFA search.
8 // This is an actual NFA like the theorists talk about,
9 // not the pseudo-NFA found in backtracking regexp implementations.
10 //
11 // IMPLEMENTATION
12 //
13 // This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14 // which is a variant of the one described in Thompson's 1968 CACM paper.
15 // See http://swtch.com/~rsc/regexp/ for various history. The main feature
16 // over the DFA implementation is that it tracks submatch boundaries.
17 //
18 // When the choice of submatch boundaries is ambiguous, this particular
19 // implementation makes the same choices that traditional backtracking
20 // implementations (in particular, Perl and PCRE) do.
21 // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22 // time in the length of the input.
23 //
24 // Like Thompson's original machine and like the DFA implementation, this
25 // implementation notices a match only once it is one byte past it.
26
27 #include <stdio.h>
28 #include <string.h>
29 #include <algorithm>
30 #include <string>
31 #include <utility>
32 #include <vector>
33
34 #include "re2/prog.h"
35 #include "re2/regexp.h"
36 #include "util/logging.h"
37 #include "util/pod_array.h"
38 #include "util/sparse_array.h"
39 #include "util/sparse_set.h"
40 #include "util/strutil.h"
41
42 namespace re2 {
43
44 static const bool ExtraDebug = false;
45
46 class NFA {
47 public:
48 NFA(Prog* prog);
49 ~NFA();
50
51 // Searches for a matching string.
52 // * If anchored is true, only considers matches starting at offset.
53 // Otherwise finds lefmost match at or after offset.
54 // * If longest is true, returns the longest match starting
55 // at the chosen start point. Otherwise returns the so-called
56 // left-biased match, the one traditional backtracking engines
57 // (like Perl and PCRE) find.
58 // Records submatch boundaries in submatch[1..nsubmatch-1].
59 // Submatch[0] is the entire match. When there is a choice in
60 // which text matches each subexpression, the submatch boundaries
61 // are chosen to match what a backtracking implementation would choose.
62 bool Search(const StringPiece& text, const StringPiece& context,
63 bool anchored, bool longest,
64 StringPiece* submatch, int nsubmatch);
65
66 private:
67 struct Thread {
68 union {
69 int ref;
70 Thread* next; // when on free list
71 };
72 const char** capture;
73 };
74
75 // State for explicit stack in AddToThreadq.
76 struct AddState {
77 int id; // Inst to process
78 Thread* t; // if not null, set t0 = t before processing id
79 };
80
81 // Threadq is a list of threads. The list is sorted by the order
82 // in which Perl would explore that particular state -- the earlier
83 // choices appear earlier in the list.
84 typedef SparseArray<Thread*> Threadq;
85
86 inline Thread* AllocThread();
87 inline Thread* Incref(Thread* t);
88 inline void Decref(Thread* t);
89
90 // Follows all empty arrows from id0 and enqueues all the states reached.
91 // Enqueues only the ByteRange instructions that match byte c.
92 // context is used (with p) for evaluating empty-width specials.
93 // p is the current input position, and t0 is the current thread.
94 void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
95 const char* p, Thread* t0);
96
97 // Run runq on byte c, appending new states to nextq.
98 // Updates matched_ and match_ as new, better matches are found.
99 // context is used (with p) for evaluating empty-width specials.
100 // p is the position of byte c in the input string for AddToThreadq;
101 // p-1 will be used when processing Match instructions.
102 // Frees all the threads on runq.
103 // If there is a shortcut to the end, returns that shortcut.
104 int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
105 const char* p);
106
107 // Returns text version of capture information, for debugging.
108 string FormatCapture(const char** capture);
109
110 inline void CopyCapture(const char** dst, const char** src);
111
112 Prog* prog_; // underlying program
113 int start_; // start instruction in program
114 int ncapture_; // number of submatches to track
115 bool longest_; // whether searching for longest match
116 bool endmatch_; // whether match must end at text.end()
117 const char* btext_; // beginning of text being matched (for FormatSubmatch)
118 const char* etext_; // end of text being matched (for endmatch_)
119 Threadq q0_, q1_; // pre-allocated for Search.
120 PODArray<AddState> stack_; // pre-allocated for AddToThreadq
121 Thread* free_threads_; // free list
122 const char** match_; // best match so far
123 bool matched_; // any match so far?
124
125 NFA(const NFA&) = delete;
126 NFA& operator=(const NFA&) = delete;
127 };
128
NFA(Prog * prog)129 NFA::NFA(Prog* prog) {
130 prog_ = prog;
131 start_ = prog_->start();
132 ncapture_ = 0;
133 longest_ = false;
134 endmatch_ = false;
135 btext_ = NULL;
136 etext_ = NULL;
137 q0_.resize(prog_->size());
138 q1_.resize(prog_->size());
139 // See NFA::AddToThreadq() for why this is so.
140 int nstack = 2*prog_->inst_count(kInstCapture) +
141 prog_->inst_count(kInstEmptyWidth) +
142 prog_->inst_count(kInstNop) + 1; // + 1 for start inst
143 stack_ = PODArray<AddState>(nstack);
144 free_threads_ = NULL;
145 match_ = NULL;
146 matched_ = false;
147 }
148
~NFA()149 NFA::~NFA() {
150 delete[] match_;
151 Thread* next;
152 for (Thread* t = free_threads_; t; t = next) {
153 next = t->next;
154 delete[] t->capture;
155 delete t;
156 }
157 }
158
AllocThread()159 NFA::Thread* NFA::AllocThread() {
160 Thread* t = free_threads_;
161 if (t == NULL) {
162 t = new Thread;
163 t->ref = 1;
164 t->capture = new const char*[ncapture_];
165 return t;
166 }
167 free_threads_ = t->next;
168 t->ref = 1;
169 return t;
170 }
171
Incref(Thread * t)172 NFA::Thread* NFA::Incref(Thread* t) {
173 DCHECK(t != NULL);
174 t->ref++;
175 return t;
176 }
177
Decref(Thread * t)178 void NFA::Decref(Thread* t) {
179 if (t == NULL)
180 return;
181 t->ref--;
182 if (t->ref > 0)
183 return;
184 DCHECK_EQ(t->ref, 0);
185 t->next = free_threads_;
186 free_threads_ = t;
187 }
188
CopyCapture(const char ** dst,const char ** src)189 void NFA::CopyCapture(const char** dst, const char** src) {
190 for (int i = 0; i < ncapture_; i+=2) {
191 dst[i] = src[i];
192 dst[i+1] = src[i+1];
193 }
194 }
195
196 // Follows all empty arrows from id0 and enqueues all the states reached.
197 // Enqueues only the ByteRange instructions that match byte c.
198 // context is used (with p) for evaluating empty-width specials.
199 // p is the current input position, and t0 is the current thread.
AddToThreadq(Threadq * q,int id0,int c,const StringPiece & context,const char * p,Thread * t0)200 void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
201 const char* p, Thread* t0) {
202 if (id0 == 0)
203 return;
204
205 // Use stack_ to hold our stack of instructions yet to process.
206 // It was preallocated as follows:
207 // two entries per Capture;
208 // one entry per EmptyWidth; and
209 // one entry per Nop.
210 // This reflects the maximum number of stack pushes that each can
211 // perform. (Each instruction can be processed at most once.)
212 AddState* stk = stack_.data();
213 int nstk = 0;
214
215 stk[nstk++] = {id0, NULL};
216 while (nstk > 0) {
217 DCHECK_LE(nstk, stack_.size());
218 AddState a = stk[--nstk];
219
220 Loop:
221 if (a.t != NULL) {
222 // t0 was a thread that we allocated and copied in order to
223 // record the capture, so we must now decref it.
224 Decref(t0);
225 t0 = a.t;
226 }
227
228 int id = a.id;
229 if (id == 0)
230 continue;
231 if (q->has_index(id)) {
232 if (ExtraDebug)
233 fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str());
234 continue;
235 }
236
237 // Create entry in q no matter what. We might fill it in below,
238 // or we might not. Even if not, it is necessary to have it,
239 // so that we don't revisit id0 during the recursion.
240 q->set_new(id, NULL);
241 Thread** tp = &q->get_existing(id);
242 int j;
243 Thread* t;
244 Prog::Inst* ip = prog_->inst(id);
245 switch (ip->opcode()) {
246 default:
247 LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
248 break;
249
250 case kInstFail:
251 break;
252
253 case kInstAltMatch:
254 // Save state; will pick up at next byte.
255 t = Incref(t0);
256 *tp = t;
257
258 DCHECK(!ip->last());
259 a = {id+1, NULL};
260 goto Loop;
261
262 case kInstNop:
263 if (!ip->last())
264 stk[nstk++] = {id+1, NULL};
265
266 // Continue on.
267 a = {ip->out(), NULL};
268 goto Loop;
269
270 case kInstCapture:
271 if (!ip->last())
272 stk[nstk++] = {id+1, NULL};
273
274 if ((j=ip->cap()) < ncapture_) {
275 // Push a dummy whose only job is to restore t0
276 // once we finish exploring this possibility.
277 stk[nstk++] = {0, t0};
278
279 // Record capture.
280 t = AllocThread();
281 CopyCapture(t->capture, t0->capture);
282 t->capture[j] = p;
283 t0 = t;
284 }
285 a = {ip->out(), NULL};
286 goto Loop;
287
288 case kInstByteRange:
289 if (!ip->Matches(c))
290 goto Next;
291 FALLTHROUGH_INTENDED;
292
293 case kInstMatch:
294 // Save state; will pick up at next byte.
295 t = Incref(t0);
296 *tp = t;
297 if (ExtraDebug)
298 fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
299
300 Next:
301 if (ip->last())
302 break;
303 a = {id+1, NULL};
304 goto Loop;
305
306 case kInstEmptyWidth:
307 if (!ip->last())
308 stk[nstk++] = {id+1, NULL};
309
310 // Continue on if we have all the right flag bits.
311 if (ip->empty() & ~Prog::EmptyFlags(context, p))
312 break;
313 a = {ip->out(), NULL};
314 goto Loop;
315 }
316 }
317 }
318
319 // Run runq on byte c, appending new states to nextq.
320 // Updates matched_ and match_ as new, better matches are found.
321 // context is used (with p) for evaluating empty-width specials.
322 // p is the position of byte c in the input string for AddToThreadq;
323 // p-1 will be used when processing Match instructions.
324 // Frees all the threads on runq.
325 // If there is a shortcut to the end, returns that shortcut.
Step(Threadq * runq,Threadq * nextq,int c,const StringPiece & context,const char * p)326 int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
327 const char* p) {
328 nextq->clear();
329
330 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
331 Thread* t = i->value();
332 if (t == NULL)
333 continue;
334
335 if (longest_) {
336 // Can skip any threads started after our current best match.
337 if (matched_ && match_[0] < t->capture[0]) {
338 Decref(t);
339 continue;
340 }
341 }
342
343 int id = i->index();
344 Prog::Inst* ip = prog_->inst(id);
345
346 switch (ip->opcode()) {
347 default:
348 // Should only see the values handled below.
349 LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
350 break;
351
352 case kInstByteRange:
353 AddToThreadq(nextq, ip->out(), c, context, p, t);
354 break;
355
356 case kInstAltMatch:
357 if (i != runq->begin())
358 break;
359 // The match is ours if we want it.
360 if (ip->greedy(prog_) || longest_) {
361 CopyCapture(match_, t->capture);
362 matched_ = true;
363
364 Decref(t);
365 for (++i; i != runq->end(); ++i)
366 Decref(i->value());
367 runq->clear();
368 if (ip->greedy(prog_))
369 return ip->out1();
370 return ip->out();
371 }
372 break;
373
374 case kInstMatch: {
375 // Avoid invoking undefined behavior when p happens
376 // to be null - and p-1 would be meaningless anyway.
377 if (p == NULL)
378 break;
379
380 if (endmatch_ && p-1 != etext_)
381 break;
382
383 if (longest_) {
384 // Leftmost-longest mode: save this match only if
385 // it is either farther to the left or at the same
386 // point but longer than an existing match.
387 if (!matched_ || t->capture[0] < match_[0] ||
388 (t->capture[0] == match_[0] && p-1 > match_[1])) {
389 CopyCapture(match_, t->capture);
390 match_[1] = p-1;
391 matched_ = true;
392 }
393 } else {
394 // Leftmost-biased mode: this match is by definition
395 // better than what we've already found (see next line).
396 CopyCapture(match_, t->capture);
397 match_[1] = p-1;
398 matched_ = true;
399
400 // Cut off the threads that can only find matches
401 // worse than the one we just found: don't run the
402 // rest of the current Threadq.
403 Decref(t);
404 for (++i; i != runq->end(); ++i)
405 Decref(i->value());
406 runq->clear();
407 return 0;
408 }
409 break;
410 }
411 }
412 Decref(t);
413 }
414 runq->clear();
415 return 0;
416 }
417
FormatCapture(const char ** capture)418 string NFA::FormatCapture(const char** capture) {
419 string s;
420
421 for (int i = 0; i < ncapture_; i+=2) {
422 if (capture[i] == NULL)
423 StringAppendF(&s, "(?,?)");
424 else if (capture[i+1] == NULL)
425 StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
426 else
427 StringAppendF(&s, "(%d,%d)",
428 (int)(capture[i] - btext_),
429 (int)(capture[i+1] - btext_));
430 }
431 return s;
432 }
433
Search(const StringPiece & text,const StringPiece & const_context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)434 bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
435 bool anchored, bool longest,
436 StringPiece* submatch, int nsubmatch) {
437 if (start_ == 0)
438 return false;
439
440 StringPiece context = const_context;
441 if (context.begin() == NULL)
442 context = text;
443
444 // Sanity check: make sure that text lies within context.
445 if (text.begin() < context.begin() || text.end() > context.end()) {
446 LOG(DFATAL) << "context does not contain text";
447 return false;
448 }
449
450 if (prog_->anchor_start() && context.begin() != text.begin())
451 return false;
452 if (prog_->anchor_end() && context.end() != text.end())
453 return false;
454 anchored |= prog_->anchor_start();
455 if (prog_->anchor_end()) {
456 longest = true;
457 endmatch_ = true;
458 etext_ = text.end();
459 }
460
461 if (nsubmatch < 0) {
462 LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
463 return false;
464 }
465
466 // Save search parameters.
467 ncapture_ = 2*nsubmatch;
468 longest_ = longest;
469
470 if (nsubmatch == 0) {
471 // We need to maintain match[0], both to distinguish the
472 // longest match (if longest is true) and also to tell
473 // whether we've seen any matches at all.
474 ncapture_ = 2;
475 }
476
477 match_ = new const char*[ncapture_];
478 matched_ = false;
479
480 // For debugging prints.
481 btext_ = context.begin();
482
483 if (ExtraDebug)
484 fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
485 string(text).c_str(), string(context).c_str(), anchored, longest);
486
487 // Set up search.
488 Threadq* runq = &q0_;
489 Threadq* nextq = &q1_;
490 runq->clear();
491 nextq->clear();
492 memset(&match_[0], 0, ncapture_*sizeof match_[0]);
493
494 // Loop over the text, stepping the machine.
495 for (const char* p = text.begin();; p++) {
496 if (ExtraDebug) {
497 int c = 0;
498 if (p == context.begin())
499 c = '^';
500 else if (p > text.end())
501 c = '$';
502 else if (p < text.end())
503 c = p[0] & 0xFF;
504
505 fprintf(stderr, "%c:", c);
506 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
507 Thread* t = i->value();
508 if (t == NULL)
509 continue;
510 fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
511 }
512 fprintf(stderr, "\n");
513 }
514
515 // This is a no-op the first time around the loop because runq is empty.
516 int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, context, p);
517 DCHECK_EQ(runq->size(), 0);
518 using std::swap;
519 swap(nextq, runq);
520 nextq->clear();
521 if (id != 0) {
522 // We're done: full match ahead.
523 p = text.end();
524 for (;;) {
525 Prog::Inst* ip = prog_->inst(id);
526 switch (ip->opcode()) {
527 default:
528 LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
529 break;
530
531 case kInstCapture:
532 if (ip->cap() < ncapture_)
533 match_[ip->cap()] = p;
534 id = ip->out();
535 continue;
536
537 case kInstNop:
538 id = ip->out();
539 continue;
540
541 case kInstMatch:
542 match_[1] = p;
543 matched_ = true;
544 break;
545 }
546 break;
547 }
548 break;
549 }
550
551 if (p > text.end())
552 break;
553
554 // Start a new thread if there have not been any matches.
555 // (No point in starting a new thread if there have been
556 // matches, since it would be to the right of the match
557 // we already found.)
558 if (!matched_ && (!anchored || p == text.begin())) {
559 // If there's a required first byte for an unanchored search
560 // and we're not in the middle of any possible matches,
561 // use memchr to search for the byte quickly.
562 int fb = prog_->first_byte();
563 if (!anchored && runq->size() == 0 &&
564 fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
565 p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
566 if (p == NULL) {
567 p = text.end();
568 }
569 }
570
571 Thread* t = AllocThread();
572 CopyCapture(t->capture, match_);
573 t->capture[0] = p;
574 AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, context, p,
575 t);
576 Decref(t);
577 }
578
579 // If all the threads have died, stop early.
580 if (runq->size() == 0) {
581 if (ExtraDebug)
582 fprintf(stderr, "dead\n");
583 break;
584 }
585 }
586
587 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
588 Decref(i->value());
589
590 if (matched_) {
591 for (int i = 0; i < nsubmatch; i++)
592 submatch[i] =
593 StringPiece(match_[2 * i],
594 static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
595 if (ExtraDebug)
596 fprintf(stderr, "match (%td,%td)\n",
597 match_[0] - btext_, match_[1] - btext_);
598 return true;
599 }
600 return false;
601 }
602
603 // Computes whether all successful matches have a common first byte,
604 // and if so, returns that byte. If not, returns -1.
ComputeFirstByte()605 int Prog::ComputeFirstByte() {
606 int b = -1;
607 SparseSet q(size());
608 q.insert(start());
609 for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
610 int id = *it;
611 Prog::Inst* ip = inst(id);
612 switch (ip->opcode()) {
613 default:
614 LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
615 break;
616
617 case kInstMatch:
618 // The empty string matches: no first byte.
619 return -1;
620
621 case kInstByteRange:
622 if (!ip->last())
623 q.insert(id+1);
624
625 // Must match only a single byte
626 if (ip->lo() != ip->hi())
627 return -1;
628 if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
629 return -1;
630 // If we haven't seen any bytes yet, record it;
631 // otherwise must match the one we saw before.
632 if (b == -1)
633 b = ip->lo();
634 else if (b != ip->lo())
635 return -1;
636 break;
637
638 case kInstNop:
639 case kInstCapture:
640 case kInstEmptyWidth:
641 if (!ip->last())
642 q.insert(id+1);
643
644 // Continue on.
645 // Ignore ip->empty() flags for kInstEmptyWidth
646 // in order to be as conservative as possible
647 // (assume all possible empty-width flags are true).
648 if (ip->out())
649 q.insert(ip->out());
650 break;
651
652 case kInstAltMatch:
653 DCHECK(!ip->last());
654 q.insert(id+1);
655 break;
656
657 case kInstFail:
658 break;
659 }
660 }
661 return b;
662 }
663
664 bool
SearchNFA(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)665 Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
666 Anchor anchor, MatchKind kind,
667 StringPiece* match, int nmatch) {
668 if (ExtraDebug)
669 Dump();
670
671 NFA nfa(this);
672 StringPiece sp;
673 if (kind == kFullMatch) {
674 anchor = kAnchored;
675 if (nmatch == 0) {
676 match = &sp;
677 nmatch = 1;
678 }
679 }
680 if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
681 return false;
682 if (kind == kFullMatch && match[0].end() != text.end())
683 return false;
684 return true;
685 }
686
687 // For each instruction i in the program reachable from the start, compute the
688 // number of instructions reachable from i by following only empty transitions
689 // and record that count as fanout[i].
690 //
691 // fanout holds the results and is also the work queue for the outer iteration.
692 // reachable holds the reached nodes for the inner iteration.
Fanout(SparseArray<int> * fanout)693 void Prog::Fanout(SparseArray<int>* fanout) {
694 DCHECK_EQ(fanout->max_size(), size());
695 SparseSet reachable(size());
696 fanout->clear();
697 fanout->set_new(start(), 0);
698 for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
699 int* count = &i->value();
700 reachable.clear();
701 reachable.insert(i->index());
702 for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
703 int id = *j;
704 Prog::Inst* ip = inst(id);
705 switch (ip->opcode()) {
706 default:
707 LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
708 break;
709
710 case kInstByteRange:
711 if (!ip->last())
712 reachable.insert(id+1);
713
714 (*count)++;
715 if (!fanout->has_index(ip->out())) {
716 fanout->set_new(ip->out(), 0);
717 }
718 break;
719
720 case kInstAltMatch:
721 DCHECK(!ip->last());
722 reachable.insert(id+1);
723 break;
724
725 case kInstCapture:
726 case kInstEmptyWidth:
727 case kInstNop:
728 if (!ip->last())
729 reachable.insert(id+1);
730
731 reachable.insert(ip->out());
732 break;
733
734 case kInstMatch:
735 if (!ip->last())
736 reachable.insert(id+1);
737 break;
738
739 case kInstFail:
740 break;
741 }
742 }
743 }
744 }
745
746 } // namespace re2
747