xref: /aosp_15_r20/external/regex-re2/re2/nfa.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1 // Copyright 2006-2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc.
6 //
7 // Prog::SearchNFA, an NFA search.
8 // This is an actual NFA like the theorists talk about,
9 // not the pseudo-NFA found in backtracking regexp implementations.
10 //
11 // IMPLEMENTATION
12 //
13 // This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14 // which is a variant of the one described in Thompson's 1968 CACM paper.
15 // See http://swtch.com/~rsc/regexp/ for various history.  The main feature
16 // over the DFA implementation is that it tracks submatch boundaries.
17 //
18 // When the choice of submatch boundaries is ambiguous, this particular
19 // implementation makes the same choices that traditional backtracking
20 // implementations (in particular, Perl and PCRE) do.
21 // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22 // time in the length of the input.
23 //
24 // Like Thompson's original machine and like the DFA implementation, this
25 // implementation notices a match only once it is one byte past it.
26 
27 #include <stdio.h>
28 #include <string.h>
29 #include <algorithm>
30 #include <string>
31 #include <utility>
32 #include <vector>
33 
34 #include "re2/prog.h"
35 #include "re2/regexp.h"
36 #include "util/logging.h"
37 #include "util/pod_array.h"
38 #include "util/sparse_array.h"
39 #include "util/sparse_set.h"
40 #include "util/strutil.h"
41 
42 namespace re2 {
43 
44 static const bool ExtraDebug = false;
45 
46 class NFA {
47  public:
48   NFA(Prog* prog);
49   ~NFA();
50 
51   // Searches for a matching string.
52   //   * If anchored is true, only considers matches starting at offset.
53   //     Otherwise finds lefmost match at or after offset.
54   //   * If longest is true, returns the longest match starting
55   //     at the chosen start point.  Otherwise returns the so-called
56   //     left-biased match, the one traditional backtracking engines
57   //     (like Perl and PCRE) find.
58   // Records submatch boundaries in submatch[1..nsubmatch-1].
59   // Submatch[0] is the entire match.  When there is a choice in
60   // which text matches each subexpression, the submatch boundaries
61   // are chosen to match what a backtracking implementation would choose.
62   bool Search(const StringPiece& text, const StringPiece& context,
63               bool anchored, bool longest,
64               StringPiece* submatch, int nsubmatch);
65 
66  private:
67   struct Thread {
68     union {
69       int ref;
70       Thread* next;  // when on free list
71     };
72     const char** capture;
73   };
74 
75   // State for explicit stack in AddToThreadq.
76   struct AddState {
77     int id;     // Inst to process
78     Thread* t;  // if not null, set t0 = t before processing id
79   };
80 
81   // Threadq is a list of threads.  The list is sorted by the order
82   // in which Perl would explore that particular state -- the earlier
83   // choices appear earlier in the list.
84   typedef SparseArray<Thread*> Threadq;
85 
86   inline Thread* AllocThread();
87   inline Thread* Incref(Thread* t);
88   inline void Decref(Thread* t);
89 
90   // Follows all empty arrows from id0 and enqueues all the states reached.
91   // Enqueues only the ByteRange instructions that match byte c.
92   // context is used (with p) for evaluating empty-width specials.
93   // p is the current input position, and t0 is the current thread.
94   void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
95                     const char* p, Thread* t0);
96 
97   // Run runq on byte c, appending new states to nextq.
98   // Updates matched_ and match_ as new, better matches are found.
99   // context is used (with p) for evaluating empty-width specials.
100   // p is the position of byte c in the input string for AddToThreadq;
101   // p-1 will be used when processing Match instructions.
102   // Frees all the threads on runq.
103   // If there is a shortcut to the end, returns that shortcut.
104   int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
105            const char* p);
106 
107   // Returns text version of capture information, for debugging.
108   string FormatCapture(const char** capture);
109 
110   inline void CopyCapture(const char** dst, const char** src);
111 
112   Prog* prog_;                // underlying program
113   int start_;                 // start instruction in program
114   int ncapture_;              // number of submatches to track
115   bool longest_;              // whether searching for longest match
116   bool endmatch_;             // whether match must end at text.end()
117   const char* btext_;         // beginning of text being matched (for FormatSubmatch)
118   const char* etext_;         // end of text being matched (for endmatch_)
119   Threadq q0_, q1_;           // pre-allocated for Search.
120   PODArray<AddState> stack_;  // pre-allocated for AddToThreadq
121   Thread* free_threads_;      // free list
122   const char** match_;        // best match so far
123   bool matched_;              // any match so far?
124 
125   NFA(const NFA&) = delete;
126   NFA& operator=(const NFA&) = delete;
127 };
128 
NFA(Prog * prog)129 NFA::NFA(Prog* prog) {
130   prog_ = prog;
131   start_ = prog_->start();
132   ncapture_ = 0;
133   longest_ = false;
134   endmatch_ = false;
135   btext_ = NULL;
136   etext_ = NULL;
137   q0_.resize(prog_->size());
138   q1_.resize(prog_->size());
139   // See NFA::AddToThreadq() for why this is so.
140   int nstack = 2*prog_->inst_count(kInstCapture) +
141                prog_->inst_count(kInstEmptyWidth) +
142                prog_->inst_count(kInstNop) + 1;  // + 1 for start inst
143   stack_ = PODArray<AddState>(nstack);
144   free_threads_ = NULL;
145   match_ = NULL;
146   matched_ = false;
147 }
148 
~NFA()149 NFA::~NFA() {
150   delete[] match_;
151   Thread* next;
152   for (Thread* t = free_threads_; t; t = next) {
153     next = t->next;
154     delete[] t->capture;
155     delete t;
156   }
157 }
158 
AllocThread()159 NFA::Thread* NFA::AllocThread() {
160   Thread* t = free_threads_;
161   if (t == NULL) {
162     t = new Thread;
163     t->ref = 1;
164     t->capture = new const char*[ncapture_];
165     return t;
166   }
167   free_threads_ = t->next;
168   t->ref = 1;
169   return t;
170 }
171 
Incref(Thread * t)172 NFA::Thread* NFA::Incref(Thread* t) {
173   DCHECK(t != NULL);
174   t->ref++;
175   return t;
176 }
177 
Decref(Thread * t)178 void NFA::Decref(Thread* t) {
179   if (t == NULL)
180     return;
181   t->ref--;
182   if (t->ref > 0)
183     return;
184   DCHECK_EQ(t->ref, 0);
185   t->next = free_threads_;
186   free_threads_ = t;
187 }
188 
CopyCapture(const char ** dst,const char ** src)189 void NFA::CopyCapture(const char** dst, const char** src) {
190   for (int i = 0; i < ncapture_; i+=2) {
191     dst[i] = src[i];
192     dst[i+1] = src[i+1];
193   }
194 }
195 
196 // Follows all empty arrows from id0 and enqueues all the states reached.
197 // Enqueues only the ByteRange instructions that match byte c.
198 // context is used (with p) for evaluating empty-width specials.
199 // p is the current input position, and t0 is the current thread.
AddToThreadq(Threadq * q,int id0,int c,const StringPiece & context,const char * p,Thread * t0)200 void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
201                        const char* p, Thread* t0) {
202   if (id0 == 0)
203     return;
204 
205   // Use stack_ to hold our stack of instructions yet to process.
206   // It was preallocated as follows:
207   //   two entries per Capture;
208   //   one entry per EmptyWidth; and
209   //   one entry per Nop.
210   // This reflects the maximum number of stack pushes that each can
211   // perform. (Each instruction can be processed at most once.)
212   AddState* stk = stack_.data();
213   int nstk = 0;
214 
215   stk[nstk++] = {id0, NULL};
216   while (nstk > 0) {
217     DCHECK_LE(nstk, stack_.size());
218     AddState a = stk[--nstk];
219 
220   Loop:
221     if (a.t != NULL) {
222       // t0 was a thread that we allocated and copied in order to
223       // record the capture, so we must now decref it.
224       Decref(t0);
225       t0 = a.t;
226     }
227 
228     int id = a.id;
229     if (id == 0)
230       continue;
231     if (q->has_index(id)) {
232       if (ExtraDebug)
233         fprintf(stderr, "  [%d%s]\n", id, FormatCapture(t0->capture).c_str());
234       continue;
235     }
236 
237     // Create entry in q no matter what.  We might fill it in below,
238     // or we might not.  Even if not, it is necessary to have it,
239     // so that we don't revisit id0 during the recursion.
240     q->set_new(id, NULL);
241     Thread** tp = &q->get_existing(id);
242     int j;
243     Thread* t;
244     Prog::Inst* ip = prog_->inst(id);
245     switch (ip->opcode()) {
246     default:
247       LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
248       break;
249 
250     case kInstFail:
251       break;
252 
253     case kInstAltMatch:
254       // Save state; will pick up at next byte.
255       t = Incref(t0);
256       *tp = t;
257 
258       DCHECK(!ip->last());
259       a = {id+1, NULL};
260       goto Loop;
261 
262     case kInstNop:
263       if (!ip->last())
264         stk[nstk++] = {id+1, NULL};
265 
266       // Continue on.
267       a = {ip->out(), NULL};
268       goto Loop;
269 
270     case kInstCapture:
271       if (!ip->last())
272         stk[nstk++] = {id+1, NULL};
273 
274       if ((j=ip->cap()) < ncapture_) {
275         // Push a dummy whose only job is to restore t0
276         // once we finish exploring this possibility.
277         stk[nstk++] = {0, t0};
278 
279         // Record capture.
280         t = AllocThread();
281         CopyCapture(t->capture, t0->capture);
282         t->capture[j] = p;
283         t0 = t;
284       }
285       a = {ip->out(), NULL};
286       goto Loop;
287 
288     case kInstByteRange:
289       if (!ip->Matches(c))
290         goto Next;
291       FALLTHROUGH_INTENDED;
292 
293     case kInstMatch:
294       // Save state; will pick up at next byte.
295       t = Incref(t0);
296       *tp = t;
297       if (ExtraDebug)
298         fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
299 
300     Next:
301       if (ip->last())
302         break;
303       a = {id+1, NULL};
304       goto Loop;
305 
306     case kInstEmptyWidth:
307       if (!ip->last())
308         stk[nstk++] = {id+1, NULL};
309 
310       // Continue on if we have all the right flag bits.
311       if (ip->empty() & ~Prog::EmptyFlags(context, p))
312         break;
313       a = {ip->out(), NULL};
314       goto Loop;
315     }
316   }
317 }
318 
319 // Run runq on byte c, appending new states to nextq.
320 // Updates matched_ and match_ as new, better matches are found.
321 // context is used (with p) for evaluating empty-width specials.
322 // p is the position of byte c in the input string for AddToThreadq;
323 // p-1 will be used when processing Match instructions.
324 // Frees all the threads on runq.
325 // If there is a shortcut to the end, returns that shortcut.
Step(Threadq * runq,Threadq * nextq,int c,const StringPiece & context,const char * p)326 int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
327               const char* p) {
328   nextq->clear();
329 
330   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
331     Thread* t = i->value();
332     if (t == NULL)
333       continue;
334 
335     if (longest_) {
336       // Can skip any threads started after our current best match.
337       if (matched_ && match_[0] < t->capture[0]) {
338         Decref(t);
339         continue;
340       }
341     }
342 
343     int id = i->index();
344     Prog::Inst* ip = prog_->inst(id);
345 
346     switch (ip->opcode()) {
347       default:
348         // Should only see the values handled below.
349         LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
350         break;
351 
352       case kInstByteRange:
353         AddToThreadq(nextq, ip->out(), c, context, p, t);
354         break;
355 
356       case kInstAltMatch:
357         if (i != runq->begin())
358           break;
359         // The match is ours if we want it.
360         if (ip->greedy(prog_) || longest_) {
361           CopyCapture(match_, t->capture);
362           matched_ = true;
363 
364           Decref(t);
365           for (++i; i != runq->end(); ++i)
366             Decref(i->value());
367           runq->clear();
368           if (ip->greedy(prog_))
369             return ip->out1();
370           return ip->out();
371         }
372         break;
373 
374       case kInstMatch: {
375         // Avoid invoking undefined behavior when p happens
376         // to be null - and p-1 would be meaningless anyway.
377         if (p == NULL)
378           break;
379 
380         if (endmatch_ && p-1 != etext_)
381           break;
382 
383         if (longest_) {
384           // Leftmost-longest mode: save this match only if
385           // it is either farther to the left or at the same
386           // point but longer than an existing match.
387           if (!matched_ || t->capture[0] < match_[0] ||
388               (t->capture[0] == match_[0] && p-1 > match_[1])) {
389             CopyCapture(match_, t->capture);
390             match_[1] = p-1;
391             matched_ = true;
392           }
393         } else {
394           // Leftmost-biased mode: this match is by definition
395           // better than what we've already found (see next line).
396           CopyCapture(match_, t->capture);
397           match_[1] = p-1;
398           matched_ = true;
399 
400           // Cut off the threads that can only find matches
401           // worse than the one we just found: don't run the
402           // rest of the current Threadq.
403           Decref(t);
404           for (++i; i != runq->end(); ++i)
405             Decref(i->value());
406           runq->clear();
407           return 0;
408         }
409         break;
410       }
411     }
412     Decref(t);
413   }
414   runq->clear();
415   return 0;
416 }
417 
FormatCapture(const char ** capture)418 string NFA::FormatCapture(const char** capture) {
419   string s;
420 
421   for (int i = 0; i < ncapture_; i+=2) {
422     if (capture[i] == NULL)
423       StringAppendF(&s, "(?,?)");
424     else if (capture[i+1] == NULL)
425       StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
426     else
427       StringAppendF(&s, "(%d,%d)",
428                     (int)(capture[i] - btext_),
429                     (int)(capture[i+1] - btext_));
430   }
431   return s;
432 }
433 
Search(const StringPiece & text,const StringPiece & const_context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)434 bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
435             bool anchored, bool longest,
436             StringPiece* submatch, int nsubmatch) {
437   if (start_ == 0)
438     return false;
439 
440   StringPiece context = const_context;
441   if (context.begin() == NULL)
442     context = text;
443 
444   // Sanity check: make sure that text lies within context.
445   if (text.begin() < context.begin() || text.end() > context.end()) {
446     LOG(DFATAL) << "context does not contain text";
447     return false;
448   }
449 
450   if (prog_->anchor_start() && context.begin() != text.begin())
451     return false;
452   if (prog_->anchor_end() && context.end() != text.end())
453     return false;
454   anchored |= prog_->anchor_start();
455   if (prog_->anchor_end()) {
456     longest = true;
457     endmatch_ = true;
458     etext_ = text.end();
459   }
460 
461   if (nsubmatch < 0) {
462     LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
463     return false;
464   }
465 
466   // Save search parameters.
467   ncapture_ = 2*nsubmatch;
468   longest_ = longest;
469 
470   if (nsubmatch == 0) {
471     // We need to maintain match[0], both to distinguish the
472     // longest match (if longest is true) and also to tell
473     // whether we've seen any matches at all.
474     ncapture_ = 2;
475   }
476 
477   match_ = new const char*[ncapture_];
478   matched_ = false;
479 
480   // For debugging prints.
481   btext_ = context.begin();
482 
483   if (ExtraDebug)
484     fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
485             string(text).c_str(), string(context).c_str(), anchored, longest);
486 
487   // Set up search.
488   Threadq* runq = &q0_;
489   Threadq* nextq = &q1_;
490   runq->clear();
491   nextq->clear();
492   memset(&match_[0], 0, ncapture_*sizeof match_[0]);
493 
494   // Loop over the text, stepping the machine.
495   for (const char* p = text.begin();; p++) {
496     if (ExtraDebug) {
497       int c = 0;
498       if (p == context.begin())
499         c = '^';
500       else if (p > text.end())
501         c = '$';
502       else if (p < text.end())
503         c = p[0] & 0xFF;
504 
505       fprintf(stderr, "%c:", c);
506       for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
507         Thread* t = i->value();
508         if (t == NULL)
509           continue;
510         fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
511       }
512       fprintf(stderr, "\n");
513     }
514 
515     // This is a no-op the first time around the loop because runq is empty.
516     int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, context, p);
517     DCHECK_EQ(runq->size(), 0);
518     using std::swap;
519     swap(nextq, runq);
520     nextq->clear();
521     if (id != 0) {
522       // We're done: full match ahead.
523       p = text.end();
524       for (;;) {
525         Prog::Inst* ip = prog_->inst(id);
526         switch (ip->opcode()) {
527           default:
528             LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
529             break;
530 
531           case kInstCapture:
532             if (ip->cap() < ncapture_)
533               match_[ip->cap()] = p;
534             id = ip->out();
535             continue;
536 
537           case kInstNop:
538             id = ip->out();
539             continue;
540 
541           case kInstMatch:
542             match_[1] = p;
543             matched_ = true;
544             break;
545         }
546         break;
547       }
548       break;
549     }
550 
551     if (p > text.end())
552       break;
553 
554     // Start a new thread if there have not been any matches.
555     // (No point in starting a new thread if there have been
556     // matches, since it would be to the right of the match
557     // we already found.)
558     if (!matched_ && (!anchored || p == text.begin())) {
559       // If there's a required first byte for an unanchored search
560       // and we're not in the middle of any possible matches,
561       // use memchr to search for the byte quickly.
562       int fb = prog_->first_byte();
563       if (!anchored && runq->size() == 0 &&
564           fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
565         p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
566         if (p == NULL) {
567           p = text.end();
568         }
569       }
570 
571       Thread* t = AllocThread();
572       CopyCapture(t->capture, match_);
573       t->capture[0] = p;
574       AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, context, p,
575                    t);
576       Decref(t);
577     }
578 
579     // If all the threads have died, stop early.
580     if (runq->size() == 0) {
581       if (ExtraDebug)
582         fprintf(stderr, "dead\n");
583       break;
584     }
585   }
586 
587   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
588     Decref(i->value());
589 
590   if (matched_) {
591     for (int i = 0; i < nsubmatch; i++)
592       submatch[i] =
593           StringPiece(match_[2 * i],
594                       static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
595     if (ExtraDebug)
596       fprintf(stderr, "match (%td,%td)\n",
597               match_[0] - btext_, match_[1] - btext_);
598     return true;
599   }
600   return false;
601 }
602 
603 // Computes whether all successful matches have a common first byte,
604 // and if so, returns that byte.  If not, returns -1.
ComputeFirstByte()605 int Prog::ComputeFirstByte() {
606   int b = -1;
607   SparseSet q(size());
608   q.insert(start());
609   for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
610     int id = *it;
611     Prog::Inst* ip = inst(id);
612     switch (ip->opcode()) {
613       default:
614         LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
615         break;
616 
617       case kInstMatch:
618         // The empty string matches: no first byte.
619         return -1;
620 
621       case kInstByteRange:
622         if (!ip->last())
623           q.insert(id+1);
624 
625         // Must match only a single byte
626         if (ip->lo() != ip->hi())
627           return -1;
628         if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
629           return -1;
630         // If we haven't seen any bytes yet, record it;
631         // otherwise must match the one we saw before.
632         if (b == -1)
633           b = ip->lo();
634         else if (b != ip->lo())
635           return -1;
636         break;
637 
638       case kInstNop:
639       case kInstCapture:
640       case kInstEmptyWidth:
641         if (!ip->last())
642           q.insert(id+1);
643 
644         // Continue on.
645         // Ignore ip->empty() flags for kInstEmptyWidth
646         // in order to be as conservative as possible
647         // (assume all possible empty-width flags are true).
648         if (ip->out())
649           q.insert(ip->out());
650         break;
651 
652       case kInstAltMatch:
653         DCHECK(!ip->last());
654         q.insert(id+1);
655         break;
656 
657       case kInstFail:
658         break;
659     }
660   }
661   return b;
662 }
663 
664 bool
SearchNFA(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)665 Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
666                 Anchor anchor, MatchKind kind,
667                 StringPiece* match, int nmatch) {
668   if (ExtraDebug)
669     Dump();
670 
671   NFA nfa(this);
672   StringPiece sp;
673   if (kind == kFullMatch) {
674     anchor = kAnchored;
675     if (nmatch == 0) {
676       match = &sp;
677       nmatch = 1;
678     }
679   }
680   if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
681     return false;
682   if (kind == kFullMatch && match[0].end() != text.end())
683     return false;
684   return true;
685 }
686 
687 // For each instruction i in the program reachable from the start, compute the
688 // number of instructions reachable from i by following only empty transitions
689 // and record that count as fanout[i].
690 //
691 // fanout holds the results and is also the work queue for the outer iteration.
692 // reachable holds the reached nodes for the inner iteration.
Fanout(SparseArray<int> * fanout)693 void Prog::Fanout(SparseArray<int>* fanout) {
694   DCHECK_EQ(fanout->max_size(), size());
695   SparseSet reachable(size());
696   fanout->clear();
697   fanout->set_new(start(), 0);
698   for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
699     int* count = &i->value();
700     reachable.clear();
701     reachable.insert(i->index());
702     for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
703       int id = *j;
704       Prog::Inst* ip = inst(id);
705       switch (ip->opcode()) {
706         default:
707           LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
708           break;
709 
710         case kInstByteRange:
711           if (!ip->last())
712             reachable.insert(id+1);
713 
714           (*count)++;
715           if (!fanout->has_index(ip->out())) {
716             fanout->set_new(ip->out(), 0);
717           }
718           break;
719 
720         case kInstAltMatch:
721           DCHECK(!ip->last());
722           reachable.insert(id+1);
723           break;
724 
725         case kInstCapture:
726         case kInstEmptyWidth:
727         case kInstNop:
728           if (!ip->last())
729             reachable.insert(id+1);
730 
731           reachable.insert(ip->out());
732           break;
733 
734         case kInstMatch:
735           if (!ip->last())
736             reachable.insert(id+1);
737           break;
738 
739         case kInstFail:
740           break;
741       }
742     }
743   }
744 }
745 
746 }  // namespace re2
747