xref: /aosp_15_r20/external/cronet/third_party/re2/src/re2/regexp.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression representation.
6 // Tested by parse_test.cc
7 
8 #include "re2/regexp.h"
9 
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <string.h>
13 #include <algorithm>
14 #include <map>
15 #include <string>
16 #include <vector>
17 
18 #include "absl/base/call_once.h"
19 #include "absl/base/macros.h"
20 #include "absl/container/flat_hash_map.h"
21 #include "absl/synchronization/mutex.h"
22 #include "util/logging.h"
23 #include "util/utf.h"
24 #include "re2/pod_array.h"
25 #include "re2/walker-inl.h"
26 
27 namespace re2 {
28 
29 // Constructor.  Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)30 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
31   : op_(static_cast<uint8_t>(op)),
32     simple_(false),
33     parse_flags_(static_cast<uint16_t>(parse_flags)),
34     ref_(1),
35     nsub_(0),
36     down_(NULL) {
37   subone_ = NULL;
38   memset(the_union_, 0, sizeof the_union_);
39 }
40 
41 // Destructor.  Assumes already cleaned up children.
42 // Private: use Decref() instead of delete to destroy Regexps.
43 // Can't call Decref on the sub-Regexps here because
44 // that could cause arbitrarily deep recursion, so
45 // required Decref() to have handled them for us.
~Regexp()46 Regexp::~Regexp() {
47   if (nsub_ > 0)
48     LOG(DFATAL) << "Regexp not destroyed.";
49 
50   switch (op_) {
51     default:
52       break;
53     case kRegexpCapture:
54       delete name_;
55       break;
56     case kRegexpLiteralString:
57       delete[] runes_;
58       break;
59     case kRegexpCharClass:
60       if (cc_)
61         cc_->Delete();
62       delete ccb_;
63       break;
64   }
65 }
66 
67 // If it's possible to destroy this regexp without recurring,
68 // do so and return true.  Else return false.
QuickDestroy()69 bool Regexp::QuickDestroy() {
70   if (nsub_ == 0) {
71     delete this;
72     return true;
73   }
74   return false;
75 }
76 
77 // Similar to EmptyStorage in re2.cc.
78 struct RefStorage {
79   absl::Mutex ref_mutex;
80   absl::flat_hash_map<Regexp*, int> ref_map;
81 };
82 alignas(RefStorage) static char ref_storage[sizeof(RefStorage)];
83 
ref_mutex()84 static inline absl::Mutex* ref_mutex() {
85   return &reinterpret_cast<RefStorage*>(ref_storage)->ref_mutex;
86 }
87 
ref_map()88 static inline absl::flat_hash_map<Regexp*, int>* ref_map() {
89   return &reinterpret_cast<RefStorage*>(ref_storage)->ref_map;
90 }
91 
Ref()92 int Regexp::Ref() {
93   if (ref_ < kMaxRef)
94     return ref_;
95 
96   absl::MutexLock l(ref_mutex());
97   return (*ref_map())[this];
98 }
99 
100 // Increments reference count, returns object as convenience.
Incref()101 Regexp* Regexp::Incref() {
102   if (ref_ >= kMaxRef-1) {
103     static absl::once_flag ref_once;
104     absl::call_once(ref_once, []() {
105       (void) new (ref_storage) RefStorage;
106     });
107 
108     // Store ref count in overflow map.
109     absl::MutexLock l(ref_mutex());
110     if (ref_ == kMaxRef) {
111       // already overflowed
112       (*ref_map())[this]++;
113     } else {
114       // overflowing now
115       (*ref_map())[this] = kMaxRef;
116       ref_ = kMaxRef;
117     }
118     return this;
119   }
120 
121   ref_++;
122   return this;
123 }
124 
125 // Decrements reference count and deletes this object if count reaches 0.
Decref()126 void Regexp::Decref() {
127   if (ref_ == kMaxRef) {
128     // Ref count is stored in overflow map.
129     absl::MutexLock l(ref_mutex());
130     int r = (*ref_map())[this] - 1;
131     if (r < kMaxRef) {
132       ref_ = static_cast<uint16_t>(r);
133       ref_map()->erase(this);
134     } else {
135       (*ref_map())[this] = r;
136     }
137     return;
138   }
139   ref_--;
140   if (ref_ == 0)
141     Destroy();
142 }
143 
144 // Deletes this object; ref count has count reached 0.
Destroy()145 void Regexp::Destroy() {
146   if (QuickDestroy())
147     return;
148 
149   // Handle recursive Destroy with explicit stack
150   // to avoid arbitrarily deep recursion on process stack [sigh].
151   down_ = NULL;
152   Regexp* stack = this;
153   while (stack != NULL) {
154     Regexp* re = stack;
155     stack = re->down_;
156     if (re->ref_ != 0)
157       LOG(DFATAL) << "Bad reference count " << re->ref_;
158     if (re->nsub_ > 0) {
159       Regexp** subs = re->sub();
160       for (int i = 0; i < re->nsub_; i++) {
161         Regexp* sub = subs[i];
162         if (sub == NULL)
163           continue;
164         if (sub->ref_ == kMaxRef)
165           sub->Decref();
166         else
167           --sub->ref_;
168         if (sub->ref_ == 0 && !sub->QuickDestroy()) {
169           sub->down_ = stack;
170           stack = sub;
171         }
172       }
173       if (re->nsub_ > 1)
174         delete[] subs;
175       re->nsub_ = 0;
176     }
177     delete re;
178   }
179 }
180 
AddRuneToString(Rune r)181 void Regexp::AddRuneToString(Rune r) {
182   DCHECK(op_ == kRegexpLiteralString);
183   if (nrunes_ == 0) {
184     // start with 8
185     runes_ = new Rune[8];
186   } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
187     // double on powers of two
188     Rune *old = runes_;
189     runes_ = new Rune[nrunes_ * 2];
190     for (int i = 0; i < nrunes_; i++)
191       runes_[i] = old[i];
192     delete[] old;
193   }
194 
195   runes_[nrunes_++] = r;
196 }
197 
HaveMatch(int match_id,ParseFlags flags)198 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
199   Regexp* re = new Regexp(kRegexpHaveMatch, flags);
200   re->match_id_ = match_id;
201   return re;
202 }
203 
StarPlusOrQuest(RegexpOp op,Regexp * sub,ParseFlags flags)204 Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
205   // Squash **, ++ and ??.
206   if (op == sub->op() && flags == sub->parse_flags())
207     return sub;
208 
209   // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
210   // op is Star/Plus/Quest, we just have to check that sub->op() is too.
211   if ((sub->op() == kRegexpStar ||
212        sub->op() == kRegexpPlus ||
213        sub->op() == kRegexpQuest) &&
214       flags == sub->parse_flags()) {
215     // If sub is Star, no need to rewrite it.
216     if (sub->op() == kRegexpStar)
217       return sub;
218 
219     // Rewrite sub to Star.
220     Regexp* re = new Regexp(kRegexpStar, flags);
221     re->AllocSub(1);
222     re->sub()[0] = sub->sub()[0]->Incref();
223     sub->Decref();  // We didn't consume the reference after all.
224     return re;
225   }
226 
227   Regexp* re = new Regexp(op, flags);
228   re->AllocSub(1);
229   re->sub()[0] = sub;
230   return re;
231 }
232 
Plus(Regexp * sub,ParseFlags flags)233 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
234   return StarPlusOrQuest(kRegexpPlus, sub, flags);
235 }
236 
Star(Regexp * sub,ParseFlags flags)237 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
238   return StarPlusOrQuest(kRegexpStar, sub, flags);
239 }
240 
Quest(Regexp * sub,ParseFlags flags)241 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
242   return StarPlusOrQuest(kRegexpQuest, sub, flags);
243 }
244 
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)245 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
246                                   ParseFlags flags, bool can_factor) {
247   if (nsub == 1)
248     return sub[0];
249 
250   if (nsub == 0) {
251     if (op == kRegexpAlternate)
252       return new Regexp(kRegexpNoMatch, flags);
253     else
254       return new Regexp(kRegexpEmptyMatch, flags);
255   }
256 
257   PODArray<Regexp*> subcopy;
258   if (op == kRegexpAlternate && can_factor) {
259     // Going to edit sub; make a copy so we don't step on caller.
260     subcopy = PODArray<Regexp*>(nsub);
261     memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
262     sub = subcopy.data();
263     nsub = FactorAlternation(sub, nsub, flags);
264     if (nsub == 1) {
265       Regexp* re = sub[0];
266       return re;
267     }
268   }
269 
270   if (nsub > kMaxNsub) {
271     // Too many subexpressions to fit in a single Regexp.
272     // Make a two-level tree.  Two levels gets us to 65535^2.
273     int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
274     Regexp* re = new Regexp(op, flags);
275     re->AllocSub(nbigsub);
276     Regexp** subs = re->sub();
277     for (int i = 0; i < nbigsub - 1; i++)
278       subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
279     subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
280                                           nsub - (nbigsub-1)*kMaxNsub, flags,
281                                           false);
282     return re;
283   }
284 
285   Regexp* re = new Regexp(op, flags);
286   re->AllocSub(nsub);
287   Regexp** subs = re->sub();
288   for (int i = 0; i < nsub; i++)
289     subs[i] = sub[i];
290   return re;
291 }
292 
Concat(Regexp ** sub,int nsub,ParseFlags flags)293 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
294   return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
295 }
296 
Alternate(Regexp ** sub,int nsub,ParseFlags flags)297 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
298   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
299 }
300 
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)301 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
302   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
303 }
304 
Capture(Regexp * sub,ParseFlags flags,int cap)305 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
306   Regexp* re = new Regexp(kRegexpCapture, flags);
307   re->AllocSub(1);
308   re->sub()[0] = sub;
309   re->cap_ = cap;
310   return re;
311 }
312 
Repeat(Regexp * sub,ParseFlags flags,int min,int max)313 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
314   Regexp* re = new Regexp(kRegexpRepeat, flags);
315   re->AllocSub(1);
316   re->sub()[0] = sub;
317   re->min_ = min;
318   re->max_ = max;
319   return re;
320 }
321 
NewLiteral(Rune rune,ParseFlags flags)322 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
323   Regexp* re = new Regexp(kRegexpLiteral, flags);
324   re->rune_ = rune;
325   return re;
326 }
327 
LiteralString(Rune * runes,int nrunes,ParseFlags flags)328 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
329   if (nrunes <= 0)
330     return new Regexp(kRegexpEmptyMatch, flags);
331   if (nrunes == 1)
332     return NewLiteral(runes[0], flags);
333   Regexp* re = new Regexp(kRegexpLiteralString, flags);
334   for (int i = 0; i < nrunes; i++)
335     re->AddRuneToString(runes[i]);
336   return re;
337 }
338 
NewCharClass(CharClass * cc,ParseFlags flags)339 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
340   Regexp* re = new Regexp(kRegexpCharClass, flags);
341   re->cc_ = cc;
342   return re;
343 }
344 
Swap(Regexp * that)345 void Regexp::Swap(Regexp* that) {
346   // Regexp is not trivially copyable, so we cannot freely copy it with
347   // memmove(3), but swapping objects like so is safe for our purposes.
348   char tmp[sizeof *this];
349   void* vthis = reinterpret_cast<void*>(this);
350   void* vthat = reinterpret_cast<void*>(that);
351   memmove(tmp, vthis, sizeof *this);
352   memmove(vthis, vthat, sizeof *this);
353   memmove(vthat, tmp, sizeof *this);
354 }
355 
356 // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)357 static bool TopEqual(Regexp* a, Regexp* b) {
358   if (a->op() != b->op())
359     return false;
360 
361   switch (a->op()) {
362     case kRegexpNoMatch:
363     case kRegexpEmptyMatch:
364     case kRegexpAnyChar:
365     case kRegexpAnyByte:
366     case kRegexpBeginLine:
367     case kRegexpEndLine:
368     case kRegexpWordBoundary:
369     case kRegexpNoWordBoundary:
370     case kRegexpBeginText:
371       return true;
372 
373     case kRegexpEndText:
374       // The parse flags remember whether it's \z or (?-m:$),
375       // which matters when testing against PCRE.
376       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
377 
378     case kRegexpLiteral:
379       return a->rune() == b->rune() &&
380              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
381 
382     case kRegexpLiteralString:
383       return a->nrunes() == b->nrunes() &&
384              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
385              memcmp(a->runes(), b->runes(),
386                     a->nrunes() * sizeof a->runes()[0]) == 0;
387 
388     case kRegexpAlternate:
389     case kRegexpConcat:
390       return a->nsub() == b->nsub();
391 
392     case kRegexpStar:
393     case kRegexpPlus:
394     case kRegexpQuest:
395       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
396 
397     case kRegexpRepeat:
398       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
399              a->min() == b->min() &&
400              a->max() == b->max();
401 
402     case kRegexpCapture:
403       if (a->name() == NULL || b->name() == NULL) {
404         // One pointer is null, so the other pointer should also be null.
405         return a->cap() == b->cap() && a->name() == b->name();
406       } else {
407         // Neither pointer is null, so compare the pointees for equality.
408         return a->cap() == b->cap() && *a->name() == *b->name();
409       }
410 
411     case kRegexpHaveMatch:
412       return a->match_id() == b->match_id();
413 
414     case kRegexpCharClass: {
415       CharClass* acc = a->cc();
416       CharClass* bcc = b->cc();
417       return acc->size() == bcc->size() &&
418              acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
419              memcmp(acc->begin(), bcc->begin(),
420                     (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
421     }
422   }
423 
424   LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
425   return 0;
426 }
427 
Equal(Regexp * a,Regexp * b)428 bool Regexp::Equal(Regexp* a, Regexp* b) {
429   if (a == NULL || b == NULL)
430     return a == b;
431 
432   if (!TopEqual(a, b))
433     return false;
434 
435   // Fast path:
436   // return without allocating vector if there are no subregexps.
437   switch (a->op()) {
438     case kRegexpAlternate:
439     case kRegexpConcat:
440     case kRegexpStar:
441     case kRegexpPlus:
442     case kRegexpQuest:
443     case kRegexpRepeat:
444     case kRegexpCapture:
445       break;
446 
447     default:
448       return true;
449   }
450 
451   // Committed to doing real work.
452   // The stack (vector) has pairs of regexps waiting to
453   // be compared.  The regexps are only equal if
454   // all the pairs end up being equal.
455   std::vector<Regexp*> stk;
456 
457   for (;;) {
458     // Invariant: TopEqual(a, b) == true.
459     Regexp* a2;
460     Regexp* b2;
461     switch (a->op()) {
462       default:
463         break;
464       case kRegexpAlternate:
465       case kRegexpConcat:
466         for (int i = 0; i < a->nsub(); i++) {
467           a2 = a->sub()[i];
468           b2 = b->sub()[i];
469           if (!TopEqual(a2, b2))
470             return false;
471           stk.push_back(a2);
472           stk.push_back(b2);
473         }
474         break;
475 
476       case kRegexpStar:
477       case kRegexpPlus:
478       case kRegexpQuest:
479       case kRegexpRepeat:
480       case kRegexpCapture:
481         a2 = a->sub()[0];
482         b2 = b->sub()[0];
483         if (!TopEqual(a2, b2))
484           return false;
485         // Really:
486         //   stk.push_back(a2);
487         //   stk.push_back(b2);
488         //   break;
489         // but faster to assign directly and loop.
490         a = a2;
491         b = b2;
492         continue;
493     }
494 
495     size_t n = stk.size();
496     if (n == 0)
497       break;
498 
499     DCHECK_GE(n, 2);
500     a = stk[n-2];
501     b = stk[n-1];
502     stk.resize(n-2);
503   }
504 
505   return true;
506 }
507 
508 // Keep in sync with enum RegexpStatusCode in regexp.h
509 static const char *kErrorStrings[] = {
510   "no error",
511   "unexpected error",
512   "invalid escape sequence",
513   "invalid character class",
514   "invalid character class range",
515   "missing ]",
516   "missing )",
517   "unexpected )",
518   "trailing \\",
519   "no argument for repetition operator",
520   "invalid repetition size",
521   "bad repetition operator",
522   "invalid perl operator",
523   "invalid UTF-8",
524   "invalid named capture group",
525 };
526 
CodeText(enum RegexpStatusCode code)527 std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
528   if (code < 0 || code >= ABSL_ARRAYSIZE(kErrorStrings))
529     code = kRegexpInternalError;
530   return kErrorStrings[code];
531 }
532 
Text() const533 std::string RegexpStatus::Text() const {
534   if (error_arg_.empty())
535     return CodeText(code_);
536   std::string s;
537   s.append(CodeText(code_));
538   s.append(": ");
539   s.append(error_arg_.data(), error_arg_.size());
540   return s;
541 }
542 
Copy(const RegexpStatus & status)543 void RegexpStatus::Copy(const RegexpStatus& status) {
544   code_ = status.code_;
545   error_arg_ = status.error_arg_;
546 }
547 
548 typedef int Ignored;  // Walker<void> doesn't exist
549 
550 // Walker subclass to count capturing parens in regexp.
551 class NumCapturesWalker : public Regexp::Walker<Ignored> {
552  public:
NumCapturesWalker()553   NumCapturesWalker() : ncapture_(0) {}
ncapture()554   int ncapture() { return ncapture_; }
555 
PreVisit(Regexp * re,Ignored ignored,bool * stop)556   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
557     if (re->op() == kRegexpCapture)
558       ncapture_++;
559     return ignored;
560   }
561 
ShortVisit(Regexp * re,Ignored ignored)562   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
563     // Should never be called: we use Walk(), not WalkExponential().
564 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
565     LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
566 #endif
567     return ignored;
568   }
569 
570  private:
571   int ncapture_;
572 
573   NumCapturesWalker(const NumCapturesWalker&) = delete;
574   NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
575 };
576 
NumCaptures()577 int Regexp::NumCaptures() {
578   NumCapturesWalker w;
579   w.Walk(this, 0);
580   return w.ncapture();
581 }
582 
583 // Walker class to build map of named capture groups and their indices.
584 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
585  public:
NamedCapturesWalker()586   NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()587   ~NamedCapturesWalker() { delete map_; }
588 
TakeMap()589   std::map<std::string, int>* TakeMap() {
590     std::map<std::string, int>* m = map_;
591     map_ = NULL;
592     return m;
593   }
594 
PreVisit(Regexp * re,Ignored ignored,bool * stop)595   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
596     if (re->op() == kRegexpCapture && re->name() != NULL) {
597       // Allocate map once we find a name.
598       if (map_ == NULL)
599         map_ = new std::map<std::string, int>;
600 
601       // Record first occurrence of each name.
602       // (The rule is that if you have the same name
603       // multiple times, only the leftmost one counts.)
604       map_->insert({*re->name(), re->cap()});
605     }
606     return ignored;
607   }
608 
ShortVisit(Regexp * re,Ignored ignored)609   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
610     // Should never be called: we use Walk(), not WalkExponential().
611 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
612     LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
613 #endif
614     return ignored;
615   }
616 
617  private:
618   std::map<std::string, int>* map_;
619 
620   NamedCapturesWalker(const NamedCapturesWalker&) = delete;
621   NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
622 };
623 
NamedCaptures()624 std::map<std::string, int>* Regexp::NamedCaptures() {
625   NamedCapturesWalker w;
626   w.Walk(this, 0);
627   return w.TakeMap();
628 }
629 
630 // Walker class to build map from capture group indices to their names.
631 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
632  public:
CaptureNamesWalker()633   CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()634   ~CaptureNamesWalker() { delete map_; }
635 
TakeMap()636   std::map<int, std::string>* TakeMap() {
637     std::map<int, std::string>* m = map_;
638     map_ = NULL;
639     return m;
640   }
641 
PreVisit(Regexp * re,Ignored ignored,bool * stop)642   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
643     if (re->op() == kRegexpCapture && re->name() != NULL) {
644       // Allocate map once we find a name.
645       if (map_ == NULL)
646         map_ = new std::map<int, std::string>;
647 
648       (*map_)[re->cap()] = *re->name();
649     }
650     return ignored;
651   }
652 
ShortVisit(Regexp * re,Ignored ignored)653   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
654     // Should never be called: we use Walk(), not WalkExponential().
655 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
656     LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
657 #endif
658     return ignored;
659   }
660 
661  private:
662   std::map<int, std::string>* map_;
663 
664   CaptureNamesWalker(const CaptureNamesWalker&) = delete;
665   CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
666 };
667 
CaptureNames()668 std::map<int, std::string>* Regexp::CaptureNames() {
669   CaptureNamesWalker w;
670   w.Walk(this, 0);
671   return w.TakeMap();
672 }
673 
ConvertRunesToBytes(bool latin1,Rune * runes,int nrunes,std::string * bytes)674 void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
675                          std::string* bytes) {
676   if (latin1) {
677     bytes->resize(nrunes);
678     for (int i = 0; i < nrunes; i++)
679       (*bytes)[i] = static_cast<char>(runes[i]);
680   } else {
681     bytes->resize(nrunes * UTFmax);  // worst case
682     char* p = &(*bytes)[0];
683     for (int i = 0; i < nrunes; i++)
684       p += runetochar(p, &runes[i]);
685     bytes->resize(p - &(*bytes)[0]);
686     bytes->shrink_to_fit();
687   }
688 }
689 
690 // Determines whether regexp matches must be anchored
691 // with a fixed string prefix.  If so, returns the prefix and
692 // the regexp that remains after the prefix.  The prefix might
693 // be ASCII case-insensitive.
RequiredPrefix(std::string * prefix,bool * foldcase,Regexp ** suffix)694 bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
695                             Regexp** suffix) {
696   prefix->clear();
697   *foldcase = false;
698   *suffix = NULL;
699 
700   // No need for a walker: the regexp must be of the form
701   // 1. some number of ^ anchors
702   // 2. a literal char or string
703   // 3. the rest
704   if (op_ != kRegexpConcat)
705     return false;
706   int i = 0;
707   while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
708     i++;
709   if (i == 0 || i >= nsub_)
710     return false;
711   Regexp* re = sub()[i];
712   if (re->op_ != kRegexpLiteral &&
713       re->op_ != kRegexpLiteralString)
714     return false;
715   i++;
716   if (i < nsub_) {
717     for (int j = i; j < nsub_; j++)
718       sub()[j]->Incref();
719     *suffix = Concat(sub() + i, nsub_ - i, parse_flags());
720   } else {
721     *suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
722   }
723 
724   bool latin1 = (re->parse_flags() & Latin1) != 0;
725   Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
726   int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
727   ConvertRunesToBytes(latin1, runes, nrunes, prefix);
728   *foldcase = (re->parse_flags() & FoldCase) != 0;
729   return true;
730 }
731 
732 // Determines whether regexp matches must be unanchored
733 // with a fixed string prefix.  If so, returns the prefix.
734 // The prefix might be ASCII case-insensitive.
RequiredPrefixForAccel(std::string * prefix,bool * foldcase)735 bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
736   prefix->clear();
737   *foldcase = false;
738 
739   // No need for a walker: the regexp must either begin with or be
740   // a literal char or string. We "see through" capturing groups,
741   // but make no effort to glue multiple prefix fragments together.
742   Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
743   while (re->op_ == kRegexpCapture) {
744     re = re->sub()[0];
745     if (re->op_ == kRegexpConcat && re->nsub_ > 0)
746       re = re->sub()[0];
747   }
748   if (re->op_ != kRegexpLiteral &&
749       re->op_ != kRegexpLiteralString)
750     return false;
751 
752   bool latin1 = (re->parse_flags() & Latin1) != 0;
753   Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
754   int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
755   ConvertRunesToBytes(latin1, runes, nrunes, prefix);
756   *foldcase = (re->parse_flags() & FoldCase) != 0;
757   return true;
758 }
759 
760 // Character class builder is a balanced binary tree (STL set)
761 // containing non-overlapping, non-abutting RuneRanges.
762 // The less-than operator used in the tree treats two
763 // ranges as equal if they overlap at all, so that
764 // lookups for a particular Rune are possible.
765 
CharClassBuilder()766 CharClassBuilder::CharClassBuilder() {
767   nrunes_ = 0;
768   upper_ = 0;
769   lower_ = 0;
770 }
771 
772 // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)773 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
774   if (hi < lo)
775     return false;
776 
777   if (lo <= 'z' && hi >= 'A') {
778     // Overlaps some alpha, maybe not all.
779     // Update bitmaps telling which ASCII letters are in the set.
780     Rune lo1 = std::max<Rune>(lo, 'A');
781     Rune hi1 = std::min<Rune>(hi, 'Z');
782     if (lo1 <= hi1)
783       upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
784 
785     lo1 = std::max<Rune>(lo, 'a');
786     hi1 = std::min<Rune>(hi, 'z');
787     if (lo1 <= hi1)
788       lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
789   }
790 
791   {  // Check whether lo, hi is already in the class.
792     iterator it = ranges_.find(RuneRange(lo, lo));
793     if (it != end() && it->lo <= lo && hi <= it->hi)
794       return false;
795   }
796 
797   // Look for a range abutting lo on the left.
798   // If it exists, take it out and increase our range.
799   if (lo > 0) {
800     iterator it = ranges_.find(RuneRange(lo-1, lo-1));
801     if (it != end()) {
802       lo = it->lo;
803       if (it->hi > hi)
804         hi = it->hi;
805       nrunes_ -= it->hi - it->lo + 1;
806       ranges_.erase(it);
807     }
808   }
809 
810   // Look for a range abutting hi on the right.
811   // If it exists, take it out and increase our range.
812   if (hi < Runemax) {
813     iterator it = ranges_.find(RuneRange(hi+1, hi+1));
814     if (it != end()) {
815       hi = it->hi;
816       nrunes_ -= it->hi - it->lo + 1;
817       ranges_.erase(it);
818     }
819   }
820 
821   // Look for ranges between lo and hi.  Take them out.
822   // This is only safe because the set has no overlapping ranges.
823   // We've already removed any ranges abutting lo and hi, so
824   // any that overlap [lo, hi] must be contained within it.
825   for (;;) {
826     iterator it = ranges_.find(RuneRange(lo, hi));
827     if (it == end())
828       break;
829     nrunes_ -= it->hi - it->lo + 1;
830     ranges_.erase(it);
831   }
832 
833   // Finally, add [lo, hi].
834   nrunes_ += hi - lo + 1;
835   ranges_.insert(RuneRange(lo, hi));
836   return true;
837 }
838 
AddCharClass(CharClassBuilder * cc)839 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
840   for (iterator it = cc->begin(); it != cc->end(); ++it)
841     AddRange(it->lo, it->hi);
842 }
843 
Contains(Rune r)844 bool CharClassBuilder::Contains(Rune r) {
845   return ranges_.find(RuneRange(r, r)) != end();
846 }
847 
848 // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()849 bool CharClassBuilder::FoldsASCII() {
850   return ((upper_ ^ lower_) & AlphaMask) == 0;
851 }
852 
Copy()853 CharClassBuilder* CharClassBuilder::Copy() {
854   CharClassBuilder* cc = new CharClassBuilder;
855   for (iterator it = begin(); it != end(); ++it)
856     cc->ranges_.insert(RuneRange(it->lo, it->hi));
857   cc->upper_ = upper_;
858   cc->lower_ = lower_;
859   cc->nrunes_ = nrunes_;
860   return cc;
861 }
862 
863 
864 
RemoveAbove(Rune r)865 void CharClassBuilder::RemoveAbove(Rune r) {
866   if (r >= Runemax)
867     return;
868 
869   if (r < 'z') {
870     if (r < 'a')
871       lower_ = 0;
872     else
873       lower_ &= AlphaMask >> ('z' - r);
874   }
875 
876   if (r < 'Z') {
877     if (r < 'A')
878       upper_ = 0;
879     else
880       upper_ &= AlphaMask >> ('Z' - r);
881   }
882 
883   for (;;) {
884 
885     iterator it = ranges_.find(RuneRange(r + 1, Runemax));
886     if (it == end())
887       break;
888     RuneRange rr = *it;
889     ranges_.erase(it);
890     nrunes_ -= rr.hi - rr.lo + 1;
891     if (rr.lo <= r) {
892       rr.hi = r;
893       ranges_.insert(rr);
894       nrunes_ += rr.hi - rr.lo + 1;
895     }
896   }
897 }
898 
Negate()899 void CharClassBuilder::Negate() {
900   // Build up negation and then copy in.
901   // Could edit ranges in place, but C++ won't let me.
902   std::vector<RuneRange> v;
903   v.reserve(ranges_.size() + 1);
904 
905   // In negation, first range begins at 0, unless
906   // the current class begins at 0.
907   iterator it = begin();
908   if (it == end()) {
909     v.push_back(RuneRange(0, Runemax));
910   } else {
911     int nextlo = 0;
912     if (it->lo == 0) {
913       nextlo = it->hi + 1;
914       ++it;
915     }
916     for (; it != end(); ++it) {
917       v.push_back(RuneRange(nextlo, it->lo - 1));
918       nextlo = it->hi + 1;
919     }
920     if (nextlo <= Runemax)
921       v.push_back(RuneRange(nextlo, Runemax));
922   }
923 
924   ranges_.clear();
925   for (size_t i = 0; i < v.size(); i++)
926     ranges_.insert(v[i]);
927 
928   upper_ = AlphaMask & ~upper_;
929   lower_ = AlphaMask & ~lower_;
930   nrunes_ = Runemax+1 - nrunes_;
931 }
932 
933 // Character class is a sorted list of ranges.
934 // The ranges are allocated in the same block as the header,
935 // necessitating a special allocator and Delete method.
936 
New(size_t maxranges)937 CharClass* CharClass::New(size_t maxranges) {
938   CharClass* cc;
939   uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
940   cc = reinterpret_cast<CharClass*>(data);
941   cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
942   cc->nranges_ = 0;
943   cc->folds_ascii_ = false;
944   cc->nrunes_ = 0;
945   return cc;
946 }
947 
Delete()948 void CharClass::Delete() {
949   uint8_t* data = reinterpret_cast<uint8_t*>(this);
950   delete[] data;
951 }
952 
Negate()953 CharClass* CharClass::Negate() {
954   CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1));
955   cc->folds_ascii_ = folds_ascii_;
956   cc->nrunes_ = Runemax + 1 - nrunes_;
957   int n = 0;
958   int nextlo = 0;
959   for (CharClass::iterator it = begin(); it != end(); ++it) {
960     if (it->lo == nextlo) {
961       nextlo = it->hi + 1;
962     } else {
963       cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
964       nextlo = it->hi + 1;
965     }
966   }
967   if (nextlo <= Runemax)
968     cc->ranges_[n++] = RuneRange(nextlo, Runemax);
969   cc->nranges_ = n;
970   return cc;
971 }
972 
Contains(Rune r) const973 bool CharClass::Contains(Rune r) const {
974   RuneRange* rr = ranges_;
975   int n = nranges_;
976   while (n > 0) {
977     int m = n/2;
978     if (rr[m].hi < r) {
979       rr += m+1;
980       n -= m+1;
981     } else if (r < rr[m].lo) {
982       n = m;
983     } else {  // rr[m].lo <= r && r <= rr[m].hi
984       return true;
985     }
986   }
987   return false;
988 }
989 
GetCharClass()990 CharClass* CharClassBuilder::GetCharClass() {
991   CharClass* cc = CharClass::New(ranges_.size());
992   int n = 0;
993   for (iterator it = begin(); it != end(); ++it)
994     cc->ranges_[n++] = *it;
995   cc->nranges_ = n;
996   DCHECK_LE(n, static_cast<int>(ranges_.size()));
997   cc->nrunes_ = nrunes_;
998   cc->folds_ascii_ = FoldsASCII();
999   return cc;
1000 }
1001 
1002 }  // namespace re2
1003