1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression representation.
6 // Tested by parse_test.cc
7
8 #include "re2/regexp.h"
9
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <string.h>
13 #include <algorithm>
14 #include <map>
15 #include <string>
16 #include <vector>
17
18 #include "absl/base/call_once.h"
19 #include "absl/base/macros.h"
20 #include "absl/container/flat_hash_map.h"
21 #include "absl/synchronization/mutex.h"
22 #include "util/logging.h"
23 #include "util/utf.h"
24 #include "re2/pod_array.h"
25 #include "re2/walker-inl.h"
26
27 namespace re2 {
28
29 // Constructor. Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)30 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
31 : op_(static_cast<uint8_t>(op)),
32 simple_(false),
33 parse_flags_(static_cast<uint16_t>(parse_flags)),
34 ref_(1),
35 nsub_(0),
36 down_(NULL) {
37 subone_ = NULL;
38 memset(the_union_, 0, sizeof the_union_);
39 }
40
41 // Destructor. Assumes already cleaned up children.
42 // Private: use Decref() instead of delete to destroy Regexps.
43 // Can't call Decref on the sub-Regexps here because
44 // that could cause arbitrarily deep recursion, so
45 // required Decref() to have handled them for us.
~Regexp()46 Regexp::~Regexp() {
47 if (nsub_ > 0)
48 LOG(DFATAL) << "Regexp not destroyed.";
49
50 switch (op_) {
51 default:
52 break;
53 case kRegexpCapture:
54 delete name_;
55 break;
56 case kRegexpLiteralString:
57 delete[] runes_;
58 break;
59 case kRegexpCharClass:
60 if (cc_)
61 cc_->Delete();
62 delete ccb_;
63 break;
64 }
65 }
66
67 // If it's possible to destroy this regexp without recurring,
68 // do so and return true. Else return false.
QuickDestroy()69 bool Regexp::QuickDestroy() {
70 if (nsub_ == 0) {
71 delete this;
72 return true;
73 }
74 return false;
75 }
76
77 // Similar to EmptyStorage in re2.cc.
78 struct RefStorage {
79 absl::Mutex ref_mutex;
80 absl::flat_hash_map<Regexp*, int> ref_map;
81 };
82 alignas(RefStorage) static char ref_storage[sizeof(RefStorage)];
83
ref_mutex()84 static inline absl::Mutex* ref_mutex() {
85 return &reinterpret_cast<RefStorage*>(ref_storage)->ref_mutex;
86 }
87
ref_map()88 static inline absl::flat_hash_map<Regexp*, int>* ref_map() {
89 return &reinterpret_cast<RefStorage*>(ref_storage)->ref_map;
90 }
91
Ref()92 int Regexp::Ref() {
93 if (ref_ < kMaxRef)
94 return ref_;
95
96 absl::MutexLock l(ref_mutex());
97 return (*ref_map())[this];
98 }
99
100 // Increments reference count, returns object as convenience.
Incref()101 Regexp* Regexp::Incref() {
102 if (ref_ >= kMaxRef-1) {
103 static absl::once_flag ref_once;
104 absl::call_once(ref_once, []() {
105 (void) new (ref_storage) RefStorage;
106 });
107
108 // Store ref count in overflow map.
109 absl::MutexLock l(ref_mutex());
110 if (ref_ == kMaxRef) {
111 // already overflowed
112 (*ref_map())[this]++;
113 } else {
114 // overflowing now
115 (*ref_map())[this] = kMaxRef;
116 ref_ = kMaxRef;
117 }
118 return this;
119 }
120
121 ref_++;
122 return this;
123 }
124
125 // Decrements reference count and deletes this object if count reaches 0.
Decref()126 void Regexp::Decref() {
127 if (ref_ == kMaxRef) {
128 // Ref count is stored in overflow map.
129 absl::MutexLock l(ref_mutex());
130 int r = (*ref_map())[this] - 1;
131 if (r < kMaxRef) {
132 ref_ = static_cast<uint16_t>(r);
133 ref_map()->erase(this);
134 } else {
135 (*ref_map())[this] = r;
136 }
137 return;
138 }
139 ref_--;
140 if (ref_ == 0)
141 Destroy();
142 }
143
144 // Deletes this object; ref count has count reached 0.
Destroy()145 void Regexp::Destroy() {
146 if (QuickDestroy())
147 return;
148
149 // Handle recursive Destroy with explicit stack
150 // to avoid arbitrarily deep recursion on process stack [sigh].
151 down_ = NULL;
152 Regexp* stack = this;
153 while (stack != NULL) {
154 Regexp* re = stack;
155 stack = re->down_;
156 if (re->ref_ != 0)
157 LOG(DFATAL) << "Bad reference count " << re->ref_;
158 if (re->nsub_ > 0) {
159 Regexp** subs = re->sub();
160 for (int i = 0; i < re->nsub_; i++) {
161 Regexp* sub = subs[i];
162 if (sub == NULL)
163 continue;
164 if (sub->ref_ == kMaxRef)
165 sub->Decref();
166 else
167 --sub->ref_;
168 if (sub->ref_ == 0 && !sub->QuickDestroy()) {
169 sub->down_ = stack;
170 stack = sub;
171 }
172 }
173 if (re->nsub_ > 1)
174 delete[] subs;
175 re->nsub_ = 0;
176 }
177 delete re;
178 }
179 }
180
AddRuneToString(Rune r)181 void Regexp::AddRuneToString(Rune r) {
182 DCHECK(op_ == kRegexpLiteralString);
183 if (nrunes_ == 0) {
184 // start with 8
185 runes_ = new Rune[8];
186 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
187 // double on powers of two
188 Rune *old = runes_;
189 runes_ = new Rune[nrunes_ * 2];
190 for (int i = 0; i < nrunes_; i++)
191 runes_[i] = old[i];
192 delete[] old;
193 }
194
195 runes_[nrunes_++] = r;
196 }
197
HaveMatch(int match_id,ParseFlags flags)198 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
199 Regexp* re = new Regexp(kRegexpHaveMatch, flags);
200 re->match_id_ = match_id;
201 return re;
202 }
203
StarPlusOrQuest(RegexpOp op,Regexp * sub,ParseFlags flags)204 Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
205 // Squash **, ++ and ??.
206 if (op == sub->op() && flags == sub->parse_flags())
207 return sub;
208
209 // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
210 // op is Star/Plus/Quest, we just have to check that sub->op() is too.
211 if ((sub->op() == kRegexpStar ||
212 sub->op() == kRegexpPlus ||
213 sub->op() == kRegexpQuest) &&
214 flags == sub->parse_flags()) {
215 // If sub is Star, no need to rewrite it.
216 if (sub->op() == kRegexpStar)
217 return sub;
218
219 // Rewrite sub to Star.
220 Regexp* re = new Regexp(kRegexpStar, flags);
221 re->AllocSub(1);
222 re->sub()[0] = sub->sub()[0]->Incref();
223 sub->Decref(); // We didn't consume the reference after all.
224 return re;
225 }
226
227 Regexp* re = new Regexp(op, flags);
228 re->AllocSub(1);
229 re->sub()[0] = sub;
230 return re;
231 }
232
Plus(Regexp * sub,ParseFlags flags)233 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
234 return StarPlusOrQuest(kRegexpPlus, sub, flags);
235 }
236
Star(Regexp * sub,ParseFlags flags)237 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
238 return StarPlusOrQuest(kRegexpStar, sub, flags);
239 }
240
Quest(Regexp * sub,ParseFlags flags)241 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
242 return StarPlusOrQuest(kRegexpQuest, sub, flags);
243 }
244
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)245 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
246 ParseFlags flags, bool can_factor) {
247 if (nsub == 1)
248 return sub[0];
249
250 if (nsub == 0) {
251 if (op == kRegexpAlternate)
252 return new Regexp(kRegexpNoMatch, flags);
253 else
254 return new Regexp(kRegexpEmptyMatch, flags);
255 }
256
257 PODArray<Regexp*> subcopy;
258 if (op == kRegexpAlternate && can_factor) {
259 // Going to edit sub; make a copy so we don't step on caller.
260 subcopy = PODArray<Regexp*>(nsub);
261 memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
262 sub = subcopy.data();
263 nsub = FactorAlternation(sub, nsub, flags);
264 if (nsub == 1) {
265 Regexp* re = sub[0];
266 return re;
267 }
268 }
269
270 if (nsub > kMaxNsub) {
271 // Too many subexpressions to fit in a single Regexp.
272 // Make a two-level tree. Two levels gets us to 65535^2.
273 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
274 Regexp* re = new Regexp(op, flags);
275 re->AllocSub(nbigsub);
276 Regexp** subs = re->sub();
277 for (int i = 0; i < nbigsub - 1; i++)
278 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
279 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
280 nsub - (nbigsub-1)*kMaxNsub, flags,
281 false);
282 return re;
283 }
284
285 Regexp* re = new Regexp(op, flags);
286 re->AllocSub(nsub);
287 Regexp** subs = re->sub();
288 for (int i = 0; i < nsub; i++)
289 subs[i] = sub[i];
290 return re;
291 }
292
Concat(Regexp ** sub,int nsub,ParseFlags flags)293 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
294 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
295 }
296
Alternate(Regexp ** sub,int nsub,ParseFlags flags)297 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
298 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
299 }
300
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)301 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
302 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
303 }
304
Capture(Regexp * sub,ParseFlags flags,int cap)305 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
306 Regexp* re = new Regexp(kRegexpCapture, flags);
307 re->AllocSub(1);
308 re->sub()[0] = sub;
309 re->cap_ = cap;
310 return re;
311 }
312
Repeat(Regexp * sub,ParseFlags flags,int min,int max)313 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
314 Regexp* re = new Regexp(kRegexpRepeat, flags);
315 re->AllocSub(1);
316 re->sub()[0] = sub;
317 re->min_ = min;
318 re->max_ = max;
319 return re;
320 }
321
NewLiteral(Rune rune,ParseFlags flags)322 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
323 Regexp* re = new Regexp(kRegexpLiteral, flags);
324 re->rune_ = rune;
325 return re;
326 }
327
LiteralString(Rune * runes,int nrunes,ParseFlags flags)328 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
329 if (nrunes <= 0)
330 return new Regexp(kRegexpEmptyMatch, flags);
331 if (nrunes == 1)
332 return NewLiteral(runes[0], flags);
333 Regexp* re = new Regexp(kRegexpLiteralString, flags);
334 for (int i = 0; i < nrunes; i++)
335 re->AddRuneToString(runes[i]);
336 return re;
337 }
338
NewCharClass(CharClass * cc,ParseFlags flags)339 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
340 Regexp* re = new Regexp(kRegexpCharClass, flags);
341 re->cc_ = cc;
342 return re;
343 }
344
Swap(Regexp * that)345 void Regexp::Swap(Regexp* that) {
346 // Regexp is not trivially copyable, so we cannot freely copy it with
347 // memmove(3), but swapping objects like so is safe for our purposes.
348 char tmp[sizeof *this];
349 void* vthis = reinterpret_cast<void*>(this);
350 void* vthat = reinterpret_cast<void*>(that);
351 memmove(tmp, vthis, sizeof *this);
352 memmove(vthis, vthat, sizeof *this);
353 memmove(vthat, tmp, sizeof *this);
354 }
355
356 // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)357 static bool TopEqual(Regexp* a, Regexp* b) {
358 if (a->op() != b->op())
359 return false;
360
361 switch (a->op()) {
362 case kRegexpNoMatch:
363 case kRegexpEmptyMatch:
364 case kRegexpAnyChar:
365 case kRegexpAnyByte:
366 case kRegexpBeginLine:
367 case kRegexpEndLine:
368 case kRegexpWordBoundary:
369 case kRegexpNoWordBoundary:
370 case kRegexpBeginText:
371 return true;
372
373 case kRegexpEndText:
374 // The parse flags remember whether it's \z or (?-m:$),
375 // which matters when testing against PCRE.
376 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
377
378 case kRegexpLiteral:
379 return a->rune() == b->rune() &&
380 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
381
382 case kRegexpLiteralString:
383 return a->nrunes() == b->nrunes() &&
384 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
385 memcmp(a->runes(), b->runes(),
386 a->nrunes() * sizeof a->runes()[0]) == 0;
387
388 case kRegexpAlternate:
389 case kRegexpConcat:
390 return a->nsub() == b->nsub();
391
392 case kRegexpStar:
393 case kRegexpPlus:
394 case kRegexpQuest:
395 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
396
397 case kRegexpRepeat:
398 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
399 a->min() == b->min() &&
400 a->max() == b->max();
401
402 case kRegexpCapture:
403 if (a->name() == NULL || b->name() == NULL) {
404 // One pointer is null, so the other pointer should also be null.
405 return a->cap() == b->cap() && a->name() == b->name();
406 } else {
407 // Neither pointer is null, so compare the pointees for equality.
408 return a->cap() == b->cap() && *a->name() == *b->name();
409 }
410
411 case kRegexpHaveMatch:
412 return a->match_id() == b->match_id();
413
414 case kRegexpCharClass: {
415 CharClass* acc = a->cc();
416 CharClass* bcc = b->cc();
417 return acc->size() == bcc->size() &&
418 acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
419 memcmp(acc->begin(), bcc->begin(),
420 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
421 }
422 }
423
424 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
425 return 0;
426 }
427
Equal(Regexp * a,Regexp * b)428 bool Regexp::Equal(Regexp* a, Regexp* b) {
429 if (a == NULL || b == NULL)
430 return a == b;
431
432 if (!TopEqual(a, b))
433 return false;
434
435 // Fast path:
436 // return without allocating vector if there are no subregexps.
437 switch (a->op()) {
438 case kRegexpAlternate:
439 case kRegexpConcat:
440 case kRegexpStar:
441 case kRegexpPlus:
442 case kRegexpQuest:
443 case kRegexpRepeat:
444 case kRegexpCapture:
445 break;
446
447 default:
448 return true;
449 }
450
451 // Committed to doing real work.
452 // The stack (vector) has pairs of regexps waiting to
453 // be compared. The regexps are only equal if
454 // all the pairs end up being equal.
455 std::vector<Regexp*> stk;
456
457 for (;;) {
458 // Invariant: TopEqual(a, b) == true.
459 Regexp* a2;
460 Regexp* b2;
461 switch (a->op()) {
462 default:
463 break;
464 case kRegexpAlternate:
465 case kRegexpConcat:
466 for (int i = 0; i < a->nsub(); i++) {
467 a2 = a->sub()[i];
468 b2 = b->sub()[i];
469 if (!TopEqual(a2, b2))
470 return false;
471 stk.push_back(a2);
472 stk.push_back(b2);
473 }
474 break;
475
476 case kRegexpStar:
477 case kRegexpPlus:
478 case kRegexpQuest:
479 case kRegexpRepeat:
480 case kRegexpCapture:
481 a2 = a->sub()[0];
482 b2 = b->sub()[0];
483 if (!TopEqual(a2, b2))
484 return false;
485 // Really:
486 // stk.push_back(a2);
487 // stk.push_back(b2);
488 // break;
489 // but faster to assign directly and loop.
490 a = a2;
491 b = b2;
492 continue;
493 }
494
495 size_t n = stk.size();
496 if (n == 0)
497 break;
498
499 DCHECK_GE(n, 2);
500 a = stk[n-2];
501 b = stk[n-1];
502 stk.resize(n-2);
503 }
504
505 return true;
506 }
507
508 // Keep in sync with enum RegexpStatusCode in regexp.h
509 static const char *kErrorStrings[] = {
510 "no error",
511 "unexpected error",
512 "invalid escape sequence",
513 "invalid character class",
514 "invalid character class range",
515 "missing ]",
516 "missing )",
517 "unexpected )",
518 "trailing \\",
519 "no argument for repetition operator",
520 "invalid repetition size",
521 "bad repetition operator",
522 "invalid perl operator",
523 "invalid UTF-8",
524 "invalid named capture group",
525 };
526
CodeText(enum RegexpStatusCode code)527 std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
528 if (code < 0 || code >= ABSL_ARRAYSIZE(kErrorStrings))
529 code = kRegexpInternalError;
530 return kErrorStrings[code];
531 }
532
Text() const533 std::string RegexpStatus::Text() const {
534 if (error_arg_.empty())
535 return CodeText(code_);
536 std::string s;
537 s.append(CodeText(code_));
538 s.append(": ");
539 s.append(error_arg_.data(), error_arg_.size());
540 return s;
541 }
542
Copy(const RegexpStatus & status)543 void RegexpStatus::Copy(const RegexpStatus& status) {
544 code_ = status.code_;
545 error_arg_ = status.error_arg_;
546 }
547
548 typedef int Ignored; // Walker<void> doesn't exist
549
550 // Walker subclass to count capturing parens in regexp.
551 class NumCapturesWalker : public Regexp::Walker<Ignored> {
552 public:
NumCapturesWalker()553 NumCapturesWalker() : ncapture_(0) {}
ncapture()554 int ncapture() { return ncapture_; }
555
PreVisit(Regexp * re,Ignored ignored,bool * stop)556 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
557 if (re->op() == kRegexpCapture)
558 ncapture_++;
559 return ignored;
560 }
561
ShortVisit(Regexp * re,Ignored ignored)562 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
563 // Should never be called: we use Walk(), not WalkExponential().
564 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
565 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
566 #endif
567 return ignored;
568 }
569
570 private:
571 int ncapture_;
572
573 NumCapturesWalker(const NumCapturesWalker&) = delete;
574 NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
575 };
576
NumCaptures()577 int Regexp::NumCaptures() {
578 NumCapturesWalker w;
579 w.Walk(this, 0);
580 return w.ncapture();
581 }
582
583 // Walker class to build map of named capture groups and their indices.
584 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
585 public:
NamedCapturesWalker()586 NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()587 ~NamedCapturesWalker() { delete map_; }
588
TakeMap()589 std::map<std::string, int>* TakeMap() {
590 std::map<std::string, int>* m = map_;
591 map_ = NULL;
592 return m;
593 }
594
PreVisit(Regexp * re,Ignored ignored,bool * stop)595 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
596 if (re->op() == kRegexpCapture && re->name() != NULL) {
597 // Allocate map once we find a name.
598 if (map_ == NULL)
599 map_ = new std::map<std::string, int>;
600
601 // Record first occurrence of each name.
602 // (The rule is that if you have the same name
603 // multiple times, only the leftmost one counts.)
604 map_->insert({*re->name(), re->cap()});
605 }
606 return ignored;
607 }
608
ShortVisit(Regexp * re,Ignored ignored)609 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
610 // Should never be called: we use Walk(), not WalkExponential().
611 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
612 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
613 #endif
614 return ignored;
615 }
616
617 private:
618 std::map<std::string, int>* map_;
619
620 NamedCapturesWalker(const NamedCapturesWalker&) = delete;
621 NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
622 };
623
NamedCaptures()624 std::map<std::string, int>* Regexp::NamedCaptures() {
625 NamedCapturesWalker w;
626 w.Walk(this, 0);
627 return w.TakeMap();
628 }
629
630 // Walker class to build map from capture group indices to their names.
631 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
632 public:
CaptureNamesWalker()633 CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()634 ~CaptureNamesWalker() { delete map_; }
635
TakeMap()636 std::map<int, std::string>* TakeMap() {
637 std::map<int, std::string>* m = map_;
638 map_ = NULL;
639 return m;
640 }
641
PreVisit(Regexp * re,Ignored ignored,bool * stop)642 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
643 if (re->op() == kRegexpCapture && re->name() != NULL) {
644 // Allocate map once we find a name.
645 if (map_ == NULL)
646 map_ = new std::map<int, std::string>;
647
648 (*map_)[re->cap()] = *re->name();
649 }
650 return ignored;
651 }
652
ShortVisit(Regexp * re,Ignored ignored)653 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
654 // Should never be called: we use Walk(), not WalkExponential().
655 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
656 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
657 #endif
658 return ignored;
659 }
660
661 private:
662 std::map<int, std::string>* map_;
663
664 CaptureNamesWalker(const CaptureNamesWalker&) = delete;
665 CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
666 };
667
CaptureNames()668 std::map<int, std::string>* Regexp::CaptureNames() {
669 CaptureNamesWalker w;
670 w.Walk(this, 0);
671 return w.TakeMap();
672 }
673
ConvertRunesToBytes(bool latin1,Rune * runes,int nrunes,std::string * bytes)674 void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
675 std::string* bytes) {
676 if (latin1) {
677 bytes->resize(nrunes);
678 for (int i = 0; i < nrunes; i++)
679 (*bytes)[i] = static_cast<char>(runes[i]);
680 } else {
681 bytes->resize(nrunes * UTFmax); // worst case
682 char* p = &(*bytes)[0];
683 for (int i = 0; i < nrunes; i++)
684 p += runetochar(p, &runes[i]);
685 bytes->resize(p - &(*bytes)[0]);
686 bytes->shrink_to_fit();
687 }
688 }
689
690 // Determines whether regexp matches must be anchored
691 // with a fixed string prefix. If so, returns the prefix and
692 // the regexp that remains after the prefix. The prefix might
693 // be ASCII case-insensitive.
RequiredPrefix(std::string * prefix,bool * foldcase,Regexp ** suffix)694 bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
695 Regexp** suffix) {
696 prefix->clear();
697 *foldcase = false;
698 *suffix = NULL;
699
700 // No need for a walker: the regexp must be of the form
701 // 1. some number of ^ anchors
702 // 2. a literal char or string
703 // 3. the rest
704 if (op_ != kRegexpConcat)
705 return false;
706 int i = 0;
707 while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
708 i++;
709 if (i == 0 || i >= nsub_)
710 return false;
711 Regexp* re = sub()[i];
712 if (re->op_ != kRegexpLiteral &&
713 re->op_ != kRegexpLiteralString)
714 return false;
715 i++;
716 if (i < nsub_) {
717 for (int j = i; j < nsub_; j++)
718 sub()[j]->Incref();
719 *suffix = Concat(sub() + i, nsub_ - i, parse_flags());
720 } else {
721 *suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
722 }
723
724 bool latin1 = (re->parse_flags() & Latin1) != 0;
725 Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
726 int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
727 ConvertRunesToBytes(latin1, runes, nrunes, prefix);
728 *foldcase = (re->parse_flags() & FoldCase) != 0;
729 return true;
730 }
731
732 // Determines whether regexp matches must be unanchored
733 // with a fixed string prefix. If so, returns the prefix.
734 // The prefix might be ASCII case-insensitive.
RequiredPrefixForAccel(std::string * prefix,bool * foldcase)735 bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
736 prefix->clear();
737 *foldcase = false;
738
739 // No need for a walker: the regexp must either begin with or be
740 // a literal char or string. We "see through" capturing groups,
741 // but make no effort to glue multiple prefix fragments together.
742 Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
743 while (re->op_ == kRegexpCapture) {
744 re = re->sub()[0];
745 if (re->op_ == kRegexpConcat && re->nsub_ > 0)
746 re = re->sub()[0];
747 }
748 if (re->op_ != kRegexpLiteral &&
749 re->op_ != kRegexpLiteralString)
750 return false;
751
752 bool latin1 = (re->parse_flags() & Latin1) != 0;
753 Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
754 int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
755 ConvertRunesToBytes(latin1, runes, nrunes, prefix);
756 *foldcase = (re->parse_flags() & FoldCase) != 0;
757 return true;
758 }
759
760 // Character class builder is a balanced binary tree (STL set)
761 // containing non-overlapping, non-abutting RuneRanges.
762 // The less-than operator used in the tree treats two
763 // ranges as equal if they overlap at all, so that
764 // lookups for a particular Rune are possible.
765
CharClassBuilder()766 CharClassBuilder::CharClassBuilder() {
767 nrunes_ = 0;
768 upper_ = 0;
769 lower_ = 0;
770 }
771
772 // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)773 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
774 if (hi < lo)
775 return false;
776
777 if (lo <= 'z' && hi >= 'A') {
778 // Overlaps some alpha, maybe not all.
779 // Update bitmaps telling which ASCII letters are in the set.
780 Rune lo1 = std::max<Rune>(lo, 'A');
781 Rune hi1 = std::min<Rune>(hi, 'Z');
782 if (lo1 <= hi1)
783 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
784
785 lo1 = std::max<Rune>(lo, 'a');
786 hi1 = std::min<Rune>(hi, 'z');
787 if (lo1 <= hi1)
788 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
789 }
790
791 { // Check whether lo, hi is already in the class.
792 iterator it = ranges_.find(RuneRange(lo, lo));
793 if (it != end() && it->lo <= lo && hi <= it->hi)
794 return false;
795 }
796
797 // Look for a range abutting lo on the left.
798 // If it exists, take it out and increase our range.
799 if (lo > 0) {
800 iterator it = ranges_.find(RuneRange(lo-1, lo-1));
801 if (it != end()) {
802 lo = it->lo;
803 if (it->hi > hi)
804 hi = it->hi;
805 nrunes_ -= it->hi - it->lo + 1;
806 ranges_.erase(it);
807 }
808 }
809
810 // Look for a range abutting hi on the right.
811 // If it exists, take it out and increase our range.
812 if (hi < Runemax) {
813 iterator it = ranges_.find(RuneRange(hi+1, hi+1));
814 if (it != end()) {
815 hi = it->hi;
816 nrunes_ -= it->hi - it->lo + 1;
817 ranges_.erase(it);
818 }
819 }
820
821 // Look for ranges between lo and hi. Take them out.
822 // This is only safe because the set has no overlapping ranges.
823 // We've already removed any ranges abutting lo and hi, so
824 // any that overlap [lo, hi] must be contained within it.
825 for (;;) {
826 iterator it = ranges_.find(RuneRange(lo, hi));
827 if (it == end())
828 break;
829 nrunes_ -= it->hi - it->lo + 1;
830 ranges_.erase(it);
831 }
832
833 // Finally, add [lo, hi].
834 nrunes_ += hi - lo + 1;
835 ranges_.insert(RuneRange(lo, hi));
836 return true;
837 }
838
AddCharClass(CharClassBuilder * cc)839 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
840 for (iterator it = cc->begin(); it != cc->end(); ++it)
841 AddRange(it->lo, it->hi);
842 }
843
Contains(Rune r)844 bool CharClassBuilder::Contains(Rune r) {
845 return ranges_.find(RuneRange(r, r)) != end();
846 }
847
848 // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()849 bool CharClassBuilder::FoldsASCII() {
850 return ((upper_ ^ lower_) & AlphaMask) == 0;
851 }
852
Copy()853 CharClassBuilder* CharClassBuilder::Copy() {
854 CharClassBuilder* cc = new CharClassBuilder;
855 for (iterator it = begin(); it != end(); ++it)
856 cc->ranges_.insert(RuneRange(it->lo, it->hi));
857 cc->upper_ = upper_;
858 cc->lower_ = lower_;
859 cc->nrunes_ = nrunes_;
860 return cc;
861 }
862
863
864
RemoveAbove(Rune r)865 void CharClassBuilder::RemoveAbove(Rune r) {
866 if (r >= Runemax)
867 return;
868
869 if (r < 'z') {
870 if (r < 'a')
871 lower_ = 0;
872 else
873 lower_ &= AlphaMask >> ('z' - r);
874 }
875
876 if (r < 'Z') {
877 if (r < 'A')
878 upper_ = 0;
879 else
880 upper_ &= AlphaMask >> ('Z' - r);
881 }
882
883 for (;;) {
884
885 iterator it = ranges_.find(RuneRange(r + 1, Runemax));
886 if (it == end())
887 break;
888 RuneRange rr = *it;
889 ranges_.erase(it);
890 nrunes_ -= rr.hi - rr.lo + 1;
891 if (rr.lo <= r) {
892 rr.hi = r;
893 ranges_.insert(rr);
894 nrunes_ += rr.hi - rr.lo + 1;
895 }
896 }
897 }
898
Negate()899 void CharClassBuilder::Negate() {
900 // Build up negation and then copy in.
901 // Could edit ranges in place, but C++ won't let me.
902 std::vector<RuneRange> v;
903 v.reserve(ranges_.size() + 1);
904
905 // In negation, first range begins at 0, unless
906 // the current class begins at 0.
907 iterator it = begin();
908 if (it == end()) {
909 v.push_back(RuneRange(0, Runemax));
910 } else {
911 int nextlo = 0;
912 if (it->lo == 0) {
913 nextlo = it->hi + 1;
914 ++it;
915 }
916 for (; it != end(); ++it) {
917 v.push_back(RuneRange(nextlo, it->lo - 1));
918 nextlo = it->hi + 1;
919 }
920 if (nextlo <= Runemax)
921 v.push_back(RuneRange(nextlo, Runemax));
922 }
923
924 ranges_.clear();
925 for (size_t i = 0; i < v.size(); i++)
926 ranges_.insert(v[i]);
927
928 upper_ = AlphaMask & ~upper_;
929 lower_ = AlphaMask & ~lower_;
930 nrunes_ = Runemax+1 - nrunes_;
931 }
932
933 // Character class is a sorted list of ranges.
934 // The ranges are allocated in the same block as the header,
935 // necessitating a special allocator and Delete method.
936
New(size_t maxranges)937 CharClass* CharClass::New(size_t maxranges) {
938 CharClass* cc;
939 uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
940 cc = reinterpret_cast<CharClass*>(data);
941 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
942 cc->nranges_ = 0;
943 cc->folds_ascii_ = false;
944 cc->nrunes_ = 0;
945 return cc;
946 }
947
Delete()948 void CharClass::Delete() {
949 uint8_t* data = reinterpret_cast<uint8_t*>(this);
950 delete[] data;
951 }
952
Negate()953 CharClass* CharClass::Negate() {
954 CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1));
955 cc->folds_ascii_ = folds_ascii_;
956 cc->nrunes_ = Runemax + 1 - nrunes_;
957 int n = 0;
958 int nextlo = 0;
959 for (CharClass::iterator it = begin(); it != end(); ++it) {
960 if (it->lo == nextlo) {
961 nextlo = it->hi + 1;
962 } else {
963 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
964 nextlo = it->hi + 1;
965 }
966 }
967 if (nextlo <= Runemax)
968 cc->ranges_[n++] = RuneRange(nextlo, Runemax);
969 cc->nranges_ = n;
970 return cc;
971 }
972
Contains(Rune r) const973 bool CharClass::Contains(Rune r) const {
974 RuneRange* rr = ranges_;
975 int n = nranges_;
976 while (n > 0) {
977 int m = n/2;
978 if (rr[m].hi < r) {
979 rr += m+1;
980 n -= m+1;
981 } else if (r < rr[m].lo) {
982 n = m;
983 } else { // rr[m].lo <= r && r <= rr[m].hi
984 return true;
985 }
986 }
987 return false;
988 }
989
GetCharClass()990 CharClass* CharClassBuilder::GetCharClass() {
991 CharClass* cc = CharClass::New(ranges_.size());
992 int n = 0;
993 for (iterator it = begin(); it != end(); ++it)
994 cc->ranges_[n++] = *it;
995 cc->nranges_ = n;
996 DCHECK_LE(n, static_cast<int>(ranges_.size()));
997 cc->nrunes_ = nrunes_;
998 cc->folds_ascii_ = FoldsASCII();
999 return cc;
1000 }
1001
1002 } // namespace re2
1003