xref: /aosp_15_r20/external/regex-re2/re2/simplify.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi // Rewrite POSIX and other features in re
6*ccdc9c3eSSadaf Ebrahimi // to use simple extended regular expression features.
7*ccdc9c3eSSadaf Ebrahimi // Also sort and simplify character classes.
8*ccdc9c3eSSadaf Ebrahimi 
9*ccdc9c3eSSadaf Ebrahimi #include <string>
10*ccdc9c3eSSadaf Ebrahimi 
11*ccdc9c3eSSadaf Ebrahimi #include "util/util.h"
12*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
13*ccdc9c3eSSadaf Ebrahimi #include "util/pod_array.h"
14*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h"
15*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
16*ccdc9c3eSSadaf Ebrahimi #include "re2/walker-inl.h"
17*ccdc9c3eSSadaf Ebrahimi 
18*ccdc9c3eSSadaf Ebrahimi namespace re2 {
19*ccdc9c3eSSadaf Ebrahimi 
20*ccdc9c3eSSadaf Ebrahimi // Parses the regexp src and then simplifies it and sets *dst to the
21*ccdc9c3eSSadaf Ebrahimi // string representation of the simplified form.  Returns true on success.
22*ccdc9c3eSSadaf Ebrahimi // Returns false and sets *error (if error != NULL) on error.
SimplifyRegexp(const StringPiece & src,ParseFlags flags,string * dst,RegexpStatus * status)23*ccdc9c3eSSadaf Ebrahimi bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
24*ccdc9c3eSSadaf Ebrahimi                             string* dst,
25*ccdc9c3eSSadaf Ebrahimi                             RegexpStatus* status) {
26*ccdc9c3eSSadaf Ebrahimi   Regexp* re = Parse(src, flags, status);
27*ccdc9c3eSSadaf Ebrahimi   if (re == NULL)
28*ccdc9c3eSSadaf Ebrahimi     return false;
29*ccdc9c3eSSadaf Ebrahimi   Regexp* sre = re->Simplify();
30*ccdc9c3eSSadaf Ebrahimi   re->Decref();
31*ccdc9c3eSSadaf Ebrahimi   if (sre == NULL) {
32*ccdc9c3eSSadaf Ebrahimi     // Should not happen, since Simplify never fails.
33*ccdc9c3eSSadaf Ebrahimi     LOG(ERROR) << "Simplify failed on " << src;
34*ccdc9c3eSSadaf Ebrahimi     if (status) {
35*ccdc9c3eSSadaf Ebrahimi       status->set_code(kRegexpInternalError);
36*ccdc9c3eSSadaf Ebrahimi       status->set_error_arg(src);
37*ccdc9c3eSSadaf Ebrahimi     }
38*ccdc9c3eSSadaf Ebrahimi     return false;
39*ccdc9c3eSSadaf Ebrahimi   }
40*ccdc9c3eSSadaf Ebrahimi   *dst = sre->ToString();
41*ccdc9c3eSSadaf Ebrahimi   sre->Decref();
42*ccdc9c3eSSadaf Ebrahimi   return true;
43*ccdc9c3eSSadaf Ebrahimi }
44*ccdc9c3eSSadaf Ebrahimi 
45*ccdc9c3eSSadaf Ebrahimi // Assuming the simple_ flags on the children are accurate,
46*ccdc9c3eSSadaf Ebrahimi // is this Regexp* simple?
ComputeSimple()47*ccdc9c3eSSadaf Ebrahimi bool Regexp::ComputeSimple() {
48*ccdc9c3eSSadaf Ebrahimi   Regexp** subs;
49*ccdc9c3eSSadaf Ebrahimi   switch (op_) {
50*ccdc9c3eSSadaf Ebrahimi     case kRegexpNoMatch:
51*ccdc9c3eSSadaf Ebrahimi     case kRegexpEmptyMatch:
52*ccdc9c3eSSadaf Ebrahimi     case kRegexpLiteral:
53*ccdc9c3eSSadaf Ebrahimi     case kRegexpLiteralString:
54*ccdc9c3eSSadaf Ebrahimi     case kRegexpBeginLine:
55*ccdc9c3eSSadaf Ebrahimi     case kRegexpEndLine:
56*ccdc9c3eSSadaf Ebrahimi     case kRegexpBeginText:
57*ccdc9c3eSSadaf Ebrahimi     case kRegexpWordBoundary:
58*ccdc9c3eSSadaf Ebrahimi     case kRegexpNoWordBoundary:
59*ccdc9c3eSSadaf Ebrahimi     case kRegexpEndText:
60*ccdc9c3eSSadaf Ebrahimi     case kRegexpAnyChar:
61*ccdc9c3eSSadaf Ebrahimi     case kRegexpAnyByte:
62*ccdc9c3eSSadaf Ebrahimi     case kRegexpHaveMatch:
63*ccdc9c3eSSadaf Ebrahimi       return true;
64*ccdc9c3eSSadaf Ebrahimi     case kRegexpConcat:
65*ccdc9c3eSSadaf Ebrahimi     case kRegexpAlternate:
66*ccdc9c3eSSadaf Ebrahimi       // These are simple as long as the subpieces are simple.
67*ccdc9c3eSSadaf Ebrahimi       subs = sub();
68*ccdc9c3eSSadaf Ebrahimi       for (int i = 0; i < nsub_; i++)
69*ccdc9c3eSSadaf Ebrahimi         if (!subs[i]->simple())
70*ccdc9c3eSSadaf Ebrahimi           return false;
71*ccdc9c3eSSadaf Ebrahimi       return true;
72*ccdc9c3eSSadaf Ebrahimi     case kRegexpCharClass:
73*ccdc9c3eSSadaf Ebrahimi       // Simple as long as the char class is not empty, not full.
74*ccdc9c3eSSadaf Ebrahimi       if (ccb_ != NULL)
75*ccdc9c3eSSadaf Ebrahimi         return !ccb_->empty() && !ccb_->full();
76*ccdc9c3eSSadaf Ebrahimi       return !cc_->empty() && !cc_->full();
77*ccdc9c3eSSadaf Ebrahimi     case kRegexpCapture:
78*ccdc9c3eSSadaf Ebrahimi       subs = sub();
79*ccdc9c3eSSadaf Ebrahimi       return subs[0]->simple();
80*ccdc9c3eSSadaf Ebrahimi     case kRegexpStar:
81*ccdc9c3eSSadaf Ebrahimi     case kRegexpPlus:
82*ccdc9c3eSSadaf Ebrahimi     case kRegexpQuest:
83*ccdc9c3eSSadaf Ebrahimi       subs = sub();
84*ccdc9c3eSSadaf Ebrahimi       if (!subs[0]->simple())
85*ccdc9c3eSSadaf Ebrahimi         return false;
86*ccdc9c3eSSadaf Ebrahimi       switch (subs[0]->op_) {
87*ccdc9c3eSSadaf Ebrahimi         case kRegexpStar:
88*ccdc9c3eSSadaf Ebrahimi         case kRegexpPlus:
89*ccdc9c3eSSadaf Ebrahimi         case kRegexpQuest:
90*ccdc9c3eSSadaf Ebrahimi         case kRegexpEmptyMatch:
91*ccdc9c3eSSadaf Ebrahimi         case kRegexpNoMatch:
92*ccdc9c3eSSadaf Ebrahimi           return false;
93*ccdc9c3eSSadaf Ebrahimi         default:
94*ccdc9c3eSSadaf Ebrahimi           break;
95*ccdc9c3eSSadaf Ebrahimi       }
96*ccdc9c3eSSadaf Ebrahimi       return true;
97*ccdc9c3eSSadaf Ebrahimi     case kRegexpRepeat:
98*ccdc9c3eSSadaf Ebrahimi       return false;
99*ccdc9c3eSSadaf Ebrahimi   }
100*ccdc9c3eSSadaf Ebrahimi   LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
101*ccdc9c3eSSadaf Ebrahimi   return false;
102*ccdc9c3eSSadaf Ebrahimi }
103*ccdc9c3eSSadaf Ebrahimi 
104*ccdc9c3eSSadaf Ebrahimi // Walker subclass used by Simplify.
105*ccdc9c3eSSadaf Ebrahimi // Coalesces runs of star/plus/quest/repeat of the same literal along with any
106*ccdc9c3eSSadaf Ebrahimi // occurrences of that literal into repeats of that literal. It also works for
107*ccdc9c3eSSadaf Ebrahimi // char classes, any char and any byte.
108*ccdc9c3eSSadaf Ebrahimi // PostVisit creates the coalesced result, which should then be simplified.
109*ccdc9c3eSSadaf Ebrahimi class CoalesceWalker : public Regexp::Walker<Regexp*> {
110*ccdc9c3eSSadaf Ebrahimi  public:
CoalesceWalker()111*ccdc9c3eSSadaf Ebrahimi   CoalesceWalker() {}
112*ccdc9c3eSSadaf Ebrahimi   virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
113*ccdc9c3eSSadaf Ebrahimi                             Regexp** child_args, int nchild_args);
114*ccdc9c3eSSadaf Ebrahimi   virtual Regexp* Copy(Regexp* re);
115*ccdc9c3eSSadaf Ebrahimi   virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
116*ccdc9c3eSSadaf Ebrahimi 
117*ccdc9c3eSSadaf Ebrahimi  private:
118*ccdc9c3eSSadaf Ebrahimi   // These functions are declared inside CoalesceWalker so that
119*ccdc9c3eSSadaf Ebrahimi   // they can edit the private fields of the Regexps they construct.
120*ccdc9c3eSSadaf Ebrahimi 
121*ccdc9c3eSSadaf Ebrahimi   // Returns true if r1 and r2 can be coalesced. In particular, ensures that
122*ccdc9c3eSSadaf Ebrahimi   // the parse flags are consistent. (They will not be checked again later.)
123*ccdc9c3eSSadaf Ebrahimi   static bool CanCoalesce(Regexp* r1, Regexp* r2);
124*ccdc9c3eSSadaf Ebrahimi 
125*ccdc9c3eSSadaf Ebrahimi   // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
126*ccdc9c3eSSadaf Ebrahimi   // will be empty match and the coalesced op. In other cases, where part of a
127*ccdc9c3eSSadaf Ebrahimi   // literal string was removed to be coalesced, the array elements afterwards
128*ccdc9c3eSSadaf Ebrahimi   // will be the coalesced op and the remainder of the literal string.
129*ccdc9c3eSSadaf Ebrahimi   static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
130*ccdc9c3eSSadaf Ebrahimi 
131*ccdc9c3eSSadaf Ebrahimi   CoalesceWalker(const CoalesceWalker&) = delete;
132*ccdc9c3eSSadaf Ebrahimi   CoalesceWalker& operator=(const CoalesceWalker&) = delete;
133*ccdc9c3eSSadaf Ebrahimi };
134*ccdc9c3eSSadaf Ebrahimi 
135*ccdc9c3eSSadaf Ebrahimi // Walker subclass used by Simplify.
136*ccdc9c3eSSadaf Ebrahimi // The simplify walk is purely post-recursive: given the simplified children,
137*ccdc9c3eSSadaf Ebrahimi // PostVisit creates the simplified result.
138*ccdc9c3eSSadaf Ebrahimi // The child_args are simplified Regexp*s.
139*ccdc9c3eSSadaf Ebrahimi class SimplifyWalker : public Regexp::Walker<Regexp*> {
140*ccdc9c3eSSadaf Ebrahimi  public:
SimplifyWalker()141*ccdc9c3eSSadaf Ebrahimi   SimplifyWalker() {}
142*ccdc9c3eSSadaf Ebrahimi   virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
143*ccdc9c3eSSadaf Ebrahimi   virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
144*ccdc9c3eSSadaf Ebrahimi                             Regexp** child_args, int nchild_args);
145*ccdc9c3eSSadaf Ebrahimi   virtual Regexp* Copy(Regexp* re);
146*ccdc9c3eSSadaf Ebrahimi   virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
147*ccdc9c3eSSadaf Ebrahimi 
148*ccdc9c3eSSadaf Ebrahimi  private:
149*ccdc9c3eSSadaf Ebrahimi   // These functions are declared inside SimplifyWalker so that
150*ccdc9c3eSSadaf Ebrahimi   // they can edit the private fields of the Regexps they construct.
151*ccdc9c3eSSadaf Ebrahimi 
152*ccdc9c3eSSadaf Ebrahimi   // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
153*ccdc9c3eSSadaf Ebrahimi   // Caller must Decref return value when done with it.
154*ccdc9c3eSSadaf Ebrahimi   static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
155*ccdc9c3eSSadaf Ebrahimi 
156*ccdc9c3eSSadaf Ebrahimi   // Simplifies the expression re{min,max} in terms of *, +, and ?.
157*ccdc9c3eSSadaf Ebrahimi   // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
158*ccdc9c3eSSadaf Ebrahimi   // Caller must Decref return value when done with it.
159*ccdc9c3eSSadaf Ebrahimi   static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
160*ccdc9c3eSSadaf Ebrahimi                                 Regexp::ParseFlags parse_flags);
161*ccdc9c3eSSadaf Ebrahimi 
162*ccdc9c3eSSadaf Ebrahimi   // Simplifies a character class by expanding any named classes
163*ccdc9c3eSSadaf Ebrahimi   // into rune ranges.  Does not edit re.  Does not consume ref to re.
164*ccdc9c3eSSadaf Ebrahimi   // Caller must Decref return value when done with it.
165*ccdc9c3eSSadaf Ebrahimi   static Regexp* SimplifyCharClass(Regexp* re);
166*ccdc9c3eSSadaf Ebrahimi 
167*ccdc9c3eSSadaf Ebrahimi   SimplifyWalker(const SimplifyWalker&) = delete;
168*ccdc9c3eSSadaf Ebrahimi   SimplifyWalker& operator=(const SimplifyWalker&) = delete;
169*ccdc9c3eSSadaf Ebrahimi };
170*ccdc9c3eSSadaf Ebrahimi 
171*ccdc9c3eSSadaf Ebrahimi // Simplifies a regular expression, returning a new regexp.
172*ccdc9c3eSSadaf Ebrahimi // The new regexp uses traditional Unix egrep features only,
173*ccdc9c3eSSadaf Ebrahimi // plus the Perl (?:) non-capturing parentheses.
174*ccdc9c3eSSadaf Ebrahimi // Otherwise, no POSIX or Perl additions.  The new regexp
175*ccdc9c3eSSadaf Ebrahimi // captures exactly the same subexpressions (with the same indices)
176*ccdc9c3eSSadaf Ebrahimi // as the original.
177*ccdc9c3eSSadaf Ebrahimi // Does not edit current object.
178*ccdc9c3eSSadaf Ebrahimi // Caller must Decref() return value when done with it.
179*ccdc9c3eSSadaf Ebrahimi 
Simplify()180*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::Simplify() {
181*ccdc9c3eSSadaf Ebrahimi   CoalesceWalker cw;
182*ccdc9c3eSSadaf Ebrahimi   Regexp* cre = cw.Walk(this, NULL);
183*ccdc9c3eSSadaf Ebrahimi   if (cre == NULL)
184*ccdc9c3eSSadaf Ebrahimi     return cre;
185*ccdc9c3eSSadaf Ebrahimi   SimplifyWalker sw;
186*ccdc9c3eSSadaf Ebrahimi   Regexp* sre = sw.Walk(cre, NULL);
187*ccdc9c3eSSadaf Ebrahimi   cre->Decref();
188*ccdc9c3eSSadaf Ebrahimi   return sre;
189*ccdc9c3eSSadaf Ebrahimi }
190*ccdc9c3eSSadaf Ebrahimi 
191*ccdc9c3eSSadaf Ebrahimi #define Simplify DontCallSimplify  // Avoid accidental recursion
192*ccdc9c3eSSadaf Ebrahimi 
193*ccdc9c3eSSadaf Ebrahimi // Utility function for PostVisit implementations that compares re->sub() with
194*ccdc9c3eSSadaf Ebrahimi // child_args to determine whether any child_args changed. In the common case,
195*ccdc9c3eSSadaf Ebrahimi // where nothing changed, calls Decref() for all child_args and returns false,
196*ccdc9c3eSSadaf Ebrahimi // so PostVisit must return re->Incref(). Otherwise, returns true.
ChildArgsChanged(Regexp * re,Regexp ** child_args)197*ccdc9c3eSSadaf Ebrahimi static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
198*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < re->nsub(); i++) {
199*ccdc9c3eSSadaf Ebrahimi     Regexp* sub = re->sub()[i];
200*ccdc9c3eSSadaf Ebrahimi     Regexp* newsub = child_args[i];
201*ccdc9c3eSSadaf Ebrahimi     if (newsub != sub)
202*ccdc9c3eSSadaf Ebrahimi       return true;
203*ccdc9c3eSSadaf Ebrahimi   }
204*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < re->nsub(); i++) {
205*ccdc9c3eSSadaf Ebrahimi     Regexp* newsub = child_args[i];
206*ccdc9c3eSSadaf Ebrahimi     newsub->Decref();
207*ccdc9c3eSSadaf Ebrahimi   }
208*ccdc9c3eSSadaf Ebrahimi   return false;
209*ccdc9c3eSSadaf Ebrahimi }
210*ccdc9c3eSSadaf Ebrahimi 
Copy(Regexp * re)211*ccdc9c3eSSadaf Ebrahimi Regexp* CoalesceWalker::Copy(Regexp* re) {
212*ccdc9c3eSSadaf Ebrahimi   return re->Incref();
213*ccdc9c3eSSadaf Ebrahimi }
214*ccdc9c3eSSadaf Ebrahimi 
ShortVisit(Regexp * re,Regexp * parent_arg)215*ccdc9c3eSSadaf Ebrahimi Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
216*ccdc9c3eSSadaf Ebrahimi   // This should never be called, since we use Walk and not
217*ccdc9c3eSSadaf Ebrahimi   // WalkExponential.
218*ccdc9c3eSSadaf Ebrahimi   LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
219*ccdc9c3eSSadaf Ebrahimi   return re->Incref();
220*ccdc9c3eSSadaf Ebrahimi }
221*ccdc9c3eSSadaf Ebrahimi 
PostVisit(Regexp * re,Regexp * parent_arg,Regexp * pre_arg,Regexp ** child_args,int nchild_args)222*ccdc9c3eSSadaf Ebrahimi Regexp* CoalesceWalker::PostVisit(Regexp* re,
223*ccdc9c3eSSadaf Ebrahimi                                   Regexp* parent_arg,
224*ccdc9c3eSSadaf Ebrahimi                                   Regexp* pre_arg,
225*ccdc9c3eSSadaf Ebrahimi                                   Regexp** child_args,
226*ccdc9c3eSSadaf Ebrahimi                                   int nchild_args) {
227*ccdc9c3eSSadaf Ebrahimi   if (re->nsub() == 0)
228*ccdc9c3eSSadaf Ebrahimi     return re->Incref();
229*ccdc9c3eSSadaf Ebrahimi 
230*ccdc9c3eSSadaf Ebrahimi   if (re->op() != kRegexpConcat) {
231*ccdc9c3eSSadaf Ebrahimi     if (!ChildArgsChanged(re, child_args))
232*ccdc9c3eSSadaf Ebrahimi       return re->Incref();
233*ccdc9c3eSSadaf Ebrahimi 
234*ccdc9c3eSSadaf Ebrahimi     // Something changed. Build a new op.
235*ccdc9c3eSSadaf Ebrahimi     Regexp* nre = new Regexp(re->op(), re->parse_flags());
236*ccdc9c3eSSadaf Ebrahimi     nre->AllocSub(re->nsub());
237*ccdc9c3eSSadaf Ebrahimi     Regexp** nre_subs = nre->sub();
238*ccdc9c3eSSadaf Ebrahimi     for (int i = 0; i < re->nsub(); i++)
239*ccdc9c3eSSadaf Ebrahimi       nre_subs[i] = child_args[i];
240*ccdc9c3eSSadaf Ebrahimi     // Repeats and Captures have additional data that must be copied.
241*ccdc9c3eSSadaf Ebrahimi     if (re->op() == kRegexpRepeat) {
242*ccdc9c3eSSadaf Ebrahimi       nre->min_ = re->min();
243*ccdc9c3eSSadaf Ebrahimi       nre->max_ = re->max();
244*ccdc9c3eSSadaf Ebrahimi     } else if (re->op() == kRegexpCapture) {
245*ccdc9c3eSSadaf Ebrahimi       nre->cap_ = re->cap();
246*ccdc9c3eSSadaf Ebrahimi     }
247*ccdc9c3eSSadaf Ebrahimi     return nre;
248*ccdc9c3eSSadaf Ebrahimi   }
249*ccdc9c3eSSadaf Ebrahimi 
250*ccdc9c3eSSadaf Ebrahimi   bool can_coalesce = false;
251*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < re->nsub(); i++) {
252*ccdc9c3eSSadaf Ebrahimi     if (i+1 < re->nsub() &&
253*ccdc9c3eSSadaf Ebrahimi         CanCoalesce(child_args[i], child_args[i+1])) {
254*ccdc9c3eSSadaf Ebrahimi       can_coalesce = true;
255*ccdc9c3eSSadaf Ebrahimi       break;
256*ccdc9c3eSSadaf Ebrahimi     }
257*ccdc9c3eSSadaf Ebrahimi   }
258*ccdc9c3eSSadaf Ebrahimi   if (!can_coalesce) {
259*ccdc9c3eSSadaf Ebrahimi     if (!ChildArgsChanged(re, child_args))
260*ccdc9c3eSSadaf Ebrahimi       return re->Incref();
261*ccdc9c3eSSadaf Ebrahimi 
262*ccdc9c3eSSadaf Ebrahimi     // Something changed. Build a new op.
263*ccdc9c3eSSadaf Ebrahimi     Regexp* nre = new Regexp(re->op(), re->parse_flags());
264*ccdc9c3eSSadaf Ebrahimi     nre->AllocSub(re->nsub());
265*ccdc9c3eSSadaf Ebrahimi     Regexp** nre_subs = nre->sub();
266*ccdc9c3eSSadaf Ebrahimi     for (int i = 0; i < re->nsub(); i++)
267*ccdc9c3eSSadaf Ebrahimi       nre_subs[i] = child_args[i];
268*ccdc9c3eSSadaf Ebrahimi     return nre;
269*ccdc9c3eSSadaf Ebrahimi   }
270*ccdc9c3eSSadaf Ebrahimi 
271*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < re->nsub(); i++) {
272*ccdc9c3eSSadaf Ebrahimi     if (i+1 < re->nsub() &&
273*ccdc9c3eSSadaf Ebrahimi         CanCoalesce(child_args[i], child_args[i+1]))
274*ccdc9c3eSSadaf Ebrahimi       DoCoalesce(&child_args[i], &child_args[i+1]);
275*ccdc9c3eSSadaf Ebrahimi   }
276*ccdc9c3eSSadaf Ebrahimi   // Determine how many empty matches were left by DoCoalesce.
277*ccdc9c3eSSadaf Ebrahimi   int n = 0;
278*ccdc9c3eSSadaf Ebrahimi   for (int i = n; i < re->nsub(); i++) {
279*ccdc9c3eSSadaf Ebrahimi     if (child_args[i]->op() == kRegexpEmptyMatch)
280*ccdc9c3eSSadaf Ebrahimi       n++;
281*ccdc9c3eSSadaf Ebrahimi   }
282*ccdc9c3eSSadaf Ebrahimi   // Build a new op.
283*ccdc9c3eSSadaf Ebrahimi   Regexp* nre = new Regexp(re->op(), re->parse_flags());
284*ccdc9c3eSSadaf Ebrahimi   nre->AllocSub(re->nsub() - n);
285*ccdc9c3eSSadaf Ebrahimi   Regexp** nre_subs = nre->sub();
286*ccdc9c3eSSadaf Ebrahimi   for (int i = 0, j = 0; i < re->nsub(); i++) {
287*ccdc9c3eSSadaf Ebrahimi     if (child_args[i]->op() == kRegexpEmptyMatch) {
288*ccdc9c3eSSadaf Ebrahimi       child_args[i]->Decref();
289*ccdc9c3eSSadaf Ebrahimi       continue;
290*ccdc9c3eSSadaf Ebrahimi     }
291*ccdc9c3eSSadaf Ebrahimi     nre_subs[j] = child_args[i];
292*ccdc9c3eSSadaf Ebrahimi     j++;
293*ccdc9c3eSSadaf Ebrahimi   }
294*ccdc9c3eSSadaf Ebrahimi   return nre;
295*ccdc9c3eSSadaf Ebrahimi }
296*ccdc9c3eSSadaf Ebrahimi 
CanCoalesce(Regexp * r1,Regexp * r2)297*ccdc9c3eSSadaf Ebrahimi bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
298*ccdc9c3eSSadaf Ebrahimi   // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
299*ccdc9c3eSSadaf Ebrahimi   // any byte.
300*ccdc9c3eSSadaf Ebrahimi   if ((r1->op() == kRegexpStar ||
301*ccdc9c3eSSadaf Ebrahimi        r1->op() == kRegexpPlus ||
302*ccdc9c3eSSadaf Ebrahimi        r1->op() == kRegexpQuest ||
303*ccdc9c3eSSadaf Ebrahimi        r1->op() == kRegexpRepeat) &&
304*ccdc9c3eSSadaf Ebrahimi       (r1->sub()[0]->op() == kRegexpLiteral ||
305*ccdc9c3eSSadaf Ebrahimi        r1->sub()[0]->op() == kRegexpCharClass ||
306*ccdc9c3eSSadaf Ebrahimi        r1->sub()[0]->op() == kRegexpAnyChar ||
307*ccdc9c3eSSadaf Ebrahimi        r1->sub()[0]->op() == kRegexpAnyByte)) {
308*ccdc9c3eSSadaf Ebrahimi     // r2 must be a star/plus/quest/repeat of the same literal, char class,
309*ccdc9c3eSSadaf Ebrahimi     // any char or any byte.
310*ccdc9c3eSSadaf Ebrahimi     if ((r2->op() == kRegexpStar ||
311*ccdc9c3eSSadaf Ebrahimi          r2->op() == kRegexpPlus ||
312*ccdc9c3eSSadaf Ebrahimi          r2->op() == kRegexpQuest ||
313*ccdc9c3eSSadaf Ebrahimi          r2->op() == kRegexpRepeat) &&
314*ccdc9c3eSSadaf Ebrahimi         Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
315*ccdc9c3eSSadaf Ebrahimi         // The parse flags must be consistent.
316*ccdc9c3eSSadaf Ebrahimi         ((r1->parse_flags() & Regexp::NonGreedy) ==
317*ccdc9c3eSSadaf Ebrahimi          (r2->parse_flags() & Regexp::NonGreedy))) {
318*ccdc9c3eSSadaf Ebrahimi       return true;
319*ccdc9c3eSSadaf Ebrahimi     }
320*ccdc9c3eSSadaf Ebrahimi     // ... OR an occurrence of that literal, char class, any char or any byte
321*ccdc9c3eSSadaf Ebrahimi     if (Regexp::Equal(r1->sub()[0], r2)) {
322*ccdc9c3eSSadaf Ebrahimi       return true;
323*ccdc9c3eSSadaf Ebrahimi     }
324*ccdc9c3eSSadaf Ebrahimi     // ... OR a literal string that begins with that literal.
325*ccdc9c3eSSadaf Ebrahimi     if (r1->sub()[0]->op() == kRegexpLiteral &&
326*ccdc9c3eSSadaf Ebrahimi         r2->op() == kRegexpLiteralString &&
327*ccdc9c3eSSadaf Ebrahimi         r2->runes()[0] == r1->sub()[0]->rune() &&
328*ccdc9c3eSSadaf Ebrahimi         // The parse flags must be consistent.
329*ccdc9c3eSSadaf Ebrahimi         ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
330*ccdc9c3eSSadaf Ebrahimi          (r2->parse_flags() & Regexp::FoldCase))) {
331*ccdc9c3eSSadaf Ebrahimi       return true;
332*ccdc9c3eSSadaf Ebrahimi     }
333*ccdc9c3eSSadaf Ebrahimi   }
334*ccdc9c3eSSadaf Ebrahimi   return false;
335*ccdc9c3eSSadaf Ebrahimi }
336*ccdc9c3eSSadaf Ebrahimi 
DoCoalesce(Regexp ** r1ptr,Regexp ** r2ptr)337*ccdc9c3eSSadaf Ebrahimi void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
338*ccdc9c3eSSadaf Ebrahimi   Regexp* r1 = *r1ptr;
339*ccdc9c3eSSadaf Ebrahimi   Regexp* r2 = *r2ptr;
340*ccdc9c3eSSadaf Ebrahimi 
341*ccdc9c3eSSadaf Ebrahimi   Regexp* nre = Regexp::Repeat(
342*ccdc9c3eSSadaf Ebrahimi       r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
343*ccdc9c3eSSadaf Ebrahimi 
344*ccdc9c3eSSadaf Ebrahimi   switch (r1->op()) {
345*ccdc9c3eSSadaf Ebrahimi     case kRegexpStar:
346*ccdc9c3eSSadaf Ebrahimi       nre->min_ = 0;
347*ccdc9c3eSSadaf Ebrahimi       nre->max_ = -1;
348*ccdc9c3eSSadaf Ebrahimi       break;
349*ccdc9c3eSSadaf Ebrahimi 
350*ccdc9c3eSSadaf Ebrahimi     case kRegexpPlus:
351*ccdc9c3eSSadaf Ebrahimi       nre->min_ = 1;
352*ccdc9c3eSSadaf Ebrahimi       nre->max_ = -1;
353*ccdc9c3eSSadaf Ebrahimi       break;
354*ccdc9c3eSSadaf Ebrahimi 
355*ccdc9c3eSSadaf Ebrahimi     case kRegexpQuest:
356*ccdc9c3eSSadaf Ebrahimi       nre->min_ = 0;
357*ccdc9c3eSSadaf Ebrahimi       nre->max_ = 1;
358*ccdc9c3eSSadaf Ebrahimi       break;
359*ccdc9c3eSSadaf Ebrahimi 
360*ccdc9c3eSSadaf Ebrahimi     case kRegexpRepeat:
361*ccdc9c3eSSadaf Ebrahimi       nre->min_ = r1->min();
362*ccdc9c3eSSadaf Ebrahimi       nre->max_ = r1->max();
363*ccdc9c3eSSadaf Ebrahimi       break;
364*ccdc9c3eSSadaf Ebrahimi 
365*ccdc9c3eSSadaf Ebrahimi     default:
366*ccdc9c3eSSadaf Ebrahimi       LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
367*ccdc9c3eSSadaf Ebrahimi       nre->Decref();
368*ccdc9c3eSSadaf Ebrahimi       return;
369*ccdc9c3eSSadaf Ebrahimi   }
370*ccdc9c3eSSadaf Ebrahimi 
371*ccdc9c3eSSadaf Ebrahimi   switch (r2->op()) {
372*ccdc9c3eSSadaf Ebrahimi     case kRegexpStar:
373*ccdc9c3eSSadaf Ebrahimi       nre->max_ = -1;
374*ccdc9c3eSSadaf Ebrahimi       goto LeaveEmpty;
375*ccdc9c3eSSadaf Ebrahimi 
376*ccdc9c3eSSadaf Ebrahimi     case kRegexpPlus:
377*ccdc9c3eSSadaf Ebrahimi       nre->min_++;
378*ccdc9c3eSSadaf Ebrahimi       nre->max_ = -1;
379*ccdc9c3eSSadaf Ebrahimi       goto LeaveEmpty;
380*ccdc9c3eSSadaf Ebrahimi 
381*ccdc9c3eSSadaf Ebrahimi     case kRegexpQuest:
382*ccdc9c3eSSadaf Ebrahimi       if (nre->max() != -1)
383*ccdc9c3eSSadaf Ebrahimi         nre->max_++;
384*ccdc9c3eSSadaf Ebrahimi       goto LeaveEmpty;
385*ccdc9c3eSSadaf Ebrahimi 
386*ccdc9c3eSSadaf Ebrahimi     case kRegexpRepeat:
387*ccdc9c3eSSadaf Ebrahimi       nre->min_ += r2->min();
388*ccdc9c3eSSadaf Ebrahimi       if (r2->max() == -1)
389*ccdc9c3eSSadaf Ebrahimi         nre->max_ = -1;
390*ccdc9c3eSSadaf Ebrahimi       else if (nre->max() != -1)
391*ccdc9c3eSSadaf Ebrahimi         nre->max_ += r2->max();
392*ccdc9c3eSSadaf Ebrahimi       goto LeaveEmpty;
393*ccdc9c3eSSadaf Ebrahimi 
394*ccdc9c3eSSadaf Ebrahimi     case kRegexpLiteral:
395*ccdc9c3eSSadaf Ebrahimi     case kRegexpCharClass:
396*ccdc9c3eSSadaf Ebrahimi     case kRegexpAnyChar:
397*ccdc9c3eSSadaf Ebrahimi     case kRegexpAnyByte:
398*ccdc9c3eSSadaf Ebrahimi       nre->min_++;
399*ccdc9c3eSSadaf Ebrahimi       if (nre->max() != -1)
400*ccdc9c3eSSadaf Ebrahimi         nre->max_++;
401*ccdc9c3eSSadaf Ebrahimi       goto LeaveEmpty;
402*ccdc9c3eSSadaf Ebrahimi 
403*ccdc9c3eSSadaf Ebrahimi     LeaveEmpty:
404*ccdc9c3eSSadaf Ebrahimi       *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
405*ccdc9c3eSSadaf Ebrahimi       *r2ptr = nre;
406*ccdc9c3eSSadaf Ebrahimi       break;
407*ccdc9c3eSSadaf Ebrahimi 
408*ccdc9c3eSSadaf Ebrahimi     case kRegexpLiteralString: {
409*ccdc9c3eSSadaf Ebrahimi       Rune r = r1->sub()[0]->rune();
410*ccdc9c3eSSadaf Ebrahimi       // Determine how much of the literal string is removed.
411*ccdc9c3eSSadaf Ebrahimi       // We know that we have at least one rune. :)
412*ccdc9c3eSSadaf Ebrahimi       int n = 1;
413*ccdc9c3eSSadaf Ebrahimi       while (n < r2->nrunes() && r2->runes()[n] == r)
414*ccdc9c3eSSadaf Ebrahimi         n++;
415*ccdc9c3eSSadaf Ebrahimi       nre->min_ += n;
416*ccdc9c3eSSadaf Ebrahimi       if (nre->max() != -1)
417*ccdc9c3eSSadaf Ebrahimi         nre->max_ += n;
418*ccdc9c3eSSadaf Ebrahimi       if (n == r2->nrunes())
419*ccdc9c3eSSadaf Ebrahimi         goto LeaveEmpty;
420*ccdc9c3eSSadaf Ebrahimi       *r1ptr = nre;
421*ccdc9c3eSSadaf Ebrahimi       *r2ptr = Regexp::LiteralString(
422*ccdc9c3eSSadaf Ebrahimi           &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
423*ccdc9c3eSSadaf Ebrahimi       break;
424*ccdc9c3eSSadaf Ebrahimi     }
425*ccdc9c3eSSadaf Ebrahimi 
426*ccdc9c3eSSadaf Ebrahimi     default:
427*ccdc9c3eSSadaf Ebrahimi       LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
428*ccdc9c3eSSadaf Ebrahimi       nre->Decref();
429*ccdc9c3eSSadaf Ebrahimi       return;
430*ccdc9c3eSSadaf Ebrahimi   }
431*ccdc9c3eSSadaf Ebrahimi 
432*ccdc9c3eSSadaf Ebrahimi   r1->Decref();
433*ccdc9c3eSSadaf Ebrahimi   r2->Decref();
434*ccdc9c3eSSadaf Ebrahimi }
435*ccdc9c3eSSadaf Ebrahimi 
Copy(Regexp * re)436*ccdc9c3eSSadaf Ebrahimi Regexp* SimplifyWalker::Copy(Regexp* re) {
437*ccdc9c3eSSadaf Ebrahimi   return re->Incref();
438*ccdc9c3eSSadaf Ebrahimi }
439*ccdc9c3eSSadaf Ebrahimi 
ShortVisit(Regexp * re,Regexp * parent_arg)440*ccdc9c3eSSadaf Ebrahimi Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
441*ccdc9c3eSSadaf Ebrahimi   // This should never be called, since we use Walk and not
442*ccdc9c3eSSadaf Ebrahimi   // WalkExponential.
443*ccdc9c3eSSadaf Ebrahimi   LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
444*ccdc9c3eSSadaf Ebrahimi   return re->Incref();
445*ccdc9c3eSSadaf Ebrahimi }
446*ccdc9c3eSSadaf Ebrahimi 
PreVisit(Regexp * re,Regexp * parent_arg,bool * stop)447*ccdc9c3eSSadaf Ebrahimi Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
448*ccdc9c3eSSadaf Ebrahimi   if (re->simple()) {
449*ccdc9c3eSSadaf Ebrahimi     *stop = true;
450*ccdc9c3eSSadaf Ebrahimi     return re->Incref();
451*ccdc9c3eSSadaf Ebrahimi   }
452*ccdc9c3eSSadaf Ebrahimi   return NULL;
453*ccdc9c3eSSadaf Ebrahimi }
454*ccdc9c3eSSadaf Ebrahimi 
PostVisit(Regexp * re,Regexp * parent_arg,Regexp * pre_arg,Regexp ** child_args,int nchild_args)455*ccdc9c3eSSadaf Ebrahimi Regexp* SimplifyWalker::PostVisit(Regexp* re,
456*ccdc9c3eSSadaf Ebrahimi                                   Regexp* parent_arg,
457*ccdc9c3eSSadaf Ebrahimi                                   Regexp* pre_arg,
458*ccdc9c3eSSadaf Ebrahimi                                   Regexp** child_args,
459*ccdc9c3eSSadaf Ebrahimi                                   int nchild_args) {
460*ccdc9c3eSSadaf Ebrahimi   switch (re->op()) {
461*ccdc9c3eSSadaf Ebrahimi     case kRegexpNoMatch:
462*ccdc9c3eSSadaf Ebrahimi     case kRegexpEmptyMatch:
463*ccdc9c3eSSadaf Ebrahimi     case kRegexpLiteral:
464*ccdc9c3eSSadaf Ebrahimi     case kRegexpLiteralString:
465*ccdc9c3eSSadaf Ebrahimi     case kRegexpBeginLine:
466*ccdc9c3eSSadaf Ebrahimi     case kRegexpEndLine:
467*ccdc9c3eSSadaf Ebrahimi     case kRegexpBeginText:
468*ccdc9c3eSSadaf Ebrahimi     case kRegexpWordBoundary:
469*ccdc9c3eSSadaf Ebrahimi     case kRegexpNoWordBoundary:
470*ccdc9c3eSSadaf Ebrahimi     case kRegexpEndText:
471*ccdc9c3eSSadaf Ebrahimi     case kRegexpAnyChar:
472*ccdc9c3eSSadaf Ebrahimi     case kRegexpAnyByte:
473*ccdc9c3eSSadaf Ebrahimi     case kRegexpHaveMatch:
474*ccdc9c3eSSadaf Ebrahimi       // All these are always simple.
475*ccdc9c3eSSadaf Ebrahimi       re->simple_ = true;
476*ccdc9c3eSSadaf Ebrahimi       return re->Incref();
477*ccdc9c3eSSadaf Ebrahimi 
478*ccdc9c3eSSadaf Ebrahimi     case kRegexpConcat:
479*ccdc9c3eSSadaf Ebrahimi     case kRegexpAlternate: {
480*ccdc9c3eSSadaf Ebrahimi       // These are simple as long as the subpieces are simple.
481*ccdc9c3eSSadaf Ebrahimi       if (!ChildArgsChanged(re, child_args)) {
482*ccdc9c3eSSadaf Ebrahimi         re->simple_ = true;
483*ccdc9c3eSSadaf Ebrahimi         return re->Incref();
484*ccdc9c3eSSadaf Ebrahimi       }
485*ccdc9c3eSSadaf Ebrahimi       Regexp* nre = new Regexp(re->op(), re->parse_flags());
486*ccdc9c3eSSadaf Ebrahimi       nre->AllocSub(re->nsub());
487*ccdc9c3eSSadaf Ebrahimi       Regexp** nre_subs = nre->sub();
488*ccdc9c3eSSadaf Ebrahimi       for (int i = 0; i < re->nsub(); i++)
489*ccdc9c3eSSadaf Ebrahimi         nre_subs[i] = child_args[i];
490*ccdc9c3eSSadaf Ebrahimi       nre->simple_ = true;
491*ccdc9c3eSSadaf Ebrahimi       return nre;
492*ccdc9c3eSSadaf Ebrahimi     }
493*ccdc9c3eSSadaf Ebrahimi 
494*ccdc9c3eSSadaf Ebrahimi     case kRegexpCapture: {
495*ccdc9c3eSSadaf Ebrahimi       Regexp* newsub = child_args[0];
496*ccdc9c3eSSadaf Ebrahimi       if (newsub == re->sub()[0]) {
497*ccdc9c3eSSadaf Ebrahimi         newsub->Decref();
498*ccdc9c3eSSadaf Ebrahimi         re->simple_ = true;
499*ccdc9c3eSSadaf Ebrahimi         return re->Incref();
500*ccdc9c3eSSadaf Ebrahimi       }
501*ccdc9c3eSSadaf Ebrahimi       Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
502*ccdc9c3eSSadaf Ebrahimi       nre->AllocSub(1);
503*ccdc9c3eSSadaf Ebrahimi       nre->sub()[0] = newsub;
504*ccdc9c3eSSadaf Ebrahimi       nre->cap_ = re->cap();
505*ccdc9c3eSSadaf Ebrahimi       nre->simple_ = true;
506*ccdc9c3eSSadaf Ebrahimi       return nre;
507*ccdc9c3eSSadaf Ebrahimi     }
508*ccdc9c3eSSadaf Ebrahimi 
509*ccdc9c3eSSadaf Ebrahimi     case kRegexpStar:
510*ccdc9c3eSSadaf Ebrahimi     case kRegexpPlus:
511*ccdc9c3eSSadaf Ebrahimi     case kRegexpQuest: {
512*ccdc9c3eSSadaf Ebrahimi       Regexp* newsub = child_args[0];
513*ccdc9c3eSSadaf Ebrahimi       // Special case: repeat the empty string as much as
514*ccdc9c3eSSadaf Ebrahimi       // you want, but it's still the empty string.
515*ccdc9c3eSSadaf Ebrahimi       if (newsub->op() == kRegexpEmptyMatch)
516*ccdc9c3eSSadaf Ebrahimi         return newsub;
517*ccdc9c3eSSadaf Ebrahimi 
518*ccdc9c3eSSadaf Ebrahimi       // These are simple as long as the subpiece is simple.
519*ccdc9c3eSSadaf Ebrahimi       if (newsub == re->sub()[0]) {
520*ccdc9c3eSSadaf Ebrahimi         newsub->Decref();
521*ccdc9c3eSSadaf Ebrahimi         re->simple_ = true;
522*ccdc9c3eSSadaf Ebrahimi         return re->Incref();
523*ccdc9c3eSSadaf Ebrahimi       }
524*ccdc9c3eSSadaf Ebrahimi 
525*ccdc9c3eSSadaf Ebrahimi       // These are also idempotent if flags are constant.
526*ccdc9c3eSSadaf Ebrahimi       if (re->op() == newsub->op() &&
527*ccdc9c3eSSadaf Ebrahimi           re->parse_flags() == newsub->parse_flags())
528*ccdc9c3eSSadaf Ebrahimi         return newsub;
529*ccdc9c3eSSadaf Ebrahimi 
530*ccdc9c3eSSadaf Ebrahimi       Regexp* nre = new Regexp(re->op(), re->parse_flags());
531*ccdc9c3eSSadaf Ebrahimi       nre->AllocSub(1);
532*ccdc9c3eSSadaf Ebrahimi       nre->sub()[0] = newsub;
533*ccdc9c3eSSadaf Ebrahimi       nre->simple_ = true;
534*ccdc9c3eSSadaf Ebrahimi       return nre;
535*ccdc9c3eSSadaf Ebrahimi     }
536*ccdc9c3eSSadaf Ebrahimi 
537*ccdc9c3eSSadaf Ebrahimi     case kRegexpRepeat: {
538*ccdc9c3eSSadaf Ebrahimi       Regexp* newsub = child_args[0];
539*ccdc9c3eSSadaf Ebrahimi       // Special case: repeat the empty string as much as
540*ccdc9c3eSSadaf Ebrahimi       // you want, but it's still the empty string.
541*ccdc9c3eSSadaf Ebrahimi       if (newsub->op() == kRegexpEmptyMatch)
542*ccdc9c3eSSadaf Ebrahimi         return newsub;
543*ccdc9c3eSSadaf Ebrahimi 
544*ccdc9c3eSSadaf Ebrahimi       Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
545*ccdc9c3eSSadaf Ebrahimi                                    re->parse_flags());
546*ccdc9c3eSSadaf Ebrahimi       newsub->Decref();
547*ccdc9c3eSSadaf Ebrahimi       nre->simple_ = true;
548*ccdc9c3eSSadaf Ebrahimi       return nre;
549*ccdc9c3eSSadaf Ebrahimi     }
550*ccdc9c3eSSadaf Ebrahimi 
551*ccdc9c3eSSadaf Ebrahimi     case kRegexpCharClass: {
552*ccdc9c3eSSadaf Ebrahimi       Regexp* nre = SimplifyCharClass(re);
553*ccdc9c3eSSadaf Ebrahimi       nre->simple_ = true;
554*ccdc9c3eSSadaf Ebrahimi       return nre;
555*ccdc9c3eSSadaf Ebrahimi     }
556*ccdc9c3eSSadaf Ebrahimi   }
557*ccdc9c3eSSadaf Ebrahimi 
558*ccdc9c3eSSadaf Ebrahimi   LOG(ERROR) << "Simplify case not handled: " << re->op();
559*ccdc9c3eSSadaf Ebrahimi   return re->Incref();
560*ccdc9c3eSSadaf Ebrahimi }
561*ccdc9c3eSSadaf Ebrahimi 
562*ccdc9c3eSSadaf Ebrahimi // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
563*ccdc9c3eSSadaf Ebrahimi // Returns a new Regexp, handing the ref to the caller.
Concat2(Regexp * re1,Regexp * re2,Regexp::ParseFlags parse_flags)564*ccdc9c3eSSadaf Ebrahimi Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
565*ccdc9c3eSSadaf Ebrahimi                                 Regexp::ParseFlags parse_flags) {
566*ccdc9c3eSSadaf Ebrahimi   Regexp* re = new Regexp(kRegexpConcat, parse_flags);
567*ccdc9c3eSSadaf Ebrahimi   re->AllocSub(2);
568*ccdc9c3eSSadaf Ebrahimi   Regexp** subs = re->sub();
569*ccdc9c3eSSadaf Ebrahimi   subs[0] = re1;
570*ccdc9c3eSSadaf Ebrahimi   subs[1] = re2;
571*ccdc9c3eSSadaf Ebrahimi   return re;
572*ccdc9c3eSSadaf Ebrahimi }
573*ccdc9c3eSSadaf Ebrahimi 
574*ccdc9c3eSSadaf Ebrahimi // Simplifies the expression re{min,max} in terms of *, +, and ?.
575*ccdc9c3eSSadaf Ebrahimi // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
576*ccdc9c3eSSadaf Ebrahimi // Caller must Decref return value when done with it.
577*ccdc9c3eSSadaf Ebrahimi // The result will *not* necessarily have the right capturing parens
578*ccdc9c3eSSadaf Ebrahimi // if you call ToString() and re-parse it: (x){2} becomes (x)(x),
579*ccdc9c3eSSadaf Ebrahimi // but in the Regexp* representation, both (x) are marked as $1.
SimplifyRepeat(Regexp * re,int min,int max,Regexp::ParseFlags f)580*ccdc9c3eSSadaf Ebrahimi Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
581*ccdc9c3eSSadaf Ebrahimi                                        Regexp::ParseFlags f) {
582*ccdc9c3eSSadaf Ebrahimi   // x{n,} means at least n matches of x.
583*ccdc9c3eSSadaf Ebrahimi   if (max == -1) {
584*ccdc9c3eSSadaf Ebrahimi     // Special case: x{0,} is x*
585*ccdc9c3eSSadaf Ebrahimi     if (min == 0)
586*ccdc9c3eSSadaf Ebrahimi       return Regexp::Star(re->Incref(), f);
587*ccdc9c3eSSadaf Ebrahimi 
588*ccdc9c3eSSadaf Ebrahimi     // Special case: x{1,} is x+
589*ccdc9c3eSSadaf Ebrahimi     if (min == 1)
590*ccdc9c3eSSadaf Ebrahimi       return Regexp::Plus(re->Incref(), f);
591*ccdc9c3eSSadaf Ebrahimi 
592*ccdc9c3eSSadaf Ebrahimi     // General case: x{4,} is xxxx+
593*ccdc9c3eSSadaf Ebrahimi     PODArray<Regexp*> nre_subs(min);
594*ccdc9c3eSSadaf Ebrahimi     for (int i = 0; i < min-1; i++)
595*ccdc9c3eSSadaf Ebrahimi       nre_subs[i] = re->Incref();
596*ccdc9c3eSSadaf Ebrahimi     nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
597*ccdc9c3eSSadaf Ebrahimi     return Regexp::Concat(nre_subs.data(), min, f);
598*ccdc9c3eSSadaf Ebrahimi   }
599*ccdc9c3eSSadaf Ebrahimi 
600*ccdc9c3eSSadaf Ebrahimi   // Special case: (x){0} matches only empty string.
601*ccdc9c3eSSadaf Ebrahimi   if (min == 0 && max == 0)
602*ccdc9c3eSSadaf Ebrahimi     return new Regexp(kRegexpEmptyMatch, f);
603*ccdc9c3eSSadaf Ebrahimi 
604*ccdc9c3eSSadaf Ebrahimi   // Special case: x{1} is just x.
605*ccdc9c3eSSadaf Ebrahimi   if (min == 1 && max == 1)
606*ccdc9c3eSSadaf Ebrahimi     return re->Incref();
607*ccdc9c3eSSadaf Ebrahimi 
608*ccdc9c3eSSadaf Ebrahimi   // General case: x{n,m} means n copies of x and m copies of x?.
609*ccdc9c3eSSadaf Ebrahimi   // The machine will do less work if we nest the final m copies,
610*ccdc9c3eSSadaf Ebrahimi   // so that x{2,5} = xx(x(x(x)?)?)?
611*ccdc9c3eSSadaf Ebrahimi 
612*ccdc9c3eSSadaf Ebrahimi   // Build leading prefix: xx.  Capturing only on the last one.
613*ccdc9c3eSSadaf Ebrahimi   Regexp* nre = NULL;
614*ccdc9c3eSSadaf Ebrahimi   if (min > 0) {
615*ccdc9c3eSSadaf Ebrahimi     PODArray<Regexp*> nre_subs(min);
616*ccdc9c3eSSadaf Ebrahimi     for (int i = 0; i < min; i++)
617*ccdc9c3eSSadaf Ebrahimi       nre_subs[i] = re->Incref();
618*ccdc9c3eSSadaf Ebrahimi     nre = Regexp::Concat(nre_subs.data(), min, f);
619*ccdc9c3eSSadaf Ebrahimi   }
620*ccdc9c3eSSadaf Ebrahimi 
621*ccdc9c3eSSadaf Ebrahimi   // Build and attach suffix: (x(x(x)?)?)?
622*ccdc9c3eSSadaf Ebrahimi   if (max > min) {
623*ccdc9c3eSSadaf Ebrahimi     Regexp* suf = Regexp::Quest(re->Incref(), f);
624*ccdc9c3eSSadaf Ebrahimi     for (int i = min+1; i < max; i++)
625*ccdc9c3eSSadaf Ebrahimi       suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
626*ccdc9c3eSSadaf Ebrahimi     if (nre == NULL)
627*ccdc9c3eSSadaf Ebrahimi       nre = suf;
628*ccdc9c3eSSadaf Ebrahimi     else
629*ccdc9c3eSSadaf Ebrahimi       nre = Concat2(nre, suf, f);
630*ccdc9c3eSSadaf Ebrahimi   }
631*ccdc9c3eSSadaf Ebrahimi 
632*ccdc9c3eSSadaf Ebrahimi   if (nre == NULL) {
633*ccdc9c3eSSadaf Ebrahimi     // Some degenerate case, like min > max, or min < max < 0.
634*ccdc9c3eSSadaf Ebrahimi     // This shouldn't happen, because the parser rejects such regexps.
635*ccdc9c3eSSadaf Ebrahimi     LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
636*ccdc9c3eSSadaf Ebrahimi     return new Regexp(kRegexpNoMatch, f);
637*ccdc9c3eSSadaf Ebrahimi   }
638*ccdc9c3eSSadaf Ebrahimi 
639*ccdc9c3eSSadaf Ebrahimi   return nre;
640*ccdc9c3eSSadaf Ebrahimi }
641*ccdc9c3eSSadaf Ebrahimi 
642*ccdc9c3eSSadaf Ebrahimi // Simplifies a character class.
643*ccdc9c3eSSadaf Ebrahimi // Caller must Decref return value when done with it.
SimplifyCharClass(Regexp * re)644*ccdc9c3eSSadaf Ebrahimi Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
645*ccdc9c3eSSadaf Ebrahimi   CharClass* cc = re->cc();
646*ccdc9c3eSSadaf Ebrahimi 
647*ccdc9c3eSSadaf Ebrahimi   // Special cases
648*ccdc9c3eSSadaf Ebrahimi   if (cc->empty())
649*ccdc9c3eSSadaf Ebrahimi     return new Regexp(kRegexpNoMatch, re->parse_flags());
650*ccdc9c3eSSadaf Ebrahimi   if (cc->full())
651*ccdc9c3eSSadaf Ebrahimi     return new Regexp(kRegexpAnyChar, re->parse_flags());
652*ccdc9c3eSSadaf Ebrahimi 
653*ccdc9c3eSSadaf Ebrahimi   return re->Incref();
654*ccdc9c3eSSadaf Ebrahimi }
655*ccdc9c3eSSadaf Ebrahimi 
656*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
657