xref: /aosp_15_r20/external/cronet/third_party/re2/src/re2/mimics_pcre.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Determine whether this library should match PCRE exactly
6 // for a particular Regexp.  (If so, the testing framework can
7 // check that it does.)
8 //
9 // This library matches PCRE except in these cases:
10 //   * the regexp contains a repetition of an empty string,
11 //     like (a*)* or (a*)+.  In this case, PCRE will treat
12 //     the repetition sequence as ending with an empty string,
13 //     while this library does not.
14 //   * Perl and PCRE differ on whether \v matches \n.
15 //     For historical reasons, this library implements the Perl behavior.
16 //   * Perl and PCRE allow $ in one-line mode to match either the very
17 //     end of the text or just before a \n at the end of the text.
18 //     This library requires it to match only the end of the text.
19 //   * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20 //     match the end of the text if the last character is a \n.
21 //     This library does allow it.
22 //
23 // Regexp::MimicsPCRE checks for any of these conditions.
24 
25 #include "util/logging.h"
26 #include "re2/regexp.h"
27 #include "re2/walker-inl.h"
28 
29 namespace re2 {
30 
31 // Returns whether re might match an empty string.
32 static bool CanBeEmptyString(Regexp *re);
33 
34 // Walker class to compute whether library handles a regexp
35 // exactly as PCRE would.  See comment at top for conditions.
36 
37 class PCREWalker : public Regexp::Walker<bool> {
38  public:
PCREWalker()39   PCREWalker() {}
40 
41   virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
42                          bool* child_args, int nchild_args);
43 
ShortVisit(Regexp * re,bool a)44   virtual bool ShortVisit(Regexp* re, bool a) {
45     // Should never be called: we use Walk(), not WalkExponential().
46 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
47     LOG(DFATAL) << "PCREWalker::ShortVisit called";
48 #endif
49     return a;
50   }
51 
52  private:
53   PCREWalker(const PCREWalker&) = delete;
54   PCREWalker& operator=(const PCREWalker&) = delete;
55 };
56 
57 // Called after visiting each of re's children and accumulating
58 // the return values in child_args.  So child_args contains whether
59 // this library mimics PCRE for those subexpressions.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)60 bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
61                            bool* child_args, int nchild_args) {
62   // If children failed, so do we.
63   for (int i = 0; i < nchild_args; i++)
64     if (!child_args[i])
65       return false;
66 
67   // Otherwise look for other reasons to fail.
68   switch (re->op()) {
69     // Look for repeated empty string.
70     case kRegexpStar:
71     case kRegexpPlus:
72     case kRegexpQuest:
73       if (CanBeEmptyString(re->sub()[0]))
74         return false;
75       break;
76     case kRegexpRepeat:
77       if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
78         return false;
79       break;
80 
81     // Look for \v
82     case kRegexpLiteral:
83       if (re->rune() == '\v')
84         return false;
85       break;
86 
87     // Look for $ in single-line mode.
88     case kRegexpEndText:
89     case kRegexpEmptyMatch:
90       if (re->parse_flags() & Regexp::WasDollar)
91         return false;
92       break;
93 
94     // Look for ^ in multi-line mode.
95     case kRegexpBeginLine:
96       // No condition: in single-line mode ^ becomes kRegexpBeginText.
97       return false;
98 
99     default:
100       break;
101   }
102 
103   // Not proven guilty.
104   return true;
105 }
106 
107 // Returns whether this regexp's behavior will mimic PCRE's exactly.
MimicsPCRE()108 bool Regexp::MimicsPCRE() {
109   PCREWalker w;
110   return w.Walk(this, true);
111 }
112 
113 
114 // Walker class to compute whether a Regexp can match an empty string.
115 // It is okay to overestimate.  For example, \b\B cannot match an empty
116 // string, because \b and \B are mutually exclusive, but this isn't
117 // that smart and will say it can.  Spurious empty strings
118 // will reduce the number of regexps we sanity check against PCRE,
119 // but they won't break anything.
120 
121 class EmptyStringWalker : public Regexp::Walker<bool> {
122  public:
EmptyStringWalker()123   EmptyStringWalker() {}
124 
125   virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
126                          bool* child_args, int nchild_args);
127 
ShortVisit(Regexp * re,bool a)128   virtual bool ShortVisit(Regexp* re, bool a) {
129     // Should never be called: we use Walk(), not WalkExponential().
130 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
131     LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
132 #endif
133     return a;
134   }
135 
136  private:
137   EmptyStringWalker(const EmptyStringWalker&) = delete;
138   EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
139 };
140 
141 // Called after visiting re's children.  child_args contains the return
142 // value from each of the children's PostVisits (i.e., whether each child
143 // can match an empty string).  Returns whether this clause can match an
144 // empty string.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)145 bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
146                                   bool* child_args, int nchild_args) {
147   switch (re->op()) {
148     case kRegexpNoMatch:               // never empty
149     case kRegexpLiteral:
150     case kRegexpAnyChar:
151     case kRegexpAnyByte:
152     case kRegexpCharClass:
153     case kRegexpLiteralString:
154       return false;
155 
156     case kRegexpEmptyMatch:            // always empty
157     case kRegexpBeginLine:             // always empty, when they match
158     case kRegexpEndLine:
159     case kRegexpNoWordBoundary:
160     case kRegexpWordBoundary:
161     case kRegexpBeginText:
162     case kRegexpEndText:
163     case kRegexpStar:                  // can always be empty
164     case kRegexpQuest:
165     case kRegexpHaveMatch:
166       return true;
167 
168     case kRegexpConcat:                // can be empty if all children can
169       for (int i = 0; i < nchild_args; i++)
170         if (!child_args[i])
171           return false;
172       return true;
173 
174     case kRegexpAlternate:             // can be empty if any child can
175       for (int i = 0; i < nchild_args; i++)
176         if (child_args[i])
177           return true;
178       return false;
179 
180     case kRegexpPlus:                  // can be empty if the child can
181     case kRegexpCapture:
182       return child_args[0];
183 
184     case kRegexpRepeat:                // can be empty if child can or is x{0}
185       return child_args[0] || re->min() == 0;
186   }
187   return false;
188 }
189 
190 // Returns whether re can match an empty string.
CanBeEmptyString(Regexp * re)191 static bool CanBeEmptyString(Regexp* re) {
192   EmptyStringWalker w;
193   return w.Walk(re, true);
194 }
195 
196 }  // namespace re2
197