1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Determine whether this library should match PCRE exactly
6 // for a particular Regexp. (If so, the testing framework can
7 // check that it does.)
8 //
9 // This library matches PCRE except in these cases:
10 // * the regexp contains a repetition of an empty string,
11 // like (a*)* or (a*)+. In this case, PCRE will treat
12 // the repetition sequence as ending with an empty string,
13 // while this library does not.
14 // * Perl and PCRE differ on whether \v matches \n.
15 // For historical reasons, this library implements the Perl behavior.
16 // * Perl and PCRE allow $ in one-line mode to match either the very
17 // end of the text or just before a \n at the end of the text.
18 // This library requires it to match only the end of the text.
19 // * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20 // match the end of the text if the last character is a \n.
21 // This library does allow it.
22 //
23 // Regexp::MimicsPCRE checks for any of these conditions.
24
25 #include "util/logging.h"
26 #include "re2/regexp.h"
27 #include "re2/walker-inl.h"
28
29 namespace re2 {
30
31 // Returns whether re might match an empty string.
32 static bool CanBeEmptyString(Regexp *re);
33
34 // Walker class to compute whether library handles a regexp
35 // exactly as PCRE would. See comment at top for conditions.
36
37 class PCREWalker : public Regexp::Walker<bool> {
38 public:
PCREWalker()39 PCREWalker() {}
40
41 virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
42 bool* child_args, int nchild_args);
43
ShortVisit(Regexp * re,bool a)44 virtual bool ShortVisit(Regexp* re, bool a) {
45 // Should never be called: we use Walk(), not WalkExponential().
46 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
47 LOG(DFATAL) << "PCREWalker::ShortVisit called";
48 #endif
49 return a;
50 }
51
52 private:
53 PCREWalker(const PCREWalker&) = delete;
54 PCREWalker& operator=(const PCREWalker&) = delete;
55 };
56
57 // Called after visiting each of re's children and accumulating
58 // the return values in child_args. So child_args contains whether
59 // this library mimics PCRE for those subexpressions.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)60 bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
61 bool* child_args, int nchild_args) {
62 // If children failed, so do we.
63 for (int i = 0; i < nchild_args; i++)
64 if (!child_args[i])
65 return false;
66
67 // Otherwise look for other reasons to fail.
68 switch (re->op()) {
69 // Look for repeated empty string.
70 case kRegexpStar:
71 case kRegexpPlus:
72 case kRegexpQuest:
73 if (CanBeEmptyString(re->sub()[0]))
74 return false;
75 break;
76 case kRegexpRepeat:
77 if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
78 return false;
79 break;
80
81 // Look for \v
82 case kRegexpLiteral:
83 if (re->rune() == '\v')
84 return false;
85 break;
86
87 // Look for $ in single-line mode.
88 case kRegexpEndText:
89 case kRegexpEmptyMatch:
90 if (re->parse_flags() & Regexp::WasDollar)
91 return false;
92 break;
93
94 // Look for ^ in multi-line mode.
95 case kRegexpBeginLine:
96 // No condition: in single-line mode ^ becomes kRegexpBeginText.
97 return false;
98
99 default:
100 break;
101 }
102
103 // Not proven guilty.
104 return true;
105 }
106
107 // Returns whether this regexp's behavior will mimic PCRE's exactly.
MimicsPCRE()108 bool Regexp::MimicsPCRE() {
109 PCREWalker w;
110 return w.Walk(this, true);
111 }
112
113
114 // Walker class to compute whether a Regexp can match an empty string.
115 // It is okay to overestimate. For example, \b\B cannot match an empty
116 // string, because \b and \B are mutually exclusive, but this isn't
117 // that smart and will say it can. Spurious empty strings
118 // will reduce the number of regexps we sanity check against PCRE,
119 // but they won't break anything.
120
121 class EmptyStringWalker : public Regexp::Walker<bool> {
122 public:
EmptyStringWalker()123 EmptyStringWalker() {}
124
125 virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
126 bool* child_args, int nchild_args);
127
ShortVisit(Regexp * re,bool a)128 virtual bool ShortVisit(Regexp* re, bool a) {
129 // Should never be called: we use Walk(), not WalkExponential().
130 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
131 LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
132 #endif
133 return a;
134 }
135
136 private:
137 EmptyStringWalker(const EmptyStringWalker&) = delete;
138 EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
139 };
140
141 // Called after visiting re's children. child_args contains the return
142 // value from each of the children's PostVisits (i.e., whether each child
143 // can match an empty string). Returns whether this clause can match an
144 // empty string.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)145 bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
146 bool* child_args, int nchild_args) {
147 switch (re->op()) {
148 case kRegexpNoMatch: // never empty
149 case kRegexpLiteral:
150 case kRegexpAnyChar:
151 case kRegexpAnyByte:
152 case kRegexpCharClass:
153 case kRegexpLiteralString:
154 return false;
155
156 case kRegexpEmptyMatch: // always empty
157 case kRegexpBeginLine: // always empty, when they match
158 case kRegexpEndLine:
159 case kRegexpNoWordBoundary:
160 case kRegexpWordBoundary:
161 case kRegexpBeginText:
162 case kRegexpEndText:
163 case kRegexpStar: // can always be empty
164 case kRegexpQuest:
165 case kRegexpHaveMatch:
166 return true;
167
168 case kRegexpConcat: // can be empty if all children can
169 for (int i = 0; i < nchild_args; i++)
170 if (!child_args[i])
171 return false;
172 return true;
173
174 case kRegexpAlternate: // can be empty if any child can
175 for (int i = 0; i < nchild_args; i++)
176 if (child_args[i])
177 return true;
178 return false;
179
180 case kRegexpPlus: // can be empty if the child can
181 case kRegexpCapture:
182 return child_args[0];
183
184 case kRegexpRepeat: // can be empty if child can or is x{0}
185 return child_args[0] || re->min() == 0;
186 }
187 return false;
188 }
189
190 // Returns whether re can match an empty string.
CanBeEmptyString(Regexp * re)191 static bool CanBeEmptyString(Regexp* re) {
192 EmptyStringWalker w;
193 return w.Walk(re, true);
194 }
195
196 } // namespace re2
197