1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Determine whether this library should match PCRE exactly
6 // for a particular Regexp. (If so, the testing framework can
7 // check that it does.)
8 //
9 // This library matches PCRE except in these cases:
10 // * the regexp contains a repetition of an empty string,
11 // like (a*)* or (a*)+. In this case, PCRE will treat
12 // the repetition sequence as ending with an empty string,
13 // while this library does not.
14 // * Perl and PCRE differ on whether \v matches \n.
15 // For historical reasons, this library implements the Perl behavior.
16 // * Perl and PCRE allow $ in one-line mode to match either the very
17 // end of the text or just before a \n at the end of the text.
18 // This library requires it to match only the end of the text.
19 // * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20 // match the end of the text if the last character is a \n.
21 // This library does allow it.
22 //
23 // Regexp::MimicsPCRE checks for any of these conditions.
24
25 #include "util/util.h"
26 #include "util/logging.h"
27 #include "re2/regexp.h"
28 #include "re2/walker-inl.h"
29
30 namespace re2 {
31
32 // Returns whether re might match an empty string.
33 static bool CanBeEmptyString(Regexp *re);
34
35 // Walker class to compute whether library handles a regexp
36 // exactly as PCRE would. See comment at top for conditions.
37
38 class PCREWalker : public Regexp::Walker<bool> {
39 public:
PCREWalker()40 PCREWalker() {}
41 bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
42 int nchild_args);
43
ShortVisit(Regexp * re,bool a)44 bool ShortVisit(Regexp* re, bool a) {
45 // Should never be called: we use Walk not WalkExponential.
46 LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
47 return a;
48 }
49 };
50
51 // Called after visiting each of re's children and accumulating
52 // the return values in child_args. So child_args contains whether
53 // this library mimics PCRE for those subexpressions.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)54 bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
55 bool* child_args, int nchild_args) {
56 // If children failed, so do we.
57 for (int i = 0; i < nchild_args; i++)
58 if (!child_args[i])
59 return false;
60
61 // Otherwise look for other reasons to fail.
62 switch (re->op()) {
63 // Look for repeated empty string.
64 case kRegexpStar:
65 case kRegexpPlus:
66 case kRegexpQuest:
67 if (CanBeEmptyString(re->sub()[0]))
68 return false;
69 break;
70 case kRegexpRepeat:
71 if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
72 return false;
73 break;
74
75 // Look for \v
76 case kRegexpLiteral:
77 if (re->rune() == '\v')
78 return false;
79 break;
80
81 // Look for $ in single-line mode.
82 case kRegexpEndText:
83 case kRegexpEmptyMatch:
84 if (re->parse_flags() & Regexp::WasDollar)
85 return false;
86 break;
87
88 // Look for ^ in multi-line mode.
89 case kRegexpBeginLine:
90 // No condition: in single-line mode ^ becomes kRegexpBeginText.
91 return false;
92
93 default:
94 break;
95 }
96
97 // Not proven guilty.
98 return true;
99 }
100
101 // Returns whether this regexp's behavior will mimic PCRE's exactly.
MimicsPCRE()102 bool Regexp::MimicsPCRE() {
103 PCREWalker w;
104 return w.Walk(this, true);
105 }
106
107
108 // Walker class to compute whether a Regexp can match an empty string.
109 // It is okay to overestimate. For example, \b\B cannot match an empty
110 // string, because \b and \B are mutually exclusive, but this isn't
111 // that smart and will say it can. Spurious empty strings
112 // will reduce the number of regexps we sanity check against PCRE,
113 // but they won't break anything.
114
115 class EmptyStringWalker : public Regexp::Walker<bool> {
116 public:
EmptyStringWalker()117 EmptyStringWalker() { }
118 bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
119 bool* child_args, int nchild_args);
120
ShortVisit(Regexp * re,bool a)121 bool ShortVisit(Regexp* re, bool a) {
122 // Should never be called: we use Walk not WalkExponential.
123 LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
124 return a;
125 }
126
127 private:
128 EmptyStringWalker(const EmptyStringWalker&) = delete;
129 EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
130 };
131
132 // Called after visiting re's children. child_args contains the return
133 // value from each of the children's PostVisits (i.e., whether each child
134 // can match an empty string). Returns whether this clause can match an
135 // empty string.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)136 bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
137 bool* child_args, int nchild_args) {
138 switch (re->op()) {
139 case kRegexpNoMatch: // never empty
140 case kRegexpLiteral:
141 case kRegexpAnyChar:
142 case kRegexpAnyByte:
143 case kRegexpCharClass:
144 case kRegexpLiteralString:
145 return false;
146
147 case kRegexpEmptyMatch: // always empty
148 case kRegexpBeginLine: // always empty, when they match
149 case kRegexpEndLine:
150 case kRegexpNoWordBoundary:
151 case kRegexpWordBoundary:
152 case kRegexpBeginText:
153 case kRegexpEndText:
154 case kRegexpStar: // can always be empty
155 case kRegexpQuest:
156 case kRegexpHaveMatch:
157 return true;
158
159 case kRegexpConcat: // can be empty if all children can
160 for (int i = 0; i < nchild_args; i++)
161 if (!child_args[i])
162 return false;
163 return true;
164
165 case kRegexpAlternate: // can be empty if any child can
166 for (int i = 0; i < nchild_args; i++)
167 if (child_args[i])
168 return true;
169 return false;
170
171 case kRegexpPlus: // can be empty if the child can
172 case kRegexpCapture:
173 return child_args[0];
174
175 case kRegexpRepeat: // can be empty if child can or is x{0}
176 return child_args[0] || re->min() == 0;
177 }
178 return false;
179 }
180
181 // Returns whether re can match an empty string.
CanBeEmptyString(Regexp * re)182 static bool CanBeEmptyString(Regexp* re) {
183 EmptyStringWalker w;
184 return w.Walk(re, true);
185 }
186
187 } // namespace re2
188