xref: /aosp_15_r20/external/regex-re2/re2/mimics_pcre.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Determine whether this library should match PCRE exactly
6 // for a particular Regexp.  (If so, the testing framework can
7 // check that it does.)
8 //
9 // This library matches PCRE except in these cases:
10 //   * the regexp contains a repetition of an empty string,
11 //     like (a*)* or (a*)+.  In this case, PCRE will treat
12 //     the repetition sequence as ending with an empty string,
13 //     while this library does not.
14 //   * Perl and PCRE differ on whether \v matches \n.
15 //     For historical reasons, this library implements the Perl behavior.
16 //   * Perl and PCRE allow $ in one-line mode to match either the very
17 //     end of the text or just before a \n at the end of the text.
18 //     This library requires it to match only the end of the text.
19 //   * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20 //     match the end of the text if the last character is a \n.
21 //     This library does allow it.
22 //
23 // Regexp::MimicsPCRE checks for any of these conditions.
24 
25 #include "util/util.h"
26 #include "util/logging.h"
27 #include "re2/regexp.h"
28 #include "re2/walker-inl.h"
29 
30 namespace re2 {
31 
32 // Returns whether re might match an empty string.
33 static bool CanBeEmptyString(Regexp *re);
34 
35 // Walker class to compute whether library handles a regexp
36 // exactly as PCRE would.  See comment at top for conditions.
37 
38 class PCREWalker : public Regexp::Walker<bool> {
39  public:
PCREWalker()40   PCREWalker() {}
41   bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
42                  int nchild_args);
43 
ShortVisit(Regexp * re,bool a)44   bool ShortVisit(Regexp* re, bool a) {
45     // Should never be called: we use Walk not WalkExponential.
46     LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
47     return a;
48   }
49 };
50 
51 // Called after visiting each of re's children and accumulating
52 // the return values in child_args.  So child_args contains whether
53 // this library mimics PCRE for those subexpressions.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)54 bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
55                            bool* child_args, int nchild_args) {
56   // If children failed, so do we.
57   for (int i = 0; i < nchild_args; i++)
58     if (!child_args[i])
59       return false;
60 
61   // Otherwise look for other reasons to fail.
62   switch (re->op()) {
63     // Look for repeated empty string.
64     case kRegexpStar:
65     case kRegexpPlus:
66     case kRegexpQuest:
67       if (CanBeEmptyString(re->sub()[0]))
68         return false;
69       break;
70     case kRegexpRepeat:
71       if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
72         return false;
73       break;
74 
75     // Look for \v
76     case kRegexpLiteral:
77       if (re->rune() == '\v')
78         return false;
79       break;
80 
81     // Look for $ in single-line mode.
82     case kRegexpEndText:
83     case kRegexpEmptyMatch:
84       if (re->parse_flags() & Regexp::WasDollar)
85         return false;
86       break;
87 
88     // Look for ^ in multi-line mode.
89     case kRegexpBeginLine:
90       // No condition: in single-line mode ^ becomes kRegexpBeginText.
91       return false;
92 
93     default:
94       break;
95   }
96 
97   // Not proven guilty.
98   return true;
99 }
100 
101 // Returns whether this regexp's behavior will mimic PCRE's exactly.
MimicsPCRE()102 bool Regexp::MimicsPCRE() {
103   PCREWalker w;
104   return w.Walk(this, true);
105 }
106 
107 
108 // Walker class to compute whether a Regexp can match an empty string.
109 // It is okay to overestimate.  For example, \b\B cannot match an empty
110 // string, because \b and \B are mutually exclusive, but this isn't
111 // that smart and will say it can.  Spurious empty strings
112 // will reduce the number of regexps we sanity check against PCRE,
113 // but they won't break anything.
114 
115 class EmptyStringWalker : public Regexp::Walker<bool> {
116  public:
EmptyStringWalker()117   EmptyStringWalker() { }
118   bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
119                  bool* child_args, int nchild_args);
120 
ShortVisit(Regexp * re,bool a)121   bool ShortVisit(Regexp* re, bool a) {
122     // Should never be called: we use Walk not WalkExponential.
123     LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
124     return a;
125   }
126 
127  private:
128   EmptyStringWalker(const EmptyStringWalker&) = delete;
129   EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
130 };
131 
132 // Called after visiting re's children.  child_args contains the return
133 // value from each of the children's PostVisits (i.e., whether each child
134 // can match an empty string).  Returns whether this clause can match an
135 // empty string.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)136 bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
137                                   bool* child_args, int nchild_args) {
138   switch (re->op()) {
139     case kRegexpNoMatch:               // never empty
140     case kRegexpLiteral:
141     case kRegexpAnyChar:
142     case kRegexpAnyByte:
143     case kRegexpCharClass:
144     case kRegexpLiteralString:
145       return false;
146 
147     case kRegexpEmptyMatch:            // always empty
148     case kRegexpBeginLine:             // always empty, when they match
149     case kRegexpEndLine:
150     case kRegexpNoWordBoundary:
151     case kRegexpWordBoundary:
152     case kRegexpBeginText:
153     case kRegexpEndText:
154     case kRegexpStar:                  // can always be empty
155     case kRegexpQuest:
156     case kRegexpHaveMatch:
157       return true;
158 
159     case kRegexpConcat:                // can be empty if all children can
160       for (int i = 0; i < nchild_args; i++)
161         if (!child_args[i])
162           return false;
163       return true;
164 
165     case kRegexpAlternate:             // can be empty if any child can
166       for (int i = 0; i < nchild_args; i++)
167         if (child_args[i])
168           return true;
169       return false;
170 
171     case kRegexpPlus:                  // can be empty if the child can
172     case kRegexpCapture:
173       return child_args[0];
174 
175     case kRegexpRepeat:                // can be empty if child can or is x{0}
176       return child_args[0] || re->min() == 0;
177   }
178   return false;
179 }
180 
181 // Returns whether re can match an empty string.
CanBeEmptyString(Regexp * re)182 static bool CanBeEmptyString(Regexp* re) {
183   EmptyStringWalker w;
184   return w.Walk(re, true);
185 }
186 
187 }  // namespace re2
188